adjusting the script with new paths

This commit is contained in:
2026-05-19 10:13:29 +02:00
parent 98df7c70f1
commit bb9fcf20ae
2 changed files with 71 additions and 455 deletions
-384
View File
@@ -1,384 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
# ============================================================
# Organize Barcelona EDSS project safely
# - Creates a timestamped backup first
# - Creates a cleaner folder structure
# - Moves files conservatively
# - Does NOT delete anything
# ============================================================
PROJECT_ROOT="$(pwd)"
TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
BACKUP_PARENT="${PROJECT_ROOT}/../Barcelona_backups"
BACKUP_DIR="${BACKUP_PARENT}/Barcelona_backup_${TIMESTAMP}"
echo "Project root: ${PROJECT_ROOT}"
echo "Backup dir: ${BACKUP_DIR}"
echo
# ------------------------------------------------------------
# Safety checks
# ------------------------------------------------------------
if [ ! -f "${PROJECT_ROOT}/README.md" ]; then
echo "WARNING: README.md not found. Are you sure you are in the project root?"
echo "Current directory: ${PROJECT_ROOT}"
read -r -p "Continue anyway? [y/N] " answer
case "$answer" in
y|Y|yes|YES) ;;
*) echo "Aborted."; exit 1 ;;
esac
fi
if [ -d "${PROJECT_ROOT}/.git" ]; then
if ! git diff --quiet || ! git diff --cached --quiet; then
echo "ERROR: Git working tree is not clean."
echo "Please commit or stash changes before organizing."
exit 1
fi
fi
echo "This script will:"
echo "1. Create a full backup."
echo "2. Create organized folders."
echo "3. Move files into data/, prompts/, scripts/, results/, archive/."
echo "4. Keep your original files in the backup."
echo
read -r -p "Proceed? [y/N] " answer
case "$answer" in
y|Y|yes|YES) ;;
*) echo "Aborted."; exit 1 ;;
esac
# ------------------------------------------------------------
# Backup
# ------------------------------------------------------------
mkdir -p "${BACKUP_PARENT}"
echo
echo "Creating backup..."
rsync -a \
--exclude "enarcelona/" \
--exclude "env/" \
--exclude ".venv/" \
--exclude "__pycache__/" \
"${PROJECT_ROOT}/" "${BACKUP_DIR}/"
echo "Backup created at:"
echo "${BACKUP_DIR}"
# ------------------------------------------------------------
# Create target structure
# ------------------------------------------------------------
echo
echo "Creating new directory structure..."
mkdir -p \
data/raw \
data/processed \
data/ground_truth \
data/external \
prompts \
scripts \
results/benchmark_runs \
results/final_results/model_outputs \
results/figures \
results/tables \
results/logs \
manuscript/figures \
manuscript/tables \
archive/old_scripts \
archive/old_results \
archive/tmp \
archive/old_data \
archive/old_project_files
# ------------------------------------------------------------
# Helper move functions
# ------------------------------------------------------------
move_if_exists() {
src="$1"
dest="$2"
if [ -e "$src" ]; then
mkdir -p "$(dirname "$dest")"
if [ -e "$dest" ]; then
echo "SKIP: destination exists: $dest"
else
echo "MOVE: $src -> $dest"
mv "$src" "$dest"
fi
fi
}
move_glob_if_exists() {
pattern="$1"
dest_dir="$2"
mkdir -p "$dest_dir"
shopt -s nullglob
files=( $pattern )
shopt -u nullglob
for f in "${files[@]}"; do
base="$(basename "$f")"
dest="${dest_dir}/${base}"
if [ -e "$dest" ]; then
echo "SKIP: destination exists: $dest"
else
echo "MOVE: $f -> $dest"
mv "$f" "$dest"
fi
done
}
# ------------------------------------------------------------
# Move prompts / attached instruction files
# ------------------------------------------------------------
echo
echo "Moving prompt and instruction files..."
move_if_exists "attach/Komplett.txt" "prompts/Komplett.txt"
move_if_exists "attach/just_edss_schema.gbnf" "prompts/just_edss_schema.gbnf"
move_if_exists "attach/just_edss_text.txt" "prompts/just_edss_text.txt"
# Move leftover attach folder if empty or archive it
if [ -d "attach" ]; then
if [ -z "$(ls -A attach)" ]; then
rmdir attach
else
move_if_exists "attach" "archive/old_project_files/attach"
fi
fi
# ------------------------------------------------------------
# Move important data files
# ------------------------------------------------------------
echo
echo "Moving data files..."
move_if_exists "Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned.csv" \
"data/processed/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned.csv"
move_if_exists "Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv" \
"data/processed/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
move_if_exists "Data/Join_edssandsub.tsv" \
"data/ground_truth/Join_edssandsub.tsv"
move_if_exists "Data/GT_Numbers.csv" \
"data/ground_truth/GT_Numbers.csv"
move_if_exists "Data/Annika1.csv" \
"data/ground_truth/Annika1.csv"
move_if_exists "Data/comparison.tsv" \
"data/ground_truth/comparison.tsv"
move_if_exists "Data/edss_distribution_summary.csv" \
"data/processed/edss_distribution_summary.csv"
move_if_exists "Data/empirical_confidence_table.csv" \
"data/processed/empirical_confidence_table.csv"
move_if_exists "Data/functional_system_colors.json" \
"data/processed/functional_system_colors.json"
move_if_exists "Data/Test.csv" \
"archive/tmp/Test.csv"
move_if_exists "Data/Hernan" \
"data/external/Hernan"
move_if_exists "Data/iteration" \
"archive/old_data/iteration"
# Old generated JSON/results from Data folder
move_glob_if_exists "Data/*results*.json" "archive/old_results"
move_glob_if_exists "Data/join_*.tsv" "archive/old_results"
# Move remaining Data folder if anything left
if [ -d "Data" ]; then
if [ -z "$(ls -A Data)" ]; then
rmdir Data
else
move_if_exists "Data" "archive/old_data/Data_remaining"
fi
fi
# ------------------------------------------------------------
# Move benchmark results
# ------------------------------------------------------------
echo
echo "Moving benchmark results..."
if [ -d "results_edss_benchmark" ]; then
move_glob_if_exists "results_edss_benchmark/run_*" "results/benchmark_runs"
move_if_exists "results_edss_benchmark/endresults" \
"results/final_results/model_outputs"
move_if_exists "results_edss_benchmark/confusion_matrices" \
"results/figures/confusion_matrices"
if [ -z "$(ls -A results_edss_benchmark 2>/dev/null || true)" ]; then
rmdir results_edss_benchmark
else
move_if_exists "results_edss_benchmark" \
"archive/old_results/results_edss_benchmark_remaining"
fi
fi
# ------------------------------------------------------------
# Move old/general results
# ------------------------------------------------------------
echo
echo "Moving existing results files..."
if [ -d "results" ]; then
# Figures
move_glob_if_exists "results/*.png" "results/figures"
move_glob_if_exists "results/*.PNG" "results/figures"
move_glob_if_exists "results/*.jpg" "results/figures"
move_glob_if_exists "results/*.jpeg" "results/figures"
move_glob_if_exists "results/*.svg" "results/figures"
# Tables
move_glob_if_exists "results/*.csv" "results/tables"
move_glob_if_exists "results/*.tsv" "results/tables"
move_glob_if_exists "results/*.xlsx" "results/tables"
# Subfolders that look like old results
move_if_exists "results/Jan_visual" "archive/old_results/Jan_visual"
move_if_exists "results/Lab_meeting" "archive/old_results/Lab_meeting"
move_if_exists "results/just_edss" "archive/old_results/just_edss"
fi
# Root-level result tables
move_if_exists "edss_distribution_summary.csv" \
"results/tables/edss_distribution_summary.csv"
# Logs
move_if_exists "edss_benchmark_terminal.log" \
"results/logs/edss_benchmark_terminal.log"
# ------------------------------------------------------------
# Move scripts
# ------------------------------------------------------------
echo
echo "Moving scripts..."
move_if_exists "audit.py" "scripts/audit_outputs.py"
move_if_exists "certainty.py" "scripts/analyze_certainty.py"
move_if_exists "certainty_show.py" "scripts/certainty_show.py"
move_if_exists "figure1.py" "scripts/figure1.py"
move_if_exists "show_plots.py" "scripts/show_plots.py"
move_if_exists "show_plots.py.orig" "archive/old_scripts/show_plots.py.orig"
# Apps / old entry points
move_if_exists "app.py" "archive/old_scripts/app.py"
move_if_exists "total_app.py" "archive/old_scripts/total_app.py"
# Existing project visuals folder
move_if_exists "project/visuals" "results/figures/project_visuals"
if [ -d "project" ]; then
if [ -z "$(ls -A project)" ]; then
rmdir project
else
move_if_exists "project" "archive/old_project_files/project"
fi
fi
# ------------------------------------------------------------
# Environment folder
# ------------------------------------------------------------
echo
echo "Handling virtual environment..."
if [ -d "enarcelona" ]; then
echo "Leaving virtual environment in place: enarcelona/"
echo "It should remain ignored by .gitignore."
fi
# ------------------------------------------------------------
# Create README notes
# ------------------------------------------------------------
echo
echo "Writing organization notes..."
cat > "PROJECT_STRUCTURE.md" <<'EOF'
# Project Structure
This project was reorganized into:
- `data/`
- `raw/`: original raw data, if retained locally
- `processed/`: cleaned or derived input data
- `ground_truth/`: manually annotated reference data
- `external/`: externally provided data
- `prompts/`
- EDSS instructions and prompt/schema assets
- `scripts/`
- runnable analysis and plotting scripts
- `results/`
- `benchmark_runs/`: full model benchmark runs
- `final_results/`: final selected model outputs
- `figures/`: generated figures
- `tables/`: generated tables
- `logs/`: terminal logs
- `manuscript/`
- final figures and tables for paper/thesis writing
- `archive/`
- old scripts, old results, temporary files, and unclear legacy files
Important:
The reorganization was performed after creating a full timestamped backup.
EOF
# ------------------------------------------------------------
# Final checks
# ------------------------------------------------------------
echo
echo "Organization complete."
echo
echo "Backup is here:"
echo "${BACKUP_DIR}"
echo
echo "New top-level structure:"
find . -maxdepth 2 -type d | sort
echo
if [ -d ".git" ]; then
echo "Git status:"
git status --short
fi
echo
echo "Next recommended commands:"
echo " git status"
echo " git add ."
echo " git commit -m \"Reorganize project structure\""
+71 -71
View File
@@ -1258,7 +1258,7 @@
# %% API call - Multi-model, multi-iteration EDSS + timing/resource benchmark
#
#import time
#import json
#import os
@@ -1279,9 +1279,9 @@
# print("Install with: pip install psutil")
#
#
## =========================
## CONFIGURATION
## =========================
# =========================
# CONFIGURATION
# =========================
#
#load_dotenv()
#
@@ -1289,21 +1289,21 @@
#OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
#
#MODEL_CONFIGS = [
## {
## "model_name": "qwen3.6-35b-a3b",
## "use_response_format": False,
## "temperature": 0.0,
## "max_tokens": 4096,
##
## # If your backend is vLLM / Qwen chat-template compatible,
## # this may reduce long hidden reasoning and JSON truncation.
## # If your server errors because of extra_body, set this to None.
## "extra_body": {
## "chat_template_kwargs": {
## "enable_thinking": False
## }
## },
## },
# {
# "model_name": "qwen3.6-35b-a3b",
# "use_response_format": False,
# "temperature": 0.0,
# "max_tokens": 4096,
#
# # If your backend is vLLM / Qwen chat-template compatible,
# # this may reduce long hidden reasoning and JSON truncation.
# # If your server errors because of extra_body, set this to None.
# "extra_body": {
# "chat_template_kwargs": {
# "enable_thinking": False
# }
# },
# },
# {
# "model_name": "gemma-4-31B-it",
# "use_response_format": False,
@@ -1328,10 +1328,10 @@
#NUM_ITERATIONS = 10
#STOP_ON_FIRST_ERROR = False
#
## For testing, set to e.g. 2.
## For full run, set to None.
# For testing, set to e.g. 2.
# For full run, set to None.
#MAX_ROWS = 2
## MAX_ROWS = 2
# MAX_ROWS = 2
#
#MAX_TOKENS = 4096
#TEMPERATURE = 0.0
@@ -1340,14 +1340,14 @@
#
#SAVE_EVERY_N_ROWS = 1
#
## Retries for invalid JSON / truncated JSON
# Retries for invalid JSON / truncated JSON
#MAX_JSON_RETRIES = 2
#RETRY_SLEEP_SEC = 2
#
#
## =========================
## CLIENT
## =========================
# =========================
# CLIENT
# =========================
#
#client = OpenAI(
# api_key=OPENAI_API_KEY,
@@ -1355,9 +1355,9 @@
#)
#
#
## =========================
## HELPERS
## =========================
# =========================
# HELPERS
# =========================
#
#def safe_dir_name(name: str) -> str:
# name = str(name).strip()
@@ -1433,9 +1433,9 @@
# return max(self.samples_mb)
#
#
## =========================
## JSON EXTRACTION
## =========================
# =========================
# JSON EXTRACTION
# =========================
#
#def extract_json_from_text(text):
# if text is None:
@@ -1587,17 +1587,17 @@
# return None
#
#
## =========================
## READ INSTRUCTIONS
## =========================
# =========================
# READ INSTRUCTIONS
# =========================
#
#with open(EDSS_INSTRUCTIONS_PATH, "r", encoding="utf-8") as f:
# EDSS_INSTRUCTIONS = f.read().strip()
#
#
## =========================
## PROMPT
## =========================
# =========================
# PROMPT
# =========================
#
#def build_prompt(patient_text):
# return f'''Du bist ein medizinischer Assistent für EDSS-Extraktion aus klinischen Berichten.
@@ -1676,9 +1676,9 @@
#'''
#
#
## =========================
## VALIDATION / NORMALIZATION
## =========================
# =========================
# VALIDATION / NORMALIZATION
# =========================
#
#def normalize_model_output(parsed):
# if not isinstance(parsed, dict):
@@ -1755,9 +1755,9 @@
# return parsed
#
#
## =========================
## API CALL
## =========================
# =========================
# API CALL
# =========================
#
#def make_chat_completion(model_config, prompt):
# model_name = model_config["model_name"]
@@ -1794,9 +1794,9 @@
# return client.chat.completions.create(**kwargs)
#
#
## =========================
## INFERENCE FUNCTION WITH RETRIES
## =========================
# =========================
# INFERENCE FUNCTION WITH RETRIES
# =========================
#
#def run_inference(patient_text, model_config):
# model_name = model_config["model_name"]
@@ -1912,9 +1912,9 @@
# }
#
#
## =========================
## BUILD PATIENT TEXT
## =========================
# =========================
# BUILD PATIENT TEXT
# =========================
#
#def build_patient_text(row):
# return (
@@ -1925,9 +1925,9 @@
# )
#
#
## =========================
## FLATTEN RESULTS FOR CSV
## =========================
# =========================
# FLATTEN RESULTS FOR CSV
# =========================
#
#def flatten_result(record):
# flat = {
@@ -2012,9 +2012,9 @@
# return pd.DataFrame([summary])
#
#
## =========================
## INCREMENTAL SAVE HELPERS
## =========================
# =========================
# INCREMENTAL SAVE HELPERS
# =========================
#
#def append_jsonl(path, record):
# with open(path, "a", encoding="utf-8") as f:
@@ -2030,9 +2030,9 @@
# df_one.to_csv(path, mode="a", header=not file_exists, index=False)
#
#
## =========================
## MAIN LOOP
## =========================
# =========================
# MAIN LOOP
# =========================
#
#if __name__ == "__main__":
#
@@ -2277,26 +2277,26 @@ MODEL_CONFIGS = [
"max_tokens": 4096,
"extra_body": None,
},
# {
# "model_name": "GPT-OSS-120B",
# "use_response_format": True,
# "temperature": 0.0,
# "max_tokens": 4096,
# "extra_body": None,
# },
{
"model_name": "GPT-OSS-120B",
"use_response_format": True,
"temperature": 0.0,
"max_tokens": 4096,
"extra_body": None,
},
]
INPUT_CSV = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/Komplett.txt"
INPUT_CSV ="/home/shahin/Lab/Doktorarbeit/Barcelona/data/processed/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/prompts/Komplett.txt"
RESULTS_ROOT = "/home/shahin/Lab/Doktorarbeit/Barcelona/results_edss_benchmark"
RESULTS_ROOT = "/home/shahin/Lab/Doktorarbeit/Barcelona/results/benchmark_runs"
NUM_ITERATIONS = 10
NUM_ITERATIONS = 2
STOP_ON_FIRST_ERROR = False
# For testing, set to e.g. 2.
# For full run, set to None.
MAX_ROWS = None
MAX_ROWS = 2
# MAX_ROWS = 2
MAX_TOKENS = 4096