adjusting the script with new paths

2026-05-19 10:13:29 +02:00
parent 98df7c70f1
commit bb9fcf20ae
2 changed files with 71 additions and 455 deletions
@@ -1,384 +0,0 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # ============================================================
 # Organize Barcelona EDSS project safely
 # - Creates a timestamped backup first
 # - Creates a cleaner folder structure
 # - Moves files conservatively
 # - Does NOT delete anything
 # ============================================================
 PROJECT_ROOT="$(pwd)"
 TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
 BACKUP_PARENT="${PROJECT_ROOT}/../Barcelona_backups"
 BACKUP_DIR="${BACKUP_PARENT}/Barcelona_backup_${TIMESTAMP}"
 echo "Project root: ${PROJECT_ROOT}"
 echo "Backup dir:   ${BACKUP_DIR}"
 echo
 # ------------------------------------------------------------
 # Safety checks
 # ------------------------------------------------------------
 if [ ! -f "${PROJECT_ROOT}/README.md" ]; then
    echo "WARNING: README.md not found. Are you sure you are in the project root?"
    echo "Current directory: ${PROJECT_ROOT}"
    read -r -p "Continue anyway? [y/N] " answer
    case "$answer" in
        y|Y|yes|YES) ;;
        *) echo "Aborted."; exit 1 ;;
    esac
 fi
 if [ -d "${PROJECT_ROOT}/.git" ]; then
    if ! git diff --quiet || ! git diff --cached --quiet; then
        echo "ERROR: Git working tree is not clean."
        echo "Please commit or stash changes before organizing."
        exit 1
    fi
 fi
 echo "This script will:"
 echo "1. Create a full backup."
 echo "2. Create organized folders."
 echo "3. Move files into data/, prompts/, scripts/, results/, archive/."
 echo "4. Keep your original files in the backup."
 echo
 read -r -p "Proceed? [y/N] " answer
 case "$answer" in
    y|Y|yes|YES) ;;
    *) echo "Aborted."; exit 1 ;;
 esac
 # ------------------------------------------------------------
 # Backup
 # ------------------------------------------------------------
 mkdir -p "${BACKUP_PARENT}"
 echo
 echo "Creating backup..."
 rsync -a \
    --exclude "enarcelona/" \
    --exclude "env/" \
    --exclude ".venv/" \
    --exclude "__pycache__/" \
    "${PROJECT_ROOT}/" "${BACKUP_DIR}/"
 echo "Backup created at:"
 echo "${BACKUP_DIR}"
 # ------------------------------------------------------------
 # Create target structure
 # ------------------------------------------------------------
 echo
 echo "Creating new directory structure..."
 mkdir -p \
    data/raw \
    data/processed \
    data/ground_truth \
    data/external \
    prompts \
    scripts \
    results/benchmark_runs \
    results/final_results/model_outputs \
    results/figures \
    results/tables \
    results/logs \
    manuscript/figures \
    manuscript/tables \
    archive/old_scripts \
    archive/old_results \
    archive/tmp \
    archive/old_data \
    archive/old_project_files
 # ------------------------------------------------------------
 # Helper move functions
 # ------------------------------------------------------------
 move_if_exists() {
    src="$1"
    dest="$2"
    if [ -e "$src" ]; then
        mkdir -p "$(dirname "$dest")"
        if [ -e "$dest" ]; then
            echo "SKIP: destination exists: $dest"
        else
            echo "MOVE: $src -> $dest"
            mv "$src" "$dest"
        fi
    fi
 }
 move_glob_if_exists() {
    pattern="$1"
    dest_dir="$2"
    mkdir -p "$dest_dir"
    shopt -s nullglob
    files=( $pattern )
    shopt -u nullglob
    for f in "${files[@]}"; do
        base="$(basename "$f")"
        dest="${dest_dir}/${base}"
        if [ -e "$dest" ]; then
            echo "SKIP: destination exists: $dest"
        else
            echo "MOVE: $f -> $dest"
            mv "$f" "$dest"
        fi
    done
 }
 # ------------------------------------------------------------
 # Move prompts / attached instruction files
 # ------------------------------------------------------------
 echo
 echo "Moving prompt and instruction files..."
 move_if_exists "attach/Komplett.txt" "prompts/Komplett.txt"
 move_if_exists "attach/just_edss_schema.gbnf" "prompts/just_edss_schema.gbnf"
 move_if_exists "attach/just_edss_text.txt" "prompts/just_edss_text.txt"
 # Move leftover attach folder if empty or archive it
 if [ -d "attach" ]; then
    if [ -z "$(ls -A attach)" ]; then
        rmdir attach
    else
        move_if_exists "attach" "archive/old_project_files/attach"
    fi
 fi
 # ------------------------------------------------------------
 # Move important data files
 # ------------------------------------------------------------
 echo
 echo "Moving data files..."
 move_if_exists "Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned.csv" \
    "data/processed/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned.csv"
 move_if_exists "Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv" \
    "data/processed/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
 move_if_exists "Data/Join_edssandsub.tsv" \
    "data/ground_truth/Join_edssandsub.tsv"
 move_if_exists "Data/GT_Numbers.csv" \
    "data/ground_truth/GT_Numbers.csv"
 move_if_exists "Data/Annika1.csv" \
    "data/ground_truth/Annika1.csv"
 move_if_exists "Data/comparison.tsv" \
    "data/ground_truth/comparison.tsv"
 move_if_exists "Data/edss_distribution_summary.csv" \
    "data/processed/edss_distribution_summary.csv"
 move_if_exists "Data/empirical_confidence_table.csv" \
    "data/processed/empirical_confidence_table.csv"
 move_if_exists "Data/functional_system_colors.json" \
    "data/processed/functional_system_colors.json"
 move_if_exists "Data/Test.csv" \
    "archive/tmp/Test.csv"
 move_if_exists "Data/Hernan" \
    "data/external/Hernan"
 move_if_exists "Data/iteration" \
    "archive/old_data/iteration"
 # Old generated JSON/results from Data folder
 move_glob_if_exists "Data/*results*.json" "archive/old_results"
 move_glob_if_exists "Data/join_*.tsv" "archive/old_results"
 # Move remaining Data folder if anything left
 if [ -d "Data" ]; then
    if [ -z "$(ls -A Data)" ]; then
        rmdir Data
    else
        move_if_exists "Data" "archive/old_data/Data_remaining"
    fi
 fi
 # ------------------------------------------------------------
 # Move benchmark results
 # ------------------------------------------------------------
 echo
 echo "Moving benchmark results..."
 if [ -d "results_edss_benchmark" ]; then
    move_glob_if_exists "results_edss_benchmark/run_*" "results/benchmark_runs"
    move_if_exists "results_edss_benchmark/endresults" \
        "results/final_results/model_outputs"
    move_if_exists "results_edss_benchmark/confusion_matrices" \
        "results/figures/confusion_matrices"
    if [ -z "$(ls -A results_edss_benchmark 2>/dev/null || true)" ]; then
        rmdir results_edss_benchmark
    else
        move_if_exists "results_edss_benchmark" \
            "archive/old_results/results_edss_benchmark_remaining"
    fi
 fi
 # ------------------------------------------------------------
 # Move old/general results
 # ------------------------------------------------------------
 echo
 echo "Moving existing results files..."
 if [ -d "results" ]; then
    # Figures
    move_glob_if_exists "results/*.png" "results/figures"
    move_glob_if_exists "results/*.PNG" "results/figures"
    move_glob_if_exists "results/*.jpg" "results/figures"
    move_glob_if_exists "results/*.jpeg" "results/figures"
    move_glob_if_exists "results/*.svg" "results/figures"
    # Tables
    move_glob_if_exists "results/*.csv" "results/tables"
    move_glob_if_exists "results/*.tsv" "results/tables"
    move_glob_if_exists "results/*.xlsx" "results/tables"
    # Subfolders that look like old results
    move_if_exists "results/Jan_visual" "archive/old_results/Jan_visual"
    move_if_exists "results/Lab_meeting" "archive/old_results/Lab_meeting"
    move_if_exists "results/just_edss" "archive/old_results/just_edss"
 fi
 # Root-level result tables
 move_if_exists "edss_distribution_summary.csv" \
    "results/tables/edss_distribution_summary.csv"
 # Logs
 move_if_exists "edss_benchmark_terminal.log" \
    "results/logs/edss_benchmark_terminal.log"
 # ------------------------------------------------------------
 # Move scripts
 # ------------------------------------------------------------
 echo
 echo "Moving scripts..."
 move_if_exists "audit.py" "scripts/audit_outputs.py"
 move_if_exists "certainty.py" "scripts/analyze_certainty.py"
 move_if_exists "certainty_show.py" "scripts/certainty_show.py"
 move_if_exists "figure1.py" "scripts/figure1.py"
 move_if_exists "show_plots.py" "scripts/show_plots.py"
 move_if_exists "show_plots.py.orig" "archive/old_scripts/show_plots.py.orig"
 # Apps / old entry points
 move_if_exists "app.py" "archive/old_scripts/app.py"
 move_if_exists "total_app.py" "archive/old_scripts/total_app.py"
 # Existing project visuals folder
 move_if_exists "project/visuals" "results/figures/project_visuals"
 if [ -d "project" ]; then
    if [ -z "$(ls -A project)" ]; then
        rmdir project
    else
        move_if_exists "project" "archive/old_project_files/project"
    fi
 fi
 # ------------------------------------------------------------
 # Environment folder
 # ------------------------------------------------------------
 echo
 echo "Handling virtual environment..."
 if [ -d "enarcelona" ]; then
    echo "Leaving virtual environment in place: enarcelona/"
    echo "It should remain ignored by .gitignore."
 fi
 # ------------------------------------------------------------
 # Create README notes
 # ------------------------------------------------------------
 echo
 echo "Writing organization notes..."
 cat > "PROJECT_STRUCTURE.md" <<'EOF'
 # Project Structure
 This project was reorganized into:
 - `data/`
  - `raw/`: original raw data, if retained locally
  - `processed/`: cleaned or derived input data
  - `ground_truth/`: manually annotated reference data
  - `external/`: externally provided data
 - `prompts/`
  - EDSS instructions and prompt/schema assets
 - `scripts/`
  - runnable analysis and plotting scripts
 - `results/`
  - `benchmark_runs/`: full model benchmark runs
  - `final_results/`: final selected model outputs
  - `figures/`: generated figures
  - `tables/`: generated tables
  - `logs/`: terminal logs
 - `manuscript/`
  - final figures and tables for paper/thesis writing
 - `archive/`
  - old scripts, old results, temporary files, and unclear legacy files
 Important:
 The reorganization was performed after creating a full timestamped backup.
 EOF
 # ------------------------------------------------------------
 # Final checks
 # ------------------------------------------------------------
 echo
 echo "Organization complete."
 echo
 echo "Backup is here:"
 echo "${BACKUP_DIR}"
 echo
 echo "New top-level structure:"
 find . -maxdepth 2 -type d | sort
 echo
 if [ -d ".git" ]; then
    echo "Git status:"
    git status --short
 fi
 echo
 echo "Next recommended commands:"
 echo "  git status"
 echo "  git add ."
 echo "  git commit -m \"Reorganize project structure\""
@@ -1258,7 +1258,7 @@
 # %% API call - Multi-model, multi-iteration EDSS + timing/resource benchmark
-#
+
 #import time
 #import json
 #import os
@@ -1279,9 +1279,9 @@
 #    print("Install with: pip install psutil")
 #
 #
-## =========================
+# =========================
-## CONFIGURATION
+# CONFIGURATION
-## =========================
+# =========================
 #
 #load_dotenv()
 #
@@ -1289,21 +1289,21 @@
 #OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
 #
 #MODEL_CONFIGS = [
-##    {
+#    {
-##        "model_name": "qwen3.6-35b-a3b",
+#        "model_name": "qwen3.6-35b-a3b",
-##        "use_response_format": False,
+#        "use_response_format": False,
-##        "temperature": 0.0,
+#        "temperature": 0.0,
-##        "max_tokens": 4096,
+#        "max_tokens": 4096,
-##
+#
-##        # If your backend is vLLM / Qwen chat-template compatible,
+#        # If your backend is vLLM / Qwen chat-template compatible,
-##        # this may reduce long hidden reasoning and JSON truncation.
+#        # this may reduce long hidden reasoning and JSON truncation.
-##        # If your server errors because of extra_body, set this to None.
+#        # If your server errors because of extra_body, set this to None.
-##        "extra_body": {
+#        "extra_body": {
-##            "chat_template_kwargs": {
+#            "chat_template_kwargs": {
-##                "enable_thinking": False
+#                "enable_thinking": False
-##            }
+#            }
-##        },
+#        },
-##    },
+#    },
 #    {
 #        "model_name": "gemma-4-31B-it",
 #        "use_response_format": False,
@@ -1328,10 +1328,10 @@
 #NUM_ITERATIONS = 10
 #STOP_ON_FIRST_ERROR = False
 #
-## For testing, set to e.g. 2.
+# For testing, set to e.g. 2.
-## For full run, set to None.
+# For full run, set to None.
 #MAX_ROWS = 2
 # MAX_ROWS = 2
 ## MAX_ROWS = 2
 #
 #MAX_TOKENS = 4096
 #TEMPERATURE = 0.0
@@ -1340,14 +1340,14 @@
 #
 #SAVE_EVERY_N_ROWS = 1
 #
-## Retries for invalid JSON / truncated JSON
+# Retries for invalid JSON / truncated JSON
 #MAX_JSON_RETRIES = 2
 #RETRY_SLEEP_SEC = 2
 #
 #
-## =========================
+# =========================
-## CLIENT
+# CLIENT
-## =========================
+# =========================
 #
 #client = OpenAI(
 #    api_key=OPENAI_API_KEY,
@@ -1355,9 +1355,9 @@
 #)
 #
 #
-## =========================
+# =========================
-## HELPERS
+# HELPERS
-## =========================
+# =========================
 #
 #def safe_dir_name(name: str) -> str:
 #    name = str(name).strip()
@@ -1433,9 +1433,9 @@
 #        return max(self.samples_mb)
 #
 #
-## =========================
+# =========================
-## JSON EXTRACTION
+# JSON EXTRACTION
-## =========================
+# =========================
 #
 #def extract_json_from_text(text):
 #    if text is None:
@@ -1587,17 +1587,17 @@
 #    return None
 #
 #
-## =========================
+# =========================
-## READ INSTRUCTIONS
+# READ INSTRUCTIONS
-## =========================
+# =========================
 #
 #with open(EDSS_INSTRUCTIONS_PATH, "r", encoding="utf-8") as f:
 #    EDSS_INSTRUCTIONS = f.read().strip()
 #
 #
-## =========================
+# =========================
-## PROMPT
+# PROMPT
-## =========================
+# =========================
 #
 #def build_prompt(patient_text):
 #    return f'''Du bist ein medizinischer Assistent für EDSS-Extraktion aus klinischen Berichten.
@@ -1676,9 +1676,9 @@
 #'''
 #
 #
-## =========================
+# =========================
-## VALIDATION / NORMALIZATION
+# VALIDATION / NORMALIZATION
-## =========================
+# =========================
 #
 #def normalize_model_output(parsed):
 #    if not isinstance(parsed, dict):
@@ -1755,9 +1755,9 @@
 #    return parsed
 #
 #
-## =========================
+# =========================
-## API CALL
+# API CALL
-## =========================
+# =========================
 #
 #def make_chat_completion(model_config, prompt):
 #    model_name = model_config["model_name"]
@@ -1794,9 +1794,9 @@
 #    return client.chat.completions.create(**kwargs)
 #
 #
-## =========================
+# =========================
-## INFERENCE FUNCTION WITH RETRIES
+# INFERENCE FUNCTION WITH RETRIES
-## =========================
+# =========================
 #
 #def run_inference(patient_text, model_config):
 #    model_name = model_config["model_name"]
@@ -1912,9 +1912,9 @@
 #    }
 #
 #
-## =========================
+# =========================
-## BUILD PATIENT TEXT
+# BUILD PATIENT TEXT
-## =========================
+# =========================
 #
 #def build_patient_text(row):
 #    return (
@@ -1925,9 +1925,9 @@
 #    )
 #
 #
-## =========================
+# =========================
-## FLATTEN RESULTS FOR CSV
+# FLATTEN RESULTS FOR CSV
-## =========================
+# =========================
 #
 #def flatten_result(record):
 #    flat = {
@@ -2012,9 +2012,9 @@
 #    return pd.DataFrame([summary])
 #
 #
-## =========================
+# =========================
-## INCREMENTAL SAVE HELPERS
+# INCREMENTAL SAVE HELPERS
-## =========================
+# =========================
 #
 #def append_jsonl(path, record):
 #    with open(path, "a", encoding="utf-8") as f:
@@ -2030,9 +2030,9 @@
 #    df_one.to_csv(path, mode="a", header=not file_exists, index=False)
 #
 #
-## =========================
+# =========================
-## MAIN LOOP
+# MAIN LOOP
-## =========================
+# =========================
 #
 #if __name__ == "__main__":
 #
@@ -2277,26 +2277,26 @@ MODEL_CONFIGS = [
        "max_tokens": 4096,
        "extra_body": None,
    },
-    # {
+    {
-    #     "model_name": "GPT-OSS-120B",
+        "model_name": "GPT-OSS-120B",
-    #     "use_response_format": True,
+        "use_response_format": True,
-    #     "temperature": 0.0,
+        "temperature": 0.0,
-    #     "max_tokens": 4096,
+        "max_tokens": 4096,
-    #     "extra_body": None,
+        "extra_body": None,
-    # },
+    },
 ]
-INPUT_CSV = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
+INPUT_CSV ="/home/shahin/Lab/Doktorarbeit/Barcelona/data/processed/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
-EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/Komplett.txt"
+EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/prompts/Komplett.txt"
-RESULTS_ROOT = "/home/shahin/Lab/Doktorarbeit/Barcelona/results_edss_benchmark"
+RESULTS_ROOT = "/home/shahin/Lab/Doktorarbeit/Barcelona/results/benchmark_runs"
-NUM_ITERATIONS = 10
+NUM_ITERATIONS = 2
 STOP_ON_FIRST_ERROR = False
 # For testing, set to e.g. 2.
 # For full run, set to None.
-MAX_ROWS = None
+MAX_ROWS = 2
 # MAX_ROWS = 2
 MAX_TOKENS = 4096