adjusting the script with new paths

2026-05-19 10:13:29 +02:00
parent 98df7c70f1
commit bb9fcf20ae
2 changed files with 71 additions and 455 deletions
@@ -1,384 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-# ============================================================
-# Organize Barcelona EDSS project safely
-# - Creates a timestamped backup first
-# - Creates a cleaner folder structure
-# - Moves files conservatively
-# - Does NOT delete anything
-# ============================================================
-
-PROJECT_ROOT="$(pwd)"
-TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
-BACKUP_PARENT="${PROJECT_ROOT}/../Barcelona_backups"
-BACKUP_DIR="${BACKUP_PARENT}/Barcelona_backup_${TIMESTAMP}"
-
-echo "Project root: ${PROJECT_ROOT}"
-echo "Backup dir:   ${BACKUP_DIR}"
-echo
-
-# ------------------------------------------------------------
-# Safety checks
-# ------------------------------------------------------------
-
-if [ ! -f "${PROJECT_ROOT}/README.md" ]; then
-    echo "WARNING: README.md not found. Are you sure you are in the project root?"
-    echo "Current directory: ${PROJECT_ROOT}"
-    read -r -p "Continue anyway? [y/N] " answer
-    case "$answer" in
-        y|Y|yes|YES) ;;
-        *) echo "Aborted."; exit 1 ;;
-    esac
-fi
-
-if [ -d "${PROJECT_ROOT}/.git" ]; then
-    if ! git diff --quiet || ! git diff --cached --quiet; then
-        echo "ERROR: Git working tree is not clean."
-        echo "Please commit or stash changes before organizing."
-        exit 1
-    fi
-fi
-
-echo "This script will:"
-echo "1. Create a full backup."
-echo "2. Create organized folders."
-echo "3. Move files into data/, prompts/, scripts/, results/, archive/."
-echo "4. Keep your original files in the backup."
-echo
-read -r -p "Proceed? [y/N] " answer
-case "$answer" in
-    y|Y|yes|YES) ;;
-    *) echo "Aborted."; exit 1 ;;
-esac
-
-# ------------------------------------------------------------
-# Backup
-# ------------------------------------------------------------
-
-mkdir -p "${BACKUP_PARENT}"
-
-echo
-echo "Creating backup..."
-rsync -a \
-    --exclude "enarcelona/" \
-    --exclude "env/" \
-    --exclude ".venv/" \
-    --exclude "__pycache__/" \
-    "${PROJECT_ROOT}/" "${BACKUP_DIR}/"
-
-echo "Backup created at:"
-echo "${BACKUP_DIR}"
-
-# ------------------------------------------------------------
-# Create target structure
-# ------------------------------------------------------------
-
-echo
-echo "Creating new directory structure..."
-
-mkdir -p \
-    data/raw \
-    data/processed \
-    data/ground_truth \
-    data/external \
-    prompts \
-    scripts \
-    results/benchmark_runs \
-    results/final_results/model_outputs \
-    results/figures \
-    results/tables \
-    results/logs \
-    manuscript/figures \
-    manuscript/tables \
-    archive/old_scripts \
-    archive/old_results \
-    archive/tmp \
-    archive/old_data \
-    archive/old_project_files
-
-# ------------------------------------------------------------
-# Helper move functions
-# ------------------------------------------------------------
-
-move_if_exists() {
-    src="$1"
-    dest="$2"
-
-    if [ -e "$src" ]; then
-        mkdir -p "$(dirname "$dest")"
-
-        if [ -e "$dest" ]; then
-            echo "SKIP: destination exists: $dest"
-        else
-            echo "MOVE: $src -> $dest"
-            mv "$src" "$dest"
-        fi
-    fi
-}
-
-move_glob_if_exists() {
-    pattern="$1"
-    dest_dir="$2"
-
-    mkdir -p "$dest_dir"
-
-    shopt -s nullglob
-    files=( $pattern )
-    shopt -u nullglob
-
-    for f in "${files[@]}"; do
-        base="$(basename "$f")"
-        dest="${dest_dir}/${base}"
-
-        if [ -e "$dest" ]; then
-            echo "SKIP: destination exists: $dest"
-        else
-            echo "MOVE: $f -> $dest"
-            mv "$f" "$dest"
-        fi
-    done
-}
-
-# ------------------------------------------------------------
-# Move prompts / attached instruction files
-# ------------------------------------------------------------
-
-echo
-echo "Moving prompt and instruction files..."
-
-move_if_exists "attach/Komplett.txt" "prompts/Komplett.txt"
-move_if_exists "attach/just_edss_schema.gbnf" "prompts/just_edss_schema.gbnf"
-move_if_exists "attach/just_edss_text.txt" "prompts/just_edss_text.txt"
-
-# Move leftover attach folder if empty or archive it
-if [ -d "attach" ]; then
-    if [ -z "$(ls -A attach)" ]; then
-        rmdir attach
-    else
-        move_if_exists "attach" "archive/old_project_files/attach"
-    fi
-fi
-
-# ------------------------------------------------------------
-# Move important data files
-# ------------------------------------------------------------
-
-echo
-echo "Moving data files..."
-
-move_if_exists "Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned.csv" \
-    "data/processed/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned.csv"
-
-move_if_exists "Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv" \
-    "data/processed/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
-
-move_if_exists "Data/Join_edssandsub.tsv" \
-    "data/ground_truth/Join_edssandsub.tsv"
-
-move_if_exists "Data/GT_Numbers.csv" \
-    "data/ground_truth/GT_Numbers.csv"
-
-move_if_exists "Data/Annika1.csv" \
-    "data/ground_truth/Annika1.csv"
-
-move_if_exists "Data/comparison.tsv" \
-    "data/ground_truth/comparison.tsv"
-
-move_if_exists "Data/edss_distribution_summary.csv" \
-    "data/processed/edss_distribution_summary.csv"
-
-move_if_exists "Data/empirical_confidence_table.csv" \
-    "data/processed/empirical_confidence_table.csv"
-
-move_if_exists "Data/functional_system_colors.json" \
-    "data/processed/functional_system_colors.json"
-
-move_if_exists "Data/Test.csv" \
-    "archive/tmp/Test.csv"
-
-move_if_exists "Data/Hernan" \
-    "data/external/Hernan"
-
-move_if_exists "Data/iteration" \
-    "archive/old_data/iteration"
-
-# Old generated JSON/results from Data folder
-move_glob_if_exists "Data/*results*.json" "archive/old_results"
-move_glob_if_exists "Data/join_*.tsv" "archive/old_results"
-
-# Move remaining Data folder if anything left
-if [ -d "Data" ]; then
-    if [ -z "$(ls -A Data)" ]; then
-        rmdir Data
-    else
-        move_if_exists "Data" "archive/old_data/Data_remaining"
-    fi
-fi
-
-# ------------------------------------------------------------
-# Move benchmark results
-# ------------------------------------------------------------
-
-echo
-echo "Moving benchmark results..."
-
-if [ -d "results_edss_benchmark" ]; then
-    move_glob_if_exists "results_edss_benchmark/run_*" "results/benchmark_runs"
-
-    move_if_exists "results_edss_benchmark/endresults" \
-        "results/final_results/model_outputs"
-
-    move_if_exists "results_edss_benchmark/confusion_matrices" \
-        "results/figures/confusion_matrices"
-
-    if [ -z "$(ls -A results_edss_benchmark 2>/dev/null || true)" ]; then
-        rmdir results_edss_benchmark
-    else
-        move_if_exists "results_edss_benchmark" \
-            "archive/old_results/results_edss_benchmark_remaining"
-    fi
-fi
-
-# ------------------------------------------------------------
-# Move old/general results
-# ------------------------------------------------------------
-
-echo
-echo "Moving existing results files..."
-
-if [ -d "results" ]; then
-    # Figures
-    move_glob_if_exists "results/*.png" "results/figures"
-    move_glob_if_exists "results/*.PNG" "results/figures"
-    move_glob_if_exists "results/*.jpg" "results/figures"
-    move_glob_if_exists "results/*.jpeg" "results/figures"
-    move_glob_if_exists "results/*.svg" "results/figures"
-
-    # Tables
-    move_glob_if_exists "results/*.csv" "results/tables"
-    move_glob_if_exists "results/*.tsv" "results/tables"
-    move_glob_if_exists "results/*.xlsx" "results/tables"
-
-    # Subfolders that look like old results
-    move_if_exists "results/Jan_visual" "archive/old_results/Jan_visual"
-    move_if_exists "results/Lab_meeting" "archive/old_results/Lab_meeting"
-    move_if_exists "results/just_edss" "archive/old_results/just_edss"
-fi
-
-# Root-level result tables
-move_if_exists "edss_distribution_summary.csv" \
-    "results/tables/edss_distribution_summary.csv"
-
-# Logs
-move_if_exists "edss_benchmark_terminal.log" \
-    "results/logs/edss_benchmark_terminal.log"
-
-# ------------------------------------------------------------
-# Move scripts
-# ------------------------------------------------------------
-
-echo
-echo "Moving scripts..."
-
-move_if_exists "audit.py" "scripts/audit_outputs.py"
-move_if_exists "certainty.py" "scripts/analyze_certainty.py"
-move_if_exists "certainty_show.py" "scripts/certainty_show.py"
-move_if_exists "figure1.py" "scripts/figure1.py"
-move_if_exists "show_plots.py" "scripts/show_plots.py"
-
-move_if_exists "show_plots.py.orig" "archive/old_scripts/show_plots.py.orig"
-
-# Apps / old entry points
-move_if_exists "app.py" "archive/old_scripts/app.py"
-move_if_exists "total_app.py" "archive/old_scripts/total_app.py"
-
-# Existing project visuals folder
-move_if_exists "project/visuals" "results/figures/project_visuals"
-
-if [ -d "project" ]; then
-    if [ -z "$(ls -A project)" ]; then
-        rmdir project
-    else
-        move_if_exists "project" "archive/old_project_files/project"
-    fi
-fi
-
-# ------------------------------------------------------------
-# Environment folder
-# ------------------------------------------------------------
-
-echo
-echo "Handling virtual environment..."
-
-if [ -d "enarcelona" ]; then
-    echo "Leaving virtual environment in place: enarcelona/"
-    echo "It should remain ignored by .gitignore."
-fi
-
-# ------------------------------------------------------------
-# Create README notes
-# ------------------------------------------------------------
-
-echo
-echo "Writing organization notes..."
-
-cat > "PROJECT_STRUCTURE.md" <<'EOF'
-# Project Structure
-
-This project was reorganized into:
-
- `data/`
-  - `raw/`: original raw data, if retained locally
-  - `processed/`: cleaned or derived input data
-  - `ground_truth/`: manually annotated reference data
-  - `external/`: externally provided data
-
- `prompts/`
-  - EDSS instructions and prompt/schema assets
-
- `scripts/`
-  - runnable analysis and plotting scripts
-
- `results/`
-  - `benchmark_runs/`: full model benchmark runs
-  - `final_results/`: final selected model outputs
-  - `figures/`: generated figures
-  - `tables/`: generated tables
-  - `logs/`: terminal logs
-
- `manuscript/`
-  - final figures and tables for paper/thesis writing
-
- `archive/`
-  - old scripts, old results, temporary files, and unclear legacy files
-
-Important:
-The reorganization was performed after creating a full timestamped backup.
-EOF
-
-# ------------------------------------------------------------
-# Final checks
-# ------------------------------------------------------------
-
-echo
-echo "Organization complete."
-echo
-echo "Backup is here:"
-echo "${BACKUP_DIR}"
-echo
-echo "New top-level structure:"
-find . -maxdepth 2 -type d | sort
-echo
-
-if [ -d ".git" ]; then
-    echo "Git status:"
-    git status --short
-fi
-
-echo
-echo "Next recommended commands:"
-echo "  git status"
-echo "  git add ."
-echo "  git commit -m \"Reorganize project structure\""
@@ -1258,7 +1258,7 @@


 # %% API call - Multi-model, multi-iteration EDSS + timing/resource benchmark
-#
+
 #import time
 #import json
 #import os
@@ -1279,9 +1279,9 @@
 #    print("Install with: pip install psutil")
 #
 #
-## =========================
-## CONFIGURATION
-## =========================
+# =========================
+# CONFIGURATION
+# =========================
 #
 #load_dotenv()
 #
@@ -1289,21 +1289,21 @@
 #OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
 #
 #MODEL_CONFIGS = [
-##    {
-##        "model_name": "qwen3.6-35b-a3b",
-##        "use_response_format": False,
-##        "temperature": 0.0,
-##        "max_tokens": 4096,
-##
-##        # If your backend is vLLM / Qwen chat-template compatible,
-##        # this may reduce long hidden reasoning and JSON truncation.
-##        # If your server errors because of extra_body, set this to None.
-##        "extra_body": {
-##            "chat_template_kwargs": {
-##                "enable_thinking": False
-##            }
-##        },
-##    },
+#    {
+#        "model_name": "qwen3.6-35b-a3b",
+#        "use_response_format": False,
+#        "temperature": 0.0,
+#        "max_tokens": 4096,
+#
+#        # If your backend is vLLM / Qwen chat-template compatible,
+#        # this may reduce long hidden reasoning and JSON truncation.
+#        # If your server errors because of extra_body, set this to None.
+#        "extra_body": {
+#            "chat_template_kwargs": {
+#                "enable_thinking": False
+#            }
+#        },
+#    },
 #    {
 #        "model_name": "gemma-4-31B-it",
 #        "use_response_format": False,
@@ -1328,10 +1328,10 @@
 #NUM_ITERATIONS = 10
 #STOP_ON_FIRST_ERROR = False
 #
-## For testing, set to e.g. 2.
-## For full run, set to None.
+# For testing, set to e.g. 2.
+# For full run, set to None.
 #MAX_ROWS = 2
-## MAX_ROWS = 2
+# MAX_ROWS = 2
 #
 #MAX_TOKENS = 4096
 #TEMPERATURE = 0.0
@@ -1340,14 +1340,14 @@
 #
 #SAVE_EVERY_N_ROWS = 1
 #
-## Retries for invalid JSON / truncated JSON
+# Retries for invalid JSON / truncated JSON
 #MAX_JSON_RETRIES = 2
 #RETRY_SLEEP_SEC = 2
 #
 #
-## =========================
-## CLIENT
-## =========================
+# =========================
+# CLIENT
+# =========================
 #
 #client = OpenAI(
 #    api_key=OPENAI_API_KEY,
@@ -1355,9 +1355,9 @@
 #)
 #
 #
-## =========================
-## HELPERS
-## =========================
+# =========================
+# HELPERS
+# =========================
 #
 #def safe_dir_name(name: str) -> str:
 #    name = str(name).strip()
@@ -1433,9 +1433,9 @@
 #        return max(self.samples_mb)
 #
 #
-## =========================
-## JSON EXTRACTION
-## =========================
+# =========================
+# JSON EXTRACTION
+# =========================
 #
 #def extract_json_from_text(text):
 #    if text is None:
@@ -1587,17 +1587,17 @@
 #    return None
 #
 #
-## =========================
-## READ INSTRUCTIONS
-## =========================
+# =========================
+# READ INSTRUCTIONS
+# =========================
 #
 #with open(EDSS_INSTRUCTIONS_PATH, "r", encoding="utf-8") as f:
 #    EDSS_INSTRUCTIONS = f.read().strip()
 #
 #
-## =========================
-## PROMPT
-## =========================
+# =========================
+# PROMPT
+# =========================
 #
 #def build_prompt(patient_text):
 #    return f'''Du bist ein medizinischer Assistent für EDSS-Extraktion aus klinischen Berichten.
@@ -1676,9 +1676,9 @@
 #'''
 #
 #
-## =========================
-## VALIDATION / NORMALIZATION
-## =========================
+# =========================
+# VALIDATION / NORMALIZATION
+# =========================
 #
 #def normalize_model_output(parsed):
 #    if not isinstance(parsed, dict):
@@ -1755,9 +1755,9 @@
 #    return parsed
 #
 #
-## =========================
-## API CALL
-## =========================
+# =========================
+# API CALL
+# =========================
 #
 #def make_chat_completion(model_config, prompt):
 #    model_name = model_config["model_name"]
@@ -1794,9 +1794,9 @@
 #    return client.chat.completions.create(**kwargs)
 #
 #
-## =========================
-## INFERENCE FUNCTION WITH RETRIES
-## =========================
+# =========================
+# INFERENCE FUNCTION WITH RETRIES
+# =========================
 #
 #def run_inference(patient_text, model_config):
 #    model_name = model_config["model_name"]
@@ -1912,9 +1912,9 @@
 #    }
 #
 #
-## =========================
-## BUILD PATIENT TEXT
-## =========================
+# =========================
+# BUILD PATIENT TEXT
+# =========================
 #
 #def build_patient_text(row):
 #    return (
@@ -1925,9 +1925,9 @@
 #    )
 #
 #
-## =========================
-## FLATTEN RESULTS FOR CSV
-## =========================
+# =========================
+# FLATTEN RESULTS FOR CSV
+# =========================
 #
 #def flatten_result(record):
 #    flat = {
@@ -2012,9 +2012,9 @@
 #    return pd.DataFrame([summary])
 #
 #
-## =========================
-## INCREMENTAL SAVE HELPERS
-## =========================
+# =========================
+# INCREMENTAL SAVE HELPERS
+# =========================
 #
 #def append_jsonl(path, record):
 #    with open(path, "a", encoding="utf-8") as f:
@@ -2030,9 +2030,9 @@
 #    df_one.to_csv(path, mode="a", header=not file_exists, index=False)
 #
 #
-## =========================
-## MAIN LOOP
-## =========================
+# =========================
+# MAIN LOOP
+# =========================
 #
 #if __name__ == "__main__":
 #
@@ -2277,26 +2277,26 @@ MODEL_CONFIGS = [
        "max_tokens": 4096,
        "extra_body": None,
    },
-    # {
-    #     "model_name": "GPT-OSS-120B",
-    #     "use_response_format": True,
-    #     "temperature": 0.0,
-    #     "max_tokens": 4096,
-    #     "extra_body": None,
-    # },
+    {
+        "model_name": "GPT-OSS-120B",
+        "use_response_format": True,
+        "temperature": 0.0,
+        "max_tokens": 4096,
+        "extra_body": None,
+    },
 ]

-INPUT_CSV = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
-EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/Komplett.txt"
+INPUT_CSV ="/home/shahin/Lab/Doktorarbeit/Barcelona/data/processed/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
+EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/prompts/Komplett.txt"

-RESULTS_ROOT = "/home/shahin/Lab/Doktorarbeit/Barcelona/results_edss_benchmark"
+RESULTS_ROOT = "/home/shahin/Lab/Doktorarbeit/Barcelona/results/benchmark_runs"

-NUM_ITERATIONS = 10
+NUM_ITERATIONS = 2
 STOP_ON_FIRST_ERROR = False

 # For testing, set to e.g. 2.
 # For full run, set to None.
-MAX_ROWS = None
+MAX_ROWS = 2
 # MAX_ROWS = 2

 MAX_TOKENS = 4096