diff --git a/organize_project.sh b/organize_project.sh deleted file mode 100755 index af2c35a..0000000 --- a/organize_project.sh +++ /dev/null @@ -1,384 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -# ============================================================ -# Organize Barcelona EDSS project safely -# - Creates a timestamped backup first -# - Creates a cleaner folder structure -# - Moves files conservatively -# - Does NOT delete anything -# ============================================================ - -PROJECT_ROOT="$(pwd)" -TIMESTAMP="$(date +%Y%m%d_%H%M%S)" -BACKUP_PARENT="${PROJECT_ROOT}/../Barcelona_backups" -BACKUP_DIR="${BACKUP_PARENT}/Barcelona_backup_${TIMESTAMP}" - -echo "Project root: ${PROJECT_ROOT}" -echo "Backup dir: ${BACKUP_DIR}" -echo - -# ------------------------------------------------------------ -# Safety checks -# ------------------------------------------------------------ - -if [ ! -f "${PROJECT_ROOT}/README.md" ]; then - echo "WARNING: README.md not found. Are you sure you are in the project root?" - echo "Current directory: ${PROJECT_ROOT}" - read -r -p "Continue anyway? [y/N] " answer - case "$answer" in - y|Y|yes|YES) ;; - *) echo "Aborted."; exit 1 ;; - esac -fi - -if [ -d "${PROJECT_ROOT}/.git" ]; then - if ! git diff --quiet || ! git diff --cached --quiet; then - echo "ERROR: Git working tree is not clean." - echo "Please commit or stash changes before organizing." - exit 1 - fi -fi - -echo "This script will:" -echo "1. Create a full backup." -echo "2. Create organized folders." -echo "3. Move files into data/, prompts/, scripts/, results/, archive/." -echo "4. Keep your original files in the backup." -echo -read -r -p "Proceed? [y/N] " answer -case "$answer" in - y|Y|yes|YES) ;; - *) echo "Aborted."; exit 1 ;; -esac - -# ------------------------------------------------------------ -# Backup -# ------------------------------------------------------------ - -mkdir -p "${BACKUP_PARENT}" - -echo -echo "Creating backup..." -rsync -a \ - --exclude "enarcelona/" \ - --exclude "env/" \ - --exclude ".venv/" \ - --exclude "__pycache__/" \ - "${PROJECT_ROOT}/" "${BACKUP_DIR}/" - -echo "Backup created at:" -echo "${BACKUP_DIR}" - -# ------------------------------------------------------------ -# Create target structure -# ------------------------------------------------------------ - -echo -echo "Creating new directory structure..." - -mkdir -p \ - data/raw \ - data/processed \ - data/ground_truth \ - data/external \ - prompts \ - scripts \ - results/benchmark_runs \ - results/final_results/model_outputs \ - results/figures \ - results/tables \ - results/logs \ - manuscript/figures \ - manuscript/tables \ - archive/old_scripts \ - archive/old_results \ - archive/tmp \ - archive/old_data \ - archive/old_project_files - -# ------------------------------------------------------------ -# Helper move functions -# ------------------------------------------------------------ - -move_if_exists() { - src="$1" - dest="$2" - - if [ -e "$src" ]; then - mkdir -p "$(dirname "$dest")" - - if [ -e "$dest" ]; then - echo "SKIP: destination exists: $dest" - else - echo "MOVE: $src -> $dest" - mv "$src" "$dest" - fi - fi -} - -move_glob_if_exists() { - pattern="$1" - dest_dir="$2" - - mkdir -p "$dest_dir" - - shopt -s nullglob - files=( $pattern ) - shopt -u nullglob - - for f in "${files[@]}"; do - base="$(basename "$f")" - dest="${dest_dir}/${base}" - - if [ -e "$dest" ]; then - echo "SKIP: destination exists: $dest" - else - echo "MOVE: $f -> $dest" - mv "$f" "$dest" - fi - done -} - -# ------------------------------------------------------------ -# Move prompts / attached instruction files -# ------------------------------------------------------------ - -echo -echo "Moving prompt and instruction files..." - -move_if_exists "attach/Komplett.txt" "prompts/Komplett.txt" -move_if_exists "attach/just_edss_schema.gbnf" "prompts/just_edss_schema.gbnf" -move_if_exists "attach/just_edss_text.txt" "prompts/just_edss_text.txt" - -# Move leftover attach folder if empty or archive it -if [ -d "attach" ]; then - if [ -z "$(ls -A attach)" ]; then - rmdir attach - else - move_if_exists "attach" "archive/old_project_files/attach" - fi -fi - -# ------------------------------------------------------------ -# Move important data files -# ------------------------------------------------------------ - -echo -echo "Moving data files..." - -move_if_exists "Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned.csv" \ - "data/processed/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned.csv" - -move_if_exists "Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv" \ - "data/processed/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv" - -move_if_exists "Data/Join_edssandsub.tsv" \ - "data/ground_truth/Join_edssandsub.tsv" - -move_if_exists "Data/GT_Numbers.csv" \ - "data/ground_truth/GT_Numbers.csv" - -move_if_exists "Data/Annika1.csv" \ - "data/ground_truth/Annika1.csv" - -move_if_exists "Data/comparison.tsv" \ - "data/ground_truth/comparison.tsv" - -move_if_exists "Data/edss_distribution_summary.csv" \ - "data/processed/edss_distribution_summary.csv" - -move_if_exists "Data/empirical_confidence_table.csv" \ - "data/processed/empirical_confidence_table.csv" - -move_if_exists "Data/functional_system_colors.json" \ - "data/processed/functional_system_colors.json" - -move_if_exists "Data/Test.csv" \ - "archive/tmp/Test.csv" - -move_if_exists "Data/Hernan" \ - "data/external/Hernan" - -move_if_exists "Data/iteration" \ - "archive/old_data/iteration" - -# Old generated JSON/results from Data folder -move_glob_if_exists "Data/*results*.json" "archive/old_results" -move_glob_if_exists "Data/join_*.tsv" "archive/old_results" - -# Move remaining Data folder if anything left -if [ -d "Data" ]; then - if [ -z "$(ls -A Data)" ]; then - rmdir Data - else - move_if_exists "Data" "archive/old_data/Data_remaining" - fi -fi - -# ------------------------------------------------------------ -# Move benchmark results -# ------------------------------------------------------------ - -echo -echo "Moving benchmark results..." - -if [ -d "results_edss_benchmark" ]; then - move_glob_if_exists "results_edss_benchmark/run_*" "results/benchmark_runs" - - move_if_exists "results_edss_benchmark/endresults" \ - "results/final_results/model_outputs" - - move_if_exists "results_edss_benchmark/confusion_matrices" \ - "results/figures/confusion_matrices" - - if [ -z "$(ls -A results_edss_benchmark 2>/dev/null || true)" ]; then - rmdir results_edss_benchmark - else - move_if_exists "results_edss_benchmark" \ - "archive/old_results/results_edss_benchmark_remaining" - fi -fi - -# ------------------------------------------------------------ -# Move old/general results -# ------------------------------------------------------------ - -echo -echo "Moving existing results files..." - -if [ -d "results" ]; then - # Figures - move_glob_if_exists "results/*.png" "results/figures" - move_glob_if_exists "results/*.PNG" "results/figures" - move_glob_if_exists "results/*.jpg" "results/figures" - move_glob_if_exists "results/*.jpeg" "results/figures" - move_glob_if_exists "results/*.svg" "results/figures" - - # Tables - move_glob_if_exists "results/*.csv" "results/tables" - move_glob_if_exists "results/*.tsv" "results/tables" - move_glob_if_exists "results/*.xlsx" "results/tables" - - # Subfolders that look like old results - move_if_exists "results/Jan_visual" "archive/old_results/Jan_visual" - move_if_exists "results/Lab_meeting" "archive/old_results/Lab_meeting" - move_if_exists "results/just_edss" "archive/old_results/just_edss" -fi - -# Root-level result tables -move_if_exists "edss_distribution_summary.csv" \ - "results/tables/edss_distribution_summary.csv" - -# Logs -move_if_exists "edss_benchmark_terminal.log" \ - "results/logs/edss_benchmark_terminal.log" - -# ------------------------------------------------------------ -# Move scripts -# ------------------------------------------------------------ - -echo -echo "Moving scripts..." - -move_if_exists "audit.py" "scripts/audit_outputs.py" -move_if_exists "certainty.py" "scripts/analyze_certainty.py" -move_if_exists "certainty_show.py" "scripts/certainty_show.py" -move_if_exists "figure1.py" "scripts/figure1.py" -move_if_exists "show_plots.py" "scripts/show_plots.py" - -move_if_exists "show_plots.py.orig" "archive/old_scripts/show_plots.py.orig" - -# Apps / old entry points -move_if_exists "app.py" "archive/old_scripts/app.py" -move_if_exists "total_app.py" "archive/old_scripts/total_app.py" - -# Existing project visuals folder -move_if_exists "project/visuals" "results/figures/project_visuals" - -if [ -d "project" ]; then - if [ -z "$(ls -A project)" ]; then - rmdir project - else - move_if_exists "project" "archive/old_project_files/project" - fi -fi - -# ------------------------------------------------------------ -# Environment folder -# ------------------------------------------------------------ - -echo -echo "Handling virtual environment..." - -if [ -d "enarcelona" ]; then - echo "Leaving virtual environment in place: enarcelona/" - echo "It should remain ignored by .gitignore." -fi - -# ------------------------------------------------------------ -# Create README notes -# ------------------------------------------------------------ - -echo -echo "Writing organization notes..." - -cat > "PROJECT_STRUCTURE.md" <<'EOF' -# Project Structure - -This project was reorganized into: - -- `data/` - - `raw/`: original raw data, if retained locally - - `processed/`: cleaned or derived input data - - `ground_truth/`: manually annotated reference data - - `external/`: externally provided data - -- `prompts/` - - EDSS instructions and prompt/schema assets - -- `scripts/` - - runnable analysis and plotting scripts - -- `results/` - - `benchmark_runs/`: full model benchmark runs - - `final_results/`: final selected model outputs - - `figures/`: generated figures - - `tables/`: generated tables - - `logs/`: terminal logs - -- `manuscript/` - - final figures and tables for paper/thesis writing - -- `archive/` - - old scripts, old results, temporary files, and unclear legacy files - -Important: -The reorganization was performed after creating a full timestamped backup. -EOF - -# ------------------------------------------------------------ -# Final checks -# ------------------------------------------------------------ - -echo -echo "Organization complete." -echo -echo "Backup is here:" -echo "${BACKUP_DIR}" -echo -echo "New top-level structure:" -find . -maxdepth 2 -type d | sort -echo - -if [ -d ".git" ]; then - echo "Git status:" - git status --short -fi - -echo -echo "Next recommended commands:" -echo " git status" -echo " git add ." -echo " git commit -m \"Reorganize project structure\"" diff --git a/scripts/analyze_certainty.py b/scripts/analyze_certainty.py index a9f1038..8766a0b 100644 --- a/scripts/analyze_certainty.py +++ b/scripts/analyze_certainty.py @@ -1258,7 +1258,7 @@ # %% API call - Multi-model, multi-iteration EDSS + timing/resource benchmark -# + #import time #import json #import os @@ -1279,9 +1279,9 @@ # print("Install with: pip install psutil") # # -## ========================= -## CONFIGURATION -## ========================= +# ========================= +# CONFIGURATION +# ========================= # #load_dotenv() # @@ -1289,21 +1289,21 @@ #OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL") # #MODEL_CONFIGS = [ -## { -## "model_name": "qwen3.6-35b-a3b", -## "use_response_format": False, -## "temperature": 0.0, -## "max_tokens": 4096, -## -## # If your backend is vLLM / Qwen chat-template compatible, -## # this may reduce long hidden reasoning and JSON truncation. -## # If your server errors because of extra_body, set this to None. -## "extra_body": { -## "chat_template_kwargs": { -## "enable_thinking": False -## } -## }, -## }, +# { +# "model_name": "qwen3.6-35b-a3b", +# "use_response_format": False, +# "temperature": 0.0, +# "max_tokens": 4096, +# +# # If your backend is vLLM / Qwen chat-template compatible, +# # this may reduce long hidden reasoning and JSON truncation. +# # If your server errors because of extra_body, set this to None. +# "extra_body": { +# "chat_template_kwargs": { +# "enable_thinking": False +# } +# }, +# }, # { # "model_name": "gemma-4-31B-it", # "use_response_format": False, @@ -1328,10 +1328,10 @@ #NUM_ITERATIONS = 10 #STOP_ON_FIRST_ERROR = False # -## For testing, set to e.g. 2. -## For full run, set to None. +# For testing, set to e.g. 2. +# For full run, set to None. #MAX_ROWS = 2 -## MAX_ROWS = 2 +# MAX_ROWS = 2 # #MAX_TOKENS = 4096 #TEMPERATURE = 0.0 @@ -1340,14 +1340,14 @@ # #SAVE_EVERY_N_ROWS = 1 # -## Retries for invalid JSON / truncated JSON +# Retries for invalid JSON / truncated JSON #MAX_JSON_RETRIES = 2 #RETRY_SLEEP_SEC = 2 # # -## ========================= -## CLIENT -## ========================= +# ========================= +# CLIENT +# ========================= # #client = OpenAI( # api_key=OPENAI_API_KEY, @@ -1355,9 +1355,9 @@ #) # # -## ========================= -## HELPERS -## ========================= +# ========================= +# HELPERS +# ========================= # #def safe_dir_name(name: str) -> str: # name = str(name).strip() @@ -1433,9 +1433,9 @@ # return max(self.samples_mb) # # -## ========================= -## JSON EXTRACTION -## ========================= +# ========================= +# JSON EXTRACTION +# ========================= # #def extract_json_from_text(text): # if text is None: @@ -1587,17 +1587,17 @@ # return None # # -## ========================= -## READ INSTRUCTIONS -## ========================= +# ========================= +# READ INSTRUCTIONS +# ========================= # #with open(EDSS_INSTRUCTIONS_PATH, "r", encoding="utf-8") as f: # EDSS_INSTRUCTIONS = f.read().strip() # # -## ========================= -## PROMPT -## ========================= +# ========================= +# PROMPT +# ========================= # #def build_prompt(patient_text): # return f'''Du bist ein medizinischer Assistent für EDSS-Extraktion aus klinischen Berichten. @@ -1676,9 +1676,9 @@ #''' # # -## ========================= -## VALIDATION / NORMALIZATION -## ========================= +# ========================= +# VALIDATION / NORMALIZATION +# ========================= # #def normalize_model_output(parsed): # if not isinstance(parsed, dict): @@ -1755,9 +1755,9 @@ # return parsed # # -## ========================= -## API CALL -## ========================= +# ========================= +# API CALL +# ========================= # #def make_chat_completion(model_config, prompt): # model_name = model_config["model_name"] @@ -1794,9 +1794,9 @@ # return client.chat.completions.create(**kwargs) # # -## ========================= -## INFERENCE FUNCTION WITH RETRIES -## ========================= +# ========================= +# INFERENCE FUNCTION WITH RETRIES +# ========================= # #def run_inference(patient_text, model_config): # model_name = model_config["model_name"] @@ -1912,9 +1912,9 @@ # } # # -## ========================= -## BUILD PATIENT TEXT -## ========================= +# ========================= +# BUILD PATIENT TEXT +# ========================= # #def build_patient_text(row): # return ( @@ -1925,9 +1925,9 @@ # ) # # -## ========================= -## FLATTEN RESULTS FOR CSV -## ========================= +# ========================= +# FLATTEN RESULTS FOR CSV +# ========================= # #def flatten_result(record): # flat = { @@ -2012,9 +2012,9 @@ # return pd.DataFrame([summary]) # # -## ========================= -## INCREMENTAL SAVE HELPERS -## ========================= +# ========================= +# INCREMENTAL SAVE HELPERS +# ========================= # #def append_jsonl(path, record): # with open(path, "a", encoding="utf-8") as f: @@ -2030,9 +2030,9 @@ # df_one.to_csv(path, mode="a", header=not file_exists, index=False) # # -## ========================= -## MAIN LOOP -## ========================= +# ========================= +# MAIN LOOP +# ========================= # #if __name__ == "__main__": # @@ -2277,26 +2277,26 @@ MODEL_CONFIGS = [ "max_tokens": 4096, "extra_body": None, }, - # { - # "model_name": "GPT-OSS-120B", - # "use_response_format": True, - # "temperature": 0.0, - # "max_tokens": 4096, - # "extra_body": None, - # }, + { + "model_name": "GPT-OSS-120B", + "use_response_format": True, + "temperature": 0.0, + "max_tokens": 4096, + "extra_body": None, + }, ] -INPUT_CSV = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv" -EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/Komplett.txt" +INPUT_CSV ="/home/shahin/Lab/Doktorarbeit/Barcelona/data/processed/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv" +EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/prompts/Komplett.txt" -RESULTS_ROOT = "/home/shahin/Lab/Doktorarbeit/Barcelona/results_edss_benchmark" +RESULTS_ROOT = "/home/shahin/Lab/Doktorarbeit/Barcelona/results/benchmark_runs" -NUM_ITERATIONS = 10 +NUM_ITERATIONS = 2 STOP_ON_FIRST_ERROR = False # For testing, set to e.g. 2. # For full run, set to None. -MAX_ROWS = None +MAX_ROWS = 2 # MAX_ROWS = 2 MAX_TOKENS = 4096