Files
EDSS-calc/scripts/analyze_certainty.py

4542 lines
145 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# %% API call1
#import time
#import json
#import os
#from datetime import datetime
#import pandas as pd
#from openai import OpenAI
#from dotenv import load_dotenv
#
## Load environment variables
#load_dotenv()
#
## === CONFIGURATION ===
#OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
#OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
#MODEL_NAME = "GPT-OSS-120B"
#HEALTH_URL = f"{OPENAI_BASE_URL}/health" # Placeholder - actual health check would need to be implemented
#CHAT_URL = f"{OPENAI_BASE_URL}/chat/completions"
#
## File paths
#INPUT_CSV = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
#EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/Komplett.txt"
##GRAMMAR_FILE = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/just_edss_schema.gbnf"
#
## Initialize OpenAI client
#client = OpenAI(
# api_key=OPENAI_API_KEY,
# base_url=OPENAI_BASE_URL
#)
#
## Read EDSS instructions from file
#with open(EDSS_INSTRUCTIONS_PATH, 'r') as f:
# EDSS_INSTRUCTIONS = f.read().strip()
## === RUN INFERENCE 2 ===
#def run_inference(patient_text):
# prompt = f'''
# Du bist ein medizinischer Assistent, der spezialisiert darauf ist, EDSS-Scores (Expanded Disability Status Scale) aus klinischen Berichten zu extrahieren.
#### Regeln für die Ausgabe:
#1. **Reason**: Erstelle eine prägnante Zusammenfassung (max. 400 Zeichen) der Befunde auf **DEUTSCH**, die zur Einstufung führen.
#2. **klassifizierbar**:
# - Setze dies auf **true**, wenn ein EDSS-Wert identifiziert, berechnet oder basierend auf den klinischen Hinweisen plausibel geschätzt werden kann.
# - Setze dies auf **false**, NUR wenn die Daten absolut unzureichend oder so widersprüchlich sind, dass keinerlei Einstufung möglich ist.
#3. **EDSS**:
# - Dieses Feld ist **VERPFLICHTEND**, wenn "klassifizierbar" auf true steht.
# - Es muss eine Zahl zwischen 0.0 und 10.0 sein.
# - Versuche stets, den EDSS-Wert so präzise wie möglich zu bestimmen, auch wenn die Datenlage dünn ist (nutze verfügbare Informationen zu Gehstrecke und Funktionssystemen).
# - Dieses Feld **DARF NICHT ERSCHEINEN**, wenn "klassifizierbar" auf false steht.
#
#### Einschränkungen:
#- Erfinde keine Fakten, aber nutze klinische Herleitungen aus dem Bericht, um den EDSS zu bestimmen.
#- Priorisiere die Vergabe eines EDSS-Wertes gegenüber der Markierung als nicht klassifizierbar.
#- Halte dich strikt an die JSON-Struktur.
#
#EDSS-Bewertungsrichtlinien:
#{EDSS_INSTRUCTIONS}
#
#Patientenbericht:
#{patient_text}
#'''
# start_time = time.time()
#
# try:
# # Make API call using OpenAI client
# response = client.chat.completions.create(
# messages=[
# {
# "role": "system",
# "content": "You extract EDSS scores. You prioritize providing a score even if data is partial, by using clinical inference."
# },
# {
# "role": "user",
# "content": prompt
# }
# ],
# model=MODEL_NAME,
# max_tokens=2048,
# temperature=0.0,
# response_format={"type": "json_object"}
# )
#
# # Extract content from response
# content = response.choices[0].message.content
#
# # Parse the JSON response
# parsed = json.loads(content)
#
# inference_time = time.time() - start_time
#
# return {
# "success": True,
# "result": parsed,
# "inference_time_sec": inference_time
# }
#
# except Exception as e:
# print(f"Inference error: {e}")
# return {
# "success": False,
# "error": str(e),
# "inference_time_sec": -1
# }
## === BUILD PATIENT TEXT ===
#def build_patient_text(row):
# return (
# str(row["T_Zusammenfassung"]) + "\n" +
# str(row["Diagnosen"]) + "\n" +
# str(row["T_KlinBef"]) + "\n" +
# str(row["T_Befunde"]) + "\n"
# )
#
#if __name__ == "__main__":
# # Read CSV file ONLY inside main block
# df = pd.read_csv(INPUT_CSV, sep=';')
# results = []
#
# # Process each row
# for idx, row in df.iterrows():
# print(f"Processing row {idx + 1}/{len(df)}")
# try:
# patient_text = build_patient_text(row)
# result = run_inference(patient_text)
#
# # Add unique_id and MedDatum to result for tracking
# result["unique_id"] = row.get("unique_id", f"row_{idx}")
# result["MedDatum"] = row.get("MedDatum", None)
#
# results.append(result)
# print(json.dumps(result, indent=2))
# except Exception as e:
# print(f"Error processing row {idx}: {e}")
# results.append({
# "success": False,
# "error": str(e),
# "unique_id": row.get("unique_id", f"row_{idx}"),
# "MedDatum": row.get("MedDatum", None)
# })
#
# # Save results to a JSON file
# output_json = INPUT_CSV.replace(".csv", "_results_Nisch.json")
# with open(output_json, 'w') as f:
# json.dump(results, f, indent=2)
# print(f"Results saved to {output_json}")
##
# %% API call1 - Enhanced with certainty scoring
#import time
#import json
#import os
#from datetime import datetime
#import pandas as pd
#from openai import OpenAI
#from dotenv import load_dotenv
#
## Load environment variables
#load_dotenv()
#
## === CONFIGURATION ===
#OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
#OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
#MODEL_NAME = "GPT-OSS-120B"
#
## File paths
#INPUT_CSV = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Test.csv"
#EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/Komplett.txt"
#
## Initialize OpenAI client
#client = OpenAI(
# api_key=OPENAI_API_KEY,
# base_url=OPENAI_BASE_URL
#)
#
## Read EDSS instructions from file
#with open(EDSS_INSTRUCTIONS_PATH, 'r') as f:
# EDSS_INSTRUCTIONS = f.read().strip()
#
## === PROMPT WITH CERTAINTY REQUEST ===
#def build_prompt(patient_text):
# return f'''Du bist ein medizinischer Assistent, der spezialisiert darauf ist, EDSS-Scores (Expanded Disability Status Scale), alle Unterkategorien und die Bewertungssicherheit aus klinischen Berichten zu extrahieren.
#
#### Deine Aufgabe:
#1. Analysiere den Patientenbericht und extrahiere:
# - Den Gesamt-EDSS-Score (0.010.0)
# - Alle 8 EDSS-Unterkategorien (mit jeweils eigener Maximalpunktzahl)
#2. Schätze für jede Entscheidung die Sicherheit als Ganzzahl von 0100 % ein.
#
#### Struktur der JSON-Ausgabe (VERPFLICHTEND):
#Gib NUR gültiges JSON zurück — kein Markdown, kein Text davor/dahinter.
#
#{{
# "reason": "Kernaussage zur EDSS-Begründung (max. 400 Zeichen, auf Deutsch).",
# "klassifizierbar": true/false,
# "EDSS": null ODER Zahl zwischen 0.0 und 10.0 (nur wenn klassifizierbar=true)",
# "certainty_percent": 0 ODER Zahl zwischen 0 und 100 (Ganzzahl)",
# "subcategories": {{
# "VISUAL_OPTIC_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "BRAINSTEM_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "PYRAMIDAL_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "CEREBELLAR_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "SENSORY_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "BOWEL_AND_BLADDER_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "CEREBRAL_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "AMBULATION": null ODER Zahl zwischen 0.0 und 10.0
# }}
#}}
#
#### Regeln:
#- **reason**: Kurze, prägnante Begründung (auf Deutsch, max. 400 Zeichen), warum du den EDSS-Wert und die Unterkategorien so bewertest.
#- **klassifizierbar**:
# - `true`, wenn EDSS und mindestens die wichtigsten Unterkategorien *eindeutig ableitbar* oder *plausibel inferierbar* sind.
# - `false`, **nur**, wenn keine relevanten Daten vorliegen, oder diese so widersprüchlich/inkonsistent sind, dass keine vernünftige Einschätzung möglich ist.
#- **EDSS**:
# - **VERPFLICHTEND**, wenn `klassifizierbar=true`.
# - Zahl zwischen 0.0 und 10.0 (z.B. 3.0, 5.5). Darf **nicht** erscheinen, wenn `klassifizierbar=false`.
#- **certainty_percent**:
# - **Immer present** — Ganzzahl (0100), basierend auf:
# - Klarheit und Vollständigkeit der Berichtsangaben,
# - Stichhaltigkeit der Schlussfolgerung (inkl. Inferenz),
# - Konsistenz zwischen den Unterkategorien.
#- **subcategories**:
# - **Immer present** — **alle 8 Unterkategorien** müssen enthalten sein.
# - Jeder Wert ist entweder:
# - `null` (wenn keine ausreichende Information vorliegt), **oder**
# - eine Zahl ≤ jeweiliger Obergrenze (z.B. Ambulation ≤ 10.0).
# - Wenn die Unterkategorie plausibel inferiert werden kann (auch indirekt), gib einen sinnvollen Wert ab.
# - Beispiel: Wenn „Gang mit Krückstock auf ebenem Boden bis 200 m“ steht, setze `AMBULATION: 5.5`.
#
#### EDSS-Bewertungsrichtlinien:
#{EDSS_INSTRUCTIONS}
#
#Patientenbericht:
#{patient_text}
#'''
#
## === INFERENCE FUNCTION ===
#def run_inference(patient_text):
# prompt = build_prompt(patient_text)
#
# start_time = time.time()
#
# try:
# response = client.chat.completions.create(
# messages=[
# {"role": "system", "content": "Du gibst EXKLUSIV gültiges JSON zurück — keine weiteren Erklärungen."}
# ] + [
# {"role": "user", "content": prompt}
# ],
# model=MODEL_NAME,
# max_tokens=2048,
# temperature=0.1, # Slightly higher for more natural certainty estimation (still low for reliability)
# response_format={"type": "json_object"}
# )
#
# content = response.choices[0].message.content
#
# # Parse and validate JSON
# try:
# parsed = json.loads(content)
# except json.JSONDecodeError as e:
# print(f"⚠️ JSON parsing failed: {e}")
# print("Raw response:", content[:500])
# raise ValueError("Model did not return valid JSON")
#
# # Enforce required keys
# if "certainty_percent" not in parsed:
# print("⚠️ Missing 'certainty_percent' in output! Force-adding fallback.")
# parsed["certainty_percent"] = 0 # fallback
# elif not isinstance(parsed["certainty_percent"], (int, float)):
# parsed["certainty_percent"] = int(parsed["certainty_percent"])
#
# # Clamp certainty to [0, 100]
# pct = parsed["certainty_percent"]
# parsed["certainty_percent"] =max(0, min(100, int(pct)))
#
# # Enforce EDSS rules: if not classifiable → remove EDSS
# if not parsed.get("klassifizierbar", False):
# if "EDSS" in parsed:
# del parsed["EDSS"] # per spec, must not appear if not classifiable
# else:
# if "EDSS" not in parsed:
# print("⚠️ 'klassifizierbar' is true but EDSS missing — adding fallback.")
# parsed["EDSS"] = 7.0 # last-resort fallback
#
# inference_time = time.time() - start_time
#
# return {
# "success": True,
# "result": parsed,
# "inference_time_sec": inference_time
# }
#
# except Exception as e:
# print(f"❌ Inference error: {e}")
# return {
# "success": False,
# "error": str(e),
# "inference_time_sec": -1,
# "result": None # no structured output
# }
#
## === BUILD PATIENT TEXT ===
#def build_patient_text(row):
# return (
# str(row.get("T_Zusammenfassung", "")) + "\n" +
# str(row.get("Diagnosen", "")) + "\n" +
# str(row.get("T_KlinBef", "")) + "\n" +
# str(row.get("T_Befunde", ""))
# )
#
#if __name__ == "__main__":
# # Load data
# df = pd.read_csv(INPUT_CSV, sep=';')
# results = []
#
# # Optional: limit for testing
# # df = df.head(3)
#
# print(f"Processing {len(df)} rows...")
# for idx, row in df.iterrows():
# print(f"\n— Row {idx + 1}/{len(df)} —")
# try:
# patient_text = build_patient_text(row)
# result = run_inference(patient_text)
#
# # Attach metadata
# result["unique_id"] = row.get("unique_id", f"row_{idx}")
# result["MedDatum"] = row.get("MedDatum", None)
#
# results.append(result)
#
# # Print summary
# if result["success"]:
# res = result["result"]
# edss = res.get("EDSS", "N/A") if res.get("klassifizierbar") else "N/A"
# print(f"✅ Result → EDSS={edss}, certainty={res.get('certainty_percent', 'N/A')}%")
# print(f" Reason: {res.get('reason', 'N/A')[:100]}…")
# else:
# print(f"❌ Failed: {result.get('error', 'Unknown error')[:100]}")
#
# except Exception as e:
# print(f"⚠️ Error processing row {idx}: {e}")
# results.append({
# "success": False,
# "error": str(e),
# "unique_id": row.get("unique_id", f"row_{idx}"),
# "MedDatum": row.get("MedDatum", None),
# "result": None
# })
#
# # Save results
# output_json = INPUT_CSV.replace(".csv", "_results_Nisch_certainty.json")
# with open(output_json, 'w', encoding='utf-8') as f:
# json.dump(results, f, indent=2, ensure_ascii=False)
# print(f"\n✅ Saved results to: {output_json}")
#
##
# %% API call - Multi-iteration EDSS + certainty extraction
#
#import time
#import json
#import os
#from datetime import datetime
#import pandas as pd
#from openai import OpenAI
#from dotenv import load_dotenv
#
## Load environment variables
#load_dotenv()
#
## === CONFIGURATION ===
#OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
#OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
#MODEL_NAME = "GPT-OSS-120B"
#
## File paths
#INPUT_CSV = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
#EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/Komplett.txt"
#
## Iteration settings
#NUM_ITERATIONS = 20
#STOP_ON_FIRST_ERROR = False # Set to True for debugging
#
## Initialize OpenAI client
#client = OpenAI(
# api_key=OPENAI_API_KEY,
# base_url=OPENAI_BASE_URL
#)
#
## Read EDSS instructions from file
#with open(EDSS_INSTRUCTIONS_PATH, 'r') as f:
# EDSS_INSTRUCTIONS = f.read().strip()
#
## === PROMPT (unchanged from before) ===
#def build_prompt(patient_text):
# return f'''Du bist ein medizinischer Assistent, der spezialisiert darauf ist, EDSS-Scores (Expanded Disability Status Scale), alle Unterkategorien und die Bewertungssicherheit aus klinischen Berichten zu extrahieren.
#
#### Deine Aufgabe:
#1. Analysiere den Patientenbericht und extrahiere:
# - Den Gesamt-EDSS-Score (0.010.0)
# - Alle 8 EDSS-Unterkategorien (mit jeweils eigener Maximalpunktzahl)
#2. Schätze für jede Entscheidung die Sicherheit als Ganzzahl von 0100 % ein.
#
#### Struktur der JSON-Ausgabe (VERPFLICHTEND):
#Gib NUR gültiges JSON zurück — kein Markdown, kein Text davor/dahinter.
#
#{{
# "reason": "Kernaussage zur EDSS-Begründung (max. 400 Zeichen, auf Deutsch).",
# "klassifizierbar": true/false,
# "EDSS": null ODER Zahl zwischen 0.0 und 10.0 (nur wenn klassifizierbar=true)",
# "certainty_percent": 0 ODER Zahl zwischen 0 und 100 (Ganzzahl)",
# "subcategories": {{
# "VISUAL_OPTIC_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "BRAINSTEM_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "PYRAMIDAL_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "CEREBELLAR_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "SENSORY_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "BOWEL_AND_BLADDER_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "CEREBRAL_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "AMBULATION": null ODER Zahl zwischen 0.0 und 10.0
# }}
#}}
#
#### Regeln:
#- **reason**: Kurze, prägnante Begründung (auf Deutsch, max. 400 Zeichen), warum du den EDSS-Wert und die Unterkategorien so bewertest.
#- **klassifizierbar**:
# - `true`, wenn EDSS und mindestens die wichtigsten Unterkategorien *eindeutig ableitbar* oder *plausibel inferierbar* sind.
# - `false`, **nur**, wenn keine relevanten Daten vorliegen, oder diese so widersprüchlich/inkonsistent sind, dass keine vernünftige Einschätzung möglich ist.
#- **EDSS**:
# - **VERPFLICHTEND**, wenn `klassifizierbar=true`.
# - Zahl zwischen 0.0 und 10.0 (z.B. 3.0, 5.5). Darf **nicht** erscheinen, wenn `klassifizierbar=false`.
#- **certainty_percent**:
# - **Immer present** — Ganzzahl (0100), basierend auf:
# - Klarheit und Vollständigkeit der Berichtsangaben,
# - Stichhaltigkeit der Schlussfolgerung (inkl. Inferenz),
# - Konsistenz zwischen den Unterkategorien.
#- **subcategories**:
# - **Immer present** — **alle 8 Unterkategorien** müssen enthalten sein.
# - Jeder Wert ist entweder:
# - `null` (wenn keine ausreichende Information vorliegt), **oder**
# - eine Zahl ≤ jeweiliger Obergrenze (z.B. Ambulation ≤ 10.0).
# - Wenn die Unterkategorie plausibel inferiert werden kann (auch indirekt), gib einen sinnvollen Wert ab.
# - Beispiel: Wenn „Gang mit Krückstock auf ebenem Boden bis 200 m“ steht, setze `AMBULATION: 5.5`.
#
#### EDSS-Bewertungsrichtlinien:
#{EDSS_INSTRUCTIONS}
#
#Patientenbericht:
#{patient_text}
#'''
#
## === INFERENCE FUNCTION (unchanged) ===
#def run_inference(patient_text):
# prompt = build_prompt(patient_text)
#
# start_time = time.time()
#
# try:
# response = client.chat.completions.create(
# messages=[
# {"role": "system", "content": "Du gibst EXKLUSIV gültiges JSON zurück — keine weiteren Erklärungen."}
# ] + [
# {"role": "user", "content": prompt}
# ],
# model=MODEL_NAME,
# max_tokens=2048,
# temperature=0.1,
# response_format={"type": "json_object"}
# )
#
# content = response.choices[0].message.content
#
# # Parse and validate JSON
# try:
# parsed = json.loads(content)
# except json.JSONDecodeError as e:
# print(f"⚠️ JSON parsing failed: {e}")
# print("Raw response:", content[:500])
# raise ValueError("Model did not return valid JSON")
#
# # Enforce required keys
# if "certainty_percent" not in parsed:
# print("⚠️ Missing 'certainty_percent' in output! Force-adding fallback.")
# parsed["certainty_percent"] = 0
# elif not isinstance(parsed["certainty_percent"], (int, float)):
# parsed["certainty_percent"] = int(parsed["certainty_percent"])
#
# # Clamp certainty to [0, 100]
# pct = parsed["certainty_percent"]
# parsed["certainty_percent"] = max(0, min(100, int(pct)))
#
# # Enforce EDSS rules
# if not parsed.get("klassifizierbar", False):
# if "EDSS" in parsed:
# del parsed["EDSS"]
# else:
# if "EDSS" not in parsed:
# print("⚠️ 'klassifizierbar' is true but EDSS missing — adding fallback.")
# parsed["EDSS"] = 7.0
#
# inference_time = time.time() - start_time
#
# return {
# "success": True,
# "result": parsed,
# "inference_time_sec": inference_time
# }
#
# except Exception as e:
# print(f"❌ Inference error: {e}")
# return {
# "success": False,
# "error": str(e),
# "inference_time_sec": -1,
# "result": None
# }
#
## === BUILD PATIENT TEXT ===
#def build_patient_text(row):
# return (
# str(row.get("T_Zusammenfassung", "")) + "\n" +
# str(row.get("Diagnosen", "")) + "\n" +
# str(row.get("T_KlinBef", "")) + "\n" +
# str(row.get("T_Befunde", ""))
# )
#
## === MAIN LOOP (NEW: MULTI-ITERATION) ===
#if __name__ == "__main__":
# # Load data ONCE (to avoid repeated I/O overhead)
# df = pd.read_csv(INPUT_CSV, sep=';')
# total_rows = len(df)
# print(f"Loaded {total_rows} patient records.")
#
# for iteration in range(1, NUM_ITERATIONS + 1):
# print(f"\n{'='*60}")
# print(f"🔄 ITERATION {iteration}/{NUM_ITERATIONS}")
# print(f"{'='*60}")
#
# iteration_results = []
# start_iter = time.time()
#
# for idx, row in df.iterrows():
# print(f"\rRow {idx+1}/{total_rows} | Iter {iteration}", end='', flush=True)
# try:
# patient_text = build_patient_text(row)
# result = run_inference(patient_text)
#
# # Attach metadata
# if result["success"]:
# res = result["result"].copy() # avoid mutation
# res["iteration"] = iteration
# res["unique_id"] = row.get("unique_id", f"row_{idx}")
# res["MedDatum"] = row.get("MedDatum", None)
# result["result"] = res
#
# else:
# result["iteration"] = iteration
# result["unique_id"] = row.get("unique_id", f"row_{idx}")
# result["MedDatum"] = row.get("MedDatum", None)
#
# iteration_results.append(result)
#
# if result["success"]:
# res = result["result"]
# edss = res.get("EDSS", "N/A") if res.get("klassifizierbar") else "N/A"
# print(f" ✅ EDSS={edss}, cert={res.get('certainty_percent', '?')}%")
# else:
# print(f" ❌ {result.get('error', 'Unknown')}")
#
# except Exception as e:
# print(f"\n⚠️ Row {idx} failed: {e}")
# iteration_results.append({
# "success": False,
# "error": str(e),
# "iteration": iteration,
# "unique_id": row.get("unique_id", f"row_{idx}"),
# "MedDatum": row.get("MedDatum", None),
# "result": None
# })
# if STOP_ON_FIRST_ERROR:
# break
#
# # Save per-iteration results
# timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# output_path = INPUT_CSV.replace(".csv", f"_results_iter_{iteration}_{timestamp}.json")
# with open(output_path, 'w', encoding='utf-8') as f:
# json.dump(iteration_results, f, indent=2, ensure_ascii=False)
# print(f"\n✅ Iteration {iteration} complete. Saved to: {output_path}")
#
# elapsed = time.time() - start_iter
# print(f"⏱️ Iteration {iteration} took {elapsed:.1f}s ({elapsed/total_rows:.1f}s/row)")
#
# print(f"\n🎉 All {NUM_ITERATIONS} iterations completed!")
#
#
##
# %% API call - Multi-model, multi-iteration EDSS + timing/resource benchmark
#
#import time
#import json
#import os
#import re
#import threading
#from datetime import datetime
#from pathlib import Path
#
#import pandas as pd
#from openai import OpenAI
#from dotenv import load_dotenv
#
#try:
# import psutil
#except ImportError:
# psutil = None
# print("⚠️ psutil is not installed. Resource metrics will be limited.")
# print("Install with: pip install psutil")
#
#
## =========================
## CONFIGURATION
## =========================
#
#load_dotenv()
#
#OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
#OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
#
#MODEL_NAMES = [
# # "GPT-OSS-120B",
# "qwen3.6-27b",
# # "gemma-4-31B-it",
#]
#
#INPUT_CSV = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
#EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/Komplett.txt"
#
#RESULTS_ROOT = "/home/shahin/Lab/Doktorarbeit/Barcelona/results_edss_benchmark"
#
#NUM_ITERATIONS = 20
#STOP_ON_FIRST_ERROR = False
#
#MAX_TOKENS = 2048
#TEMPERATURE = 0.1
#
## Memory sampling interval during one inference call
#RESOURCE_SAMPLE_INTERVAL_SEC = 0.05
#
#
## =========================
## CLIENT
## =========================
#
#client = OpenAI(
# api_key=OPENAI_API_KEY,
# base_url=OPENAI_BASE_URL
#)
#
#
## =========================
## HELPERS
## =========================
#
#def safe_dir_name(name: str) -> str:
# """
# Convert model name to a filesystem-safe directory name.
# """
# name = str(name).strip()
# name = re.sub(r"[^\w\-.]+", "_", name)
# return name[:150]
#
#
#def now_timestamp() -> str:
# return datetime.now().strftime("%Y%m%d_%H%M%S")
#
#
#def get_process():
# if psutil is None:
# return None
# return psutil.Process(os.getpid())
#
#
#def get_memory_rss_mb(process=None):
# if psutil is None:
# return None
# if process is None:
# process = get_process()
# return process.memory_info().rss / (1024 * 1024)
#
#
#def get_cpu_times_sec(process=None):
# if psutil is None:
# return None
# if process is None:
# process = get_process()
# cpu_times = process.cpu_times()
# return cpu_times.user + cpu_times.system
#
#
#class ResourceSampler:
# """
# Samples process RSS while an inference call is running.
# Useful for approximating peak memory usage per request.
# """
#
# def __init__(self, interval_sec=0.05):
# self.interval_sec = interval_sec
# self.process = get_process()
# self.running = False
# self.thread = None
# self.samples_mb = []
#
# def start(self):
# if psutil is None:
# return
#
# self.running = True
# self.samples_mb = []
# self.thread = threading.Thread(target=self._sample_loop, daemon=True)
# self.thread.start()
#
# def stop(self):
# if psutil is None:
# return
#
# self.running = False
# if self.thread is not None:
# self.thread.join(timeout=1.0)
#
# def _sample_loop(self):
# while self.running:
# try:
# rss_mb = get_memory_rss_mb(self.process)
# self.samples_mb.append(rss_mb)
# except Exception:
# pass
# time.sleep(self.interval_sec)
#
# @property
# def peak_rss_mb(self):
# if not self.samples_mb:
# return None
# return max(self.samples_mb)
#
#
## =========================
## READ INSTRUCTIONS
## =========================
#
#with open(EDSS_INSTRUCTIONS_PATH, "r", encoding="utf-8") as f:
# EDSS_INSTRUCTIONS = f.read().strip()
#
#
## =========================
## PROMPT
## =========================
#
#def build_prompt(patient_text):
# return f'''Du bist ein medizinischer Assistent, der spezialisiert darauf ist, EDSS-Scores (Expanded Disability Status Scale), alle Unterkategorien und die Bewertungssicherheit aus klinischen Berichten zu extrahieren.
#
#### Deine Aufgabe:
#1. Analysiere den Patientenbericht und extrahiere:
# - Den Gesamt-EDSS-Score (0.010.0)
# - Alle 8 EDSS-Unterkategorien (mit jeweils eigener Maximalpunktzahl)
#2. Schätze für jede Entscheidung die Sicherheit als Ganzzahl von 0100 % ein.
#
#### Struktur der JSON-Ausgabe (VERPFLICHTEND):
#Gib NUR gültiges JSON zurück — kein Markdown, kein Text davor/dahinter.
#
#{{
# "reason": "Kernaussage zur EDSS-Begründung (max. 400 Zeichen, auf Deutsch).",
# "klassifizierbar": true/false,
# "EDSS": null ODER Zahl zwischen 0.0 und 10.0 (nur wenn klassifizierbar=true)",
# "certainty_percent": 0 ODER Zahl zwischen 0 und 100 (Ganzzahl)",
# "subcategories": {{
# "VISUAL_OPTIC_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "BRAINSTEM_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "PYRAMIDAL_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "CEREBELLAR_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "SENSORY_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "BOWEL_AND_BLADDER_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "CEREBRAL_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "AMBULATION": null ODER Zahl zwischen 0.0 und 10.0
# }}
#}}
#
#### Regeln:
#- **reason**: Kurze, prägnante Begründung (auf Deutsch, max. 400 Zeichen), warum du den EDSS-Wert und die Unterkategorien so bewertest.
#- **klassifizierbar**:
# - `true`, wenn EDSS und mindestens die wichtigsten Unterkategorien *eindeutig ableitbar* oder *plausibel inferierbar* sind.
# - `false`, **nur**, wenn keine relevanten Daten vorliegen, oder diese so widersprüchlich/inkonsistent sind, dass keine vernünftige Einschätzung möglich ist.
#- **EDSS**:
# - **VERPFLICHTEND**, wenn `klassifizierbar=true`.
# - Zahl zwischen 0.0 und 10.0 (z.B. 3.0, 5.5). Darf **nicht** erscheinen, wenn `klassifizierbar=false`.
#- **certainty_percent**:
# - **Immer present** — Ganzzahl (0100), basierend auf:
# - Klarheit und Vollständigkeit der Berichtsangaben,
# - Stichhaltigkeit der Schlussfolgerung (inkl. Inferenz),
# - Konsistenz zwischen den Unterkategorien.
#- **subcategories**:
# - **Immer present** — **alle 8 Unterkategorien** müssen enthalten sein.
# - Jeder Wert ist entweder:
# - `null` (wenn keine ausreichende Information vorliegt), **oder**
# - eine Zahl ≤ jeweiliger Obergrenze (z.B. Ambulation ≤ 10.0).
# - Wenn die Unterkategorie plausibel inferiert werden kann (auch indirekt), gib einen sinnvollen Wert ab.
# - Beispiel: Wenn „Gang mit Krückstock auf ebenem Boden bis 200 m“ steht, setze `AMBULATION: 5.5`.
#
#### EDSS-Bewertungsrichtlinien:
#{EDSS_INSTRUCTIONS}
#
#Patientenbericht:
#{patient_text}
#'''
#
#
## =========================
## VALIDATION / NORMALIZATION
## =========================
#
#def normalize_model_output(parsed):
# """
# Keeps your existing validation behavior, with a few extra safety checks.
# """
#
# if not isinstance(parsed, dict):
# raise ValueError("Parsed model output is not a JSON object")
#
# if "certainty_percent" not in parsed:
# print("⚠️ Missing 'certainty_percent' in output. Force-adding fallback.")
# parsed["certainty_percent"] = 0
# elif not isinstance(parsed["certainty_percent"], (int, float)):
# parsed["certainty_percent"] = int(parsed["certainty_percent"])
#
# parsed["certainty_percent"] = max(0, min(100, int(parsed["certainty_percent"])))
#
# if "klassifizierbar" not in parsed:
# parsed["klassifizierbar"] = False
#
# if not parsed.get("klassifizierbar", False):
# parsed.pop("EDSS", None)
# else:
# if "EDSS" not in parsed:
# print("⚠️ 'klassifizierbar' is true but EDSS missing — adding fallback.")
# parsed["EDSS"] = 7.0
#
# required_subcategories = {
# "VISUAL_OPTIC_FUNCTIONS": 6.0,
# "BRAINSTEM_FUNCTIONS": 6.0,
# "PYRAMIDAL_FUNCTIONS": 6.0,
# "CEREBELLAR_FUNCTIONS": 6.0,
# "SENSORY_FUNCTIONS": 6.0,
# "BOWEL_AND_BLADDER_FUNCTIONS": 6.0,
# "CEREBRAL_FUNCTIONS": 6.0,
# "AMBULATION": 10.0,
# }
#
# if "subcategories" not in parsed or not isinstance(parsed["subcategories"], dict):
# parsed["subcategories"] = {}
#
# for key in required_subcategories:
# if key not in parsed["subcategories"]:
# parsed["subcategories"][key] = None
#
# return parsed
#
#
## =========================
## INFERENCE FUNCTION
## =========================
#
#def run_inference(patient_text, model_name):
# prompt = build_prompt(patient_text)
#
# process = get_process()
# sampler = ResourceSampler(interval_sec=RESOURCE_SAMPLE_INTERVAL_SEC)
#
# wall_start = time.perf_counter()
# cpu_start = get_cpu_times_sec(process)
# rss_start_mb = get_memory_rss_mb(process)
#
# sampler.start()
#
# try:
# response = client.chat.completions.create(
# messages=[
# {
# "role": "system",
# "content": "Du gibst EXKLUSIV gültiges JSON zurück — keine weiteren Erklärungen."
# },
# {
# "role": "user",
# "content": prompt
# }
# ],
# model=model_name,
# max_tokens=MAX_TOKENS,
# temperature=TEMPERATURE,
# response_format={"type": "json_object"}
# )
#
# content = response.choices[0].message.content
#
# try:
# parsed = json.loads(content)
# except json.JSONDecodeError as e:
# print(f"⚠️ JSON parsing failed: {e}")
# print("Raw response:", content[:500])
# raise ValueError("Model did not return valid JSON")
#
# parsed = normalize_model_output(parsed)
#
# usage = getattr(response, "usage", None)
#
# if usage is not None:
# prompt_tokens = getattr(usage, "prompt_tokens", None)
# completion_tokens = getattr(usage, "completion_tokens", None)
# total_tokens = getattr(usage, "total_tokens", None)
# else:
# prompt_tokens = None
# completion_tokens = None
# total_tokens = None
#
# success = True
# error = None
# result = parsed
#
# except Exception as e:
# print(f"❌ Inference error: {e}")
#
# success = False
# error = str(e)
# result = None
# prompt_tokens = None
# completion_tokens = None
# total_tokens = None
#
# finally:
# sampler.stop()
#
# wall_end = time.perf_counter()
# cpu_end = get_cpu_times_sec(process)
# rss_end_mb = get_memory_rss_mb(process)
#
# wall_time_sec = wall_end - wall_start
#
# if cpu_start is not None and cpu_end is not None:
# process_cpu_time_sec = cpu_end - cpu_start
# else:
# process_cpu_time_sec = None
#
# if rss_start_mb is not None and rss_end_mb is not None:
# rss_delta_mb = rss_end_mb - rss_start_mb
# else:
# rss_delta_mb = None
#
# return {
# "success": success,
# "error": error,
# "result": result,
#
# "model": model_name,
#
# # Main inference timing
# "inference_time_sec": wall_time_sec,
#
# # Resource metrics
# "process_cpu_time_sec": process_cpu_time_sec,
# "rss_before_mb": rss_start_mb,
# "rss_after_mb": rss_end_mb,
# "rss_delta_mb": rss_delta_mb,
# "peak_rss_mb": sampler.peak_rss_mb,
#
# # Token metrics, when available
# "prompt_tokens": prompt_tokens,
# "completion_tokens": completion_tokens,
# "total_tokens": total_tokens,
# }
#
#
## =========================
## BUILD PATIENT TEXT
## =========================
#
#def build_patient_text(row):
# return (
# str(row.get("T_Zusammenfassung", "")) + "\n" +
# str(row.get("Diagnosen", "")) + "\n" +
# str(row.get("T_KlinBef", "")) + "\n" +
# str(row.get("T_Befunde", ""))
# )
#
#
## =========================
## FLATTEN RESULTS FOR CSV
## =========================
#
#def flatten_result(record):
# """
# Converts one result record to a flat row for CSV export.
# """
#
# flat = {
# "model": record.get("model"),
# "iteration": record.get("iteration"),
# "row_index": record.get("row_index"),
# "unique_id": record.get("unique_id"),
# "MedDatum": record.get("MedDatum"),
#
# "success": record.get("success"),
# "error": record.get("error"),
#
# "inference_time_sec": record.get("inference_time_sec"),
# "process_cpu_time_sec": record.get("process_cpu_time_sec"),
# "rss_before_mb": record.get("rss_before_mb"),
# "rss_after_mb": record.get("rss_after_mb"),
# "rss_delta_mb": record.get("rss_delta_mb"),
# "peak_rss_mb": record.get("peak_rss_mb"),
#
# "prompt_tokens": record.get("prompt_tokens"),
# "completion_tokens": record.get("completion_tokens"),
# "total_tokens": record.get("total_tokens"),
# }
#
# result = record.get("result")
#
# if isinstance(result, dict):
# flat["reason"] = result.get("reason")
# flat["klassifizierbar"] = result.get("klassifizierbar")
# flat["EDSS"] = result.get("EDSS")
# flat["certainty_percent"] = result.get("certainty_percent")
#
# subcats = result.get("subcategories", {})
# if isinstance(subcats, dict):
# for key, value in subcats.items():
# flat[f"subcat_{key}"] = value
#
# return flat
#
#
#def summarize_records(records):
# """
# Creates summary statistics for one model over all iterations.
# """
#
# df = pd.DataFrame([flatten_result(r) for r in records])
#
# if df.empty:
# return pd.DataFrame()
#
# summary = {
# "model": df["model"].iloc[0] if "model" in df.columns else None,
# "n_records": len(df),
# "n_success": int(df["success"].sum()) if "success" in df.columns else None,
# "n_failed": int((~df["success"]).sum()) if "success" in df.columns else None,
# "success_rate": float(df["success"].mean()) if "success" in df.columns else None,
# }
#
# numeric_cols = [
# "inference_time_sec",
# "process_cpu_time_sec",
# "rss_delta_mb",
# "peak_rss_mb",
# "prompt_tokens",
# "completion_tokens",
# "total_tokens",
# "certainty_percent",
# "EDSS",
# ]
#
# for col in numeric_cols:
# if col in df.columns:
# values = pd.to_numeric(df[col], errors="coerce")
# summary[f"{col}_mean"] = values.mean()
# summary[f"{col}_median"] = values.median()
# summary[f"{col}_std"] = values.std()
# summary[f"{col}_min"] = values.min()
# summary[f"{col}_max"] = values.max()
#
# return pd.DataFrame([summary])
#
#
## =========================
## MAIN LOOP
## =========================
#
#if __name__ == "__main__":
#
# run_timestamp = now_timestamp()
#
# results_root = Path(RESULTS_ROOT)
# results_root.mkdir(parents=True, exist_ok=True)
#
# run_root = results_root / f"run_{run_timestamp}"
# run_root.mkdir(parents=True, exist_ok=True)
#
# print(f"Results root: {run_root}")
#
# df = pd.read_csv(INPUT_CSV, sep=";")
# total_rows = len(df)
#
# print(f"Loaded {total_rows} patient records.")
# print(f"Models: {MODEL_NAMES}")
# print(f"Iterations per model: {NUM_ITERATIONS}")
#
# all_model_summaries = []
#
# for model_name in MODEL_NAMES:
# safe_model = safe_dir_name(model_name)
# model_dir = run_root / safe_model
# model_dir.mkdir(parents=True, exist_ok=True)
#
# print(f"\n{'#' * 80}")
# print(f"MODEL: {model_name}")
# print(f"Saving to: {model_dir}")
# print(f"{'#' * 80}")
#
# model_records = []
# model_start = time.perf_counter()
#
# for iteration in range(1, NUM_ITERATIONS + 1):
# print(f"\n{'=' * 60}")
# print(f"🔄 MODEL {model_name} | ITERATION {iteration}/{NUM_ITERATIONS}")
# print(f"{'=' * 60}")
#
# iteration_results = []
# iteration_start = time.perf_counter()
#
# for idx, row in df.iterrows():
# print(
# f"\rModel={model_name} | Row {idx + 1}/{total_rows} | Iter {iteration}",
# end="",
# flush=True
# )
#
# try:
# patient_text = build_patient_text(row)
# record = run_inference(patient_text, model_name=model_name)
#
# record["iteration"] = iteration
# record["row_index"] = int(idx)
# record["unique_id"] = row.get("unique_id", f"row_{idx}")
# record["MedDatum"] = row.get("MedDatum", None)
#
# iteration_results.append(record)
# model_records.append(record)
#
# if record["success"]:
# res = record["result"]
# edss = res.get("EDSS", "N/A") if res.get("klassifizierbar") else "N/A"
# print(
# f" ✅ EDSS={edss}, "
# f"cert={res.get('certainty_percent', '?')}%, "
# f"time={record['inference_time_sec']:.2f}s"
# )
# else:
# print(f" ❌ {record.get('error', 'Unknown error')}")
#
# except Exception as e:
# print(f"\n⚠️ Row {idx} failed outside inference wrapper: {e}")
#
# fallback_record = {
# "success": False,
# "error": str(e),
# "result": None,
#
# "model": model_name,
# "iteration": iteration,
# "row_index": int(idx),
# "unique_id": row.get("unique_id", f"row_{idx}"),
# "MedDatum": row.get("MedDatum", None),
#
# "inference_time_sec": None,
# "process_cpu_time_sec": None,
# "rss_before_mb": None,
# "rss_after_mb": None,
# "rss_delta_mb": None,
# "peak_rss_mb": None,
#
# "prompt_tokens": None,
# "completion_tokens": None,
# "total_tokens": None,
# }
#
# iteration_results.append(fallback_record)
# model_records.append(fallback_record)
#
# if STOP_ON_FIRST_ERROR:
# break
#
# iteration_elapsed = time.perf_counter() - iteration_start
#
# # Save per-iteration JSON
# iter_json_path = model_dir / f"{safe_model}_results_iter_{iteration}_{run_timestamp}.json"
# with open(iter_json_path, "w", encoding="utf-8") as f:
# json.dump(iteration_results, f, indent=2, ensure_ascii=False)
#
# # Save per-iteration CSV
# iter_csv_path = model_dir / f"{safe_model}_results_iter_{iteration}_{run_timestamp}.csv"
# iter_flat_df = pd.DataFrame([flatten_result(r) for r in iteration_results])
# iter_flat_df.to_csv(iter_csv_path, index=False)
#
# print(f"\n✅ Iteration {iteration} complete.")
# print(f"JSON saved to: {iter_json_path}")
# print(f"CSV saved to: {iter_csv_path}")
# print(
# f"⏱️ Iteration time: {iteration_elapsed:.1f}s "
# f"({iteration_elapsed / max(total_rows, 1):.2f}s/row)"
# )
#
# model_elapsed = time.perf_counter() - model_start
#
# # Save all records for this model
# model_json_path = model_dir / f"{safe_model}_all_results_{run_timestamp}.json"
# with open(model_json_path, "w", encoding="utf-8") as f:
# json.dump(model_records, f, indent=2, ensure_ascii=False)
#
# model_csv_path = model_dir / f"{safe_model}_all_results_{run_timestamp}.csv"
# model_flat_df = pd.DataFrame([flatten_result(r) for r in model_records])
# model_flat_df.to_csv(model_csv_path, index=False)
#
# # Save model summary
# model_summary_df = summarize_records(model_records)
# model_summary_df["model_total_wall_time_sec"] = model_elapsed
# model_summary_df["model_total_wall_time_min"] = model_elapsed / 60
#
# model_summary_path = model_dir / f"{safe_model}_summary_{run_timestamp}.csv"
# model_summary_df.to_csv(model_summary_path, index=False)
#
# all_model_summaries.append(model_summary_df)
#
# print(f"\n🎉 Model completed: {model_name}")
# print(f"All JSON: {model_json_path}")
# print(f"All CSV: {model_csv_path}")
# print(f"Summary: {model_summary_path}")
# print(f"Total model time: {model_elapsed / 60:.2f} min")
#
# # Save combined model summaries
# if all_model_summaries:
# combined_summary_df = pd.concat(all_model_summaries, ignore_index=True)
# combined_summary_path = run_root / f"all_models_summary_{run_timestamp}.csv"
# combined_summary_df.to_csv(combined_summary_path, index=False)
#
# print(f"\n📊 Combined summary saved to: {combined_summary_path}")
#
# print(f"\n🎉 All models and all iterations completed!")
#
##
# %% API call - Multi-model, multi-iteration EDSS + timing/resource benchmark
#import time
#import json
#import os
#import re
#import threading
#from datetime import datetime
#from pathlib import Path
#
#import pandas as pd
#from openai import OpenAI
#from dotenv import load_dotenv
#
#try:
# import psutil
#except ImportError:
# psutil = None
# print("⚠️ psutil is not installed. Resource metrics will be limited.")
# print("Install with: pip install psutil")
#
#
# =========================
# CONFIGURATION
# =========================
#
#load_dotenv()
#
#OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
#OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
#
#MODEL_CONFIGS = [
# {
# "model_name": "qwen3.6-35b-a3b",
# "use_response_format": False,
# "temperature": 0.0,
# "max_tokens": 4096,
#
# # If your backend is vLLM / Qwen chat-template compatible,
# # this may reduce long hidden reasoning and JSON truncation.
# # If your server errors because of extra_body, set this to None.
# "extra_body": {
# "chat_template_kwargs": {
# "enable_thinking": False
# }
# },
# },
# {
# "model_name": "gemma-4-31B-it",
# "use_response_format": False,
# "temperature": 0.0,
# "max_tokens": 4096,
# "extra_body": None,
# },
# # {
# # "model_name": "GPT-OSS-120B",
# # "use_response_format": True,
# # "temperature": 0.0,
# # "max_tokens": 4096,
# # "extra_body": None,
# # },
#]
#
#INPUT_CSV = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
#EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/Komplett.txt"
#
#RESULTS_ROOT = "/home/shahin/Lab/Doktorarbeit/Barcelona/results_edss_benchmark"
#
#NUM_ITERATIONS = 10
#STOP_ON_FIRST_ERROR = False
#
# For testing, set to e.g. 2.
# For full run, set to None.
#MAX_ROWS = 2
# MAX_ROWS = 2
#
#MAX_TOKENS = 4096
#TEMPERATURE = 0.0
#
#RESOURCE_SAMPLE_INTERVAL_SEC = 0.05
#
#SAVE_EVERY_N_ROWS = 1
#
# Retries for invalid JSON / truncated JSON
#MAX_JSON_RETRIES = 2
#RETRY_SLEEP_SEC = 2
#
#
# =========================
# CLIENT
# =========================
#
#client = OpenAI(
# api_key=OPENAI_API_KEY,
# base_url=OPENAI_BASE_URL
#)
#
#
# =========================
# HELPERS
# =========================
#
#def safe_dir_name(name: str) -> str:
# name = str(name).strip()
# name = re.sub(r"[^\w\-.]+", "_", name)
# return name[:150]
#
#
#def now_timestamp() -> str:
# return datetime.now().strftime("%Y%m%d_%H%M%S")
#
#
#def get_process():
# if psutil is None:
# return None
# return psutil.Process(os.getpid())
#
#
#def get_memory_rss_mb(process=None):
# if psutil is None:
# return None
# if process is None:
# process = get_process()
# return process.memory_info().rss / (1024 * 1024)
#
#
#def get_cpu_times_sec(process=None):
# if psutil is None:
# return None
# if process is None:
# process = get_process()
# cpu_times = process.cpu_times()
# return cpu_times.user + cpu_times.system
#
#
#class ResourceSampler:
# def __init__(self, interval_sec=0.05):
# self.interval_sec = interval_sec
# self.process = get_process()
# self.running = False
# self.thread = None
# self.samples_mb = []
#
# def start(self):
# if psutil is None:
# return
#
# self.running = True
# self.samples_mb = []
# self.thread = threading.Thread(target=self._sample_loop, daemon=True)
# self.thread.start()
#
# def stop(self):
# if psutil is None:
# return
#
# self.running = False
# if self.thread is not None:
# self.thread.join(timeout=1.0)
#
# def _sample_loop(self):
# while self.running:
# try:
# rss_mb = get_memory_rss_mb(self.process)
# self.samples_mb.append(rss_mb)
# except Exception:
# pass
# time.sleep(self.interval_sec)
#
# @property
# def peak_rss_mb(self):
# if not self.samples_mb:
# return None
# return max(self.samples_mb)
#
#
# =========================
# JSON EXTRACTION
# =========================
#
#def extract_json_from_text(text):
# if text is None:
# raise ValueError("Model returned empty content: message.content is None")
#
# text = str(text).strip()
#
# if not text:
# raise ValueError("Model returned empty content")
#
# text = (
# text.replace("```json", "")
# .replace("```JSON", "")
# .replace("```Json", "")
# .replace("```", "")
# .strip()
# )
#
# # Direct parse
# try:
# parsed = json.loads(text)
# if isinstance(parsed, dict):
# return parsed
# except json.JSONDecodeError:
# pass
#
# # Balanced JSON candidates
# candidates = []
# stack = []
# start_idx = None
# in_string = False
# escape = False
#
# for i, ch in enumerate(text):
# if escape:
# escape = False
# continue
#
# if ch == "\\":
# escape = True
# continue
#
# if ch == '"':
# in_string = not in_string
# continue
#
# if in_string:
# continue
#
# if ch == "{":
# if not stack:
# start_idx = i
# stack.append(ch)
#
# elif ch == "}":
# if stack:
# stack.pop()
# if not stack and start_idx is not None:
# candidates.append(text[start_idx:i + 1])
# start_idx = None
#
# valid_objects = []
#
# for candidate in candidates:
# candidate = candidate.strip()
# lowered = candidate.lower()
#
# invalid_markers = [
# "true/false",
# "null or",
# "oder zahl",
# "0.0-6.0",
# "0.0-10.0",
# "zahl zwischen",
# "...",
# ]
#
# if any(marker in lowered for marker in invalid_markers):
# continue
#
# try:
# parsed = json.loads(candidate)
# if isinstance(parsed, dict):
# valid_objects.append(parsed)
# except json.JSONDecodeError:
# continue
#
# for obj in reversed(valid_objects):
# if (
# "klassifizierbar" in obj
# and "certainty_percent" in obj
# and "subcategories" in obj
# ):
# return obj
#
# if valid_objects:
# return valid_objects[-1]
#
# # Check if it looks like truncated JSON
# stripped = text.strip()
# if stripped.startswith("{") and not stripped.endswith("}"):
# raise ValueError(
# "Model output looks like truncated JSON. "
# f"Raw output starts with: {text[:1000]}"
# )
#
# raise ValueError(
# "No valid JSON object found in model output. "
# f"Raw output starts with: {text[:1000]}"
# )
#
#
#def extract_message_content(message):
# raw_content = getattr(message, "content", None)
#
# if raw_content is not None:
# return raw_content
#
# msg_dict = None
#
# try:
# msg_dict = message.model_dump()
# except Exception:
# try:
# msg_dict = dict(message)
# except Exception:
# msg_dict = None
#
# if not isinstance(msg_dict, dict):
# return None
#
# for key in ["content", "reasoning_content", "reasoning", "text", "output_text"]:
# value = msg_dict.get(key)
# if value:
# return value
#
# possible_content = msg_dict.get("content")
# if isinstance(possible_content, list):
# parts = []
# for block in possible_content:
# if isinstance(block, dict):
# if "text" in block:
# parts.append(str(block["text"]))
# elif "content" in block:
# parts.append(str(block["content"]))
# if parts:
# return "\n".join(parts).strip()
#
# return None
#
#
# =========================
# READ INSTRUCTIONS
# =========================
#
#with open(EDSS_INSTRUCTIONS_PATH, "r", encoding="utf-8") as f:
# EDSS_INSTRUCTIONS = f.read().strip()
#
#
# =========================
# PROMPT
# =========================
#
#def build_prompt(patient_text):
# return f'''Du bist ein medizinischer Assistent für EDSS-Extraktion aus klinischen Berichten.
#
#Extrahiere:
#1. Gesamt-EDSS-Score von 0.0 bis 10.0
#2. Alle 8 EDSS-Unterkategorien
#3. Sicherheit als Ganzzahl von 0 bis 100
#
#Antworte ausschließlich mit EINEM validen JSON-Objekt.
#Kein Markdown.
#Keine Code-Fences.
#Kein Text vor oder nach JSON.
#Keine Platzhalter.
#Kopiere kein Schema.
#
#Das JSON muss exakt diese Schlüssel enthalten:
#- reason
#- klassifizierbar
#- EDSS
#- certainty_percent
#- subcategories
#
#Die subcategories müssen exakt diese 8 Schlüssel enthalten:
#- VISUAL_OPTIC_FUNCTIONS
#- BRAINSTEM_FUNCTIONS
#- PYRAMIDAL_FUNCTIONS
#- CEREBELLAR_FUNCTIONS
#- SENSORY_FUNCTIONS
#- BOWEL_AND_BLADDER_FUNCTIONS
#- CEREBRAL_FUNCTIONS
#- AMBULATION
#
#Werte:
#- klassifizierbar: true oder false
#- EDSS: Zahl von 0.0 bis 10.0 oder null
#- certainty_percent: Ganzzahl von 0 bis 100
#- Unterkategorien: Zahl oder null
#- VISUAL_OPTIC_FUNCTIONS maximal 6.0
#- BRAINSTEM_FUNCTIONS maximal 6.0
#- PYRAMIDAL_FUNCTIONS maximal 6.0
#- CEREBELLAR_FUNCTIONS maximal 6.0
#- SENSORY_FUNCTIONS maximal 6.0
#- BOWEL_AND_BLADDER_FUNCTIONS maximal 6.0
#- CEREBRAL_FUNCTIONS maximal 6.0
#- AMBULATION maximal 10.0
#- reason: maximal 250 Zeichen, Deutsch
#
#Wenn klassifizierbar false ist, setze EDSS auf null.
#
#Valide Beispielausgabe:
#{{
# "reason": "Leichte Einschränkungen mit sicher ableitbarer Gehfähigkeit und geringen funktionellen Defiziten.",
# "klassifizierbar": true,
# "EDSS": 2.0,
# "certainty_percent": 90,
# "subcategories": {{
# "VISUAL_OPTIC_FUNCTIONS": null,
# "BRAINSTEM_FUNCTIONS": null,
# "PYRAMIDAL_FUNCTIONS": 1.0,
# "CEREBELLAR_FUNCTIONS": 1.0,
# "SENSORY_FUNCTIONS": 1.0,
# "BOWEL_AND_BLADDER_FUNCTIONS": null,
# "CEREBRAL_FUNCTIONS": null,
# "AMBULATION": 0.0
# }}
#}}
#
#EDSS-Bewertungsrichtlinien:
#{EDSS_INSTRUCTIONS}
#
#Patientenbericht:
#{patient_text}
#
#Gib ausschließlich das finale JSON-Objekt zurück.
#'''
#
#
# =========================
# VALIDATION / NORMALIZATION
# =========================
#
#def normalize_model_output(parsed):
# if not isinstance(parsed, dict):
# raise ValueError("Parsed model output is not a JSON object")
#
# if "certainty_percent" not in parsed:
# parsed["certainty_percent"] = 0
# elif not isinstance(parsed["certainty_percent"], (int, float)):
# try:
# parsed["certainty_percent"] = int(parsed["certainty_percent"])
# except Exception:
# parsed["certainty_percent"] = 0
#
# parsed["certainty_percent"] = max(0, min(100, int(parsed["certainty_percent"])))
#
# if "klassifizierbar" not in parsed:
# parsed["klassifizierbar"] = False
#
# if not isinstance(parsed["klassifizierbar"], bool):
# if str(parsed["klassifizierbar"]).lower() in ["true", "1", "yes", "ja"]:
# parsed["klassifizierbar"] = True
# else:
# parsed["klassifizierbar"] = False
#
# if not parsed.get("klassifizierbar", False):
# parsed["EDSS"] = None
# else:
# if "EDSS" not in parsed or parsed["EDSS"] is None:
# parsed["EDSS"] = 7.0
# else:
# try:
# parsed["EDSS"] = float(parsed["EDSS"])
# parsed["EDSS"] = max(0.0, min(10.0, parsed["EDSS"]))
# except Exception:
# parsed["EDSS"] = 7.0
#
# required_subcategories = {
# "VISUAL_OPTIC_FUNCTIONS": 6.0,
# "BRAINSTEM_FUNCTIONS": 6.0,
# "PYRAMIDAL_FUNCTIONS": 6.0,
# "CEREBELLAR_FUNCTIONS": 6.0,
# "SENSORY_FUNCTIONS": 6.0,
# "BOWEL_AND_BLADDER_FUNCTIONS": 6.0,
# "CEREBRAL_FUNCTIONS": 6.0,
# "AMBULATION": 10.0,
# }
#
# if "subcategories" not in parsed or not isinstance(parsed["subcategories"], dict):
# parsed["subcategories"] = {}
#
# for key, max_value in required_subcategories.items():
# value = parsed["subcategories"].get(key, None)
#
# if value is None:
# parsed["subcategories"][key] = None
# else:
# try:
# value = float(value)
# value = max(0.0, min(max_value, value))
# parsed["subcategories"][key] = value
# except Exception:
# parsed["subcategories"][key] = None
#
# parsed["subcategories"] = {
# key: parsed["subcategories"].get(key, None)
# for key in required_subcategories.keys()
# }
#
# if "reason" not in parsed or parsed["reason"] is None:
# parsed["reason"] = ""
#
# parsed["reason"] = str(parsed["reason"])[:250]
#
# return parsed
#
#
# =========================
# API CALL
# =========================
#
#def make_chat_completion(model_config, prompt):
# model_name = model_config["model_name"]
#
# kwargs = dict(
# messages=[
# {
# "role": "system",
# "content": (
# "Du bist ein JSON-Generator. "
# "Antworte ausschließlich mit einem einzigen validen JSON-Objekt. "
# "Keine Erklärung. Kein Markdown. Keine Code-Fences. "
# "Keine Platzhalter. Kein Schema kopieren. "
# "Das JSON muss vollständig geschlossen sein."
# )
# },
# {
# "role": "user",
# "content": prompt
# }
# ],
# model=model_name,
# max_tokens=model_config.get("max_tokens", MAX_TOKENS),
# temperature=model_config.get("temperature", TEMPERATURE),
# )
#
# if model_config.get("use_response_format", False):
# kwargs["response_format"] = {"type": "json_object"}
#
# extra_body = model_config.get("extra_body")
# if extra_body is not None:
# kwargs["extra_body"] = extra_body
#
# return client.chat.completions.create(**kwargs)
#
#
# =========================
# INFERENCE FUNCTION WITH RETRIES
# =========================
#
#def run_inference(patient_text, model_config):
# model_name = model_config["model_name"]
# prompt = build_prompt(patient_text)
#
# process = get_process()
# sampler = ResourceSampler(interval_sec=RESOURCE_SAMPLE_INTERVAL_SEC)
#
# wall_start = time.perf_counter()
# cpu_start = get_cpu_times_sec(process)
# rss_start_mb = get_memory_rss_mb(process)
#
# sampler.start()
#
# raw_content = None
# raw_response_debug = None
# last_error = None
# prompt_tokens = None
# completion_tokens = None
# total_tokens = None
#
# try:
# for attempt in range(1, MAX_JSON_RETRIES + 2):
# try:
# response = make_chat_completion(
# model_config=model_config,
# prompt=prompt
# )
#
# message = response.choices[0].message
# raw_content = extract_message_content(message)
#
# try:
# raw_response_debug = response.model_dump()
# except Exception:
# raw_response_debug = str(response)
#
# usage = getattr(response, "usage", None)
# if usage is not None:
# prompt_tokens = getattr(usage, "prompt_tokens", None)
# completion_tokens = getattr(usage, "completion_tokens", None)
# total_tokens = getattr(usage, "total_tokens", None)
#
# parsed = extract_json_from_text(raw_content)
# parsed = normalize_model_output(parsed)
#
# success = True
# error = None
# result = parsed
# break
#
# except Exception as e:
# last_error = str(e)
#
# if attempt <= MAX_JSON_RETRIES:
# print(
# f"\n⚠️ JSON failed on attempt {attempt}. "
# f"Retrying row. Error: {last_error[:300]}"
# )
# time.sleep(RETRY_SLEEP_SEC)
# continue
#
# raise
#
# except Exception as e:
# print(f"❌ Inference error: {e}")
#
# success = False
# error = str(e)
# result = None
#
# finally:
# sampler.stop()
#
# wall_end = time.perf_counter()
# cpu_end = get_cpu_times_sec(process)
# rss_end_mb = get_memory_rss_mb(process)
#
# wall_time_sec = wall_end - wall_start
#
# if cpu_start is not None and cpu_end is not None:
# process_cpu_time_sec = cpu_end - cpu_start
# else:
# process_cpu_time_sec = None
#
# if rss_start_mb is not None and rss_end_mb is not None:
# rss_delta_mb = rss_end_mb - rss_start_mb
# else:
# rss_delta_mb = None
#
# return {
# "success": success,
# "error": error,
# "result": result,
#
# "model": model_name,
#
# "inference_time_sec": wall_time_sec,
#
# "process_cpu_time_sec": process_cpu_time_sec,
# "rss_before_mb": rss_start_mb,
# "rss_after_mb": rss_end_mb,
# "rss_delta_mb": rss_delta_mb,
# "peak_rss_mb": sampler.peak_rss_mb,
#
# "prompt_tokens": prompt_tokens,
# "completion_tokens": completion_tokens,
# "total_tokens": total_tokens,
#
# "raw_content": raw_content if not success else None,
# "raw_response_debug": raw_response_debug if not success else None,
# "last_error": last_error,
# }
#
#
# =========================
# BUILD PATIENT TEXT
# =========================
#
#def build_patient_text(row):
# return (
# str(row.get("T_Zusammenfassung", "")) + "\n" +
# str(row.get("Diagnosen", "")) + "\n" +
# str(row.get("T_KlinBef", "")) + "\n" +
# str(row.get("T_Befunde", ""))
# )
#
#
# =========================
# FLATTEN RESULTS FOR CSV
# =========================
#
#def flatten_result(record):
# flat = {
# "model": record.get("model"),
# "iteration": record.get("iteration"),
# "row_index": record.get("row_index"),
# "row_number_in_run": record.get("row_number_in_run"),
# "unique_id": record.get("unique_id"),
# "MedDatum": record.get("MedDatum"),
#
# "success": record.get("success"),
# "error": record.get("error"),
# "last_error": record.get("last_error"),
#
# "inference_time_sec": record.get("inference_time_sec"),
# "process_cpu_time_sec": record.get("process_cpu_time_sec"),
# "rss_before_mb": record.get("rss_before_mb"),
# "rss_after_mb": record.get("rss_after_mb"),
# "rss_delta_mb": record.get("rss_delta_mb"),
# "peak_rss_mb": record.get("peak_rss_mb"),
#
# "prompt_tokens": record.get("prompt_tokens"),
# "completion_tokens": record.get("completion_tokens"),
# "total_tokens": record.get("total_tokens"),
#
# "raw_content": record.get("raw_content"),
# }
#
# result = record.get("result")
#
# if isinstance(result, dict):
# flat["reason"] = result.get("reason")
# flat["klassifizierbar"] = result.get("klassifizierbar")
# flat["EDSS"] = result.get("EDSS")
# flat["certainty_percent"] = result.get("certainty_percent")
#
# subcats = result.get("subcategories", {})
# if isinstance(subcats, dict):
# for key, value in subcats.items():
# flat[f"subcat_{key}"] = value
#
# return flat
#
#
#def summarize_records(records):
# df = pd.DataFrame([flatten_result(r) for r in records])
#
# if df.empty:
# return pd.DataFrame()
#
# success_series = df["success"].fillna(False).astype(bool)
#
# summary = {
# "model": df["model"].iloc[0] if "model" in df.columns else None,
# "n_records": len(df),
# "n_success": int(success_series.sum()),
# "n_failed": int((~success_series).sum()),
# "success_rate": float(success_series.mean()),
# }
#
# numeric_cols = [
# "inference_time_sec",
# "process_cpu_time_sec",
# "rss_delta_mb",
# "peak_rss_mb",
# "prompt_tokens",
# "completion_tokens",
# "total_tokens",
# "certainty_percent",
# "EDSS",
# ]
#
# for col in numeric_cols:
# if col in df.columns:
# values = pd.to_numeric(df[col], errors="coerce")
# summary[f"{col}_mean"] = values.mean()
# summary[f"{col}_median"] = values.median()
# summary[f"{col}_std"] = values.std()
# summary[f"{col}_min"] = values.min()
# summary[f"{col}_max"] = values.max()
#
# return pd.DataFrame([summary])
#
#
# =========================
# INCREMENTAL SAVE HELPERS
# =========================
#
#def append_jsonl(path, record):
# with open(path, "a", encoding="utf-8") as f:
# f.write(json.dumps(record, ensure_ascii=False) + "\n")
# f.flush()
# os.fsync(f.fileno())
#
#
#def append_csv(path, record):
# flat = flatten_result(record)
# df_one = pd.DataFrame([flat])
# file_exists = Path(path).exists()
# df_one.to_csv(path, mode="a", header=not file_exists, index=False)
#
#
# =========================
# MAIN LOOP
# =========================
#
#if __name__ == "__main__":
#
# run_timestamp = now_timestamp()
#
# results_root = Path(RESULTS_ROOT)
# results_root.mkdir(parents=True, exist_ok=True)
#
# run_root = results_root / f"run_{run_timestamp}"
# run_root.mkdir(parents=True, exist_ok=True)
#
# print(f"Results root: {run_root}")
#
# df = pd.read_csv(INPUT_CSV, sep=";")
#
# if MAX_ROWS is not None:
# df = df.head(MAX_ROWS)
#
# total_rows = len(df)
#
# model_names_for_print = [m["model_name"] for m in MODEL_CONFIGS]
#
# print(f"Loaded {total_rows} patient records.")
# print(f"Models: {model_names_for_print}")
# print(f"Iterations per model: {NUM_ITERATIONS}")
#
# all_model_summaries = []
#
# for model_config in MODEL_CONFIGS:
# model_name = model_config["model_name"]
# safe_model = safe_dir_name(model_name)
#
# model_dir = run_root / safe_model
# model_dir.mkdir(parents=True, exist_ok=True)
#
# print(f"\n{'#' * 80}")
# print(f"MODEL: {model_name}")
# print(f"use_response_format: {model_config.get('use_response_format', False)}")
# print(f"temperature: {model_config.get('temperature', TEMPERATURE)}")
# print(f"max_tokens: {model_config.get('max_tokens', MAX_TOKENS)}")
# print(f"Saving to: {model_dir}")
# print(f"{'#' * 80}")
#
# model_records = []
# model_start = time.perf_counter()
#
# for iteration in range(1, NUM_ITERATIONS + 1):
# print(f"\n{'=' * 60}")
# print(f"🔄 MODEL {model_name} | ITERATION {iteration}/{NUM_ITERATIONS}")
# print(f"{'=' * 60}")
#
# iteration_results = []
# iteration_start = time.perf_counter()
#
# incremental_jsonl_path = model_dir / f"{safe_model}_iter_{iteration}_{run_timestamp}_incremental.jsonl"
# incremental_csv_path = model_dir / f"{safe_model}_iter_{iteration}_{run_timestamp}_incremental.csv"
#
# print(f"Incremental JSONL: {incremental_jsonl_path}")
# print(f"Incremental CSV: {incremental_csv_path}")
#
# for loop_i, (idx, row) in enumerate(df.iterrows(), start=1):
# print(
# f"\rModel={model_name} | Row {loop_i}/{total_rows} | Iter {iteration}",
# end="",
# flush=True
# )
#
# try:
# patient_text = build_patient_text(row)
#
# record = run_inference(
# patient_text=patient_text,
# model_config=model_config
# )
#
# record["iteration"] = iteration
# record["row_index"] = int(idx)
# record["row_number_in_run"] = int(loop_i)
# record["unique_id"] = row.get("unique_id", f"row_{idx}")
# record["MedDatum"] = row.get("MedDatum", None)
#
# iteration_results.append(record)
# model_records.append(record)
#
# if loop_i % SAVE_EVERY_N_ROWS == 0:
# append_jsonl(incremental_jsonl_path, record)
# append_csv(incremental_csv_path, record)
#
# if record["success"]:
# res = record["result"]
# edss = res.get("EDSS", "N/A") if res.get("klassifizierbar") else "N/A"
# print(
# f" ✅ EDSS={edss}, "
# f"cert={res.get('certainty_percent', '?')}%, "
# f"time={record['inference_time_sec']:.2f}s"
# )
# else:
# print(f" ❌ {record.get('error', 'Unknown error')}")
#
# except Exception as e:
# print(f"\n⚠️ Row {idx} failed outside inference wrapper: {e}")
#
# fallback_record = {
# "success": False,
# "error": str(e),
# "last_error": str(e),
# "result": None,
#
# "model": model_name,
# "iteration": iteration,
# "row_index": int(idx),
# "row_number_in_run": int(loop_i),
# "unique_id": row.get("unique_id", f"row_{idx}"),
# "MedDatum": row.get("MedDatum", None),
#
# "inference_time_sec": None,
# "process_cpu_time_sec": None,
# "rss_before_mb": None,
# "rss_after_mb": None,
# "rss_delta_mb": None,
# "peak_rss_mb": None,
#
# "prompt_tokens": None,
# "completion_tokens": None,
# "total_tokens": None,
#
# "raw_content": None,
# "raw_response_debug": None,
# }
#
# iteration_results.append(fallback_record)
# model_records.append(fallback_record)
#
# append_jsonl(incremental_jsonl_path, fallback_record)
# append_csv(incremental_csv_path, fallback_record)
#
# if STOP_ON_FIRST_ERROR:
# break
#
# iteration_elapsed = time.perf_counter() - iteration_start
#
# iter_json_path = model_dir / f"{safe_model}_results_iter_{iteration}_{run_timestamp}.json"
# with open(iter_json_path, "w", encoding="utf-8") as f:
# json.dump(iteration_results, f, indent=2, ensure_ascii=False)
#
# iter_csv_path = model_dir / f"{safe_model}_results_iter_{iteration}_{run_timestamp}.csv"
# iter_flat_df = pd.DataFrame([flatten_result(r) for r in iteration_results])
# iter_flat_df.to_csv(iter_csv_path, index=False)
#
# print(f"\n✅ Iteration {iteration} complete.")
# print(f"Incremental JSONL saved to: {incremental_jsonl_path}")
# print(f"Incremental CSV saved to: {incremental_csv_path}")
# print(f"Final JSON saved to: {iter_json_path}")
# print(f"Final CSV saved to: {iter_csv_path}")
# print(
# f"⏱️ Iteration time: {iteration_elapsed:.1f}s "
# f"({iteration_elapsed / max(total_rows, 1):.2f}s/row)"
# )
#
# model_elapsed = time.perf_counter() - model_start
#
# model_json_path = model_dir / f"{safe_model}_all_results_{run_timestamp}.json"
# with open(model_json_path, "w", encoding="utf-8") as f:
# json.dump(model_records, f, indent=2, ensure_ascii=False)
#
# model_csv_path = model_dir / f"{safe_model}_all_results_{run_timestamp}.csv"
# model_flat_df = pd.DataFrame([flatten_result(r) for r in model_records])
# model_flat_df.to_csv(model_csv_path, index=False)
#
# model_summary_df = summarize_records(model_records)
# model_summary_df["model_total_wall_time_sec"] = model_elapsed
# model_summary_df["model_total_wall_time_min"] = model_elapsed / 60
#
# model_summary_path = model_dir / f"{safe_model}_summary_{run_timestamp}.csv"
# model_summary_df.to_csv(model_summary_path, index=False)
#
# all_model_summaries.append(model_summary_df)
#
# print(f"\n🎉 Model completed: {model_name}")
# print(f"All JSON: {model_json_path}")
# print(f"All CSV: {model_csv_path}")
# print(f"Summary: {model_summary_path}")
# print(f"Total model time: {model_elapsed / 60:.2f} min")
#
# if all_model_summaries:
# combined_summary_df = pd.concat(all_model_summaries, ignore_index=True)
# combined_summary_path = run_root / f"all_models_summary_{run_timestamp}.csv"
# combined_summary_df.to_csv(combined_summary_path, index=False)
#
# print(f"\n📊 Combined summary saved to: {combined_summary_path}")
#
# print(f"\n🎉 All models and all iterations completed!")
##
# %% API call - Multi-model, multi-iteration EDSS + timing/resource benchmark2
#
#import time
#import json
#import os
#import re
#import threading
#from datetime import datetime
#from pathlib import Path
#
#import pandas as pd
#from openai import OpenAI
#from dotenv import load_dotenv
#
#try:
# import psutil
#except ImportError:
# psutil = None
# print("⚠️ psutil is not installed. Resource metrics will be limited.")
# print("Install with: pip install psutil")
#
#
## =========================
## CONFIGURATION
## =========================
#
#load_dotenv()
#
#OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
#OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
#
#MODEL_CONFIGS = [
# {
# "model_name": "gpt-oss-120b",
# "use_response_format": True,
# "temperature": 0.0,
# "max_tokens": 4096,
# "extra_body": None,
# },
# {
# "model_name": "qwen3.6-27b",
# "use_response_format": False,
# "temperature": 0.0,
# "max_tokens": 4096,
# "extra_body": {
# "chat_template_kwargs": {
# "enable_thinking": False
# }
# },
# },
# {
# "model_name": "gemma-4-31B-it",
# "use_response_format": False,
# "temperature": 0.0,
# "max_tokens": 4096,
# "extra_body": None,
# },
# ]
#
#INPUT_CSV ="/home/shahin/Lab/Doktorarbeit/Barcelona/data/processed/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
#EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/prompts/Komplett.txt"
#
#RESULTS_ROOT = "/home/shahin/Lab/Doktorarbeit/Barcelona/results/benchmark_runs"
#
#NUM_ITERATIONS = 10
#STOP_ON_FIRST_ERROR = False
#
## For testing, set to e.g. 2.
## For full run, set to None.
#MAX_ROWS = None
## MAX_ROWS = 2
#
#MAX_TOKENS = 4096
#TEMPERATURE = 0.0
#
#RESOURCE_SAMPLE_INTERVAL_SEC = 0.05
#
#SAVE_EVERY_N_ROWS = 1
#
## Retries for invalid JSON / truncated JSON
#MAX_JSON_RETRIES = 2
#RETRY_SLEEP_SEC = 2
#
#
## =========================
## VALID CLINICAL RANGES
## =========================
#
#EDSS_MIN = 0.0
#EDSS_MAX = 10.0
#
#FUNCTIONAL_SYSTEM_RANGES = {
# "VISUAL_OPTIC_FUNCTIONS": (0.0, 6.0),
# "BRAINSTEM_FUNCTIONS": (0.0, 6.0),
# "PYRAMIDAL_FUNCTIONS": (0.0, 6.0),
# "CEREBELLAR_FUNCTIONS": (0.0, 6.0),
# "SENSORY_FUNCTIONS": (0.0, 6.0),
# "BOWEL_AND_BLADDER_FUNCTIONS": (0.0, 6.0),
# "CEREBRAL_FUNCTIONS": (0.0, 6.0),
# "AMBULATION": (0.0, 10.0),
#}
#
#REQUIRED_TOP_LEVEL_FIELDS = [
# "reason",
# "klassifizierbar",
# "EDSS",
# "certainty_percent",
# "subcategories",
#]
#
#
## =========================
## CLIENT
## =========================
#
#client = OpenAI(
# api_key=OPENAI_API_KEY,
# base_url=OPENAI_BASE_URL
#)
#
#
## =========================
## HELPERS
## =========================
#
#def safe_dir_name(name: str) -> str:
# name = str(name).strip()
# name = re.sub(r"[^\w\-.]+", "_", name)
# return name[:150]
#
#
#def now_timestamp() -> str:
# return datetime.now().strftime("%Y%m%d_%H%M%S")
#
#
#def get_process():
# if psutil is None:
# return None
# return psutil.Process(os.getpid())
#
#
#def get_memory_rss_mb(process=None):
# if psutil is None:
# return None
# if process is None:
# process = get_process()
# return process.memory_info().rss / (1024 * 1024)
#
#
#def get_cpu_times_sec(process=None):
# if psutil is None:
# return None
# if process is None:
# process = get_process()
# cpu_times = process.cpu_times()
# return cpu_times.user + cpu_times.system
#
#
#class ResourceSampler:
# def __init__(self, interval_sec=0.05):
# self.interval_sec = interval_sec
# self.process = get_process()
# self.running = False
# self.thread = None
# self.samples_mb = []
#
# def start(self):
# if psutil is None:
# return
#
# self.running = True
# self.samples_mb = []
# self.thread = threading.Thread(target=self._sample_loop, daemon=True)
# self.thread.start()
#
# def stop(self):
# if psutil is None:
# return
#
# self.running = False
# if self.thread is not None:
# self.thread.join(timeout=1.0)
#
# def _sample_loop(self):
# while self.running:
# try:
# rss_mb = get_memory_rss_mb(self.process)
# self.samples_mb.append(rss_mb)
# except Exception:
# pass
# time.sleep(self.interval_sec)
#
# @property
# def peak_rss_mb(self):
# if not self.samples_mb:
# return None
# return max(self.samples_mb)
#
#
## =========================
## JSON EXTRACTION
## =========================
#
#def extract_json_from_text(text):
# if text is None:
# raise ValueError("Model returned empty content: message.content is None")
#
# text = str(text).strip()
#
# if not text:
# raise ValueError("Model returned empty content")
#
# text = (
# text.replace("```json", "")
# .replace("```JSON", "")
# .replace("```Json", "")
# .replace("```", "")
# .strip()
# )
#
# # Direct parse
# try:
# parsed = json.loads(text)
# if isinstance(parsed, dict):
# return parsed
# except json.JSONDecodeError:
# pass
#
# # Balanced JSON candidates
# candidates = []
# stack = []
# start_idx = None
# in_string = False
# escape = False
#
# for i, ch in enumerate(text):
# if escape:
# escape = False
# continue
#
# if ch == "\\":
# escape = True
# continue
#
# if ch == '"':
# in_string = not in_string
# continue
#
# if in_string:
# continue
#
# if ch == "{":
# if not stack:
# start_idx = i
# stack.append(ch)
#
# elif ch == "}":
# if stack:
# stack.pop()
# if not stack and start_idx is not None:
# candidates.append(text[start_idx:i + 1])
# start_idx = None
#
# valid_objects = []
#
# for candidate in candidates:
# candidate = candidate.strip()
# lowered = candidate.lower()
#
# invalid_markers = [
# "true/false",
# "null or",
# "oder zahl",
# "0.0-6.0",
# "0.0-10.0",
# "zahl zwischen",
# "...",
# ]
#
# if any(marker in lowered for marker in invalid_markers):
# continue
#
# try:
# parsed = json.loads(candidate)
# if isinstance(parsed, dict):
# valid_objects.append(parsed)
# except json.JSONDecodeError:
# continue
#
# for obj in reversed(valid_objects):
# if (
# "klassifizierbar" in obj
# and "certainty_percent" in obj
# and "subcategories" in obj
# ):
# return obj
#
# if valid_objects:
# return valid_objects[-1]
#
# stripped = text.strip()
# if stripped.startswith("{") and not stripped.endswith("}"):
# raise ValueError(
# "Model output looks like truncated JSON. "
# f"Raw output starts with: {text[:1000]}"
# )
#
# raise ValueError(
# "No valid JSON object found in model output. "
# f"Raw output starts with: {text[:1000]}"
# )
#
#
#def extract_message_content(message):
# raw_content = getattr(message, "content", None)
#
# if raw_content is not None:
# return raw_content
#
# msg_dict = None
#
# try:
# msg_dict = message.model_dump()
# except Exception:
# try:
# msg_dict = dict(message)
# except Exception:
# msg_dict = None
#
# if not isinstance(msg_dict, dict):
# return None
#
# for key in ["content", "reasoning_content", "reasoning", "text", "output_text"]:
# value = msg_dict.get(key)
# if value:
# return value
#
# possible_content = msg_dict.get("content")
# if isinstance(possible_content, list):
# parts = []
# for block in possible_content:
# if isinstance(block, dict):
# if "text" in block:
# parts.append(str(block["text"]))
# elif "content" in block:
# parts.append(str(block["content"]))
# if parts:
# return "\n".join(parts).strip()
#
# return None
#
#
## =========================
## READ INSTRUCTIONS
## =========================
#
#with open(EDSS_INSTRUCTIONS_PATH, "r", encoding="utf-8") as f:
# EDSS_INSTRUCTIONS = f.read().strip()
#
#
## =========================
## PROMPT
## =========================
#
#def build_prompt(patient_text):
# return f'''Du bist ein medizinischer Assistent für EDSS-Extraktion aus klinischen Berichten.
#
#Extrahiere:
#1. Gesamt-EDSS-Score von 0.0 bis 10.0
#2. Alle 8 EDSS-Unterkategorien
#3. Sicherheit als Ganzzahl von 0 bis 100
#
#Antworte ausschließlich mit EINEM validen JSON-Objekt.
#Kein Markdown.
#Keine Code-Fences.
#Kein Text vor oder nach JSON.
#Keine Platzhalter.
#Kopiere kein Schema.
#
#Das JSON muss exakt diese Schlüssel enthalten:
#- reason
#- klassifizierbar
#- EDSS
#- certainty_percent
#- subcategories
#
#Die subcategories müssen exakt diese 8 Schlüssel enthalten:
#- VISUAL_OPTIC_FUNCTIONS
#- BRAINSTEM_FUNCTIONS
#- PYRAMIDAL_FUNCTIONS
#- CEREBELLAR_FUNCTIONS
#- SENSORY_FUNCTIONS
#- BOWEL_AND_BLADDER_FUNCTIONS
#- CEREBRAL_FUNCTIONS
#- AMBULATION
#
#Werte:
#- klassifizierbar: true oder false
#- EDSS: Zahl von 0.0 bis 10.0 oder null
#- certainty_percent: Ganzzahl von 0 bis 100
#- Unterkategorien: Zahl oder null
#- VISUAL_OPTIC_FUNCTIONS maximal 6.0
#- BRAINSTEM_FUNCTIONS maximal 6.0
#- PYRAMIDAL_FUNCTIONS maximal 6.0
#- CEREBELLAR_FUNCTIONS maximal 6.0
#- SENSORY_FUNCTIONS maximal 6.0
#- BOWEL_AND_BLADDER_FUNCTIONS maximal 6.0
#- CEREBRAL_FUNCTIONS maximal 6.0
#- AMBULATION maximal 10.0
#- reason: maximal 250 Zeichen, Deutsch
#
#Wenn klassifizierbar false ist, setze EDSS auf null.
#
#Valide Beispielausgabe:
#{{
# "reason": "Leichte Einschränkungen mit sicher ableitbarer Gehfähigkeit und geringen funktionellen Defiziten.",
# "klassifizierbar": true,
# "EDSS": 2.0,
# "certainty_percent": 90,
# "subcategories": {{
# "VISUAL_OPTIC_FUNCTIONS": null,
# "BRAINSTEM_FUNCTIONS": null,
# "PYRAMIDAL_FUNCTIONS": 1.0,
# "CEREBELLAR_FUNCTIONS": 1.0,
# "SENSORY_FUNCTIONS": 1.0,
# "BOWEL_AND_BLADDER_FUNCTIONS": null,
# "CEREBRAL_FUNCTIONS": null,
# "AMBULATION": 0.0
# }}
#}}
#
#EDSS-Bewertungsrichtlinien:
#{EDSS_INSTRUCTIONS}
#
#Patientenbericht:
#{patient_text}
#
#Gib ausschließlich das finale JSON-Objekt zurück.
#'''
#
#
## =========================
## VALIDATION, NOT NORMALIZATION
## =========================
#
#def parse_float_preserve_raw(value):
# """
# Try to parse a value as float without clipping or correcting it.
#
# Returns:
# raw_value: original value exactly as present in parsed JSON
# numeric_value: float or None
# is_numeric: bool
# """
# raw_value = value
#
# if value is None:
# return raw_value, None, False
#
# if isinstance(value, bool):
# return raw_value, None, False
#
# try:
# numeric_value = float(str(value).replace(",", "."))
# return raw_value, numeric_value, True
# except Exception:
# return raw_value, None, False
#
#
#def is_in_range(value, min_value, max_value):
# """
# Range check without clipping.
# """
# if value is None:
# return False
# return min_value <= value <= max_value
#
#
#def validate_model_output(parsed):
# """
# Validate parsed model output without repairing/clipping clinical values.
#
# Important:
# - Does NOT clip EDSS.
# - Does NOT clip functional system values.
# - Does NOT insert default EDSS.
# - Does NOT insert default certainty_percent.
# - Missing fields are kept as None.
# - Adds explicit validity flags for scientific transparency.
# """
#
# validation = {
# "json_parse_success": isinstance(parsed, dict),
# "required_fields_present": False,
# "required_schema_success": False,
# "clinical_range_valid": False,
# "certainty_present": False,
#
# "missing_required_fields": [],
# "missing_subcategory_fields": [],
#
# "EDSS_is_numeric": False,
# "EDSS_in_valid_range": False,
# }
#
# if not isinstance(parsed, dict):
# return {
# "raw_output": parsed,
# "validated_output": {},
# "validation": validation,
# }
#
# missing_required = [
# field for field in REQUIRED_TOP_LEVEL_FIELDS
# if field not in parsed
# ]
#
# validation["missing_required_fields"] = missing_required
# validation["required_fields_present"] = len(missing_required) == 0
#
# validated = {}
#
# validated["reason"] = parsed.get("reason", None)
# validated["klassifizierbar"] = parsed.get("klassifizierbar", None)
#
# raw_certainty = parsed.get("certainty_percent", None)
# validated["raw_certainty_percent"] = raw_certainty
# validation["certainty_present"] = "certainty_percent" in parsed and raw_certainty is not None
#
# _, certainty_numeric, certainty_is_numeric = parse_float_preserve_raw(raw_certainty)
# validated["certainty_percent"] = certainty_numeric if certainty_is_numeric else None
# validated["certainty_percent_is_numeric"] = certainty_is_numeric
# validated["certainty_percent_in_valid_range"] = (
# is_in_range(certainty_numeric, 0.0, 100.0)
# if certainty_is_numeric else False
# )
#
# raw_edss = parsed.get("EDSS", None)
# raw_edss, edss_numeric, edss_is_numeric = parse_float_preserve_raw(raw_edss)
#
# validated["raw_EDSS"] = raw_edss
# validated["EDSS_numeric"] = edss_numeric
# validated["EDSS"] = edss_numeric # Backward-compatible; parsed only, not clipped
# validated["EDSS_is_numeric"] = edss_is_numeric
# validated["EDSS_in_valid_range"] = (
# is_in_range(edss_numeric, EDSS_MIN, EDSS_MAX)
# if edss_is_numeric else False
# )
#
# validation["EDSS_is_numeric"] = validated["EDSS_is_numeric"]
# validation["EDSS_in_valid_range"] = validated["EDSS_in_valid_range"]
#
# raw_subcategories = parsed.get("subcategories", None)
#
# if isinstance(raw_subcategories, dict):
# subcategories = raw_subcategories
# else:
# subcategories = {}
#
# validated["subcategories"] = {}
# validated["raw_subcategories"] = {}
# validated["subcategory_validation"] = {}
#
# missing_subcats = []
#
# for subcat, (min_value, max_value) in FUNCTIONAL_SYSTEM_RANGES.items():
# if subcat not in subcategories:
# missing_subcats.append(subcat)
#
# raw_value = subcategories.get(subcat, None)
# raw_value, numeric_value, is_numeric_value = parse_float_preserve_raw(raw_value)
# in_valid_range = (
# is_in_range(numeric_value, min_value, max_value)
# if is_numeric_value else False
# )
#
# validated["raw_subcategories"][subcat] = raw_value
# validated["subcategories"][subcat] = numeric_value
#
# validated["subcategory_validation"][subcat] = {
# "is_numeric": is_numeric_value,
# "in_valid_range": in_valid_range,
# "min_allowed": min_value,
# "max_allowed": max_value,
# }
#
# validation["missing_subcategory_fields"] = missing_subcats
#
# subcategory_schema_present = len(missing_subcats) == 0
#
# all_subcats_numeric = all(
# validated["subcategory_validation"][subcat]["is_numeric"]
# for subcat in FUNCTIONAL_SYSTEM_RANGES
# )
#
# all_subcats_in_range = all(
# validated["subcategory_validation"][subcat]["in_valid_range"]
# for subcat in FUNCTIONAL_SYSTEM_RANGES
# )
#
# validated["all_functional_systems_numeric"] = all_subcats_numeric
# validated["all_functional_systems_in_valid_range"] = all_subcats_in_range
#
# validation["clinical_range_valid"] = (
# validated["EDSS_in_valid_range"]
# and all_subcats_in_range
# )
#
# validation["required_schema_success"] = (
# validation["required_fields_present"]
# and subcategory_schema_present
# )
#
# return {
# "raw_output": parsed,
# "validated_output": validated,
# "validation": validation,
# }
#
#
## =========================
## API CALL
## =========================
#
#def make_chat_completion(model_config, prompt):
# model_name = model_config["model_name"]
#
# kwargs = dict(
# messages=[
# {
# "role": "system",
# "content": (
# "Du bist ein JSON-Generator. "
# "Antworte ausschließlich mit einem einzigen validen JSON-Objekt. "
# "Keine Erklärung. Kein Markdown. Keine Code-Fences. "
# "Keine Platzhalter. Kein Schema kopieren. "
# "Das JSON muss vollständig geschlossen sein."
# )
# },
# {
# "role": "user",
# "content": prompt
# }
# ],
# model=model_name,
# max_tokens=model_config.get("max_tokens", MAX_TOKENS),
# temperature=model_config.get("temperature", TEMPERATURE),
# )
#
# if model_config.get("use_response_format", False):
# kwargs["response_format"] = {"type": "json_object"}
#
# extra_body = model_config.get("extra_body")
# if extra_body is not None:
# kwargs["extra_body"] = extra_body
#
# return client.chat.completions.create(**kwargs)
#
#
## =========================
## INFERENCE FUNCTION WITH RETRIES
## =========================
#
#def run_inference(patient_text, model_config):
# model_name = model_config["model_name"]
# prompt = build_prompt(patient_text)
#
# process = get_process()
# sampler = ResourceSampler(interval_sec=RESOURCE_SAMPLE_INTERVAL_SEC)
#
# wall_start = time.perf_counter()
# cpu_start = get_cpu_times_sec(process)
# rss_start_mb = get_memory_rss_mb(process)
#
# sampler.start()
#
# raw_content = None
# raw_response_debug = None
# raw_parsed_output = None
# validation = None
# last_error = None
#
# prompt_tokens = None
# completion_tokens = None
# total_tokens = None
#
# try:
# for attempt in range(1, MAX_JSON_RETRIES + 2):
# try:
# response = make_chat_completion(
# model_config=model_config,
# prompt=prompt
# )
#
# message = response.choices[0].message
# raw_content = extract_message_content(message)
#
# try:
# raw_response_debug = response.model_dump()
# except Exception:
# raw_response_debug = str(response)
#
# usage = getattr(response, "usage", None)
# if usage is not None:
# prompt_tokens = getattr(usage, "prompt_tokens", None)
# completion_tokens = getattr(usage, "completion_tokens", None)
# total_tokens = getattr(usage, "total_tokens", None)
#
# parsed = extract_json_from_text(raw_content)
# validation_package = validate_model_output(parsed)
#
# success = True
# error = None
#
# result = validation_package["validated_output"]
# validation = validation_package["validation"]
# raw_parsed_output = validation_package["raw_output"]
#
# break
#
# except Exception as e:
# last_error = str(e)
#
# if attempt <= MAX_JSON_RETRIES:
# print(
# f"\n⚠️ JSON failed on attempt {attempt}. "
# f"Retrying row. Error: {last_error[:300]}"
# )
# time.sleep(RETRY_SLEEP_SEC)
# continue
#
# raise
#
# except Exception as e:
# print(f"❌ Inference error: {e}")
#
# success = False
# error = str(e)
# result = None
# raw_parsed_output = None
#
# validation = {
# "json_parse_success": False,
# "required_fields_present": False,
# "required_schema_success": False,
# "clinical_range_valid": False,
# "certainty_present": False,
# "missing_required_fields": [],
# "missing_subcategory_fields": [],
# "EDSS_is_numeric": False,
# "EDSS_in_valid_range": False,
# }
#
# finally:
# sampler.stop()
#
# wall_end = time.perf_counter()
# cpu_end = get_cpu_times_sec(process)
# rss_end_mb = get_memory_rss_mb(process)
#
# wall_time_sec = wall_end - wall_start
#
# if cpu_start is not None and cpu_end is not None:
# process_cpu_time_sec = cpu_end - cpu_start
# else:
# process_cpu_time_sec = None
#
# if rss_start_mb is not None and rss_end_mb is not None:
# rss_delta_mb = rss_end_mb - rss_start_mb
# else:
# rss_delta_mb = None
#
# return {
# "success": success,
# "error": error,
# "result": result,
#
# "validation": validation,
# "raw_parsed_output": raw_parsed_output,
#
# "model": model_name,
#
# "inference_time_sec": wall_time_sec,
#
# "process_cpu_time_sec": process_cpu_time_sec,
# "rss_before_mb": rss_start_mb,
# "rss_after_mb": rss_end_mb,
# "rss_delta_mb": rss_delta_mb,
# "peak_rss_mb": sampler.peak_rss_mb,
#
# "prompt_tokens": prompt_tokens,
# "completion_tokens": completion_tokens,
# "total_tokens": total_tokens,
#
# # Keeping raw content improves auditability but can make files large.
# # To save space, change this to: raw_content if not success else None
# "raw_content": raw_content,
# "raw_response_debug": raw_response_debug if not success else None,
# "last_error": last_error,
# }
#
#
## =========================
## BUILD PATIENT TEXT
## =========================
#
#def build_patient_text(row):
# return (
# str(row.get("T_Zusammenfassung", "")) + "\n" +
# str(row.get("Diagnosen", "")) + "\n" +
# str(row.get("T_KlinBef", "")) + "\n" +
# str(row.get("T_Befunde", ""))
# )
#
#
## =========================
## FLATTEN RESULTS FOR CSV
## =========================
#
#def flatten_result(record):
# """
# Flatten one benchmark record for CSV export.
#
# This preserves:
# - raw model values
# - parsed numeric values without clipping
# - validity flags
# - backward-compatible columns where possible
# """
#
# validation = record.get("validation") or {}
# result = record.get("result") or {}
#
# flat = {
# "model": record.get("model"),
# "iteration": record.get("iteration"),
# "row_index": record.get("row_index"),
# "row_number_in_run": record.get("row_number_in_run"),
# "unique_id": record.get("unique_id"),
# "MedDatum": record.get("MedDatum"),
#
# "success": record.get("success"),
# "error": record.get("error"),
# "last_error": record.get("last_error"),
#
# "json_parse_success": validation.get("json_parse_success"),
# "required_fields_present": validation.get("required_fields_present"),
# "required_schema_success": validation.get("required_schema_success"),
# "clinical_range_valid": validation.get("clinical_range_valid"),
# "certainty_present": validation.get("certainty_present"),
#
# "missing_required_fields": json.dumps(
# validation.get("missing_required_fields", []),
# ensure_ascii=False
# ),
# "missing_subcategory_fields": json.dumps(
# validation.get("missing_subcategory_fields", []),
# ensure_ascii=False
# ),
#
# "inference_time_sec": record.get("inference_time_sec"),
# "process_cpu_time_sec": record.get("process_cpu_time_sec"),
# "rss_before_mb": record.get("rss_before_mb"),
# "rss_after_mb": record.get("rss_after_mb"),
# "rss_delta_mb": record.get("rss_delta_mb"),
# "peak_rss_mb": record.get("peak_rss_mb"),
#
# "prompt_tokens": record.get("prompt_tokens"),
# "completion_tokens": record.get("completion_tokens"),
# "total_tokens": record.get("total_tokens"),
#
# "raw_content": record.get("raw_content"),
# "raw_parsed_output": json.dumps(record.get("raw_parsed_output"), ensure_ascii=False),
#
# # Backward-compatible fields
# "reason": result.get("reason"),
# "klassifizierbar": result.get("klassifizierbar"),
#
# "raw_certainty_percent": result.get("raw_certainty_percent"),
# "certainty_percent": result.get("certainty_percent"),
# "certainty_percent_is_numeric": result.get("certainty_percent_is_numeric"),
# "certainty_percent_in_valid_range": result.get("certainty_percent_in_valid_range"),
#
# # EDSS raw/numeric/validity fields
# "raw_EDSS": result.get("raw_EDSS"),
# "EDSS_numeric": result.get("EDSS_numeric"),
# "EDSS": result.get("EDSS"), # backward-compatible; same as EDSS_numeric, not clipped
# "EDSS_is_numeric": result.get("EDSS_is_numeric"),
# "EDSS_in_valid_range": result.get("EDSS_in_valid_range"),
#
# "all_functional_systems_numeric": result.get("all_functional_systems_numeric"),
# "all_functional_systems_in_valid_range": result.get("all_functional_systems_in_valid_range"),
# }
#
# raw_subcategories = result.get("raw_subcategories", {})
# numeric_subcategories = result.get("subcategories", {})
# subcat_validation = result.get("subcategory_validation", {})
#
# for subcat in FUNCTIONAL_SYSTEM_RANGES:
# raw_value = None
# numeric_value = None
# is_numeric = False
# in_valid_range = False
#
# if isinstance(raw_subcategories, dict):
# raw_value = raw_subcategories.get(subcat)
#
# if isinstance(numeric_subcategories, dict):
# numeric_value = numeric_subcategories.get(subcat)
#
# if isinstance(subcat_validation, dict):
# flags = subcat_validation.get(subcat, {})
# if isinstance(flags, dict):
# is_numeric = flags.get("is_numeric", False)
# in_valid_range = flags.get("in_valid_range", False)
#
# # New transparent columns
# flat[f"raw_subcat_{subcat}"] = raw_value
# flat[f"numeric_subcat_{subcat}"] = numeric_value
# flat[f"subcat_{subcat}_is_numeric"] = is_numeric
# flat[f"subcat_{subcat}_in_valid_range"] = in_valid_range
#
# # Backward-compatible old column name.
# # This is numeric but NOT clipped.
# flat[f"subcat_{subcat}"] = numeric_value
#
# return flat
#
#
## =========================
## SUMMARY STATISTICS
## =========================
#
#def summarize_records(records):
# """
# Create transparent summary statistics per model.
#
# Separates:
# - JSON/schema validity
# - numeric parse validity
# - clinical range validity
# - out-of-range outputs
# """
#
# df = pd.DataFrame([flatten_result(r) for r in records])
#
# if df.empty:
# return pd.DataFrame()
#
# def bool_mean(col):
# if col not in df.columns:
# return None
# return df[col].fillna(False).astype(bool).mean()
#
# def bool_sum(col):
# if col not in df.columns:
# return None
# return int(df[col].fillna(False).astype(bool).sum())
#
# n_records = len(df)
#
# summary = {
# "model": df["model"].iloc[0] if "model" in df.columns else None,
# "n_total_responses": n_records,
#
# "n_success": bool_sum("success"),
# "success_rate": bool_mean("success"),
#
# "n_json_parse_success": bool_sum("json_parse_success"),
# "json_parse_success_rate": bool_mean("json_parse_success"),
#
# "n_required_fields_present": bool_sum("required_fields_present"),
# "required_fields_present_rate": bool_mean("required_fields_present"),
#
# "n_required_schema_success": bool_sum("required_schema_success"),
# "required_schema_success_rate": bool_mean("required_schema_success"),
#
# "n_clinical_range_valid": bool_sum("clinical_range_valid"),
# "clinical_range_valid_rate": bool_mean("clinical_range_valid"),
#
# "n_certainty_present": bool_sum("certainty_present"),
# "certainty_present_rate": bool_mean("certainty_present"),
#
# "n_EDSS_numeric": bool_sum("EDSS_is_numeric"),
# "EDSS_numeric_rate": bool_mean("EDSS_is_numeric"),
#
# "n_EDSS_in_valid_range": bool_sum("EDSS_in_valid_range"),
# "EDSS_valid_range_rate": bool_mean("EDSS_in_valid_range"),
# }
#
# # EDSS out-of-range among numeric EDSS outputs
# if "EDSS_is_numeric" in df.columns and "EDSS_in_valid_range" in df.columns:
# edss_numeric = df["EDSS_is_numeric"].fillna(False).astype(bool)
# edss_valid = df["EDSS_in_valid_range"].fillna(False).astype(bool)
# edss_out_of_range = edss_numeric & (~edss_valid)
#
# summary["n_EDSS_out_of_range"] = int(edss_out_of_range.sum())
# summary["EDSS_out_of_range_rate_total"] = float(edss_out_of_range.mean())
# summary["EDSS_out_of_range_rate_among_numeric"] = (
# float(edss_out_of_range.sum() / edss_numeric.sum())
# if edss_numeric.sum() > 0 else None
# )
#
# # Functional system rates
# fs_out_of_range_any = pd.Series(False, index=df.index)
# fs_valid_all = pd.Series(True, index=df.index)
#
# for subcat in FUNCTIONAL_SYSTEM_RANGES:
# numeric_col = f"subcat_{subcat}_is_numeric"
# valid_col = f"subcat_{subcat}_in_valid_range"
#
# if numeric_col in df.columns:
# numeric_series = df[numeric_col].fillna(False).astype(bool)
# else:
# numeric_series = pd.Series(False, index=df.index)
#
# if valid_col in df.columns:
# valid_series = df[valid_col].fillna(False).astype(bool)
# else:
# valid_series = pd.Series(False, index=df.index)
#
# out_of_range_series = numeric_series & (~valid_series)
#
# summary[f"n_{subcat}_numeric"] = int(numeric_series.sum())
# summary[f"{subcat}_numeric_rate"] = float(numeric_series.mean())
#
# summary[f"n_{subcat}_in_valid_range"] = int(valid_series.sum())
# summary[f"{subcat}_valid_range_rate"] = float(valid_series.mean())
#
# summary[f"n_{subcat}_out_of_range"] = int(out_of_range_series.sum())
# summary[f"{subcat}_out_of_range_rate_total"] = float(out_of_range_series.mean())
# summary[f"{subcat}_out_of_range_rate_among_numeric"] = (
# float(out_of_range_series.sum() / numeric_series.sum())
# if numeric_series.sum() > 0 else None
# )
#
# fs_out_of_range_any = fs_out_of_range_any | out_of_range_series
# fs_valid_all = fs_valid_all & valid_series
#
# summary["n_any_functional_system_out_of_range"] = int(fs_out_of_range_any.sum())
# summary["any_functional_system_out_of_range_rate_total"] = float(fs_out_of_range_any.mean())
#
# summary["n_all_functional_systems_in_valid_range"] = int(fs_valid_all.sum())
# summary["all_functional_systems_valid_range_rate"] = float(fs_valid_all.mean())
#
# numeric_cols = [
# "inference_time_sec",
# "process_cpu_time_sec",
# "rss_delta_mb",
# "peak_rss_mb",
# "prompt_tokens",
# "completion_tokens",
# "total_tokens",
# "certainty_percent",
# "EDSS_numeric",
# ]
#
# for col in numeric_cols:
# if col in df.columns:
# values = pd.to_numeric(df[col], errors="coerce")
# summary[f"{col}_mean"] = values.mean()
# summary[f"{col}_median"] = values.median()
# summary[f"{col}_std"] = values.std()
# summary[f"{col}_min"] = values.min()
# summary[f"{col}_max"] = values.max()
#
# if "EDSS_is_numeric" in df.columns and "EDSS_in_valid_range" in df.columns:
# primary_valid_only = (
# df["EDSS_is_numeric"].fillna(False).astype(bool)
# & df["EDSS_in_valid_range"].fillna(False).astype(bool)
# )
#
# sensitivity_all_numeric = df["EDSS_is_numeric"].fillna(False).astype(bool)
#
# summary["n_primary_valid_only_EDSS"] = int(primary_valid_only.sum())
# summary["primary_valid_only_EDSS_rate"] = float(primary_valid_only.mean())
#
# summary["n_sensitivity_all_numeric_EDSS"] = int(sensitivity_all_numeric.sum())
# summary["sensitivity_all_numeric_EDSS_rate"] = float(sensitivity_all_numeric.mean())
#
# return pd.DataFrame([summary])
#
#
## =========================
## ANALYSIS DATASET HELPERS
## =========================
#
#def create_analysis_datasets(records):
# """
# Create two transparent EDSS analysis datasets:
#
# 1. primary_valid_only:
# Only numeric EDSS predictions within the valid clinical range.
#
# 2. sensitivity_all_numeric:
# All numeric EDSS predictions, including out-of-range values.
# No clipping is applied.
# """
#
# df = pd.DataFrame([flatten_result(r) for r in records])
#
# if df.empty:
# return df.copy(), df.copy()
#
# primary_valid_only = df[
# df["EDSS_is_numeric"].fillna(False).astype(bool)
# & df["EDSS_in_valid_range"].fillna(False).astype(bool)
# ].copy()
#
# sensitivity_all_numeric = df[
# df["EDSS_is_numeric"].fillna(False).astype(bool)
# ].copy()
#
# return primary_valid_only, sensitivity_all_numeric
#
#
## =========================
## INCREMENTAL SAVE HELPERS
## =========================
#
#def append_jsonl(path, record):
# with open(path, "a", encoding="utf-8") as f:
# f.write(json.dumps(record, ensure_ascii=False) + "\n")
# f.flush()
# os.fsync(f.fileno())
#
#
#def append_csv(path, record):
# flat = flatten_result(record)
# df_one = pd.DataFrame([flat])
# file_exists = Path(path).exists()
# df_one.to_csv(path, mode="a", header=not file_exists, index=False)
#
#
## =========================
## MAIN LOOP
## =========================
#
#if __name__ == "__main__":
#
# run_timestamp = now_timestamp()
#
# results_root = Path(RESULTS_ROOT)
# results_root.mkdir(parents=True, exist_ok=True)
#
# run_root = results_root / f"run_{run_timestamp}"
# run_root.mkdir(parents=True, exist_ok=True)
#
# print(f"Results root: {run_root}")
#
# df = pd.read_csv(INPUT_CSV, sep=";")
#
# if MAX_ROWS is not None:
# df = df.head(MAX_ROWS)
#
# total_rows = len(df)
#
# model_names_for_print = [m["model_name"] for m in MODEL_CONFIGS]
#
# print(f"Loaded {total_rows} patient records.")
# print(f"Models: {model_names_for_print}")
# print(f"Iterations per model: {NUM_ITERATIONS}")
#
# all_model_summaries = []
#
# for model_config in MODEL_CONFIGS:
# model_name = model_config["model_name"]
# safe_model = safe_dir_name(model_name)
#
# model_dir = run_root / safe_model
# model_dir.mkdir(parents=True, exist_ok=True)
#
# print(f"\n{'#' * 80}")
# print(f"MODEL: {model_name}")
# print(f"use_response_format: {model_config.get('use_response_format', False)}")
# print(f"temperature: {model_config.get('temperature', TEMPERATURE)}")
# print(f"max_tokens: {model_config.get('max_tokens', MAX_TOKENS)}")
# print(f"Saving to: {model_dir}")
# print(f"{'#' * 80}")
#
# model_records = []
# model_start = time.perf_counter()
#
# for iteration in range(1, NUM_ITERATIONS + 1):
# print(f"\n{'=' * 60}")
# print(f"🔄 MODEL {model_name} | ITERATION {iteration}/{NUM_ITERATIONS}")
# print(f"{'=' * 60}")
#
# iteration_results = []
# iteration_start = time.perf_counter()
#
# incremental_jsonl_path = model_dir / f"{safe_model}_iter_{iteration}_{run_timestamp}_incremental.jsonl"
# incremental_csv_path = model_dir / f"{safe_model}_iter_{iteration}_{run_timestamp}_incremental.csv"
#
# print(f"Incremental JSONL: {incremental_jsonl_path}")
# print(f"Incremental CSV: {incremental_csv_path}")
#
# for loop_i, (idx, row) in enumerate(df.iterrows(), start=1):
# print(
# f"\rModel={model_name} | Row {loop_i}/{total_rows} | Iter {iteration}",
# end="",
# flush=True
# )
#
# try:
# patient_text = build_patient_text(row)
#
# record = run_inference(
# patient_text=patient_text,
# model_config=model_config
# )
#
# record["iteration"] = iteration
# record["row_index"] = int(idx)
# record["row_number_in_run"] = int(loop_i)
# record["unique_id"] = row.get("unique_id", f"row_{idx}")
# record["MedDatum"] = row.get("MedDatum", None)
#
# iteration_results.append(record)
# model_records.append(record)
#
# if loop_i % SAVE_EVERY_N_ROWS == 0:
# append_jsonl(incremental_jsonl_path, record)
# append_csv(incremental_csv_path, record)
#
# if record["success"]:
# res = record["result"] or {}
# edss_display = res.get("EDSS_numeric", None)
# edss_valid = res.get("EDSS_in_valid_range", False)
#
# print(
# f" ✅ EDSS={edss_display}, "
# f"valid_range={edss_valid}, "
# f"time={record['inference_time_sec']:.2f}s"
# )
# else:
# print(f" ❌ {record.get('error', 'Unknown error')}")
#
# except Exception as e:
# print(f"\n⚠️ Row {idx} failed outside inference wrapper: {e}")
#
# fallback_record = {
# "success": False,
# "error": str(e),
# "last_error": str(e),
# "result": None,
#
# "validation": {
# "json_parse_success": False,
# "required_fields_present": False,
# "required_schema_success": False,
# "clinical_range_valid": False,
# "certainty_present": False,
# "missing_required_fields": [],
# "missing_subcategory_fields": [],
# "EDSS_is_numeric": False,
# "EDSS_in_valid_range": False,
# },
# "raw_parsed_output": None,
#
# "model": model_name,
# "iteration": iteration,
# "row_index": int(idx),
# "row_number_in_run": int(loop_i),
# "unique_id": row.get("unique_id", f"row_{idx}"),
# "MedDatum": row.get("MedDatum", None),
#
# "inference_time_sec": None,
# "process_cpu_time_sec": None,
# "rss_before_mb": None,
# "rss_after_mb": None,
# "rss_delta_mb": None,
# "peak_rss_mb": None,
#
# "prompt_tokens": None,
# "completion_tokens": None,
# "total_tokens": None,
#
# "raw_content": None,
# "raw_response_debug": None,
# }
#
# iteration_results.append(fallback_record)
# model_records.append(fallback_record)
#
# append_jsonl(incremental_jsonl_path, fallback_record)
# append_csv(incremental_csv_path, fallback_record)
#
# if STOP_ON_FIRST_ERROR:
# break
#
# iteration_elapsed = time.perf_counter() - iteration_start
#
# # Final full per-iteration JSON
# iter_json_path = model_dir / f"{safe_model}_results_iter_{iteration}_{run_timestamp}.json"
# with open(iter_json_path, "w", encoding="utf-8") as f:
# json.dump(iteration_results, f, indent=2, ensure_ascii=False)
#
# # Final full per-iteration CSV
# iter_csv_path = model_dir / f"{safe_model}_results_iter_{iteration}_{run_timestamp}.csv"
# iter_flat_df = pd.DataFrame([flatten_result(r) for r in iteration_results])
# iter_flat_df.to_csv(iter_csv_path, index=False)
#
# # Transparent analysis datasets
# primary_valid_only_df, sensitivity_all_numeric_df = create_analysis_datasets(iteration_results)
#
# primary_valid_only_path = model_dir / f"{safe_model}_results_iter_{iteration}_{run_timestamp}_primary_valid_only.csv"
# sensitivity_all_numeric_path = model_dir / f"{safe_model}_results_iter_{iteration}_{run_timestamp}_sensitivity_all_numeric.csv"
#
# primary_valid_only_df.to_csv(primary_valid_only_path, index=False)
# sensitivity_all_numeric_df.to_csv(sensitivity_all_numeric_path, index=False)
#
# print(f"\n✅ Iteration {iteration} complete.")
# print(f"Incremental JSONL saved to: {incremental_jsonl_path}")
# print(f"Incremental CSV saved to: {incremental_csv_path}")
# print(f"Final JSON saved to: {iter_json_path}")
# print(f"Final CSV saved to: {iter_csv_path}")
# print(f"Primary valid-only CSV saved to: {primary_valid_only_path}")
# print(f"Sensitivity all-numeric CSV: {sensitivity_all_numeric_path}")
# print(
# f"⏱️ Iteration time: {iteration_elapsed:.1f}s "
# f"({iteration_elapsed / max(total_rows, 1):.2f}s/row)"
# )
#
# model_elapsed = time.perf_counter() - model_start
#
# # Save all records for this model
# model_json_path = model_dir / f"{safe_model}_all_results_{run_timestamp}.json"
# with open(model_json_path, "w", encoding="utf-8") as f:
# json.dump(model_records, f, indent=2, ensure_ascii=False)
#
# model_csv_path = model_dir / f"{safe_model}_all_results_{run_timestamp}.csv"
# model_flat_df = pd.DataFrame([flatten_result(r) for r in model_records])
# model_flat_df.to_csv(model_csv_path, index=False)
#
# # Save model-level analysis datasets
# primary_valid_only_df, sensitivity_all_numeric_df = create_analysis_datasets(model_records)
#
# model_primary_valid_only_path = model_dir / f"{safe_model}_all_results_{run_timestamp}_primary_valid_only.csv"
# model_sensitivity_all_numeric_path = model_dir / f"{safe_model}_all_results_{run_timestamp}_sensitivity_all_numeric.csv"
#
# primary_valid_only_df.to_csv(model_primary_valid_only_path, index=False)
# sensitivity_all_numeric_df.to_csv(model_sensitivity_all_numeric_path, index=False)
#
# # Save model summary
# model_summary_df = summarize_records(model_records)
# model_summary_df["model_total_wall_time_sec"] = model_elapsed
# model_summary_df["model_total_wall_time_min"] = model_elapsed / 60
#
# model_summary_path = model_dir / f"{safe_model}_summary_{run_timestamp}.csv"
# model_summary_df.to_csv(model_summary_path, index=False)
#
# all_model_summaries.append(model_summary_df)
#
# print(f"\n🎉 Model completed: {model_name}")
# print(f"All JSON: {model_json_path}")
# print(f"All CSV: {model_csv_path}")
# print(f"All primary valid-only CSV: {model_primary_valid_only_path}")
# print(f"All sensitivity all-numeric CSV: {model_sensitivity_all_numeric_path}")
# print(f"Summary: {model_summary_path}")
# print(f"Total model time: {model_elapsed / 60:.2f} min")
#
# if all_model_summaries:
# combined_summary_df = pd.concat(all_model_summaries, ignore_index=True)
# combined_summary_path = run_root / f"all_models_summary_{run_timestamp}.csv"
# combined_summary_df.to_csv(combined_summary_path, index=False)
#
# print(f"\n📊 Combined summary saved to: {combined_summary_path}")
#
# print(f"\n🎉 All models and all iterations completed!")
##
# %% Minimal parallel EDSS benchmark with correct klassifizierbar/EDSS logic
import os
import re
import json
import time
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
# =========================
# CONFIGURATION
# =========================
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
INPUT_CSV = "/home/shahin/Lab/Doktorarbeit/Barcelona/data/processed/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/prompts/Komplett.txt"
RESULTS_ROOT = "/home/shahin/Lab/Doktorarbeit/Barcelona/results/benchmark_runs"
MODEL_CONFIGS = [
{
"model_name": "gpt-oss-120b",
"use_response_format": True,
"temperature": 0.0,
"max_tokens": 4096,
"extra_body": None,
},
{
"model_name": "qwen3.6-27b",
"use_response_format": False,
"temperature": 0.0,
"max_tokens": 4096,
"extra_body": {
"chat_template_kwargs": {
"enable_thinking": False
}
},
},
{
"model_name": "gemma-4-31B-it",
"use_response_format": False,
"temperature": 0.0,
"max_tokens": 4096,
"extra_body": None,
},
]
NUM_ITERATIONS = 10
MAX_ROWS = None
# MAX_ROWS = 2
PARALLEL_WORKERS = 10
BATCH_SIZE = 100
MAX_JSON_RETRIES = 5
RETRY_SLEEP_SEC = 2
STOP_ON_FIRST_ERROR = False
# =========================
# CONSTANTS
# =========================
EDSS_MIN = 0.0
EDSS_MAX = 10.0
FUNCTIONAL_SYSTEM_RANGES = {
"VISUAL_OPTIC_FUNCTIONS": (0.0, 6.0),
"BRAINSTEM_FUNCTIONS": (0.0, 6.0),
"PYRAMIDAL_FUNCTIONS": (0.0, 6.0),
"CEREBELLAR_FUNCTIONS": (0.0, 6.0),
"SENSORY_FUNCTIONS": (0.0, 6.0),
"BOWEL_AND_BLADDER_FUNCTIONS": (0.0, 6.0),
"CEREBRAL_FUNCTIONS": (0.0, 6.0),
"AMBULATION": (0.0, 10.0),
}
REQUIRED_TOP_LEVEL_FIELDS = [
"reason",
"klassifizierbar",
"EDSS",
"certainty_percent",
"subcategories",
]
# =========================
# CLIENT
# =========================
client = OpenAI(
api_key=OPENAI_API_KEY,
base_url=OPENAI_BASE_URL,
)
# =========================
# HELPERS
# =========================
def safe_dir_name(name):
name = str(name).strip()
name = re.sub(r"[^\w\-.]+", "_", name)
return name[:150]
def now_timestamp():
return datetime.now().strftime("%Y%m%d_%H%M%S")
def parse_float(value):
"""
Returns:
- raw value
- numeric float value or None
- is_numeric boolean
"""
if value is None or isinstance(value, bool):
return value, None, False
try:
return value, float(str(value).replace(",", ".")), True
except Exception:
return value, None, False
def is_in_range(value, min_value, max_value):
if value is None:
return False
return min_value <= value <= max_value
def build_patient_text(row):
return (
str(row.get("T_Zusammenfassung", "")) + "\n" +
str(row.get("Diagnosen", "")) + "\n" +
str(row.get("T_KlinBef", "")) + "\n" +
str(row.get("T_Befunde", ""))
)
# =========================
# READ EDSS INSTRUCTIONS
# =========================
with open(EDSS_INSTRUCTIONS_PATH, "r", encoding="utf-8") as f:
EDSS_INSTRUCTIONS = f.read().strip()
# =========================
# PROMPT
# =========================
def build_prompt(patient_text):
return f"""Du bist ein medizinischer Assistent für EDSS-Extraktion aus klinischen Berichten.
Extrahiere:
1. Ob der Bericht für eine EDSS-Einschätzung klassifizierbar ist.
2. Falls klassifizierbar: Gesamt-EDSS-Score von 0.0 bis 10.0.
3. Alle 8 EDSS-Unterkategorien, soweit ableitbar.
4. Sicherheit als Ganzzahl von 0 bis 100.
Antworte ausschließlich mit EINEM validen JSON-Objekt.
Kein Markdown. Keine Code-Fences. Kein Text vor oder nach JSON.
Keine Platzhalter. Kopiere kein Schema.
Das JSON muss exakt diese Schlüssel enthalten:
- reason
- klassifizierbar
- EDSS
- certainty_percent
- subcategories
Die subcategories müssen exakt diese 8 Schlüssel enthalten:
- VISUAL_OPTIC_FUNCTIONS
- BRAINSTEM_FUNCTIONS
- PYRAMIDAL_FUNCTIONS
- CEREBELLAR_FUNCTIONS
- SENSORY_FUNCTIONS
- BOWEL_AND_BLADDER_FUNCTIONS
- CEREBRAL_FUNCTIONS
- AMBULATION
Wichtige Regeln:
- klassifizierbar muss true oder false sein.
- Wenn klassifizierbar=true:
- EDSS muss eine Zahl zwischen 0.0 und 10.0 sein.
- EDSS darf dann niemals null sein.
- Wenn klassifizierbar=false:
- EDSS muss null sein.
- Verwende klassifizierbar=false nur, wenn keine vernünftige EDSS-Einschätzung möglich ist.
- Setze klassifizierbar=true, wenn irgendeine plausible EDSS-Einschätzung aus neurologischem Befund, Gehfähigkeit, Funktionssystemen, Diagnoseverlauf oder explizitem EDSS-Wert ableitbar ist.
- Fehlende einzelne Unterkategorien sind kein Grund für klassifizierbar=false.
- Einzelne Unterkategorien dürfen null sein, wenn sie nicht ausreichend ableitbar sind.
- certainty_percent muss eine Ganzzahl von 0 bis 100 sein.
- reason: maximal 250 Zeichen, Deutsch.
Valide Beispielausgabe bei klassifizierbarem Bericht:
{{
"reason": "Leichte Einschränkungen mit sicher ableitbarer Gehfähigkeit.",
"klassifizierbar": true,
"EDSS": 2.0,
"certainty_percent": 90,
"subcategories": {{
"VISUAL_OPTIC_FUNCTIONS": null,
"BRAINSTEM_FUNCTIONS": null,
"PYRAMIDAL_FUNCTIONS": 1.0,
"CEREBELLAR_FUNCTIONS": 1.0,
"SENSORY_FUNCTIONS": 1.0,
"BOWEL_AND_BLADDER_FUNCTIONS": null,
"CEREBRAL_FUNCTIONS": null,
"AMBULATION": 0.0
}}
}}
Valide Beispielausgabe bei nicht klassifizierbarem Bericht:
{{
"reason": "Keine ausreichenden neurologischen Informationen für eine EDSS-Einschätzung.",
"klassifizierbar": false,
"EDSS": null,
"certainty_percent": 0,
"subcategories": {{
"VISUAL_OPTIC_FUNCTIONS": null,
"BRAINSTEM_FUNCTIONS": null,
"PYRAMIDAL_FUNCTIONS": null,
"CEREBELLAR_FUNCTIONS": null,
"SENSORY_FUNCTIONS": null,
"BOWEL_AND_BLADDER_FUNCTIONS": null,
"CEREBRAL_FUNCTIONS": null,
"AMBULATION": null
}}
}}
EDSS-Bewertungsrichtlinien:
{EDSS_INSTRUCTIONS}
Patientenbericht:
{patient_text}
Gib ausschließlich das finale JSON-Objekt zurück.
"""
# =========================
# JSON EXTRACTION
# =========================
def extract_json_from_text(text):
if text is None:
raise ValueError("Model returned empty content")
text = str(text).strip()
if not text:
raise ValueError("Model returned empty content")
text = (
text.replace("```json", "")
.replace("```JSON", "")
.replace("```Json", "")
.replace("```", "")
.strip()
)
try:
parsed = json.loads(text)
if isinstance(parsed, dict):
return parsed
except json.JSONDecodeError:
pass
candidates = []
stack = []
start_idx = None
in_string = False
escape = False
for i, ch in enumerate(text):
if escape:
escape = False
continue
if ch == "\\":
escape = True
continue
if ch == '"':
in_string = not in_string
continue
if in_string:
continue
if ch == "{":
if not stack:
start_idx = i
stack.append(ch)
elif ch == "}":
if stack:
stack.pop()
if not stack and start_idx is not None:
candidates.append(text[start_idx:i + 1])
start_idx = None
for candidate in reversed(candidates):
try:
parsed = json.loads(candidate)
if isinstance(parsed, dict):
return parsed
except json.JSONDecodeError:
continue
raise ValueError(f"No valid JSON object found. Raw starts with: {text[:500]}")
def extract_message_content(message):
content = getattr(message, "content", None)
if content is not None:
return content
try:
msg = message.model_dump()
except Exception:
return None
for key in ["content", "reasoning_content", "reasoning", "text", "output_text"]:
if msg.get(key):
return msg[key]
return None
# =========================
# VALIDATION WITHOUT CLIPPING
# =========================
def validate_model_output(parsed):
missing_required = [
field for field in REQUIRED_TOP_LEVEL_FIELDS
if field not in parsed
]
required_fields_present = len(missing_required) == 0
klassifizierbar = parsed.get("klassifizierbar", None)
klassifizierbar_is_bool = isinstance(klassifizierbar, bool)
raw_certainty = parsed.get("certainty_percent")
_, certainty_numeric, certainty_is_numeric = parse_float(raw_certainty)
raw_edss = parsed.get("EDSS")
raw_edss, edss_numeric, edss_is_numeric = parse_float(raw_edss)
edss_in_valid_range = (
is_in_range(edss_numeric, EDSS_MIN, EDSS_MAX)
if edss_is_numeric else False
)
if klassifizierbar is True:
edss_logic_valid = edss_is_numeric and edss_in_valid_range
elif klassifizierbar is False:
edss_logic_valid = raw_edss is None
else:
edss_logic_valid = False
raw_subcats = parsed.get("subcategories")
if not isinstance(raw_subcats, dict):
raw_subcats = {}
subcats_numeric = {}
raw_subcats_out = {}
subcat_validation = {}
missing_subcats = []
for name, (min_value, max_value) in FUNCTIONAL_SYSTEM_RANGES.items():
if name not in raw_subcats:
missing_subcats.append(name)
raw_value = raw_subcats.get(name)
raw_value, numeric_value, is_numeric = parse_float(raw_value)
if raw_value is None:
in_valid_range = True
valid_when_present = True
else:
in_valid_range = (
is_in_range(numeric_value, min_value, max_value)
if is_numeric else False
)
valid_when_present = is_numeric and in_valid_range
raw_subcats_out[name] = raw_value
subcats_numeric[name] = numeric_value
subcat_validation[name] = {
"is_numeric": is_numeric,
"in_valid_range": in_valid_range,
"valid_when_present": valid_when_present,
"is_missing_or_null": raw_value is None,
}
all_subcats_valid_when_present = all(
subcat_validation[name]["valid_when_present"]
or subcat_validation[name]["is_missing_or_null"]
for name in FUNCTIONAL_SYSTEM_RANGES
)
required_schema_success = (
required_fields_present
and len(missing_subcats) == 0
and klassifizierbar_is_bool
)
clinical_output_valid = (
required_schema_success
and edss_logic_valid
and all_subcats_valid_when_present
)
return {
"reason": parsed.get("reason"),
"klassifizierbar": klassifizierbar,
"klassifizierbar_is_bool": klassifizierbar_is_bool,
"raw_certainty_percent": raw_certainty,
"certainty_percent": certainty_numeric if certainty_is_numeric else None,
"certainty_present": raw_certainty is not None,
"certainty_percent_is_numeric": certainty_is_numeric,
"certainty_percent_in_valid_range": (
is_in_range(certainty_numeric, 0.0, 100.0)
if certainty_is_numeric else False
),
"raw_EDSS": raw_edss,
"EDSS_numeric": edss_numeric,
"EDSS": edss_numeric,
"EDSS_is_numeric": edss_is_numeric,
"EDSS_in_valid_range": edss_in_valid_range,
"edss_logic_valid": edss_logic_valid,
"json_parse_success": True,
"required_fields_present": required_fields_present,
"required_schema_success": required_schema_success,
"clinical_range_valid": clinical_output_valid,
"clinical_output_valid": clinical_output_valid,
"missing_required_fields": missing_required,
"missing_subcategory_fields": missing_subcats,
"subcategories": subcats_numeric,
"raw_subcategories": raw_subcats_out,
"subcategory_validation": subcat_validation,
"all_functional_systems_valid_when_present": all_subcats_valid_when_present,
}
# =========================
# API CALL
# =========================
def make_chat_completion(model_config, prompt):
kwargs = {
"messages": [
{
"role": "system",
"content": (
"Du bist ein JSON-Generator. "
"Antworte ausschließlich mit einem einzigen validen JSON-Objekt. "
"Keine Erklärung. Kein Markdown. Keine Code-Fences."
),
},
{
"role": "user",
"content": prompt,
},
],
"model": model_config["model_name"],
"max_tokens": model_config["max_tokens"],
"temperature": model_config["temperature"],
}
if model_config.get("use_response_format", False):
kwargs["response_format"] = {"type": "json_object"}
if model_config.get("extra_body") is not None:
kwargs["extra_body"] = model_config["extra_body"]
return client.chat.completions.create(**kwargs)
def run_inference(patient_text, model_config):
base_prompt = build_prompt(patient_text)
prompt = base_prompt
model_name = model_config["model_name"]
start = time.perf_counter()
raw_content = None
raw_response_debug = None
last_error = None
prompt_tokens = None
completion_tokens = None
total_tokens = None
for attempt in range(1, MAX_JSON_RETRIES + 2):
try:
response = make_chat_completion(model_config, prompt)
message = response.choices[0].message
raw_content = extract_message_content(message)
try:
raw_response_debug = response.model_dump()
except Exception:
raw_response_debug = str(response)
usage = getattr(response, "usage", None)
if usage is not None:
prompt_tokens = getattr(usage, "prompt_tokens", None)
completion_tokens = getattr(usage, "completion_tokens", None)
total_tokens = getattr(usage, "total_tokens", None)
parsed = extract_json_from_text(raw_content)
validated = validate_model_output(parsed)
if not validated.get("clinical_output_valid", False):
raise ValueError(
"Model output is logically invalid. "
f"klassifizierbar={validated.get('klassifizierbar')}, "
f"raw_EDSS={validated.get('raw_EDSS')}, "
f"EDSS_numeric={validated.get('EDSS_numeric')}, "
f"edss_logic_valid={validated.get('edss_logic_valid')}, "
f"missing_required={validated.get('missing_required_fields')}, "
f"missing_subcats={validated.get('missing_subcategory_fields')}"
)
return {
"success": True,
"error": None,
"last_error": last_error,
"model": model_name,
"result": validated,
"raw_parsed_output": parsed,
"raw_content": raw_content,
"raw_response_debug": None,
"inference_time_sec": time.perf_counter() - start,
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": total_tokens,
}
except Exception as e:
last_error = str(e)
if attempt <= MAX_JSON_RETRIES:
prompt = f"""
Deine vorherige Antwort war ungültig.
Fehler:
{last_error}
Antworte erneut mit genau EINEM validen JSON-Objekt.
Strikte Regeln:
- Kein Markdown.
- Keine Code-Fences.
- Kein Text außerhalb des JSON.
- klassifizierbar muss true oder false sein.
- Wenn klassifizierbar=true: EDSS muss eine Zahl zwischen 0.0 und 10.0 sein.
- Wenn klassifizierbar=false: EDSS muss null sein.
- Verwende klassifizierbar=false nur, wenn keine vernünftige EDSS-Einschätzung möglich ist.
- Fehlende einzelne Unterkategorien sind kein Grund für klassifizierbar=false.
- Alle 8 subcategories-Schlüssel müssen vorhanden sein.
- Unterkategorien dürfen null sein, wenn nicht ausreichend ableitbar.
Ursprüngliche Aufgabe:
{base_prompt}
"""
time.sleep(RETRY_SLEEP_SEC)
continue
return {
"success": False,
"error": str(e),
"last_error": last_error,
"model": model_name,
"result": None,
"raw_parsed_output": None,
"raw_content": raw_content,
"raw_response_debug": raw_response_debug,
"inference_time_sec": time.perf_counter() - start,
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": total_tokens,
}
# =========================
# FLATTEN / SAVE
# =========================
def flatten_result(record):
result = record.get("result") or {}
flat = {
"model": record.get("model"),
"iteration": record.get("iteration"),
"row_index": record.get("row_index"),
"row_number_in_run": record.get("row_number_in_run"),
"unique_id": record.get("unique_id"),
"MedDatum": record.get("MedDatum"),
"success": record.get("success"),
"error": record.get("error"),
"last_error": record.get("last_error"),
"json_parse_success": result.get("json_parse_success", False),
"required_fields_present": result.get("required_fields_present", False),
"required_schema_success": result.get("required_schema_success", False),
"clinical_range_valid": result.get("clinical_range_valid", False),
"clinical_output_valid": result.get("clinical_output_valid", False),
"klassifizierbar_is_bool": result.get("klassifizierbar_is_bool"),
"edss_logic_valid": result.get("edss_logic_valid"),
"all_functional_systems_valid_when_present": result.get("all_functional_systems_valid_when_present"),
"inference_time_sec": record.get("inference_time_sec"),
"prompt_tokens": record.get("prompt_tokens"),
"completion_tokens": record.get("completion_tokens"),
"total_tokens": record.get("total_tokens"),
"raw_content": record.get("raw_content"),
"raw_parsed_output": json.dumps(
record.get("raw_parsed_output"),
ensure_ascii=False
),
"reason": result.get("reason"),
"klassifizierbar": result.get("klassifizierbar"),
"raw_certainty_percent": result.get("raw_certainty_percent"),
"certainty_percent": result.get("certainty_percent"),
"certainty_present": result.get("certainty_present"),
"certainty_percent_is_numeric": result.get("certainty_percent_is_numeric"),
"certainty_percent_in_valid_range": result.get("certainty_percent_in_valid_range"),
"raw_EDSS": result.get("raw_EDSS"),
"EDSS_numeric": result.get("EDSS_numeric"),
"EDSS": result.get("EDSS"),
"EDSS_is_numeric": result.get("EDSS_is_numeric"),
"EDSS_in_valid_range": result.get("EDSS_in_valid_range"),
"missing_required_fields": json.dumps(
result.get("missing_required_fields", []),
ensure_ascii=False
),
"missing_subcategory_fields": json.dumps(
result.get("missing_subcategory_fields", []),
ensure_ascii=False
),
}
raw_subcats = result.get("raw_subcategories", {})
num_subcats = result.get("subcategories", {})
subcat_flags = result.get("subcategory_validation", {})
for name in FUNCTIONAL_SYSTEM_RANGES:
flags = subcat_flags.get(name, {})
flat[f"raw_subcat_{name}"] = raw_subcats.get(name)
flat[f"numeric_subcat_{name}"] = num_subcats.get(name)
flat[f"subcat_{name}_is_numeric"] = flags.get("is_numeric", False)
flat[f"subcat_{name}_in_valid_range"] = flags.get("in_valid_range", False)
flat[f"subcat_{name}_valid_when_present"] = flags.get("valid_when_present", False)
flat[f"subcat_{name}"] = num_subcats.get(name)
return flat
def append_jsonl(path, record):
with open(path, "a", encoding="utf-8") as f:
f.write(json.dumps(record, ensure_ascii=False) + "\n")
f.flush()
os.fsync(f.fileno())
def append_csv(path, record):
one = pd.DataFrame([flatten_result(record)])
file_exists = Path(path).exists()
one.to_csv(path, mode="a", header=not file_exists, index=False)
def save_json(path, records):
with open(path, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
def save_csv(path, records):
df = pd.DataFrame([flatten_result(r) for r in records])
df.to_csv(path, index=False)
def summarize_records(records):
df = pd.DataFrame([flatten_result(r) for r in records])
if df.empty:
return pd.DataFrame()
def rate(col):
if col not in df.columns:
return None
return df[col].fillna(False).astype(bool).mean()
summary = {
"model": df["model"].iloc[0],
"n_total": len(df),
"success_rate": rate("success"),
"json_parse_success_rate": rate("json_parse_success"),
"required_schema_success_rate": rate("required_schema_success"),
"clinical_output_valid_rate": rate("clinical_output_valid"),
"edss_logic_valid_rate": rate("edss_logic_valid"),
"EDSS_numeric_rate": rate("EDSS_is_numeric"),
"EDSS_valid_range_rate": rate("EDSS_in_valid_range"),
"klassifizierbar_true_rate": (
df["klassifizierbar"].fillna(False).astype(bool).mean()
if "klassifizierbar" in df.columns
else None
),
"mean_inference_time_sec": pd.to_numeric(
df["inference_time_sec"],
errors="coerce"
).mean(),
}
return pd.DataFrame([summary])
# =========================
# PARALLEL ROW PROCESSING
# =========================
def process_one_row(payload, model_config, iteration):
idx, row_dict, row_number, total_rows = payload
row = pd.Series(row_dict)
patient_text = build_patient_text(row)
record = run_inference(patient_text, model_config)
record["iteration"] = iteration
record["row_index"] = int(idx)
record["row_number_in_run"] = int(row_number)
record["unique_id"] = row.get("unique_id", f"row_{idx}")
record["MedDatum"] = row.get("MedDatum", None)
return record
# =========================
# MAIN
# =========================
if __name__ == "__main__":
run_timestamp = now_timestamp()
results_root = Path(RESULTS_ROOT)
run_root = results_root / f"run_{run_timestamp}"
run_root.mkdir(parents=True, exist_ok=True)
print(f"Results root: {run_root}")
df = pd.read_csv(INPUT_CSV, sep=";")
if MAX_ROWS is not None:
df = df.head(MAX_ROWS)
total_rows = len(df)
print(f"Loaded rows: {total_rows}")
print(f"Parallel workers: {PARALLEL_WORKERS}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Iterations: {NUM_ITERATIONS}")
print(f"Models: {[m['model_name'] for m in MODEL_CONFIGS]}")
row_payloads = [
(idx, row.to_dict(), row_number, total_rows)
for row_number, (idx, row) in enumerate(df.iterrows(), start=1)
]
all_summaries = []
for model_config in MODEL_CONFIGS:
model_name = model_config["model_name"]
safe_model = safe_dir_name(model_name)
model_dir = run_root / safe_model
model_dir.mkdir(parents=True, exist_ok=True)
print("\n" + "#" * 80)
print(f"MODEL: {model_name}")
print(f"Saving to: {model_dir}")
print("#" * 80)
model_records = []
model_start = time.perf_counter()
for iteration in range(1, NUM_ITERATIONS + 1):
print("\n" + "=" * 80)
print(f"MODEL {model_name} | ITERATION {iteration}/{NUM_ITERATIONS}")
print("=" * 80)
iteration_start = time.perf_counter()
iteration_results = []
incremental_jsonl_path = model_dir / f"{safe_model}_iter_{iteration}_{run_timestamp}_incremental.jsonl"
incremental_csv_path = model_dir / f"{safe_model}_iter_{iteration}_{run_timestamp}_incremental.csv"
completed = 0
for batch_start in range(0, len(row_payloads), BATCH_SIZE):
batch = row_payloads[batch_start:batch_start + BATCH_SIZE]
print(
f"\nSubmitting rows {batch_start + 1}-"
f"{batch_start + len(batch)} / {total_rows}"
)
with ThreadPoolExecutor(max_workers=PARALLEL_WORKERS) as executor:
futures = [
executor.submit(
process_one_row,
payload,
model_config,
iteration
)
for payload in batch
]
for future in as_completed(futures):
record = future.result()
completed += 1
iteration_results.append(record)
model_records.append(record)
append_jsonl(incremental_jsonl_path, record)
append_csv(incremental_csv_path, record)
if record["success"]:
result = record.get("result") or {}
print(
f"Done {completed}/{total_rows} | "
f"row={record.get('row_number_in_run')} | "
f"klassifizierbar={result.get('klassifizierbar')} | "
f"EDSS={result.get('EDSS_numeric')} | "
f"edss_logic_valid={result.get('edss_logic_valid')} | "
f"clinical_output_valid={result.get('clinical_output_valid')} | "
f"time={record.get('inference_time_sec'):.2f}s"
)
else:
print(
f"Done {completed}/{total_rows} | "
f"row={record.get('row_number_in_run')} | "
f"ERROR={record.get('error')}"
)
if STOP_ON_FIRST_ERROR:
raise RuntimeError(record.get("error"))
iteration_results = sorted(
iteration_results,
key=lambda r: r.get("row_number_in_run", 10**9)
)
iter_json_path = model_dir / f"{safe_model}_results_iter_{iteration}_{run_timestamp}.json"
iter_csv_path = model_dir / f"{safe_model}_results_iter_{iteration}_{run_timestamp}.csv"
save_json(iter_json_path, iteration_results)
save_csv(iter_csv_path, iteration_results)
elapsed = time.perf_counter() - iteration_start
print(f"\nIteration {iteration} complete.")
print(f"JSON: {iter_json_path}")
print(f"CSV: {iter_csv_path}")
print(f"Time: {elapsed / 60:.2f} min")
model_records = sorted(
model_records,
key=lambda r: (
r.get("iteration", 10**9),
r.get("row_number_in_run", 10**9)
)
)
model_json_path = model_dir / f"{safe_model}_all_results_{run_timestamp}.json"
model_csv_path = model_dir / f"{safe_model}_all_results_{run_timestamp}.csv"
model_summary_path = model_dir / f"{safe_model}_summary_{run_timestamp}.csv"
save_json(model_json_path, model_records)
save_csv(model_csv_path, model_records)
summary_df = summarize_records(model_records)
summary_df["model_total_wall_time_sec"] = time.perf_counter() - model_start
summary_df["model_total_wall_time_min"] = summary_df["model_total_wall_time_sec"] / 60
summary_df.to_csv(model_summary_path, index=False)
all_summaries.append(summary_df)
print(f"\nModel complete: {model_name}")
print(f"All JSON: {model_json_path}")
print(f"All CSV: {model_csv_path}")
print(f"Summary: {model_summary_path}")
if all_summaries:
combined = pd.concat(all_summaries, ignore_index=True)
combined_path = run_root / f"all_models_summary_{run_timestamp}.csv"
combined.to_csv(combined_path, index=False)
print(f"\nCombined summary: {combined_path}")
print("\nAll models and iterations completed.")
##