7 Commits

Author SHA1 Message Date
816c50e467 Config Dashbprard 2026-02-23 18:19:50 +01:00
118e3e63b3 refinement 2026-02-23 15:06:54 +01:00
99862629b8 update gitignore 2026-02-23 00:43:33 +01:00
9cc80cd3e6 Audit code 2026-02-23 00:42:41 +01:00
424d38ad1c certainty Delta show 2026-02-18 17:12:31 +01:00
f1d22b28ad updated plot certainty 2026-02-13 09:22:53 +01:00
8e4a43c557 add certainty 2026-02-12 13:39:36 +01:00
6 changed files with 4689 additions and 91 deletions

2
.gitignore vendored
View File

@@ -6,7 +6,7 @@
.env .env
__pycache__/ __pycache__/
*.pyc *.pyc
*.csv
======= =======
/reference/ /reference/
*.svg *.svg

2371
audit.py Normal file

File diff suppressed because it is too large Load Diff

600
certainty.py Normal file
View File

@@ -0,0 +1,600 @@
# %% API call1
#import time
#import json
#import os
#from datetime import datetime
#import pandas as pd
#from openai import OpenAI
#from dotenv import load_dotenv
#
## Load environment variables
#load_dotenv()
#
## === CONFIGURATION ===
#OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
#OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
#MODEL_NAME = "GPT-OSS-120B"
#HEALTH_URL = f"{OPENAI_BASE_URL}/health" # Placeholder - actual health check would need to be implemented
#CHAT_URL = f"{OPENAI_BASE_URL}/chat/completions"
#
## File paths
#INPUT_CSV = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
#EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/Komplett.txt"
##GRAMMAR_FILE = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/just_edss_schema.gbnf"
#
## Initialize OpenAI client
#client = OpenAI(
# api_key=OPENAI_API_KEY,
# base_url=OPENAI_BASE_URL
#)
#
## Read EDSS instructions from file
#with open(EDSS_INSTRUCTIONS_PATH, 'r') as f:
# EDSS_INSTRUCTIONS = f.read().strip()
## === RUN INFERENCE 2 ===
#def run_inference(patient_text):
# prompt = f'''
# Du bist ein medizinischer Assistent, der spezialisiert darauf ist, EDSS-Scores (Expanded Disability Status Scale) aus klinischen Berichten zu extrahieren.
#### Regeln für die Ausgabe:
#1. **Reason**: Erstelle eine prägnante Zusammenfassung (max. 400 Zeichen) der Befunde auf **DEUTSCH**, die zur Einstufung führen.
#2. **klassifizierbar**:
# - Setze dies auf **true**, wenn ein EDSS-Wert identifiziert, berechnet oder basierend auf den klinischen Hinweisen plausibel geschätzt werden kann.
# - Setze dies auf **false**, NUR wenn die Daten absolut unzureichend oder so widersprüchlich sind, dass keinerlei Einstufung möglich ist.
#3. **EDSS**:
# - Dieses Feld ist **VERPFLICHTEND**, wenn "klassifizierbar" auf true steht.
# - Es muss eine Zahl zwischen 0.0 und 10.0 sein.
# - Versuche stets, den EDSS-Wert so präzise wie möglich zu bestimmen, auch wenn die Datenlage dünn ist (nutze verfügbare Informationen zu Gehstrecke und Funktionssystemen).
# - Dieses Feld **DARF NICHT ERSCHEINEN**, wenn "klassifizierbar" auf false steht.
#
#### Einschränkungen:
#- Erfinde keine Fakten, aber nutze klinische Herleitungen aus dem Bericht, um den EDSS zu bestimmen.
#- Priorisiere die Vergabe eines EDSS-Wertes gegenüber der Markierung als nicht klassifizierbar.
#- Halte dich strikt an die JSON-Struktur.
#
#EDSS-Bewertungsrichtlinien:
#{EDSS_INSTRUCTIONS}
#
#Patientenbericht:
#{patient_text}
#'''
# start_time = time.time()
#
# try:
# # Make API call using OpenAI client
# response = client.chat.completions.create(
# messages=[
# {
# "role": "system",
# "content": "You extract EDSS scores. You prioritize providing a score even if data is partial, by using clinical inference."
# },
# {
# "role": "user",
# "content": prompt
# }
# ],
# model=MODEL_NAME,
# max_tokens=2048,
# temperature=0.0,
# response_format={"type": "json_object"}
# )
#
# # Extract content from response
# content = response.choices[0].message.content
#
# # Parse the JSON response
# parsed = json.loads(content)
#
# inference_time = time.time() - start_time
#
# return {
# "success": True,
# "result": parsed,
# "inference_time_sec": inference_time
# }
#
# except Exception as e:
# print(f"Inference error: {e}")
# return {
# "success": False,
# "error": str(e),
# "inference_time_sec": -1
# }
## === BUILD PATIENT TEXT ===
#def build_patient_text(row):
# return (
# str(row["T_Zusammenfassung"]) + "\n" +
# str(row["Diagnosen"]) + "\n" +
# str(row["T_KlinBef"]) + "\n" +
# str(row["T_Befunde"]) + "\n"
# )
#
#if __name__ == "__main__":
# # Read CSV file ONLY inside main block
# df = pd.read_csv(INPUT_CSV, sep=';')
# results = []
#
# # Process each row
# for idx, row in df.iterrows():
# print(f"Processing row {idx + 1}/{len(df)}")
# try:
# patient_text = build_patient_text(row)
# result = run_inference(patient_text)
#
# # Add unique_id and MedDatum to result for tracking
# result["unique_id"] = row.get("unique_id", f"row_{idx}")
# result["MedDatum"] = row.get("MedDatum", None)
#
# results.append(result)
# print(json.dumps(result, indent=2))
# except Exception as e:
# print(f"Error processing row {idx}: {e}")
# results.append({
# "success": False,
# "error": str(e),
# "unique_id": row.get("unique_id", f"row_{idx}"),
# "MedDatum": row.get("MedDatum", None)
# })
#
# # Save results to a JSON file
# output_json = INPUT_CSV.replace(".csv", "_results_Nisch.json")
# with open(output_json, 'w') as f:
# json.dump(results, f, indent=2)
# print(f"Results saved to {output_json}")
##
# %% API call1 - Enhanced with certainty scoring
#import time
#import json
#import os
#from datetime import datetime
#import pandas as pd
#from openai import OpenAI
#from dotenv import load_dotenv
#
## Load environment variables
#load_dotenv()
#
## === CONFIGURATION ===
#OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
#OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
#MODEL_NAME = "GPT-OSS-120B"
#
## File paths
#INPUT_CSV = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Test.csv"
#EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/Komplett.txt"
#
## Initialize OpenAI client
#client = OpenAI(
# api_key=OPENAI_API_KEY,
# base_url=OPENAI_BASE_URL
#)
#
## Read EDSS instructions from file
#with open(EDSS_INSTRUCTIONS_PATH, 'r') as f:
# EDSS_INSTRUCTIONS = f.read().strip()
#
## === PROMPT WITH CERTAINTY REQUEST ===
#def build_prompt(patient_text):
# return f'''Du bist ein medizinischer Assistent, der spezialisiert darauf ist, EDSS-Scores (Expanded Disability Status Scale), alle Unterkategorien und die Bewertungssicherheit aus klinischen Berichten zu extrahieren.
#
#### Deine Aufgabe:
#1. Analysiere den Patientenbericht und extrahiere:
# - Den Gesamt-EDSS-Score (0.010.0)
# - Alle 8 EDSS-Unterkategorien (mit jeweils eigener Maximalpunktzahl)
#2. Schätze für jede Entscheidung die Sicherheit als Ganzzahl von 0100 % ein.
#
#### Struktur der JSON-Ausgabe (VERPFLICHTEND):
#Gib NUR gültiges JSON zurück — kein Markdown, kein Text davor/dahinter.
#
#{{
# "reason": "Kernaussage zur EDSS-Begründung (max. 400 Zeichen, auf Deutsch).",
# "klassifizierbar": true/false,
# "EDSS": null ODER Zahl zwischen 0.0 und 10.0 (nur wenn klassifizierbar=true)",
# "certainty_percent": 0 ODER Zahl zwischen 0 und 100 (Ganzzahl)",
# "subcategories": {{
# "VISUAL_OPTIC_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "BRAINSTEM_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "PYRAMIDAL_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "CEREBELLAR_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "SENSORY_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "BOWEL_AND_BLADDER_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "CEREBRAL_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
# "AMBULATION": null ODER Zahl zwischen 0.0 und 10.0
# }}
#}}
#
#### Regeln:
#- **reason**: Kurze, prägnante Begründung (auf Deutsch, max. 400 Zeichen), warum du den EDSS-Wert und die Unterkategorien so bewertest.
#- **klassifizierbar**:
# - `true`, wenn EDSS und mindestens die wichtigsten Unterkategorien *eindeutig ableitbar* oder *plausibel inferierbar* sind.
# - `false`, **nur**, wenn keine relevanten Daten vorliegen, oder diese so widersprüchlich/inkonsistent sind, dass keine vernünftige Einschätzung möglich ist.
#- **EDSS**:
# - **VERPFLICHTEND**, wenn `klassifizierbar=true`.
# - Zahl zwischen 0.0 und 10.0 (z.B. 3.0, 5.5). Darf **nicht** erscheinen, wenn `klassifizierbar=false`.
#- **certainty_percent**:
# - **Immer present** — Ganzzahl (0100), basierend auf:
# - Klarheit und Vollständigkeit der Berichtsangaben,
# - Stichhaltigkeit der Schlussfolgerung (inkl. Inferenz),
# - Konsistenz zwischen den Unterkategorien.
#- **subcategories**:
# - **Immer present** — **alle 8 Unterkategorien** müssen enthalten sein.
# - Jeder Wert ist entweder:
# - `null` (wenn keine ausreichende Information vorliegt), **oder**
# - eine Zahl ≤ jeweiliger Obergrenze (z.B. Ambulation ≤ 10.0).
# - Wenn die Unterkategorie plausibel inferiert werden kann (auch indirekt), gib einen sinnvollen Wert ab.
# - Beispiel: Wenn „Gang mit Krückstock auf ebenem Boden bis 200 m“ steht, setze `AMBULATION: 5.5`.
#
#### EDSS-Bewertungsrichtlinien:
#{EDSS_INSTRUCTIONS}
#
#Patientenbericht:
#{patient_text}
#'''
#
## === INFERENCE FUNCTION ===
#def run_inference(patient_text):
# prompt = build_prompt(patient_text)
#
# start_time = time.time()
#
# try:
# response = client.chat.completions.create(
# messages=[
# {"role": "system", "content": "Du gibst EXKLUSIV gültiges JSON zurück — keine weiteren Erklärungen."}
# ] + [
# {"role": "user", "content": prompt}
# ],
# model=MODEL_NAME,
# max_tokens=2048,
# temperature=0.1, # Slightly higher for more natural certainty estimation (still low for reliability)
# response_format={"type": "json_object"}
# )
#
# content = response.choices[0].message.content
#
# # Parse and validate JSON
# try:
# parsed = json.loads(content)
# except json.JSONDecodeError as e:
# print(f"⚠️ JSON parsing failed: {e}")
# print("Raw response:", content[:500])
# raise ValueError("Model did not return valid JSON")
#
# # Enforce required keys
# if "certainty_percent" not in parsed:
# print("⚠️ Missing 'certainty_percent' in output! Force-adding fallback.")
# parsed["certainty_percent"] = 0 # fallback
# elif not isinstance(parsed["certainty_percent"], (int, float)):
# parsed["certainty_percent"] = int(parsed["certainty_percent"])
#
# # Clamp certainty to [0, 100]
# pct = parsed["certainty_percent"]
# parsed["certainty_percent"] =max(0, min(100, int(pct)))
#
# # Enforce EDSS rules: if not classifiable → remove EDSS
# if not parsed.get("klassifizierbar", False):
# if "EDSS" in parsed:
# del parsed["EDSS"] # per spec, must not appear if not classifiable
# else:
# if "EDSS" not in parsed:
# print("⚠️ 'klassifizierbar' is true but EDSS missing — adding fallback.")
# parsed["EDSS"] = 7.0 # last-resort fallback
#
# inference_time = time.time() - start_time
#
# return {
# "success": True,
# "result": parsed,
# "inference_time_sec": inference_time
# }
#
# except Exception as e:
# print(f"❌ Inference error: {e}")
# return {
# "success": False,
# "error": str(e),
# "inference_time_sec": -1,
# "result": None # no structured output
# }
#
## === BUILD PATIENT TEXT ===
#def build_patient_text(row):
# return (
# str(row.get("T_Zusammenfassung", "")) + "\n" +
# str(row.get("Diagnosen", "")) + "\n" +
# str(row.get("T_KlinBef", "")) + "\n" +
# str(row.get("T_Befunde", ""))
# )
#
#if __name__ == "__main__":
# # Load data
# df = pd.read_csv(INPUT_CSV, sep=';')
# results = []
#
# # Optional: limit for testing
# # df = df.head(3)
#
# print(f"Processing {len(df)} rows...")
# for idx, row in df.iterrows():
# print(f"\n— Row {idx + 1}/{len(df)} —")
# try:
# patient_text = build_patient_text(row)
# result = run_inference(patient_text)
#
# # Attach metadata
# result["unique_id"] = row.get("unique_id", f"row_{idx}")
# result["MedDatum"] = row.get("MedDatum", None)
#
# results.append(result)
#
# # Print summary
# if result["success"]:
# res = result["result"]
# edss = res.get("EDSS", "N/A") if res.get("klassifizierbar") else "N/A"
# print(f"✅ Result → EDSS={edss}, certainty={res.get('certainty_percent', 'N/A')}%")
# print(f" Reason: {res.get('reason', 'N/A')[:100]}…")
# else:
# print(f"❌ Failed: {result.get('error', 'Unknown error')[:100]}")
#
# except Exception as e:
# print(f"⚠️ Error processing row {idx}: {e}")
# results.append({
# "success": False,
# "error": str(e),
# "unique_id": row.get("unique_id", f"row_{idx}"),
# "MedDatum": row.get("MedDatum", None),
# "result": None
# })
#
# # Save results
# output_json = INPUT_CSV.replace(".csv", "_results_Nisch_certainty.json")
# with open(output_json, 'w', encoding='utf-8') as f:
# json.dump(results, f, indent=2, ensure_ascii=False)
# print(f"\n✅ Saved results to: {output_json}")
#
##
# %% API call - Multi-iteration EDSS + certainty extraction
import time
import json
import os
from datetime import datetime
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# === CONFIGURATION ===
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
MODEL_NAME = "GPT-OSS-120B"
# File paths
INPUT_CSV = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/Komplett.txt"
# Iteration settings
NUM_ITERATIONS = 20
STOP_ON_FIRST_ERROR = False # Set to True for debugging
# Initialize OpenAI client
client = OpenAI(
api_key=OPENAI_API_KEY,
base_url=OPENAI_BASE_URL
)
# Read EDSS instructions from file
with open(EDSS_INSTRUCTIONS_PATH, 'r') as f:
EDSS_INSTRUCTIONS = f.read().strip()
# === PROMPT (unchanged from before) ===
def build_prompt(patient_text):
return f'''Du bist ein medizinischer Assistent, der spezialisiert darauf ist, EDSS-Scores (Expanded Disability Status Scale), alle Unterkategorien und die Bewertungssicherheit aus klinischen Berichten zu extrahieren.
### Deine Aufgabe:
1. Analysiere den Patientenbericht und extrahiere:
- Den Gesamt-EDSS-Score (0.010.0)
- Alle 8 EDSS-Unterkategorien (mit jeweils eigener Maximalpunktzahl)
2. Schätze für jede Entscheidung die Sicherheit als Ganzzahl von 0100 % ein.
### Struktur der JSON-Ausgabe (VERPFLICHTEND):
Gib NUR gültiges JSON zurück — kein Markdown, kein Text davor/dahinter.
{{
"reason": "Kernaussage zur EDSS-Begründung (max. 400 Zeichen, auf Deutsch).",
"klassifizierbar": true/false,
"EDSS": null ODER Zahl zwischen 0.0 und 10.0 (nur wenn klassifizierbar=true)",
"certainty_percent": 0 ODER Zahl zwischen 0 und 100 (Ganzzahl)",
"subcategories": {{
"VISUAL_OPTIC_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
"BRAINSTEM_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
"PYRAMIDAL_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
"CEREBELLAR_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
"SENSORY_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
"BOWEL_AND_BLADDER_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
"CEREBRAL_FUNCTIONS": null ODER Zahl zwischen 0.0 und 6.0,
"AMBULATION": null ODER Zahl zwischen 0.0 und 10.0
}}
}}
### Regeln:
- **reason**: Kurze, prägnante Begründung (auf Deutsch, max. 400 Zeichen), warum du den EDSS-Wert und die Unterkategorien so bewertest.
- **klassifizierbar**:
- `true`, wenn EDSS und mindestens die wichtigsten Unterkategorien *eindeutig ableitbar* oder *plausibel inferierbar* sind.
- `false`, **nur**, wenn keine relevanten Daten vorliegen, oder diese so widersprüchlich/inkonsistent sind, dass keine vernünftige Einschätzung möglich ist.
- **EDSS**:
- **VERPFLICHTEND**, wenn `klassifizierbar=true`.
- Zahl zwischen 0.0 und 10.0 (z.B. 3.0, 5.5). Darf **nicht** erscheinen, wenn `klassifizierbar=false`.
- **certainty_percent**:
- **Immer present** — Ganzzahl (0100), basierend auf:
- Klarheit und Vollständigkeit der Berichtsangaben,
- Stichhaltigkeit der Schlussfolgerung (inkl. Inferenz),
- Konsistenz zwischen den Unterkategorien.
- **subcategories**:
- **Immer present** — **alle 8 Unterkategorien** müssen enthalten sein.
- Jeder Wert ist entweder:
- `null` (wenn keine ausreichende Information vorliegt), **oder**
- eine Zahl ≤ jeweiliger Obergrenze (z.B. Ambulation ≤ 10.0).
- Wenn die Unterkategorie plausibel inferiert werden kann (auch indirekt), gib einen sinnvollen Wert ab.
- Beispiel: Wenn „Gang mit Krückstock auf ebenem Boden bis 200 m“ steht, setze `AMBULATION: 5.5`.
### EDSS-Bewertungsrichtlinien:
{EDSS_INSTRUCTIONS}
Patientenbericht:
{patient_text}
'''
# === INFERENCE FUNCTION (unchanged) ===
def run_inference(patient_text):
prompt = build_prompt(patient_text)
start_time = time.time()
try:
response = client.chat.completions.create(
messages=[
{"role": "system", "content": "Du gibst EXKLUSIV gültiges JSON zurück — keine weiteren Erklärungen."}
] + [
{"role": "user", "content": prompt}
],
model=MODEL_NAME,
max_tokens=2048,
temperature=0.1,
response_format={"type": "json_object"}
)
content = response.choices[0].message.content
# Parse and validate JSON
try:
parsed = json.loads(content)
except json.JSONDecodeError as e:
print(f"⚠️ JSON parsing failed: {e}")
print("Raw response:", content[:500])
raise ValueError("Model did not return valid JSON")
# Enforce required keys
if "certainty_percent" not in parsed:
print("⚠️ Missing 'certainty_percent' in output! Force-adding fallback.")
parsed["certainty_percent"] = 0
elif not isinstance(parsed["certainty_percent"], (int, float)):
parsed["certainty_percent"] = int(parsed["certainty_percent"])
# Clamp certainty to [0, 100]
pct = parsed["certainty_percent"]
parsed["certainty_percent"] = max(0, min(100, int(pct)))
# Enforce EDSS rules
if not parsed.get("klassifizierbar", False):
if "EDSS" in parsed:
del parsed["EDSS"]
else:
if "EDSS" not in parsed:
print("⚠️ 'klassifizierbar' is true but EDSS missing — adding fallback.")
parsed["EDSS"] = 7.0
inference_time = time.time() - start_time
return {
"success": True,
"result": parsed,
"inference_time_sec": inference_time
}
except Exception as e:
print(f"❌ Inference error: {e}")
return {
"success": False,
"error": str(e),
"inference_time_sec": -1,
"result": None
}
# === BUILD PATIENT TEXT ===
def build_patient_text(row):
return (
str(row.get("T_Zusammenfassung", "")) + "\n" +
str(row.get("Diagnosen", "")) + "\n" +
str(row.get("T_KlinBef", "")) + "\n" +
str(row.get("T_Befunde", ""))
)
# === MAIN LOOP (NEW: MULTI-ITERATION) ===
if __name__ == "__main__":
# Load data ONCE (to avoid repeated I/O overhead)
df = pd.read_csv(INPUT_CSV, sep=';')
total_rows = len(df)
print(f"Loaded {total_rows} patient records.")
for iteration in range(1, NUM_ITERATIONS + 1):
print(f"\n{'='*60}")
print(f"🔄 ITERATION {iteration}/{NUM_ITERATIONS}")
print(f"{'='*60}")
iteration_results = []
start_iter = time.time()
for idx, row in df.iterrows():
print(f"\rRow {idx+1}/{total_rows} | Iter {iteration}", end='', flush=True)
try:
patient_text = build_patient_text(row)
result = run_inference(patient_text)
# Attach metadata
if result["success"]:
res = result["result"].copy() # avoid mutation
res["iteration"] = iteration
res["unique_id"] = row.get("unique_id", f"row_{idx}")
res["MedDatum"] = row.get("MedDatum", None)
result["result"] = res
else:
result["iteration"] = iteration
result["unique_id"] = row.get("unique_id", f"row_{idx}")
result["MedDatum"] = row.get("MedDatum", None)
iteration_results.append(result)
if result["success"]:
res = result["result"]
edss = res.get("EDSS", "N/A") if res.get("klassifizierbar") else "N/A"
print(f" ✅ EDSS={edss}, cert={res.get('certainty_percent', '?')}%")
else:
print(f"{result.get('error', 'Unknown')}")
except Exception as e:
print(f"\n⚠️ Row {idx} failed: {e}")
iteration_results.append({
"success": False,
"error": str(e),
"iteration": iteration,
"unique_id": row.get("unique_id", f"row_{idx}"),
"MedDatum": row.get("MedDatum", None),
"result": None
})
if STOP_ON_FIRST_ERROR:
break
# Save per-iteration results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = INPUT_CSV.replace(".csv", f"_results_iter_{iteration}_{timestamp}.json")
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(iteration_results, f, indent=2, ensure_ascii=False)
print(f"\n✅ Iteration {iteration} complete. Saved to: {output_path}")
elapsed = time.time() - start_iter
print(f"⏱️ Iteration {iteration} took {elapsed:.1f}s ({elapsed/total_rows:.1f}s/row)")
print(f"\n🎉 All {NUM_ITERATIONS} iterations completed!")
##

1540
certainty_show.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -320,3 +320,63 @@ plt.tight_layout()
plt.show() plt.show()
## ##
# %% Patientjourney Bubble chart
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
mpl.rcParams["font.family"] = "DejaVu Sans" # or "Arial", "Calibri", "Times New Roman", ...
mpl.rcParams["font.size"] = 12 # default size for text
mpl.rcParams["axes.titlesize"] = 14
mpl.rcParams["axes.titleweight"] = "bold"
# Data (your counts)
visits = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
patient_count = np.array([32, 24, 28, 17, 13, 6, 3, 3, 2])
# "Remaining" = patients with >= that many visits (cumulative from the right)
remaining = np.array([patient_count[i:].sum() for i in range(len(patient_count))])
# --- Plot ---
fig, ax = plt.subplots(figsize=(12, 3))
y = 0.0 # all bubbles on one horizontal line
# Horizontal line
ax.hlines(y, visits.min() - 0.4, visits.max() + 0.4, color="#1f77b4", linewidth=3)
# Bubble sizes (scale as needed)
# (Matplotlib scatter uses area in points^2)
sizes = patient_count * 35 # tweak this multiplier if you want bigger/smaller bubbles
ax.scatter(visits, np.full_like(visits, y), s=sizes, color="#1f77b4", zorder=3)
# Title
#ax.set_title("Patient Journey by Visit Count", fontsize=14, pad=18)
# Top labels: "1 visits", "2 visits", ...
for x in visits:
label = f"{x} visit" if x == 1 else f"{x} visits"
ax.text(x, y + 0.18, label, ha="center", va="bottom", fontsize=10)
# Bottom labels: "X patients" and "Y remaining"
for x, pc, rem in zip(visits, patient_count, remaining):
ax.text(x, y - 0.20, f"{pc} patients", ha="center", va="top", fontsize=9)
ax.text(x, y - 0.32, f"{rem} remaining", ha="center", va="top", fontsize=9)
# Cosmetics: remove axes, keep spacing nice
ax.set_xlim(visits.min() - 0.6, visits.max() + 0.6)
ax.set_ylim(-0.5, 0.35)
ax.set_xticks([])
ax.set_yticks([])
for spine in ax.spines.values():
spine.set_visible(False)
plt.tight_layout()
plt.show()
plt.savefig("patient_journey.svg", format="svg", bbox_inches="tight")
##

View File

@@ -718,128 +718,155 @@ plt.show()
## ##
# %% Dashboard
# %% Dashboard
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import matplotlib.dates as mdates
from datetime import datetime
import numpy as np import numpy as np
from matplotlib.gridspec import GridSpec
def to_numeric_comma(s: pd.Series) -> pd.Series:
# accepts 1.5 and 1,5
return pd.to_numeric(s.astype(str).str.replace(",", ".", regex=False), errors="coerce")
# Load the data # Load the data
file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv' file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv'
df = pd.read_csv(file_path, sep='\t') df = pd.read_csv(file_path, sep='\t')
# Rename columns to remove 'result.' prefix and handle spaces # Rename columns to remove 'result.' prefix and replace spaces
column_mapping = {} column_mapping = {}
for col in df.columns: for col in df.columns:
if col.startswith('result.'): if col.startswith('result.'):
new_name = col.replace('result.', '') new_name = col.replace('result.', '').replace(' ', '_')
# Handle spaces in column names (replace with underscores if needed)
new_name = new_name.replace(' ', '_')
column_mapping[col] = new_name column_mapping[col] = new_name
df = df.rename(columns=column_mapping) df = df.rename(columns=column_mapping)
# Convert MedDatum to datetime # Parse MedDatum safely
df['MedDatum'] = pd.to_datetime(df['MedDatum']) df['MedDatum'] = pd.to_datetime(df['MedDatum'], errors='coerce')
# Check what columns actually exist in the dataset # Patient
print("Available columns:") patient_id = '6389d658'
print(df.columns.tolist()) patient_data = df[df['unique_id'] == patient_id].sort_values('MedDatum').copy()
print("\nFirst few rows:") if patient_data.empty:
print(df.head()) raise ValueError(f"No data found for patient: {patient_id}")
# Hardcode specific patient names # Functional systems + EDSS
patient_names = ['6b56865d'] edss_col, edss_title = ('GT.EDSS', 'EDSS')
# Define the functional systems (columns to plot) - adjust based on actual column names functional_systems = [
functional_systems = ['EDSS', 'Visual', 'Sensory', 'Motor', 'Brainstem', 'Cerebellar', 'Autonomic', 'Bladder', 'Intellectual'] ('GT.VISUAL_OPTIC_FUNCTIONS', 'Visual / Optic'),
('GT.CEREBELLAR_FUNCTIONS', 'Cerebellar'),
('GT.BRAINSTEM_FUNCTIONS', 'Brainstem'),
('GT.SENSORY_FUNCTIONS', 'Sensory'),
('GT.PYRAMIDAL_FUNCTIONS', 'Pyramidal (Motor)'),
('GT.AMBULATION', 'Ambulation'),
('GT.CEREBRAL_FUNCTIONS', 'Cerebral'),
('GT.BOWEL_AND_BLADDER_FUNCTIONS', 'Bowel & Bladder'),
]
# Create subplots horizontally (2 columns, adjust rows as needed) # y-axis max rules
num_plots = len(functional_systems) ymax_by_col = {
num_cols = 2 'GT.PYRAMIDAL_FUNCTIONS': 6,
num_rows = (num_plots + num_cols - 1) // num_cols # Ceiling division 'GT.SENSORY_FUNCTIONS': 6,
'GT.BOWEL_AND_BLADDER_FUNCTIONS': 6,
'GT.VISUAL_OPTIC_FUNCTIONS': 6,
'GT.CEREBELLAR_FUNCTIONS': 5,
'GT.CEREBRAL_FUNCTIONS': 5,
'GT.BRAINSTEM_FUNCTIONS': 5,
'GT.EDSS': 10,
}
default_ymax = 6
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 4*num_rows), sharex=False) # Changed sharex=False # ---------- Build shared "event dates" ticks ----------
if num_plots == 1: cols_for_dates = [edss_col] + [c for c, _ in functional_systems]
axes = [axes] event_dates = []
elif num_rows == 1:
axes = axes
else:
axes = axes.flatten()
# Plot for the hardcoded patient for c in cols_for_dates:
for i, system in enumerate(functional_systems): if c in patient_data.columns:
# Filter data for this specific patient y = to_numeric_comma(patient_data[c]) # <-- changed
patient_data = df[df['unique_id'] == patient_names[0]].sort_values('MedDatum') x = patient_data['MedDatum']
tmp = pd.DataFrame({"x": x, "y": y}).dropna(subset=["x", "y"])
event_dates.extend(tmp["x"].tolist())
# Check if patient data exists event_dates = sorted(pd.Series(event_dates).drop_duplicates().tolist())
if patient_data.empty:
print(f"No data found for patient: {patient_names[0]}")
continue
# Check if the system column exists in the data max_ticks = 8
if system in patient_data.columns: if len(event_dates) > max_ticks:
# Plot the specific functional system idx = np.linspace(0, len(event_dates) - 1, max_ticks, dtype=int)
if not patient_data[system].isna().all(): event_dates = [event_dates[i] for i in idx]
axes[i].plot(patient_data['MedDatum'], patient_data[system], marker='o', linewidth=2, label=system)
axes[i].set_ylabel('Score') # ---------- A4 figure ----------
axes[i].set_title(f'Functional System: {system}') fig = plt.figure(figsize=(11.69, 8.27))
axes[i].grid(True, alpha=0.3) gs = GridSpec(nrows=3, ncols=4, figure=fig, height_ratios=[2.0, 1.0, 1.0], hspace=0.5, wspace=0.35)
axes[i].legend()
else: def style_time_axis(ax, show_labels=True):
axes[i].set_title(f'Functional System: {system} (No data)') ax.set_xticks(event_dates)
axes[i].set_ylabel('Score') ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
axes[i].grid(True, alpha=0.3) ax.tick_params(axis='x', rotation=30, labelsize=8, pad=2)
if not show_labels:
ax.tick_params(labelbottom=False)
# ---------- EDSS main plot ----------
ax_main = fig.add_subplot(gs[0, :])
if edss_col in patient_data.columns:
y = to_numeric_comma(patient_data[edss_col]) # <-- changed
x = patient_data['MedDatum']
plot_df = pd.DataFrame({"x": x, "y": y}).dropna(subset=["x", "y"]).sort_values("x")
ax_main.set_title(edss_title, fontsize=14, fontweight='bold')
ax_main.set_ylabel("Score")
ax_main.set_ylim(0, ymax_by_col.get(edss_col, default_ymax))
ax_main.grid(True, alpha=0.3)
if not plot_df.empty:
ax_main.plot(plot_df["x"], plot_df["y"], marker='o', linewidth=3, color='tab:red')
else: else:
# Try to find column with similar name (case insensitive) ax_main.set_title("EDSS (no numeric data)", fontsize=14, fontweight='bold')
found_column = None else:
for col in df.columns: ax_main.set_title("EDSS (missing column GT.EDSS)", fontsize=14, fontweight='bold')
if system.lower() in col.lower(): ax_main.set_ylim(0, ymax_by_col.get(edss_col, 10))
found_column = col ax_main.grid(True, alpha=0.3)
break
if found_column: style_time_axis(ax_main)
print(f"Found similar column: {found_column}")
if not patient_data[found_column].isna().all(): # ---------- Small aligned plots ----------
axes[i].plot(patient_data['MedDatum'], patient_data[found_column], marker='o', linewidth=2, label=found_column) small_axes = []
axes[i].set_ylabel('Score') for k, (col, title) in enumerate(functional_systems):
axes[i].set_title(f'Functional System: {system} (found as: {found_column})') r = 1 + (k // 4)
axes[i].grid(True, alpha=0.3) c = (k % 4)
axes[i].legend() ax = fig.add_subplot(gs[r, c], sharex=ax_main)
small_axes.append(ax)
ymax = ymax_by_col.get(col, default_ymax)
ax.set_title(title, fontsize=10)
ax.set_ylabel("Score")
ax.set_ylim(0, ymax)
ax.grid(True, alpha=0.3)
if col in patient_data.columns:
y = to_numeric_comma(patient_data[col]) # <-- changed
x = patient_data['MedDatum']
plot_df = pd.DataFrame({"x": x, "y": y}).dropna(subset=["x", "y"]).sort_values("x")
if not plot_df.empty:
ax.plot(plot_df["x"], plot_df["y"], marker='o', linewidth=2, color='tab:blue')
else: else:
axes[i].set_title(f'Functional System: {system} (Column not found)') ax.set_title(f"{title} (no data)", fontsize=10)
axes[i].set_ylabel('Score') else:
axes[i].grid(True, alpha=0.3) ax.set_title(f"{title} (missing)", fontsize=10)
# Hide empty subplots style_time_axis(ax)
for i in range(len(functional_systems), len(axes)):
axes[i].set_visible(False)
# Set x-axis label for the last row only # Hide x tick labels on first row of small plots
for i in range(len(functional_systems)): for ax in small_axes[:4]:
if i >= len(axes) - num_cols: # Last row ax.tick_params(labelbottom=False)
axes[i].set_xlabel('Date')
# Force date formatting on all axes
for ax in axes:
ax.tick_params(axis='x', rotation=45)
ax.xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m-%d'))
ax.xaxis.set_major_locator(plt.matplotlib.dates.MonthLocator())
# Automatically format x-axis dates
plt.gcf().autofmt_xdate()
plt.tight_layout() plt.tight_layout()
fig.subplots_adjust(hspace=0.7)
plt.show() plt.show()
## ##
# %% Table # %% Table
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt