From 118e3e63b30d4822f182a6f1d75d354717024d9f Mon Sep 17 00:00:00 2001
From: Shahin Ramezanzadeh <shahin.rmz@tutanota.com>
Date: Mon, 23 Feb 2026 15:06:54 +0100
Subject: [PATCH] refinement

---
 Data/audit.py                               |  415 ----
 audit.py                                    | 2371 +++++++++++++++++++
 Data/certainty_show.py => certainty_show.py |    0
 figure1.py                                  |   60 +
 Data/show_plots.py => show_plots.py         |    0
 5 files changed, 2431 insertions(+), 415 deletions(-)
 delete mode 100644 Data/audit.py
 create mode 100644 audit.py
 rename Data/certainty_show.py => certainty_show.py (100%)
 rename Data/show_plots.py => show_plots.py (100%)

diff --git a/Data/audit.py b/Data/audit.py
deleted file mode 100644
index cbac7cb..0000000
--- a/Data/audit.py
+++ /dev/null
@@ -1,415 +0,0 @@
-# %% Confirm EDSS missing
-import pandas as pd
-import numpy as np
-
-def clean_series(s):
-    return s.astype(str).str.strip().str.lower()
-
-def gt_edss_audit(ground_truth_path, edss_col="EDSS"):
-    df_gt = pd.read_csv(ground_truth_path, sep=';')
-
-    # normalize keys
-    df_gt['unique_id'] = clean_series(df_gt['unique_id'])
-    df_gt['MedDatum']  = clean_series(df_gt['MedDatum'])
-    df_gt['key'] = df_gt['unique_id'] + "_" + df_gt['MedDatum']
-
-    print("GT rows:", len(df_gt))
-    print("GT unique keys:", df_gt['key'].nunique())
-
-    # IMPORTANT: parse EDSS robustly (German decimal commas etc.)
-    if edss_col in df_gt.columns:
-        edss_raw = df_gt[edss_col]
-        edss_num = pd.to_numeric(
-            edss_raw.astype(str).str.replace(",", ".", regex=False).str.strip(),
-            errors="coerce"
-        )
-        df_gt["_edss_num"] = edss_num
-
-        print(f"GT missing EDSS look (numeric-coerce): {df_gt['_edss_num'].isna().sum()}")
-        print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['_edss_num'].isna(), 'key'].nunique()}")
-
-        # duplicates on key
-        dup = df_gt['key'].duplicated(keep=False)
-        print("GT duplicate-key rows:", dup.sum())
-        if dup.any():
-            # how many duplicate keys exist?
-            print("GT duplicate keys:", df_gt.loc[dup, 'key'].nunique())
-            # of duplicate-key rows, how many have missing EDSS?
-            print("Duplicate-key rows with missing EDSS:", df_gt.loc[dup, "_edss_num"].isna().sum())
-
-            # show the worst offenders
-            print("\nTop duplicate keys (by count):")
-            print(df_gt.loc[dup, 'key'].value_counts().head(10))
-    else:
-        print(f"EDSS column '{edss_col}' not found in GT columns:", df_gt.columns.tolist())
-
-    return df_gt
-
-df_gt = gt_edss_audit("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", edss_col="EDSS")
-
-##
-
-
-
-
-# %% trace missing ones
-
-import json, glob, os
-import pandas as pd
-
-def load_preds(json_dir_path):
-    all_preds = []
-    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
-        with open(file_path, 'r', encoding='utf-8') as f:
-            data = json.load(f)
-        file_name = os.path.basename(file_path)
-        for entry in data:
-            if entry.get("success"):
-                res = entry["result"]
-                all_preds.append({
-                    "unique_id": str(res.get("unique_id")).strip().lower(),
-                    "MedDatum":  str(res.get("MedDatum")).strip().lower(),
-                    "file": file_name
-                })
-    df_pred = pd.DataFrame(all_preds)
-    df_pred["key"] = df_pred["unique_id"] + "_" + df_pred["MedDatum"]
-    return df_pred
-
-df_pred = load_preds("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration")
-print("Pred rows:", len(df_pred))
-print("Pred unique keys:", df_pred["key"].nunique())
-
-# Suppose df_gt was returned from step 1 and has _edss_num + key
-missing_gt_keys = set(df_gt.loc[df_gt["_edss_num"].isna(), "key"])
-
-df_pred["gt_key_missing_edss"] = df_pred["key"].isin(missing_gt_keys)
-
-print("Pred rows whose GT key has missing EDSS:", df_pred["gt_key_missing_edss"].sum())
-print("Unique keys (among preds) whose GT EDSS missing:", df_pred.loc[df_pred["gt_key_missing_edss"], "key"].nunique())
-
-print("\nTop files contributing to missing-GT-EDSS rows:")
-print(df_pred.loc[df_pred["gt_key_missing_edss"], "file"].value_counts().head(20))
-
-print("\nTop keys replicated in predictions (why count inflates):")
-print(df_pred.loc[df_pred["gt_key_missing_edss"], "key"].value_counts().head(20))
-
-
-##
-
-
-# %% verify
-
-merged = df_pred.merge(
-    df_gt[["key", "_edss_num"]],  # use the numeric-coerced GT EDSS
-    on="key",
-    how="left",
-    validate="many_to_one"  # will ERROR if GT has duplicate keys (GOOD!)
-)
-
-print("Merged rows:", len(merged))
-print("Merged missing GT EDSS:", merged["_edss_num"].isna().sum())
-
-
-##
-
-
-# %% 1json (rewritten with robust parsing + detailed data log)
-import pandas as pd
-import numpy as np
-import json
-import matplotlib.pyplot as plt
-import seaborn as sns
-from matplotlib.patches import Patch
-from matplotlib.lines import Line2D
-
-def plot_single_json_error_analysis_with_log(
-    json_file_path,
-    ground_truth_path,
-    edss_gt_col="EDSS",
-    min_bin_count=5,
-):
-    def norm_str(x):
-        # normalize identifiers and dates consistently
-        return str(x).strip().lower()
-
-    def parse_edss(x):
-        # robust numeric parse: handles "3,5" as 3.5, blanks, "nan", etc.
-        if x is None:
-            return np.nan
-        s = str(x).strip()
-        if s == "" or s.lower() in {"nan", "none", "null"}:
-            return np.nan
-        s = s.replace(",", ".")
-        return pd.to_numeric(s, errors="coerce")
-
-    print("\n" + "="*80)
-    print("SINGLE-JSON ERROR ANALYSIS (WITH LOG)")
-    print("="*80)
-    print(f"JSON: {json_file_path}")
-    print(f"GT:   {ground_truth_path}")
-
-    # ------------------------------------------------------------------
-    # 1) Load Ground Truth
-    # ------------------------------------------------------------------
-    df_gt = pd.read_csv(ground_truth_path, sep=";")
-
-    required_gt_cols = {"unique_id", "MedDatum", edss_gt_col}
-    missing_cols = required_gt_cols - set(df_gt.columns)
-    if missing_cols:
-        raise ValueError(f"GT is missing required columns: {missing_cols}. Available: {df_gt.columns.tolist()}")
-
-    df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
-    df_gt["MedDatum"]  = df_gt["MedDatum"].map(norm_str)
-    df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
-
-    # Robust EDSS parsing (important!)
-    df_gt["EDSS_gt"] = df_gt[edss_gt_col].map(parse_edss)
-
-    # GT logs
-    print("\n--- GT LOG ---")
-    print(f"GT rows: {len(df_gt)}")
-    print(f"GT unique keys: {df_gt['key'].nunique()}")
-    gt_dup = df_gt["key"].duplicated(keep=False).sum()
-    print(f"GT duplicate-key rows: {gt_dup}")
-    print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}")
-    print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['EDSS_gt'].isna(), 'key'].nunique()}")
-
-    if gt_dup > 0:
-        print("\n[WARNING] GT has duplicate keys. Merge can duplicate rows. Example duplicate keys:")
-        print(df_gt.loc[df_gt["key"].duplicated(keep=False), "key"].value_counts().head(10))
-
-    # ------------------------------------------------------------------
-    # 2) Load Predictions from the specific JSON
-    # ------------------------------------------------------------------
-    with open(json_file_path, "r", encoding="utf-8") as f:
-        data = json.load(f)
-
-    total_entries = len(data)
-    success_entries = sum(1 for e in data if e.get("success"))
-
-    all_preds = []
-    skipped = {
-        "not_success": 0,
-        "missing_uid_or_date": 0,
-        "missing_edss": 0,
-        "missing_conf": 0,
-    }
-
-    for entry in data:
-        if not entry.get("success"):
-            skipped["not_success"] += 1
-            continue
-
-        res = entry.get("result", {})
-        uid = res.get("unique_id")
-        md  = res.get("MedDatum")
-
-        if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
-            skipped["missing_uid_or_date"] += 1
-            continue
-
-        edss_pred = parse_edss(res.get("EDSS"))
-        conf = pd.to_numeric(res.get("certainty_percent"), errors="coerce")
-
-        if pd.isna(edss_pred):
-            skipped["missing_edss"] += 1
-        if pd.isna(conf):
-            skipped["missing_conf"] += 1
-
-        all_preds.append({
-            "unique_id": norm_str(uid),
-            "MedDatum":  norm_str(md),
-            "key":       norm_str(uid) + "_" + norm_str(md),
-            "EDSS_pred": edss_pred,
-            "confidence": conf,
-        })
-
-    df_pred = pd.DataFrame(all_preds)
-
-    # Pred logs
-    print("\n--- PRED LOG ---")
-    print(f"JSON total entries: {total_entries}")
-    print(f"JSON success entries: {success_entries}")
-    print(f"Pred rows loaded (success + has keys): {len(df_pred)}")
-    if len(df_pred) == 0:
-        print("[ERROR] No usable prediction rows found. Nothing to plot.")
-        return
-
-    print(f"Pred unique keys: {df_pred['key'].nunique()}")
-    print(f"Pred missing EDSS (numeric): {df_pred['EDSS_pred'].isna().sum()}")
-    print(f"Pred missing confidence:     {df_pred['confidence'].isna().sum()}")
-    print("Skipped counts:", skipped)
-
-    # Are keys duplicated within this JSON? (often yes if multiple notes map to same key)
-    key_counts = df_pred["key"].value_counts()
-    dup_pred_rows = (key_counts > 1).sum()
-    max_rep = int(key_counts.max())
-    print(f"Keys with >1 prediction in this JSON: {dup_pred_rows}")
-    print(f"Max repetitions of a single key in this JSON: {max_rep}")
-    if max_rep > 1:
-        print("Top repeated keys in this JSON:")
-        print(key_counts.head(10))
-
-    # ------------------------------------------------------------------
-    # 3) Merge (and diagnose why rows drop)
-    # ------------------------------------------------------------------
-    # Diagnose how many pred keys exist in GT
-    gt_key_set = set(df_gt["key"])
-    df_pred["key_in_gt"] = df_pred["key"].isin(gt_key_set)
-    not_in_gt = df_pred.loc[~df_pred["key_in_gt"]]
-
-    print("\n--- KEY MATCH LOG ---")
-    print(f"Pred rows with key found in GT: {df_pred['key_in_gt'].sum()} / {len(df_pred)}")
-    print(f"Pred rows with key NOT found in GT: {len(not_in_gt)}")
-    if len(not_in_gt) > 0:
-        print("[WARNING] Some prediction keys are not present in GT. First 10:")
-        print(not_in_gt[["unique_id", "MedDatum", "key"]].head(10))
-
-    # Now merge; we expect GT is one-to-many with pred (many_to_one)
-    # If GT had duplicates, validate would raise.
-    df_merged = df_pred.merge(
-        df_gt[["key", "EDSS_gt"]],
-        on="key",
-        how="inner",
-        validate="many_to_one"
-    )
-
-    print("\n--- MERGE LOG ---")
-    print(f"Merged rows (inner join): {len(df_merged)}")
-    print(f"Merged unique keys: {df_merged['key'].nunique()}")
-    print(f"Merged missing GT EDSS:   {df_merged['EDSS_gt'].isna().sum()}")
-    print(f"Merged missing pred EDSS: {df_merged['EDSS_pred'].isna().sum()}")
-    print(f"Merged missing confidence:{df_merged['confidence'].isna().sum()}")
-
-    # How many rows will be removed by dropna() in your old code?
-    # Old code did .dropna() on ALL columns, which can remove rows for missing confidence too.
-    rows_complete = df_merged.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"])
-    print("\n--- FILTER LOG (what will be used for stats/plot) ---")
-    print(f"Rows with all required fields (EDSS_gt, EDSS_pred, confidence): {len(rows_complete)}")
-    if len(rows_complete) == 0:
-        print("[ERROR] No complete rows after filtering. Nothing to plot.")
-        return
-
-    # Compute abs error
-    rows_complete = rows_complete.copy()
-    rows_complete["abs_error"] = (rows_complete["EDSS_pred"] - rows_complete["EDSS_gt"]).abs()
-
-    # ------------------------------------------------------------------
-    # 4) Binning + stats (with guardrails)
-    # ------------------------------------------------------------------
-    bins = [0, 70, 80, 90, 100]
-    labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]
-
-    # Confidence outside bins becomes NaN; log it
-    rows_complete["conf_bin"] = pd.cut(rows_complete["confidence"], bins=bins, labels=labels, include_lowest=True)
-    conf_outside = rows_complete["conf_bin"].isna().sum()
-    print(f"Rows with confidence outside [0,100] or outside bin edges: {conf_outside}")
-    if conf_outside > 0:
-        print("Example confidences outside bins:")
-        print(rows_complete.loc[rows_complete["conf_bin"].isna(), "confidence"].head(20).to_list())
-
-    df_plot = rows_complete.dropna(subset=["conf_bin"])
-    stats = (
-        df_plot.groupby("conf_bin", observed=True)["abs_error"]
-        .agg(mean="mean", std="std", count="count")
-        .reindex(labels)
-        .reset_index()
-    )
-
-    print("\n--- BIN STATS ---")
-    print(stats)
-
-    # Warn about low counts
-    low_bins = stats.loc[stats["count"].fillna(0) < min_bin_count, ["conf_bin", "count"]]
-    if not low_bins.empty:
-        print(f"\n[WARNING] Some bins have < {min_bin_count} rows; error bars/trend may be unstable:")
-        print(low_bins)
-
-    # ------------------------------------------------------------------
-    # 5) Plot
-    # ------------------------------------------------------------------
-    plt.figure(figsize=(13, 8))
-    colors = sns.color_palette("Blues", n_colors=len(labels))
-
-    # Replace NaNs in mean for plotting bars (empty bins)
-    means = stats["mean"].to_numpy()
-    counts = stats["count"].fillna(0).astype(int).to_numpy()
-    stds = stats["std"].to_numpy()
-
-    # For bins with no data, bar height 0 (and no errorbar)
-    means_plot = np.nan_to_num(means, nan=0.0)
-
-    bars = plt.bar(labels, means_plot, color=colors, edgecolor="black", alpha=0.85)
-
-    # Error bars only where count>1 and std is not NaN
-    sem = np.where((counts > 1) & (~np.isnan(stds)), stds / np.sqrt(counts), np.nan)
-    plt.errorbar(labels, means_plot, yerr=sem, fmt="none", c="black", capsize=8, elinewidth=1.5)
-
-    # Trend line only if at least 2 non-empty bins
-    valid_idx = np.where(~np.isnan(means))[0]
-    if len(valid_idx) >= 2:
-        x_idx = np.arange(len(labels))
-        z = np.polyfit(valid_idx, means[valid_idx], 1)
-        p = np.poly1d(z)
-        plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5)
-        trend_label = "Trend Line"
-    else:
-        trend_label = "Trend Line (insufficient bins)"
-        print("\n[INFO] Not enough non-empty bins to fit a trend line.")
-
-    # Data labels
-    for i, bar in enumerate(bars):
-        n_count = int(counts[i])
-        mae_val = means[i]
-        if np.isnan(mae_val) or n_count == 0:
-            txt = "empty"
-            y = 0.02
-        else:
-            txt = f"MAE: {mae_val:.2f}\nn={n_count}"
-            y = bar.get_height() + 0.04
-        plt.text(
-            bar.get_x() + bar.get_width()/2,
-            y,
-            txt,
-            ha="center",
-            va="bottom",
-            fontweight="bold",
-            fontsize=10
-        )
-
-    # Legend
-    legend_elements = [
-        Patch(facecolor=colors[0], edgecolor="black", label=f"Bin 1: {labels[0]}"),
-        Patch(facecolor=colors[1], edgecolor="black", label=f"Bin 2: {labels[1]}"),
-        Patch(facecolor=colors[2], edgecolor="black", label=f"Bin 3: {labels[2]}"),
-        Patch(facecolor=colors[3], edgecolor="black", label=f"Bin 4: {labels[3]}"),
-        Line2D([0], [0], color="#e74c3c", linestyle="--", lw=3, label=trend_label),
-        Line2D([0], [0], color="black", marker="_", linestyle="None", markersize=10, label="Std Error (SEM)"),
-        Patch(color="none", label="Metric: Mean Absolute Error (MAE)")
-    ]
-    plt.legend(handles=legend_elements, loc="upper right", frameon=True, shadow=True, title="Legend")
-
-    plt.title("Validation: Confidence vs. Error Magnitude (Single JSON)", fontsize=15, pad=30)
-    plt.ylabel("Mean Absolute Error (EDSS Points)", fontsize=12)
-    plt.xlabel("LLM Confidence Bracket", fontsize=12)
-    plt.grid(axis="y", linestyle=":", alpha=0.5)
-
-    ymax = np.nanmax(means) if np.any(~np.isnan(means)) else 0.0
-    plt.ylim(0, max(0.5, float(ymax) + 0.6))
-    plt.tight_layout()
-    plt.show()
-
-    print("\n" + "="*80)
-    print("DONE")
-    print("="*80)
-
-
-# --- RUN ---
-json_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json"
-gt_path   = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv"
-
-plot_single_json_error_analysis_with_log(json_path, gt_path)
-
-
-
-##
diff --git a/audit.py b/audit.py
new file mode 100644
index 0000000..7fa3039
--- /dev/null
+++ b/audit.py
@@ -0,0 +1,2371 @@
+# %% Confirm EDSS missing
+import pandas as pd
+import numpy as np
+
+def clean_series(s):
+    return s.astype(str).str.strip().str.lower()
+
+def gt_edss_audit(ground_truth_path, edss_col="EDSS"):
+    df_gt = pd.read_csv(ground_truth_path, sep=';')
+
+    # normalize keys
+    df_gt['unique_id'] = clean_series(df_gt['unique_id'])
+    df_gt['MedDatum']  = clean_series(df_gt['MedDatum'])
+    df_gt['key'] = df_gt['unique_id'] + "_" + df_gt['MedDatum']
+
+    print("GT rows:", len(df_gt))
+    print("GT unique keys:", df_gt['key'].nunique())
+
+    # IMPORTANT: parse EDSS robustly (German decimal commas etc.)
+    if edss_col in df_gt.columns:
+        edss_raw = df_gt[edss_col]
+        edss_num = pd.to_numeric(
+            edss_raw.astype(str).str.replace(",", ".", regex=False).str.strip(),
+            errors="coerce"
+        )
+        df_gt["_edss_num"] = edss_num
+
+        print(f"GT missing EDSS look (numeric-coerce): {df_gt['_edss_num'].isna().sum()}")
+        print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['_edss_num'].isna(), 'key'].nunique()}")
+
+        # duplicates on key
+        dup = df_gt['key'].duplicated(keep=False)
+        print("GT duplicate-key rows:", dup.sum())
+        if dup.any():
+            # how many duplicate keys exist?
+            print("GT duplicate keys:", df_gt.loc[dup, 'key'].nunique())
+            # of duplicate-key rows, how many have missing EDSS?
+            print("Duplicate-key rows with missing EDSS:", df_gt.loc[dup, "_edss_num"].isna().sum())
+
+            # show the worst offenders
+            print("\nTop duplicate keys (by count):")
+            print(df_gt.loc[dup, 'key'].value_counts().head(10))
+    else:
+        print(f"EDSS column '{edss_col}' not found in GT columns:", df_gt.columns.tolist())
+
+    return df_gt
+
+df_gt = gt_edss_audit("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", edss_col="EDSS")
+
+##
+
+
+
+
+# %% trace missing ones
+
+import json, glob, os
+import pandas as pd
+
+def load_preds(json_dir_path):
+    all_preds = []
+    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        file_name = os.path.basename(file_path)
+        for entry in data:
+            if entry.get("success"):
+                res = entry["result"]
+                all_preds.append({
+                    "unique_id": str(res.get("unique_id")).strip().lower(),
+                    "MedDatum":  str(res.get("MedDatum")).strip().lower(),
+                    "file": file_name
+                })
+    df_pred = pd.DataFrame(all_preds)
+    df_pred["key"] = df_pred["unique_id"] + "_" + df_pred["MedDatum"]
+    return df_pred
+
+df_pred = load_preds("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration")
+print("Pred rows:", len(df_pred))
+print("Pred unique keys:", df_pred["key"].nunique())
+
+# Suppose df_gt was returned from step 1 and has _edss_num + key
+missing_gt_keys = set(df_gt.loc[df_gt["_edss_num"].isna(), "key"])
+
+df_pred["gt_key_missing_edss"] = df_pred["key"].isin(missing_gt_keys)
+
+print("Pred rows whose GT key has missing EDSS:", df_pred["gt_key_missing_edss"].sum())
+print("Unique keys (among preds) whose GT EDSS missing:", df_pred.loc[df_pred["gt_key_missing_edss"], "key"].nunique())
+
+print("\nTop files contributing to missing-GT-EDSS rows:")
+print(df_pred.loc[df_pred["gt_key_missing_edss"], "file"].value_counts().head(20))
+
+print("\nTop keys replicated in predictions (why count inflates):")
+print(df_pred.loc[df_pred["gt_key_missing_edss"], "key"].value_counts().head(20))
+
+
+##
+
+
+# %% verify
+
+merged = df_pred.merge(
+    df_gt[["key", "_edss_num"]],  # use the numeric-coerced GT EDSS
+    on="key",
+    how="left",
+    validate="many_to_one"  # will ERROR if GT has duplicate keys (GOOD!)
+)
+
+print("Merged rows:", len(merged))
+print("Merged missing GT EDSS:", merged["_edss_num"].isna().sum())
+
+
+##
+
+
+# %% 1json (rewritten with robust parsing + detailed data log)
+import pandas as pd
+import numpy as np
+import json
+import matplotlib.pyplot as plt
+import seaborn as sns
+from matplotlib.patches import Patch
+from matplotlib.lines import Line2D
+
+def plot_single_json_error_analysis_with_log(
+    json_file_path,
+    ground_truth_path,
+    edss_gt_col="EDSS",
+    min_bin_count=5,
+):
+    def norm_str(x):
+        # normalize identifiers and dates consistently
+        return str(x).strip().lower()
+
+    def parse_edss(x):
+        # robust numeric parse: handles "3,5" as 3.5, blanks, "nan", etc.
+        if x is None:
+            return np.nan
+        s = str(x).strip()
+        if s == "" or s.lower() in {"nan", "none", "null"}:
+            return np.nan
+        s = s.replace(",", ".")
+        return pd.to_numeric(s, errors="coerce")
+
+    print("\n" + "="*80)
+    print("SINGLE-JSON ERROR ANALYSIS (WITH LOG)")
+    print("="*80)
+    print(f"JSON: {json_file_path}")
+    print(f"GT:   {ground_truth_path}")
+
+    # ------------------------------------------------------------------
+    # 1) Load Ground Truth
+    # ------------------------------------------------------------------
+    df_gt = pd.read_csv(ground_truth_path, sep=";")
+
+    required_gt_cols = {"unique_id", "MedDatum", edss_gt_col}
+    missing_cols = required_gt_cols - set(df_gt.columns)
+    if missing_cols:
+        raise ValueError(f"GT is missing required columns: {missing_cols}. Available: {df_gt.columns.tolist()}")
+
+    df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
+    df_gt["MedDatum"]  = df_gt["MedDatum"].map(norm_str)
+    df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
+
+    # Robust EDSS parsing (important!)
+    df_gt["EDSS_gt"] = df_gt[edss_gt_col].map(parse_edss)
+
+    # GT logs
+    print("\n--- GT LOG ---")
+    print(f"GT rows: {len(df_gt)}")
+    print(f"GT unique keys: {df_gt['key'].nunique()}")
+    gt_dup = df_gt["key"].duplicated(keep=False).sum()
+    print(f"GT duplicate-key rows: {gt_dup}")
+    print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}")
+    print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['EDSS_gt'].isna(), 'key'].nunique()}")
+
+    if gt_dup > 0:
+        print("\n[WARNING] GT has duplicate keys. Merge can duplicate rows. Example duplicate keys:")
+        print(df_gt.loc[df_gt["key"].duplicated(keep=False), "key"].value_counts().head(10))
+
+    # ------------------------------------------------------------------
+    # 2) Load Predictions from the specific JSON
+    # ------------------------------------------------------------------
+    with open(json_file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    total_entries = len(data)
+    success_entries = sum(1 for e in data if e.get("success"))
+
+    all_preds = []
+    skipped = {
+        "not_success": 0,
+        "missing_uid_or_date": 0,
+        "missing_edss": 0,
+        "missing_conf": 0,
+    }
+
+    for entry in data:
+        if not entry.get("success"):
+            skipped["not_success"] += 1
+            continue
+
+        res = entry.get("result", {})
+        uid = res.get("unique_id")
+        md  = res.get("MedDatum")
+
+        if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
+            skipped["missing_uid_or_date"] += 1
+            continue
+
+        edss_pred = parse_edss(res.get("EDSS"))
+        conf = pd.to_numeric(res.get("certainty_percent"), errors="coerce")
+
+        if pd.isna(edss_pred):
+            skipped["missing_edss"] += 1
+        if pd.isna(conf):
+            skipped["missing_conf"] += 1
+
+        all_preds.append({
+            "unique_id": norm_str(uid),
+            "MedDatum":  norm_str(md),
+            "key":       norm_str(uid) + "_" + norm_str(md),
+            "EDSS_pred": edss_pred,
+            "confidence": conf,
+        })
+
+    df_pred = pd.DataFrame(all_preds)
+
+    # Pred logs
+    print("\n--- PRED LOG ---")
+    print(f"JSON total entries: {total_entries}")
+    print(f"JSON success entries: {success_entries}")
+    print(f"Pred rows loaded (success + has keys): {len(df_pred)}")
+    if len(df_pred) == 0:
+        print("[ERROR] No usable prediction rows found. Nothing to plot.")
+        return
+
+    print(f"Pred unique keys: {df_pred['key'].nunique()}")
+    print(f"Pred missing EDSS (numeric): {df_pred['EDSS_pred'].isna().sum()}")
+    print(f"Pred missing confidence:     {df_pred['confidence'].isna().sum()}")
+    print("Skipped counts:", skipped)
+
+    # Are keys duplicated within this JSON? (often yes if multiple notes map to same key)
+    key_counts = df_pred["key"].value_counts()
+    dup_pred_rows = (key_counts > 1).sum()
+    max_rep = int(key_counts.max())
+    print(f"Keys with >1 prediction in this JSON: {dup_pred_rows}")
+    print(f"Max repetitions of a single key in this JSON: {max_rep}")
+    if max_rep > 1:
+        print("Top repeated keys in this JSON:")
+        print(key_counts.head(10))
+
+    # ------------------------------------------------------------------
+    # 3) Merge (and diagnose why rows drop)
+    # ------------------------------------------------------------------
+    # Diagnose how many pred keys exist in GT
+    gt_key_set = set(df_gt["key"])
+    df_pred["key_in_gt"] = df_pred["key"].isin(gt_key_set)
+    not_in_gt = df_pred.loc[~df_pred["key_in_gt"]]
+
+    print("\n--- KEY MATCH LOG ---")
+    print(f"Pred rows with key found in GT: {df_pred['key_in_gt'].sum()} / {len(df_pred)}")
+    print(f"Pred rows with key NOT found in GT: {len(not_in_gt)}")
+    if len(not_in_gt) > 0:
+        print("[WARNING] Some prediction keys are not present in GT. First 10:")
+        print(not_in_gt[["unique_id", "MedDatum", "key"]].head(10))
+
+    # Now merge; we expect GT is one-to-many with pred (many_to_one)
+    # If GT had duplicates, validate would raise.
+    df_merged = df_pred.merge(
+        df_gt[["key", "EDSS_gt"]],
+        on="key",
+        how="inner",
+        validate="many_to_one"
+    )
+
+    print("\n--- MERGE LOG ---")
+    print(f"Merged rows (inner join): {len(df_merged)}")
+    print(f"Merged unique keys: {df_merged['key'].nunique()}")
+    print(f"Merged missing GT EDSS:   {df_merged['EDSS_gt'].isna().sum()}")
+    print(f"Merged missing pred EDSS: {df_merged['EDSS_pred'].isna().sum()}")
+    print(f"Merged missing confidence:{df_merged['confidence'].isna().sum()}")
+
+    # How many rows will be removed by dropna() in your old code?
+    # Old code did .dropna() on ALL columns, which can remove rows for missing confidence too.
+    rows_complete = df_merged.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"])
+    print("\n--- FILTER LOG (what will be used for stats/plot) ---")
+    print(f"Rows with all required fields (EDSS_gt, EDSS_pred, confidence): {len(rows_complete)}")
+    if len(rows_complete) == 0:
+        print("[ERROR] No complete rows after filtering. Nothing to plot.")
+        return
+
+    # Compute abs error
+    rows_complete = rows_complete.copy()
+    rows_complete["abs_error"] = (rows_complete["EDSS_pred"] - rows_complete["EDSS_gt"]).abs()
+
+    # ------------------------------------------------------------------
+    # 4) Binning + stats (with guardrails)
+    # ------------------------------------------------------------------
+    bins = [0, 70, 80, 90, 100]
+    labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]
+
+    # Confidence outside bins becomes NaN; log it
+    rows_complete["conf_bin"] = pd.cut(rows_complete["confidence"], bins=bins, labels=labels, include_lowest=True)
+    conf_outside = rows_complete["conf_bin"].isna().sum()
+    print(f"Rows with confidence outside [0,100] or outside bin edges: {conf_outside}")
+    if conf_outside > 0:
+        print("Example confidences outside bins:")
+        print(rows_complete.loc[rows_complete["conf_bin"].isna(), "confidence"].head(20).to_list())
+
+    df_plot = rows_complete.dropna(subset=["conf_bin"])
+    stats = (
+        df_plot.groupby("conf_bin", observed=True)["abs_error"]
+        .agg(mean="mean", std="std", count="count")
+        .reindex(labels)
+        .reset_index()
+    )
+
+    print("\n--- BIN STATS ---")
+    print(stats)
+
+    # Warn about low counts
+    low_bins = stats.loc[stats["count"].fillna(0) < min_bin_count, ["conf_bin", "count"]]
+    if not low_bins.empty:
+        print(f"\n[WARNING] Some bins have < {min_bin_count} rows; error bars/trend may be unstable:")
+        print(low_bins)
+
+    # ------------------------------------------------------------------
+    # 5) Plot
+    # ------------------------------------------------------------------
+    plt.figure(figsize=(13, 8))
+    colors = sns.color_palette("Blues", n_colors=len(labels))
+
+    # Replace NaNs in mean for plotting bars (empty bins)
+    means = stats["mean"].to_numpy()
+    counts = stats["count"].fillna(0).astype(int).to_numpy()
+    stds = stats["std"].to_numpy()
+
+    # For bins with no data, bar height 0 (and no errorbar)
+    means_plot = np.nan_to_num(means, nan=0.0)
+
+    bars = plt.bar(labels, means_plot, color=colors, edgecolor="black", alpha=0.85)
+
+    # Error bars only where count>1 and std is not NaN
+    sem = np.where((counts > 1) & (~np.isnan(stds)), stds / np.sqrt(counts), np.nan)
+    plt.errorbar(labels, means_plot, yerr=sem, fmt="none", c="black", capsize=8, elinewidth=1.5)
+
+    # Trend line only if at least 2 non-empty bins
+    valid_idx = np.where(~np.isnan(means))[0]
+    if len(valid_idx) >= 2:
+        x_idx = np.arange(len(labels))
+        z = np.polyfit(valid_idx, means[valid_idx], 1)
+        p = np.poly1d(z)
+        plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5)
+        trend_label = "Trend Line"
+    else:
+        trend_label = "Trend Line (insufficient bins)"
+        print("\n[INFO] Not enough non-empty bins to fit a trend line.")
+
+    # Data labels
+    for i, bar in enumerate(bars):
+        n_count = int(counts[i])
+        mae_val = means[i]
+        if np.isnan(mae_val) or n_count == 0:
+            txt = "empty"
+            y = 0.02
+        else:
+            txt = f"MAE: {mae_val:.2f}\nn={n_count}"
+            y = bar.get_height() + 0.04
+        plt.text(
+            bar.get_x() + bar.get_width()/2,
+            y,
+            txt,
+            ha="center",
+            va="bottom",
+            fontweight="bold",
+            fontsize=10
+        )
+
+    # Legend
+    legend_elements = [
+        Patch(facecolor=colors[0], edgecolor="black", label=f"Bin 1: {labels[0]}"),
+        Patch(facecolor=colors[1], edgecolor="black", label=f"Bin 2: {labels[1]}"),
+        Patch(facecolor=colors[2], edgecolor="black", label=f"Bin 3: {labels[2]}"),
+        Patch(facecolor=colors[3], edgecolor="black", label=f"Bin 4: {labels[3]}"),
+        Line2D([0], [0], color="#e74c3c", linestyle="--", lw=3, label=trend_label),
+        Line2D([0], [0], color="black", marker="_", linestyle="None", markersize=10, label="Std Error (SEM)"),
+        Patch(color="none", label="Metric: Mean Absolute Error (MAE)")
+    ]
+    plt.legend(handles=legend_elements, loc="upper right", frameon=True, shadow=True, title="Legend")
+
+    plt.title("Validation: Confidence vs. Error Magnitude (Single JSON)", fontsize=15, pad=30)
+    plt.ylabel("Mean Absolute Error (EDSS Points)", fontsize=12)
+    plt.xlabel("LLM Confidence Bracket", fontsize=12)
+    plt.grid(axis="y", linestyle=":", alpha=0.5)
+
+    ymax = np.nanmax(means) if np.any(~np.isnan(means)) else 0.0
+    plt.ylim(0, max(0.5, float(ymax) + 0.6))
+    plt.tight_layout()
+    plt.show()
+
+    print("\n" + "="*80)
+    print("DONE")
+    print("="*80)
+
+
+# --- RUN ---
+json_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json"
+gt_path   = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv"
+
+plot_single_json_error_analysis_with_log(json_path, gt_path)
+
+
+
+##
+
+
+# %% Certainty vs Delta (rewritten with robust parsing + detailed data loss logs)
+import pandas as pd
+import numpy as np
+import json
+import glob
+import os
+import matplotlib.pyplot as plt
+import seaborn as sns
+from matplotlib.patches import Patch
+from matplotlib.lines import Line2D
+
+def plot_confidence_vs_abs_error_with_log(
+    json_dir_path,
+    ground_truth_path,
+    edss_gt_col="EDSS",
+    min_bin_count=5,
+    include_lowest=True,
+):
+    def norm_str(x):
+        return str(x).strip().lower()
+
+    def parse_edss(x):
+        # robust numeric parse: handles comma decimals and empty tokens
+        if x is None:
+            return np.nan
+        s = str(x).strip()
+        if s == "" or s.lower() in {"nan", "none", "null"}:
+            return np.nan
+        s = s.replace(",", ".")
+        return pd.to_numeric(s, errors="coerce")
+
+    print("\n" + "="*90)
+    print("CERTAINTY vs ABS ERROR (ALL JSONs) — WITH DATA LOSS LOG")
+    print("="*90)
+    print(f"JSON DIR: {json_dir_path}")
+    print(f"GT FILE:  {ground_truth_path}")
+
+    # ------------------------------------------------------------------
+    # 1) Load GT
+    # ------------------------------------------------------------------
+    df_gt = pd.read_csv(ground_truth_path, sep=";")
+    required_gt_cols = {"unique_id", "MedDatum", edss_gt_col}
+    missing_cols = required_gt_cols - set(df_gt.columns)
+    if missing_cols:
+        raise ValueError(f"GT missing columns: {missing_cols}. Available: {df_gt.columns.tolist()}")
+
+    df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
+    df_gt["MedDatum"]  = df_gt["MedDatum"].map(norm_str)
+    df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
+    df_gt["EDSS_gt"] = df_gt[edss_gt_col].map(parse_edss)
+
+    # GT logs
+    print("\n--- GT LOG ---")
+    print(f"GT rows: {len(df_gt)}")
+    print(f"GT unique keys: {df_gt['key'].nunique()}")
+    gt_dup_rows = df_gt["key"].duplicated(keep=False).sum()
+    print(f"GT duplicate-key rows: {gt_dup_rows}")
+    print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}")
+    print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['EDSS_gt'].isna(), 'key'].nunique()}")
+    if gt_dup_rows > 0:
+        print("\n[WARNING] GT has duplicate keys; merge can explode rows. Top duplicate keys:")
+        print(df_gt.loc[df_gt["key"].duplicated(keep=False), "key"].value_counts().head(10))
+
+    gt_key_set = set(df_gt["key"])
+
+    # ------------------------------------------------------------------
+    # 2) Load predictions from all JSON files (with per-file logs)
+    # ------------------------------------------------------------------
+    json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json")))
+    if not json_files:
+        raise FileNotFoundError(f"No JSON files found in: {json_dir_path}")
+
+    all_preds = []
+    per_file_summary = []
+
+    total_entries_all = 0
+    total_success_all = 0
+    skipped_all = {"not_success": 0, "missing_uid_or_date": 0}
+
+    for file_path in json_files:
+        with open(file_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+
+        total_entries = len(data)
+        success_entries = sum(1 for e in data if e.get("success"))
+
+        total_entries_all += total_entries
+        total_success_all += success_entries
+
+        skipped = {"not_success": 0, "missing_uid_or_date": 0}
+        loaded_rows = 0
+
+        for entry in data:
+            if not entry.get("success"):
+                skipped["not_success"] += 1
+                continue
+            res = entry.get("result", {})
+            uid = res.get("unique_id")
+            md  = res.get("MedDatum")
+
+            if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
+                skipped["missing_uid_or_date"] += 1
+                continue
+
+            all_preds.append({
+                "file": os.path.basename(file_path),
+                "unique_id": norm_str(uid),
+                "MedDatum": norm_str(md),
+                "key": norm_str(uid) + "_" + norm_str(md),
+                "EDSS_pred": parse_edss(res.get("EDSS")),
+                "confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
+            })
+            loaded_rows += 1
+
+        skipped_all["not_success"] += skipped["not_success"]
+        skipped_all["missing_uid_or_date"] += skipped["missing_uid_or_date"]
+
+        per_file_summary.append({
+            "file": os.path.basename(file_path),
+            "entries_total": total_entries,
+            "entries_success": success_entries,
+            "pred_rows_loaded": loaded_rows,
+            "skipped_not_success": skipped["not_success"],
+            "skipped_missing_uid_or_date": skipped["missing_uid_or_date"],
+        })
+
+    df_pred = pd.DataFrame(all_preds)
+    df_file = pd.DataFrame(per_file_summary)
+
+    # PRED logs
+    print("\n--- PRED LOG (ALL FILES) ---")
+    print(f"JSON files found: {len(json_files)}")
+    print(f"Total JSON entries:  {total_entries_all}")
+    print(f"Total success entries:{total_success_all}")
+    print(f"Pred rows loaded (success + has keys): {len(df_pred)}")
+    if len(df_pred) == 0:
+        print("[ERROR] No usable prediction rows found. Nothing to plot.")
+        return
+
+    print(f"Pred unique keys (across all files): {df_pred['key'].nunique()}")
+    print(f"Pred missing EDSS (numeric): {df_pred['EDSS_pred'].isna().sum()}")
+    print(f"Pred missing confidence:     {df_pred['confidence'].isna().sum()}")
+    print("Skipped totals:", skipped_all)
+
+    # show per-file quick check (useful when one iteration is broken)
+    print("\nPer-file loaded rows (head):")
+    print(df_file.sort_values("file").head(10))
+
+    # ------------------------------------------------------------------
+    # 3) Key match log (pred -> GT)
+    # ------------------------------------------------------------------
+    df_pred["key_in_gt"] = df_pred["key"].isin(gt_key_set)
+    not_in_gt = df_pred.loc[~df_pred["key_in_gt"]]
+
+    print("\n--- KEY MATCH LOG ---")
+    print(f"Pred rows with key found in GT: {df_pred['key_in_gt'].sum()} / {len(df_pred)}")
+    print(f"Pred rows with key NOT in GT:   {len(not_in_gt)}")
+    if len(not_in_gt) > 0:
+        print("[WARNING] Example keys not found in GT (first 10):")
+        print(not_in_gt[["file", "unique_id", "MedDatum", "key"]].head(10))
+        print("\n[WARNING] Files contributing most to key-mismatch:")
+        print(not_in_gt["file"].value_counts().head(10))
+
+    # ------------------------------------------------------------------
+    # 4) Merge (no dropna yet) + detailed data loss accounting
+    # ------------------------------------------------------------------
+    df_merged = df_pred.merge(
+        df_gt[["key", "EDSS_gt"]],
+        on="key",
+        how="inner",
+        validate="many_to_one"  # catches GT duplicates
+    )
+
+    print("\n--- MERGE LOG ---")
+    print(f"Merged rows (inner join): {len(df_merged)}")
+    print(f"Merged unique keys: {df_merged['key'].nunique()}")
+
+    # Now quantify what you lose at each filter stage
+    n0 = len(df_merged)
+
+    miss_gt = df_merged["EDSS_gt"].isna()
+    miss_pred = df_merged["EDSS_pred"].isna()
+    miss_conf = df_merged["confidence"].isna()
+
+    print("\n--- MISSINGNESS IN MERGED ---")
+    print(f"Missing GT EDSS:    {miss_gt.sum()}")
+    print(f"Missing Pred EDSS:  {miss_pred.sum()}")
+    print(f"Missing Confidence: {miss_conf.sum()}")
+
+    # IMPORTANT: your old code used .dropna() with no subset => drops if ANY column is NaN.
+    # We'll replicate the intended logic explicitly and log counts.
+    df_complete = df_merged.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"])
+    n1 = len(df_complete)
+    print("\n--- FILTER LOG ---")
+    print(f"Rows before filtering: {n0}")
+    print(f"Rows after requiring EDSS_gt, EDSS_pred, confidence: {n1}")
+    print(f"Rows lost due to missing required fields: {n0 - n1}")
+
+    # Break down why rows were lost (overlap-aware)
+    lost_mask = df_merged[["EDSS_gt", "EDSS_pred", "confidence"]].isna().any(axis=1)
+    lost = df_merged.loc[lost_mask].copy()
+    if len(lost) > 0:
+        lost_reason = (
+            (lost["EDSS_gt"].isna()).astype(int).map({1:"GT",0:""}) +
+            (lost["EDSS_pred"].isna()).astype(int).map({1:"+PRED",0:""}) +
+            (lost["confidence"].isna()).astype(int).map({1:"+CONF",0:""})
+        )
+        lost["loss_reason"] = lost_reason.str.replace(r"^\+", "", regex=True).replace("", "UNKNOWN")
+        print("\nTop loss reasons (overlap-aware):")
+        print(lost["loss_reason"].value_counts().head(10))
+
+        print("\nFiles contributing most to lost rows:")
+        print(lost["file"].value_counts().head(10))
+
+    if len(df_complete) == 0:
+        print("[ERROR] No complete rows left after filtering. Nothing to plot.")
+        return
+
+    # ------------------------------------------------------------------
+    # 5) Abs error + binning
+    # ------------------------------------------------------------------
+    df_complete = df_complete.copy()
+    df_complete["abs_error"] = (df_complete["EDSS_pred"] - df_complete["EDSS_gt"]).abs()
+
+    bins = [0, 70, 80, 90, 100]
+    labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]
+
+    df_complete["conf_bin"] = pd.cut(
+        df_complete["confidence"],
+        bins=bins,
+        labels=labels,
+        include_lowest=include_lowest
+    )
+
+    conf_outside = df_complete["conf_bin"].isna().sum()
+    print("\n--- BINNING LOG ---")
+    print(f"Rows with confidence outside bin edges / invalid: {conf_outside}")
+    if conf_outside > 0:
+        print("Example out-of-bin confidences:")
+        print(df_complete.loc[df_complete["conf_bin"].isna(), "confidence"].head(20).to_list())
+
+    df_plot = df_complete.dropna(subset=["conf_bin"])
+    print(f"Rows kept for bin stats/plot (after dropping out-of-bin): {len(df_plot)}")
+    print(f"Rows lost due to out-of-bin confidence: {len(df_complete) - len(df_plot)}")
+
+    stats = (
+        df_plot.groupby("conf_bin", observed=True)["abs_error"]
+        .agg(mean="mean", std="std", count="count")
+        .reindex(labels)
+        .reset_index()
+    )
+
+    print("\n--- BIN STATS ---")
+    print(stats)
+
+    low_bins = stats.loc[stats["count"].fillna(0) < min_bin_count, ["conf_bin", "count"]]
+    if not low_bins.empty:
+        print(f"\n[WARNING] Some bins have < {min_bin_count} rows (unstable SEM/trend):")
+        print(low_bins)
+
+    # ------------------------------------------------------------------
+    # 6) Plot
+    # ------------------------------------------------------------------
+    plt.figure(figsize=(12, 8))
+    colors = sns.color_palette("Blues", n_colors=len(labels))
+
+    means = stats["mean"].to_numpy()
+    counts = stats["count"].fillna(0).astype(int).to_numpy()
+    stds = stats["std"].to_numpy()
+
+    means_plot = np.nan_to_num(means, nan=0.0)
+    bars = plt.bar(labels, means_plot, color=colors, edgecolor="black", linewidth=1.2)
+
+    sem = np.where((counts > 1) & (~np.isnan(stds)), stds / np.sqrt(counts), np.nan)
+    plt.errorbar(labels, means_plot, yerr=sem, fmt="none", c="black", capsize=6, elinewidth=1.5)
+
+    # Trend line only if >=2 non-empty bins
+    valid_idx = np.where(~np.isnan(means))[0]
+    if len(valid_idx) >= 2:
+        x_idx = np.arange(len(labels))
+        z = np.polyfit(valid_idx, means[valid_idx], 1)
+        p = np.poly1d(z)
+        plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=2.5)
+        trend_label = "Correlation Trend"
+    else:
+        trend_label = "Correlation Trend (insufficient bins)"
+        print("\n[INFO] Not enough non-empty bins to fit a trend line.")
+
+    # Bar annotations (MAE + n)
+    for i, bar in enumerate(bars):
+        n = int(counts[i])
+        m = means[i]
+        if n == 0 or np.isnan(m):
+            txt = "empty"
+            y = 0.02
+        else:
+            txt = f"MAE: {m:.2f}\nn={n}"
+            y = bar.get_height() + 0.05
+        plt.text(bar.get_x() + bar.get_width()/2, y, txt, ha="center", fontweight="bold")
+
+    legend_elements = [
+        Patch(facecolor=colors[0], edgecolor="black", label=f"Bin 1: {labels[0]}"),
+        Patch(facecolor=colors[1], edgecolor="black", label=f"Bin 2: {labels[1]}"),
+        Patch(facecolor=colors[2], edgecolor="black", label=f"Bin 3: {labels[2]}"),
+        Patch(facecolor=colors[3], edgecolor="black", label=f"Bin 4: {labels[3]}"),
+        Line2D([0], [0], color="black", marker="_", linestyle="None", markersize=10, label="Standard Error (SEM)"),
+        Line2D([0], [0], color="#e74c3c", linestyle="--", lw=2.5, label=trend_label),
+        Patch(color="none", label="Metric: Mean Absolute Error (MAE)")
+    ]
+    plt.legend(handles=legend_elements, loc="upper right", frameon=True, shadow=True, fontsize=10, title="Legend")
+
+    plt.title("Validation: Inverse Correlation of Confidence vs. Error Magnitude", fontsize=15, pad=20)
+    plt.ylabel("Mean Absolute Error (Δ EDSS Points)", fontsize=12)
+    plt.xlabel("LLM Confidence Bracket", fontsize=12)
+    plt.grid(axis="y", linestyle=":", alpha=0.5)
+
+    ymax = np.nanmax(means) if np.any(~np.isnan(means)) else 0.0
+    plt.ylim(0, max(0.5, float(ymax) + 0.6))
+    plt.tight_layout()
+    plt.show()
+
+    print("\n" + "="*90)
+    print("DONE")
+    print("="*90)
+
+
+# Example run:
+plot_confidence_vs_abs_error_with_log("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv")
+
+
+##
+
+
+# %% Empirical Confidence
+# Empirical stability confidence (from 10 runs) + LLM certainty_percent as secondary signal
+# - Reads all JSONs in a folder (your 10 iterations)
+# - Aggregates by key = unique_id + MedDatum
+# - Computes:
+#     * EDSS_mean, EDSS_std, EDSS_iqr, mode/share
+#     * empirical_conf_0_100 (based on stability)
+#     * llm_conf_mean_0_100 (mean certainty_percent)
+#     * combined_conf_0_100 (weighted blend)
+# - Optional: merges GT EDSS and computes abs error on the aggregated prediction
+
+import os, glob, json
+import numpy as np
+import pandas as pd
+
+def build_empirical_confidence_table(
+    json_dir_path: str,
+    ground_truth_path: str | None = None,
+    gt_sep: str = ";",
+    gt_edss_col: str = "EDSS",
+    w_empirical: float = 0.7,          # weight for empirical stability
+    w_llm: float = 0.3,                # weight for LLM self-reported confidence
+    tol_mode: float = 0.5,             # tolerance to treat EDSS as "same" (EDSS often in 0.5 steps)
+    min_runs_expected: int = 10,
+):
+    # -----------------------------
+    # Helpers
+    # -----------------------------
+    def norm_str(x):
+        return str(x).strip().lower()
+
+    def parse_number(x):
+        if x is None:
+            return np.nan
+        s = str(x).strip()
+        if s == "" or s.lower() in {"nan", "none", "null"}:
+            return np.nan
+        s = s.replace(",", ".")
+        return pd.to_numeric(s, errors="coerce")
+
+    def robust_iqr(x: pd.Series):
+        x = x.dropna()
+        if len(x) == 0:
+            return np.nan
+        return float(x.quantile(0.75) - x.quantile(0.25))
+
+    def stability_to_confidence(std_val: float) -> float:
+        """
+        Map EDSS variability across runs to a 0..100 confidence.
+        EDSS is typically on 0.5 steps. A natural scale:
+          std ~= 0.0  -> ~100
+          std ~= 0.25 -> ~75-90
+          std ~= 0.5  -> ~50-70
+          std >= 1.0  -> low
+        Use a smooth exponential mapping.
+        """
+        if np.isnan(std_val):
+            return np.nan
+        # scale parameter: std=0.5 -> exp(-1)=0.367 -> ~36.7
+        scale = 0.5
+        conf = 100.0 * np.exp(-(std_val / scale))
+        # clamp
+        return float(np.clip(conf, 0.0, 100.0))
+
+    def mode_share_with_tolerance(values: np.ndarray, tol: float) -> tuple[float, float]:
+        """
+        Compute a 'mode' under tolerance: pick the cluster center (median) and count
+        how many values fall within +/- tol. Return (mode_center, share).
+        This is robust to tiny float differences.
+        """
+        vals = values[~np.isnan(values)]
+        if len(vals) == 0:
+            return (np.nan, np.nan)
+        center = float(np.median(vals))
+        share = float(np.mean(np.abs(vals - center) <= tol))
+        return (center, share)
+
+    # -----------------------------
+    # Load predictions from all JSONs
+    # -----------------------------
+    json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json")))
+    if not json_files:
+        raise FileNotFoundError(f"No JSON files found in: {json_dir_path}")
+
+    rows = []
+    per_file = []
+    total_entries_all = 0
+    total_success_all = 0
+    skipped_all = {"not_success": 0, "missing_uid_or_date": 0}
+
+    for fp in json_files:
+        with open(fp, "r", encoding="utf-8") as f:
+            data = json.load(f)
+
+        total_entries = len(data)
+        success_entries = sum(1 for e in data if e.get("success"))
+        total_entries_all += total_entries
+        total_success_all += success_entries
+
+        skipped = {"not_success": 0, "missing_uid_or_date": 0}
+        loaded = 0
+
+        for entry in data:
+            if not entry.get("success"):
+                skipped["not_success"] += 1
+                continue
+
+            res = entry.get("result", {})
+            uid = res.get("unique_id")
+            md  = res.get("MedDatum")
+
+            if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
+                skipped["missing_uid_or_date"] += 1
+                continue
+
+            edss = parse_number(res.get("EDSS"))
+            conf = parse_number(res.get("certainty_percent"))
+            it   = res.get("iteration", None)
+
+            rows.append({
+                "file": os.path.basename(fp),
+                "iteration": it,
+                "unique_id": norm_str(uid),
+                "MedDatum":  norm_str(md),
+                "key":       norm_str(uid) + "_" + norm_str(md),
+                "EDSS_pred": edss,
+                "llm_conf":  conf,
+            })
+            loaded += 1
+
+        skipped_all["not_success"] += skipped["not_success"]
+        skipped_all["missing_uid_or_date"] += skipped["missing_uid_or_date"]
+
+        per_file.append({
+            "file": os.path.basename(fp),
+            "entries_total": total_entries,
+            "entries_success": success_entries,
+            "rows_loaded": loaded,
+            "skipped_not_success": skipped["not_success"],
+            "skipped_missing_uid_or_date": skipped["missing_uid_or_date"],
+        })
+
+    df_pred = pd.DataFrame(rows)
+    df_file = pd.DataFrame(per_file)
+
+    # -----------------------------
+    # Logs: ingestion
+    # -----------------------------
+    print("\n" + "="*90)
+    print("EMPIRICAL CONFIDENCE (10-RUN STABILITY) + LLM CONFIDENCE (SECONDARY)")
+    print("="*90)
+    print(f"JSON DIR: {json_dir_path}")
+    print(f"JSON files: {len(json_files)}")
+    print("\n--- INGEST LOG ---")
+    print(f"Total JSON entries:   {total_entries_all}")
+    print(f"Total success entries:{total_success_all}")
+    print(f"Pred rows loaded:     {len(df_pred)}")
+    print(f"Unique keys in preds: {df_pred['key'].nunique() if len(df_pred) else 0}")
+    print(f"Missing EDSS_pred:    {df_pred['EDSS_pred'].isna().sum() if len(df_pred) else 0}")
+    print(f"Missing llm_conf:     {df_pred['llm_conf'].isna().sum() if len(df_pred) else 0}")
+    print("Skipped totals:", skipped_all)
+
+    print("\nPer-file summary (top 10 by name):")
+    print(df_file.sort_values("file").head(10))
+
+    # -----------------------------
+    # Aggregate by key (empirical stability)
+    # -----------------------------
+    if len(df_pred) == 0:
+        print("[ERROR] No usable prediction rows.")
+        return None
+
+    # how many runs per key (expect ~10)
+    runs_per_key = df_pred.groupby("key")["EDSS_pred"].size().rename("n_rows").reset_index()
+    print("\n--- RUNS PER KEY LOG ---")
+    print(f"Keys with at least 1 row: {len(runs_per_key)}")
+    print("Distribution of rows per key (value_counts):")
+    print(runs_per_key["n_rows"].value_counts().sort_index())
+
+    # Aggregate stats
+    def agg_block(g: pd.DataFrame):
+        ed = g["EDSS_pred"].to_numpy(dtype=float)
+        ll = g["llm_conf"].to_numpy(dtype=float)
+
+        n_rows = len(g)
+        n_edss = int(np.sum(~np.isnan(ed)))
+        n_llm  = int(np.sum(~np.isnan(ll)))
+
+        ed_mean = float(np.nanmean(ed)) if n_edss else np.nan
+        ed_std  = float(np.nanstd(ed, ddof=1)) if n_edss >= 2 else (0.0 if n_edss == 1 else np.nan)
+        ed_iqr  = robust_iqr(pd.Series(ed))
+        mode_center, mode_share = mode_share_with_tolerance(ed, tol=tol_mode)
+
+        llm_mean = float(np.nanmean(ll)) if n_llm else np.nan
+        llm_std  = float(np.nanstd(ll, ddof=1)) if n_llm >= 2 else (0.0 if n_llm == 1 else np.nan)
+
+        emp_conf = stability_to_confidence(ed_std) if not np.isnan(ed_std) else np.nan
+
+        # Combined confidence (weighted). If one side missing, fall back to the other.
+        if np.isnan(emp_conf) and np.isnan(llm_mean):
+            comb = np.nan
+        elif np.isnan(emp_conf):
+            comb = llm_mean
+        elif np.isnan(llm_mean):
+            comb = emp_conf
+        else:
+            comb = w_empirical * emp_conf + w_llm * llm_mean
+
+        return pd.Series({
+            "unique_id": g["unique_id"].iloc[0],
+            "MedDatum":  g["MedDatum"].iloc[0],
+            "n_rows":    n_rows,
+            "n_edss":    n_edss,
+            "n_llm_conf":n_llm,
+            "EDSS_mean": ed_mean,
+            "EDSS_std":  ed_std,
+            "EDSS_iqr":  ed_iqr,
+            "EDSS_mode_center": mode_center,
+            "EDSS_mode_share": mode_share,      # fraction within ±tol_mode of median center
+            "llm_conf_mean": llm_mean,
+            "llm_conf_std":  llm_std,
+            "empirical_conf_0_100": emp_conf,
+            "combined_conf_0_100":  float(np.clip(comb, 0.0, 100.0)) if not np.isnan(comb) else np.nan,
+        })
+
+    df_agg = df_pred.groupby("key", as_index=False).apply(agg_block)
+    # groupby+apply returns a multiindex sometimes depending on pandas version
+    if isinstance(df_agg.index, pd.MultiIndex):
+        df_agg = df_agg.reset_index(drop=True)
+
+    # Logs: aggregation + losses
+    print("\n--- AGGREGATION LOG ---")
+    print(f"Aggregated keys: {len(df_agg)}")
+    print(f"Keys with EDSS in >=1 run: {(df_agg['n_edss'] >= 1).sum()}")
+    print(f"Keys with EDSS in >=2 runs (std meaningful): {(df_agg['n_edss'] >= 2).sum()}")
+    print(f"Keys missing EDSS in all runs: {(df_agg['n_edss'] == 0).sum()}")
+    print(f"Keys missing llm_conf in all runs: {(df_agg['n_llm_conf'] == 0).sum()}")
+
+    # Expected runs check
+    if min_runs_expected is not None:
+        print(f"\nKeys with < {min_runs_expected} rows (potential missing iterations):")
+        print(df_agg.loc[df_agg["n_rows"] < min_runs_expected, ["key", "n_rows"]].sort_values("n_rows").head(20))
+
+    # -----------------------------
+    # Optional: merge GT and compute error on aggregated EDSS_mean
+    # -----------------------------
+    if ground_truth_path is not None:
+        df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
+        need = {"unique_id", "MedDatum", gt_edss_col}
+        miss = need - set(df_gt.columns)
+        if miss:
+            raise ValueError(f"GT missing columns: {miss}. Available: {df_gt.columns.tolist()}")
+
+        df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
+        df_gt["MedDatum"]  = df_gt["MedDatum"].map(norm_str)
+        df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
+        df_gt["EDSS_gt"] = df_gt[gt_edss_col].apply(parse_number)
+
+        print("\n--- GT MERGE LOG ---")
+        print(f"GT rows: {len(df_gt)} | GT unique keys: {df_gt['key'].nunique()}")
+        print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}")
+
+        df_final = df_agg.merge(df_gt[["key", "EDSS_gt"]], on="key", how="left", validate="one_to_one")
+
+        print(f"Aggregated keys with GT match: {df_final['EDSS_gt'].notna().sum()} / {len(df_final)}")
+        print(f"Aggregated keys missing GT EDSS: {df_final['EDSS_gt'].isna().sum()}")
+
+        df_final["abs_error_mean"] = (df_final["EDSS_mean"] - df_final["EDSS_gt"]).abs()
+
+        # How many keys usable for evaluation?
+        usable = df_final.dropna(subset=["EDSS_mean", "EDSS_gt"])
+        print("\n--- EVAL LOG (AGGREGATED) ---")
+        print(f"Keys with both EDSS_mean and EDSS_gt: {len(usable)}")
+        if len(usable) > 0:
+            print(f"MAE on EDSS_mean vs GT: {usable['abs_error_mean'].mean():.3f}")
+            print(f"Median abs error:       {usable['abs_error_mean'].median():.3f}")
+
+        return df_final
+
+    return df_agg
+
+
+# Example usage:
+df = build_empirical_confidence_table(json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", w_empirical=0.7, w_llm=0.3, tol_mode=0.5,min_runs_expected=10,)
+df.to_csv("empirical_confidence_table.csv", index=False)
+
+##
+
+
+
+
+# %% Executive Boxplot
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib.patches import Patch
+
+def plot_exec_boxplots(df, min_bin_size_warn=10):
+    """
+    Two side-by-side boxplots:
+      - Left: abs_error_mean grouped by empirical_conf_0_100 quantile bins
+      - Right: abs_error_mean grouped by llm_conf_mean quantile bins
+
+    Adds:
+      - Robust qcut labeling (handles ties; bins may be < 4)
+      - Data logs + per-bin summary table printed
+      - Clear legend explaining each panel and what box elements mean
+    """
+    need_cols = ["abs_error_mean", "empirical_conf_0_100", "llm_conf_mean"]
+    missing = [c for c in need_cols if c not in df.columns]
+    if missing:
+        raise ValueError(f"Missing columns in df: {missing}. Available: {df.columns.tolist()}")
+
+    d = df[need_cols].copy()
+
+    # -----------------------------
+    # Data logs: survivorship
+    # -----------------------------
+    d_emp = d.dropna(subset=["abs_error_mean", "empirical_conf_0_100"]).copy()
+    d_llm = d.dropna(subset=["abs_error_mean", "llm_conf_mean"]).copy()
+
+    print("\n" + "="*90)
+    print("EXECUTIVE BOXPLOTS — DATA LOG + SUMMARY")
+    print("="*90)
+    print(f"Total rows in df: {len(df)}")
+    print(f"Rows for empirical plot: {len(d_emp)} (dropped {len(df) - len(d_emp)})")
+    print(f"Rows for LLM plot:       {len(d_llm)} (dropped {len(df) - len(d_llm)})")
+
+    if len(d_emp) == 0 or len(d_llm) == 0:
+        print("[ERROR] Not enough data after dropping NaNs to build both plots.")
+        return
+
+    # -----------------------------
+    # Robust quantile binning (handles ties)
+    # -----------------------------
+    # Empirical
+    emp_bins = pd.qcut(d_emp["empirical_conf_0_100"], q=4, duplicates="drop")
+    k_emp = emp_bins.cat.categories.size
+    emp_labels = [f"Q{i+1}" for i in range(k_emp)]
+    d_emp["emp_q"] = pd.qcut(d_emp["empirical_conf_0_100"], q=4, duplicates="drop", labels=emp_labels)
+
+    # LLM
+    llm_bins = pd.qcut(d_llm["llm_conf_mean"], q=4, duplicates="drop")
+    k_llm = llm_bins.cat.categories.size
+    llm_labels = [f"Q{i+1}" for i in range(k_llm)]
+    d_llm["llm_q"] = pd.qcut(d_llm["llm_conf_mean"], q=4, duplicates="drop", labels=llm_labels)
+
+    # Print bin edges (so you can discuss exact thresholds)
+    print("\n--- BIN EDGES (actual ranges) ---")
+    print("Empirical confidence bins:")
+    for i, interval in enumerate(emp_bins.cat.categories):
+        print(f"  {emp_labels[i]}: {interval}")
+    print("LLM confidence bins:")
+    for i, interval in enumerate(llm_bins.cat.categories):
+        print(f"  {llm_labels[i]}: {interval}")
+
+    # -----------------------------
+    # Summary tables (per bin)
+    # -----------------------------
+    def summarize_bins(df_in, bin_col, conf_col, label):
+        g = df_in.groupby(bin_col, observed=True).agg(
+            n=("abs_error_mean", "size"),
+            mae_mean=("abs_error_mean", "mean"),
+            mae_median=("abs_error_mean", "median"),
+            mae_q25=("abs_error_mean", lambda x: x.quantile(0.25)),
+            mae_q75=("abs_error_mean", lambda x: x.quantile(0.75)),
+            conf_mean=(conf_col, "mean"),
+            conf_median=(conf_col, "median"),
+        ).reset_index().rename(columns={bin_col: "bin"})
+        g["panel"] = label
+        return g[["panel", "bin", "n", "mae_mean", "mae_median", "mae_q25", "mae_q75", "conf_mean", "conf_median"]]
+
+    summary_emp = summarize_bins(d_emp, "emp_q", "empirical_conf_0_100", "Empirical")
+    summary_llm = summarize_bins(d_llm, "llm_q", "llm_conf_mean", "LLM")
+
+    print("\n--- SUMMARY TABLE: Empirical confidence quartiles (or fewer if ties) ---")
+    print(summary_emp.to_string(index=False, float_format=lambda x: f"{x:.3f}"))
+
+    print("\n--- SUMMARY TABLE: LLM confidence quartiles (or fewer if ties) ---")
+    print(summary_llm.to_string(index=False, float_format=lambda x: f"{x:.3f}"))
+
+    # Warn about small bins
+    small_emp = summary_emp.loc[summary_emp["n"] < min_bin_size_warn, ["bin", "n"]]
+    small_llm = summary_llm.loc[summary_llm["n"] < min_bin_size_warn, ["bin", "n"]]
+    if not small_emp.empty or not small_llm.empty:
+        print(f"\n[WARNING] Some bins have < {min_bin_size_warn} points; compare them cautiously.")
+        if not small_emp.empty:
+            print("  Empirical small bins:")
+            print(small_emp.to_string(index=False))
+        if not small_llm.empty:
+            print("  LLM small bins:")
+            print(small_llm.to_string(index=False))
+
+    # -----------------------------
+    # Prepare data for boxplots
+    # -----------------------------
+    emp_cats = list(d_emp["emp_q"].cat.categories)
+    llm_cats = list(d_llm["llm_q"].cat.categories)
+
+    emp_groups = [d_emp.loc[d_emp["emp_q"] == q, "abs_error_mean"].values for q in emp_cats]
+    llm_groups = [d_llm.loc[d_llm["llm_q"] == q, "abs_error_mean"].values for q in llm_cats]
+
+    # -----------------------------
+    # Plot
+    # -----------------------------
+    fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True)
+
+    bp0 = axes[0].boxplot(emp_groups, labels=emp_cats, showfliers=False, patch_artist=True)
+    bp1 = axes[1].boxplot(llm_groups, labels=llm_cats, showfliers=False, patch_artist=True)
+
+    # Make panels visually distinct but still simple (no extra clutter)
+    for patch in bp0["boxes"]:
+        patch.set_alpha(0.6)
+    for patch in bp1["boxes"]:
+        patch.set_alpha(0.6)
+
+    axes[0].set_title("Error by Empirical Confidence (quantile bins)")
+    axes[0].set_xlabel("Empirical confidence bin")
+    axes[0].set_ylabel("Absolute Error (|EDSS_mean − EDSS_gt|)")
+
+    axes[1].set_title("Error by LLM Confidence (quantile bins)")
+    axes[1].set_xlabel("LLM confidence bin")
+
+    for ax in axes:
+        ax.grid(axis="y", linestyle=":", alpha=0.5)
+
+    # -----------------------------
+    # Legend (simple, but useful)
+    # -----------------------------
+    legend_elements = [
+        Patch(facecolor="white", edgecolor="black", label="Box = IQR (25%–75%)"),
+        Patch(facecolor="white", edgecolor="black", label="Center line = median"),
+        Patch(facecolor="white", edgecolor="black", label="Whiskers = typical range (no outliers shown)"),
+        Patch(facecolor="white", edgecolor="white", label="Left panel: empirical stability bins"),
+        Patch(facecolor="white", edgecolor="white", label="Right panel: LLM self-reported bins"),
+    ]
+    fig.legend(handles=legend_elements, loc="upper center", ncol=3, frameon=True)
+
+    plt.tight_layout(rect=[0, 0, 1, 0.90])
+    plt.show()
+
+    print("\n" + "="*90)
+    print("DONE")
+    print("="*90)
+
+
+# Example (complete):
+df_final = build_empirical_confidence_table(
+    json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration",
+    ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv",
+    w_empirical=0.7,
+    w_llm=0.3,
+    tol_mode=0.5,
+    min_runs_expected=10,
+)
+plot_exec_boxplots(df_final)
+
+
+##
+
+
+
+# %% Scatter
+
+import os, json
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+def scatter_abs_error_by_conf_bins_single_json(
+    json_file_path,
+    ground_truth_path,
+    gt_sep=";",
+    gt_edss_col="EDSS",
+):
+    def norm_str(x):
+        return str(x).strip().lower()
+
+    def parse_edss(x):
+        if x is None:
+            return np.nan
+        s = str(x).strip()
+        if s == "" or s.lower() in {"nan", "none", "null"}:
+            return np.nan
+        s = s.replace(",", ".")
+        return pd.to_numeric(s, errors="coerce")
+
+    # ---- Load GT
+    df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
+    df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
+    df_gt["MedDatum"]  = df_gt["MedDatum"].map(norm_str)
+    df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
+    df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss)
+
+    # ---- Load preds from JSON
+    with open(json_file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    rows = []
+    for entry in data:
+        if not entry.get("success"):
+            continue
+        res = entry.get("result", {})
+        uid, md = res.get("unique_id"), res.get("MedDatum")
+        if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
+            continue
+
+        rows.append({
+            "key": norm_str(uid) + "_" + norm_str(md),
+            "EDSS_pred": parse_edss(res.get("EDSS")),
+            "confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
+        })
+
+    df_pred = pd.DataFrame(rows)
+
+    # ---- Merge + filter
+    df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one")
+    df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy()
+    df["abs_error"] = (df["EDSS_pred"] - df["EDSS_gt"]).abs()
+
+    # ---- Bin confidence into 4 categories
+    bins = [0, 70, 80, 90, 100]
+    labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]
+    df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True)
+    df = df.dropna(subset=["conf_bin"]).copy()
+
+    # ---- Logs
+    print("\n--- BIN COUNTS (points plotted) ---")
+    print(df["conf_bin"].value_counts().reindex(labels).fillna(0).astype(int))
+    print(f"Total points plotted: {len(df)}")
+
+    # ---- Scatter (categorical x with jitter)
+    x_map = {lab: i for i, lab in enumerate(labels)}
+    x = df["conf_bin"].map(x_map).astype(float).to_numpy()
+    jitter = np.random.uniform(-0.12, 0.12, size=len(df))
+    xj = x + jitter
+
+    plt.figure(figsize=(12, 6))
+    plt.scatter(xj, df["abs_error"].to_numpy(), alpha=0.55)
+    plt.xticks(range(len(labels)), labels)
+    plt.xlabel("certainty_percent category (Iteration 1)")
+    plt.ylabel("Absolute Error (|EDSS_pred − EDSS_gt|)")
+    plt.title("Absolute Error vs LLM Confidence Category (Single JSON)")
+    plt.grid(axis="y", linestyle=":", alpha=0.5)
+    plt.tight_layout()
+    plt.show()
+
+# --- RUN ---
+scatter_abs_error_by_conf_bins_single_json(
+    json_file_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json",
+    ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv",
+)
+
+##
+
+
+
+
+# %% Boxplot2
+
+# Boxplot + light jittered points
+# - Single JSON (iteration 1)
+# - X: confidence bin (<70, 70-80, 80-90, 90-100)
+# - Y: absolute error
+# - Legend includes n per bin
+
+import json
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib.patches import Patch
+
+def boxplot_with_jitter_abs_error_by_conf_bins_single_json(
+    json_file_path,
+    ground_truth_path,
+    gt_sep=";",
+    gt_edss_col="EDSS",
+    jitter_width=0.12,
+    point_alpha=0.25,
+    show_outliers=False,
+):
+    def norm_str(x):
+        return str(x).strip().lower()
+
+    def parse_edss(x):
+        if x is None:
+            return np.nan
+        s = str(x).strip()
+        if s == "" or s.lower() in {"nan", "none", "null"}:
+            return np.nan
+        s = s.replace(",", ".")
+        return pd.to_numeric(s, errors="coerce")
+
+    # ---- Load GT
+    df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
+    for col in ["unique_id", "MedDatum", gt_edss_col]:
+        if col not in df_gt.columns:
+            raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}")
+
+    df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
+    df_gt["MedDatum"]  = df_gt["MedDatum"].map(norm_str)
+    df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
+    df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss)
+
+    # ---- Load preds from JSON
+    with open(json_file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    rows = []
+    for entry in data:
+        if not entry.get("success"):
+            continue
+        res = entry.get("result", {})
+        uid, md = res.get("unique_id"), res.get("MedDatum")
+        if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
+            continue
+        rows.append({
+            "key": norm_str(uid) + "_" + norm_str(md),
+            "EDSS_pred": parse_edss(res.get("EDSS")),
+            "confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
+        })
+
+    df_pred = pd.DataFrame(rows)
+
+    # ---- Merge + filter
+    df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one")
+    df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy()
+    df["abs_error"] = (df["EDSS_pred"] - df["EDSS_gt"]).abs()
+
+    # ---- Bin confidence
+    bins = [0, 70, 80, 90, 100]
+    labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]
+    df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True)
+    df = df.dropna(subset=["conf_bin"]).copy()
+
+    # ---- Prepare per-bin arrays
+    bin_arrays = [df.loc[df["conf_bin"] == lab, "abs_error"].to_numpy() for lab in labels]
+    n_counts = [len(a) for a in bin_arrays]
+
+    # ---- Plot
+    fig, ax = plt.subplots(figsize=(12, 6))
+
+    # Boxplot (no fliers by default to reduce clutter)
+    bp = ax.boxplot(
+        bin_arrays,
+        labels=labels,
+        showfliers=show_outliers,
+        patch_artist=True,
+        widths=0.55,
+    )
+
+    # Light fill for boxes (no explicit color choices required)
+    for b in bp["boxes"]:
+        b.set_alpha(0.35)
+
+    # Jittered points on top
+    for i, arr in enumerate(bin_arrays, start=1):
+        if len(arr) == 0:
+            continue
+        x = np.full(len(arr), i, dtype=float)
+        x += np.random.uniform(-jitter_width, jitter_width, size=len(arr))
+        ax.scatter(x, arr, alpha=point_alpha, s=18)
+
+    ax.set_title("Absolute Error by LLM Confidence Bin (Iteration 1)")
+    ax.set_xlabel("certainty_percent category")
+    ax.set_ylabel("Absolute Error (|EDSS_pred − EDSS_gt|)")
+    ax.grid(axis="y", linestyle=":", alpha=0.5)
+
+    # Legend showing n per bin
+    legend_handles = [
+        Patch(facecolor="white", edgecolor="black", label=f"{lab}: n={n}")
+        for lab, n in zip(labels, n_counts)
+    ]
+    ax.legend(handles=legend_handles, title="Bin counts", loc="upper right", frameon=True)
+
+    plt.tight_layout()
+    plt.show()
+
+    # Print counts too (useful for discussion)
+    print("\n--- BIN COUNTS (points plotted) ---")
+    for lab, n in zip(labels, n_counts):
+        print(f"{lab:>18}: n={n}")
+    print(f"Total points plotted: {sum(n_counts)}")
+
+
+# Example run:
+boxplot_with_jitter_abs_error_by_conf_bins_single_json(
+    json_file_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json",
+    ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv"
+)
+
+##
+
+
+
+
+# %% Boxplot3
+
+
+# Boxplot + jitter with SIGNED error (direction)
+# - Y-axis: signed error = EDSS_pred - EDSS_gt  (negative = underestimation, positive = overestimation)
+# - Also prints per-bin summary (n, mean signed error, median, MAE)
+
+import json
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib.patches import Patch
+from matplotlib.lines import Line2D
+
+def boxplot_with_jitter_signed_error_by_conf_bins_single_json(
+    json_file_path,
+    ground_truth_path,
+    gt_sep=";",
+    gt_edss_col="EDSS",
+    jitter_width=0.12,
+    point_alpha=0.25,
+    show_outliers=False,
+):
+    def norm_str(x):
+        return str(x).strip().lower()
+
+    def parse_edss(x):
+        if x is None:
+            return np.nan
+        s = str(x).strip()
+        if s == "" or s.lower() in {"nan", "none", "null"}:
+            return np.nan
+        s = s.replace(",", ".")
+        return pd.to_numeric(s, errors="coerce")
+
+    # ---- Load GT
+    df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
+    for col in ["unique_id", "MedDatum", gt_edss_col]:
+        if col not in df_gt.columns:
+            raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}")
+
+    df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
+    df_gt["MedDatum"]  = df_gt["MedDatum"].map(norm_str)
+    df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
+    df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss)
+
+    # ---- Load preds from JSON
+    with open(json_file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    rows = []
+    for entry in data:
+        if not entry.get("success"):
+            continue
+        res = entry.get("result", {})
+        uid, md = res.get("unique_id"), res.get("MedDatum")
+        if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
+            continue
+        rows.append({
+            "key": norm_str(uid) + "_" + norm_str(md),
+            "EDSS_pred": parse_edss(res.get("EDSS")),
+            "confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
+        })
+
+    df_pred = pd.DataFrame(rows)
+
+    # ---- Merge + filter
+    df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one")
+    df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy()
+
+    # SIGNED ERROR (direction)
+    df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"]
+    df["abs_error"] = df["signed_error"].abs()
+
+    # ---- Bin confidence
+    bins = [0, 70, 80, 90, 100]
+    labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]
+    df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True)
+    df = df.dropna(subset=["conf_bin"]).copy()
+
+    # ---- Prepare arrays
+    bin_arrays = [df.loc[df["conf_bin"] == lab, "signed_error"].to_numpy() for lab in labels]
+    n_counts = [len(a) for a in bin_arrays]
+
+    # ---- Plot
+    fig, ax = plt.subplots(figsize=(12, 6))
+
+    bp = ax.boxplot(
+        bin_arrays,
+        labels=labels,
+        showfliers=show_outliers,
+        patch_artist=True,
+        widths=0.55,
+    )
+
+    for b in bp["boxes"]:
+        b.set_alpha(0.35)
+
+    # Jittered points
+    for i, arr in enumerate(bin_arrays, start=1):
+        if len(arr) == 0:
+            continue
+        x = np.full(len(arr), i, dtype=float)
+        x += np.random.uniform(-jitter_width, jitter_width, size=len(arr))
+        ax.scatter(x, arr, alpha=point_alpha, s=18)
+
+    # Zero line to show over/under clearly
+    ax.axhline(0, linewidth=1.5, linestyle="--")
+
+    ax.set_title("Signed Error by LLM Confidence Bin (Iteration 1)")
+    ax.set_xlabel("certainty_percent category")
+    ax.set_ylabel("Signed Error (EDSS_pred − EDSS_gt)")
+    ax.grid(axis="y", linestyle=":", alpha=0.5)
+
+    # Legend with n per bin + zero-line meaning
+    legend_handles = [
+        Patch(facecolor="white", edgecolor="black", label=f"{lab}: n={n}")
+        for lab, n in zip(labels, n_counts)
+    ]
+    legend_handles.append(Line2D([0], [0], linestyle="--", color="black", label="0 = unbiased (over/under split)"))
+    ax.legend(handles=legend_handles, title="Bin counts", loc="upper right", frameon=True)
+
+    plt.tight_layout()
+    plt.show()
+
+    # ---- Print per-bin summary to discuss
+    print("\n--- PER-BIN SUMMARY (points plotted) ---")
+    for lab in labels:
+        sub = df.loc[df["conf_bin"] == lab]
+        n = len(sub)
+        if n == 0:
+            print(f"{lab:>18}: n=0")
+            continue
+        print(
+            f"{lab:>18}: n={n:3d} | "
+            f"mean signed={sub['signed_error'].mean(): .3f} | "
+            f"median signed={sub['signed_error'].median(): .3f} | "
+            f"MAE={sub['abs_error'].mean(): .3f}"
+        )
+    print(f"Total points plotted: {len(df)}")
+
+
+# Example run:
+boxplot_with_jitter_signed_error_by_conf_bins_single_json(
+    json_file_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json",
+    ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv"
+)
+##
+
+
+
+# %% jitter and violin 10x10
+
+# Violin + jitter (all JSONs in folder), with signed error
+# - X: confidence bins (<70, 70-80, 80-90, 90-100)
+# - Y: signed error = EDSS_pred - EDSS_gt  (direction)
+# - Prints bin counts (n) and puts n into the legend
+
+import os, glob, json
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from matplotlib.patches import Patch
+from matplotlib.lines import Line2D
+
+def violin_jitter_signed_error_all_jsons(
+    json_dir_path,
+    ground_truth_path,
+    gt_sep=";",
+    gt_edss_col="EDSS",
+    jitter_width=0.12,
+    point_alpha=0.20,
+    point_size=10,
+    violin_inner="quartile",   # 'quartile', 'box', 'stick', or None
+):
+    def norm_str(x):
+        return str(x).strip().lower()
+
+    def parse_edss(x):
+        if x is None:
+            return np.nan
+        s = str(x).strip()
+        if s == "" or s.lower() in {"nan", "none", "null"}:
+            return np.nan
+        s = s.replace(",", ".")
+        return pd.to_numeric(s, errors="coerce")
+
+    # ---- Load GT
+    df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
+    for col in ["unique_id", "MedDatum", gt_edss_col]:
+        if col not in df_gt.columns:
+            raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}")
+
+    df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
+    df_gt["MedDatum"]  = df_gt["MedDatum"].map(norm_str)
+    df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
+    df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss)
+
+    # ---- Load preds from ALL JSONs
+    json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json")))
+    if not json_files:
+        raise FileNotFoundError(f"No JSON files found in: {json_dir_path}")
+
+    rows = []
+    for fp in json_files:
+        with open(fp, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        for entry in data:
+            if not entry.get("success"):
+                continue
+            res = entry.get("result", {})
+            uid, md = res.get("unique_id"), res.get("MedDatum")
+            if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
+                continue
+            rows.append({
+                "file": os.path.basename(fp),
+                "key": norm_str(uid) + "_" + norm_str(md),
+                "EDSS_pred": parse_edss(res.get("EDSS")),
+                "confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
+            })
+
+    df_pred = pd.DataFrame(rows)
+
+    # ---- Merge + filter
+    df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one")
+    df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy()
+    df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"]
+
+    # ---- Bin confidence
+    bins = [0, 70, 80, 90, 100]
+    labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]
+    df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True)
+    df = df.dropna(subset=["conf_bin"]).copy()
+
+    # ---- Counts + log
+    counts = df["conf_bin"].value_counts().reindex(labels).fillna(0).astype(int)
+    print("\n--- BIN COUNTS (all JSONs) ---")
+    for lab in labels:
+        print(f"{lab:>18}: n={counts[lab]}")
+    print(f"Total points plotted: {len(df)}")
+    print(f"JSON files: {len(json_files)}")
+
+    # Ensure ordering for seaborn
+    df["conf_bin"] = pd.Categorical(df["conf_bin"], categories=labels, ordered=True)
+
+    # ---- Plot
+    plt.figure(figsize=(12, 6))
+
+    # Violin (density)
+    sns.violinplot(
+        data=df,
+        x="conf_bin",
+        y="signed_error",
+        order=labels,
+        inner=violin_inner,
+        cut=0
+    )
+
+    # Jittered points (manual jitter to keep it consistent and fast)
+    x_map = {lab: i for i, lab in enumerate(labels)}
+    x = df["conf_bin"].map(x_map).astype(float).to_numpy()
+    xj = x + np.random.uniform(-jitter_width, jitter_width, size=len(df))
+    plt.scatter(xj, df["signed_error"].to_numpy(), alpha=point_alpha, s=point_size)
+
+    # Zero line (over/under split)
+    plt.axhline(0, linestyle="--", linewidth=1.5)
+
+    plt.xticks(range(len(labels)), labels)
+    plt.xlabel("certainty_percent category (all iterations)")
+    plt.ylabel("Signed Error (EDSS_pred − EDSS_gt)")
+    plt.title("Signed Error vs LLM Confidence Category — Violin + Jitter (All JSONs)")
+    plt.grid(axis="y", linestyle=":", alpha=0.5)
+
+    # Legend with n per bin
+    legend_handles = [
+        Patch(facecolor="white", edgecolor="black", label=f"{lab}: n={int(counts[lab])}")
+        for lab in labels
+    ]
+    legend_handles.append(Line2D([0], [0], linestyle="--", color="black", label="0 = unbiased (over/under split)"))
+    plt.legend(handles=legend_handles, title="Bin counts", loc="upper right", frameon=True)
+
+    plt.tight_layout()
+    plt.show()
+
+
+# Example run:
+violin_jitter_signed_error_all_jsons(
+    json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration",
+    ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv"
+)
+
+##
+
+
+
+# %% jitter and violin 10x1
+
+
+# Adjusted: Violin + jitter (ALL JSONs for points) but X-bins come ONLY from JSON #1 (reference)
+# Fixes:
+# 1) Legend has colors matching bins
+# 2) Legend placed OUTSIDE plot area
+# 3) X-axis binning uses certainty_percent from JSON1 (by key), then all iterations' points inherit that bin
+
+import os, glob, json
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from matplotlib.patches import Patch
+from matplotlib.lines import Line2D
+
+def violin_jitter_signed_error_all_jsons_xbins_from_json1(
+    json_dir_path,
+    json1_file_path,
+    ground_truth_path,
+    gt_sep=";",
+    gt_edss_col="EDSS",
+    jitter_width=0.12,
+    point_alpha=0.18,
+    point_size=10,
+    violin_inner="quartile",   # 'quartile', 'box', 'stick', or None
+):
+    def norm_str(x):
+        return str(x).strip().lower()
+
+    def parse_edss(x):
+        if x is None:
+            return np.nan
+        s = str(x).strip()
+        if s == "" or s.lower() in {"nan", "none", "null"}:
+            return np.nan
+        s = s.replace(",", ".")
+        return pd.to_numeric(s, errors="coerce")
+
+    # ----------------------------
+    # Load GT
+    # ----------------------------
+    df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
+    for col in ["unique_id", "MedDatum", gt_edss_col]:
+        if col not in df_gt.columns:
+            raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}")
+
+    df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
+    df_gt["MedDatum"]  = df_gt["MedDatum"].map(norm_str)
+    df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
+    df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss)
+
+    # ----------------------------
+    # Load JSON1 and build reference bins by KEY
+    # ----------------------------
+    with open(json1_file_path, "r", encoding="utf-8") as f:
+        data1 = json.load(f)
+
+    ref_rows = []
+    for entry in data1:
+        if not entry.get("success"):
+            continue
+        res = entry.get("result", {})
+        uid, md = res.get("unique_id"), res.get("MedDatum")
+        if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
+            continue
+        ref_rows.append({
+            "key": norm_str(uid) + "_" + norm_str(md),
+            "confidence_ref": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
+        })
+
+    df_ref = pd.DataFrame(ref_rows)
+
+    # If JSON1 has duplicates for a key (unlikely, but safe), take the first non-null confidence
+    df_ref = (df_ref.sort_values("confidence_ref")
+                    .groupby("key", as_index=False)["confidence_ref"]
+                    .apply(lambda s: s.dropna().iloc[0] if s.dropna().any() else np.nan))
+    if isinstance(df_ref.index, pd.MultiIndex):
+        df_ref = df_ref.reset_index(drop=True)
+
+    # Confidence bins
+    bins = [0, 70, 80, 90, 100]
+    labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]
+    df_ref["conf_bin_ref"] = pd.cut(df_ref["confidence_ref"], bins=bins, labels=labels, include_lowest=True)
+    df_ref = df_ref.dropna(subset=["conf_bin_ref"]).copy()
+
+    # ----------------------------
+    # Load ALL JSONs (all points)
+    # ----------------------------
+    json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json")))
+    if not json_files:
+        raise FileNotFoundError(f"No JSON files found in: {json_dir_path}")
+
+    rows = []
+    for fp in json_files:
+        with open(fp, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        for entry in data:
+            if not entry.get("success"):
+                continue
+            res = entry.get("result", {})
+            uid, md = res.get("unique_id"), res.get("MedDatum")
+            if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
+                continue
+            rows.append({
+                "file": os.path.basename(fp),
+                "key": norm_str(uid) + "_" + norm_str(md),
+                "EDSS_pred": parse_edss(res.get("EDSS")),
+            })
+
+    df_pred = pd.DataFrame(rows)
+
+    # ----------------------------
+    # Merge: preds + GT + reference bins (from JSON1)
+    # ----------------------------
+    df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one")
+    df = df.merge(df_ref[["key", "conf_bin_ref"]], on="key", how="inner", validate="many_to_one")
+
+    # filter for plotting
+    df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "conf_bin_ref"]).copy()
+    df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"]
+
+    # ordering
+    df["conf_bin_ref"] = pd.Categorical(df["conf_bin_ref"], categories=labels, ordered=True)
+
+    # ----------------------------
+    # Logs + counts
+    # ----------------------------
+    counts = df["conf_bin_ref"].value_counts().reindex(labels).fillna(0).astype(int)
+
+    print("\n--- BIN COUNTS (ALL JSON points, binned by JSON1 confidence) ---")
+    for lab in labels:
+        print(f"{lab:>18}: n={int(counts[lab])}")
+    print(f"Total points plotted: {len(df)}")
+    print(f"JSON files used for points: {len(json_files)}")
+    print(f"Reference JSON1 bins derived from: {os.path.basename(json1_file_path)}")
+    print(f"Keys in reference (after binning & non-null): {df_ref['key'].nunique()}")
+
+    # ----------------------------
+    # Colors + legend patches
+    # ----------------------------
+    palette = sns.color_palette("Blues", n_colors=len(labels))
+    bin_colors = {lab: palette[i] for i, lab in enumerate(labels)}
+
+    legend_handles = [
+        Patch(facecolor=bin_colors[lab], edgecolor="black", label=f"{lab}: n={int(counts[lab])}")
+        for lab in labels
+    ]
+    legend_handles.append(Line2D([0], [0], linestyle="--", color="black", label="0 = unbiased (over/under split)"))
+
+    # ----------------------------
+    # Plot (legend outside)
+    # ----------------------------
+    fig, ax = plt.subplots(figsize=(12.5, 6))
+
+    sns.violinplot(
+        data=df,
+        x="conf_bin_ref",
+        y="signed_error",
+        order=labels,
+        inner=violin_inner,
+        cut=0,
+        palette=[bin_colors[l] for l in labels],
+        ax=ax,
+    )
+
+    # jittered points (manual jitter)
+    x_map = {lab: i for i, lab in enumerate(labels)}
+    x = df["conf_bin_ref"].map(x_map).astype(float).to_numpy()
+    xj = x + np.random.uniform(-jitter_width, jitter_width, size=len(df))
+    ax.scatter(xj, df["signed_error"].to_numpy(), alpha=point_alpha, s=point_size)
+
+    ax.axhline(0, linestyle="--", linewidth=1.5)
+
+    ax.set_xlabel("certainty_percent category (from JSON 1 as reference)")
+    ax.set_ylabel("Signed Error (EDSS_pred − EDSS_gt)")
+    ax.set_title("Signed Error vs LLM Confidence Category — Violin + Jitter (All JSONs)\nBinned by JSON 1 certainty_percent")
+    ax.grid(axis="y", linestyle=":", alpha=0.5)
+
+    # Legend outside (right)
+    ax.legend(
+        handles=legend_handles,
+        title="Bin counts",
+        loc="center left",
+        bbox_to_anchor=(1.02, 0.5),
+        frameon=True
+    )
+
+    plt.tight_layout()
+    plt.show()
+
+
+# Example run:
+json1_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json"
+violin_jitter_signed_error_all_jsons_xbins_from_json1(
+    json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration",
+    json1_file_path=json1_path,
+    ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv")
+
+##
+
+
+# %% Coorelation
+
+# Correlation plot (RAW certainty_percent) vs error
+# - Uses ALL JSONs as points
+# - Uses JSON1 certainty_percent as the x-value reference (per key)
+# - Y can be abs_error or signed_error (choose with y_mode)
+# - Prints Spearman + Pearson correlations
+# - Adds a simple linear trend line
+
+import os, glob, json
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+def correlation_scatter_raw_certainty_json1_reference(
+    json_dir_path,
+    json1_file_path,
+    ground_truth_path,
+    gt_sep=";",
+    gt_edss_col="EDSS",
+    y_mode="abs",            # "abs" or "signed"
+    point_alpha=0.18,
+    point_size=12,
+):
+    def norm_str(x):
+        return str(x).strip().lower()
+
+    def parse_edss(x):
+        if x is None:
+            return np.nan
+        s = str(x).strip()
+        if s == "" or s.lower() in {"nan", "none", "null"}:
+            return np.nan
+        s = s.replace(",", ".")
+        return pd.to_numeric(s, errors="coerce")
+
+    def rankdata(a):
+        # Average-rank for ties (Spearman needs ranks)
+        s = pd.Series(a)
+        return s.rank(method="average").to_numpy()
+
+    # ----------------------------
+    # Load GT
+    # ----------------------------
+    df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
+    for col in ["unique_id", "MedDatum", gt_edss_col]:
+        if col not in df_gt.columns:
+            raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}")
+
+    df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
+    df_gt["MedDatum"]  = df_gt["MedDatum"].map(norm_str)
+    df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
+    df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss)
+
+    # ----------------------------
+    # Load JSON1 reference certainty_percent (per key)
+    # ----------------------------
+    with open(json1_file_path, "r", encoding="utf-8") as f:
+        data1 = json.load(f)
+
+    ref_rows = []
+    for entry in data1:
+        if not entry.get("success"):
+            continue
+        res = entry.get("result", {})
+        uid, md = res.get("unique_id"), res.get("MedDatum")
+        if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
+            continue
+        ref_rows.append({
+            "key": norm_str(uid) + "_" + norm_str(md),
+            "certainty_ref": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
+        })
+
+    df_ref = pd.DataFrame(ref_rows)
+
+    # Deduplicate keys if needed: take first non-null certainty
+    df_ref = (df_ref.dropna(subset=["certainty_ref"])
+                    .groupby("key", as_index=False)["certainty_ref"]
+                    .first())
+
+    # ----------------------------
+    # Load ALL JSON predictions (points)
+    # ----------------------------
+    json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json")))
+    if not json_files:
+        raise FileNotFoundError(f"No JSON files found in: {json_dir_path}")
+
+    rows = []
+    for fp in json_files:
+        with open(fp, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        for entry in data:
+            if not entry.get("success"):
+                continue
+            res = entry.get("result", {})
+            uid, md = res.get("unique_id"), res.get("MedDatum")
+            if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
+                continue
+            rows.append({
+                "file": os.path.basename(fp),
+                "key": norm_str(uid) + "_" + norm_str(md),
+                "EDSS_pred": parse_edss(res.get("EDSS")),
+            })
+
+    df_pred = pd.DataFrame(rows)
+
+    # ----------------------------
+    # Merge: preds + GT + JSON1 reference certainty
+    # ----------------------------
+    df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one")
+    df = df.merge(df_ref[["key", "certainty_ref"]], on="key", how="inner", validate="many_to_one")
+
+    # Filter needed fields
+    df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "certainty_ref"]).copy()
+
+    df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"]
+    df["abs_error"] = df["signed_error"].abs()
+    y_col = "abs_error" if y_mode == "abs" else "signed_error"
+
+    # ----------------------------
+    # Logs
+    # ----------------------------
+    print("\n" + "="*90)
+    print("CORRELATION: RAW certainty_percent (JSON1 reference) vs ERROR (ALL JSON points)")
+    print("="*90)
+    print(f"JSON DIR (points): {json_dir_path} | files: {len(json_files)}")
+    print(f"JSON1 reference:   {os.path.basename(json1_file_path)}")
+    print(f"Points available after merge+filter: {len(df)}")
+    print(f"Unique keys in plot: {df['key'].nunique()}")
+    print(f"Y mode: {y_mode} ({y_col})")
+
+    # ----------------------------
+    # Correlations (Pearson + Spearman)
+    # ----------------------------
+    x = df["certainty_ref"].to_numpy(dtype=float)
+    y = df[y_col].to_numpy(dtype=float)
+
+    # Pearson
+    pearson = np.corrcoef(x, y)[0, 1] if len(df) >= 2 else np.nan
+
+    # Spearman = Pearson corr of ranks
+    rx = rankdata(x)
+    ry = rankdata(y)
+    spearman = np.corrcoef(rx, ry)[0, 1] if len(df) >= 2 else np.nan
+
+    print(f"\nPearson r:  {pearson:.4f}")
+    print(f"Spearman ρ: {spearman:.4f}")
+
+    # ----------------------------
+    # Trend line (simple linear fit)
+    # ----------------------------
+    # Fit y = a*x + b
+    if len(df) >= 2:
+        a, b = np.polyfit(x, y, 1)
+    else:
+        a, b = np.nan, np.nan
+
+    # ----------------------------
+    # Plot
+    # ----------------------------
+    plt.figure(figsize=(12, 6))
+    plt.scatter(x, y, alpha=point_alpha, s=point_size)
+
+    # trend line across full x-range
+    if np.isfinite(a) and np.isfinite(b):
+        xs = np.linspace(np.nanmin(x), np.nanmax(x), 200)
+        plt.plot(xs, a * xs + b, linestyle="--", linewidth=2)
+
+    plt.xlabel("certainty_percent (from JSON 1, per key)")
+    ylabel = "Absolute Error |EDSS_pred − EDSS_gt|" if y_mode == "abs" else "Signed Error (EDSS_pred − EDSS_gt)"
+    plt.ylabel(ylabel)
+    plt.title(f"Correlation of JSON1 certainty_percent vs {y_col} (All iterations)\n"
+              f"Pearson r={pearson:.3f} | Spearman ρ={spearman:.3f}")
+    plt.grid(linestyle=":", alpha=0.5)
+    plt.tight_layout()
+    plt.show()
+
+
+# Example run:
+json1_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json"
+correlation_scatter_raw_certainty_json1_reference(
+    json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration",
+    json1_file_path=json1_path,
+    ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv",
+    y_mode="abs"   # or "signed"
+)
+##
+
+# %% Correlation adjusted
+
+# Correlation scatter (RAW certainty_percent from JSON1) vs error (all JSON points)
+# Adds:
+# 1) Legend (points, trend line) + Pearson/Spearman shown in legend and title
+# 2) Trend line color set to high-contrast (black by default)
+# 3) Density coloring: dots colored by local point density (bluer = more cases) + colorbar
+
+import os, glob, json
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib.lines import Line2D
+from matplotlib.colors import LogNorm, PowerNorm
+
+def correlation_scatter_raw_certainty_json1_reference(
+    json_dir_path,
+    json1_file_path,
+    ground_truth_path,
+    gt_sep=";",
+    gt_edss_col="EDSS",
+    y_mode="abs",            # "abs" or "signed"
+    point_alpha=0.85,        # higher alpha works better with density coloring
+    point_size=14,
+    trend_color="black",     # high-contrast line
+    save_svg_path=None,
+    dpi=300
+):
+    def norm_str(x):
+        return str(x).strip().lower()
+
+    def parse_edss(x):
+        if x is None:
+            return np.nan
+        s = str(x).strip()
+        if s == "" or s.lower() in {"nan", "none", "null"}:
+            return np.nan
+        s = s.replace(",", ".")
+        return pd.to_numeric(s, errors="coerce")
+
+    def rankdata(a):
+        return pd.Series(a).rank(method="average").to_numpy()
+
+    # ----------------------------
+    # Load GT
+    # ----------------------------
+    df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
+    for col in ["unique_id", "MedDatum", gt_edss_col]:
+        if col not in df_gt.columns:
+            raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}")
+
+    df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
+    df_gt["MedDatum"]  = df_gt["MedDatum"].map(norm_str)
+    df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
+    df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss)
+
+    # ----------------------------
+    # Load JSON1 reference certainty_percent (per key)
+    # ----------------------------
+    with open(json1_file_path, "r", encoding="utf-8") as f:
+        data1 = json.load(f)
+
+    ref_rows = []
+    for entry in data1:
+        if not entry.get("success"):
+            continue
+        res = entry.get("result", {})
+        uid, md = res.get("unique_id"), res.get("MedDatum")
+        if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
+            continue
+        ref_rows.append({
+            "key": norm_str(uid) + "_" + norm_str(md),
+            "certainty_ref": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
+        })
+
+    df_ref = pd.DataFrame(ref_rows)
+    df_ref = (df_ref.dropna(subset=["certainty_ref"])
+                    .groupby("key", as_index=False)["certainty_ref"]
+                    .first())
+
+    # ----------------------------
+    # Load ALL JSON predictions (points)
+    # ----------------------------
+    json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json")))
+    if not json_files:
+        raise FileNotFoundError(f"No JSON files found in: {json_dir_path}")
+
+    rows = []
+    for fp in json_files:
+        with open(fp, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        for entry in data:
+            if not entry.get("success"):
+                continue
+            res = entry.get("result", {})
+            uid, md = res.get("unique_id"), res.get("MedDatum")
+            if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
+                continue
+            rows.append({
+                "file": os.path.basename(fp),
+                "key": norm_str(uid) + "_" + norm_str(md),
+                "EDSS_pred": parse_edss(res.get("EDSS")),
+            })
+
+    df_pred = pd.DataFrame(rows)
+
+    # ----------------------------
+    # Merge: preds + GT + JSON1 reference certainty
+    # ----------------------------
+    df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one")
+    df = df.merge(df_ref[["key", "certainty_ref"]], on="key", how="inner", validate="many_to_one")
+    df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "certainty_ref"]).copy()
+
+    df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"]
+    df["abs_error"] = df["signed_error"].abs()
+    y_col = "abs_error" if y_mode == "abs" else "signed_error"
+
+    # ----------------------------
+    # Correlations
+    # ----------------------------
+    x = df["certainty_ref"].to_numpy(dtype=float)
+    y = df[y_col].to_numpy(dtype=float)
+
+    pearson = np.corrcoef(x, y)[0, 1] if len(df) >= 2 else np.nan
+    rx, ry = rankdata(x), rankdata(y)
+    spearman = np.corrcoef(rx, ry)[0, 1] if len(df) >= 2 else np.nan
+
+    # ----------------------------
+    # Trend line (linear fit)
+    # ----------------------------
+    if len(df) >= 2:
+        a, b = np.polyfit(x, y, 1)
+    else:
+        a, b = np.nan, np.nan
+
+    # ----------------------------
+    # Density coloring (2D histogram bin counts)
+    # "how blue" = how many points are around that location
+    # ----------------------------
+    # Choose binning resolution (balanced for ~thousands of points)
+    x_bins = 50
+    y_bins = 50
+
+    # Compute bin index per point
+    x_edges = np.linspace(np.nanmin(x), np.nanmax(x), x_bins + 1)
+    y_edges = np.linspace(np.nanmin(y), np.nanmax(y), y_bins + 1)
+
+    xi = np.clip(np.digitize(x, x_edges) - 1, 0, x_bins - 1)
+    yi = np.clip(np.digitize(y, y_edges) - 1, 0, y_bins - 1)
+
+    # 2D counts
+    counts2d = np.zeros((x_bins, y_bins), dtype=int)
+    for i in range(len(x)):
+        counts2d[xi[i], yi[i]] += 1
+
+    # density per point = count of its bin
+    density = np.array([counts2d[xi[i], yi[i]] for i in range(len(x))], dtype=float)
+
+    # Plot low density first, high density last (so dense points are visible)
+    order = np.argsort(density)
+    x_o, y_o, d_o = x[order], y[order], density[order]
+
+
+
+
+# ... keep everything above the "Plot" section identical ...
+
+    # ----------------------------
+    # Plot (IMPROVED COLORS)
+    # ----------------------------
+    fig, ax = plt.subplots(figsize=(12.5, 6))
+
+    # Option A (recommended): logarithmic color scaling
+    # Add +1 to avoid log(0)
+    d_plot = d_o + 1
+
+    # clip vmax so one extreme bin doesn't wash everything out
+    vmax = np.percentile(d_plot, 99)  # try 95 or 99 depending on your data
+    norm = LogNorm(vmin=1, vmax=max(2, vmax))
+
+    sc = ax.scatter(
+        x_o, y_o,
+        c=d_plot,
+        cmap="Blues",
+        norm=norm,
+        s=point_size,
+        alpha=point_alpha,
+        linewidths=0
+    )
+
+    # Trend line (black)
+    if np.isfinite(a) and np.isfinite(b):
+        xs = np.linspace(np.nanmin(x), np.nanmax(x), 200)
+        ax.plot(xs, a * xs + b, linestyle="--", linewidth=2.5, color=trend_color)
+
+    ax.set_xlabel("certainty_percent (from JSON 1, per key)")
+    ax.set_ylabel("Absolute Error |EDSS_pred − EDSS_gt|" if y_mode == "abs" else "Signed Error (EDSS_pred − EDSS_gt)")
+    ax.set_title(
+        f"Correlation: JSON1 certainty_percent vs {y_col} (All iterations)\n"
+        f"Pearson r={pearson:.3f} | Spearman ρ={spearman:.3f}"
+    )
+    ax.grid(linestyle=":", alpha=0.5)
+
+    # Colorbar
+    cbar = plt.colorbar(sc, ax=ax)
+    cbar.set_label("Local density (count of cases in bin, log-scaled)")
+
+    # Legend
+    legend_items = [
+        Line2D([0], [0], marker="o", linestyle="None", color="navy",
+               label=f"Data points (n={len(df)})"),
+        Line2D([0], [0], linestyle="--", color=trend_color, linewidth=2.5,
+               label=f"Linear trend (Pearson r={pearson:.3f})"),
+    ]
+    ax.legend(handles=legend_items, loc="upper right", frameon=True, title="Legend")
+
+    plt.tight_layout()
+    # Save as SVG (optional)
+    if save_svg_path:
+        fig.savefig(save_svg_path, format="svg", bbox_inches="tight", dpi=dpi)
+        print(f"[SAVED] {save_svg_path}")
+
+    plt.show()
+
+json1_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json"
+
+correlation_scatter_raw_certainty_json1_reference(
+    json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration",
+    json1_file_path=json1_path,
+    ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv",
+    y_mode="abs",
+    save_svg_path="/home/shahin/Lab/Doktorarbeit/Barcelona/results/corr_json1_abs_error.svg"
+)
+
+##
+
+
+
diff --git a/Data/certainty_show.py b/certainty_show.py
similarity index 100%
rename from Data/certainty_show.py
rename to certainty_show.py
diff --git a/figure1.py b/figure1.py
index e9f1ee7..9f4706b 100644
--- a/figure1.py
+++ b/figure1.py
@@ -320,3 +320,63 @@ plt.tight_layout()
 plt.show()
 
 ##
+
+# %% Patientjourney Bubble chart
+import matplotlib.pyplot as plt
+import numpy as np
+
+import matplotlib as mpl
+
+mpl.rcParams["font.family"] = "DejaVu Sans"   # or "Arial", "Calibri", "Times New Roman", ...
+mpl.rcParams["font.size"] = 12                # default size for text
+mpl.rcParams["axes.titlesize"] = 14
+mpl.rcParams["axes.titleweight"] = "bold"
+
+
+# Data (your counts)
+visits = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
+patient_count = np.array([32, 24, 28, 17, 13, 6, 3, 3, 2])
+
+# "Remaining" = patients with >= that many visits (cumulative from the right)
+remaining = np.array([patient_count[i:].sum() for i in range(len(patient_count))])
+
+# --- Plot ---
+fig, ax = plt.subplots(figsize=(12, 3))
+
+y = 0.0  # all bubbles on one horizontal line
+
+# Horizontal line
+ax.hlines(y, visits.min() - 0.4, visits.max() + 0.4, color="#1f77b4", linewidth=3)
+
+# Bubble sizes (scale as needed)
+# (Matplotlib scatter uses area in points^2)
+sizes = patient_count * 35  # tweak this multiplier if you want bigger/smaller bubbles
+
+ax.scatter(visits, np.full_like(visits, y), s=sizes, color="#1f77b4", zorder=3)
+
+# Title
+#ax.set_title("Patient Journey by Visit Count", fontsize=14, pad=18)
+
+# Top labels: "1 visits", "2 visits", ...
+for x in visits:
+    label = f"{x} visit" if x == 1 else f"{x} visits"
+    ax.text(x, y + 0.18, label, ha="center", va="bottom", fontsize=10)
+
+# Bottom labels: "X patients" and "Y remaining"
+for x, pc, rem in zip(visits, patient_count, remaining):
+    ax.text(x, y - 0.20, f"{pc} patients", ha="center", va="top", fontsize=9)
+    ax.text(x, y - 0.32, f"{rem} remaining", ha="center", va="top", fontsize=9)
+
+# Cosmetics: remove axes, keep spacing nice
+ax.set_xlim(visits.min() - 0.6, visits.max() + 0.6)
+ax.set_ylim(-0.5, 0.35)
+ax.set_xticks([])
+ax.set_yticks([])
+for spine in ax.spines.values():
+    spine.set_visible(False)
+
+plt.tight_layout()
+plt.show()
+plt.savefig("patient_journey.svg", format="svg", bbox_inches="tight")
+##
+
diff --git a/Data/show_plots.py b/show_plots.py
similarity index 100%
rename from Data/show_plots.py
rename to show_plots.py