From 118e3e63b30d4822f182a6f1d75d354717024d9f Mon Sep 17 00:00:00 2001 From: Shahin Ramezanzadeh Date: Mon, 23 Feb 2026 15:06:54 +0100 Subject: [PATCH] refinement --- Data/audit.py | 415 ---- audit.py | 2371 +++++++++++++++++++ Data/certainty_show.py => certainty_show.py | 0 figure1.py | 60 + Data/show_plots.py => show_plots.py | 0 5 files changed, 2431 insertions(+), 415 deletions(-) delete mode 100644 Data/audit.py create mode 100644 audit.py rename Data/certainty_show.py => certainty_show.py (100%) rename Data/show_plots.py => show_plots.py (100%) diff --git a/Data/audit.py b/Data/audit.py deleted file mode 100644 index cbac7cb..0000000 --- a/Data/audit.py +++ /dev/null @@ -1,415 +0,0 @@ -# %% Confirm EDSS missing -import pandas as pd -import numpy as np - -def clean_series(s): - return s.astype(str).str.strip().str.lower() - -def gt_edss_audit(ground_truth_path, edss_col="EDSS"): - df_gt = pd.read_csv(ground_truth_path, sep=';') - - # normalize keys - df_gt['unique_id'] = clean_series(df_gt['unique_id']) - df_gt['MedDatum'] = clean_series(df_gt['MedDatum']) - df_gt['key'] = df_gt['unique_id'] + "_" + df_gt['MedDatum'] - - print("GT rows:", len(df_gt)) - print("GT unique keys:", df_gt['key'].nunique()) - - # IMPORTANT: parse EDSS robustly (German decimal commas etc.) - if edss_col in df_gt.columns: - edss_raw = df_gt[edss_col] - edss_num = pd.to_numeric( - edss_raw.astype(str).str.replace(",", ".", regex=False).str.strip(), - errors="coerce" - ) - df_gt["_edss_num"] = edss_num - - print(f"GT missing EDSS look (numeric-coerce): {df_gt['_edss_num'].isna().sum()}") - print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['_edss_num'].isna(), 'key'].nunique()}") - - # duplicates on key - dup = df_gt['key'].duplicated(keep=False) - print("GT duplicate-key rows:", dup.sum()) - if dup.any(): - # how many duplicate keys exist? - print("GT duplicate keys:", df_gt.loc[dup, 'key'].nunique()) - # of duplicate-key rows, how many have missing EDSS? - print("Duplicate-key rows with missing EDSS:", df_gt.loc[dup, "_edss_num"].isna().sum()) - - # show the worst offenders - print("\nTop duplicate keys (by count):") - print(df_gt.loc[dup, 'key'].value_counts().head(10)) - else: - print(f"EDSS column '{edss_col}' not found in GT columns:", df_gt.columns.tolist()) - - return df_gt - -df_gt = gt_edss_audit("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", edss_col="EDSS") - -## - - - - -# %% trace missing ones - -import json, glob, os -import pandas as pd - -def load_preds(json_dir_path): - all_preds = [] - for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): - with open(file_path, 'r', encoding='utf-8') as f: - data = json.load(f) - file_name = os.path.basename(file_path) - for entry in data: - if entry.get("success"): - res = entry["result"] - all_preds.append({ - "unique_id": str(res.get("unique_id")).strip().lower(), - "MedDatum": str(res.get("MedDatum")).strip().lower(), - "file": file_name - }) - df_pred = pd.DataFrame(all_preds) - df_pred["key"] = df_pred["unique_id"] + "_" + df_pred["MedDatum"] - return df_pred - -df_pred = load_preds("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration") -print("Pred rows:", len(df_pred)) -print("Pred unique keys:", df_pred["key"].nunique()) - -# Suppose df_gt was returned from step 1 and has _edss_num + key -missing_gt_keys = set(df_gt.loc[df_gt["_edss_num"].isna(), "key"]) - -df_pred["gt_key_missing_edss"] = df_pred["key"].isin(missing_gt_keys) - -print("Pred rows whose GT key has missing EDSS:", df_pred["gt_key_missing_edss"].sum()) -print("Unique keys (among preds) whose GT EDSS missing:", df_pred.loc[df_pred["gt_key_missing_edss"], "key"].nunique()) - -print("\nTop files contributing to missing-GT-EDSS rows:") -print(df_pred.loc[df_pred["gt_key_missing_edss"], "file"].value_counts().head(20)) - -print("\nTop keys replicated in predictions (why count inflates):") -print(df_pred.loc[df_pred["gt_key_missing_edss"], "key"].value_counts().head(20)) - - -## - - -# %% verify - -merged = df_pred.merge( - df_gt[["key", "_edss_num"]], # use the numeric-coerced GT EDSS - on="key", - how="left", - validate="many_to_one" # will ERROR if GT has duplicate keys (GOOD!) -) - -print("Merged rows:", len(merged)) -print("Merged missing GT EDSS:", merged["_edss_num"].isna().sum()) - - -## - - -# %% 1json (rewritten with robust parsing + detailed data log) -import pandas as pd -import numpy as np -import json -import matplotlib.pyplot as plt -import seaborn as sns -from matplotlib.patches import Patch -from matplotlib.lines import Line2D - -def plot_single_json_error_analysis_with_log( - json_file_path, - ground_truth_path, - edss_gt_col="EDSS", - min_bin_count=5, -): - def norm_str(x): - # normalize identifiers and dates consistently - return str(x).strip().lower() - - def parse_edss(x): - # robust numeric parse: handles "3,5" as 3.5, blanks, "nan", etc. - if x is None: - return np.nan - s = str(x).strip() - if s == "" or s.lower() in {"nan", "none", "null"}: - return np.nan - s = s.replace(",", ".") - return pd.to_numeric(s, errors="coerce") - - print("\n" + "="*80) - print("SINGLE-JSON ERROR ANALYSIS (WITH LOG)") - print("="*80) - print(f"JSON: {json_file_path}") - print(f"GT: {ground_truth_path}") - - # ------------------------------------------------------------------ - # 1) Load Ground Truth - # ------------------------------------------------------------------ - df_gt = pd.read_csv(ground_truth_path, sep=";") - - required_gt_cols = {"unique_id", "MedDatum", edss_gt_col} - missing_cols = required_gt_cols - set(df_gt.columns) - if missing_cols: - raise ValueError(f"GT is missing required columns: {missing_cols}. Available: {df_gt.columns.tolist()}") - - df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) - df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) - df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] - - # Robust EDSS parsing (important!) - df_gt["EDSS_gt"] = df_gt[edss_gt_col].map(parse_edss) - - # GT logs - print("\n--- GT LOG ---") - print(f"GT rows: {len(df_gt)}") - print(f"GT unique keys: {df_gt['key'].nunique()}") - gt_dup = df_gt["key"].duplicated(keep=False).sum() - print(f"GT duplicate-key rows: {gt_dup}") - print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}") - print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['EDSS_gt'].isna(), 'key'].nunique()}") - - if gt_dup > 0: - print("\n[WARNING] GT has duplicate keys. Merge can duplicate rows. Example duplicate keys:") - print(df_gt.loc[df_gt["key"].duplicated(keep=False), "key"].value_counts().head(10)) - - # ------------------------------------------------------------------ - # 2) Load Predictions from the specific JSON - # ------------------------------------------------------------------ - with open(json_file_path, "r", encoding="utf-8") as f: - data = json.load(f) - - total_entries = len(data) - success_entries = sum(1 for e in data if e.get("success")) - - all_preds = [] - skipped = { - "not_success": 0, - "missing_uid_or_date": 0, - "missing_edss": 0, - "missing_conf": 0, - } - - for entry in data: - if not entry.get("success"): - skipped["not_success"] += 1 - continue - - res = entry.get("result", {}) - uid = res.get("unique_id") - md = res.get("MedDatum") - - if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": - skipped["missing_uid_or_date"] += 1 - continue - - edss_pred = parse_edss(res.get("EDSS")) - conf = pd.to_numeric(res.get("certainty_percent"), errors="coerce") - - if pd.isna(edss_pred): - skipped["missing_edss"] += 1 - if pd.isna(conf): - skipped["missing_conf"] += 1 - - all_preds.append({ - "unique_id": norm_str(uid), - "MedDatum": norm_str(md), - "key": norm_str(uid) + "_" + norm_str(md), - "EDSS_pred": edss_pred, - "confidence": conf, - }) - - df_pred = pd.DataFrame(all_preds) - - # Pred logs - print("\n--- PRED LOG ---") - print(f"JSON total entries: {total_entries}") - print(f"JSON success entries: {success_entries}") - print(f"Pred rows loaded (success + has keys): {len(df_pred)}") - if len(df_pred) == 0: - print("[ERROR] No usable prediction rows found. Nothing to plot.") - return - - print(f"Pred unique keys: {df_pred['key'].nunique()}") - print(f"Pred missing EDSS (numeric): {df_pred['EDSS_pred'].isna().sum()}") - print(f"Pred missing confidence: {df_pred['confidence'].isna().sum()}") - print("Skipped counts:", skipped) - - # Are keys duplicated within this JSON? (often yes if multiple notes map to same key) - key_counts = df_pred["key"].value_counts() - dup_pred_rows = (key_counts > 1).sum() - max_rep = int(key_counts.max()) - print(f"Keys with >1 prediction in this JSON: {dup_pred_rows}") - print(f"Max repetitions of a single key in this JSON: {max_rep}") - if max_rep > 1: - print("Top repeated keys in this JSON:") - print(key_counts.head(10)) - - # ------------------------------------------------------------------ - # 3) Merge (and diagnose why rows drop) - # ------------------------------------------------------------------ - # Diagnose how many pred keys exist in GT - gt_key_set = set(df_gt["key"]) - df_pred["key_in_gt"] = df_pred["key"].isin(gt_key_set) - not_in_gt = df_pred.loc[~df_pred["key_in_gt"]] - - print("\n--- KEY MATCH LOG ---") - print(f"Pred rows with key found in GT: {df_pred['key_in_gt'].sum()} / {len(df_pred)}") - print(f"Pred rows with key NOT found in GT: {len(not_in_gt)}") - if len(not_in_gt) > 0: - print("[WARNING] Some prediction keys are not present in GT. First 10:") - print(not_in_gt[["unique_id", "MedDatum", "key"]].head(10)) - - # Now merge; we expect GT is one-to-many with pred (many_to_one) - # If GT had duplicates, validate would raise. - df_merged = df_pred.merge( - df_gt[["key", "EDSS_gt"]], - on="key", - how="inner", - validate="many_to_one" - ) - - print("\n--- MERGE LOG ---") - print(f"Merged rows (inner join): {len(df_merged)}") - print(f"Merged unique keys: {df_merged['key'].nunique()}") - print(f"Merged missing GT EDSS: {df_merged['EDSS_gt'].isna().sum()}") - print(f"Merged missing pred EDSS: {df_merged['EDSS_pred'].isna().sum()}") - print(f"Merged missing confidence:{df_merged['confidence'].isna().sum()}") - - # How many rows will be removed by dropna() in your old code? - # Old code did .dropna() on ALL columns, which can remove rows for missing confidence too. - rows_complete = df_merged.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]) - print("\n--- FILTER LOG (what will be used for stats/plot) ---") - print(f"Rows with all required fields (EDSS_gt, EDSS_pred, confidence): {len(rows_complete)}") - if len(rows_complete) == 0: - print("[ERROR] No complete rows after filtering. Nothing to plot.") - return - - # Compute abs error - rows_complete = rows_complete.copy() - rows_complete["abs_error"] = (rows_complete["EDSS_pred"] - rows_complete["EDSS_gt"]).abs() - - # ------------------------------------------------------------------ - # 4) Binning + stats (with guardrails) - # ------------------------------------------------------------------ - bins = [0, 70, 80, 90, 100] - labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] - - # Confidence outside bins becomes NaN; log it - rows_complete["conf_bin"] = pd.cut(rows_complete["confidence"], bins=bins, labels=labels, include_lowest=True) - conf_outside = rows_complete["conf_bin"].isna().sum() - print(f"Rows with confidence outside [0,100] or outside bin edges: {conf_outside}") - if conf_outside > 0: - print("Example confidences outside bins:") - print(rows_complete.loc[rows_complete["conf_bin"].isna(), "confidence"].head(20).to_list()) - - df_plot = rows_complete.dropna(subset=["conf_bin"]) - stats = ( - df_plot.groupby("conf_bin", observed=True)["abs_error"] - .agg(mean="mean", std="std", count="count") - .reindex(labels) - .reset_index() - ) - - print("\n--- BIN STATS ---") - print(stats) - - # Warn about low counts - low_bins = stats.loc[stats["count"].fillna(0) < min_bin_count, ["conf_bin", "count"]] - if not low_bins.empty: - print(f"\n[WARNING] Some bins have < {min_bin_count} rows; error bars/trend may be unstable:") - print(low_bins) - - # ------------------------------------------------------------------ - # 5) Plot - # ------------------------------------------------------------------ - plt.figure(figsize=(13, 8)) - colors = sns.color_palette("Blues", n_colors=len(labels)) - - # Replace NaNs in mean for plotting bars (empty bins) - means = stats["mean"].to_numpy() - counts = stats["count"].fillna(0).astype(int).to_numpy() - stds = stats["std"].to_numpy() - - # For bins with no data, bar height 0 (and no errorbar) - means_plot = np.nan_to_num(means, nan=0.0) - - bars = plt.bar(labels, means_plot, color=colors, edgecolor="black", alpha=0.85) - - # Error bars only where count>1 and std is not NaN - sem = np.where((counts > 1) & (~np.isnan(stds)), stds / np.sqrt(counts), np.nan) - plt.errorbar(labels, means_plot, yerr=sem, fmt="none", c="black", capsize=8, elinewidth=1.5) - - # Trend line only if at least 2 non-empty bins - valid_idx = np.where(~np.isnan(means))[0] - if len(valid_idx) >= 2: - x_idx = np.arange(len(labels)) - z = np.polyfit(valid_idx, means[valid_idx], 1) - p = np.poly1d(z) - plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5) - trend_label = "Trend Line" - else: - trend_label = "Trend Line (insufficient bins)" - print("\n[INFO] Not enough non-empty bins to fit a trend line.") - - # Data labels - for i, bar in enumerate(bars): - n_count = int(counts[i]) - mae_val = means[i] - if np.isnan(mae_val) or n_count == 0: - txt = "empty" - y = 0.02 - else: - txt = f"MAE: {mae_val:.2f}\nn={n_count}" - y = bar.get_height() + 0.04 - plt.text( - bar.get_x() + bar.get_width()/2, - y, - txt, - ha="center", - va="bottom", - fontweight="bold", - fontsize=10 - ) - - # Legend - legend_elements = [ - Patch(facecolor=colors[0], edgecolor="black", label=f"Bin 1: {labels[0]}"), - Patch(facecolor=colors[1], edgecolor="black", label=f"Bin 2: {labels[1]}"), - Patch(facecolor=colors[2], edgecolor="black", label=f"Bin 3: {labels[2]}"), - Patch(facecolor=colors[3], edgecolor="black", label=f"Bin 4: {labels[3]}"), - Line2D([0], [0], color="#e74c3c", linestyle="--", lw=3, label=trend_label), - Line2D([0], [0], color="black", marker="_", linestyle="None", markersize=10, label="Std Error (SEM)"), - Patch(color="none", label="Metric: Mean Absolute Error (MAE)") - ] - plt.legend(handles=legend_elements, loc="upper right", frameon=True, shadow=True, title="Legend") - - plt.title("Validation: Confidence vs. Error Magnitude (Single JSON)", fontsize=15, pad=30) - plt.ylabel("Mean Absolute Error (EDSS Points)", fontsize=12) - plt.xlabel("LLM Confidence Bracket", fontsize=12) - plt.grid(axis="y", linestyle=":", alpha=0.5) - - ymax = np.nanmax(means) if np.any(~np.isnan(means)) else 0.0 - plt.ylim(0, max(0.5, float(ymax) + 0.6)) - plt.tight_layout() - plt.show() - - print("\n" + "="*80) - print("DONE") - print("="*80) - - -# --- RUN --- -json_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json" -gt_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv" - -plot_single_json_error_analysis_with_log(json_path, gt_path) - - - -## diff --git a/audit.py b/audit.py new file mode 100644 index 0000000..7fa3039 --- /dev/null +++ b/audit.py @@ -0,0 +1,2371 @@ +# %% Confirm EDSS missing +import pandas as pd +import numpy as np + +def clean_series(s): + return s.astype(str).str.strip().str.lower() + +def gt_edss_audit(ground_truth_path, edss_col="EDSS"): + df_gt = pd.read_csv(ground_truth_path, sep=';') + + # normalize keys + df_gt['unique_id'] = clean_series(df_gt['unique_id']) + df_gt['MedDatum'] = clean_series(df_gt['MedDatum']) + df_gt['key'] = df_gt['unique_id'] + "_" + df_gt['MedDatum'] + + print("GT rows:", len(df_gt)) + print("GT unique keys:", df_gt['key'].nunique()) + + # IMPORTANT: parse EDSS robustly (German decimal commas etc.) + if edss_col in df_gt.columns: + edss_raw = df_gt[edss_col] + edss_num = pd.to_numeric( + edss_raw.astype(str).str.replace(",", ".", regex=False).str.strip(), + errors="coerce" + ) + df_gt["_edss_num"] = edss_num + + print(f"GT missing EDSS look (numeric-coerce): {df_gt['_edss_num'].isna().sum()}") + print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['_edss_num'].isna(), 'key'].nunique()}") + + # duplicates on key + dup = df_gt['key'].duplicated(keep=False) + print("GT duplicate-key rows:", dup.sum()) + if dup.any(): + # how many duplicate keys exist? + print("GT duplicate keys:", df_gt.loc[dup, 'key'].nunique()) + # of duplicate-key rows, how many have missing EDSS? + print("Duplicate-key rows with missing EDSS:", df_gt.loc[dup, "_edss_num"].isna().sum()) + + # show the worst offenders + print("\nTop duplicate keys (by count):") + print(df_gt.loc[dup, 'key'].value_counts().head(10)) + else: + print(f"EDSS column '{edss_col}' not found in GT columns:", df_gt.columns.tolist()) + + return df_gt + +df_gt = gt_edss_audit("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", edss_col="EDSS") + +## + + + + +# %% trace missing ones + +import json, glob, os +import pandas as pd + +def load_preds(json_dir_path): + all_preds = [] + for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + file_name = os.path.basename(file_path) + for entry in data: + if entry.get("success"): + res = entry["result"] + all_preds.append({ + "unique_id": str(res.get("unique_id")).strip().lower(), + "MedDatum": str(res.get("MedDatum")).strip().lower(), + "file": file_name + }) + df_pred = pd.DataFrame(all_preds) + df_pred["key"] = df_pred["unique_id"] + "_" + df_pred["MedDatum"] + return df_pred + +df_pred = load_preds("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration") +print("Pred rows:", len(df_pred)) +print("Pred unique keys:", df_pred["key"].nunique()) + +# Suppose df_gt was returned from step 1 and has _edss_num + key +missing_gt_keys = set(df_gt.loc[df_gt["_edss_num"].isna(), "key"]) + +df_pred["gt_key_missing_edss"] = df_pred["key"].isin(missing_gt_keys) + +print("Pred rows whose GT key has missing EDSS:", df_pred["gt_key_missing_edss"].sum()) +print("Unique keys (among preds) whose GT EDSS missing:", df_pred.loc[df_pred["gt_key_missing_edss"], "key"].nunique()) + +print("\nTop files contributing to missing-GT-EDSS rows:") +print(df_pred.loc[df_pred["gt_key_missing_edss"], "file"].value_counts().head(20)) + +print("\nTop keys replicated in predictions (why count inflates):") +print(df_pred.loc[df_pred["gt_key_missing_edss"], "key"].value_counts().head(20)) + + +## + + +# %% verify + +merged = df_pred.merge( + df_gt[["key", "_edss_num"]], # use the numeric-coerced GT EDSS + on="key", + how="left", + validate="many_to_one" # will ERROR if GT has duplicate keys (GOOD!) +) + +print("Merged rows:", len(merged)) +print("Merged missing GT EDSS:", merged["_edss_num"].isna().sum()) + + +## + + +# %% 1json (rewritten with robust parsing + detailed data log) +import pandas as pd +import numpy as np +import json +import matplotlib.pyplot as plt +import seaborn as sns +from matplotlib.patches import Patch +from matplotlib.lines import Line2D + +def plot_single_json_error_analysis_with_log( + json_file_path, + ground_truth_path, + edss_gt_col="EDSS", + min_bin_count=5, +): + def norm_str(x): + # normalize identifiers and dates consistently + return str(x).strip().lower() + + def parse_edss(x): + # robust numeric parse: handles "3,5" as 3.5, blanks, "nan", etc. + if x is None: + return np.nan + s = str(x).strip() + if s == "" or s.lower() in {"nan", "none", "null"}: + return np.nan + s = s.replace(",", ".") + return pd.to_numeric(s, errors="coerce") + + print("\n" + "="*80) + print("SINGLE-JSON ERROR ANALYSIS (WITH LOG)") + print("="*80) + print(f"JSON: {json_file_path}") + print(f"GT: {ground_truth_path}") + + # ------------------------------------------------------------------ + # 1) Load Ground Truth + # ------------------------------------------------------------------ + df_gt = pd.read_csv(ground_truth_path, sep=";") + + required_gt_cols = {"unique_id", "MedDatum", edss_gt_col} + missing_cols = required_gt_cols - set(df_gt.columns) + if missing_cols: + raise ValueError(f"GT is missing required columns: {missing_cols}. Available: {df_gt.columns.tolist()}") + + df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) + df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) + df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] + + # Robust EDSS parsing (important!) + df_gt["EDSS_gt"] = df_gt[edss_gt_col].map(parse_edss) + + # GT logs + print("\n--- GT LOG ---") + print(f"GT rows: {len(df_gt)}") + print(f"GT unique keys: {df_gt['key'].nunique()}") + gt_dup = df_gt["key"].duplicated(keep=False).sum() + print(f"GT duplicate-key rows: {gt_dup}") + print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}") + print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['EDSS_gt'].isna(), 'key'].nunique()}") + + if gt_dup > 0: + print("\n[WARNING] GT has duplicate keys. Merge can duplicate rows. Example duplicate keys:") + print(df_gt.loc[df_gt["key"].duplicated(keep=False), "key"].value_counts().head(10)) + + # ------------------------------------------------------------------ + # 2) Load Predictions from the specific JSON + # ------------------------------------------------------------------ + with open(json_file_path, "r", encoding="utf-8") as f: + data = json.load(f) + + total_entries = len(data) + success_entries = sum(1 for e in data if e.get("success")) + + all_preds = [] + skipped = { + "not_success": 0, + "missing_uid_or_date": 0, + "missing_edss": 0, + "missing_conf": 0, + } + + for entry in data: + if not entry.get("success"): + skipped["not_success"] += 1 + continue + + res = entry.get("result", {}) + uid = res.get("unique_id") + md = res.get("MedDatum") + + if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": + skipped["missing_uid_or_date"] += 1 + continue + + edss_pred = parse_edss(res.get("EDSS")) + conf = pd.to_numeric(res.get("certainty_percent"), errors="coerce") + + if pd.isna(edss_pred): + skipped["missing_edss"] += 1 + if pd.isna(conf): + skipped["missing_conf"] += 1 + + all_preds.append({ + "unique_id": norm_str(uid), + "MedDatum": norm_str(md), + "key": norm_str(uid) + "_" + norm_str(md), + "EDSS_pred": edss_pred, + "confidence": conf, + }) + + df_pred = pd.DataFrame(all_preds) + + # Pred logs + print("\n--- PRED LOG ---") + print(f"JSON total entries: {total_entries}") + print(f"JSON success entries: {success_entries}") + print(f"Pred rows loaded (success + has keys): {len(df_pred)}") + if len(df_pred) == 0: + print("[ERROR] No usable prediction rows found. Nothing to plot.") + return + + print(f"Pred unique keys: {df_pred['key'].nunique()}") + print(f"Pred missing EDSS (numeric): {df_pred['EDSS_pred'].isna().sum()}") + print(f"Pred missing confidence: {df_pred['confidence'].isna().sum()}") + print("Skipped counts:", skipped) + + # Are keys duplicated within this JSON? (often yes if multiple notes map to same key) + key_counts = df_pred["key"].value_counts() + dup_pred_rows = (key_counts > 1).sum() + max_rep = int(key_counts.max()) + print(f"Keys with >1 prediction in this JSON: {dup_pred_rows}") + print(f"Max repetitions of a single key in this JSON: {max_rep}") + if max_rep > 1: + print("Top repeated keys in this JSON:") + print(key_counts.head(10)) + + # ------------------------------------------------------------------ + # 3) Merge (and diagnose why rows drop) + # ------------------------------------------------------------------ + # Diagnose how many pred keys exist in GT + gt_key_set = set(df_gt["key"]) + df_pred["key_in_gt"] = df_pred["key"].isin(gt_key_set) + not_in_gt = df_pred.loc[~df_pred["key_in_gt"]] + + print("\n--- KEY MATCH LOG ---") + print(f"Pred rows with key found in GT: {df_pred['key_in_gt'].sum()} / {len(df_pred)}") + print(f"Pred rows with key NOT found in GT: {len(not_in_gt)}") + if len(not_in_gt) > 0: + print("[WARNING] Some prediction keys are not present in GT. First 10:") + print(not_in_gt[["unique_id", "MedDatum", "key"]].head(10)) + + # Now merge; we expect GT is one-to-many with pred (many_to_one) + # If GT had duplicates, validate would raise. + df_merged = df_pred.merge( + df_gt[["key", "EDSS_gt"]], + on="key", + how="inner", + validate="many_to_one" + ) + + print("\n--- MERGE LOG ---") + print(f"Merged rows (inner join): {len(df_merged)}") + print(f"Merged unique keys: {df_merged['key'].nunique()}") + print(f"Merged missing GT EDSS: {df_merged['EDSS_gt'].isna().sum()}") + print(f"Merged missing pred EDSS: {df_merged['EDSS_pred'].isna().sum()}") + print(f"Merged missing confidence:{df_merged['confidence'].isna().sum()}") + + # How many rows will be removed by dropna() in your old code? + # Old code did .dropna() on ALL columns, which can remove rows for missing confidence too. + rows_complete = df_merged.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]) + print("\n--- FILTER LOG (what will be used for stats/plot) ---") + print(f"Rows with all required fields (EDSS_gt, EDSS_pred, confidence): {len(rows_complete)}") + if len(rows_complete) == 0: + print("[ERROR] No complete rows after filtering. Nothing to plot.") + return + + # Compute abs error + rows_complete = rows_complete.copy() + rows_complete["abs_error"] = (rows_complete["EDSS_pred"] - rows_complete["EDSS_gt"]).abs() + + # ------------------------------------------------------------------ + # 4) Binning + stats (with guardrails) + # ------------------------------------------------------------------ + bins = [0, 70, 80, 90, 100] + labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] + + # Confidence outside bins becomes NaN; log it + rows_complete["conf_bin"] = pd.cut(rows_complete["confidence"], bins=bins, labels=labels, include_lowest=True) + conf_outside = rows_complete["conf_bin"].isna().sum() + print(f"Rows with confidence outside [0,100] or outside bin edges: {conf_outside}") + if conf_outside > 0: + print("Example confidences outside bins:") + print(rows_complete.loc[rows_complete["conf_bin"].isna(), "confidence"].head(20).to_list()) + + df_plot = rows_complete.dropna(subset=["conf_bin"]) + stats = ( + df_plot.groupby("conf_bin", observed=True)["abs_error"] + .agg(mean="mean", std="std", count="count") + .reindex(labels) + .reset_index() + ) + + print("\n--- BIN STATS ---") + print(stats) + + # Warn about low counts + low_bins = stats.loc[stats["count"].fillna(0) < min_bin_count, ["conf_bin", "count"]] + if not low_bins.empty: + print(f"\n[WARNING] Some bins have < {min_bin_count} rows; error bars/trend may be unstable:") + print(low_bins) + + # ------------------------------------------------------------------ + # 5) Plot + # ------------------------------------------------------------------ + plt.figure(figsize=(13, 8)) + colors = sns.color_palette("Blues", n_colors=len(labels)) + + # Replace NaNs in mean for plotting bars (empty bins) + means = stats["mean"].to_numpy() + counts = stats["count"].fillna(0).astype(int).to_numpy() + stds = stats["std"].to_numpy() + + # For bins with no data, bar height 0 (and no errorbar) + means_plot = np.nan_to_num(means, nan=0.0) + + bars = plt.bar(labels, means_plot, color=colors, edgecolor="black", alpha=0.85) + + # Error bars only where count>1 and std is not NaN + sem = np.where((counts > 1) & (~np.isnan(stds)), stds / np.sqrt(counts), np.nan) + plt.errorbar(labels, means_plot, yerr=sem, fmt="none", c="black", capsize=8, elinewidth=1.5) + + # Trend line only if at least 2 non-empty bins + valid_idx = np.where(~np.isnan(means))[0] + if len(valid_idx) >= 2: + x_idx = np.arange(len(labels)) + z = np.polyfit(valid_idx, means[valid_idx], 1) + p = np.poly1d(z) + plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5) + trend_label = "Trend Line" + else: + trend_label = "Trend Line (insufficient bins)" + print("\n[INFO] Not enough non-empty bins to fit a trend line.") + + # Data labels + for i, bar in enumerate(bars): + n_count = int(counts[i]) + mae_val = means[i] + if np.isnan(mae_val) or n_count == 0: + txt = "empty" + y = 0.02 + else: + txt = f"MAE: {mae_val:.2f}\nn={n_count}" + y = bar.get_height() + 0.04 + plt.text( + bar.get_x() + bar.get_width()/2, + y, + txt, + ha="center", + va="bottom", + fontweight="bold", + fontsize=10 + ) + + # Legend + legend_elements = [ + Patch(facecolor=colors[0], edgecolor="black", label=f"Bin 1: {labels[0]}"), + Patch(facecolor=colors[1], edgecolor="black", label=f"Bin 2: {labels[1]}"), + Patch(facecolor=colors[2], edgecolor="black", label=f"Bin 3: {labels[2]}"), + Patch(facecolor=colors[3], edgecolor="black", label=f"Bin 4: {labels[3]}"), + Line2D([0], [0], color="#e74c3c", linestyle="--", lw=3, label=trend_label), + Line2D([0], [0], color="black", marker="_", linestyle="None", markersize=10, label="Std Error (SEM)"), + Patch(color="none", label="Metric: Mean Absolute Error (MAE)") + ] + plt.legend(handles=legend_elements, loc="upper right", frameon=True, shadow=True, title="Legend") + + plt.title("Validation: Confidence vs. Error Magnitude (Single JSON)", fontsize=15, pad=30) + plt.ylabel("Mean Absolute Error (EDSS Points)", fontsize=12) + plt.xlabel("LLM Confidence Bracket", fontsize=12) + plt.grid(axis="y", linestyle=":", alpha=0.5) + + ymax = np.nanmax(means) if np.any(~np.isnan(means)) else 0.0 + plt.ylim(0, max(0.5, float(ymax) + 0.6)) + plt.tight_layout() + plt.show() + + print("\n" + "="*80) + print("DONE") + print("="*80) + + +# --- RUN --- +json_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json" +gt_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv" + +plot_single_json_error_analysis_with_log(json_path, gt_path) + + + +## + + +# %% Certainty vs Delta (rewritten with robust parsing + detailed data loss logs) +import pandas as pd +import numpy as np +import json +import glob +import os +import matplotlib.pyplot as plt +import seaborn as sns +from matplotlib.patches import Patch +from matplotlib.lines import Line2D + +def plot_confidence_vs_abs_error_with_log( + json_dir_path, + ground_truth_path, + edss_gt_col="EDSS", + min_bin_count=5, + include_lowest=True, +): + def norm_str(x): + return str(x).strip().lower() + + def parse_edss(x): + # robust numeric parse: handles comma decimals and empty tokens + if x is None: + return np.nan + s = str(x).strip() + if s == "" or s.lower() in {"nan", "none", "null"}: + return np.nan + s = s.replace(",", ".") + return pd.to_numeric(s, errors="coerce") + + print("\n" + "="*90) + print("CERTAINTY vs ABS ERROR (ALL JSONs) — WITH DATA LOSS LOG") + print("="*90) + print(f"JSON DIR: {json_dir_path}") + print(f"GT FILE: {ground_truth_path}") + + # ------------------------------------------------------------------ + # 1) Load GT + # ------------------------------------------------------------------ + df_gt = pd.read_csv(ground_truth_path, sep=";") + required_gt_cols = {"unique_id", "MedDatum", edss_gt_col} + missing_cols = required_gt_cols - set(df_gt.columns) + if missing_cols: + raise ValueError(f"GT missing columns: {missing_cols}. Available: {df_gt.columns.tolist()}") + + df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) + df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) + df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] + df_gt["EDSS_gt"] = df_gt[edss_gt_col].map(parse_edss) + + # GT logs + print("\n--- GT LOG ---") + print(f"GT rows: {len(df_gt)}") + print(f"GT unique keys: {df_gt['key'].nunique()}") + gt_dup_rows = df_gt["key"].duplicated(keep=False).sum() + print(f"GT duplicate-key rows: {gt_dup_rows}") + print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}") + print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['EDSS_gt'].isna(), 'key'].nunique()}") + if gt_dup_rows > 0: + print("\n[WARNING] GT has duplicate keys; merge can explode rows. Top duplicate keys:") + print(df_gt.loc[df_gt["key"].duplicated(keep=False), "key"].value_counts().head(10)) + + gt_key_set = set(df_gt["key"]) + + # ------------------------------------------------------------------ + # 2) Load predictions from all JSON files (with per-file logs) + # ------------------------------------------------------------------ + json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json"))) + if not json_files: + raise FileNotFoundError(f"No JSON files found in: {json_dir_path}") + + all_preds = [] + per_file_summary = [] + + total_entries_all = 0 + total_success_all = 0 + skipped_all = {"not_success": 0, "missing_uid_or_date": 0} + + for file_path in json_files: + with open(file_path, "r", encoding="utf-8") as f: + data = json.load(f) + + total_entries = len(data) + success_entries = sum(1 for e in data if e.get("success")) + + total_entries_all += total_entries + total_success_all += success_entries + + skipped = {"not_success": 0, "missing_uid_or_date": 0} + loaded_rows = 0 + + for entry in data: + if not entry.get("success"): + skipped["not_success"] += 1 + continue + res = entry.get("result", {}) + uid = res.get("unique_id") + md = res.get("MedDatum") + + if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": + skipped["missing_uid_or_date"] += 1 + continue + + all_preds.append({ + "file": os.path.basename(file_path), + "unique_id": norm_str(uid), + "MedDatum": norm_str(md), + "key": norm_str(uid) + "_" + norm_str(md), + "EDSS_pred": parse_edss(res.get("EDSS")), + "confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), + }) + loaded_rows += 1 + + skipped_all["not_success"] += skipped["not_success"] + skipped_all["missing_uid_or_date"] += skipped["missing_uid_or_date"] + + per_file_summary.append({ + "file": os.path.basename(file_path), + "entries_total": total_entries, + "entries_success": success_entries, + "pred_rows_loaded": loaded_rows, + "skipped_not_success": skipped["not_success"], + "skipped_missing_uid_or_date": skipped["missing_uid_or_date"], + }) + + df_pred = pd.DataFrame(all_preds) + df_file = pd.DataFrame(per_file_summary) + + # PRED logs + print("\n--- PRED LOG (ALL FILES) ---") + print(f"JSON files found: {len(json_files)}") + print(f"Total JSON entries: {total_entries_all}") + print(f"Total success entries:{total_success_all}") + print(f"Pred rows loaded (success + has keys): {len(df_pred)}") + if len(df_pred) == 0: + print("[ERROR] No usable prediction rows found. Nothing to plot.") + return + + print(f"Pred unique keys (across all files): {df_pred['key'].nunique()}") + print(f"Pred missing EDSS (numeric): {df_pred['EDSS_pred'].isna().sum()}") + print(f"Pred missing confidence: {df_pred['confidence'].isna().sum()}") + print("Skipped totals:", skipped_all) + + # show per-file quick check (useful when one iteration is broken) + print("\nPer-file loaded rows (head):") + print(df_file.sort_values("file").head(10)) + + # ------------------------------------------------------------------ + # 3) Key match log (pred -> GT) + # ------------------------------------------------------------------ + df_pred["key_in_gt"] = df_pred["key"].isin(gt_key_set) + not_in_gt = df_pred.loc[~df_pred["key_in_gt"]] + + print("\n--- KEY MATCH LOG ---") + print(f"Pred rows with key found in GT: {df_pred['key_in_gt'].sum()} / {len(df_pred)}") + print(f"Pred rows with key NOT in GT: {len(not_in_gt)}") + if len(not_in_gt) > 0: + print("[WARNING] Example keys not found in GT (first 10):") + print(not_in_gt[["file", "unique_id", "MedDatum", "key"]].head(10)) + print("\n[WARNING] Files contributing most to key-mismatch:") + print(not_in_gt["file"].value_counts().head(10)) + + # ------------------------------------------------------------------ + # 4) Merge (no dropna yet) + detailed data loss accounting + # ------------------------------------------------------------------ + df_merged = df_pred.merge( + df_gt[["key", "EDSS_gt"]], + on="key", + how="inner", + validate="many_to_one" # catches GT duplicates + ) + + print("\n--- MERGE LOG ---") + print(f"Merged rows (inner join): {len(df_merged)}") + print(f"Merged unique keys: {df_merged['key'].nunique()}") + + # Now quantify what you lose at each filter stage + n0 = len(df_merged) + + miss_gt = df_merged["EDSS_gt"].isna() + miss_pred = df_merged["EDSS_pred"].isna() + miss_conf = df_merged["confidence"].isna() + + print("\n--- MISSINGNESS IN MERGED ---") + print(f"Missing GT EDSS: {miss_gt.sum()}") + print(f"Missing Pred EDSS: {miss_pred.sum()}") + print(f"Missing Confidence: {miss_conf.sum()}") + + # IMPORTANT: your old code used .dropna() with no subset => drops if ANY column is NaN. + # We'll replicate the intended logic explicitly and log counts. + df_complete = df_merged.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]) + n1 = len(df_complete) + print("\n--- FILTER LOG ---") + print(f"Rows before filtering: {n0}") + print(f"Rows after requiring EDSS_gt, EDSS_pred, confidence: {n1}") + print(f"Rows lost due to missing required fields: {n0 - n1}") + + # Break down why rows were lost (overlap-aware) + lost_mask = df_merged[["EDSS_gt", "EDSS_pred", "confidence"]].isna().any(axis=1) + lost = df_merged.loc[lost_mask].copy() + if len(lost) > 0: + lost_reason = ( + (lost["EDSS_gt"].isna()).astype(int).map({1:"GT",0:""}) + + (lost["EDSS_pred"].isna()).astype(int).map({1:"+PRED",0:""}) + + (lost["confidence"].isna()).astype(int).map({1:"+CONF",0:""}) + ) + lost["loss_reason"] = lost_reason.str.replace(r"^\+", "", regex=True).replace("", "UNKNOWN") + print("\nTop loss reasons (overlap-aware):") + print(lost["loss_reason"].value_counts().head(10)) + + print("\nFiles contributing most to lost rows:") + print(lost["file"].value_counts().head(10)) + + if len(df_complete) == 0: + print("[ERROR] No complete rows left after filtering. Nothing to plot.") + return + + # ------------------------------------------------------------------ + # 5) Abs error + binning + # ------------------------------------------------------------------ + df_complete = df_complete.copy() + df_complete["abs_error"] = (df_complete["EDSS_pred"] - df_complete["EDSS_gt"]).abs() + + bins = [0, 70, 80, 90, 100] + labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] + + df_complete["conf_bin"] = pd.cut( + df_complete["confidence"], + bins=bins, + labels=labels, + include_lowest=include_lowest + ) + + conf_outside = df_complete["conf_bin"].isna().sum() + print("\n--- BINNING LOG ---") + print(f"Rows with confidence outside bin edges / invalid: {conf_outside}") + if conf_outside > 0: + print("Example out-of-bin confidences:") + print(df_complete.loc[df_complete["conf_bin"].isna(), "confidence"].head(20).to_list()) + + df_plot = df_complete.dropna(subset=["conf_bin"]) + print(f"Rows kept for bin stats/plot (after dropping out-of-bin): {len(df_plot)}") + print(f"Rows lost due to out-of-bin confidence: {len(df_complete) - len(df_plot)}") + + stats = ( + df_plot.groupby("conf_bin", observed=True)["abs_error"] + .agg(mean="mean", std="std", count="count") + .reindex(labels) + .reset_index() + ) + + print("\n--- BIN STATS ---") + print(stats) + + low_bins = stats.loc[stats["count"].fillna(0) < min_bin_count, ["conf_bin", "count"]] + if not low_bins.empty: + print(f"\n[WARNING] Some bins have < {min_bin_count} rows (unstable SEM/trend):") + print(low_bins) + + # ------------------------------------------------------------------ + # 6) Plot + # ------------------------------------------------------------------ + plt.figure(figsize=(12, 8)) + colors = sns.color_palette("Blues", n_colors=len(labels)) + + means = stats["mean"].to_numpy() + counts = stats["count"].fillna(0).astype(int).to_numpy() + stds = stats["std"].to_numpy() + + means_plot = np.nan_to_num(means, nan=0.0) + bars = plt.bar(labels, means_plot, color=colors, edgecolor="black", linewidth=1.2) + + sem = np.where((counts > 1) & (~np.isnan(stds)), stds / np.sqrt(counts), np.nan) + plt.errorbar(labels, means_plot, yerr=sem, fmt="none", c="black", capsize=6, elinewidth=1.5) + + # Trend line only if >=2 non-empty bins + valid_idx = np.where(~np.isnan(means))[0] + if len(valid_idx) >= 2: + x_idx = np.arange(len(labels)) + z = np.polyfit(valid_idx, means[valid_idx], 1) + p = np.poly1d(z) + plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=2.5) + trend_label = "Correlation Trend" + else: + trend_label = "Correlation Trend (insufficient bins)" + print("\n[INFO] Not enough non-empty bins to fit a trend line.") + + # Bar annotations (MAE + n) + for i, bar in enumerate(bars): + n = int(counts[i]) + m = means[i] + if n == 0 or np.isnan(m): + txt = "empty" + y = 0.02 + else: + txt = f"MAE: {m:.2f}\nn={n}" + y = bar.get_height() + 0.05 + plt.text(bar.get_x() + bar.get_width()/2, y, txt, ha="center", fontweight="bold") + + legend_elements = [ + Patch(facecolor=colors[0], edgecolor="black", label=f"Bin 1: {labels[0]}"), + Patch(facecolor=colors[1], edgecolor="black", label=f"Bin 2: {labels[1]}"), + Patch(facecolor=colors[2], edgecolor="black", label=f"Bin 3: {labels[2]}"), + Patch(facecolor=colors[3], edgecolor="black", label=f"Bin 4: {labels[3]}"), + Line2D([0], [0], color="black", marker="_", linestyle="None", markersize=10, label="Standard Error (SEM)"), + Line2D([0], [0], color="#e74c3c", linestyle="--", lw=2.5, label=trend_label), + Patch(color="none", label="Metric: Mean Absolute Error (MAE)") + ] + plt.legend(handles=legend_elements, loc="upper right", frameon=True, shadow=True, fontsize=10, title="Legend") + + plt.title("Validation: Inverse Correlation of Confidence vs. Error Magnitude", fontsize=15, pad=20) + plt.ylabel("Mean Absolute Error (Δ EDSS Points)", fontsize=12) + plt.xlabel("LLM Confidence Bracket", fontsize=12) + plt.grid(axis="y", linestyle=":", alpha=0.5) + + ymax = np.nanmax(means) if np.any(~np.isnan(means)) else 0.0 + plt.ylim(0, max(0.5, float(ymax) + 0.6)) + plt.tight_layout() + plt.show() + + print("\n" + "="*90) + print("DONE") + print("="*90) + + +# Example run: +plot_confidence_vs_abs_error_with_log("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv") + + +## + + +# %% Empirical Confidence +# Empirical stability confidence (from 10 runs) + LLM certainty_percent as secondary signal +# - Reads all JSONs in a folder (your 10 iterations) +# - Aggregates by key = unique_id + MedDatum +# - Computes: +# * EDSS_mean, EDSS_std, EDSS_iqr, mode/share +# * empirical_conf_0_100 (based on stability) +# * llm_conf_mean_0_100 (mean certainty_percent) +# * combined_conf_0_100 (weighted blend) +# - Optional: merges GT EDSS and computes abs error on the aggregated prediction + +import os, glob, json +import numpy as np +import pandas as pd + +def build_empirical_confidence_table( + json_dir_path: str, + ground_truth_path: str | None = None, + gt_sep: str = ";", + gt_edss_col: str = "EDSS", + w_empirical: float = 0.7, # weight for empirical stability + w_llm: float = 0.3, # weight for LLM self-reported confidence + tol_mode: float = 0.5, # tolerance to treat EDSS as "same" (EDSS often in 0.5 steps) + min_runs_expected: int = 10, +): + # ----------------------------- + # Helpers + # ----------------------------- + def norm_str(x): + return str(x).strip().lower() + + def parse_number(x): + if x is None: + return np.nan + s = str(x).strip() + if s == "" or s.lower() in {"nan", "none", "null"}: + return np.nan + s = s.replace(",", ".") + return pd.to_numeric(s, errors="coerce") + + def robust_iqr(x: pd.Series): + x = x.dropna() + if len(x) == 0: + return np.nan + return float(x.quantile(0.75) - x.quantile(0.25)) + + def stability_to_confidence(std_val: float) -> float: + """ + Map EDSS variability across runs to a 0..100 confidence. + EDSS is typically on 0.5 steps. A natural scale: + std ~= 0.0 -> ~100 + std ~= 0.25 -> ~75-90 + std ~= 0.5 -> ~50-70 + std >= 1.0 -> low + Use a smooth exponential mapping. + """ + if np.isnan(std_val): + return np.nan + # scale parameter: std=0.5 -> exp(-1)=0.367 -> ~36.7 + scale = 0.5 + conf = 100.0 * np.exp(-(std_val / scale)) + # clamp + return float(np.clip(conf, 0.0, 100.0)) + + def mode_share_with_tolerance(values: np.ndarray, tol: float) -> tuple[float, float]: + """ + Compute a 'mode' under tolerance: pick the cluster center (median) and count + how many values fall within +/- tol. Return (mode_center, share). + This is robust to tiny float differences. + """ + vals = values[~np.isnan(values)] + if len(vals) == 0: + return (np.nan, np.nan) + center = float(np.median(vals)) + share = float(np.mean(np.abs(vals - center) <= tol)) + return (center, share) + + # ----------------------------- + # Load predictions from all JSONs + # ----------------------------- + json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json"))) + if not json_files: + raise FileNotFoundError(f"No JSON files found in: {json_dir_path}") + + rows = [] + per_file = [] + total_entries_all = 0 + total_success_all = 0 + skipped_all = {"not_success": 0, "missing_uid_or_date": 0} + + for fp in json_files: + with open(fp, "r", encoding="utf-8") as f: + data = json.load(f) + + total_entries = len(data) + success_entries = sum(1 for e in data if e.get("success")) + total_entries_all += total_entries + total_success_all += success_entries + + skipped = {"not_success": 0, "missing_uid_or_date": 0} + loaded = 0 + + for entry in data: + if not entry.get("success"): + skipped["not_success"] += 1 + continue + + res = entry.get("result", {}) + uid = res.get("unique_id") + md = res.get("MedDatum") + + if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": + skipped["missing_uid_or_date"] += 1 + continue + + edss = parse_number(res.get("EDSS")) + conf = parse_number(res.get("certainty_percent")) + it = res.get("iteration", None) + + rows.append({ + "file": os.path.basename(fp), + "iteration": it, + "unique_id": norm_str(uid), + "MedDatum": norm_str(md), + "key": norm_str(uid) + "_" + norm_str(md), + "EDSS_pred": edss, + "llm_conf": conf, + }) + loaded += 1 + + skipped_all["not_success"] += skipped["not_success"] + skipped_all["missing_uid_or_date"] += skipped["missing_uid_or_date"] + + per_file.append({ + "file": os.path.basename(fp), + "entries_total": total_entries, + "entries_success": success_entries, + "rows_loaded": loaded, + "skipped_not_success": skipped["not_success"], + "skipped_missing_uid_or_date": skipped["missing_uid_or_date"], + }) + + df_pred = pd.DataFrame(rows) + df_file = pd.DataFrame(per_file) + + # ----------------------------- + # Logs: ingestion + # ----------------------------- + print("\n" + "="*90) + print("EMPIRICAL CONFIDENCE (10-RUN STABILITY) + LLM CONFIDENCE (SECONDARY)") + print("="*90) + print(f"JSON DIR: {json_dir_path}") + print(f"JSON files: {len(json_files)}") + print("\n--- INGEST LOG ---") + print(f"Total JSON entries: {total_entries_all}") + print(f"Total success entries:{total_success_all}") + print(f"Pred rows loaded: {len(df_pred)}") + print(f"Unique keys in preds: {df_pred['key'].nunique() if len(df_pred) else 0}") + print(f"Missing EDSS_pred: {df_pred['EDSS_pred'].isna().sum() if len(df_pred) else 0}") + print(f"Missing llm_conf: {df_pred['llm_conf'].isna().sum() if len(df_pred) else 0}") + print("Skipped totals:", skipped_all) + + print("\nPer-file summary (top 10 by name):") + print(df_file.sort_values("file").head(10)) + + # ----------------------------- + # Aggregate by key (empirical stability) + # ----------------------------- + if len(df_pred) == 0: + print("[ERROR] No usable prediction rows.") + return None + + # how many runs per key (expect ~10) + runs_per_key = df_pred.groupby("key")["EDSS_pred"].size().rename("n_rows").reset_index() + print("\n--- RUNS PER KEY LOG ---") + print(f"Keys with at least 1 row: {len(runs_per_key)}") + print("Distribution of rows per key (value_counts):") + print(runs_per_key["n_rows"].value_counts().sort_index()) + + # Aggregate stats + def agg_block(g: pd.DataFrame): + ed = g["EDSS_pred"].to_numpy(dtype=float) + ll = g["llm_conf"].to_numpy(dtype=float) + + n_rows = len(g) + n_edss = int(np.sum(~np.isnan(ed))) + n_llm = int(np.sum(~np.isnan(ll))) + + ed_mean = float(np.nanmean(ed)) if n_edss else np.nan + ed_std = float(np.nanstd(ed, ddof=1)) if n_edss >= 2 else (0.0 if n_edss == 1 else np.nan) + ed_iqr = robust_iqr(pd.Series(ed)) + mode_center, mode_share = mode_share_with_tolerance(ed, tol=tol_mode) + + llm_mean = float(np.nanmean(ll)) if n_llm else np.nan + llm_std = float(np.nanstd(ll, ddof=1)) if n_llm >= 2 else (0.0 if n_llm == 1 else np.nan) + + emp_conf = stability_to_confidence(ed_std) if not np.isnan(ed_std) else np.nan + + # Combined confidence (weighted). If one side missing, fall back to the other. + if np.isnan(emp_conf) and np.isnan(llm_mean): + comb = np.nan + elif np.isnan(emp_conf): + comb = llm_mean + elif np.isnan(llm_mean): + comb = emp_conf + else: + comb = w_empirical * emp_conf + w_llm * llm_mean + + return pd.Series({ + "unique_id": g["unique_id"].iloc[0], + "MedDatum": g["MedDatum"].iloc[0], + "n_rows": n_rows, + "n_edss": n_edss, + "n_llm_conf":n_llm, + "EDSS_mean": ed_mean, + "EDSS_std": ed_std, + "EDSS_iqr": ed_iqr, + "EDSS_mode_center": mode_center, + "EDSS_mode_share": mode_share, # fraction within ±tol_mode of median center + "llm_conf_mean": llm_mean, + "llm_conf_std": llm_std, + "empirical_conf_0_100": emp_conf, + "combined_conf_0_100": float(np.clip(comb, 0.0, 100.0)) if not np.isnan(comb) else np.nan, + }) + + df_agg = df_pred.groupby("key", as_index=False).apply(agg_block) + # groupby+apply returns a multiindex sometimes depending on pandas version + if isinstance(df_agg.index, pd.MultiIndex): + df_agg = df_agg.reset_index(drop=True) + + # Logs: aggregation + losses + print("\n--- AGGREGATION LOG ---") + print(f"Aggregated keys: {len(df_agg)}") + print(f"Keys with EDSS in >=1 run: {(df_agg['n_edss'] >= 1).sum()}") + print(f"Keys with EDSS in >=2 runs (std meaningful): {(df_agg['n_edss'] >= 2).sum()}") + print(f"Keys missing EDSS in all runs: {(df_agg['n_edss'] == 0).sum()}") + print(f"Keys missing llm_conf in all runs: {(df_agg['n_llm_conf'] == 0).sum()}") + + # Expected runs check + if min_runs_expected is not None: + print(f"\nKeys with < {min_runs_expected} rows (potential missing iterations):") + print(df_agg.loc[df_agg["n_rows"] < min_runs_expected, ["key", "n_rows"]].sort_values("n_rows").head(20)) + + # ----------------------------- + # Optional: merge GT and compute error on aggregated EDSS_mean + # ----------------------------- + if ground_truth_path is not None: + df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) + need = {"unique_id", "MedDatum", gt_edss_col} + miss = need - set(df_gt.columns) + if miss: + raise ValueError(f"GT missing columns: {miss}. Available: {df_gt.columns.tolist()}") + + df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) + df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) + df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] + df_gt["EDSS_gt"] = df_gt[gt_edss_col].apply(parse_number) + + print("\n--- GT MERGE LOG ---") + print(f"GT rows: {len(df_gt)} | GT unique keys: {df_gt['key'].nunique()}") + print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}") + + df_final = df_agg.merge(df_gt[["key", "EDSS_gt"]], on="key", how="left", validate="one_to_one") + + print(f"Aggregated keys with GT match: {df_final['EDSS_gt'].notna().sum()} / {len(df_final)}") + print(f"Aggregated keys missing GT EDSS: {df_final['EDSS_gt'].isna().sum()}") + + df_final["abs_error_mean"] = (df_final["EDSS_mean"] - df_final["EDSS_gt"]).abs() + + # How many keys usable for evaluation? + usable = df_final.dropna(subset=["EDSS_mean", "EDSS_gt"]) + print("\n--- EVAL LOG (AGGREGATED) ---") + print(f"Keys with both EDSS_mean and EDSS_gt: {len(usable)}") + if len(usable) > 0: + print(f"MAE on EDSS_mean vs GT: {usable['abs_error_mean'].mean():.3f}") + print(f"Median abs error: {usable['abs_error_mean'].median():.3f}") + + return df_final + + return df_agg + + +# Example usage: +df = build_empirical_confidence_table(json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", w_empirical=0.7, w_llm=0.3, tol_mode=0.5,min_runs_expected=10,) +df.to_csv("empirical_confidence_table.csv", index=False) + +## + + + + +# %% Executive Boxplot +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.patches import Patch + +def plot_exec_boxplots(df, min_bin_size_warn=10): + """ + Two side-by-side boxplots: + - Left: abs_error_mean grouped by empirical_conf_0_100 quantile bins + - Right: abs_error_mean grouped by llm_conf_mean quantile bins + + Adds: + - Robust qcut labeling (handles ties; bins may be < 4) + - Data logs + per-bin summary table printed + - Clear legend explaining each panel and what box elements mean + """ + need_cols = ["abs_error_mean", "empirical_conf_0_100", "llm_conf_mean"] + missing = [c for c in need_cols if c not in df.columns] + if missing: + raise ValueError(f"Missing columns in df: {missing}. Available: {df.columns.tolist()}") + + d = df[need_cols].copy() + + # ----------------------------- + # Data logs: survivorship + # ----------------------------- + d_emp = d.dropna(subset=["abs_error_mean", "empirical_conf_0_100"]).copy() + d_llm = d.dropna(subset=["abs_error_mean", "llm_conf_mean"]).copy() + + print("\n" + "="*90) + print("EXECUTIVE BOXPLOTS — DATA LOG + SUMMARY") + print("="*90) + print(f"Total rows in df: {len(df)}") + print(f"Rows for empirical plot: {len(d_emp)} (dropped {len(df) - len(d_emp)})") + print(f"Rows for LLM plot: {len(d_llm)} (dropped {len(df) - len(d_llm)})") + + if len(d_emp) == 0 or len(d_llm) == 0: + print("[ERROR] Not enough data after dropping NaNs to build both plots.") + return + + # ----------------------------- + # Robust quantile binning (handles ties) + # ----------------------------- + # Empirical + emp_bins = pd.qcut(d_emp["empirical_conf_0_100"], q=4, duplicates="drop") + k_emp = emp_bins.cat.categories.size + emp_labels = [f"Q{i+1}" for i in range(k_emp)] + d_emp["emp_q"] = pd.qcut(d_emp["empirical_conf_0_100"], q=4, duplicates="drop", labels=emp_labels) + + # LLM + llm_bins = pd.qcut(d_llm["llm_conf_mean"], q=4, duplicates="drop") + k_llm = llm_bins.cat.categories.size + llm_labels = [f"Q{i+1}" for i in range(k_llm)] + d_llm["llm_q"] = pd.qcut(d_llm["llm_conf_mean"], q=4, duplicates="drop", labels=llm_labels) + + # Print bin edges (so you can discuss exact thresholds) + print("\n--- BIN EDGES (actual ranges) ---") + print("Empirical confidence bins:") + for i, interval in enumerate(emp_bins.cat.categories): + print(f" {emp_labels[i]}: {interval}") + print("LLM confidence bins:") + for i, interval in enumerate(llm_bins.cat.categories): + print(f" {llm_labels[i]}: {interval}") + + # ----------------------------- + # Summary tables (per bin) + # ----------------------------- + def summarize_bins(df_in, bin_col, conf_col, label): + g = df_in.groupby(bin_col, observed=True).agg( + n=("abs_error_mean", "size"), + mae_mean=("abs_error_mean", "mean"), + mae_median=("abs_error_mean", "median"), + mae_q25=("abs_error_mean", lambda x: x.quantile(0.25)), + mae_q75=("abs_error_mean", lambda x: x.quantile(0.75)), + conf_mean=(conf_col, "mean"), + conf_median=(conf_col, "median"), + ).reset_index().rename(columns={bin_col: "bin"}) + g["panel"] = label + return g[["panel", "bin", "n", "mae_mean", "mae_median", "mae_q25", "mae_q75", "conf_mean", "conf_median"]] + + summary_emp = summarize_bins(d_emp, "emp_q", "empirical_conf_0_100", "Empirical") + summary_llm = summarize_bins(d_llm, "llm_q", "llm_conf_mean", "LLM") + + print("\n--- SUMMARY TABLE: Empirical confidence quartiles (or fewer if ties) ---") + print(summary_emp.to_string(index=False, float_format=lambda x: f"{x:.3f}")) + + print("\n--- SUMMARY TABLE: LLM confidence quartiles (or fewer if ties) ---") + print(summary_llm.to_string(index=False, float_format=lambda x: f"{x:.3f}")) + + # Warn about small bins + small_emp = summary_emp.loc[summary_emp["n"] < min_bin_size_warn, ["bin", "n"]] + small_llm = summary_llm.loc[summary_llm["n"] < min_bin_size_warn, ["bin", "n"]] + if not small_emp.empty or not small_llm.empty: + print(f"\n[WARNING] Some bins have < {min_bin_size_warn} points; compare them cautiously.") + if not small_emp.empty: + print(" Empirical small bins:") + print(small_emp.to_string(index=False)) + if not small_llm.empty: + print(" LLM small bins:") + print(small_llm.to_string(index=False)) + + # ----------------------------- + # Prepare data for boxplots + # ----------------------------- + emp_cats = list(d_emp["emp_q"].cat.categories) + llm_cats = list(d_llm["llm_q"].cat.categories) + + emp_groups = [d_emp.loc[d_emp["emp_q"] == q, "abs_error_mean"].values for q in emp_cats] + llm_groups = [d_llm.loc[d_llm["llm_q"] == q, "abs_error_mean"].values for q in llm_cats] + + # ----------------------------- + # Plot + # ----------------------------- + fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True) + + bp0 = axes[0].boxplot(emp_groups, labels=emp_cats, showfliers=False, patch_artist=True) + bp1 = axes[1].boxplot(llm_groups, labels=llm_cats, showfliers=False, patch_artist=True) + + # Make panels visually distinct but still simple (no extra clutter) + for patch in bp0["boxes"]: + patch.set_alpha(0.6) + for patch in bp1["boxes"]: + patch.set_alpha(0.6) + + axes[0].set_title("Error by Empirical Confidence (quantile bins)") + axes[0].set_xlabel("Empirical confidence bin") + axes[0].set_ylabel("Absolute Error (|EDSS_mean − EDSS_gt|)") + + axes[1].set_title("Error by LLM Confidence (quantile bins)") + axes[1].set_xlabel("LLM confidence bin") + + for ax in axes: + ax.grid(axis="y", linestyle=":", alpha=0.5) + + # ----------------------------- + # Legend (simple, but useful) + # ----------------------------- + legend_elements = [ + Patch(facecolor="white", edgecolor="black", label="Box = IQR (25%–75%)"), + Patch(facecolor="white", edgecolor="black", label="Center line = median"), + Patch(facecolor="white", edgecolor="black", label="Whiskers = typical range (no outliers shown)"), + Patch(facecolor="white", edgecolor="white", label="Left panel: empirical stability bins"), + Patch(facecolor="white", edgecolor="white", label="Right panel: LLM self-reported bins"), + ] + fig.legend(handles=legend_elements, loc="upper center", ncol=3, frameon=True) + + plt.tight_layout(rect=[0, 0, 1, 0.90]) + plt.show() + + print("\n" + "="*90) + print("DONE") + print("="*90) + + +# Example (complete): +df_final = build_empirical_confidence_table( + json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", + ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", + w_empirical=0.7, + w_llm=0.3, + tol_mode=0.5, + min_runs_expected=10, +) +plot_exec_boxplots(df_final) + + +## + + + +# %% Scatter + +import os, json +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +def scatter_abs_error_by_conf_bins_single_json( + json_file_path, + ground_truth_path, + gt_sep=";", + gt_edss_col="EDSS", +): + def norm_str(x): + return str(x).strip().lower() + + def parse_edss(x): + if x is None: + return np.nan + s = str(x).strip() + if s == "" or s.lower() in {"nan", "none", "null"}: + return np.nan + s = s.replace(",", ".") + return pd.to_numeric(s, errors="coerce") + + # ---- Load GT + df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) + df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) + df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) + df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] + df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss) + + # ---- Load preds from JSON + with open(json_file_path, "r", encoding="utf-8") as f: + data = json.load(f) + + rows = [] + for entry in data: + if not entry.get("success"): + continue + res = entry.get("result", {}) + uid, md = res.get("unique_id"), res.get("MedDatum") + if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": + continue + + rows.append({ + "key": norm_str(uid) + "_" + norm_str(md), + "EDSS_pred": parse_edss(res.get("EDSS")), + "confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), + }) + + df_pred = pd.DataFrame(rows) + + # ---- Merge + filter + df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one") + df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy() + df["abs_error"] = (df["EDSS_pred"] - df["EDSS_gt"]).abs() + + # ---- Bin confidence into 4 categories + bins = [0, 70, 80, 90, 100] + labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] + df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True) + df = df.dropna(subset=["conf_bin"]).copy() + + # ---- Logs + print("\n--- BIN COUNTS (points plotted) ---") + print(df["conf_bin"].value_counts().reindex(labels).fillna(0).astype(int)) + print(f"Total points plotted: {len(df)}") + + # ---- Scatter (categorical x with jitter) + x_map = {lab: i for i, lab in enumerate(labels)} + x = df["conf_bin"].map(x_map).astype(float).to_numpy() + jitter = np.random.uniform(-0.12, 0.12, size=len(df)) + xj = x + jitter + + plt.figure(figsize=(12, 6)) + plt.scatter(xj, df["abs_error"].to_numpy(), alpha=0.55) + plt.xticks(range(len(labels)), labels) + plt.xlabel("certainty_percent category (Iteration 1)") + plt.ylabel("Absolute Error (|EDSS_pred − EDSS_gt|)") + plt.title("Absolute Error vs LLM Confidence Category (Single JSON)") + plt.grid(axis="y", linestyle=":", alpha=0.5) + plt.tight_layout() + plt.show() + +# --- RUN --- +scatter_abs_error_by_conf_bins_single_json( + json_file_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json", + ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", +) + +## + + + + +# %% Boxplot2 + +# Boxplot + light jittered points +# - Single JSON (iteration 1) +# - X: confidence bin (<70, 70-80, 80-90, 90-100) +# - Y: absolute error +# - Legend includes n per bin + +import json +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.patches import Patch + +def boxplot_with_jitter_abs_error_by_conf_bins_single_json( + json_file_path, + ground_truth_path, + gt_sep=";", + gt_edss_col="EDSS", + jitter_width=0.12, + point_alpha=0.25, + show_outliers=False, +): + def norm_str(x): + return str(x).strip().lower() + + def parse_edss(x): + if x is None: + return np.nan + s = str(x).strip() + if s == "" or s.lower() in {"nan", "none", "null"}: + return np.nan + s = s.replace(",", ".") + return pd.to_numeric(s, errors="coerce") + + # ---- Load GT + df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) + for col in ["unique_id", "MedDatum", gt_edss_col]: + if col not in df_gt.columns: + raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}") + + df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) + df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) + df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] + df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss) + + # ---- Load preds from JSON + with open(json_file_path, "r", encoding="utf-8") as f: + data = json.load(f) + + rows = [] + for entry in data: + if not entry.get("success"): + continue + res = entry.get("result", {}) + uid, md = res.get("unique_id"), res.get("MedDatum") + if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": + continue + rows.append({ + "key": norm_str(uid) + "_" + norm_str(md), + "EDSS_pred": parse_edss(res.get("EDSS")), + "confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), + }) + + df_pred = pd.DataFrame(rows) + + # ---- Merge + filter + df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one") + df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy() + df["abs_error"] = (df["EDSS_pred"] - df["EDSS_gt"]).abs() + + # ---- Bin confidence + bins = [0, 70, 80, 90, 100] + labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] + df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True) + df = df.dropna(subset=["conf_bin"]).copy() + + # ---- Prepare per-bin arrays + bin_arrays = [df.loc[df["conf_bin"] == lab, "abs_error"].to_numpy() for lab in labels] + n_counts = [len(a) for a in bin_arrays] + + # ---- Plot + fig, ax = plt.subplots(figsize=(12, 6)) + + # Boxplot (no fliers by default to reduce clutter) + bp = ax.boxplot( + bin_arrays, + labels=labels, + showfliers=show_outliers, + patch_artist=True, + widths=0.55, + ) + + # Light fill for boxes (no explicit color choices required) + for b in bp["boxes"]: + b.set_alpha(0.35) + + # Jittered points on top + for i, arr in enumerate(bin_arrays, start=1): + if len(arr) == 0: + continue + x = np.full(len(arr), i, dtype=float) + x += np.random.uniform(-jitter_width, jitter_width, size=len(arr)) + ax.scatter(x, arr, alpha=point_alpha, s=18) + + ax.set_title("Absolute Error by LLM Confidence Bin (Iteration 1)") + ax.set_xlabel("certainty_percent category") + ax.set_ylabel("Absolute Error (|EDSS_pred − EDSS_gt|)") + ax.grid(axis="y", linestyle=":", alpha=0.5) + + # Legend showing n per bin + legend_handles = [ + Patch(facecolor="white", edgecolor="black", label=f"{lab}: n={n}") + for lab, n in zip(labels, n_counts) + ] + ax.legend(handles=legend_handles, title="Bin counts", loc="upper right", frameon=True) + + plt.tight_layout() + plt.show() + + # Print counts too (useful for discussion) + print("\n--- BIN COUNTS (points plotted) ---") + for lab, n in zip(labels, n_counts): + print(f"{lab:>18}: n={n}") + print(f"Total points plotted: {sum(n_counts)}") + + +# Example run: +boxplot_with_jitter_abs_error_by_conf_bins_single_json( + json_file_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json", + ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv" +) + +## + + + + +# %% Boxplot3 + + +# Boxplot + jitter with SIGNED error (direction) +# - Y-axis: signed error = EDSS_pred - EDSS_gt (negative = underestimation, positive = overestimation) +# - Also prints per-bin summary (n, mean signed error, median, MAE) + +import json +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.patches import Patch +from matplotlib.lines import Line2D + +def boxplot_with_jitter_signed_error_by_conf_bins_single_json( + json_file_path, + ground_truth_path, + gt_sep=";", + gt_edss_col="EDSS", + jitter_width=0.12, + point_alpha=0.25, + show_outliers=False, +): + def norm_str(x): + return str(x).strip().lower() + + def parse_edss(x): + if x is None: + return np.nan + s = str(x).strip() + if s == "" or s.lower() in {"nan", "none", "null"}: + return np.nan + s = s.replace(",", ".") + return pd.to_numeric(s, errors="coerce") + + # ---- Load GT + df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) + for col in ["unique_id", "MedDatum", gt_edss_col]: + if col not in df_gt.columns: + raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}") + + df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) + df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) + df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] + df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss) + + # ---- Load preds from JSON + with open(json_file_path, "r", encoding="utf-8") as f: + data = json.load(f) + + rows = [] + for entry in data: + if not entry.get("success"): + continue + res = entry.get("result", {}) + uid, md = res.get("unique_id"), res.get("MedDatum") + if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": + continue + rows.append({ + "key": norm_str(uid) + "_" + norm_str(md), + "EDSS_pred": parse_edss(res.get("EDSS")), + "confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), + }) + + df_pred = pd.DataFrame(rows) + + # ---- Merge + filter + df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one") + df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy() + + # SIGNED ERROR (direction) + df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"] + df["abs_error"] = df["signed_error"].abs() + + # ---- Bin confidence + bins = [0, 70, 80, 90, 100] + labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] + df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True) + df = df.dropna(subset=["conf_bin"]).copy() + + # ---- Prepare arrays + bin_arrays = [df.loc[df["conf_bin"] == lab, "signed_error"].to_numpy() for lab in labels] + n_counts = [len(a) for a in bin_arrays] + + # ---- Plot + fig, ax = plt.subplots(figsize=(12, 6)) + + bp = ax.boxplot( + bin_arrays, + labels=labels, + showfliers=show_outliers, + patch_artist=True, + widths=0.55, + ) + + for b in bp["boxes"]: + b.set_alpha(0.35) + + # Jittered points + for i, arr in enumerate(bin_arrays, start=1): + if len(arr) == 0: + continue + x = np.full(len(arr), i, dtype=float) + x += np.random.uniform(-jitter_width, jitter_width, size=len(arr)) + ax.scatter(x, arr, alpha=point_alpha, s=18) + + # Zero line to show over/under clearly + ax.axhline(0, linewidth=1.5, linestyle="--") + + ax.set_title("Signed Error by LLM Confidence Bin (Iteration 1)") + ax.set_xlabel("certainty_percent category") + ax.set_ylabel("Signed Error (EDSS_pred − EDSS_gt)") + ax.grid(axis="y", linestyle=":", alpha=0.5) + + # Legend with n per bin + zero-line meaning + legend_handles = [ + Patch(facecolor="white", edgecolor="black", label=f"{lab}: n={n}") + for lab, n in zip(labels, n_counts) + ] + legend_handles.append(Line2D([0], [0], linestyle="--", color="black", label="0 = unbiased (over/under split)")) + ax.legend(handles=legend_handles, title="Bin counts", loc="upper right", frameon=True) + + plt.tight_layout() + plt.show() + + # ---- Print per-bin summary to discuss + print("\n--- PER-BIN SUMMARY (points plotted) ---") + for lab in labels: + sub = df.loc[df["conf_bin"] == lab] + n = len(sub) + if n == 0: + print(f"{lab:>18}: n=0") + continue + print( + f"{lab:>18}: n={n:3d} | " + f"mean signed={sub['signed_error'].mean(): .3f} | " + f"median signed={sub['signed_error'].median(): .3f} | " + f"MAE={sub['abs_error'].mean(): .3f}" + ) + print(f"Total points plotted: {len(df)}") + + +# Example run: +boxplot_with_jitter_signed_error_by_conf_bins_single_json( + json_file_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json", + ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv" +) +## + + + +# %% jitter and violin 10x10 + +# Violin + jitter (all JSONs in folder), with signed error +# - X: confidence bins (<70, 70-80, 80-90, 90-100) +# - Y: signed error = EDSS_pred - EDSS_gt (direction) +# - Prints bin counts (n) and puts n into the legend + +import os, glob, json +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from matplotlib.patches import Patch +from matplotlib.lines import Line2D + +def violin_jitter_signed_error_all_jsons( + json_dir_path, + ground_truth_path, + gt_sep=";", + gt_edss_col="EDSS", + jitter_width=0.12, + point_alpha=0.20, + point_size=10, + violin_inner="quartile", # 'quartile', 'box', 'stick', or None +): + def norm_str(x): + return str(x).strip().lower() + + def parse_edss(x): + if x is None: + return np.nan + s = str(x).strip() + if s == "" or s.lower() in {"nan", "none", "null"}: + return np.nan + s = s.replace(",", ".") + return pd.to_numeric(s, errors="coerce") + + # ---- Load GT + df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) + for col in ["unique_id", "MedDatum", gt_edss_col]: + if col not in df_gt.columns: + raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}") + + df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) + df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) + df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] + df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss) + + # ---- Load preds from ALL JSONs + json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json"))) + if not json_files: + raise FileNotFoundError(f"No JSON files found in: {json_dir_path}") + + rows = [] + for fp in json_files: + with open(fp, "r", encoding="utf-8") as f: + data = json.load(f) + for entry in data: + if not entry.get("success"): + continue + res = entry.get("result", {}) + uid, md = res.get("unique_id"), res.get("MedDatum") + if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": + continue + rows.append({ + "file": os.path.basename(fp), + "key": norm_str(uid) + "_" + norm_str(md), + "EDSS_pred": parse_edss(res.get("EDSS")), + "confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), + }) + + df_pred = pd.DataFrame(rows) + + # ---- Merge + filter + df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one") + df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy() + df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"] + + # ---- Bin confidence + bins = [0, 70, 80, 90, 100] + labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] + df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True) + df = df.dropna(subset=["conf_bin"]).copy() + + # ---- Counts + log + counts = df["conf_bin"].value_counts().reindex(labels).fillna(0).astype(int) + print("\n--- BIN COUNTS (all JSONs) ---") + for lab in labels: + print(f"{lab:>18}: n={counts[lab]}") + print(f"Total points plotted: {len(df)}") + print(f"JSON files: {len(json_files)}") + + # Ensure ordering for seaborn + df["conf_bin"] = pd.Categorical(df["conf_bin"], categories=labels, ordered=True) + + # ---- Plot + plt.figure(figsize=(12, 6)) + + # Violin (density) + sns.violinplot( + data=df, + x="conf_bin", + y="signed_error", + order=labels, + inner=violin_inner, + cut=0 + ) + + # Jittered points (manual jitter to keep it consistent and fast) + x_map = {lab: i for i, lab in enumerate(labels)} + x = df["conf_bin"].map(x_map).astype(float).to_numpy() + xj = x + np.random.uniform(-jitter_width, jitter_width, size=len(df)) + plt.scatter(xj, df["signed_error"].to_numpy(), alpha=point_alpha, s=point_size) + + # Zero line (over/under split) + plt.axhline(0, linestyle="--", linewidth=1.5) + + plt.xticks(range(len(labels)), labels) + plt.xlabel("certainty_percent category (all iterations)") + plt.ylabel("Signed Error (EDSS_pred − EDSS_gt)") + plt.title("Signed Error vs LLM Confidence Category — Violin + Jitter (All JSONs)") + plt.grid(axis="y", linestyle=":", alpha=0.5) + + # Legend with n per bin + legend_handles = [ + Patch(facecolor="white", edgecolor="black", label=f"{lab}: n={int(counts[lab])}") + for lab in labels + ] + legend_handles.append(Line2D([0], [0], linestyle="--", color="black", label="0 = unbiased (over/under split)")) + plt.legend(handles=legend_handles, title="Bin counts", loc="upper right", frameon=True) + + plt.tight_layout() + plt.show() + + +# Example run: +violin_jitter_signed_error_all_jsons( + json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", + ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv" +) + +## + + + +# %% jitter and violin 10x1 + + +# Adjusted: Violin + jitter (ALL JSONs for points) but X-bins come ONLY from JSON #1 (reference) +# Fixes: +# 1) Legend has colors matching bins +# 2) Legend placed OUTSIDE plot area +# 3) X-axis binning uses certainty_percent from JSON1 (by key), then all iterations' points inherit that bin + +import os, glob, json +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from matplotlib.patches import Patch +from matplotlib.lines import Line2D + +def violin_jitter_signed_error_all_jsons_xbins_from_json1( + json_dir_path, + json1_file_path, + ground_truth_path, + gt_sep=";", + gt_edss_col="EDSS", + jitter_width=0.12, + point_alpha=0.18, + point_size=10, + violin_inner="quartile", # 'quartile', 'box', 'stick', or None +): + def norm_str(x): + return str(x).strip().lower() + + def parse_edss(x): + if x is None: + return np.nan + s = str(x).strip() + if s == "" or s.lower() in {"nan", "none", "null"}: + return np.nan + s = s.replace(",", ".") + return pd.to_numeric(s, errors="coerce") + + # ---------------------------- + # Load GT + # ---------------------------- + df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) + for col in ["unique_id", "MedDatum", gt_edss_col]: + if col not in df_gt.columns: + raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}") + + df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) + df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) + df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] + df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss) + + # ---------------------------- + # Load JSON1 and build reference bins by KEY + # ---------------------------- + with open(json1_file_path, "r", encoding="utf-8") as f: + data1 = json.load(f) + + ref_rows = [] + for entry in data1: + if not entry.get("success"): + continue + res = entry.get("result", {}) + uid, md = res.get("unique_id"), res.get("MedDatum") + if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": + continue + ref_rows.append({ + "key": norm_str(uid) + "_" + norm_str(md), + "confidence_ref": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), + }) + + df_ref = pd.DataFrame(ref_rows) + + # If JSON1 has duplicates for a key (unlikely, but safe), take the first non-null confidence + df_ref = (df_ref.sort_values("confidence_ref") + .groupby("key", as_index=False)["confidence_ref"] + .apply(lambda s: s.dropna().iloc[0] if s.dropna().any() else np.nan)) + if isinstance(df_ref.index, pd.MultiIndex): + df_ref = df_ref.reset_index(drop=True) + + # Confidence bins + bins = [0, 70, 80, 90, 100] + labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] + df_ref["conf_bin_ref"] = pd.cut(df_ref["confidence_ref"], bins=bins, labels=labels, include_lowest=True) + df_ref = df_ref.dropna(subset=["conf_bin_ref"]).copy() + + # ---------------------------- + # Load ALL JSONs (all points) + # ---------------------------- + json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json"))) + if not json_files: + raise FileNotFoundError(f"No JSON files found in: {json_dir_path}") + + rows = [] + for fp in json_files: + with open(fp, "r", encoding="utf-8") as f: + data = json.load(f) + for entry in data: + if not entry.get("success"): + continue + res = entry.get("result", {}) + uid, md = res.get("unique_id"), res.get("MedDatum") + if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": + continue + rows.append({ + "file": os.path.basename(fp), + "key": norm_str(uid) + "_" + norm_str(md), + "EDSS_pred": parse_edss(res.get("EDSS")), + }) + + df_pred = pd.DataFrame(rows) + + # ---------------------------- + # Merge: preds + GT + reference bins (from JSON1) + # ---------------------------- + df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one") + df = df.merge(df_ref[["key", "conf_bin_ref"]], on="key", how="inner", validate="many_to_one") + + # filter for plotting + df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "conf_bin_ref"]).copy() + df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"] + + # ordering + df["conf_bin_ref"] = pd.Categorical(df["conf_bin_ref"], categories=labels, ordered=True) + + # ---------------------------- + # Logs + counts + # ---------------------------- + counts = df["conf_bin_ref"].value_counts().reindex(labels).fillna(0).astype(int) + + print("\n--- BIN COUNTS (ALL JSON points, binned by JSON1 confidence) ---") + for lab in labels: + print(f"{lab:>18}: n={int(counts[lab])}") + print(f"Total points plotted: {len(df)}") + print(f"JSON files used for points: {len(json_files)}") + print(f"Reference JSON1 bins derived from: {os.path.basename(json1_file_path)}") + print(f"Keys in reference (after binning & non-null): {df_ref['key'].nunique()}") + + # ---------------------------- + # Colors + legend patches + # ---------------------------- + palette = sns.color_palette("Blues", n_colors=len(labels)) + bin_colors = {lab: palette[i] for i, lab in enumerate(labels)} + + legend_handles = [ + Patch(facecolor=bin_colors[lab], edgecolor="black", label=f"{lab}: n={int(counts[lab])}") + for lab in labels + ] + legend_handles.append(Line2D([0], [0], linestyle="--", color="black", label="0 = unbiased (over/under split)")) + + # ---------------------------- + # Plot (legend outside) + # ---------------------------- + fig, ax = plt.subplots(figsize=(12.5, 6)) + + sns.violinplot( + data=df, + x="conf_bin_ref", + y="signed_error", + order=labels, + inner=violin_inner, + cut=0, + palette=[bin_colors[l] for l in labels], + ax=ax, + ) + + # jittered points (manual jitter) + x_map = {lab: i for i, lab in enumerate(labels)} + x = df["conf_bin_ref"].map(x_map).astype(float).to_numpy() + xj = x + np.random.uniform(-jitter_width, jitter_width, size=len(df)) + ax.scatter(xj, df["signed_error"].to_numpy(), alpha=point_alpha, s=point_size) + + ax.axhline(0, linestyle="--", linewidth=1.5) + + ax.set_xlabel("certainty_percent category (from JSON 1 as reference)") + ax.set_ylabel("Signed Error (EDSS_pred − EDSS_gt)") + ax.set_title("Signed Error vs LLM Confidence Category — Violin + Jitter (All JSONs)\nBinned by JSON 1 certainty_percent") + ax.grid(axis="y", linestyle=":", alpha=0.5) + + # Legend outside (right) + ax.legend( + handles=legend_handles, + title="Bin counts", + loc="center left", + bbox_to_anchor=(1.02, 0.5), + frameon=True + ) + + plt.tight_layout() + plt.show() + + +# Example run: +json1_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json" +violin_jitter_signed_error_all_jsons_xbins_from_json1( + json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", + json1_file_path=json1_path, + ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv") + +## + + +# %% Coorelation + +# Correlation plot (RAW certainty_percent) vs error +# - Uses ALL JSONs as points +# - Uses JSON1 certainty_percent as the x-value reference (per key) +# - Y can be abs_error or signed_error (choose with y_mode) +# - Prints Spearman + Pearson correlations +# - Adds a simple linear trend line + +import os, glob, json +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +def correlation_scatter_raw_certainty_json1_reference( + json_dir_path, + json1_file_path, + ground_truth_path, + gt_sep=";", + gt_edss_col="EDSS", + y_mode="abs", # "abs" or "signed" + point_alpha=0.18, + point_size=12, +): + def norm_str(x): + return str(x).strip().lower() + + def parse_edss(x): + if x is None: + return np.nan + s = str(x).strip() + if s == "" or s.lower() in {"nan", "none", "null"}: + return np.nan + s = s.replace(",", ".") + return pd.to_numeric(s, errors="coerce") + + def rankdata(a): + # Average-rank for ties (Spearman needs ranks) + s = pd.Series(a) + return s.rank(method="average").to_numpy() + + # ---------------------------- + # Load GT + # ---------------------------- + df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) + for col in ["unique_id", "MedDatum", gt_edss_col]: + if col not in df_gt.columns: + raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}") + + df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) + df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) + df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] + df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss) + + # ---------------------------- + # Load JSON1 reference certainty_percent (per key) + # ---------------------------- + with open(json1_file_path, "r", encoding="utf-8") as f: + data1 = json.load(f) + + ref_rows = [] + for entry in data1: + if not entry.get("success"): + continue + res = entry.get("result", {}) + uid, md = res.get("unique_id"), res.get("MedDatum") + if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": + continue + ref_rows.append({ + "key": norm_str(uid) + "_" + norm_str(md), + "certainty_ref": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), + }) + + df_ref = pd.DataFrame(ref_rows) + + # Deduplicate keys if needed: take first non-null certainty + df_ref = (df_ref.dropna(subset=["certainty_ref"]) + .groupby("key", as_index=False)["certainty_ref"] + .first()) + + # ---------------------------- + # Load ALL JSON predictions (points) + # ---------------------------- + json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json"))) + if not json_files: + raise FileNotFoundError(f"No JSON files found in: {json_dir_path}") + + rows = [] + for fp in json_files: + with open(fp, "r", encoding="utf-8") as f: + data = json.load(f) + for entry in data: + if not entry.get("success"): + continue + res = entry.get("result", {}) + uid, md = res.get("unique_id"), res.get("MedDatum") + if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": + continue + rows.append({ + "file": os.path.basename(fp), + "key": norm_str(uid) + "_" + norm_str(md), + "EDSS_pred": parse_edss(res.get("EDSS")), + }) + + df_pred = pd.DataFrame(rows) + + # ---------------------------- + # Merge: preds + GT + JSON1 reference certainty + # ---------------------------- + df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one") + df = df.merge(df_ref[["key", "certainty_ref"]], on="key", how="inner", validate="many_to_one") + + # Filter needed fields + df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "certainty_ref"]).copy() + + df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"] + df["abs_error"] = df["signed_error"].abs() + y_col = "abs_error" if y_mode == "abs" else "signed_error" + + # ---------------------------- + # Logs + # ---------------------------- + print("\n" + "="*90) + print("CORRELATION: RAW certainty_percent (JSON1 reference) vs ERROR (ALL JSON points)") + print("="*90) + print(f"JSON DIR (points): {json_dir_path} | files: {len(json_files)}") + print(f"JSON1 reference: {os.path.basename(json1_file_path)}") + print(f"Points available after merge+filter: {len(df)}") + print(f"Unique keys in plot: {df['key'].nunique()}") + print(f"Y mode: {y_mode} ({y_col})") + + # ---------------------------- + # Correlations (Pearson + Spearman) + # ---------------------------- + x = df["certainty_ref"].to_numpy(dtype=float) + y = df[y_col].to_numpy(dtype=float) + + # Pearson + pearson = np.corrcoef(x, y)[0, 1] if len(df) >= 2 else np.nan + + # Spearman = Pearson corr of ranks + rx = rankdata(x) + ry = rankdata(y) + spearman = np.corrcoef(rx, ry)[0, 1] if len(df) >= 2 else np.nan + + print(f"\nPearson r: {pearson:.4f}") + print(f"Spearman ρ: {spearman:.4f}") + + # ---------------------------- + # Trend line (simple linear fit) + # ---------------------------- + # Fit y = a*x + b + if len(df) >= 2: + a, b = np.polyfit(x, y, 1) + else: + a, b = np.nan, np.nan + + # ---------------------------- + # Plot + # ---------------------------- + plt.figure(figsize=(12, 6)) + plt.scatter(x, y, alpha=point_alpha, s=point_size) + + # trend line across full x-range + if np.isfinite(a) and np.isfinite(b): + xs = np.linspace(np.nanmin(x), np.nanmax(x), 200) + plt.plot(xs, a * xs + b, linestyle="--", linewidth=2) + + plt.xlabel("certainty_percent (from JSON 1, per key)") + ylabel = "Absolute Error |EDSS_pred − EDSS_gt|" if y_mode == "abs" else "Signed Error (EDSS_pred − EDSS_gt)" + plt.ylabel(ylabel) + plt.title(f"Correlation of JSON1 certainty_percent vs {y_col} (All iterations)\n" + f"Pearson r={pearson:.3f} | Spearman ρ={spearman:.3f}") + plt.grid(linestyle=":", alpha=0.5) + plt.tight_layout() + plt.show() + + +# Example run: +json1_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json" +correlation_scatter_raw_certainty_json1_reference( + json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", + json1_file_path=json1_path, + ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", + y_mode="abs" # or "signed" +) +## + +# %% Correlation adjusted + +# Correlation scatter (RAW certainty_percent from JSON1) vs error (all JSON points) +# Adds: +# 1) Legend (points, trend line) + Pearson/Spearman shown in legend and title +# 2) Trend line color set to high-contrast (black by default) +# 3) Density coloring: dots colored by local point density (bluer = more cases) + colorbar + +import os, glob, json +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.lines import Line2D +from matplotlib.colors import LogNorm, PowerNorm + +def correlation_scatter_raw_certainty_json1_reference( + json_dir_path, + json1_file_path, + ground_truth_path, + gt_sep=";", + gt_edss_col="EDSS", + y_mode="abs", # "abs" or "signed" + point_alpha=0.85, # higher alpha works better with density coloring + point_size=14, + trend_color="black", # high-contrast line + save_svg_path=None, + dpi=300 +): + def norm_str(x): + return str(x).strip().lower() + + def parse_edss(x): + if x is None: + return np.nan + s = str(x).strip() + if s == "" or s.lower() in {"nan", "none", "null"}: + return np.nan + s = s.replace(",", ".") + return pd.to_numeric(s, errors="coerce") + + def rankdata(a): + return pd.Series(a).rank(method="average").to_numpy() + + # ---------------------------- + # Load GT + # ---------------------------- + df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) + for col in ["unique_id", "MedDatum", gt_edss_col]: + if col not in df_gt.columns: + raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}") + + df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) + df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) + df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] + df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss) + + # ---------------------------- + # Load JSON1 reference certainty_percent (per key) + # ---------------------------- + with open(json1_file_path, "r", encoding="utf-8") as f: + data1 = json.load(f) + + ref_rows = [] + for entry in data1: + if not entry.get("success"): + continue + res = entry.get("result", {}) + uid, md = res.get("unique_id"), res.get("MedDatum") + if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": + continue + ref_rows.append({ + "key": norm_str(uid) + "_" + norm_str(md), + "certainty_ref": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), + }) + + df_ref = pd.DataFrame(ref_rows) + df_ref = (df_ref.dropna(subset=["certainty_ref"]) + .groupby("key", as_index=False)["certainty_ref"] + .first()) + + # ---------------------------- + # Load ALL JSON predictions (points) + # ---------------------------- + json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json"))) + if not json_files: + raise FileNotFoundError(f"No JSON files found in: {json_dir_path}") + + rows = [] + for fp in json_files: + with open(fp, "r", encoding="utf-8") as f: + data = json.load(f) + for entry in data: + if not entry.get("success"): + continue + res = entry.get("result", {}) + uid, md = res.get("unique_id"), res.get("MedDatum") + if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": + continue + rows.append({ + "file": os.path.basename(fp), + "key": norm_str(uid) + "_" + norm_str(md), + "EDSS_pred": parse_edss(res.get("EDSS")), + }) + + df_pred = pd.DataFrame(rows) + + # ---------------------------- + # Merge: preds + GT + JSON1 reference certainty + # ---------------------------- + df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one") + df = df.merge(df_ref[["key", "certainty_ref"]], on="key", how="inner", validate="many_to_one") + df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "certainty_ref"]).copy() + + df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"] + df["abs_error"] = df["signed_error"].abs() + y_col = "abs_error" if y_mode == "abs" else "signed_error" + + # ---------------------------- + # Correlations + # ---------------------------- + x = df["certainty_ref"].to_numpy(dtype=float) + y = df[y_col].to_numpy(dtype=float) + + pearson = np.corrcoef(x, y)[0, 1] if len(df) >= 2 else np.nan + rx, ry = rankdata(x), rankdata(y) + spearman = np.corrcoef(rx, ry)[0, 1] if len(df) >= 2 else np.nan + + # ---------------------------- + # Trend line (linear fit) + # ---------------------------- + if len(df) >= 2: + a, b = np.polyfit(x, y, 1) + else: + a, b = np.nan, np.nan + + # ---------------------------- + # Density coloring (2D histogram bin counts) + # "how blue" = how many points are around that location + # ---------------------------- + # Choose binning resolution (balanced for ~thousands of points) + x_bins = 50 + y_bins = 50 + + # Compute bin index per point + x_edges = np.linspace(np.nanmin(x), np.nanmax(x), x_bins + 1) + y_edges = np.linspace(np.nanmin(y), np.nanmax(y), y_bins + 1) + + xi = np.clip(np.digitize(x, x_edges) - 1, 0, x_bins - 1) + yi = np.clip(np.digitize(y, y_edges) - 1, 0, y_bins - 1) + + # 2D counts + counts2d = np.zeros((x_bins, y_bins), dtype=int) + for i in range(len(x)): + counts2d[xi[i], yi[i]] += 1 + + # density per point = count of its bin + density = np.array([counts2d[xi[i], yi[i]] for i in range(len(x))], dtype=float) + + # Plot low density first, high density last (so dense points are visible) + order = np.argsort(density) + x_o, y_o, d_o = x[order], y[order], density[order] + + + + +# ... keep everything above the "Plot" section identical ... + + # ---------------------------- + # Plot (IMPROVED COLORS) + # ---------------------------- + fig, ax = plt.subplots(figsize=(12.5, 6)) + + # Option A (recommended): logarithmic color scaling + # Add +1 to avoid log(0) + d_plot = d_o + 1 + + # clip vmax so one extreme bin doesn't wash everything out + vmax = np.percentile(d_plot, 99) # try 95 or 99 depending on your data + norm = LogNorm(vmin=1, vmax=max(2, vmax)) + + sc = ax.scatter( + x_o, y_o, + c=d_plot, + cmap="Blues", + norm=norm, + s=point_size, + alpha=point_alpha, + linewidths=0 + ) + + # Trend line (black) + if np.isfinite(a) and np.isfinite(b): + xs = np.linspace(np.nanmin(x), np.nanmax(x), 200) + ax.plot(xs, a * xs + b, linestyle="--", linewidth=2.5, color=trend_color) + + ax.set_xlabel("certainty_percent (from JSON 1, per key)") + ax.set_ylabel("Absolute Error |EDSS_pred − EDSS_gt|" if y_mode == "abs" else "Signed Error (EDSS_pred − EDSS_gt)") + ax.set_title( + f"Correlation: JSON1 certainty_percent vs {y_col} (All iterations)\n" + f"Pearson r={pearson:.3f} | Spearman ρ={spearman:.3f}" + ) + ax.grid(linestyle=":", alpha=0.5) + + # Colorbar + cbar = plt.colorbar(sc, ax=ax) + cbar.set_label("Local density (count of cases in bin, log-scaled)") + + # Legend + legend_items = [ + Line2D([0], [0], marker="o", linestyle="None", color="navy", + label=f"Data points (n={len(df)})"), + Line2D([0], [0], linestyle="--", color=trend_color, linewidth=2.5, + label=f"Linear trend (Pearson r={pearson:.3f})"), + ] + ax.legend(handles=legend_items, loc="upper right", frameon=True, title="Legend") + + plt.tight_layout() + # Save as SVG (optional) + if save_svg_path: + fig.savefig(save_svg_path, format="svg", bbox_inches="tight", dpi=dpi) + print(f"[SAVED] {save_svg_path}") + + plt.show() + +json1_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json" + +correlation_scatter_raw_certainty_json1_reference( + json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", + json1_file_path=json1_path, + ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", + y_mode="abs", + save_svg_path="/home/shahin/Lab/Doktorarbeit/Barcelona/results/corr_json1_abs_error.svg" +) + +## + + + diff --git a/Data/certainty_show.py b/certainty_show.py similarity index 100% rename from Data/certainty_show.py rename to certainty_show.py diff --git a/figure1.py b/figure1.py index e9f1ee7..9f4706b 100644 --- a/figure1.py +++ b/figure1.py @@ -320,3 +320,63 @@ plt.tight_layout() plt.show() ## + +# %% Patientjourney Bubble chart +import matplotlib.pyplot as plt +import numpy as np + +import matplotlib as mpl + +mpl.rcParams["font.family"] = "DejaVu Sans" # or "Arial", "Calibri", "Times New Roman", ... +mpl.rcParams["font.size"] = 12 # default size for text +mpl.rcParams["axes.titlesize"] = 14 +mpl.rcParams["axes.titleweight"] = "bold" + + +# Data (your counts) +visits = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]) +patient_count = np.array([32, 24, 28, 17, 13, 6, 3, 3, 2]) + +# "Remaining" = patients with >= that many visits (cumulative from the right) +remaining = np.array([patient_count[i:].sum() for i in range(len(patient_count))]) + +# --- Plot --- +fig, ax = plt.subplots(figsize=(12, 3)) + +y = 0.0 # all bubbles on one horizontal line + +# Horizontal line +ax.hlines(y, visits.min() - 0.4, visits.max() + 0.4, color="#1f77b4", linewidth=3) + +# Bubble sizes (scale as needed) +# (Matplotlib scatter uses area in points^2) +sizes = patient_count * 35 # tweak this multiplier if you want bigger/smaller bubbles + +ax.scatter(visits, np.full_like(visits, y), s=sizes, color="#1f77b4", zorder=3) + +# Title +#ax.set_title("Patient Journey by Visit Count", fontsize=14, pad=18) + +# Top labels: "1 visits", "2 visits", ... +for x in visits: + label = f"{x} visit" if x == 1 else f"{x} visits" + ax.text(x, y + 0.18, label, ha="center", va="bottom", fontsize=10) + +# Bottom labels: "X patients" and "Y remaining" +for x, pc, rem in zip(visits, patient_count, remaining): + ax.text(x, y - 0.20, f"{pc} patients", ha="center", va="top", fontsize=9) + ax.text(x, y - 0.32, f"{rem} remaining", ha="center", va="top", fontsize=9) + +# Cosmetics: remove axes, keep spacing nice +ax.set_xlim(visits.min() - 0.6, visits.max() + 0.6) +ax.set_ylim(-0.5, 0.35) +ax.set_xticks([]) +ax.set_yticks([]) +for spine in ax.spines.values(): + spine.set_visible(False) + +plt.tight_layout() +plt.show() +plt.savefig("patient_journey.svg", format="svg", bbox_inches="tight") +## + diff --git a/Data/show_plots.py b/show_plots.py similarity index 100% rename from Data/show_plots.py rename to show_plots.py