# %% Confirm EDSS missing import pandas as pd import numpy as np def clean_series(s): return s.astype(str).str.strip().str.lower() def gt_edss_audit(ground_truth_path, edss_col="EDSS"): df_gt = pd.read_csv(ground_truth_path, sep=';') # normalize keys df_gt['unique_id'] = clean_series(df_gt['unique_id']) df_gt['MedDatum'] = clean_series(df_gt['MedDatum']) df_gt['key'] = df_gt['unique_id'] + "_" + df_gt['MedDatum'] print("GT rows:", len(df_gt)) print("GT unique keys:", df_gt['key'].nunique()) # IMPORTANT: parse EDSS robustly (German decimal commas etc.) if edss_col in df_gt.columns: edss_raw = df_gt[edss_col] edss_num = pd.to_numeric( edss_raw.astype(str).str.replace(",", ".", regex=False).str.strip(), errors="coerce" ) df_gt["_edss_num"] = edss_num print(f"GT missing EDSS look (numeric-coerce): {df_gt['_edss_num'].isna().sum()}") print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['_edss_num'].isna(), 'key'].nunique()}") # duplicates on key dup = df_gt['key'].duplicated(keep=False) print("GT duplicate-key rows:", dup.sum()) if dup.any(): # how many duplicate keys exist? print("GT duplicate keys:", df_gt.loc[dup, 'key'].nunique()) # of duplicate-key rows, how many have missing EDSS? print("Duplicate-key rows with missing EDSS:", df_gt.loc[dup, "_edss_num"].isna().sum()) # show the worst offenders print("\nTop duplicate keys (by count):") print(df_gt.loc[dup, 'key'].value_counts().head(10)) else: print(f"EDSS column '{edss_col}' not found in GT columns:", df_gt.columns.tolist()) return df_gt df_gt = gt_edss_audit("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", edss_col="EDSS") ## # %% trace missing ones import json, glob, os import pandas as pd def load_preds(json_dir_path): all_preds = [] for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) file_name = os.path.basename(file_path) for entry in data: if entry.get("success"): res = entry["result"] all_preds.append({ "unique_id": str(res.get("unique_id")).strip().lower(), "MedDatum": str(res.get("MedDatum")).strip().lower(), "file": file_name }) df_pred = pd.DataFrame(all_preds) df_pred["key"] = df_pred["unique_id"] + "_" + df_pred["MedDatum"] return df_pred df_pred = load_preds("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration") print("Pred rows:", len(df_pred)) print("Pred unique keys:", df_pred["key"].nunique()) # Suppose df_gt was returned from step 1 and has _edss_num + key missing_gt_keys = set(df_gt.loc[df_gt["_edss_num"].isna(), "key"]) df_pred["gt_key_missing_edss"] = df_pred["key"].isin(missing_gt_keys) print("Pred rows whose GT key has missing EDSS:", df_pred["gt_key_missing_edss"].sum()) print("Unique keys (among preds) whose GT EDSS missing:", df_pred.loc[df_pred["gt_key_missing_edss"], "key"].nunique()) print("\nTop files contributing to missing-GT-EDSS rows:") print(df_pred.loc[df_pred["gt_key_missing_edss"], "file"].value_counts().head(20)) print("\nTop keys replicated in predictions (why count inflates):") print(df_pred.loc[df_pred["gt_key_missing_edss"], "key"].value_counts().head(20)) ## # %% verify merged = df_pred.merge( df_gt[["key", "_edss_num"]], # use the numeric-coerced GT EDSS on="key", how="left", validate="many_to_one" # will ERROR if GT has duplicate keys (GOOD!) ) print("Merged rows:", len(merged)) print("Merged missing GT EDSS:", merged["_edss_num"].isna().sum()) ## # %% 1json (rewritten with robust parsing + detailed data log) import pandas as pd import numpy as np import json import matplotlib.pyplot as plt import seaborn as sns from matplotlib.patches import Patch from matplotlib.lines import Line2D def plot_single_json_error_analysis_with_log( json_file_path, ground_truth_path, edss_gt_col="EDSS", min_bin_count=5, ): def norm_str(x): # normalize identifiers and dates consistently return str(x).strip().lower() def parse_edss(x): # robust numeric parse: handles "3,5" as 3.5, blanks, "nan", etc. if x is None: return np.nan s = str(x).strip() if s == "" or s.lower() in {"nan", "none", "null"}: return np.nan s = s.replace(",", ".") return pd.to_numeric(s, errors="coerce") print("\n" + "="*80) print("SINGLE-JSON ERROR ANALYSIS (WITH LOG)") print("="*80) print(f"JSON: {json_file_path}") print(f"GT: {ground_truth_path}") # ------------------------------------------------------------------ # 1) Load Ground Truth # ------------------------------------------------------------------ df_gt = pd.read_csv(ground_truth_path, sep=";") required_gt_cols = {"unique_id", "MedDatum", edss_gt_col} missing_cols = required_gt_cols - set(df_gt.columns) if missing_cols: raise ValueError(f"GT is missing required columns: {missing_cols}. Available: {df_gt.columns.tolist()}") df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] # Robust EDSS parsing (important!) df_gt["EDSS_gt"] = df_gt[edss_gt_col].map(parse_edss) # GT logs print("\n--- GT LOG ---") print(f"GT rows: {len(df_gt)}") print(f"GT unique keys: {df_gt['key'].nunique()}") gt_dup = df_gt["key"].duplicated(keep=False).sum() print(f"GT duplicate-key rows: {gt_dup}") print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}") print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['EDSS_gt'].isna(), 'key'].nunique()}") if gt_dup > 0: print("\n[WARNING] GT has duplicate keys. Merge can duplicate rows. Example duplicate keys:") print(df_gt.loc[df_gt["key"].duplicated(keep=False), "key"].value_counts().head(10)) # ------------------------------------------------------------------ # 2) Load Predictions from the specific JSON # ------------------------------------------------------------------ with open(json_file_path, "r", encoding="utf-8") as f: data = json.load(f) total_entries = len(data) success_entries = sum(1 for e in data if e.get("success")) all_preds = [] skipped = { "not_success": 0, "missing_uid_or_date": 0, "missing_edss": 0, "missing_conf": 0, } for entry in data: if not entry.get("success"): skipped["not_success"] += 1 continue res = entry.get("result", {}) uid = res.get("unique_id") md = res.get("MedDatum") if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": skipped["missing_uid_or_date"] += 1 continue edss_pred = parse_edss(res.get("EDSS")) conf = pd.to_numeric(res.get("certainty_percent"), errors="coerce") if pd.isna(edss_pred): skipped["missing_edss"] += 1 if pd.isna(conf): skipped["missing_conf"] += 1 all_preds.append({ "unique_id": norm_str(uid), "MedDatum": norm_str(md), "key": norm_str(uid) + "_" + norm_str(md), "EDSS_pred": edss_pred, "confidence": conf, }) df_pred = pd.DataFrame(all_preds) # Pred logs print("\n--- PRED LOG ---") print(f"JSON total entries: {total_entries}") print(f"JSON success entries: {success_entries}") print(f"Pred rows loaded (success + has keys): {len(df_pred)}") if len(df_pred) == 0: print("[ERROR] No usable prediction rows found. Nothing to plot.") return print(f"Pred unique keys: {df_pred['key'].nunique()}") print(f"Pred missing EDSS (numeric): {df_pred['EDSS_pred'].isna().sum()}") print(f"Pred missing confidence: {df_pred['confidence'].isna().sum()}") print("Skipped counts:", skipped) # Are keys duplicated within this JSON? (often yes if multiple notes map to same key) key_counts = df_pred["key"].value_counts() dup_pred_rows = (key_counts > 1).sum() max_rep = int(key_counts.max()) print(f"Keys with >1 prediction in this JSON: {dup_pred_rows}") print(f"Max repetitions of a single key in this JSON: {max_rep}") if max_rep > 1: print("Top repeated keys in this JSON:") print(key_counts.head(10)) # ------------------------------------------------------------------ # 3) Merge (and diagnose why rows drop) # ------------------------------------------------------------------ # Diagnose how many pred keys exist in GT gt_key_set = set(df_gt["key"]) df_pred["key_in_gt"] = df_pred["key"].isin(gt_key_set) not_in_gt = df_pred.loc[~df_pred["key_in_gt"]] print("\n--- KEY MATCH LOG ---") print(f"Pred rows with key found in GT: {df_pred['key_in_gt'].sum()} / {len(df_pred)}") print(f"Pred rows with key NOT found in GT: {len(not_in_gt)}") if len(not_in_gt) > 0: print("[WARNING] Some prediction keys are not present in GT. First 10:") print(not_in_gt[["unique_id", "MedDatum", "key"]].head(10)) # Now merge; we expect GT is one-to-many with pred (many_to_one) # If GT had duplicates, validate would raise. df_merged = df_pred.merge( df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one" ) print("\n--- MERGE LOG ---") print(f"Merged rows (inner join): {len(df_merged)}") print(f"Merged unique keys: {df_merged['key'].nunique()}") print(f"Merged missing GT EDSS: {df_merged['EDSS_gt'].isna().sum()}") print(f"Merged missing pred EDSS: {df_merged['EDSS_pred'].isna().sum()}") print(f"Merged missing confidence:{df_merged['confidence'].isna().sum()}") # How many rows will be removed by dropna() in your old code? # Old code did .dropna() on ALL columns, which can remove rows for missing confidence too. rows_complete = df_merged.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]) print("\n--- FILTER LOG (what will be used for stats/plot) ---") print(f"Rows with all required fields (EDSS_gt, EDSS_pred, confidence): {len(rows_complete)}") if len(rows_complete) == 0: print("[ERROR] No complete rows after filtering. Nothing to plot.") return # Compute abs error rows_complete = rows_complete.copy() rows_complete["abs_error"] = (rows_complete["EDSS_pred"] - rows_complete["EDSS_gt"]).abs() # ------------------------------------------------------------------ # 4) Binning + stats (with guardrails) # ------------------------------------------------------------------ bins = [0, 70, 80, 90, 100] labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] # Confidence outside bins becomes NaN; log it rows_complete["conf_bin"] = pd.cut(rows_complete["confidence"], bins=bins, labels=labels, include_lowest=True) conf_outside = rows_complete["conf_bin"].isna().sum() print(f"Rows with confidence outside [0,100] or outside bin edges: {conf_outside}") if conf_outside > 0: print("Example confidences outside bins:") print(rows_complete.loc[rows_complete["conf_bin"].isna(), "confidence"].head(20).to_list()) df_plot = rows_complete.dropna(subset=["conf_bin"]) stats = ( df_plot.groupby("conf_bin", observed=True)["abs_error"] .agg(mean="mean", std="std", count="count") .reindex(labels) .reset_index() ) print("\n--- BIN STATS ---") print(stats) # Warn about low counts low_bins = stats.loc[stats["count"].fillna(0) < min_bin_count, ["conf_bin", "count"]] if not low_bins.empty: print(f"\n[WARNING] Some bins have < {min_bin_count} rows; error bars/trend may be unstable:") print(low_bins) # ------------------------------------------------------------------ # 5) Plot # ------------------------------------------------------------------ plt.figure(figsize=(13, 8)) colors = sns.color_palette("Blues", n_colors=len(labels)) # Replace NaNs in mean for plotting bars (empty bins) means = stats["mean"].to_numpy() counts = stats["count"].fillna(0).astype(int).to_numpy() stds = stats["std"].to_numpy() # For bins with no data, bar height 0 (and no errorbar) means_plot = np.nan_to_num(means, nan=0.0) bars = plt.bar(labels, means_plot, color=colors, edgecolor="black", alpha=0.85) # Error bars only where count>1 and std is not NaN sem = np.where((counts > 1) & (~np.isnan(stds)), stds / np.sqrt(counts), np.nan) plt.errorbar(labels, means_plot, yerr=sem, fmt="none", c="black", capsize=8, elinewidth=1.5) # Trend line only if at least 2 non-empty bins valid_idx = np.where(~np.isnan(means))[0] if len(valid_idx) >= 2: x_idx = np.arange(len(labels)) z = np.polyfit(valid_idx, means[valid_idx], 1) p = np.poly1d(z) plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5) trend_label = "Trend Line" else: trend_label = "Trend Line (insufficient bins)" print("\n[INFO] Not enough non-empty bins to fit a trend line.") # Data labels for i, bar in enumerate(bars): n_count = int(counts[i]) mae_val = means[i] if np.isnan(mae_val) or n_count == 0: txt = "empty" y = 0.02 else: txt = f"MAE: {mae_val:.2f}\nn={n_count}" y = bar.get_height() + 0.04 plt.text( bar.get_x() + bar.get_width()/2, y, txt, ha="center", va="bottom", fontweight="bold", fontsize=10 ) # Legend legend_elements = [ Patch(facecolor=colors[0], edgecolor="black", label=f"Bin 1: {labels[0]}"), Patch(facecolor=colors[1], edgecolor="black", label=f"Bin 2: {labels[1]}"), Patch(facecolor=colors[2], edgecolor="black", label=f"Bin 3: {labels[2]}"), Patch(facecolor=colors[3], edgecolor="black", label=f"Bin 4: {labels[3]}"), Line2D([0], [0], color="#e74c3c", linestyle="--", lw=3, label=trend_label), Line2D([0], [0], color="black", marker="_", linestyle="None", markersize=10, label="Std Error (SEM)"), Patch(color="none", label="Metric: Mean Absolute Error (MAE)") ] plt.legend(handles=legend_elements, loc="upper right", frameon=True, shadow=True, title="Legend") plt.title("Validation: Confidence vs. Error Magnitude (Single JSON)", fontsize=15, pad=30) plt.ylabel("Mean Absolute Error (EDSS Points)", fontsize=12) plt.xlabel("LLM Confidence Bracket", fontsize=12) plt.grid(axis="y", linestyle=":", alpha=0.5) ymax = np.nanmax(means) if np.any(~np.isnan(means)) else 0.0 plt.ylim(0, max(0.5, float(ymax) + 0.6)) plt.tight_layout() plt.show() print("\n" + "="*80) print("DONE") print("="*80) # --- RUN --- json_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json" gt_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv" plot_single_json_error_analysis_with_log(json_path, gt_path) ## # %% Certainty vs Delta (rewritten with robust parsing + detailed data loss logs) import pandas as pd import numpy as np import json import glob import os import matplotlib.pyplot as plt import seaborn as sns from matplotlib.patches import Patch from matplotlib.lines import Line2D def plot_confidence_vs_abs_error_with_log( json_dir_path, ground_truth_path, edss_gt_col="EDSS", min_bin_count=5, include_lowest=True, ): def norm_str(x): return str(x).strip().lower() def parse_edss(x): # robust numeric parse: handles comma decimals and empty tokens if x is None: return np.nan s = str(x).strip() if s == "" or s.lower() in {"nan", "none", "null"}: return np.nan s = s.replace(",", ".") return pd.to_numeric(s, errors="coerce") print("\n" + "="*90) print("CERTAINTY vs ABS ERROR (ALL JSONs) — WITH DATA LOSS LOG") print("="*90) print(f"JSON DIR: {json_dir_path}") print(f"GT FILE: {ground_truth_path}") # ------------------------------------------------------------------ # 1) Load GT # ------------------------------------------------------------------ df_gt = pd.read_csv(ground_truth_path, sep=";") required_gt_cols = {"unique_id", "MedDatum", edss_gt_col} missing_cols = required_gt_cols - set(df_gt.columns) if missing_cols: raise ValueError(f"GT missing columns: {missing_cols}. Available: {df_gt.columns.tolist()}") df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] df_gt["EDSS_gt"] = df_gt[edss_gt_col].map(parse_edss) # GT logs print("\n--- GT LOG ---") print(f"GT rows: {len(df_gt)}") print(f"GT unique keys: {df_gt['key'].nunique()}") gt_dup_rows = df_gt["key"].duplicated(keep=False).sum() print(f"GT duplicate-key rows: {gt_dup_rows}") print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}") print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['EDSS_gt'].isna(), 'key'].nunique()}") if gt_dup_rows > 0: print("\n[WARNING] GT has duplicate keys; merge can explode rows. Top duplicate keys:") print(df_gt.loc[df_gt["key"].duplicated(keep=False), "key"].value_counts().head(10)) gt_key_set = set(df_gt["key"]) # ------------------------------------------------------------------ # 2) Load predictions from all JSON files (with per-file logs) # ------------------------------------------------------------------ json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json"))) if not json_files: raise FileNotFoundError(f"No JSON files found in: {json_dir_path}") all_preds = [] per_file_summary = [] total_entries_all = 0 total_success_all = 0 skipped_all = {"not_success": 0, "missing_uid_or_date": 0} for file_path in json_files: with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) total_entries = len(data) success_entries = sum(1 for e in data if e.get("success")) total_entries_all += total_entries total_success_all += success_entries skipped = {"not_success": 0, "missing_uid_or_date": 0} loaded_rows = 0 for entry in data: if not entry.get("success"): skipped["not_success"] += 1 continue res = entry.get("result", {}) uid = res.get("unique_id") md = res.get("MedDatum") if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": skipped["missing_uid_or_date"] += 1 continue all_preds.append({ "file": os.path.basename(file_path), "unique_id": norm_str(uid), "MedDatum": norm_str(md), "key": norm_str(uid) + "_" + norm_str(md), "EDSS_pred": parse_edss(res.get("EDSS")), "confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), }) loaded_rows += 1 skipped_all["not_success"] += skipped["not_success"] skipped_all["missing_uid_or_date"] += skipped["missing_uid_or_date"] per_file_summary.append({ "file": os.path.basename(file_path), "entries_total": total_entries, "entries_success": success_entries, "pred_rows_loaded": loaded_rows, "skipped_not_success": skipped["not_success"], "skipped_missing_uid_or_date": skipped["missing_uid_or_date"], }) df_pred = pd.DataFrame(all_preds) df_file = pd.DataFrame(per_file_summary) # PRED logs print("\n--- PRED LOG (ALL FILES) ---") print(f"JSON files found: {len(json_files)}") print(f"Total JSON entries: {total_entries_all}") print(f"Total success entries:{total_success_all}") print(f"Pred rows loaded (success + has keys): {len(df_pred)}") if len(df_pred) == 0: print("[ERROR] No usable prediction rows found. Nothing to plot.") return print(f"Pred unique keys (across all files): {df_pred['key'].nunique()}") print(f"Pred missing EDSS (numeric): {df_pred['EDSS_pred'].isna().sum()}") print(f"Pred missing confidence: {df_pred['confidence'].isna().sum()}") print("Skipped totals:", skipped_all) # show per-file quick check (useful when one iteration is broken) print("\nPer-file loaded rows (head):") print(df_file.sort_values("file").head(10)) # ------------------------------------------------------------------ # 3) Key match log (pred -> GT) # ------------------------------------------------------------------ df_pred["key_in_gt"] = df_pred["key"].isin(gt_key_set) not_in_gt = df_pred.loc[~df_pred["key_in_gt"]] print("\n--- KEY MATCH LOG ---") print(f"Pred rows with key found in GT: {df_pred['key_in_gt'].sum()} / {len(df_pred)}") print(f"Pred rows with key NOT in GT: {len(not_in_gt)}") if len(not_in_gt) > 0: print("[WARNING] Example keys not found in GT (first 10):") print(not_in_gt[["file", "unique_id", "MedDatum", "key"]].head(10)) print("\n[WARNING] Files contributing most to key-mismatch:") print(not_in_gt["file"].value_counts().head(10)) # ------------------------------------------------------------------ # 4) Merge (no dropna yet) + detailed data loss accounting # ------------------------------------------------------------------ df_merged = df_pred.merge( df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one" # catches GT duplicates ) print("\n--- MERGE LOG ---") print(f"Merged rows (inner join): {len(df_merged)}") print(f"Merged unique keys: {df_merged['key'].nunique()}") # Now quantify what you lose at each filter stage n0 = len(df_merged) miss_gt = df_merged["EDSS_gt"].isna() miss_pred = df_merged["EDSS_pred"].isna() miss_conf = df_merged["confidence"].isna() print("\n--- MISSINGNESS IN MERGED ---") print(f"Missing GT EDSS: {miss_gt.sum()}") print(f"Missing Pred EDSS: {miss_pred.sum()}") print(f"Missing Confidence: {miss_conf.sum()}") # IMPORTANT: your old code used .dropna() with no subset => drops if ANY column is NaN. # We'll replicate the intended logic explicitly and log counts. df_complete = df_merged.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]) n1 = len(df_complete) print("\n--- FILTER LOG ---") print(f"Rows before filtering: {n0}") print(f"Rows after requiring EDSS_gt, EDSS_pred, confidence: {n1}") print(f"Rows lost due to missing required fields: {n0 - n1}") # Break down why rows were lost (overlap-aware) lost_mask = df_merged[["EDSS_gt", "EDSS_pred", "confidence"]].isna().any(axis=1) lost = df_merged.loc[lost_mask].copy() if len(lost) > 0: lost_reason = ( (lost["EDSS_gt"].isna()).astype(int).map({1:"GT",0:""}) + (lost["EDSS_pred"].isna()).astype(int).map({1:"+PRED",0:""}) + (lost["confidence"].isna()).astype(int).map({1:"+CONF",0:""}) ) lost["loss_reason"] = lost_reason.str.replace(r"^\+", "", regex=True).replace("", "UNKNOWN") print("\nTop loss reasons (overlap-aware):") print(lost["loss_reason"].value_counts().head(10)) print("\nFiles contributing most to lost rows:") print(lost["file"].value_counts().head(10)) if len(df_complete) == 0: print("[ERROR] No complete rows left after filtering. Nothing to plot.") return # ------------------------------------------------------------------ # 5) Abs error + binning # ------------------------------------------------------------------ df_complete = df_complete.copy() df_complete["abs_error"] = (df_complete["EDSS_pred"] - df_complete["EDSS_gt"]).abs() bins = [0, 70, 80, 90, 100] labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] df_complete["conf_bin"] = pd.cut( df_complete["confidence"], bins=bins, labels=labels, include_lowest=include_lowest ) conf_outside = df_complete["conf_bin"].isna().sum() print("\n--- BINNING LOG ---") print(f"Rows with confidence outside bin edges / invalid: {conf_outside}") if conf_outside > 0: print("Example out-of-bin confidences:") print(df_complete.loc[df_complete["conf_bin"].isna(), "confidence"].head(20).to_list()) df_plot = df_complete.dropna(subset=["conf_bin"]) print(f"Rows kept for bin stats/plot (after dropping out-of-bin): {len(df_plot)}") print(f"Rows lost due to out-of-bin confidence: {len(df_complete) - len(df_plot)}") stats = ( df_plot.groupby("conf_bin", observed=True)["abs_error"] .agg(mean="mean", std="std", count="count") .reindex(labels) .reset_index() ) print("\n--- BIN STATS ---") print(stats) low_bins = stats.loc[stats["count"].fillna(0) < min_bin_count, ["conf_bin", "count"]] if not low_bins.empty: print(f"\n[WARNING] Some bins have < {min_bin_count} rows (unstable SEM/trend):") print(low_bins) # ------------------------------------------------------------------ # 6) Plot # ------------------------------------------------------------------ plt.figure(figsize=(12, 8)) colors = sns.color_palette("Blues", n_colors=len(labels)) means = stats["mean"].to_numpy() counts = stats["count"].fillna(0).astype(int).to_numpy() stds = stats["std"].to_numpy() means_plot = np.nan_to_num(means, nan=0.0) bars = plt.bar(labels, means_plot, color=colors, edgecolor="black", linewidth=1.2) sem = np.where((counts > 1) & (~np.isnan(stds)), stds / np.sqrt(counts), np.nan) plt.errorbar(labels, means_plot, yerr=sem, fmt="none", c="black", capsize=6, elinewidth=1.5) # Trend line only if >=2 non-empty bins valid_idx = np.where(~np.isnan(means))[0] if len(valid_idx) >= 2: x_idx = np.arange(len(labels)) z = np.polyfit(valid_idx, means[valid_idx], 1) p = np.poly1d(z) plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=2.5) trend_label = "Correlation Trend" else: trend_label = "Correlation Trend (insufficient bins)" print("\n[INFO] Not enough non-empty bins to fit a trend line.") # Bar annotations (MAE + n) for i, bar in enumerate(bars): n = int(counts[i]) m = means[i] if n == 0 or np.isnan(m): txt = "empty" y = 0.02 else: txt = f"MAE: {m:.2f}\nn={n}" y = bar.get_height() + 0.05 plt.text(bar.get_x() + bar.get_width()/2, y, txt, ha="center", fontweight="bold") legend_elements = [ Patch(facecolor=colors[0], edgecolor="black", label=f"Bin 1: {labels[0]}"), Patch(facecolor=colors[1], edgecolor="black", label=f"Bin 2: {labels[1]}"), Patch(facecolor=colors[2], edgecolor="black", label=f"Bin 3: {labels[2]}"), Patch(facecolor=colors[3], edgecolor="black", label=f"Bin 4: {labels[3]}"), Line2D([0], [0], color="black", marker="_", linestyle="None", markersize=10, label="Standard Error (SEM)"), Line2D([0], [0], color="#e74c3c", linestyle="--", lw=2.5, label=trend_label), Patch(color="none", label="Metric: Mean Absolute Error (MAE)") ] plt.legend(handles=legend_elements, loc="upper right", frameon=True, shadow=True, fontsize=10, title="Legend") plt.title("Validation: Inverse Correlation of Confidence vs. Error Magnitude", fontsize=15, pad=20) plt.ylabel("Mean Absolute Error (Δ EDSS Points)", fontsize=12) plt.xlabel("LLM Confidence Bracket", fontsize=12) plt.grid(axis="y", linestyle=":", alpha=0.5) ymax = np.nanmax(means) if np.any(~np.isnan(means)) else 0.0 plt.ylim(0, max(0.5, float(ymax) + 0.6)) plt.tight_layout() plt.show() print("\n" + "="*90) print("DONE") print("="*90) # Example run: plot_confidence_vs_abs_error_with_log("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv") ## # %% Empirical Confidence # Empirical stability confidence (from 10 runs) + LLM certainty_percent as secondary signal # - Reads all JSONs in a folder (your 10 iterations) # - Aggregates by key = unique_id + MedDatum # - Computes: # * EDSS_mean, EDSS_std, EDSS_iqr, mode/share # * empirical_conf_0_100 (based on stability) # * llm_conf_mean_0_100 (mean certainty_percent) # * combined_conf_0_100 (weighted blend) # - Optional: merges GT EDSS and computes abs error on the aggregated prediction import os, glob, json import numpy as np import pandas as pd def build_empirical_confidence_table( json_dir_path: str, ground_truth_path: str | None = None, gt_sep: str = ";", gt_edss_col: str = "EDSS", w_empirical: float = 0.7, # weight for empirical stability w_llm: float = 0.3, # weight for LLM self-reported confidence tol_mode: float = 0.5, # tolerance to treat EDSS as "same" (EDSS often in 0.5 steps) min_runs_expected: int = 10, ): # ----------------------------- # Helpers # ----------------------------- def norm_str(x): return str(x).strip().lower() def parse_number(x): if x is None: return np.nan s = str(x).strip() if s == "" or s.lower() in {"nan", "none", "null"}: return np.nan s = s.replace(",", ".") return pd.to_numeric(s, errors="coerce") def robust_iqr(x: pd.Series): x = x.dropna() if len(x) == 0: return np.nan return float(x.quantile(0.75) - x.quantile(0.25)) def stability_to_confidence(std_val: float) -> float: """ Map EDSS variability across runs to a 0..100 confidence. EDSS is typically on 0.5 steps. A natural scale: std ~= 0.0 -> ~100 std ~= 0.25 -> ~75-90 std ~= 0.5 -> ~50-70 std >= 1.0 -> low Use a smooth exponential mapping. """ if np.isnan(std_val): return np.nan # scale parameter: std=0.5 -> exp(-1)=0.367 -> ~36.7 scale = 0.5 conf = 100.0 * np.exp(-(std_val / scale)) # clamp return float(np.clip(conf, 0.0, 100.0)) def mode_share_with_tolerance(values: np.ndarray, tol: float) -> tuple[float, float]: """ Compute a 'mode' under tolerance: pick the cluster center (median) and count how many values fall within +/- tol. Return (mode_center, share). This is robust to tiny float differences. """ vals = values[~np.isnan(values)] if len(vals) == 0: return (np.nan, np.nan) center = float(np.median(vals)) share = float(np.mean(np.abs(vals - center) <= tol)) return (center, share) # ----------------------------- # Load predictions from all JSONs # ----------------------------- json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json"))) if not json_files: raise FileNotFoundError(f"No JSON files found in: {json_dir_path}") rows = [] per_file = [] total_entries_all = 0 total_success_all = 0 skipped_all = {"not_success": 0, "missing_uid_or_date": 0} for fp in json_files: with open(fp, "r", encoding="utf-8") as f: data = json.load(f) total_entries = len(data) success_entries = sum(1 for e in data if e.get("success")) total_entries_all += total_entries total_success_all += success_entries skipped = {"not_success": 0, "missing_uid_or_date": 0} loaded = 0 for entry in data: if not entry.get("success"): skipped["not_success"] += 1 continue res = entry.get("result", {}) uid = res.get("unique_id") md = res.get("MedDatum") if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": skipped["missing_uid_or_date"] += 1 continue edss = parse_number(res.get("EDSS")) conf = parse_number(res.get("certainty_percent")) it = res.get("iteration", None) rows.append({ "file": os.path.basename(fp), "iteration": it, "unique_id": norm_str(uid), "MedDatum": norm_str(md), "key": norm_str(uid) + "_" + norm_str(md), "EDSS_pred": edss, "llm_conf": conf, }) loaded += 1 skipped_all["not_success"] += skipped["not_success"] skipped_all["missing_uid_or_date"] += skipped["missing_uid_or_date"] per_file.append({ "file": os.path.basename(fp), "entries_total": total_entries, "entries_success": success_entries, "rows_loaded": loaded, "skipped_not_success": skipped["not_success"], "skipped_missing_uid_or_date": skipped["missing_uid_or_date"], }) df_pred = pd.DataFrame(rows) df_file = pd.DataFrame(per_file) # ----------------------------- # Logs: ingestion # ----------------------------- print("\n" + "="*90) print("EMPIRICAL CONFIDENCE (10-RUN STABILITY) + LLM CONFIDENCE (SECONDARY)") print("="*90) print(f"JSON DIR: {json_dir_path}") print(f"JSON files: {len(json_files)}") print("\n--- INGEST LOG ---") print(f"Total JSON entries: {total_entries_all}") print(f"Total success entries:{total_success_all}") print(f"Pred rows loaded: {len(df_pred)}") print(f"Unique keys in preds: {df_pred['key'].nunique() if len(df_pred) else 0}") print(f"Missing EDSS_pred: {df_pred['EDSS_pred'].isna().sum() if len(df_pred) else 0}") print(f"Missing llm_conf: {df_pred['llm_conf'].isna().sum() if len(df_pred) else 0}") print("Skipped totals:", skipped_all) print("\nPer-file summary (top 10 by name):") print(df_file.sort_values("file").head(10)) # ----------------------------- # Aggregate by key (empirical stability) # ----------------------------- if len(df_pred) == 0: print("[ERROR] No usable prediction rows.") return None # how many runs per key (expect ~10) runs_per_key = df_pred.groupby("key")["EDSS_pred"].size().rename("n_rows").reset_index() print("\n--- RUNS PER KEY LOG ---") print(f"Keys with at least 1 row: {len(runs_per_key)}") print("Distribution of rows per key (value_counts):") print(runs_per_key["n_rows"].value_counts().sort_index()) # Aggregate stats def agg_block(g: pd.DataFrame): ed = g["EDSS_pred"].to_numpy(dtype=float) ll = g["llm_conf"].to_numpy(dtype=float) n_rows = len(g) n_edss = int(np.sum(~np.isnan(ed))) n_llm = int(np.sum(~np.isnan(ll))) ed_mean = float(np.nanmean(ed)) if n_edss else np.nan ed_std = float(np.nanstd(ed, ddof=1)) if n_edss >= 2 else (0.0 if n_edss == 1 else np.nan) ed_iqr = robust_iqr(pd.Series(ed)) mode_center, mode_share = mode_share_with_tolerance(ed, tol=tol_mode) llm_mean = float(np.nanmean(ll)) if n_llm else np.nan llm_std = float(np.nanstd(ll, ddof=1)) if n_llm >= 2 else (0.0 if n_llm == 1 else np.nan) emp_conf = stability_to_confidence(ed_std) if not np.isnan(ed_std) else np.nan # Combined confidence (weighted). If one side missing, fall back to the other. if np.isnan(emp_conf) and np.isnan(llm_mean): comb = np.nan elif np.isnan(emp_conf): comb = llm_mean elif np.isnan(llm_mean): comb = emp_conf else: comb = w_empirical * emp_conf + w_llm * llm_mean return pd.Series({ "unique_id": g["unique_id"].iloc[0], "MedDatum": g["MedDatum"].iloc[0], "n_rows": n_rows, "n_edss": n_edss, "n_llm_conf":n_llm, "EDSS_mean": ed_mean, "EDSS_std": ed_std, "EDSS_iqr": ed_iqr, "EDSS_mode_center": mode_center, "EDSS_mode_share": mode_share, # fraction within ±tol_mode of median center "llm_conf_mean": llm_mean, "llm_conf_std": llm_std, "empirical_conf_0_100": emp_conf, "combined_conf_0_100": float(np.clip(comb, 0.0, 100.0)) if not np.isnan(comb) else np.nan, }) df_agg = df_pred.groupby("key", as_index=False).apply(agg_block) # groupby+apply returns a multiindex sometimes depending on pandas version if isinstance(df_agg.index, pd.MultiIndex): df_agg = df_agg.reset_index(drop=True) # Logs: aggregation + losses print("\n--- AGGREGATION LOG ---") print(f"Aggregated keys: {len(df_agg)}") print(f"Keys with EDSS in >=1 run: {(df_agg['n_edss'] >= 1).sum()}") print(f"Keys with EDSS in >=2 runs (std meaningful): {(df_agg['n_edss'] >= 2).sum()}") print(f"Keys missing EDSS in all runs: {(df_agg['n_edss'] == 0).sum()}") print(f"Keys missing llm_conf in all runs: {(df_agg['n_llm_conf'] == 0).sum()}") # Expected runs check if min_runs_expected is not None: print(f"\nKeys with < {min_runs_expected} rows (potential missing iterations):") print(df_agg.loc[df_agg["n_rows"] < min_runs_expected, ["key", "n_rows"]].sort_values("n_rows").head(20)) # ----------------------------- # Optional: merge GT and compute error on aggregated EDSS_mean # ----------------------------- if ground_truth_path is not None: df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) need = {"unique_id", "MedDatum", gt_edss_col} miss = need - set(df_gt.columns) if miss: raise ValueError(f"GT missing columns: {miss}. Available: {df_gt.columns.tolist()}") df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] df_gt["EDSS_gt"] = df_gt[gt_edss_col].apply(parse_number) print("\n--- GT MERGE LOG ---") print(f"GT rows: {len(df_gt)} | GT unique keys: {df_gt['key'].nunique()}") print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}") df_final = df_agg.merge(df_gt[["key", "EDSS_gt"]], on="key", how="left", validate="one_to_one") print(f"Aggregated keys with GT match: {df_final['EDSS_gt'].notna().sum()} / {len(df_final)}") print(f"Aggregated keys missing GT EDSS: {df_final['EDSS_gt'].isna().sum()}") df_final["abs_error_mean"] = (df_final["EDSS_mean"] - df_final["EDSS_gt"]).abs() # How many keys usable for evaluation? usable = df_final.dropna(subset=["EDSS_mean", "EDSS_gt"]) print("\n--- EVAL LOG (AGGREGATED) ---") print(f"Keys with both EDSS_mean and EDSS_gt: {len(usable)}") if len(usable) > 0: print(f"MAE on EDSS_mean vs GT: {usable['abs_error_mean'].mean():.3f}") print(f"Median abs error: {usable['abs_error_mean'].median():.3f}") return df_final return df_agg # Example usage: df = build_empirical_confidence_table(json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", w_empirical=0.7, w_llm=0.3, tol_mode=0.5,min_runs_expected=10,) df.to_csv("empirical_confidence_table.csv", index=False) ## # %% Executive Boxplot import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib.patches import Patch def plot_exec_boxplots(df, min_bin_size_warn=10): """ Two side-by-side boxplots: - Left: abs_error_mean grouped by empirical_conf_0_100 quantile bins - Right: abs_error_mean grouped by llm_conf_mean quantile bins Adds: - Robust qcut labeling (handles ties; bins may be < 4) - Data logs + per-bin summary table printed - Clear legend explaining each panel and what box elements mean """ need_cols = ["abs_error_mean", "empirical_conf_0_100", "llm_conf_mean"] missing = [c for c in need_cols if c not in df.columns] if missing: raise ValueError(f"Missing columns in df: {missing}. Available: {df.columns.tolist()}") d = df[need_cols].copy() # ----------------------------- # Data logs: survivorship # ----------------------------- d_emp = d.dropna(subset=["abs_error_mean", "empirical_conf_0_100"]).copy() d_llm = d.dropna(subset=["abs_error_mean", "llm_conf_mean"]).copy() print("\n" + "="*90) print("EXECUTIVE BOXPLOTS — DATA LOG + SUMMARY") print("="*90) print(f"Total rows in df: {len(df)}") print(f"Rows for empirical plot: {len(d_emp)} (dropped {len(df) - len(d_emp)})") print(f"Rows for LLM plot: {len(d_llm)} (dropped {len(df) - len(d_llm)})") if len(d_emp) == 0 or len(d_llm) == 0: print("[ERROR] Not enough data after dropping NaNs to build both plots.") return # ----------------------------- # Robust quantile binning (handles ties) # ----------------------------- # Empirical emp_bins = pd.qcut(d_emp["empirical_conf_0_100"], q=4, duplicates="drop") k_emp = emp_bins.cat.categories.size emp_labels = [f"Q{i+1}" for i in range(k_emp)] d_emp["emp_q"] = pd.qcut(d_emp["empirical_conf_0_100"], q=4, duplicates="drop", labels=emp_labels) # LLM llm_bins = pd.qcut(d_llm["llm_conf_mean"], q=4, duplicates="drop") k_llm = llm_bins.cat.categories.size llm_labels = [f"Q{i+1}" for i in range(k_llm)] d_llm["llm_q"] = pd.qcut(d_llm["llm_conf_mean"], q=4, duplicates="drop", labels=llm_labels) # Print bin edges (so you can discuss exact thresholds) print("\n--- BIN EDGES (actual ranges) ---") print("Empirical confidence bins:") for i, interval in enumerate(emp_bins.cat.categories): print(f" {emp_labels[i]}: {interval}") print("LLM confidence bins:") for i, interval in enumerate(llm_bins.cat.categories): print(f" {llm_labels[i]}: {interval}") # ----------------------------- # Summary tables (per bin) # ----------------------------- def summarize_bins(df_in, bin_col, conf_col, label): g = df_in.groupby(bin_col, observed=True).agg( n=("abs_error_mean", "size"), mae_mean=("abs_error_mean", "mean"), mae_median=("abs_error_mean", "median"), mae_q25=("abs_error_mean", lambda x: x.quantile(0.25)), mae_q75=("abs_error_mean", lambda x: x.quantile(0.75)), conf_mean=(conf_col, "mean"), conf_median=(conf_col, "median"), ).reset_index().rename(columns={bin_col: "bin"}) g["panel"] = label return g[["panel", "bin", "n", "mae_mean", "mae_median", "mae_q25", "mae_q75", "conf_mean", "conf_median"]] summary_emp = summarize_bins(d_emp, "emp_q", "empirical_conf_0_100", "Empirical") summary_llm = summarize_bins(d_llm, "llm_q", "llm_conf_mean", "LLM") print("\n--- SUMMARY TABLE: Empirical confidence quartiles (or fewer if ties) ---") print(summary_emp.to_string(index=False, float_format=lambda x: f"{x:.3f}")) print("\n--- SUMMARY TABLE: LLM confidence quartiles (or fewer if ties) ---") print(summary_llm.to_string(index=False, float_format=lambda x: f"{x:.3f}")) # Warn about small bins small_emp = summary_emp.loc[summary_emp["n"] < min_bin_size_warn, ["bin", "n"]] small_llm = summary_llm.loc[summary_llm["n"] < min_bin_size_warn, ["bin", "n"]] if not small_emp.empty or not small_llm.empty: print(f"\n[WARNING] Some bins have < {min_bin_size_warn} points; compare them cautiously.") if not small_emp.empty: print(" Empirical small bins:") print(small_emp.to_string(index=False)) if not small_llm.empty: print(" LLM small bins:") print(small_llm.to_string(index=False)) # ----------------------------- # Prepare data for boxplots # ----------------------------- emp_cats = list(d_emp["emp_q"].cat.categories) llm_cats = list(d_llm["llm_q"].cat.categories) emp_groups = [d_emp.loc[d_emp["emp_q"] == q, "abs_error_mean"].values for q in emp_cats] llm_groups = [d_llm.loc[d_llm["llm_q"] == q, "abs_error_mean"].values for q in llm_cats] # ----------------------------- # Plot # ----------------------------- fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True) bp0 = axes[0].boxplot(emp_groups, labels=emp_cats, showfliers=False, patch_artist=True) bp1 = axes[1].boxplot(llm_groups, labels=llm_cats, showfliers=False, patch_artist=True) # Make panels visually distinct but still simple (no extra clutter) for patch in bp0["boxes"]: patch.set_alpha(0.6) for patch in bp1["boxes"]: patch.set_alpha(0.6) axes[0].set_title("Error by Empirical Confidence (quantile bins)") axes[0].set_xlabel("Empirical confidence bin") axes[0].set_ylabel("Absolute Error (|EDSS_mean − EDSS_gt|)") axes[1].set_title("Error by LLM Confidence (quantile bins)") axes[1].set_xlabel("LLM confidence bin") for ax in axes: ax.grid(axis="y", linestyle=":", alpha=0.5) # ----------------------------- # Legend (simple, but useful) # ----------------------------- legend_elements = [ Patch(facecolor="white", edgecolor="black", label="Box = IQR (25%–75%)"), Patch(facecolor="white", edgecolor="black", label="Center line = median"), Patch(facecolor="white", edgecolor="black", label="Whiskers = typical range (no outliers shown)"), Patch(facecolor="white", edgecolor="white", label="Left panel: empirical stability bins"), Patch(facecolor="white", edgecolor="white", label="Right panel: LLM self-reported bins"), ] fig.legend(handles=legend_elements, loc="upper center", ncol=3, frameon=True) plt.tight_layout(rect=[0, 0, 1, 0.90]) plt.show() print("\n" + "="*90) print("DONE") print("="*90) # Example (complete): df_final = build_empirical_confidence_table( json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", w_empirical=0.7, w_llm=0.3, tol_mode=0.5, min_runs_expected=10, ) plot_exec_boxplots(df_final) ## # %% Scatter import os, json import numpy as np import pandas as pd import matplotlib.pyplot as plt def scatter_abs_error_by_conf_bins_single_json( json_file_path, ground_truth_path, gt_sep=";", gt_edss_col="EDSS", ): def norm_str(x): return str(x).strip().lower() def parse_edss(x): if x is None: return np.nan s = str(x).strip() if s == "" or s.lower() in {"nan", "none", "null"}: return np.nan s = s.replace(",", ".") return pd.to_numeric(s, errors="coerce") # ---- Load GT df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss) # ---- Load preds from JSON with open(json_file_path, "r", encoding="utf-8") as f: data = json.load(f) rows = [] for entry in data: if not entry.get("success"): continue res = entry.get("result", {}) uid, md = res.get("unique_id"), res.get("MedDatum") if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": continue rows.append({ "key": norm_str(uid) + "_" + norm_str(md), "EDSS_pred": parse_edss(res.get("EDSS")), "confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), }) df_pred = pd.DataFrame(rows) # ---- Merge + filter df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one") df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy() df["abs_error"] = (df["EDSS_pred"] - df["EDSS_gt"]).abs() # ---- Bin confidence into 4 categories bins = [0, 70, 80, 90, 100] labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True) df = df.dropna(subset=["conf_bin"]).copy() # ---- Logs print("\n--- BIN COUNTS (points plotted) ---") print(df["conf_bin"].value_counts().reindex(labels).fillna(0).astype(int)) print(f"Total points plotted: {len(df)}") # ---- Scatter (categorical x with jitter) x_map = {lab: i for i, lab in enumerate(labels)} x = df["conf_bin"].map(x_map).astype(float).to_numpy() jitter = np.random.uniform(-0.12, 0.12, size=len(df)) xj = x + jitter plt.figure(figsize=(12, 6)) plt.scatter(xj, df["abs_error"].to_numpy(), alpha=0.55) plt.xticks(range(len(labels)), labels) plt.xlabel("certainty_percent category (Iteration 1)") plt.ylabel("Absolute Error (|EDSS_pred − EDSS_gt|)") plt.title("Absolute Error vs LLM Confidence Category (Single JSON)") plt.grid(axis="y", linestyle=":", alpha=0.5) plt.tight_layout() plt.show() # --- RUN --- scatter_abs_error_by_conf_bins_single_json( json_file_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json", ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", ) ## # %% Boxplot2 # Boxplot + light jittered points # - Single JSON (iteration 1) # - X: confidence bin (<70, 70-80, 80-90, 90-100) # - Y: absolute error # - Legend includes n per bin import json import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib.patches import Patch def boxplot_with_jitter_abs_error_by_conf_bins_single_json( json_file_path, ground_truth_path, gt_sep=";", gt_edss_col="EDSS", jitter_width=0.12, point_alpha=0.25, show_outliers=False, ): def norm_str(x): return str(x).strip().lower() def parse_edss(x): if x is None: return np.nan s = str(x).strip() if s == "" or s.lower() in {"nan", "none", "null"}: return np.nan s = s.replace(",", ".") return pd.to_numeric(s, errors="coerce") # ---- Load GT df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) for col in ["unique_id", "MedDatum", gt_edss_col]: if col not in df_gt.columns: raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}") df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss) # ---- Load preds from JSON with open(json_file_path, "r", encoding="utf-8") as f: data = json.load(f) rows = [] for entry in data: if not entry.get("success"): continue res = entry.get("result", {}) uid, md = res.get("unique_id"), res.get("MedDatum") if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": continue rows.append({ "key": norm_str(uid) + "_" + norm_str(md), "EDSS_pred": parse_edss(res.get("EDSS")), "confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), }) df_pred = pd.DataFrame(rows) # ---- Merge + filter df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one") df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy() df["abs_error"] = (df["EDSS_pred"] - df["EDSS_gt"]).abs() # ---- Bin confidence bins = [0, 70, 80, 90, 100] labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True) df = df.dropna(subset=["conf_bin"]).copy() # ---- Prepare per-bin arrays bin_arrays = [df.loc[df["conf_bin"] == lab, "abs_error"].to_numpy() for lab in labels] n_counts = [len(a) for a in bin_arrays] # ---- Plot fig, ax = plt.subplots(figsize=(12, 6)) # Boxplot (no fliers by default to reduce clutter) bp = ax.boxplot( bin_arrays, labels=labels, showfliers=show_outliers, patch_artist=True, widths=0.55, ) # Light fill for boxes (no explicit color choices required) for b in bp["boxes"]: b.set_alpha(0.35) # Jittered points on top for i, arr in enumerate(bin_arrays, start=1): if len(arr) == 0: continue x = np.full(len(arr), i, dtype=float) x += np.random.uniform(-jitter_width, jitter_width, size=len(arr)) ax.scatter(x, arr, alpha=point_alpha, s=18) ax.set_title("Absolute Error by LLM Confidence Bin (Iteration 1)") ax.set_xlabel("certainty_percent category") ax.set_ylabel("Absolute Error (|EDSS_pred − EDSS_gt|)") ax.grid(axis="y", linestyle=":", alpha=0.5) # Legend showing n per bin legend_handles = [ Patch(facecolor="white", edgecolor="black", label=f"{lab}: n={n}") for lab, n in zip(labels, n_counts) ] ax.legend(handles=legend_handles, title="Bin counts", loc="upper right", frameon=True) plt.tight_layout() plt.show() # Print counts too (useful for discussion) print("\n--- BIN COUNTS (points plotted) ---") for lab, n in zip(labels, n_counts): print(f"{lab:>18}: n={n}") print(f"Total points plotted: {sum(n_counts)}") # Example run: boxplot_with_jitter_abs_error_by_conf_bins_single_json( json_file_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json", ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv" ) ## # %% Boxplot3 # Boxplot + jitter with SIGNED error (direction) # - Y-axis: signed error = EDSS_pred - EDSS_gt (negative = underestimation, positive = overestimation) # - Also prints per-bin summary (n, mean signed error, median, MAE) import json import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib.patches import Patch from matplotlib.lines import Line2D def boxplot_with_jitter_signed_error_by_conf_bins_single_json( json_file_path, ground_truth_path, gt_sep=";", gt_edss_col="EDSS", jitter_width=0.12, point_alpha=0.25, show_outliers=False, ): def norm_str(x): return str(x).strip().lower() def parse_edss(x): if x is None: return np.nan s = str(x).strip() if s == "" or s.lower() in {"nan", "none", "null"}: return np.nan s = s.replace(",", ".") return pd.to_numeric(s, errors="coerce") # ---- Load GT df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) for col in ["unique_id", "MedDatum", gt_edss_col]: if col not in df_gt.columns: raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}") df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss) # ---- Load preds from JSON with open(json_file_path, "r", encoding="utf-8") as f: data = json.load(f) rows = [] for entry in data: if not entry.get("success"): continue res = entry.get("result", {}) uid, md = res.get("unique_id"), res.get("MedDatum") if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": continue rows.append({ "key": norm_str(uid) + "_" + norm_str(md), "EDSS_pred": parse_edss(res.get("EDSS")), "confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), }) df_pred = pd.DataFrame(rows) # ---- Merge + filter df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one") df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy() # SIGNED ERROR (direction) df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"] df["abs_error"] = df["signed_error"].abs() # ---- Bin confidence bins = [0, 70, 80, 90, 100] labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True) df = df.dropna(subset=["conf_bin"]).copy() # ---- Prepare arrays bin_arrays = [df.loc[df["conf_bin"] == lab, "signed_error"].to_numpy() for lab in labels] n_counts = [len(a) for a in bin_arrays] # ---- Plot fig, ax = plt.subplots(figsize=(12, 6)) bp = ax.boxplot( bin_arrays, labels=labels, showfliers=show_outliers, patch_artist=True, widths=0.55, ) for b in bp["boxes"]: b.set_alpha(0.35) # Jittered points for i, arr in enumerate(bin_arrays, start=1): if len(arr) == 0: continue x = np.full(len(arr), i, dtype=float) x += np.random.uniform(-jitter_width, jitter_width, size=len(arr)) ax.scatter(x, arr, alpha=point_alpha, s=18) # Zero line to show over/under clearly ax.axhline(0, linewidth=1.5, linestyle="--") ax.set_title("Signed Error by LLM Confidence Bin (Iteration 1)") ax.set_xlabel("certainty_percent category") ax.set_ylabel("Signed Error (EDSS_pred − EDSS_gt)") ax.grid(axis="y", linestyle=":", alpha=0.5) # Legend with n per bin + zero-line meaning legend_handles = [ Patch(facecolor="white", edgecolor="black", label=f"{lab}: n={n}") for lab, n in zip(labels, n_counts) ] legend_handles.append(Line2D([0], [0], linestyle="--", color="black", label="0 = unbiased (over/under split)")) ax.legend(handles=legend_handles, title="Bin counts", loc="upper right", frameon=True) plt.tight_layout() plt.show() # ---- Print per-bin summary to discuss print("\n--- PER-BIN SUMMARY (points plotted) ---") for lab in labels: sub = df.loc[df["conf_bin"] == lab] n = len(sub) if n == 0: print(f"{lab:>18}: n=0") continue print( f"{lab:>18}: n={n:3d} | " f"mean signed={sub['signed_error'].mean(): .3f} | " f"median signed={sub['signed_error'].median(): .3f} | " f"MAE={sub['abs_error'].mean(): .3f}" ) print(f"Total points plotted: {len(df)}") # Example run: boxplot_with_jitter_signed_error_by_conf_bins_single_json( json_file_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json", ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv" ) ## # %% jitter and violin 10x10 # Violin + jitter (all JSONs in folder), with signed error # - X: confidence bins (<70, 70-80, 80-90, 90-100) # - Y: signed error = EDSS_pred - EDSS_gt (direction) # - Prints bin counts (n) and puts n into the legend import os, glob, json import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from matplotlib.patches import Patch from matplotlib.lines import Line2D def violin_jitter_signed_error_all_jsons( json_dir_path, ground_truth_path, gt_sep=";", gt_edss_col="EDSS", jitter_width=0.12, point_alpha=0.20, point_size=10, violin_inner="quartile", # 'quartile', 'box', 'stick', or None ): def norm_str(x): return str(x).strip().lower() def parse_edss(x): if x is None: return np.nan s = str(x).strip() if s == "" or s.lower() in {"nan", "none", "null"}: return np.nan s = s.replace(",", ".") return pd.to_numeric(s, errors="coerce") # ---- Load GT df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) for col in ["unique_id", "MedDatum", gt_edss_col]: if col not in df_gt.columns: raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}") df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss) # ---- Load preds from ALL JSONs json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json"))) if not json_files: raise FileNotFoundError(f"No JSON files found in: {json_dir_path}") rows = [] for fp in json_files: with open(fp, "r", encoding="utf-8") as f: data = json.load(f) for entry in data: if not entry.get("success"): continue res = entry.get("result", {}) uid, md = res.get("unique_id"), res.get("MedDatum") if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": continue rows.append({ "file": os.path.basename(fp), "key": norm_str(uid) + "_" + norm_str(md), "EDSS_pred": parse_edss(res.get("EDSS")), "confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), }) df_pred = pd.DataFrame(rows) # ---- Merge + filter df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one") df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy() df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"] # ---- Bin confidence bins = [0, 70, 80, 90, 100] labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True) df = df.dropna(subset=["conf_bin"]).copy() # ---- Counts + log counts = df["conf_bin"].value_counts().reindex(labels).fillna(0).astype(int) print("\n--- BIN COUNTS (all JSONs) ---") for lab in labels: print(f"{lab:>18}: n={counts[lab]}") print(f"Total points plotted: {len(df)}") print(f"JSON files: {len(json_files)}") # Ensure ordering for seaborn df["conf_bin"] = pd.Categorical(df["conf_bin"], categories=labels, ordered=True) # ---- Plot plt.figure(figsize=(12, 6)) # Violin (density) sns.violinplot( data=df, x="conf_bin", y="signed_error", order=labels, inner=violin_inner, cut=0 ) # Jittered points (manual jitter to keep it consistent and fast) x_map = {lab: i for i, lab in enumerate(labels)} x = df["conf_bin"].map(x_map).astype(float).to_numpy() xj = x + np.random.uniform(-jitter_width, jitter_width, size=len(df)) plt.scatter(xj, df["signed_error"].to_numpy(), alpha=point_alpha, s=point_size) # Zero line (over/under split) plt.axhline(0, linestyle="--", linewidth=1.5) plt.xticks(range(len(labels)), labels) plt.xlabel("certainty_percent category (all iterations)") plt.ylabel("Signed Error (EDSS_pred − EDSS_gt)") plt.title("Signed Error vs LLM Confidence Category — Violin + Jitter (All JSONs)") plt.grid(axis="y", linestyle=":", alpha=0.5) # Legend with n per bin legend_handles = [ Patch(facecolor="white", edgecolor="black", label=f"{lab}: n={int(counts[lab])}") for lab in labels ] legend_handles.append(Line2D([0], [0], linestyle="--", color="black", label="0 = unbiased (over/under split)")) plt.legend(handles=legend_handles, title="Bin counts", loc="upper right", frameon=True) plt.tight_layout() plt.show() # Example run: violin_jitter_signed_error_all_jsons( json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv" ) ## # %% jitter and violin 10x1 # Adjusted: Violin + jitter (ALL JSONs for points) but X-bins come ONLY from JSON #1 (reference) # Fixes: # 1) Legend has colors matching bins # 2) Legend placed OUTSIDE plot area # 3) X-axis binning uses certainty_percent from JSON1 (by key), then all iterations' points inherit that bin import os, glob, json import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from matplotlib.patches import Patch from matplotlib.lines import Line2D def violin_jitter_signed_error_all_jsons_xbins_from_json1( json_dir_path, json1_file_path, ground_truth_path, gt_sep=";", gt_edss_col="EDSS", jitter_width=0.12, point_alpha=0.18, point_size=10, violin_inner="quartile", # 'quartile', 'box', 'stick', or None ): def norm_str(x): return str(x).strip().lower() def parse_edss(x): if x is None: return np.nan s = str(x).strip() if s == "" or s.lower() in {"nan", "none", "null"}: return np.nan s = s.replace(",", ".") return pd.to_numeric(s, errors="coerce") # ---------------------------- # Load GT # ---------------------------- df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) for col in ["unique_id", "MedDatum", gt_edss_col]: if col not in df_gt.columns: raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}") df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss) # ---------------------------- # Load JSON1 and build reference bins by KEY # ---------------------------- with open(json1_file_path, "r", encoding="utf-8") as f: data1 = json.load(f) ref_rows = [] for entry in data1: if not entry.get("success"): continue res = entry.get("result", {}) uid, md = res.get("unique_id"), res.get("MedDatum") if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": continue ref_rows.append({ "key": norm_str(uid) + "_" + norm_str(md), "confidence_ref": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), }) df_ref = pd.DataFrame(ref_rows) # If JSON1 has duplicates for a key (unlikely, but safe), take the first non-null confidence df_ref = (df_ref.sort_values("confidence_ref") .groupby("key", as_index=False)["confidence_ref"] .apply(lambda s: s.dropna().iloc[0] if s.dropna().any() else np.nan)) if isinstance(df_ref.index, pd.MultiIndex): df_ref = df_ref.reset_index(drop=True) # Confidence bins bins = [0, 70, 80, 90, 100] labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] df_ref["conf_bin_ref"] = pd.cut(df_ref["confidence_ref"], bins=bins, labels=labels, include_lowest=True) df_ref = df_ref.dropna(subset=["conf_bin_ref"]).copy() # ---------------------------- # Load ALL JSONs (all points) # ---------------------------- json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json"))) if not json_files: raise FileNotFoundError(f"No JSON files found in: {json_dir_path}") rows = [] for fp in json_files: with open(fp, "r", encoding="utf-8") as f: data = json.load(f) for entry in data: if not entry.get("success"): continue res = entry.get("result", {}) uid, md = res.get("unique_id"), res.get("MedDatum") if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": continue rows.append({ "file": os.path.basename(fp), "key": norm_str(uid) + "_" + norm_str(md), "EDSS_pred": parse_edss(res.get("EDSS")), }) df_pred = pd.DataFrame(rows) # ---------------------------- # Merge: preds + GT + reference bins (from JSON1) # ---------------------------- df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one") df = df.merge(df_ref[["key", "conf_bin_ref"]], on="key", how="inner", validate="many_to_one") # filter for plotting df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "conf_bin_ref"]).copy() df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"] # ordering df["conf_bin_ref"] = pd.Categorical(df["conf_bin_ref"], categories=labels, ordered=True) # ---------------------------- # Logs + counts # ---------------------------- counts = df["conf_bin_ref"].value_counts().reindex(labels).fillna(0).astype(int) print("\n--- BIN COUNTS (ALL JSON points, binned by JSON1 confidence) ---") for lab in labels: print(f"{lab:>18}: n={int(counts[lab])}") print(f"Total points plotted: {len(df)}") print(f"JSON files used for points: {len(json_files)}") print(f"Reference JSON1 bins derived from: {os.path.basename(json1_file_path)}") print(f"Keys in reference (after binning & non-null): {df_ref['key'].nunique()}") # ---------------------------- # Colors + legend patches # ---------------------------- palette = sns.color_palette("Blues", n_colors=len(labels)) bin_colors = {lab: palette[i] for i, lab in enumerate(labels)} legend_handles = [ Patch(facecolor=bin_colors[lab], edgecolor="black", label=f"{lab}: n={int(counts[lab])}") for lab in labels ] legend_handles.append(Line2D([0], [0], linestyle="--", color="black", label="0 = unbiased (over/under split)")) # ---------------------------- # Plot (legend outside) # ---------------------------- fig, ax = plt.subplots(figsize=(12.5, 6)) sns.violinplot( data=df, x="conf_bin_ref", y="signed_error", order=labels, inner=violin_inner, cut=0, palette=[bin_colors[l] for l in labels], ax=ax, ) # jittered points (manual jitter) x_map = {lab: i for i, lab in enumerate(labels)} x = df["conf_bin_ref"].map(x_map).astype(float).to_numpy() xj = x + np.random.uniform(-jitter_width, jitter_width, size=len(df)) ax.scatter(xj, df["signed_error"].to_numpy(), alpha=point_alpha, s=point_size) ax.axhline(0, linestyle="--", linewidth=1.5) ax.set_xlabel("certainty_percent category (from JSON 1 as reference)") ax.set_ylabel("Signed Error (EDSS_pred − EDSS_gt)") ax.set_title("Signed Error vs LLM Confidence Category — Violin + Jitter (All JSONs)\nBinned by JSON 1 certainty_percent") ax.grid(axis="y", linestyle=":", alpha=0.5) # Legend outside (right) ax.legend( handles=legend_handles, title="Bin counts", loc="center left", bbox_to_anchor=(1.02, 0.5), frameon=True ) plt.tight_layout() plt.show() # Example run: json1_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json" violin_jitter_signed_error_all_jsons_xbins_from_json1( json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", json1_file_path=json1_path, ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv") ## # %% Coorelation # Correlation plot (RAW certainty_percent) vs error # - Uses ALL JSONs as points # - Uses JSON1 certainty_percent as the x-value reference (per key) # - Y can be abs_error or signed_error (choose with y_mode) # - Prints Spearman + Pearson correlations # - Adds a simple linear trend line import os, glob, json import numpy as np import pandas as pd import matplotlib.pyplot as plt def correlation_scatter_raw_certainty_json1_reference( json_dir_path, json1_file_path, ground_truth_path, gt_sep=";", gt_edss_col="EDSS", y_mode="abs", # "abs" or "signed" point_alpha=0.18, point_size=12, ): def norm_str(x): return str(x).strip().lower() def parse_edss(x): if x is None: return np.nan s = str(x).strip() if s == "" or s.lower() in {"nan", "none", "null"}: return np.nan s = s.replace(",", ".") return pd.to_numeric(s, errors="coerce") def rankdata(a): # Average-rank for ties (Spearman needs ranks) s = pd.Series(a) return s.rank(method="average").to_numpy() # ---------------------------- # Load GT # ---------------------------- df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) for col in ["unique_id", "MedDatum", gt_edss_col]: if col not in df_gt.columns: raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}") df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss) # ---------------------------- # Load JSON1 reference certainty_percent (per key) # ---------------------------- with open(json1_file_path, "r", encoding="utf-8") as f: data1 = json.load(f) ref_rows = [] for entry in data1: if not entry.get("success"): continue res = entry.get("result", {}) uid, md = res.get("unique_id"), res.get("MedDatum") if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": continue ref_rows.append({ "key": norm_str(uid) + "_" + norm_str(md), "certainty_ref": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), }) df_ref = pd.DataFrame(ref_rows) # Deduplicate keys if needed: take first non-null certainty df_ref = (df_ref.dropna(subset=["certainty_ref"]) .groupby("key", as_index=False)["certainty_ref"] .first()) # ---------------------------- # Load ALL JSON predictions (points) # ---------------------------- json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json"))) if not json_files: raise FileNotFoundError(f"No JSON files found in: {json_dir_path}") rows = [] for fp in json_files: with open(fp, "r", encoding="utf-8") as f: data = json.load(f) for entry in data: if not entry.get("success"): continue res = entry.get("result", {}) uid, md = res.get("unique_id"), res.get("MedDatum") if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": continue rows.append({ "file": os.path.basename(fp), "key": norm_str(uid) + "_" + norm_str(md), "EDSS_pred": parse_edss(res.get("EDSS")), }) df_pred = pd.DataFrame(rows) # ---------------------------- # Merge: preds + GT + JSON1 reference certainty # ---------------------------- df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one") df = df.merge(df_ref[["key", "certainty_ref"]], on="key", how="inner", validate="many_to_one") # Filter needed fields df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "certainty_ref"]).copy() df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"] df["abs_error"] = df["signed_error"].abs() y_col = "abs_error" if y_mode == "abs" else "signed_error" # ---------------------------- # Logs # ---------------------------- print("\n" + "="*90) print("CORRELATION: RAW certainty_percent (JSON1 reference) vs ERROR (ALL JSON points)") print("="*90) print(f"JSON DIR (points): {json_dir_path} | files: {len(json_files)}") print(f"JSON1 reference: {os.path.basename(json1_file_path)}") print(f"Points available after merge+filter: {len(df)}") print(f"Unique keys in plot: {df['key'].nunique()}") print(f"Y mode: {y_mode} ({y_col})") # ---------------------------- # Correlations (Pearson + Spearman) # ---------------------------- x = df["certainty_ref"].to_numpy(dtype=float) y = df[y_col].to_numpy(dtype=float) # Pearson pearson = np.corrcoef(x, y)[0, 1] if len(df) >= 2 else np.nan # Spearman = Pearson corr of ranks rx = rankdata(x) ry = rankdata(y) spearman = np.corrcoef(rx, ry)[0, 1] if len(df) >= 2 else np.nan print(f"\nPearson r: {pearson:.4f}") print(f"Spearman ρ: {spearman:.4f}") # ---------------------------- # Trend line (simple linear fit) # ---------------------------- # Fit y = a*x + b if len(df) >= 2: a, b = np.polyfit(x, y, 1) else: a, b = np.nan, np.nan # ---------------------------- # Plot # ---------------------------- plt.figure(figsize=(12, 6)) plt.scatter(x, y, alpha=point_alpha, s=point_size) # trend line across full x-range if np.isfinite(a) and np.isfinite(b): xs = np.linspace(np.nanmin(x), np.nanmax(x), 200) plt.plot(xs, a * xs + b, linestyle="--", linewidth=2) plt.xlabel("certainty_percent (from JSON 1, per key)") ylabel = "Absolute Error |EDSS_pred − EDSS_gt|" if y_mode == "abs" else "Signed Error (EDSS_pred − EDSS_gt)" plt.ylabel(ylabel) plt.title(f"Correlation of JSON1 certainty_percent vs {y_col} (All iterations)\n" f"Pearson r={pearson:.3f} | Spearman ρ={spearman:.3f}") plt.grid(linestyle=":", alpha=0.5) plt.tight_layout() plt.show() # Example run: json1_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json" correlation_scatter_raw_certainty_json1_reference( json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", json1_file_path=json1_path, ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", y_mode="abs" # or "signed" ) ## # %% Correlation adjusted # Correlation scatter (RAW certainty_percent from JSON1) vs error (all JSON points) # Adds: # 1) Legend (points, trend line) + Pearson/Spearman shown in legend and title # 2) Trend line color set to high-contrast (black by default) # 3) Density coloring: dots colored by local point density (bluer = more cases) + colorbar import os, glob, json import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib.lines import Line2D from matplotlib.colors import LogNorm, PowerNorm def correlation_scatter_raw_certainty_json1_reference( json_dir_path, json1_file_path, ground_truth_path, gt_sep=";", gt_edss_col="EDSS", y_mode="abs", # "abs" or "signed" point_alpha=0.85, # higher alpha works better with density coloring point_size=14, trend_color="black", # high-contrast line save_svg_path=None, dpi=300 ): def norm_str(x): return str(x).strip().lower() def parse_edss(x): if x is None: return np.nan s = str(x).strip() if s == "" or s.lower() in {"nan", "none", "null"}: return np.nan s = s.replace(",", ".") return pd.to_numeric(s, errors="coerce") def rankdata(a): return pd.Series(a).rank(method="average").to_numpy() # ---------------------------- # Load GT # ---------------------------- df_gt = pd.read_csv(ground_truth_path, sep=gt_sep) for col in ["unique_id", "MedDatum", gt_edss_col]: if col not in df_gt.columns: raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}") df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss) # ---------------------------- # Load JSON1 reference certainty_percent (per key) # ---------------------------- with open(json1_file_path, "r", encoding="utf-8") as f: data1 = json.load(f) ref_rows = [] for entry in data1: if not entry.get("success"): continue res = entry.get("result", {}) uid, md = res.get("unique_id"), res.get("MedDatum") if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": continue ref_rows.append({ "key": norm_str(uid) + "_" + norm_str(md), "certainty_ref": pd.to_numeric(res.get("certainty_percent"), errors="coerce"), }) df_ref = pd.DataFrame(ref_rows) df_ref = (df_ref.dropna(subset=["certainty_ref"]) .groupby("key", as_index=False)["certainty_ref"] .first()) # ---------------------------- # Load ALL JSON predictions (points) # ---------------------------- json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json"))) if not json_files: raise FileNotFoundError(f"No JSON files found in: {json_dir_path}") rows = [] for fp in json_files: with open(fp, "r", encoding="utf-8") as f: data = json.load(f) for entry in data: if not entry.get("success"): continue res = entry.get("result", {}) uid, md = res.get("unique_id"), res.get("MedDatum") if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": continue rows.append({ "file": os.path.basename(fp), "key": norm_str(uid) + "_" + norm_str(md), "EDSS_pred": parse_edss(res.get("EDSS")), }) df_pred = pd.DataFrame(rows) # ---------------------------- # Merge: preds + GT + JSON1 reference certainty # ---------------------------- df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one") df = df.merge(df_ref[["key", "certainty_ref"]], on="key", how="inner", validate="many_to_one") df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "certainty_ref"]).copy() df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"] df["abs_error"] = df["signed_error"].abs() y_col = "abs_error" if y_mode == "abs" else "signed_error" # ---------------------------- # Correlations # ---------------------------- x = df["certainty_ref"].to_numpy(dtype=float) y = df[y_col].to_numpy(dtype=float) pearson = np.corrcoef(x, y)[0, 1] if len(df) >= 2 else np.nan rx, ry = rankdata(x), rankdata(y) spearman = np.corrcoef(rx, ry)[0, 1] if len(df) >= 2 else np.nan # ---------------------------- # Trend line (linear fit) # ---------------------------- if len(df) >= 2: a, b = np.polyfit(x, y, 1) else: a, b = np.nan, np.nan # ---------------------------- # Density coloring (2D histogram bin counts) # "how blue" = how many points are around that location # ---------------------------- # Choose binning resolution (balanced for ~thousands of points) x_bins = 50 y_bins = 50 # Compute bin index per point x_edges = np.linspace(np.nanmin(x), np.nanmax(x), x_bins + 1) y_edges = np.linspace(np.nanmin(y), np.nanmax(y), y_bins + 1) xi = np.clip(np.digitize(x, x_edges) - 1, 0, x_bins - 1) yi = np.clip(np.digitize(y, y_edges) - 1, 0, y_bins - 1) # 2D counts counts2d = np.zeros((x_bins, y_bins), dtype=int) for i in range(len(x)): counts2d[xi[i], yi[i]] += 1 # density per point = count of its bin density = np.array([counts2d[xi[i], yi[i]] for i in range(len(x))], dtype=float) # Plot low density first, high density last (so dense points are visible) order = np.argsort(density) x_o, y_o, d_o = x[order], y[order], density[order] # ... keep everything above the "Plot" section identical ... # ---------------------------- # Plot (IMPROVED COLORS) # ---------------------------- fig, ax = plt.subplots(figsize=(12.5, 6)) # Option A (recommended): logarithmic color scaling # Add +1 to avoid log(0) d_plot = d_o + 1 # clip vmax so one extreme bin doesn't wash everything out vmax = np.percentile(d_plot, 99) # try 95 or 99 depending on your data norm = LogNorm(vmin=1, vmax=max(2, vmax)) sc = ax.scatter( x_o, y_o, c=d_plot, cmap="Blues", norm=norm, s=point_size, alpha=point_alpha, linewidths=0 ) # Trend line (black) if np.isfinite(a) and np.isfinite(b): xs = np.linspace(np.nanmin(x), np.nanmax(x), 200) ax.plot(xs, a * xs + b, linestyle="--", linewidth=2.5, color=trend_color) ax.set_xlabel("certainty percent") ax.set_ylabel("Absolute Error" if y_mode == "abs" else "Signed Error (EDSS_pred − EDSS_gt)") # ax.set_title( # f"Correlation: JSON1 certainty_percent vs {y_col} (All iterations)\n" # f"Pearson r={pearson:.3f} | Spearman ρ={spearman:.3f}" # ) ax.grid(linestyle=":", alpha=0.5) # Colorbar cbar = plt.colorbar(sc, ax=ax) cbar.set_label("Local density (count of cases in bin, log-scaled)") # Legend legend_items = [ Line2D([0], [0], marker="o", linestyle="None", color="navy", label=f"Data points (n={len(df)})"), Line2D([0], [0], linestyle="--", color=trend_color, linewidth=2.5, label=f"Linear trend (Pearson r={pearson:.3f})"), ] ax.legend(handles=legend_items, loc="upper right", frameon=True, title="Legend") plt.tight_layout() # Save as SVG (optional) if save_svg_path: fig.savefig(save_svg_path, format="svg", bbox_inches="tight", dpi=dpi) print(f"[SAVED] {save_svg_path}") plt.show() json1_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json" correlation_scatter_raw_certainty_json1_reference( json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", json1_file_path=json1_path, ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", y_mode="abs", # save_svg_path="/home/shahin/Lab/Doktorarbeit/Barcelona/results/corr_json1_abs_error.svg" ) ##