2372 lines
86 KiB
Python
2372 lines
86 KiB
Python
# %% Confirm EDSS missing
|
||
import pandas as pd
|
||
import numpy as np
|
||
|
||
def clean_series(s):
|
||
return s.astype(str).str.strip().str.lower()
|
||
|
||
def gt_edss_audit(ground_truth_path, edss_col="EDSS"):
|
||
df_gt = pd.read_csv(ground_truth_path, sep=';')
|
||
|
||
# normalize keys
|
||
df_gt['unique_id'] = clean_series(df_gt['unique_id'])
|
||
df_gt['MedDatum'] = clean_series(df_gt['MedDatum'])
|
||
df_gt['key'] = df_gt['unique_id'] + "_" + df_gt['MedDatum']
|
||
|
||
print("GT rows:", len(df_gt))
|
||
print("GT unique keys:", df_gt['key'].nunique())
|
||
|
||
# IMPORTANT: parse EDSS robustly (German decimal commas etc.)
|
||
if edss_col in df_gt.columns:
|
||
edss_raw = df_gt[edss_col]
|
||
edss_num = pd.to_numeric(
|
||
edss_raw.astype(str).str.replace(",", ".", regex=False).str.strip(),
|
||
errors="coerce"
|
||
)
|
||
df_gt["_edss_num"] = edss_num
|
||
|
||
print(f"GT missing EDSS look (numeric-coerce): {df_gt['_edss_num'].isna().sum()}")
|
||
print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['_edss_num'].isna(), 'key'].nunique()}")
|
||
|
||
# duplicates on key
|
||
dup = df_gt['key'].duplicated(keep=False)
|
||
print("GT duplicate-key rows:", dup.sum())
|
||
if dup.any():
|
||
# how many duplicate keys exist?
|
||
print("GT duplicate keys:", df_gt.loc[dup, 'key'].nunique())
|
||
# of duplicate-key rows, how many have missing EDSS?
|
||
print("Duplicate-key rows with missing EDSS:", df_gt.loc[dup, "_edss_num"].isna().sum())
|
||
|
||
# show the worst offenders
|
||
print("\nTop duplicate keys (by count):")
|
||
print(df_gt.loc[dup, 'key'].value_counts().head(10))
|
||
else:
|
||
print(f"EDSS column '{edss_col}' not found in GT columns:", df_gt.columns.tolist())
|
||
|
||
return df_gt
|
||
|
||
df_gt = gt_edss_audit("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", edss_col="EDSS")
|
||
|
||
##
|
||
|
||
|
||
|
||
|
||
# %% trace missing ones
|
||
|
||
import json, glob, os
|
||
import pandas as pd
|
||
|
||
def load_preds(json_dir_path):
|
||
all_preds = []
|
||
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
file_name = os.path.basename(file_path)
|
||
for entry in data:
|
||
if entry.get("success"):
|
||
res = entry["result"]
|
||
all_preds.append({
|
||
"unique_id": str(res.get("unique_id")).strip().lower(),
|
||
"MedDatum": str(res.get("MedDatum")).strip().lower(),
|
||
"file": file_name
|
||
})
|
||
df_pred = pd.DataFrame(all_preds)
|
||
df_pred["key"] = df_pred["unique_id"] + "_" + df_pred["MedDatum"]
|
||
return df_pred
|
||
|
||
df_pred = load_preds("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration")
|
||
print("Pred rows:", len(df_pred))
|
||
print("Pred unique keys:", df_pred["key"].nunique())
|
||
|
||
# Suppose df_gt was returned from step 1 and has _edss_num + key
|
||
missing_gt_keys = set(df_gt.loc[df_gt["_edss_num"].isna(), "key"])
|
||
|
||
df_pred["gt_key_missing_edss"] = df_pred["key"].isin(missing_gt_keys)
|
||
|
||
print("Pred rows whose GT key has missing EDSS:", df_pred["gt_key_missing_edss"].sum())
|
||
print("Unique keys (among preds) whose GT EDSS missing:", df_pred.loc[df_pred["gt_key_missing_edss"], "key"].nunique())
|
||
|
||
print("\nTop files contributing to missing-GT-EDSS rows:")
|
||
print(df_pred.loc[df_pred["gt_key_missing_edss"], "file"].value_counts().head(20))
|
||
|
||
print("\nTop keys replicated in predictions (why count inflates):")
|
||
print(df_pred.loc[df_pred["gt_key_missing_edss"], "key"].value_counts().head(20))
|
||
|
||
|
||
##
|
||
|
||
|
||
# %% verify
|
||
|
||
merged = df_pred.merge(
|
||
df_gt[["key", "_edss_num"]], # use the numeric-coerced GT EDSS
|
||
on="key",
|
||
how="left",
|
||
validate="many_to_one" # will ERROR if GT has duplicate keys (GOOD!)
|
||
)
|
||
|
||
print("Merged rows:", len(merged))
|
||
print("Merged missing GT EDSS:", merged["_edss_num"].isna().sum())
|
||
|
||
|
||
##
|
||
|
||
|
||
# %% 1json (rewritten with robust parsing + detailed data log)
|
||
import pandas as pd
|
||
import numpy as np
|
||
import json
|
||
import matplotlib.pyplot as plt
|
||
import seaborn as sns
|
||
from matplotlib.patches import Patch
|
||
from matplotlib.lines import Line2D
|
||
|
||
def plot_single_json_error_analysis_with_log(
|
||
json_file_path,
|
||
ground_truth_path,
|
||
edss_gt_col="EDSS",
|
||
min_bin_count=5,
|
||
):
|
||
def norm_str(x):
|
||
# normalize identifiers and dates consistently
|
||
return str(x).strip().lower()
|
||
|
||
def parse_edss(x):
|
||
# robust numeric parse: handles "3,5" as 3.5, blanks, "nan", etc.
|
||
if x is None:
|
||
return np.nan
|
||
s = str(x).strip()
|
||
if s == "" or s.lower() in {"nan", "none", "null"}:
|
||
return np.nan
|
||
s = s.replace(",", ".")
|
||
return pd.to_numeric(s, errors="coerce")
|
||
|
||
print("\n" + "="*80)
|
||
print("SINGLE-JSON ERROR ANALYSIS (WITH LOG)")
|
||
print("="*80)
|
||
print(f"JSON: {json_file_path}")
|
||
print(f"GT: {ground_truth_path}")
|
||
|
||
# ------------------------------------------------------------------
|
||
# 1) Load Ground Truth
|
||
# ------------------------------------------------------------------
|
||
df_gt = pd.read_csv(ground_truth_path, sep=";")
|
||
|
||
required_gt_cols = {"unique_id", "MedDatum", edss_gt_col}
|
||
missing_cols = required_gt_cols - set(df_gt.columns)
|
||
if missing_cols:
|
||
raise ValueError(f"GT is missing required columns: {missing_cols}. Available: {df_gt.columns.tolist()}")
|
||
|
||
df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
|
||
df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str)
|
||
df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
|
||
|
||
# Robust EDSS parsing (important!)
|
||
df_gt["EDSS_gt"] = df_gt[edss_gt_col].map(parse_edss)
|
||
|
||
# GT logs
|
||
print("\n--- GT LOG ---")
|
||
print(f"GT rows: {len(df_gt)}")
|
||
print(f"GT unique keys: {df_gt['key'].nunique()}")
|
||
gt_dup = df_gt["key"].duplicated(keep=False).sum()
|
||
print(f"GT duplicate-key rows: {gt_dup}")
|
||
print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}")
|
||
print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['EDSS_gt'].isna(), 'key'].nunique()}")
|
||
|
||
if gt_dup > 0:
|
||
print("\n[WARNING] GT has duplicate keys. Merge can duplicate rows. Example duplicate keys:")
|
||
print(df_gt.loc[df_gt["key"].duplicated(keep=False), "key"].value_counts().head(10))
|
||
|
||
# ------------------------------------------------------------------
|
||
# 2) Load Predictions from the specific JSON
|
||
# ------------------------------------------------------------------
|
||
with open(json_file_path, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
|
||
total_entries = len(data)
|
||
success_entries = sum(1 for e in data if e.get("success"))
|
||
|
||
all_preds = []
|
||
skipped = {
|
||
"not_success": 0,
|
||
"missing_uid_or_date": 0,
|
||
"missing_edss": 0,
|
||
"missing_conf": 0,
|
||
}
|
||
|
||
for entry in data:
|
||
if not entry.get("success"):
|
||
skipped["not_success"] += 1
|
||
continue
|
||
|
||
res = entry.get("result", {})
|
||
uid = res.get("unique_id")
|
||
md = res.get("MedDatum")
|
||
|
||
if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
|
||
skipped["missing_uid_or_date"] += 1
|
||
continue
|
||
|
||
edss_pred = parse_edss(res.get("EDSS"))
|
||
conf = pd.to_numeric(res.get("certainty_percent"), errors="coerce")
|
||
|
||
if pd.isna(edss_pred):
|
||
skipped["missing_edss"] += 1
|
||
if pd.isna(conf):
|
||
skipped["missing_conf"] += 1
|
||
|
||
all_preds.append({
|
||
"unique_id": norm_str(uid),
|
||
"MedDatum": norm_str(md),
|
||
"key": norm_str(uid) + "_" + norm_str(md),
|
||
"EDSS_pred": edss_pred,
|
||
"confidence": conf,
|
||
})
|
||
|
||
df_pred = pd.DataFrame(all_preds)
|
||
|
||
# Pred logs
|
||
print("\n--- PRED LOG ---")
|
||
print(f"JSON total entries: {total_entries}")
|
||
print(f"JSON success entries: {success_entries}")
|
||
print(f"Pred rows loaded (success + has keys): {len(df_pred)}")
|
||
if len(df_pred) == 0:
|
||
print("[ERROR] No usable prediction rows found. Nothing to plot.")
|
||
return
|
||
|
||
print(f"Pred unique keys: {df_pred['key'].nunique()}")
|
||
print(f"Pred missing EDSS (numeric): {df_pred['EDSS_pred'].isna().sum()}")
|
||
print(f"Pred missing confidence: {df_pred['confidence'].isna().sum()}")
|
||
print("Skipped counts:", skipped)
|
||
|
||
# Are keys duplicated within this JSON? (often yes if multiple notes map to same key)
|
||
key_counts = df_pred["key"].value_counts()
|
||
dup_pred_rows = (key_counts > 1).sum()
|
||
max_rep = int(key_counts.max())
|
||
print(f"Keys with >1 prediction in this JSON: {dup_pred_rows}")
|
||
print(f"Max repetitions of a single key in this JSON: {max_rep}")
|
||
if max_rep > 1:
|
||
print("Top repeated keys in this JSON:")
|
||
print(key_counts.head(10))
|
||
|
||
# ------------------------------------------------------------------
|
||
# 3) Merge (and diagnose why rows drop)
|
||
# ------------------------------------------------------------------
|
||
# Diagnose how many pred keys exist in GT
|
||
gt_key_set = set(df_gt["key"])
|
||
df_pred["key_in_gt"] = df_pred["key"].isin(gt_key_set)
|
||
not_in_gt = df_pred.loc[~df_pred["key_in_gt"]]
|
||
|
||
print("\n--- KEY MATCH LOG ---")
|
||
print(f"Pred rows with key found in GT: {df_pred['key_in_gt'].sum()} / {len(df_pred)}")
|
||
print(f"Pred rows with key NOT found in GT: {len(not_in_gt)}")
|
||
if len(not_in_gt) > 0:
|
||
print("[WARNING] Some prediction keys are not present in GT. First 10:")
|
||
print(not_in_gt[["unique_id", "MedDatum", "key"]].head(10))
|
||
|
||
# Now merge; we expect GT is one-to-many with pred (many_to_one)
|
||
# If GT had duplicates, validate would raise.
|
||
df_merged = df_pred.merge(
|
||
df_gt[["key", "EDSS_gt"]],
|
||
on="key",
|
||
how="inner",
|
||
validate="many_to_one"
|
||
)
|
||
|
||
print("\n--- MERGE LOG ---")
|
||
print(f"Merged rows (inner join): {len(df_merged)}")
|
||
print(f"Merged unique keys: {df_merged['key'].nunique()}")
|
||
print(f"Merged missing GT EDSS: {df_merged['EDSS_gt'].isna().sum()}")
|
||
print(f"Merged missing pred EDSS: {df_merged['EDSS_pred'].isna().sum()}")
|
||
print(f"Merged missing confidence:{df_merged['confidence'].isna().sum()}")
|
||
|
||
# How many rows will be removed by dropna() in your old code?
|
||
# Old code did .dropna() on ALL columns, which can remove rows for missing confidence too.
|
||
rows_complete = df_merged.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"])
|
||
print("\n--- FILTER LOG (what will be used for stats/plot) ---")
|
||
print(f"Rows with all required fields (EDSS_gt, EDSS_pred, confidence): {len(rows_complete)}")
|
||
if len(rows_complete) == 0:
|
||
print("[ERROR] No complete rows after filtering. Nothing to plot.")
|
||
return
|
||
|
||
# Compute abs error
|
||
rows_complete = rows_complete.copy()
|
||
rows_complete["abs_error"] = (rows_complete["EDSS_pred"] - rows_complete["EDSS_gt"]).abs()
|
||
|
||
# ------------------------------------------------------------------
|
||
# 4) Binning + stats (with guardrails)
|
||
# ------------------------------------------------------------------
|
||
bins = [0, 70, 80, 90, 100]
|
||
labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]
|
||
|
||
# Confidence outside bins becomes NaN; log it
|
||
rows_complete["conf_bin"] = pd.cut(rows_complete["confidence"], bins=bins, labels=labels, include_lowest=True)
|
||
conf_outside = rows_complete["conf_bin"].isna().sum()
|
||
print(f"Rows with confidence outside [0,100] or outside bin edges: {conf_outside}")
|
||
if conf_outside > 0:
|
||
print("Example confidences outside bins:")
|
||
print(rows_complete.loc[rows_complete["conf_bin"].isna(), "confidence"].head(20).to_list())
|
||
|
||
df_plot = rows_complete.dropna(subset=["conf_bin"])
|
||
stats = (
|
||
df_plot.groupby("conf_bin", observed=True)["abs_error"]
|
||
.agg(mean="mean", std="std", count="count")
|
||
.reindex(labels)
|
||
.reset_index()
|
||
)
|
||
|
||
print("\n--- BIN STATS ---")
|
||
print(stats)
|
||
|
||
# Warn about low counts
|
||
low_bins = stats.loc[stats["count"].fillna(0) < min_bin_count, ["conf_bin", "count"]]
|
||
if not low_bins.empty:
|
||
print(f"\n[WARNING] Some bins have < {min_bin_count} rows; error bars/trend may be unstable:")
|
||
print(low_bins)
|
||
|
||
# ------------------------------------------------------------------
|
||
# 5) Plot
|
||
# ------------------------------------------------------------------
|
||
plt.figure(figsize=(13, 8))
|
||
colors = sns.color_palette("Blues", n_colors=len(labels))
|
||
|
||
# Replace NaNs in mean for plotting bars (empty bins)
|
||
means = stats["mean"].to_numpy()
|
||
counts = stats["count"].fillna(0).astype(int).to_numpy()
|
||
stds = stats["std"].to_numpy()
|
||
|
||
# For bins with no data, bar height 0 (and no errorbar)
|
||
means_plot = np.nan_to_num(means, nan=0.0)
|
||
|
||
bars = plt.bar(labels, means_plot, color=colors, edgecolor="black", alpha=0.85)
|
||
|
||
# Error bars only where count>1 and std is not NaN
|
||
sem = np.where((counts > 1) & (~np.isnan(stds)), stds / np.sqrt(counts), np.nan)
|
||
plt.errorbar(labels, means_plot, yerr=sem, fmt="none", c="black", capsize=8, elinewidth=1.5)
|
||
|
||
# Trend line only if at least 2 non-empty bins
|
||
valid_idx = np.where(~np.isnan(means))[0]
|
||
if len(valid_idx) >= 2:
|
||
x_idx = np.arange(len(labels))
|
||
z = np.polyfit(valid_idx, means[valid_idx], 1)
|
||
p = np.poly1d(z)
|
||
plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5)
|
||
trend_label = "Trend Line"
|
||
else:
|
||
trend_label = "Trend Line (insufficient bins)"
|
||
print("\n[INFO] Not enough non-empty bins to fit a trend line.")
|
||
|
||
# Data labels
|
||
for i, bar in enumerate(bars):
|
||
n_count = int(counts[i])
|
||
mae_val = means[i]
|
||
if np.isnan(mae_val) or n_count == 0:
|
||
txt = "empty"
|
||
y = 0.02
|
||
else:
|
||
txt = f"MAE: {mae_val:.2f}\nn={n_count}"
|
||
y = bar.get_height() + 0.04
|
||
plt.text(
|
||
bar.get_x() + bar.get_width()/2,
|
||
y,
|
||
txt,
|
||
ha="center",
|
||
va="bottom",
|
||
fontweight="bold",
|
||
fontsize=10
|
||
)
|
||
|
||
# Legend
|
||
legend_elements = [
|
||
Patch(facecolor=colors[0], edgecolor="black", label=f"Bin 1: {labels[0]}"),
|
||
Patch(facecolor=colors[1], edgecolor="black", label=f"Bin 2: {labels[1]}"),
|
||
Patch(facecolor=colors[2], edgecolor="black", label=f"Bin 3: {labels[2]}"),
|
||
Patch(facecolor=colors[3], edgecolor="black", label=f"Bin 4: {labels[3]}"),
|
||
Line2D([0], [0], color="#e74c3c", linestyle="--", lw=3, label=trend_label),
|
||
Line2D([0], [0], color="black", marker="_", linestyle="None", markersize=10, label="Std Error (SEM)"),
|
||
Patch(color="none", label="Metric: Mean Absolute Error (MAE)")
|
||
]
|
||
plt.legend(handles=legend_elements, loc="upper right", frameon=True, shadow=True, title="Legend")
|
||
|
||
plt.title("Validation: Confidence vs. Error Magnitude (Single JSON)", fontsize=15, pad=30)
|
||
plt.ylabel("Mean Absolute Error (EDSS Points)", fontsize=12)
|
||
plt.xlabel("LLM Confidence Bracket", fontsize=12)
|
||
plt.grid(axis="y", linestyle=":", alpha=0.5)
|
||
|
||
ymax = np.nanmax(means) if np.any(~np.isnan(means)) else 0.0
|
||
plt.ylim(0, max(0.5, float(ymax) + 0.6))
|
||
plt.tight_layout()
|
||
plt.show()
|
||
|
||
print("\n" + "="*80)
|
||
print("DONE")
|
||
print("="*80)
|
||
|
||
|
||
# --- RUN ---
|
||
json_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json"
|
||
gt_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv"
|
||
|
||
plot_single_json_error_analysis_with_log(json_path, gt_path)
|
||
|
||
|
||
|
||
##
|
||
|
||
|
||
# %% Certainty vs Delta (rewritten with robust parsing + detailed data loss logs)
|
||
import pandas as pd
|
||
import numpy as np
|
||
import json
|
||
import glob
|
||
import os
|
||
import matplotlib.pyplot as plt
|
||
import seaborn as sns
|
||
from matplotlib.patches import Patch
|
||
from matplotlib.lines import Line2D
|
||
|
||
def plot_confidence_vs_abs_error_with_log(
|
||
json_dir_path,
|
||
ground_truth_path,
|
||
edss_gt_col="EDSS",
|
||
min_bin_count=5,
|
||
include_lowest=True,
|
||
):
|
||
def norm_str(x):
|
||
return str(x).strip().lower()
|
||
|
||
def parse_edss(x):
|
||
# robust numeric parse: handles comma decimals and empty tokens
|
||
if x is None:
|
||
return np.nan
|
||
s = str(x).strip()
|
||
if s == "" or s.lower() in {"nan", "none", "null"}:
|
||
return np.nan
|
||
s = s.replace(",", ".")
|
||
return pd.to_numeric(s, errors="coerce")
|
||
|
||
print("\n" + "="*90)
|
||
print("CERTAINTY vs ABS ERROR (ALL JSONs) — WITH DATA LOSS LOG")
|
||
print("="*90)
|
||
print(f"JSON DIR: {json_dir_path}")
|
||
print(f"GT FILE: {ground_truth_path}")
|
||
|
||
# ------------------------------------------------------------------
|
||
# 1) Load GT
|
||
# ------------------------------------------------------------------
|
||
df_gt = pd.read_csv(ground_truth_path, sep=";")
|
||
required_gt_cols = {"unique_id", "MedDatum", edss_gt_col}
|
||
missing_cols = required_gt_cols - set(df_gt.columns)
|
||
if missing_cols:
|
||
raise ValueError(f"GT missing columns: {missing_cols}. Available: {df_gt.columns.tolist()}")
|
||
|
||
df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
|
||
df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str)
|
||
df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
|
||
df_gt["EDSS_gt"] = df_gt[edss_gt_col].map(parse_edss)
|
||
|
||
# GT logs
|
||
print("\n--- GT LOG ---")
|
||
print(f"GT rows: {len(df_gt)}")
|
||
print(f"GT unique keys: {df_gt['key'].nunique()}")
|
||
gt_dup_rows = df_gt["key"].duplicated(keep=False).sum()
|
||
print(f"GT duplicate-key rows: {gt_dup_rows}")
|
||
print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}")
|
||
print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['EDSS_gt'].isna(), 'key'].nunique()}")
|
||
if gt_dup_rows > 0:
|
||
print("\n[WARNING] GT has duplicate keys; merge can explode rows. Top duplicate keys:")
|
||
print(df_gt.loc[df_gt["key"].duplicated(keep=False), "key"].value_counts().head(10))
|
||
|
||
gt_key_set = set(df_gt["key"])
|
||
|
||
# ------------------------------------------------------------------
|
||
# 2) Load predictions from all JSON files (with per-file logs)
|
||
# ------------------------------------------------------------------
|
||
json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json")))
|
||
if not json_files:
|
||
raise FileNotFoundError(f"No JSON files found in: {json_dir_path}")
|
||
|
||
all_preds = []
|
||
per_file_summary = []
|
||
|
||
total_entries_all = 0
|
||
total_success_all = 0
|
||
skipped_all = {"not_success": 0, "missing_uid_or_date": 0}
|
||
|
||
for file_path in json_files:
|
||
with open(file_path, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
|
||
total_entries = len(data)
|
||
success_entries = sum(1 for e in data if e.get("success"))
|
||
|
||
total_entries_all += total_entries
|
||
total_success_all += success_entries
|
||
|
||
skipped = {"not_success": 0, "missing_uid_or_date": 0}
|
||
loaded_rows = 0
|
||
|
||
for entry in data:
|
||
if not entry.get("success"):
|
||
skipped["not_success"] += 1
|
||
continue
|
||
res = entry.get("result", {})
|
||
uid = res.get("unique_id")
|
||
md = res.get("MedDatum")
|
||
|
||
if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
|
||
skipped["missing_uid_or_date"] += 1
|
||
continue
|
||
|
||
all_preds.append({
|
||
"file": os.path.basename(file_path),
|
||
"unique_id": norm_str(uid),
|
||
"MedDatum": norm_str(md),
|
||
"key": norm_str(uid) + "_" + norm_str(md),
|
||
"EDSS_pred": parse_edss(res.get("EDSS")),
|
||
"confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
|
||
})
|
||
loaded_rows += 1
|
||
|
||
skipped_all["not_success"] += skipped["not_success"]
|
||
skipped_all["missing_uid_or_date"] += skipped["missing_uid_or_date"]
|
||
|
||
per_file_summary.append({
|
||
"file": os.path.basename(file_path),
|
||
"entries_total": total_entries,
|
||
"entries_success": success_entries,
|
||
"pred_rows_loaded": loaded_rows,
|
||
"skipped_not_success": skipped["not_success"],
|
||
"skipped_missing_uid_or_date": skipped["missing_uid_or_date"],
|
||
})
|
||
|
||
df_pred = pd.DataFrame(all_preds)
|
||
df_file = pd.DataFrame(per_file_summary)
|
||
|
||
# PRED logs
|
||
print("\n--- PRED LOG (ALL FILES) ---")
|
||
print(f"JSON files found: {len(json_files)}")
|
||
print(f"Total JSON entries: {total_entries_all}")
|
||
print(f"Total success entries:{total_success_all}")
|
||
print(f"Pred rows loaded (success + has keys): {len(df_pred)}")
|
||
if len(df_pred) == 0:
|
||
print("[ERROR] No usable prediction rows found. Nothing to plot.")
|
||
return
|
||
|
||
print(f"Pred unique keys (across all files): {df_pred['key'].nunique()}")
|
||
print(f"Pred missing EDSS (numeric): {df_pred['EDSS_pred'].isna().sum()}")
|
||
print(f"Pred missing confidence: {df_pred['confidence'].isna().sum()}")
|
||
print("Skipped totals:", skipped_all)
|
||
|
||
# show per-file quick check (useful when one iteration is broken)
|
||
print("\nPer-file loaded rows (head):")
|
||
print(df_file.sort_values("file").head(10))
|
||
|
||
# ------------------------------------------------------------------
|
||
# 3) Key match log (pred -> GT)
|
||
# ------------------------------------------------------------------
|
||
df_pred["key_in_gt"] = df_pred["key"].isin(gt_key_set)
|
||
not_in_gt = df_pred.loc[~df_pred["key_in_gt"]]
|
||
|
||
print("\n--- KEY MATCH LOG ---")
|
||
print(f"Pred rows with key found in GT: {df_pred['key_in_gt'].sum()} / {len(df_pred)}")
|
||
print(f"Pred rows with key NOT in GT: {len(not_in_gt)}")
|
||
if len(not_in_gt) > 0:
|
||
print("[WARNING] Example keys not found in GT (first 10):")
|
||
print(not_in_gt[["file", "unique_id", "MedDatum", "key"]].head(10))
|
||
print("\n[WARNING] Files contributing most to key-mismatch:")
|
||
print(not_in_gt["file"].value_counts().head(10))
|
||
|
||
# ------------------------------------------------------------------
|
||
# 4) Merge (no dropna yet) + detailed data loss accounting
|
||
# ------------------------------------------------------------------
|
||
df_merged = df_pred.merge(
|
||
df_gt[["key", "EDSS_gt"]],
|
||
on="key",
|
||
how="inner",
|
||
validate="many_to_one" # catches GT duplicates
|
||
)
|
||
|
||
print("\n--- MERGE LOG ---")
|
||
print(f"Merged rows (inner join): {len(df_merged)}")
|
||
print(f"Merged unique keys: {df_merged['key'].nunique()}")
|
||
|
||
# Now quantify what you lose at each filter stage
|
||
n0 = len(df_merged)
|
||
|
||
miss_gt = df_merged["EDSS_gt"].isna()
|
||
miss_pred = df_merged["EDSS_pred"].isna()
|
||
miss_conf = df_merged["confidence"].isna()
|
||
|
||
print("\n--- MISSINGNESS IN MERGED ---")
|
||
print(f"Missing GT EDSS: {miss_gt.sum()}")
|
||
print(f"Missing Pred EDSS: {miss_pred.sum()}")
|
||
print(f"Missing Confidence: {miss_conf.sum()}")
|
||
|
||
# IMPORTANT: your old code used .dropna() with no subset => drops if ANY column is NaN.
|
||
# We'll replicate the intended logic explicitly and log counts.
|
||
df_complete = df_merged.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"])
|
||
n1 = len(df_complete)
|
||
print("\n--- FILTER LOG ---")
|
||
print(f"Rows before filtering: {n0}")
|
||
print(f"Rows after requiring EDSS_gt, EDSS_pred, confidence: {n1}")
|
||
print(f"Rows lost due to missing required fields: {n0 - n1}")
|
||
|
||
# Break down why rows were lost (overlap-aware)
|
||
lost_mask = df_merged[["EDSS_gt", "EDSS_pred", "confidence"]].isna().any(axis=1)
|
||
lost = df_merged.loc[lost_mask].copy()
|
||
if len(lost) > 0:
|
||
lost_reason = (
|
||
(lost["EDSS_gt"].isna()).astype(int).map({1:"GT",0:""}) +
|
||
(lost["EDSS_pred"].isna()).astype(int).map({1:"+PRED",0:""}) +
|
||
(lost["confidence"].isna()).astype(int).map({1:"+CONF",0:""})
|
||
)
|
||
lost["loss_reason"] = lost_reason.str.replace(r"^\+", "", regex=True).replace("", "UNKNOWN")
|
||
print("\nTop loss reasons (overlap-aware):")
|
||
print(lost["loss_reason"].value_counts().head(10))
|
||
|
||
print("\nFiles contributing most to lost rows:")
|
||
print(lost["file"].value_counts().head(10))
|
||
|
||
if len(df_complete) == 0:
|
||
print("[ERROR] No complete rows left after filtering. Nothing to plot.")
|
||
return
|
||
|
||
# ------------------------------------------------------------------
|
||
# 5) Abs error + binning
|
||
# ------------------------------------------------------------------
|
||
df_complete = df_complete.copy()
|
||
df_complete["abs_error"] = (df_complete["EDSS_pred"] - df_complete["EDSS_gt"]).abs()
|
||
|
||
bins = [0, 70, 80, 90, 100]
|
||
labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]
|
||
|
||
df_complete["conf_bin"] = pd.cut(
|
||
df_complete["confidence"],
|
||
bins=bins,
|
||
labels=labels,
|
||
include_lowest=include_lowest
|
||
)
|
||
|
||
conf_outside = df_complete["conf_bin"].isna().sum()
|
||
print("\n--- BINNING LOG ---")
|
||
print(f"Rows with confidence outside bin edges / invalid: {conf_outside}")
|
||
if conf_outside > 0:
|
||
print("Example out-of-bin confidences:")
|
||
print(df_complete.loc[df_complete["conf_bin"].isna(), "confidence"].head(20).to_list())
|
||
|
||
df_plot = df_complete.dropna(subset=["conf_bin"])
|
||
print(f"Rows kept for bin stats/plot (after dropping out-of-bin): {len(df_plot)}")
|
||
print(f"Rows lost due to out-of-bin confidence: {len(df_complete) - len(df_plot)}")
|
||
|
||
stats = (
|
||
df_plot.groupby("conf_bin", observed=True)["abs_error"]
|
||
.agg(mean="mean", std="std", count="count")
|
||
.reindex(labels)
|
||
.reset_index()
|
||
)
|
||
|
||
print("\n--- BIN STATS ---")
|
||
print(stats)
|
||
|
||
low_bins = stats.loc[stats["count"].fillna(0) < min_bin_count, ["conf_bin", "count"]]
|
||
if not low_bins.empty:
|
||
print(f"\n[WARNING] Some bins have < {min_bin_count} rows (unstable SEM/trend):")
|
||
print(low_bins)
|
||
|
||
# ------------------------------------------------------------------
|
||
# 6) Plot
|
||
# ------------------------------------------------------------------
|
||
plt.figure(figsize=(12, 8))
|
||
colors = sns.color_palette("Blues", n_colors=len(labels))
|
||
|
||
means = stats["mean"].to_numpy()
|
||
counts = stats["count"].fillna(0).astype(int).to_numpy()
|
||
stds = stats["std"].to_numpy()
|
||
|
||
means_plot = np.nan_to_num(means, nan=0.0)
|
||
bars = plt.bar(labels, means_plot, color=colors, edgecolor="black", linewidth=1.2)
|
||
|
||
sem = np.where((counts > 1) & (~np.isnan(stds)), stds / np.sqrt(counts), np.nan)
|
||
plt.errorbar(labels, means_plot, yerr=sem, fmt="none", c="black", capsize=6, elinewidth=1.5)
|
||
|
||
# Trend line only if >=2 non-empty bins
|
||
valid_idx = np.where(~np.isnan(means))[0]
|
||
if len(valid_idx) >= 2:
|
||
x_idx = np.arange(len(labels))
|
||
z = np.polyfit(valid_idx, means[valid_idx], 1)
|
||
p = np.poly1d(z)
|
||
plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=2.5)
|
||
trend_label = "Correlation Trend"
|
||
else:
|
||
trend_label = "Correlation Trend (insufficient bins)"
|
||
print("\n[INFO] Not enough non-empty bins to fit a trend line.")
|
||
|
||
# Bar annotations (MAE + n)
|
||
for i, bar in enumerate(bars):
|
||
n = int(counts[i])
|
||
m = means[i]
|
||
if n == 0 or np.isnan(m):
|
||
txt = "empty"
|
||
y = 0.02
|
||
else:
|
||
txt = f"MAE: {m:.2f}\nn={n}"
|
||
y = bar.get_height() + 0.05
|
||
plt.text(bar.get_x() + bar.get_width()/2, y, txt, ha="center", fontweight="bold")
|
||
|
||
legend_elements = [
|
||
Patch(facecolor=colors[0], edgecolor="black", label=f"Bin 1: {labels[0]}"),
|
||
Patch(facecolor=colors[1], edgecolor="black", label=f"Bin 2: {labels[1]}"),
|
||
Patch(facecolor=colors[2], edgecolor="black", label=f"Bin 3: {labels[2]}"),
|
||
Patch(facecolor=colors[3], edgecolor="black", label=f"Bin 4: {labels[3]}"),
|
||
Line2D([0], [0], color="black", marker="_", linestyle="None", markersize=10, label="Standard Error (SEM)"),
|
||
Line2D([0], [0], color="#e74c3c", linestyle="--", lw=2.5, label=trend_label),
|
||
Patch(color="none", label="Metric: Mean Absolute Error (MAE)")
|
||
]
|
||
plt.legend(handles=legend_elements, loc="upper right", frameon=True, shadow=True, fontsize=10, title="Legend")
|
||
|
||
plt.title("Validation: Inverse Correlation of Confidence vs. Error Magnitude", fontsize=15, pad=20)
|
||
plt.ylabel("Mean Absolute Error (Δ EDSS Points)", fontsize=12)
|
||
plt.xlabel("LLM Confidence Bracket", fontsize=12)
|
||
plt.grid(axis="y", linestyle=":", alpha=0.5)
|
||
|
||
ymax = np.nanmax(means) if np.any(~np.isnan(means)) else 0.0
|
||
plt.ylim(0, max(0.5, float(ymax) + 0.6))
|
||
plt.tight_layout()
|
||
plt.show()
|
||
|
||
print("\n" + "="*90)
|
||
print("DONE")
|
||
print("="*90)
|
||
|
||
|
||
# Example run:
|
||
plot_confidence_vs_abs_error_with_log("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv")
|
||
|
||
|
||
##
|
||
|
||
|
||
# %% Empirical Confidence
|
||
# Empirical stability confidence (from 10 runs) + LLM certainty_percent as secondary signal
|
||
# - Reads all JSONs in a folder (your 10 iterations)
|
||
# - Aggregates by key = unique_id + MedDatum
|
||
# - Computes:
|
||
# * EDSS_mean, EDSS_std, EDSS_iqr, mode/share
|
||
# * empirical_conf_0_100 (based on stability)
|
||
# * llm_conf_mean_0_100 (mean certainty_percent)
|
||
# * combined_conf_0_100 (weighted blend)
|
||
# - Optional: merges GT EDSS and computes abs error on the aggregated prediction
|
||
|
||
import os, glob, json
|
||
import numpy as np
|
||
import pandas as pd
|
||
|
||
def build_empirical_confidence_table(
|
||
json_dir_path: str,
|
||
ground_truth_path: str | None = None,
|
||
gt_sep: str = ";",
|
||
gt_edss_col: str = "EDSS",
|
||
w_empirical: float = 0.7, # weight for empirical stability
|
||
w_llm: float = 0.3, # weight for LLM self-reported confidence
|
||
tol_mode: float = 0.5, # tolerance to treat EDSS as "same" (EDSS often in 0.5 steps)
|
||
min_runs_expected: int = 10,
|
||
):
|
||
# -----------------------------
|
||
# Helpers
|
||
# -----------------------------
|
||
def norm_str(x):
|
||
return str(x).strip().lower()
|
||
|
||
def parse_number(x):
|
||
if x is None:
|
||
return np.nan
|
||
s = str(x).strip()
|
||
if s == "" or s.lower() in {"nan", "none", "null"}:
|
||
return np.nan
|
||
s = s.replace(",", ".")
|
||
return pd.to_numeric(s, errors="coerce")
|
||
|
||
def robust_iqr(x: pd.Series):
|
||
x = x.dropna()
|
||
if len(x) == 0:
|
||
return np.nan
|
||
return float(x.quantile(0.75) - x.quantile(0.25))
|
||
|
||
def stability_to_confidence(std_val: float) -> float:
|
||
"""
|
||
Map EDSS variability across runs to a 0..100 confidence.
|
||
EDSS is typically on 0.5 steps. A natural scale:
|
||
std ~= 0.0 -> ~100
|
||
std ~= 0.25 -> ~75-90
|
||
std ~= 0.5 -> ~50-70
|
||
std >= 1.0 -> low
|
||
Use a smooth exponential mapping.
|
||
"""
|
||
if np.isnan(std_val):
|
||
return np.nan
|
||
# scale parameter: std=0.5 -> exp(-1)=0.367 -> ~36.7
|
||
scale = 0.5
|
||
conf = 100.0 * np.exp(-(std_val / scale))
|
||
# clamp
|
||
return float(np.clip(conf, 0.0, 100.0))
|
||
|
||
def mode_share_with_tolerance(values: np.ndarray, tol: float) -> tuple[float, float]:
|
||
"""
|
||
Compute a 'mode' under tolerance: pick the cluster center (median) and count
|
||
how many values fall within +/- tol. Return (mode_center, share).
|
||
This is robust to tiny float differences.
|
||
"""
|
||
vals = values[~np.isnan(values)]
|
||
if len(vals) == 0:
|
||
return (np.nan, np.nan)
|
||
center = float(np.median(vals))
|
||
share = float(np.mean(np.abs(vals - center) <= tol))
|
||
return (center, share)
|
||
|
||
# -----------------------------
|
||
# Load predictions from all JSONs
|
||
# -----------------------------
|
||
json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json")))
|
||
if not json_files:
|
||
raise FileNotFoundError(f"No JSON files found in: {json_dir_path}")
|
||
|
||
rows = []
|
||
per_file = []
|
||
total_entries_all = 0
|
||
total_success_all = 0
|
||
skipped_all = {"not_success": 0, "missing_uid_or_date": 0}
|
||
|
||
for fp in json_files:
|
||
with open(fp, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
|
||
total_entries = len(data)
|
||
success_entries = sum(1 for e in data if e.get("success"))
|
||
total_entries_all += total_entries
|
||
total_success_all += success_entries
|
||
|
||
skipped = {"not_success": 0, "missing_uid_or_date": 0}
|
||
loaded = 0
|
||
|
||
for entry in data:
|
||
if not entry.get("success"):
|
||
skipped["not_success"] += 1
|
||
continue
|
||
|
||
res = entry.get("result", {})
|
||
uid = res.get("unique_id")
|
||
md = res.get("MedDatum")
|
||
|
||
if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
|
||
skipped["missing_uid_or_date"] += 1
|
||
continue
|
||
|
||
edss = parse_number(res.get("EDSS"))
|
||
conf = parse_number(res.get("certainty_percent"))
|
||
it = res.get("iteration", None)
|
||
|
||
rows.append({
|
||
"file": os.path.basename(fp),
|
||
"iteration": it,
|
||
"unique_id": norm_str(uid),
|
||
"MedDatum": norm_str(md),
|
||
"key": norm_str(uid) + "_" + norm_str(md),
|
||
"EDSS_pred": edss,
|
||
"llm_conf": conf,
|
||
})
|
||
loaded += 1
|
||
|
||
skipped_all["not_success"] += skipped["not_success"]
|
||
skipped_all["missing_uid_or_date"] += skipped["missing_uid_or_date"]
|
||
|
||
per_file.append({
|
||
"file": os.path.basename(fp),
|
||
"entries_total": total_entries,
|
||
"entries_success": success_entries,
|
||
"rows_loaded": loaded,
|
||
"skipped_not_success": skipped["not_success"],
|
||
"skipped_missing_uid_or_date": skipped["missing_uid_or_date"],
|
||
})
|
||
|
||
df_pred = pd.DataFrame(rows)
|
||
df_file = pd.DataFrame(per_file)
|
||
|
||
# -----------------------------
|
||
# Logs: ingestion
|
||
# -----------------------------
|
||
print("\n" + "="*90)
|
||
print("EMPIRICAL CONFIDENCE (10-RUN STABILITY) + LLM CONFIDENCE (SECONDARY)")
|
||
print("="*90)
|
||
print(f"JSON DIR: {json_dir_path}")
|
||
print(f"JSON files: {len(json_files)}")
|
||
print("\n--- INGEST LOG ---")
|
||
print(f"Total JSON entries: {total_entries_all}")
|
||
print(f"Total success entries:{total_success_all}")
|
||
print(f"Pred rows loaded: {len(df_pred)}")
|
||
print(f"Unique keys in preds: {df_pred['key'].nunique() if len(df_pred) else 0}")
|
||
print(f"Missing EDSS_pred: {df_pred['EDSS_pred'].isna().sum() if len(df_pred) else 0}")
|
||
print(f"Missing llm_conf: {df_pred['llm_conf'].isna().sum() if len(df_pred) else 0}")
|
||
print("Skipped totals:", skipped_all)
|
||
|
||
print("\nPer-file summary (top 10 by name):")
|
||
print(df_file.sort_values("file").head(10))
|
||
|
||
# -----------------------------
|
||
# Aggregate by key (empirical stability)
|
||
# -----------------------------
|
||
if len(df_pred) == 0:
|
||
print("[ERROR] No usable prediction rows.")
|
||
return None
|
||
|
||
# how many runs per key (expect ~10)
|
||
runs_per_key = df_pred.groupby("key")["EDSS_pred"].size().rename("n_rows").reset_index()
|
||
print("\n--- RUNS PER KEY LOG ---")
|
||
print(f"Keys with at least 1 row: {len(runs_per_key)}")
|
||
print("Distribution of rows per key (value_counts):")
|
||
print(runs_per_key["n_rows"].value_counts().sort_index())
|
||
|
||
# Aggregate stats
|
||
def agg_block(g: pd.DataFrame):
|
||
ed = g["EDSS_pred"].to_numpy(dtype=float)
|
||
ll = g["llm_conf"].to_numpy(dtype=float)
|
||
|
||
n_rows = len(g)
|
||
n_edss = int(np.sum(~np.isnan(ed)))
|
||
n_llm = int(np.sum(~np.isnan(ll)))
|
||
|
||
ed_mean = float(np.nanmean(ed)) if n_edss else np.nan
|
||
ed_std = float(np.nanstd(ed, ddof=1)) if n_edss >= 2 else (0.0 if n_edss == 1 else np.nan)
|
||
ed_iqr = robust_iqr(pd.Series(ed))
|
||
mode_center, mode_share = mode_share_with_tolerance(ed, tol=tol_mode)
|
||
|
||
llm_mean = float(np.nanmean(ll)) if n_llm else np.nan
|
||
llm_std = float(np.nanstd(ll, ddof=1)) if n_llm >= 2 else (0.0 if n_llm == 1 else np.nan)
|
||
|
||
emp_conf = stability_to_confidence(ed_std) if not np.isnan(ed_std) else np.nan
|
||
|
||
# Combined confidence (weighted). If one side missing, fall back to the other.
|
||
if np.isnan(emp_conf) and np.isnan(llm_mean):
|
||
comb = np.nan
|
||
elif np.isnan(emp_conf):
|
||
comb = llm_mean
|
||
elif np.isnan(llm_mean):
|
||
comb = emp_conf
|
||
else:
|
||
comb = w_empirical * emp_conf + w_llm * llm_mean
|
||
|
||
return pd.Series({
|
||
"unique_id": g["unique_id"].iloc[0],
|
||
"MedDatum": g["MedDatum"].iloc[0],
|
||
"n_rows": n_rows,
|
||
"n_edss": n_edss,
|
||
"n_llm_conf":n_llm,
|
||
"EDSS_mean": ed_mean,
|
||
"EDSS_std": ed_std,
|
||
"EDSS_iqr": ed_iqr,
|
||
"EDSS_mode_center": mode_center,
|
||
"EDSS_mode_share": mode_share, # fraction within ±tol_mode of median center
|
||
"llm_conf_mean": llm_mean,
|
||
"llm_conf_std": llm_std,
|
||
"empirical_conf_0_100": emp_conf,
|
||
"combined_conf_0_100": float(np.clip(comb, 0.0, 100.0)) if not np.isnan(comb) else np.nan,
|
||
})
|
||
|
||
df_agg = df_pred.groupby("key", as_index=False).apply(agg_block)
|
||
# groupby+apply returns a multiindex sometimes depending on pandas version
|
||
if isinstance(df_agg.index, pd.MultiIndex):
|
||
df_agg = df_agg.reset_index(drop=True)
|
||
|
||
# Logs: aggregation + losses
|
||
print("\n--- AGGREGATION LOG ---")
|
||
print(f"Aggregated keys: {len(df_agg)}")
|
||
print(f"Keys with EDSS in >=1 run: {(df_agg['n_edss'] >= 1).sum()}")
|
||
print(f"Keys with EDSS in >=2 runs (std meaningful): {(df_agg['n_edss'] >= 2).sum()}")
|
||
print(f"Keys missing EDSS in all runs: {(df_agg['n_edss'] == 0).sum()}")
|
||
print(f"Keys missing llm_conf in all runs: {(df_agg['n_llm_conf'] == 0).sum()}")
|
||
|
||
# Expected runs check
|
||
if min_runs_expected is not None:
|
||
print(f"\nKeys with < {min_runs_expected} rows (potential missing iterations):")
|
||
print(df_agg.loc[df_agg["n_rows"] < min_runs_expected, ["key", "n_rows"]].sort_values("n_rows").head(20))
|
||
|
||
# -----------------------------
|
||
# Optional: merge GT and compute error on aggregated EDSS_mean
|
||
# -----------------------------
|
||
if ground_truth_path is not None:
|
||
df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
|
||
need = {"unique_id", "MedDatum", gt_edss_col}
|
||
miss = need - set(df_gt.columns)
|
||
if miss:
|
||
raise ValueError(f"GT missing columns: {miss}. Available: {df_gt.columns.tolist()}")
|
||
|
||
df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
|
||
df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str)
|
||
df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
|
||
df_gt["EDSS_gt"] = df_gt[gt_edss_col].apply(parse_number)
|
||
|
||
print("\n--- GT MERGE LOG ---")
|
||
print(f"GT rows: {len(df_gt)} | GT unique keys: {df_gt['key'].nunique()}")
|
||
print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}")
|
||
|
||
df_final = df_agg.merge(df_gt[["key", "EDSS_gt"]], on="key", how="left", validate="one_to_one")
|
||
|
||
print(f"Aggregated keys with GT match: {df_final['EDSS_gt'].notna().sum()} / {len(df_final)}")
|
||
print(f"Aggregated keys missing GT EDSS: {df_final['EDSS_gt'].isna().sum()}")
|
||
|
||
df_final["abs_error_mean"] = (df_final["EDSS_mean"] - df_final["EDSS_gt"]).abs()
|
||
|
||
# How many keys usable for evaluation?
|
||
usable = df_final.dropna(subset=["EDSS_mean", "EDSS_gt"])
|
||
print("\n--- EVAL LOG (AGGREGATED) ---")
|
||
print(f"Keys with both EDSS_mean and EDSS_gt: {len(usable)}")
|
||
if len(usable) > 0:
|
||
print(f"MAE on EDSS_mean vs GT: {usable['abs_error_mean'].mean():.3f}")
|
||
print(f"Median abs error: {usable['abs_error_mean'].median():.3f}")
|
||
|
||
return df_final
|
||
|
||
return df_agg
|
||
|
||
|
||
# Example usage:
|
||
df = build_empirical_confidence_table(json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration", ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", w_empirical=0.7, w_llm=0.3, tol_mode=0.5,min_runs_expected=10,)
|
||
df.to_csv("empirical_confidence_table.csv", index=False)
|
||
|
||
##
|
||
|
||
|
||
|
||
|
||
# %% Executive Boxplot
|
||
import numpy as np
|
||
import pandas as pd
|
||
import matplotlib.pyplot as plt
|
||
from matplotlib.patches import Patch
|
||
|
||
def plot_exec_boxplots(df, min_bin_size_warn=10):
|
||
"""
|
||
Two side-by-side boxplots:
|
||
- Left: abs_error_mean grouped by empirical_conf_0_100 quantile bins
|
||
- Right: abs_error_mean grouped by llm_conf_mean quantile bins
|
||
|
||
Adds:
|
||
- Robust qcut labeling (handles ties; bins may be < 4)
|
||
- Data logs + per-bin summary table printed
|
||
- Clear legend explaining each panel and what box elements mean
|
||
"""
|
||
need_cols = ["abs_error_mean", "empirical_conf_0_100", "llm_conf_mean"]
|
||
missing = [c for c in need_cols if c not in df.columns]
|
||
if missing:
|
||
raise ValueError(f"Missing columns in df: {missing}. Available: {df.columns.tolist()}")
|
||
|
||
d = df[need_cols].copy()
|
||
|
||
# -----------------------------
|
||
# Data logs: survivorship
|
||
# -----------------------------
|
||
d_emp = d.dropna(subset=["abs_error_mean", "empirical_conf_0_100"]).copy()
|
||
d_llm = d.dropna(subset=["abs_error_mean", "llm_conf_mean"]).copy()
|
||
|
||
print("\n" + "="*90)
|
||
print("EXECUTIVE BOXPLOTS — DATA LOG + SUMMARY")
|
||
print("="*90)
|
||
print(f"Total rows in df: {len(df)}")
|
||
print(f"Rows for empirical plot: {len(d_emp)} (dropped {len(df) - len(d_emp)})")
|
||
print(f"Rows for LLM plot: {len(d_llm)} (dropped {len(df) - len(d_llm)})")
|
||
|
||
if len(d_emp) == 0 or len(d_llm) == 0:
|
||
print("[ERROR] Not enough data after dropping NaNs to build both plots.")
|
||
return
|
||
|
||
# -----------------------------
|
||
# Robust quantile binning (handles ties)
|
||
# -----------------------------
|
||
# Empirical
|
||
emp_bins = pd.qcut(d_emp["empirical_conf_0_100"], q=4, duplicates="drop")
|
||
k_emp = emp_bins.cat.categories.size
|
||
emp_labels = [f"Q{i+1}" for i in range(k_emp)]
|
||
d_emp["emp_q"] = pd.qcut(d_emp["empirical_conf_0_100"], q=4, duplicates="drop", labels=emp_labels)
|
||
|
||
# LLM
|
||
llm_bins = pd.qcut(d_llm["llm_conf_mean"], q=4, duplicates="drop")
|
||
k_llm = llm_bins.cat.categories.size
|
||
llm_labels = [f"Q{i+1}" for i in range(k_llm)]
|
||
d_llm["llm_q"] = pd.qcut(d_llm["llm_conf_mean"], q=4, duplicates="drop", labels=llm_labels)
|
||
|
||
# Print bin edges (so you can discuss exact thresholds)
|
||
print("\n--- BIN EDGES (actual ranges) ---")
|
||
print("Empirical confidence bins:")
|
||
for i, interval in enumerate(emp_bins.cat.categories):
|
||
print(f" {emp_labels[i]}: {interval}")
|
||
print("LLM confidence bins:")
|
||
for i, interval in enumerate(llm_bins.cat.categories):
|
||
print(f" {llm_labels[i]}: {interval}")
|
||
|
||
# -----------------------------
|
||
# Summary tables (per bin)
|
||
# -----------------------------
|
||
def summarize_bins(df_in, bin_col, conf_col, label):
|
||
g = df_in.groupby(bin_col, observed=True).agg(
|
||
n=("abs_error_mean", "size"),
|
||
mae_mean=("abs_error_mean", "mean"),
|
||
mae_median=("abs_error_mean", "median"),
|
||
mae_q25=("abs_error_mean", lambda x: x.quantile(0.25)),
|
||
mae_q75=("abs_error_mean", lambda x: x.quantile(0.75)),
|
||
conf_mean=(conf_col, "mean"),
|
||
conf_median=(conf_col, "median"),
|
||
).reset_index().rename(columns={bin_col: "bin"})
|
||
g["panel"] = label
|
||
return g[["panel", "bin", "n", "mae_mean", "mae_median", "mae_q25", "mae_q75", "conf_mean", "conf_median"]]
|
||
|
||
summary_emp = summarize_bins(d_emp, "emp_q", "empirical_conf_0_100", "Empirical")
|
||
summary_llm = summarize_bins(d_llm, "llm_q", "llm_conf_mean", "LLM")
|
||
|
||
print("\n--- SUMMARY TABLE: Empirical confidence quartiles (or fewer if ties) ---")
|
||
print(summary_emp.to_string(index=False, float_format=lambda x: f"{x:.3f}"))
|
||
|
||
print("\n--- SUMMARY TABLE: LLM confidence quartiles (or fewer if ties) ---")
|
||
print(summary_llm.to_string(index=False, float_format=lambda x: f"{x:.3f}"))
|
||
|
||
# Warn about small bins
|
||
small_emp = summary_emp.loc[summary_emp["n"] < min_bin_size_warn, ["bin", "n"]]
|
||
small_llm = summary_llm.loc[summary_llm["n"] < min_bin_size_warn, ["bin", "n"]]
|
||
if not small_emp.empty or not small_llm.empty:
|
||
print(f"\n[WARNING] Some bins have < {min_bin_size_warn} points; compare them cautiously.")
|
||
if not small_emp.empty:
|
||
print(" Empirical small bins:")
|
||
print(small_emp.to_string(index=False))
|
||
if not small_llm.empty:
|
||
print(" LLM small bins:")
|
||
print(small_llm.to_string(index=False))
|
||
|
||
# -----------------------------
|
||
# Prepare data for boxplots
|
||
# -----------------------------
|
||
emp_cats = list(d_emp["emp_q"].cat.categories)
|
||
llm_cats = list(d_llm["llm_q"].cat.categories)
|
||
|
||
emp_groups = [d_emp.loc[d_emp["emp_q"] == q, "abs_error_mean"].values for q in emp_cats]
|
||
llm_groups = [d_llm.loc[d_llm["llm_q"] == q, "abs_error_mean"].values for q in llm_cats]
|
||
|
||
# -----------------------------
|
||
# Plot
|
||
# -----------------------------
|
||
fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True)
|
||
|
||
bp0 = axes[0].boxplot(emp_groups, labels=emp_cats, showfliers=False, patch_artist=True)
|
||
bp1 = axes[1].boxplot(llm_groups, labels=llm_cats, showfliers=False, patch_artist=True)
|
||
|
||
# Make panels visually distinct but still simple (no extra clutter)
|
||
for patch in bp0["boxes"]:
|
||
patch.set_alpha(0.6)
|
||
for patch in bp1["boxes"]:
|
||
patch.set_alpha(0.6)
|
||
|
||
axes[0].set_title("Error by Empirical Confidence (quantile bins)")
|
||
axes[0].set_xlabel("Empirical confidence bin")
|
||
axes[0].set_ylabel("Absolute Error (|EDSS_mean − EDSS_gt|)")
|
||
|
||
axes[1].set_title("Error by LLM Confidence (quantile bins)")
|
||
axes[1].set_xlabel("LLM confidence bin")
|
||
|
||
for ax in axes:
|
||
ax.grid(axis="y", linestyle=":", alpha=0.5)
|
||
|
||
# -----------------------------
|
||
# Legend (simple, but useful)
|
||
# -----------------------------
|
||
legend_elements = [
|
||
Patch(facecolor="white", edgecolor="black", label="Box = IQR (25%–75%)"),
|
||
Patch(facecolor="white", edgecolor="black", label="Center line = median"),
|
||
Patch(facecolor="white", edgecolor="black", label="Whiskers = typical range (no outliers shown)"),
|
||
Patch(facecolor="white", edgecolor="white", label="Left panel: empirical stability bins"),
|
||
Patch(facecolor="white", edgecolor="white", label="Right panel: LLM self-reported bins"),
|
||
]
|
||
fig.legend(handles=legend_elements, loc="upper center", ncol=3, frameon=True)
|
||
|
||
plt.tight_layout(rect=[0, 0, 1, 0.90])
|
||
plt.show()
|
||
|
||
print("\n" + "="*90)
|
||
print("DONE")
|
||
print("="*90)
|
||
|
||
|
||
# Example (complete):
|
||
df_final = build_empirical_confidence_table(
|
||
json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration",
|
||
ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv",
|
||
w_empirical=0.7,
|
||
w_llm=0.3,
|
||
tol_mode=0.5,
|
||
min_runs_expected=10,
|
||
)
|
||
plot_exec_boxplots(df_final)
|
||
|
||
|
||
##
|
||
|
||
|
||
|
||
# %% Scatter
|
||
|
||
import os, json
|
||
import numpy as np
|
||
import pandas as pd
|
||
import matplotlib.pyplot as plt
|
||
|
||
def scatter_abs_error_by_conf_bins_single_json(
|
||
json_file_path,
|
||
ground_truth_path,
|
||
gt_sep=";",
|
||
gt_edss_col="EDSS",
|
||
):
|
||
def norm_str(x):
|
||
return str(x).strip().lower()
|
||
|
||
def parse_edss(x):
|
||
if x is None:
|
||
return np.nan
|
||
s = str(x).strip()
|
||
if s == "" or s.lower() in {"nan", "none", "null"}:
|
||
return np.nan
|
||
s = s.replace(",", ".")
|
||
return pd.to_numeric(s, errors="coerce")
|
||
|
||
# ---- Load GT
|
||
df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
|
||
df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
|
||
df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str)
|
||
df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
|
||
df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss)
|
||
|
||
# ---- Load preds from JSON
|
||
with open(json_file_path, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
|
||
rows = []
|
||
for entry in data:
|
||
if not entry.get("success"):
|
||
continue
|
||
res = entry.get("result", {})
|
||
uid, md = res.get("unique_id"), res.get("MedDatum")
|
||
if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
|
||
continue
|
||
|
||
rows.append({
|
||
"key": norm_str(uid) + "_" + norm_str(md),
|
||
"EDSS_pred": parse_edss(res.get("EDSS")),
|
||
"confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
|
||
})
|
||
|
||
df_pred = pd.DataFrame(rows)
|
||
|
||
# ---- Merge + filter
|
||
df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one")
|
||
df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy()
|
||
df["abs_error"] = (df["EDSS_pred"] - df["EDSS_gt"]).abs()
|
||
|
||
# ---- Bin confidence into 4 categories
|
||
bins = [0, 70, 80, 90, 100]
|
||
labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]
|
||
df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True)
|
||
df = df.dropna(subset=["conf_bin"]).copy()
|
||
|
||
# ---- Logs
|
||
print("\n--- BIN COUNTS (points plotted) ---")
|
||
print(df["conf_bin"].value_counts().reindex(labels).fillna(0).astype(int))
|
||
print(f"Total points plotted: {len(df)}")
|
||
|
||
# ---- Scatter (categorical x with jitter)
|
||
x_map = {lab: i for i, lab in enumerate(labels)}
|
||
x = df["conf_bin"].map(x_map).astype(float).to_numpy()
|
||
jitter = np.random.uniform(-0.12, 0.12, size=len(df))
|
||
xj = x + jitter
|
||
|
||
plt.figure(figsize=(12, 6))
|
||
plt.scatter(xj, df["abs_error"].to_numpy(), alpha=0.55)
|
||
plt.xticks(range(len(labels)), labels)
|
||
plt.xlabel("certainty_percent category (Iteration 1)")
|
||
plt.ylabel("Absolute Error (|EDSS_pred − EDSS_gt|)")
|
||
plt.title("Absolute Error vs LLM Confidence Category (Single JSON)")
|
||
plt.grid(axis="y", linestyle=":", alpha=0.5)
|
||
plt.tight_layout()
|
||
plt.show()
|
||
|
||
# --- RUN ---
|
||
scatter_abs_error_by_conf_bins_single_json(
|
||
json_file_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json",
|
||
ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv",
|
||
)
|
||
|
||
##
|
||
|
||
|
||
|
||
|
||
# %% Boxplot2
|
||
|
||
# Boxplot + light jittered points
|
||
# - Single JSON (iteration 1)
|
||
# - X: confidence bin (<70, 70-80, 80-90, 90-100)
|
||
# - Y: absolute error
|
||
# - Legend includes n per bin
|
||
|
||
import json
|
||
import numpy as np
|
||
import pandas as pd
|
||
import matplotlib.pyplot as plt
|
||
from matplotlib.patches import Patch
|
||
|
||
def boxplot_with_jitter_abs_error_by_conf_bins_single_json(
|
||
json_file_path,
|
||
ground_truth_path,
|
||
gt_sep=";",
|
||
gt_edss_col="EDSS",
|
||
jitter_width=0.12,
|
||
point_alpha=0.25,
|
||
show_outliers=False,
|
||
):
|
||
def norm_str(x):
|
||
return str(x).strip().lower()
|
||
|
||
def parse_edss(x):
|
||
if x is None:
|
||
return np.nan
|
||
s = str(x).strip()
|
||
if s == "" or s.lower() in {"nan", "none", "null"}:
|
||
return np.nan
|
||
s = s.replace(",", ".")
|
||
return pd.to_numeric(s, errors="coerce")
|
||
|
||
# ---- Load GT
|
||
df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
|
||
for col in ["unique_id", "MedDatum", gt_edss_col]:
|
||
if col not in df_gt.columns:
|
||
raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}")
|
||
|
||
df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
|
||
df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str)
|
||
df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
|
||
df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss)
|
||
|
||
# ---- Load preds from JSON
|
||
with open(json_file_path, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
|
||
rows = []
|
||
for entry in data:
|
||
if not entry.get("success"):
|
||
continue
|
||
res = entry.get("result", {})
|
||
uid, md = res.get("unique_id"), res.get("MedDatum")
|
||
if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
|
||
continue
|
||
rows.append({
|
||
"key": norm_str(uid) + "_" + norm_str(md),
|
||
"EDSS_pred": parse_edss(res.get("EDSS")),
|
||
"confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
|
||
})
|
||
|
||
df_pred = pd.DataFrame(rows)
|
||
|
||
# ---- Merge + filter
|
||
df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one")
|
||
df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy()
|
||
df["abs_error"] = (df["EDSS_pred"] - df["EDSS_gt"]).abs()
|
||
|
||
# ---- Bin confidence
|
||
bins = [0, 70, 80, 90, 100]
|
||
labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]
|
||
df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True)
|
||
df = df.dropna(subset=["conf_bin"]).copy()
|
||
|
||
# ---- Prepare per-bin arrays
|
||
bin_arrays = [df.loc[df["conf_bin"] == lab, "abs_error"].to_numpy() for lab in labels]
|
||
n_counts = [len(a) for a in bin_arrays]
|
||
|
||
# ---- Plot
|
||
fig, ax = plt.subplots(figsize=(12, 6))
|
||
|
||
# Boxplot (no fliers by default to reduce clutter)
|
||
bp = ax.boxplot(
|
||
bin_arrays,
|
||
labels=labels,
|
||
showfliers=show_outliers,
|
||
patch_artist=True,
|
||
widths=0.55,
|
||
)
|
||
|
||
# Light fill for boxes (no explicit color choices required)
|
||
for b in bp["boxes"]:
|
||
b.set_alpha(0.35)
|
||
|
||
# Jittered points on top
|
||
for i, arr in enumerate(bin_arrays, start=1):
|
||
if len(arr) == 0:
|
||
continue
|
||
x = np.full(len(arr), i, dtype=float)
|
||
x += np.random.uniform(-jitter_width, jitter_width, size=len(arr))
|
||
ax.scatter(x, arr, alpha=point_alpha, s=18)
|
||
|
||
ax.set_title("Absolute Error by LLM Confidence Bin (Iteration 1)")
|
||
ax.set_xlabel("certainty_percent category")
|
||
ax.set_ylabel("Absolute Error (|EDSS_pred − EDSS_gt|)")
|
||
ax.grid(axis="y", linestyle=":", alpha=0.5)
|
||
|
||
# Legend showing n per bin
|
||
legend_handles = [
|
||
Patch(facecolor="white", edgecolor="black", label=f"{lab}: n={n}")
|
||
for lab, n in zip(labels, n_counts)
|
||
]
|
||
ax.legend(handles=legend_handles, title="Bin counts", loc="upper right", frameon=True)
|
||
|
||
plt.tight_layout()
|
||
plt.show()
|
||
|
||
# Print counts too (useful for discussion)
|
||
print("\n--- BIN COUNTS (points plotted) ---")
|
||
for lab, n in zip(labels, n_counts):
|
||
print(f"{lab:>18}: n={n}")
|
||
print(f"Total points plotted: {sum(n_counts)}")
|
||
|
||
|
||
# Example run:
|
||
boxplot_with_jitter_abs_error_by_conf_bins_single_json(
|
||
json_file_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json",
|
||
ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv"
|
||
)
|
||
|
||
##
|
||
|
||
|
||
|
||
|
||
# %% Boxplot3
|
||
|
||
|
||
# Boxplot + jitter with SIGNED error (direction)
|
||
# - Y-axis: signed error = EDSS_pred - EDSS_gt (negative = underestimation, positive = overestimation)
|
||
# - Also prints per-bin summary (n, mean signed error, median, MAE)
|
||
|
||
import json
|
||
import numpy as np
|
||
import pandas as pd
|
||
import matplotlib.pyplot as plt
|
||
from matplotlib.patches import Patch
|
||
from matplotlib.lines import Line2D
|
||
|
||
def boxplot_with_jitter_signed_error_by_conf_bins_single_json(
|
||
json_file_path,
|
||
ground_truth_path,
|
||
gt_sep=";",
|
||
gt_edss_col="EDSS",
|
||
jitter_width=0.12,
|
||
point_alpha=0.25,
|
||
show_outliers=False,
|
||
):
|
||
def norm_str(x):
|
||
return str(x).strip().lower()
|
||
|
||
def parse_edss(x):
|
||
if x is None:
|
||
return np.nan
|
||
s = str(x).strip()
|
||
if s == "" or s.lower() in {"nan", "none", "null"}:
|
||
return np.nan
|
||
s = s.replace(",", ".")
|
||
return pd.to_numeric(s, errors="coerce")
|
||
|
||
# ---- Load GT
|
||
df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
|
||
for col in ["unique_id", "MedDatum", gt_edss_col]:
|
||
if col not in df_gt.columns:
|
||
raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}")
|
||
|
||
df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
|
||
df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str)
|
||
df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
|
||
df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss)
|
||
|
||
# ---- Load preds from JSON
|
||
with open(json_file_path, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
|
||
rows = []
|
||
for entry in data:
|
||
if not entry.get("success"):
|
||
continue
|
||
res = entry.get("result", {})
|
||
uid, md = res.get("unique_id"), res.get("MedDatum")
|
||
if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
|
||
continue
|
||
rows.append({
|
||
"key": norm_str(uid) + "_" + norm_str(md),
|
||
"EDSS_pred": parse_edss(res.get("EDSS")),
|
||
"confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
|
||
})
|
||
|
||
df_pred = pd.DataFrame(rows)
|
||
|
||
# ---- Merge + filter
|
||
df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one")
|
||
df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy()
|
||
|
||
# SIGNED ERROR (direction)
|
||
df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"]
|
||
df["abs_error"] = df["signed_error"].abs()
|
||
|
||
# ---- Bin confidence
|
||
bins = [0, 70, 80, 90, 100]
|
||
labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]
|
||
df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True)
|
||
df = df.dropna(subset=["conf_bin"]).copy()
|
||
|
||
# ---- Prepare arrays
|
||
bin_arrays = [df.loc[df["conf_bin"] == lab, "signed_error"].to_numpy() for lab in labels]
|
||
n_counts = [len(a) for a in bin_arrays]
|
||
|
||
# ---- Plot
|
||
fig, ax = plt.subplots(figsize=(12, 6))
|
||
|
||
bp = ax.boxplot(
|
||
bin_arrays,
|
||
labels=labels,
|
||
showfliers=show_outliers,
|
||
patch_artist=True,
|
||
widths=0.55,
|
||
)
|
||
|
||
for b in bp["boxes"]:
|
||
b.set_alpha(0.35)
|
||
|
||
# Jittered points
|
||
for i, arr in enumerate(bin_arrays, start=1):
|
||
if len(arr) == 0:
|
||
continue
|
||
x = np.full(len(arr), i, dtype=float)
|
||
x += np.random.uniform(-jitter_width, jitter_width, size=len(arr))
|
||
ax.scatter(x, arr, alpha=point_alpha, s=18)
|
||
|
||
# Zero line to show over/under clearly
|
||
ax.axhline(0, linewidth=1.5, linestyle="--")
|
||
|
||
ax.set_title("Signed Error by LLM Confidence Bin (Iteration 1)")
|
||
ax.set_xlabel("certainty_percent category")
|
||
ax.set_ylabel("Signed Error (EDSS_pred − EDSS_gt)")
|
||
ax.grid(axis="y", linestyle=":", alpha=0.5)
|
||
|
||
# Legend with n per bin + zero-line meaning
|
||
legend_handles = [
|
||
Patch(facecolor="white", edgecolor="black", label=f"{lab}: n={n}")
|
||
for lab, n in zip(labels, n_counts)
|
||
]
|
||
legend_handles.append(Line2D([0], [0], linestyle="--", color="black", label="0 = unbiased (over/under split)"))
|
||
ax.legend(handles=legend_handles, title="Bin counts", loc="upper right", frameon=True)
|
||
|
||
plt.tight_layout()
|
||
plt.show()
|
||
|
||
# ---- Print per-bin summary to discuss
|
||
print("\n--- PER-BIN SUMMARY (points plotted) ---")
|
||
for lab in labels:
|
||
sub = df.loc[df["conf_bin"] == lab]
|
||
n = len(sub)
|
||
if n == 0:
|
||
print(f"{lab:>18}: n=0")
|
||
continue
|
||
print(
|
||
f"{lab:>18}: n={n:3d} | "
|
||
f"mean signed={sub['signed_error'].mean(): .3f} | "
|
||
f"median signed={sub['signed_error'].median(): .3f} | "
|
||
f"MAE={sub['abs_error'].mean(): .3f}"
|
||
)
|
||
print(f"Total points plotted: {len(df)}")
|
||
|
||
|
||
# Example run:
|
||
boxplot_with_jitter_signed_error_by_conf_bins_single_json(
|
||
json_file_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json",
|
||
ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv"
|
||
)
|
||
##
|
||
|
||
|
||
|
||
# %% jitter and violin 10x10
|
||
|
||
# Violin + jitter (all JSONs in folder), with signed error
|
||
# - X: confidence bins (<70, 70-80, 80-90, 90-100)
|
||
# - Y: signed error = EDSS_pred - EDSS_gt (direction)
|
||
# - Prints bin counts (n) and puts n into the legend
|
||
|
||
import os, glob, json
|
||
import numpy as np
|
||
import pandas as pd
|
||
import matplotlib.pyplot as plt
|
||
import seaborn as sns
|
||
from matplotlib.patches import Patch
|
||
from matplotlib.lines import Line2D
|
||
|
||
def violin_jitter_signed_error_all_jsons(
|
||
json_dir_path,
|
||
ground_truth_path,
|
||
gt_sep=";",
|
||
gt_edss_col="EDSS",
|
||
jitter_width=0.12,
|
||
point_alpha=0.20,
|
||
point_size=10,
|
||
violin_inner="quartile", # 'quartile', 'box', 'stick', or None
|
||
):
|
||
def norm_str(x):
|
||
return str(x).strip().lower()
|
||
|
||
def parse_edss(x):
|
||
if x is None:
|
||
return np.nan
|
||
s = str(x).strip()
|
||
if s == "" or s.lower() in {"nan", "none", "null"}:
|
||
return np.nan
|
||
s = s.replace(",", ".")
|
||
return pd.to_numeric(s, errors="coerce")
|
||
|
||
# ---- Load GT
|
||
df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
|
||
for col in ["unique_id", "MedDatum", gt_edss_col]:
|
||
if col not in df_gt.columns:
|
||
raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}")
|
||
|
||
df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
|
||
df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str)
|
||
df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
|
||
df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss)
|
||
|
||
# ---- Load preds from ALL JSONs
|
||
json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json")))
|
||
if not json_files:
|
||
raise FileNotFoundError(f"No JSON files found in: {json_dir_path}")
|
||
|
||
rows = []
|
||
for fp in json_files:
|
||
with open(fp, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
for entry in data:
|
||
if not entry.get("success"):
|
||
continue
|
||
res = entry.get("result", {})
|
||
uid, md = res.get("unique_id"), res.get("MedDatum")
|
||
if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
|
||
continue
|
||
rows.append({
|
||
"file": os.path.basename(fp),
|
||
"key": norm_str(uid) + "_" + norm_str(md),
|
||
"EDSS_pred": parse_edss(res.get("EDSS")),
|
||
"confidence": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
|
||
})
|
||
|
||
df_pred = pd.DataFrame(rows)
|
||
|
||
# ---- Merge + filter
|
||
df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one")
|
||
df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]).copy()
|
||
df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"]
|
||
|
||
# ---- Bin confidence
|
||
bins = [0, 70, 80, 90, 100]
|
||
labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]
|
||
df["conf_bin"] = pd.cut(df["confidence"], bins=bins, labels=labels, include_lowest=True)
|
||
df = df.dropna(subset=["conf_bin"]).copy()
|
||
|
||
# ---- Counts + log
|
||
counts = df["conf_bin"].value_counts().reindex(labels).fillna(0).astype(int)
|
||
print("\n--- BIN COUNTS (all JSONs) ---")
|
||
for lab in labels:
|
||
print(f"{lab:>18}: n={counts[lab]}")
|
||
print(f"Total points plotted: {len(df)}")
|
||
print(f"JSON files: {len(json_files)}")
|
||
|
||
# Ensure ordering for seaborn
|
||
df["conf_bin"] = pd.Categorical(df["conf_bin"], categories=labels, ordered=True)
|
||
|
||
# ---- Plot
|
||
plt.figure(figsize=(12, 6))
|
||
|
||
# Violin (density)
|
||
sns.violinplot(
|
||
data=df,
|
||
x="conf_bin",
|
||
y="signed_error",
|
||
order=labels,
|
||
inner=violin_inner,
|
||
cut=0
|
||
)
|
||
|
||
# Jittered points (manual jitter to keep it consistent and fast)
|
||
x_map = {lab: i for i, lab in enumerate(labels)}
|
||
x = df["conf_bin"].map(x_map).astype(float).to_numpy()
|
||
xj = x + np.random.uniform(-jitter_width, jitter_width, size=len(df))
|
||
plt.scatter(xj, df["signed_error"].to_numpy(), alpha=point_alpha, s=point_size)
|
||
|
||
# Zero line (over/under split)
|
||
plt.axhline(0, linestyle="--", linewidth=1.5)
|
||
|
||
plt.xticks(range(len(labels)), labels)
|
||
plt.xlabel("certainty_percent category (all iterations)")
|
||
plt.ylabel("Signed Error (EDSS_pred − EDSS_gt)")
|
||
plt.title("Signed Error vs LLM Confidence Category — Violin + Jitter (All JSONs)")
|
||
plt.grid(axis="y", linestyle=":", alpha=0.5)
|
||
|
||
# Legend with n per bin
|
||
legend_handles = [
|
||
Patch(facecolor="white", edgecolor="black", label=f"{lab}: n={int(counts[lab])}")
|
||
for lab in labels
|
||
]
|
||
legend_handles.append(Line2D([0], [0], linestyle="--", color="black", label="0 = unbiased (over/under split)"))
|
||
plt.legend(handles=legend_handles, title="Bin counts", loc="upper right", frameon=True)
|
||
|
||
plt.tight_layout()
|
||
plt.show()
|
||
|
||
|
||
# Example run:
|
||
violin_jitter_signed_error_all_jsons(
|
||
json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration",
|
||
ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv"
|
||
)
|
||
|
||
##
|
||
|
||
|
||
|
||
# %% jitter and violin 10x1
|
||
|
||
|
||
# Adjusted: Violin + jitter (ALL JSONs for points) but X-bins come ONLY from JSON #1 (reference)
|
||
# Fixes:
|
||
# 1) Legend has colors matching bins
|
||
# 2) Legend placed OUTSIDE plot area
|
||
# 3) X-axis binning uses certainty_percent from JSON1 (by key), then all iterations' points inherit that bin
|
||
|
||
import os, glob, json
|
||
import numpy as np
|
||
import pandas as pd
|
||
import matplotlib.pyplot as plt
|
||
import seaborn as sns
|
||
from matplotlib.patches import Patch
|
||
from matplotlib.lines import Line2D
|
||
|
||
def violin_jitter_signed_error_all_jsons_xbins_from_json1(
|
||
json_dir_path,
|
||
json1_file_path,
|
||
ground_truth_path,
|
||
gt_sep=";",
|
||
gt_edss_col="EDSS",
|
||
jitter_width=0.12,
|
||
point_alpha=0.18,
|
||
point_size=10,
|
||
violin_inner="quartile", # 'quartile', 'box', 'stick', or None
|
||
):
|
||
def norm_str(x):
|
||
return str(x).strip().lower()
|
||
|
||
def parse_edss(x):
|
||
if x is None:
|
||
return np.nan
|
||
s = str(x).strip()
|
||
if s == "" or s.lower() in {"nan", "none", "null"}:
|
||
return np.nan
|
||
s = s.replace(",", ".")
|
||
return pd.to_numeric(s, errors="coerce")
|
||
|
||
# ----------------------------
|
||
# Load GT
|
||
# ----------------------------
|
||
df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
|
||
for col in ["unique_id", "MedDatum", gt_edss_col]:
|
||
if col not in df_gt.columns:
|
||
raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}")
|
||
|
||
df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
|
||
df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str)
|
||
df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
|
||
df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss)
|
||
|
||
# ----------------------------
|
||
# Load JSON1 and build reference bins by KEY
|
||
# ----------------------------
|
||
with open(json1_file_path, "r", encoding="utf-8") as f:
|
||
data1 = json.load(f)
|
||
|
||
ref_rows = []
|
||
for entry in data1:
|
||
if not entry.get("success"):
|
||
continue
|
||
res = entry.get("result", {})
|
||
uid, md = res.get("unique_id"), res.get("MedDatum")
|
||
if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
|
||
continue
|
||
ref_rows.append({
|
||
"key": norm_str(uid) + "_" + norm_str(md),
|
||
"confidence_ref": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
|
||
})
|
||
|
||
df_ref = pd.DataFrame(ref_rows)
|
||
|
||
# If JSON1 has duplicates for a key (unlikely, but safe), take the first non-null confidence
|
||
df_ref = (df_ref.sort_values("confidence_ref")
|
||
.groupby("key", as_index=False)["confidence_ref"]
|
||
.apply(lambda s: s.dropna().iloc[0] if s.dropna().any() else np.nan))
|
||
if isinstance(df_ref.index, pd.MultiIndex):
|
||
df_ref = df_ref.reset_index(drop=True)
|
||
|
||
# Confidence bins
|
||
bins = [0, 70, 80, 90, 100]
|
||
labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]
|
||
df_ref["conf_bin_ref"] = pd.cut(df_ref["confidence_ref"], bins=bins, labels=labels, include_lowest=True)
|
||
df_ref = df_ref.dropna(subset=["conf_bin_ref"]).copy()
|
||
|
||
# ----------------------------
|
||
# Load ALL JSONs (all points)
|
||
# ----------------------------
|
||
json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json")))
|
||
if not json_files:
|
||
raise FileNotFoundError(f"No JSON files found in: {json_dir_path}")
|
||
|
||
rows = []
|
||
for fp in json_files:
|
||
with open(fp, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
for entry in data:
|
||
if not entry.get("success"):
|
||
continue
|
||
res = entry.get("result", {})
|
||
uid, md = res.get("unique_id"), res.get("MedDatum")
|
||
if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
|
||
continue
|
||
rows.append({
|
||
"file": os.path.basename(fp),
|
||
"key": norm_str(uid) + "_" + norm_str(md),
|
||
"EDSS_pred": parse_edss(res.get("EDSS")),
|
||
})
|
||
|
||
df_pred = pd.DataFrame(rows)
|
||
|
||
# ----------------------------
|
||
# Merge: preds + GT + reference bins (from JSON1)
|
||
# ----------------------------
|
||
df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one")
|
||
df = df.merge(df_ref[["key", "conf_bin_ref"]], on="key", how="inner", validate="many_to_one")
|
||
|
||
# filter for plotting
|
||
df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "conf_bin_ref"]).copy()
|
||
df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"]
|
||
|
||
# ordering
|
||
df["conf_bin_ref"] = pd.Categorical(df["conf_bin_ref"], categories=labels, ordered=True)
|
||
|
||
# ----------------------------
|
||
# Logs + counts
|
||
# ----------------------------
|
||
counts = df["conf_bin_ref"].value_counts().reindex(labels).fillna(0).astype(int)
|
||
|
||
print("\n--- BIN COUNTS (ALL JSON points, binned by JSON1 confidence) ---")
|
||
for lab in labels:
|
||
print(f"{lab:>18}: n={int(counts[lab])}")
|
||
print(f"Total points plotted: {len(df)}")
|
||
print(f"JSON files used for points: {len(json_files)}")
|
||
print(f"Reference JSON1 bins derived from: {os.path.basename(json1_file_path)}")
|
||
print(f"Keys in reference (after binning & non-null): {df_ref['key'].nunique()}")
|
||
|
||
# ----------------------------
|
||
# Colors + legend patches
|
||
# ----------------------------
|
||
palette = sns.color_palette("Blues", n_colors=len(labels))
|
||
bin_colors = {lab: palette[i] for i, lab in enumerate(labels)}
|
||
|
||
legend_handles = [
|
||
Patch(facecolor=bin_colors[lab], edgecolor="black", label=f"{lab}: n={int(counts[lab])}")
|
||
for lab in labels
|
||
]
|
||
legend_handles.append(Line2D([0], [0], linestyle="--", color="black", label="0 = unbiased (over/under split)"))
|
||
|
||
# ----------------------------
|
||
# Plot (legend outside)
|
||
# ----------------------------
|
||
fig, ax = plt.subplots(figsize=(12.5, 6))
|
||
|
||
sns.violinplot(
|
||
data=df,
|
||
x="conf_bin_ref",
|
||
y="signed_error",
|
||
order=labels,
|
||
inner=violin_inner,
|
||
cut=0,
|
||
palette=[bin_colors[l] for l in labels],
|
||
ax=ax,
|
||
)
|
||
|
||
# jittered points (manual jitter)
|
||
x_map = {lab: i for i, lab in enumerate(labels)}
|
||
x = df["conf_bin_ref"].map(x_map).astype(float).to_numpy()
|
||
xj = x + np.random.uniform(-jitter_width, jitter_width, size=len(df))
|
||
ax.scatter(xj, df["signed_error"].to_numpy(), alpha=point_alpha, s=point_size)
|
||
|
||
ax.axhline(0, linestyle="--", linewidth=1.5)
|
||
|
||
ax.set_xlabel("certainty_percent category (from JSON 1 as reference)")
|
||
ax.set_ylabel("Signed Error (EDSS_pred − EDSS_gt)")
|
||
ax.set_title("Signed Error vs LLM Confidence Category — Violin + Jitter (All JSONs)\nBinned by JSON 1 certainty_percent")
|
||
ax.grid(axis="y", linestyle=":", alpha=0.5)
|
||
|
||
# Legend outside (right)
|
||
ax.legend(
|
||
handles=legend_handles,
|
||
title="Bin counts",
|
||
loc="center left",
|
||
bbox_to_anchor=(1.02, 0.5),
|
||
frameon=True
|
||
)
|
||
|
||
plt.tight_layout()
|
||
plt.show()
|
||
|
||
|
||
# Example run:
|
||
json1_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json"
|
||
violin_jitter_signed_error_all_jsons_xbins_from_json1(
|
||
json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration",
|
||
json1_file_path=json1_path,
|
||
ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv")
|
||
|
||
##
|
||
|
||
|
||
# %% Coorelation
|
||
|
||
# Correlation plot (RAW certainty_percent) vs error
|
||
# - Uses ALL JSONs as points
|
||
# - Uses JSON1 certainty_percent as the x-value reference (per key)
|
||
# - Y can be abs_error or signed_error (choose with y_mode)
|
||
# - Prints Spearman + Pearson correlations
|
||
# - Adds a simple linear trend line
|
||
|
||
import os, glob, json
|
||
import numpy as np
|
||
import pandas as pd
|
||
import matplotlib.pyplot as plt
|
||
|
||
def correlation_scatter_raw_certainty_json1_reference(
|
||
json_dir_path,
|
||
json1_file_path,
|
||
ground_truth_path,
|
||
gt_sep=";",
|
||
gt_edss_col="EDSS",
|
||
y_mode="abs", # "abs" or "signed"
|
||
point_alpha=0.18,
|
||
point_size=12,
|
||
):
|
||
def norm_str(x):
|
||
return str(x).strip().lower()
|
||
|
||
def parse_edss(x):
|
||
if x is None:
|
||
return np.nan
|
||
s = str(x).strip()
|
||
if s == "" or s.lower() in {"nan", "none", "null"}:
|
||
return np.nan
|
||
s = s.replace(",", ".")
|
||
return pd.to_numeric(s, errors="coerce")
|
||
|
||
def rankdata(a):
|
||
# Average-rank for ties (Spearman needs ranks)
|
||
s = pd.Series(a)
|
||
return s.rank(method="average").to_numpy()
|
||
|
||
# ----------------------------
|
||
# Load GT
|
||
# ----------------------------
|
||
df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
|
||
for col in ["unique_id", "MedDatum", gt_edss_col]:
|
||
if col not in df_gt.columns:
|
||
raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}")
|
||
|
||
df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
|
||
df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str)
|
||
df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
|
||
df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss)
|
||
|
||
# ----------------------------
|
||
# Load JSON1 reference certainty_percent (per key)
|
||
# ----------------------------
|
||
with open(json1_file_path, "r", encoding="utf-8") as f:
|
||
data1 = json.load(f)
|
||
|
||
ref_rows = []
|
||
for entry in data1:
|
||
if not entry.get("success"):
|
||
continue
|
||
res = entry.get("result", {})
|
||
uid, md = res.get("unique_id"), res.get("MedDatum")
|
||
if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
|
||
continue
|
||
ref_rows.append({
|
||
"key": norm_str(uid) + "_" + norm_str(md),
|
||
"certainty_ref": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
|
||
})
|
||
|
||
df_ref = pd.DataFrame(ref_rows)
|
||
|
||
# Deduplicate keys if needed: take first non-null certainty
|
||
df_ref = (df_ref.dropna(subset=["certainty_ref"])
|
||
.groupby("key", as_index=False)["certainty_ref"]
|
||
.first())
|
||
|
||
# ----------------------------
|
||
# Load ALL JSON predictions (points)
|
||
# ----------------------------
|
||
json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json")))
|
||
if not json_files:
|
||
raise FileNotFoundError(f"No JSON files found in: {json_dir_path}")
|
||
|
||
rows = []
|
||
for fp in json_files:
|
||
with open(fp, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
for entry in data:
|
||
if not entry.get("success"):
|
||
continue
|
||
res = entry.get("result", {})
|
||
uid, md = res.get("unique_id"), res.get("MedDatum")
|
||
if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
|
||
continue
|
||
rows.append({
|
||
"file": os.path.basename(fp),
|
||
"key": norm_str(uid) + "_" + norm_str(md),
|
||
"EDSS_pred": parse_edss(res.get("EDSS")),
|
||
})
|
||
|
||
df_pred = pd.DataFrame(rows)
|
||
|
||
# ----------------------------
|
||
# Merge: preds + GT + JSON1 reference certainty
|
||
# ----------------------------
|
||
df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one")
|
||
df = df.merge(df_ref[["key", "certainty_ref"]], on="key", how="inner", validate="many_to_one")
|
||
|
||
# Filter needed fields
|
||
df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "certainty_ref"]).copy()
|
||
|
||
df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"]
|
||
df["abs_error"] = df["signed_error"].abs()
|
||
y_col = "abs_error" if y_mode == "abs" else "signed_error"
|
||
|
||
# ----------------------------
|
||
# Logs
|
||
# ----------------------------
|
||
print("\n" + "="*90)
|
||
print("CORRELATION: RAW certainty_percent (JSON1 reference) vs ERROR (ALL JSON points)")
|
||
print("="*90)
|
||
print(f"JSON DIR (points): {json_dir_path} | files: {len(json_files)}")
|
||
print(f"JSON1 reference: {os.path.basename(json1_file_path)}")
|
||
print(f"Points available after merge+filter: {len(df)}")
|
||
print(f"Unique keys in plot: {df['key'].nunique()}")
|
||
print(f"Y mode: {y_mode} ({y_col})")
|
||
|
||
# ----------------------------
|
||
# Correlations (Pearson + Spearman)
|
||
# ----------------------------
|
||
x = df["certainty_ref"].to_numpy(dtype=float)
|
||
y = df[y_col].to_numpy(dtype=float)
|
||
|
||
# Pearson
|
||
pearson = np.corrcoef(x, y)[0, 1] if len(df) >= 2 else np.nan
|
||
|
||
# Spearman = Pearson corr of ranks
|
||
rx = rankdata(x)
|
||
ry = rankdata(y)
|
||
spearman = np.corrcoef(rx, ry)[0, 1] if len(df) >= 2 else np.nan
|
||
|
||
print(f"\nPearson r: {pearson:.4f}")
|
||
print(f"Spearman ρ: {spearman:.4f}")
|
||
|
||
# ----------------------------
|
||
# Trend line (simple linear fit)
|
||
# ----------------------------
|
||
# Fit y = a*x + b
|
||
if len(df) >= 2:
|
||
a, b = np.polyfit(x, y, 1)
|
||
else:
|
||
a, b = np.nan, np.nan
|
||
|
||
# ----------------------------
|
||
# Plot
|
||
# ----------------------------
|
||
plt.figure(figsize=(12, 6))
|
||
plt.scatter(x, y, alpha=point_alpha, s=point_size)
|
||
|
||
# trend line across full x-range
|
||
if np.isfinite(a) and np.isfinite(b):
|
||
xs = np.linspace(np.nanmin(x), np.nanmax(x), 200)
|
||
plt.plot(xs, a * xs + b, linestyle="--", linewidth=2)
|
||
|
||
plt.xlabel("certainty_percent (from JSON 1, per key)")
|
||
ylabel = "Absolute Error |EDSS_pred − EDSS_gt|" if y_mode == "abs" else "Signed Error (EDSS_pred − EDSS_gt)"
|
||
plt.ylabel(ylabel)
|
||
plt.title(f"Correlation of JSON1 certainty_percent vs {y_col} (All iterations)\n"
|
||
f"Pearson r={pearson:.3f} | Spearman ρ={spearman:.3f}")
|
||
plt.grid(linestyle=":", alpha=0.5)
|
||
plt.tight_layout()
|
||
plt.show()
|
||
|
||
|
||
# Example run:
|
||
json1_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json"
|
||
correlation_scatter_raw_certainty_json1_reference(
|
||
json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration",
|
||
json1_file_path=json1_path,
|
||
ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv",
|
||
y_mode="abs" # or "signed"
|
||
)
|
||
##
|
||
|
||
# %% Correlation adjusted
|
||
|
||
# Correlation scatter (RAW certainty_percent from JSON1) vs error (all JSON points)
|
||
# Adds:
|
||
# 1) Legend (points, trend line) + Pearson/Spearman shown in legend and title
|
||
# 2) Trend line color set to high-contrast (black by default)
|
||
# 3) Density coloring: dots colored by local point density (bluer = more cases) + colorbar
|
||
|
||
import os, glob, json
|
||
import numpy as np
|
||
import pandas as pd
|
||
import matplotlib.pyplot as plt
|
||
from matplotlib.lines import Line2D
|
||
from matplotlib.colors import LogNorm, PowerNorm
|
||
|
||
def correlation_scatter_raw_certainty_json1_reference(
|
||
json_dir_path,
|
||
json1_file_path,
|
||
ground_truth_path,
|
||
gt_sep=";",
|
||
gt_edss_col="EDSS",
|
||
y_mode="abs", # "abs" or "signed"
|
||
point_alpha=0.85, # higher alpha works better with density coloring
|
||
point_size=14,
|
||
trend_color="black", # high-contrast line
|
||
save_svg_path=None,
|
||
dpi=300
|
||
):
|
||
def norm_str(x):
|
||
return str(x).strip().lower()
|
||
|
||
def parse_edss(x):
|
||
if x is None:
|
||
return np.nan
|
||
s = str(x).strip()
|
||
if s == "" or s.lower() in {"nan", "none", "null"}:
|
||
return np.nan
|
||
s = s.replace(",", ".")
|
||
return pd.to_numeric(s, errors="coerce")
|
||
|
||
def rankdata(a):
|
||
return pd.Series(a).rank(method="average").to_numpy()
|
||
|
||
# ----------------------------
|
||
# Load GT
|
||
# ----------------------------
|
||
df_gt = pd.read_csv(ground_truth_path, sep=gt_sep)
|
||
for col in ["unique_id", "MedDatum", gt_edss_col]:
|
||
if col not in df_gt.columns:
|
||
raise ValueError(f"GT missing column '{col}'. Available: {df_gt.columns.tolist()}")
|
||
|
||
df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
|
||
df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str)
|
||
df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]
|
||
df_gt["EDSS_gt"] = df_gt[gt_edss_col].map(parse_edss)
|
||
|
||
# ----------------------------
|
||
# Load JSON1 reference certainty_percent (per key)
|
||
# ----------------------------
|
||
with open(json1_file_path, "r", encoding="utf-8") as f:
|
||
data1 = json.load(f)
|
||
|
||
ref_rows = []
|
||
for entry in data1:
|
||
if not entry.get("success"):
|
||
continue
|
||
res = entry.get("result", {})
|
||
uid, md = res.get("unique_id"), res.get("MedDatum")
|
||
if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
|
||
continue
|
||
ref_rows.append({
|
||
"key": norm_str(uid) + "_" + norm_str(md),
|
||
"certainty_ref": pd.to_numeric(res.get("certainty_percent"), errors="coerce"),
|
||
})
|
||
|
||
df_ref = pd.DataFrame(ref_rows)
|
||
df_ref = (df_ref.dropna(subset=["certainty_ref"])
|
||
.groupby("key", as_index=False)["certainty_ref"]
|
||
.first())
|
||
|
||
# ----------------------------
|
||
# Load ALL JSON predictions (points)
|
||
# ----------------------------
|
||
json_files = sorted(glob.glob(os.path.join(json_dir_path, "*.json")))
|
||
if not json_files:
|
||
raise FileNotFoundError(f"No JSON files found in: {json_dir_path}")
|
||
|
||
rows = []
|
||
for fp in json_files:
|
||
with open(fp, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
for entry in data:
|
||
if not entry.get("success"):
|
||
continue
|
||
res = entry.get("result", {})
|
||
uid, md = res.get("unique_id"), res.get("MedDatum")
|
||
if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
|
||
continue
|
||
rows.append({
|
||
"file": os.path.basename(fp),
|
||
"key": norm_str(uid) + "_" + norm_str(md),
|
||
"EDSS_pred": parse_edss(res.get("EDSS")),
|
||
})
|
||
|
||
df_pred = pd.DataFrame(rows)
|
||
|
||
# ----------------------------
|
||
# Merge: preds + GT + JSON1 reference certainty
|
||
# ----------------------------
|
||
df = df_pred.merge(df_gt[["key", "EDSS_gt"]], on="key", how="inner", validate="many_to_one")
|
||
df = df.merge(df_ref[["key", "certainty_ref"]], on="key", how="inner", validate="many_to_one")
|
||
df = df.dropna(subset=["EDSS_gt", "EDSS_pred", "certainty_ref"]).copy()
|
||
|
||
df["signed_error"] = df["EDSS_pred"] - df["EDSS_gt"]
|
||
df["abs_error"] = df["signed_error"].abs()
|
||
y_col = "abs_error" if y_mode == "abs" else "signed_error"
|
||
|
||
# ----------------------------
|
||
# Correlations
|
||
# ----------------------------
|
||
x = df["certainty_ref"].to_numpy(dtype=float)
|
||
y = df[y_col].to_numpy(dtype=float)
|
||
|
||
pearson = np.corrcoef(x, y)[0, 1] if len(df) >= 2 else np.nan
|
||
rx, ry = rankdata(x), rankdata(y)
|
||
spearman = np.corrcoef(rx, ry)[0, 1] if len(df) >= 2 else np.nan
|
||
|
||
# ----------------------------
|
||
# Trend line (linear fit)
|
||
# ----------------------------
|
||
if len(df) >= 2:
|
||
a, b = np.polyfit(x, y, 1)
|
||
else:
|
||
a, b = np.nan, np.nan
|
||
|
||
# ----------------------------
|
||
# Density coloring (2D histogram bin counts)
|
||
# "how blue" = how many points are around that location
|
||
# ----------------------------
|
||
# Choose binning resolution (balanced for ~thousands of points)
|
||
x_bins = 50
|
||
y_bins = 50
|
||
|
||
# Compute bin index per point
|
||
x_edges = np.linspace(np.nanmin(x), np.nanmax(x), x_bins + 1)
|
||
y_edges = np.linspace(np.nanmin(y), np.nanmax(y), y_bins + 1)
|
||
|
||
xi = np.clip(np.digitize(x, x_edges) - 1, 0, x_bins - 1)
|
||
yi = np.clip(np.digitize(y, y_edges) - 1, 0, y_bins - 1)
|
||
|
||
# 2D counts
|
||
counts2d = np.zeros((x_bins, y_bins), dtype=int)
|
||
for i in range(len(x)):
|
||
counts2d[xi[i], yi[i]] += 1
|
||
|
||
# density per point = count of its bin
|
||
density = np.array([counts2d[xi[i], yi[i]] for i in range(len(x))], dtype=float)
|
||
|
||
# Plot low density first, high density last (so dense points are visible)
|
||
order = np.argsort(density)
|
||
x_o, y_o, d_o = x[order], y[order], density[order]
|
||
|
||
|
||
|
||
|
||
# ... keep everything above the "Plot" section identical ...
|
||
|
||
# ----------------------------
|
||
# Plot (IMPROVED COLORS)
|
||
# ----------------------------
|
||
fig, ax = plt.subplots(figsize=(12.5, 6))
|
||
|
||
# Option A (recommended): logarithmic color scaling
|
||
# Add +1 to avoid log(0)
|
||
d_plot = d_o + 1
|
||
|
||
# clip vmax so one extreme bin doesn't wash everything out
|
||
vmax = np.percentile(d_plot, 99) # try 95 or 99 depending on your data
|
||
norm = LogNorm(vmin=1, vmax=max(2, vmax))
|
||
|
||
sc = ax.scatter(
|
||
x_o, y_o,
|
||
c=d_plot,
|
||
cmap="Blues",
|
||
norm=norm,
|
||
s=point_size,
|
||
alpha=point_alpha,
|
||
linewidths=0
|
||
)
|
||
|
||
# Trend line (black)
|
||
if np.isfinite(a) and np.isfinite(b):
|
||
xs = np.linspace(np.nanmin(x), np.nanmax(x), 200)
|
||
ax.plot(xs, a * xs + b, linestyle="--", linewidth=2.5, color=trend_color)
|
||
|
||
ax.set_xlabel("certainty_percent (from JSON 1, per key)")
|
||
ax.set_ylabel("Absolute Error |EDSS_pred − EDSS_gt|" if y_mode == "abs" else "Signed Error (EDSS_pred − EDSS_gt)")
|
||
ax.set_title(
|
||
f"Correlation: JSON1 certainty_percent vs {y_col} (All iterations)\n"
|
||
f"Pearson r={pearson:.3f} | Spearman ρ={spearman:.3f}"
|
||
)
|
||
ax.grid(linestyle=":", alpha=0.5)
|
||
|
||
# Colorbar
|
||
cbar = plt.colorbar(sc, ax=ax)
|
||
cbar.set_label("Local density (count of cases in bin, log-scaled)")
|
||
|
||
# Legend
|
||
legend_items = [
|
||
Line2D([0], [0], marker="o", linestyle="None", color="navy",
|
||
label=f"Data points (n={len(df)})"),
|
||
Line2D([0], [0], linestyle="--", color=trend_color, linewidth=2.5,
|
||
label=f"Linear trend (Pearson r={pearson:.3f})"),
|
||
]
|
||
ax.legend(handles=legend_items, loc="upper right", frameon=True, title="Legend")
|
||
|
||
plt.tight_layout()
|
||
# Save as SVG (optional)
|
||
if save_svg_path:
|
||
fig.savefig(save_svg_path, format="svg", bbox_inches="tight", dpi=dpi)
|
||
print(f"[SAVED] {save_svg_path}")
|
||
|
||
plt.show()
|
||
|
||
json1_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json"
|
||
|
||
correlation_scatter_raw_certainty_json1_reference(
|
||
json_dir_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration",
|
||
json1_file_path=json1_path,
|
||
ground_truth_path="/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv",
|
||
y_mode="abs",
|
||
save_svg_path="/home/shahin/Lab/Doktorarbeit/Barcelona/results/corr_json1_abs_error.svg"
|
||
)
|
||
|
||
##
|
||
|
||
|
||
|