diff --git a/Data/audit.py b/Data/audit.py new file mode 100644 index 0000000..cbac7cb --- /dev/null +++ b/Data/audit.py @@ -0,0 +1,415 @@ +# %% Confirm EDSS missing +import pandas as pd +import numpy as np + +def clean_series(s): + return s.astype(str).str.strip().str.lower() + +def gt_edss_audit(ground_truth_path, edss_col="EDSS"): + df_gt = pd.read_csv(ground_truth_path, sep=';') + + # normalize keys + df_gt['unique_id'] = clean_series(df_gt['unique_id']) + df_gt['MedDatum'] = clean_series(df_gt['MedDatum']) + df_gt['key'] = df_gt['unique_id'] + "_" + df_gt['MedDatum'] + + print("GT rows:", len(df_gt)) + print("GT unique keys:", df_gt['key'].nunique()) + + # IMPORTANT: parse EDSS robustly (German decimal commas etc.) + if edss_col in df_gt.columns: + edss_raw = df_gt[edss_col] + edss_num = pd.to_numeric( + edss_raw.astype(str).str.replace(",", ".", regex=False).str.strip(), + errors="coerce" + ) + df_gt["_edss_num"] = edss_num + + print(f"GT missing EDSS look (numeric-coerce): {df_gt['_edss_num'].isna().sum()}") + print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['_edss_num'].isna(), 'key'].nunique()}") + + # duplicates on key + dup = df_gt['key'].duplicated(keep=False) + print("GT duplicate-key rows:", dup.sum()) + if dup.any(): + # how many duplicate keys exist? + print("GT duplicate keys:", df_gt.loc[dup, 'key'].nunique()) + # of duplicate-key rows, how many have missing EDSS? + print("Duplicate-key rows with missing EDSS:", df_gt.loc[dup, "_edss_num"].isna().sum()) + + # show the worst offenders + print("\nTop duplicate keys (by count):") + print(df_gt.loc[dup, 'key'].value_counts().head(10)) + else: + print(f"EDSS column '{edss_col}' not found in GT columns:", df_gt.columns.tolist()) + + return df_gt + +df_gt = gt_edss_audit("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", edss_col="EDSS") + +## + + + + +# %% trace missing ones + +import json, glob, os +import pandas as pd + +def load_preds(json_dir_path): + all_preds = [] + for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + file_name = os.path.basename(file_path) + for entry in data: + if entry.get("success"): + res = entry["result"] + all_preds.append({ + "unique_id": str(res.get("unique_id")).strip().lower(), + "MedDatum": str(res.get("MedDatum")).strip().lower(), + "file": file_name + }) + df_pred = pd.DataFrame(all_preds) + df_pred["key"] = df_pred["unique_id"] + "_" + df_pred["MedDatum"] + return df_pred + +df_pred = load_preds("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration") +print("Pred rows:", len(df_pred)) +print("Pred unique keys:", df_pred["key"].nunique()) + +# Suppose df_gt was returned from step 1 and has _edss_num + key +missing_gt_keys = set(df_gt.loc[df_gt["_edss_num"].isna(), "key"]) + +df_pred["gt_key_missing_edss"] = df_pred["key"].isin(missing_gt_keys) + +print("Pred rows whose GT key has missing EDSS:", df_pred["gt_key_missing_edss"].sum()) +print("Unique keys (among preds) whose GT EDSS missing:", df_pred.loc[df_pred["gt_key_missing_edss"], "key"].nunique()) + +print("\nTop files contributing to missing-GT-EDSS rows:") +print(df_pred.loc[df_pred["gt_key_missing_edss"], "file"].value_counts().head(20)) + +print("\nTop keys replicated in predictions (why count inflates):") +print(df_pred.loc[df_pred["gt_key_missing_edss"], "key"].value_counts().head(20)) + + +## + + +# %% verify + +merged = df_pred.merge( + df_gt[["key", "_edss_num"]], # use the numeric-coerced GT EDSS + on="key", + how="left", + validate="many_to_one" # will ERROR if GT has duplicate keys (GOOD!) +) + +print("Merged rows:", len(merged)) +print("Merged missing GT EDSS:", merged["_edss_num"].isna().sum()) + + +## + + +# %% 1json (rewritten with robust parsing + detailed data log) +import pandas as pd +import numpy as np +import json +import matplotlib.pyplot as plt +import seaborn as sns +from matplotlib.patches import Patch +from matplotlib.lines import Line2D + +def plot_single_json_error_analysis_with_log( + json_file_path, + ground_truth_path, + edss_gt_col="EDSS", + min_bin_count=5, +): + def norm_str(x): + # normalize identifiers and dates consistently + return str(x).strip().lower() + + def parse_edss(x): + # robust numeric parse: handles "3,5" as 3.5, blanks, "nan", etc. + if x is None: + return np.nan + s = str(x).strip() + if s == "" or s.lower() in {"nan", "none", "null"}: + return np.nan + s = s.replace(",", ".") + return pd.to_numeric(s, errors="coerce") + + print("\n" + "="*80) + print("SINGLE-JSON ERROR ANALYSIS (WITH LOG)") + print("="*80) + print(f"JSON: {json_file_path}") + print(f"GT: {ground_truth_path}") + + # ------------------------------------------------------------------ + # 1) Load Ground Truth + # ------------------------------------------------------------------ + df_gt = pd.read_csv(ground_truth_path, sep=";") + + required_gt_cols = {"unique_id", "MedDatum", edss_gt_col} + missing_cols = required_gt_cols - set(df_gt.columns) + if missing_cols: + raise ValueError(f"GT is missing required columns: {missing_cols}. Available: {df_gt.columns.tolist()}") + + df_gt["unique_id"] = df_gt["unique_id"].map(norm_str) + df_gt["MedDatum"] = df_gt["MedDatum"].map(norm_str) + df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"] + + # Robust EDSS parsing (important!) + df_gt["EDSS_gt"] = df_gt[edss_gt_col].map(parse_edss) + + # GT logs + print("\n--- GT LOG ---") + print(f"GT rows: {len(df_gt)}") + print(f"GT unique keys: {df_gt['key'].nunique()}") + gt_dup = df_gt["key"].duplicated(keep=False).sum() + print(f"GT duplicate-key rows: {gt_dup}") + print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}") + print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['EDSS_gt'].isna(), 'key'].nunique()}") + + if gt_dup > 0: + print("\n[WARNING] GT has duplicate keys. Merge can duplicate rows. Example duplicate keys:") + print(df_gt.loc[df_gt["key"].duplicated(keep=False), "key"].value_counts().head(10)) + + # ------------------------------------------------------------------ + # 2) Load Predictions from the specific JSON + # ------------------------------------------------------------------ + with open(json_file_path, "r", encoding="utf-8") as f: + data = json.load(f) + + total_entries = len(data) + success_entries = sum(1 for e in data if e.get("success")) + + all_preds = [] + skipped = { + "not_success": 0, + "missing_uid_or_date": 0, + "missing_edss": 0, + "missing_conf": 0, + } + + for entry in data: + if not entry.get("success"): + skipped["not_success"] += 1 + continue + + res = entry.get("result", {}) + uid = res.get("unique_id") + md = res.get("MedDatum") + + if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "": + skipped["missing_uid_or_date"] += 1 + continue + + edss_pred = parse_edss(res.get("EDSS")) + conf = pd.to_numeric(res.get("certainty_percent"), errors="coerce") + + if pd.isna(edss_pred): + skipped["missing_edss"] += 1 + if pd.isna(conf): + skipped["missing_conf"] += 1 + + all_preds.append({ + "unique_id": norm_str(uid), + "MedDatum": norm_str(md), + "key": norm_str(uid) + "_" + norm_str(md), + "EDSS_pred": edss_pred, + "confidence": conf, + }) + + df_pred = pd.DataFrame(all_preds) + + # Pred logs + print("\n--- PRED LOG ---") + print(f"JSON total entries: {total_entries}") + print(f"JSON success entries: {success_entries}") + print(f"Pred rows loaded (success + has keys): {len(df_pred)}") + if len(df_pred) == 0: + print("[ERROR] No usable prediction rows found. Nothing to plot.") + return + + print(f"Pred unique keys: {df_pred['key'].nunique()}") + print(f"Pred missing EDSS (numeric): {df_pred['EDSS_pred'].isna().sum()}") + print(f"Pred missing confidence: {df_pred['confidence'].isna().sum()}") + print("Skipped counts:", skipped) + + # Are keys duplicated within this JSON? (often yes if multiple notes map to same key) + key_counts = df_pred["key"].value_counts() + dup_pred_rows = (key_counts > 1).sum() + max_rep = int(key_counts.max()) + print(f"Keys with >1 prediction in this JSON: {dup_pred_rows}") + print(f"Max repetitions of a single key in this JSON: {max_rep}") + if max_rep > 1: + print("Top repeated keys in this JSON:") + print(key_counts.head(10)) + + # ------------------------------------------------------------------ + # 3) Merge (and diagnose why rows drop) + # ------------------------------------------------------------------ + # Diagnose how many pred keys exist in GT + gt_key_set = set(df_gt["key"]) + df_pred["key_in_gt"] = df_pred["key"].isin(gt_key_set) + not_in_gt = df_pred.loc[~df_pred["key_in_gt"]] + + print("\n--- KEY MATCH LOG ---") + print(f"Pred rows with key found in GT: {df_pred['key_in_gt'].sum()} / {len(df_pred)}") + print(f"Pred rows with key NOT found in GT: {len(not_in_gt)}") + if len(not_in_gt) > 0: + print("[WARNING] Some prediction keys are not present in GT. First 10:") + print(not_in_gt[["unique_id", "MedDatum", "key"]].head(10)) + + # Now merge; we expect GT is one-to-many with pred (many_to_one) + # If GT had duplicates, validate would raise. + df_merged = df_pred.merge( + df_gt[["key", "EDSS_gt"]], + on="key", + how="inner", + validate="many_to_one" + ) + + print("\n--- MERGE LOG ---") + print(f"Merged rows (inner join): {len(df_merged)}") + print(f"Merged unique keys: {df_merged['key'].nunique()}") + print(f"Merged missing GT EDSS: {df_merged['EDSS_gt'].isna().sum()}") + print(f"Merged missing pred EDSS: {df_merged['EDSS_pred'].isna().sum()}") + print(f"Merged missing confidence:{df_merged['confidence'].isna().sum()}") + + # How many rows will be removed by dropna() in your old code? + # Old code did .dropna() on ALL columns, which can remove rows for missing confidence too. + rows_complete = df_merged.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"]) + print("\n--- FILTER LOG (what will be used for stats/plot) ---") + print(f"Rows with all required fields (EDSS_gt, EDSS_pred, confidence): {len(rows_complete)}") + if len(rows_complete) == 0: + print("[ERROR] No complete rows after filtering. Nothing to plot.") + return + + # Compute abs error + rows_complete = rows_complete.copy() + rows_complete["abs_error"] = (rows_complete["EDSS_pred"] - rows_complete["EDSS_gt"]).abs() + + # ------------------------------------------------------------------ + # 4) Binning + stats (with guardrails) + # ------------------------------------------------------------------ + bins = [0, 70, 80, 90, 100] + labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"] + + # Confidence outside bins becomes NaN; log it + rows_complete["conf_bin"] = pd.cut(rows_complete["confidence"], bins=bins, labels=labels, include_lowest=True) + conf_outside = rows_complete["conf_bin"].isna().sum() + print(f"Rows with confidence outside [0,100] or outside bin edges: {conf_outside}") + if conf_outside > 0: + print("Example confidences outside bins:") + print(rows_complete.loc[rows_complete["conf_bin"].isna(), "confidence"].head(20).to_list()) + + df_plot = rows_complete.dropna(subset=["conf_bin"]) + stats = ( + df_plot.groupby("conf_bin", observed=True)["abs_error"] + .agg(mean="mean", std="std", count="count") + .reindex(labels) + .reset_index() + ) + + print("\n--- BIN STATS ---") + print(stats) + + # Warn about low counts + low_bins = stats.loc[stats["count"].fillna(0) < min_bin_count, ["conf_bin", "count"]] + if not low_bins.empty: + print(f"\n[WARNING] Some bins have < {min_bin_count} rows; error bars/trend may be unstable:") + print(low_bins) + + # ------------------------------------------------------------------ + # 5) Plot + # ------------------------------------------------------------------ + plt.figure(figsize=(13, 8)) + colors = sns.color_palette("Blues", n_colors=len(labels)) + + # Replace NaNs in mean for plotting bars (empty bins) + means = stats["mean"].to_numpy() + counts = stats["count"].fillna(0).astype(int).to_numpy() + stds = stats["std"].to_numpy() + + # For bins with no data, bar height 0 (and no errorbar) + means_plot = np.nan_to_num(means, nan=0.0) + + bars = plt.bar(labels, means_plot, color=colors, edgecolor="black", alpha=0.85) + + # Error bars only where count>1 and std is not NaN + sem = np.where((counts > 1) & (~np.isnan(stds)), stds / np.sqrt(counts), np.nan) + plt.errorbar(labels, means_plot, yerr=sem, fmt="none", c="black", capsize=8, elinewidth=1.5) + + # Trend line only if at least 2 non-empty bins + valid_idx = np.where(~np.isnan(means))[0] + if len(valid_idx) >= 2: + x_idx = np.arange(len(labels)) + z = np.polyfit(valid_idx, means[valid_idx], 1) + p = np.poly1d(z) + plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5) + trend_label = "Trend Line" + else: + trend_label = "Trend Line (insufficient bins)" + print("\n[INFO] Not enough non-empty bins to fit a trend line.") + + # Data labels + for i, bar in enumerate(bars): + n_count = int(counts[i]) + mae_val = means[i] + if np.isnan(mae_val) or n_count == 0: + txt = "empty" + y = 0.02 + else: + txt = f"MAE: {mae_val:.2f}\nn={n_count}" + y = bar.get_height() + 0.04 + plt.text( + bar.get_x() + bar.get_width()/2, + y, + txt, + ha="center", + va="bottom", + fontweight="bold", + fontsize=10 + ) + + # Legend + legend_elements = [ + Patch(facecolor=colors[0], edgecolor="black", label=f"Bin 1: {labels[0]}"), + Patch(facecolor=colors[1], edgecolor="black", label=f"Bin 2: {labels[1]}"), + Patch(facecolor=colors[2], edgecolor="black", label=f"Bin 3: {labels[2]}"), + Patch(facecolor=colors[3], edgecolor="black", label=f"Bin 4: {labels[3]}"), + Line2D([0], [0], color="#e74c3c", linestyle="--", lw=3, label=trend_label), + Line2D([0], [0], color="black", marker="_", linestyle="None", markersize=10, label="Std Error (SEM)"), + Patch(color="none", label="Metric: Mean Absolute Error (MAE)") + ] + plt.legend(handles=legend_elements, loc="upper right", frameon=True, shadow=True, title="Legend") + + plt.title("Validation: Confidence vs. Error Magnitude (Single JSON)", fontsize=15, pad=30) + plt.ylabel("Mean Absolute Error (EDSS Points)", fontsize=12) + plt.xlabel("LLM Confidence Bracket", fontsize=12) + plt.grid(axis="y", linestyle=":", alpha=0.5) + + ymax = np.nanmax(means) if np.any(~np.isnan(means)) else 0.0 + plt.ylim(0, max(0.5, float(ymax) + 0.6)) + plt.tight_layout() + plt.show() + + print("\n" + "="*80) + print("DONE") + print("="*80) + + +# --- RUN --- +json_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json" +gt_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv" + +plot_single_json_error_analysis_with_log(json_path, gt_path) + + + +## diff --git a/Data/certainty_show.py b/Data/certainty_show.py index a35ac75..24b199d 100644 --- a/Data/certainty_show.py +++ b/Data/certainty_show.py @@ -753,6 +753,7 @@ def audit_matches(json_dir_path, ground_truth_path): print("\nFailure count per JSON file:") print(unmatched_summary['file'].value_counts()) +audit_matches('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') ## @@ -1523,7 +1524,6 @@ plot_single_json_error_analysis(json_path, gt_path) #plot_subcategory_analysis('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #plot_certainty_vs_accuracy_by_category('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') -#audit_matches('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #plot_edss_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') @@ -1535,6 +1535,6 @@ plot_single_json_error_analysis(json_path, gt_path) #plot_error_distribution_by_confidence('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #plot_confidence_vs_abs_error_refined('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #plot_confidence_vs_abs_error_with_counts('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') -plot_final_thesis_error_chart('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') +#plot_final_thesis_error_chart('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') ##