EDSS-calc/Data/audit.py

# %% Confirm EDSS missing
import pandas as pd
import numpy as np

def clean_series(s):
    return s.astype(str).str.strip().str.lower()

def gt_edss_audit(ground_truth_path, edss_col="EDSS"):
    df_gt = pd.read_csv(ground_truth_path, sep=';')

    # normalize keys
    df_gt['unique_id'] = clean_series(df_gt['unique_id'])
    df_gt['MedDatum']  = clean_series(df_gt['MedDatum'])
    df_gt['key'] = df_gt['unique_id'] + "_" + df_gt['MedDatum']

    print("GT rows:", len(df_gt))
    print("GT unique keys:", df_gt['key'].nunique())

    # IMPORTANT: parse EDSS robustly (German decimal commas etc.)
    if edss_col in df_gt.columns:
        edss_raw = df_gt[edss_col]
        edss_num = pd.to_numeric(
            edss_raw.astype(str).str.replace(",", ".", regex=False).str.strip(),
            errors="coerce"
        )
        df_gt["_edss_num"] = edss_num

        print(f"GT missing EDSS look (numeric-coerce): {df_gt['_edss_num'].isna().sum()}")
        print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['_edss_num'].isna(), 'key'].nunique()}")

        # duplicates on key
        dup = df_gt['key'].duplicated(keep=False)
        print("GT duplicate-key rows:", dup.sum())
        if dup.any():
            # how many duplicate keys exist?
            print("GT duplicate keys:", df_gt.loc[dup, 'key'].nunique())
            # of duplicate-key rows, how many have missing EDSS?
            print("Duplicate-key rows with missing EDSS:", df_gt.loc[dup, "_edss_num"].isna().sum())

            # show the worst offenders
            print("\nTop duplicate keys (by count):")
            print(df_gt.loc[dup, 'key'].value_counts().head(10))
    else:
        print(f"EDSS column '{edss_col}' not found in GT columns:", df_gt.columns.tolist())

    return df_gt

df_gt = gt_edss_audit("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv", edss_col="EDSS")

##


# %% trace missing ones

import json, glob, os
import pandas as pd

def load_preds(json_dir_path):
    all_preds = []
    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        file_name = os.path.basename(file_path)
        for entry in data:
            if entry.get("success"):
                res = entry["result"]
                all_preds.append({
                    "unique_id": str(res.get("unique_id")).strip().lower(),
                    "MedDatum":  str(res.get("MedDatum")).strip().lower(),
                    "file": file_name
                })
    df_pred = pd.DataFrame(all_preds)
    df_pred["key"] = df_pred["unique_id"] + "_" + df_pred["MedDatum"]
    return df_pred

df_pred = load_preds("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration")
print("Pred rows:", len(df_pred))
print("Pred unique keys:", df_pred["key"].nunique())

# Suppose df_gt was returned from step 1 and has _edss_num + key
missing_gt_keys = set(df_gt.loc[df_gt["_edss_num"].isna(), "key"])

df_pred["gt_key_missing_edss"] = df_pred["key"].isin(missing_gt_keys)

print("Pred rows whose GT key has missing EDSS:", df_pred["gt_key_missing_edss"].sum())
print("Unique keys (among preds) whose GT EDSS missing:", df_pred.loc[df_pred["gt_key_missing_edss"], "key"].nunique())

print("\nTop files contributing to missing-GT-EDSS rows:")
print(df_pred.loc[df_pred["gt_key_missing_edss"], "file"].value_counts().head(20))

print("\nTop keys replicated in predictions (why count inflates):")
print(df_pred.loc[df_pred["gt_key_missing_edss"], "key"].value_counts().head(20))


##


# %% verify

merged = df_pred.merge(
    df_gt[["key", "_edss_num"]],  # use the numeric-coerced GT EDSS
    on="key",
    how="left",
    validate="many_to_one"  # will ERROR if GT has duplicate keys (GOOD!)
)

print("Merged rows:", len(merged))
print("Merged missing GT EDSS:", merged["_edss_num"].isna().sum())


##


# %% 1json (rewritten with robust parsing + detailed data log)
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D

def plot_single_json_error_analysis_with_log(
    json_file_path,
    ground_truth_path,
    edss_gt_col="EDSS",
    min_bin_count=5,
):
    def norm_str(x):
        # normalize identifiers and dates consistently
        return str(x).strip().lower()

    def parse_edss(x):
        # robust numeric parse: handles "3,5" as 3.5, blanks, "nan", etc.
        if x is None:
            return np.nan
        s = str(x).strip()
        if s == "" or s.lower() in {"nan", "none", "null"}:
            return np.nan
        s = s.replace(",", ".")
        return pd.to_numeric(s, errors="coerce")

    print("\n" + "="*80)
    print("SINGLE-JSON ERROR ANALYSIS (WITH LOG)")
    print("="*80)
    print(f"JSON: {json_file_path}")
    print(f"GT:   {ground_truth_path}")

    # ------------------------------------------------------------------
    # 1) Load Ground Truth
    # ------------------------------------------------------------------
    df_gt = pd.read_csv(ground_truth_path, sep=";")

    required_gt_cols = {"unique_id", "MedDatum", edss_gt_col}
    missing_cols = required_gt_cols - set(df_gt.columns)
    if missing_cols:
        raise ValueError(f"GT is missing required columns: {missing_cols}. Available: {df_gt.columns.tolist()}")

    df_gt["unique_id"] = df_gt["unique_id"].map(norm_str)
    df_gt["MedDatum"]  = df_gt["MedDatum"].map(norm_str)
    df_gt["key"] = df_gt["unique_id"] + "_" + df_gt["MedDatum"]

    # Robust EDSS parsing (important!)
    df_gt["EDSS_gt"] = df_gt[edss_gt_col].map(parse_edss)

    # GT logs
    print("\n--- GT LOG ---")
    print(f"GT rows: {len(df_gt)}")
    print(f"GT unique keys: {df_gt['key'].nunique()}")
    gt_dup = df_gt["key"].duplicated(keep=False).sum()
    print(f"GT duplicate-key rows: {gt_dup}")
    print(f"GT missing EDSS (numeric): {df_gt['EDSS_gt'].isna().sum()}")
    print(f"GT missing EDSS unique keys: {df_gt.loc[df_gt['EDSS_gt'].isna(), 'key'].nunique()}")

    if gt_dup > 0:
        print("\n[WARNING] GT has duplicate keys. Merge can duplicate rows. Example duplicate keys:")
        print(df_gt.loc[df_gt["key"].duplicated(keep=False), "key"].value_counts().head(10))

    # ------------------------------------------------------------------
    # 2) Load Predictions from the specific JSON
    # ------------------------------------------------------------------
    with open(json_file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    total_entries = len(data)
    success_entries = sum(1 for e in data if e.get("success"))

    all_preds = []
    skipped = {
        "not_success": 0,
        "missing_uid_or_date": 0,
        "missing_edss": 0,
        "missing_conf": 0,
    }

    for entry in data:
        if not entry.get("success"):
            skipped["not_success"] += 1
            continue

        res = entry.get("result", {})
        uid = res.get("unique_id")
        md  = res.get("MedDatum")

        if uid is None or md is None or str(uid).strip() == "" or str(md).strip() == "":
            skipped["missing_uid_or_date"] += 1
            continue

        edss_pred = parse_edss(res.get("EDSS"))
        conf = pd.to_numeric(res.get("certainty_percent"), errors="coerce")

        if pd.isna(edss_pred):
            skipped["missing_edss"] += 1
        if pd.isna(conf):
            skipped["missing_conf"] += 1

        all_preds.append({
            "unique_id": norm_str(uid),
            "MedDatum":  norm_str(md),
            "key":       norm_str(uid) + "_" + norm_str(md),
            "EDSS_pred": edss_pred,
            "confidence": conf,
        })

    df_pred = pd.DataFrame(all_preds)

    # Pred logs
    print("\n--- PRED LOG ---")
    print(f"JSON total entries: {total_entries}")
    print(f"JSON success entries: {success_entries}")
    print(f"Pred rows loaded (success + has keys): {len(df_pred)}")
    if len(df_pred) == 0:
        print("[ERROR] No usable prediction rows found. Nothing to plot.")
        return

    print(f"Pred unique keys: {df_pred['key'].nunique()}")
    print(f"Pred missing EDSS (numeric): {df_pred['EDSS_pred'].isna().sum()}")
    print(f"Pred missing confidence:     {df_pred['confidence'].isna().sum()}")
    print("Skipped counts:", skipped)

    # Are keys duplicated within this JSON? (often yes if multiple notes map to same key)
    key_counts = df_pred["key"].value_counts()
    dup_pred_rows = (key_counts > 1).sum()
    max_rep = int(key_counts.max())
    print(f"Keys with >1 prediction in this JSON: {dup_pred_rows}")
    print(f"Max repetitions of a single key in this JSON: {max_rep}")
    if max_rep > 1:
        print("Top repeated keys in this JSON:")
        print(key_counts.head(10))

    # ------------------------------------------------------------------
    # 3) Merge (and diagnose why rows drop)
    # ------------------------------------------------------------------
    # Diagnose how many pred keys exist in GT
    gt_key_set = set(df_gt["key"])
    df_pred["key_in_gt"] = df_pred["key"].isin(gt_key_set)
    not_in_gt = df_pred.loc[~df_pred["key_in_gt"]]

    print("\n--- KEY MATCH LOG ---")
    print(f"Pred rows with key found in GT: {df_pred['key_in_gt'].sum()} / {len(df_pred)}")
    print(f"Pred rows with key NOT found in GT: {len(not_in_gt)}")
    if len(not_in_gt) > 0:
        print("[WARNING] Some prediction keys are not present in GT. First 10:")
        print(not_in_gt[["unique_id", "MedDatum", "key"]].head(10))

    # Now merge; we expect GT is one-to-many with pred (many_to_one)
    # If GT had duplicates, validate would raise.
    df_merged = df_pred.merge(
        df_gt[["key", "EDSS_gt"]],
        on="key",
        how="inner",
        validate="many_to_one"
    )

    print("\n--- MERGE LOG ---")
    print(f"Merged rows (inner join): {len(df_merged)}")
    print(f"Merged unique keys: {df_merged['key'].nunique()}")
    print(f"Merged missing GT EDSS:   {df_merged['EDSS_gt'].isna().sum()}")
    print(f"Merged missing pred EDSS: {df_merged['EDSS_pred'].isna().sum()}")
    print(f"Merged missing confidence:{df_merged['confidence'].isna().sum()}")

    # How many rows will be removed by dropna() in your old code?
    # Old code did .dropna() on ALL columns, which can remove rows for missing confidence too.
    rows_complete = df_merged.dropna(subset=["EDSS_gt", "EDSS_pred", "confidence"])
    print("\n--- FILTER LOG (what will be used for stats/plot) ---")
    print(f"Rows with all required fields (EDSS_gt, EDSS_pred, confidence): {len(rows_complete)}")
    if len(rows_complete) == 0:
        print("[ERROR] No complete rows after filtering. Nothing to plot.")
        return

    # Compute abs error
    rows_complete = rows_complete.copy()
    rows_complete["abs_error"] = (rows_complete["EDSS_pred"] - rows_complete["EDSS_gt"]).abs()

    # ------------------------------------------------------------------
    # 4) Binning + stats (with guardrails)
    # ------------------------------------------------------------------
    bins = [0, 70, 80, 90, 100]
    labels = ["Low (<70%)", "Moderate (70-80%)", "High (80-90%)", "Very High (90-100%)"]

    # Confidence outside bins becomes NaN; log it
    rows_complete["conf_bin"] = pd.cut(rows_complete["confidence"], bins=bins, labels=labels, include_lowest=True)
    conf_outside = rows_complete["conf_bin"].isna().sum()
    print(f"Rows with confidence outside [0,100] or outside bin edges: {conf_outside}")
    if conf_outside > 0:
        print("Example confidences outside bins:")
        print(rows_complete.loc[rows_complete["conf_bin"].isna(), "confidence"].head(20).to_list())

    df_plot = rows_complete.dropna(subset=["conf_bin"])
    stats = (
        df_plot.groupby("conf_bin", observed=True)["abs_error"]
        .agg(mean="mean", std="std", count="count")
        .reindex(labels)
        .reset_index()
    )

    print("\n--- BIN STATS ---")
    print(stats)

    # Warn about low counts
    low_bins = stats.loc[stats["count"].fillna(0) < min_bin_count, ["conf_bin", "count"]]
    if not low_bins.empty:
        print(f"\n[WARNING] Some bins have < {min_bin_count} rows; error bars/trend may be unstable:")
        print(low_bins)

    # ------------------------------------------------------------------
    # 5) Plot
    # ------------------------------------------------------------------
    plt.figure(figsize=(13, 8))
    colors = sns.color_palette("Blues", n_colors=len(labels))

    # Replace NaNs in mean for plotting bars (empty bins)
    means = stats["mean"].to_numpy()
    counts = stats["count"].fillna(0).astype(int).to_numpy()
    stds = stats["std"].to_numpy()

    # For bins with no data, bar height 0 (and no errorbar)
    means_plot = np.nan_to_num(means, nan=0.0)

    bars = plt.bar(labels, means_plot, color=colors, edgecolor="black", alpha=0.85)

    # Error bars only where count>1 and std is not NaN
    sem = np.where((counts > 1) & (~np.isnan(stds)), stds / np.sqrt(counts), np.nan)
    plt.errorbar(labels, means_plot, yerr=sem, fmt="none", c="black", capsize=8, elinewidth=1.5)

    # Trend line only if at least 2 non-empty bins
    valid_idx = np.where(~np.isnan(means))[0]
    if len(valid_idx) >= 2:
        x_idx = np.arange(len(labels))
        z = np.polyfit(valid_idx, means[valid_idx], 1)
        p = np.poly1d(z)
        plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5)
        trend_label = "Trend Line"
    else:
        trend_label = "Trend Line (insufficient bins)"
        print("\n[INFO] Not enough non-empty bins to fit a trend line.")

    # Data labels
    for i, bar in enumerate(bars):
        n_count = int(counts[i])
        mae_val = means[i]
        if np.isnan(mae_val) or n_count == 0:
            txt = "empty"
            y = 0.02
        else:
            txt = f"MAE: {mae_val:.2f}\nn={n_count}"
            y = bar.get_height() + 0.04
        plt.text(
            bar.get_x() + bar.get_width()/2,
            y,
            txt,
            ha="center",
            va="bottom",
            fontweight="bold",
            fontsize=10
        )

    # Legend
    legend_elements = [
        Patch(facecolor=colors[0], edgecolor="black", label=f"Bin 1: {labels[0]}"),
        Patch(facecolor=colors[1], edgecolor="black", label=f"Bin 2: {labels[1]}"),
        Patch(facecolor=colors[2], edgecolor="black", label=f"Bin 3: {labels[2]}"),
        Patch(facecolor=colors[3], edgecolor="black", label=f"Bin 4: {labels[3]}"),
        Line2D([0], [0], color="#e74c3c", linestyle="--", lw=3, label=trend_label),
        Line2D([0], [0], color="black", marker="_", linestyle="None", markersize=10, label="Std Error (SEM)"),
        Patch(color="none", label="Metric: Mean Absolute Error (MAE)")
    ]
    plt.legend(handles=legend_elements, loc="upper right", frameon=True, shadow=True, title="Legend")

    plt.title("Validation: Confidence vs. Error Magnitude (Single JSON)", fontsize=15, pad=30)
    plt.ylabel("Mean Absolute Error (EDSS Points)", fontsize=12)
    plt.xlabel("LLM Confidence Bracket", fontsize=12)
    plt.grid(axis="y", linestyle=":", alpha=0.5)

    ymax = np.nanmax(means) if np.any(~np.isnan(means)) else 0.0
    plt.ylim(0, max(0.5, float(ymax) + 0.6))
    plt.tight_layout()
    plt.show()

    print("\n" + "="*80)
    print("DONE")
    print("="*80)


# --- RUN ---
json_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json"
gt_path   = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv"

plot_single_json_error_analysis_with_log(json_path, gt_path)


##