isabella box and Error disagreement plot

2026-05-04 16:41:42 +02:00
parent c9db7b5163
commit f6ec60e685
1 changed files with 803 additions and 3 deletions
@@ -2152,8 +2152,6 @@ plt.savefig(figure_save_path, format='svg', bbox_inches='tight')
 plt.show()
 ##
 <<<<<<< Updated upstream
 =======
 # %% Functional System + EDSS Error Boxplots
 import pandas as pd
 import matplotlib.pyplot as plt
@@ -2302,7 +2300,6 @@ plt.savefig(figure_save_path, format='svg', bbox_inches='tight')
 plt.show()
 ##
 >>>>>>> Stashed changes
 # %% test
 # Diagnose: what are the actual differences?
@@ -2318,3 +2315,806 @@ for gt_col, res_col in functional_systems_to_plot:
    print(f"{gt_col.split('.')[1]:25s}: non-zero = {non_zero:3d}, tiny = {tiny.sum():3d}, max abs diff = {abs_diff.max():.12f}")
 ##
 # %% Functional System Continuous Accuracy Boxplot
 import pandas as pd
 import matplotlib.pyplot as plt
 import os
 import numpy as np
 from matplotlib.patches import Patch
 from matplotlib.lines import Line2D
 # --- Configuration ---
 plt.rcParams['font.family'] = 'Arial'
 figure_save_path = 'project/visuals/functional_systems_continuous_accuracy_boxplot.svg'
 # --- Functional systems using your actual column names ---
 functional_systems_to_plot = [
    ('GT.VISUAL_OPTIC_FUNCTIONS', 'result.VISUAL OPTIC FUNCTIONS'),
    ('GT.CEREBELLAR_FUNCTIONS', 'result.CEREBELLAR FUNCTIONS'),
    ('GT.BRAINSTEM_FUNCTIONS', 'result.BRAINSTEM FUNCTIONS'),
    ('GT.SENSORY_FUNCTIONS', 'result.SENSORY FUNCTIONS'),
    ('GT.PYRAMIDAL_FUNCTIONS', 'result.PYRAMIDAL FUNCTIONS'),
    ('GT.AMBULATION', 'result.AMBULATION'),
    ('GT.CEREBRAL_FUNCTIONS', 'result.CEREBRAL FUNCTIONS'),
    ('GT.BOWEL_AND_BLADDER_FUNCTIONS', 'result.BOWEL AND BLADDER FUNCTIONS')
 ]
 # --- Robust parser ---
 def safe_parse(s):
    """Convert to float, handling comma decimals like '3,5'."""
    if pd.isna(s):
        return np.nan
    if isinstance(s, (int, float, np.integer, np.floating)):
        return float(s)
    s_clean = str(s).replace(',', '.').strip()
    if s_clean == "":
        return np.nan
    try:
        return float(s_clean)
    except ValueError:
        return np.nan
 # --- Build accuracy data ---
 boxplot_data = []
 system_labels = []
 predicted_counts = []
 missing_prediction_counts = []
 total_gt_counts = []
 mean_accuracies = []
 for gt_col, res_col in functional_systems_to_plot:
    if gt_col not in df.columns:
        print(f"Skipping {gt_col}: GT column not found")
        continue
    if res_col not in df.columns:
        print(f"Skipping {res_col}: result column not found")
        continue
    system_name = gt_col.split('.')[1]
    gt = df[gt_col].apply(safe_parse)
    res = df[res_col].apply(safe_parse)
    # Only rows where ground truth exists
    gt_exists = gt.notna()
    total_gt = gt_exists.sum()
    if total_gt == 0:
        print(f"Skipping {system_name}: no ground-truth values")
        continue
    gt_valid = gt[gt_exists]
    res_valid = res[gt_exists]
    # GT exists, but LLM prediction is missing
    missing_count = res_valid.isna().sum()
    # For the boxplot, use rows where both GT and result exist
    both_exist = res_valid.notna()
    if both_exist.sum() == 0:
        print(f"Skipping {system_name}: no predicted values")
        continue
    gt_eval = gt_valid[both_exist]
    res_eval = res_valid[both_exist]
    # Functional system score range.
    # Adjust if your functional systems use another scale.
    score_range = 5
    # Continuous accuracy:
    # exact match = 1.0
    # off by 1 point = 0.8
    # off by 2 points = 0.6
    # etc.
    abs_error = (res_eval - gt_eval).abs()
    accuracy = 1 - (abs_error / score_range)
    accuracy = accuracy.clip(lower=0, upper=1)
    clean_name = system_name.replace('_', ' ').title()
    boxplot_data.append(accuracy.values)
    system_labels.append(clean_name)
    predicted_counts.append(len(gt_eval))
    missing_prediction_counts.append(missing_count)
    total_gt_counts.append(total_gt)
    mean_accuracies.append(accuracy.mean())
    print(
        f"{clean_name}: "
        f"GT={total_gt}, predicted={len(gt_eval)}, missing={missing_count}, "
        f"mean accuracy={accuracy.mean():.1%}"
    )
 if not boxplot_data:
    raise ValueError("No valid accuracy data available for plotting.")
 # X-axis labels
 xtick_labels = [
    f"{label}\nGT={gt_n}, predicted={pred_n}, missing={miss_n}"
    for label, gt_n, pred_n, miss_n
    in zip(system_labels, total_gt_counts, predicted_counts, missing_prediction_counts)
 ]
 # --- Plot ---
 fig, ax = plt.subplots(figsize=(16, 8))
 bp = ax.boxplot(
    boxplot_data,
    vert=True,
    patch_artist=True,
    labels=xtick_labels,
    showmeans=True,
    meanline=False,
    widths=0.55
 )
 # --- Styling ---
 box_face = '#D6EAF8'
 box_edge = '#2980B9'
 whisker_col = '#7F8C8D'
 median_col = '#C0392B'
 mean_col = '#1ABC9C'
 flier_face = '#95A5A6'
 flier_edge = '#7F8C8D'
 for box in bp['boxes']:
    box.set(facecolor=box_face, edgecolor=box_edge, linewidth=1.5)
 for whisker in bp['whiskers']:
    whisker.set(color=whisker_col, linewidth=1.2)
 for cap in bp['caps']:
    cap.set(color=whisker_col, linewidth=1.2)
 for median in bp['medians']:
    median.set(color=median_col, linewidth=2)
 for mean in bp['means']:
    mean.set(
        marker='o',
        markerfacecolor=mean_col,
        markeredgecolor='black',
        markersize=6
    )
 for flier in bp['fliers']:
    flier.set(
        marker='o',
        markerfacecolor=flier_face,
        markeredgecolor=flier_edge,
        alpha=0.6,
        markersize=4
    )
 # Mean accuracy label above each box
 for i, acc in enumerate(mean_accuracies, start=1):
    ax.text(
        i,
        1.03,
        f"{acc:.1%}",
        ha='center',
        va='bottom',
        fontsize=9,
        fontweight='bold'
    )
 # Perfect accuracy reference line
 ax.axhline(1, color='black', linewidth=1.2, linestyle='--', alpha=0.7)
 # Labels and formatting
 ax.set_xlabel('Functional System', fontsize=11, fontweight='bold')
 ax.set_ylabel('Continuous Accuracy', fontsize=11, fontweight='bold')
 ax.set_ylim(-0.05, 1.10)
 ax.set_yticks(np.arange(0, 1.01, 0.1))
 ax.set_yticklabels([f"{int(y * 100)}%" for y in np.arange(0, 1.01, 0.1)])
 plt.xticks(rotation=45, ha='right')
 ax.yaxis.grid(True, linestyle='--', alpha=0.3)
 for spine in ['top', 'right']:
    ax.spines[spine].set_visible(False)
 # Legend
 legend_handles = [
    Patch(facecolor=box_face, edgecolor=box_edge, label='IQR of continuous accuracy'),
    Line2D([0], [0], color=median_col, lw=2, label='Median'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor=mean_col,
           markeredgecolor='black', markersize=7, label='Mean'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor=flier_face,
           markeredgecolor=flier_edge, alpha=0.8, markersize=6, label='Outlier'),
    Line2D([0], [0], color='black', lw=1.2, linestyle='--', label='Perfect accuracy')
 ]
 ax.legend(
    handles=legend_handles,
    loc='lower center',
    bbox_to_anchor=(0.5, 1.06),
    ncol=5,
    frameon=False
 )
 plt.tight_layout(rect=[0, 0, 1, 0.88])
 os.makedirs(os.path.dirname(figure_save_path), exist_ok=True)
 plt.savefig(figure_save_path, format='svg', bbox_inches='tight')
 plt.show()
 ##
 # %% Functional Systems + EDSS Continuous Accuracy Boxplot
 import pandas as pd
 import matplotlib.pyplot as plt
 import os
 import numpy as np
 from matplotlib.patches import Patch
 from matplotlib.lines import Line2D
 # --- Configuration ---
 plt.rcParams['font.family'] = 'Arial'
 figure_save_path = 'project/visuals/functional_systems_edss_continuous_accuracy_boxplot.svg'
 # --- Functional systems + EDSS using your actual column names ---
 functional_systems_to_plot = [
    ('GT.VISUAL_OPTIC_FUNCTIONS', 'result.VISUAL OPTIC FUNCTIONS'),
    ('GT.CEREBELLAR_FUNCTIONS', 'result.CEREBELLAR FUNCTIONS'),
    ('GT.BRAINSTEM_FUNCTIONS', 'result.BRAINSTEM FUNCTIONS'),
    ('GT.SENSORY_FUNCTIONS', 'result.SENSORY FUNCTIONS'),
    ('GT.PYRAMIDAL_FUNCTIONS', 'result.PYRAMIDAL FUNCTIONS'),
    ('GT.AMBULATION', 'result.AMBULATION'),
    ('GT.CEREBRAL_FUNCTIONS', 'result.CEREBRAL FUNCTIONS'),
    ('GT.BOWEL_AND_BLADDER_FUNCTIONS', 'result.BOWEL AND BLADDER FUNCTIONS'),
    # EDSS
    ('GT.EDSS', 'result.EDSS')
 ]
 # --- Robust parser ---
 def safe_parse(s):
    """Convert to float, handling comma decimals like '3,5'."""
    if pd.isna(s):
        return np.nan
    if isinstance(s, (int, float, np.integer, np.floating)):
        return float(s)
    s_clean = str(s).replace(',', '.').strip()
    if s_clean == "":
        return np.nan
    try:
        return float(s_clean)
    except ValueError:
        return np.nan
 # --- Build accuracy data ---
 boxplot_data = []
 system_labels = []
 predicted_counts = []
 missing_prediction_counts = []
 total_gt_counts = []
 mean_accuracies = []
 for gt_col, res_col in functional_systems_to_plot:
    if gt_col not in df.columns:
        print(f"Skipping {gt_col}: GT column not found")
        continue
    if res_col not in df.columns:
        print(f"Skipping {res_col}: result column not found")
        continue
    system_name = gt_col.split('.')[1]
    gt = df[gt_col].apply(safe_parse)
    res = df[res_col].apply(safe_parse)
    # Only rows where ground truth exists
    gt_exists = gt.notna()
    total_gt = gt_exists.sum()
    if total_gt == 0:
        print(f"Skipping {system_name}: no ground-truth values")
        continue
    gt_valid = gt[gt_exists]
    res_valid = res[gt_exists]
    # Count cases where GT exists but LLM prediction is missing
    missing_count = res_valid.isna().sum()
    # For the boxplot, use only rows where both GT and prediction exist
    both_exist = res_valid.notna()
    if both_exist.sum() == 0:
        print(f"Skipping {system_name}: no predicted values")
        continue
    gt_eval = gt_valid[both_exist]
    res_eval = res_valid[both_exist]
    # Functional systems are usually scored 0-5.
    # EDSS is usually scored 0-10.
    if system_name == "EDSS":
        score_range = 10
        clean_name = "EDSS"
    else:
        score_range = 5
        clean_name = system_name.replace('_', ' ').title()
    # Continuous accuracy:
    # exact match = 1.0
    # off by 1 point in FS = 0.8
    # off by 1 point in EDSS = 0.9
    abs_error = (res_eval - gt_eval).abs()
    accuracy = 1 - (abs_error / score_range)
    # Keep values between 0 and 1
    accuracy = accuracy.clip(lower=0, upper=1)
    boxplot_data.append(accuracy.values)
    system_labels.append(clean_name)
    predicted_counts.append(len(gt_eval))
    missing_prediction_counts.append(missing_count)
    total_gt_counts.append(total_gt)
    mean_accuracies.append(accuracy.mean())
    print(
        f"{clean_name}: "
        f"GT={total_gt}, predicted={len(gt_eval)}, missing={missing_count}, "
        f"mean accuracy={accuracy.mean():.1%}"
    )
 if not boxplot_data:
    raise ValueError("No valid accuracy data available for plotting.")
 # --- X-axis labels ---
 xtick_labels = [
    f"{label}\nGT={gt_n}, predicted={pred_n}, missing={miss_n}"
    for label, gt_n, pred_n, miss_n
    in zip(system_labels, total_gt_counts, predicted_counts, missing_prediction_counts)
 ]
 # --- Plot ---
 fig, ax = plt.subplots(figsize=(17, 8))
 bp = ax.boxplot(
    boxplot_data,
    vert=True,
    patch_artist=True,
    labels=xtick_labels,
    showmeans=True,
    meanline=False,
    widths=0.55
 )
 # --- Styling ---
 box_face = '#D6EAF8'
 box_edge = '#2980B9'
 whisker_col = '#7F8C8D'
 median_col = '#C0392B'
 mean_col = '#1ABC9C'
 flier_face = '#95A5A6'
 flier_edge = '#7F8C8D'
 for box in bp['boxes']:
    box.set(
        facecolor=box_face,
        edgecolor=box_edge,
        linewidth=1.5
    )
 for whisker in bp['whiskers']:
    whisker.set(
        color=whisker_col,
        linewidth=1.2
    )
 for cap in bp['caps']:
    cap.set(
        color=whisker_col,
        linewidth=1.2
    )
 for median in bp['medians']:
    median.set(
        color=median_col,
        linewidth=2
    )
 for mean in bp['means']:
    mean.set(
        marker='o',
        markerfacecolor=mean_col,
        markeredgecolor='black',
        markersize=6
    )
 for flier in bp['fliers']:
    flier.set(
        marker='o',
        markerfacecolor=flier_face,
        markeredgecolor=flier_edge,
        alpha=0.6,
        markersize=4
    )
 # --- Mean accuracy labels above each box ---
 for i, acc in enumerate(mean_accuracies, start=1):
    ax.text(
        i,
        1.03,
        f"{acc:.1%}",
        ha='center',
        va='bottom',
        fontsize=9,
        fontweight='bold'
    )
 # --- Perfect accuracy reference line ---
 ax.axhline(
    1,
    color='black',
    linewidth=1.2,
    linestyle='--',
    alpha=0.7
 )
 # --- Labels and formatting ---
 ax.set_xlabel(
    'Functional System / EDSS',
    fontsize=11,
    fontweight='bold'
 )
 ax.set_ylabel(
    'Continuous Accuracy',
    fontsize=11,
    fontweight='bold'
 )
 #ax.set_title(
 #    'Continuous Accuracy of Functional Systems and EDSS',
 #    fontsize=14,
 #    fontweight='bold',
 #    pad=35
 #)
 ax.set_ylim(-0.05, 1.10)
 yticks = np.arange(0, 1.01, 0.1)
 ax.set_yticks(yticks)
 ax.set_yticklabels([f"{int(y * 100)}%" for y in yticks])
 plt.xticks(rotation=45, ha='right')
 ax.yaxis.grid(True, linestyle='--', alpha=0.3)
 ax.set_axisbelow(True)
 for spine in ['top', 'right']:
    ax.spines[spine].set_visible(False)
 # --- Legend ---
 legend_handles = [
    Patch(
        facecolor=box_face,
        edgecolor=box_edge,
        label='IQR of continuous accuracy'
    ),
    Line2D(
        [0], [0],
        color=median_col,
        lw=2,
        label='Median'
    ),
    Line2D(
        [0], [0],
        marker='o',
        color='w',
        markerfacecolor=mean_col,
        markeredgecolor='black',
        markersize=7,
        label='Mean'
    ),
    Line2D(
        [0], [0],
        marker='o',
        color='w',
        markerfacecolor=flier_face,
        markeredgecolor=flier_edge,
        alpha=0.8,
        markersize=6,
        label='Outlier'
    ),
    Line2D(
        [0], [0],
        color='black',
        lw=1.2,
        linestyle='--',
        label='Perfect accuracy'
    )
 ]
 ax.legend(
    handles=legend_handles,
    loc='lower center',
    bbox_to_anchor=(0.5, 1.08),
    ncol=5,
    frameon=False
 )
 # --- Save and show ---
 plt.tight_layout(rect=[0, 0, 1, 0.86])
 os.makedirs(os.path.dirname(figure_save_path), exist_ok=True)
 plt.savefig(figure_save_path, format='svg', bbox_inches='tight')
 plt.show()
 ##
 # %% Functional Systems + EDSS Error Category Stacked Bar Plot
 import pandas as pd
 import matplotlib.pyplot as plt
 import os
 import numpy as np
 from matplotlib.patches import Patch
 # --- Configuration ---
 plt.rcParams['font.family'] = 'Arial'
 figure_save_path = 'project/visuals/functional_systems_edss_error_categories.svg'
 # --- Functional systems + EDSS using your actual column names ---
 systems_to_plot = [
    ('GT.VISUAL_OPTIC_FUNCTIONS', 'result.VISUAL OPTIC FUNCTIONS'),
    ('GT.CEREBELLAR_FUNCTIONS', 'result.CEREBELLAR FUNCTIONS'),
    ('GT.BRAINSTEM_FUNCTIONS', 'result.BRAINSTEM FUNCTIONS'),
    ('GT.SENSORY_FUNCTIONS', 'result.SENSORY FUNCTIONS'),
    ('GT.PYRAMIDAL_FUNCTIONS', 'result.PYRAMIDAL FUNCTIONS'),
    ('GT.AMBULATION', 'result.AMBULATION'),
    ('GT.CEREBRAL_FUNCTIONS', 'result.CEREBRAL FUNCTIONS'),
    ('GT.BOWEL_AND_BLADDER_FUNCTIONS', 'result.BOWEL AND BLADDER FUNCTIONS'),
    ('GT.EDSS', 'result.EDSS')
 ]
 # --- Robust parser ---
 def safe_parse(s):
    """Convert to float, handling comma decimals like '3,5'."""
    if pd.isna(s):
        return np.nan
    if isinstance(s, (int, float, np.integer, np.floating)):
        return float(s)
    s_clean = str(s).replace(',', '.').strip()
    if s_clean == "":
        return np.nan
    try:
        return float(s_clean)
    except ValueError:
        return np.nan
 # --- Categorize absolute error ---
 def categorize_error(abs_error):
    if abs_error == 0:
        return "Exact"
    elif abs_error <= 0.5:
        return "≤0.5 error"
    elif abs_error <= 1:
        return "≤1 error"
    else:
        return ">1 error"
 # --- Prepare data ---
 rows = []
 for gt_col, res_col in systems_to_plot:
    if gt_col not in df.columns:
        print(f"Skipping {gt_col}: GT column not found")
        continue
    if res_col not in df.columns:
        print(f"Skipping {res_col}: result column not found")
        continue
    system_name = gt_col.split('.')[1]
    if system_name == "EDSS":
        clean_name = "EDSS"
    else:
        clean_name = system_name.replace("_", " ").title()
    gt = df[gt_col].apply(safe_parse)
    res = df[res_col].apply(safe_parse)
    # Evaluate only cases where ground truth exists
    gt_exists = gt.notna()
    gt_valid = gt[gt_exists]
    res_valid = res[gt_exists]
    if len(gt_valid) == 0:
        continue
    for gt_value, res_value in zip(gt_valid, res_valid):
        if pd.isna(res_value):
            category = "Missing"
        else:
            abs_error = abs(res_value - gt_value)
            category = categorize_error(abs_error)
        rows.append({
            "system": clean_name,
            "category": category
        })
 plot_df = pd.DataFrame(rows)
 if plot_df.empty:
    raise ValueError("No valid data available for plotting.")
 category_order = [
    "Exact",
    "≤0.5 error",
    "≤1 error",
    ">1 error",
    "Missing"
 ]
 system_order = [
    "Visual Optic Functions",
    "Cerebellar Functions",
    "Brainstem Functions",
    "Sensory Functions",
    "Pyramidal Functions",
    "Ambulation",
    "Cerebral Functions",
    "Bowel And Bladder Functions",
    "EDSS"
 ]
 counts = (
    plot_df
    .groupby(["system", "category"])
    .size()
    .unstack(fill_value=0)
    .reindex(index=system_order)
    .reindex(columns=category_order, fill_value=0)
 )
 # Remove systems that were not available
 counts = counts.dropna(how="all")
 # Convert to percentages for easier comparison
 percentages = counts.div(counts.sum(axis=1), axis=0) * 100
 # --- Plot ---
 fig, ax = plt.subplots(figsize=(13, 7))
 colors = {
    "Exact": "#2ECC71",
    "≤0.5 error": "#A9DFBF",
    "≤1 error": "#F9E79F",
    ">1 error": "#E67E22",
    "Missing": "#E74C3C"
 }
 left = np.zeros(len(percentages))
 for category in category_order:
    values = percentages[category].values
    ax.barh(
        percentages.index,
        values,
        left=left,
        color=colors[category],
        edgecolor="white",
        linewidth=0.8,
        label=category
    )
    # Add labels only if segment is large enough
    for i, value in enumerate(values):
        if value >= 4:
            ax.text(
                left[i] + value / 2,
                i,
                f"{value:.1f}%",
                ha="center",
                va="center",
                fontsize=8,
                fontweight="bold"
            )
    left += values
 # Add total n and missing count at the right side
 for i, system in enumerate(percentages.index):
    total_n = int(counts.loc[system].sum())
    missing_n = int(counts.loc[system, "Missing"])
    ax.text(
        101,
        i,
        f"n={total_n}, missing={missing_n}",
        va="center",
        ha="left",
        fontsize=9
    )
 # --- Formatting ---
 ax.set_xlim(0, 115)
 ax.set_xlabel("Percentage of Cases", fontsize=11, fontweight="bold")
 ax.set_ylabel("Functional System / EDSS", fontsize=11, fontweight="bold")
 #ax.set_title(
 #    "Prediction Error Categories by Functional System and EDSS",
 #    fontsize=14,
 #    fontweight="bold",
 #    pad=20
 #)
 ax.set_xticks(np.arange(0, 101, 10))
 ax.set_xticklabels([f"{x}%" for x in np.arange(0, 101, 10)])
 ax.xaxis.grid(True, linestyle="--", alpha=0.3)
 ax.set_axisbelow(True)
 for spine in ["top", "right", "left"]:
    ax.spines[spine].set_visible(False)
 ax.legend(
    loc="lower center",
    bbox_to_anchor=(0.5, 1.02),
    ncol=5,
    frameon=False
 )
 plt.tight_layout(rect=[0, 0, 1, 0.92])
 os.makedirs(os.path.dirname(figure_save_path), exist_ok=True)
 plt.savefig(figure_save_path, format="svg", bbox_inches="tight")
 plt.show()
 ##