diff --git a/Data/certainty_show.py b/Data/certainty_show.py index a301379..d3d6db0 100644 --- a/Data/certainty_show.py +++ b/Data/certainty_show.py @@ -610,6 +610,88 @@ def plot_certainty_vs_accuracy_by_category(json_dir_path, ground_truth_path): +# %% Boxplot +import pandas as pd +import numpy as np +import json +import glob +import os +import re +import matplotlib.pyplot as plt +from matplotlib.lines import Line2D +from matplotlib.patches import Patch + +def natural_key(string_): + return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)] + +def plot_edss_boxplot(json_dir_path, ground_truth_path): + # 1. Load Ground Truth + df_gt = pd.read_csv(ground_truth_path, sep=';') + gt_values = pd.to_numeric(df_gt['EDSS'], errors='coerce').dropna().tolist() + + # 2. Load Iterations + json_files = glob.glob(os.path.join(json_dir_path, "*.json")) + json_files.sort(key=natural_key) + + plot_data = [gt_values] + labels = ['Ground Truth'] + + for i, file_path in enumerate(json_files): + iteration_values = [] + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + for entry in data: + if entry.get("success"): + val = entry["result"].get("EDSS") + if val is not None: + iteration_values.append(float(val)) + plot_data.append(iteration_values) + labels.append(f"Iter {i+1}") + + # 3. Plotting Configuration + plt.figure(figsize=(14, 8)) + + # Define colors + gt_color = '#ff9999' # Soft Red + iter_color = '#66b3ff' # Soft Blue + + # Create the boxplot + bplot = plt.boxplot(plot_data, labels=labels, patch_artist=True, + notch=False, + medianprops={'color': 'black', 'linewidth': 2}, + flierprops={'marker': 'o', 'markerfacecolor': 'gray', 'markersize': 5, 'alpha': 0.5}, + showmeans=True, + meanprops={"marker":"D", "markerfacecolor":"white", "markeredgecolor":"black", "markersize": 6}) + + # 4. Fill boxes with colors + colors = [gt_color] + [iter_color] * (len(plot_data) - 1) + for patch, color in zip(bplot['boxes'], colors): + patch.set_facecolor(color) + + # 5. CONSTRUCT THE COMPLETE LEGEND + legend_elements = [ + Patch(facecolor=gt_color, edgecolor='black', label='Ground Truth'), + Patch(facecolor=iter_color, edgecolor='black', label='LLM Iterations (1-10)'), + Line2D([0], [0], color='black', lw=2, label='Median'), + Line2D([0], [0], marker='D', color='w', label='Mean Score', + markerfacecolor='white', markeredgecolor='black', markersize=8), + Line2D([0], [0], marker='o', color='w', label='Outliers', + markerfacecolor='gray', markersize=6, alpha=0.5) + ] + + plt.legend(handles=legend_elements, loc='upper right', frameon=True, shadow=True, title="Legend") + + # Formatting + plt.title('Distribution of EDSS Scores: Ground Truth vs. 10 LLM Iterations', fontsize=16, pad=20) + plt.ylabel('EDSS Score (0-10)', fontsize=12) + plt.xlabel('Data Source', fontsize=12) + plt.grid(axis='y', linestyle='--', alpha=0.4) + plt.ylim(-0.5, 10.5) + plt.xticks(rotation=45) + + plt.tight_layout() + plt.show() +## # %% Audit @@ -674,6 +756,464 @@ def audit_matches(json_dir_path, ground_truth_path): ## + + + +# %% Cinfidence accuracy correlation + +import pandas as pd +import numpy as np +import json +import glob +import os +import matplotlib.pyplot as plt + +def categorize_edss(value): + if pd.isna(value): return np.nan + elif value <= 1.0: return '0-1' + elif value <= 2.0: return '1-2' + elif value <= 3.0: return '2-3' + elif value <= 4.0: return '3-4' + elif value <= 5.0: return '4-5' + elif value <= 6.0: return '6-7' + elif value <= 7.0: return '7-8' + elif value <= 8.0: return '8-9' + elif value <= 9.0: return '9-10' + else: return '10+' + +def plot_binned_calibration(json_dir_path, ground_truth_path): + # 1. Load Ground Truth + df_gt = pd.read_csv(ground_truth_path, sep=';') + df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower() + df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower() + df_gt['gt_cat'] = pd.to_numeric(df_gt['EDSS'], errors='coerce').apply(categorize_edss) + + # 2. Load Predictions + all_preds = [] + for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + for entry in data: + if entry.get("success"): + res = entry["result"] + all_preds.append({ + 'unique_id': str(res.get('unique_id')).strip().lower(), + 'MedDatum': str(res.get('MedDatum')).strip().lower(), + 'pred_cat': categorize_edss(res.get('EDSS')), + 'confidence': res.get('certainty_percent') + }) + + df_pred = pd.DataFrame(all_preds) + df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'gt_cat']], + on=['unique_id', 'MedDatum'], how='inner') + + # Define correctness + df_merged['is_correct'] = (df_merged['pred_cat'] == df_merged['gt_cat']).astype(int) + + # 3. Create Confidence Bins (e.g., 0-60, 60-70, 70-80, 80-90, 90-100) + bins = [0, 60, 70, 80, 90, 100] + labels = ['<60%', '60-70%', '70-80%', '80-90%', '90-100%'] + df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels) + + # Calculate average accuracy per bin + calibration_stats = df_merged.groupby('conf_bin')['is_correct'].agg(['mean', 'count']).reset_index() + + # 4. Plotting + plt.figure(figsize=(10, 6)) + + # Bar chart for actual accuracy + bars = plt.bar(calibration_stats['conf_bin'], calibration_stats['mean'], + color='skyblue', edgecolor='navy', alpha=0.7, label='Actual Accuracy') + + # Add the "Perfect Calibration" line + # (If confidence is 95%, accuracy should be 0.95) + expected_x = np.arange(len(labels)) + expected_y = [0.3, 0.65, 0.75, 0.85, 0.95] # Midpoints of the bins for visual reference + plt.plot(expected_x, expected_y, color='red', marker='o', linestyle='--', + linewidth=2, label='Perfect Calibration (Theoretical)') + + # 5. Add text labels on top of bars to show sample size (how many cases in that bin) + for i, bar in enumerate(bars): + yval = bar.get_height() + count = calibration_stats.loc[i, 'count'] + plt.text(bar.get_x() + bar.get_width()/2, yval + 0.02, + f'Acc: {yval:.1%}\n(n={count})', ha='center', va='bottom', fontsize=9) + + # Legend and Labels + plt.title('Model Calibration: Does Confidence Match Accuracy?', fontsize=14, pad=15) + plt.xlabel('LLM Confidence Score Bin', fontsize=12) + plt.ylabel('Actual Accuracy (Correct Category %)', fontsize=12) + plt.ylim(0, 1.1) + plt.grid(axis='y', linestyle=':', alpha=0.5) + + # Adding a clean, informative legend + plt.legend(loc='upper left', frameon=True, shadow=True) + + plt.tight_layout() + plt.show() +## + + + +# %% Confidence comparison + +import pandas as pd +import numpy as np +import json +import glob +import os +import matplotlib.pyplot as plt +import seaborn as sns +from matplotlib.lines import Line2D +from matplotlib.patches import Patch + +def plot_edss_confidence_comparison(json_dir_path, ground_truth_path): + # 1. Load Ground Truth + df_gt = pd.read_csv(ground_truth_path, sep=';') + df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower() + df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower() + df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce') + + # 2. Load Predictions from all JSONs + all_preds = [] + for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): + with open(file_path, 'r', encoding='utf-8') as f: + try: + data = json.load(f) + for entry in data: + if entry.get("success"): + res = entry["result"] + all_preds.append({ + 'unique_id': str(res.get('unique_id')).strip().lower(), + 'MedDatum': str(res.get('MedDatum')).strip().lower(), + 'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'), + 'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce') + }) + except Exception as e: + print(f"Skipping {file_path}: {e}") + + df_pred = pd.DataFrame(all_preds) + + # 3. Merge and Clean + df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'EDSS_gt']], + on=['unique_id', 'MedDatum'], how='inner') + df_plot = df_merged.dropna(subset=['EDSS_pred', 'EDSS_gt', 'confidence']).copy() + + # 4. Bin Confidence (X-Axis Categories) + # We group confidence into bins to create a readable boxplot + bins = [0, 60, 70, 80, 90, 100] + labels = ['<60%', '60-70%', '70-80%', '80-90%', '90-100%'] + df_plot['conf_bin'] = pd.cut(df_plot['confidence'], bins=bins, labels=labels) + + # 5. Plotting + plt.figure(figsize=(14, 8)) + + # A. Boxplot: Shows the distribution of LLM PREDICTIONS + sns.boxplot(data=df_plot, x='conf_bin', y='EDSS_pred', + color='#3498db', width=0.5, showfliers=False, + boxprops=dict(alpha=0.4, edgecolor='navy')) + + # B. Stripplot (Dots): Shows individual GROUND TRUTH scores + # We add jitter so dots don't hide each other + sns.stripplot(data=df_plot, x='conf_bin', y='EDSS_gt', + color='#e74c3c', alpha=0.4, jitter=0.2, size=5) + + # 6. Create a CLEAR Legend + legend_elements = [ + Patch(facecolor='#3498db', edgecolor='navy', alpha=0.4, + label='LLM Predictions (Box = Distribution)'), + Line2D([0], [0], marker='o', color='w', label='Ground Truth (Dots = Clinician Scores)', + markerfacecolor='#e74c3c', markersize=8, alpha=0.6), + Line2D([0], [0], color='black', lw=2, label='Median Predicted EDSS') + ] + plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Legend") + + # Final Labels + plt.title('Comparison of EDSS Scores Across Confidence Levels', fontsize=16, pad=20) + plt.xlabel('LLM Certainty Score (%)', fontsize=12) + plt.ylabel('EDSS Score (0-10)', fontsize=12) + plt.ylim(-0.5, 10.5) + plt.yticks(np.arange(0, 11, 1)) + plt.grid(axis='y', linestyle='--', alpha=0.3) + + plt.tight_layout() + plt.show() + + +## + + + +# %% EDSS vs Boxplot + +import pandas as pd +import numpy as np +import json +import glob +import os +import matplotlib.pyplot as plt +import seaborn as sns +from matplotlib.patches import Patch + +def categorize_edss(value): + if pd.isna(value): return np.nan + elif value <= 1.0: return '0-1' + elif value <= 2.0: return '1-2' + elif value <= 3.0: return '2-3' + elif value <= 4.0: return '3-4' + elif value <= 5.0: return '4-5' + elif value <= 6.0: return '5-6' + elif value <= 7.0: return '6-7' + elif value <= 8.0: return '7-8' + elif value <= 9.0: return '8-9' + elif value <= 10.0: return '9-10' + else: return '10+' + +def plot_edss_vs_confidence_boxplot(json_dir_path): + # 1. Load all Predictions + all_preds = [] + json_files = glob.glob(os.path.join(json_dir_path, "*.json")) + + for file_path in json_files: + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + for entry in data: + if entry.get("success"): + res = entry["result"] + edss_val = pd.to_numeric(res.get('EDSS'), errors='coerce') + conf_val = pd.to_numeric(res.get('certainty_percent'), errors='coerce') + + if not pd.isna(edss_val) and not pd.isna(conf_val): + all_preds.append({ + 'edss_cat': categorize_edss(edss_val), + 'confidence': conf_val + }) + + df = pd.DataFrame(all_preds) + + # 2. Sort categories correctly for the x-axis + cat_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10'] + df['edss_cat'] = pd.Categorical(df['edss_cat'], categories=cat_order, ordered=True) + + # 3. Plotting + plt.figure(figsize=(14, 8)) + + # Create Boxplot + sns.boxplot(data=df, x='edss_cat', y='confidence', + palette="Blues", width=0.6, showfliers=False) + + # Add Stripplot (Dots) to show density of cases + sns.stripplot(data=df, x='edss_cat', y='confidence', + color='black', alpha=0.15, jitter=0.2, size=3) + + # 4. Legend and Labels + # Since boxplot color is clear, we add a legend for the components + legend_elements = [ + Patch(facecolor='#6da7d1', label='Confidence Distribution (IQR)'), + plt.Line2D([0], [0], color='black', marker='o', linestyle='', + markersize=4, alpha=0.4, label='Individual Predictions') + ] + plt.legend(handles=legend_elements, loc='lower left', frameon=True) + + plt.title('LLM Confidence Levels Across Clinical EDSS Categories', fontsize=16, pad=20) + plt.xlabel('Predicted EDSS Category (Clinical Severity)', fontsize=12) + plt.ylabel('Confidence Score (%)', fontsize=12) + plt.ylim(0, 105) + plt.grid(axis='y', linestyle='--', alpha=0.3) + + plt.tight_layout() + plt.show() +## + + + + +# %% Correlation Boxplot +import pandas as pd +import numpy as np +import json +import glob +import os +import matplotlib.pyplot as plt +import seaborn as sns +from matplotlib.patches import Patch +from sklearn.metrics import cohen_kappa_score + +def categorize_edss(value): + """Standardized clinical categorization.""" + if pd.isna(value): return np.nan + elif value <= 1.0: return '0-1' + elif value <= 2.0: return '1-2' + elif value <= 3.0: return '2-3' + elif value <= 4.0: return '3-4' + elif value <= 5.0: return '4-5' + elif value <= 6.0: return '5-6' + elif value <= 7.0: return '6-7' + elif value <= 8.0: return '7-8' + elif value <= 9.0: return '8-9' + elif value <= 10.0: return '9-10' + else: return '10+' + +def plot_categorical_vs_categorical(json_dir_path, ground_truth_path): + # 1. Load Ground Truth + df_gt = pd.read_csv(ground_truth_path, sep=';') + df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower() + df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower() + df_gt['gt_cat'] = pd.to_numeric(df_gt['EDSS'], errors='coerce').apply(categorize_edss) + + # 2. Load Predictions + all_preds = [] + for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + for entry in data: + if entry.get("success"): + res = entry["result"] + all_preds.append({ + 'unique_id': str(res.get('unique_id')).strip().lower(), + 'MedDatum': str(res.get('MedDatum')).strip().lower(), + 'pred_cat': categorize_edss(pd.to_numeric(res.get('EDSS'), errors='coerce')) + }) + + df_pred = pd.DataFrame(all_preds) + + # 3. Merge + df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'gt_cat']], + on=['unique_id', 'MedDatum'], how='inner').dropna() + + # 4. Set Order and Numeric Mapping for Plotting + cat_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10'] + cat_map = {cat: i for i, cat in enumerate(cat_order)} + + df_merged['gt_idx'] = df_merged['gt_cat'].map(cat_map) + df_merged['pred_idx'] = df_merged['pred_cat'].map(cat_map) + + # Calculate Cohen's Kappa (Standard for categorical agreement) + kappa = cohen_kappa_score(df_merged['gt_cat'], df_merged['pred_cat'], weights='linear') + + # 5. Plotting + plt.figure(figsize=(14, 8)) + + # BOXPLOT: Distribution of Predicted Categories relative to Ground Truth + sns.boxplot(data=df_merged, x='gt_cat', y='pred_idx', + palette="rocket", width=0.6, showfliers=False, boxprops=dict(alpha=0.5)) + + # STRIPPLOT: Individual counts + sns.stripplot(data=df_merged, x='gt_cat', y='pred_idx', + color='black', alpha=0.1, jitter=0.3, size=4) + + # DIAGONAL REFERENCE: Perfect category match + plt.plot([0, 9], [0, 9], color='red', linestyle='--', linewidth=2) + + # 6. Formatting Legend & Axes + plt.yticks(ticks=range(len(cat_order)), labels=cat_order) + + legend_elements = [ + Patch(facecolor='#ae3e50', alpha=0.5, label='Predicted Category Spread'), + plt.Line2D([0], [0], color='red', linestyle='--', label='Perfect Category Agreement'), + plt.Line2D([0], [0], color='black', marker='o', linestyle='', markersize=4, alpha=0.3, label='Iteration Matches'), + Patch(color='none', label=f'Linear Weighted Kappa: {kappa:.3f}') + ] + plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Agreement Metrics") + + plt.title('Categorical Agreement: Ground Truth vs. LLM Prediction', fontsize=16, pad=20) + plt.xlabel('Ground Truth Category (Clinician)', fontsize=12) + plt.ylabel('LLM Predicted Category', fontsize=12) + plt.grid(axis='both', linestyle=':', alpha=0.4) + + plt.tight_layout() + plt.show() +## + + + +# %% rainplot +import pandas as pd +import numpy as np +import json +import glob +import os +import matplotlib.pyplot as plt +import seaborn as sns +from matplotlib.patches import Patch +from matplotlib.lines import Line2D + +def plot_error_distribution_by_confidence(json_dir_path, ground_truth_path): + # 1. Load Ground Truth + df_gt = pd.read_csv(ground_truth_path, sep=';') + df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower() + df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower() + df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce') + + # 2. Load Predictions + all_preds = [] + for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + for entry in data: + if entry.get("success"): + res = entry["result"] + all_preds.append({ + 'unique_id': str(res.get('unique_id')).strip().lower(), + 'MedDatum': str(res.get('MedDatum')).strip().lower(), + 'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'), + 'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce') + }) + + df_merged = pd.merge(pd.DataFrame(all_preds), df_gt[['unique_id', 'MedDatum', 'EDSS_gt']], + on=['unique_id', 'MedDatum'], how='inner').dropna() + + # 3. Calculate Error + df_merged['error'] = df_merged['EDSS_pred'] - df_merged['EDSS_gt'] + + # 4. Bin Confidence + bins = [0, 70, 80, 90, 100] + labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)'] + df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels) + + # Calculate counts for labels + counts = df_merged['conf_bin'].value_counts().reindex(labels) + new_labels = [f"{l}\n(n={int(counts[l])})" for l in labels] + + # 5. Plotting + plt.figure(figsize=(13, 8)) + + # Using a sequential color palette (Light blue to Dark blue) + palette_colors = sns.color_palette("Blues", n_colors=len(labels)) + + vplot = sns.violinplot(data=df_merged, x='conf_bin', y='error', inner="quartile", + palette=palette_colors, cut=0) + + # Reference line at 0 + plt.axhline(0, color='#d9534f', linestyle='--', linewidth=2.5) + + # 6. UPDATED LEGEND WITH CORRECT COLORS + legend_elements = [ + # Legend items for the color gradient + Patch(facecolor=palette_colors[0], label='Confidence: <70%'), + Patch(facecolor=palette_colors[1], label='Confidence: 70-80%'), + Patch(facecolor=palette_colors[2], label='Confidence: 80-90%'), + Patch(facecolor=palette_colors[3], label='Confidence: 90-100%'), + # Legend items for the symbols + Line2D([0], [0], color='black', linestyle=':', label='Quartile Lines (25th, 50th, 75th)'), + Line2D([0], [0], color='#d9534f', linestyle='--', lw=2.5, label='Zero Error (Perfect Match)') + ] + + plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Legend & Confidence Gradient") + + # Formatting + plt.title('Error Magnitude vs. LLM Confidence Levels', fontsize=16, pad=20) + plt.xlabel('LLM Certainty Group', fontsize=12) + plt.ylabel('Prediction Delta (EDSS_pred - EDSS_gt)', fontsize=12) + plt.xticks(ticks=range(len(labels)), labels=new_labels) + plt.grid(axis='y', linestyle=':', alpha=0.5) + + plt.tight_layout() + plt.show() + +# plot_error_distribution_by_confidence('jsons_folder/', 'ground_truth.csv') +## + # %% Usage # --- Usage --- @@ -681,7 +1221,15 @@ def audit_matches(json_dir_path, ground_truth_path): # '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #plot_subcategory_analysis('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') -plot_certainty_vs_accuracy_by_category('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') +#plot_certainty_vs_accuracy_by_category('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #audit_matches('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') + +#plot_edss_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') +#plot_binned_calibration('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') + +#plot_edss_vs_confidence_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration') +#plot_gt_vs_llm_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') +#plot_categorical_vs_categorical('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') +plot_error_distribution_by_confidence('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') ##