updated plot certainty

This commit is contained in:
2026-02-13 09:22:53 +01:00
parent 8e4a43c557
commit f1d22b28ad

View File

@@ -610,6 +610,88 @@ def plot_certainty_vs_accuracy_by_category(json_dir_path, ground_truth_path):
# %% Boxplot
import pandas as pd
import numpy as np
import json
import glob
import os
import re
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
def natural_key(string_):
return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]
def plot_edss_boxplot(json_dir_path, ground_truth_path):
# 1. Load Ground Truth
df_gt = pd.read_csv(ground_truth_path, sep=';')
gt_values = pd.to_numeric(df_gt['EDSS'], errors='coerce').dropna().tolist()
# 2. Load Iterations
json_files = glob.glob(os.path.join(json_dir_path, "*.json"))
json_files.sort(key=natural_key)
plot_data = [gt_values]
labels = ['Ground Truth']
for i, file_path in enumerate(json_files):
iteration_values = []
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
val = entry["result"].get("EDSS")
if val is not None:
iteration_values.append(float(val))
plot_data.append(iteration_values)
labels.append(f"Iter {i+1}")
# 3. Plotting Configuration
plt.figure(figsize=(14, 8))
# Define colors
gt_color = '#ff9999' # Soft Red
iter_color = '#66b3ff' # Soft Blue
# Create the boxplot
bplot = plt.boxplot(plot_data, labels=labels, patch_artist=True,
notch=False,
medianprops={'color': 'black', 'linewidth': 2},
flierprops={'marker': 'o', 'markerfacecolor': 'gray', 'markersize': 5, 'alpha': 0.5},
showmeans=True,
meanprops={"marker":"D", "markerfacecolor":"white", "markeredgecolor":"black", "markersize": 6})
# 4. Fill boxes with colors
colors = [gt_color] + [iter_color] * (len(plot_data) - 1)
for patch, color in zip(bplot['boxes'], colors):
patch.set_facecolor(color)
# 5. CONSTRUCT THE COMPLETE LEGEND
legend_elements = [
Patch(facecolor=gt_color, edgecolor='black', label='Ground Truth'),
Patch(facecolor=iter_color, edgecolor='black', label='LLM Iterations (1-10)'),
Line2D([0], [0], color='black', lw=2, label='Median'),
Line2D([0], [0], marker='D', color='w', label='Mean Score',
markerfacecolor='white', markeredgecolor='black', markersize=8),
Line2D([0], [0], marker='o', color='w', label='Outliers',
markerfacecolor='gray', markersize=6, alpha=0.5)
]
plt.legend(handles=legend_elements, loc='upper right', frameon=True, shadow=True, title="Legend")
# Formatting
plt.title('Distribution of EDSS Scores: Ground Truth vs. 10 LLM Iterations', fontsize=16, pad=20)
plt.ylabel('EDSS Score (0-10)', fontsize=12)
plt.xlabel('Data Source', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.4)
plt.ylim(-0.5, 10.5)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
##
# %% Audit
@@ -674,6 +756,464 @@ def audit_matches(json_dir_path, ground_truth_path):
##
# %% Cinfidence accuracy correlation
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
def categorize_edss(value):
if pd.isna(value): return np.nan
elif value <= 1.0: return '0-1'
elif value <= 2.0: return '1-2'
elif value <= 3.0: return '2-3'
elif value <= 4.0: return '3-4'
elif value <= 5.0: return '4-5'
elif value <= 6.0: return '6-7'
elif value <= 7.0: return '7-8'
elif value <= 8.0: return '8-9'
elif value <= 9.0: return '9-10'
else: return '10+'
def plot_binned_calibration(json_dir_path, ground_truth_path):
# 1. Load Ground Truth
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
df_gt['gt_cat'] = pd.to_numeric(df_gt['EDSS'], errors='coerce').apply(categorize_edss)
# 2. Load Predictions
all_preds = []
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'pred_cat': categorize_edss(res.get('EDSS')),
'confidence': res.get('certainty_percent')
})
df_pred = pd.DataFrame(all_preds)
df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'gt_cat']],
on=['unique_id', 'MedDatum'], how='inner')
# Define correctness
df_merged['is_correct'] = (df_merged['pred_cat'] == df_merged['gt_cat']).astype(int)
# 3. Create Confidence Bins (e.g., 0-60, 60-70, 70-80, 80-90, 90-100)
bins = [0, 60, 70, 80, 90, 100]
labels = ['<60%', '60-70%', '70-80%', '80-90%', '90-100%']
df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)
# Calculate average accuracy per bin
calibration_stats = df_merged.groupby('conf_bin')['is_correct'].agg(['mean', 'count']).reset_index()
# 4. Plotting
plt.figure(figsize=(10, 6))
# Bar chart for actual accuracy
bars = plt.bar(calibration_stats['conf_bin'], calibration_stats['mean'],
color='skyblue', edgecolor='navy', alpha=0.7, label='Actual Accuracy')
# Add the "Perfect Calibration" line
# (If confidence is 95%, accuracy should be 0.95)
expected_x = np.arange(len(labels))
expected_y = [0.3, 0.65, 0.75, 0.85, 0.95] # Midpoints of the bins for visual reference
plt.plot(expected_x, expected_y, color='red', marker='o', linestyle='--',
linewidth=2, label='Perfect Calibration (Theoretical)')
# 5. Add text labels on top of bars to show sample size (how many cases in that bin)
for i, bar in enumerate(bars):
yval = bar.get_height()
count = calibration_stats.loc[i, 'count']
plt.text(bar.get_x() + bar.get_width()/2, yval + 0.02,
f'Acc: {yval:.1%}\n(n={count})', ha='center', va='bottom', fontsize=9)
# Legend and Labels
plt.title('Model Calibration: Does Confidence Match Accuracy?', fontsize=14, pad=15)
plt.xlabel('LLM Confidence Score Bin', fontsize=12)
plt.ylabel('Actual Accuracy (Correct Category %)', fontsize=12)
plt.ylim(0, 1.1)
plt.grid(axis='y', linestyle=':', alpha=0.5)
# Adding a clean, informative legend
plt.legend(loc='upper left', frameon=True, shadow=True)
plt.tight_layout()
plt.show()
##
# %% Confidence comparison
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
def plot_edss_confidence_comparison(json_dir_path, ground_truth_path):
# 1. Load Ground Truth
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')
# 2. Load Predictions from all JSONs
all_preds = []
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
with open(file_path, 'r', encoding='utf-8') as f:
try:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
})
except Exception as e:
print(f"Skipping {file_path}: {e}")
df_pred = pd.DataFrame(all_preds)
# 3. Merge and Clean
df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
on=['unique_id', 'MedDatum'], how='inner')
df_plot = df_merged.dropna(subset=['EDSS_pred', 'EDSS_gt', 'confidence']).copy()
# 4. Bin Confidence (X-Axis Categories)
# We group confidence into bins to create a readable boxplot
bins = [0, 60, 70, 80, 90, 100]
labels = ['<60%', '60-70%', '70-80%', '80-90%', '90-100%']
df_plot['conf_bin'] = pd.cut(df_plot['confidence'], bins=bins, labels=labels)
# 5. Plotting
plt.figure(figsize=(14, 8))
# A. Boxplot: Shows the distribution of LLM PREDICTIONS
sns.boxplot(data=df_plot, x='conf_bin', y='EDSS_pred',
color='#3498db', width=0.5, showfliers=False,
boxprops=dict(alpha=0.4, edgecolor='navy'))
# B. Stripplot (Dots): Shows individual GROUND TRUTH scores
# We add jitter so dots don't hide each other
sns.stripplot(data=df_plot, x='conf_bin', y='EDSS_gt',
color='#e74c3c', alpha=0.4, jitter=0.2, size=5)
# 6. Create a CLEAR Legend
legend_elements = [
Patch(facecolor='#3498db', edgecolor='navy', alpha=0.4,
label='LLM Predictions (Box = Distribution)'),
Line2D([0], [0], marker='o', color='w', label='Ground Truth (Dots = Clinician Scores)',
markerfacecolor='#e74c3c', markersize=8, alpha=0.6),
Line2D([0], [0], color='black', lw=2, label='Median Predicted EDSS')
]
plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Legend")
# Final Labels
plt.title('Comparison of EDSS Scores Across Confidence Levels', fontsize=16, pad=20)
plt.xlabel('LLM Certainty Score (%)', fontsize=12)
plt.ylabel('EDSS Score (0-10)', fontsize=12)
plt.ylim(-0.5, 10.5)
plt.yticks(np.arange(0, 11, 1))
plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()
##
# %% EDSS vs Boxplot
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
def categorize_edss(value):
if pd.isna(value): return np.nan
elif value <= 1.0: return '0-1'
elif value <= 2.0: return '1-2'
elif value <= 3.0: return '2-3'
elif value <= 4.0: return '3-4'
elif value <= 5.0: return '4-5'
elif value <= 6.0: return '5-6'
elif value <= 7.0: return '6-7'
elif value <= 8.0: return '7-8'
elif value <= 9.0: return '8-9'
elif value <= 10.0: return '9-10'
else: return '10+'
def plot_edss_vs_confidence_boxplot(json_dir_path):
# 1. Load all Predictions
all_preds = []
json_files = glob.glob(os.path.join(json_dir_path, "*.json"))
for file_path in json_files:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
edss_val = pd.to_numeric(res.get('EDSS'), errors='coerce')
conf_val = pd.to_numeric(res.get('certainty_percent'), errors='coerce')
if not pd.isna(edss_val) and not pd.isna(conf_val):
all_preds.append({
'edss_cat': categorize_edss(edss_val),
'confidence': conf_val
})
df = pd.DataFrame(all_preds)
# 2. Sort categories correctly for the x-axis
cat_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']
df['edss_cat'] = pd.Categorical(df['edss_cat'], categories=cat_order, ordered=True)
# 3. Plotting
plt.figure(figsize=(14, 8))
# Create Boxplot
sns.boxplot(data=df, x='edss_cat', y='confidence',
palette="Blues", width=0.6, showfliers=False)
# Add Stripplot (Dots) to show density of cases
sns.stripplot(data=df, x='edss_cat', y='confidence',
color='black', alpha=0.15, jitter=0.2, size=3)
# 4. Legend and Labels
# Since boxplot color is clear, we add a legend for the components
legend_elements = [
Patch(facecolor='#6da7d1', label='Confidence Distribution (IQR)'),
plt.Line2D([0], [0], color='black', marker='o', linestyle='',
markersize=4, alpha=0.4, label='Individual Predictions')
]
plt.legend(handles=legend_elements, loc='lower left', frameon=True)
plt.title('LLM Confidence Levels Across Clinical EDSS Categories', fontsize=16, pad=20)
plt.xlabel('Predicted EDSS Category (Clinical Severity)', fontsize=12)
plt.ylabel('Confidence Score (%)', fontsize=12)
plt.ylim(0, 105)
plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()
##
# %% Correlation Boxplot
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from sklearn.metrics import cohen_kappa_score
def categorize_edss(value):
"""Standardized clinical categorization."""
if pd.isna(value): return np.nan
elif value <= 1.0: return '0-1'
elif value <= 2.0: return '1-2'
elif value <= 3.0: return '2-3'
elif value <= 4.0: return '3-4'
elif value <= 5.0: return '4-5'
elif value <= 6.0: return '5-6'
elif value <= 7.0: return '6-7'
elif value <= 8.0: return '7-8'
elif value <= 9.0: return '8-9'
elif value <= 10.0: return '9-10'
else: return '10+'
def plot_categorical_vs_categorical(json_dir_path, ground_truth_path):
# 1. Load Ground Truth
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
df_gt['gt_cat'] = pd.to_numeric(df_gt['EDSS'], errors='coerce').apply(categorize_edss)
# 2. Load Predictions
all_preds = []
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'pred_cat': categorize_edss(pd.to_numeric(res.get('EDSS'), errors='coerce'))
})
df_pred = pd.DataFrame(all_preds)
# 3. Merge
df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'gt_cat']],
on=['unique_id', 'MedDatum'], how='inner').dropna()
# 4. Set Order and Numeric Mapping for Plotting
cat_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']
cat_map = {cat: i for i, cat in enumerate(cat_order)}
df_merged['gt_idx'] = df_merged['gt_cat'].map(cat_map)
df_merged['pred_idx'] = df_merged['pred_cat'].map(cat_map)
# Calculate Cohen's Kappa (Standard for categorical agreement)
kappa = cohen_kappa_score(df_merged['gt_cat'], df_merged['pred_cat'], weights='linear')
# 5. Plotting
plt.figure(figsize=(14, 8))
# BOXPLOT: Distribution of Predicted Categories relative to Ground Truth
sns.boxplot(data=df_merged, x='gt_cat', y='pred_idx',
palette="rocket", width=0.6, showfliers=False, boxprops=dict(alpha=0.5))
# STRIPPLOT: Individual counts
sns.stripplot(data=df_merged, x='gt_cat', y='pred_idx',
color='black', alpha=0.1, jitter=0.3, size=4)
# DIAGONAL REFERENCE: Perfect category match
plt.plot([0, 9], [0, 9], color='red', linestyle='--', linewidth=2)
# 6. Formatting Legend & Axes
plt.yticks(ticks=range(len(cat_order)), labels=cat_order)
legend_elements = [
Patch(facecolor='#ae3e50', alpha=0.5, label='Predicted Category Spread'),
plt.Line2D([0], [0], color='red', linestyle='--', label='Perfect Category Agreement'),
plt.Line2D([0], [0], color='black', marker='o', linestyle='', markersize=4, alpha=0.3, label='Iteration Matches'),
Patch(color='none', label=f'Linear Weighted Kappa: {kappa:.3f}')
]
plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Agreement Metrics")
plt.title('Categorical Agreement: Ground Truth vs. LLM Prediction', fontsize=16, pad=20)
plt.xlabel('Ground Truth Category (Clinician)', fontsize=12)
plt.ylabel('LLM Predicted Category', fontsize=12)
plt.grid(axis='both', linestyle=':', alpha=0.4)
plt.tight_layout()
plt.show()
##
# %% rainplot
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
def plot_error_distribution_by_confidence(json_dir_path, ground_truth_path):
# 1. Load Ground Truth
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')
# 2. Load Predictions
all_preds = []
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
})
df_merged = pd.merge(pd.DataFrame(all_preds), df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
on=['unique_id', 'MedDatum'], how='inner').dropna()
# 3. Calculate Error
df_merged['error'] = df_merged['EDSS_pred'] - df_merged['EDSS_gt']
# 4. Bin Confidence
bins = [0, 70, 80, 90, 100]
labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)']
df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)
# Calculate counts for labels
counts = df_merged['conf_bin'].value_counts().reindex(labels)
new_labels = [f"{l}\n(n={int(counts[l])})" for l in labels]
# 5. Plotting
plt.figure(figsize=(13, 8))
# Using a sequential color palette (Light blue to Dark blue)
palette_colors = sns.color_palette("Blues", n_colors=len(labels))
vplot = sns.violinplot(data=df_merged, x='conf_bin', y='error', inner="quartile",
palette=palette_colors, cut=0)
# Reference line at 0
plt.axhline(0, color='#d9534f', linestyle='--', linewidth=2.5)
# 6. UPDATED LEGEND WITH CORRECT COLORS
legend_elements = [
# Legend items for the color gradient
Patch(facecolor=palette_colors[0], label='Confidence: <70%'),
Patch(facecolor=palette_colors[1], label='Confidence: 70-80%'),
Patch(facecolor=palette_colors[2], label='Confidence: 80-90%'),
Patch(facecolor=palette_colors[3], label='Confidence: 90-100%'),
# Legend items for the symbols
Line2D([0], [0], color='black', linestyle=':', label='Quartile Lines (25th, 50th, 75th)'),
Line2D([0], [0], color='#d9534f', linestyle='--', lw=2.5, label='Zero Error (Perfect Match)')
]
plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Legend & Confidence Gradient")
# Formatting
plt.title('Error Magnitude vs. LLM Confidence Levels', fontsize=16, pad=20)
plt.xlabel('LLM Certainty Group', fontsize=12)
plt.ylabel('Prediction Delta (EDSS_pred - EDSS_gt)', fontsize=12)
plt.xticks(ticks=range(len(labels)), labels=new_labels)
plt.grid(axis='y', linestyle=':', alpha=0.5)
plt.tight_layout()
plt.show()
# plot_error_distribution_by_confidence('jsons_folder/', 'ground_truth.csv')
##
# %% Usage
# --- Usage ---
@@ -681,7 +1221,15 @@ def audit_matches(json_dir_path, ground_truth_path):
# '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_subcategory_analysis('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
plot_certainty_vs_accuracy_by_category('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_certainty_vs_accuracy_by_category('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#audit_matches('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_edss_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_binned_calibration('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_edss_vs_confidence_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration')
#plot_gt_vs_llm_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_categorical_vs_categorical('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
plot_error_distribution_by_confidence('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
##