certainty Delta show

This commit is contained in:
2026-02-18 17:12:31 +01:00
parent f1d22b28ad
commit 424d38ad1c

View File

@@ -1214,6 +1214,307 @@ def plot_error_distribution_by_confidence(json_dir_path, ground_truth_path):
# plot_error_distribution_by_confidence('jsons_folder/', 'ground_truth.csv')
##
# %% Certainty vs Delta
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
def plot_confidence_vs_abs_error_refined(json_dir_path, ground_truth_path):
# 1. Load and Merge Data
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')
all_preds = []
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
})
df_merged = pd.merge(pd.DataFrame(all_preds), df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
on=['unique_id', 'MedDatum'], how='inner').dropna()
# 2. Calculate Absolute Delta
df_merged['abs_error'] = (df_merged['EDSS_pred'] - df_merged['EDSS_gt']).abs()
# 3. Binning
bins = [0, 70, 80, 90, 100]
labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)']
df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)
stats = df_merged.groupby('conf_bin', observed=True)['abs_error'].agg(['mean', 'std', 'count']).reset_index()
# 4. Plotting
plt.figure(figsize=(12, 8))
# Sequential palette: light to dark
colors = sns.color_palette("Blues", n_colors=len(labels))
bars = plt.bar(stats['conf_bin'], stats['mean'], color=colors, edgecolor='black', linewidth=1.2)
# Standard Error Bars
plt.errorbar(stats['conf_bin'], stats['mean'],
yerr=stats['std']/np.sqrt(stats['count']),
fmt='none', c='black', capsize=6, elinewidth=1.5)
# Trend Line (Linear Fit)
x_idx = np.arange(len(labels))
z = np.polyfit(x_idx, stats['mean'], 1)
p = np.poly1d(z)
plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=2.5)
# 5. THE COMPLETE LEGEND
# We create a specific handle for every single thing on the chart
legend_elements = [
# Explicit color mapping for bins
Patch(facecolor=colors[0], edgecolor='black', label=f'Bin 1: {labels[0]}'),
Patch(facecolor=colors[1], edgecolor='black', label=f'Bin 2: {labels[1]}'),
Patch(facecolor=colors[2], edgecolor='black', label=f'Bin 3: {labels[2]}'),
Patch(facecolor=colors[3], edgecolor='black', label=f'Bin 4: {labels[3]}'),
# Statistical components
Line2D([0], [0], color='black', marker='_', linestyle='None', markersize=10, label='Standard Error (SEM)'),
Line2D([0], [0], color='#e74c3c', linestyle='--', lw=2.5, label='Correlation Trend (Inverse Rel.)'),
# Metric definition
Patch(color='none', label='Metric: Mean Absolute Error (MAE)')
]
plt.legend(handles=legend_elements, loc='upper right', frameon=True,
shadow=True, fontsize=10, title="Legend")
# Final Labels & Clean-up
plt.title('Validation: Inverse Correlation of Confidence vs. Error Magnitude', fontsize=15, pad=20)
plt.ylabel('Mean Absolute Error (Δ EDSS Points)', fontsize=12)
plt.xlabel('LLM Confidence Bracket', fontsize=12)
# Text annotations for MAE on bars
for i, bar in enumerate(bars):
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05,
f'MAE: {stats.loc[i, "mean"]:.2f}', ha='center', fontweight='bold')
plt.grid(axis='y', linestyle=':', alpha=0.5)
plt.tight_layout()
plt.show()
##
# %% name
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
def plot_final_thesis_error_chart(json_dir_path, ground_truth_path):
# 1. Load Ground Truth & Predictions
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')
all_preds = []
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
})
df_merged = pd.merge(pd.DataFrame(all_preds), df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
on=['unique_id', 'MedDatum'], how='inner').dropna()
# 2. Metric Calculation
df_merged['abs_error'] = (df_merged['EDSS_pred'] - df_merged['EDSS_gt']).abs()
# 3. Binning & Stats
bins = [0, 70, 80, 90, 100]
labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)']
df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)
stats = df_merged.groupby('conf_bin', observed=True)['abs_error'].agg(['mean', 'std', 'count']).reset_index()
# 4. Plotting
plt.figure(figsize=(13, 8))
colors = sns.color_palette("Blues", n_colors=len(labels))
# BARS (MAE)
bars = plt.bar(stats['conf_bin'], stats['mean'], color=colors, edgecolor='black', alpha=0.85)
# ERROR BARS (Standard Error of the Mean)
plt.errorbar(stats['conf_bin'], stats['mean'],
yerr=stats['std']/np.sqrt(stats['count']),
fmt='none', c='black', capsize=8, elinewidth=1.5)
# CORRELATION TREND LINE
x_idx = np.arange(len(labels))
z = np.polyfit(x_idx, stats['mean'], 1)
p = np.poly1d(z)
plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5)
# 5. DATA LABELS (n and MAE)
for i, bar in enumerate(bars):
n_count = int(stats.loc[i, 'count'])
mae_val = stats.loc[i, 'mean']
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.04,
f'MAE: {mae_val:.2f}\nn={n_count}',
ha='center', va='bottom', fontweight='bold', fontsize=10)
# 6. THE COMPLETE LEGEND
legend_elements = [
Patch(facecolor=colors[0], edgecolor='black', label=f'Bin 1: {labels[0]}'),
Patch(facecolor=colors[1], edgecolor='black', label=f'Bin 2: {labels[1]}'),
Patch(facecolor=colors[2], edgecolor='black', label=f'Bin 3: {labels[2]}'),
Patch(facecolor=colors[3], edgecolor='black', label=f'Bin 4: {labels[3]}'),
Line2D([0], [0], color='#e74c3c', linestyle='--', lw=3, label='Correlation Trend (Inverse Relationship)'),
Line2D([0], [0], color='black', marker='_', linestyle='None', markersize=10, label='Standard Error (SEM)'),
Patch(color='none', label='Metric: Mean Absolute Error (MAE)')
]
plt.legend(handles=legend_elements, loc='upper right', frameon=True, shadow=True, title="Chart Components")
# Formatting
plt.title('Clinical Validation: LLM Certainty vs. Prediction Accuracy', fontsize=16, pad=30)
plt.ylabel('Mean Absolute Error (EDSS Points)', fontsize=12)
plt.xlabel('LLM Confidence Bracket', fontsize=12)
plt.grid(axis='y', linestyle=':', alpha=0.5)
plt.ylim(0, stats['mean'].max() + 0.6) # Add room for labels
plt.tight_layout()
plt.show()
# plot_final_thesis_error_chart('jsons_folder/', 'gt.csv')
##
# %% 1json
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
def plot_single_json_error_analysis(json_file_path, ground_truth_path):
# 1. Load Ground Truth
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')
# 2. Load the Specific JSON
all_preds = []
with open(json_file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
})
df_pred = pd.DataFrame(all_preds)
# 3. Merge and Calculate Absolute Error
df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
on=['unique_id', 'MedDatum'], how='inner').dropna()
df_merged['abs_error'] = (df_merged['EDSS_pred'] - df_merged['EDSS_gt']).abs()
# 4. Binning and Statistics
bins = [0, 70, 80, 90, 100]
labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)']
df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)
stats = df_merged.groupby('conf_bin', observed=True)['abs_error'].agg(['mean', 'std', 'count']).reset_index()
# 5. Plotting
plt.figure(figsize=(13, 8))
colors = sns.color_palette("Blues", n_colors=len(labels))
# BARS (MAE)
bars = plt.bar(stats['conf_bin'], stats['mean'], color=colors, edgecolor='black', alpha=0.85)
# ERROR BARS (SEM)
plt.errorbar(stats['conf_bin'], stats['mean'],
yerr=stats['std']/np.sqrt(stats['count']),
fmt='none', c='black', capsize=8, elinewidth=1.5)
# CORRELATION TREND LINE
x_idx = np.arange(len(labels))
z = np.polyfit(x_idx, stats['mean'], 1)
p = np.poly1d(z)
plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5)
# 6. DATA LABELS (n and MAE)
for i, bar in enumerate(bars):
n_count = int(stats.loc[i, 'count'])
mae_val = stats.loc[i, 'mean']
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.04,
f'MAE: {mae_val:.2f}\nn={n_count}',
ha='center', va='bottom', fontweight='bold', fontsize=10)
# 7. COMPREHENSIVE LEGEND
legend_elements = [
Patch(facecolor=colors[0], edgecolor='black', label=f'Bin 1: {labels[0]}'),
Patch(facecolor=colors[1], edgecolor='black', label=f'Bin 2: {labels[1]}'),
Patch(facecolor=colors[2], edgecolor='black', label=f'Bin 3: {labels[2]}'),
Patch(facecolor=colors[3], edgecolor='black', label=f'Bin 4: {labels[3]}'),
Line2D([0], [0], color='#e74c3c', linestyle='--', lw=3, label='Inverse Trend Line'),
Line2D([0], [0], color='black', marker='_', linestyle='None', markersize=10, label='Std Error (SEM)'),
Patch(color='none', label='Metric: Mean Absolute Error (MAE)')
]
plt.legend(handles=legend_elements, loc='upper right', frameon=True, shadow=True, title="Legend")
# Final Styling
plt.title('Validation: Confidence vs. Error Magnitude (Iteration 1 Only)', fontsize=15, pad=30)
plt.ylabel('Mean Absolute Error (EDSS Points)', fontsize=12)
plt.xlabel('LLM Confidence Bracket', fontsize=12)
plt.grid(axis='y', linestyle=':', alpha=0.5)
plt.ylim(0, stats['mean'].max() + 0.6)
plt.tight_layout()
plt.show()
# --- RUN THE PLOT ---
json_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json"
gt_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv"
plot_single_json_error_analysis(json_path, gt_path)
##
# %% Usage
# --- Usage ---
@@ -1231,5 +1532,9 @@ def plot_error_distribution_by_confidence(json_dir_path, ground_truth_path):
#plot_edss_vs_confidence_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration')
#plot_gt_vs_llm_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_categorical_vs_categorical('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
plot_error_distribution_by_confidence('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_error_distribution_by_confidence('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_confidence_vs_abs_error_refined('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_confidence_vs_abs_error_with_counts('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
plot_final_thesis_error_chart('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
##