Files
EDSS-calc/Data/certainty_show.py
2026-02-23 00:42:41 +01:00

1541 lines
58 KiB
Python

# %% Explore Dist Plot
import pandas as pd
import json
import glob
import os
import re
import matplotlib.pyplot as plt
def plot_edss_distribution_per_iteration(json_dir_path):
# 1. Reuse your categorization logic
def categorize_edss(value):
if pd.isna(value): return 'Unknown'
elif value <= 1.0: return '0-1'
elif value <= 2.0: return '1-2'
elif value <= 3.0: return '2-3'
elif value <= 4.0: return '3-4'
elif value <= 5.0: return '4-5'
elif value <= 6.0: return '5-6'
elif value <= 7.0: return '6-7'
elif value <= 8.0: return '7-8'
elif value <= 9.0: return '8-9'
elif value <= 10.0: return '9-10'
else: return '10+'
# 2. Extract data from all files with Numerical Sorting
all_records = []
json_files = glob.glob(os.path.join(json_dir_path, "*.json"))
# Natural sort function to handle Iter 1, Iter 2 ... Iter 10
def natural_key(string_):
return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]
json_files.sort(key=natural_key)
for i, file_path in enumerate(json_files):
# We use the index + 1 for the label to ensure Iter 1 to Iter 10 order
iter_label = f"Iter {i+1}"
with open(file_path, 'r', encoding='utf-8') as f:
try:
data = json.load(f)
for entry in data:
if entry.get("success"):
val = entry["result"].get("EDSS")
all_records.append({
'Iteration': iter_label,
'Category': categorize_edss(val),
'Order': i # Used to maintain sort order in the table
})
except Exception as e:
print(f"Error reading {file_path}: {e}")
df = pd.DataFrame(all_records)
# 3. Create a Frequency Table (Crosstab)
# Pivot so iterations are on the X-axis
dist_table = pd.crosstab(df['Iteration'], df['Category'])
# Ensure the rows (Iterations) stay in the 1-10 order
iter_order = [f"Iter {i+1}" for i in range(len(json_files))]
dist_table = dist_table.reindex(iter_order)
# Ensure columns follow clinical order
fixed_labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']
available_labels = [l for l in fixed_labels if l in dist_table.columns]
dist_table = dist_table[available_labels]
# 4. Plotting
ax = dist_table.plot(kind='bar', stacked=True, figsize=(14, 8), colormap='viridis', edgecolor='white')
plt.title('Distribution of Predicted EDSS Categories per Iteration', fontsize=15, pad=20)
plt.xlabel('JSON Iteration File', fontsize=12)
plt.ylabel('Number of Cases (Count)', fontsize=12)
plt.xticks(rotation=0)
# Move legend outside to the right
plt.legend(title="EDSS Category", bbox_to_anchor=(1.05, 1), loc='upper left')
# Add total count labels on top of bars
for i, (name, row) in enumerate(dist_table.iterrows()):
total = row.sum()
if total > 0:
plt.text(i, total + 2, f'Total: {int(total)}', ha='center', va='bottom', fontweight='bold')
plt.tight_layout()
plt.show()
return dist_table
# Usage:
counts_table = plot_edss_distribution_per_iteration('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration')
print(counts_table)
##
# %% Explore Table
import pandas as pd
import json
import glob
import os
import re
def generate_edss_distribution_csv(json_dir_path, output_filename='edss_distribution_summary.csv'):
# 1. Categorization logic
def categorize_edss(value):
if pd.isna(value): return 'Unknown'
elif value <= 1.0: return '0-1'
elif value <= 2.0: return '1-2'
elif value <= 3.0: return '2-3'
elif value <= 4.0: return '3-4'
elif value <= 5.0: return '4-5'
elif value <= 6.0: return '5-6'
elif value <= 7.0: return '6-7'
elif value <= 8.0: return '7-8'
elif value <= 9.0: return '8-9'
elif value <= 10.0: return '9-10'
else: return '10+'
# 2. Extract data from files with Natural Sorting
all_records = []
json_files = glob.glob(os.path.join(json_dir_path, "*.json"))
def natural_key(string_):
return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]
json_files.sort(key=natural_key)
for i, file_path in enumerate(json_files):
iter_label = f"Iter {i+1}"
with open(file_path, 'r', encoding='utf-8') as f:
try:
data = json.load(f)
for entry in data:
if entry.get("success"):
val = entry["result"].get("EDSS")
all_records.append({
'Iteration': iter_label,
'Category': categorize_edss(val)
})
except Exception as e:
print(f"Error reading {file_path}: {e}")
df = pd.DataFrame(all_records)
# 3. Create Frequency Table (Crosstab)
dist_table = pd.crosstab(df['Iteration'], df['Category'])
# 4. Reindex Rows (Numerical order) and Columns (Clinical order)
iter_order = [f"Iter {i+1}" for i in range(len(json_files))]
dist_table = dist_table.reindex(iter_order)
fixed_labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']
available_labels = [l for l in fixed_labels if l in dist_table.columns]
dist_table = dist_table[available_labels]
# Fill missing categories with 0 and convert to integers
dist_table = dist_table.fillna(0).astype(int)
# 5. Add "Total" row at the end
# This sums the counts for each category across all iterations
dist_table.loc['Total Sum'] = dist_table.sum()
# 6. Save to CSV
dist_table.to_csv(output_filename)
print(f"Table successfully saved to: {output_filename}")
return dist_table
# Usage:
final_table = generate_edss_distribution_csv('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration')
print(final_table)
##
# %% EDSS Confusion Matrix
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
def categorize_edss(value):
if pd.isna(value):
return np.nan
elif value <= 1.0:
return '0-1'
elif value <= 2.0:
return '1-2'
elif value <= 3.0:
return '2-3'
elif value <= 4.0:
return '3-4'
elif value <= 5.0:
return '4-5'
elif value <= 6.0:
return '5-6'
elif value <= 7.0:
return '6-7'
elif value <= 8.0:
return '7-8'
elif value <= 9.0:
return '8-9'
elif value <= 10.0:
return '9-10'
else:
return '10+'
def plot_categorized_edss(json_dir_path, ground_truth_path):
# 1. Load Ground Truth
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str)
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str)
df_gt['EDSS'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')
# 2. Iterate through JSON files
all_preds = []
json_pattern = os.path.join(json_dir_path, "*.json")
for file_path in glob.glob(json_pattern):
with open(file_path, 'r', encoding='utf-8') as f:
try:
data = json.load(f)
for entry in data:
if entry.get("success") and "result" in entry:
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')),
'MedDatum': str(res.get('MedDatum')),
'edss_pred': res.get('EDSS')
})
except Exception as e:
print(f"Error reading {file_path}: {e}")
df_pred = pd.DataFrame(all_preds)
df_pred['edss_pred'] = pd.to_numeric(df_pred['edss_pred'], errors='coerce')
# 3. Merge and Categorize
# Clean keys to ensure 100% match rate
for df in [df_gt, df_pred]:
df['unique_id'] = df['unique_id'].astype(str).str.strip()
df['MedDatum'] = df['MedDatum'].astype(str).str.strip()
df_merged = pd.merge(
df_gt[['unique_id', 'MedDatum', 'EDSS']],
df_pred,
on=['unique_id', 'MedDatum'],
how='inner'
)
df_merged = df_merged.dropna(subset=['EDSS', 'edss_pred'])
# --- ADDED THESE LINES TO FIX THE NAMEERROR ---
y_true = df_merged['EDSS'].apply(categorize_edss)
y_pred = df_merged['edss_pred'].apply(categorize_edss)
# ----------------------------------------------
print(f"Verification: Total matches in Confusion Matrix: {len(df_merged)}")
# 4. Define fixed labels to handle data gaps
fixed_labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']
# 5. Generate Confusion Matrix
cm = confusion_matrix(y_true, y_pred, labels=fixed_labels)
# 6. Plotting
fig, ax = plt.subplots(figsize=(10, 8))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=fixed_labels)
# Plotting (y_axis is Ground Truth, x_axis is LLM Prediction)
disp.plot(cmap=plt.cm.Blues, values_format='d', ax=ax)
plt.title('Categorized EDSS: Ground Truth vs LLM Prediction')
plt.ylabel('Ground Truth EDSS')
plt.xlabel('LLM Prediction')
plt.show()
##
# %% Confusion Matrix adjusted
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
def categorize_edss(value):
"""Bins EDSS values into clinical categories."""
if pd.isna(value):
return np.nan
elif value <= 1.0: return '0-1'
elif value <= 2.0: return '1-2'
elif value <= 3.0: return '2-3'
elif value <= 4.0: return '3-4'
elif value <= 5.0: return '4-5'
elif value <= 6.0: return '5-6'
elif value <= 7.0: return '6-7'
elif value <= 8.0: return '7-8'
elif value <= 9.0: return '8-9'
elif value <= 10.0: return '9-10'
else: return '10+'
def plot_categorized_edss(json_dir_path, ground_truth_path):
# 1. Load Ground Truth with Normalization
df_gt = pd.read_csv(ground_truth_path, sep=';')
# Standardize keys to ensure 1:N matching works
df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
df_gt['EDSS'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')
# 2. Load All Predictions from JSONs
all_preds = []
json_files = glob.glob(os.path.join(json_dir_path, "*.json"))
for file_path in json_files:
with open(file_path, 'r', encoding='utf-8') as f:
try:
data = json.load(f)
for entry in data:
# We only take 'success': true entries
if entry.get("success") and "result" in entry:
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'edss_pred': res.get('EDSS')
})
except Exception as e:
print(f"Error reading {file_path}: {e}")
df_pred = pd.DataFrame(all_preds)
df_pred['edss_pred'] = pd.to_numeric(df_pred['edss_pred'], errors='coerce')
# 3. Merge (This should give you ~3934 rows based on your audit)
df_merged = pd.merge(
df_gt[['unique_id', 'MedDatum', 'EDSS']],
df_pred,
on=['unique_id', 'MedDatum'],
how='inner'
)
# --- THE BIG REVEAL: Count the NaNs ---
nan_in_gt = df_merged['EDSS'].isna().sum()
nan_in_pred = df_merged['edss_pred'].isna().sum()
print("-" * 40)
print(f"TOTAL MERGED ROWS: {len(df_merged)}")
print(f"Rows with missing Ground Truth EDSS: {nan_in_gt}")
print(f"Rows with missing Prediction EDSS: {nan_in_pred}")
print("-" * 40)
# Now drop rows that have NO values in either side for the matrix
df_final = df_merged.dropna(subset=['EDSS', 'edss_pred']).copy()
print(f"FINAL ROWS FOR CONFUSION MATRIX: {len(df_final)}")
print("-" * 40)
# 4. Categorize for the Matrix
y_true = df_final['EDSS'].apply(categorize_edss)
y_pred = df_final['edss_pred'].apply(categorize_edss)
fixed_labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']
# 5. Generate and Print Raw Matrix
cm = confusion_matrix(y_true, y_pred, labels=fixed_labels)
# Print the Raw Matrix to terminal
cm_df = pd.DataFrame(cm, index=[f"True_{l}" for l in fixed_labels],
columns=[f"Pred_{l}" for l in fixed_labels])
print("\nRAW CONFUSION MATRIX (Rows=True, Cols=Pred):")
print(cm_df)
# 6. Plotting
fig, ax = plt.subplots(figsize=(12, 10))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=fixed_labels)
# Values_format='d' ensures we see whole numbers, not scientific notation
disp.plot(cmap=plt.cm.Blues, values_format='d', ax=ax)
plt.title(f'EDSS Confusion Matrix\n(n={len(df_final)} iterations across ~400 cases)', fontsize=14)
plt.ylabel('Ground Truth (Clinician)')
plt.xlabel('LLM Prediction')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
##
# %% Subcategories
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
def plot_subcategory_analysis(json_dir_path, ground_truth_path):
# 1. Column Mapping (JSON Key : CSV Column)
mapping = {
"VISUAL_OPTIC_FUNCTIONS": "Sehvermögen",
"BRAINSTEM_FUNCTIONS": "Hirnstamm",
"PYRAMIDAL_FUNCTIONS": "Pyramidalmotorik",
"CEREBELLAR_FUNCTIONS": "Cerebellum",
"SENSORY_FUNCTIONS": "Sensibiliät",
"BOWEL_AND_BLADDER_FUNCTIONS": "Blasen-_und_Mastdarmfunktion",
"CEREBRAL_FUNCTIONS": "Cerebrale_Funktion",
"AMBULATION": "Ambulation"
}
# 2. Load Ground Truth
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str)
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str)
# 3. Load Predictions including Subcategories
all_preds = []
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
row = {
'unique_id': str(res.get('unique_id')),
'MedDatum': str(res.get('MedDatum'))
}
# Add subcategory scores
for json_key in mapping.keys():
row[json_key] = res.get('subcategories', {}).get(json_key)
all_preds.append(row)
df_pred = pd.DataFrame(all_preds)
# 4. Merge
df_merged = pd.merge(df_gt, df_pred, on=['unique_id', 'MedDatum'], suffixes=('_gt', '_llm'))
# 5. Calculate Metrics
results = []
for json_key, csv_col in mapping.items():
# Ensure numeric
true_vals = pd.to_numeric(df_merged[csv_col], errors='coerce')
pred_vals = pd.to_numeric(df_merged[json_key], errors='coerce')
# Drop NaNs for this specific subcategory
mask = true_vals.notna() & pred_vals.notna()
y_t = true_vals[mask]
y_p = pred_vals[mask]
if len(y_t) > 0:
accuracy = (y_t == y_p).mean() * 100
mae = np.abs(y_t - y_p).mean() # Mean Absolute Error (Deviation)
results.append({
'Subcategory': csv_col,
'Accuracy': accuracy,
'Deviation': mae
})
stats_df = pd.DataFrame(results).sort_values('Accuracy', ascending=False)
# 6. Plotting
fig, ax1 = plt.subplots(figsize=(14, 7))
# Bar chart for Accuracy
bars = ax1.bar(stats_df['Subcategory'], stats_df['Accuracy'],
color='#3498db', alpha=0.8, label='Accuracy (%)')
ax1.set_ylabel('Accuracy (%)', color='#2980b9', fontsize=12, fontweight='bold')
ax1.set_ylim(0, 115) # Extra head room for labels
ax1.grid(axis='y', linestyle='--', alpha=0.7)
# Rotate labels
plt.xticks(rotation=30, ha='right', fontsize=10)
# Line chart for Deviation
ax2 = ax1.twinx()
ax2.plot(stats_df['Subcategory'], stats_df['Deviation'],
color='#e74c3c', marker='o', linewidth=2.5, markersize=8,
label='Mean Abs. Deviation (Score Points)')
ax2.set_ylabel('Mean Absolute Deviation', color='#c0392b', fontsize=12, fontweight='bold')
# Adjust ax2 limit to avoid overlap with accuracy text
ax2.set_ylim(0, max(stats_df['Deviation']) * 1.5 if not stats_df['Deviation'].empty else 1)
# plt.title('Subcategory Performance: Accuracy vs. Mean Deviation', fontsize=14, pad=20)
# --- THE FIX: Better Legend Placement ---
# Combine legends from both axes and place them above the plot
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2,
loc='upper center', bbox_to_anchor=(0.5, 1.12),
ncol=2, frameon=False, fontsize=11)
# Add percentage labels on top of bars
for bar in bars:
height = bar.get_height()
ax1.annotate(f'{height:.1f}%',
xy=(bar.get_x() + bar.get_width() / 2, height),
xytext=(0, 5), textcoords="offset points",
ha='center', va='bottom', fontweight='bold', color='#2c3e50')
plt.tight_layout()
plt.show()
##
# %% Certainty
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
def categorize_edss(value):
if pd.isna(value): return np.nan
elif value <= 1.0: return '0-1'
elif value <= 2.0: return '1-2'
elif value <= 3.0: return '2-3'
elif value <= 4.0: return '3-4'
elif value <= 5.0: return '4-5'
elif value <= 6.0: return '5-6'
elif value <= 7.0: return '6-7'
elif value <= 8.0: return '7-8'
elif value <= 9.0: return '8-9'
elif value <= 10.0: return '9-10'
else: return '10+'
def plot_certainty_vs_accuracy_by_category(json_dir_path, ground_truth_path):
# 1. Data Loading & Merging
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str)
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str)
df_gt['EDSS_true'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')
all_preds = []
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')),
'MedDatum': str(res.get('MedDatum')),
'EDSS_pred': res.get('EDSS'),
'certainty': res.get('certainty_percent')
})
df_pred = pd.DataFrame(all_preds)
df_pred['EDSS_pred'] = pd.to_numeric(df_pred['EDSS_pred'], errors='coerce')
df = pd.merge(df_gt[['unique_id', 'MedDatum', 'EDSS_true']],
df_pred, on=['unique_id', 'MedDatum']).dropna()
# 2. Process Metrics
df['gt_category'] = df['EDSS_true'].apply(categorize_edss)
df['is_correct'] = (df['EDSS_true'].round(1) == df['EDSS_pred'].round(1))
fixed_labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']
# Calculate Mean Certainty and Mean Accuracy per category
stats = df.groupby('gt_category').agg({
'is_correct': 'mean',
'certainty': 'mean',
'unique_id': 'count'
}).reindex(fixed_labels)
stats['accuracy_percent'] = stats['is_correct'] * 100
stats = stats.fillna(0)
# 3. Plotting
x = np.arange(len(fixed_labels))
width = 0.35 # Width of the bars
fig, ax = plt.subplots(figsize=(14, 8))
# Plotting both bars side-by-side
rects1 = ax.bar(x - width/2, stats['accuracy_percent'], width,
label='Actual Accuracy (%)', color='#2ecc71', alpha=0.8)
rects2 = ax.bar(x + width/2, stats['certainty'], width,
label='LLM Avg. Certainty (%)', color='#e67e22', alpha=0.8)
# Add text labels, titles and custom x-axis tick labels, etc.
ax.set_ylabel('Percentage (%)', fontsize=12)
ax.set_xlabel('Ground Truth EDSS Category', fontsize=12)
# ax.set_title('Comparison: LLM Confidence (Certainty) vs. Real Accuracy per EDSS Range', fontsize=15, pad=25)
ax.set_xticks(x)
ax.set_xticklabels(fixed_labels)
ax.set_ylim(0, 115)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.08), ncol=2, frameon=False)
ax.grid(axis='y', linestyle=':', alpha=0.5)
# Helper function to label bar heights
def autolabel(rects):
for rect in rects:
height = rect.get_height()
if height > 0:
ax.annotate(f'{height:.0f}%',
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), textcoords="offset points",
ha='center', va='bottom', fontsize=9, fontweight='bold')
autolabel(rects1)
autolabel(rects2)
# Add sample size (n) at the bottom
for i, count in enumerate(stats['unique_id']):
ax.text(i, 2, f'n={int(count)}', ha='center', va='bottom', fontsize=10, color='white', fontweight='bold')
plt.tight_layout()
plt.show()
##
# %% Boxplot
import pandas as pd
import numpy as np
import json
import glob
import os
import re
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
def natural_key(string_):
return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]
def plot_edss_boxplot(json_dir_path, ground_truth_path):
# 1. Load Ground Truth
df_gt = pd.read_csv(ground_truth_path, sep=';')
gt_values = pd.to_numeric(df_gt['EDSS'], errors='coerce').dropna().tolist()
# 2. Load Iterations
json_files = glob.glob(os.path.join(json_dir_path, "*.json"))
json_files.sort(key=natural_key)
plot_data = [gt_values]
labels = ['Ground Truth']
for i, file_path in enumerate(json_files):
iteration_values = []
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
val = entry["result"].get("EDSS")
if val is not None:
iteration_values.append(float(val))
plot_data.append(iteration_values)
labels.append(f"Iter {i+1}")
# 3. Plotting Configuration
plt.figure(figsize=(14, 8))
# Define colors
gt_color = '#ff9999' # Soft Red
iter_color = '#66b3ff' # Soft Blue
# Create the boxplot
bplot = plt.boxplot(plot_data, labels=labels, patch_artist=True,
notch=False,
medianprops={'color': 'black', 'linewidth': 2},
flierprops={'marker': 'o', 'markerfacecolor': 'gray', 'markersize': 5, 'alpha': 0.5},
showmeans=True,
meanprops={"marker":"D", "markerfacecolor":"white", "markeredgecolor":"black", "markersize": 6})
# 4. Fill boxes with colors
colors = [gt_color] + [iter_color] * (len(plot_data) - 1)
for patch, color in zip(bplot['boxes'], colors):
patch.set_facecolor(color)
# 5. CONSTRUCT THE COMPLETE LEGEND
legend_elements = [
Patch(facecolor=gt_color, edgecolor='black', label='Ground Truth'),
Patch(facecolor=iter_color, edgecolor='black', label='LLM Iterations (1-10)'),
Line2D([0], [0], color='black', lw=2, label='Median'),
Line2D([0], [0], marker='D', color='w', label='Mean Score',
markerfacecolor='white', markeredgecolor='black', markersize=8),
Line2D([0], [0], marker='o', color='w', label='Outliers',
markerfacecolor='gray', markersize=6, alpha=0.5)
]
plt.legend(handles=legend_elements, loc='upper right', frameon=True, shadow=True, title="Legend")
# Formatting
plt.title('Distribution of EDSS Scores: Ground Truth vs. 10 LLM Iterations', fontsize=16, pad=20)
plt.ylabel('EDSS Score (0-10)', fontsize=12)
plt.xlabel('Data Source', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.4)
plt.ylim(-0.5, 10.5)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
##
# %% Audit
import pandas as pd
import numpy as np
import json
import glob
import os
def audit_matches(json_dir_path, ground_truth_path):
# 1. Load GT
df_gt = pd.read_csv(ground_truth_path, sep=';')
# 2. Advanced Normalization
def clean_series(s):
return s.astype(str).str.strip().str.lower()
df_gt['unique_id'] = clean_series(df_gt['unique_id'])
df_gt['MedDatum'] = clean_series(df_gt['MedDatum'])
# 3. Load Predictions
all_preds = []
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
file_name = os.path.basename(file_path)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'file': file_name
})
df_pred = pd.DataFrame(all_preds)
# 4. Find the "Ghost" entries (In JSON but not in GT)
# Create a 'key' column for easy comparison
df_gt['key'] = df_gt['unique_id'] + "_" + df_gt['MedDatum']
df_pred['key'] = df_pred['unique_id'] + "_" + df_pred['MedDatum']
gt_keys = set(df_gt['key'])
df_pred['is_matched'] = df_pred['key'].isin(gt_keys)
unmatched_summary = df_pred[df_pred['is_matched'] == False]
print("--- AUDIT RESULTS ---")
print(f"Total rows in JSON: {len(df_pred)}")
print(f"Rows that matched GT: {df_pred['is_matched'].sum()}")
print(f"Rows that FAILED to match: {len(unmatched_summary)}")
if not unmatched_summary.empty:
print("\nFirst 10 Unmatched Entries (check these against your CSV):")
print(unmatched_summary[['unique_id', 'MedDatum', 'file']].head(10))
# Breakdown by file - see if specific JSON files are broken
print("\nFailure count per JSON file:")
print(unmatched_summary['file'].value_counts())
audit_matches('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
##
# %% Cinfidence accuracy correlation
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
def categorize_edss(value):
if pd.isna(value): return np.nan
elif value <= 1.0: return '0-1'
elif value <= 2.0: return '1-2'
elif value <= 3.0: return '2-3'
elif value <= 4.0: return '3-4'
elif value <= 5.0: return '4-5'
elif value <= 6.0: return '6-7'
elif value <= 7.0: return '7-8'
elif value <= 8.0: return '8-9'
elif value <= 9.0: return '9-10'
else: return '10+'
def plot_binned_calibration(json_dir_path, ground_truth_path):
# 1. Load Ground Truth
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
df_gt['gt_cat'] = pd.to_numeric(df_gt['EDSS'], errors='coerce').apply(categorize_edss)
# 2. Load Predictions
all_preds = []
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'pred_cat': categorize_edss(res.get('EDSS')),
'confidence': res.get('certainty_percent')
})
df_pred = pd.DataFrame(all_preds)
df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'gt_cat']],
on=['unique_id', 'MedDatum'], how='inner')
# Define correctness
df_merged['is_correct'] = (df_merged['pred_cat'] == df_merged['gt_cat']).astype(int)
# 3. Create Confidence Bins (e.g., 0-60, 60-70, 70-80, 80-90, 90-100)
bins = [0, 60, 70, 80, 90, 100]
labels = ['<60%', '60-70%', '70-80%', '80-90%', '90-100%']
df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)
# Calculate average accuracy per bin
calibration_stats = df_merged.groupby('conf_bin')['is_correct'].agg(['mean', 'count']).reset_index()
# 4. Plotting
plt.figure(figsize=(10, 6))
# Bar chart for actual accuracy
bars = plt.bar(calibration_stats['conf_bin'], calibration_stats['mean'],
color='skyblue', edgecolor='navy', alpha=0.7, label='Actual Accuracy')
# Add the "Perfect Calibration" line
# (If confidence is 95%, accuracy should be 0.95)
expected_x = np.arange(len(labels))
expected_y = [0.3, 0.65, 0.75, 0.85, 0.95] # Midpoints of the bins for visual reference
plt.plot(expected_x, expected_y, color='red', marker='o', linestyle='--',
linewidth=2, label='Perfect Calibration (Theoretical)')
# 5. Add text labels on top of bars to show sample size (how many cases in that bin)
for i, bar in enumerate(bars):
yval = bar.get_height()
count = calibration_stats.loc[i, 'count']
plt.text(bar.get_x() + bar.get_width()/2, yval + 0.02,
f'Acc: {yval:.1%}\n(n={count})', ha='center', va='bottom', fontsize=9)
# Legend and Labels
plt.title('Model Calibration: Does Confidence Match Accuracy?', fontsize=14, pad=15)
plt.xlabel('LLM Confidence Score Bin', fontsize=12)
plt.ylabel('Actual Accuracy (Correct Category %)', fontsize=12)
plt.ylim(0, 1.1)
plt.grid(axis='y', linestyle=':', alpha=0.5)
# Adding a clean, informative legend
plt.legend(loc='upper left', frameon=True, shadow=True)
plt.tight_layout()
plt.show()
##
# %% Confidence comparison
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
def plot_edss_confidence_comparison(json_dir_path, ground_truth_path):
# 1. Load Ground Truth
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')
# 2. Load Predictions from all JSONs
all_preds = []
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
with open(file_path, 'r', encoding='utf-8') as f:
try:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
})
except Exception as e:
print(f"Skipping {file_path}: {e}")
df_pred = pd.DataFrame(all_preds)
# 3. Merge and Clean
df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
on=['unique_id', 'MedDatum'], how='inner')
df_plot = df_merged.dropna(subset=['EDSS_pred', 'EDSS_gt', 'confidence']).copy()
# 4. Bin Confidence (X-Axis Categories)
# We group confidence into bins to create a readable boxplot
bins = [0, 60, 70, 80, 90, 100]
labels = ['<60%', '60-70%', '70-80%', '80-90%', '90-100%']
df_plot['conf_bin'] = pd.cut(df_plot['confidence'], bins=bins, labels=labels)
# 5. Plotting
plt.figure(figsize=(14, 8))
# A. Boxplot: Shows the distribution of LLM PREDICTIONS
sns.boxplot(data=df_plot, x='conf_bin', y='EDSS_pred',
color='#3498db', width=0.5, showfliers=False,
boxprops=dict(alpha=0.4, edgecolor='navy'))
# B. Stripplot (Dots): Shows individual GROUND TRUTH scores
# We add jitter so dots don't hide each other
sns.stripplot(data=df_plot, x='conf_bin', y='EDSS_gt',
color='#e74c3c', alpha=0.4, jitter=0.2, size=5)
# 6. Create a CLEAR Legend
legend_elements = [
Patch(facecolor='#3498db', edgecolor='navy', alpha=0.4,
label='LLM Predictions (Box = Distribution)'),
Line2D([0], [0], marker='o', color='w', label='Ground Truth (Dots = Clinician Scores)',
markerfacecolor='#e74c3c', markersize=8, alpha=0.6),
Line2D([0], [0], color='black', lw=2, label='Median Predicted EDSS')
]
plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Legend")
# Final Labels
plt.title('Comparison of EDSS Scores Across Confidence Levels', fontsize=16, pad=20)
plt.xlabel('LLM Certainty Score (%)', fontsize=12)
plt.ylabel('EDSS Score (0-10)', fontsize=12)
plt.ylim(-0.5, 10.5)
plt.yticks(np.arange(0, 11, 1))
plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()
##
# %% EDSS vs Boxplot
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
def categorize_edss(value):
if pd.isna(value): return np.nan
elif value <= 1.0: return '0-1'
elif value <= 2.0: return '1-2'
elif value <= 3.0: return '2-3'
elif value <= 4.0: return '3-4'
elif value <= 5.0: return '4-5'
elif value <= 6.0: return '5-6'
elif value <= 7.0: return '6-7'
elif value <= 8.0: return '7-8'
elif value <= 9.0: return '8-9'
elif value <= 10.0: return '9-10'
else: return '10+'
def plot_edss_vs_confidence_boxplot(json_dir_path):
# 1. Load all Predictions
all_preds = []
json_files = glob.glob(os.path.join(json_dir_path, "*.json"))
for file_path in json_files:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
edss_val = pd.to_numeric(res.get('EDSS'), errors='coerce')
conf_val = pd.to_numeric(res.get('certainty_percent'), errors='coerce')
if not pd.isna(edss_val) and not pd.isna(conf_val):
all_preds.append({
'edss_cat': categorize_edss(edss_val),
'confidence': conf_val
})
df = pd.DataFrame(all_preds)
# 2. Sort categories correctly for the x-axis
cat_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']
df['edss_cat'] = pd.Categorical(df['edss_cat'], categories=cat_order, ordered=True)
# 3. Plotting
plt.figure(figsize=(14, 8))
# Create Boxplot
sns.boxplot(data=df, x='edss_cat', y='confidence',
palette="Blues", width=0.6, showfliers=False)
# Add Stripplot (Dots) to show density of cases
sns.stripplot(data=df, x='edss_cat', y='confidence',
color='black', alpha=0.15, jitter=0.2, size=3)
# 4. Legend and Labels
# Since boxplot color is clear, we add a legend for the components
legend_elements = [
Patch(facecolor='#6da7d1', label='Confidence Distribution (IQR)'),
plt.Line2D([0], [0], color='black', marker='o', linestyle='',
markersize=4, alpha=0.4, label='Individual Predictions')
]
plt.legend(handles=legend_elements, loc='lower left', frameon=True)
plt.title('LLM Confidence Levels Across Clinical EDSS Categories', fontsize=16, pad=20)
plt.xlabel('Predicted EDSS Category (Clinical Severity)', fontsize=12)
plt.ylabel('Confidence Score (%)', fontsize=12)
plt.ylim(0, 105)
plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()
##
# %% Correlation Boxplot
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from sklearn.metrics import cohen_kappa_score
def categorize_edss(value):
"""Standardized clinical categorization."""
if pd.isna(value): return np.nan
elif value <= 1.0: return '0-1'
elif value <= 2.0: return '1-2'
elif value <= 3.0: return '2-3'
elif value <= 4.0: return '3-4'
elif value <= 5.0: return '4-5'
elif value <= 6.0: return '5-6'
elif value <= 7.0: return '6-7'
elif value <= 8.0: return '7-8'
elif value <= 9.0: return '8-9'
elif value <= 10.0: return '9-10'
else: return '10+'
def plot_categorical_vs_categorical(json_dir_path, ground_truth_path):
# 1. Load Ground Truth
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
df_gt['gt_cat'] = pd.to_numeric(df_gt['EDSS'], errors='coerce').apply(categorize_edss)
# 2. Load Predictions
all_preds = []
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'pred_cat': categorize_edss(pd.to_numeric(res.get('EDSS'), errors='coerce'))
})
df_pred = pd.DataFrame(all_preds)
# 3. Merge
df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'gt_cat']],
on=['unique_id', 'MedDatum'], how='inner').dropna()
# 4. Set Order and Numeric Mapping for Plotting
cat_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']
cat_map = {cat: i for i, cat in enumerate(cat_order)}
df_merged['gt_idx'] = df_merged['gt_cat'].map(cat_map)
df_merged['pred_idx'] = df_merged['pred_cat'].map(cat_map)
# Calculate Cohen's Kappa (Standard for categorical agreement)
kappa = cohen_kappa_score(df_merged['gt_cat'], df_merged['pred_cat'], weights='linear')
# 5. Plotting
plt.figure(figsize=(14, 8))
# BOXPLOT: Distribution of Predicted Categories relative to Ground Truth
sns.boxplot(data=df_merged, x='gt_cat', y='pred_idx',
palette="rocket", width=0.6, showfliers=False, boxprops=dict(alpha=0.5))
# STRIPPLOT: Individual counts
sns.stripplot(data=df_merged, x='gt_cat', y='pred_idx',
color='black', alpha=0.1, jitter=0.3, size=4)
# DIAGONAL REFERENCE: Perfect category match
plt.plot([0, 9], [0, 9], color='red', linestyle='--', linewidth=2)
# 6. Formatting Legend & Axes
plt.yticks(ticks=range(len(cat_order)), labels=cat_order)
legend_elements = [
Patch(facecolor='#ae3e50', alpha=0.5, label='Predicted Category Spread'),
plt.Line2D([0], [0], color='red', linestyle='--', label='Perfect Category Agreement'),
plt.Line2D([0], [0], color='black', marker='o', linestyle='', markersize=4, alpha=0.3, label='Iteration Matches'),
Patch(color='none', label=f'Linear Weighted Kappa: {kappa:.3f}')
]
plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Agreement Metrics")
plt.title('Categorical Agreement: Ground Truth vs. LLM Prediction', fontsize=16, pad=20)
plt.xlabel('Ground Truth Category (Clinician)', fontsize=12)
plt.ylabel('LLM Predicted Category', fontsize=12)
plt.grid(axis='both', linestyle=':', alpha=0.4)
plt.tight_layout()
plt.show()
##
# %% rainplot
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
def plot_error_distribution_by_confidence(json_dir_path, ground_truth_path):
# 1. Load Ground Truth
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')
# 2. Load Predictions
all_preds = []
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
})
df_merged = pd.merge(pd.DataFrame(all_preds), df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
on=['unique_id', 'MedDatum'], how='inner').dropna()
# 3. Calculate Error
df_merged['error'] = df_merged['EDSS_pred'] - df_merged['EDSS_gt']
# 4. Bin Confidence
bins = [0, 70, 80, 90, 100]
labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)']
df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)
# Calculate counts for labels
counts = df_merged['conf_bin'].value_counts().reindex(labels)
new_labels = [f"{l}\n(n={int(counts[l])})" for l in labels]
# 5. Plotting
plt.figure(figsize=(13, 8))
# Using a sequential color palette (Light blue to Dark blue)
palette_colors = sns.color_palette("Blues", n_colors=len(labels))
vplot = sns.violinplot(data=df_merged, x='conf_bin', y='error', inner="quartile",
palette=palette_colors, cut=0)
# Reference line at 0
plt.axhline(0, color='#d9534f', linestyle='--', linewidth=2.5)
# 6. UPDATED LEGEND WITH CORRECT COLORS
legend_elements = [
# Legend items for the color gradient
Patch(facecolor=palette_colors[0], label='Confidence: <70%'),
Patch(facecolor=palette_colors[1], label='Confidence: 70-80%'),
Patch(facecolor=palette_colors[2], label='Confidence: 80-90%'),
Patch(facecolor=palette_colors[3], label='Confidence: 90-100%'),
# Legend items for the symbols
Line2D([0], [0], color='black', linestyle=':', label='Quartile Lines (25th, 50th, 75th)'),
Line2D([0], [0], color='#d9534f', linestyle='--', lw=2.5, label='Zero Error (Perfect Match)')
]
plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Legend & Confidence Gradient")
# Formatting
plt.title('Error Magnitude vs. LLM Confidence Levels', fontsize=16, pad=20)
plt.xlabel('LLM Certainty Group', fontsize=12)
plt.ylabel('Prediction Delta (EDSS_pred - EDSS_gt)', fontsize=12)
plt.xticks(ticks=range(len(labels)), labels=new_labels)
plt.grid(axis='y', linestyle=':', alpha=0.5)
plt.tight_layout()
plt.show()
# plot_error_distribution_by_confidence('jsons_folder/', 'ground_truth.csv')
##
# %% Certainty vs Delta
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
def plot_confidence_vs_abs_error_refined(json_dir_path, ground_truth_path):
# 1. Load and Merge Data
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')
all_preds = []
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
})
df_merged = pd.merge(pd.DataFrame(all_preds), df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
on=['unique_id', 'MedDatum'], how='inner').dropna()
# 2. Calculate Absolute Delta
df_merged['abs_error'] = (df_merged['EDSS_pred'] - df_merged['EDSS_gt']).abs()
# 3. Binning
bins = [0, 70, 80, 90, 100]
labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)']
df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)
stats = df_merged.groupby('conf_bin', observed=True)['abs_error'].agg(['mean', 'std', 'count']).reset_index()
# 4. Plotting
plt.figure(figsize=(12, 8))
# Sequential palette: light to dark
colors = sns.color_palette("Blues", n_colors=len(labels))
bars = plt.bar(stats['conf_bin'], stats['mean'], color=colors, edgecolor='black', linewidth=1.2)
# Standard Error Bars
plt.errorbar(stats['conf_bin'], stats['mean'],
yerr=stats['std']/np.sqrt(stats['count']),
fmt='none', c='black', capsize=6, elinewidth=1.5)
# Trend Line (Linear Fit)
x_idx = np.arange(len(labels))
z = np.polyfit(x_idx, stats['mean'], 1)
p = np.poly1d(z)
plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=2.5)
# 5. THE COMPLETE LEGEND
# We create a specific handle for every single thing on the chart
legend_elements = [
# Explicit color mapping for bins
Patch(facecolor=colors[0], edgecolor='black', label=f'Bin 1: {labels[0]}'),
Patch(facecolor=colors[1], edgecolor='black', label=f'Bin 2: {labels[1]}'),
Patch(facecolor=colors[2], edgecolor='black', label=f'Bin 3: {labels[2]}'),
Patch(facecolor=colors[3], edgecolor='black', label=f'Bin 4: {labels[3]}'),
# Statistical components
Line2D([0], [0], color='black', marker='_', linestyle='None', markersize=10, label='Standard Error (SEM)'),
Line2D([0], [0], color='#e74c3c', linestyle='--', lw=2.5, label='Correlation Trend (Inverse Rel.)'),
# Metric definition
Patch(color='none', label='Metric: Mean Absolute Error (MAE)')
]
plt.legend(handles=legend_elements, loc='upper right', frameon=True,
shadow=True, fontsize=10, title="Legend")
# Final Labels & Clean-up
plt.title('Validation: Inverse Correlation of Confidence vs. Error Magnitude', fontsize=15, pad=20)
plt.ylabel('Mean Absolute Error (Δ EDSS Points)', fontsize=12)
plt.xlabel('LLM Confidence Bracket', fontsize=12)
# Text annotations for MAE on bars
for i, bar in enumerate(bars):
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05,
f'MAE: {stats.loc[i, "mean"]:.2f}', ha='center', fontweight='bold')
plt.grid(axis='y', linestyle=':', alpha=0.5)
plt.tight_layout()
plt.show()
##
# %% name
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
def plot_final_thesis_error_chart(json_dir_path, ground_truth_path):
# 1. Load Ground Truth & Predictions
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')
all_preds = []
for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
})
df_merged = pd.merge(pd.DataFrame(all_preds), df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
on=['unique_id', 'MedDatum'], how='inner').dropna()
# 2. Metric Calculation
df_merged['abs_error'] = (df_merged['EDSS_pred'] - df_merged['EDSS_gt']).abs()
# 3. Binning & Stats
bins = [0, 70, 80, 90, 100]
labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)']
df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)
stats = df_merged.groupby('conf_bin', observed=True)['abs_error'].agg(['mean', 'std', 'count']).reset_index()
# 4. Plotting
plt.figure(figsize=(13, 8))
colors = sns.color_palette("Blues", n_colors=len(labels))
# BARS (MAE)
bars = plt.bar(stats['conf_bin'], stats['mean'], color=colors, edgecolor='black', alpha=0.85)
# ERROR BARS (Standard Error of the Mean)
plt.errorbar(stats['conf_bin'], stats['mean'],
yerr=stats['std']/np.sqrt(stats['count']),
fmt='none', c='black', capsize=8, elinewidth=1.5)
# CORRELATION TREND LINE
x_idx = np.arange(len(labels))
z = np.polyfit(x_idx, stats['mean'], 1)
p = np.poly1d(z)
plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5)
# 5. DATA LABELS (n and MAE)
for i, bar in enumerate(bars):
n_count = int(stats.loc[i, 'count'])
mae_val = stats.loc[i, 'mean']
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.04,
f'MAE: {mae_val:.2f}\nn={n_count}',
ha='center', va='bottom', fontweight='bold', fontsize=10)
# 6. THE COMPLETE LEGEND
legend_elements = [
Patch(facecolor=colors[0], edgecolor='black', label=f'Bin 1: {labels[0]}'),
Patch(facecolor=colors[1], edgecolor='black', label=f'Bin 2: {labels[1]}'),
Patch(facecolor=colors[2], edgecolor='black', label=f'Bin 3: {labels[2]}'),
Patch(facecolor=colors[3], edgecolor='black', label=f'Bin 4: {labels[3]}'),
Line2D([0], [0], color='#e74c3c', linestyle='--', lw=3, label='Correlation Trend (Inverse Relationship)'),
Line2D([0], [0], color='black', marker='_', linestyle='None', markersize=10, label='Standard Error (SEM)'),
Patch(color='none', label='Metric: Mean Absolute Error (MAE)')
]
plt.legend(handles=legend_elements, loc='upper right', frameon=True, shadow=True, title="Chart Components")
# Formatting
plt.title('Clinical Validation: LLM Certainty vs. Prediction Accuracy', fontsize=16, pad=30)
plt.ylabel('Mean Absolute Error (EDSS Points)', fontsize=12)
plt.xlabel('LLM Confidence Bracket', fontsize=12)
plt.grid(axis='y', linestyle=':', alpha=0.5)
plt.ylim(0, stats['mean'].max() + 0.6) # Add room for labels
plt.tight_layout()
plt.show()
# plot_final_thesis_error_chart('jsons_folder/', 'gt.csv')
##
# %% 1json
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
def plot_single_json_error_analysis(json_file_path, ground_truth_path):
# 1. Load Ground Truth
df_gt = pd.read_csv(ground_truth_path, sep=';')
df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')
# 2. Load the Specific JSON
all_preds = []
with open(json_file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for entry in data:
if entry.get("success"):
res = entry["result"]
all_preds.append({
'unique_id': str(res.get('unique_id')).strip().lower(),
'MedDatum': str(res.get('MedDatum')).strip().lower(),
'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
})
df_pred = pd.DataFrame(all_preds)
# 3. Merge and Calculate Absolute Error
df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
on=['unique_id', 'MedDatum'], how='inner').dropna()
df_merged['abs_error'] = (df_merged['EDSS_pred'] - df_merged['EDSS_gt']).abs()
# 4. Binning and Statistics
bins = [0, 70, 80, 90, 100]
labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)']
df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)
stats = df_merged.groupby('conf_bin', observed=True)['abs_error'].agg(['mean', 'std', 'count']).reset_index()
# 5. Plotting
plt.figure(figsize=(13, 8))
colors = sns.color_palette("Blues", n_colors=len(labels))
# BARS (MAE)
bars = plt.bar(stats['conf_bin'], stats['mean'], color=colors, edgecolor='black', alpha=0.85)
# ERROR BARS (SEM)
plt.errorbar(stats['conf_bin'], stats['mean'],
yerr=stats['std']/np.sqrt(stats['count']),
fmt='none', c='black', capsize=8, elinewidth=1.5)
# CORRELATION TREND LINE
x_idx = np.arange(len(labels))
z = np.polyfit(x_idx, stats['mean'], 1)
p = np.poly1d(z)
plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5)
# 6. DATA LABELS (n and MAE)
for i, bar in enumerate(bars):
n_count = int(stats.loc[i, 'count'])
mae_val = stats.loc[i, 'mean']
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.04,
f'MAE: {mae_val:.2f}\nn={n_count}',
ha='center', va='bottom', fontweight='bold', fontsize=10)
# 7. COMPREHENSIVE LEGEND
legend_elements = [
Patch(facecolor=colors[0], edgecolor='black', label=f'Bin 1: {labels[0]}'),
Patch(facecolor=colors[1], edgecolor='black', label=f'Bin 2: {labels[1]}'),
Patch(facecolor=colors[2], edgecolor='black', label=f'Bin 3: {labels[2]}'),
Patch(facecolor=colors[3], edgecolor='black', label=f'Bin 4: {labels[3]}'),
Line2D([0], [0], color='#e74c3c', linestyle='--', lw=3, label='Inverse Trend Line'),
Line2D([0], [0], color='black', marker='_', linestyle='None', markersize=10, label='Std Error (SEM)'),
Patch(color='none', label='Metric: Mean Absolute Error (MAE)')
]
plt.legend(handles=legend_elements, loc='upper right', frameon=True, shadow=True, title="Legend")
# Final Styling
plt.title('Validation: Confidence vs. Error Magnitude (Iteration 1 Only)', fontsize=15, pad=30)
plt.ylabel('Mean Absolute Error (EDSS Points)', fontsize=12)
plt.xlabel('LLM Confidence Bracket', fontsize=12)
plt.grid(axis='y', linestyle=':', alpha=0.5)
plt.ylim(0, stats['mean'].max() + 0.6)
plt.tight_layout()
plt.show()
# --- RUN THE PLOT ---
json_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json"
gt_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv"
plot_single_json_error_analysis(json_path, gt_path)
##
# %% Usage
# --- Usage ---
#plot_categorized_edss('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration',
# '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_subcategory_analysis('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_certainty_vs_accuracy_by_category('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_edss_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_binned_calibration('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_edss_vs_confidence_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration')
#plot_gt_vs_llm_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_categorical_vs_categorical('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_error_distribution_by_confidence('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_confidence_vs_abs_error_refined('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_confidence_vs_abs_error_with_counts('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_final_thesis_error_chart('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
##