# %% Explore Dist Plot import pandas as pd import json import glob import os import re import matplotlib.pyplot as plt def plot_edss_distribution_per_iteration(json_dir_path): # 1. Reuse your categorization logic def categorize_edss(value): if pd.isna(value): return 'Unknown' elif value <= 1.0: return '0-1' elif value <= 2.0: return '1-2' elif value <= 3.0: return '2-3' elif value <= 4.0: return '3-4' elif value <= 5.0: return '4-5' elif value <= 6.0: return '5-6' elif value <= 7.0: return '6-7' elif value <= 8.0: return '7-8' elif value <= 9.0: return '8-9' elif value <= 10.0: return '9-10' else: return '10+' # 2. Extract data from all files with Numerical Sorting all_records = [] json_files = glob.glob(os.path.join(json_dir_path, "*.json")) # Natural sort function to handle Iter 1, Iter 2 ... Iter 10 def natural_key(string_): return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)] json_files.sort(key=natural_key) for i, file_path in enumerate(json_files): # We use the index + 1 for the label to ensure Iter 1 to Iter 10 order iter_label = f"Iter {i+1}" with open(file_path, 'r', encoding='utf-8') as f: try: data = json.load(f) for entry in data: if entry.get("success"): val = entry["result"].get("EDSS") all_records.append({ 'Iteration': iter_label, 'Category': categorize_edss(val), 'Order': i # Used to maintain sort order in the table }) except Exception as e: print(f"Error reading {file_path}: {e}") df = pd.DataFrame(all_records) # 3. Create a Frequency Table (Crosstab) # Pivot so iterations are on the X-axis dist_table = pd.crosstab(df['Iteration'], df['Category']) # Ensure the rows (Iterations) stay in the 1-10 order iter_order = [f"Iter {i+1}" for i in range(len(json_files))] dist_table = dist_table.reindex(iter_order) # Ensure columns follow clinical order fixed_labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10'] available_labels = [l for l in fixed_labels if l in dist_table.columns] dist_table = dist_table[available_labels] # 4. Plotting ax = dist_table.plot(kind='bar', stacked=True, figsize=(14, 8), colormap='viridis', edgecolor='white') plt.title('Distribution of Predicted EDSS Categories per Iteration', fontsize=15, pad=20) plt.xlabel('JSON Iteration File', fontsize=12) plt.ylabel('Number of Cases (Count)', fontsize=12) plt.xticks(rotation=0) # Move legend outside to the right plt.legend(title="EDSS Category", bbox_to_anchor=(1.05, 1), loc='upper left') # Add total count labels on top of bars for i, (name, row) in enumerate(dist_table.iterrows()): total = row.sum() if total > 0: plt.text(i, total + 2, f'Total: {int(total)}', ha='center', va='bottom', fontweight='bold') plt.tight_layout() plt.show() return dist_table # Usage: counts_table = plot_edss_distribution_per_iteration('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration') print(counts_table) ## # %% Explore Table import pandas as pd import json import glob import os import re def generate_edss_distribution_csv(json_dir_path, output_filename='edss_distribution_summary.csv'): # 1. Categorization logic def categorize_edss(value): if pd.isna(value): return 'Unknown' elif value <= 1.0: return '0-1' elif value <= 2.0: return '1-2' elif value <= 3.0: return '2-3' elif value <= 4.0: return '3-4' elif value <= 5.0: return '4-5' elif value <= 6.0: return '5-6' elif value <= 7.0: return '6-7' elif value <= 8.0: return '7-8' elif value <= 9.0: return '8-9' elif value <= 10.0: return '9-10' else: return '10+' # 2. Extract data from files with Natural Sorting all_records = [] json_files = glob.glob(os.path.join(json_dir_path, "*.json")) def natural_key(string_): return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)] json_files.sort(key=natural_key) for i, file_path in enumerate(json_files): iter_label = f"Iter {i+1}" with open(file_path, 'r', encoding='utf-8') as f: try: data = json.load(f) for entry in data: if entry.get("success"): val = entry["result"].get("EDSS") all_records.append({ 'Iteration': iter_label, 'Category': categorize_edss(val) }) except Exception as e: print(f"Error reading {file_path}: {e}") df = pd.DataFrame(all_records) # 3. Create Frequency Table (Crosstab) dist_table = pd.crosstab(df['Iteration'], df['Category']) # 4. Reindex Rows (Numerical order) and Columns (Clinical order) iter_order = [f"Iter {i+1}" for i in range(len(json_files))] dist_table = dist_table.reindex(iter_order) fixed_labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10'] available_labels = [l for l in fixed_labels if l in dist_table.columns] dist_table = dist_table[available_labels] # Fill missing categories with 0 and convert to integers dist_table = dist_table.fillna(0).astype(int) # 5. Add "Total" row at the end # This sums the counts for each category across all iterations dist_table.loc['Total Sum'] = dist_table.sum() # 6. Save to CSV dist_table.to_csv(output_filename) print(f"Table successfully saved to: {output_filename}") return dist_table # Usage: final_table = generate_edss_distribution_csv('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration') print(final_table) ## # %% EDSS Confusion Matrix import pandas as pd import numpy as np import json import glob import os import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay def categorize_edss(value): if pd.isna(value): return np.nan elif value <= 1.0: return '0-1' elif value <= 2.0: return '1-2' elif value <= 3.0: return '2-3' elif value <= 4.0: return '3-4' elif value <= 5.0: return '4-5' elif value <= 6.0: return '5-6' elif value <= 7.0: return '6-7' elif value <= 8.0: return '7-8' elif value <= 9.0: return '8-9' elif value <= 10.0: return '9-10' else: return '10+' def plot_categorized_edss(json_dir_path, ground_truth_path): # 1. Load Ground Truth df_gt = pd.read_csv(ground_truth_path, sep=';') df_gt['unique_id'] = df_gt['unique_id'].astype(str) df_gt['MedDatum'] = df_gt['MedDatum'].astype(str) df_gt['EDSS'] = pd.to_numeric(df_gt['EDSS'], errors='coerce') # 2. Iterate through JSON files all_preds = [] json_pattern = os.path.join(json_dir_path, "*.json") for file_path in glob.glob(json_pattern): with open(file_path, 'r', encoding='utf-8') as f: try: data = json.load(f) for entry in data: if entry.get("success") and "result" in entry: res = entry["result"] all_preds.append({ 'unique_id': str(res.get('unique_id')), 'MedDatum': str(res.get('MedDatum')), 'edss_pred': res.get('EDSS') }) except Exception as e: print(f"Error reading {file_path}: {e}") df_pred = pd.DataFrame(all_preds) df_pred['edss_pred'] = pd.to_numeric(df_pred['edss_pred'], errors='coerce') # 3. Merge and Categorize # Clean keys to ensure 100% match rate for df in [df_gt, df_pred]: df['unique_id'] = df['unique_id'].astype(str).str.strip() df['MedDatum'] = df['MedDatum'].astype(str).str.strip() df_merged = pd.merge( df_gt[['unique_id', 'MedDatum', 'EDSS']], df_pred, on=['unique_id', 'MedDatum'], how='inner' ) df_merged = df_merged.dropna(subset=['EDSS', 'edss_pred']) # --- ADDED THESE LINES TO FIX THE NAMEERROR --- y_true = df_merged['EDSS'].apply(categorize_edss) y_pred = df_merged['edss_pred'].apply(categorize_edss) # ---------------------------------------------- print(f"Verification: Total matches in Confusion Matrix: {len(df_merged)}") # 4. Define fixed labels to handle data gaps fixed_labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10'] # 5. Generate Confusion Matrix cm = confusion_matrix(y_true, y_pred, labels=fixed_labels) # 6. Plotting fig, ax = plt.subplots(figsize=(10, 8)) disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=fixed_labels) # Plotting (y_axis is Ground Truth, x_axis is LLM Prediction) disp.plot(cmap=plt.cm.Blues, values_format='d', ax=ax) plt.title('Categorized EDSS: Ground Truth vs LLM Prediction') plt.ylabel('Ground Truth EDSS') plt.xlabel('LLM Prediction') plt.show() ## # %% Confusion Matrix adjusted import pandas as pd import numpy as np import json import glob import os import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay def categorize_edss(value): """Bins EDSS values into clinical categories.""" if pd.isna(value): return np.nan elif value <= 1.0: return '0-1' elif value <= 2.0: return '1-2' elif value <= 3.0: return '2-3' elif value <= 4.0: return '3-4' elif value <= 5.0: return '4-5' elif value <= 6.0: return '5-6' elif value <= 7.0: return '6-7' elif value <= 8.0: return '7-8' elif value <= 9.0: return '8-9' elif value <= 10.0: return '9-10' else: return '10+' def plot_categorized_edss(json_dir_path, ground_truth_path): # 1. Load Ground Truth with Normalization df_gt = pd.read_csv(ground_truth_path, sep=';') # Standardize keys to ensure 1:N matching works df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower() df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower() df_gt['EDSS'] = pd.to_numeric(df_gt['EDSS'], errors='coerce') # 2. Load All Predictions from JSONs all_preds = [] json_files = glob.glob(os.path.join(json_dir_path, "*.json")) for file_path in json_files: with open(file_path, 'r', encoding='utf-8') as f: try: data = json.load(f) for entry in data: # We only take 'success': true entries if entry.get("success") and "result" in entry: res = entry["result"] all_preds.append({ 'unique_id': str(res.get('unique_id')).strip().lower(), 'MedDatum': str(res.get('MedDatum')).strip().lower(), 'edss_pred': res.get('EDSS') }) except Exception as e: print(f"Error reading {file_path}: {e}") df_pred = pd.DataFrame(all_preds) df_pred['edss_pred'] = pd.to_numeric(df_pred['edss_pred'], errors='coerce') # 3. Merge (This should give you ~3934 rows based on your audit) df_merged = pd.merge( df_gt[['unique_id', 'MedDatum', 'EDSS']], df_pred, on=['unique_id', 'MedDatum'], how='inner' ) # --- THE BIG REVEAL: Count the NaNs --- nan_in_gt = df_merged['EDSS'].isna().sum() nan_in_pred = df_merged['edss_pred'].isna().sum() print("-" * 40) print(f"TOTAL MERGED ROWS: {len(df_merged)}") print(f"Rows with missing Ground Truth EDSS: {nan_in_gt}") print(f"Rows with missing Prediction EDSS: {nan_in_pred}") print("-" * 40) # Now drop rows that have NO values in either side for the matrix df_final = df_merged.dropna(subset=['EDSS', 'edss_pred']).copy() print(f"FINAL ROWS FOR CONFUSION MATRIX: {len(df_final)}") print("-" * 40) # 4. Categorize for the Matrix y_true = df_final['EDSS'].apply(categorize_edss) y_pred = df_final['edss_pred'].apply(categorize_edss) fixed_labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10'] # 5. Generate and Print Raw Matrix cm = confusion_matrix(y_true, y_pred, labels=fixed_labels) # Print the Raw Matrix to terminal cm_df = pd.DataFrame(cm, index=[f"True_{l}" for l in fixed_labels], columns=[f"Pred_{l}" for l in fixed_labels]) print("\nRAW CONFUSION MATRIX (Rows=True, Cols=Pred):") print(cm_df) # 6. Plotting fig, ax = plt.subplots(figsize=(12, 10)) disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=fixed_labels) # Values_format='d' ensures we see whole numbers, not scientific notation disp.plot(cmap=plt.cm.Blues, values_format='d', ax=ax) plt.title(f'EDSS Confusion Matrix\n(n={len(df_final)} iterations across ~400 cases)', fontsize=14) plt.ylabel('Ground Truth (Clinician)') plt.xlabel('LLM Prediction') plt.xticks(rotation=45) plt.tight_layout() plt.show() ## # %% Subcategories import pandas as pd import numpy as np import json import glob import os import matplotlib.pyplot as plt def plot_subcategory_analysis(json_dir_path, ground_truth_path): # 1. Column Mapping (JSON Key : CSV Column) mapping = { "VISUAL_OPTIC_FUNCTIONS": "Sehvermögen", "BRAINSTEM_FUNCTIONS": "Hirnstamm", "PYRAMIDAL_FUNCTIONS": "Pyramidalmotorik", "CEREBELLAR_FUNCTIONS": "Cerebellum", "SENSORY_FUNCTIONS": "Sensibiliät", "BOWEL_AND_BLADDER_FUNCTIONS": "Blasen-_und_Mastdarmfunktion", "CEREBRAL_FUNCTIONS": "Cerebrale_Funktion", "AMBULATION": "Ambulation" } # 2. Load Ground Truth df_gt = pd.read_csv(ground_truth_path, sep=';') df_gt['unique_id'] = df_gt['unique_id'].astype(str) df_gt['MedDatum'] = df_gt['MedDatum'].astype(str) # 3. Load Predictions including Subcategories all_preds = [] for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) for entry in data: if entry.get("success"): res = entry["result"] row = { 'unique_id': str(res.get('unique_id')), 'MedDatum': str(res.get('MedDatum')) } # Add subcategory scores for json_key in mapping.keys(): row[json_key] = res.get('subcategories', {}).get(json_key) all_preds.append(row) df_pred = pd.DataFrame(all_preds) # 4. Merge df_merged = pd.merge(df_gt, df_pred, on=['unique_id', 'MedDatum'], suffixes=('_gt', '_llm')) # 5. Calculate Metrics results = [] for json_key, csv_col in mapping.items(): # Ensure numeric true_vals = pd.to_numeric(df_merged[csv_col], errors='coerce') pred_vals = pd.to_numeric(df_merged[json_key], errors='coerce') # Drop NaNs for this specific subcategory mask = true_vals.notna() & pred_vals.notna() y_t = true_vals[mask] y_p = pred_vals[mask] if len(y_t) > 0: accuracy = (y_t == y_p).mean() * 100 mae = np.abs(y_t - y_p).mean() # Mean Absolute Error (Deviation) results.append({ 'Subcategory': csv_col, 'Accuracy': accuracy, 'Deviation': mae }) stats_df = pd.DataFrame(results).sort_values('Accuracy', ascending=False) # 6. Plotting fig, ax1 = plt.subplots(figsize=(14, 7)) # Bar chart for Accuracy bars = ax1.bar(stats_df['Subcategory'], stats_df['Accuracy'], color='#3498db', alpha=0.8, label='Accuracy (%)') ax1.set_ylabel('Accuracy (%)', color='#2980b9', fontsize=12, fontweight='bold') ax1.set_ylim(0, 115) # Extra head room for labels ax1.grid(axis='y', linestyle='--', alpha=0.7) # Rotate labels plt.xticks(rotation=30, ha='right', fontsize=10) # Line chart for Deviation ax2 = ax1.twinx() ax2.plot(stats_df['Subcategory'], stats_df['Deviation'], color='#e74c3c', marker='o', linewidth=2.5, markersize=8, label='Mean Abs. Deviation (Score Points)') ax2.set_ylabel('Mean Absolute Deviation', color='#c0392b', fontsize=12, fontweight='bold') # Adjust ax2 limit to avoid overlap with accuracy text ax2.set_ylim(0, max(stats_df['Deviation']) * 1.5 if not stats_df['Deviation'].empty else 1) # plt.title('Subcategory Performance: Accuracy vs. Mean Deviation', fontsize=14, pad=20) # --- THE FIX: Better Legend Placement --- # Combine legends from both axes and place them above the plot lines1, labels1 = ax1.get_legend_handles_labels() lines2, labels2 = ax2.get_legend_handles_labels() ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper center', bbox_to_anchor=(0.5, 1.12), ncol=2, frameon=False, fontsize=11) # Add percentage labels on top of bars for bar in bars: height = bar.get_height() ax1.annotate(f'{height:.1f}%', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 5), textcoords="offset points", ha='center', va='bottom', fontweight='bold', color='#2c3e50') plt.tight_layout() plt.show() ## # %% Certainty import pandas as pd import numpy as np import json import glob import os import matplotlib.pyplot as plt def categorize_edss(value): if pd.isna(value): return np.nan elif value <= 1.0: return '0-1' elif value <= 2.0: return '1-2' elif value <= 3.0: return '2-3' elif value <= 4.0: return '3-4' elif value <= 5.0: return '4-5' elif value <= 6.0: return '5-6' elif value <= 7.0: return '6-7' elif value <= 8.0: return '7-8' elif value <= 9.0: return '8-9' elif value <= 10.0: return '9-10' else: return '10+' def plot_certainty_vs_accuracy_by_category(json_dir_path, ground_truth_path): # 1. Data Loading & Merging df_gt = pd.read_csv(ground_truth_path, sep=';') df_gt['unique_id'] = df_gt['unique_id'].astype(str) df_gt['MedDatum'] = df_gt['MedDatum'].astype(str) df_gt['EDSS_true'] = pd.to_numeric(df_gt['EDSS'], errors='coerce') all_preds = [] for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) for entry in data: if entry.get("success"): res = entry["result"] all_preds.append({ 'unique_id': str(res.get('unique_id')), 'MedDatum': str(res.get('MedDatum')), 'EDSS_pred': res.get('EDSS'), 'certainty': res.get('certainty_percent') }) df_pred = pd.DataFrame(all_preds) df_pred['EDSS_pred'] = pd.to_numeric(df_pred['EDSS_pred'], errors='coerce') df = pd.merge(df_gt[['unique_id', 'MedDatum', 'EDSS_true']], df_pred, on=['unique_id', 'MedDatum']).dropna() # 2. Process Metrics df['gt_category'] = df['EDSS_true'].apply(categorize_edss) df['is_correct'] = (df['EDSS_true'].round(1) == df['EDSS_pred'].round(1)) fixed_labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10'] # Calculate Mean Certainty and Mean Accuracy per category stats = df.groupby('gt_category').agg({ 'is_correct': 'mean', 'certainty': 'mean', 'unique_id': 'count' }).reindex(fixed_labels) stats['accuracy_percent'] = stats['is_correct'] * 100 stats = stats.fillna(0) # 3. Plotting x = np.arange(len(fixed_labels)) width = 0.35 # Width of the bars fig, ax = plt.subplots(figsize=(14, 8)) # Plotting both bars side-by-side rects1 = ax.bar(x - width/2, stats['accuracy_percent'], width, label='Actual Accuracy (%)', color='#2ecc71', alpha=0.8) rects2 = ax.bar(x + width/2, stats['certainty'], width, label='LLM Avg. Certainty (%)', color='#e67e22', alpha=0.8) # Add text labels, titles and custom x-axis tick labels, etc. ax.set_ylabel('Percentage (%)', fontsize=12) ax.set_xlabel('Ground Truth EDSS Category', fontsize=12) # ax.set_title('Comparison: LLM Confidence (Certainty) vs. Real Accuracy per EDSS Range', fontsize=15, pad=25) ax.set_xticks(x) ax.set_xticklabels(fixed_labels) ax.set_ylim(0, 115) ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.08), ncol=2, frameon=False) ax.grid(axis='y', linestyle=':', alpha=0.5) # Helper function to label bar heights def autolabel(rects): for rect in rects: height = rect.get_height() if height > 0: ax.annotate(f'{height:.0f}%', xy=(rect.get_x() + rect.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=9, fontweight='bold') autolabel(rects1) autolabel(rects2) # Add sample size (n) at the bottom for i, count in enumerate(stats['unique_id']): ax.text(i, 2, f'n={int(count)}', ha='center', va='bottom', fontsize=10, color='white', fontweight='bold') plt.tight_layout() plt.show() ## # %% Boxplot import pandas as pd import numpy as np import json import glob import os import re import matplotlib.pyplot as plt from matplotlib.lines import Line2D from matplotlib.patches import Patch def natural_key(string_): return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)] def plot_edss_boxplot(json_dir_path, ground_truth_path): # 1. Load Ground Truth df_gt = pd.read_csv(ground_truth_path, sep=';') gt_values = pd.to_numeric(df_gt['EDSS'], errors='coerce').dropna().tolist() # 2. Load Iterations json_files = glob.glob(os.path.join(json_dir_path, "*.json")) json_files.sort(key=natural_key) plot_data = [gt_values] labels = ['Ground Truth'] for i, file_path in enumerate(json_files): iteration_values = [] with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) for entry in data: if entry.get("success"): val = entry["result"].get("EDSS") if val is not None: iteration_values.append(float(val)) plot_data.append(iteration_values) labels.append(f"Iter {i+1}") # 3. Plotting Configuration plt.figure(figsize=(14, 8)) # Define colors gt_color = '#ff9999' # Soft Red iter_color = '#66b3ff' # Soft Blue # Create the boxplot bplot = plt.boxplot(plot_data, labels=labels, patch_artist=True, notch=False, medianprops={'color': 'black', 'linewidth': 2}, flierprops={'marker': 'o', 'markerfacecolor': 'gray', 'markersize': 5, 'alpha': 0.5}, showmeans=True, meanprops={"marker":"D", "markerfacecolor":"white", "markeredgecolor":"black", "markersize": 6}) # 4. Fill boxes with colors colors = [gt_color] + [iter_color] * (len(plot_data) - 1) for patch, color in zip(bplot['boxes'], colors): patch.set_facecolor(color) # 5. CONSTRUCT THE COMPLETE LEGEND legend_elements = [ Patch(facecolor=gt_color, edgecolor='black', label='Ground Truth'), Patch(facecolor=iter_color, edgecolor='black', label='LLM Iterations (1-10)'), Line2D([0], [0], color='black', lw=2, label='Median'), Line2D([0], [0], marker='D', color='w', label='Mean Score', markerfacecolor='white', markeredgecolor='black', markersize=8), Line2D([0], [0], marker='o', color='w', label='Outliers', markerfacecolor='gray', markersize=6, alpha=0.5) ] plt.legend(handles=legend_elements, loc='upper right', frameon=True, shadow=True, title="Legend") # Formatting plt.title('Distribution of EDSS Scores: Ground Truth vs. 10 LLM Iterations', fontsize=16, pad=20) plt.ylabel('EDSS Score (0-10)', fontsize=12) plt.xlabel('Data Source', fontsize=12) plt.grid(axis='y', linestyle='--', alpha=0.4) plt.ylim(-0.5, 10.5) plt.xticks(rotation=45) plt.tight_layout() plt.show() ## # %% Audit import pandas as pd import numpy as np import json import glob import os def audit_matches(json_dir_path, ground_truth_path): # 1. Load GT df_gt = pd.read_csv(ground_truth_path, sep=';') # 2. Advanced Normalization def clean_series(s): return s.astype(str).str.strip().str.lower() df_gt['unique_id'] = clean_series(df_gt['unique_id']) df_gt['MedDatum'] = clean_series(df_gt['MedDatum']) # 3. Load Predictions all_preds = [] for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) file_name = os.path.basename(file_path) for entry in data: if entry.get("success"): res = entry["result"] all_preds.append({ 'unique_id': str(res.get('unique_id')).strip().lower(), 'MedDatum': str(res.get('MedDatum')).strip().lower(), 'file': file_name }) df_pred = pd.DataFrame(all_preds) # 4. Find the "Ghost" entries (In JSON but not in GT) # Create a 'key' column for easy comparison df_gt['key'] = df_gt['unique_id'] + "_" + df_gt['MedDatum'] df_pred['key'] = df_pred['unique_id'] + "_" + df_pred['MedDatum'] gt_keys = set(df_gt['key']) df_pred['is_matched'] = df_pred['key'].isin(gt_keys) unmatched_summary = df_pred[df_pred['is_matched'] == False] print("--- AUDIT RESULTS ---") print(f"Total rows in JSON: {len(df_pred)}") print(f"Rows that matched GT: {df_pred['is_matched'].sum()}") print(f"Rows that FAILED to match: {len(unmatched_summary)}") if not unmatched_summary.empty: print("\nFirst 10 Unmatched Entries (check these against your CSV):") print(unmatched_summary[['unique_id', 'MedDatum', 'file']].head(10)) # Breakdown by file - see if specific JSON files are broken print("\nFailure count per JSON file:") print(unmatched_summary['file'].value_counts()) audit_matches('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') ## # %% Cinfidence accuracy correlation import pandas as pd import numpy as np import json import glob import os import matplotlib.pyplot as plt def categorize_edss(value): if pd.isna(value): return np.nan elif value <= 1.0: return '0-1' elif value <= 2.0: return '1-2' elif value <= 3.0: return '2-3' elif value <= 4.0: return '3-4' elif value <= 5.0: return '4-5' elif value <= 6.0: return '6-7' elif value <= 7.0: return '7-8' elif value <= 8.0: return '8-9' elif value <= 9.0: return '9-10' else: return '10+' def plot_binned_calibration(json_dir_path, ground_truth_path): # 1. Load Ground Truth df_gt = pd.read_csv(ground_truth_path, sep=';') df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower() df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower() df_gt['gt_cat'] = pd.to_numeric(df_gt['EDSS'], errors='coerce').apply(categorize_edss) # 2. Load Predictions all_preds = [] for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) for entry in data: if entry.get("success"): res = entry["result"] all_preds.append({ 'unique_id': str(res.get('unique_id')).strip().lower(), 'MedDatum': str(res.get('MedDatum')).strip().lower(), 'pred_cat': categorize_edss(res.get('EDSS')), 'confidence': res.get('certainty_percent') }) df_pred = pd.DataFrame(all_preds) df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'gt_cat']], on=['unique_id', 'MedDatum'], how='inner') # Define correctness df_merged['is_correct'] = (df_merged['pred_cat'] == df_merged['gt_cat']).astype(int) # 3. Create Confidence Bins (e.g., 0-60, 60-70, 70-80, 80-90, 90-100) bins = [0, 60, 70, 80, 90, 100] labels = ['<60%', '60-70%', '70-80%', '80-90%', '90-100%'] df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels) # Calculate average accuracy per bin calibration_stats = df_merged.groupby('conf_bin')['is_correct'].agg(['mean', 'count']).reset_index() # 4. Plotting plt.figure(figsize=(10, 6)) # Bar chart for actual accuracy bars = plt.bar(calibration_stats['conf_bin'], calibration_stats['mean'], color='skyblue', edgecolor='navy', alpha=0.7, label='Actual Accuracy') # Add the "Perfect Calibration" line # (If confidence is 95%, accuracy should be 0.95) expected_x = np.arange(len(labels)) expected_y = [0.3, 0.65, 0.75, 0.85, 0.95] # Midpoints of the bins for visual reference plt.plot(expected_x, expected_y, color='red', marker='o', linestyle='--', linewidth=2, label='Perfect Calibration (Theoretical)') # 5. Add text labels on top of bars to show sample size (how many cases in that bin) for i, bar in enumerate(bars): yval = bar.get_height() count = calibration_stats.loc[i, 'count'] plt.text(bar.get_x() + bar.get_width()/2, yval + 0.02, f'Acc: {yval:.1%}\n(n={count})', ha='center', va='bottom', fontsize=9) # Legend and Labels plt.title('Model Calibration: Does Confidence Match Accuracy?', fontsize=14, pad=15) plt.xlabel('LLM Confidence Score Bin', fontsize=12) plt.ylabel('Actual Accuracy (Correct Category %)', fontsize=12) plt.ylim(0, 1.1) plt.grid(axis='y', linestyle=':', alpha=0.5) # Adding a clean, informative legend plt.legend(loc='upper left', frameon=True, shadow=True) plt.tight_layout() plt.show() ## # %% Confidence comparison import pandas as pd import numpy as np import json import glob import os import matplotlib.pyplot as plt import seaborn as sns from matplotlib.lines import Line2D from matplotlib.patches import Patch def plot_edss_confidence_comparison(json_dir_path, ground_truth_path): # 1. Load Ground Truth df_gt = pd.read_csv(ground_truth_path, sep=';') df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower() df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower() df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce') # 2. Load Predictions from all JSONs all_preds = [] for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): with open(file_path, 'r', encoding='utf-8') as f: try: data = json.load(f) for entry in data: if entry.get("success"): res = entry["result"] all_preds.append({ 'unique_id': str(res.get('unique_id')).strip().lower(), 'MedDatum': str(res.get('MedDatum')).strip().lower(), 'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'), 'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce') }) except Exception as e: print(f"Skipping {file_path}: {e}") df_pred = pd.DataFrame(all_preds) # 3. Merge and Clean df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'EDSS_gt']], on=['unique_id', 'MedDatum'], how='inner') df_plot = df_merged.dropna(subset=['EDSS_pred', 'EDSS_gt', 'confidence']).copy() # 4. Bin Confidence (X-Axis Categories) # We group confidence into bins to create a readable boxplot bins = [0, 60, 70, 80, 90, 100] labels = ['<60%', '60-70%', '70-80%', '80-90%', '90-100%'] df_plot['conf_bin'] = pd.cut(df_plot['confidence'], bins=bins, labels=labels) # 5. Plotting plt.figure(figsize=(14, 8)) # A. Boxplot: Shows the distribution of LLM PREDICTIONS sns.boxplot(data=df_plot, x='conf_bin', y='EDSS_pred', color='#3498db', width=0.5, showfliers=False, boxprops=dict(alpha=0.4, edgecolor='navy')) # B. Stripplot (Dots): Shows individual GROUND TRUTH scores # We add jitter so dots don't hide each other sns.stripplot(data=df_plot, x='conf_bin', y='EDSS_gt', color='#e74c3c', alpha=0.4, jitter=0.2, size=5) # 6. Create a CLEAR Legend legend_elements = [ Patch(facecolor='#3498db', edgecolor='navy', alpha=0.4, label='LLM Predictions (Box = Distribution)'), Line2D([0], [0], marker='o', color='w', label='Ground Truth (Dots = Clinician Scores)', markerfacecolor='#e74c3c', markersize=8, alpha=0.6), Line2D([0], [0], color='black', lw=2, label='Median Predicted EDSS') ] plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Legend") # Final Labels plt.title('Comparison of EDSS Scores Across Confidence Levels', fontsize=16, pad=20) plt.xlabel('LLM Certainty Score (%)', fontsize=12) plt.ylabel('EDSS Score (0-10)', fontsize=12) plt.ylim(-0.5, 10.5) plt.yticks(np.arange(0, 11, 1)) plt.grid(axis='y', linestyle='--', alpha=0.3) plt.tight_layout() plt.show() ## # %% EDSS vs Boxplot import pandas as pd import numpy as np import json import glob import os import matplotlib.pyplot as plt import seaborn as sns from matplotlib.patches import Patch def categorize_edss(value): if pd.isna(value): return np.nan elif value <= 1.0: return '0-1' elif value <= 2.0: return '1-2' elif value <= 3.0: return '2-3' elif value <= 4.0: return '3-4' elif value <= 5.0: return '4-5' elif value <= 6.0: return '5-6' elif value <= 7.0: return '6-7' elif value <= 8.0: return '7-8' elif value <= 9.0: return '8-9' elif value <= 10.0: return '9-10' else: return '10+' def plot_edss_vs_confidence_boxplot(json_dir_path): # 1. Load all Predictions all_preds = [] json_files = glob.glob(os.path.join(json_dir_path, "*.json")) for file_path in json_files: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) for entry in data: if entry.get("success"): res = entry["result"] edss_val = pd.to_numeric(res.get('EDSS'), errors='coerce') conf_val = pd.to_numeric(res.get('certainty_percent'), errors='coerce') if not pd.isna(edss_val) and not pd.isna(conf_val): all_preds.append({ 'edss_cat': categorize_edss(edss_val), 'confidence': conf_val }) df = pd.DataFrame(all_preds) # 2. Sort categories correctly for the x-axis cat_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10'] df['edss_cat'] = pd.Categorical(df['edss_cat'], categories=cat_order, ordered=True) # 3. Plotting plt.figure(figsize=(14, 8)) # Create Boxplot sns.boxplot(data=df, x='edss_cat', y='confidence', palette="Blues", width=0.6, showfliers=False) # Add Stripplot (Dots) to show density of cases sns.stripplot(data=df, x='edss_cat', y='confidence', color='black', alpha=0.15, jitter=0.2, size=3) # 4. Legend and Labels # Since boxplot color is clear, we add a legend for the components legend_elements = [ Patch(facecolor='#6da7d1', label='Confidence Distribution (IQR)'), plt.Line2D([0], [0], color='black', marker='o', linestyle='', markersize=4, alpha=0.4, label='Individual Predictions') ] plt.legend(handles=legend_elements, loc='lower left', frameon=True) plt.title('LLM Confidence Levels Across Clinical EDSS Categories', fontsize=16, pad=20) plt.xlabel('Predicted EDSS Category (Clinical Severity)', fontsize=12) plt.ylabel('Confidence Score (%)', fontsize=12) plt.ylim(0, 105) plt.grid(axis='y', linestyle='--', alpha=0.3) plt.tight_layout() plt.show() ## # %% Correlation Boxplot import pandas as pd import numpy as np import json import glob import os import matplotlib.pyplot as plt import seaborn as sns from matplotlib.patches import Patch from sklearn.metrics import cohen_kappa_score def categorize_edss(value): """Standardized clinical categorization.""" if pd.isna(value): return np.nan elif value <= 1.0: return '0-1' elif value <= 2.0: return '1-2' elif value <= 3.0: return '2-3' elif value <= 4.0: return '3-4' elif value <= 5.0: return '4-5' elif value <= 6.0: return '5-6' elif value <= 7.0: return '6-7' elif value <= 8.0: return '7-8' elif value <= 9.0: return '8-9' elif value <= 10.0: return '9-10' else: return '10+' def plot_categorical_vs_categorical(json_dir_path, ground_truth_path): # 1. Load Ground Truth df_gt = pd.read_csv(ground_truth_path, sep=';') df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower() df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower() df_gt['gt_cat'] = pd.to_numeric(df_gt['EDSS'], errors='coerce').apply(categorize_edss) # 2. Load Predictions all_preds = [] for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) for entry in data: if entry.get("success"): res = entry["result"] all_preds.append({ 'unique_id': str(res.get('unique_id')).strip().lower(), 'MedDatum': str(res.get('MedDatum')).strip().lower(), 'pred_cat': categorize_edss(pd.to_numeric(res.get('EDSS'), errors='coerce')) }) df_pred = pd.DataFrame(all_preds) # 3. Merge df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'gt_cat']], on=['unique_id', 'MedDatum'], how='inner').dropna() # 4. Set Order and Numeric Mapping for Plotting cat_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10'] cat_map = {cat: i for i, cat in enumerate(cat_order)} df_merged['gt_idx'] = df_merged['gt_cat'].map(cat_map) df_merged['pred_idx'] = df_merged['pred_cat'].map(cat_map) # Calculate Cohen's Kappa (Standard for categorical agreement) kappa = cohen_kappa_score(df_merged['gt_cat'], df_merged['pred_cat'], weights='linear') # 5. Plotting plt.figure(figsize=(14, 8)) # BOXPLOT: Distribution of Predicted Categories relative to Ground Truth sns.boxplot(data=df_merged, x='gt_cat', y='pred_idx', palette="rocket", width=0.6, showfliers=False, boxprops=dict(alpha=0.5)) # STRIPPLOT: Individual counts sns.stripplot(data=df_merged, x='gt_cat', y='pred_idx', color='black', alpha=0.1, jitter=0.3, size=4) # DIAGONAL REFERENCE: Perfect category match plt.plot([0, 9], [0, 9], color='red', linestyle='--', linewidth=2) # 6. Formatting Legend & Axes plt.yticks(ticks=range(len(cat_order)), labels=cat_order) legend_elements = [ Patch(facecolor='#ae3e50', alpha=0.5, label='Predicted Category Spread'), plt.Line2D([0], [0], color='red', linestyle='--', label='Perfect Category Agreement'), plt.Line2D([0], [0], color='black', marker='o', linestyle='', markersize=4, alpha=0.3, label='Iteration Matches'), Patch(color='none', label=f'Linear Weighted Kappa: {kappa:.3f}') ] plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Agreement Metrics") plt.title('Categorical Agreement: Ground Truth vs. LLM Prediction', fontsize=16, pad=20) plt.xlabel('Ground Truth Category (Clinician)', fontsize=12) plt.ylabel('LLM Predicted Category', fontsize=12) plt.grid(axis='both', linestyle=':', alpha=0.4) plt.tight_layout() plt.show() ## # %% rainplot import pandas as pd import numpy as np import json import glob import os import matplotlib.pyplot as plt import seaborn as sns from matplotlib.patches import Patch from matplotlib.lines import Line2D def plot_error_distribution_by_confidence(json_dir_path, ground_truth_path): # 1. Load Ground Truth df_gt = pd.read_csv(ground_truth_path, sep=';') df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower() df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower() df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce') # 2. Load Predictions all_preds = [] for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) for entry in data: if entry.get("success"): res = entry["result"] all_preds.append({ 'unique_id': str(res.get('unique_id')).strip().lower(), 'MedDatum': str(res.get('MedDatum')).strip().lower(), 'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'), 'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce') }) df_merged = pd.merge(pd.DataFrame(all_preds), df_gt[['unique_id', 'MedDatum', 'EDSS_gt']], on=['unique_id', 'MedDatum'], how='inner').dropna() # 3. Calculate Error df_merged['error'] = df_merged['EDSS_pred'] - df_merged['EDSS_gt'] # 4. Bin Confidence bins = [0, 70, 80, 90, 100] labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)'] df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels) # Calculate counts for labels counts = df_merged['conf_bin'].value_counts().reindex(labels) new_labels = [f"{l}\n(n={int(counts[l])})" for l in labels] # 5. Plotting plt.figure(figsize=(13, 8)) # Using a sequential color palette (Light blue to Dark blue) palette_colors = sns.color_palette("Blues", n_colors=len(labels)) vplot = sns.violinplot(data=df_merged, x='conf_bin', y='error', inner="quartile", palette=palette_colors, cut=0) # Reference line at 0 plt.axhline(0, color='#d9534f', linestyle='--', linewidth=2.5) # 6. UPDATED LEGEND WITH CORRECT COLORS legend_elements = [ # Legend items for the color gradient Patch(facecolor=palette_colors[0], label='Confidence: <70%'), Patch(facecolor=palette_colors[1], label='Confidence: 70-80%'), Patch(facecolor=palette_colors[2], label='Confidence: 80-90%'), Patch(facecolor=palette_colors[3], label='Confidence: 90-100%'), # Legend items for the symbols Line2D([0], [0], color='black', linestyle=':', label='Quartile Lines (25th, 50th, 75th)'), Line2D([0], [0], color='#d9534f', linestyle='--', lw=2.5, label='Zero Error (Perfect Match)') ] plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Legend & Confidence Gradient") # Formatting plt.title('Error Magnitude vs. LLM Confidence Levels', fontsize=16, pad=20) plt.xlabel('LLM Certainty Group', fontsize=12) plt.ylabel('Prediction Delta (EDSS_pred - EDSS_gt)', fontsize=12) plt.xticks(ticks=range(len(labels)), labels=new_labels) plt.grid(axis='y', linestyle=':', alpha=0.5) plt.tight_layout() plt.show() # plot_error_distribution_by_confidence('jsons_folder/', 'ground_truth.csv') ## # %% Certainty vs Delta import pandas as pd import numpy as np import json import glob import os import matplotlib.pyplot as plt import seaborn as sns from matplotlib.patches import Patch from matplotlib.lines import Line2D def plot_confidence_vs_abs_error_refined(json_dir_path, ground_truth_path): # 1. Load and Merge Data df_gt = pd.read_csv(ground_truth_path, sep=';') df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower() df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower() df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce') all_preds = [] for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) for entry in data: if entry.get("success"): res = entry["result"] all_preds.append({ 'unique_id': str(res.get('unique_id')).strip().lower(), 'MedDatum': str(res.get('MedDatum')).strip().lower(), 'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'), 'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce') }) df_merged = pd.merge(pd.DataFrame(all_preds), df_gt[['unique_id', 'MedDatum', 'EDSS_gt']], on=['unique_id', 'MedDatum'], how='inner').dropna() # 2. Calculate Absolute Delta df_merged['abs_error'] = (df_merged['EDSS_pred'] - df_merged['EDSS_gt']).abs() # 3. Binning bins = [0, 70, 80, 90, 100] labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)'] df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels) stats = df_merged.groupby('conf_bin', observed=True)['abs_error'].agg(['mean', 'std', 'count']).reset_index() # 4. Plotting plt.figure(figsize=(12, 8)) # Sequential palette: light to dark colors = sns.color_palette("Blues", n_colors=len(labels)) bars = plt.bar(stats['conf_bin'], stats['mean'], color=colors, edgecolor='black', linewidth=1.2) # Standard Error Bars plt.errorbar(stats['conf_bin'], stats['mean'], yerr=stats['std']/np.sqrt(stats['count']), fmt='none', c='black', capsize=6, elinewidth=1.5) # Trend Line (Linear Fit) x_idx = np.arange(len(labels)) z = np.polyfit(x_idx, stats['mean'], 1) p = np.poly1d(z) plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=2.5) # 5. THE COMPLETE LEGEND # We create a specific handle for every single thing on the chart legend_elements = [ # Explicit color mapping for bins Patch(facecolor=colors[0], edgecolor='black', label=f'Bin 1: {labels[0]}'), Patch(facecolor=colors[1], edgecolor='black', label=f'Bin 2: {labels[1]}'), Patch(facecolor=colors[2], edgecolor='black', label=f'Bin 3: {labels[2]}'), Patch(facecolor=colors[3], edgecolor='black', label=f'Bin 4: {labels[3]}'), # Statistical components Line2D([0], [0], color='black', marker='_', linestyle='None', markersize=10, label='Standard Error (SEM)'), Line2D([0], [0], color='#e74c3c', linestyle='--', lw=2.5, label='Correlation Trend (Inverse Rel.)'), # Metric definition Patch(color='none', label='Metric: Mean Absolute Error (MAE)') ] plt.legend(handles=legend_elements, loc='upper right', frameon=True, shadow=True, fontsize=10, title="Legend") # Final Labels & Clean-up plt.title('Validation: Inverse Correlation of Confidence vs. Error Magnitude', fontsize=15, pad=20) plt.ylabel('Mean Absolute Error (Δ EDSS Points)', fontsize=12) plt.xlabel('LLM Confidence Bracket', fontsize=12) # Text annotations for MAE on bars for i, bar in enumerate(bars): plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05, f'MAE: {stats.loc[i, "mean"]:.2f}', ha='center', fontweight='bold') plt.grid(axis='y', linestyle=':', alpha=0.5) plt.tight_layout() plt.show() ## # %% name import pandas as pd import numpy as np import json import glob import os import matplotlib.pyplot as plt import seaborn as sns from matplotlib.patches import Patch from matplotlib.lines import Line2D def plot_final_thesis_error_chart(json_dir_path, ground_truth_path): # 1. Load Ground Truth & Predictions df_gt = pd.read_csv(ground_truth_path, sep=';') df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower() df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower() df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce') all_preds = [] for file_path in glob.glob(os.path.join(json_dir_path, "*.json")): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) for entry in data: if entry.get("success"): res = entry["result"] all_preds.append({ 'unique_id': str(res.get('unique_id')).strip().lower(), 'MedDatum': str(res.get('MedDatum')).strip().lower(), 'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'), 'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce') }) df_merged = pd.merge(pd.DataFrame(all_preds), df_gt[['unique_id', 'MedDatum', 'EDSS_gt']], on=['unique_id', 'MedDatum'], how='inner').dropna() # 2. Metric Calculation df_merged['abs_error'] = (df_merged['EDSS_pred'] - df_merged['EDSS_gt']).abs() # 3. Binning & Stats bins = [0, 70, 80, 90, 100] labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)'] df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels) stats = df_merged.groupby('conf_bin', observed=True)['abs_error'].agg(['mean', 'std', 'count']).reset_index() # 4. Plotting plt.figure(figsize=(13, 8)) colors = sns.color_palette("Blues", n_colors=len(labels)) # BARS (MAE) bars = plt.bar(stats['conf_bin'], stats['mean'], color=colors, edgecolor='black', alpha=0.85) # ERROR BARS (Standard Error of the Mean) plt.errorbar(stats['conf_bin'], stats['mean'], yerr=stats['std']/np.sqrt(stats['count']), fmt='none', c='black', capsize=8, elinewidth=1.5) # CORRELATION TREND LINE x_idx = np.arange(len(labels)) z = np.polyfit(x_idx, stats['mean'], 1) p = np.poly1d(z) plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5) # 5. DATA LABELS (n and MAE) for i, bar in enumerate(bars): n_count = int(stats.loc[i, 'count']) mae_val = stats.loc[i, 'mean'] plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.04, f'MAE: {mae_val:.2f}\nn={n_count}', ha='center', va='bottom', fontweight='bold', fontsize=10) # 6. THE COMPLETE LEGEND legend_elements = [ Patch(facecolor=colors[0], edgecolor='black', label=f'Bin 1: {labels[0]}'), Patch(facecolor=colors[1], edgecolor='black', label=f'Bin 2: {labels[1]}'), Patch(facecolor=colors[2], edgecolor='black', label=f'Bin 3: {labels[2]}'), Patch(facecolor=colors[3], edgecolor='black', label=f'Bin 4: {labels[3]}'), Line2D([0], [0], color='#e74c3c', linestyle='--', lw=3, label='Correlation Trend (Inverse Relationship)'), Line2D([0], [0], color='black', marker='_', linestyle='None', markersize=10, label='Standard Error (SEM)'), Patch(color='none', label='Metric: Mean Absolute Error (MAE)') ] plt.legend(handles=legend_elements, loc='upper right', frameon=True, shadow=True, title="Chart Components") # Formatting plt.title('Clinical Validation: LLM Certainty vs. Prediction Accuracy', fontsize=16, pad=30) plt.ylabel('Mean Absolute Error (EDSS Points)', fontsize=12) plt.xlabel('LLM Confidence Bracket', fontsize=12) plt.grid(axis='y', linestyle=':', alpha=0.5) plt.ylim(0, stats['mean'].max() + 0.6) # Add room for labels plt.tight_layout() plt.show() # plot_final_thesis_error_chart('jsons_folder/', 'gt.csv') ## # %% 1json import pandas as pd import numpy as np import json import matplotlib.pyplot as plt import seaborn as sns from matplotlib.patches import Patch from matplotlib.lines import Line2D def plot_single_json_error_analysis(json_file_path, ground_truth_path): # 1. Load Ground Truth df_gt = pd.read_csv(ground_truth_path, sep=';') df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower() df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower() df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce') # 2. Load the Specific JSON all_preds = [] with open(json_file_path, 'r', encoding='utf-8') as f: data = json.load(f) for entry in data: if entry.get("success"): res = entry["result"] all_preds.append({ 'unique_id': str(res.get('unique_id')).strip().lower(), 'MedDatum': str(res.get('MedDatum')).strip().lower(), 'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'), 'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce') }) df_pred = pd.DataFrame(all_preds) # 3. Merge and Calculate Absolute Error df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'EDSS_gt']], on=['unique_id', 'MedDatum'], how='inner').dropna() df_merged['abs_error'] = (df_merged['EDSS_pred'] - df_merged['EDSS_gt']).abs() # 4. Binning and Statistics bins = [0, 70, 80, 90, 100] labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)'] df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels) stats = df_merged.groupby('conf_bin', observed=True)['abs_error'].agg(['mean', 'std', 'count']).reset_index() # 5. Plotting plt.figure(figsize=(13, 8)) colors = sns.color_palette("Blues", n_colors=len(labels)) # BARS (MAE) bars = plt.bar(stats['conf_bin'], stats['mean'], color=colors, edgecolor='black', alpha=0.85) # ERROR BARS (SEM) plt.errorbar(stats['conf_bin'], stats['mean'], yerr=stats['std']/np.sqrt(stats['count']), fmt='none', c='black', capsize=8, elinewidth=1.5) # CORRELATION TREND LINE x_idx = np.arange(len(labels)) z = np.polyfit(x_idx, stats['mean'], 1) p = np.poly1d(z) plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5) # 6. DATA LABELS (n and MAE) for i, bar in enumerate(bars): n_count = int(stats.loc[i, 'count']) mae_val = stats.loc[i, 'mean'] plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.04, f'MAE: {mae_val:.2f}\nn={n_count}', ha='center', va='bottom', fontweight='bold', fontsize=10) # 7. COMPREHENSIVE LEGEND legend_elements = [ Patch(facecolor=colors[0], edgecolor='black', label=f'Bin 1: {labels[0]}'), Patch(facecolor=colors[1], edgecolor='black', label=f'Bin 2: {labels[1]}'), Patch(facecolor=colors[2], edgecolor='black', label=f'Bin 3: {labels[2]}'), Patch(facecolor=colors[3], edgecolor='black', label=f'Bin 4: {labels[3]}'), Line2D([0], [0], color='#e74c3c', linestyle='--', lw=3, label='Inverse Trend Line'), Line2D([0], [0], color='black', marker='_', linestyle='None', markersize=10, label='Std Error (SEM)'), Patch(color='none', label='Metric: Mean Absolute Error (MAE)') ] plt.legend(handles=legend_elements, loc='upper right', frameon=True, shadow=True, title="Legend") # Final Styling plt.title('Validation: Confidence vs. Error Magnitude (Iteration 1 Only)', fontsize=15, pad=30) plt.ylabel('Mean Absolute Error (EDSS Points)', fontsize=12) plt.xlabel('LLM Confidence Bracket', fontsize=12) plt.grid(axis='y', linestyle=':', alpha=0.5) plt.ylim(0, stats['mean'].max() + 0.6) plt.tight_layout() plt.show() # --- RUN THE PLOT --- json_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json" gt_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv" plot_single_json_error_analysis(json_path, gt_path) ## # %% Usage # --- Usage --- #plot_categorized_edss('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', # '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #plot_subcategory_analysis('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #plot_certainty_vs_accuracy_by_category('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #plot_edss_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #plot_binned_calibration('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #plot_edss_vs_confidence_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration') #plot_gt_vs_llm_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #plot_categorical_vs_categorical('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #plot_error_distribution_by_confidence('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #plot_confidence_vs_abs_error_refined('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #plot_confidence_vs_abs_error_with_counts('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') #plot_final_thesis_error_chart('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv') ##