EDSS-calc/Data/certainty_show.py

# %% Explore Dist Plot
import pandas as pd
import json
import glob
import os
import re
import matplotlib.pyplot as plt

def plot_edss_distribution_per_iteration(json_dir_path):
    # 1. Reuse your categorization logic
    def categorize_edss(value):
        if pd.isna(value): return 'Unknown'
        elif value <= 1.0: return '0-1'
        elif value <= 2.0: return '1-2'
        elif value <= 3.0: return '2-3'
        elif value <= 4.0: return '3-4'
        elif value <= 5.0: return '4-5'
        elif value <= 6.0: return '5-6'
        elif value <= 7.0: return '6-7'
        elif value <= 8.0: return '7-8'
        elif value <= 9.0: return '8-9'
        elif value <= 10.0: return '9-10'
        else: return '10+'

    # 2. Extract data from all files with Numerical Sorting
    all_records = []
    json_files = glob.glob(os.path.join(json_dir_path, "*.json"))

    # Natural sort function to handle Iter 1, Iter 2 ... Iter 10
    def natural_key(string_):
        return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]

    json_files.sort(key=natural_key)

    for i, file_path in enumerate(json_files):
        # We use the index + 1 for the label to ensure Iter 1 to Iter 10 order
        iter_label = f"Iter {i+1}"
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                for entry in data:
                    if entry.get("success"):
                        val = entry["result"].get("EDSS")
                        all_records.append({
                            'Iteration': iter_label,
                            'Category': categorize_edss(val),
                            'Order': i # Used to maintain sort order in the table
                        })
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

    df = pd.DataFrame(all_records)

    # 3. Create a Frequency Table (Crosstab)
    # Pivot so iterations are on the X-axis
    dist_table = pd.crosstab(df['Iteration'], df['Category'])

    # Ensure the rows (Iterations) stay in the 1-10 order
    iter_order = [f"Iter {i+1}" for i in range(len(json_files))]
    dist_table = dist_table.reindex(iter_order)

    # Ensure columns follow clinical order
    fixed_labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']
    available_labels = [l for l in fixed_labels if l in dist_table.columns]
    dist_table = dist_table[available_labels]

    # 4. Plotting
    ax = dist_table.plot(kind='bar', stacked=True, figsize=(14, 8), colormap='viridis', edgecolor='white')

    plt.title('Distribution of Predicted EDSS Categories per Iteration', fontsize=15, pad=20)
    plt.xlabel('JSON Iteration File', fontsize=12)
    plt.ylabel('Number of Cases (Count)', fontsize=12)
    plt.xticks(rotation=0)

    # Move legend outside to the right
    plt.legend(title="EDSS Category", bbox_to_anchor=(1.05, 1), loc='upper left')

    # Add total count labels on top of bars
    for i, (name, row) in enumerate(dist_table.iterrows()):
        total = row.sum()
        if total > 0:
            plt.text(i, total + 2, f'Total: {int(total)}', ha='center', va='bottom', fontweight='bold')

    plt.tight_layout()
    plt.show()

    return dist_table
# Usage:
counts_table = plot_edss_distribution_per_iteration('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration')
print(counts_table)
##


# %% Explore Table
import pandas as pd
import json
import glob
import os
import re

def generate_edss_distribution_csv(json_dir_path, output_filename='edss_distribution_summary.csv'):
    # 1. Categorization logic
    def categorize_edss(value):
        if pd.isna(value): return 'Unknown'
        elif value <= 1.0: return '0-1'
        elif value <= 2.0: return '1-2'
        elif value <= 3.0: return '2-3'
        elif value <= 4.0: return '3-4'
        elif value <= 5.0: return '4-5'
        elif value <= 6.0: return '5-6'
        elif value <= 7.0: return '6-7'
        elif value <= 8.0: return '7-8'
        elif value <= 9.0: return '8-9'
        elif value <= 10.0: return '9-10'
        else: return '10+'

    # 2. Extract data from files with Natural Sorting
    all_records = []
    json_files = glob.glob(os.path.join(json_dir_path, "*.json"))

    def natural_key(string_):
        return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]

    json_files.sort(key=natural_key)

    for i, file_path in enumerate(json_files):
        iter_label = f"Iter {i+1}"
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                for entry in data:
                    if entry.get("success"):
                        val = entry["result"].get("EDSS")
                        all_records.append({
                            'Iteration': iter_label,
                            'Category': categorize_edss(val)
                        })
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

    df = pd.DataFrame(all_records)

    # 3. Create Frequency Table (Crosstab)
    dist_table = pd.crosstab(df['Iteration'], df['Category'])

    # 4. Reindex Rows (Numerical order) and Columns (Clinical order)
    iter_order = [f"Iter {i+1}" for i in range(len(json_files))]
    dist_table = dist_table.reindex(iter_order)

    fixed_labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']
    available_labels = [l for l in fixed_labels if l in dist_table.columns]
    dist_table = dist_table[available_labels]

    # Fill missing categories with 0 and convert to integers
    dist_table = dist_table.fillna(0).astype(int)

    # 5. Add "Total" row at the end
    # This sums the counts for each category across all iterations
    dist_table.loc['Total Sum'] = dist_table.sum()

    # 6. Save to CSV
    dist_table.to_csv(output_filename)
    print(f"Table successfully saved to: {output_filename}")

    return dist_table

# Usage:
final_table = generate_edss_distribution_csv('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration')
print(final_table)
##

# %% EDSS Confusion Matrix
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def categorize_edss(value):
    if pd.isna(value):
        return np.nan
    elif value <= 1.0:
        return '0-1'
    elif value <= 2.0:
        return '1-2'
    elif value <= 3.0:
        return '2-3'
    elif value <= 4.0:
        return '3-4'
    elif value <= 5.0:
        return '4-5'
    elif value <= 6.0:
        return '5-6'
    elif value <= 7.0:
        return '6-7'
    elif value <= 8.0:
        return '7-8'
    elif value <= 9.0:
        return '8-9'
    elif value <= 10.0:
        return '9-10'
    else:
        return '10+'

def plot_categorized_edss(json_dir_path, ground_truth_path):
    # 1. Load Ground Truth
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    df_gt['unique_id'] = df_gt['unique_id'].astype(str)
    df_gt['MedDatum'] = df_gt['MedDatum'].astype(str)
    df_gt['EDSS'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')

    # 2. Iterate through JSON files
    all_preds = []
    json_pattern = os.path.join(json_dir_path, "*.json")
    for file_path in glob.glob(json_pattern):
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                for entry in data:
                    if entry.get("success") and "result" in entry:
                        res = entry["result"]
                        all_preds.append({
                            'unique_id': str(res.get('unique_id')),
                            'MedDatum': str(res.get('MedDatum')),
                            'edss_pred': res.get('EDSS')
                        })
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

    df_pred = pd.DataFrame(all_preds)
    df_pred['edss_pred'] = pd.to_numeric(df_pred['edss_pred'], errors='coerce')

    # 3. Merge and Categorize
    # Clean keys to ensure 100% match rate
    for df in [df_gt, df_pred]:
        df['unique_id'] = df['unique_id'].astype(str).str.strip()
        df['MedDatum'] = df['MedDatum'].astype(str).str.strip()

    df_merged = pd.merge(
        df_gt[['unique_id', 'MedDatum', 'EDSS']],
        df_pred,
        on=['unique_id', 'MedDatum'],
        how='inner'
    )

    df_merged = df_merged.dropna(subset=['EDSS', 'edss_pred'])

    # --- ADDED THESE LINES TO FIX THE NAMEERROR ---
    y_true = df_merged['EDSS'].apply(categorize_edss)
    y_pred = df_merged['edss_pred'].apply(categorize_edss)
    # ----------------------------------------------

    print(f"Verification: Total matches in Confusion Matrix: {len(df_merged)}")

    # 4. Define fixed labels to handle data gaps
    fixed_labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']

    # 5. Generate Confusion Matrix
    cm = confusion_matrix(y_true, y_pred, labels=fixed_labels)

    # 6. Plotting
    fig, ax = plt.subplots(figsize=(10, 8))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=fixed_labels)

    # Plotting (y_axis is Ground Truth, x_axis is LLM Prediction)
    disp.plot(cmap=plt.cm.Blues, values_format='d', ax=ax)

    plt.title('Categorized EDSS: Ground Truth vs LLM Prediction')
    plt.ylabel('Ground Truth EDSS')
    plt.xlabel('LLM Prediction')
    plt.show()
##

# %% Confusion Matrix adjusted
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def categorize_edss(value):
    """Bins EDSS values into clinical categories."""
    if pd.isna(value):
        return np.nan
    elif value <= 1.0: return '0-1'
    elif value <= 2.0: return '1-2'
    elif value <= 3.0: return '2-3'
    elif value <= 4.0: return '3-4'
    elif value <= 5.0: return '4-5'
    elif value <= 6.0: return '5-6'
    elif value <= 7.0: return '6-7'
    elif value <= 8.0: return '7-8'
    elif value <= 9.0: return '8-9'
    elif value <= 10.0: return '9-10'
    else: return '10+'

def plot_categorized_edss(json_dir_path, ground_truth_path):
    # 1. Load Ground Truth with Normalization
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    # Standardize keys to ensure 1:N matching works
    df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
    df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
    df_gt['EDSS'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')

    # 2. Load All Predictions from JSONs
    all_preds = []
    json_files = glob.glob(os.path.join(json_dir_path, "*.json"))

    for file_path in json_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                for entry in data:
                    # We only take 'success': true entries
                    if entry.get("success") and "result" in entry:
                        res = entry["result"]
                        all_preds.append({
                            'unique_id': str(res.get('unique_id')).strip().lower(),
                            'MedDatum': str(res.get('MedDatum')).strip().lower(),
                            'edss_pred': res.get('EDSS')
                        })
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

    df_pred = pd.DataFrame(all_preds)
    df_pred['edss_pred'] = pd.to_numeric(df_pred['edss_pred'], errors='coerce')

    # 3. Merge (This should give you ~3934 rows based on your audit)
    df_merged = pd.merge(
        df_gt[['unique_id', 'MedDatum', 'EDSS']],
        df_pred,
        on=['unique_id', 'MedDatum'],
        how='inner'
    )

    # --- THE BIG REVEAL: Count the NaNs ---
    nan_in_gt = df_merged['EDSS'].isna().sum()
    nan_in_pred = df_merged['edss_pred'].isna().sum()

    print("-" * 40)
    print(f"TOTAL MERGED ROWS: {len(df_merged)}")
    print(f"Rows with missing Ground Truth EDSS: {nan_in_gt}")
    print(f"Rows with missing Prediction EDSS:    {nan_in_pred}")
    print("-" * 40)

    # Now drop rows that have NO values in either side for the matrix
    df_final = df_merged.dropna(subset=['EDSS', 'edss_pred']).copy()
    print(f"FINAL ROWS FOR CONFUSION MATRIX: {len(df_final)}")
    print("-" * 40)

    # 4. Categorize for the Matrix
    y_true = df_final['EDSS'].apply(categorize_edss)
    y_pred = df_final['edss_pred'].apply(categorize_edss)

    fixed_labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']

    # 5. Generate and Print Raw Matrix
    cm = confusion_matrix(y_true, y_pred, labels=fixed_labels)

    # Print the Raw Matrix to terminal
    cm_df = pd.DataFrame(cm, index=[f"True_{l}" for l in fixed_labels],
                         columns=[f"Pred_{l}" for l in fixed_labels])
    print("\nRAW CONFUSION MATRIX (Rows=True, Cols=Pred):")
    print(cm_df)

    # 6. Plotting
    fig, ax = plt.subplots(figsize=(12, 10))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=fixed_labels)

    # Values_format='d' ensures we see whole numbers, not scientific notation
    disp.plot(cmap=plt.cm.Blues, values_format='d', ax=ax)

    plt.title(f'EDSS Confusion Matrix\n(n={len(df_final)} iterations across ~400 cases)', fontsize=14)
    plt.ylabel('Ground Truth (Clinician)')
    plt.xlabel('LLM Prediction')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

##
# %% Subcategories

import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt

def plot_subcategory_analysis(json_dir_path, ground_truth_path):
    # 1. Column Mapping (JSON Key : CSV Column)
    mapping = {
        "VISUAL_OPTIC_FUNCTIONS": "Sehvermögen",
        "BRAINSTEM_FUNCTIONS": "Hirnstamm",
        "PYRAMIDAL_FUNCTIONS": "Pyramidalmotorik",
        "CEREBELLAR_FUNCTIONS": "Cerebellum",
        "SENSORY_FUNCTIONS": "Sensibiliät",
        "BOWEL_AND_BLADDER_FUNCTIONS": "Blasen-_und_Mastdarmfunktion",
        "CEREBRAL_FUNCTIONS": "Cerebrale_Funktion",
        "AMBULATION": "Ambulation"
    }

    # 2. Load Ground Truth
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    df_gt['unique_id'] = df_gt['unique_id'].astype(str)
    df_gt['MedDatum'] = df_gt['MedDatum'].astype(str)

    # 3. Load Predictions including Subcategories
    all_preds = []
    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for entry in data:
                if entry.get("success"):
                    res = entry["result"]
                    row = {
                        'unique_id': str(res.get('unique_id')),
                        'MedDatum': str(res.get('MedDatum'))
                    }
                    # Add subcategory scores
                    for json_key in mapping.keys():
                        row[json_key] = res.get('subcategories', {}).get(json_key)
                    all_preds.append(row)

    df_pred = pd.DataFrame(all_preds)

    # 4. Merge
    df_merged = pd.merge(df_gt, df_pred, on=['unique_id', 'MedDatum'], suffixes=('_gt', '_llm'))

    # 5. Calculate Metrics
    results = []
    for json_key, csv_col in mapping.items():
        # Ensure numeric
        true_vals = pd.to_numeric(df_merged[csv_col], errors='coerce')
        pred_vals = pd.to_numeric(df_merged[json_key], errors='coerce')

        # Drop NaNs for this specific subcategory
        mask = true_vals.notna() & pred_vals.notna()
        y_t = true_vals[mask]
        y_p = pred_vals[mask]

        if len(y_t) > 0:
            accuracy = (y_t == y_p).mean() * 100
            mae = np.abs(y_t - y_p).mean() # Mean Absolute Error (Deviation)
            results.append({
                'Subcategory': csv_col,
                'Accuracy': accuracy,
                'Deviation': mae
            })

    stats_df = pd.DataFrame(results).sort_values('Accuracy', ascending=False)

# 6. Plotting
    fig, ax1 = plt.subplots(figsize=(14, 7))

    # Bar chart for Accuracy
    bars = ax1.bar(stats_df['Subcategory'], stats_df['Accuracy'],
                   color='#3498db', alpha=0.8, label='Accuracy (%)')
    ax1.set_ylabel('Accuracy (%)', color='#2980b9', fontsize=12, fontweight='bold')
    ax1.set_ylim(0, 115) # Extra head room for labels
    ax1.grid(axis='y', linestyle='--', alpha=0.7)

    # Rotate labels
    plt.xticks(rotation=30, ha='right', fontsize=10)

    # Line chart for Deviation
    ax2 = ax1.twinx()
    ax2.plot(stats_df['Subcategory'], stats_df['Deviation'],
             color='#e74c3c', marker='o', linewidth=2.5, markersize=8,
             label='Mean Abs. Deviation (Score Points)')
    ax2.set_ylabel('Mean Absolute Deviation', color='#c0392b', fontsize=12, fontweight='bold')

    # Adjust ax2 limit to avoid overlap with accuracy text
    ax2.set_ylim(0, max(stats_df['Deviation']) * 1.5 if not stats_df['Deviation'].empty else 1)

#    plt.title('Subcategory Performance: Accuracy vs. Mean Deviation', fontsize=14, pad=20)

    # --- THE FIX: Better Legend Placement ---
    # Combine legends from both axes and place them above the plot
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2,
               loc='upper center', bbox_to_anchor=(0.5, 1.12),
               ncol=2, frameon=False, fontsize=11)

    # Add percentage labels on top of bars
    for bar in bars:
        height = bar.get_height()
        ax1.annotate(f'{height:.1f}%',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 5), textcoords="offset points",
                    ha='center', va='bottom', fontweight='bold', color='#2c3e50')

    plt.tight_layout()
    plt.show()
##

# %% Certainty
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt

def categorize_edss(value):
    if pd.isna(value): return np.nan
    elif value <= 1.0: return '0-1'
    elif value <= 2.0: return '1-2'
    elif value <= 3.0: return '2-3'
    elif value <= 4.0: return '3-4'
    elif value <= 5.0: return '4-5'
    elif value <= 6.0: return '5-6'
    elif value <= 7.0: return '6-7'
    elif value <= 8.0: return '7-8'
    elif value <= 9.0: return '8-9'
    elif value <= 10.0: return '9-10'
    else: return '10+'

def plot_certainty_vs_accuracy_by_category(json_dir_path, ground_truth_path):
    # 1. Data Loading & Merging
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    df_gt['unique_id'] = df_gt['unique_id'].astype(str)
    df_gt['MedDatum'] = df_gt['MedDatum'].astype(str)
    df_gt['EDSS_true'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')

    all_preds = []
    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for entry in data:
                if entry.get("success"):
                    res = entry["result"]
                    all_preds.append({
                        'unique_id': str(res.get('unique_id')),
                        'MedDatum': str(res.get('MedDatum')),
                        'EDSS_pred': res.get('EDSS'),
                        'certainty': res.get('certainty_percent')
                    })

    df_pred = pd.DataFrame(all_preds)
    df_pred['EDSS_pred'] = pd.to_numeric(df_pred['EDSS_pred'], errors='coerce')

    df = pd.merge(df_gt[['unique_id', 'MedDatum', 'EDSS_true']],
                  df_pred, on=['unique_id', 'MedDatum']).dropna()

    # 2. Process Metrics
    df['gt_category'] = df['EDSS_true'].apply(categorize_edss)
    df['is_correct'] = (df['EDSS_true'].round(1) == df['EDSS_pred'].round(1))

    fixed_labels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']

    # Calculate Mean Certainty and Mean Accuracy per category
    stats = df.groupby('gt_category').agg({
        'is_correct': 'mean',
        'certainty': 'mean',
        'unique_id': 'count'
    }).reindex(fixed_labels)

    stats['accuracy_percent'] = stats['is_correct'] * 100
    stats = stats.fillna(0)

    # 3. Plotting
    x = np.arange(len(fixed_labels))
    width = 0.35  # Width of the bars

    fig, ax = plt.subplots(figsize=(14, 8))

    # Plotting both bars side-by-side
    rects1 = ax.bar(x - width/2, stats['accuracy_percent'], width,
                    label='Actual Accuracy (%)', color='#2ecc71', alpha=0.8)
    rects2 = ax.bar(x + width/2, stats['certainty'], width,
                    label='LLM Avg. Certainty (%)', color='#e67e22', alpha=0.8)

    # Add text labels, titles and custom x-axis tick labels, etc.
    ax.set_ylabel('Percentage (%)', fontsize=12)
    ax.set_xlabel('Ground Truth EDSS Category', fontsize=12)
#    ax.set_title('Comparison: LLM Confidence (Certainty) vs. Real Accuracy per EDSS Range', fontsize=15, pad=25)
    ax.set_xticks(x)
    ax.set_xticklabels(fixed_labels)
    ax.set_ylim(0, 115)
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.08), ncol=2, frameon=False)
    ax.grid(axis='y', linestyle=':', alpha=0.5)

    # Helper function to label bar heights
    def autolabel(rects):
        for rect in rects:
            height = rect.get_height()
            if height > 0:
                ax.annotate(f'{height:.0f}%',
                            xy=(rect.get_x() + rect.get_width() / 2, height),
                            xytext=(0, 3), textcoords="offset points",
                            ha='center', va='bottom', fontsize=9, fontweight='bold')

    autolabel(rects1)
    autolabel(rects2)

    # Add sample size (n) at the bottom
    for i, count in enumerate(stats['unique_id']):
        ax.text(i, 2, f'n={int(count)}', ha='center', va='bottom', fontsize=10, color='white', fontweight='bold')

    plt.tight_layout()
    plt.show()

##


# %% Boxplot
import pandas as pd
import numpy as np
import json
import glob
import os
import re
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch

def natural_key(string_):
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]

def plot_edss_boxplot(json_dir_path, ground_truth_path):
    # 1. Load Ground Truth
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    gt_values = pd.to_numeric(df_gt['EDSS'], errors='coerce').dropna().tolist()

    # 2. Load Iterations
    json_files = glob.glob(os.path.join(json_dir_path, "*.json"))
    json_files.sort(key=natural_key)

    plot_data = [gt_values]
    labels = ['Ground Truth']

    for i, file_path in enumerate(json_files):
        iteration_values = []
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for entry in data:
                if entry.get("success"):
                    val = entry["result"].get("EDSS")
                    if val is not None:
                        iteration_values.append(float(val))
            plot_data.append(iteration_values)
            labels.append(f"Iter {i+1}")

    # 3. Plotting Configuration
    plt.figure(figsize=(14, 8))

    # Define colors
    gt_color = '#ff9999' # Soft Red
    iter_color = '#66b3ff' # Soft Blue

    # Create the boxplot
    bplot = plt.boxplot(plot_data, labels=labels, patch_artist=True,
                        notch=False,
                        medianprops={'color': 'black', 'linewidth': 2},
                        flierprops={'marker': 'o', 'markerfacecolor': 'gray', 'markersize': 5, 'alpha': 0.5},
                        showmeans=True,
                        meanprops={"marker":"D", "markerfacecolor":"white", "markeredgecolor":"black", "markersize": 6})

    # 4. Fill boxes with colors
    colors = [gt_color] + [iter_color] * (len(plot_data) - 1)
    for patch, color in zip(bplot['boxes'], colors):
        patch.set_facecolor(color)

    # 5. CONSTRUCT THE COMPLETE LEGEND
    legend_elements = [
        Patch(facecolor=gt_color, edgecolor='black', label='Ground Truth'),
        Patch(facecolor=iter_color, edgecolor='black', label='LLM Iterations (1-10)'),
        Line2D([0], [0], color='black', lw=2, label='Median'),
        Line2D([0], [0], marker='D', color='w', label='Mean Score',
               markerfacecolor='white', markeredgecolor='black', markersize=8),
        Line2D([0], [0], marker='o', color='w', label='Outliers',
               markerfacecolor='gray', markersize=6, alpha=0.5)
    ]

    plt.legend(handles=legend_elements, loc='upper right', frameon=True, shadow=True, title="Legend")

    # Formatting
    plt.title('Distribution of EDSS Scores: Ground Truth vs. 10 LLM Iterations', fontsize=16, pad=20)
    plt.ylabel('EDSS Score (0-10)', fontsize=12)
    plt.xlabel('Data Source', fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.4)
    plt.ylim(-0.5, 10.5)
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()
##

# %% Audit


import pandas as pd
import numpy as np
import json
import glob
import os

def audit_matches(json_dir_path, ground_truth_path):
    # 1. Load GT
    df_gt = pd.read_csv(ground_truth_path, sep=';')

    # 2. Advanced Normalization
    def clean_series(s):
        return s.astype(str).str.strip().str.lower()

    df_gt['unique_id'] = clean_series(df_gt['unique_id'])
    df_gt['MedDatum'] = clean_series(df_gt['MedDatum'])

    # 3. Load Predictions
    all_preds = []
    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            file_name = os.path.basename(file_path)
            for entry in data:
                if entry.get("success"):
                    res = entry["result"]
                    all_preds.append({
                        'unique_id': str(res.get('unique_id')).strip().lower(),
                        'MedDatum': str(res.get('MedDatum')).strip().lower(),
                        'file': file_name
                    })

    df_pred = pd.DataFrame(all_preds)

    # 4. Find the "Ghost" entries (In JSON but not in GT)
    # Create a 'key' column for easy comparison
    df_gt['key'] = df_gt['unique_id'] + "_" + df_gt['MedDatum']
    df_pred['key'] = df_pred['unique_id'] + "_" + df_pred['MedDatum']

    gt_keys = set(df_gt['key'])
    df_pred['is_matched'] = df_pred['key'].isin(gt_keys)

    unmatched_summary = df_pred[df_pred['is_matched'] == False]

    print("--- AUDIT RESULTS ---")
    print(f"Total rows in JSON: {len(df_pred)}")
    print(f"Rows that matched GT: {df_pred['is_matched'].sum()}")
    print(f"Rows that FAILED to match: {len(unmatched_summary)}")

    if not unmatched_summary.empty:
        print("\nFirst 10 Unmatched Entries (check these against your CSV):")
        print(unmatched_summary[['unique_id', 'MedDatum', 'file']].head(10))

        # Breakdown by file - see if specific JSON files are broken
        print("\nFailure count per JSON file:")
        print(unmatched_summary['file'].value_counts())

audit_matches('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')

##


# %% Cinfidence accuracy correlation

import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt

def categorize_edss(value):
    if pd.isna(value): return np.nan
    elif value <= 1.0: return '0-1'
    elif value <= 2.0: return '1-2'
    elif value <= 3.0: return '2-3'
    elif value <= 4.0: return '3-4'
    elif value <= 5.0: return '4-5'
    elif value <= 6.0: return '6-7'
    elif value <= 7.0: return '7-8'
    elif value <= 8.0: return '8-9'
    elif value <= 9.0: return '9-10'
    else: return '10+'

def plot_binned_calibration(json_dir_path, ground_truth_path):
    # 1. Load Ground Truth
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
    df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
    df_gt['gt_cat'] = pd.to_numeric(df_gt['EDSS'], errors='coerce').apply(categorize_edss)

    # 2. Load Predictions
    all_preds = []
    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for entry in data:
                if entry.get("success"):
                    res = entry["result"]
                    all_preds.append({
                        'unique_id': str(res.get('unique_id')).strip().lower(),
                        'MedDatum': str(res.get('MedDatum')).strip().lower(),
                        'pred_cat': categorize_edss(res.get('EDSS')),
                        'confidence': res.get('certainty_percent')
                    })

    df_pred = pd.DataFrame(all_preds)
    df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'gt_cat']],
                         on=['unique_id', 'MedDatum'], how='inner')

    # Define correctness
    df_merged['is_correct'] = (df_merged['pred_cat'] == df_merged['gt_cat']).astype(int)

    # 3. Create Confidence Bins (e.g., 0-60, 60-70, 70-80, 80-90, 90-100)
    bins = [0, 60, 70, 80, 90, 100]
    labels = ['<60%', '60-70%', '70-80%', '80-90%', '90-100%']
    df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)

    # Calculate average accuracy per bin
    calibration_stats = df_merged.groupby('conf_bin')['is_correct'].agg(['mean', 'count']).reset_index()

    # 4. Plotting
    plt.figure(figsize=(10, 6))

    # Bar chart for actual accuracy
    bars = plt.bar(calibration_stats['conf_bin'], calibration_stats['mean'],
                   color='skyblue', edgecolor='navy', alpha=0.7, label='Actual Accuracy')

    # Add the "Perfect Calibration" line
    # (If confidence is 95%, accuracy should be 0.95)
    expected_x = np.arange(len(labels))
    expected_y = [0.3, 0.65, 0.75, 0.85, 0.95] # Midpoints of the bins for visual reference
    plt.plot(expected_x, expected_y, color='red', marker='o', linestyle='--',
             linewidth=2, label='Perfect Calibration (Theoretical)')

    # 5. Add text labels on top of bars to show sample size (how many cases in that bin)
    for i, bar in enumerate(bars):
        yval = bar.get_height()
        count = calibration_stats.loc[i, 'count']
        plt.text(bar.get_x() + bar.get_width()/2, yval + 0.02,
                 f'Acc: {yval:.1%}\n(n={count})', ha='center', va='bottom', fontsize=9)

    # Legend and Labels
    plt.title('Model Calibration: Does Confidence Match Accuracy?', fontsize=14, pad=15)
    plt.xlabel('LLM Confidence Score Bin', fontsize=12)
    plt.ylabel('Actual Accuracy (Correct Category %)', fontsize=12)
    plt.ylim(0, 1.1)
    plt.grid(axis='y', linestyle=':', alpha=0.5)

    # Adding a clean, informative legend
    plt.legend(loc='upper left', frameon=True, shadow=True)

    plt.tight_layout()
    plt.show()
##


# %%  Confidence comparison

import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.lines import Line2D
from matplotlib.patches import Patch

def plot_edss_confidence_comparison(json_dir_path, ground_truth_path):
    # 1. Load Ground Truth
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
    df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
    df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')

    # 2. Load Predictions from all JSONs
    all_preds = []
    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                for entry in data:
                    if entry.get("success"):
                        res = entry["result"]
                        all_preds.append({
                            'unique_id': str(res.get('unique_id')).strip().lower(),
                            'MedDatum': str(res.get('MedDatum')).strip().lower(),
                            'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
                            'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
                        })
            except Exception as e:
                print(f"Skipping {file_path}: {e}")

    df_pred = pd.DataFrame(all_preds)

    # 3. Merge and Clean
    df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
                         on=['unique_id', 'MedDatum'], how='inner')
    df_plot = df_merged.dropna(subset=['EDSS_pred', 'EDSS_gt', 'confidence']).copy()

    # 4. Bin Confidence (X-Axis Categories)
    # We group confidence into bins to create a readable boxplot
    bins = [0, 60, 70, 80, 90, 100]
    labels = ['<60%', '60-70%', '70-80%', '80-90%', '90-100%']
    df_plot['conf_bin'] = pd.cut(df_plot['confidence'], bins=bins, labels=labels)

    # 5. Plotting
    plt.figure(figsize=(14, 8))

    # A. Boxplot: Shows the distribution of LLM PREDICTIONS
    sns.boxplot(data=df_plot, x='conf_bin', y='EDSS_pred',
                color='#3498db', width=0.5, showfliers=False,
                boxprops=dict(alpha=0.4, edgecolor='navy'))

    # B. Stripplot (Dots): Shows individual GROUND TRUTH scores
    # We add jitter so dots don't hide each other
    sns.stripplot(data=df_plot, x='conf_bin', y='EDSS_gt',
                  color='#e74c3c', alpha=0.4, jitter=0.2, size=5)

    # 6. Create a CLEAR Legend
    legend_elements = [
        Patch(facecolor='#3498db', edgecolor='navy', alpha=0.4,
              label='LLM Predictions (Box = Distribution)'),
        Line2D([0], [0], marker='o', color='w', label='Ground Truth (Dots = Clinician Scores)',
               markerfacecolor='#e74c3c', markersize=8, alpha=0.6),
        Line2D([0], [0], color='black', lw=2, label='Median Predicted EDSS')
    ]
    plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Legend")

    # Final Labels
    plt.title('Comparison of EDSS Scores Across Confidence Levels', fontsize=16, pad=20)
    plt.xlabel('LLM Certainty Score (%)', fontsize=12)
    plt.ylabel('EDSS Score (0-10)', fontsize=12)
    plt.ylim(-0.5, 10.5)
    plt.yticks(np.arange(0, 11, 1))
    plt.grid(axis='y', linestyle='--', alpha=0.3)

    plt.tight_layout()
    plt.show()


##


# %% EDSS vs Boxplot

import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch

def categorize_edss(value):
    if pd.isna(value): return np.nan
    elif value <= 1.0: return '0-1'
    elif value <= 2.0: return '1-2'
    elif value <= 3.0: return '2-3'
    elif value <= 4.0: return '3-4'
    elif value <= 5.0: return '4-5'
    elif value <= 6.0: return '5-6'
    elif value <= 7.0: return '6-7'
    elif value <= 8.0: return '7-8'
    elif value <= 9.0: return '8-9'
    elif value <= 10.0: return '9-10'
    else: return '10+'

def plot_edss_vs_confidence_boxplot(json_dir_path):
    # 1. Load all Predictions
    all_preds = []
    json_files = glob.glob(os.path.join(json_dir_path, "*.json"))

    for file_path in json_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for entry in data:
                if entry.get("success"):
                    res = entry["result"]
                    edss_val = pd.to_numeric(res.get('EDSS'), errors='coerce')
                    conf_val = pd.to_numeric(res.get('certainty_percent'), errors='coerce')

                    if not pd.isna(edss_val) and not pd.isna(conf_val):
                        all_preds.append({
                            'edss_cat': categorize_edss(edss_val),
                            'confidence': conf_val
                        })

    df = pd.DataFrame(all_preds)

    # 2. Sort categories correctly for the x-axis
    cat_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']
    df['edss_cat'] = pd.Categorical(df['edss_cat'], categories=cat_order, ordered=True)

    # 3. Plotting
    plt.figure(figsize=(14, 8))

    # Create Boxplot
    sns.boxplot(data=df, x='edss_cat', y='confidence',
                palette="Blues", width=0.6, showfliers=False)

    # Add Stripplot (Dots) to show density of cases
    sns.stripplot(data=df, x='edss_cat', y='confidence',
                  color='black', alpha=0.15, jitter=0.2, size=3)

    # 4. Legend and Labels
    # Since boxplot color is clear, we add a legend for the components
    legend_elements = [
        Patch(facecolor='#6da7d1', label='Confidence Distribution (IQR)'),
        plt.Line2D([0], [0], color='black', marker='o', linestyle='',
                   markersize=4, alpha=0.4, label='Individual Predictions')
    ]
    plt.legend(handles=legend_elements, loc='lower left', frameon=True)

    plt.title('LLM Confidence Levels Across Clinical EDSS Categories', fontsize=16, pad=20)
    plt.xlabel('Predicted EDSS Category (Clinical Severity)', fontsize=12)
    plt.ylabel('Confidence Score (%)', fontsize=12)
    plt.ylim(0, 105)
    plt.grid(axis='y', linestyle='--', alpha=0.3)

    plt.tight_layout()
    plt.show()
##


# %% Correlation Boxplot
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from sklearn.metrics import cohen_kappa_score

def categorize_edss(value):
    """Standardized clinical categorization."""
    if pd.isna(value): return np.nan
    elif value <= 1.0: return '0-1'
    elif value <= 2.0: return '1-2'
    elif value <= 3.0: return '2-3'
    elif value <= 4.0: return '3-4'
    elif value <= 5.0: return '4-5'
    elif value <= 6.0: return '5-6'
    elif value <= 7.0: return '6-7'
    elif value <= 8.0: return '7-8'
    elif value <= 9.0: return '8-9'
    elif value <= 10.0: return '9-10'
    else: return '10+'

def plot_categorical_vs_categorical(json_dir_path, ground_truth_path):
    # 1. Load Ground Truth
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
    df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
    df_gt['gt_cat'] = pd.to_numeric(df_gt['EDSS'], errors='coerce').apply(categorize_edss)

    # 2. Load Predictions
    all_preds = []
    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for entry in data:
                if entry.get("success"):
                    res = entry["result"]
                    all_preds.append({
                        'unique_id': str(res.get('unique_id')).strip().lower(),
                        'MedDatum': str(res.get('MedDatum')).strip().lower(),
                        'pred_cat': categorize_edss(pd.to_numeric(res.get('EDSS'), errors='coerce'))
                    })

    df_pred = pd.DataFrame(all_preds)

    # 3. Merge
    df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'gt_cat']],
                         on=['unique_id', 'MedDatum'], how='inner').dropna()

    # 4. Set Order and Numeric Mapping for Plotting
    cat_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']
    cat_map = {cat: i for i, cat in enumerate(cat_order)}

    df_merged['gt_idx'] = df_merged['gt_cat'].map(cat_map)
    df_merged['pred_idx'] = df_merged['pred_cat'].map(cat_map)

    # Calculate Cohen's Kappa (Standard for categorical agreement)
    kappa = cohen_kappa_score(df_merged['gt_cat'], df_merged['pred_cat'], weights='linear')

    # 5. Plotting
    plt.figure(figsize=(14, 8))

    # BOXPLOT: Distribution of Predicted Categories relative to Ground Truth
    sns.boxplot(data=df_merged, x='gt_cat', y='pred_idx',
                palette="rocket", width=0.6, showfliers=False, boxprops=dict(alpha=0.5))

    # STRIPPLOT: Individual counts
    sns.stripplot(data=df_merged, x='gt_cat', y='pred_idx',
                  color='black', alpha=0.1, jitter=0.3, size=4)

    # DIAGONAL REFERENCE: Perfect category match
    plt.plot([0, 9], [0, 9], color='red', linestyle='--', linewidth=2)

    # 6. Formatting Legend & Axes
    plt.yticks(ticks=range(len(cat_order)), labels=cat_order)

    legend_elements = [
        Patch(facecolor='#ae3e50', alpha=0.5, label='Predicted Category Spread'),
        plt.Line2D([0], [0], color='red', linestyle='--', label='Perfect Category Agreement'),
        plt.Line2D([0], [0], color='black', marker='o', linestyle='', markersize=4, alpha=0.3, label='Iteration Matches'),
        Patch(color='none', label=f'Linear Weighted Kappa: {kappa:.3f}')
    ]
    plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Agreement Metrics")

    plt.title('Categorical Agreement: Ground Truth vs. LLM Prediction', fontsize=16, pad=20)
    plt.xlabel('Ground Truth Category (Clinician)', fontsize=12)
    plt.ylabel('LLM Predicted Category', fontsize=12)
    plt.grid(axis='both', linestyle=':', alpha=0.4)

    plt.tight_layout()
    plt.show()
##


# %% rainplot
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D

def plot_error_distribution_by_confidence(json_dir_path, ground_truth_path):
    # 1. Load Ground Truth
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
    df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
    df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')

    # 2. Load Predictions
    all_preds = []
    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for entry in data:
                if entry.get("success"):
                    res = entry["result"]
                    all_preds.append({
                        'unique_id': str(res.get('unique_id')).strip().lower(),
                        'MedDatum': str(res.get('MedDatum')).strip().lower(),
                        'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
                        'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
                    })

    df_merged = pd.merge(pd.DataFrame(all_preds), df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
                         on=['unique_id', 'MedDatum'], how='inner').dropna()

    # 3. Calculate Error
    df_merged['error'] = df_merged['EDSS_pred'] - df_merged['EDSS_gt']

    # 4. Bin Confidence
    bins = [0, 70, 80, 90, 100]
    labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)']
    df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)

    # Calculate counts for labels
    counts = df_merged['conf_bin'].value_counts().reindex(labels)
    new_labels = [f"{l}\n(n={int(counts[l])})" for l in labels]

    # 5. Plotting
    plt.figure(figsize=(13, 8))

    # Using a sequential color palette (Light blue to Dark blue)
    palette_colors = sns.color_palette("Blues", n_colors=len(labels))

    vplot = sns.violinplot(data=df_merged, x='conf_bin', y='error', inner="quartile",
                           palette=palette_colors, cut=0)

    # Reference line at 0
    plt.axhline(0, color='#d9534f', linestyle='--', linewidth=2.5)

    # 6. UPDATED LEGEND WITH CORRECT COLORS
    legend_elements = [
        # Legend items for the color gradient
        Patch(facecolor=palette_colors[0], label='Confidence: <70%'),
        Patch(facecolor=palette_colors[1], label='Confidence: 70-80%'),
        Patch(facecolor=palette_colors[2], label='Confidence: 80-90%'),
        Patch(facecolor=palette_colors[3], label='Confidence: 90-100%'),
        # Legend items for the symbols
        Line2D([0], [0], color='black', linestyle=':', label='Quartile Lines (25th, 50th, 75th)'),
        Line2D([0], [0], color='#d9534f', linestyle='--', lw=2.5, label='Zero Error (Perfect Match)')
    ]

    plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Legend & Confidence Gradient")

    # Formatting
    plt.title('Error Magnitude vs. LLM Confidence Levels', fontsize=16, pad=20)
    plt.xlabel('LLM Certainty Group', fontsize=12)
    plt.ylabel('Prediction Delta (EDSS_pred - EDSS_gt)', fontsize=12)
    plt.xticks(ticks=range(len(labels)), labels=new_labels)
    plt.grid(axis='y', linestyle=':', alpha=0.5)

    plt.tight_layout()
    plt.show()

# plot_error_distribution_by_confidence('jsons_folder/', 'ground_truth.csv')
##


# %% Certainty vs Delta
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D

def plot_confidence_vs_abs_error_refined(json_dir_path, ground_truth_path):
    # 1. Load and Merge Data
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
    df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
    df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')

    all_preds = []
    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for entry in data:
                if entry.get("success"):
                    res = entry["result"]
                    all_preds.append({
                        'unique_id': str(res.get('unique_id')).strip().lower(),
                        'MedDatum': str(res.get('MedDatum')).strip().lower(),
                        'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
                        'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
                    })

    df_merged = pd.merge(pd.DataFrame(all_preds), df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
                         on=['unique_id', 'MedDatum'], how='inner').dropna()

    # 2. Calculate Absolute Delta
    df_merged['abs_error'] = (df_merged['EDSS_pred'] - df_merged['EDSS_gt']).abs()

    # 3. Binning
    bins = [0, 70, 80, 90, 100]
    labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)']
    df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)

    stats = df_merged.groupby('conf_bin', observed=True)['abs_error'].agg(['mean', 'std', 'count']).reset_index()

    # 4. Plotting
    plt.figure(figsize=(12, 8))
    # Sequential palette: light to dark
    colors = sns.color_palette("Blues", n_colors=len(labels))

    bars = plt.bar(stats['conf_bin'], stats['mean'], color=colors, edgecolor='black', linewidth=1.2)

    # Standard Error Bars
    plt.errorbar(stats['conf_bin'], stats['mean'],
                 yerr=stats['std']/np.sqrt(stats['count']),
                 fmt='none', c='black', capsize=6, elinewidth=1.5)

    # Trend Line (Linear Fit)
    x_idx = np.arange(len(labels))
    z = np.polyfit(x_idx, stats['mean'], 1)
    p = np.poly1d(z)
    plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=2.5)

    # 5. THE COMPLETE LEGEND
    # We create a specific handle for every single thing on the chart
    legend_elements = [
        # Explicit color mapping for bins
        Patch(facecolor=colors[0], edgecolor='black', label=f'Bin 1: {labels[0]}'),
        Patch(facecolor=colors[1], edgecolor='black', label=f'Bin 2: {labels[1]}'),
        Patch(facecolor=colors[2], edgecolor='black', label=f'Bin 3: {labels[2]}'),
        Patch(facecolor=colors[3], edgecolor='black', label=f'Bin 4: {labels[3]}'),
        # Statistical components
        Line2D([0], [0], color='black', marker='_', linestyle='None', markersize=10, label='Standard Error (SEM)'),
        Line2D([0], [0], color='#e74c3c', linestyle='--', lw=2.5, label='Correlation Trend (Inverse Rel.)'),
        # Metric definition
        Patch(color='none', label='Metric: Mean Absolute Error (MAE)')
    ]

    plt.legend(handles=legend_elements, loc='upper right', frameon=True,
               shadow=True, fontsize=10, title="Legend")

    # Final Labels & Clean-up
    plt.title('Validation: Inverse Correlation of Confidence vs. Error Magnitude', fontsize=15, pad=20)
    plt.ylabel('Mean Absolute Error (Δ EDSS Points)', fontsize=12)
    plt.xlabel('LLM Confidence Bracket', fontsize=12)

    # Text annotations for MAE on bars
    for i, bar in enumerate(bars):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05,
                 f'MAE: {stats.loc[i, "mean"]:.2f}', ha='center', fontweight='bold')

    plt.grid(axis='y', linestyle=':', alpha=0.5)
    plt.tight_layout()
    plt.show()
##


# %% name
import pandas as pd
import numpy as np
import json
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D

def plot_final_thesis_error_chart(json_dir_path, ground_truth_path):
    # 1. Load Ground Truth & Predictions
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
    df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
    df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')

    all_preds = []
    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for entry in data:
                if entry.get("success"):
                    res = entry["result"]
                    all_preds.append({
                        'unique_id': str(res.get('unique_id')).strip().lower(),
                        'MedDatum': str(res.get('MedDatum')).strip().lower(),
                        'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
                        'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
                    })

    df_merged = pd.merge(pd.DataFrame(all_preds), df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
                         on=['unique_id', 'MedDatum'], how='inner').dropna()

    # 2. Metric Calculation
    df_merged['abs_error'] = (df_merged['EDSS_pred'] - df_merged['EDSS_gt']).abs()

    # 3. Binning & Stats
    bins = [0, 70, 80, 90, 100]
    labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)']
    df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)

    stats = df_merged.groupby('conf_bin', observed=True)['abs_error'].agg(['mean', 'std', 'count']).reset_index()

    # 4. Plotting
    plt.figure(figsize=(13, 8))
    colors = sns.color_palette("Blues", n_colors=len(labels))

    # BARS (MAE)
    bars = plt.bar(stats['conf_bin'], stats['mean'], color=colors, edgecolor='black', alpha=0.85)

    # ERROR BARS (Standard Error of the Mean)
    plt.errorbar(stats['conf_bin'], stats['mean'],
                 yerr=stats['std']/np.sqrt(stats['count']),
                 fmt='none', c='black', capsize=8, elinewidth=1.5)

    # CORRELATION TREND LINE
    x_idx = np.arange(len(labels))
    z = np.polyfit(x_idx, stats['mean'], 1)
    p = np.poly1d(z)
    plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5)

    # 5. DATA LABELS (n and MAE)
    for i, bar in enumerate(bars):
        n_count = int(stats.loc[i, 'count'])
        mae_val = stats.loc[i, 'mean']
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.04,
                 f'MAE: {mae_val:.2f}\nn={n_count}',
                 ha='center', va='bottom', fontweight='bold', fontsize=10)

    # 6. THE COMPLETE LEGEND
    legend_elements = [
        Patch(facecolor=colors[0], edgecolor='black', label=f'Bin 1: {labels[0]}'),
        Patch(facecolor=colors[1], edgecolor='black', label=f'Bin 2: {labels[1]}'),
        Patch(facecolor=colors[2], edgecolor='black', label=f'Bin 3: {labels[2]}'),
        Patch(facecolor=colors[3], edgecolor='black', label=f'Bin 4: {labels[3]}'),
        Line2D([0], [0], color='#e74c3c', linestyle='--', lw=3, label='Correlation Trend (Inverse Relationship)'),
        Line2D([0], [0], color='black', marker='_', linestyle='None', markersize=10, label='Standard Error (SEM)'),
        Patch(color='none', label='Metric: Mean Absolute Error (MAE)')
    ]
    plt.legend(handles=legend_elements, loc='upper right', frameon=True, shadow=True, title="Chart Components")

    # Formatting
    plt.title('Clinical Validation: LLM Certainty vs. Prediction Accuracy', fontsize=16, pad=30)
    plt.ylabel('Mean Absolute Error (EDSS Points)', fontsize=12)
    plt.xlabel('LLM Confidence Bracket', fontsize=12)
    plt.grid(axis='y', linestyle=':', alpha=0.5)
    plt.ylim(0, stats['mean'].max() + 0.6) # Add room for labels

    plt.tight_layout()
    plt.show()

# plot_final_thesis_error_chart('jsons_folder/', 'gt.csv')
##


# %% 1json
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Patch
from matplotlib.lines import Line2D

def plot_single_json_error_analysis(json_file_path, ground_truth_path):
    # 1. Load Ground Truth
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
    df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
    df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')

    # 2. Load the Specific JSON
    all_preds = []
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        for entry in data:
            if entry.get("success"):
                res = entry["result"]
                all_preds.append({
                    'unique_id': str(res.get('unique_id')).strip().lower(),
                    'MedDatum': str(res.get('MedDatum')).strip().lower(),
                    'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
                    'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
                })

    df_pred = pd.DataFrame(all_preds)

    # 3. Merge and Calculate Absolute Error
    df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
                         on=['unique_id', 'MedDatum'], how='inner').dropna()

    df_merged['abs_error'] = (df_merged['EDSS_pred'] - df_merged['EDSS_gt']).abs()

    # 4. Binning and Statistics
    bins = [0, 70, 80, 90, 100]
    labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)']
    df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)

    stats = df_merged.groupby('conf_bin', observed=True)['abs_error'].agg(['mean', 'std', 'count']).reset_index()

    # 5. Plotting
    plt.figure(figsize=(13, 8))
    colors = sns.color_palette("Blues", n_colors=len(labels))

    # BARS (MAE)
    bars = plt.bar(stats['conf_bin'], stats['mean'], color=colors, edgecolor='black', alpha=0.85)

    # ERROR BARS (SEM)
    plt.errorbar(stats['conf_bin'], stats['mean'],
                 yerr=stats['std']/np.sqrt(stats['count']),
                 fmt='none', c='black', capsize=8, elinewidth=1.5)

    # CORRELATION TREND LINE
    x_idx = np.arange(len(labels))
    z = np.polyfit(x_idx, stats['mean'], 1)
    p = np.poly1d(z)
    plt.plot(x_idx, p(x_idx), color="#e74c3c", linestyle="--", linewidth=3, zorder=5)

    # 6. DATA LABELS (n and MAE)
    for i, bar in enumerate(bars):
        n_count = int(stats.loc[i, 'count'])
        mae_val = stats.loc[i, 'mean']
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.04,
                 f'MAE: {mae_val:.2f}\nn={n_count}',
                 ha='center', va='bottom', fontweight='bold', fontsize=10)

    # 7. COMPREHENSIVE LEGEND
    legend_elements = [
        Patch(facecolor=colors[0], edgecolor='black', label=f'Bin 1: {labels[0]}'),
        Patch(facecolor=colors[1], edgecolor='black', label=f'Bin 2: {labels[1]}'),
        Patch(facecolor=colors[2], edgecolor='black', label=f'Bin 3: {labels[2]}'),
        Patch(facecolor=colors[3], edgecolor='black', label=f'Bin 4: {labels[3]}'),
        Line2D([0], [0], color='#e74c3c', linestyle='--', lw=3, label='Inverse Trend Line'),
        Line2D([0], [0], color='black', marker='_', linestyle='None', markersize=10, label='Std Error (SEM)'),
        Patch(color='none', label='Metric: Mean Absolute Error (MAE)')
    ]
    plt.legend(handles=legend_elements, loc='upper right', frameon=True, shadow=True, title="Legend")

    # Final Styling
    plt.title('Validation: Confidence vs. Error Magnitude (Iteration 1 Only)', fontsize=15, pad=30)
    plt.ylabel('Mean Absolute Error (EDSS Points)', fontsize=12)
    plt.xlabel('LLM Confidence Bracket', fontsize=12)
    plt.grid(axis='y', linestyle=':', alpha=0.5)
    plt.ylim(0, stats['mean'].max() + 0.6)

    plt.tight_layout()
    plt.show()

# --- RUN THE PLOT ---
json_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique_results_iter_1_20260212_020628.json"
gt_path = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv"

plot_single_json_error_analysis(json_path, gt_path)
##


# %% Usage

# --- Usage ---
#plot_categorized_edss('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration',
#                           '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')

#plot_subcategory_analysis('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_certainty_vs_accuracy_by_category('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')


#plot_edss_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_binned_calibration('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')

#plot_edss_vs_confidence_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration')
#plot_gt_vs_llm_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_categorical_vs_categorical('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_error_distribution_by_confidence('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_confidence_vs_abs_error_refined('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_confidence_vs_abs_error_with_counts('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
#plot_final_thesis_error_chart('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')

##