updated plot certainty

2026-02-13 09:22:53 +01:00
parent 8e4a43c557
commit f1d22b28ad
1 changed files with 549 additions and 1 deletions
@@ -610,6 +610,88 @@ def plot_certainty_vs_accuracy_by_category(json_dir_path, ground_truth_path):
 # %% Boxplot
 import pandas as pd
 import numpy as np
 import json
 import glob
 import os
 import re
 import matplotlib.pyplot as plt
 from matplotlib.lines import Line2D
 from matplotlib.patches import Patch
 def natural_key(string_):
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]
 def plot_edss_boxplot(json_dir_path, ground_truth_path):
    # 1. Load Ground Truth
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    gt_values = pd.to_numeric(df_gt['EDSS'], errors='coerce').dropna().tolist()
    # 2. Load Iterations
    json_files = glob.glob(os.path.join(json_dir_path, "*.json"))
    json_files.sort(key=natural_key)
    plot_data = [gt_values]
    labels = ['Ground Truth']
    for i, file_path in enumerate(json_files):
        iteration_values = []
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for entry in data:
                if entry.get("success"):
                    val = entry["result"].get("EDSS")
                    if val is not None:
                        iteration_values.append(float(val))
            plot_data.append(iteration_values)
            labels.append(f"Iter {i+1}")
    # 3. Plotting Configuration
    plt.figure(figsize=(14, 8))
    # Define colors
    gt_color = '#ff9999' # Soft Red
    iter_color = '#66b3ff' # Soft Blue
    # Create the boxplot
    bplot = plt.boxplot(plot_data, labels=labels, patch_artist=True,
                        notch=False,
                        medianprops={'color': 'black', 'linewidth': 2},
                        flierprops={'marker': 'o', 'markerfacecolor': 'gray', 'markersize': 5, 'alpha': 0.5},
                        showmeans=True,
                        meanprops={"marker":"D", "markerfacecolor":"white", "markeredgecolor":"black", "markersize": 6})
    # 4. Fill boxes with colors
    colors = [gt_color] + [iter_color] * (len(plot_data) - 1)
    for patch, color in zip(bplot['boxes'], colors):
        patch.set_facecolor(color)
    # 5. CONSTRUCT THE COMPLETE LEGEND
    legend_elements = [
        Patch(facecolor=gt_color, edgecolor='black', label='Ground Truth'),
        Patch(facecolor=iter_color, edgecolor='black', label='LLM Iterations (1-10)'),
        Line2D([0], [0], color='black', lw=2, label='Median'),
        Line2D([0], [0], marker='D', color='w', label='Mean Score',
               markerfacecolor='white', markeredgecolor='black', markersize=8),
        Line2D([0], [0], marker='o', color='w', label='Outliers',
               markerfacecolor='gray', markersize=6, alpha=0.5)
    ]
    plt.legend(handles=legend_elements, loc='upper right', frameon=True, shadow=True, title="Legend")
    # Formatting
    plt.title('Distribution of EDSS Scores: Ground Truth vs. 10 LLM Iterations', fontsize=16, pad=20)
    plt.ylabel('EDSS Score (0-10)', fontsize=12)
    plt.xlabel('Data Source', fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.4)
    plt.ylim(-0.5, 10.5)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
 ##
 # %% Audit
@@ -674,6 +756,464 @@ def audit_matches(json_dir_path, ground_truth_path):
 ##
 # %% Cinfidence accuracy correlation
 import pandas as pd
 import numpy as np
 import json
 import glob
 import os
 import matplotlib.pyplot as plt
 def categorize_edss(value):
    if pd.isna(value): return np.nan
    elif value <= 1.0: return '0-1'
    elif value <= 2.0: return '1-2'
    elif value <= 3.0: return '2-3'
    elif value <= 4.0: return '3-4'
    elif value <= 5.0: return '4-5'
    elif value <= 6.0: return '6-7'
    elif value <= 7.0: return '7-8'
    elif value <= 8.0: return '8-9'
    elif value <= 9.0: return '9-10'
    else: return '10+'
 def plot_binned_calibration(json_dir_path, ground_truth_path):
    # 1. Load Ground Truth
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
    df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
    df_gt['gt_cat'] = pd.to_numeric(df_gt['EDSS'], errors='coerce').apply(categorize_edss)
    # 2. Load Predictions
    all_preds = []
    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for entry in data:
                if entry.get("success"):
                    res = entry["result"]
                    all_preds.append({
                        'unique_id': str(res.get('unique_id')).strip().lower(),
                        'MedDatum': str(res.get('MedDatum')).strip().lower(),
                        'pred_cat': categorize_edss(res.get('EDSS')),
                        'confidence': res.get('certainty_percent')
                    })
    df_pred = pd.DataFrame(all_preds)
    df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'gt_cat']],
                         on=['unique_id', 'MedDatum'], how='inner')
    # Define correctness
    df_merged['is_correct'] = (df_merged['pred_cat'] == df_merged['gt_cat']).astype(int)
    # 3. Create Confidence Bins (e.g., 0-60, 60-70, 70-80, 80-90, 90-100)
    bins = [0, 60, 70, 80, 90, 100]
    labels = ['<60%', '60-70%', '70-80%', '80-90%', '90-100%']
    df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)
    # Calculate average accuracy per bin
    calibration_stats = df_merged.groupby('conf_bin')['is_correct'].agg(['mean', 'count']).reset_index()
    # 4. Plotting
    plt.figure(figsize=(10, 6))
    # Bar chart for actual accuracy
    bars = plt.bar(calibration_stats['conf_bin'], calibration_stats['mean'],
                   color='skyblue', edgecolor='navy', alpha=0.7, label='Actual Accuracy')
    # Add the "Perfect Calibration" line
    # (If confidence is 95%, accuracy should be 0.95)
    expected_x = np.arange(len(labels))
    expected_y = [0.3, 0.65, 0.75, 0.85, 0.95] # Midpoints of the bins for visual reference
    plt.plot(expected_x, expected_y, color='red', marker='o', linestyle='--',
             linewidth=2, label='Perfect Calibration (Theoretical)')
    # 5. Add text labels on top of bars to show sample size (how many cases in that bin)
    for i, bar in enumerate(bars):
        yval = bar.get_height()
        count = calibration_stats.loc[i, 'count']
        plt.text(bar.get_x() + bar.get_width()/2, yval + 0.02,
                 f'Acc: {yval:.1%}\n(n={count})', ha='center', va='bottom', fontsize=9)
    # Legend and Labels
    plt.title('Model Calibration: Does Confidence Match Accuracy?', fontsize=14, pad=15)
    plt.xlabel('LLM Confidence Score Bin', fontsize=12)
    plt.ylabel('Actual Accuracy (Correct Category %)', fontsize=12)
    plt.ylim(0, 1.1)
    plt.grid(axis='y', linestyle=':', alpha=0.5)
    # Adding a clean, informative legend
    plt.legend(loc='upper left', frameon=True, shadow=True)
    plt.tight_layout()
    plt.show()
 ##
 # %%  Confidence comparison
 import pandas as pd
 import numpy as np
 import json
 import glob
 import os
 import matplotlib.pyplot as plt
 import seaborn as sns
 from matplotlib.lines import Line2D
 from matplotlib.patches import Patch
 def plot_edss_confidence_comparison(json_dir_path, ground_truth_path):
    # 1. Load Ground Truth
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
    df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
    df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')
    # 2. Load Predictions from all JSONs
    all_preds = []
    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                for entry in data:
                    if entry.get("success"):
                        res = entry["result"]
                        all_preds.append({
                            'unique_id': str(res.get('unique_id')).strip().lower(),
                            'MedDatum': str(res.get('MedDatum')).strip().lower(),
                            'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
                            'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
                        })
            except Exception as e:
                print(f"Skipping {file_path}: {e}")
    df_pred = pd.DataFrame(all_preds)
    # 3. Merge and Clean
    df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
                         on=['unique_id', 'MedDatum'], how='inner')
    df_plot = df_merged.dropna(subset=['EDSS_pred', 'EDSS_gt', 'confidence']).copy()
    # 4. Bin Confidence (X-Axis Categories)
    # We group confidence into bins to create a readable boxplot
    bins = [0, 60, 70, 80, 90, 100]
    labels = ['<60%', '60-70%', '70-80%', '80-90%', '90-100%']
    df_plot['conf_bin'] = pd.cut(df_plot['confidence'], bins=bins, labels=labels)
    # 5. Plotting
    plt.figure(figsize=(14, 8))
    # A. Boxplot: Shows the distribution of LLM PREDICTIONS
    sns.boxplot(data=df_plot, x='conf_bin', y='EDSS_pred',
                color='#3498db', width=0.5, showfliers=False,
                boxprops=dict(alpha=0.4, edgecolor='navy'))
    # B. Stripplot (Dots): Shows individual GROUND TRUTH scores
    # We add jitter so dots don't hide each other
    sns.stripplot(data=df_plot, x='conf_bin', y='EDSS_gt',
                  color='#e74c3c', alpha=0.4, jitter=0.2, size=5)
    # 6. Create a CLEAR Legend
    legend_elements = [
        Patch(facecolor='#3498db', edgecolor='navy', alpha=0.4,
              label='LLM Predictions (Box = Distribution)'),
        Line2D([0], [0], marker='o', color='w', label='Ground Truth (Dots = Clinician Scores)',
               markerfacecolor='#e74c3c', markersize=8, alpha=0.6),
        Line2D([0], [0], color='black', lw=2, label='Median Predicted EDSS')
    ]
    plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Legend")
    # Final Labels
    plt.title('Comparison of EDSS Scores Across Confidence Levels', fontsize=16, pad=20)
    plt.xlabel('LLM Certainty Score (%)', fontsize=12)
    plt.ylabel('EDSS Score (0-10)', fontsize=12)
    plt.ylim(-0.5, 10.5)
    plt.yticks(np.arange(0, 11, 1))
    plt.grid(axis='y', linestyle='--', alpha=0.3)
    plt.tight_layout()
    plt.show()
 ##
 # %% EDSS vs Boxplot
 import pandas as pd
 import numpy as np
 import json
 import glob
 import os
 import matplotlib.pyplot as plt
 import seaborn as sns
 from matplotlib.patches import Patch
 def categorize_edss(value):
    if pd.isna(value): return np.nan
    elif value <= 1.0: return '0-1'
    elif value <= 2.0: return '1-2'
    elif value <= 3.0: return '2-3'
    elif value <= 4.0: return '3-4'
    elif value <= 5.0: return '4-5'
    elif value <= 6.0: return '5-6'
    elif value <= 7.0: return '6-7'
    elif value <= 8.0: return '7-8'
    elif value <= 9.0: return '8-9'
    elif value <= 10.0: return '9-10'
    else: return '10+'
 def plot_edss_vs_confidence_boxplot(json_dir_path):
    # 1. Load all Predictions
    all_preds = []
    json_files = glob.glob(os.path.join(json_dir_path, "*.json"))
    for file_path in json_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for entry in data:
                if entry.get("success"):
                    res = entry["result"]
                    edss_val = pd.to_numeric(res.get('EDSS'), errors='coerce')
                    conf_val = pd.to_numeric(res.get('certainty_percent'), errors='coerce')
                    if not pd.isna(edss_val) and not pd.isna(conf_val):
                        all_preds.append({
                            'edss_cat': categorize_edss(edss_val),
                            'confidence': conf_val
                        })
    df = pd.DataFrame(all_preds)
    # 2. Sort categories correctly for the x-axis
    cat_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']
    df['edss_cat'] = pd.Categorical(df['edss_cat'], categories=cat_order, ordered=True)
    # 3. Plotting
    plt.figure(figsize=(14, 8))
    # Create Boxplot
    sns.boxplot(data=df, x='edss_cat', y='confidence', 
                palette="Blues", width=0.6, showfliers=False)
    # Add Stripplot (Dots) to show density of cases
    sns.stripplot(data=df, x='edss_cat', y='confidence', 
                  color='black', alpha=0.15, jitter=0.2, size=3)
    # 4. Legend and Labels
    # Since boxplot color is clear, we add a legend for the components
    legend_elements = [
        Patch(facecolor='#6da7d1', label='Confidence Distribution (IQR)'),
        plt.Line2D([0], [0], color='black', marker='o', linestyle='', 
                   markersize=4, alpha=0.4, label='Individual Predictions')
    ]
    plt.legend(handles=legend_elements, loc='lower left', frameon=True)
    plt.title('LLM Confidence Levels Across Clinical EDSS Categories', fontsize=16, pad=20)
    plt.xlabel('Predicted EDSS Category (Clinical Severity)', fontsize=12)
    plt.ylabel('Confidence Score (%)', fontsize=12)
    plt.ylim(0, 105)
    plt.grid(axis='y', linestyle='--', alpha=0.3)
    plt.tight_layout()
    plt.show()
 ##
 # %% Correlation Boxplot
 import pandas as pd
 import numpy as np
 import json
 import glob
 import os
 import matplotlib.pyplot as plt
 import seaborn as sns
 from matplotlib.patches import Patch
 from sklearn.metrics import cohen_kappa_score
 def categorize_edss(value):
    """Standardized clinical categorization."""
    if pd.isna(value): return np.nan
    elif value <= 1.0: return '0-1'
    elif value <= 2.0: return '1-2'
    elif value <= 3.0: return '2-3'
    elif value <= 4.0: return '3-4'
    elif value <= 5.0: return '4-5'
    elif value <= 6.0: return '5-6'
    elif value <= 7.0: return '6-7'
    elif value <= 8.0: return '7-8'
    elif value <= 9.0: return '8-9'
    elif value <= 10.0: return '9-10'
    else: return '10+'
 def plot_categorical_vs_categorical(json_dir_path, ground_truth_path):
    # 1. Load Ground Truth
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
    df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
    df_gt['gt_cat'] = pd.to_numeric(df_gt['EDSS'], errors='coerce').apply(categorize_edss)
    # 2. Load Predictions
    all_preds = []
    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for entry in data:
                if entry.get("success"):
                    res = entry["result"]
                    all_preds.append({
                        'unique_id': str(res.get('unique_id')).strip().lower(),
                        'MedDatum': str(res.get('MedDatum')).strip().lower(),
                        'pred_cat': categorize_edss(pd.to_numeric(res.get('EDSS'), errors='coerce'))
                    })
    df_pred = pd.DataFrame(all_preds)
    # 3. Merge
    df_merged = pd.merge(df_pred, df_gt[['unique_id', 'MedDatum', 'gt_cat']],
                         on=['unique_id', 'MedDatum'], how='inner').dropna()
    # 4. Set Order and Numeric Mapping for Plotting
    cat_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']
    cat_map = {cat: i for i, cat in enumerate(cat_order)}
    df_merged['gt_idx'] = df_merged['gt_cat'].map(cat_map)
    df_merged['pred_idx'] = df_merged['pred_cat'].map(cat_map)
    # Calculate Cohen's Kappa (Standard for categorical agreement)
    kappa = cohen_kappa_score(df_merged['gt_cat'], df_merged['pred_cat'], weights='linear')
    # 5. Plotting
    plt.figure(figsize=(14, 8))
    # BOXPLOT: Distribution of Predicted Categories relative to Ground Truth
    sns.boxplot(data=df_merged, x='gt_cat', y='pred_idx',
                palette="rocket", width=0.6, showfliers=False, boxprops=dict(alpha=0.5))
    # STRIPPLOT: Individual counts
    sns.stripplot(data=df_merged, x='gt_cat', y='pred_idx',
                  color='black', alpha=0.1, jitter=0.3, size=4)
    # DIAGONAL REFERENCE: Perfect category match
    plt.plot([0, 9], [0, 9], color='red', linestyle='--', linewidth=2)
    # 6. Formatting Legend & Axes
    plt.yticks(ticks=range(len(cat_order)), labels=cat_order)
    legend_elements = [
        Patch(facecolor='#ae3e50', alpha=0.5, label='Predicted Category Spread'),
        plt.Line2D([0], [0], color='red', linestyle='--', label='Perfect Category Agreement'),
        plt.Line2D([0], [0], color='black', marker='o', linestyle='', markersize=4, alpha=0.3, label='Iteration Matches'),
        Patch(color='none', label=f'Linear Weighted Kappa: {kappa:.3f}')
    ]
    plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Agreement Metrics")
    plt.title('Categorical Agreement: Ground Truth vs. LLM Prediction', fontsize=16, pad=20)
    plt.xlabel('Ground Truth Category (Clinician)', fontsize=12)
    plt.ylabel('LLM Predicted Category', fontsize=12)
    plt.grid(axis='both', linestyle=':', alpha=0.4)
    plt.tight_layout()
    plt.show()
 ##
 # %% rainplot
 import pandas as pd
 import numpy as np
 import json
 import glob
 import os
 import matplotlib.pyplot as plt
 import seaborn as sns
 from matplotlib.patches import Patch
 from matplotlib.lines import Line2D
 def plot_error_distribution_by_confidence(json_dir_path, ground_truth_path):
    # 1. Load Ground Truth
    df_gt = pd.read_csv(ground_truth_path, sep=';')
    df_gt['unique_id'] = df_gt['unique_id'].astype(str).str.strip().str.lower()
    df_gt['MedDatum'] = df_gt['MedDatum'].astype(str).str.strip().str.lower()
    df_gt['EDSS_gt'] = pd.to_numeric(df_gt['EDSS'], errors='coerce')
    # 2. Load Predictions
    all_preds = []
    for file_path in glob.glob(os.path.join(json_dir_path, "*.json")):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for entry in data:
                if entry.get("success"):
                    res = entry["result"]
                    all_preds.append({
                        'unique_id': str(res.get('unique_id')).strip().lower(),
                        'MedDatum': str(res.get('MedDatum')).strip().lower(),
                        'EDSS_pred': pd.to_numeric(res.get('EDSS'), errors='coerce'),
                        'confidence': pd.to_numeric(res.get('certainty_percent'), errors='coerce')
                    })
    df_merged = pd.merge(pd.DataFrame(all_preds), df_gt[['unique_id', 'MedDatum', 'EDSS_gt']],
                         on=['unique_id', 'MedDatum'], how='inner').dropna()
    # 3. Calculate Error
    df_merged['error'] = df_merged['EDSS_pred'] - df_merged['EDSS_gt']
    # 4. Bin Confidence
    bins = [0, 70, 80, 90, 100]
    labels = ['Low (<70%)', 'Moderate (70-80%)', 'High (80-90%)', 'Very High (90-100%)']
    df_merged['conf_bin'] = pd.cut(df_merged['confidence'], bins=bins, labels=labels)
    # Calculate counts for labels
    counts = df_merged['conf_bin'].value_counts().reindex(labels)
    new_labels = [f"{l}\n(n={int(counts[l])})" for l in labels]
    # 5. Plotting
    plt.figure(figsize=(13, 8))
    # Using a sequential color palette (Light blue to Dark blue)
    palette_colors = sns.color_palette("Blues", n_colors=len(labels))
    vplot = sns.violinplot(data=df_merged, x='conf_bin', y='error', inner="quartile",
                           palette=palette_colors, cut=0)
    # Reference line at 0
    plt.axhline(0, color='#d9534f', linestyle='--', linewidth=2.5)
    # 6. UPDATED LEGEND WITH CORRECT COLORS
    legend_elements = [
        # Legend items for the color gradient
        Patch(facecolor=palette_colors[0], label='Confidence: <70%'),
        Patch(facecolor=palette_colors[1], label='Confidence: 70-80%'),
        Patch(facecolor=palette_colors[2], label='Confidence: 80-90%'),
        Patch(facecolor=palette_colors[3], label='Confidence: 90-100%'),
        # Legend items for the symbols
        Line2D([0], [0], color='black', linestyle=':', label='Quartile Lines (25th, 50th, 75th)'),
        Line2D([0], [0], color='#d9534f', linestyle='--', lw=2.5, label='Zero Error (Perfect Match)')
    ]
    plt.legend(handles=legend_elements, loc='upper left', frameon=True, shadow=True, title="Legend & Confidence Gradient")
    # Formatting
    plt.title('Error Magnitude vs. LLM Confidence Levels', fontsize=16, pad=20)
    plt.xlabel('LLM Certainty Group', fontsize=12)
    plt.ylabel('Prediction Delta (EDSS_pred - EDSS_gt)', fontsize=12)
    plt.xticks(ticks=range(len(labels)), labels=new_labels)
    plt.grid(axis='y', linestyle=':', alpha=0.5)
    plt.tight_layout()
    plt.show()
 # plot_error_distribution_by_confidence('jsons_folder/', 'ground_truth.csv')
 ##
 # %% Usage
 # --- Usage ---
@@ -681,7 +1221,15 @@ def audit_matches(json_dir_path, ground_truth_path):
 #                           '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
 #plot_subcategory_analysis('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
-plot_certainty_vs_accuracy_by_category('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
+#plot_certainty_vs_accuracy_by_category('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
 #audit_matches('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
 #plot_edss_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
 #plot_binned_calibration('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
 #plot_edss_vs_confidence_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration')
 #plot_gt_vs_llm_boxplot('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
 #plot_categorical_vs_categorical('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
 plot_error_distribution_by_confidence('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/iteration', '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/GT_Numbers.csv')
 ##