Adjsuting and cleaning

2026-02-08 01:59:38 +01:00
parent f4bf37f71c
commit 2f507bcf20
4 changed files with 148 additions and 209 deletions
@@ -1828,6 +1828,97 @@ plt.tight_layout()
 plt.show()
 ##

+
+
+# %% name
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+import numpy as np
+
+# --- Configuration & Theme ---
+plt.rcParams['font.family'] = 'Arial'
+figure_save_path = 'project/visuals/functional_systems_magnitude_focus.svg'
+
+# --- 1. Process Error Data with Magnitude Breakdown ---
+system_names = [name.split('.')[1] for name, _ in functional_systems_to_plot]
+plot_list = []
+
+for gt_col, res_col in functional_systems_to_plot:
+    sys_name = gt_col.split('.')[1]
+
+    # Robust parsing
+    gt = df[gt_col].apply(safe_parse)
+    res = df[res_col].apply(safe_parse)
+    error = res - gt
+
+    # Granular Counts
+    matches = (error == 0).sum()
+    u_1 = (error == -1).sum()
+    u_2plus = (error <= -2).sum()
+    o_1 = (error == 1).sum()
+    o_2plus = (error >= 2).sum()
+
+    total = error.dropna().count()
+    divisor = max(total, 1)
+
+    plot_list.append({
+        'System': sys_name.replace('_', ' ').title(),
+        'Matches': matches, 'MatchPct': (matches / divisor) * 100,
+        'U1': u_1, 'U2': u_2plus, 'UnderTotal': u_1 + u_2plus,
+        'UnderPct': ((u_1 + u_2plus) / divisor) * 100,
+        'O1': o_1, 'O2': o_2plus, 'OverTotal': o_1 + o_2plus,
+        'OverPct': ((o_1 + o_2plus) / divisor) * 100
+    })
+
+stats_df = pd.DataFrame(plot_list)
+
+# --- 2. Plotting ---
+fig, ax = plt.subplots(figsize=(13, 8))
+
+# Define Magnitude Colors
+c_under_dark, c_under_light = '#C0392B', '#E74C3C' # Dark Red (-2+), Soft Red (-1)
+c_over_dark, c_over_light   = '#2980B9', '#3498DB' # Dark Blue (+2+), Soft Blue (+1)
+bar_height = 0.6
+y_pos = np.arange(len(stats_df))
+
+# Plot Under-scored (Stacked: -2+ then -1)
+ax.barh(y_pos, -stats_df['U2'], bar_height, color=c_under_dark, label='Under -2+', edgecolor='white')
+ax.barh(y_pos, -stats_df['U1'], bar_height, left=-stats_df['U2'], color=c_under_light, label='Under -1', edgecolor='white')
+
+# Plot Over-scored (Stacked: +1 then +2+)
+ax.barh(y_pos, stats_df['O1'], bar_height, color=c_over_light, label='Over +1', edgecolor='white')
+ax.barh(y_pos, stats_df['O2'], bar_height, left=stats_df['O1'], color=c_over_dark, label='Over +2+', edgecolor='white')
+
+# --- 3. Aesthetics & Table Labels ---
+for i, row in stats_df.iterrows():
+    label_text = (
+        f"$\\mathbf{{{row['System']}}}$\n"
+        f"Match: {int(row['Matches'])} ({row['MatchPct']:.1f}%)\n"
+        f"Under: {int(row['UnderTotal'])} ({row['UnderPct']:.1f}%) | Over: {int(row['OverTotal'])} ({row['OverPct']:.1f}%)"
+    )
+    # Position table text to the left
+    ax.text(ax.get_xlim()[0] - 0.5, i, label_text, va='center', ha='right', fontsize=9, color='#333333', linespacing=1.4)
+
+# Formatting
+ax.axvline(0, color='black', linewidth=1.2)
+ax.set_yticks([])
+ax.set_xlabel('Number of Patients with Error', fontsize=11, fontweight='bold')
+#ax.set_title('Directional Error Magnitude (Under vs. Over Scoring)', fontsize=14, pad=35)
+
+# Absolute X-axis labels
+ax.set_xticklabels([int(abs(tick)) for tick in ax.get_xticks()])
+
+# Remove spines and add grid
+for spine in ['top', 'right', 'left']: ax.spines[spine].set_visible(False)
+ax.xaxis.grid(True, linestyle='--', alpha=0.3)
+
+# Legend with magnitude info
+ax.legend(loc='upper right', frameon=False, bbox_to_anchor=(1, 1.1), ncol=2)
+
+plt.tight_layout()
+plt.show()
+##
 # %% test
 # Diagnose: what are the actual differences?
 print("\n🔍 Raw differences (first 5 rows per system):")
@@ -1,135 +0,0 @@
-import pandas as pd
-import numpy as np
-import seaborn as sns
-import matplotlib.pyplot as plt
-import dataframe_image as dfi
-# Load data
-df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t')
-
-# 1. Identify all GT and result columns
-gt_columns = [col for col in df.columns if col.startswith('GT.')]
-result_columns = [col for col in df.columns if col.startswith('result.')]
-
-print("GT Columns found:", gt_columns)
-print("Result Columns found:", result_columns)
-
-# 2. Create proper mapping between GT and result columns
-# Handle various naming conventions (spaces, underscores, etc.)
-column_mapping = {}
-
-for gt_col in gt_columns:
-    base_name = gt_col.replace('GT.', '')
-
-    # Clean the base name for matching - remove spaces, underscores, etc.
-    # Try different matching approaches
-    candidates = [
-        f'result.{base_name}',  # Exact match
-        f'result.{base_name.replace(" ", "_")}',  # With underscores
-        f'result.{base_name.replace("_", " ")}',  # With spaces
-        f'result.{base_name.replace(" ", "")}',   # No spaces
-        f'result.{base_name.replace("_", "")}'    # No underscores
-    ]
-
-    # Also try case-insensitive matching
-    candidates.append(f'result.{base_name.lower()}')
-    candidates.append(f'result.{base_name.upper()}')
-
-    # Try to find matching result column
-    matched = False
-    for candidate in candidates:
-        if candidate in result_columns:
-            column_mapping[gt_col] = candidate
-            matched = True
-            break
-
-    # If no exact match found, try partial matching
-    if not matched:
-        # Try to match by removing special characters and comparing
-        base_clean = ''.join(e for e in base_name if e.isalnum() or e in ['_', ' '])
-        for result_col in result_columns:
-            result_base = result_col.replace('result.', '')
-            result_clean = ''.join(e for e in result_base if e.isalnum() or e in ['_', ' '])
-            if base_clean.lower() == result_clean.lower():
-                column_mapping[gt_col] = result_col
-                matched = True
-                break
-
-print("Column mapping:", column_mapping)
-
-# 3. Faster, vectorized computation using the corrected mapping
-data_list = []
-
-for gt_col, result_col in column_mapping.items():
-    print(f"Processing {gt_col} vs {result_col}")
-
-    # Convert to numeric, forcing errors to NaN
-    s1 = pd.to_numeric(df[gt_col], errors='coerce').astype(float)
-    s2 = pd.to_numeric(df[result_col], errors='coerce').astype(float)
-
-    # Calculate matches (abs difference <= 0.5)
-    diff = np.abs(s1 - s2)
-    matches = (diff <= 0.5).sum()
-
-    # Determine the denominator (total valid comparisons)
-    valid_count = diff.notna().sum()
-
-    if valid_count > 0:
-        percentage = (matches / valid_count) * 100
-    else:
-        percentage = 0
-
-    # Extract clean base name for display
-    base_name = gt_col.replace('GT.', '')
-
-    data_list.append({
-        'GT': base_name,
-        'Match %': round(percentage, 1)
-    })
-
-
-
-# 4. Prepare Data for Plotting
-match_df = pd.DataFrame(data_list)
-match_df = match_df.sort_values('Match %', ascending=False) # Sort for better visual flow
-
-# 5. Create the Styled Gradient Table
-def style_agreement_table(df):
-    return (df.style
-        .format({'Match %': '{:.1f}%'}) # Add % sign
-        .background_gradient(cmap='RdYlGn', subset=['Match %'], vmin=50, vmax=100) # Red to Green gradient
-        .set_properties(**{
-            'text-align': 'center',
-            'font-size': '12pt',
-            'border-collapse': 'collapse',
-            'border': '1px solid #D3D3D3'
-        })
-        .set_table_styles([
-            # Style the header
-            {'selector': 'th', 'props': [
-                ('background-color', '#404040'), 
-                ('color', 'white'),
-                ('font-weight', 'bold'),
-                ('text-transform', 'uppercase'),
-                ('padding', '10px')
-            ]},
-            # Add hover effect
-            {'selector': 'tr:hover', 'props': [('background-color', '#f5f5f5')]}
-        ])
-        .set_caption("EDSS Agreement Analysis: Ground Truth vs. Results (Tolerance ±0.5)")
-    )
-
-# To display in a Jupyter Notebook:
-styled_table = style_agreement_table(match_df)
-styled_table
-
-dfi.export(styled_table, "styled_table.png")
-#styled_table.to_html("agreement_report.html")
-# 6. Save as SVG
-
-#plt.savefig("agreement_table.svg", format='svg', dpi=300, bbox_inches='tight')
-#print("Successfully saved agreement_table.svg")
-
-# Show plot if running in a GUI environment
-plt.show()
-
-
@@ -1,74 +0,0 @@
-import pandas as pd
-import numpy as np
-import seaborn as sns
-
-# Sample data (replace with your actual df)
-df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t')
-
-# Identify GT and Result columns
-gt_columns = [col for col in df.columns if col.startswith('GT.')]
-result_columns = [col for col in df.columns if col.startswith('result.')]
-
-# Create mapping
-column_mapping = {}
-for gt_col in gt_columns:
-    base_name = gt_col.replace('GT.', '')
-    result_col = f'result.{base_name}'
-    if result_col in result_columns:
-        column_mapping[gt_col] = result_col
-
-# Function to compute match percentage for each GT-Result pair
-def compute_match_percentages(df, column_mapping):
-    percentages = []
-    for gt_col, result_col in column_mapping.items():
-        count = 0
-        total = len(df)
-
-        for _, row in df.iterrows():
-            gt_val = row[gt_col]
-            result_val = row[result_col]
-
-            # Handle NaN values
-            if pd.isna(gt_val) or pd.isna(result_val):
-                continue
-
-            # Handle non-numeric values
-            try:
-                gt_float = float(gt_val)
-                result_float = float(result_val)
-            except (ValueError, TypeError):
-                # Skip rows with non-numeric values
-                continue
-
-            # Check if values are within 0.5 tolerance
-            if abs(gt_float - result_float) <= 0.5:
-                count += 1
-
-        percentage = (count / total) * 100
-        percentages.append({
-            'GT_Column': gt_col,
-            'Result_Column': result_col,
-            'Match_Percentage': round(percentage, 1)
-        })
-
-    return pd.DataFrame(percentages)
-
-# Compute match percentages
-match_df = compute_match_percentages(df, column_mapping)
-
-# Create a pivot table for gradient display (optional but helpful)
-pivot_table = match_df.set_index(['GT_Column', 'Result_Column'])['Match_Percentage'].unstack(fill_value=0)
-
-# Apply gradient background
-cm = sns.light_palette("green", as_cmap=True)
-styled_table = pivot_table.style.background_gradient(cmap=cm, axis=None)
-
-# Display result
-print("Agreement Percentage Table (with gradient):")
-styled_table
-
-
-
-# Save the styled table to a file
-styled_table.to_html("agreement_report.html")
-print("Report saved to agreement_report.html")