Adjsuting and cleaning

2026-02-08 01:59:38 +01:00
parent f4bf37f71c
commit 2f507bcf20
4 changed files with 148 additions and 209 deletions
@@ -1828,6 +1828,97 @@ plt.tight_layout()
 plt.show()
 ##
 # %% name
 import pandas as pd
 import matplotlib.pyplot as plt
 import os
 import numpy as np
 # --- Configuration & Theme ---
 plt.rcParams['font.family'] = 'Arial'
 figure_save_path = 'project/visuals/functional_systems_magnitude_focus.svg'
 # --- 1. Process Error Data with Magnitude Breakdown ---
 system_names = [name.split('.')[1] for name, _ in functional_systems_to_plot]
 plot_list = []
 for gt_col, res_col in functional_systems_to_plot:
    sys_name = gt_col.split('.')[1]
    # Robust parsing
    gt = df[gt_col].apply(safe_parse)
    res = df[res_col].apply(safe_parse)
    error = res - gt
    # Granular Counts
    matches = (error == 0).sum()
    u_1 = (error == -1).sum()
    u_2plus = (error <= -2).sum()
    o_1 = (error == 1).sum()
    o_2plus = (error >= 2).sum()
    total = error.dropna().count()
    divisor = max(total, 1)
    plot_list.append({
        'System': sys_name.replace('_', ' ').title(),
        'Matches': matches, 'MatchPct': (matches / divisor) * 100,
        'U1': u_1, 'U2': u_2plus, 'UnderTotal': u_1 + u_2plus,
        'UnderPct': ((u_1 + u_2plus) / divisor) * 100,
        'O1': o_1, 'O2': o_2plus, 'OverTotal': o_1 + o_2plus,
        'OverPct': ((o_1 + o_2plus) / divisor) * 100
    })
 stats_df = pd.DataFrame(plot_list)
 # --- 2. Plotting ---
 fig, ax = plt.subplots(figsize=(13, 8))
 # Define Magnitude Colors
 c_under_dark, c_under_light = '#C0392B', '#E74C3C' # Dark Red (-2+), Soft Red (-1)
 c_over_dark, c_over_light   = '#2980B9', '#3498DB' # Dark Blue (+2+), Soft Blue (+1)
 bar_height = 0.6
 y_pos = np.arange(len(stats_df))
 # Plot Under-scored (Stacked: -2+ then -1)
 ax.barh(y_pos, -stats_df['U2'], bar_height, color=c_under_dark, label='Under -2+', edgecolor='white')
 ax.barh(y_pos, -stats_df['U1'], bar_height, left=-stats_df['U2'], color=c_under_light, label='Under -1', edgecolor='white')
 # Plot Over-scored (Stacked: +1 then +2+)
 ax.barh(y_pos, stats_df['O1'], bar_height, color=c_over_light, label='Over +1', edgecolor='white')
 ax.barh(y_pos, stats_df['O2'], bar_height, left=stats_df['O1'], color=c_over_dark, label='Over +2+', edgecolor='white')
 # --- 3. Aesthetics & Table Labels ---
 for i, row in stats_df.iterrows():
    label_text = (
        f"$\\mathbf{{{row['System']}}}$\n"
        f"Match: {int(row['Matches'])} ({row['MatchPct']:.1f}%)\n"
        f"Under: {int(row['UnderTotal'])} ({row['UnderPct']:.1f}%) | Over: {int(row['OverTotal'])} ({row['OverPct']:.1f}%)"
    )
    # Position table text to the left
    ax.text(ax.get_xlim()[0] - 0.5, i, label_text, va='center', ha='right', fontsize=9, color='#333333', linespacing=1.4)
 # Formatting
 ax.axvline(0, color='black', linewidth=1.2)
 ax.set_yticks([])
 ax.set_xlabel('Number of Patients with Error', fontsize=11, fontweight='bold')
 #ax.set_title('Directional Error Magnitude (Under vs. Over Scoring)', fontsize=14, pad=35)
 # Absolute X-axis labels
 ax.set_xticklabels([int(abs(tick)) for tick in ax.get_xticks()])
 # Remove spines and add grid
 for spine in ['top', 'right', 'left']: ax.spines[spine].set_visible(False)
 ax.xaxis.grid(True, linestyle='--', alpha=0.3)
 # Legend with magnitude info
 ax.legend(loc='upper right', frameon=False, bbox_to_anchor=(1, 1.1), ncol=2)
 plt.tight_layout()
 plt.show()
 ##
 # %% test
 # Diagnose: what are the actual differences?
 print("\n🔍 Raw differences (first 5 rows per system):")
@@ -1,135 +0,0 @@
 import pandas as pd
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
 import dataframe_image as dfi
 # Load data
 df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t')
 # 1. Identify all GT and result columns
 gt_columns = [col for col in df.columns if col.startswith('GT.')]
 result_columns = [col for col in df.columns if col.startswith('result.')]
 print("GT Columns found:", gt_columns)
 print("Result Columns found:", result_columns)
 # 2. Create proper mapping between GT and result columns
 # Handle various naming conventions (spaces, underscores, etc.)
 column_mapping = {}
 for gt_col in gt_columns:
    base_name = gt_col.replace('GT.', '')
    # Clean the base name for matching - remove spaces, underscores, etc.
    # Try different matching approaches
    candidates = [
        f'result.{base_name}',  # Exact match
        f'result.{base_name.replace(" ", "_")}',  # With underscores
        f'result.{base_name.replace("_", " ")}',  # With spaces
        f'result.{base_name.replace(" ", "")}',   # No spaces
        f'result.{base_name.replace("_", "")}'    # No underscores
    ]
    # Also try case-insensitive matching
    candidates.append(f'result.{base_name.lower()}')
    candidates.append(f'result.{base_name.upper()}')
    # Try to find matching result column
    matched = False
    for candidate in candidates:
        if candidate in result_columns:
            column_mapping[gt_col] = candidate
            matched = True
            break
    # If no exact match found, try partial matching
    if not matched:
        # Try to match by removing special characters and comparing
        base_clean = ''.join(e for e in base_name if e.isalnum() or e in ['_', ' '])
        for result_col in result_columns:
            result_base = result_col.replace('result.', '')
            result_clean = ''.join(e for e in result_base if e.isalnum() or e in ['_', ' '])
            if base_clean.lower() == result_clean.lower():
                column_mapping[gt_col] = result_col
                matched = True
                break
 print("Column mapping:", column_mapping)
 # 3. Faster, vectorized computation using the corrected mapping
 data_list = []
 for gt_col, result_col in column_mapping.items():
    print(f"Processing {gt_col} vs {result_col}")
    # Convert to numeric, forcing errors to NaN
    s1 = pd.to_numeric(df[gt_col], errors='coerce').astype(float)
    s2 = pd.to_numeric(df[result_col], errors='coerce').astype(float)
    # Calculate matches (abs difference <= 0.5)
    diff = np.abs(s1 - s2)
    matches = (diff <= 0.5).sum()
    # Determine the denominator (total valid comparisons)
    valid_count = diff.notna().sum()
    if valid_count > 0:
        percentage = (matches / valid_count) * 100
    else:
        percentage = 0
    # Extract clean base name for display
    base_name = gt_col.replace('GT.', '')
    data_list.append({
        'GT': base_name,
        'Match %': round(percentage, 1)
    })
 # 4. Prepare Data for Plotting
 match_df = pd.DataFrame(data_list)
 match_df = match_df.sort_values('Match %', ascending=False) # Sort for better visual flow
 # 5. Create the Styled Gradient Table
 def style_agreement_table(df):
    return (df.style
        .format({'Match %': '{:.1f}%'}) # Add % sign
        .background_gradient(cmap='RdYlGn', subset=['Match %'], vmin=50, vmax=100) # Red to Green gradient
        .set_properties(**{
            'text-align': 'center',
            'font-size': '12pt',
            'border-collapse': 'collapse',
            'border': '1px solid #D3D3D3'
        })
        .set_table_styles([
            # Style the header
            {'selector': 'th', 'props': [
                ('background-color', '#404040'), 
                ('color', 'white'),
                ('font-weight', 'bold'),
                ('text-transform', 'uppercase'),
                ('padding', '10px')
            ]},
            # Add hover effect
            {'selector': 'tr:hover', 'props': [('background-color', '#f5f5f5')]}
        ])
        .set_caption("EDSS Agreement Analysis: Ground Truth vs. Results (Tolerance ±0.5)")
    )
 # To display in a Jupyter Notebook:
 styled_table = style_agreement_table(match_df)
 styled_table
 dfi.export(styled_table, "styled_table.png")
 #styled_table.to_html("agreement_report.html")
 # 6. Save as SVG
 #plt.savefig("agreement_table.svg", format='svg', dpi=300, bbox_inches='tight')
 #print("Successfully saved agreement_table.svg")
 # Show plot if running in a GUI environment
 plt.show()
@@ -1,74 +0,0 @@
 import pandas as pd
 import numpy as np
 import seaborn as sns
 # Sample data (replace with your actual df)
 df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t')
 # Identify GT and Result columns
 gt_columns = [col for col in df.columns if col.startswith('GT.')]
 result_columns = [col for col in df.columns if col.startswith('result.')]
 # Create mapping
 column_mapping = {}
 for gt_col in gt_columns:
    base_name = gt_col.replace('GT.', '')
    result_col = f'result.{base_name}'
    if result_col in result_columns:
        column_mapping[gt_col] = result_col
 # Function to compute match percentage for each GT-Result pair
 def compute_match_percentages(df, column_mapping):
    percentages = []
    for gt_col, result_col in column_mapping.items():
        count = 0
        total = len(df)
        for _, row in df.iterrows():
            gt_val = row[gt_col]
            result_val = row[result_col]
            # Handle NaN values
            if pd.isna(gt_val) or pd.isna(result_val):
                continue
            # Handle non-numeric values
            try:
                gt_float = float(gt_val)
                result_float = float(result_val)
            except (ValueError, TypeError):
                # Skip rows with non-numeric values
                continue
            # Check if values are within 0.5 tolerance
            if abs(gt_float - result_float) <= 0.5:
                count += 1
        percentage = (count / total) * 100
        percentages.append({
            'GT_Column': gt_col,
            'Result_Column': result_col,
            'Match_Percentage': round(percentage, 1)
        })
    return pd.DataFrame(percentages)
 # Compute match percentages
 match_df = compute_match_percentages(df, column_mapping)
 # Create a pivot table for gradient display (optional but helpful)
 pivot_table = match_df.set_index(['GT_Column', 'Result_Column'])['Match_Percentage'].unstack(fill_value=0)
 # Apply gradient background
 cm = sns.light_palette("green", as_cmap=True)
 styled_table = pivot_table.style.background_gradient(cmap=cm, axis=None)
 # Display result
 print("Agreement Percentage Table (with gradient):")
 styled_table
 # Save the styled table to a file
 styled_table.to_html("agreement_report.html")
 print("Report saved to agreement_report.html")
@@ -263,3 +263,60 @@ plt.legend(frameon=False, loc='upper center', bbox_to_anchor=(0.5, -0.05))
 plt.tight_layout()
 plt.show()
 ##
 # %% name
 import matplotlib.pyplot as plt
 # Data
 data = {
    'Visit': [9, 8, 7, 6, 5, 4, 3, 2, 1],
    'patient_count': [2, 3, 3, 6, 13, 17, 28, 24, 32]
 }
 # Create figure and axis
 fig, ax = plt.subplots(figsize=(10, 6))
 # Plot the bar chart
 bars = ax.bar(data['Visit'], data['patient_count'], color='darkblue', label='Patients by Visit Count')
 # Add labels and title
 ax.set_xlabel('Visit Number (from last to first)', fontsize=12)
 ax.set_ylabel('Number of Patients', fontsize=12)
 ax.set_title('Patient Visits by Visit Number', fontsize=14)
 # Invert x-axis to show Visit 9 on the left (descending order) if desired, but keep natural order (1–9 left to right)
 # For descending order (9→1 from left to right), we'd need to reverse:
 # Visit = data['Visit'][::-1], patient_count = data['patient_count'][::-1]
 # But standard practice is ascending (1 to 9), so we'll sort accordingly:
 # Let's sort by Visit to ensure left-to-right: 1,2,...,9
 # Actually, your current Visit list is [9,8,...,1], which is descending.
 # Let's sort by Visit for intuitive left-to-right increasing order:
 sorted_indices = sorted(range(len(data['Visit'])), key=lambda i: data['Visit'][i])
 visit_sorted = [data['Visit'][i] for i in sorted_indices]
 count_sorted = [data['patient_count'][i] for i in sorted_indices]
 # Re-plot with sorted x-axis:
 ax.clear()
 bars = ax.bar(visit_sorted, count_sorted, color='darkblue', label='Patients by Visit Count')
 # Re-apply labels, etc.
 ax.set_xlabel('Number of Visits', fontsize=12)
 ax.set_ylabel('Number of Unique Patients', fontsize=12)
 #ax.set_title('Number of Patients by Visit Number', fontsize=14)
 # Add legend
 ax.legend()
 # Improve layout and grid
 ax.grid(axis='y', linestyle='--', alpha=0.7)
 plt.xticks(visit_sorted)  # Ensure all integer visit numbers are shown
 # Show the plot
 plt.tight_layout()
 plt.show()
 ##