Adjsuting and cleaning

show directional errors
Directional Errors of each functional system.
2026-02-08 01:59:38 +01:00 · 2026-02-08 01:27:48 +01:00 · 2026-02-04 18:01:11 +01:00 · 2026-02-04 15:29:56 +01:00 · 2026-01-26 02:02:19 +01:00 · 2026-01-20 14:47:53 +01:00
5 changed files with 1260 additions and 212 deletions
@@ -7,6 +7,10 @@
 __pycache__/
 *.pyc
 =======
 /reference/
 *.svg
 >>>>>>> Stashed changes
 # 2. Ignore virtual environments COMPLETELY
 # This must come BEFORE the unignore rule
 env*/
@@ -1,135 +0,0 @@
 import pandas as pd
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
 import dataframe_image as dfi
 # Load data
 df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t')
 # 1. Identify all GT and result columns
 gt_columns = [col for col in df.columns if col.startswith('GT.')]
 result_columns = [col for col in df.columns if col.startswith('result.')]
 print("GT Columns found:", gt_columns)
 print("Result Columns found:", result_columns)
 # 2. Create proper mapping between GT and result columns
 # Handle various naming conventions (spaces, underscores, etc.)
 column_mapping = {}
 for gt_col in gt_columns:
    base_name = gt_col.replace('GT.', '')
    # Clean the base name for matching - remove spaces, underscores, etc.
    # Try different matching approaches
    candidates = [
        f'result.{base_name}',  # Exact match
        f'result.{base_name.replace(" ", "_")}',  # With underscores
        f'result.{base_name.replace("_", " ")}',  # With spaces
        f'result.{base_name.replace(" ", "")}',   # No spaces
        f'result.{base_name.replace("_", "")}'    # No underscores
    ]
    # Also try case-insensitive matching
    candidates.append(f'result.{base_name.lower()}')
    candidates.append(f'result.{base_name.upper()}')
    # Try to find matching result column
    matched = False
    for candidate in candidates:
        if candidate in result_columns:
            column_mapping[gt_col] = candidate
            matched = True
            break
    # If no exact match found, try partial matching
    if not matched:
        # Try to match by removing special characters and comparing
        base_clean = ''.join(e for e in base_name if e.isalnum() or e in ['_', ' '])
        for result_col in result_columns:
            result_base = result_col.replace('result.', '')
            result_clean = ''.join(e for e in result_base if e.isalnum() or e in ['_', ' '])
            if base_clean.lower() == result_clean.lower():
                column_mapping[gt_col] = result_col
                matched = True
                break
 print("Column mapping:", column_mapping)
 # 3. Faster, vectorized computation using the corrected mapping
 data_list = []
 for gt_col, result_col in column_mapping.items():
    print(f"Processing {gt_col} vs {result_col}")
    # Convert to numeric, forcing errors to NaN
    s1 = pd.to_numeric(df[gt_col], errors='coerce').astype(float)
    s2 = pd.to_numeric(df[result_col], errors='coerce').astype(float)
    # Calculate matches (abs difference <= 0.5)
    diff = np.abs(s1 - s2)
    matches = (diff <= 0.5).sum()
    # Determine the denominator (total valid comparisons)
    valid_count = diff.notna().sum()
    if valid_count > 0:
        percentage = (matches / valid_count) * 100
    else:
        percentage = 0
    # Extract clean base name for display
    base_name = gt_col.replace('GT.', '')
    data_list.append({
        'GT': base_name,
        'Match %': round(percentage, 1)
    })
 # 4. Prepare Data for Plotting
 match_df = pd.DataFrame(data_list)
 match_df = match_df.sort_values('Match %', ascending=False) # Sort for better visual flow
 # 5. Create the Styled Gradient Table
 def style_agreement_table(df):
    return (df.style
        .format({'Match %': '{:.1f}%'}) # Add % sign
        .background_gradient(cmap='RdYlGn', subset=['Match %'], vmin=50, vmax=100) # Red to Green gradient
        .set_properties(**{
            'text-align': 'center',
            'font-size': '12pt',
            'border-collapse': 'collapse',
            'border': '1px solid #D3D3D3'
        })
        .set_table_styles([
            # Style the header
            {'selector': 'th', 'props': [
                ('background-color', '#404040'), 
                ('color', 'white'),
                ('font-weight', 'bold'),
                ('text-transform', 'uppercase'),
                ('padding', '10px')
            ]},
            # Add hover effect
            {'selector': 'tr:hover', 'props': [('background-color', '#f5f5f5')]}
        ])
        .set_caption("EDSS Agreement Analysis: Ground Truth vs. Results (Tolerance ±0.5)")
    )
 # To display in a Jupyter Notebook:
 styled_table = style_agreement_table(match_df)
 styled_table
 dfi.export(styled_table, "styled_table.png")
 #styled_table.to_html("agreement_report.html")
 # 6. Save as SVG
 #plt.savefig("agreement_table.svg", format='svg', dpi=300, bbox_inches='tight')
 #print("Successfully saved agreement_table.svg")
 # Show plot if running in a GUI environment
 plt.show()
@@ -1,74 +0,0 @@
 import pandas as pd
 import numpy as np
 import seaborn as sns
 # Sample data (replace with your actual df)
 df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t')
 # Identify GT and Result columns
 gt_columns = [col for col in df.columns if col.startswith('GT.')]
 result_columns = [col for col in df.columns if col.startswith('result.')]
 # Create mapping
 column_mapping = {}
 for gt_col in gt_columns:
    base_name = gt_col.replace('GT.', '')
    result_col = f'result.{base_name}'
    if result_col in result_columns:
        column_mapping[gt_col] = result_col
 # Function to compute match percentage for each GT-Result pair
 def compute_match_percentages(df, column_mapping):
    percentages = []
    for gt_col, result_col in column_mapping.items():
        count = 0
        total = len(df)
        for _, row in df.iterrows():
            gt_val = row[gt_col]
            result_val = row[result_col]
            # Handle NaN values
            if pd.isna(gt_val) or pd.isna(result_val):
                continue
            # Handle non-numeric values
            try:
                gt_float = float(gt_val)
                result_float = float(result_val)
            except (ValueError, TypeError):
                # Skip rows with non-numeric values
                continue
            # Check if values are within 0.5 tolerance
            if abs(gt_float - result_float) <= 0.5:
                count += 1
        percentage = (count / total) * 100
        percentages.append({
            'GT_Column': gt_col,
            'Result_Column': result_col,
            'Match_Percentage': round(percentage, 1)
        })
    return pd.DataFrame(percentages)
 # Compute match percentages
 match_df = compute_match_percentages(df, column_mapping)
 # Create a pivot table for gradient display (optional but helpful)
 pivot_table = match_df.set_index(['GT_Column', 'Result_Column'])['Match_Percentage'].unstack(fill_value=0)
 # Apply gradient background
 cm = sns.light_palette("green", as_cmap=True)
 styled_table = pivot_table.style.background_gradient(cmap=cm, axis=None)
 # Display result
 print("Agreement Percentage Table (with gradient):")
 styled_table
 # Save the styled table to a file
 styled_table.to_html("agreement_report.html")
 print("Report saved to agreement_report.html")
@@ -263,3 +263,60 @@ plt.legend(frameon=False, loc='upper center', bbox_to_anchor=(0.5, -0.05))
 plt.tight_layout()
 plt.show()
 ##
 # %% name
 import matplotlib.pyplot as plt
 # Data
 data = {
    'Visit': [9, 8, 7, 6, 5, 4, 3, 2, 1],
    'patient_count': [2, 3, 3, 6, 13, 17, 28, 24, 32]
 }
 # Create figure and axis
 fig, ax = plt.subplots(figsize=(10, 6))
 # Plot the bar chart
 bars = ax.bar(data['Visit'], data['patient_count'], color='darkblue', label='Patients by Visit Count')
 # Add labels and title
 ax.set_xlabel('Visit Number (from last to first)', fontsize=12)
 ax.set_ylabel('Number of Patients', fontsize=12)
 ax.set_title('Patient Visits by Visit Number', fontsize=14)
 # Invert x-axis to show Visit 9 on the left (descending order) if desired, but keep natural order (1–9 left to right)
 # For descending order (9→1 from left to right), we'd need to reverse:
 # Visit = data['Visit'][::-1], patient_count = data['patient_count'][::-1]
 # But standard practice is ascending (1 to 9), so we'll sort accordingly:
 # Let's sort by Visit to ensure left-to-right: 1,2,...,9
 # Actually, your current Visit list is [9,8,...,1], which is descending.
 # Let's sort by Visit for intuitive left-to-right increasing order:
 sorted_indices = sorted(range(len(data['Visit'])), key=lambda i: data['Visit'][i])
 visit_sorted = [data['Visit'][i] for i in sorted_indices]
 count_sorted = [data['patient_count'][i] for i in sorted_indices]
 # Re-plot with sorted x-axis:
 ax.clear()
 bars = ax.bar(visit_sorted, count_sorted, color='darkblue', label='Patients by Visit Count')
 # Re-apply labels, etc.
 ax.set_xlabel('Number of Visits', fontsize=12)
 ax.set_ylabel('Number of Unique Patients', fontsize=12)
 #ax.set_title('Number of Patients by Visit Number', fontsize=14)
 # Add legend
 ax.legend()
 # Improve layout and grid
 ax.grid(axis='y', linestyle='--', alpha=0.7)
 plt.xticks(visit_sorted)  # Ensure all integer visit numbers are shown
 # Show the plot
 plt.tight_layout()
 plt.show()
 ##
Author	SHA1	Message	Date
shahin	2f507bcf20	Adjsuting and cleaning	2026-02-08 01:59:38 +01:00
shahin	f4bf37f71c	show directional errors Directional Errors of each functional system.	2026-02-08 01:27:48 +01:00
shahin	bc63d1ee72	added new confusion matrix	2026-02-04 18:01:11 +01:00
shahin	c2ccb8cd11	update gitignore	2026-02-04 15:29:56 +01:00
shahin	b2e9ccd2b6	adding some visualizations	2026-01-26 02:02:19 +01:00
shahin	2f1bd2bfd0	save	2026-01-20 14:47:53 +01:00