ugly plot

2026-01-19 01:04:00 +01:00
parent eabde3fcb1
commit 8f34f06578
1 changed files with 82 additions and 223 deletions
@@ -400,200 +400,9 @@ for old_name, new_name in column_mapping.items():
 ##
 # %% name
 import pandas as pd
 import numpy as np
 import seaborn as sns
 # Sample data (replace with your actual df)
 df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t')
 # Identify GT and Result columns
 gt_columns = [col for col in df.columns if col.startswith('GT.')]
 result_columns = [col for col in df.columns if col.startswith('result.')]
 # Create mapping
 column_mapping = {}
 for gt_col in gt_columns:
    base_name = gt_col.replace('GT.', '')
    result_col = f'result.{base_name}'
    if result_col in result_columns:
        column_mapping[gt_col] = result_col
 # Function to compute match percentage for each GT-Result pair
 def compute_match_percentages(df, column_mapping):
    percentages = []
    for gt_col, result_col in column_mapping.items():
        count = 0
        total = len(df)
        for _, row in df.iterrows():
            gt_val = row[gt_col]
            result_val = row[result_col]
            # Handle NaN values
            if pd.isna(gt_val) or pd.isna(result_val):
                continue
            # Handle non-numeric values
            try:
                gt_float = float(gt_val)
                result_float = float(result_val)
            except (ValueError, TypeError):
                # Skip rows with non-numeric values
                continue
            # Check if values are within 0.5 tolerance
            if abs(gt_float - result_float) <= 0.5:
                count += 1
        percentage = (count / total) * 100
        percentages.append({
            'GT_Column': gt_col,
            'Result_Column': result_col,
            'Match_Percentage': round(percentage, 1)
        })
    return pd.DataFrame(percentages)
 # Compute match percentages
 match_df = compute_match_percentages(df, column_mapping)
 # Create a pivot table for gradient display (optional but helpful)
 pivot_table = match_df.set_index(['GT_Column', 'Result_Column'])['Match_Percentage'].unstack(fill_value=0)
 # Apply gradient background
 cm = sns.light_palette("green", as_cmap=True)
 styled_table = pivot_table.style.background_gradient(cmap=cm, axis=None)
 # Display result
 print("Agreement Percentage Table (with gradient):")
 styled_table
-##
+# %% Styled table
 # %% name
 import pandas as pd
 import numpy as np
 import seaborn as sns
 # first, let's identify the gt and result columns
 gt_columns = [col for col in df.columns if col.startswith('gt.')]
 result_columns = [col for col in df.columns if col.startswith('result.')]
 # create a mapping dictionary for matching columns
 column_mapping = {}
 for gt_col in gt_columns:
    # extract the base name (remove 'gt.' prefix)
    base_name = gt_col.replace('gt.', '')
    # find matching result column
    result_col = f'result.{base_name}'
    if result_col in result_columns:
        column_mapping[gt_col] = result_col
 # create comparison dataframe with error handling
 def safe_float_convert(value):
    '''safely convert value to float, returning nan for non-numeric values'''
    if pd.isna(value):
        return np.nan
    try:
        return float(value)
    except (valueerror, typeerror):
        return np.nan
 def create_comparison_table(df, column_mapping):
    # create a table showing match status for all comparisons
    comparison_matrix = pd.dataframe(index=df.index, columns=[f"{gt_col}_vs_{result_col}" for gt_col, result_col in column_mapping.items()])
    for idx, row in df.iterrows():
        for gt_col, result_col in column_mapping.items():
            gt_val = row[gt_col]
            result_val = row[result_col]
            # handle nan values
            if pd.isna(gt_val) or pd.isna(result_val):
                comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0
            else:
                # safely convert to float
                gt_float = safe_float_convert(gt_val)
                result_float = safe_float_convert(result_val)
                # if either conversion failed, mark as no match
                if pd.isna(gt_float) or pd.isna(result_float):
                    comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0
                else:
                    # check if values are within 0.5 tolerance
                    if abs(gt_float - result_float) <= 0.5:
                        comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 1
                    else:
                        comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0
    return comparison_matrix
 # generate the comparison matrix
 comprehensive_matrix = create_comparison_table(df, column_mapping)
 # create summary statistics
 summary_data = []
 for gt_col, result_col in column_mapping.items():
    match_count = 0
    total_count = len(df)
    for idx, row in df.iterrows():
        gt_val = row[gt_col]
        result_val = row[result_col]
        # handle nan values
        if pd.isna(gt_val) or pd.isna(result_val):
            continue
        else:
            # safely convert to float
            gt_float = safe_float_convert(gt_val)
            result_float = safe_float_convert(result_val)
            # if both conversions succeeded, check tolerance
            if not pd.isna(gt_float) and not pd.isna(result_float):
                if abs(gt_float - result_float) <= 0.5:
                    match_count += 1
    summary_data.append({
        'gt_column': gt_col,
        'result_column': result_col,
        'match_count': match_count,
        'total_records': total_count,
        'match_percentage': f"{(match_count/total_count*100):.1f}%" if total_count > 0 else "0.0%"
    })
 summary_df = pd.dataframe(summary_data)
 # display the summary
 print("comparison summary:")
 print("="*80)
 for _, row in summary_df.iterrows():
    print(f"{row['gt_column']} vs {row['result_column']}:")
    print(f"  matches: {row['match_count']}/{row['total_records']} ({row['match_percentage']})")
    print()
 # create gradient styled table
 cm = sns.light_palette("green", as_cmap=true)
 print("comparison results with gradient:")
 comprehensive_gradient = comprehensive_matrix.style.background_gradient(cmap=cm, axis=0)
 # display the gradient table
 comprehensive_gradient
 # if you want to see the actual comparison data
 print("\nraw comparison data:")
 print(comprehensive_matrix.head())
 ##
 # %% name
 import pandas as pd
 import numpy as np
 import seaborn as sns
@@ -602,23 +411,67 @@ import matplotlib.pyplot as plt
 # Load data
 df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t')
-# 1. Faster, vectorized computation
+# 1. Identify all GT and result columns
 gt_columns = [col for col in df.columns if col.startswith('GT.')]
-data_list = []
+result_columns = [col for col in df.columns if col.startswith('result.')]
 print("GT Columns found:", gt_columns)
 print("Result Columns found:", result_columns)
 # 2. Create proper mapping between GT and result columns
 # Handle various naming conventions (spaces, underscores, etc.)
 column_mapping = {}
 for gt_col in gt_columns:
    base_name = gt_col.replace('GT.', '')
    result_col = f'result.{base_name}'
-    if result_col in df.columns:
+    # Clean the base name for matching - remove spaces, underscores, etc.
-        # 1. Convert to numeric, forcing errors to NaN
+    # Try different matching approaches
-        # 2. Explicitly cast to float to prevent Boolean subtraction errors
+    candidates = [
        f'result.{base_name}',  # Exact match
        f'result.{base_name.replace(" ", "_")}',  # With underscores
        f'result.{base_name.replace("_", " ")}',  # With spaces
        f'result.{base_name.replace(" ", "")}',   # No spaces
        f'result.{base_name.replace("_", "")}'    # No underscores
    ]
    # Also try case-insensitive matching
    candidates.append(f'result.{base_name.lower()}')
    candidates.append(f'result.{base_name.upper()}')
    # Try to find matching result column
    matched = False
    for candidate in candidates:
        if candidate in result_columns:
            column_mapping[gt_col] = candidate
            matched = True
            break
    # If no exact match found, try partial matching
    if not matched:
        # Try to match by removing special characters and comparing
        base_clean = ''.join(e for e in base_name if e.isalnum() or e in ['_', ' '])
        for result_col in result_columns:
            result_base = result_col.replace('result.', '')
            result_clean = ''.join(e for e in result_base if e.isalnum() or e in ['_', ' '])
            if base_clean.lower() == result_clean.lower():
                column_mapping[gt_col] = result_col
                matched = True
                break
 print("Column mapping:", column_mapping)
 # 3. Faster, vectorized computation using the corrected mapping
 data_list = []
 for gt_col, result_col in column_mapping.items():
    print(f"Processing {gt_col} vs {result_col}")
    # Convert to numeric, forcing errors to NaN
    s1 = pd.to_numeric(df[gt_col], errors='coerce').astype(float)
    s2 = pd.to_numeric(df[result_col], errors='coerce').astype(float)
    # Calculate matches (abs difference <= 0.5)
        # We use .count() to find how many non-NaN values we have if you want
        # a more accurate percentage, or keep len(df) for the total row count.
    diff = np.abs(s1 - s2)
    matches = (diff <= 0.5).sum()
@@ -630,24 +483,29 @@ for gt_col in gt_columns:
    else:
        percentage = 0
    # Extract clean base name for display
    base_name = gt_col.replace('GT.', '')
    data_list.append({
        'GT': base_name,
        'Match %': round(percentage, 1)
    })
-
+# 4. Prepare Data for Plotting
 # 2. Prepare Data for Plotting
 match_df = pd.DataFrame(data_list)
 # We pivot to get a format suitable for a heatmap
 plot_data = match_df.set_index('GT')
-# 3. Create the Plot
+# Handle case where no matches were found
 if len(match_df) == 0:
    print("No valid column pairs found for comparison")
    exit()
 # 5. Create the Plot
 plt.figure(figsize=(10, 8))
 sns.set_theme(style="white")
 # Create heatmap
 ax = sns.heatmap(
-    plot_data,
+    match_df.set_index('GT')[['Match %']],  # Just the percentage column
    annot=True,          # Show the numbers in the boxes
    fmt=".1f",           # Format to 1 decimal place
    cmap="YlGnBu",       # Yellow-Green-Blue color palette
@@ -658,10 +516,11 @@ ax = sns.heatmap(
 plt.title('Agreement Percentage (Tolerance ±0.5)', pad=20)
 plt.tight_layout()
-# 4. Save as SVG
+# 6. Save as SVG
-plt.savefig("agreement_table.svg", format='svg')
+#plt.savefig("agreement_table.svg", format='svg', dpi=300, bbox_inches='tight')
-print("Successfully saved agreement_table.svg")
+#print("Successfully saved agreement_table.svg")
 # Show plot if running in a GUI environment
 plt.show()
 ##