ugly plot

2026-01-19 01:04:00 +01:00
parent eabde3fcb1
commit 8f34f06578
1 changed files with 82 additions and 223 deletions
--- a/Data/show_plots.py
+++ b/Data/show_plots.py
@@ -400,200 +400,9 @@ for old_name, new_name in column_mapping.items():
 ##


-# %% name
-import pandas as pd
-import numpy as np
-import seaborn as sns
-
-# Sample data (replace with your actual df)
-df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t')
-
-# Identify GT and Result columns
-gt_columns = [col for col in df.columns if col.startswith('GT.')]
-result_columns = [col for col in df.columns if col.startswith('result.')]
-
-# Create mapping
-column_mapping = {}
-for gt_col in gt_columns:
-    base_name = gt_col.replace('GT.', '')
-    result_col = f'result.{base_name}'
-    if result_col in result_columns:
-        column_mapping[gt_col] = result_col
-
-# Function to compute match percentage for each GT-Result pair
-def compute_match_percentages(df, column_mapping):
-    percentages = []
-    for gt_col, result_col in column_mapping.items():
-        count = 0
-        total = len(df)
-
-        for _, row in df.iterrows():
-            gt_val = row[gt_col]
-            result_val = row[result_col]
-
-            # Handle NaN values
-            if pd.isna(gt_val) or pd.isna(result_val):
-                continue
-
-            # Handle non-numeric values
-            try:
-                gt_float = float(gt_val)
-                result_float = float(result_val)
-            except (ValueError, TypeError):
-                # Skip rows with non-numeric values
-                continue
-
-            # Check if values are within 0.5 tolerance
-            if abs(gt_float - result_float) <= 0.5:
-                count += 1
-
-        percentage = (count / total) * 100
-        percentages.append({
-            'GT_Column': gt_col,
-            'Result_Column': result_col,
-            'Match_Percentage': round(percentage, 1)
-        })
-
-    return pd.DataFrame(percentages)
-
-# Compute match percentages
-match_df = compute_match_percentages(df, column_mapping)
-
-# Create a pivot table for gradient display (optional but helpful)
-pivot_table = match_df.set_index(['GT_Column', 'Result_Column'])['Match_Percentage'].unstack(fill_value=0)
-
-# Apply gradient background
-cm = sns.light_palette("green", as_cmap=True)
-styled_table = pivot_table.style.background_gradient(cmap=cm, axis=None)
-
-# Display result
-print("Agreement Percentage Table (with gradient):")
-styled_table


-##
-
-
-
-
-# %% name
-import pandas as pd
-import numpy as np
-import seaborn as sns
-
-# first, let's identify the gt and result columns
-gt_columns = [col for col in df.columns if col.startswith('gt.')]
-result_columns = [col for col in df.columns if col.startswith('result.')]
-
-# create a mapping dictionary for matching columns
-column_mapping = {}
-for gt_col in gt_columns:
-    # extract the base name (remove 'gt.' prefix)
-    base_name = gt_col.replace('gt.', '')
-
-    # find matching result column
-    result_col = f'result.{base_name}'
-    if result_col in result_columns:
-        column_mapping[gt_col] = result_col
-
-# create comparison dataframe with error handling
-def safe_float_convert(value):
-    '''safely convert value to float, returning nan for non-numeric values'''
-    if pd.isna(value):
-        return np.nan
-    try:
-        return float(value)
-    except (valueerror, typeerror):
-        return np.nan
-
-def create_comparison_table(df, column_mapping):
-    # create a table showing match status for all comparisons
-    comparison_matrix = pd.dataframe(index=df.index, columns=[f"{gt_col}_vs_{result_col}" for gt_col, result_col in column_mapping.items()])
-
-    for idx, row in df.iterrows():
-        for gt_col, result_col in column_mapping.items():
-            gt_val = row[gt_col]
-            result_val = row[result_col]
-
-            # handle nan values
-            if pd.isna(gt_val) or pd.isna(result_val):
-                comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0
-            else:
-                # safely convert to float
-                gt_float = safe_float_convert(gt_val)
-                result_float = safe_float_convert(result_val)
-
-                # if either conversion failed, mark as no match
-                if pd.isna(gt_float) or pd.isna(result_float):
-                    comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0
-                else:
-                    # check if values are within 0.5 tolerance
-                    if abs(gt_float - result_float) <= 0.5:
-                        comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 1
-                    else:
-                        comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0
-
-    return comparison_matrix
-
-# generate the comparison matrix
-comprehensive_matrix = create_comparison_table(df, column_mapping)
-
-# create summary statistics
-summary_data = []
-for gt_col, result_col in column_mapping.items():
-    match_count = 0
-    total_count = len(df)
-
-    for idx, row in df.iterrows():
-        gt_val = row[gt_col]
-        result_val = row[result_col]
-
-        # handle nan values
-        if pd.isna(gt_val) or pd.isna(result_val):
-            continue
-        else:
-            # safely convert to float
-            gt_float = safe_float_convert(gt_val)
-            result_float = safe_float_convert(result_val)
-
-            # if both conversions succeeded, check tolerance
-            if not pd.isna(gt_float) and not pd.isna(result_float):
-                if abs(gt_float - result_float) <= 0.5:
-                    match_count += 1
-
-    summary_data.append({
-        'gt_column': gt_col,
-        'result_column': result_col,
-        'match_count': match_count,
-        'total_records': total_count,
-        'match_percentage': f"{(match_count/total_count*100):.1f}%" if total_count > 0 else "0.0%"
-    })
-
-summary_df = pd.dataframe(summary_data)
-
-# display the summary
-print("comparison summary:")
-print("="*80)
-for _, row in summary_df.iterrows():
-    print(f"{row['gt_column']} vs {row['result_column']}:")
-    print(f"  matches: {row['match_count']}/{row['total_records']} ({row['match_percentage']})")
-    print()
-
-# create gradient styled table
-cm = sns.light_palette("green", as_cmap=true)
-print("comparison results with gradient:")
-comprehensive_gradient = comprehensive_matrix.style.background_gradient(cmap=cm, axis=0)
-
-# display the gradient table
-comprehensive_gradient
-
-# if you want to see the actual comparison data
-print("\nraw comparison data:")
-print(comprehensive_matrix.head())
-
-##
-
-# %% name
+# %% Styled table
 import pandas as pd
 import numpy as np
 import seaborn as sns
@@ -602,23 +411,67 @@ import matplotlib.pyplot as plt
 # Load data
 df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t')

-# 1. Faster, vectorized computation
+# 1. Identify all GT and result columns
 gt_columns = [col for col in df.columns if col.startswith('GT.')]
-data_list = []
+result_columns = [col for col in df.columns if col.startswith('result.')]
+
+print("GT Columns found:", gt_columns)
+print("Result Columns found:", result_columns)
+
+# 2. Create proper mapping between GT and result columns
+# Handle various naming conventions (spaces, underscores, etc.)
+column_mapping = {}

 for gt_col in gt_columns:
    base_name = gt_col.replace('GT.', '')
-    result_col = f'result.{base_name}'

-    if result_col in df.columns:
-        # 1. Convert to numeric, forcing errors to NaN
-        # 2. Explicitly cast to float to prevent Boolean subtraction errors
+    # Clean the base name for matching - remove spaces, underscores, etc.
+    # Try different matching approaches
+    candidates = [
+        f'result.{base_name}',  # Exact match
+        f'result.{base_name.replace(" ", "_")}',  # With underscores
+        f'result.{base_name.replace("_", " ")}',  # With spaces
+        f'result.{base_name.replace(" ", "")}',   # No spaces
+        f'result.{base_name.replace("_", "")}'    # No underscores
+    ]
+
+    # Also try case-insensitive matching
+    candidates.append(f'result.{base_name.lower()}')
+    candidates.append(f'result.{base_name.upper()}')
+
+    # Try to find matching result column
+    matched = False
+    for candidate in candidates:
+        if candidate in result_columns:
+            column_mapping[gt_col] = candidate
+            matched = True
+            break
+
+    # If no exact match found, try partial matching
+    if not matched:
+        # Try to match by removing special characters and comparing
+        base_clean = ''.join(e for e in base_name if e.isalnum() or e in ['_', ' '])
+        for result_col in result_columns:
+            result_base = result_col.replace('result.', '')
+            result_clean = ''.join(e for e in result_base if e.isalnum() or e in ['_', ' '])
+            if base_clean.lower() == result_clean.lower():
+                column_mapping[gt_col] = result_col
+                matched = True
+                break
+
+print("Column mapping:", column_mapping)
+
+# 3. Faster, vectorized computation using the corrected mapping
+data_list = []
+
+for gt_col, result_col in column_mapping.items():
+    print(f"Processing {gt_col} vs {result_col}")
+
+    # Convert to numeric, forcing errors to NaN
    s1 = pd.to_numeric(df[gt_col], errors='coerce').astype(float)
    s2 = pd.to_numeric(df[result_col], errors='coerce').astype(float)

    # Calculate matches (abs difference <= 0.5)
-        # We use .count() to find how many non-NaN values we have if you want
-        # a more accurate percentage, or keep len(df) for the total row count.
    diff = np.abs(s1 - s2)
    matches = (diff <= 0.5).sum()

@@ -630,24 +483,29 @@ for gt_col in gt_columns:
    else:
        percentage = 0

+    # Extract clean base name for display
+    base_name = gt_col.replace('GT.', '')
+
    data_list.append({
        'GT': base_name,
        'Match %': round(percentage, 1)
    })

-
-# 2. Prepare Data for Plotting
+# 4. Prepare Data for Plotting
 match_df = pd.DataFrame(data_list)
-# We pivot to get a format suitable for a heatmap
-plot_data = match_df.set_index('GT')

-# 3. Create the Plot
+# Handle case where no matches were found
+if len(match_df) == 0:
+    print("No valid column pairs found for comparison")
+    exit()
+
+# 5. Create the Plot
 plt.figure(figsize=(10, 8))
 sns.set_theme(style="white")

 # Create heatmap
 ax = sns.heatmap(
-    plot_data,
+    match_df.set_index('GT')[['Match %']],  # Just the percentage column
    annot=True,          # Show the numbers in the boxes
    fmt=".1f",           # Format to 1 decimal place
    cmap="YlGnBu",       # Yellow-Green-Blue color palette
@@ -658,10 +516,11 @@ ax = sns.heatmap(
 plt.title('Agreement Percentage (Tolerance ±0.5)', pad=20)
 plt.tight_layout()

-# 4. Save as SVG
-plt.savefig("agreement_table.svg", format='svg')
-print("Successfully saved agreement_table.svg")
+# 6. Save as SVG
+#plt.savefig("agreement_table.svg", format='svg', dpi=300, bbox_inches='tight')
+#print("Successfully saved agreement_table.svg")

 # Show plot if running in a GUI environment
 plt.show()
+
 ##