import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import dataframe_image as dfi # Load data df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t') # 1. Identify all GT and result columns gt_columns = [col for col in df.columns if col.startswith('GT.')] result_columns = [col for col in df.columns if col.startswith('result.')] print("GT Columns found:", gt_columns) print("Result Columns found:", result_columns) # 2. Create proper mapping between GT and result columns # Handle various naming conventions (spaces, underscores, etc.) column_mapping = {} for gt_col in gt_columns: base_name = gt_col.replace('GT.', '') # Clean the base name for matching - remove spaces, underscores, etc. # Try different matching approaches candidates = [ f'result.{base_name}', # Exact match f'result.{base_name.replace(" ", "_")}', # With underscores f'result.{base_name.replace("_", " ")}', # With spaces f'result.{base_name.replace(" ", "")}', # No spaces f'result.{base_name.replace("_", "")}' # No underscores ] # Also try case-insensitive matching candidates.append(f'result.{base_name.lower()}') candidates.append(f'result.{base_name.upper()}') # Try to find matching result column matched = False for candidate in candidates: if candidate in result_columns: column_mapping[gt_col] = candidate matched = True break # If no exact match found, try partial matching if not matched: # Try to match by removing special characters and comparing base_clean = ''.join(e for e in base_name if e.isalnum() or e in ['_', ' ']) for result_col in result_columns: result_base = result_col.replace('result.', '') result_clean = ''.join(e for e in result_base if e.isalnum() or e in ['_', ' ']) if base_clean.lower() == result_clean.lower(): column_mapping[gt_col] = result_col matched = True break print("Column mapping:", column_mapping) # 3. Faster, vectorized computation using the corrected mapping data_list = [] for gt_col, result_col in column_mapping.items(): print(f"Processing {gt_col} vs {result_col}") # Convert to numeric, forcing errors to NaN s1 = pd.to_numeric(df[gt_col], errors='coerce').astype(float) s2 = pd.to_numeric(df[result_col], errors='coerce').astype(float) # Calculate matches (abs difference <= 0.5) diff = np.abs(s1 - s2) matches = (diff <= 0.5).sum() # Determine the denominator (total valid comparisons) valid_count = diff.notna().sum() if valid_count > 0: percentage = (matches / valid_count) * 100 else: percentage = 0 # Extract clean base name for display base_name = gt_col.replace('GT.', '') data_list.append({ 'GT': base_name, 'Match %': round(percentage, 1) }) # 4. Prepare Data for Plotting match_df = pd.DataFrame(data_list) match_df = match_df.sort_values('Match %', ascending=False) # Sort for better visual flow # 5. Create the Styled Gradient Table def style_agreement_table(df): return (df.style .format({'Match %': '{:.1f}%'}) # Add % sign .background_gradient(cmap='RdYlGn', subset=['Match %'], vmin=50, vmax=100) # Red to Green gradient .set_properties(**{ 'text-align': 'center', 'font-size': '12pt', 'border-collapse': 'collapse', 'border': '1px solid #D3D3D3' }) .set_table_styles([ # Style the header {'selector': 'th', 'props': [ ('background-color', '#404040'), ('color', 'white'), ('font-weight', 'bold'), ('text-transform', 'uppercase'), ('padding', '10px') ]}, # Add hover effect {'selector': 'tr:hover', 'props': [('background-color', '#f5f5f5')]} ]) .set_caption("EDSS Agreement Analysis: Ground Truth vs. Results (Tolerance ±0.5)") ) # To display in a Jupyter Notebook: styled_table = style_agreement_table(match_df) styled_table dfi.export(styled_table, "styled_table.png") #styled_table.to_html("agreement_report.html") # 6. Save as SVG #plt.savefig("agreement_table.svg", format='svg', dpi=300, bbox_inches='tight') #print("Successfully saved agreement_table.svg") # Show plot if running in a GUI environment plt.show()