From 8f34f06578e7946122be3b447a83239c2f2bd578 Mon Sep 17 00:00:00 2001 From: Shahin Ramezanzadeh Date: Mon, 19 Jan 2026 01:04:00 +0100 Subject: [PATCH] ugly plot --- Data/show_plots.py | 305 ++++++++++++--------------------------------- 1 file changed, 82 insertions(+), 223 deletions(-) diff --git a/Data/show_plots.py b/Data/show_plots.py index 8803a35..b24578e 100644 --- a/Data/show_plots.py +++ b/Data/show_plots.py @@ -400,200 +400,9 @@ for old_name, new_name in column_mapping.items(): ## -# %% name -import pandas as pd -import numpy as np -import seaborn as sns - -# Sample data (replace with your actual df) -df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t') - -# Identify GT and Result columns -gt_columns = [col for col in df.columns if col.startswith('GT.')] -result_columns = [col for col in df.columns if col.startswith('result.')] - -# Create mapping -column_mapping = {} -for gt_col in gt_columns: - base_name = gt_col.replace('GT.', '') - result_col = f'result.{base_name}' - if result_col in result_columns: - column_mapping[gt_col] = result_col - -# Function to compute match percentage for each GT-Result pair -def compute_match_percentages(df, column_mapping): - percentages = [] - for gt_col, result_col in column_mapping.items(): - count = 0 - total = len(df) - - for _, row in df.iterrows(): - gt_val = row[gt_col] - result_val = row[result_col] - - # Handle NaN values - if pd.isna(gt_val) or pd.isna(result_val): - continue - - # Handle non-numeric values - try: - gt_float = float(gt_val) - result_float = float(result_val) - except (ValueError, TypeError): - # Skip rows with non-numeric values - continue - - # Check if values are within 0.5 tolerance - if abs(gt_float - result_float) <= 0.5: - count += 1 - - percentage = (count / total) * 100 - percentages.append({ - 'GT_Column': gt_col, - 'Result_Column': result_col, - 'Match_Percentage': round(percentage, 1) - }) - - return pd.DataFrame(percentages) - -# Compute match percentages -match_df = compute_match_percentages(df, column_mapping) - -# Create a pivot table for gradient display (optional but helpful) -pivot_table = match_df.set_index(['GT_Column', 'Result_Column'])['Match_Percentage'].unstack(fill_value=0) - -# Apply gradient background -cm = sns.light_palette("green", as_cmap=True) -styled_table = pivot_table.style.background_gradient(cmap=cm, axis=None) - -# Display result -print("Agreement Percentage Table (with gradient):") -styled_table -## - - - - -# %% name -import pandas as pd -import numpy as np -import seaborn as sns - -# first, let's identify the gt and result columns -gt_columns = [col for col in df.columns if col.startswith('gt.')] -result_columns = [col for col in df.columns if col.startswith('result.')] - -# create a mapping dictionary for matching columns -column_mapping = {} -for gt_col in gt_columns: - # extract the base name (remove 'gt.' prefix) - base_name = gt_col.replace('gt.', '') - - # find matching result column - result_col = f'result.{base_name}' - if result_col in result_columns: - column_mapping[gt_col] = result_col - -# create comparison dataframe with error handling -def safe_float_convert(value): - '''safely convert value to float, returning nan for non-numeric values''' - if pd.isna(value): - return np.nan - try: - return float(value) - except (valueerror, typeerror): - return np.nan - -def create_comparison_table(df, column_mapping): - # create a table showing match status for all comparisons - comparison_matrix = pd.dataframe(index=df.index, columns=[f"{gt_col}_vs_{result_col}" for gt_col, result_col in column_mapping.items()]) - - for idx, row in df.iterrows(): - for gt_col, result_col in column_mapping.items(): - gt_val = row[gt_col] - result_val = row[result_col] - - # handle nan values - if pd.isna(gt_val) or pd.isna(result_val): - comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0 - else: - # safely convert to float - gt_float = safe_float_convert(gt_val) - result_float = safe_float_convert(result_val) - - # if either conversion failed, mark as no match - if pd.isna(gt_float) or pd.isna(result_float): - comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0 - else: - # check if values are within 0.5 tolerance - if abs(gt_float - result_float) <= 0.5: - comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 1 - else: - comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0 - - return comparison_matrix - -# generate the comparison matrix -comprehensive_matrix = create_comparison_table(df, column_mapping) - -# create summary statistics -summary_data = [] -for gt_col, result_col in column_mapping.items(): - match_count = 0 - total_count = len(df) - - for idx, row in df.iterrows(): - gt_val = row[gt_col] - result_val = row[result_col] - - # handle nan values - if pd.isna(gt_val) or pd.isna(result_val): - continue - else: - # safely convert to float - gt_float = safe_float_convert(gt_val) - result_float = safe_float_convert(result_val) - - # if both conversions succeeded, check tolerance - if not pd.isna(gt_float) and not pd.isna(result_float): - if abs(gt_float - result_float) <= 0.5: - match_count += 1 - - summary_data.append({ - 'gt_column': gt_col, - 'result_column': result_col, - 'match_count': match_count, - 'total_records': total_count, - 'match_percentage': f"{(match_count/total_count*100):.1f}%" if total_count > 0 else "0.0%" - }) - -summary_df = pd.dataframe(summary_data) - -# display the summary -print("comparison summary:") -print("="*80) -for _, row in summary_df.iterrows(): - print(f"{row['gt_column']} vs {row['result_column']}:") - print(f" matches: {row['match_count']}/{row['total_records']} ({row['match_percentage']})") - print() - -# create gradient styled table -cm = sns.light_palette("green", as_cmap=true) -print("comparison results with gradient:") -comprehensive_gradient = comprehensive_matrix.style.background_gradient(cmap=cm, axis=0) - -# display the gradient table -comprehensive_gradient - -# if you want to see the actual comparison data -print("\nraw comparison data:") -print(comprehensive_matrix.head()) - -## - -# %% name +# %% Styled table import pandas as pd import numpy as np import seaborn as sns @@ -602,52 +411,101 @@ import matplotlib.pyplot as plt # Load data df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t') -# 1. Faster, vectorized computation +# 1. Identify all GT and result columns gt_columns = [col for col in df.columns if col.startswith('GT.')] -data_list = [] +result_columns = [col for col in df.columns if col.startswith('result.')] + +print("GT Columns found:", gt_columns) +print("Result Columns found:", result_columns) + +# 2. Create proper mapping between GT and result columns +# Handle various naming conventions (spaces, underscores, etc.) +column_mapping = {} for gt_col in gt_columns: base_name = gt_col.replace('GT.', '') - result_col = f'result.{base_name}' - if result_col in df.columns: - # 1. Convert to numeric, forcing errors to NaN - # 2. Explicitly cast to float to prevent Boolean subtraction errors - s1 = pd.to_numeric(df[gt_col], errors='coerce').astype(float) - s2 = pd.to_numeric(df[result_col], errors='coerce').astype(float) + # Clean the base name for matching - remove spaces, underscores, etc. + # Try different matching approaches + candidates = [ + f'result.{base_name}', # Exact match + f'result.{base_name.replace(" ", "_")}', # With underscores + f'result.{base_name.replace("_", " ")}', # With spaces + f'result.{base_name.replace(" ", "")}', # No spaces + f'result.{base_name.replace("_", "")}' # No underscores + ] - # Calculate matches (abs difference <= 0.5) - # We use .count() to find how many non-NaN values we have if you want - # a more accurate percentage, or keep len(df) for the total row count. - diff = np.abs(s1 - s2) - matches = (diff <= 0.5).sum() + # Also try case-insensitive matching + candidates.append(f'result.{base_name.lower()}') + candidates.append(f'result.{base_name.upper()}') - # Determine the denominator (total valid comparisons) - valid_count = diff.notna().sum() + # Try to find matching result column + matched = False + for candidate in candidates: + if candidate in result_columns: + column_mapping[gt_col] = candidate + matched = True + break - if valid_count > 0: - percentage = (matches / valid_count) * 100 - else: - percentage = 0 + # If no exact match found, try partial matching + if not matched: + # Try to match by removing special characters and comparing + base_clean = ''.join(e for e in base_name if e.isalnum() or e in ['_', ' ']) + for result_col in result_columns: + result_base = result_col.replace('result.', '') + result_clean = ''.join(e for e in result_base if e.isalnum() or e in ['_', ' ']) + if base_clean.lower() == result_clean.lower(): + column_mapping[gt_col] = result_col + matched = True + break - data_list.append({ - 'GT': base_name, - 'Match %': round(percentage, 1) - }) +print("Column mapping:", column_mapping) +# 3. Faster, vectorized computation using the corrected mapping +data_list = [] -# 2. Prepare Data for Plotting +for gt_col, result_col in column_mapping.items(): + print(f"Processing {gt_col} vs {result_col}") + + # Convert to numeric, forcing errors to NaN + s1 = pd.to_numeric(df[gt_col], errors='coerce').astype(float) + s2 = pd.to_numeric(df[result_col], errors='coerce').astype(float) + + # Calculate matches (abs difference <= 0.5) + diff = np.abs(s1 - s2) + matches = (diff <= 0.5).sum() + + # Determine the denominator (total valid comparisons) + valid_count = diff.notna().sum() + + if valid_count > 0: + percentage = (matches / valid_count) * 100 + else: + percentage = 0 + + # Extract clean base name for display + base_name = gt_col.replace('GT.', '') + + data_list.append({ + 'GT': base_name, + 'Match %': round(percentage, 1) + }) + +# 4. Prepare Data for Plotting match_df = pd.DataFrame(data_list) -# We pivot to get a format suitable for a heatmap -plot_data = match_df.set_index('GT') -# 3. Create the Plot +# Handle case where no matches were found +if len(match_df) == 0: + print("No valid column pairs found for comparison") + exit() + +# 5. Create the Plot plt.figure(figsize=(10, 8)) sns.set_theme(style="white") # Create heatmap ax = sns.heatmap( - plot_data, + match_df.set_index('GT')[['Match %']], # Just the percentage column annot=True, # Show the numbers in the boxes fmt=".1f", # Format to 1 decimal place cmap="YlGnBu", # Yellow-Green-Blue color palette @@ -658,10 +516,11 @@ ax = sns.heatmap( plt.title('Agreement Percentage (Tolerance ±0.5)', pad=20) plt.tight_layout() -# 4. Save as SVG -plt.savefig("agreement_table.svg", format='svg') -print("Successfully saved agreement_table.svg") +# 6. Save as SVG +#plt.savefig("agreement_table.svg", format='svg', dpi=300, bbox_inches='tight') +#print("Successfully saved agreement_table.svg") # Show plot if running in a GUI environment plt.show() + ##