ugly plot

This commit is contained in:
2026-01-19 01:04:00 +01:00
parent eabde3fcb1
commit 8f34f06578

View File

@@ -400,200 +400,9 @@ for old_name, new_name in column_mapping.items():
##
# %% name
import pandas as pd
import numpy as np
import seaborn as sns
# Sample data (replace with your actual df)
df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t')
# Identify GT and Result columns
gt_columns = [col for col in df.columns if col.startswith('GT.')]
result_columns = [col for col in df.columns if col.startswith('result.')]
# Create mapping
column_mapping = {}
for gt_col in gt_columns:
base_name = gt_col.replace('GT.', '')
result_col = f'result.{base_name}'
if result_col in result_columns:
column_mapping[gt_col] = result_col
# Function to compute match percentage for each GT-Result pair
def compute_match_percentages(df, column_mapping):
percentages = []
for gt_col, result_col in column_mapping.items():
count = 0
total = len(df)
for _, row in df.iterrows():
gt_val = row[gt_col]
result_val = row[result_col]
# Handle NaN values
if pd.isna(gt_val) or pd.isna(result_val):
continue
# Handle non-numeric values
try:
gt_float = float(gt_val)
result_float = float(result_val)
except (ValueError, TypeError):
# Skip rows with non-numeric values
continue
# Check if values are within 0.5 tolerance
if abs(gt_float - result_float) <= 0.5:
count += 1
percentage = (count / total) * 100
percentages.append({
'GT_Column': gt_col,
'Result_Column': result_col,
'Match_Percentage': round(percentage, 1)
})
return pd.DataFrame(percentages)
# Compute match percentages
match_df = compute_match_percentages(df, column_mapping)
# Create a pivot table for gradient display (optional but helpful)
pivot_table = match_df.set_index(['GT_Column', 'Result_Column'])['Match_Percentage'].unstack(fill_value=0)
# Apply gradient background
cm = sns.light_palette("green", as_cmap=True)
styled_table = pivot_table.style.background_gradient(cmap=cm, axis=None)
# Display result
print("Agreement Percentage Table (with gradient):")
styled_table
##
# %% name
import pandas as pd
import numpy as np
import seaborn as sns
# first, let's identify the gt and result columns
gt_columns = [col for col in df.columns if col.startswith('gt.')]
result_columns = [col for col in df.columns if col.startswith('result.')]
# create a mapping dictionary for matching columns
column_mapping = {}
for gt_col in gt_columns:
# extract the base name (remove 'gt.' prefix)
base_name = gt_col.replace('gt.', '')
# find matching result column
result_col = f'result.{base_name}'
if result_col in result_columns:
column_mapping[gt_col] = result_col
# create comparison dataframe with error handling
def safe_float_convert(value):
'''safely convert value to float, returning nan for non-numeric values'''
if pd.isna(value):
return np.nan
try:
return float(value)
except (valueerror, typeerror):
return np.nan
def create_comparison_table(df, column_mapping):
# create a table showing match status for all comparisons
comparison_matrix = pd.dataframe(index=df.index, columns=[f"{gt_col}_vs_{result_col}" for gt_col, result_col in column_mapping.items()])
for idx, row in df.iterrows():
for gt_col, result_col in column_mapping.items():
gt_val = row[gt_col]
result_val = row[result_col]
# handle nan values
if pd.isna(gt_val) or pd.isna(result_val):
comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0
else:
# safely convert to float
gt_float = safe_float_convert(gt_val)
result_float = safe_float_convert(result_val)
# if either conversion failed, mark as no match
if pd.isna(gt_float) or pd.isna(result_float):
comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0
else:
# check if values are within 0.5 tolerance
if abs(gt_float - result_float) <= 0.5:
comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 1
else:
comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0
return comparison_matrix
# generate the comparison matrix
comprehensive_matrix = create_comparison_table(df, column_mapping)
# create summary statistics
summary_data = []
for gt_col, result_col in column_mapping.items():
match_count = 0
total_count = len(df)
for idx, row in df.iterrows():
gt_val = row[gt_col]
result_val = row[result_col]
# handle nan values
if pd.isna(gt_val) or pd.isna(result_val):
continue
else:
# safely convert to float
gt_float = safe_float_convert(gt_val)
result_float = safe_float_convert(result_val)
# if both conversions succeeded, check tolerance
if not pd.isna(gt_float) and not pd.isna(result_float):
if abs(gt_float - result_float) <= 0.5:
match_count += 1
summary_data.append({
'gt_column': gt_col,
'result_column': result_col,
'match_count': match_count,
'total_records': total_count,
'match_percentage': f"{(match_count/total_count*100):.1f}%" if total_count > 0 else "0.0%"
})
summary_df = pd.dataframe(summary_data)
# display the summary
print("comparison summary:")
print("="*80)
for _, row in summary_df.iterrows():
print(f"{row['gt_column']} vs {row['result_column']}:")
print(f" matches: {row['match_count']}/{row['total_records']} ({row['match_percentage']})")
print()
# create gradient styled table
cm = sns.light_palette("green", as_cmap=true)
print("comparison results with gradient:")
comprehensive_gradient = comprehensive_matrix.style.background_gradient(cmap=cm, axis=0)
# display the gradient table
comprehensive_gradient
# if you want to see the actual comparison data
print("\nraw comparison data:")
print(comprehensive_matrix.head())
##
# %% name
# %% Styled table
import pandas as pd
import numpy as np
import seaborn as sns
@@ -602,23 +411,67 @@ import matplotlib.pyplot as plt
# Load data
df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t')
# 1. Faster, vectorized computation
# 1. Identify all GT and result columns
gt_columns = [col for col in df.columns if col.startswith('GT.')]
data_list = []
result_columns = [col for col in df.columns if col.startswith('result.')]
print("GT Columns found:", gt_columns)
print("Result Columns found:", result_columns)
# 2. Create proper mapping between GT and result columns
# Handle various naming conventions (spaces, underscores, etc.)
column_mapping = {}
for gt_col in gt_columns:
base_name = gt_col.replace('GT.', '')
result_col = f'result.{base_name}'
if result_col in df.columns:
# 1. Convert to numeric, forcing errors to NaN
# 2. Explicitly cast to float to prevent Boolean subtraction errors
# Clean the base name for matching - remove spaces, underscores, etc.
# Try different matching approaches
candidates = [
f'result.{base_name}', # Exact match
f'result.{base_name.replace(" ", "_")}', # With underscores
f'result.{base_name.replace("_", " ")}', # With spaces
f'result.{base_name.replace(" ", "")}', # No spaces
f'result.{base_name.replace("_", "")}' # No underscores
]
# Also try case-insensitive matching
candidates.append(f'result.{base_name.lower()}')
candidates.append(f'result.{base_name.upper()}')
# Try to find matching result column
matched = False
for candidate in candidates:
if candidate in result_columns:
column_mapping[gt_col] = candidate
matched = True
break
# If no exact match found, try partial matching
if not matched:
# Try to match by removing special characters and comparing
base_clean = ''.join(e for e in base_name if e.isalnum() or e in ['_', ' '])
for result_col in result_columns:
result_base = result_col.replace('result.', '')
result_clean = ''.join(e for e in result_base if e.isalnum() or e in ['_', ' '])
if base_clean.lower() == result_clean.lower():
column_mapping[gt_col] = result_col
matched = True
break
print("Column mapping:", column_mapping)
# 3. Faster, vectorized computation using the corrected mapping
data_list = []
for gt_col, result_col in column_mapping.items():
print(f"Processing {gt_col} vs {result_col}")
# Convert to numeric, forcing errors to NaN
s1 = pd.to_numeric(df[gt_col], errors='coerce').astype(float)
s2 = pd.to_numeric(df[result_col], errors='coerce').astype(float)
# Calculate matches (abs difference <= 0.5)
# We use .count() to find how many non-NaN values we have if you want
# a more accurate percentage, or keep len(df) for the total row count.
diff = np.abs(s1 - s2)
matches = (diff <= 0.5).sum()
@@ -630,24 +483,29 @@ for gt_col in gt_columns:
else:
percentage = 0
# Extract clean base name for display
base_name = gt_col.replace('GT.', '')
data_list.append({
'GT': base_name,
'Match %': round(percentage, 1)
})
# 2. Prepare Data for Plotting
# 4. Prepare Data for Plotting
match_df = pd.DataFrame(data_list)
# We pivot to get a format suitable for a heatmap
plot_data = match_df.set_index('GT')
# 3. Create the Plot
# Handle case where no matches were found
if len(match_df) == 0:
print("No valid column pairs found for comparison")
exit()
# 5. Create the Plot
plt.figure(figsize=(10, 8))
sns.set_theme(style="white")
# Create heatmap
ax = sns.heatmap(
plot_data,
match_df.set_index('GT')[['Match %']], # Just the percentage column
annot=True, # Show the numbers in the boxes
fmt=".1f", # Format to 1 decimal place
cmap="YlGnBu", # Yellow-Green-Blue color palette
@@ -658,10 +516,11 @@ ax = sns.heatmap(
plt.title('Agreement Percentage (Tolerance ±0.5)', pad=20)
plt.tight_layout()
# 4. Save as SVG
plt.savefig("agreement_table.svg", format='svg')
print("Successfully saved agreement_table.svg")
# 6. Save as SVG
#plt.savefig("agreement_table.svg", format='svg', dpi=300, bbox_inches='tight')
#print("Successfully saved agreement_table.svg")
# Show plot if running in a GUI environment
plt.show()
##