ugly plot
This commit is contained in:
@@ -400,200 +400,9 @@ for old_name, new_name in column_mapping.items():
|
|||||||
##
|
##
|
||||||
|
|
||||||
|
|
||||||
# %% name
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import seaborn as sns
|
|
||||||
|
|
||||||
# Sample data (replace with your actual df)
|
|
||||||
df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t')
|
|
||||||
|
|
||||||
# Identify GT and Result columns
|
|
||||||
gt_columns = [col for col in df.columns if col.startswith('GT.')]
|
|
||||||
result_columns = [col for col in df.columns if col.startswith('result.')]
|
|
||||||
|
|
||||||
# Create mapping
|
|
||||||
column_mapping = {}
|
|
||||||
for gt_col in gt_columns:
|
|
||||||
base_name = gt_col.replace('GT.', '')
|
|
||||||
result_col = f'result.{base_name}'
|
|
||||||
if result_col in result_columns:
|
|
||||||
column_mapping[gt_col] = result_col
|
|
||||||
|
|
||||||
# Function to compute match percentage for each GT-Result pair
|
|
||||||
def compute_match_percentages(df, column_mapping):
|
|
||||||
percentages = []
|
|
||||||
for gt_col, result_col in column_mapping.items():
|
|
||||||
count = 0
|
|
||||||
total = len(df)
|
|
||||||
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
gt_val = row[gt_col]
|
|
||||||
result_val = row[result_col]
|
|
||||||
|
|
||||||
# Handle NaN values
|
|
||||||
if pd.isna(gt_val) or pd.isna(result_val):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Handle non-numeric values
|
|
||||||
try:
|
|
||||||
gt_float = float(gt_val)
|
|
||||||
result_float = float(result_val)
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
# Skip rows with non-numeric values
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check if values are within 0.5 tolerance
|
|
||||||
if abs(gt_float - result_float) <= 0.5:
|
|
||||||
count += 1
|
|
||||||
|
|
||||||
percentage = (count / total) * 100
|
|
||||||
percentages.append({
|
|
||||||
'GT_Column': gt_col,
|
|
||||||
'Result_Column': result_col,
|
|
||||||
'Match_Percentage': round(percentage, 1)
|
|
||||||
})
|
|
||||||
|
|
||||||
return pd.DataFrame(percentages)
|
|
||||||
|
|
||||||
# Compute match percentages
|
|
||||||
match_df = compute_match_percentages(df, column_mapping)
|
|
||||||
|
|
||||||
# Create a pivot table for gradient display (optional but helpful)
|
|
||||||
pivot_table = match_df.set_index(['GT_Column', 'Result_Column'])['Match_Percentage'].unstack(fill_value=0)
|
|
||||||
|
|
||||||
# Apply gradient background
|
|
||||||
cm = sns.light_palette("green", as_cmap=True)
|
|
||||||
styled_table = pivot_table.style.background_gradient(cmap=cm, axis=None)
|
|
||||||
|
|
||||||
# Display result
|
|
||||||
print("Agreement Percentage Table (with gradient):")
|
|
||||||
styled_table
|
|
||||||
|
|
||||||
|
|
||||||
##
|
# %% Styled table
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# %% name
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import seaborn as sns
|
|
||||||
|
|
||||||
# first, let's identify the gt and result columns
|
|
||||||
gt_columns = [col for col in df.columns if col.startswith('gt.')]
|
|
||||||
result_columns = [col for col in df.columns if col.startswith('result.')]
|
|
||||||
|
|
||||||
# create a mapping dictionary for matching columns
|
|
||||||
column_mapping = {}
|
|
||||||
for gt_col in gt_columns:
|
|
||||||
# extract the base name (remove 'gt.' prefix)
|
|
||||||
base_name = gt_col.replace('gt.', '')
|
|
||||||
|
|
||||||
# find matching result column
|
|
||||||
result_col = f'result.{base_name}'
|
|
||||||
if result_col in result_columns:
|
|
||||||
column_mapping[gt_col] = result_col
|
|
||||||
|
|
||||||
# create comparison dataframe with error handling
|
|
||||||
def safe_float_convert(value):
|
|
||||||
'''safely convert value to float, returning nan for non-numeric values'''
|
|
||||||
if pd.isna(value):
|
|
||||||
return np.nan
|
|
||||||
try:
|
|
||||||
return float(value)
|
|
||||||
except (valueerror, typeerror):
|
|
||||||
return np.nan
|
|
||||||
|
|
||||||
def create_comparison_table(df, column_mapping):
|
|
||||||
# create a table showing match status for all comparisons
|
|
||||||
comparison_matrix = pd.dataframe(index=df.index, columns=[f"{gt_col}_vs_{result_col}" for gt_col, result_col in column_mapping.items()])
|
|
||||||
|
|
||||||
for idx, row in df.iterrows():
|
|
||||||
for gt_col, result_col in column_mapping.items():
|
|
||||||
gt_val = row[gt_col]
|
|
||||||
result_val = row[result_col]
|
|
||||||
|
|
||||||
# handle nan values
|
|
||||||
if pd.isna(gt_val) or pd.isna(result_val):
|
|
||||||
comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0
|
|
||||||
else:
|
|
||||||
# safely convert to float
|
|
||||||
gt_float = safe_float_convert(gt_val)
|
|
||||||
result_float = safe_float_convert(result_val)
|
|
||||||
|
|
||||||
# if either conversion failed, mark as no match
|
|
||||||
if pd.isna(gt_float) or pd.isna(result_float):
|
|
||||||
comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0
|
|
||||||
else:
|
|
||||||
# check if values are within 0.5 tolerance
|
|
||||||
if abs(gt_float - result_float) <= 0.5:
|
|
||||||
comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 1
|
|
||||||
else:
|
|
||||||
comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0
|
|
||||||
|
|
||||||
return comparison_matrix
|
|
||||||
|
|
||||||
# generate the comparison matrix
|
|
||||||
comprehensive_matrix = create_comparison_table(df, column_mapping)
|
|
||||||
|
|
||||||
# create summary statistics
|
|
||||||
summary_data = []
|
|
||||||
for gt_col, result_col in column_mapping.items():
|
|
||||||
match_count = 0
|
|
||||||
total_count = len(df)
|
|
||||||
|
|
||||||
for idx, row in df.iterrows():
|
|
||||||
gt_val = row[gt_col]
|
|
||||||
result_val = row[result_col]
|
|
||||||
|
|
||||||
# handle nan values
|
|
||||||
if pd.isna(gt_val) or pd.isna(result_val):
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
# safely convert to float
|
|
||||||
gt_float = safe_float_convert(gt_val)
|
|
||||||
result_float = safe_float_convert(result_val)
|
|
||||||
|
|
||||||
# if both conversions succeeded, check tolerance
|
|
||||||
if not pd.isna(gt_float) and not pd.isna(result_float):
|
|
||||||
if abs(gt_float - result_float) <= 0.5:
|
|
||||||
match_count += 1
|
|
||||||
|
|
||||||
summary_data.append({
|
|
||||||
'gt_column': gt_col,
|
|
||||||
'result_column': result_col,
|
|
||||||
'match_count': match_count,
|
|
||||||
'total_records': total_count,
|
|
||||||
'match_percentage': f"{(match_count/total_count*100):.1f}%" if total_count > 0 else "0.0%"
|
|
||||||
})
|
|
||||||
|
|
||||||
summary_df = pd.dataframe(summary_data)
|
|
||||||
|
|
||||||
# display the summary
|
|
||||||
print("comparison summary:")
|
|
||||||
print("="*80)
|
|
||||||
for _, row in summary_df.iterrows():
|
|
||||||
print(f"{row['gt_column']} vs {row['result_column']}:")
|
|
||||||
print(f" matches: {row['match_count']}/{row['total_records']} ({row['match_percentage']})")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# create gradient styled table
|
|
||||||
cm = sns.light_palette("green", as_cmap=true)
|
|
||||||
print("comparison results with gradient:")
|
|
||||||
comprehensive_gradient = comprehensive_matrix.style.background_gradient(cmap=cm, axis=0)
|
|
||||||
|
|
||||||
# display the gradient table
|
|
||||||
comprehensive_gradient
|
|
||||||
|
|
||||||
# if you want to see the actual comparison data
|
|
||||||
print("\nraw comparison data:")
|
|
||||||
print(comprehensive_matrix.head())
|
|
||||||
|
|
||||||
##
|
|
||||||
|
|
||||||
# %% name
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
@@ -602,23 +411,67 @@ import matplotlib.pyplot as plt
|
|||||||
# Load data
|
# Load data
|
||||||
df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t')
|
df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t')
|
||||||
|
|
||||||
# 1. Faster, vectorized computation
|
# 1. Identify all GT and result columns
|
||||||
gt_columns = [col for col in df.columns if col.startswith('GT.')]
|
gt_columns = [col for col in df.columns if col.startswith('GT.')]
|
||||||
data_list = []
|
result_columns = [col for col in df.columns if col.startswith('result.')]
|
||||||
|
|
||||||
|
print("GT Columns found:", gt_columns)
|
||||||
|
print("Result Columns found:", result_columns)
|
||||||
|
|
||||||
|
# 2. Create proper mapping between GT and result columns
|
||||||
|
# Handle various naming conventions (spaces, underscores, etc.)
|
||||||
|
column_mapping = {}
|
||||||
|
|
||||||
for gt_col in gt_columns:
|
for gt_col in gt_columns:
|
||||||
base_name = gt_col.replace('GT.', '')
|
base_name = gt_col.replace('GT.', '')
|
||||||
result_col = f'result.{base_name}'
|
|
||||||
|
|
||||||
if result_col in df.columns:
|
# Clean the base name for matching - remove spaces, underscores, etc.
|
||||||
# 1. Convert to numeric, forcing errors to NaN
|
# Try different matching approaches
|
||||||
# 2. Explicitly cast to float to prevent Boolean subtraction errors
|
candidates = [
|
||||||
|
f'result.{base_name}', # Exact match
|
||||||
|
f'result.{base_name.replace(" ", "_")}', # With underscores
|
||||||
|
f'result.{base_name.replace("_", " ")}', # With spaces
|
||||||
|
f'result.{base_name.replace(" ", "")}', # No spaces
|
||||||
|
f'result.{base_name.replace("_", "")}' # No underscores
|
||||||
|
]
|
||||||
|
|
||||||
|
# Also try case-insensitive matching
|
||||||
|
candidates.append(f'result.{base_name.lower()}')
|
||||||
|
candidates.append(f'result.{base_name.upper()}')
|
||||||
|
|
||||||
|
# Try to find matching result column
|
||||||
|
matched = False
|
||||||
|
for candidate in candidates:
|
||||||
|
if candidate in result_columns:
|
||||||
|
column_mapping[gt_col] = candidate
|
||||||
|
matched = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# If no exact match found, try partial matching
|
||||||
|
if not matched:
|
||||||
|
# Try to match by removing special characters and comparing
|
||||||
|
base_clean = ''.join(e for e in base_name if e.isalnum() or e in ['_', ' '])
|
||||||
|
for result_col in result_columns:
|
||||||
|
result_base = result_col.replace('result.', '')
|
||||||
|
result_clean = ''.join(e for e in result_base if e.isalnum() or e in ['_', ' '])
|
||||||
|
if base_clean.lower() == result_clean.lower():
|
||||||
|
column_mapping[gt_col] = result_col
|
||||||
|
matched = True
|
||||||
|
break
|
||||||
|
|
||||||
|
print("Column mapping:", column_mapping)
|
||||||
|
|
||||||
|
# 3. Faster, vectorized computation using the corrected mapping
|
||||||
|
data_list = []
|
||||||
|
|
||||||
|
for gt_col, result_col in column_mapping.items():
|
||||||
|
print(f"Processing {gt_col} vs {result_col}")
|
||||||
|
|
||||||
|
# Convert to numeric, forcing errors to NaN
|
||||||
s1 = pd.to_numeric(df[gt_col], errors='coerce').astype(float)
|
s1 = pd.to_numeric(df[gt_col], errors='coerce').astype(float)
|
||||||
s2 = pd.to_numeric(df[result_col], errors='coerce').astype(float)
|
s2 = pd.to_numeric(df[result_col], errors='coerce').astype(float)
|
||||||
|
|
||||||
# Calculate matches (abs difference <= 0.5)
|
# Calculate matches (abs difference <= 0.5)
|
||||||
# We use .count() to find how many non-NaN values we have if you want
|
|
||||||
# a more accurate percentage, or keep len(df) for the total row count.
|
|
||||||
diff = np.abs(s1 - s2)
|
diff = np.abs(s1 - s2)
|
||||||
matches = (diff <= 0.5).sum()
|
matches = (diff <= 0.5).sum()
|
||||||
|
|
||||||
@@ -630,24 +483,29 @@ for gt_col in gt_columns:
|
|||||||
else:
|
else:
|
||||||
percentage = 0
|
percentage = 0
|
||||||
|
|
||||||
|
# Extract clean base name for display
|
||||||
|
base_name = gt_col.replace('GT.', '')
|
||||||
|
|
||||||
data_list.append({
|
data_list.append({
|
||||||
'GT': base_name,
|
'GT': base_name,
|
||||||
'Match %': round(percentage, 1)
|
'Match %': round(percentage, 1)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# 4. Prepare Data for Plotting
|
||||||
# 2. Prepare Data for Plotting
|
|
||||||
match_df = pd.DataFrame(data_list)
|
match_df = pd.DataFrame(data_list)
|
||||||
# We pivot to get a format suitable for a heatmap
|
|
||||||
plot_data = match_df.set_index('GT')
|
|
||||||
|
|
||||||
# 3. Create the Plot
|
# Handle case where no matches were found
|
||||||
|
if len(match_df) == 0:
|
||||||
|
print("No valid column pairs found for comparison")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
# 5. Create the Plot
|
||||||
plt.figure(figsize=(10, 8))
|
plt.figure(figsize=(10, 8))
|
||||||
sns.set_theme(style="white")
|
sns.set_theme(style="white")
|
||||||
|
|
||||||
# Create heatmap
|
# Create heatmap
|
||||||
ax = sns.heatmap(
|
ax = sns.heatmap(
|
||||||
plot_data,
|
match_df.set_index('GT')[['Match %']], # Just the percentage column
|
||||||
annot=True, # Show the numbers in the boxes
|
annot=True, # Show the numbers in the boxes
|
||||||
fmt=".1f", # Format to 1 decimal place
|
fmt=".1f", # Format to 1 decimal place
|
||||||
cmap="YlGnBu", # Yellow-Green-Blue color palette
|
cmap="YlGnBu", # Yellow-Green-Blue color palette
|
||||||
@@ -658,10 +516,11 @@ ax = sns.heatmap(
|
|||||||
plt.title('Agreement Percentage (Tolerance ±0.5)', pad=20)
|
plt.title('Agreement Percentage (Tolerance ±0.5)', pad=20)
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
|
|
||||||
# 4. Save as SVG
|
# 6. Save as SVG
|
||||||
plt.savefig("agreement_table.svg", format='svg')
|
#plt.savefig("agreement_table.svg", format='svg', dpi=300, bbox_inches='tight')
|
||||||
print("Successfully saved agreement_table.svg")
|
#print("Successfully saved agreement_table.svg")
|
||||||
|
|
||||||
# Show plot if running in a GUI environment
|
# Show plot if running in a GUI environment
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
##
|
##
|
||||||
|
|||||||
Reference in New Issue
Block a user