From a415632552baef4dfe8246c776d75f1e29d64b41 Mon Sep 17 00:00:00 2001 From: Shahin Ramezanzadeh Date: Mon, 19 Jan 2026 00:39:13 +0100 Subject: [PATCH] updated git ignore and new files --- .gitignore | 20 +- Data/show_plots.py | 594 ++++++++++++++++++++++++++++++++++++++++++ Data/styled_tables.py | 71 +++++ 3 files changed, 679 insertions(+), 6 deletions(-) create mode 100644 Data/show_plots.py create mode 100644 Data/styled_tables.py diff --git a/.gitignore b/.gitignore index 3edd7e9..072bc4e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,16 @@ -# Ignore all contents of these directories - -/Data/ -/attach/ -/results/ -/enarcelona/ +# 1. Broad Ignores +/Data/* +/attach/* +/results/* +/enarcelona/* .env +__pycache__/ +*.pyc + +# 2. Ignore virtual environments COMPLETELY +# This must come BEFORE the unignore rule +env*/ + +# 3. The "Unignore" rule (Whitelisting) +# We only unignore .py files that aren't already blocked by the rules above !**/*.py diff --git a/Data/show_plots.py b/Data/show_plots.py new file mode 100644 index 0000000..ea441bd --- /dev/null +++ b/Data/show_plots.py @@ -0,0 +1,594 @@ +# %% Scatter +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +# Load your data from TSV file +file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_results+MS_Briefe_400_with_unique_id_SHA3_explore_cleaned.tsv' +df = pd.read_csv(file_path, sep='\t') + +# Replace comma with dot for numeric conversion in GT_EDSS and LLM_Results +df['GT_EDSS'] = df['GT_EDSS'].astype(str).str.replace(',', '.') +df['LLM_Results'] = df['LLM_Results'].astype(str).str.replace(',', '.') + +# Convert to float (handle invalid entries gracefully) +df['GT_EDSS'] = pd.to_numeric(df['GT_EDSS'], errors='coerce') +df['LLM_Results'] = pd.to_numeric(df['LLM_Results'], errors='coerce') + +# Drop rows where either column is NaN +df_clean = df.dropna(subset=['GT_EDSS', 'LLM_Results']) + +# Create scatter plot +plt.figure(figsize=(8, 6)) +plt.scatter(df_clean['GT_EDSS'], df_clean['LLM_Results'], alpha=0.7, color='blue') + +# Add labels and title +plt.xlabel('GT_EDSS') +plt.ylabel('LLM_Results') +plt.title('Comparison of GT_EDSS vs LLM_Results') + +# Optional: Add a diagonal line for reference (perfect prediction) +plt.plot([0, max(df_clean['GT_EDSS'])], [0, max(df_clean['GT_EDSS'])], color='red', linestyle='--', label='Perfect Prediction') +plt.legend() + +# Show plot +plt.grid(True) +plt.tight_layout() +plt.show() + +## + + +# %% Bland0-altman + +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import statsmodels.api as sm + +# Load your data from TSV file +file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_results+MS_Briefe_400_with_unique_id_SHA3_explore_cleaned.tsv' +df = pd.read_csv(file_path, sep='\t') + +# Replace comma with dot for numeric conversion in GT_EDSS and LLM_Results +df['GT_EDSS'] = df['GT_EDSS'].astype(str).str.replace(',', '.') +df['LLM_Results'] = df['LLM_Results'].astype(str).str.replace(',', '.') + +# Convert to float (handle invalid entries gracefully) +df['GT_EDSS'] = pd.to_numeric(df['GT_EDSS'], errors='coerce') +df['LLM_Results'] = pd.to_numeric(df['LLM_Results'], errors='coerce') + +# Drop rows where either column is NaN +df_clean = df.dropna(subset=['GT_EDSS', 'LLM_Results']) + +# Create Bland-Altman plot +f, ax = plt.subplots(1, figsize=(8, 5)) +sm.graphics.mean_diff_plot(df_clean['GT_EDSS'], df_clean['LLM_Results'], ax=ax) + +# Add labels and title +ax.set_title('Bland-Altman Plot: GT_EDSS vs LLM_Results') +ax.set_xlabel('Mean of GT_EDSS and LLM_Results') +ax.set_ylabel('Difference between GT_EDSS and LLM_Results') + +# Display Bland-Altman plot +plt.tight_layout() +plt.show() + +# Print some statistics +mean_diff = np.mean(df_clean['GT_EDSS'] - df_clean['LLM_Results']) +std_diff = np.std(df_clean['GT_EDSS'] - df_clean['LLM_Results']) +print(f"Mean difference: {mean_diff:.3f}") +print(f"Standard deviation of differences: {std_diff:.3f}") +print(f"95% Limits of Agreement: [{mean_diff - 1.96*std_diff:.3f}, {mean_diff + 1.96*std_diff:.3f}]") + +## + + + +# %% Confusion matrix +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +from sklearn.metrics import confusion_matrix, classification_report +import seaborn as sns + +# Load your data from TSV file +file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_results_unique.tsv' +df = pd.read_csv(file_path, sep='\t') + +# Replace comma with dot for numeric conversion in GT_EDSS and LLM_Results +df['GT_EDSS'] = df['GT_EDSS'].astype(str).str.replace(',', '.') +df['LLM_Results'] = df['LLM_Results'].astype(str).str.replace(',', '.') + +# Convert to float (handle invalid entries gracefully) +df['GT_EDSS'] = pd.to_numeric(df['GT_EDSS'], errors='coerce') +df['LLM_Results'] = pd.to_numeric(df['LLM_Results'], errors='coerce') + +# Drop rows where either column is NaN +df_clean = df.dropna(subset=['GT_EDSS', 'LLM_Results']) + +# For confusion matrix, we need to categorize the values +# Let's create categories up to 10 (0-1, 1-2, 2-3, ..., 9-10) +def categorize_edss(value): + if pd.isna(value): + return np.nan + elif value <= 1.0: + return '0-1' + elif value <= 2.0: + return '1-2' + elif value <= 3.0: + return '2-3' + elif value <= 4.0: + return '3-4' + elif value <= 5.0: + return '4-5' + elif value <= 6.0: + return '5-6' + elif value <= 7.0: + return '6-7' + elif value <= 8.0: + return '7-8' + elif value <= 9.0: + return '8-9' + elif value <= 10.0: + return '9-10' + else: + return '10+' + +# Create categorical versions +df_clean['GT_EDSS_cat'] = df_clean['GT_EDSS'].apply(categorize_edss) +df_clean['LLM_Results_cat'] = df_clean['LLM_Results'].apply(categorize_edss) + +# Remove any NaN categories +df_clean = df_clean.dropna(subset=['GT_EDSS_cat', 'LLM_Results_cat']) + +# Create confusion matrix +cm = confusion_matrix(df_clean['GT_EDSS_cat'], df_clean['LLM_Results_cat'], + labels=['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']) + +# Plot confusion matrix +plt.figure(figsize=(10, 8)) +sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', + xticklabels=['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10'], + yticklabels=['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']) +plt.title('Confusion Matrix: GT_EDSS vs interferred EDSS (Categorized 0-10)') +plt.xlabel('LLM_Results Category') +plt.ylabel('GT_EDSS Category') +plt.tight_layout() +plt.show() + +# Print classification report +print("Classification Report:") +print(classification_report(df_clean['GT_EDSS_cat'], df_clean['LLM_Results_cat'])) + +# Print raw counts +print("\nConfusion Matrix (Raw Counts):") +print(cm) + +## + + + +# %% Classification +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.metrics import confusion_matrix +import numpy as np + +# Load your data from TSV file +file_path ='/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_results_unique.tsv' + +df = pd.read_csv(file_path, sep='\t') + +# Check data structure +print("Data shape:", df.shape) +print("First few rows:") +print(df.head()) +print("\nColumn names:") +for col in df.columns: + print(f" {col}") + +# Function to safely convert to boolean +def safe_bool_convert(series): + '''Safely convert series to boolean, handling various input formats''' + # Convert to string first, then to boolean + series_str = series.astype(str).str.strip().str.lower() + + # Handle different true/false representations + bool_map = { + 'true': True, '1': True, 'yes': True, 'y': True, + 'false': False, '0': False, 'no': False, 'n': False + } + + converted = series_str.map(bool_map) + + # Handle remaining NaN values + converted = converted.fillna(False) # or True, depending on your preference + + return converted + +# Convert columns safely +if 'LLM_klassifizierbar' in df.columns: + print("\nLLM_klassifizierbar column info:") + print(df['LLM_klassifizierbar'].head(10)) + print("Unique values:", df['LLM_klassifizierbar'].unique()) + + df['LLM_klassifizierbar'] = safe_bool_convert(df['LLM_klassifizierbar']) + print("After conversion:") + print(df['LLM_klassifizierbar'].value_counts()) + +if 'GT_klassifizierbar' in df.columns: + print("\nGT_klassifizierbar column info:") + print(df['GT_klassifizierbar'].head(10)) + print("Unique values:", df['GT_klassifizierbar'].unique()) + + df['GT_klassifizierbar'] = safe_bool_convert(df['GT_klassifizierbar']) + print("After conversion:") + print(df['GT_klassifizierbar'].value_counts()) + +# Create bar chart showing only True values for klassifizierbar +if 'LLM_klassifizierbar' in df.columns and 'GT_klassifizierbar' in df.columns: + # Get counts for True values only + llm_true_count = df['LLM_klassifizierbar'].sum() + gt_true_count = df['GT_klassifizierbar'].sum() + + # Plot using matplotlib directly + fig, ax = plt.subplots(figsize=(8, 6)) + + x = np.arange(2) + width = 0.35 + + bars1 = ax.bar(x[0] - width/2, llm_true_count, width, label='LLM', color='skyblue', alpha=0.8) + bars2 = ax.bar(x[1] + width/2, gt_true_count, width, label='GT', color='lightcoral', alpha=0.8) + + # Add value labels on bars + ax.annotate(f'{llm_true_count}', + xy=(x[0], llm_true_count), + xytext=(0, 3), + textcoords="offset points", + ha='center', va='bottom') + + ax.annotate(f'{gt_true_count}', + xy=(x[1], gt_true_count), + xytext=(0, 3), + textcoords="offset points", + ha='center', va='bottom') + + ax.set_xlabel('Classification Status (klassifizierbar)') + ax.set_ylabel('Count') + ax.set_title('True Values Comparison: LLM vs GT for "klassifizierbar"') + ax.set_xticks(x) + ax.set_xticklabels(['LLM', 'GT']) + ax.legend() + + plt.tight_layout() + plt.show() + +# Create confusion matrix if both columns exist +if 'LLM_klassifizierbar' in df.columns and 'GT_klassifizierbar' in df.columns: + try: + # Ensure both columns are boolean + llm_bool = df['LLM_klassifizierbar'].fillna(False).astype(bool) + gt_bool = df['GT_klassifizierbar'].fillna(False).astype(bool) + + cm = confusion_matrix(gt_bool, llm_bool) + + # Plot confusion matrix + fig, ax = plt.subplots(figsize=(8, 6)) + sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', + xticklabels=['False ', 'True '], + yticklabels=['False', 'True '], + ax=ax) + ax.set_xlabel('LLM Predictions ') + ax.set_ylabel('GT Labels ') + ax.set_title('Confusion Matrix: LLM vs GT for "klassifizierbar"') + + plt.tight_layout() + plt.show() + + print("Confusion Matrix:") + print(cm) + + except Exception as e: + print(f"Error creating confusion matrix: {e}") + +# Show final data info +print("\nFinal DataFrame info:") +print(df[['LLM_klassifizierbar', 'GT_klassifizierbar']].info()) + +## + + + + +# %% Boxplot +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np + +# Load your data from TSV file +file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_results_unique.tsv' +df = pd.read_csv(file_path, sep='\t') + +# Replace comma with dot for numeric conversion in GT_EDSS and LLM_Results +df['GT_EDSS'] = df['GT_EDSS'].astype(str).str.replace(',', '.') +df['LLM_Results'] = df['LLM_Results'].astype(str).str.replace(',', '.') + +# Convert to float (handle invalid entries gracefully) +df['GT_EDSS'] = pd.to_numeric(df['GT_EDSS'], errors='coerce') +df['LLM_Results'] = pd.to_numeric(df['LLM_Results'], errors='coerce') + +# Drop rows where either column is NaN +df_clean = df.dropna(subset=['GT_EDSS', 'LLM_Results']) + +# 1. DEFINE CATEGORY ORDER +# This ensures the X-axis is numerically logical (0-1 comes before 1-2) +category_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10', '10+'] + +# Convert the column to a Categorical type with the specific order +df_clean['GT_EDSS_cat'] = pd.Categorical(df_clean['GT_EDSS'].apply(categorize_edss), + categories=category_order, + ordered=True) + +plt.figure(figsize=(14, 8)) + +# 2. ADD HUE FOR LEGEND +# Assigning x to 'hue' allows Seaborn to generate a legend automatically +box_plot = sns.boxplot( + data=df_clean, + x='GT_EDSS_cat', + y='LLM_Results', + hue='GT_EDSS_cat', # Added hue + palette='viridis', + linewidth=1.5, + legend=True # Ensure legend is enabled +) + +# 3. CUSTOMIZE PLOT +plt.title('Distribution of LLM_Results by GT_EDSS Category', fontsize=18, pad=20) +plt.xlabel('Ground Truth EDSS Category', fontsize=14) +plt.ylabel('LLM Predicted EDSS', fontsize=14) + +# Move legend to the side or top +plt.legend(title="EDSS Categories", bbox_to_anchor=(1.05, 1), loc='upper left') + +plt.xticks(rotation=45, ha='right', fontsize=10) +plt.grid(True, axis='y', alpha=0.3) +plt.tight_layout() + +plt.show() +## + + +# %% Postproccessing Column names + +import pandas as pd + +# Read the TSV file +file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv' +df = pd.read_csv(file_path, sep='\t') + +# Create a mapping dictionary for German to English column names +column_mapping = { + 'EDSS':'GT.EDSS', + 'klassifizierbar': 'GT.klassifizierbar', + 'Sehvermögen': 'GT.VISUAL_OPTIC_FUNCTIONS', + 'Cerebellum': 'GT.CEREBELLAR_FUNCTIONS', + 'Hirnstamm': 'GT.BRAINSTEM_FUNCTIONS', + 'Sensibiliät': 'GT.SENSORY_FUNCTIONS', + 'Pyramidalmotorik': 'GT.PYRAMIDAL_FUNCTIONS', + 'Ambulation': 'GT.AMBULATION', + 'Cerebrale_Funktion': 'GT.CEREBRAL_FUNCTIONS', + 'Blasen-_und_Mastdarmfunktion': 'GT.BOWEL_AND_BLADDER_FUNCTIONS' +} + +# Rename columns +df = df.rename(columns=column_mapping) + +# Save the modified dataframe back to TSV file +df.to_csv(file_path, sep='\t', index=False) + +print("Columns have been successfully renamed!") +print("Renamed columns:") +for old_name, new_name in column_mapping.items(): + if old_name in df.columns: + print(f" {old_name} -> {new_name}") + + +## + + +# %% name +import pandas as pd +import numpy as np +import seaborn as sns + +# Sample data (replace with your actual df) +df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t') + +# Identify GT and Result columns +gt_columns = [col for col in df.columns if col.startswith('GT.')] +result_columns = [col for col in df.columns if col.startswith('result.')] + +# Create mapping +column_mapping = {} +for gt_col in gt_columns: + base_name = gt_col.replace('GT.', '') + result_col = f'result.{base_name}' + if result_col in result_columns: + column_mapping[gt_col] = result_col + +# Function to compute match percentage for each GT-Result pair +def compute_match_percentages(df, column_mapping): + percentages = [] + for gt_col, result_col in column_mapping.items(): + count = 0 + total = len(df) + + for _, row in df.iterrows(): + gt_val = row[gt_col] + result_val = row[result_col] + + # Handle NaN values + if pd.isna(gt_val) or pd.isna(result_val): + continue + + # Handle non-numeric values + try: + gt_float = float(gt_val) + result_float = float(result_val) + except (ValueError, TypeError): + # Skip rows with non-numeric values + continue + + # Check if values are within 0.5 tolerance + if abs(gt_float - result_float) <= 0.5: + count += 1 + + percentage = (count / total) * 100 + percentages.append({ + 'GT_Column': gt_col, + 'Result_Column': result_col, + 'Match_Percentage': round(percentage, 1) + }) + + return pd.DataFrame(percentages) + +# Compute match percentages +match_df = compute_match_percentages(df, column_mapping) + +# Create a pivot table for gradient display (optional but helpful) +pivot_table = match_df.set_index(['GT_Column', 'Result_Column'])['Match_Percentage'].unstack(fill_value=0) + +# Apply gradient background +cm = sns.light_palette("green", as_cmap=True) +styled_table = pivot_table.style.background_gradient(cmap=cm, axis=None) + +# Display result +print("Agreement Percentage Table (with gradient):") +styled_table + + +## + + + + +# %% name +import pandas as pd +import numpy as np +import seaborn as sns + +# first, let's identify the gt and result columns +gt_columns = [col for col in df.columns if col.startswith('gt.')] +result_columns = [col for col in df.columns if col.startswith('result.')] + +# create a mapping dictionary for matching columns +column_mapping = {} +for gt_col in gt_columns: + # extract the base name (remove 'gt.' prefix) + base_name = gt_col.replace('gt.', '') + + # find matching result column + result_col = f'result.{base_name}' + if result_col in result_columns: + column_mapping[gt_col] = result_col + +# create comparison dataframe with error handling +def safe_float_convert(value): + '''safely convert value to float, returning nan for non-numeric values''' + if pd.isna(value): + return np.nan + try: + return float(value) + except (valueerror, typeerror): + return np.nan + +def create_comparison_table(df, column_mapping): + # create a table showing match status for all comparisons + comparison_matrix = pd.dataframe(index=df.index, columns=[f"{gt_col}_vs_{result_col}" for gt_col, result_col in column_mapping.items()]) + + for idx, row in df.iterrows(): + for gt_col, result_col in column_mapping.items(): + gt_val = row[gt_col] + result_val = row[result_col] + + # handle nan values + if pd.isna(gt_val) or pd.isna(result_val): + comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0 + else: + # safely convert to float + gt_float = safe_float_convert(gt_val) + result_float = safe_float_convert(result_val) + + # if either conversion failed, mark as no match + if pd.isna(gt_float) or pd.isna(result_float): + comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0 + else: + # check if values are within 0.5 tolerance + if abs(gt_float - result_float) <= 0.5: + comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 1 + else: + comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0 + + return comparison_matrix + +# generate the comparison matrix +comprehensive_matrix = create_comparison_table(df, column_mapping) + +# create summary statistics +summary_data = [] +for gt_col, result_col in column_mapping.items(): + match_count = 0 + total_count = len(df) + + for idx, row in df.iterrows(): + gt_val = row[gt_col] + result_val = row[result_col] + + # handle nan values + if pd.isna(gt_val) or pd.isna(result_val): + continue + else: + # safely convert to float + gt_float = safe_float_convert(gt_val) + result_float = safe_float_convert(result_val) + + # if both conversions succeeded, check tolerance + if not pd.isna(gt_float) and not pd.isna(result_float): + if abs(gt_float - result_float) <= 0.5: + match_count += 1 + + summary_data.append({ + 'gt_column': gt_col, + 'result_column': result_col, + 'match_count': match_count, + 'total_records': total_count, + 'match_percentage': f"{(match_count/total_count*100):.1f}%" if total_count > 0 else "0.0%" + }) + +summary_df = pd.dataframe(summary_data) + +# display the summary +print("comparison summary:") +print("="*80) +for _, row in summary_df.iterrows(): + print(f"{row['gt_column']} vs {row['result_column']}:") + print(f" matches: {row['match_count']}/{row['total_records']} ({row['match_percentage']})") + print() + +# create gradient styled table +cm = sns.light_palette("green", as_cmap=true) +print("comparison results with gradient:") +comprehensive_gradient = comprehensive_matrix.style.background_gradient(cmap=cm, axis=0) + +# display the gradient table +comprehensive_gradient + +# if you want to see the actual comparison data +print("\nraw comparison data:") +print(comprehensive_matrix.head()) + +## diff --git a/Data/styled_tables.py b/Data/styled_tables.py new file mode 100644 index 0000000..a42c651 --- /dev/null +++ b/Data/styled_tables.py @@ -0,0 +1,71 @@ +import pandas as pd +import numpy as np +import seaborn as sns + +# Sample data (replace with your actual df) +df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t') + +# Identify GT and Result columns +gt_columns = [col for col in df.columns if col.startswith('GT.')] +result_columns = [col for col in df.columns if col.startswith('result.')] + +# Create mapping +column_mapping = {} +for gt_col in gt_columns: + base_name = gt_col.replace('GT.', '') + result_col = f'result.{base_name}' + if result_col in result_columns: + column_mapping[gt_col] = result_col + +# Function to compute match percentage for each GT-Result pair +def compute_match_percentages(df, column_mapping): + percentages = [] + for gt_col, result_col in column_mapping.items(): + count = 0 + total = len(df) + + for _, row in df.iterrows(): + gt_val = row[gt_col] + result_val = row[result_col] + + # Handle NaN values + if pd.isna(gt_val) or pd.isna(result_val): + continue + + # Handle non-numeric values + try: + gt_float = float(gt_val) + result_float = float(result_val) + except (ValueError, TypeError): + # Skip rows with non-numeric values + continue + + # Check if values are within 0.5 tolerance + if abs(gt_float - result_float) <= 0.5: + count += 1 + + percentage = (count / total) * 100 + percentages.append({ + 'GT_Column': gt_col, + 'Result_Column': result_col, + 'Match_Percentage': round(percentage, 1) + }) + + return pd.DataFrame(percentages) + +# Compute match percentages +match_df = compute_match_percentages(df, column_mapping) + +# Create a pivot table for gradient display (optional but helpful) +pivot_table = match_df.set_index(['GT_Column', 'Result_Column'])['Match_Percentage'].unstack(fill_value=0) + +# Apply gradient background +cm = sns.light_palette("green", as_cmap=True) +styled_table = pivot_table.style.background_gradient(cmap=cm, axis=None) + +# Display result +print("Agreement Percentage Table (with gradient):") +styled_table + + +