# %% Scatter import pandas as pd import matplotlib.pyplot as plt import numpy as np # Load your data from TSV file file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_results+MS_Briefe_400_with_unique_id_SHA3_explore_cleaned.tsv' df = pd.read_csv(file_path, sep='\t') # Replace comma with dot for numeric conversion in GT_EDSS and LLM_Results df['GT_EDSS'] = df['GT_EDSS'].astype(str).str.replace(',', '.') df['LLM_Results'] = df['LLM_Results'].astype(str).str.replace(',', '.') # Convert to float (handle invalid entries gracefully) df['GT_EDSS'] = pd.to_numeric(df['GT_EDSS'], errors='coerce') df['LLM_Results'] = pd.to_numeric(df['LLM_Results'], errors='coerce') # Drop rows where either column is NaN df_clean = df.dropna(subset=['GT_EDSS', 'LLM_Results']) # Create scatter plot plt.figure(figsize=(8, 6)) plt.scatter(df_clean['GT_EDSS'], df_clean['LLM_Results'], alpha=0.7, color='blue') # Add labels and title plt.xlabel('GT_EDSS') plt.ylabel('LLM_Results') plt.title('Comparison of GT_EDSS vs LLM_Results') # Optional: Add a diagonal line for reference (perfect prediction) plt.plot([0, max(df_clean['GT_EDSS'])], [0, max(df_clean['GT_EDSS'])], color='red', linestyle='--', label='Perfect Prediction') plt.legend() # Show plot plt.grid(True) plt.tight_layout() plt.show() ## # %% Bland0-altman import pandas as pd import matplotlib.pyplot as plt import numpy as np import statsmodels.api as sm # Load your data from TSV file file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_results+MS_Briefe_400_with_unique_id_SHA3_explore_cleaned.tsv' df = pd.read_csv(file_path, sep='\t') # Replace comma with dot for numeric conversion in GT_EDSS and LLM_Results df['GT_EDSS'] = df['GT_EDSS'].astype(str).str.replace(',', '.') df['LLM_Results'] = df['LLM_Results'].astype(str).str.replace(',', '.') # Convert to float (handle invalid entries gracefully) df['GT_EDSS'] = pd.to_numeric(df['GT_EDSS'], errors='coerce') df['LLM_Results'] = pd.to_numeric(df['LLM_Results'], errors='coerce') # Drop rows where either column is NaN df_clean = df.dropna(subset=['GT_EDSS', 'LLM_Results']) # Create Bland-Altman plot f, ax = plt.subplots(1, figsize=(8, 5)) sm.graphics.mean_diff_plot(df_clean['GT_EDSS'], df_clean['LLM_Results'], ax=ax) # Add labels and title ax.set_title('Bland-Altman Plot: GT_EDSS vs LLM_Results') ax.set_xlabel('Mean of GT_EDSS and LLM_Results') ax.set_ylabel('Difference between GT_EDSS and LLM_Results') # Display Bland-Altman plot plt.tight_layout() plt.show() # Print some statistics mean_diff = np.mean(df_clean['GT_EDSS'] - df_clean['LLM_Results']) std_diff = np.std(df_clean['GT_EDSS'] - df_clean['LLM_Results']) print(f"Mean difference: {mean_diff:.3f}") print(f"Standard deviation of differences: {std_diff:.3f}") print(f"95% Limits of Agreement: [{mean_diff - 1.96*std_diff:.3f}, {mean_diff + 1.96*std_diff:.3f}]") ## # %% Confusion matrix import pandas as pd import matplotlib.pyplot as plt import numpy as np from sklearn.metrics import confusion_matrix, classification_report import seaborn as sns # Load your data from TSV file file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_results_unique.tsv' df = pd.read_csv(file_path, sep='\t') # Replace comma with dot for numeric conversion in GT_EDSS and LLM_Results df['GT_EDSS'] = df['GT_EDSS'].astype(str).str.replace(',', '.') df['LLM_Results'] = df['LLM_Results'].astype(str).str.replace(',', '.') # Convert to float (handle invalid entries gracefully) df['GT_EDSS'] = pd.to_numeric(df['GT_EDSS'], errors='coerce') df['LLM_Results'] = pd.to_numeric(df['LLM_Results'], errors='coerce') # Drop rows where either column is NaN df_clean = df.dropna(subset=['GT_EDSS', 'LLM_Results']) # For confusion matrix, we need to categorize the values # Let's create categories up to 10 (0-1, 1-2, 2-3, ..., 9-10) def categorize_edss(value): if pd.isna(value): return np.nan elif value <= 1.0: return '0-1' elif value <= 2.0: return '1-2' elif value <= 3.0: return '2-3' elif value <= 4.0: return '3-4' elif value <= 5.0: return '4-5' elif value <= 6.0: return '5-6' elif value <= 7.0: return '6-7' elif value <= 8.0: return '7-8' elif value <= 9.0: return '8-9' elif value <= 10.0: return '9-10' else: return '10+' # Create categorical versions df_clean['GT_EDSS_cat'] = df_clean['GT_EDSS'].apply(categorize_edss) df_clean['LLM_Results_cat'] = df_clean['LLM_Results'].apply(categorize_edss) # Remove any NaN categories df_clean = df_clean.dropna(subset=['GT_EDSS_cat', 'LLM_Results_cat']) # Create confusion matrix cm = confusion_matrix(df_clean['GT_EDSS_cat'], df_clean['LLM_Results_cat'], labels=['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']) # Plot confusion matrix plt.figure(figsize=(10, 8)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10'], yticklabels=['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']) plt.title('Confusion Matrix: GT_EDSS vs interferred EDSS (Categorized 0-10)') plt.xlabel('LLM_Results Category') plt.ylabel('GT_EDSS Category') plt.tight_layout() plt.show() # Print classification report print("Classification Report:") print(classification_report(df_clean['GT_EDSS_cat'], df_clean['LLM_Results_cat'])) # Print raw counts print("\nConfusion Matrix (Raw Counts):") print(cm) ## # %% Classification import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.metrics import confusion_matrix import numpy as np # Load your data from TSV file file_path ='/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_results_unique.tsv' df = pd.read_csv(file_path, sep='\t') # Check data structure print("Data shape:", df.shape) print("First few rows:") print(df.head()) print("\nColumn names:") for col in df.columns: print(f" {col}") # Function to safely convert to boolean def safe_bool_convert(series): '''Safely convert series to boolean, handling various input formats''' # Convert to string first, then to boolean series_str = series.astype(str).str.strip().str.lower() # Handle different true/false representations bool_map = { 'true': True, '1': True, 'yes': True, 'y': True, 'false': False, '0': False, 'no': False, 'n': False } converted = series_str.map(bool_map) # Handle remaining NaN values converted = converted.fillna(False) # or True, depending on your preference return converted # Convert columns safely if 'LLM_klassifizierbar' in df.columns: print("\nLLM_klassifizierbar column info:") print(df['LLM_klassifizierbar'].head(10)) print("Unique values:", df['LLM_klassifizierbar'].unique()) df['LLM_klassifizierbar'] = safe_bool_convert(df['LLM_klassifizierbar']) print("After conversion:") print(df['LLM_klassifizierbar'].value_counts()) if 'GT_klassifizierbar' in df.columns: print("\nGT_klassifizierbar column info:") print(df['GT_klassifizierbar'].head(10)) print("Unique values:", df['GT_klassifizierbar'].unique()) df['GT_klassifizierbar'] = safe_bool_convert(df['GT_klassifizierbar']) print("After conversion:") print(df['GT_klassifizierbar'].value_counts()) # Create bar chart showing only True values for klassifizierbar if 'LLM_klassifizierbar' in df.columns and 'GT_klassifizierbar' in df.columns: # Get counts for True values only llm_true_count = df['LLM_klassifizierbar'].sum() gt_true_count = df['GT_klassifizierbar'].sum() # Plot using matplotlib directly fig, ax = plt.subplots(figsize=(8, 6)) x = np.arange(2) width = 0.35 bars1 = ax.bar(x[0] - width/2, llm_true_count, width, label='LLM', color='skyblue', alpha=0.8) bars2 = ax.bar(x[1] + width/2, gt_true_count, width, label='GT', color='lightcoral', alpha=0.8) # Add value labels on bars ax.annotate(f'{llm_true_count}', xy=(x[0], llm_true_count), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom') ax.annotate(f'{gt_true_count}', xy=(x[1], gt_true_count), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom') ax.set_xlabel('Classification Status (klassifizierbar)') ax.set_ylabel('Count') ax.set_title('True Values Comparison: LLM vs GT for "klassifizierbar"') ax.set_xticks(x) ax.set_xticklabels(['LLM', 'GT']) ax.legend() plt.tight_layout() plt.show() # Create confusion matrix if both columns exist if 'LLM_klassifizierbar' in df.columns and 'GT_klassifizierbar' in df.columns: try: # Ensure both columns are boolean llm_bool = df['LLM_klassifizierbar'].fillna(False).astype(bool) gt_bool = df['GT_klassifizierbar'].fillna(False).astype(bool) cm = confusion_matrix(gt_bool, llm_bool) # Plot confusion matrix fig, ax = plt.subplots(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['False ', 'True '], yticklabels=['False', 'True '], ax=ax) ax.set_xlabel('LLM Predictions ') ax.set_ylabel('GT Labels ') ax.set_title('Confusion Matrix: LLM vs GT for "klassifizierbar"') plt.tight_layout() plt.show() print("Confusion Matrix:") print(cm) except Exception as e: print(f"Error creating confusion matrix: {e}") # Show final data info print("\nFinal DataFrame info:") print(df[['LLM_klassifizierbar', 'GT_klassifizierbar']].info()) ## # %% Boxplot import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np # Load your data from TSV file file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_results_unique.tsv' df = pd.read_csv(file_path, sep='\t') # Replace comma with dot for numeric conversion in GT_EDSS and LLM_Results df['GT_EDSS'] = df['GT_EDSS'].astype(str).str.replace(',', '.') df['LLM_Results'] = df['LLM_Results'].astype(str).str.replace(',', '.') # Convert to float (handle invalid entries gracefully) df['GT_EDSS'] = pd.to_numeric(df['GT_EDSS'], errors='coerce') df['LLM_Results'] = pd.to_numeric(df['LLM_Results'], errors='coerce') # Drop rows where either column is NaN df_clean = df.dropna(subset=['GT_EDSS', 'LLM_Results']) # 1. DEFINE CATEGORY ORDER # This ensures the X-axis is numerically logical (0-1 comes before 1-2) category_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10', '10+'] # Convert the column to a Categorical type with the specific order df_clean['GT_EDSS_cat'] = pd.Categorical(df_clean['GT_EDSS'].apply(categorize_edss), categories=category_order, ordered=True) plt.figure(figsize=(14, 8)) # 2. ADD HUE FOR LEGEND # Assigning x to 'hue' allows Seaborn to generate a legend automatically box_plot = sns.boxplot( data=df_clean, x='GT_EDSS_cat', y='LLM_Results', hue='GT_EDSS_cat', # Added hue palette='viridis', linewidth=1.5, legend=True # Ensure legend is enabled ) # 3. CUSTOMIZE PLOT plt.title('Distribution of LLM_Results by GT_EDSS Category', fontsize=18, pad=20) plt.xlabel('Ground Truth EDSS Category', fontsize=14) plt.ylabel('LLM Predicted EDSS', fontsize=14) # Move legend to the side or top plt.legend(title="EDSS Categories", bbox_to_anchor=(1.05, 1), loc='upper left') plt.xticks(rotation=45, ha='right', fontsize=10) plt.grid(True, axis='y', alpha=0.3) plt.tight_layout() plt.show() ## # %% Postproccessing Column names import pandas as pd # Read the TSV file file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv' df = pd.read_csv(file_path, sep='\t') # Create a mapping dictionary for German to English column names column_mapping = { 'EDSS':'GT.EDSS', 'klassifizierbar': 'GT.klassifizierbar', 'Sehvermögen': 'GT.VISUAL_OPTIC_FUNCTIONS', 'Cerebellum': 'GT.CEREBELLAR_FUNCTIONS', 'Hirnstamm': 'GT.BRAINSTEM_FUNCTIONS', 'Sensibiliät': 'GT.SENSORY_FUNCTIONS', 'Pyramidalmotorik': 'GT.PYRAMIDAL_FUNCTIONS', 'Ambulation': 'GT.AMBULATION', 'Cerebrale_Funktion': 'GT.CEREBRAL_FUNCTIONS', 'Blasen-_und_Mastdarmfunktion': 'GT.BOWEL_AND_BLADDER_FUNCTIONS' } # Rename columns df = df.rename(columns=column_mapping) # Save the modified dataframe back to TSV file df.to_csv(file_path, sep='\t', index=False) print("Columns have been successfully renamed!") print("Renamed columns:") for old_name, new_name in column_mapping.items(): if old_name in df.columns: print(f" {old_name} -> {new_name}") ## # %% name import pandas as pd import numpy as np import seaborn as sns # Sample data (replace with your actual df) df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t') # Identify GT and Result columns gt_columns = [col for col in df.columns if col.startswith('GT.')] result_columns = [col for col in df.columns if col.startswith('result.')] # Create mapping column_mapping = {} for gt_col in gt_columns: base_name = gt_col.replace('GT.', '') result_col = f'result.{base_name}' if result_col in result_columns: column_mapping[gt_col] = result_col # Function to compute match percentage for each GT-Result pair def compute_match_percentages(df, column_mapping): percentages = [] for gt_col, result_col in column_mapping.items(): count = 0 total = len(df) for _, row in df.iterrows(): gt_val = row[gt_col] result_val = row[result_col] # Handle NaN values if pd.isna(gt_val) or pd.isna(result_val): continue # Handle non-numeric values try: gt_float = float(gt_val) result_float = float(result_val) except (ValueError, TypeError): # Skip rows with non-numeric values continue # Check if values are within 0.5 tolerance if abs(gt_float - result_float) <= 0.5: count += 1 percentage = (count / total) * 100 percentages.append({ 'GT_Column': gt_col, 'Result_Column': result_col, 'Match_Percentage': round(percentage, 1) }) return pd.DataFrame(percentages) # Compute match percentages match_df = compute_match_percentages(df, column_mapping) # Create a pivot table for gradient display (optional but helpful) pivot_table = match_df.set_index(['GT_Column', 'Result_Column'])['Match_Percentage'].unstack(fill_value=0) # Apply gradient background cm = sns.light_palette("green", as_cmap=True) styled_table = pivot_table.style.background_gradient(cmap=cm, axis=None) # Display result print("Agreement Percentage Table (with gradient):") styled_table ## # %% name import pandas as pd import numpy as np import seaborn as sns # first, let's identify the gt and result columns gt_columns = [col for col in df.columns if col.startswith('gt.')] result_columns = [col for col in df.columns if col.startswith('result.')] # create a mapping dictionary for matching columns column_mapping = {} for gt_col in gt_columns: # extract the base name (remove 'gt.' prefix) base_name = gt_col.replace('gt.', '') # find matching result column result_col = f'result.{base_name}' if result_col in result_columns: column_mapping[gt_col] = result_col # create comparison dataframe with error handling def safe_float_convert(value): '''safely convert value to float, returning nan for non-numeric values''' if pd.isna(value): return np.nan try: return float(value) except (valueerror, typeerror): return np.nan def create_comparison_table(df, column_mapping): # create a table showing match status for all comparisons comparison_matrix = pd.dataframe(index=df.index, columns=[f"{gt_col}_vs_{result_col}" for gt_col, result_col in column_mapping.items()]) for idx, row in df.iterrows(): for gt_col, result_col in column_mapping.items(): gt_val = row[gt_col] result_val = row[result_col] # handle nan values if pd.isna(gt_val) or pd.isna(result_val): comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0 else: # safely convert to float gt_float = safe_float_convert(gt_val) result_float = safe_float_convert(result_val) # if either conversion failed, mark as no match if pd.isna(gt_float) or pd.isna(result_float): comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0 else: # check if values are within 0.5 tolerance if abs(gt_float - result_float) <= 0.5: comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 1 else: comparison_matrix.loc[idx, f"{gt_col}_vs_{result_col}"] = 0 return comparison_matrix # generate the comparison matrix comprehensive_matrix = create_comparison_table(df, column_mapping) # create summary statistics summary_data = [] for gt_col, result_col in column_mapping.items(): match_count = 0 total_count = len(df) for idx, row in df.iterrows(): gt_val = row[gt_col] result_val = row[result_col] # handle nan values if pd.isna(gt_val) or pd.isna(result_val): continue else: # safely convert to float gt_float = safe_float_convert(gt_val) result_float = safe_float_convert(result_val) # if both conversions succeeded, check tolerance if not pd.isna(gt_float) and not pd.isna(result_float): if abs(gt_float - result_float) <= 0.5: match_count += 1 summary_data.append({ 'gt_column': gt_col, 'result_column': result_col, 'match_count': match_count, 'total_records': total_count, 'match_percentage': f"{(match_count/total_count*100):.1f}%" if total_count > 0 else "0.0%" }) summary_df = pd.dataframe(summary_data) # display the summary print("comparison summary:") print("="*80) for _, row in summary_df.iterrows(): print(f"{row['gt_column']} vs {row['result_column']}:") print(f" matches: {row['match_count']}/{row['total_records']} ({row['match_percentage']})") print() # create gradient styled table cm = sns.light_palette("green", as_cmap=true) print("comparison results with gradient:") comprehensive_gradient = comprehensive_matrix.style.background_gradient(cmap=cm, axis=0) # display the gradient table comprehensive_gradient # if you want to see the actual comparison data print("\nraw comparison data:") print(comprehensive_matrix.head()) ## # %% name import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt # Load data df = pd.read_csv("/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv", sep='\t') # 1. Faster, vectorized computation gt_columns = [col for col in df.columns if col.startswith('GT.')] data_list = [] for gt_col in gt_columns: base_name = gt_col.replace('GT.', '') result_col = f'result.{base_name}' if result_col in df.columns: # Convert to numeric, force invalid to NaN s1 = pd.to_numeric(df[gt_col], errors='coerce') s2 = pd.to_numeric(df[result_col], errors='coerce') # Calculate matches (abs difference <= 0.5) # Note: .sum() treats True as 1 matches = (np.abs(s1 - s2) <= 0.5).sum() percentage = (matches / len(df)) * 100 data_list.append({ 'GT': gt_col.replace('GT.', ''), 'Match %': round(percentage, 1) }) # 2. Prepare Data for Plotting match_df = pd.DataFrame(data_list) # We pivot to get a format suitable for a heatmap plot_data = match_df.set_index('GT') # 3. Create the Plot plt.figure(figsize=(10, 8)) sns.set_theme(style="white") # Create heatmap ax = sns.heatmap( plot_data, annot=True, # Show the numbers in the boxes fmt=".1f", # Format to 1 decimal place cmap="YlGnBu", # Yellow-Green-Blue color palette cbar_kws={'label': 'Agreement (%)'}, linewidths=.5 ) plt.title('Agreement Percentage (Tolerance ±0.5)', pad=20) plt.tight_layout() # 4. Save as SVG plt.savefig("agreement_table.svg", format='svg') print("Successfully saved agreement_table.svg") # Show plot if running in a GUI environment plt.show() ##