From ce3baff6ccf3d6679dd5d862452f6577960cd4e9 Mon Sep 17 00:00:00 2001 From: Shahin Ramezanzadeh Date: Mon, 19 Jan 2026 02:29:38 +0100 Subject: [PATCH] optimize with new column names --- Data/show_plots.py | 98 +++++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/Data/show_plots.py b/Data/show_plots.py index bda9e3c..51f24ef 100644 --- a/Data/show_plots.py +++ b/Data/show_plots.py @@ -93,19 +93,19 @@ from sklearn.metrics import confusion_matrix, classification_report import seaborn as sns # Load your data from TSV file -file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_results_unique.tsv' +file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv' df = pd.read_csv(file_path, sep='\t') -# Replace comma with dot for numeric conversion in GT_EDSS and LLM_Results -df['GT_EDSS'] = df['GT_EDSS'].astype(str).str.replace(',', '.') -df['LLM_Results'] = df['LLM_Results'].astype(str).str.replace(',', '.') +# Replace comma with dot for numeric conversion in GT.EDSS and result.EDSS +df['GT.EDSS'] = df['GT.EDSS'].astype(str).str.replace(',', '.') +df['result.EDSS'] = df['result.EDSS'].astype(str).str.replace(',', '.') # Convert to float (handle invalid entries gracefully) -df['GT_EDSS'] = pd.to_numeric(df['GT_EDSS'], errors='coerce') -df['LLM_Results'] = pd.to_numeric(df['LLM_Results'], errors='coerce') +df['GT.EDSS'] = pd.to_numeric(df['GT.EDSS'], errors='coerce') +df['result.EDSS'] = pd.to_numeric(df['result.EDSS'], errors='coerce') # Drop rows where either column is NaN -df_clean = df.dropna(subset=['GT_EDSS', 'LLM_Results']) +df_clean = df.dropna(subset=['GT.EDSS', 'result.EDSS']) # For confusion matrix, we need to categorize the values # Let's create categories up to 10 (0-1, 1-2, 2-3, ..., 9-10) @@ -136,14 +136,14 @@ def categorize_edss(value): return '10+' # Create categorical versions -df_clean['GT_EDSS_cat'] = df_clean['GT_EDSS'].apply(categorize_edss) -df_clean['LLM_Results_cat'] = df_clean['LLM_Results'].apply(categorize_edss) +df_clean['GT.EDSS_cat'] = df_clean['GT.EDSS'].apply(categorize_edss) +df_clean['result.EDSS_cat'] = df_clean['result.EDSS'].apply(categorize_edss) # Remove any NaN categories -df_clean = df_clean.dropna(subset=['GT_EDSS_cat', 'LLM_Results_cat']) +df_clean = df_clean.dropna(subset=['GT.EDSS_cat', 'result.EDSS_cat']) # Create confusion matrix -cm = confusion_matrix(df_clean['GT_EDSS_cat'], df_clean['LLM_Results_cat'], +cm = confusion_matrix(df_clean['GT.EDSS_cat'], df_clean['result.EDSS_cat'], labels=['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']) # Plot confusion matrix @@ -151,15 +151,15 @@ plt.figure(figsize=(10, 8)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10'], yticklabels=['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']) -plt.title('Confusion Matrix: GT_EDSS vs interferred EDSS (Categorized 0-10)') -plt.xlabel('LLM_Results Category') -plt.ylabel('GT_EDSS Category') +plt.title('Confusion Matrix: Ground truth EDSS vs interferred EDSS (Categorized 0-10)') +plt.xlabel('LLM Generated EDSS') +plt.ylabel('Ground Truth EDSS') plt.tight_layout() plt.show() # Print classification report print("Classification Report:") -print(classification_report(df_clean['GT_EDSS_cat'], df_clean['LLM_Results_cat'])) +print(classification_report(df_clean['GT.EDSS_cat'], df_clean['result.EDSS_cat'])) # Print raw counts print("\nConfusion Matrix (Raw Counts):") @@ -177,7 +177,7 @@ from sklearn.metrics import confusion_matrix import numpy as np # Load your data from TSV file -file_path ='/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_results_unique.tsv' +file_path ='/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv' df = pd.read_csv(file_path, sep='\t') @@ -209,29 +209,29 @@ def safe_bool_convert(series): return converted # Convert columns safely -if 'LLM_klassifizierbar' in df.columns: - print("\nLLM_klassifizierbar column info:") - print(df['LLM_klassifizierbar'].head(10)) - print("Unique values:", df['LLM_klassifizierbar'].unique()) +if 'result.klassifizierbar' in df.columns: + print("\nresult.klassifizierbar column info:") + print(df['result.klassifizierbar'].head(10)) + print("Unique values:", df['result.klassifizierbar'].unique()) - df['LLM_klassifizierbar'] = safe_bool_convert(df['LLM_klassifizierbar']) + df['result.klassifizierbar'] = safe_bool_convert(df['result.klassifizierbar']) print("After conversion:") - print(df['LLM_klassifizierbar'].value_counts()) + print(df['result.klassifizierbar'].value_counts()) -if 'GT_klassifizierbar' in df.columns: - print("\nGT_klassifizierbar column info:") - print(df['GT_klassifizierbar'].head(10)) - print("Unique values:", df['GT_klassifizierbar'].unique()) +if 'GT.klassifizierbar' in df.columns: + print("\nGT.klassifizierbar column info:") + print(df['GT.klassifizierbar'].head(10)) + print("Unique values:", df['GT.klassifizierbar'].unique()) - df['GT_klassifizierbar'] = safe_bool_convert(df['GT_klassifizierbar']) + df['GT.klassifizierbar'] = safe_bool_convert(df['GT.klassifizierbar']) print("After conversion:") - print(df['GT_klassifizierbar'].value_counts()) + print(df['GT.klassifizierbar'].value_counts()) # Create bar chart showing only True values for klassifizierbar -if 'LLM_klassifizierbar' in df.columns and 'GT_klassifizierbar' in df.columns: +if 'result.klassifizierbar' in df.columns and 'GT.klassifizierbar' in df.columns: # Get counts for True values only - llm_true_count = df['LLM_klassifizierbar'].sum() - gt_true_count = df['GT_klassifizierbar'].sum() + llm_true_count = df['result.klassifizierbar'].sum() + gt_true_count = df['GT.klassifizierbar'].sum() # Plot using matplotlib directly fig, ax = plt.subplots(figsize=(8, 6)) @@ -266,11 +266,11 @@ if 'LLM_klassifizierbar' in df.columns and 'GT_klassifizierbar' in df.columns: plt.show() # Create confusion matrix if both columns exist -if 'LLM_klassifizierbar' in df.columns and 'GT_klassifizierbar' in df.columns: +if 'result.klassifizierbar' in df.columns and 'GT.klassifizierbar' in df.columns: try: # Ensure both columns are boolean - llm_bool = df['LLM_klassifizierbar'].fillna(False).astype(bool) - gt_bool = df['GT_klassifizierbar'].fillna(False).astype(bool) + llm_bool = df['result.klassifizierbar'].fillna(False).astype(bool) + gt_bool = df['GT.klassifizierbar'].fillna(False).astype(bool) cm = confusion_matrix(gt_bool, llm_bool) @@ -295,7 +295,7 @@ if 'LLM_klassifizierbar' in df.columns and 'GT_klassifizierbar' in df.columns: # Show final data info print("\nFinal DataFrame info:") -print(df[['LLM_klassifizierbar', 'GT_klassifizierbar']].info()) +print(df[['result.klassifizierbar', 'GT.klassifizierbar']].info()) ## @@ -312,23 +312,23 @@ import numpy as np file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_results_unique.tsv' df = pd.read_csv(file_path, sep='\t') -# Replace comma with dot for numeric conversion in GT_EDSS and LLM_Results -df['GT_EDSS'] = df['GT_EDSS'].astype(str).str.replace(',', '.') -df['LLM_Results'] = df['LLM_Results'].astype(str).str.replace(',', '.') +# Replace comma with dot for numeric conversion in GT.EDSS and result.EDSS +df['GT.EDSS'] = df['GT.EDSS'].astype(str).str.replace(',', '.') +df['result.EDSS'] = df['result.EDSS'].astype(str).str.replace(',', '.') # Convert to float (handle invalid entries gracefully) -df['GT_EDSS'] = pd.to_numeric(df['GT_EDSS'], errors='coerce') -df['LLM_Results'] = pd.to_numeric(df['LLM_Results'], errors='coerce') +df['GT.EDSS'] = pd.to_numeric(df['GT.EDSS'], errors='coerce') +df['result.EDSS'] = pd.to_numeric(df['result.EDSS'], errors='coerce') # Drop rows where either column is NaN -df_clean = df.dropna(subset=['GT_EDSS', 'LLM_Results']) +df_clean = df.dropna(subset=['GT.EDSS', 'result.EDSS']) # 1. DEFINE CATEGORY ORDER # This ensures the X-axis is numerically logical (0-1 comes before 1-2) category_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10', '10+'] # Convert the column to a Categorical type with the specific order -df_clean['GT_EDSS_cat'] = pd.Categorical(df_clean['GT_EDSS'].apply(categorize_edss), +df_clean['GT.EDSS_cat'] = pd.Categorical(df_clean['GT.EDSS'].apply(categorize_edss), categories=category_order, ordered=True) @@ -338,16 +338,16 @@ plt.figure(figsize=(14, 8)) # Assigning x to 'hue' allows Seaborn to generate a legend automatically box_plot = sns.boxplot( data=df_clean, - x='GT_EDSS_cat', - y='LLM_Results', - hue='GT_EDSS_cat', # Added hue + x='GT.EDSS_cat', + y='result.EDSS', + hue='GT.EDSS_cat', # Added hue palette='viridis', linewidth=1.5, legend=True # Ensure legend is enabled ) # 3. CUSTOMIZE PLOT -plt.title('Distribution of LLM_Results by GT_EDSS Category', fontsize=18, pad=20) +plt.title('Distribution of result.EDSS by GT.EDSS Category', fontsize=18, pad=20) plt.xlabel('Ground Truth EDSS Category', fontsize=14) plt.ylabel('LLM Predicted EDSS', fontsize=14) @@ -545,7 +545,7 @@ def create_luxury_table(df, output_file="edss_agreement.png"): plt.title('EDSS Subcategory Consistency Analysis', fontsize=16, pad=40, fontweight='bold', color='#2c3e50') # Add a subtle footer - plt.figtext(0.5, 0.02, "Tolerance: ±0.5 points | N = [Total Samples]", + plt.figtext(0.5, 0.0, "Tolerance: ±0.5 points", wrap=True, horizontalalignment='center', fontsize=10, color='gray', style='italic') # Save with high resolution @@ -561,8 +561,8 @@ create_luxury_table(match_df) save_styled_table(match_df) # 6. Save as SVG -#plt.savefig("agreement_table.svg", format='svg', dpi=300, bbox_inches='tight') -#print("Successfully saved agreement_table.svg") +plt.savefig("agreement_table.svg", format='svg', dpi=300, bbox_inches='tight') +print("Successfully saved agreement_table.svg") # Show plot if running in a GUI environment plt.show()