optimize with new column names
This commit is contained in:
@@ -93,19 +93,19 @@ from sklearn.metrics import confusion_matrix, classification_report
|
||||
import seaborn as sns
|
||||
|
||||
# Load your data from TSV file
|
||||
file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_results_unique.tsv'
|
||||
file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv'
|
||||
df = pd.read_csv(file_path, sep='\t')
|
||||
|
||||
# Replace comma with dot for numeric conversion in GT_EDSS and LLM_Results
|
||||
df['GT_EDSS'] = df['GT_EDSS'].astype(str).str.replace(',', '.')
|
||||
df['LLM_Results'] = df['LLM_Results'].astype(str).str.replace(',', '.')
|
||||
# Replace comma with dot for numeric conversion in GT.EDSS and result.EDSS
|
||||
df['GT.EDSS'] = df['GT.EDSS'].astype(str).str.replace(',', '.')
|
||||
df['result.EDSS'] = df['result.EDSS'].astype(str).str.replace(',', '.')
|
||||
|
||||
# Convert to float (handle invalid entries gracefully)
|
||||
df['GT_EDSS'] = pd.to_numeric(df['GT_EDSS'], errors='coerce')
|
||||
df['LLM_Results'] = pd.to_numeric(df['LLM_Results'], errors='coerce')
|
||||
df['GT.EDSS'] = pd.to_numeric(df['GT.EDSS'], errors='coerce')
|
||||
df['result.EDSS'] = pd.to_numeric(df['result.EDSS'], errors='coerce')
|
||||
|
||||
# Drop rows where either column is NaN
|
||||
df_clean = df.dropna(subset=['GT_EDSS', 'LLM_Results'])
|
||||
df_clean = df.dropna(subset=['GT.EDSS', 'result.EDSS'])
|
||||
|
||||
# For confusion matrix, we need to categorize the values
|
||||
# Let's create categories up to 10 (0-1, 1-2, 2-3, ..., 9-10)
|
||||
@@ -136,14 +136,14 @@ def categorize_edss(value):
|
||||
return '10+'
|
||||
|
||||
# Create categorical versions
|
||||
df_clean['GT_EDSS_cat'] = df_clean['GT_EDSS'].apply(categorize_edss)
|
||||
df_clean['LLM_Results_cat'] = df_clean['LLM_Results'].apply(categorize_edss)
|
||||
df_clean['GT.EDSS_cat'] = df_clean['GT.EDSS'].apply(categorize_edss)
|
||||
df_clean['result.EDSS_cat'] = df_clean['result.EDSS'].apply(categorize_edss)
|
||||
|
||||
# Remove any NaN categories
|
||||
df_clean = df_clean.dropna(subset=['GT_EDSS_cat', 'LLM_Results_cat'])
|
||||
df_clean = df_clean.dropna(subset=['GT.EDSS_cat', 'result.EDSS_cat'])
|
||||
|
||||
# Create confusion matrix
|
||||
cm = confusion_matrix(df_clean['GT_EDSS_cat'], df_clean['LLM_Results_cat'],
|
||||
cm = confusion_matrix(df_clean['GT.EDSS_cat'], df_clean['result.EDSS_cat'],
|
||||
labels=['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10'])
|
||||
|
||||
# Plot confusion matrix
|
||||
@@ -151,15 +151,15 @@ plt.figure(figsize=(10, 8))
|
||||
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
||||
xticklabels=['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10'],
|
||||
yticklabels=['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10'])
|
||||
plt.title('Confusion Matrix: GT_EDSS vs interferred EDSS (Categorized 0-10)')
|
||||
plt.xlabel('LLM_Results Category')
|
||||
plt.ylabel('GT_EDSS Category')
|
||||
plt.title('Confusion Matrix: Ground truth EDSS vs interferred EDSS (Categorized 0-10)')
|
||||
plt.xlabel('LLM Generated EDSS')
|
||||
plt.ylabel('Ground Truth EDSS')
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
# Print classification report
|
||||
print("Classification Report:")
|
||||
print(classification_report(df_clean['GT_EDSS_cat'], df_clean['LLM_Results_cat']))
|
||||
print(classification_report(df_clean['GT.EDSS_cat'], df_clean['result.EDSS_cat']))
|
||||
|
||||
# Print raw counts
|
||||
print("\nConfusion Matrix (Raw Counts):")
|
||||
@@ -177,7 +177,7 @@ from sklearn.metrics import confusion_matrix
|
||||
import numpy as np
|
||||
|
||||
# Load your data from TSV file
|
||||
file_path ='/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_results_unique.tsv'
|
||||
file_path ='/home/shahin/Lab/Doktorarbeit/Barcelona/Data/Join_edssandsub.tsv'
|
||||
|
||||
df = pd.read_csv(file_path, sep='\t')
|
||||
|
||||
@@ -209,29 +209,29 @@ def safe_bool_convert(series):
|
||||
return converted
|
||||
|
||||
# Convert columns safely
|
||||
if 'LLM_klassifizierbar' in df.columns:
|
||||
print("\nLLM_klassifizierbar column info:")
|
||||
print(df['LLM_klassifizierbar'].head(10))
|
||||
print("Unique values:", df['LLM_klassifizierbar'].unique())
|
||||
if 'result.klassifizierbar' in df.columns:
|
||||
print("\nresult.klassifizierbar column info:")
|
||||
print(df['result.klassifizierbar'].head(10))
|
||||
print("Unique values:", df['result.klassifizierbar'].unique())
|
||||
|
||||
df['LLM_klassifizierbar'] = safe_bool_convert(df['LLM_klassifizierbar'])
|
||||
df['result.klassifizierbar'] = safe_bool_convert(df['result.klassifizierbar'])
|
||||
print("After conversion:")
|
||||
print(df['LLM_klassifizierbar'].value_counts())
|
||||
print(df['result.klassifizierbar'].value_counts())
|
||||
|
||||
if 'GT_klassifizierbar' in df.columns:
|
||||
print("\nGT_klassifizierbar column info:")
|
||||
print(df['GT_klassifizierbar'].head(10))
|
||||
print("Unique values:", df['GT_klassifizierbar'].unique())
|
||||
if 'GT.klassifizierbar' in df.columns:
|
||||
print("\nGT.klassifizierbar column info:")
|
||||
print(df['GT.klassifizierbar'].head(10))
|
||||
print("Unique values:", df['GT.klassifizierbar'].unique())
|
||||
|
||||
df['GT_klassifizierbar'] = safe_bool_convert(df['GT_klassifizierbar'])
|
||||
df['GT.klassifizierbar'] = safe_bool_convert(df['GT.klassifizierbar'])
|
||||
print("After conversion:")
|
||||
print(df['GT_klassifizierbar'].value_counts())
|
||||
print(df['GT.klassifizierbar'].value_counts())
|
||||
|
||||
# Create bar chart showing only True values for klassifizierbar
|
||||
if 'LLM_klassifizierbar' in df.columns and 'GT_klassifizierbar' in df.columns:
|
||||
if 'result.klassifizierbar' in df.columns and 'GT.klassifizierbar' in df.columns:
|
||||
# Get counts for True values only
|
||||
llm_true_count = df['LLM_klassifizierbar'].sum()
|
||||
gt_true_count = df['GT_klassifizierbar'].sum()
|
||||
llm_true_count = df['result.klassifizierbar'].sum()
|
||||
gt_true_count = df['GT.klassifizierbar'].sum()
|
||||
|
||||
# Plot using matplotlib directly
|
||||
fig, ax = plt.subplots(figsize=(8, 6))
|
||||
@@ -266,11 +266,11 @@ if 'LLM_klassifizierbar' in df.columns and 'GT_klassifizierbar' in df.columns:
|
||||
plt.show()
|
||||
|
||||
# Create confusion matrix if both columns exist
|
||||
if 'LLM_klassifizierbar' in df.columns and 'GT_klassifizierbar' in df.columns:
|
||||
if 'result.klassifizierbar' in df.columns and 'GT.klassifizierbar' in df.columns:
|
||||
try:
|
||||
# Ensure both columns are boolean
|
||||
llm_bool = df['LLM_klassifizierbar'].fillna(False).astype(bool)
|
||||
gt_bool = df['GT_klassifizierbar'].fillna(False).astype(bool)
|
||||
llm_bool = df['result.klassifizierbar'].fillna(False).astype(bool)
|
||||
gt_bool = df['GT.klassifizierbar'].fillna(False).astype(bool)
|
||||
|
||||
cm = confusion_matrix(gt_bool, llm_bool)
|
||||
|
||||
@@ -295,7 +295,7 @@ if 'LLM_klassifizierbar' in df.columns and 'GT_klassifizierbar' in df.columns:
|
||||
|
||||
# Show final data info
|
||||
print("\nFinal DataFrame info:")
|
||||
print(df[['LLM_klassifizierbar', 'GT_klassifizierbar']].info())
|
||||
print(df[['result.klassifizierbar', 'GT.klassifizierbar']].info())
|
||||
|
||||
##
|
||||
|
||||
@@ -312,23 +312,23 @@ import numpy as np
|
||||
file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/join_results_unique.tsv'
|
||||
df = pd.read_csv(file_path, sep='\t')
|
||||
|
||||
# Replace comma with dot for numeric conversion in GT_EDSS and LLM_Results
|
||||
df['GT_EDSS'] = df['GT_EDSS'].astype(str).str.replace(',', '.')
|
||||
df['LLM_Results'] = df['LLM_Results'].astype(str).str.replace(',', '.')
|
||||
# Replace comma with dot for numeric conversion in GT.EDSS and result.EDSS
|
||||
df['GT.EDSS'] = df['GT.EDSS'].astype(str).str.replace(',', '.')
|
||||
df['result.EDSS'] = df['result.EDSS'].astype(str).str.replace(',', '.')
|
||||
|
||||
# Convert to float (handle invalid entries gracefully)
|
||||
df['GT_EDSS'] = pd.to_numeric(df['GT_EDSS'], errors='coerce')
|
||||
df['LLM_Results'] = pd.to_numeric(df['LLM_Results'], errors='coerce')
|
||||
df['GT.EDSS'] = pd.to_numeric(df['GT.EDSS'], errors='coerce')
|
||||
df['result.EDSS'] = pd.to_numeric(df['result.EDSS'], errors='coerce')
|
||||
|
||||
# Drop rows where either column is NaN
|
||||
df_clean = df.dropna(subset=['GT_EDSS', 'LLM_Results'])
|
||||
df_clean = df.dropna(subset=['GT.EDSS', 'result.EDSS'])
|
||||
|
||||
# 1. DEFINE CATEGORY ORDER
|
||||
# This ensures the X-axis is numerically logical (0-1 comes before 1-2)
|
||||
category_order = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10', '10+']
|
||||
|
||||
# Convert the column to a Categorical type with the specific order
|
||||
df_clean['GT_EDSS_cat'] = pd.Categorical(df_clean['GT_EDSS'].apply(categorize_edss),
|
||||
df_clean['GT.EDSS_cat'] = pd.Categorical(df_clean['GT.EDSS'].apply(categorize_edss),
|
||||
categories=category_order,
|
||||
ordered=True)
|
||||
|
||||
@@ -338,16 +338,16 @@ plt.figure(figsize=(14, 8))
|
||||
# Assigning x to 'hue' allows Seaborn to generate a legend automatically
|
||||
box_plot = sns.boxplot(
|
||||
data=df_clean,
|
||||
x='GT_EDSS_cat',
|
||||
y='LLM_Results',
|
||||
hue='GT_EDSS_cat', # Added hue
|
||||
x='GT.EDSS_cat',
|
||||
y='result.EDSS',
|
||||
hue='GT.EDSS_cat', # Added hue
|
||||
palette='viridis',
|
||||
linewidth=1.5,
|
||||
legend=True # Ensure legend is enabled
|
||||
)
|
||||
|
||||
# 3. CUSTOMIZE PLOT
|
||||
plt.title('Distribution of LLM_Results by GT_EDSS Category', fontsize=18, pad=20)
|
||||
plt.title('Distribution of result.EDSS by GT.EDSS Category', fontsize=18, pad=20)
|
||||
plt.xlabel('Ground Truth EDSS Category', fontsize=14)
|
||||
plt.ylabel('LLM Predicted EDSS', fontsize=14)
|
||||
|
||||
@@ -545,7 +545,7 @@ def create_luxury_table(df, output_file="edss_agreement.png"):
|
||||
plt.title('EDSS Subcategory Consistency Analysis', fontsize=16, pad=40, fontweight='bold', color='#2c3e50')
|
||||
|
||||
# Add a subtle footer
|
||||
plt.figtext(0.5, 0.02, "Tolerance: ±0.5 points | N = [Total Samples]",
|
||||
plt.figtext(0.5, 0.0, "Tolerance: ±0.5 points",
|
||||
wrap=True, horizontalalignment='center', fontsize=10, color='gray', style='italic')
|
||||
|
||||
# Save with high resolution
|
||||
@@ -561,8 +561,8 @@ create_luxury_table(match_df)
|
||||
save_styled_table(match_df)
|
||||
# 6. Save as SVG
|
||||
|
||||
#plt.savefig("agreement_table.svg", format='svg', dpi=300, bbox_inches='tight')
|
||||
#print("Successfully saved agreement_table.svg")
|
||||
plt.savefig("agreement_table.svg", format='svg', dpi=300, bbox_inches='tight')
|
||||
print("Successfully saved agreement_table.svg")
|
||||
|
||||
# Show plot if running in a GUI environment
|
||||
plt.show()
|
||||
|
||||
Reference in New Issue
Block a user