Files
EDSS-calc/figure1.py
2026-02-23 15:06:54 +01:00

383 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# %% Explore
import pandas as pd
# Load the dataset
file_path = '/home/shahin/Lab/Doktorarbeit/Barcelona/Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv'
df = pd.read_csv(file_path, sep=';')
# Show basic information about the dataset
print("Dataset shape:", df.shape)
print("\nColumn names:")
for col in df.columns:
print(f" {col}")
# Count unique patients
unique_patients = df['unique_id'].nunique()
print(f"\nNumber of unique patients: {unique_patients}")
# Count how many times each patient appears
patient_counts = df['unique_id'].value_counts()
# Calculate average appearances per patient
average_appearances = patient_counts.mean()
# Show some statistics
print(f"\nAverage number of times each patient appeared: {average_appearances:.2f}")
print(f"\nMinimum appearances for any patient: {patient_counts.min()}")
print(f"Maximum appearances for any patient: {patient_counts.max()}")
# Show the first few patient counts
print("\nFirst 10 patients and their appearance counts:")
print(patient_counts.head(10))
# Show how many patients appear exactly once
single_occurrence = (patient_counts == 1).sum()
print(f"\nNumber of patients who appeared exactly once: {single_occurrence}")
# Show how many patients appear more than once
multiple_occurrence = (patient_counts > 1).sum()
print(f"Number of patients who appeared more than once: {multiple_occurrence}")
##
# %% EDSS Dist
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Assuming your data is loaded into a DataFrame called 'df'
# If you need to load from file:
df = pd.read_csv('/home/shahin/Lab/Doktorarbeit/Barcelona/Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv', sep=';')
# Create EDSS categories function with better error handling
def categorize_edss(edss_value):
# Handle NaN and empty values
if pd.isna(edss_value) or edss_value == '' or edss_value is None:
return 'No EDSS'
# Convert to string and replace comma with dot
edss_str = str(edss_value).strip()
# Handle special cases
if edss_str.lower() in ['unverändert', 'unchanged', 'n/a', 'na', 'none', 'null', '']:
return 'Invalid'
try:
# Replace comma with dot for decimal numbers
edss_str = edss_str.replace(',', '.')
# Try to convert to float
edss_float = float(edss_str)
# Categorize based on value
if 0 <= edss_float <= 1:
return '0-1'
elif 1 < edss_float <= 2:
return '1-2'
elif 2 < edss_float <= 3:
return '2-3'
elif 3 < edss_float <= 4:
return '3-4'
elif 4 < edss_float <= 5:
return '4-5'
elif 5 < edss_float <= 6:
return '5-6'
elif 6 < edss_float <= 7:
return '6-7'
elif 7 < edss_float <= 8:
return '7-8'
elif 8 < edss_float <= 9:
return '8-9'
elif 9 < edss_float <= 10:
return '9-10'
else:
return 'Invalid'
except (ValueError, TypeError):
return 'Invalid'
# Apply categorization
df['EDSS_Category'] = df['EDSS'].apply(categorize_edss)
# Count patients in each category
edss_counts = df['EDSS_Category'].value_counts().sort_index()
# Print the counts for reference
print("Patient counts by EDSS category:")
print(edss_counts)
# Create the bar chart
plt.figure(figsize=(12, 6))
bars = plt.bar(edss_counts.index, edss_counts.values, color='skyblue', edgecolor='navy', alpha=0.7)
# Add value labels on top of bars
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2., height,
f'{int(height)}',
ha='center', va='bottom')
plt.xlabel('EDSS Score Categories')
plt.ylabel('Number of Cases')
plt.title('Distribution of Patients by EDSS Score Categories')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()
##
# %% Pie Chart
import matplotlib.pyplot as plt
import pandas as pd
# Your data
data = {
'Visit': [9, 8, 7, 6, 5, 4, 3, 2, 1],
'patient_count': [2, 3, 3, 6, 13, 17, 28, 24, 32]
}
df = pd.DataFrame(data)
# Create pie chart
plt.figure(figsize=(10, 8))
# Define colors for better visualization
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99', '#ff99cc',
'#c2c2f0', '#ffb3b3', '#99ffcc', '#ffccff']
# Create pie chart with custom labels showing both count and percentage
labels = [f'{visit} Visits \n{count} patients\n({count/128*100:.1f}%)'
for visit, count in zip(df['Visit'], df['patient_count'])]
wedges, texts, autotexts = plt.pie(df['patient_count'],
labels=labels,
colors=colors,
autopct='%1.1f%%',
startangle=90,
explode=[0.05] * len(df)) # Slightly separate slices
# Add title
plt.title('Patient Visit Frequency Distribution\nTotal Patients: 128\nTotal Cases: 396', fontsize=16, pad=20)
# Ensure pie chart is circular
plt.axis('equal')
# Adjust layout to prevent legend cutoff
plt.tight_layout()
# Save as SVG (this will create a high-quality vector graphic)
plt.savefig('patient_visit_frequency.svg', format='svg', bbox_inches='tight', dpi=300)
# Also save as PNG for reference
plt.savefig('patient_visit_frequency.png', format='png', bbox_inches='tight', dpi=300)
# Show the chart
plt.show()
# Print summary statistics
print(f"Total patients: {sum(df['patient_count'])}")
print("\nVisit frequency breakdown:")
for visit, count in zip(df['Visit'], df['patient_count']):
percentage = (count / 128) * 100
print(f"{visit} Visits : {count} patients ({percentage:.1f}%)")
print("\nFiles created:")
print("- patient_visit_frequency.svg (SVG format)")
print("- patient_visit_frequency.png (PNG format)")
##
# %% Slope Chart
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# 1. Data Prep (Cleaning and filtering for patients with >1 visit)
df['EDSS'] = pd.to_numeric(df['EDSS'].astype(str).str.replace(',', '.'), errors='coerce')
df = df.dropna(subset=['unique_id', 'EDSS', 'MedDatum']).sort_values(['unique_id', 'MedDatum'])
# Extract first and last
first_last = df.groupby('unique_id').apply(lambda x: x.iloc[[0, -1]] if len(x) > 1 else None).reset_index(drop=True)
first_last['Visit_Type'] = first_last.groupby('unique_id').cumcount().map({0: 'First Visit', 1: 'Last Visit'})
# 2. Set the style
sns.set_style("white")
plt.figure(figsize=(10, 8))
# Define sophisticated colors
color_worsened = "#E67E22" # Muted Orange
color_improved = "#2ECC71" # Muted Green
color_stable = "#BDC3C7" # Soft Grey
avg_color = "#2C3E50" # Deep Navy
# 3. Plotting individual lines
for pid in first_last['unique_id'].unique():
p_data = first_last[first_last['unique_id'] == pid]
start, end = p_data.iloc[0]['EDSS'], p_data.iloc[1]['EDSS']
# Logic for color and linewidth
if end > start:
color, alpha, lw = color_worsened, 0.3, 1.2
elif end < start:
color, alpha, lw = color_improved, 0.3, 1.2
else:
color, alpha, lw = color_stable, 0.15, 0.8
plt.plot(p_data['Visit_Type'], p_data['EDSS'],
color=color, alpha=alpha, linewidth=lw, marker='o', markersize=4, markerfacecolor='white')
# 4. Add Background Distribution (Violin Plot)
sns.violinplot(x='Visit_Type', y='EDSS', data=first_last,
inner=None, color=".95", linewidth=0)
# 5. Add the Population Mean (The "Hero" line)
summary = first_last.groupby('Visit_Type')['EDSS'].mean().reindex(['First Visit', 'Last Visit'])
plt.plot(summary.index, summary.values, color=avg_color, linewidth=4,
marker='s', markersize=10, label='Average Population Trend', zorder=10)
# 6. Annotate the Average values
for i, val in enumerate(summary.values):
plt.text(i, val + 0.2, f'{val:.2f}', color=avg_color, fontweight='bold', ha='center')
# Aesthetics
plt.title('Evolution of EDSS Scores: First vs. Last Clinical Visit', fontsize=16, pad=20, fontweight='bold')
plt.ylabel('EDSS Score', fontsize=12)
plt.xlabel('')
plt.xticks(fontsize=12, fontweight='bold')
plt.yticks(np.arange(0, 10.5, 1)) # Typical EDSS scale
plt.ylim(-0.5, 10)
# Remove chart junk
sns.despine(left=True, bottom=True)
plt.grid(axis='y', color='gray', linestyle='--', alpha=0.2)
plt.legend(frameon=False, loc='upper center', bbox_to_anchor=(0.5, -0.05))
plt.tight_layout()
plt.show()
##
# %% name
import matplotlib.pyplot as plt
# Data
data = {
'Visit': [9, 8, 7, 6, 5, 4, 3, 2, 1],
'patient_count': [2, 3, 3, 6, 13, 17, 28, 24, 32]
}
# Create figure and axis
fig, ax = plt.subplots(figsize=(10, 6))
# Plot the bar chart
bars = ax.bar(data['Visit'], data['patient_count'], color='darkblue', label='Patients by Visit Count')
# Add labels and title
ax.set_xlabel('Visit Number (from last to first)', fontsize=12)
ax.set_ylabel('Number of Patients', fontsize=12)
ax.set_title('Patient Visits by Visit Number', fontsize=14)
# Invert x-axis to show Visit 9 on the left (descending order) if desired, but keep natural order (19 left to right)
# For descending order (9→1 from left to right), we'd need to reverse:
# Visit = data['Visit'][::-1], patient_count = data['patient_count'][::-1]
# But standard practice is ascending (1 to 9), so we'll sort accordingly:
# Let's sort by Visit to ensure left-to-right: 1,2,...,9
# Actually, your current Visit list is [9,8,...,1], which is descending.
# Let's sort by Visit for intuitive left-to-right increasing order:
sorted_indices = sorted(range(len(data['Visit'])), key=lambda i: data['Visit'][i])
visit_sorted = [data['Visit'][i] for i in sorted_indices]
count_sorted = [data['patient_count'][i] for i in sorted_indices]
# Re-plot with sorted x-axis:
ax.clear()
bars = ax.bar(visit_sorted, count_sorted, color='darkblue', label='Patients by Visit Count')
# Re-apply labels, etc.
ax.set_xlabel('Number of Visits', fontsize=12)
ax.set_ylabel('Number of Unique Patients', fontsize=12)
#ax.set_title('Number of Patients by Visit Number', fontsize=14)
# Add legend
ax.legend()
# Improve layout and grid
ax.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(visit_sorted) # Ensure all integer visit numbers are shown
# Show the plot
plt.tight_layout()
plt.show()
##
# %% Patientjourney Bubble chart
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
mpl.rcParams["font.family"] = "DejaVu Sans" # or "Arial", "Calibri", "Times New Roman", ...
mpl.rcParams["font.size"] = 12 # default size for text
mpl.rcParams["axes.titlesize"] = 14
mpl.rcParams["axes.titleweight"] = "bold"
# Data (your counts)
visits = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
patient_count = np.array([32, 24, 28, 17, 13, 6, 3, 3, 2])
# "Remaining" = patients with >= that many visits (cumulative from the right)
remaining = np.array([patient_count[i:].sum() for i in range(len(patient_count))])
# --- Plot ---
fig, ax = plt.subplots(figsize=(12, 3))
y = 0.0 # all bubbles on one horizontal line
# Horizontal line
ax.hlines(y, visits.min() - 0.4, visits.max() + 0.4, color="#1f77b4", linewidth=3)
# Bubble sizes (scale as needed)
# (Matplotlib scatter uses area in points^2)
sizes = patient_count * 35 # tweak this multiplier if you want bigger/smaller bubbles
ax.scatter(visits, np.full_like(visits, y), s=sizes, color="#1f77b4", zorder=3)
# Title
#ax.set_title("Patient Journey by Visit Count", fontsize=14, pad=18)
# Top labels: "1 visits", "2 visits", ...
for x in visits:
label = f"{x} visit" if x == 1 else f"{x} visits"
ax.text(x, y + 0.18, label, ha="center", va="bottom", fontsize=10)
# Bottom labels: "X patients" and "Y remaining"
for x, pc, rem in zip(visits, patient_count, remaining):
ax.text(x, y - 0.20, f"{pc} patients", ha="center", va="top", fontsize=9)
ax.text(x, y - 0.32, f"{rem} remaining", ha="center", va="top", fontsize=9)
# Cosmetics: remove axes, keep spacing nice
ax.set_xlim(visits.min() - 0.6, visits.max() + 0.6)
ax.set_ylim(-0.5, 0.35)
ax.set_xticks([])
ax.set_yticks([])
for spine in ax.spines.values():
spine.set_visible(False)
plt.tight_layout()
plt.show()
plt.savefig("patient_journey.svg", format="svg", bbox_inches="tight")
##