From e6e704cd18ceb88a8f8b68a43debabd9ef74bc1c Mon Sep 17 00:00:00 2001 From: Ahmadreza Ramezanzadeh Date: Tue, 6 Jan 2026 13:25:08 +0100 Subject: [PATCH] initial commit/ pipeline --- .gitignore | 48 ++++++++ pipeline/osce_pipeline.py | 229 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 277 insertions(+) create mode 100644 .gitignore create mode 100644 pipeline/osce_pipeline.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b1c1fed --- /dev/null +++ b/.gitignore @@ -0,0 +1,48 @@ +# ====================== +# Python +# ====================== +__pycache__/ +*.pyc +*.pyo +*.pyd + +# Virtual environments +enisper/ +venv/ +*.env/ + +# ====================== +# Audio / Video / Media +# ====================== +*.mp3 +*.mp4 +*.wav + +# ====================== +# Data / Outputs +# ====================== +input/ +output/ +summary/ +*.json +*.txt + +# ====================== +# Archives +# ====================== +*.zip + +# ====================== +# OS / Editor +# ====================== +.DS_Store +Thumbs.db +.vscode/ + +# ====================== +# Experimental scripts +# ====================== +2moodle.py +app.old_inside_pipeline.py +feedback_bewertung.py + diff --git a/pipeline/osce_pipeline.py b/pipeline/osce_pipeline.py new file mode 100644 index 0000000..e91712f --- /dev/null +++ b/pipeline/osce_pipeline.py @@ -0,0 +1,229 @@ +# %% s2t iterative +# ------------------------------------------------------------ +# osce_pipeline.py +# ------------------------------------------------------------ +import os +from pathlib import Path +import re +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +API_KEY = os.getenv("OPENAI_API_KEY") +BASE_URL = os.getenv("OPENAI_BASE_URL") + +# Validate that credentials are loaded +if not API_KEY or not BASE_URL: + raise ValueError("Please set OPENAI_API_KEY and OPENAI_BASE_URL in your .env file") + +from openai import OpenAI + +client = OpenAI( + api_key=API_KEY, + base_url=BASE_URL, + timeout=600, +) + +def sanitize_filename(filename): + """Remove or replace invalid characters from filename""" + # Replace problematic characters with underscores + return re.sub(r'[<>:"/\\|?*]', '_', filename) + +def create_output_structure(input_dir, output_admin_dir, output_student_dir): + """Create the required output directory structure for both admin and student folders""" + # Get all prüfling directories + pruefling_dirs = [d for d in input_dir.iterdir() if d.is_dir()] + + for pruefling_dir in pruefling_dirs: + # Extract pruefling number from directory name (e.g., prfling12345 -> 12345) + pruefling_id = pruefling_dir.name.replace('prfling', '') + + # Create output directories for both admin and student + admin_pruefling_dir = Path(output_admin_dir) / f"Prüfling_{pruefling_id}" + student_pruefling_dir = Path(output_student_dir) / f"Prüfling_{pruefling_id}" + + # Create directories for admin + admin_pruefling_dir.mkdir(parents=True, exist_ok=True) + (admin_pruefling_dir / "Transkripte").mkdir(exist_ok=True) + (admin_pruefling_dir / "Zusammenfassungen").mkdir(exist_ok=True) + + # Create directories for student + student_pruefling_dir.mkdir(parents=True, exist_ok=True) + (student_pruefling_dir / "Zusammenfassungen").mkdir(exist_ok=True) + + print(f"Created structure for Prüfling_{pruefling_id}") + +def process_pruefling(pruefling_dir, output_admin_dir, output_student_dir): + """Process all MP3 files in a pruefling directory and create both admin and student outputs""" + # Extract pruefling number from directory name + pruefling_id = pruefling_dir.name.replace('prfling', '') + + # Define output directories for this pruefling + admin_pruefling_dir = Path(output_admin_dir) / f"Prüfling_{pruefling_id}" + student_pruefling_dir = Path(output_student_dir) / f"Prüfling_{pruefling_id}" + + transcribe_admin_dir = admin_pruefling_dir / "Transkripte" + summary_admin_dir = admin_pruefling_dir / "Zusammenfassungen" + summary_student_dir = student_pruefling_dir / "Zusammenfassungen" + + # Get all MP3 files from input directory + mp3_files = list(pruefling_dir.glob("*.mp3")) + + if not mp3_files: + print(f"No MP3 files found in {pruefling_dir}") + return + + print(f"Found {len(mp3_files)} MP3 files for Prüfling_{pruefling_id}.") + + # Process each MP3 file + for demo_mp3_path in mp3_files: + try: + print(f"\nProcessing: {demo_mp3_path.name}") + + with demo_mp3_path.open("rb") as audio_file: + transcription = client.audio.transcriptions.create( + model="faster-whisper-large-v3", + file=audio_file, + ) + + # Get base filename without extension + base_filename = demo_mp3_path.stem + + # Create safe output filenames + txt_filename = sanitize_filename(f"{base_filename}.txt") + summary_filename = sanitize_filename(f"{base_filename}-summary.txt") + + # Save transcription to admin directory + transcript_output_path = transcribe_admin_dir / txt_filename + with open(transcript_output_path, 'w', encoding='utf-8') as f: + f.write(transcription.text) + + print(f"Transcript saved to {transcript_output_path}") + + # Perform summarization + response = client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": f''' +Aufgabe: Fasse das folgende Prüferinnen-Feedback aus einer medizinischen Prüfung präzise, vollständig und übersichtlich zusammen. + +Ziel: Eine konsolidierte, neutrale und strukturierte Zusammenfassung, die alle wesentlichen Inhalte widerspiegelt. + +Bitte führe Folgendes durch: + +Kernthemen des Feedbacks identifizieren. + +Stärken der geprüften Person klar benennen. + +Verbesserungspotenziale mit konkretem Bezug zu Inhalten oder Verhalten hervorheben. + +Fachliche Aspekte getrennt von kommunikativen/sozialen Kompetenzen darstellen. + +Eventuelle konkrete Beispiele oder Situationen aus dem Feedback erwähnen. + +Widersprüche oder unterschiedliche Sichtweisen zwischen Prüferinnen markieren. + +Eine kurze abschließende Meta-Synthese formulieren, die die wichtigsten Prioritäten für zukünftige Prüfungen ableitet. + +Format der Antwort: + +Thematische Zusammenfassung (Stichpunkte) + +Stärken (Stichpunkte) + +Verbesserungspotenziale (Stichpunkte) + +Auffällige Unterschiede im Feedback (falls vorhanden) + +Priorisierte Handlungsempfehlungen (max. 5 Punkte) + +Hier ist das Feedback: +\n\n{transcription.text}''' + } + ], + model="GPT-OSS-120B", + max_tokens=1024, + temperature=0.3 + ) + + # Save summary to admin directory + summary_output_path = summary_admin_dir / summary_filename + with open(summary_output_path, "w", encoding='utf-8') as f: + content = response.choices[0].message.content or "" + f.write(content) + + print(f"Summary saved to {summary_output_path}") + + # Copy summary to student directory + student_summary_path = summary_student_dir / summary_filename + with open(student_summary_path, "w", encoding='utf-8') as f: + content = response.choices[0].message.content or "" + f.write(content) + + print(f"Summary copied to student directory: {student_summary_path}") + + except Exception as e: + print(f"Error processing {demo_mp3_path.name}: {str(e)}") + +def main(): + # Ask for input directory + input_base_dir = Path(input("Enter the input directory path: ")) + + # Validate input directory exists + if not input_base_dir.exists(): + print(f"Error: Input directory {input_base_dir} does not exist!") + return + + if not input_base_dir.is_dir(): + print(f"Error: {input_base_dir} is not a directory!") + return + + # Create .env file template if it doesn't exist + env_file = Path(".env") + if not env_file.exists(): + with open(env_file, "w") as f: + f.write("# OpenAI Configuration\n") + f.write("OPENAI_API_KEY=your_api_key_here\n") + f.write("OPENAI_BASE_URL=https://api.openai.com/v1\n") + print("Created .env file. Please fill in your credentials.") + return + + # Get output directory names from environment variables + output_admin_name = os.getenv("OUTPUT_ADMIN_DIR", "output_admin") + output_student_name = os.getenv("OUTPUT_STUDENT_DIR", "output_student") + + # Create parent directory for outputs + parent_output_dir = input_base_dir.parent + + # Define output directories + output_admin_dir = parent_output_dir / output_admin_name + output_student_dir = parent_output_dir / output_student_name + + print(f"Input directory: {input_base_dir}") + print(f"Admin output directory: {output_admin_dir}") + print(f"Student output directory: {output_student_dir}") + + # Create output directories + output_admin_dir.mkdir(exist_ok=True) + output_student_dir.mkdir(exist_ok=True) + + # Create directory structure + create_output_structure(input_base_dir, output_admin_dir, output_student_dir) + + # Process all pruefling directories + for pruefling_dir in input_base_dir.iterdir(): + if pruefling_dir.is_dir() and pruefling_dir.name.startswith('prfling'): + print(f"\nProcessing {pruefling_dir.name}") + process_pruefling(pruefling_dir, output_admin_dir, output_student_dir) + + print("\nProcessing complete!") + +if __name__ == "__main__": + main() +