# %% s2t iterative # ------------------------------------------------------------ # osce_pipeline.py # ------------------------------------------------------------ import os from pathlib import Path import re from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() API_KEY = os.getenv("OPENAI_API_KEY") BASE_URL = os.getenv("OPENAI_BASE_URL") # Validate that credentials are loaded if not API_KEY or not BASE_URL: raise ValueError("Please set OPENAI_API_KEY and OPENAI_BASE_URL in your .env file") from openai import OpenAI client = OpenAI( api_key=API_KEY, base_url=BASE_URL, timeout=600, ) def sanitize_filename(filename): """Remove or replace invalid characters from filename""" # Replace problematic characters with underscores return re.sub(r'[<>:"/\\|?*]', '_', filename) def create_output_structure(input_dir, output_admin_dir, output_student_dir): """Create the required output directory structure for both admin and student folders""" # Get all prüfling directories pruefling_dirs = [d for d in input_dir.iterdir() if d.is_dir()] for pruefling_dir in pruefling_dirs: # Extract pruefling number from directory name (e.g., prfling12345 -> 12345) pruefling_id = pruefling_dir.name.replace('prfling', '') # Create output directories for both admin and student admin_pruefling_dir = Path(output_admin_dir) / f"Prüfling_{pruefling_id}" student_pruefling_dir = Path(output_student_dir) / f"Prüfling_{pruefling_id}" # Create directories for admin admin_pruefling_dir.mkdir(parents=True, exist_ok=True) (admin_pruefling_dir / "Transkripte").mkdir(exist_ok=True) (admin_pruefling_dir / "Zusammenfassungen").mkdir(exist_ok=True) # Create directories for student student_pruefling_dir.mkdir(parents=True, exist_ok=True) (student_pruefling_dir / "Zusammenfassungen").mkdir(exist_ok=True) print(f"Created structure for Prüfling_{pruefling_id}") def process_pruefling(pruefling_dir, output_admin_dir, output_student_dir): """Process all MP3 files in a pruefling directory and create both admin and student outputs""" # Extract pruefling number from directory name pruefling_id = pruefling_dir.name.replace('prfling', '') # Define output directories for this pruefling admin_pruefling_dir = Path(output_admin_dir) / f"Prüfling_{pruefling_id}" student_pruefling_dir = Path(output_student_dir) / f"Prüfling_{pruefling_id}" transcribe_admin_dir = admin_pruefling_dir / "Transkripte" summary_admin_dir = admin_pruefling_dir / "Zusammenfassungen" summary_student_dir = student_pruefling_dir / "Zusammenfassungen" # Get all MP3 files from input directory mp3_files = list(pruefling_dir.glob("*.mp3")) if not mp3_files: print(f"No MP3 files found in {pruefling_dir}") return print(f"Found {len(mp3_files)} MP3 files for Prüfling_{pruefling_id}.") # Process each MP3 file for demo_mp3_path in mp3_files: try: print(f"\nProcessing: {demo_mp3_path.name}") with demo_mp3_path.open("rb") as audio_file: transcription = client.audio.transcriptions.create( model="faster-whisper-large-v3", file=audio_file, ) # Get base filename without extension base_filename = demo_mp3_path.stem # Create safe output filenames txt_filename = sanitize_filename(f"{base_filename}.txt") summary_filename = sanitize_filename(f"{base_filename}-summary.txt") # Save transcription to admin directory transcript_output_path = transcribe_admin_dir / txt_filename with open(transcript_output_path, 'w', encoding='utf-8') as f: f.write(transcription.text) print(f"Transcript saved to {transcript_output_path}") # Perform summarization response = client.chat.completions.create( messages=[ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": f''' Aufgabe: Fasse das folgende Prüferinnen-Feedback aus einer medizinischen Prüfung präzise, vollständig und übersichtlich zusammen. Ziel: Eine konsolidierte, neutrale und strukturierte Zusammenfassung, die alle wesentlichen Inhalte widerspiegelt. Bitte führe Folgendes durch: Kernthemen des Feedbacks identifizieren. Stärken der geprüften Person klar benennen. Verbesserungspotenziale mit konkretem Bezug zu Inhalten oder Verhalten hervorheben. Fachliche Aspekte getrennt von kommunikativen/sozialen Kompetenzen darstellen. Eventuelle konkrete Beispiele oder Situationen aus dem Feedback erwähnen. Widersprüche oder unterschiedliche Sichtweisen zwischen Prüferinnen markieren. Eine kurze abschließende Meta-Synthese formulieren, die die wichtigsten Prioritäten für zukünftige Prüfungen ableitet. Format der Antwort: Thematische Zusammenfassung (Stichpunkte) Stärken (Stichpunkte) Verbesserungspotenziale (Stichpunkte) Auffällige Unterschiede im Feedback (falls vorhanden) Priorisierte Handlungsempfehlungen (max. 5 Punkte) Hier ist das Feedback: \n\n{transcription.text}''' } ], model="GPT-OSS-120B", max_tokens=1024, temperature=0.3 ) # Save summary to admin directory summary_output_path = summary_admin_dir / summary_filename with open(summary_output_path, "w", encoding='utf-8') as f: content = response.choices[0].message.content or "" f.write(content) print(f"Summary saved to {summary_output_path}") # Copy summary to student directory student_summary_path = summary_student_dir / summary_filename with open(student_summary_path, "w", encoding='utf-8') as f: content = response.choices[0].message.content or "" f.write(content) print(f"Summary copied to student directory: {student_summary_path}") except Exception as e: print(f"Error processing {demo_mp3_path.name}: {str(e)}") def main(): # Ask for input directory input_base_dir = Path(input("Enter the input directory path: ")) # Validate input directory exists if not input_base_dir.exists(): print(f"Error: Input directory {input_base_dir} does not exist!") return if not input_base_dir.is_dir(): print(f"Error: {input_base_dir} is not a directory!") return # Create .env file template if it doesn't exist env_file = Path(".env") if not env_file.exists(): with open(env_file, "w") as f: f.write("# OpenAI Configuration\n") f.write("OPENAI_API_KEY=your_api_key_here\n") f.write("OPENAI_BASE_URL=https://api.openai.com/v1\n") print("Created .env file. Please fill in your credentials.") return # Get output directory names from environment variables output_admin_name = os.getenv("OUTPUT_ADMIN_DIR", "output_admin") output_student_name = os.getenv("OUTPUT_STUDENT_DIR", "output_student") # Create parent directory for outputs parent_output_dir = input_base_dir.parent # Define output directories output_admin_dir = parent_output_dir / output_admin_name output_student_dir = parent_output_dir / output_student_name print(f"Input directory: {input_base_dir}") print(f"Admin output directory: {output_admin_dir}") print(f"Student output directory: {output_student_dir}") # Create output directories output_admin_dir.mkdir(exist_ok=True) output_student_dir.mkdir(exist_ok=True) # Create directory structure create_output_structure(input_base_dir, output_admin_dir, output_student_dir) # Process all pruefling directories for pruefling_dir in input_base_dir.iterdir(): if pruefling_dir.is_dir() and pruefling_dir.name.startswith('prfling'): print(f"\nProcessing {pruefling_dir.name}") process_pruefling(pruefling_dir, output_admin_dir, output_student_dir) print("\nProcessing complete!") if __name__ == "__main__": main()