230 lines
8.3 KiB
Python
230 lines
8.3 KiB
Python
# %% s2t iterative
|
|
# ------------------------------------------------------------
|
|
# osce_pipeline.py
|
|
# ------------------------------------------------------------
|
|
import os
|
|
from pathlib import Path
|
|
import re
|
|
from dotenv import load_dotenv
|
|
|
|
# Load environment variables from .env file
|
|
load_dotenv()
|
|
|
|
API_KEY = os.getenv("OPENAI_API_KEY")
|
|
BASE_URL = os.getenv("OPENAI_BASE_URL")
|
|
|
|
# Validate that credentials are loaded
|
|
if not API_KEY or not BASE_URL:
|
|
raise ValueError("Please set OPENAI_API_KEY and OPENAI_BASE_URL in your .env file")
|
|
|
|
from openai import OpenAI
|
|
|
|
client = OpenAI(
|
|
api_key=API_KEY,
|
|
base_url=BASE_URL,
|
|
timeout=600,
|
|
)
|
|
|
|
def sanitize_filename(filename):
|
|
"""Remove or replace invalid characters from filename"""
|
|
# Replace problematic characters with underscores
|
|
return re.sub(r'[<>:"/\\|?*]', '_', filename)
|
|
|
|
def create_output_structure(input_dir, output_admin_dir, output_student_dir):
|
|
"""Create the required output directory structure for both admin and student folders"""
|
|
# Get all prüfling directories
|
|
pruefling_dirs = [d for d in input_dir.iterdir() if d.is_dir()]
|
|
|
|
for pruefling_dir in pruefling_dirs:
|
|
# Extract pruefling number from directory name (e.g., prfling12345 -> 12345)
|
|
pruefling_id = pruefling_dir.name.replace('prfling', '')
|
|
|
|
# Create output directories for both admin and student
|
|
admin_pruefling_dir = Path(output_admin_dir) / f"Prüfling_{pruefling_id}"
|
|
student_pruefling_dir = Path(output_student_dir) / f"Prüfling_{pruefling_id}"
|
|
|
|
# Create directories for admin
|
|
admin_pruefling_dir.mkdir(parents=True, exist_ok=True)
|
|
(admin_pruefling_dir / "Transkripte").mkdir(exist_ok=True)
|
|
(admin_pruefling_dir / "Zusammenfassungen").mkdir(exist_ok=True)
|
|
|
|
# Create directories for student
|
|
student_pruefling_dir.mkdir(parents=True, exist_ok=True)
|
|
(student_pruefling_dir / "Zusammenfassungen").mkdir(exist_ok=True)
|
|
|
|
print(f"Created structure for Prüfling_{pruefling_id}")
|
|
|
|
def process_pruefling(pruefling_dir, output_admin_dir, output_student_dir):
|
|
"""Process all MP3 files in a pruefling directory and create both admin and student outputs"""
|
|
# Extract pruefling number from directory name
|
|
pruefling_id = pruefling_dir.name.replace('prfling', '')
|
|
|
|
# Define output directories for this pruefling
|
|
admin_pruefling_dir = Path(output_admin_dir) / f"Prüfling_{pruefling_id}"
|
|
student_pruefling_dir = Path(output_student_dir) / f"Prüfling_{pruefling_id}"
|
|
|
|
transcribe_admin_dir = admin_pruefling_dir / "Transkripte"
|
|
summary_admin_dir = admin_pruefling_dir / "Zusammenfassungen"
|
|
summary_student_dir = student_pruefling_dir / "Zusammenfassungen"
|
|
|
|
# Get all MP3 files from input directory
|
|
mp3_files = list(pruefling_dir.glob("*.mp3"))
|
|
|
|
if not mp3_files:
|
|
print(f"No MP3 files found in {pruefling_dir}")
|
|
return
|
|
|
|
print(f"Found {len(mp3_files)} MP3 files for Prüfling_{pruefling_id}.")
|
|
|
|
# Process each MP3 file
|
|
for demo_mp3_path in mp3_files:
|
|
try:
|
|
print(f"\nProcessing: {demo_mp3_path.name}")
|
|
|
|
with demo_mp3_path.open("rb") as audio_file:
|
|
transcription = client.audio.transcriptions.create(
|
|
model="faster-whisper-large-v3",
|
|
file=audio_file,
|
|
)
|
|
|
|
# Get base filename without extension
|
|
base_filename = demo_mp3_path.stem
|
|
|
|
# Create safe output filenames
|
|
txt_filename = sanitize_filename(f"{base_filename}.txt")
|
|
summary_filename = sanitize_filename(f"{base_filename}-summary.txt")
|
|
|
|
# Save transcription to admin directory
|
|
transcript_output_path = transcribe_admin_dir / txt_filename
|
|
with open(transcript_output_path, 'w', encoding='utf-8') as f:
|
|
f.write(transcription.text)
|
|
|
|
print(f"Transcript saved to {transcript_output_path}")
|
|
|
|
# Perform summarization
|
|
response = client.chat.completions.create(
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You are a helpful assistant."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f'''
|
|
Aufgabe: Fasse das folgende Prüferinnen-Feedback aus einer medizinischen Prüfung präzise, vollständig und übersichtlich zusammen.
|
|
|
|
Ziel: Eine konsolidierte, neutrale und strukturierte Zusammenfassung, die alle wesentlichen Inhalte widerspiegelt.
|
|
|
|
Bitte führe Folgendes durch:
|
|
|
|
Kernthemen des Feedbacks identifizieren.
|
|
|
|
Stärken der geprüften Person klar benennen.
|
|
|
|
Verbesserungspotenziale mit konkretem Bezug zu Inhalten oder Verhalten hervorheben.
|
|
|
|
Fachliche Aspekte getrennt von kommunikativen/sozialen Kompetenzen darstellen.
|
|
|
|
Eventuelle konkrete Beispiele oder Situationen aus dem Feedback erwähnen.
|
|
|
|
Widersprüche oder unterschiedliche Sichtweisen zwischen Prüferinnen markieren.
|
|
|
|
Eine kurze abschließende Meta-Synthese formulieren, die die wichtigsten Prioritäten für zukünftige Prüfungen ableitet.
|
|
|
|
Format der Antwort:
|
|
|
|
Thematische Zusammenfassung (Stichpunkte)
|
|
|
|
Stärken (Stichpunkte)
|
|
|
|
Verbesserungspotenziale (Stichpunkte)
|
|
|
|
Auffällige Unterschiede im Feedback (falls vorhanden)
|
|
|
|
Priorisierte Handlungsempfehlungen (max. 5 Punkte)
|
|
|
|
Hier ist das Feedback:
|
|
\n\n{transcription.text}'''
|
|
}
|
|
],
|
|
model="GPT-OSS-120B",
|
|
max_tokens=1024,
|
|
temperature=0.3
|
|
)
|
|
|
|
# Save summary to admin directory
|
|
summary_output_path = summary_admin_dir / summary_filename
|
|
with open(summary_output_path, "w", encoding='utf-8') as f:
|
|
content = response.choices[0].message.content or ""
|
|
f.write(content)
|
|
|
|
print(f"Summary saved to {summary_output_path}")
|
|
|
|
# Copy summary to student directory
|
|
student_summary_path = summary_student_dir / summary_filename
|
|
with open(student_summary_path, "w", encoding='utf-8') as f:
|
|
content = response.choices[0].message.content or ""
|
|
f.write(content)
|
|
|
|
print(f"Summary copied to student directory: {student_summary_path}")
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {demo_mp3_path.name}: {str(e)}")
|
|
|
|
def main():
|
|
# Ask for input directory
|
|
input_base_dir = Path(input("Enter the input directory path: "))
|
|
|
|
# Validate input directory exists
|
|
if not input_base_dir.exists():
|
|
print(f"Error: Input directory {input_base_dir} does not exist!")
|
|
return
|
|
|
|
if not input_base_dir.is_dir():
|
|
print(f"Error: {input_base_dir} is not a directory!")
|
|
return
|
|
|
|
# Create .env file template if it doesn't exist
|
|
env_file = Path(".env")
|
|
if not env_file.exists():
|
|
with open(env_file, "w") as f:
|
|
f.write("# OpenAI Configuration\n")
|
|
f.write("OPENAI_API_KEY=your_api_key_here\n")
|
|
f.write("OPENAI_BASE_URL=https://api.openai.com/v1\n")
|
|
print("Created .env file. Please fill in your credentials.")
|
|
return
|
|
|
|
# Get output directory names from environment variables
|
|
output_admin_name = os.getenv("OUTPUT_ADMIN_DIR", "output_admin")
|
|
output_student_name = os.getenv("OUTPUT_STUDENT_DIR", "output_student")
|
|
|
|
# Create parent directory for outputs
|
|
parent_output_dir = input_base_dir.parent
|
|
|
|
# Define output directories
|
|
output_admin_dir = parent_output_dir / output_admin_name
|
|
output_student_dir = parent_output_dir / output_student_name
|
|
|
|
print(f"Input directory: {input_base_dir}")
|
|
print(f"Admin output directory: {output_admin_dir}")
|
|
print(f"Student output directory: {output_student_dir}")
|
|
|
|
# Create output directories
|
|
output_admin_dir.mkdir(exist_ok=True)
|
|
output_student_dir.mkdir(exist_ok=True)
|
|
|
|
# Create directory structure
|
|
create_output_structure(input_base_dir, output_admin_dir, output_student_dir)
|
|
|
|
# Process all pruefling directories
|
|
for pruefling_dir in input_base_dir.iterdir():
|
|
if pruefling_dir.is_dir() and pruefling_dir.name.startswith('prfling'):
|
|
print(f"\nProcessing {pruefling_dir.name}")
|
|
process_pruefling(pruefling_dir, output_admin_dir, output_student_dir)
|
|
|
|
print("\nProcessing complete!")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|