initial commit/ pipeline

This commit is contained in:
2026-01-06 13:25:08 +01:00
commit e6e704cd18
2 changed files with 277 additions and 0 deletions

48
.gitignore vendored Normal file
View File

@@ -0,0 +1,48 @@
# ======================
# Python
# ======================
__pycache__/
*.pyc
*.pyo
*.pyd
# Virtual environments
enisper/
venv/
*.env/
# ======================
# Audio / Video / Media
# ======================
*.mp3
*.mp4
*.wav
# ======================
# Data / Outputs
# ======================
input/
output/
summary/
*.json
*.txt
# ======================
# Archives
# ======================
*.zip
# ======================
# OS / Editor
# ======================
.DS_Store
Thumbs.db
.vscode/
# ======================
# Experimental scripts
# ======================
2moodle.py
app.old_inside_pipeline.py
feedback_bewertung.py

229
pipeline/osce_pipeline.py Normal file
View File

@@ -0,0 +1,229 @@
# %% s2t iterative
# ------------------------------------------------------------
# osce_pipeline.py
# ------------------------------------------------------------
import os
from pathlib import Path
import re
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")
BASE_URL = os.getenv("OPENAI_BASE_URL")
# Validate that credentials are loaded
if not API_KEY or not BASE_URL:
raise ValueError("Please set OPENAI_API_KEY and OPENAI_BASE_URL in your .env file")
from openai import OpenAI
client = OpenAI(
api_key=API_KEY,
base_url=BASE_URL,
timeout=600,
)
def sanitize_filename(filename):
"""Remove or replace invalid characters from filename"""
# Replace problematic characters with underscores
return re.sub(r'[<>:"/\\|?*]', '_', filename)
def create_output_structure(input_dir, output_admin_dir, output_student_dir):
"""Create the required output directory structure for both admin and student folders"""
# Get all prüfling directories
pruefling_dirs = [d for d in input_dir.iterdir() if d.is_dir()]
for pruefling_dir in pruefling_dirs:
# Extract pruefling number from directory name (e.g., prfling12345 -> 12345)
pruefling_id = pruefling_dir.name.replace('prfling', '')
# Create output directories for both admin and student
admin_pruefling_dir = Path(output_admin_dir) / f"Prüfling_{pruefling_id}"
student_pruefling_dir = Path(output_student_dir) / f"Prüfling_{pruefling_id}"
# Create directories for admin
admin_pruefling_dir.mkdir(parents=True, exist_ok=True)
(admin_pruefling_dir / "Transkripte").mkdir(exist_ok=True)
(admin_pruefling_dir / "Zusammenfassungen").mkdir(exist_ok=True)
# Create directories for student
student_pruefling_dir.mkdir(parents=True, exist_ok=True)
(student_pruefling_dir / "Zusammenfassungen").mkdir(exist_ok=True)
print(f"Created structure for Prüfling_{pruefling_id}")
def process_pruefling(pruefling_dir, output_admin_dir, output_student_dir):
"""Process all MP3 files in a pruefling directory and create both admin and student outputs"""
# Extract pruefling number from directory name
pruefling_id = pruefling_dir.name.replace('prfling', '')
# Define output directories for this pruefling
admin_pruefling_dir = Path(output_admin_dir) / f"Prüfling_{pruefling_id}"
student_pruefling_dir = Path(output_student_dir) / f"Prüfling_{pruefling_id}"
transcribe_admin_dir = admin_pruefling_dir / "Transkripte"
summary_admin_dir = admin_pruefling_dir / "Zusammenfassungen"
summary_student_dir = student_pruefling_dir / "Zusammenfassungen"
# Get all MP3 files from input directory
mp3_files = list(pruefling_dir.glob("*.mp3"))
if not mp3_files:
print(f"No MP3 files found in {pruefling_dir}")
return
print(f"Found {len(mp3_files)} MP3 files for Prüfling_{pruefling_id}.")
# Process each MP3 file
for demo_mp3_path in mp3_files:
try:
print(f"\nProcessing: {demo_mp3_path.name}")
with demo_mp3_path.open("rb") as audio_file:
transcription = client.audio.transcriptions.create(
model="faster-whisper-large-v3",
file=audio_file,
)
# Get base filename without extension
base_filename = demo_mp3_path.stem
# Create safe output filenames
txt_filename = sanitize_filename(f"{base_filename}.txt")
summary_filename = sanitize_filename(f"{base_filename}-summary.txt")
# Save transcription to admin directory
transcript_output_path = transcribe_admin_dir / txt_filename
with open(transcript_output_path, 'w', encoding='utf-8') as f:
f.write(transcription.text)
print(f"Transcript saved to {transcript_output_path}")
# Perform summarization
response = client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": f'''
Aufgabe: Fasse das folgende Prüferinnen-Feedback aus einer medizinischen Prüfung präzise, vollständig und übersichtlich zusammen.
Ziel: Eine konsolidierte, neutrale und strukturierte Zusammenfassung, die alle wesentlichen Inhalte widerspiegelt.
Bitte führe Folgendes durch:
Kernthemen des Feedbacks identifizieren.
Stärken der geprüften Person klar benennen.
Verbesserungspotenziale mit konkretem Bezug zu Inhalten oder Verhalten hervorheben.
Fachliche Aspekte getrennt von kommunikativen/sozialen Kompetenzen darstellen.
Eventuelle konkrete Beispiele oder Situationen aus dem Feedback erwähnen.
Widersprüche oder unterschiedliche Sichtweisen zwischen Prüferinnen markieren.
Eine kurze abschließende Meta-Synthese formulieren, die die wichtigsten Prioritäten für zukünftige Prüfungen ableitet.
Format der Antwort:
Thematische Zusammenfassung (Stichpunkte)
Stärken (Stichpunkte)
Verbesserungspotenziale (Stichpunkte)
Auffällige Unterschiede im Feedback (falls vorhanden)
Priorisierte Handlungsempfehlungen (max. 5 Punkte)
Hier ist das Feedback:
\n\n{transcription.text}'''
}
],
model="GPT-OSS-120B",
max_tokens=1024,
temperature=0.3
)
# Save summary to admin directory
summary_output_path = summary_admin_dir / summary_filename
with open(summary_output_path, "w", encoding='utf-8') as f:
content = response.choices[0].message.content or ""
f.write(content)
print(f"Summary saved to {summary_output_path}")
# Copy summary to student directory
student_summary_path = summary_student_dir / summary_filename
with open(student_summary_path, "w", encoding='utf-8') as f:
content = response.choices[0].message.content or ""
f.write(content)
print(f"Summary copied to student directory: {student_summary_path}")
except Exception as e:
print(f"Error processing {demo_mp3_path.name}: {str(e)}")
def main():
# Ask for input directory
input_base_dir = Path(input("Enter the input directory path: "))
# Validate input directory exists
if not input_base_dir.exists():
print(f"Error: Input directory {input_base_dir} does not exist!")
return
if not input_base_dir.is_dir():
print(f"Error: {input_base_dir} is not a directory!")
return
# Create .env file template if it doesn't exist
env_file = Path(".env")
if not env_file.exists():
with open(env_file, "w") as f:
f.write("# OpenAI Configuration\n")
f.write("OPENAI_API_KEY=your_api_key_here\n")
f.write("OPENAI_BASE_URL=https://api.openai.com/v1\n")
print("Created .env file. Please fill in your credentials.")
return
# Get output directory names from environment variables
output_admin_name = os.getenv("OUTPUT_ADMIN_DIR", "output_admin")
output_student_name = os.getenv("OUTPUT_STUDENT_DIR", "output_student")
# Create parent directory for outputs
parent_output_dir = input_base_dir.parent
# Define output directories
output_admin_dir = parent_output_dir / output_admin_name
output_student_dir = parent_output_dir / output_student_name
print(f"Input directory: {input_base_dir}")
print(f"Admin output directory: {output_admin_dir}")
print(f"Student output directory: {output_student_dir}")
# Create output directories
output_admin_dir.mkdir(exist_ok=True)
output_student_dir.mkdir(exist_ok=True)
# Create directory structure
create_output_structure(input_base_dir, output_admin_dir, output_student_dir)
# Process all pruefling directories
for pruefling_dir in input_base_dir.iterdir():
if pruefling_dir.is_dir() and pruefling_dir.name.startswith('prfling'):
print(f"\nProcessing {pruefling_dir.name}")
process_pruefling(pruefling_dir, output_admin_dir, output_student_dir)
print("\nProcessing complete!")
if __name__ == "__main__":
main()