initial commit/ pipeline
This commit is contained in:
48
.gitignore
vendored
Normal file
48
.gitignore
vendored
Normal file
@@ -0,0 +1,48 @@
|
||||
# ======================
|
||||
# Python
|
||||
# ======================
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
|
||||
# Virtual environments
|
||||
enisper/
|
||||
venv/
|
||||
*.env/
|
||||
|
||||
# ======================
|
||||
# Audio / Video / Media
|
||||
# ======================
|
||||
*.mp3
|
||||
*.mp4
|
||||
*.wav
|
||||
|
||||
# ======================
|
||||
# Data / Outputs
|
||||
# ======================
|
||||
input/
|
||||
output/
|
||||
summary/
|
||||
*.json
|
||||
*.txt
|
||||
|
||||
# ======================
|
||||
# Archives
|
||||
# ======================
|
||||
*.zip
|
||||
|
||||
# ======================
|
||||
# OS / Editor
|
||||
# ======================
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
.vscode/
|
||||
|
||||
# ======================
|
||||
# Experimental scripts
|
||||
# ======================
|
||||
2moodle.py
|
||||
app.old_inside_pipeline.py
|
||||
feedback_bewertung.py
|
||||
|
||||
229
pipeline/osce_pipeline.py
Normal file
229
pipeline/osce_pipeline.py
Normal file
@@ -0,0 +1,229 @@
|
||||
# %% s2t iterative
|
||||
# ------------------------------------------------------------
|
||||
# osce_pipeline.py
|
||||
# ------------------------------------------------------------
|
||||
import os
|
||||
from pathlib import Path
|
||||
import re
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
API_KEY = os.getenv("OPENAI_API_KEY")
|
||||
BASE_URL = os.getenv("OPENAI_BASE_URL")
|
||||
|
||||
# Validate that credentials are loaded
|
||||
if not API_KEY or not BASE_URL:
|
||||
raise ValueError("Please set OPENAI_API_KEY and OPENAI_BASE_URL in your .env file")
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(
|
||||
api_key=API_KEY,
|
||||
base_url=BASE_URL,
|
||||
timeout=600,
|
||||
)
|
||||
|
||||
def sanitize_filename(filename):
|
||||
"""Remove or replace invalid characters from filename"""
|
||||
# Replace problematic characters with underscores
|
||||
return re.sub(r'[<>:"/\\|?*]', '_', filename)
|
||||
|
||||
def create_output_structure(input_dir, output_admin_dir, output_student_dir):
|
||||
"""Create the required output directory structure for both admin and student folders"""
|
||||
# Get all prüfling directories
|
||||
pruefling_dirs = [d for d in input_dir.iterdir() if d.is_dir()]
|
||||
|
||||
for pruefling_dir in pruefling_dirs:
|
||||
# Extract pruefling number from directory name (e.g., prfling12345 -> 12345)
|
||||
pruefling_id = pruefling_dir.name.replace('prfling', '')
|
||||
|
||||
# Create output directories for both admin and student
|
||||
admin_pruefling_dir = Path(output_admin_dir) / f"Prüfling_{pruefling_id}"
|
||||
student_pruefling_dir = Path(output_student_dir) / f"Prüfling_{pruefling_id}"
|
||||
|
||||
# Create directories for admin
|
||||
admin_pruefling_dir.mkdir(parents=True, exist_ok=True)
|
||||
(admin_pruefling_dir / "Transkripte").mkdir(exist_ok=True)
|
||||
(admin_pruefling_dir / "Zusammenfassungen").mkdir(exist_ok=True)
|
||||
|
||||
# Create directories for student
|
||||
student_pruefling_dir.mkdir(parents=True, exist_ok=True)
|
||||
(student_pruefling_dir / "Zusammenfassungen").mkdir(exist_ok=True)
|
||||
|
||||
print(f"Created structure for Prüfling_{pruefling_id}")
|
||||
|
||||
def process_pruefling(pruefling_dir, output_admin_dir, output_student_dir):
|
||||
"""Process all MP3 files in a pruefling directory and create both admin and student outputs"""
|
||||
# Extract pruefling number from directory name
|
||||
pruefling_id = pruefling_dir.name.replace('prfling', '')
|
||||
|
||||
# Define output directories for this pruefling
|
||||
admin_pruefling_dir = Path(output_admin_dir) / f"Prüfling_{pruefling_id}"
|
||||
student_pruefling_dir = Path(output_student_dir) / f"Prüfling_{pruefling_id}"
|
||||
|
||||
transcribe_admin_dir = admin_pruefling_dir / "Transkripte"
|
||||
summary_admin_dir = admin_pruefling_dir / "Zusammenfassungen"
|
||||
summary_student_dir = student_pruefling_dir / "Zusammenfassungen"
|
||||
|
||||
# Get all MP3 files from input directory
|
||||
mp3_files = list(pruefling_dir.glob("*.mp3"))
|
||||
|
||||
if not mp3_files:
|
||||
print(f"No MP3 files found in {pruefling_dir}")
|
||||
return
|
||||
|
||||
print(f"Found {len(mp3_files)} MP3 files for Prüfling_{pruefling_id}.")
|
||||
|
||||
# Process each MP3 file
|
||||
for demo_mp3_path in mp3_files:
|
||||
try:
|
||||
print(f"\nProcessing: {demo_mp3_path.name}")
|
||||
|
||||
with demo_mp3_path.open("rb") as audio_file:
|
||||
transcription = client.audio.transcriptions.create(
|
||||
model="faster-whisper-large-v3",
|
||||
file=audio_file,
|
||||
)
|
||||
|
||||
# Get base filename without extension
|
||||
base_filename = demo_mp3_path.stem
|
||||
|
||||
# Create safe output filenames
|
||||
txt_filename = sanitize_filename(f"{base_filename}.txt")
|
||||
summary_filename = sanitize_filename(f"{base_filename}-summary.txt")
|
||||
|
||||
# Save transcription to admin directory
|
||||
transcript_output_path = transcribe_admin_dir / txt_filename
|
||||
with open(transcript_output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(transcription.text)
|
||||
|
||||
print(f"Transcript saved to {transcript_output_path}")
|
||||
|
||||
# Perform summarization
|
||||
response = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f'''
|
||||
Aufgabe: Fasse das folgende Prüferinnen-Feedback aus einer medizinischen Prüfung präzise, vollständig und übersichtlich zusammen.
|
||||
|
||||
Ziel: Eine konsolidierte, neutrale und strukturierte Zusammenfassung, die alle wesentlichen Inhalte widerspiegelt.
|
||||
|
||||
Bitte führe Folgendes durch:
|
||||
|
||||
Kernthemen des Feedbacks identifizieren.
|
||||
|
||||
Stärken der geprüften Person klar benennen.
|
||||
|
||||
Verbesserungspotenziale mit konkretem Bezug zu Inhalten oder Verhalten hervorheben.
|
||||
|
||||
Fachliche Aspekte getrennt von kommunikativen/sozialen Kompetenzen darstellen.
|
||||
|
||||
Eventuelle konkrete Beispiele oder Situationen aus dem Feedback erwähnen.
|
||||
|
||||
Widersprüche oder unterschiedliche Sichtweisen zwischen Prüferinnen markieren.
|
||||
|
||||
Eine kurze abschließende Meta-Synthese formulieren, die die wichtigsten Prioritäten für zukünftige Prüfungen ableitet.
|
||||
|
||||
Format der Antwort:
|
||||
|
||||
Thematische Zusammenfassung (Stichpunkte)
|
||||
|
||||
Stärken (Stichpunkte)
|
||||
|
||||
Verbesserungspotenziale (Stichpunkte)
|
||||
|
||||
Auffällige Unterschiede im Feedback (falls vorhanden)
|
||||
|
||||
Priorisierte Handlungsempfehlungen (max. 5 Punkte)
|
||||
|
||||
Hier ist das Feedback:
|
||||
\n\n{transcription.text}'''
|
||||
}
|
||||
],
|
||||
model="GPT-OSS-120B",
|
||||
max_tokens=1024,
|
||||
temperature=0.3
|
||||
)
|
||||
|
||||
# Save summary to admin directory
|
||||
summary_output_path = summary_admin_dir / summary_filename
|
||||
with open(summary_output_path, "w", encoding='utf-8') as f:
|
||||
content = response.choices[0].message.content or ""
|
||||
f.write(content)
|
||||
|
||||
print(f"Summary saved to {summary_output_path}")
|
||||
|
||||
# Copy summary to student directory
|
||||
student_summary_path = summary_student_dir / summary_filename
|
||||
with open(student_summary_path, "w", encoding='utf-8') as f:
|
||||
content = response.choices[0].message.content or ""
|
||||
f.write(content)
|
||||
|
||||
print(f"Summary copied to student directory: {student_summary_path}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {demo_mp3_path.name}: {str(e)}")
|
||||
|
||||
def main():
|
||||
# Ask for input directory
|
||||
input_base_dir = Path(input("Enter the input directory path: "))
|
||||
|
||||
# Validate input directory exists
|
||||
if not input_base_dir.exists():
|
||||
print(f"Error: Input directory {input_base_dir} does not exist!")
|
||||
return
|
||||
|
||||
if not input_base_dir.is_dir():
|
||||
print(f"Error: {input_base_dir} is not a directory!")
|
||||
return
|
||||
|
||||
# Create .env file template if it doesn't exist
|
||||
env_file = Path(".env")
|
||||
if not env_file.exists():
|
||||
with open(env_file, "w") as f:
|
||||
f.write("# OpenAI Configuration\n")
|
||||
f.write("OPENAI_API_KEY=your_api_key_here\n")
|
||||
f.write("OPENAI_BASE_URL=https://api.openai.com/v1\n")
|
||||
print("Created .env file. Please fill in your credentials.")
|
||||
return
|
||||
|
||||
# Get output directory names from environment variables
|
||||
output_admin_name = os.getenv("OUTPUT_ADMIN_DIR", "output_admin")
|
||||
output_student_name = os.getenv("OUTPUT_STUDENT_DIR", "output_student")
|
||||
|
||||
# Create parent directory for outputs
|
||||
parent_output_dir = input_base_dir.parent
|
||||
|
||||
# Define output directories
|
||||
output_admin_dir = parent_output_dir / output_admin_name
|
||||
output_student_dir = parent_output_dir / output_student_name
|
||||
|
||||
print(f"Input directory: {input_base_dir}")
|
||||
print(f"Admin output directory: {output_admin_dir}")
|
||||
print(f"Student output directory: {output_student_dir}")
|
||||
|
||||
# Create output directories
|
||||
output_admin_dir.mkdir(exist_ok=True)
|
||||
output_student_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Create directory structure
|
||||
create_output_structure(input_base_dir, output_admin_dir, output_student_dir)
|
||||
|
||||
# Process all pruefling directories
|
||||
for pruefling_dir in input_base_dir.iterdir():
|
||||
if pruefling_dir.is_dir() and pruefling_dir.name.startswith('prfling'):
|
||||
print(f"\nProcessing {pruefling_dir.name}")
|
||||
process_pruefling(pruefling_dir, output_admin_dir, output_student_dir)
|
||||
|
||||
print("\nProcessing complete!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user