OSCE-Feedback-Evaluator/pipeline/feedbacker.py


# %% Isabella


import openai
import os
from pathlib import Path
import datetime
import time
import json as json_module
import csv
from pydantic import BaseModel
import math # Importiere das math Modul für floor/ceil/round falls benötigt, aber Python's round() reicht hier

client = openai.OpenAI(
    api_key="xxx",
    base_url="xxx"
)

EVAL_PROMPT = '''
 Du bist ein strenger, objektiver Bewertender für medizinische Lehre. Bewerte das folgende Feedback anhand der Kursinhalte "Feedback in der Lehre: Basics". KRITERIEN (basierend auf Kursmaterial): A1 PERSPEKTIVE (Ich-Botschaften) Bewertung: Wird subjektive Wahrnehmung in Ich-Formulierungen dargestellt? A2 RESPEKT & WERTFREIHEIT Bewertung: Wird respektvoll und wertfrei kommuniziert? B1 KONKRETHEIT "Das Feedback sollte so konkret wie möglich sein. Die Wiedergabe beobachteter Beispiele ist hilfreich." Bewertung: Enthält das Feedback beobachtbare Beispiele statt Verallgemeinerungen? B2 TRENNUNG VON BEOBACHTUNG UND INTERPRETATION Bewertung: Wird zwischen beobachtbaren Fakten und Interpretationen unterschieden? C1 STRUKTURIERTE LOGIK (WWW/BEB-Prinzip) WWW: "1. Wahrnehmung: Ich habe gesehen ... 2. Wirkung: ... das hat mich nervös gemacht. 3. Wunsch: Ich wünsche mir ..." BEB: "1. Beobachtung: Ich habe gesehen ... 2. Empfehlung: Ich empfehle ... 3. Begründung: Auf diese Weise vermeiden Sie ..." Bewertung: Folgt das Feedback einer klaren Struktur (WWW oder BEB)? D1 ZUKUNGSORIENTIERTE EMPFEHLUNG Bewertung: Gibt es konkrete, zukunftsorientierte Handlungsempfehlungen, die wertschätzend formuliert sind? D2 WERTSCHÄTZENDER ABSCHLUSS Bewertung: Schließt das Feedback wertschätzend ab? E1 KOMMUNIKATIONSEBENEN "Vier Seiten einer Nachricht: Sachinhalt, Selbstoffenbarung, Beziehung, Appell" Bewertung: Berücksichtigt das Feedback die verschiedenen Kommunikationsebenen? F1 FÖRDERUNG VON REFLEXION Bewertung: Fördert das Feedback die Reflexion und das Lernen?

SCORING-ANLEITUNG:
1 = Vollständige Umsetzung (exzellentes Beispiel)
2 = Gute Umsetzung mit minimalen Lücken
3 = Grundlegende Umsetzung mit signifikanten Mängeln
4 = Unzureichende Umsetzung (wichtige Elemente fehlen)
5 = Keine erkennbare Umsetzung (kriterienwidrig)

WICHTIG: Gib die Ergebnisse AUSSCHLIESSLICH als JSON mit EXAKT folgender Struktur zurück:
{
  "scores": {
    "A1": {"score": 1, "justification": "Begründung hier"},
    "A2": {"score": 1, "justification": "Begründung hier"},
    "B1": {"score": 1, "justification": "Begründung hier"},
    "B2": {"score": 1, "justification": "Begründung hier"},
    "C1": {"score": 1, "justification": "Begründung hier"},
    "D1": {"score": 1, "justification": "Begründung hier"},
    "D2": {"score": 1, "justification": "Begründung hier"},
    "E1": {"score": 1, "justification": "Begründung hier"},
    "F1": {"score": 1, "justification": "Begründung hier"}
  },
  "total_score": 0,
  "quality_level": "Beispiel-Qualitätsstufe",
  "strengths": ["Stärke 1", "Stärke 2"],
  "weaknesses": ["Schwäche 1", "Schwäche 2"],
  "improvement_suggestions": ["Vorschlag 1", "Vorschlag 2", "Vorschlag 3"]
}
'''

class ScoreItem(BaseModel):
    score: int  # 1-5 (1=excellent, 5=failed)
    justification: str

class EvaluationResult(BaseModel):
    scores: dict[str, ScoreItem]
    total_score: int
    quality_level: str
    strengths: list[str]
    weaknesses: list[str]
    improvement_suggestions: list[str]
##


# %% Main

QUALITY_LEVEL_MAP = {
    1: "Exzellent (1)",
    2: "Gut (2)",
    3: "Befriedigend (3)",
    4: "Ausreichend (4)",
    5: "Mangelhaft/Ungenügend (5)",
    0: "Fehlerhaft/Unbekannt"
}


input_dir = "./cruscloud/Teil3/Transkripte/"
# Hardcoded output directory - CHANGE THIS PATH AS NEEDED
output_dir = "./cruscloud/Teil3/Evaluations_moodle_isabella2"
Path(output_dir).mkdir(parents=True, exist_ok=True)

# Create timing log file
timing_log_path = Path(output_dir) / "evaluation_timing.log"
with open(timing_log_path, "w", encoding="utf-8") as log:
    log.write(f"FEEDBACK EVALUATION TIMING LOG - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    log.write("="*80 + "\n\n")

# Create CSV timing file with headers
csv_timing_path = Path(output_dir) / "evaluation_timings.csv"
with open(csv_timing_path, "w", encoding="utf-8", newline="") as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=",")
    # Write CSV header
    csv_writer.writerow([
        "Filename",
        "Total_Time_sec",
        "API_Evaluation_Time_sec",
        "Start_Time",
        "End_Time",
        "Status",
        "Average_Score",
        "Quality_Level",
        "A1_Score",
        "A2_Score",
        "B1_Score",
        "B2_Score",
        "C1_Score",
        "D1_Score",
        "D2_Score",
        "E1_Score",
        "F1_Score"
    ])

files = list(Path(input_dir).glob("*.txt"))
results = {}
total_start = time.time()

for f in files:
    file_start = time.time()
    start_time_str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(f"\n{'='*50}")
    print(f"Beginne Bewertung: {f.name}")
    print(f"Startzeit: {datetime.datetime.now().strftime('%H:%M:%S')}")

    # Read input text
    text = f.read_text(encoding="utf-8")

    # Get AI evaluation with timing
    status = "Success"
    eval_duration = 0.0

    # We'll evaluate all criteria in one call with strict JSON structure
    try:
        eval_start_api = time.time()
        response = client.chat.completions.create(
            model="GPT-OSS-120B",
            messages=[
                {"role": "system", "content": EVAL_PROMPT},
                {"role": "user", "content": text},
            ],
            response_format={"type": "json_object"},
            temperature=0.1
        )
        eval_duration = time.time() - eval_start_api # API-Zeit gemessen

        print(f"  • Gesamtbewertung (API-Laufzeit): {eval_duration:.2f} sec")

        # Parse the JSON response
        try:
            parsed_response = json_module.loads(response.choices[0].message.content)

            # Validate structure before passing to Pydantic
            required_keys = ["scores", "total_score", "quality_level",
                             "strengths", "weaknesses", "improvement_suggestions"]

            # If the response has a different structure, try to fix it
            if not all(key in parsed_response for key in required_keys):
                print(f"  ! Warnung: Ungewöhnliche Antwortstruktur erkannt. Versuche Konvertierung...")
                status = "Partial Structure"

                # Create a properly structured response
                fixed_response = {
                    "scores": {
                        "A1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "A2": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "B1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "B2": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "C1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "D1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "D2": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "E1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "F1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"}
                    },
                    "total_score": 0,
                    "quality_level": "Fehlerhaft",
                    "strengths": ["Strukturfehler in der Bewertung"],
                    "weaknesses": ["Antwortstruktur nicht korrekt"],
                    "improvement_suggestions": ["Überprüfen Sie die Feedback-Struktur"]
                }

                # Try to populate with available data
                if "scores" in parsed_response:
                    for key in ["A1", "A2", "B1", "B2", "C1", "D1", "D2", "E1", "F1"]:
                        if key in parsed_response["scores"]:
                            fixed_response["scores"][key] = parsed_response["scores"][key]

                if "total_score" in parsed_response:
                    fixed_response["total_score"] = parsed_response["total_score"]

                if "quality_level" in parsed_response and parsed_response["quality_level"]:
                    fixed_response["quality_level"] = parsed_response["quality_level"]

                if "strengths" in parsed_response and isinstance(parsed_response["strengths"], list):
                    fixed_response["strengths"] = parsed_response["strengths"]

                if "weaknesses" in parsed_response and isinstance(parsed_response["weaknesses"], list):
                    fixed_response["weaknesses"] = parsed_response["weaknesses"]

                if "improvement_suggestions" in parsed_response and isinstance(parsed_response["improvement_suggestions"], list):
                    fixed_response["improvement_suggestions"] = parsed_response["improvement_suggestions"]

                parsed_response = fixed_response

            # Create evaluation object
            evaluation = EvaluationResult(**parsed_response)
            results[f.name] = evaluation

        except json_module.JSONDecodeError as e:
            print(f"  ! JSON-Decoding-Fehler: {e}")
            print(f"  ! Antwortinhalt: {response.choices[0].message.content[:200]}...")
            status = f"JSON Error: {str(e)}"

            # Create a default evaluation with error messages
            error_explanation = f"FEHLER: Ungültige JSON-Antwort vom API. Details: {str(e)}"
            default_scores = {
                key: ScoreItem(score=5, justification=error_explanation)
                for key in ["A1", "A2", "B1", "B2", "C1", "D1", "D2", "E1", "F1"]
            }

            evaluation = EvaluationResult(
                scores=default_scores,
                total_score=0,
                quality_level="Fehlerhaft",
                strengths=["Bewertung fehlgeschlagen"],
                weaknesses=["Ungültiges JSON-Format"],
                improvement_suggestions=["Überprüfen Sie die Feedback-Struktur"]
            )
            results[f.name] = evaluation

    except Exception as e:
        print(f"  ! Unerwarteter Fehler: {str(e)}")
        status = f"API Error: {str(e)}"

        # Create a default evaluation with error messages
        error_explanation = f"FEHLER: Bewertung fehlgeschlagen. Details: {str(e)}"
        default_scores = {
            key: ScoreItem(score=5, justification=error_explanation)
            for key in ["A1", "A2", "B1", "B2", "C1", "D1", "D2", "E1", "F1"]
        }

        evaluation = EvaluationResult(
            scores=default_scores,
            total_score=0,
            quality_level="Systemfehler",
            strengths=["Bewertung fehlgeschlagen"],
            weaknesses=[f"Technischer Fehler: {str(e)}"],
            improvement_suggestions=["Kontaktieren Sie den Support"]
        )
        results[f.name] = evaluation

    # Calculate the AVERAGE score (not sum)
    all_scores = [evaluation.scores[key].score for key in evaluation.scores.keys()]
    valid_scores = [s for s in all_scores if isinstance(s, int)]
    average_score = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0

    # Runden auf die nächste ganze Zahl (natürliche Zahl-Format)
    rounded_average_score = int(round(average_score))

    # --- Konkrete Qualitätsstufen zuweisen (Neu implementiert) ---
    if status in ["JSON Error", "API Error", "Systemfehler", "Partial Structure"]:
        final_quality_level = evaluation.quality_level # Behält Fehlerstatus bei
    else:
        # Weist die definierte Qualitätsstufe basierend auf dem Durchschnitt zu
        final_quality_level = QUALITY_LEVEL_MAP.get(rounded_average_score, "Fehlerhaft/Unbekannt")

    # Überschreibe den Wert im evaluation-Objekt
    evaluation.quality_level = final_quality_level
    # ----------------------------------------------------------------

    # --- Zeitmessungs-Korrektur (Überprüfung) ---
    # Gesamtzeit für die Datei (bis zum Ende der Verarbeitung)
    total_file_duration = time.time() - file_start
    # Lokale Verarbeitungszeit: Gesamtzeit minus der reinen API-Wartezeit.
    local_processing_time = total_file_duration - eval_duration
    if local_processing_time < 0: # Sicherstellen, dass die Zeit nicht negativ wird
        local_processing_time = 0.0
    # -------------------------------------

    # Generate detailed text report with timing
    report = f'''FEEDBACK-EVALUATION BERICHT
============================
Eingabedatei: {f.name}
Erstellungsdatum: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Kursreferenz: "Feedback in der Lehre: Basics" (Hochschulmedizin Dresden)

VERARBEITUNGSZEITEN
----------------------------------------
Gesamtverarbeitung: {total_file_duration:.2f} Sekunden
  • API-Bewertungszeit: {eval_duration:.2f} Sekunden
  • Lokale Verarbeitungszeit (Lesen, JSON, Bericht): {local_processing_time:.2f} Sekunden
'''

    # Add evaluation results
    report += f'''

KRITERIENBEWERTUNG
----------------------------------------
A1 PERSPEKTIVE (Ich-Botschaften): {evaluation.scores['A1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['A1'].justification}

A2 RESPEKT & WERTFREIHEIT: {evaluation.scores['A2'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['A2'].justification}

B1 KONKRETHEIT: {evaluation.scores['B1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['B1'].justification}

B2 TRENNUNG VON BEOBACHTUNG UND INTERPRETATION: {evaluation.scores['B2'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['B2'].justification}

C1 STRUKTURIERTE LOGIK (WWW/BEB): {evaluation.scores['C1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['C1'].justification}

D1 ZUKUNGSORIENTIERTE EMPFEHLUNG: {evaluation.scores['D1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['D1'].justification}

D2 WERTSCHÄTZENDER ABSCHLUSS: {evaluation.scores['D2'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['D2'].justification}

E1 KOMMUNIKATIONSEBENEN: {evaluation.scores['E1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['E1'].justification}

F1 FÖRDERUNG VON REFLEXION: {evaluation.scores['F1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['F1'].justification}

GESAMTBEWERTUNG
----------------------------------------
Durchschnittliche Bewertung: {rounded_average_score}/5
(1=exzellent, 5=nicht bestanden)

Qualitätsstufe: {evaluation.quality_level}

Stärken:
'''
    for strength in evaluation.strengths:
        report += f"- {strength}\n"

    report += "\nSchwächen:\n"
    for weakness in evaluation.weaknesses:
        report += f"- {weakness}\n"

    report += "\nVerbesserungsvorschläge:\n"
    for suggestion in evaluation.improvement_suggestions:
        report += f"- {suggestion}\n"

    # Save report to output directory
    output_path = Path(output_dir) / f"{f.stem}_evaluation.txt"
    with open(output_path, "w", encoding="utf-8") as out_file:
        out_file.write(report)

    # Write timing data to CSV
    with open(csv_timing_path, "a", encoding="utf-8", newline="") as csv_file:
        csv_writer = csv.writer(csv_file, delimiter=",")
        csv_writer.writerow([
            f.name,
            f"{total_file_duration:.2f}",
            f"{eval_duration:.2f}",
            start_time_str,
            datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            status,
            rounded_average_score,
            evaluation.quality_level,
            evaluation.scores['A1'].score,
            evaluation.scores['A2'].score,
            evaluation.scores['B1'].score,
            evaluation.scores['B2'].score,
            evaluation.scores['C1'].score,
            evaluation.scores['D1'].score,
            evaluation.scores['D2'].score,
            evaluation.scores['E1'].score,
            evaluation.scores['F1'].score
        ])

    # Log timing to central log file
    with open(timing_log_path, "a", encoding="utf-8") as log:
        log.write(f"Datei: {f.name}\n")
        log.write(f"Start: {datetime.datetime.now().strftime('%H:%M:%S')}\n")
        log.write(f"Dauer: {total_file_duration:.2f} Sekunden\n")
        log.write("Detailierte Zeiten:\n")
        log.write(f"  • API-Bewertung: {eval_duration:.2f} Sekunden\n")
        log.write(f"  • Lokale Verarbeitung: {local_processing_time:.2f} Sekunden\n")
        log.write("-"*50 + "\n\n")

    print(f"\nBewertungsbericht erstellt: {output_path}")
    print(f"Gesamtzeit für {f.name}: {total_file_duration:.2f} Sekunden (API: {eval_duration:.2f}, Lokal: {local_processing_time:.2f})")
    print(f"{'='*50}")

total_duration = time.time() - total_start
print(f"\n{'='*50}")
print(f"ALLE BEWERTUNGEN ABGESCHLOSSEN")
print(f"Gesamtverarbeitungszeit: {total_duration:.2f} Sekunden für {len(files)} Dateien")
print(f"Durchschnittliche Zeit pro Datei: {total_duration/len(files):.2f} Sekunden")
print(f"Bewertungsberichte gespeichert in: {output_dir}")
print(f"Timing-Log aktualisiert: {timing_log_path}")
print(f"CSV-Timing-Datei erstellt: {csv_timing_path}")
print(f"{'='*50}")
##