OSCE-Feedback-Evaluator/exp/app.old_inside_pipeline.py

# %% Prompt

import openai
import os
from pathlib import Path
import datetime
import time
import json as json_module
import csv
from pydantic import BaseModel

# Initialize client with reasoning capabilities
client = openai.OpenAI(
    api_key="sk--T3QiY4gBE67o9oSxEOqxw",
    base_url="http://pluto/v1"
)

# Enhanced prompt with detailed criteria and structured output requirements
EVAL_PROMPT = """
SYSTEM:
Du bist ein Expert:in für medizinische Lehre und Feedback-Didaktik an einer medizinischen Fakultät.
Bewerte das folgende Tutor-Feedback gemäß den offiziellen Feedback-Prinzipien der Medizinischen Fakultät Dresden.

KRITERIEN:
A1 PERSPEKTIVE (Ich-Botschaften)
"A feedback ... wird in „Ich-Botschaften“ ausgedrückt."
Bewertung: Wird subjektive Wahrnehmung in Ich-Formulierungen dargestellt?

A2 RESPEKT & WERTFREIHEIT
"Ein Feedback ... ist nicht (ab)wertend."
Bewertung: Wird respektvoll und wertfrei kommuniziert?

B1 KONKRETHEIT
"Das Feedback sollte so konkret wie möglich sein. Die Wiedergabe beobachteter Beispiele ist hilfreich."
Bewertung: Enthält das Feedback beobachtbare Beispiele statt Verallgemeinerungen?

B2 TRENNUNG VON BEOBACHTUNG UND INTERPRETATION
"Ein Feedback ... gibt erst nach der Äußerung von sinnlich Wahrnehmbarem die Möglichkeit zu Interpretationen, Annahmen und Schlussfolgerungen."
Bewertung: Wird zwischen beobachtbaren Fakten und Interpretationen unterschieden?

C1 STRUKTURIERTE LOGIK (WWW/BEB-Prinzip)
WWW: "1. Wahrnehmung: Ich habe gesehen ... 2. Wirkung: ... das hat mich nervös gemacht. 3. Wunsch: Ich wünsche mir ..."
BEB: "1. Beobachtung: Ich habe gesehen ... 2. Empfehlung: Ich empfehle ... 3. Begründung: Auf diese Weise vermeiden Sie ..."
Bewertung: Folgt das Feedback einer klaren Struktur (WWW oder BEB)?

D1 ZUKUNGSORIENTIERTE EMPFEHLUNG
"Ein Feedback ... endet mit einer wertschätzenden Anregung für zukünftige Verbesserungen."
Bewertung: Gibt es konkrete, zukunftsorientierte Handlungsempfehlungen?

D2 WERTSCHÄTZENDER ABSCHLUSS
"Ein Feedback ... endet mit einer wertschätzenden Anregung für zukünftige Verbesserungen."
Bewertung: Schließt das Feedback wertschätzend ab?

E1 KOMMUNIKATIONSEBENEN
"Vier Seiten einer Nachricht: Sachinhalt, Selbstoffenbarung, Beziehung, Appell"
Bewertung: Berücksichtigt das Feedback die verschiedenen Kommunikationsebenen?

F1 FÖRDERUNG VON REFLEXION
"Feedback ... ist eines der einflussreichsten Faktoren für den Lernerfolg."
Bewertung: Fördert das Feedback die Reflexion und das Lernen?

SCORING:
Bewerte jedes Kriterium mit:
0 = nicht erfüllt
1 = teilweise erfüllt
2 = vollständig erfüllt

AUFGABE:
1. Bewerte jedes Kriterium mit einer Punktzahl (0-2)
2. Gib eine kurze Begründung für jede Bewertung mit Zitaten oder Paraphrasierungen aus dem Feedback
3. Berechne die Gesamtpunktzahl (max. 18)
4. Weise eine qualitative Bewertungsstufe zu
5. Gib 3 konkrete Verbesserungsvorschläge

OUTPUT FORMAT (JSON):
{
  "scores": {
    "A1": {"score": 0-2, "justification": "..."},
    "A2": {"score": 0-2, "justification": "..."},
    "B1": {"score": 0-2, "justification": "..."},
    "B2": {"score": 0-2, "justification": "..."},
    "C1": {"score": 0-2, "justification": "..."},
    "D1": {"score": 0-2, "justification": "..."},
    "D2": {"score": 0-2, "justification": "..."},
    "E1": {"score": 0-2, "justification": "..."},
    "F1": {"score": 0-2, "justification": "..."}
  },
  "total_score": 0,
  "quality_level": "",
  "strengths": [],
  "weaknesses": [],
  "improvement_suggestions": []
}

TUTOR FEEDBACK:
"""

# Pydantic models for structured output validation
class ScoreItem(BaseModel):
    score: int  # 0-2 (0=not fulfilled, 2=fully fulfilled)
    justification: str

class EvaluationResult(BaseModel):
    scores: dict[str, ScoreItem]
    total_score: int
    quality_level: str
    strengths: list[str]
    weaknesses: list[str]
    improvement_suggestions: list[str]
##

# %% Main
input_dir = "./cruscloud/Teil3/Transkripte/"
# Hardcoded output directory - CHANGE THIS PATH AS NEEDED
output_dir = "./cruscloud/Teil3/Evaluations_moodle2"
Path(output_dir).mkdir(parents=True, exist_ok=True)

# Create timing log file
timing_log_path = Path(output_dir) / "evaluation_timing.log"
with open(timing_log_path, "w", encoding="utf-8") as log:
    log.write(f"FEEDBACK EVALUATION TIMING LOG - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    log.write("="*80 + "\n\n")

# Create CSV timing file with headers
csv_timing_path = Path(output_dir) / "evaluation_timings.csv"
with open(csv_timing_path, "w", encoding="utf-8", newline="") as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=",")
    # Write CSV header
    csv_writer.writerow([
        "Filename",
        "Total_Time_sec",
        "API_Evaluation_Time_sec",
        "Reasoning_Time_sec",
        "Start_Time",
        "End_Time",
        "Status",
        "Total_Score",
        "Quality_Level",
        "A1_Score",
        "A2_Score",
        "B1_Score",
        "B2_Score",
        "C1_Score",
        "D1_Score",
        "D2_Score",
        "E1_Score",
        "F1_Score"
    ])

files = list(Path(input_dir).glob("*.txt"))
results = {}
total_start = time.time()

for f in files:
    file_start = time.time()
    start_time_str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(f"\n{'='*50}")
    print(f"Beginne Bewertung: {f.name}")
    print(f"Startzeit: {datetime.datetime.now().strftime('%H:%M:%S')}")

    # Read input text
    text = f.read_text(encoding="utf-8")

    # Get AI evaluation with timing
    criterion_timings = {}
    status = "Success"

    try:
        # Time the complete API evaluation process
        eval_start = time.time()

        # Use reasoning model with specified parameters
        response = client.chat.completions.create(
            model="GPT-OSS-120B",
            messages=[
                {"role": "system", "content": EVAL_PROMPT},
                {"role": "user", "content": text},
            ],
            response_format={"type": "json_object"},
            temperature=0.1,
            max_completion_tokens=1024,
            reasoning_effort="medium",  # Using the reasoning model capabilities
            extra_body={"allowed_openai_params": ["reasoning_effort"]}
        )

        # Measure reasoning time separately if available
        reasoning_time = 0
        if hasattr(response.choices[0].message, 'reasoning_content') and response.choices[0].message.reasoning_content:
            reasoning_time = time.time() - eval_start
            criterion_timings["Reasoning"] = reasoning_time
            print(f"  • Reasoning: {reasoning_time:.2f} sec")

        eval_duration = time.time() - eval_start
        criterion_timings["Gesamtbewertung"] = eval_duration
        print(f"  • Gesamtbewertung: {eval_duration:.2f} sec")

        # Parse the JSON response
        try:
            parsed_response = json_module.loads(response.choices[0].message.content)

            # Validate structure before passing to Pydantic
            required_keys = ["scores", "total_score", "quality_level",
                            "strengths", "weaknesses", "improvement_suggestions"]

            if not all(key in parsed_response for key in required_keys):
                print(f"  ! Warnung: Ungewöhnliche Antwortstruktur erkannt. Versuche Konvertierung...")
                status = "Partial Structure"

            # Create evaluation object
            evaluation = EvaluationResult(**parsed_response)
            results[f.name] = evaluation

        except json_module.JSONDecodeError as e:
            print(f"  ! JSON-Decoding-Fehler: {e}")
            print(f"  ! Antwortinhalt: {response.choices[0].message.content[:200]}...")
            status = f"JSON Error: {str(e)}"

            # Create a default evaluation with error messages
            evaluation = EvaluationResult(
                scores={
                    "A1": ScoreItem(score=0, justification="FEHLER: Ungültige JSON-Antwort vom API"),
                    "A2": ScoreItem(score=0, justification="FEHLER: Ungültige JSON-Antwort vom API"),
                    "B1": ScoreItem(score=0, justification="FEHLER: Ungültige JSON-Antwort vom API"),
                    "B2": ScoreItem(score=0, justification="FEHLER: Ungültige JSON-Antwort vom API"),
                    "C1": ScoreItem(score=0, justification="FEHLER: Ungültige JSON-Antwort vom API"),
                    "D1": ScoreItem(score=0, justification="FEHLER: Ungültige JSON-Antwort vom API"),
                    "D2": ScoreItem(score=0, justification="FEHLER: Ungültige JSON-Antwort vom API"),
                    "E1": ScoreItem(score=0, justification="FEHLER: Ungültige JSON-Antwort vom API"),
                    "F1": ScoreItem(score=0, justification="FEHLER: Ungültige JSON-Antwort vom API")
                },
                total_score=0,
                quality_level="Fehlerhaft",
                strengths=["Bewertung fehlgeschlagen"],
                weaknesses=["Keine Bewertung möglich"],
                improvement_suggestions=["Korrigieren Sie die Feedback-Struktur"]
            )
            results[f.name] = evaluation

    except Exception as e:
        print(f"  ! Unerwarteter Fehler: {str(e)}")
        status = f"API Error: {str(e)}"

        # Create a default evaluation with error messages
        evaluation = EvaluationResult(
            scores={
                "A1": ScoreItem(score=0, justification=f"FEHLER: Bewertung fehlgeschlagen. Details: {str(e)}"),
                "A2": ScoreItem(score=0, justification=f"FEHLER: Bewertung fehlgeschlagen. Details: {str(e)}"),
                "B1": ScoreItem(score=0, justification=f"FEHLER: Bewertung fehlgeschlagen. Details: {str(e)}"),
                "B2": ScoreItem(score=0, justification=f"FEHLER: Bewertung fehlgeschlagen. Details: {str(e)}"),
                "C1": ScoreItem(score=0, justification=f"FEHLER: Bewertung fehlgeschlagen. Details: {str(e)}"),
                "D1": ScoreItem(score=0, justification=f"FEHLER: Bewertung fehlgeschlagen. Details: {str(e)}"),
                "D2": ScoreItem(score=0, justification=f"FEHLER: Bewertung fehlgeschlagen. Details: {str(e)}"),
                "E1": ScoreItem(score=0, justification=f"FEHLER: Bewertung fehlgeschlagen. Details: {str(e)}"),
                "F1": ScoreItem(score=0, justification=f"FEHLER: Bewertung fehlgeschlagen. Details: {str(e)}")
            },
            total_score=0,
            quality_level="Fehlerhaft",
            strengths=["Bewertung fehlgeschlagen"],
            weaknesses=["Keine Bewertung möglich"],
            improvement_suggestions=["Korrigieren Sie die Feedback-Struktur"]
        )
        results[f.name] = evaluation
        eval_duration = time.time() - eval_start
        criterion_timings["Gesamtbewertung"] = eval_duration

    # Generate detailed text report with timing
    report = f"""FEEDBACK-EVALUATION BERICHT
============================
Eingabedatei: {f.name}
Erstellungsdatum: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Kursreferenz: "Feedback in der Lehre: Basics" (Hochschulmedizin Dresden)

VERARBEITUNGSZEITEN
----------------------------------------
Gesamtverarbeitung: {time.time() - file_start:.2f} Sekunden
"""

    # Add timing for evaluation
    for criterion, duration in criterion_timings.items():
        report += f"  • {criterion}: {duration:.2f} Sekunden\n"

    # Add evaluation results
    report += f"""

KRITERIENBEWERTUNG
----------------------------------------
A1 PERSPEKTIVE (Ich-Botschaften): {evaluation.scores['A1'].score}/2
Begründung: {evaluation.scores['A1'].justification}

A2 RESPEKT & WERTFREIHEIT: {evaluation.scores['A2'].score}/2
Begründung: {evaluation.scores['A2'].justification}

B1 KONKRETHEIT: {evaluation.scores['B1'].score}/2
Begründung: {evaluation.scores['B1'].justification}

B2 TRENNUNG VON BEOBACHTUNG UND INTERPRETATION: {evaluation.scores['B2'].score}/2
Begründung: {evaluation.scores['B2'].justification}

C1 STRUKTURIERTE LOGIK (WWW/BEB): {evaluation.scores['C1'].score}/2
Begründung: {evaluation.scores['C1'].justification}

D1 ZUKUNGSORIENTIERTE EMPFEHLUNG: {evaluation.scores['D1'].score}/2
Begründung: {evaluation.scores['D1'].justification}

D2 WERTSCHÄTZENDER ABSCHLUSS: {evaluation.scores['D2'].score}/2
Begründung: {evaluation.scores['D2'].justification}

E1 KOMMUNIKATIONSEBENEN: {evaluation.scores['E1'].score}/2
Begründung: {evaluation.scores['E1'].justification}

F1 FÖRDERUNG VON REFLEXION: {evaluation.scores['F1'].score}/2
Begründung: {evaluation.scores['F1'].justification}

GESAMTBEWERTUNG
----------------------------------------
Gesamtpunktzahl: {evaluation.total_score}/18

Qualitätsstufe: {evaluation.quality_level}

Stärken:
"""
    for strength in evaluation.strengths:
        report += f"- {strength}\n"

    report += "\nSchwächen:\n"
    for weakness in evaluation.weaknesses:
        report += f"- {weakness}\n"

    report += "\nVerbesserungsvorschläge:\n"
    for suggestion in evaluation.improvement_suggestions:
        report += f"- {suggestion}\n"

    # Save report to output directory
    output_path = Path(output_dir) / f"{f.stem}_evaluation.txt"
    with open(output_path, "w", encoding="utf-8") as out_file:
        out_file.write(report)

    # Write timing data to CSV
    with open(csv_timing_path, "a", encoding="utf-8", newline="") as csv_file:
        csv_writer = csv.writer(csv_file, delimiter=",")
        csv_writer.writerow([
            f.name,
            f"{time.time() - file_start:.2f}",
            f"{eval_duration:.2f}",
            f"{reasoning_time:.2f}" if 'reasoning_time' in locals() else "0.00",
            start_time_str,
            datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            status,
            evaluation.total_score,
            evaluation.quality_level,
            evaluation.scores['A1'].score,
            evaluation.scores['A2'].score,
            evaluation.scores['B1'].score,
            evaluation.scores['B2'].score,
            evaluation.scores['C1'].score,
            evaluation.scores['D1'].score,
            evaluation.scores['D2'].score,
            evaluation.scores['E1'].score,
            evaluation.scores['F1'].score
        ])

    # Log timing to central log file
    with open(timing_log_path, "a", encoding="utf-8") as log:
        log.write(f"Datei: {f.name}\n")
        log.write(f"Start: {datetime.datetime.now().strftime('%H:%M:%S')}\n")
        log.write(f"Dauer: {time.time() - file_start:.2f} Sekunden\n")
        log.write("Detailierte Zeiten:\n")
        for criterion, duration in criterion_timings.items():
            log.write(f"  • {criterion}: {duration:.2f} Sekunden\n")
        log.write("-"*50 + "\n\n")

    print(f"\nBewertungsbericht erstellt: {output_path}")
    print(f"Gesamtzeit für {f.name}: {time.time() - file_start:.2f} Sekunden")
    print(f"{'='*50}")

total_duration = time.time() - total_start
print(f"\n{'='*50}")
print(f"ALLE BEWERTUNGEN ABGESCHLOSSEN")
print(f"Gesamtverarbeitungszeit: {total_duration:.2f} Sekunden für {len(files)} Dateien")
print(f"Durchschnittliche Zeit pro Datei: {total_duration/len(files):.2f} Sekunden")
print(f"Bewertungsberichte gespeichert in: {output_dir}")
print(f"Timing-Log aktualisiert: {timing_log_path}")
print(f"CSV-Timing-Datei erstellt: {csv_timing_path}")
print(f"{'='*50}")


##


# %% Feedback_Bewertung
import openai
import os
from pathlib import Path
import datetime
import time
import json as json_module
import csv
from pydantic import BaseModel

client = openai.OpenAI(
    api_key="sk--T3QiY4gBE67o9oSxEOqxw",
    base_url="http://pluto/v1"
)

EVAL_PROMPT = '''
Du bist ein strenger, objektiver Bewertender für medizinische Lehre.
Bewerte das folgende Feedback anhand der Kursinhalte "Feedback in der Lehre: Basics".

KRITERIEN (basierend auf Kursmaterial):
A1 PERSPEKTIVE (Ich-Botschaften)
"A feedback ... wird in „Ich-Botschaften" ausgedrückt."
Bewertung: Wird subjektive Wahrnehmung in Ich-Formulierungen dargestellt?

A2 RESPEKT & WERTFREIHEIT
"Ein Feedback ... ist nicht (ab)wertend."
Bewertung: Wird respektvoll und wertfrei kommuniziert?

B1 KONKRETHEIT
"Das Feedback sollte so konkret wie möglich sein. Die Wiedergabe beobachteter Beispiele ist hilfreich."
Bewertung: Enthält das Feedback beobachtbare Beispiele statt Verallgemeinerungen?

B2 TRENNUNG VON BEOBACHTUNG UND INTERPRETATION
"Ein Feedback ... gibt erst nach der Äußerung von sinnlich Wahrnehmbarem die Möglichkeit zu Interpretationen, Annahmen und Schlussfolgerungen."
Bewertung: Wird zwischen beobachtbaren Fakten und Interpretationen unterschieden?

C1 STRUKTURIERTE LOGIK (WWW/BEB-Prinzip)
WWW: "1. Wahrnehmung: Ich habe gesehen ... 2. Wirkung: ... das hat mich nervös gemacht. 3. Wunsch: Ich wünsche mir ..."
BEB: "1. Beobachtung: Ich habe gesehen ... 2. Empfehlung: Ich empfehle ... 3. Begründung: Auf diese Weise vermeiden Sie ..."
Bewertung: Folgt das Feedback einer klaren Struktur (WWW oder BEB)?

D1 ZUKUNGSORIENTIERTE EMPFEHLUNG
"Ein Feedback ... endet mit einer wertschätzenden Anregung für zukünftige Verbesserungen."
Bewertung: Gibt es konkrete, zukunftsorientierte Handlungsempfehlungen?

D2 WERTSCHÄTZENDER ABSCHLUSS
"Ein Feedback ... endet mit einer wertschätzenden Anregung für zukünftige Verbesserungen."
Bewertung: Schließt das Feedback wertschätzend ab?

E1 KOMMUNIKATIONSEBENEN
"Vier Seiten einer Nachricht: Sachinhalt, Selbstoffenbarung, Beziehung, Appell"
Bewertung: Berücksichtigt das Feedback die verschiedenen Kommunikationsebenen?

F1 FÖRDERUNG VON REFLEXION
"Feedback ... ist eines der einflussreichsten Faktoren für den Lernerfolg."
Bewertung: Fördert das Feedback die Reflexion und das Lernen?

SCORING-ANLEITUNG:
1 = Vollständige Umsetzung (exzellentes Beispiel)
2 = Gute Umsetzung mit minimalen Lücken
3 = Grundlegende Umsetzung mit signifikanten Mängeln
4 = Unzureichende Umsetzung (wichtige Elemente fehlen)
5 = Keine erkennbare Umsetzung (kriterienwidrig)

WICHTIG: Gib die Ergebnisse AUSSCHLIESSLICH als JSON mit EXAKT folgender Struktur zurück:
{
  "scores": {
    "A1": {"score": 1, "justification": "Begründung hier"},
    "A2": {"score": 1, "justification": "Begründung hier"},
    "B1": {"score": 1, "justification": "Begründung hier"},
    "B2": {"score": 1, "justification": "Begründung hier"},
    "C1": {"score": 1, "justification": "Begründung hier"},
    "D1": {"score": 1, "justification": "Begründung hier"},
    "D2": {"score": 1, "justification": "Begründung hier"},
    "E1": {"score": 1, "justification": "Begründung hier"},
    "F1": {"score": 1, "justification": "Begründung hier"}
  },
  "total_score": 0,
  "quality_level": "Beispiel-Qualitätsstufe",
  "strengths": ["Stärke 1", "Stärke 2"],
  "weaknesses": ["Schwäche 1", "Schwäche 2"],
  "improvement_suggestions": ["Vorschlag 1", "Vorschlag 2", "Vorschlag 3"]
}
'''

class ScoreItem(BaseModel):
    score: int  # 1-5 (1=excellent, 5=failed)
    justification: str

class EvaluationResult(BaseModel):
    scores: dict[str, ScoreItem]
    total_score: int
    quality_level: str
    strengths: list[str]
    weaknesses: list[str]
    improvement_suggestions: list[str]
##

# %% Main
input_dir = "./cruscloud/Teil3/Transkripte/"
# Hardcoded output directory - CHANGE THIS PATH AS NEEDED
output_dir = "./cruscloud/Teil3/Evaluations_moodle3"
Path(output_dir).mkdir(parents=True, exist_ok=True)

# Create timing log file
timing_log_path = Path(output_dir) / "evaluation_timing.log"
with open(timing_log_path, "w", encoding="utf-8") as log:
    log.write(f"FEEDBACK EVALUATION TIMING LOG - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    log.write("="*80 + "\n\n")

# Create CSV timing file with headers
csv_timing_path = Path(output_dir) / "evaluation_timings.csv"
with open(csv_timing_path, "w", encoding="utf-8", newline="") as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=",")
    # Write CSV header
    csv_writer.writerow([
        "Filename",
        "Total_Time_sec",
        "API_Evaluation_Time_sec",
        "Start_Time",
        "End_Time",
        "Status",
        "Average_Score",
        "Quality_Level",
        "A1_Score",
        "A2_Score",
        "B1_Score",
        "B2_Score",
        "C1_Score",
        "D1_Score",
        "D2_Score",
        "E1_Score",
        "F1_Score"
    ])

files = list(Path(input_dir).glob("*.txt"))
results = {}
total_start = time.time()

for f in files:
    file_start = time.time()
    start_time_str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(f"\n{'='*50}")
    print(f"Beginne Bewertung: {f.name}")
    print(f"Startzeit: {datetime.datetime.now().strftime('%H:%M:%S')}")

    # Read input text
    text = f.read_text(encoding="utf-8")

    # Get AI evaluation with timing
    criterion_timings = {}
    status = "Success"

    # We'll evaluate all criteria in one call with strict JSON structure
    try:
        eval_start = time.time()
        response = client.chat.completions.create(
            model="GPT-OSS-120B",
            messages=[
                {"role": "system", "content": EVAL_PROMPT},
                {"role": "user", "content": text},
            ],
            response_format={"type": "json_object"},
            temperature=0.1
        )
        eval_duration = time.time() - eval_start
        criterion_timings["Gesamtbewertung"] = eval_duration
        print(f"  • Gesamtbewertung: {eval_duration:.2f} sec")

        # Parse the JSON response
        try:
            parsed_response = json_module.loads(response.choices[0].message.content)

            # Validate structure before passing to Pydantic
            required_keys = ["scores", "total_score", "quality_level",
                            "strengths", "weaknesses", "improvement_suggestions"]

            # If the response has a different structure, try to fix it
            if not all(key in parsed_response for key in required_keys):
                print(f"  ! Warnung: Ungewöhnliche Antwortstruktur erkannt. Versuche Konvertierung...")
                status = "Partial Structure"

                # Create a properly structured response
                fixed_response = {
                    "scores": {
                        "A1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "A2": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "B1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "B2": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "C1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "D1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "D2": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "E1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "F1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"}
                    },
                    "total_score": 0,
                    "quality_level": "Fehlerhaft",
                    "strengths": ["Strukturfehler in der Bewertung"],
                    "weaknesses": ["Antwortstruktur nicht korrekt"],
                    "improvement_suggestions": ["Überprüfen Sie die Feedback-Struktur"]
                }

                # Try to populate with available data
                if "scores" in parsed_response:
                    for key in ["A1", "A2", "B1", "B2", "C1", "D1", "D2", "E1", "F1"]:
                        if key in parsed_response["scores"]:
                            fixed_response["scores"][key] = parsed_response["scores"][key]

                if "total_score" in parsed_response:
                    fixed_response["total_score"] = parsed_response["total_score"]

                if "quality_level" in parsed_response and parsed_response["quality_level"]:
                    fixed_response["quality_level"] = parsed_response["quality_level"]

                if "strengths" in parsed_response and isinstance(parsed_response["strengths"], list):
                    fixed_response["strengths"] = parsed_response["strengths"]

                if "weaknesses" in parsed_response and isinstance(parsed_response["weaknesses"], list):
                    fixed_response["weaknesses"] = parsed_response["weaknesses"]

                if "improvement_suggestions" in parsed_response and isinstance(parsed_response["improvement_suggestions"], list):
                    fixed_response["improvement_suggestions"] = parsed_response["improvement_suggestions"]

                parsed_response = fixed_response

            # Create evaluation object
            evaluation = EvaluationResult(**parsed_response)
            results[f.name] = evaluation

        except json_module.JSONDecodeError as e:
            print(f"  ! JSON-Decoding-Fehler: {e}")
            print(f"  ! Antwortinhalt: {response.choices[0].message.content[:200]}...")
            status = f"JSON Error: {str(e)}"

            # Create a default evaluation with error messages
            error_explanation = f"FEHLER: Ungültige JSON-Antwort vom API. Details: {str(e)}"
            default_scores = {
                key: ScoreItem(score=5, justification=error_explanation)
                for key in ["A1", "A2", "B1", "B2", "C1", "D1", "D2", "E1", "F1"]
            }

            evaluation = EvaluationResult(
                scores=default_scores,
                total_score=0,
                quality_level="Fehlerhaft",
                strengths=["Bewertung fehlgeschlagen"],
                weaknesses=["Ungültiges JSON-Format"],
                improvement_suggestions=["Überprüfen Sie die Feedback-Struktur"]
            )
            results[f.name] = evaluation

    except Exception as e:
        print(f"  ! Unerwarteter Fehler: {str(e)}")
        status = f"API Error: {str(e)}"

        # Create a default evaluation with error messages
        error_explanation = f"FEHLER: Bewertung fehlgeschlagen. Details: {str(e)}"
        default_scores = {
            key: ScoreItem(score=5, justification=error_explanation)
            for key in ["A1", "A2", "B1", "B2", "C1", "D1", "D2", "E1", "F1"]
        }

        evaluation = EvaluationResult(
            scores=default_scores,
            total_score=0,
            quality_level="Systemfehler",
            strengths=["Bewertung fehlgeschlagen"],
            weaknesses=[f"Technischer Fehler: {str(e)}"],
            improvement_suggestions=["Kontaktieren Sie den Support"]
        )
        results[f.name] = evaluation
        eval_duration = time.time() - eval_start
        criterion_timings["Gesamtbewertung"] = eval_duration

    # Calculate the AVERAGE score (not sum)
    all_scores = [evaluation.scores[key].score for key in evaluation.scores.keys()]
    valid_scores = [s for s in all_scores if isinstance(s, int)]
    average_score = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0

    # Generate detailed text report with timing
    report = f'''FEEDBACK-EVALUATION BERICHT
============================
Eingabedatei: {f.name}
Erstellungsdatum: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Kursreferenz: "Feedback in der Lehre: Basics" (Hochschulmedizin Dresden)

VERARBEITUNGSZEITEN
----------------------------------------
Gesamtverarbeitung: {time.time() - file_start:.2f} Sekunden
'''

    # Add timing for evaluation
    for criterion, duration in criterion_timings.items():
        report += f"  • {criterion}: {duration:.2f} Sekunden\n"

    # Add evaluation results
    report += f'''

KRITERIENBEWERTUNG
----------------------------------------
A1 PERSPEKTIVE (Ich-Botschaften): {evaluation.scores['A1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['A1'].justification}

A2 RESPEKT & WERTFREIHEIT: {evaluation.scores['A2'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['A2'].justification}

B1 KONKRETHEIT: {evaluation.scores['B1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['B1'].justification}

B2 TRENNUNG VON BEOBACHTUNG UND INTERPRETATION: {evaluation.scores['B2'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['B2'].justification}

C1 STRUKTURIERTE LOGIK (WWW/BEB): {evaluation.scores['C1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['C1'].justification}

D1 ZUKUNGSORIENTIERTE EMPFEHLUNG: {evaluation.scores['D1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['D1'].justification}

D2 WERTSCHÄTZENDER ABSCHLUSS: {evaluation.scores['D2'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['D2'].justification}

E1 KOMMUNIKATIONSEBENEN: {evaluation.scores['E1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['E1'].justification}

F1 FÖRDERUNG VON REFLEXION: {evaluation.scores['F1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['F1'].justification}

GESAMTBEWERTUNG
----------------------------------------
Durchschnittliche Bewertung: {average_score:.1f}/5
(1=exzellent, 5=nicht bestanden)

Qualitätsstufe: {evaluation.quality_level}

Stärken:
'''
    for strength in evaluation.strengths:
        report += f"- {strength}\n"

    report += "\nSchwächen:\n"
    for weakness in evaluation.weaknesses:
        report += f"- {weakness}\n"

    report += "\nVerbesserungsvorschläge:\n"
    for suggestion in evaluation.improvement_suggestions:
        report += f"- {suggestion}\n"

    # Save report to output directory
    output_path = Path(output_dir) / f"{f.stem}_evaluation.txt"
    with open(output_path, "w", encoding="utf-8") as out_file:
        out_file.write(report)

    # Write timing data to CSV
    with open(csv_timing_path, "a", encoding="utf-8", newline="") as csv_file:
        csv_writer = csv.writer(csv_file, delimiter=",")
        csv_writer.writerow([
            f.name,
            f"{time.time() - file_start:.2f}",
            f"{eval_duration:.2f}",
            start_time_str,
            datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            status,
            f"{average_score:.1f}",  # Using the calculated average score
            evaluation.quality_level,
            evaluation.scores['A1'].score,
            evaluation.scores['A2'].score,
            evaluation.scores['B1'].score,
            evaluation.scores['B2'].score,
            evaluation.scores['C1'].score,
            evaluation.scores['D1'].score,
            evaluation.scores['D2'].score,
            evaluation.scores['E1'].score,
            evaluation.scores['F1'].score
        ])

    # Log timing to central log file
    with open(timing_log_path, "a", encoding="utf-8") as log:
        log.write(f"Datei: {f.name}\n")
        log.write(f"Start: {datetime.datetime.now().strftime('%H:%M:%S')}\n")
        log.write(f"Dauer: {time.time() - file_start:.2f} Sekunden\n")
        log.write("Detailierte Zeiten:\n")
        for criterion, duration in criterion_timings.items():
            log.write(f"  • {criterion}: {duration:.2f} Sekunden\n")
        log.write("-"*50 + "\n\n")

    print(f"\nBewertungsbericht erstellt: {output_path}")
    print(f"Gesamtzeit für {f.name}: {time.time() - file_start:.2f} Sekunden")
    print(f"{'='*50}")

total_duration = time.time() - total_start
print(f"\n{'='*50}")
print(f"ALLE BEWERTUNGEN ABGESCHLOSSEN")
print(f"Gesamtverarbeitungszeit: {total_duration:.2f} Sekunden für {len(files)} Dateien")
print(f"Durchschnittliche Zeit pro Datei: {total_duration/len(files):.2f} Sekunden")
print(f"Bewertungsberichte gespeichert in: {output_dir}")
print(f"Timing-Log aktualisiert: {timing_log_path}")
print(f"CSV-Timing-Datei erstellt: {csv_timing_path}")
print(f"{'='*50}")
##


# %% Isabella


import openai
import os
from pathlib import Path
import datetime
import time
import json as json_module
import csv
from pydantic import BaseModel
import math # Importiere das math Modul für floor/ceil/round falls benötigt, aber Python's round() reicht hier

client = openai.OpenAI(
    api_key="sk--T3QiY4gBE67o9oSxEOqxw",
    base_url="http://pluto/v1"
)

EVAL_PROMPT = '''
 Du bist ein strenger, objektiver Bewertender für medizinische Lehre. Bewerte das folgende Feedback anhand der Kursinhalte "Feedback in der Lehre: Basics". KRITERIEN (basierend auf Kursmaterial): A1 PERSPEKTIVE (Ich-Botschaften) Bewertung: Wird subjektive Wahrnehmung in Ich-Formulierungen dargestellt? A2 RESPEKT & WERTFREIHEIT Bewertung: Wird respektvoll und wertfrei kommuniziert? B1 KONKRETHEIT "Das Feedback sollte so konkret wie möglich sein. Die Wiedergabe beobachteter Beispiele ist hilfreich." Bewertung: Enthält das Feedback beobachtbare Beispiele statt Verallgemeinerungen? B2 TRENNUNG VON BEOBACHTUNG UND INTERPRETATION Bewertung: Wird zwischen beobachtbaren Fakten und Interpretationen unterschieden? C1 STRUKTURIERTE LOGIK (WWW/BEB-Prinzip) WWW: "1. Wahrnehmung: Ich habe gesehen ... 2. Wirkung: ... das hat mich nervös gemacht. 3. Wunsch: Ich wünsche mir ..." BEB: "1. Beobachtung: Ich habe gesehen ... 2. Empfehlung: Ich empfehle ... 3. Begründung: Auf diese Weise vermeiden Sie ..." Bewertung: Folgt das Feedback einer klaren Struktur (WWW oder BEB)? D1 ZUKUNGSORIENTIERTE EMPFEHLUNG Bewertung: Gibt es konkrete, zukunftsorientierte Handlungsempfehlungen, die wertschätzend formuliert sind? D2 WERTSCHÄTZENDER ABSCHLUSS Bewertung: Schließt das Feedback wertschätzend ab? E1 KOMMUNIKATIONSEBENEN "Vier Seiten einer Nachricht: Sachinhalt, Selbstoffenbarung, Beziehung, Appell" Bewertung: Berücksichtigt das Feedback die verschiedenen Kommunikationsebenen? F1 FÖRDERUNG VON REFLEXION Bewertung: Fördert das Feedback die Reflexion und das Lernen?

SCORING-ANLEITUNG:
1 = Vollständige Umsetzung (exzellentes Beispiel)
2 = Gute Umsetzung mit minimalen Lücken
3 = Grundlegende Umsetzung mit signifikanten Mängeln
4 = Unzureichende Umsetzung (wichtige Elemente fehlen)
5 = Keine erkennbare Umsetzung (kriterienwidrig)

WICHTIG: Gib die Ergebnisse AUSSCHLIESSLICH als JSON mit EXAKT folgender Struktur zurück:
{
  "scores": {
    "A1": {"score": 1, "justification": "Begründung hier"},
    "A2": {"score": 1, "justification": "Begründung hier"},
    "B1": {"score": 1, "justification": "Begründung hier"},
    "B2": {"score": 1, "justification": "Begründung hier"},
    "C1": {"score": 1, "justification": "Begründung hier"},
    "D1": {"score": 1, "justification": "Begründung hier"},
    "D2": {"score": 1, "justification": "Begründung hier"},
    "E1": {"score": 1, "justification": "Begründung hier"},
    "F1": {"score": 1, "justification": "Begründung hier"}
  },
  "total_score": 0,
  "quality_level": "Beispiel-Qualitätsstufe",
  "strengths": ["Stärke 1", "Stärke 2"],
  "weaknesses": ["Schwäche 1", "Schwäche 2"],
  "improvement_suggestions": ["Vorschlag 1", "Vorschlag 2", "Vorschlag 3"]
}
'''

class ScoreItem(BaseModel):
    score: int  # 1-5 (1=excellent, 5=failed)
    justification: str

class EvaluationResult(BaseModel):
    scores: dict[str, ScoreItem]
    total_score: int
    quality_level: str
    strengths: list[str]
    weaknesses: list[str]
    improvement_suggestions: list[str]
##


# %% Main
input_dir = "./cruscloud/Teil3/Transkripte/"
# Hardcoded output directory - CHANGE THIS PATH AS NEEDED
output_dir = "./cruscloud/Teil3/Evaluations_moodle3"
Path(output_dir).mkdir(parents=True, exist_ok=True)

# Create timing log file
timing_log_path = Path(output_dir) / "evaluation_timing.log"
with open(timing_log_path, "w", encoding="utf-8") as log:
    log.write(f"FEEDBACK EVALUATION TIMING LOG - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    log.write("="*80 + "\n\n")

# Create CSV timing file with headers
csv_timing_path = Path(output_dir) / "evaluation_timings.csv"
with open(csv_timing_path, "w", encoding="utf-8", newline="") as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=",")
    # Write CSV header
    csv_writer.writerow([
        "Filename",
        "Total_Time_sec",
        "API_Evaluation_Time_sec",
        "Start_Time",
        "End_Time",
        "Status",
        "Average_Score",
        "Quality_Level",
        "A1_Score",
        "A2_Score",
        "B1_Score",
        "B2_Score",
        "C1_Score",
        "D1_Score",
        "D2_Score",
        "E1_Score",
        "F1_Score"
    ])

files = list(Path(input_dir).glob("*.txt"))
results = {}
total_start = time.time()

for f in files:
    file_start = time.time()
    start_time_str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(f"\n{'='*50}")
    print(f"Beginne Bewertung: {f.name}")
    print(f"Startzeit: {datetime.datetime.now().strftime('%H:%M:%S')}")

    # Read input text
    text = f.read_text(encoding="utf-8")

    # Get AI evaluation with timing
    status = "Success"
    eval_duration = 0.0
    eval_end = file_start # Initialisierung

    # We'll evaluate all criteria in one call with strict JSON structure
    try:
        eval_start_api = time.time()
        response = client.chat.completions.create(
            model="GPT-OSS-120B",
            messages=[
                {"role": "system", "content": EVAL_PROMPT},
                {"role": "user", "content": text},
            ],
            response_format={"type": "json_object"},
            temperature=0.1
        )
        eval_duration = time.time() - eval_start_api # API-Zeit gemessen

        print(f"  • Gesamtbewertung (API-Laufzeit): {eval_duration:.2f} sec")

        # Parse the JSON response
        try:
            parsed_response = json_module.loads(response.choices[0].message.content)
            eval_end = time.time() # Zeitpunkt nach JSON-Parsing

            # Validate structure before passing to Pydantic
            required_keys = ["scores", "total_score", "quality_level",
                             "strengths", "weaknesses", "improvement_suggestions"]

            # If the response has a different structure, try to fix it
            if not all(key in parsed_response for key in required_keys):
                print(f"  ! Warnung: Ungewöhnliche Antwortstruktur erkannt. Versuche Konvertierung...")
                status = "Partial Structure"

                # Create a properly structured response
                fixed_response = {
                    "scores": {
                        "A1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "A2": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "B1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "B2": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "C1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "D1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "D2": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "E1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "F1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"}
                    },
                    "total_score": 0,
                    "quality_level": "Fehlerhaft",
                    "strengths": ["Strukturfehler in der Bewertung"],
                    "weaknesses": ["Antwortstruktur nicht korrekt"],
                    "improvement_suggestions": ["Überprüfen Sie die Feedback-Struktur"]
                }

                # Try to populate with available data
                if "scores" in parsed_response:
                    for key in ["A1", "A2", "B1", "B2", "C1", "D1", "D2", "E1", "F1"]:
                        if key in parsed_response["scores"]:
                            fixed_response["scores"][key] = parsed_response["scores"][key]

                if "total_score" in parsed_response:
                    fixed_response["total_score"] = parsed_response["total_score"]

                if "quality_level" in parsed_response and parsed_response["quality_level"]:
                    fixed_response["quality_level"] = parsed_response["quality_level"]

                if "strengths" in parsed_response and isinstance(parsed_response["strengths"], list):
                    fixed_response["strengths"] = parsed_response["strengths"]

                if "weaknesses" in parsed_response and isinstance(parsed_response["weaknesses"], list):
                    fixed_response["weaknesses"] = parsed_response["weaknesses"]

                if "improvement_suggestions" in parsed_response and isinstance(parsed_response["improvement_suggestions"], list):
                    fixed_response["improvement_suggestions"] = parsed_response["improvement_suggestions"]

                parsed_response = fixed_response

            # Create evaluation object
            evaluation = EvaluationResult(**parsed_response)
            results[f.name] = evaluation

        except json_module.JSONDecodeError as e:
            print(f"  ! JSON-Decoding-Fehler: {e}")
            print(f"  ! Antwortinhalt: {response.choices[0].message.content[:200]}...")
            status = f"JSON Error: {str(e)}"
            eval_end = time.time() # Zeitpunkt nach Fehler

            # Create a default evaluation with error messages
            error_explanation = f"FEHLER: Ungültige JSON-Antwort vom API. Details: {str(e)}"
            default_scores = {
                key: ScoreItem(score=5, justification=error_explanation)
                for key in ["A1", "A2", "B1", "B2", "C1", "D1", "D2", "E1", "F1"]
            }

            evaluation = EvaluationResult(
                scores=default_scores,
                total_score=0,
                quality_level="Fehlerhaft",
                strengths=["Bewertung fehlgeschlagen"],
                weaknesses=["Ungültiges JSON-Format"],
                improvement_suggestions=["Überprüfen Sie die Feedback-Struktur"]
            )
            results[f.name] = evaluation

    except Exception as e:
        print(f"  ! Unerwarteter Fehler: {str(e)}")
        status = f"API Error: {str(e)}"

        # Create a default evaluation with error messages
        error_explanation = f"FEHLER: Bewertung fehlgeschlagen. Details: {str(e)}"
        default_scores = {
            key: ScoreItem(score=5, justification=error_explanation)
            for key in ["A1", "A2", "B1", "B2", "C1", "D1", "D2", "E1", "F1"]
        }

        evaluation = EvaluationResult(
            scores=default_scores,
            total_score=0,
            quality_level="Systemfehler",
            strengths=["Bewertung fehlgeschlagen"],
            weaknesses=[f"Technischer Fehler: {str(e)}"],
            improvement_suggestions=["Kontaktieren Sie den Support"]
        )
        results[f.name] = evaluation
        eval_end = time.time() # Zeitpunkt nach API-Fehler

    # Calculate the AVERAGE score (not sum)
    all_scores = [evaluation.scores[key].score for key in evaluation.scores.keys()]
    valid_scores = [s for s in all_scores if isinstance(s, int)]
    average_score = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0

    # Runden auf die nächste ganze Zahl (natürliche Zahl-Format)
    rounded_average_score = int(round(average_score))

    # --- Zeitmessungs-Korrektur (Neu) ---
    # Gesamtzeit für die Datei (bis zum Ende der Verarbeitung)
    total_file_duration = time.time() - file_start
    # Zeit für lokale Verarbeitung: Alles, was nach dem Start bis zum Ende der API/JSON-Verarbeitung (eval_end) passiert ist,
    # abzüglich der reinen API-Wartezeit (eval_duration).
    # Eine einfachere und präzisere Methode ist: Gesamtzeit minus API-Zeit.
    local_processing_time = total_file_duration - eval_duration
    if local_processing_time < 0: # Sicherstellen, dass die Zeit nicht negativ wird, falls die API-Messung ungenau ist
        local_processing_time = 0.0
    # -------------------------------------

    # Generate detailed text report with timing
    report = f'''FEEDBACK-EVALUATION BERICHT
============================
Eingabedatei: {f.name}
Erstellungsdatum: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Kursreferenz: "Feedback in der Lehre: Basics" (Hochschulmedizin Dresden)

VERARBEITUNGSZEITEN
----------------------------------------
Gesamtverarbeitung: {total_file_duration:.2f} Sekunden
  • API-Bewertungszeit: {eval_duration:.2f} Sekunden
  • Lokale Verarbeitungszeit (Lesen, JSON, Bericht): {local_processing_time:.2f} Sekunden
'''

    # Add evaluation results
    report += f'''

KRITERIENBEWERTUNG
----------------------------------------
A1 PERSPEKTIVE (Ich-Botschaften): {evaluation.scores['A1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['A1'].justification}

A2 RESPEKT & WERTFREIHEIT: {evaluation.scores['A2'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['A2'].justification}

B1 KONKRETHEIT: {evaluation.scores['B1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['B1'].justification}

B2 TRENNUNG VON BEOBACHTUNG UND INTERPRETATION: {evaluation.scores['B2'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['B2'].justification}

C1 STRUKTURIERTE LOGIK (WWW/BEB): {evaluation.scores['C1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['C1'].justification}

D1 ZUKUNGSORIENTIERTE EMPFEHLUNG: {evaluation.scores['D1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['D1'].justification}

D2 WERTSCHÄTZENDER ABSCHLUSS: {evaluation.scores['D2'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['D2'].justification}

E1 KOMMUNIKATIONSEBENEN: {evaluation.scores['E1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['E1'].justification}

F1 FÖRDERUNG VON REFLEXION: {evaluation.scores['F1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['F1'].justification}

GESAMTBEWERTUNG
----------------------------------------
Durchschnittliche Bewertung: {rounded_average_score}/5
(1=exzellent, 5=nicht bestanden)

Qualitätsstufe: {evaluation.quality_level}

Stärken:
'''
    for strength in evaluation.strengths:
        report += f"- {strength}\n"

    report += "\nSchwächen:\n"
    for weakness in evaluation.weaknesses:
        report += f"- {weakness}\n"

    report += "\nVerbesserungsvorschläge:\n"
    for suggestion in evaluation.improvement_suggestions:
        report += f"- {suggestion}\n"

    # Save report to output directory
    output_path = Path(output_dir) / f"{f.stem}_evaluation.txt"
    with open(output_path, "w", encoding="utf-8") as out_file:
        out_file.write(report)

    # Write timing data to CSV
    with open(csv_timing_path, "a", encoding="utf-8", newline="") as csv_file:
        csv_writer = csv.writer(csv_file, delimiter=",")
        csv_writer.writerow([
            f.name,
            f"{total_file_duration:.2f}",
            f"{eval_duration:.2f}",
            start_time_str,
            datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            status,
            rounded_average_score,
            evaluation.quality_level,
            evaluation.scores['A1'].score,
            evaluation.scores['A2'].score,
            evaluation.scores['B1'].score,
            evaluation.scores['B2'].score,
            evaluation.scores['C1'].score,
            evaluation.scores['D1'].score,
            evaluation.scores['D2'].score,
            evaluation.scores['E1'].score,
            evaluation.scores['F1'].score
        ])

    # Log timing to central log file
    with open(timing_log_path, "a", encoding="utf-8") as log:
        log.write(f"Datei: {f.name}\n")
        log.write(f"Start: {datetime.datetime.now().strftime('%H:%M:%S')}\n")
        log.write(f"Dauer: {total_file_duration:.2f} Sekunden\n")
        log.write("Detailierte Zeiten:\n")
        log.write(f"  • API-Bewertung: {eval_duration:.2f} Sekunden\n")
        log.write(f"  • Lokale Verarbeitung: {local_processing_time:.2f} Sekunden\n")
        log.write("-"*50 + "\n\n")

    print(f"\nBewertungsbericht erstellt: {output_path}")
    print(f"Gesamtzeit für {f.name}: {total_file_duration:.2f} Sekunden (API: {eval_duration:.2f}, Lokal: {local_processing_time:.2f})")
    print(f"{'='*50}")

total_duration = time.time() - total_start
print(f"\n{'='*50}")
print(f"ALLE BEWERTUNGEN ABGESCHLOSSEN")
print(f"Gesamtverarbeitungszeit: {total_duration:.2f} Sekunden für {len(files)} Dateien")
print(f"Durchschnittliche Zeit pro Datei: {total_duration/len(files):.2f} Sekunden")
print(f"Bewertungsberichte gespeichert in: {output_dir}")
print(f"Timing-Log aktualisiert: {timing_log_path}")
print(f"CSV-Timing-Datei erstellt: {csv_timing_path}")
print(f"{'='*50}")

##


# %% Main

QUALITY_LEVEL_MAP = {
    1: "Exzellent (1)",
    2: "Gut (2)",
    3: "Befriedigend (3)",
    4: "Ausreichend (4)",
    5: "Mangelhaft/Ungenügend (5)",
    0: "Fehlerhaft/Unbekannt"
}


input_dir = "./cruscloud/Teil3/Transkripte/"
# Hardcoded output directory - CHANGE THIS PATH AS NEEDED
output_dir = "./cruscloud/Teil3/Evaluations_moodle_isabella2"
Path(output_dir).mkdir(parents=True, exist_ok=True)

# Create timing log file
timing_log_path = Path(output_dir) / "evaluation_timing.log"
with open(timing_log_path, "w", encoding="utf-8") as log:
    log.write(f"FEEDBACK EVALUATION TIMING LOG - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    log.write("="*80 + "\n\n")

# Create CSV timing file with headers
csv_timing_path = Path(output_dir) / "evaluation_timings.csv"
with open(csv_timing_path, "w", encoding="utf-8", newline="") as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=",")
    # Write CSV header
    csv_writer.writerow([
        "Filename",
        "Total_Time_sec",
        "API_Evaluation_Time_sec",
        "Start_Time",
        "End_Time",
        "Status",
        "Average_Score",
        "Quality_Level",
        "A1_Score",
        "A2_Score",
        "B1_Score",
        "B2_Score",
        "C1_Score",
        "D1_Score",
        "D2_Score",
        "E1_Score",
        "F1_Score"
    ])

files = list(Path(input_dir).glob("*.txt"))
results = {}
total_start = time.time()

for f in files:
    file_start = time.time()
    start_time_str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(f"\n{'='*50}")
    print(f"Beginne Bewertung: {f.name}")
    print(f"Startzeit: {datetime.datetime.now().strftime('%H:%M:%S')}")

    # Read input text
    text = f.read_text(encoding="utf-8")

    # Get AI evaluation with timing
    status = "Success"
    eval_duration = 0.0

    # We'll evaluate all criteria in one call with strict JSON structure
    try:
        eval_start_api = time.time()
        response = client.chat.completions.create(
            model="GPT-OSS-120B",
            messages=[
                {"role": "system", "content": EVAL_PROMPT},
                {"role": "user", "content": text},
            ],
            response_format={"type": "json_object"},
            temperature=0.1
        )
        eval_duration = time.time() - eval_start_api # API-Zeit gemessen

        print(f"  • Gesamtbewertung (API-Laufzeit): {eval_duration:.2f} sec")

        # Parse the JSON response
        try:
            parsed_response = json_module.loads(response.choices[0].message.content)

            # Validate structure before passing to Pydantic
            required_keys = ["scores", "total_score", "quality_level",
                             "strengths", "weaknesses", "improvement_suggestions"]

            # If the response has a different structure, try to fix it
            if not all(key in parsed_response for key in required_keys):
                print(f"  ! Warnung: Ungewöhnliche Antwortstruktur erkannt. Versuche Konvertierung...")
                status = "Partial Structure"

                # Create a properly structured response
                fixed_response = {
                    "scores": {
                        "A1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "A2": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "B1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "B2": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "C1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "D1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "D2": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "E1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"},
                        "F1": {"score": 5, "justification": "FEHLER: Kriterium nicht bewertet"}
                    },
                    "total_score": 0,
                    "quality_level": "Fehlerhaft",
                    "strengths": ["Strukturfehler in der Bewertung"],
                    "weaknesses": ["Antwortstruktur nicht korrekt"],
                    "improvement_suggestions": ["Überprüfen Sie die Feedback-Struktur"]
                }

                # Try to populate with available data
                if "scores" in parsed_response:
                    for key in ["A1", "A2", "B1", "B2", "C1", "D1", "D2", "E1", "F1"]:
                        if key in parsed_response["scores"]:
                            fixed_response["scores"][key] = parsed_response["scores"][key]

                if "total_score" in parsed_response:
                    fixed_response["total_score"] = parsed_response["total_score"]

                if "quality_level" in parsed_response and parsed_response["quality_level"]:
                    fixed_response["quality_level"] = parsed_response["quality_level"]

                if "strengths" in parsed_response and isinstance(parsed_response["strengths"], list):
                    fixed_response["strengths"] = parsed_response["strengths"]

                if "weaknesses" in parsed_response and isinstance(parsed_response["weaknesses"], list):
                    fixed_response["weaknesses"] = parsed_response["weaknesses"]

                if "improvement_suggestions" in parsed_response and isinstance(parsed_response["improvement_suggestions"], list):
                    fixed_response["improvement_suggestions"] = parsed_response["improvement_suggestions"]

                parsed_response = fixed_response

            # Create evaluation object
            evaluation = EvaluationResult(**parsed_response)
            results[f.name] = evaluation

        except json_module.JSONDecodeError as e:
            print(f"  ! JSON-Decoding-Fehler: {e}")
            print(f"  ! Antwortinhalt: {response.choices[0].message.content[:200]}...")
            status = f"JSON Error: {str(e)}"

            # Create a default evaluation with error messages
            error_explanation = f"FEHLER: Ungültige JSON-Antwort vom API. Details: {str(e)}"
            default_scores = {
                key: ScoreItem(score=5, justification=error_explanation)
                for key in ["A1", "A2", "B1", "B2", "C1", "D1", "D2", "E1", "F1"]
            }

            evaluation = EvaluationResult(
                scores=default_scores,
                total_score=0,
                quality_level="Fehlerhaft",
                strengths=["Bewertung fehlgeschlagen"],
                weaknesses=["Ungültiges JSON-Format"],
                improvement_suggestions=["Überprüfen Sie die Feedback-Struktur"]
            )
            results[f.name] = evaluation

    except Exception as e:
        print(f"  ! Unerwarteter Fehler: {str(e)}")
        status = f"API Error: {str(e)}"

        # Create a default evaluation with error messages
        error_explanation = f"FEHLER: Bewertung fehlgeschlagen. Details: {str(e)}"
        default_scores = {
            key: ScoreItem(score=5, justification=error_explanation)
            for key in ["A1", "A2", "B1", "B2", "C1", "D1", "D2", "E1", "F1"]
        }

        evaluation = EvaluationResult(
            scores=default_scores,
            total_score=0,
            quality_level="Systemfehler",
            strengths=["Bewertung fehlgeschlagen"],
            weaknesses=[f"Technischer Fehler: {str(e)}"],
            improvement_suggestions=["Kontaktieren Sie den Support"]
        )
        results[f.name] = evaluation

    # Calculate the AVERAGE score (not sum)
    all_scores = [evaluation.scores[key].score for key in evaluation.scores.keys()]
    valid_scores = [s for s in all_scores if isinstance(s, int)]
    average_score = sum(valid_scores) / len(valid_scores) if valid_scores else 5.0

    # Runden auf die nächste ganze Zahl (natürliche Zahl-Format)
    rounded_average_score = int(round(average_score))

    # --- Konkrete Qualitätsstufen zuweisen (Neu implementiert) ---
    if status in ["JSON Error", "API Error", "Systemfehler", "Partial Structure"]:
        final_quality_level = evaluation.quality_level # Behält Fehlerstatus bei
    else:
        # Weist die definierte Qualitätsstufe basierend auf dem Durchschnitt zu
        final_quality_level = QUALITY_LEVEL_MAP.get(rounded_average_score, "Fehlerhaft/Unbekannt")

    # Überschreibe den Wert im evaluation-Objekt
    evaluation.quality_level = final_quality_level
    # ----------------------------------------------------------------

    # --- Zeitmessungs-Korrektur (Überprüfung) ---
    # Gesamtzeit für die Datei (bis zum Ende der Verarbeitung)
    total_file_duration = time.time() - file_start
    # Lokale Verarbeitungszeit: Gesamtzeit minus der reinen API-Wartezeit.
    local_processing_time = total_file_duration - eval_duration
    if local_processing_time < 0: # Sicherstellen, dass die Zeit nicht negativ wird
        local_processing_time = 0.0
    # -------------------------------------

    # Generate detailed text report with timing
    report = f'''FEEDBACK-EVALUATION BERICHT
============================
Eingabedatei: {f.name}
Erstellungsdatum: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Kursreferenz: "Feedback in der Lehre: Basics" (Hochschulmedizin Dresden)

VERARBEITUNGSZEITEN
----------------------------------------
Gesamtverarbeitung: {total_file_duration:.2f} Sekunden
  • API-Bewertungszeit: {eval_duration:.2f} Sekunden
  • Lokale Verarbeitungszeit (Lesen, JSON, Bericht): {local_processing_time:.2f} Sekunden
'''

    # Add evaluation results
    report += f'''

KRITERIENBEWERTUNG
----------------------------------------
A1 PERSPEKTIVE (Ich-Botschaften): {evaluation.scores['A1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['A1'].justification}

A2 RESPEKT & WERTFREIHEIT: {evaluation.scores['A2'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['A2'].justification}

B1 KONKRETHEIT: {evaluation.scores['B1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['B1'].justification}

B2 TRENNUNG VON BEOBACHTUNG UND INTERPRETATION: {evaluation.scores['B2'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['B2'].justification}

C1 STRUKTURIERTE LOGIK (WWW/BEB): {evaluation.scores['C1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['C1'].justification}

D1 ZUKUNGSORIENTIERTE EMPFEHLUNG: {evaluation.scores['D1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['D1'].justification}

D2 WERTSCHÄTZENDER ABSCHLUSS: {evaluation.scores['D2'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['D2'].justification}

E1 KOMMUNIKATIONSEBENEN: {evaluation.scores['E1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['E1'].justification}

F1 FÖRDERUNG VON REFLEXION: {evaluation.scores['F1'].score}/5
(1=exzellent, 5=nicht bestanden)
Begründung: {evaluation.scores['F1'].justification}

GESAMTBEWERTUNG
----------------------------------------
Durchschnittliche Bewertung: {rounded_average_score}/5
(1=exzellent, 5=nicht bestanden)

Qualitätsstufe: {evaluation.quality_level}

Stärken:
'''
    for strength in evaluation.strengths:
        report += f"- {strength}\n"

    report += "\nSchwächen:\n"
    for weakness in evaluation.weaknesses:
        report += f"- {weakness}\n"

    report += "\nVerbesserungsvorschläge:\n"
    for suggestion in evaluation.improvement_suggestions:
        report += f"- {suggestion}\n"

    # Save report to output directory
    output_path = Path(output_dir) / f"{f.stem}_evaluation.txt"
    with open(output_path, "w", encoding="utf-8") as out_file:
        out_file.write(report)

    # Write timing data to CSV
    with open(csv_timing_path, "a", encoding="utf-8", newline="") as csv_file:
        csv_writer = csv.writer(csv_file, delimiter=",")
        csv_writer.writerow([
            f.name,
            f"{total_file_duration:.2f}",
            f"{eval_duration:.2f}",
            start_time_str,
            datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            status,
            rounded_average_score,
            evaluation.quality_level,
            evaluation.scores['A1'].score,
            evaluation.scores['A2'].score,
            evaluation.scores['B1'].score,
            evaluation.scores['B2'].score,
            evaluation.scores['C1'].score,
            evaluation.scores['D1'].score,
            evaluation.scores['D2'].score,
            evaluation.scores['E1'].score,
            evaluation.scores['F1'].score
        ])

    # Log timing to central log file
    with open(timing_log_path, "a", encoding="utf-8") as log:
        log.write(f"Datei: {f.name}\n")
        log.write(f"Start: {datetime.datetime.now().strftime('%H:%M:%S')}\n")
        log.write(f"Dauer: {total_file_duration:.2f} Sekunden\n")
        log.write("Detailierte Zeiten:\n")
        log.write(f"  • API-Bewertung: {eval_duration:.2f} Sekunden\n")
        log.write(f"  • Lokale Verarbeitung: {local_processing_time:.2f} Sekunden\n")
        log.write("-"*50 + "\n\n")

    print(f"\nBewertungsbericht erstellt: {output_path}")
    print(f"Gesamtzeit für {f.name}: {total_file_duration:.2f} Sekunden (API: {eval_duration:.2f}, Lokal: {local_processing_time:.2f})")
    print(f"{'='*50}")

total_duration = time.time() - total_start
print(f"\n{'='*50}")
print(f"ALLE BEWERTUNGEN ABGESCHLOSSEN")
print(f"Gesamtverarbeitungszeit: {total_duration:.2f} Sekunden für {len(files)} Dateien")
print(f"Durchschnittliche Zeit pro Datei: {total_duration/len(files):.2f} Sekunden")
print(f"Bewertungsberichte gespeichert in: {output_dir}")
print(f"Timing-Log aktualisiert: {timing_log_path}")
print(f"CSV-Timing-Datei erstellt: {csv_timing_path}")
print(f"{'='*50}")
##