backing up Edss total

This commit is contained in:
2026-01-18 22:32:24 +01:00
parent 2cab5fd9b3
commit 454273a6cb

View File

@@ -1,11 +1,9 @@
# %% API call1
import time import time
import json import json
import os import os
from datetime import datetime from datetime import datetime
import pandas as pd import pandas as pd
from openai import OpenAI from openai import openai
from dotenv import load_dotenv from dotenv import load_dotenv
# Load environment variables # Load environment variables
@@ -15,104 +13,132 @@ load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL") OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
MODEL_NAME = "GPT-OSS-120B" MODEL_NAME = "GPT-OSS-120B"
HEALTH_URL = f"{OPENAI_BASE_URL}/health" # Placeholder - actual health check would need to be implemented
CHAT_URL = f"{OPENAI_BASE_URL}/chat/completions"
# File paths
INPUT_CSV = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv" INPUT_CSV = "/home/shahin/Lab/Doktorarbeit/Barcelona/Data/MS_Briefe_400_with_unique_id_SHA3_explore_cleaned_unique.csv"
EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/Komplett.txt" EDSS_INSTRUCTIONS_PATH = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/Komplett.txt"
#GRAMMAR_FILE = "/home/shahin/Lab/Doktorarbeit/Barcelona/attach/just_edss_schema.gbnf"
# Initialize OpenAI client
client = OpenAI( client = OpenAI(
api_key=OPENAI_API_KEY, api_key=OPENAI_API_KEY,
base_url=OPENAI_BASE_URL base_url=OPENAI_BASE_URL
) )
# Read EDSS instructions from file
with open(EDSS_INSTRUCTIONS_PATH, 'r') as f: with open(EDSS_INSTRUCTIONS_PATH, 'r') as f:
EDSS_INSTRUCTIONS = f.read().strip() EDSS_INSTRUCTIONS = f.read().strip()
# === UPDATED RUN INFERENCE WITH RETRY LOGIC === # === RUN INFERENCE 2 ===
def run_inference(patient_text, max_retries=3): def run_inference(patient_text):
prompt = f'''Du bist ein medizinischer Assistent... (rest of your prompt)''' # Kept same as your original prompt = f'''Du bist ein medizinischer Assistent, der spezialisiert darauf ist, EDSS-Scores (Expanded Disability Status Scale) sowie alle Unterkategorien aus klinischen Berichten zu extrahieren.
### Regeln für die Ausgabe:
1. **Reason**: Erstelle eine prägnante Zusammenfassung (max. 400 Zeichen) der Befunde auf **DEUTSCH**, die zur Einstufung führen.
2. **klassifizierbar**:
- Setze dies auf **true**, wenn ein EDSS-Wert identifiziert, berechnet oder basierend auf den klinischen Hinweisen plausibel geschätzt werden kann.
- Setze dies auf **false**, NUR wenn die Daten absolut unzureichend oder so widersprüchlich sind, dass keinerlei Einstufung möglich ist.
3. **EDSS**:
- Dieses Feld ist **VERPFLICHTEND**, wenn "klassifizierbar" auf true steht.
- Es muss eine Zahl zwischen 0.0 und 10.0 sein.
- Versuche stets, den EDSS-Wert so präzise wie möglich zu bestimmen, auch wenn die Datenlage dünn ist (nutze verfügbare Informationen zu Gehstrecke und Funktionssystemen).
- Dieses Feld **DARF NICHT ERSCHEINEN**, wenn "klassifizierbar" auf false steht.
4. **Unterkategorien**:
- Extrahiere alle folgenden Unterkategorien aus dem Bericht:
- VISUAL OPTIC FUNCTIONS (max. 6.0)
- BRAINSTEM FUNCTIONS (max. 6.0)
- PYRAMIDAL FUNCTIONS (max. 6.0)
- CEREBELLAR FUNCTIONS (max. 6.0)
- SENSORY FUNCTIONS (max. 6.0)
- BOWEL AND BLADDER FUNCTIONS (max. 6.0)
- CEREBRAL FUNCTIONS (max. 6.0)
- AMBULATION (max. 10.0)
- Jede Unterkategorie sollte eine Zahl zwischen 0.0 und der jeweiligen Obergrenze enthalten, wenn sie klassifizierbar ist
- Wenn eine Unterkategorie nicht klassifizierbar ist, setze den Wert auf null
### Einschränkungen:
- Erfinde keine Fakten, aber nutze klinische Herleitungen aus dem Bericht, um den EDSS und die Unterkategorien zu bestimmen.
- Priorisiere die Vergabe eines EDSS-Wertes gegenüber der Markierung als nicht klassifizierbar.
- Halte dich strikt an die JSON-Struktur.
- Die Unterkategorien müssen immer enthalten sein, auch wenn sie null sind.
EDSS-Bewertungsrichtlinien:
{EDSS_INSTRUCTIONS}
Patientenbericht:
{patient_text}
'''
# Instructions omitted for brevity, but keep your full prompt here
attempts = 0
while attempts < max_retries:
start_time = time.time() start_time = time.time()
attempts += 1
try: try:
# Make API call using OpenAI client
response = client.chat.completions.create( response = client.chat.completions.create(
messages=[ messages=[
{"role": "system", "content": "You extract EDSS scores..."}, {
{"role": "user", "content": prompt} "role": "system",
"content": "You extract EDSS scores and all subcategories. You prioritize providing values even if data is partial, by using clinical inference."
},
{
"role": "user",
"content": prompt
}
], ],
model=MODEL_NAME, model=MODEL_NAME,
max_tokens=2048, max_tokens=2048,
temperature=0.0, temperature=0.0,
response_format={"type": "json_object"} response_format={"type": "json_object"}
) )
# Extract content from response
content = response.choices[0].message.content content = response.choices[0].message.content
# Check if content is None or empty
# Check if content is empty or None if content is None or content.strip() == "":
if not content or content.strip() == "" or content.strip() == "{}": raise ValueError("API returned empty or None response content")
print(f" [Attempt {attempts}] Warning: Received empty response. Retrying...")
time.sleep(1) # Short pause before retrying
continue
# Parse the JSON response # Parse the JSON response
parsed = json.loads(content) parsed = json.loads(content)
inference_time = time.time() - start_time inference_time = time.time() - start_time
return { return {
"success": True, "success": True,
"result": parsed, "result": parsed,
"inference_time_sec": inference_time, "inference_time_sec": inference_time
"attempts": attempts
} }
except Exception as e: except Exception as e:
print(f" [Attempt {attempts}] Error: {e}") print(f"Inference error: {e}")
if attempts < max_retries:
time.sleep(2) # Wait longer on actual connection errors
else:
return { return {
"success": False, "success": False,
"error": f"Failed after {max_retries} attempts: {str(e)}", "error": str(e),
"inference_time_sec": -1, "inference_time_sec": -1
"attempts": attempts
} }
return {"success": False, "error": "Unknown failure", "attempts": attempts}
# === BUILD PATIENT TEXT === # === BUILD PATIENT TEXT ===
def build_patient_text(row): def build_patient_text(row):
# Handle potential NaN or None values in the row
summary = str(row.get("T_Zusammenfassung", "")) if pd.notna(row.get("T_Zusammenfassung")) else "" summary = str(row.get("T_Zusammenfassung", "")) if pd.notna(row.get("T_Zusammenfassung")) else ""
diagnoses = str(row.get("Diagnosen", "")) if pd.notna(row.get("Diagnosen")) else "" diagnoses = str(row.get("Diagnosen", "")) if pd.notna(row.get("Diagnosen")) else ""
clinical = str(row.get("T_KlinBef", "")) if pd.notna(row.get("T_KlinBef")) else "" clinical = str(row.get("T_KlinBef", "")) if pd.notna(row.get("T_KlinBef")) else ""
findings = str(row.get("T_Befunde", "")) if pd.notna(row.get("T_Befunde")) else "" findings = str(row.get("T_Befunde", "")) if pd.notna(row.get("T_Befunde")) else ""
return "\n".join([summary, diagnoses, clinical, findings]).strip() return "\n".join([summary, diagnoses, clinical, findings]).strip()
if __name__ == "__main__": if __name__ == "__main__":
# Read CSV file ONLY inside main block
df = pd.read_csv(INPUT_CSV, sep=';') df = pd.read_csv(INPUT_CSV, sep=';')
results = [] results = []
# Process each row
for idx, row in df.iterrows(): for idx, row in df.iterrows():
print(f"Processing row {idx + 1}/{len(df)}") print(f"Processing row {idx + 1}/{len(df)}")
try:
patient_text = build_patient_text(row) patient_text = build_patient_text(row)
result = run_inference(patient_text)
# Calling the updated inference function # Add unique_id and MedDatum to result for tracking
result = run_inference(patient_text, max_retries=3)
result["unique_id"] = row.get("unique_id", f"row_{idx}") result["unique_id"] = row.get("unique_id", f"row_{idx}")
result["MedDatum"] = row.get("MedDatum", None) result["MedDatum"] = row.get("MedDatum", None)
results.append(result) results.append(result)
# Optional: only print success to keep console clean print(json.dumps(result, indent=2, ensure_ascii=False))
if result["success"]: except Exception as e:
print(f" Success on attempt {result.get('attempts')}") print(f"Error processing row {idx}: {e}")
results.append({
"success": False,
"error": str(e),
"unique_id": row.get("unique_id", f"row_{idx}"),
"MedDatum": row.get("MedDatum", None)
})
# Save results to a JSON file
output_json = INPUT_CSV.replace(".csv", "_results_total.json") output_json = INPUT_CSV.replace(".csv", "_results_total.json")
with open(output_json, 'w', encoding='utf-8') as f: with open(output_json, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False) json.dump(results, f, indent=2, ensure_ascii=False)
print(f"Results saved to {output_json}") print(f"Results saved to {output_json}")