GND_Skript_Test/VLG_API_multi.py

import os
import sys
import time
import json
import requests
import pandas as pd
from pathlib import Path
from difflib import SequenceMatcher
import argparse

# =========================
# Argumente / Dry-Run
# =========================
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true', help='API-Abfragen simulieren')
args = parser.parse_args()
DRY_RUN = args.dry_run

# =========================
# Konfiguration
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)

TIMEOUT = 5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
MAX_CONSECUTIVE_FAILURES = 10

CACHE_FILE = "api_cache.json"
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, "r", encoding="utf-8") as f:
        CACHE = json.load(f)
else:
    CACHE = {}

API_ACTIVE = {"gnd": True, "wikidata": True}
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}

HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}

# =========================
# Logging
# =========================
def log(level, msg):
    print(f"[{level}] {msg}")

# =========================
# Cache speichern
# =========================
def save_cache():
    with open(CACHE_FILE, "w", encoding="utf-8") as f:
        json.dump(CACHE, f, indent=2, ensure_ascii=False)

# =========================
# Request mit Retry & Backoff
# =========================
def request_with_retries(api_name, url, params=None):
    if DRY_RUN:
        return {"dummy": True}
    if not API_ACTIVE[api_name]:
        return None

    cache_key = url + (str(params) if params else "")
    if cache_key in CACHE:
        return CACHE[cache_key]

    retries = 0
    while retries < MAX_RETRIES:
        try:
            r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
            if r.status_code == 200:
                try:
                    data = r.json()
                except:
                    data = r.text
                CACHE[cache_key] = data
                save_cache()
                FAIL_COUNTER[api_name] = 0
                return data
            elif r.status_code in [403, 429]:
                log("ERROR", f"{api_name.upper()} HTTP {r.status_code} – Stopschalter aktiviert")
                API_ACTIVE[api_name] = False
                return None
            else:
                log("ERROR", f"{api_name.upper()} HTTP {r.status_code}")
        except requests.exceptions.Timeout:
            log("ERROR", f"Timeout bei {api_name.upper()}")
        except Exception as e:
            log("ERROR", f"Fehler bei {api_name.upper()}: {e}")

        retries += 1
        sleep_time = min(BACKOFF_FACTOR ** retries, 30)
        time.sleep(sleep_time)

    FAIL_COUNTER[api_name] += 1
    if FAIL_COUNTER[api_name] >= MAX_CONSECUTIVE_FAILURES:
        log("CRITICAL", f"{MAX_CONSECUTIVE_FAILURES} Fehler bei {api_name.upper()} – Stopschalter aktiviert")
        API_ACTIVE[api_name] = False
    return None

# =========================
# API-Abfragen mit Confidence
# =========================
def query_gnd(term, min_conf=0.6):
    if DRY_RUN or not API_ACTIVE["gnd"]:
        return "TEST_GND", 1.0

    url = f"https://lobid.org/gnd/search?q={term}&format=json"
    data = request_with_retries("gnd", url)
    if not data:
        return "API nicht erreichbar", 0.0

    results = []
    scores = []
    for doc in data.get("member", []):
        name = doc.get("preferredName", "")
        conf = SequenceMatcher(None, term.lower(), name.lower()).ratio()
        if conf >= min_conf:
            results.append(name)
            scores.append(conf)
    if results:
        return ", ".join(results), max(scores)
    return "ohne Ergebnis", 0.0

def query_wikidata(term, min_conf=0.5):
    if DRY_RUN or not API_ACTIVE["wikidata"]:
        return "TEST_WD", 1.0

    url = "https://www.wikidata.org/w/api.php"
    params = {"action": "wbsearchentities", "search": term, "language": "de", "format": "json"}
    data = request_with_retries("wikidata", url, params)
    if not data:
        return "API nicht erreichbar", 0.0

    results = []
    scores = []
    for entry in data.get("search", []):
        match_info = entry.get("match", {})
        score = match_info.get("score", 0.0)
        if score >= min_conf:
            results.append(entry["label"])
            scores.append(score)
    if results:
        return ", ".join(results), max(scores)
    return "ohne Ergebnis", 0.0

# =========================
# Input laden
# =========================
def load_input_file(file_path):
    try:
        if file_path.suffix.lower() == ".ods":
            df = pd.read_excel(file_path, engine="odf", header=None)
        elif file_path.suffix.lower() == ".xlsx":
            df = pd.read_excel(file_path, engine="openpyxl", header=None)
        elif file_path.suffix.lower() == ".csv":
            df = pd.read_csv(file_path, header=None)
        else:
            log("WARNING", f"Unbekanntes Dateiformat: {file_path.name}")
            return None
        return df
    except Exception as e:
        log("ERROR", f"Fehler beim Laden von {file_path.name}: {e}")
        return None

# =========================
# Header-Zeile suchen
# =========================
def find_header_row(df, keywords=["objektbeschreibung", "objekt/ebene"]):
    for i, row in df.iterrows():
        row_lower = [str(cell).lower() if pd.notna(cell) else "" for cell in row]
        if any(kw in cell for kw in keywords for cell in row_lower):
            return i, row_lower
    return None, None

# =========================
# Verarbeitung
# =========================
def process_files():
    all_terms = []
    output_rows = []

    for file_path in INPUT_DIR.glob("*"):
        if not file_path.suffix.lower() in [".csv", ".xlsx", ".ods"]:
            continue
        log("INFO", f"Verarbeite {file_path.name}")
        df = load_input_file(file_path)
        if df is None:
            continue

        header_idx, header_row = find_header_row(df)
        if header_idx is None:
            log("WARNING", f"Keine Header-Zeile gefunden in {file_path.name}")
            continue
        df.columns = header_row
        df = df.iloc[header_idx+1:].reset_index(drop=True)

        col_objdesc = next((col for col in df.columns if "objektbeschreibung" in str(col).lower()), None)
        col_objlevel = next((col for col in df.columns if "objekt/ebene" in str(col).lower()), None)
        if not col_objdesc:
            log("WARNING", f"Keine Spalte 'Objektbeschreibung' in {file_path.name}")
            continue

        term_list = []
        obj_level_list = []
        for _, row in df.iterrows():
            terms = str(row[col_objdesc]) if pd.notna(row[col_objdesc]) else ""
            if not terms:
                continue
            for term in [t.strip() for t in terms.split(",") if t.strip()]:
                term_list.append(term)
                obj_level_list.append(row[col_objlevel] if col_objlevel and pd.notna(row[col_objlevel]) else "")

        # API-Abfragen
        gnd_results = []
        gnd_scores = []
        wikidata_results = []
        wikidata_scores = []

        for term in term_list:
            gnd_res, gnd_conf = query_gnd(term)
            wikidata_res, wd_conf = query_wikidata(term)
            gnd_results.append(gnd_res)
            gnd_scores.append(gnd_conf)
            wikidata_results.append(wikidata_res)
            wikidata_scores.append(wd_conf)

        for idx, term in enumerate(term_list):
            output_rows.append({
                "Begriff": term,
                "Quelle": file_path.name,
                "Objekt/Ebene": obj_level_list[idx],
                "GND": gnd_results[idx],
                "GND_Confidence": gnd_scores[idx],
                "Wikidata": wikidata_results[idx],
                "Wikidata_Confidence": wikidata_scores[idx]
            })
        all_terms.extend(term_list)

    # Hauptoutput
    out_df = pd.DataFrame(output_rows)
    out_file = OUTPUT_DIR / "Auswertung_gesamt.ods"
    out_df.to_excel(out_file, index=False, engine="odf")
    log("INFO", f"Hauptauswertung gespeichert: {out_file}")

    # Rohdatei
    raw_terms = pd.Series(all_terms).value_counts().reset_index()
    raw_terms.columns = ["Begriff", "Häufigkeit"]
    raw_file = OUTPUT_DIR / "Rohbegriffe.ods"
    raw_terms.to_excel(raw_file, index=False, engine="odf")
    log("INFO", f"Rohbegriffe gespeichert: {raw_file}")

# =========================
# Main
# =========================
if __name__ == "__main__":
    if not INPUT_DIR.exists():
        log("CRITICAL", f"Eingabeordner {INPUT_DIR} fehlt!")
        sys.exit(1)
    process_files()