GND_Skript_Test/NormVokabular_Mapper_1.4.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
NormVokabular Mapper – Version 1.4.1
- Detailliertes (DEBUG) Batch-Logging: gepufferte Logs werden periodisch in Konsole + Datei geschrieben
- Getty AAT (SPARQL via requests) – API-polite, timeout/retries/backoff
- Fehlertoleranz: API-Ausfälle führen nicht zum Totalabsturz
- Fehlende Begriffe -> separate Datei (gleiches Format wie Output)
- Bestehende Normalisierung/Lemmatisierung/Stemming wird weiterverwendet
- Batch-Logging-Modus (konfigurierbar)
"""

from __future__ import annotations
import os
import sys
import re
import time
import json
import threading
import queue
import requests
import pandas as pd
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher
from datetime import datetime

# Optional libs
try:
    from rapidfuzz import fuzz
    RAPIDFUZZ_AVAILABLE = True
except Exception:
    RAPIDFUZZ_AVAILABLE = False

try:
    import spacy
    nlp = spacy.load("de_core_news_sm")
    SPACY_AVAILABLE = True
except Exception:
    SPACY_AVAILABLE = False
    nlp = None

# =========================
# Config & Pfade
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
CACHE_FILE = "api_cache.json"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
TIMEOUT_DEFAULT = 5
MAX_RETRIES_DEFAULT = 3
BACKOFF_FACTOR_DEFAULT = 2
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
API_ACTIVE = {"gnd": True, "wikidata": True, "aat": True}
FAIL_COUNTER = {"gnd": 0, "wikidata": 0, "aat": 0}

# Logging file
LOG_FILE = OUTPUT_DIR / "mapper_log.txt"

# Batch logging parameters
LOG_BATCH_SIZE = 100        # flush wenn >= Einträge
LOG_FLUSH_INTERVAL = 5.0   # Sekunden zwischen Flushes (Batch-Logging)
LOG_LEVEL = "DEBUG"        # ausführlich gewünscht

# =========================
# Buffered/Batched Logger
# =========================
class BatchLogger:
    def __init__(self, logfile: Path, flush_interval: float = 5.0, batch_size: int = 100, level: str = "DEBUG"):
        self.logfile = logfile
        self.flush_interval = flush_interval
        self.batch_size = batch_size
        self.level = level
        self.q = queue.Queue()
        self._stop_event = threading.Event()
        self._thread = threading.Thread(target=self._worker, daemon=True, name="BatchLoggerThread")
        # Ensure logfile exists
        try:
            logfile.parent.mkdir(parents=True, exist_ok=True)
            logfile.touch(exist_ok=True)
        except Exception:
            pass
        self._thread.start()

    def _format(self, level: str, msg: str) -> str:
        ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        return f"{ts} - {level} - {msg}"

    def log(self, level: str, msg: str):
        if self._stop_event.is_set():
            return
        formatted = self._format(level, msg)
        self.q.put((level, formatted))
        # If queue too big, trigger immediate flush by putting a special token
        if self.q.qsize() >= self.batch_size:
            self.q.put(("__FLUSH__", "__FLUSH__"))

    def debug(self, msg: str):
        if LOG_LEVEL in ("DEBUG",):
            self.log("DEBUG", msg)

    def info(self, msg: str):
        self.log("INFO", msg)

    def warning(self, msg: str):
        self.log("WARNING", msg)

    def error(self, msg: str):
        self.log("ERROR", msg)

    def exception(self, msg: str):
        self.log("EXCEPTION", msg)

    def _worker(self):
        buffer = []
        last_flush = time.time()
        while not self._stop_event.is_set() or not self.q.empty():
            try:
                item = None
                try:
                    item = self.q.get(timeout=self.flush_interval)
                except queue.Empty:
                    # time-based flush
                    if buffer:
                        self._flush_buffer(buffer)
                        buffer = []
                        last_flush = time.time()
                    continue

                if item is None:
                    continue
                level, formatted = item
                if level == "__FLUSH__":
                    if buffer:
                        self._flush_buffer(buffer)
                        buffer = []
                        last_flush = time.time()
                    continue
                buffer.append((level, formatted))

                # flush conditions
                if len(buffer) >= self.batch_size or (time.time() - last_flush) >= self.flush_interval:
                    self._flush_buffer(buffer)
                    buffer = []
                    last_flush = time.time()
            except Exception as e:
                # As a last resort, write error immediately to stderr
                try:
                    sys.stderr.write(f"BatchLogger worker error: {e}\n")
                except Exception:
                    pass
                time.sleep(0.5)
        # final flush
        if buffer:
            self._flush_buffer(buffer)

    def _flush_buffer(self, buffer):
        if not buffer:
            return
        # write to console and file
        try:
            # console
            out_lines = [f"{line}\n" for _, line in buffer]
            # write to stdout
            try:
                sys.stdout.writelines(out_lines)
                sys.stdout.flush()
            except Exception:
                pass
            # append to file
            try:
                with open(self.logfile, "a", encoding="utf-8") as f:
                    f.writelines(out_lines)
            except Exception as e:
                try:
                    sys.stderr.write(f"BatchLogger file write error: {e}\n")
                except Exception:
                    pass
        except Exception:
            pass

    def stop(self):
        self._stop_event.set()
        # put sentinel to wake worker
        try:
            self.q.put(("__FLUSH__", "__FLUSH__"))
        except Exception:
            pass
        self._thread.join(timeout=5.0)

# Instantiate logger
logger = BatchLogger(LOG_FILE, flush_interval=LOG_FLUSH_INTERVAL, batch_size=LOG_BATCH_SIZE, level=LOG_LEVEL)
logger.info("Starte NormVokabular Mapper v1.4.1 (Batch-Logging aktiv)")

# =========================
# Cache laden/speichern
# =========================
if os.path.exists(CACHE_FILE):
    try:
        with open(CACHE_FILE,"r",encoding="utf-8") as f:
            CACHE = json.load(f)
        logger.debug(f"Cache geladen ({len(CACHE)} Einträge).")
    except Exception as e:
        logger.warning(f"Cache konnte nicht geladen werden: {e}")
        CACHE = {}
else:
    CACHE = {}

def save_cache():
    try:
        with open(CACHE_FILE,"w",encoding="utf-8") as f:
            json.dump(CACHE, f, indent=2, ensure_ascii=False)
        logger.debug("Cache gespeichert.")
    except Exception as e:
        logger.error(f"Cache konnte nicht gespeichert werden: {e}")

# =========================
# Normalisierung / Lemma / Tokenization
# =========================
def normalize_text(s):
    if not s:
        return ""
    s = str(s).lower().strip()
    s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
    s = re.sub(r"\s+"," ",s)
    return s

lemma_cache = {}

def lemmatize_term(term):
    term_norm = normalize_text(term)
    if term_norm in lemma_cache:
        return lemma_cache[term_norm]
    if SPACY_AVAILABLE and nlp:
        try:
            doc = nlp(term_norm)
            lemma = " ".join([token.lemma_ for token in doc])
        except Exception:
            lemma = term_norm
    else:
        lemma = term_norm
    lemma_cache[term_norm] = lemma
    return lemma

def compound_split(term):
    if not term:
        return []
    parts = [p for p in re.split(r"[\s\-_/]+", term) if p]
    return parts if parts else [term]

# =========================
# Normvokabular laden & Index
# =========================
def load_normvokabular(file_path):
    try:
        sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
    except Exception as e:
        logger.error(f"Normvokabular konnte nicht geladen werden: {e}")
        raise
    norm_dict = {}
    stem_index = defaultdict(list)
    lemma_norm_map = {}

    for sheet_name, df in sheets.items():
        if sheet_name.lower() in ["master", "übersicht"]:
            continue
        df = df.dropna(how="all", axis=1)
        df.columns = [str(c).strip() for c in df.columns]
        id_col = next((c for c in df.columns if "ID" in c), None)
        word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c or "Begriff" in c), None)
        if not id_col or not word_col:
            continue
        current_parent_id = None
        for _, row in df.iterrows():
            row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
            row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
            if row_id:
                current_parent_id = row_id
            if not row_word:
                continue
            assigned_parent_id = current_parent_id
            entry = {"Name": row_word, "ID": assigned_parent_id or "", "Sheet": sheet_name, "Own_ID": row_id or ""}
            key = normalize_text(row_word)
            norm_dict[key] = entry
            lemma = lemmatize_term(key)
            stem_index[lemma].append(entry)
            if lemma not in lemma_norm_map:
                lemma_norm_map[lemma] = entry
    logger.info(f"Normvokabular geladen: {len(norm_dict)} Einträge, {len(stem_index)} Stems")
    return norm_dict, stem_index, lemma_norm_map

# =========================
# Mapping & Vorschläge
# =========================
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
    term_norm = normalize_text(term)
    term_lemma = lemmatize_term(term)

    if term_norm in norm_dict:
        e = norm_dict[term_norm]
        logger.debug(f"map_to_norm: exakter Treffer für '{term}' -> {e['Name']}")
        return e["Name"], e["ID"], []

    if term_lemma in stem_index:
        e = stem_index[term_lemma][0]
        logger.debug(f"map_to_norm: Lemma-Treffer für '{term}' -> {e['Name']}")
        return e["Name"], e["ID"], []

    tokens = compound_split(term_norm)
    if len(tokens) == 1:
        suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
        logger.debug(f"map_to_norm: KEIN TREFFER für '{term}', Vorschläge: {suggestions}")
        return "KEIN TREFFER", "", suggestions
    else:
        token_matches = []
        for t in tokens:
            t_lemma = lemmatize_term(t)
            if t_lemma in stem_index:
                e = stem_index[t_lemma][0]
                token_matches.append((t, e["Name"], e["ID"]))
            else:
                sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
                token_matches.append((t, "KEIN TREFFER", "", sugg))
        combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
        logger.debug(f"map_to_norm: Kompositum '{term}' -> combined_suggestions: {combined_suggestions}")
        return "KEIN TREFFER", "", combined_suggestions

def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
    candidates = []
    for key_lemma, entry in lemma_norm_map.items():
        if RAPIDFUZZ_AVAILABLE:
            score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
        else:
            score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
        if key_lemma.lower().startswith(term_lemma.lower()):
            score = min(score + 0.1, 1.0)
        if score >= threshold:
            candidates.append((score, entry["Name"], entry["ID"]))
    candidates.sort(reverse=True)
    return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]

# =========================
# Generic request with retries & caching
# =========================
def request_with_retries_generic(api_name, url, params=None, headers=None, timeout=TIMEOUT_DEFAULT, max_retries=MAX_RETRIES_DEFAULT, backoff=BACKOFF_FACTOR_DEFAULT):
    cache_key = url + (json.dumps(params, sort_keys=True, ensure_ascii=False) if params else "")
    if cache_key in CACHE:
        logger.debug(f"[Cache] {api_name}: {cache_key}")
        return CACHE[cache_key]
    retries = 0
    while retries < max_retries:
        try:
            r = requests.get(url, params=params, headers=headers or HEADERS, timeout=timeout)
            if r.status_code == 200:
                try:
                    data = r.json()
                except Exception:
                    data = r.text
                CACHE[cache_key] = data
                FAIL_COUNTER[api_name] = 0
                logger.debug(f"[{api_name}] Erfolgreiche Antwort für {url}")
                return data
            else:
                logger.warning(f"[{api_name}] HTTP {r.status_code} für {url}")
                raise ValueError(f"HTTP {r.status_code}")
        except Exception as e:
            retries += 1
            wait = backoff ** retries
            logger.warning(f"[{api_name}] Fehler ({retries}/{max_retries}) für {url}: {e}. Warte {wait}s")
            time.sleep(wait)
    FAIL_COUNTER[api_name] += 1
    if FAIL_COUNTER[api_name] >= 10:
        API_ACTIVE[api_name] = False
        logger.error(f"[{api_name}] Deaktiviere API nach zu vielen Fehlern.")
    return None

# =========================
# GND / Wikidata (bestehend)
# =========================
def batch_query_gnd(terms):
    results = {}
    if not API_ACTIVE.get("gnd", False):
        for t in terms: results[t] = ""
        return results
    logger.info(f"[GND] Starte GND-Abfragen für {len(terms)} Terme")
    start = time.time()
    for idx, t in enumerate(terms, start=1):
        logger.debug(f"[GND] ({idx}/{len(terms)}) Anfrage für '{t}'")
        url = "https://lobid.org/gnd/search"
        params = {"q": t, "format": "json"}
        data = request_with_retries_generic("gnd", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
        top = ""
        try:
            if data and "member" in data:
                cands = [(doc.get("preferredName","") or doc.get("name",""),
                          SequenceMatcher(None, t.lower(), (doc.get("preferredName","") or doc.get("name","")).lower()).ratio())
                         for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
                cands = [c for c in cands if c[1] >= 0.75]
                if cands:
                    top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
        except Exception as e:
            logger.debug(f"[GND] Fehler bei Verarbeitung für '{t}': {e}")
        results[t] = top
    elapsed = time.time() - start
    logger.info(f"[GND] Fertig. Dauer: {elapsed:.1f}s")
    return results

def batch_query_wikidata(terms):
    results = {}
    if not API_ACTIVE.get("wikidata", False):
        for t in terms: results[t] = ""
        return results
    logger.info(f"[WD] Starte Wikidata-Abfragen für {len(terms)} Terme")
    start = time.time()
    for idx, t in enumerate(terms, start=1):
        logger.debug(f"[WD] ({idx}/{len(terms)}) Anfrage für '{t}'")
        url = "https://www.wikidata.org/w/api.php"
        params = {"action": "wbsearchentities", "search": t, "language": "de", "format": "json"}
        data = request_with_retries_generic("wikidata", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
        top = ""
        try:
            if data and "search" in data:
                cands = [(e.get("label",""), SequenceMatcher(None, t.lower(), e.get("label","").lower()).ratio())
                         for e in data["search"] if e.get("label","")]
                cands = [c for c in cands if c[1] >= 0.70]
                if cands:
                    top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
        except Exception as e:
            logger.debug(f"[WD] Fehler bei Verarbeitung für '{t}': {e}")
        results[t] = top
    elapsed = time.time() - start
    logger.info(f"[WD] Fertig. Dauer: {elapsed:.1f}s")
    return results

# =========================
# Getty AAT Abfrage – robust & API-polite (requests)
# =========================
def batch_query_getty_aat(terms):
    results = {}
    if not API_ACTIVE.get("aat", False):
        for t in terms: results[t] = ""
        return results

    endpoint = "https://vocab.getty.edu/sparql"
    headers = {"Accept": "application/sparql-results+json", "User-Agent": HEADERS.get("User-Agent")}
    TIMEOUT = 8
    MAX_RETRIES = 3
    BACKOFF_FACTOR = 2
    FAIL_LIMIT = 5
    fail_counter_local = 0

    logger.info(f"[AAT] Starte Getty AAT-Abgleich für {len(terms)} Terme")
    start_all = time.time()
    for idx, term in enumerate(terms, start=1):
        term_norm = lemmatize_term(normalize_text(term))
        tokens = compound_split(term_norm)
        logger.debug(f"[AAT] ({idx}/{len(terms)}) Begriff '{term}' -> Tokens: {tokens}")

        query_fragments = []
        for tkn in tokens:
            t_escaped = tkn.replace('"', '\\"')
            qf = f"""
                ?concept skos:prefLabel ?label .
                FILTER(lang(?label)='de' && CONTAINS(LCASE(?label), LCASE("{t_escaped}")))
            """
            query_fragments.append(f"{{ {qf} }}")
        query_body = " UNION ".join(query_fragments) if query_fragments else ""
        query = f"PREFIX skos: <http://www.w3.org/2004/02/skos/core#> SELECT ?label ?concept WHERE {{ {query_body} }} LIMIT 10"

        retries = 0
        success = False
        start_term = time.time()
        while retries < MAX_RETRIES and not success:
            try:
                logger.debug(f"[AAT] Anfrage (Retry {retries}) für '{term}'")
                r = requests.get(endpoint, params={"query": query}, headers=headers, timeout=TIMEOUT)
                if r.status_code != 200:
                    raise ValueError(f"HTTP {r.status_code}")
                ret = r.json()
                candidates = [(b['label']['value'], b['concept']['value']) for b in ret.get("results", {}).get("bindings", [])]
                if candidates:
                    scored = [
                        (c[0], c[1], SequenceMatcher(None, term_norm, lemmatize_term(normalize_text(c[0]))).ratio())
                        for c in candidates
                    ]
                    top = max(scored, key=lambda x: x[2])
                    results[term] = top[0]
                    logger.debug(f"[AAT] Treffer für '{term}': {results[term]} (Score: {top[2]:.3f})")
                else:
                    results[term] = ""
                    logger.debug(f"[AAT] Kein Treffer für '{term}'")
                success = True
            except Exception as e:
                retries += 1
                wait = BACKOFF_FACTOR ** retries
                logger.warning(f"[AAT] Fehler ({retries}/{MAX_RETRIES}) für '{term}': {e} – warte {wait}s")
                time.sleep(wait)
                if retries == MAX_RETRIES:
                    results[term] = ""
                    fail_counter_local += 1
            # polite delay
            time.sleep(1.0)
        elapsed_term = time.time() - start_term
        logger.debug(f"[AAT] Dauer für '{term}': {elapsed_term:.2f}s")

        if fail_counter_local >= FAIL_LIMIT:
            logger.error("[AAT] Zu viele Fehler lokal - breche AAT-Abfragen ab.")
            for t_rem in terms[idx:]:
                results[t_rem] = ""
            FAIL_COUNTER["aat"] += fail_counter_local
            API_ACTIVE["aat"] = False
            break

    elapsed_all = time.time() - start_all
    logger.info(f"[AAT] Getty AAT-Abgleich abgeschlossen. Dauer: {elapsed_all:.1f}s")
    return results

# =========================
# Markierung / Export (Excel/ODS)
# =========================
def mark_norm_hits(file_path):
    ext = file_path.suffix.lower()
    try:
        if ext in [".xlsx", ".xls"]:
            from openpyxl import load_workbook
            from openpyxl.styles import PatternFill
            wb = load_workbook(file_path)
            ws = wb.active
            col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
            norm_col = col_map.get("Norm_Treffer", None)
            if not norm_col:
                logger.debug("Spalte 'Norm_Treffer' nicht gefunden (mark_norm_hits).")
                wb.save(file_path)
                return
            green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
            red_fill   = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
            for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
                cell = row[0]
                if cell.value and cell.value != "KEIN TREFFER":
                    cell.fill = green_fill
                else:
                    cell.fill = red_fill
            wb.save(file_path)
        elif ext==".ods":
            df = pd.read_excel(file_path, engine="odf")
            df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
            df.to_excel(file_path, index=False, engine="odf")
    except Exception as e:
        logger.warning(f"Fehler beim Markieren der Treffer in {file_path}: {e}")

# =========================
# Fehlende Begriffe -> separate Datei
# =========================
def export_missing_terms(out_df, output_file):
    missing_df = out_df[
        (out_df["Norm_Treffer"] == "KEIN TREFFER") &
        (out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
    ][["Begriff"]].drop_duplicates()

    count_missing = len(missing_df)
    logger.info(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")

    if count_missing == 0:
        return

    ext = output_file.suffix.lower()
    base_name = output_file.stem
    missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}"
    version = 1
    while missing_file.exists():
        missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}"
        version += 1

    try:
        if ext in [".xlsx", ".xls"]:
            missing_df.to_excel(missing_file, index=False, engine="openpyxl")
        elif ext == ".ods":
            missing_df.to_excel(missing_file, index=False, engine="odf")
        else:
            missing_df.to_csv(missing_file, index=False, sep=";")
        logger.info(f"Fehlende Begriffe gespeichert: {missing_file}")
    except Exception as e:
        logger.error(f"Fehler beim Speichern der fehlenden Begriffe: {e}")

# =========================
# Haupt-Loop: Verarbeitung Input-Dateien
# =========================
def process_files():
    overall_start = time.time()
    try:
        norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
    except Exception as e:
        logger.error("Normvokabular konnte nicht geladen werden. Beende.")
        raise

    total_terms = 0
    total_hits = 0

    if not INPUT_DIR.exists():
        logger.error(f"Eingabeordner {INPUT_DIR} fehlt")
        raise SystemExit(1)
    files = list(INPUT_DIR.glob("*"))
    if not files:
        logger.info("Keine Dateien gefunden")
        return

    logger.info(f"Starte Verarbeitung von {len(files)} Dateien")
    for file_idx, file_path in enumerate(files, start=1):
        if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
            logger.debug(f"Übersprungen (kein unterstütztes Format): {file_path.name}")
            continue
        logger.info(f"[Datei {file_idx}/{len(files)}] Verarbeite: {file_path.name}")
        file_start = time.time()
        try:
            if file_path.suffix.lower() == ".csv":
                df = pd.read_csv(file_path)
            else:
                df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
        except Exception as e:
            logger.error(f"Fehler beim Lesen von {file_path.name}: {e}")
            continue

        df = df.dropna(how="all")
        df.columns = [str(c).strip() for c in df.columns]

        besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
        box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
        urh_col = next((c for c in df.columns if "Urheber" in c), None)
        if not besch_col:
            logger.warning(f"Spalte 'Objektbeschreibung' nicht gefunden in {file_path.name}. Datei übersprungen.")
            continue

        row_terms_map = []
        for r_idx, row in enumerate(df.itertuples(index=False), start=1):
            try:
                besch = str(row[df.columns.get_loc(besch_col)]).strip() if pd.notna(row[df.columns.get_loc(besch_col)]) else ""
            except Exception:
                besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
            if not besch:
                continue
            obj_box = row[df.columns.get_loc(box_col)] if box_col and box_col in df.columns else ""
            urheber = row[df.columns.get_loc(urh_col)] if urh_col and urh_col in df.columns else ""
            clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
            terms = []
            for clause in clauses:
                parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
                for p in parts:
                    if p.lower() in STOPWORDS:
                        continue
                    if re.fullmatch(r"\d+", p):
                        continue
                    terms.append(p)
            row_terms_map.append((obj_box, urheber, terms))
            if (r_idx % 200) == 0:
                logger.debug(f"[{file_path.name}] Zeile {r_idx} verarbeitet")

        all_terms = list({t for _,_,terms in row_terms_map for t in terms})
        logger.info(f"[{file_path.name}] Gefundene unique Terme: {len(all_terms)}")
        total_unique_terms = len(all_terms)
        # API-Abfragen
        t0 = time.time()
        gnd_results = batch_query_gnd(all_terms)
        t1 = time.time()
        logger.info(f"[{file_path.name}] GND-Abfragen Dauer: {t1-t0:.1f}s")
        wd_results = batch_query_wikidata(all_terms)
        t2 = time.time()
        logger.info(f"[{file_path.name}] Wikidata-Abfragen Dauer: {t2-t1:.1f}s")
        aat_results = batch_query_getty_aat(all_terms) if API_ACTIVE.get("aat", False) else {t:"" for t in all_terms}
        t3 = time.time()
        logger.info(f"[{file_path.name}] AAT-Abfragen Dauer: {t3-t2:.1f}s")

        # Build output rows
        output_rows = []
        processed_count = 0
        for obj_box, urheber, terms in row_terms_map:
            for term in terms:
                norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
                total_terms += 1
                if norm_name != "KEIN TREFFER":
                    total_hits += 1
                out_row = {
                    "Box": obj_box,
                    "Objekt/Ebene": obj_box,
                    "Urheber": urheber,
                    "Begriff": term,
                    "Norm_Treffer": norm_name,
                    "Norm_ID": norm_id,
                    "Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
                    "GND_Top1": gnd_results.get(term,""),
                    "WD_Top1": wd_results.get(term,""),
                    "AAT_Top1": aat_results.get(term,"")
                }
                output_rows.append(out_row)
                processed_count += 1
                if (processed_count % 200) == 0:
                    logger.debug(f"[{file_path.name}] {processed_count}/{total_unique_terms} Terme verarbeitet")

        out_df = pd.DataFrame(output_rows)
        output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
        version = 1
        while output_file.exists():
            output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
            version += 1
        engine = "odf" if output_file.suffix.lower()==".ods" else None

        try:
            out_df.to_excel(output_file, index=False, engine=engine)
            logger.info(f"[{file_path.name}] Auswertung gespeichert: {output_file}")
        except Exception as e:
            logger.error(f"[{file_path.name}] Fehler beim Speichern der Auswertung {output_file}: {e}")
            continue

        export_missing_terms(out_df, output_file)
        mark_norm_hits(output_file)

        file_elapsed = time.time() - file_start
        logger.info(f"[Datei {file_idx}/{len(files)}] Fertig ({file_elapsed:.1f}s)")

    overall_elapsed = time.time() - overall_start
    logger.info(f"Fertig. Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular. Gesamtzeit: {overall_elapsed:.1f}s")

# =========================
# Main
# =========================
if __name__ == "__main__":
    try:
        process_files()
    except KeyboardInterrupt:
        logger.warning("Abbruch durch Benutzer (KeyboardInterrupt).")
    except SystemExit:
        logger.warning("SystemExit aufgetreten.")
    except Exception as e:
        logger.exception(f"Ungefangener Fehler: {e}")
    finally:
        # Stop logger (flush remaining logs)
        try:
            save_cache()
        except Exception:
            pass
        try:
            logger.info("Beende.")
            logger.stop()
        except Exception:
            pass