GND_Skript_Test/NormVokabular_Mapper_1.4.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
NormVokabular Mapper – Version 1.4.2

Dieses Skript normalisiert und mappt Begriffe aus Input-Dateien auf ein zentrales Normvokabular
und führt optional API-Abgleiche mit GND und Wikidata durch. Ergebnisse werden in Excel/ODS gespeichert.
"""

from __future__ import annotations
import os
import sys
import re
import time
import json
import threading
import queue
import requests
import pandas as pd
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher
from datetime import datetime

# Optional Libraries
try:
    from rapidfuzz import fuzz  # für schnellere String-Similarity
    RAPIDFUZZ_AVAILABLE = True
except Exception:
    RAPIDFUZZ_AVAILABLE = False

try:
    import spacy
    nlp = spacy.load("de_core_news_sm")  # deutsche Lemmatization
    SPACY_AVAILABLE = True
except Exception:
    SPACY_AVAILABLE = False
    nlp = None

# =========================
# Konfiguration & Pfade
# =========================
INPUT_DIR = Path("Input CSV")                # Eingabeverzeichnis
OUTPUT_DIR = Path("Auswertung Ergebnisse")  # Ausgabeordner
OUTPUT_DIR.mkdir(exist_ok=True)             # Verzeichnis erstellen, falls nicht vorhanden
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")  # Normvokabular-Datei
CACHE_FILE = "api_cache.json"               # Cache für API-Antworten
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75                       # Threshold für Vorschläge
TIMEOUT_DEFAULT = 5
MAX_RETRIES_DEFAULT = 3
BACKOFF_FACTOR_DEFAULT = 2
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
API_ACTIVE = {"gnd": True, "wikidata": True}  # API-Verfügbarkeit
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}

# Logging-Parameter
LOG_FILE = OUTPUT_DIR / "mapper_log.txt"
LOG_BATCH_SIZE = 100        # Anzahl Logs vor Flush
LOG_FLUSH_INTERVAL = 5.0    # Sekunden zwischen Flushes
LOG_LEVEL = "DEBUG"          # Logging-Level

# =========================
# Batch/Buffered Logger
# =========================
class BatchLogger:
    """
    Buffered Logger: Speichert Logs in einem Queue-Buffer und schreibt sie periodisch in Datei und Konsole.
    Reduziert I/O-Aufwand bei vielen Logs.
    """
    def __init__(self, logfile: Path, flush_interval: float = 5.0, batch_size: int = 100, level: str = "DEBUG"):
        self.logfile = logfile
        self.flush_interval = flush_interval
        self.batch_size = batch_size
        self.level = level
        self.q = queue.Queue()
        self._stop_event = threading.Event()
        self._thread = threading.Thread(target=self._worker, daemon=True, name="BatchLoggerThread")
        # Sicherstellen, dass die Log-Datei existiert
        try:
            logfile.parent.mkdir(parents=True, exist_ok=True)
            logfile.touch(exist_ok=True)
        except Exception:
            pass
        self._thread.start()

    def _format(self, level: str, msg: str) -> str:
        """Formatiert Logeinträge mit Timestamp"""
        ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        return f"{ts} - {level} - {msg}"

    def log(self, level: str, msg: str):
        """Fügt Log dem Queue hinzu und löst Flush aus, falls Batchgröße erreicht"""
        if self._stop_event.is_set():
            return
        formatted = self._format(level, msg)
        self.q.put((level, formatted))
        if self.q.qsize() >= self.batch_size:
            self.q.put(("__FLUSH__", "__FLUSH__"))

    def debug(self, msg: str):
        if LOG_LEVEL in ("DEBUG",):
            self.log("DEBUG", msg)
    def info(self, msg: str):
        self.log("INFO", msg)
    def warning(self, msg: str):
        self.log("WARNING", msg)
    def error(self, msg: str):
        self.log("ERROR", msg)
    def exception(self, msg: str):
        self.log("EXCEPTION", msg)

    def _worker(self):
        """Hintergrund-Thread: verarbeitet Queue, schreibt Logs periodisch"""
        buffer = []
        last_flush = time.time()
        while not self._stop_event.is_set() or not self.q.empty():
            try:
                item = None
                try:
                    item = self.q.get(timeout=self.flush_interval)
                except queue.Empty:
                    if buffer:
                        self._flush_buffer(buffer)
                        buffer = []
                        last_flush = time.time()
                    continue

                if item is None:
                    continue
                level, formatted = item
                if level == "__FLUSH__":
                    if buffer:
                        self._flush_buffer(buffer)
                        buffer = []
                        last_flush = time.time()
                    continue
                buffer.append((level, formatted))

                if len(buffer) >= self.batch_size or (time.time() - last_flush) >= self.flush_interval:
                    self._flush_buffer(buffer)
                    buffer = []
                    last_flush = time.time()
            except Exception as e:
                try:
                    sys.stderr.write(f"BatchLogger worker error: {e}\n")
                except Exception:
                    pass
                time.sleep(0.5)
        if buffer:
            self._flush_buffer(buffer)

    def _flush_buffer(self, buffer):
        """Schreibt Puffer in Datei und Konsole"""
        if not buffer:
            return
        try:
            out_lines = [f"{line}\n" for _, line in buffer]
            try:
                sys.stdout.writelines(out_lines)
                sys.stdout.flush()
            except Exception:
                pass
            try:
                with open(self.logfile, "a", encoding="utf-8") as f:
                    f.writelines(out_lines)
            except Exception as e:
                try:
                    sys.stderr.write(f"BatchLogger file write error: {e}\n")
                except Exception:
                    pass
        except Exception:
            pass

    def stop(self):
        """Stoppt Logger-Thread"""
        self._stop_event.set()
        try:
            self.q.put(("__FLUSH__", "__FLUSH__"))
        except Exception:
            pass
        self._thread.join(timeout=5.0)

# Logger-Instanz erstellen
logger = BatchLogger(LOG_FILE, flush_interval=LOG_FLUSH_INTERVAL, batch_size=LOG_BATCH_SIZE, level=LOG_LEVEL)
logger.info("Starte NormVokabular Mapper v1.4.2 (Batch-Logging aktiv)")

# =========================
# Cache laden/speichern
# =========================
if os.path.exists(CACHE_FILE):
    try:
        with open(CACHE_FILE,"r",encoding="utf-8") as f:
            CACHE = json.load(f)
        logger.debug(f"Cache geladen ({len(CACHE)} Einträge).")
    except Exception as e:
        logger.warning(f"Cache konnte nicht geladen werden: {e}")
        CACHE = {}
else:
    CACHE = {}

def save_cache():
    """Speichert aktuellen Cache in JSON"""
    try:
        with open(CACHE_FILE,"w",encoding="utf-8") as f:
            json.dump(CACHE, f, indent=2, ensure_ascii=False)
        logger.debug("Cache gespeichert.")
    except Exception as e:
        logger.error(f"Cache konnte nicht gespeichert werden: {e}")

# =========================
# Normalisierung / Lemma / Tokenization
# =========================
def normalize_text(s):
    """Text in Kleinbuchstaben, Sonderzeichen entfernen, Trim"""
    if not s:
        return ""
    s = str(s).lower().strip()
    s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
    s = re.sub(r"\s+"," ",s)
    return s

lemma_cache = {}
def lemmatize_term(term):
    """Lemmatize mit spaCy, Cache für Performance"""
    term_norm = normalize_text(term)
    if term_norm in lemma_cache:
        return lemma_cache[term_norm]
    if SPACY_AVAILABLE and nlp:
        try:
            doc = nlp(term_norm)
            lemma = " ".join([token.lemma_ for token in doc])
        except Exception:
            lemma = term_norm
    else:
        lemma = term_norm
    lemma_cache[term_norm] = lemma
    return lemma

def compound_split(term):
    """Splittet Komposita nach -, _, / oder Leerzeichen"""
    if not term:
        return []
    parts = [p for p in re.split(r"[\s\-_/]+", term) if p]
    return parts if parts else [term]

# =========================
# Normvokabular laden & Index
# =========================
def load_normvokabular(file_path):
    """Lädt Normvokabular aus Excel/ODS, erstellt Dictionarys für Mapping"""
    try:
        sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
    except Exception as e:
        logger.error(f"Normvokabular konnte nicht geladen werden: {e}")
        raise

    norm_dict = {}
    stem_index = defaultdict(list)
    lemma_norm_map = {}

    for sheet_name, df in sheets.items():
        if sheet_name.lower() in ["master", "übersicht"]:
            continue  # Übersichtsblätter ignorieren
        df = df.dropna(how="all", axis=1)
        df.columns = [str(c).strip() for c in df.columns]

        # ID- und Wort-Spalte finden
        id_col = next((c for c in df.columns if "ID" in c), None)
        word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c or "Begriff" in c), None)
        if not id_col or not word_col:
            continue

        current_parent_id = None
        for _, row in df.iterrows():
            row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
            row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
            if row_id:
                current_parent_id = row_id
            if not row_word:
                continue
            assigned_parent_id = current_parent_id
            entry = {"Name": row_word, "ID": assigned_parent_id or "", "Sheet": sheet_name, "Own_ID": row_id or ""}
            key = normalize_text(row_word)
            norm_dict[key] = entry
            lemma = lemmatize_term(key)
            stem_index[lemma].append(entry)
            if lemma not in lemma_norm_map:
                lemma_norm_map[lemma] = entry
    logger.info(f"Normvokabular geladen: {len(norm_dict)} Einträge, {len(stem_index)} Stems")
    return norm_dict, stem_index, lemma_norm_map

# =========================
# Mapping & Vorschläge
# =========================
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
    """
    Mappt einen Begriff auf Normvokabular.
    Prüft exakte Treffer, Lemma-Treffer, Komposita und generiert Vorschläge.
    """
    term_norm = normalize_text(term)
    term_lemma = lemmatize_term(term)

    if term_norm in norm_dict:
        e = norm_dict[term_norm]
        logger.debug(f"map_to_norm: exakter Treffer für '{term}' -> {e['Name']}")
        return e["Name"], e["ID"], []

    if term_lemma in stem_index:
        e = stem_index[term_lemma][0]
        logger.debug(f"map_to_norm: Lemma-Treffer für '{term}' -> {e['Name']}")
        return e["Name"], e["ID"], []

    tokens = compound_split(term_norm)
    if len(tokens) == 1:
        suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
        logger.debug(f"map_to_norm: KEIN TREFFER für '{term}', Vorschläge: {suggestions}")
        return "KEIN TREFFER", "", suggestions
    else:
        token_matches = []
        for t in tokens:
            t_lemma = lemmatize_term(t)
            if t_lemma in stem_index:
                e = stem_index[t_lemma][0]
                token_matches.append((t, e["Name"], e["ID"]))
            else:
                sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
                token_matches.append((t, "KEIN TREFFER", "", sugg))
        combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
        logger.debug(f"map_to_norm: Kompositum '{term}' -> combined_suggestions: {combined_suggestions}")
        return "KEIN TREFFER", "", combined_suggestions

def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
    """Ermittelt Vorschläge basierend auf Similarity"""
    candidates = []
    for key_lemma, entry in lemma_norm_map.items():
        if RAPIDFUZZ_AVAILABLE:
            score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
        else:
            score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
        if key_lemma.lower().startswith(term_lemma.lower()):
            score = min(score + 0.1, 1.0)
        if score >= threshold:
            candidates.append((score, entry["Name"], entry["ID"]))
    candidates.sort(reverse=True)
    return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]

# =========================
# Generic request with retries & caching
# =========================
def request_with_retries_generic(api_name, url, params=None, headers=None, timeout=TIMEOUT_DEFAULT, max_retries=MAX_RETRIES_DEFAULT, backoff=BACKOFF_FACTOR_DEFAULT):
    """
    Sendet GET-Requests mit Retry-Logik, Backoff und Caching
    """
    cache_key = url + (json.dumps(params, sort_keys=True, ensure_ascii=False) if params else "")
    if cache_key in CACHE:
        logger.debug(f"[Cache] {api_name}: {cache_key}")
        return CACHE[cache_key]

    retries = 0
    while retries < max_retries:
        try:
            r = requests.get(url, params=params, headers=headers or HEADERS, timeout=timeout)
            if r.status_code == 200:
                try:
                    data = r.json()
                except Exception:
                    data = r.text
                CACHE[cache_key] = data
                FAIL_COUNTER[api_name] = 0
                logger.debug(f"[{api_name}] Erfolgreiche Antwort für {url}")
                return data
            else:
                logger.warning(f"[{api_name}] HTTP {r.status_code} für {url}")
                raise ValueError(f"HTTP {r.status_code}")
        except Exception as e:
            retries += 1
            wait = backoff ** retries
            logger.warning(f"[{api_name}] Fehler ({retries}/{max_retries}) für {url}: {e}. Warte {wait}s")
            time.sleep(wait)
    FAIL_COUNTER[api_name] += 1
    if FAIL_COUNTER[api_name] >= 10:
        API_ACTIVE[api_name] = False
        logger.error(f"[{api_name}] Deaktiviere API nach zu vielen Fehlern.")
    return None

# =========================
# GND / Wikidata Batch Queries
# =========================
def batch_query_gnd(terms):
    """Batch-Abfrage der Begriffe bei GND"""
    results = {}
    if not API_ACTIVE.get("gnd", False):
        for t in terms: results[t] = ""
        return results
    logger.info(f"[GND] Starte GND-Abfragen für {len(terms)} Terme")
    start = time.time()
    for idx, t in enumerate(terms, start=1):
        logger.debug(f"[GND] ({idx}/{len(terms)}) Anfrage für '{t}'")
        url = "https://lobid.org/gnd/search"
        params = {"q": t, "format": "json"}
        data = request_with_retries_generic("gnd", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
        top = ""
        try:
            if data and "member" in data:
                cands = [(doc.get("preferredName","") or doc.get("name",""),
                          SequenceMatcher(None, t.lower(), (doc.get("preferredName","") or doc.get("name","")).lower()).ratio())
                         for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
                cands = [c for c in cands if c[1] >= 0.75]
                if cands:
                    top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
        except Exception as e:
            logger.debug(f"[GND] Fehler bei Verarbeitung für '{t}': {e}")
        results[t] = top
    elapsed = time.time() - start
    logger.info(f"[GND] Fertig. Dauer: {elapsed:.1f}s")
    return results

def batch_query_wikidata(terms):
    """Batch-Abfrage der Begriffe bei Wikidata"""
    results = {}
    if not API_ACTIVE.get("wikidata", False):
        for t in terms: results[t] = ""
        return results
    logger.info(f"[WD] Starte Wikidata-Abfragen für {len(terms)} Terme")
    start = time.time()
    for idx, t in enumerate(terms, start=1):
        logger.debug(f"[WD] ({idx}/{len(terms)}) Anfrage für '{t}'")
        url = "https://www.wikidata.org/w/api.php"
        params = {"action": "wbsearchentities", "search": t, "language": "de", "format": "json"}
        data = request_with_retries_generic("wikidata", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
        top = ""
        try:
            if data and "search" in data:
                # Ermittlung der Kandidaten mit Ähnlichkeitsbewertung
                cands = [(e.get("label",""), SequenceMatcher(None, t.lower(), e.get("label","").lower()).ratio())
                         for e in data["search"] if e.get("label","")]
                # Filterung nach Mindestähnlichkeit (0.70)
                cands = [c for c in cands if c[1] >= 0.70]
                if cands:
                    # Bestes Ergebnis nach Ähnlichkeit auswählen
                    top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
        except Exception as e:
            logger.debug(f"[WD] Fehler bei Verarbeitung für '{t}': {e}")
        results[t] = top
    elapsed = time.time() - start
    logger.info(f"[WD] Fertig. Dauer: {elapsed:.1f}s")
    return results

# =========================
# Markierung / Export (Excel/ODS)
# =========================
def mark_norm_hits(file_path):
    """
    Markiert Treffer in Excel/ODS farblich:
    Grün = Treffer, Rot = KEIN TREFFER
    """
    ext = file_path.suffix.lower()
    try:
        if ext in [".xlsx", ".xls"]:
            from openpyxl import load_workbook
            from openpyxl.styles import PatternFill
            wb = load_workbook(file_path)
            ws = wb.active
            # Spaltenmapping anhand der Kopfzeile
            col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
            norm_col = col_map.get("Norm_Treffer", None)
            if not norm_col:
                logger.debug("Spalte 'Norm_Treffer' nicht gefunden (mark_norm_hits).")
                wb.save(file_path)
                return
            # Farben definieren
            green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
            red_fill   = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
            for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
                cell = row[0]
                if cell.value and cell.value != "KEIN TREFFER":
                    cell.fill = green_fill
                else:
                    cell.fill = red_fill
            wb.save(file_path)
        elif ext == ".ods":
            # ODS: kein Zell-Fill, stattdessen Status-Spalte
            df = pd.read_excel(file_path, engine="odf")
            df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x != "KEIN TREFFER" else "Kein Treffer")
            df.to_excel(file_path, index=False, engine="odf")
    except Exception as e:
        logger.warning(f"Fehler beim Markieren der Treffer in {file_path}: {e}")

# =========================
# Fehlende Begriffe -> separate Datei
# =========================
def export_missing_terms(out_df, output_file):
    """
    Speichert Begriffe ohne Treffer oder Vorschläge in separater Datei
    """
    missing_df = out_df[
        (out_df["Norm_Treffer"] == "KEIN TREFFER") &
        (out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
    ][["Begriff"]].drop_duplicates()

    count_missing = len(missing_df)
    logger.info(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
    if count_missing == 0:
        return

    ext = output_file.suffix.lower()
    base_name = output_file.stem
    missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}"
    version = 1
    while missing_file.exists():
        missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}"
        version += 1

    try:
        if ext in [".xlsx", ".xls"]:
            missing_df.to_excel(missing_file, index=False, engine="openpyxl")
        elif ext == ".ods":
            missing_df.to_excel(missing_file, index=False, engine="odf")
        else:
            missing_df.to_csv(missing_file, index=False, sep=";")
        logger.info(f"Fehlende Begriffe gespeichert: {missing_file}")
    except Exception as e:
        logger.error(f"Fehler beim Speichern der fehlenden Begriffe: {e}")

# =========================
# Haupt-Loop: Verarbeitung Input-Dateien
# =========================
def process_files():
    """Verarbeitet alle Dateien im Input-Ordner, mappt Begriffe und speichert Ergebnisse"""
    overall_start = time.time()
    try:
        # Normvokabular laden
        norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
    except Exception as e:
        logger.error("Normvokabular konnte nicht geladen werden. Beende.")
        raise

    total_terms = 0
    total_hits = 0

    if not INPUT_DIR.exists():
        logger.error(f"Eingabeordner {INPUT_DIR} fehlt")
        raise SystemExit(1)
    files = list(INPUT_DIR.glob("*"))
    if not files:
        logger.info("Keine Dateien gefunden")
        return

    logger.info(f"Starte Verarbeitung von {len(files)} Dateien")
    for file_idx, file_path in enumerate(files, start=1):
        if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
            logger.debug(f"Übersprungen (kein unterstütztes Format): {file_path.name}")
            continue
        logger.info(f"[Datei {file_idx}/{len(files)}] Verarbeite: {file_path.name}")
        file_start = time.time()
        try:
            if file_path.suffix.lower() == ".csv":
                df = pd.read_csv(file_path)
            else:
                df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
        except Exception as e:
            logger.error(f"Fehler beim Lesen von {file_path.name}: {e}")
            continue

        df = df.dropna(how="all")
        df.columns = [str(c).strip() for c in df.columns]

        # Spalten identifizieren
        besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
        box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
        urh_col = next((c for c in df.columns if "Urheber" in c), None)
        if not besch_col:
            logger.warning(f"Spalte 'Objektbeschreibung' nicht gefunden in {file_path.name}. Datei übersprungen.")
            continue

        # Begriffe extrahieren
        row_terms_map = []
        for r_idx, row in enumerate(df.itertuples(index=False), start=1):
            try:
                besch = str(row[df.columns.get_loc(besch_col)]).strip() if pd.notna(row[df.columns.get_loc(besch_col)]) else ""
            except Exception:
                besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
            if not besch:
                continue
            obj_box = row[df.columns.get_loc(box_col)] if box_col and box_col in df.columns else ""
            urheber = row[df.columns.get_loc(urh_col)] if urh_col and urh_col in df.columns else ""
            clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
            terms = []
            for clause in clauses:
                parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
                for p in parts:
                    if p.lower() in STOPWORDS:
                        continue
                    if re.fullmatch(r"\d+", p):
                        continue
                    terms.append(p)
            row_terms_map.append((obj_box, urheber, terms))
            if (r_idx % 200) == 0:
                logger.debug(f"[{file_path.name}] Zeile {r_idx} verarbeitet")

        # Alle einzigartigen Terme für API-Abfragen
        all_terms = list({t for _,_,terms in row_terms_map for t in terms})
        logger.info(f"[{file_path.name}] Gefundene unique Terme: {len(all_terms)}")
        total_unique_terms = len(all_terms)

        # API-Abfragen
        t0 = time.time()
        gnd_results = batch_query_gnd(all_terms)
        t1 = time.time()
        logger.info(f"[{file_path.name}] GND-Abfragen Dauer: {t1-t0:.1f}s")
        wd_results = batch_query_wikidata(all_terms)
        t2 = time.time()
        logger.info(f"[{file_path.name}] Wikidata-Abfragen Dauer: {t2-t1:.1f}s")

        # Build output rows
        output_rows = []
        processed_count = 0
        for obj_box, urheber, terms in row_terms_map:
            for term in terms:
                norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
                total_terms += 1
                if norm_name != "KEIN TREFFER":
                    total_hits += 1
                out_row = {
                    "Box": obj_box,
                    "Objekt/Ebene": obj_box,
                    "Urheber": urheber,
                    "Begriff": term,
                    "Norm_Treffer": norm_name,
                    "Norm_ID": norm_id,
                    "Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
                    "GND_Top1": gnd_results.get(term,""),
                    "WD_Top1": wd_results.get(term,"")
                }
                output_rows.append(out_row)
                processed_count += 1
                if (processed_count % 200) == 0:
                    logger.debug(f"[{file_path.name}] {processed_count}/{total_unique_terms} Terme verarbeitet")

        # Save output
        out_df = pd.DataFrame(output_rows)
        out_file = OUTPUT_DIR / f"{file_path.stem}_mapped.xlsx"
        try:
            out_df.to_excel(out_file, index=False, engine="openpyxl")
            logger.info(f"Ergebnisse gespeichert: {out_file}")
            mark_norm_hits(out_file)
            export_missing_terms(out_df, out_file)
        except Exception as e:
            logger.error(f"Fehler beim Speichern der Ergebnisse für {file_path.name}: {e}")

    elapsed_total = time.time() - overall_start
    logger.info(f"Verarbeitung abgeschlossen. Gesamtzeit: {elapsed_total:.1f}s")
    logger.info(f"Gesamtterme: {total_terms}, Treffer: {total_hits}, Trefferquote: {total_hits/total_terms:.2%}" if total_terms else "")

    save_cache()
    logger.stop()

if __name__ == "__main__":
    process_files()