GND_Skript_Test/NormVokabular_Mapper_1.4.py

661 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
NormVokabular Mapper Version 1.4.2
Dieses Skript normalisiert und mappt Begriffe aus Input-Dateien auf ein zentrales Normvokabular
und führt optional API-Abgleiche mit GND und Wikidata durch. Ergebnisse werden in Excel/ODS gespeichert.
"""
from __future__ import annotations
import os
import sys
import re
import time
import json
import threading
import queue
import requests
import pandas as pd
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher
from datetime import datetime
# Optional Libraries
try:
from rapidfuzz import fuzz # für schnellere String-Similarity
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
try:
import spacy
nlp = spacy.load("de_core_news_sm") # deutsche Lemmatization
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
# =========================
# Konfiguration & Pfade
# =========================
INPUT_DIR = Path("Input CSV") # Eingabeverzeichnis
OUTPUT_DIR = Path("Auswertung Ergebnisse") # Ausgabeordner
OUTPUT_DIR.mkdir(exist_ok=True) # Verzeichnis erstellen, falls nicht vorhanden
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods") # Normvokabular-Datei
CACHE_FILE = "api_cache.json" # Cache für API-Antworten
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75 # Threshold für Vorschläge
TIMEOUT_DEFAULT = 5
MAX_RETRIES_DEFAULT = 3
BACKOFF_FACTOR_DEFAULT = 2
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
API_ACTIVE = {"gnd": True, "wikidata": True} # API-Verfügbarkeit
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
# Logging-Parameter
LOG_FILE = OUTPUT_DIR / "mapper_log.txt"
LOG_BATCH_SIZE = 100 # Anzahl Logs vor Flush
LOG_FLUSH_INTERVAL = 5.0 # Sekunden zwischen Flushes
LOG_LEVEL = "DEBUG" # Logging-Level
# =========================
# Batch/Buffered Logger
# =========================
class BatchLogger:
"""
Buffered Logger: Speichert Logs in einem Queue-Buffer und schreibt sie periodisch in Datei und Konsole.
Reduziert I/O-Aufwand bei vielen Logs.
"""
def __init__(self, logfile: Path, flush_interval: float = 5.0, batch_size: int = 100, level: str = "DEBUG"):
self.logfile = logfile
self.flush_interval = flush_interval
self.batch_size = batch_size
self.level = level
self.q = queue.Queue()
self._stop_event = threading.Event()
self._thread = threading.Thread(target=self._worker, daemon=True, name="BatchLoggerThread")
# Sicherstellen, dass die Log-Datei existiert
try:
logfile.parent.mkdir(parents=True, exist_ok=True)
logfile.touch(exist_ok=True)
except Exception:
pass
self._thread.start()
def _format(self, level: str, msg: str) -> str:
"""Formatiert Logeinträge mit Timestamp"""
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
return f"{ts} - {level} - {msg}"
def log(self, level: str, msg: str):
"""Fügt Log dem Queue hinzu und löst Flush aus, falls Batchgröße erreicht"""
if self._stop_event.is_set():
return
formatted = self._format(level, msg)
self.q.put((level, formatted))
if self.q.qsize() >= self.batch_size:
self.q.put(("__FLUSH__", "__FLUSH__"))
def debug(self, msg: str):
if LOG_LEVEL in ("DEBUG",):
self.log("DEBUG", msg)
def info(self, msg: str):
self.log("INFO", msg)
def warning(self, msg: str):
self.log("WARNING", msg)
def error(self, msg: str):
self.log("ERROR", msg)
def exception(self, msg: str):
self.log("EXCEPTION", msg)
def _worker(self):
"""Hintergrund-Thread: verarbeitet Queue, schreibt Logs periodisch"""
buffer = []
last_flush = time.time()
while not self._stop_event.is_set() or not self.q.empty():
try:
item = None
try:
item = self.q.get(timeout=self.flush_interval)
except queue.Empty:
if buffer:
self._flush_buffer(buffer)
buffer = []
last_flush = time.time()
continue
if item is None:
continue
level, formatted = item
if level == "__FLUSH__":
if buffer:
self._flush_buffer(buffer)
buffer = []
last_flush = time.time()
continue
buffer.append((level, formatted))
if len(buffer) >= self.batch_size or (time.time() - last_flush) >= self.flush_interval:
self._flush_buffer(buffer)
buffer = []
last_flush = time.time()
except Exception as e:
try:
sys.stderr.write(f"BatchLogger worker error: {e}\n")
except Exception:
pass
time.sleep(0.5)
if buffer:
self._flush_buffer(buffer)
def _flush_buffer(self, buffer):
"""Schreibt Puffer in Datei und Konsole"""
if not buffer:
return
try:
out_lines = [f"{line}\n" for _, line in buffer]
try:
sys.stdout.writelines(out_lines)
sys.stdout.flush()
except Exception:
pass
try:
with open(self.logfile, "a", encoding="utf-8") as f:
f.writelines(out_lines)
except Exception as e:
try:
sys.stderr.write(f"BatchLogger file write error: {e}\n")
except Exception:
pass
except Exception:
pass
def stop(self):
"""Stoppt Logger-Thread"""
self._stop_event.set()
try:
self.q.put(("__FLUSH__", "__FLUSH__"))
except Exception:
pass
self._thread.join(timeout=5.0)
# Logger-Instanz erstellen
logger = BatchLogger(LOG_FILE, flush_interval=LOG_FLUSH_INTERVAL, batch_size=LOG_BATCH_SIZE, level=LOG_LEVEL)
logger.info("Starte NormVokabular Mapper v1.4.2 (Batch-Logging aktiv)")
# =========================
# Cache laden/speichern
# =========================
if os.path.exists(CACHE_FILE):
try:
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
logger.debug(f"Cache geladen ({len(CACHE)} Einträge).")
except Exception as e:
logger.warning(f"Cache konnte nicht geladen werden: {e}")
CACHE = {}
else:
CACHE = {}
def save_cache():
"""Speichert aktuellen Cache in JSON"""
try:
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
logger.debug("Cache gespeichert.")
except Exception as e:
logger.error(f"Cache konnte nicht gespeichert werden: {e}")
# =========================
# Normalisierung / Lemma / Tokenization
# =========================
def normalize_text(s):
"""Text in Kleinbuchstaben, Sonderzeichen entfernen, Trim"""
if not s:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
lemma_cache = {}
def lemmatize_term(term):
"""Lemmatize mit spaCy, Cache für Performance"""
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
except Exception:
lemma = term_norm
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
def compound_split(term):
"""Splittet Komposita nach -, _, / oder Leerzeichen"""
if not term:
return []
parts = [p for p in re.split(r"[\s\-_/]+", term) if p]
return parts if parts else [term]
# =========================
# Normvokabular laden & Index
# =========================
def load_normvokabular(file_path):
"""Lädt Normvokabular aus Excel/ODS, erstellt Dictionarys für Mapping"""
try:
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
except Exception as e:
logger.error(f"Normvokabular konnte nicht geladen werden: {e}")
raise
norm_dict = {}
stem_index = defaultdict(list)
lemma_norm_map = {}
for sheet_name, df in sheets.items():
if sheet_name.lower() in ["master", "übersicht"]:
continue # Übersichtsblätter ignorieren
df = df.dropna(how="all", axis=1)
df.columns = [str(c).strip() for c in df.columns]
# ID- und Wort-Spalte finden
id_col = next((c for c in df.columns if "ID" in c), None)
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c or "Begriff" in c), None)
if not id_col or not word_col:
continue
current_parent_id = None
for _, row in df.iterrows():
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
if row_id:
current_parent_id = row_id
if not row_word:
continue
assigned_parent_id = current_parent_id
entry = {"Name": row_word, "ID": assigned_parent_id or "", "Sheet": sheet_name, "Own_ID": row_id or ""}
key = normalize_text(row_word)
norm_dict[key] = entry
lemma = lemmatize_term(key)
stem_index[lemma].append(entry)
if lemma not in lemma_norm_map:
lemma_norm_map[lemma] = entry
logger.info(f"Normvokabular geladen: {len(norm_dict)} Einträge, {len(stem_index)} Stems")
return norm_dict, stem_index, lemma_norm_map
# =========================
# Mapping & Vorschläge
# =========================
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
"""
Mappt einen Begriff auf Normvokabular.
Prüft exakte Treffer, Lemma-Treffer, Komposita und generiert Vorschläge.
"""
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_norm in norm_dict:
e = norm_dict[term_norm]
logger.debug(f"map_to_norm: exakter Treffer für '{term}' -> {e['Name']}")
return e["Name"], e["ID"], []
if term_lemma in stem_index:
e = stem_index[term_lemma][0]
logger.debug(f"map_to_norm: Lemma-Treffer für '{term}' -> {e['Name']}")
return e["Name"], e["ID"], []
tokens = compound_split(term_norm)
if len(tokens) == 1:
suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
logger.debug(f"map_to_norm: KEIN TREFFER für '{term}', Vorschläge: {suggestions}")
return "KEIN TREFFER", "", suggestions
else:
token_matches = []
for t in tokens:
t_lemma = lemmatize_term(t)
if t_lemma in stem_index:
e = stem_index[t_lemma][0]
token_matches.append((t, e["Name"], e["ID"]))
else:
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
token_matches.append((t, "KEIN TREFFER", "", sugg))
combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
logger.debug(f"map_to_norm: Kompositum '{term}' -> combined_suggestions: {combined_suggestions}")
return "KEIN TREFFER", "", combined_suggestions
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
"""Ermittelt Vorschläge basierend auf Similarity"""
candidates = []
for key_lemma, entry in lemma_norm_map.items():
if RAPIDFUZZ_AVAILABLE:
score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
else:
score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
if key_lemma.lower().startswith(term_lemma.lower()):
score = min(score + 0.1, 1.0)
if score >= threshold:
candidates.append((score, entry["Name"], entry["ID"]))
candidates.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
# =========================
# Generic request with retries & caching
# =========================
def request_with_retries_generic(api_name, url, params=None, headers=None, timeout=TIMEOUT_DEFAULT, max_retries=MAX_RETRIES_DEFAULT, backoff=BACKOFF_FACTOR_DEFAULT):
"""
Sendet GET-Requests mit Retry-Logik, Backoff und Caching
"""
cache_key = url + (json.dumps(params, sort_keys=True, ensure_ascii=False) if params else "")
if cache_key in CACHE:
logger.debug(f"[Cache] {api_name}: {cache_key}")
return CACHE[cache_key]
retries = 0
while retries < max_retries:
try:
r = requests.get(url, params=params, headers=headers or HEADERS, timeout=timeout)
if r.status_code == 200:
try:
data = r.json()
except Exception:
data = r.text
CACHE[cache_key] = data
FAIL_COUNTER[api_name] = 0
logger.debug(f"[{api_name}] Erfolgreiche Antwort für {url}")
return data
else:
logger.warning(f"[{api_name}] HTTP {r.status_code} für {url}")
raise ValueError(f"HTTP {r.status_code}")
except Exception as e:
retries += 1
wait = backoff ** retries
logger.warning(f"[{api_name}] Fehler ({retries}/{max_retries}) für {url}: {e}. Warte {wait}s")
time.sleep(wait)
FAIL_COUNTER[api_name] += 1
if FAIL_COUNTER[api_name] >= 10:
API_ACTIVE[api_name] = False
logger.error(f"[{api_name}] Deaktiviere API nach zu vielen Fehlern.")
return None
# =========================
# GND / Wikidata Batch Queries
# =========================
def batch_query_gnd(terms):
"""Batch-Abfrage der Begriffe bei GND"""
results = {}
if not API_ACTIVE.get("gnd", False):
for t in terms: results[t] = ""
return results
logger.info(f"[GND] Starte GND-Abfragen für {len(terms)} Terme")
start = time.time()
for idx, t in enumerate(terms, start=1):
logger.debug(f"[GND] ({idx}/{len(terms)}) Anfrage für '{t}'")
url = "https://lobid.org/gnd/search"
params = {"q": t, "format": "json"}
data = request_with_retries_generic("gnd", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
top = ""
try:
if data and "member" in data:
cands = [(doc.get("preferredName","") or doc.get("name",""),
SequenceMatcher(None, t.lower(), (doc.get("preferredName","") or doc.get("name","")).lower()).ratio())
for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
cands = [c for c in cands if c[1] >= 0.75]
if cands:
top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
except Exception as e:
logger.debug(f"[GND] Fehler bei Verarbeitung für '{t}': {e}")
results[t] = top
elapsed = time.time() - start
logger.info(f"[GND] Fertig. Dauer: {elapsed:.1f}s")
return results
def batch_query_wikidata(terms):
"""Batch-Abfrage der Begriffe bei Wikidata"""
results = {}
if not API_ACTIVE.get("wikidata", False):
for t in terms: results[t] = ""
return results
logger.info(f"[WD] Starte Wikidata-Abfragen für {len(terms)} Terme")
start = time.time()
for idx, t in enumerate(terms, start=1):
logger.debug(f"[WD] ({idx}/{len(terms)}) Anfrage für '{t}'")
url = "https://www.wikidata.org/w/api.php"
params = {"action": "wbsearchentities", "search": t, "language": "de", "format": "json"}
data = request_with_retries_generic("wikidata", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
top = ""
try:
if data and "search" in data:
# Ermittlung der Kandidaten mit Ähnlichkeitsbewertung
cands = [(e.get("label",""), SequenceMatcher(None, t.lower(), e.get("label","").lower()).ratio())
for e in data["search"] if e.get("label","")]
# Filterung nach Mindestähnlichkeit (0.70)
cands = [c for c in cands if c[1] >= 0.70]
if cands:
# Bestes Ergebnis nach Ähnlichkeit auswählen
top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
except Exception as e:
logger.debug(f"[WD] Fehler bei Verarbeitung für '{t}': {e}")
results[t] = top
elapsed = time.time() - start
logger.info(f"[WD] Fertig. Dauer: {elapsed:.1f}s")
return results
# =========================
# Markierung / Export (Excel/ODS)
# =========================
def mark_norm_hits(file_path):
"""
Markiert Treffer in Excel/ODS farblich:
Grün = Treffer, Rot = KEIN TREFFER
"""
ext = file_path.suffix.lower()
try:
if ext in [".xlsx", ".xls"]:
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
wb = load_workbook(file_path)
ws = wb.active
# Spaltenmapping anhand der Kopfzeile
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
norm_col = col_map.get("Norm_Treffer", None)
if not norm_col:
logger.debug("Spalte 'Norm_Treffer' nicht gefunden (mark_norm_hits).")
wb.save(file_path)
return
# Farben definieren
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
cell = row[0]
if cell.value and cell.value != "KEIN TREFFER":
cell.fill = green_fill
else:
cell.fill = red_fill
wb.save(file_path)
elif ext == ".ods":
# ODS: kein Zell-Fill, stattdessen Status-Spalte
df = pd.read_excel(file_path, engine="odf")
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x != "KEIN TREFFER" else "Kein Treffer")
df.to_excel(file_path, index=False, engine="odf")
except Exception as e:
logger.warning(f"Fehler beim Markieren der Treffer in {file_path}: {e}")
# =========================
# Fehlende Begriffe -> separate Datei
# =========================
def export_missing_terms(out_df, output_file):
"""
Speichert Begriffe ohne Treffer oder Vorschläge in separater Datei
"""
missing_df = out_df[
(out_df["Norm_Treffer"] == "KEIN TREFFER") &
(out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
][["Begriff"]].drop_duplicates()
count_missing = len(missing_df)
logger.info(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
if count_missing == 0:
return
ext = output_file.suffix.lower()
base_name = output_file.stem
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}"
version = 1
while missing_file.exists():
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}"
version += 1
try:
if ext in [".xlsx", ".xls"]:
missing_df.to_excel(missing_file, index=False, engine="openpyxl")
elif ext == ".ods":
missing_df.to_excel(missing_file, index=False, engine="odf")
else:
missing_df.to_csv(missing_file, index=False, sep=";")
logger.info(f"Fehlende Begriffe gespeichert: {missing_file}")
except Exception as e:
logger.error(f"Fehler beim Speichern der fehlenden Begriffe: {e}")
# =========================
# Haupt-Loop: Verarbeitung Input-Dateien
# =========================
def process_files():
"""Verarbeitet alle Dateien im Input-Ordner, mappt Begriffe und speichert Ergebnisse"""
overall_start = time.time()
try:
# Normvokabular laden
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
except Exception as e:
logger.error("Normvokabular konnte nicht geladen werden. Beende.")
raise
total_terms = 0
total_hits = 0
if not INPUT_DIR.exists():
logger.error(f"Eingabeordner {INPUT_DIR} fehlt")
raise SystemExit(1)
files = list(INPUT_DIR.glob("*"))
if not files:
logger.info("Keine Dateien gefunden")
return
logger.info(f"Starte Verarbeitung von {len(files)} Dateien")
for file_idx, file_path in enumerate(files, start=1):
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
logger.debug(f"Übersprungen (kein unterstütztes Format): {file_path.name}")
continue
logger.info(f"[Datei {file_idx}/{len(files)}] Verarbeite: {file_path.name}")
file_start = time.time()
try:
if file_path.suffix.lower() == ".csv":
df = pd.read_csv(file_path)
else:
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
except Exception as e:
logger.error(f"Fehler beim Lesen von {file_path.name}: {e}")
continue
df = df.dropna(how="all")
df.columns = [str(c).strip() for c in df.columns]
# Spalten identifizieren
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
urh_col = next((c for c in df.columns if "Urheber" in c), None)
if not besch_col:
logger.warning(f"Spalte 'Objektbeschreibung' nicht gefunden in {file_path.name}. Datei übersprungen.")
continue
# Begriffe extrahieren
row_terms_map = []
for r_idx, row in enumerate(df.itertuples(index=False), start=1):
try:
besch = str(row[df.columns.get_loc(besch_col)]).strip() if pd.notna(row[df.columns.get_loc(besch_col)]) else ""
except Exception:
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
if not besch:
continue
obj_box = row[df.columns.get_loc(box_col)] if box_col and box_col in df.columns else ""
urheber = row[df.columns.get_loc(urh_col)] if urh_col and urh_col in df.columns else ""
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
terms = []
for clause in clauses:
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS:
continue
if re.fullmatch(r"\d+", p):
continue
terms.append(p)
row_terms_map.append((obj_box, urheber, terms))
if (r_idx % 200) == 0:
logger.debug(f"[{file_path.name}] Zeile {r_idx} verarbeitet")
# Alle einzigartigen Terme für API-Abfragen
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
logger.info(f"[{file_path.name}] Gefundene unique Terme: {len(all_terms)}")
total_unique_terms = len(all_terms)
# API-Abfragen
t0 = time.time()
gnd_results = batch_query_gnd(all_terms)
t1 = time.time()
logger.info(f"[{file_path.name}] GND-Abfragen Dauer: {t1-t0:.1f}s")
wd_results = batch_query_wikidata(all_terms)
t2 = time.time()
logger.info(f"[{file_path.name}] Wikidata-Abfragen Dauer: {t2-t1:.1f}s")
# Build output rows
output_rows = []
processed_count = 0
for obj_box, urheber, terms in row_terms_map:
for term in terms:
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
total_terms += 1
if norm_name != "KEIN TREFFER":
total_hits += 1
out_row = {
"Box": obj_box,
"Objekt/Ebene": obj_box,
"Urheber": urheber,
"Begriff": term,
"Norm_Treffer": norm_name,
"Norm_ID": norm_id,
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
"GND_Top1": gnd_results.get(term,""),
"WD_Top1": wd_results.get(term,"")
}
output_rows.append(out_row)
processed_count += 1
if (processed_count % 200) == 0:
logger.debug(f"[{file_path.name}] {processed_count}/{total_unique_terms} Terme verarbeitet")
# Save output
out_df = pd.DataFrame(output_rows)
out_file = OUTPUT_DIR / f"{file_path.stem}_mapped.xlsx"
try:
out_df.to_excel(out_file, index=False, engine="openpyxl")
logger.info(f"Ergebnisse gespeichert: {out_file}")
mark_norm_hits(out_file)
export_missing_terms(out_df, out_file)
except Exception as e:
logger.error(f"Fehler beim Speichern der Ergebnisse für {file_path.name}: {e}")
elapsed_total = time.time() - overall_start
logger.info(f"Verarbeitung abgeschlossen. Gesamtzeit: {elapsed_total:.1f}s")
logger.info(f"Gesamtterme: {total_terms}, Treffer: {total_hits}, Trefferquote: {total_hits/total_terms:.2%}" if total_terms else "")
save_cache()
logger.stop()
if __name__ == "__main__":
process_files()