GND_Skript_Test/NormVokabular_Mapper_1.4.py
2025-10-10 09:46:41 +02:00

748 lines
29 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
NormVokabular Mapper Version 1.4.1
- Detailliertes (DEBUG) Batch-Logging: gepufferte Logs werden periodisch in Konsole + Datei geschrieben
- Getty AAT (SPARQL via requests) API-polite, timeout/retries/backoff
- Fehlertoleranz: API-Ausfälle führen nicht zum Totalabsturz
- Fehlende Begriffe -> separate Datei (gleiches Format wie Output)
- Bestehende Normalisierung/Lemmatisierung/Stemming wird weiterverwendet
- Batch-Logging-Modus (konfigurierbar)
"""
from __future__ import annotations
import os
import sys
import re
import time
import json
import threading
import queue
import requests
import pandas as pd
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher
from datetime import datetime
# Optional libs
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
# =========================
# Config & Pfade
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
CACHE_FILE = "api_cache.json"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
TIMEOUT_DEFAULT = 5
MAX_RETRIES_DEFAULT = 3
BACKOFF_FACTOR_DEFAULT = 2
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
API_ACTIVE = {"gnd": True, "wikidata": True, "aat": True}
FAIL_COUNTER = {"gnd": 0, "wikidata": 0, "aat": 0}
# Logging file
LOG_FILE = OUTPUT_DIR / "mapper_log.txt"
# Batch logging parameters
LOG_BATCH_SIZE = 100 # flush wenn >= Einträge
LOG_FLUSH_INTERVAL = 5.0 # Sekunden zwischen Flushes (Batch-Logging)
LOG_LEVEL = "DEBUG" # ausführlich gewünscht
# =========================
# Buffered/Batched Logger
# =========================
class BatchLogger:
def __init__(self, logfile: Path, flush_interval: float = 5.0, batch_size: int = 100, level: str = "DEBUG"):
self.logfile = logfile
self.flush_interval = flush_interval
self.batch_size = batch_size
self.level = level
self.q = queue.Queue()
self._stop_event = threading.Event()
self._thread = threading.Thread(target=self._worker, daemon=True, name="BatchLoggerThread")
# Ensure logfile exists
try:
logfile.parent.mkdir(parents=True, exist_ok=True)
logfile.touch(exist_ok=True)
except Exception:
pass
self._thread.start()
def _format(self, level: str, msg: str) -> str:
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
return f"{ts} - {level} - {msg}"
def log(self, level: str, msg: str):
if self._stop_event.is_set():
return
formatted = self._format(level, msg)
self.q.put((level, formatted))
# If queue too big, trigger immediate flush by putting a special token
if self.q.qsize() >= self.batch_size:
self.q.put(("__FLUSH__", "__FLUSH__"))
def debug(self, msg: str):
if LOG_LEVEL in ("DEBUG",):
self.log("DEBUG", msg)
def info(self, msg: str):
self.log("INFO", msg)
def warning(self, msg: str):
self.log("WARNING", msg)
def error(self, msg: str):
self.log("ERROR", msg)
def exception(self, msg: str):
self.log("EXCEPTION", msg)
def _worker(self):
buffer = []
last_flush = time.time()
while not self._stop_event.is_set() or not self.q.empty():
try:
item = None
try:
item = self.q.get(timeout=self.flush_interval)
except queue.Empty:
# time-based flush
if buffer:
self._flush_buffer(buffer)
buffer = []
last_flush = time.time()
continue
if item is None:
continue
level, formatted = item
if level == "__FLUSH__":
if buffer:
self._flush_buffer(buffer)
buffer = []
last_flush = time.time()
continue
buffer.append((level, formatted))
# flush conditions
if len(buffer) >= self.batch_size or (time.time() - last_flush) >= self.flush_interval:
self._flush_buffer(buffer)
buffer = []
last_flush = time.time()
except Exception as e:
# As a last resort, write error immediately to stderr
try:
sys.stderr.write(f"BatchLogger worker error: {e}\n")
except Exception:
pass
time.sleep(0.5)
# final flush
if buffer:
self._flush_buffer(buffer)
def _flush_buffer(self, buffer):
if not buffer:
return
# write to console and file
try:
# console
out_lines = [f"{line}\n" for _, line in buffer]
# write to stdout
try:
sys.stdout.writelines(out_lines)
sys.stdout.flush()
except Exception:
pass
# append to file
try:
with open(self.logfile, "a", encoding="utf-8") as f:
f.writelines(out_lines)
except Exception as e:
try:
sys.stderr.write(f"BatchLogger file write error: {e}\n")
except Exception:
pass
except Exception:
pass
def stop(self):
self._stop_event.set()
# put sentinel to wake worker
try:
self.q.put(("__FLUSH__", "__FLUSH__"))
except Exception:
pass
self._thread.join(timeout=5.0)
# Instantiate logger
logger = BatchLogger(LOG_FILE, flush_interval=LOG_FLUSH_INTERVAL, batch_size=LOG_BATCH_SIZE, level=LOG_LEVEL)
logger.info("Starte NormVokabular Mapper v1.4.1 (Batch-Logging aktiv)")
# =========================
# Cache laden/speichern
# =========================
if os.path.exists(CACHE_FILE):
try:
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
logger.debug(f"Cache geladen ({len(CACHE)} Einträge).")
except Exception as e:
logger.warning(f"Cache konnte nicht geladen werden: {e}")
CACHE = {}
else:
CACHE = {}
def save_cache():
try:
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
logger.debug("Cache gespeichert.")
except Exception as e:
logger.error(f"Cache konnte nicht gespeichert werden: {e}")
# =========================
# Normalisierung / Lemma / Tokenization
# =========================
def normalize_text(s):
if not s:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
except Exception:
lemma = term_norm
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
def compound_split(term):
if not term:
return []
parts = [p for p in re.split(r"[\s\-_/]+", term) if p]
return parts if parts else [term]
# =========================
# Normvokabular laden & Index
# =========================
def load_normvokabular(file_path):
try:
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
except Exception as e:
logger.error(f"Normvokabular konnte nicht geladen werden: {e}")
raise
norm_dict = {}
stem_index = defaultdict(list)
lemma_norm_map = {}
for sheet_name, df in sheets.items():
if sheet_name.lower() in ["master", "übersicht"]:
continue
df = df.dropna(how="all", axis=1)
df.columns = [str(c).strip() for c in df.columns]
id_col = next((c for c in df.columns if "ID" in c), None)
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c or "Begriff" in c), None)
if not id_col or not word_col:
continue
current_parent_id = None
for _, row in df.iterrows():
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
if row_id:
current_parent_id = row_id
if not row_word:
continue
assigned_parent_id = current_parent_id
entry = {"Name": row_word, "ID": assigned_parent_id or "", "Sheet": sheet_name, "Own_ID": row_id or ""}
key = normalize_text(row_word)
norm_dict[key] = entry
lemma = lemmatize_term(key)
stem_index[lemma].append(entry)
if lemma not in lemma_norm_map:
lemma_norm_map[lemma] = entry
logger.info(f"Normvokabular geladen: {len(norm_dict)} Einträge, {len(stem_index)} Stems")
return norm_dict, stem_index, lemma_norm_map
# =========================
# Mapping & Vorschläge
# =========================
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_norm in norm_dict:
e = norm_dict[term_norm]
logger.debug(f"map_to_norm: exakter Treffer für '{term}' -> {e['Name']}")
return e["Name"], e["ID"], []
if term_lemma in stem_index:
e = stem_index[term_lemma][0]
logger.debug(f"map_to_norm: Lemma-Treffer für '{term}' -> {e['Name']}")
return e["Name"], e["ID"], []
tokens = compound_split(term_norm)
if len(tokens) == 1:
suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
logger.debug(f"map_to_norm: KEIN TREFFER für '{term}', Vorschläge: {suggestions}")
return "KEIN TREFFER", "", suggestions
else:
token_matches = []
for t in tokens:
t_lemma = lemmatize_term(t)
if t_lemma in stem_index:
e = stem_index[t_lemma][0]
token_matches.append((t, e["Name"], e["ID"]))
else:
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
token_matches.append((t, "KEIN TREFFER", "", sugg))
combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
logger.debug(f"map_to_norm: Kompositum '{term}' -> combined_suggestions: {combined_suggestions}")
return "KEIN TREFFER", "", combined_suggestions
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
candidates = []
for key_lemma, entry in lemma_norm_map.items():
if RAPIDFUZZ_AVAILABLE:
score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
else:
score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
if key_lemma.lower().startswith(term_lemma.lower()):
score = min(score + 0.1, 1.0)
if score >= threshold:
candidates.append((score, entry["Name"], entry["ID"]))
candidates.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
# =========================
# Generic request with retries & caching
# =========================
def request_with_retries_generic(api_name, url, params=None, headers=None, timeout=TIMEOUT_DEFAULT, max_retries=MAX_RETRIES_DEFAULT, backoff=BACKOFF_FACTOR_DEFAULT):
cache_key = url + (json.dumps(params, sort_keys=True, ensure_ascii=False) if params else "")
if cache_key in CACHE:
logger.debug(f"[Cache] {api_name}: {cache_key}")
return CACHE[cache_key]
retries = 0
while retries < max_retries:
try:
r = requests.get(url, params=params, headers=headers or HEADERS, timeout=timeout)
if r.status_code == 200:
try:
data = r.json()
except Exception:
data = r.text
CACHE[cache_key] = data
FAIL_COUNTER[api_name] = 0
logger.debug(f"[{api_name}] Erfolgreiche Antwort für {url}")
return data
else:
logger.warning(f"[{api_name}] HTTP {r.status_code} für {url}")
raise ValueError(f"HTTP {r.status_code}")
except Exception as e:
retries += 1
wait = backoff ** retries
logger.warning(f"[{api_name}] Fehler ({retries}/{max_retries}) für {url}: {e}. Warte {wait}s")
time.sleep(wait)
FAIL_COUNTER[api_name] += 1
if FAIL_COUNTER[api_name] >= 10:
API_ACTIVE[api_name] = False
logger.error(f"[{api_name}] Deaktiviere API nach zu vielen Fehlern.")
return None
# =========================
# GND / Wikidata (bestehend)
# =========================
def batch_query_gnd(terms):
results = {}
if not API_ACTIVE.get("gnd", False):
for t in terms: results[t] = ""
return results
logger.info(f"[GND] Starte GND-Abfragen für {len(terms)} Terme")
start = time.time()
for idx, t in enumerate(terms, start=1):
logger.debug(f"[GND] ({idx}/{len(terms)}) Anfrage für '{t}'")
url = "https://lobid.org/gnd/search"
params = {"q": t, "format": "json"}
data = request_with_retries_generic("gnd", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
top = ""
try:
if data and "member" in data:
cands = [(doc.get("preferredName","") or doc.get("name",""),
SequenceMatcher(None, t.lower(), (doc.get("preferredName","") or doc.get("name","")).lower()).ratio())
for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
cands = [c for c in cands if c[1] >= 0.75]
if cands:
top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
except Exception as e:
logger.debug(f"[GND] Fehler bei Verarbeitung für '{t}': {e}")
results[t] = top
elapsed = time.time() - start
logger.info(f"[GND] Fertig. Dauer: {elapsed:.1f}s")
return results
def batch_query_wikidata(terms):
results = {}
if not API_ACTIVE.get("wikidata", False):
for t in terms: results[t] = ""
return results
logger.info(f"[WD] Starte Wikidata-Abfragen für {len(terms)} Terme")
start = time.time()
for idx, t in enumerate(terms, start=1):
logger.debug(f"[WD] ({idx}/{len(terms)}) Anfrage für '{t}'")
url = "https://www.wikidata.org/w/api.php"
params = {"action": "wbsearchentities", "search": t, "language": "de", "format": "json"}
data = request_with_retries_generic("wikidata", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
top = ""
try:
if data and "search" in data:
cands = [(e.get("label",""), SequenceMatcher(None, t.lower(), e.get("label","").lower()).ratio())
for e in data["search"] if e.get("label","")]
cands = [c for c in cands if c[1] >= 0.70]
if cands:
top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
except Exception as e:
logger.debug(f"[WD] Fehler bei Verarbeitung für '{t}': {e}")
results[t] = top
elapsed = time.time() - start
logger.info(f"[WD] Fertig. Dauer: {elapsed:.1f}s")
return results
# =========================
# Getty AAT Abfrage robust & API-polite (requests)
# =========================
def batch_query_getty_aat(terms):
results = {}
if not API_ACTIVE.get("aat", False):
for t in terms: results[t] = ""
return results
endpoint = "https://vocab.getty.edu/sparql"
headers = {"Accept": "application/sparql-results+json", "User-Agent": HEADERS.get("User-Agent")}
TIMEOUT = 8
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
FAIL_LIMIT = 5
fail_counter_local = 0
logger.info(f"[AAT] Starte Getty AAT-Abgleich für {len(terms)} Terme")
start_all = time.time()
for idx, term in enumerate(terms, start=1):
term_norm = lemmatize_term(normalize_text(term))
tokens = compound_split(term_norm)
logger.debug(f"[AAT] ({idx}/{len(terms)}) Begriff '{term}' -> Tokens: {tokens}")
query_fragments = []
for tkn in tokens:
t_escaped = tkn.replace('"', '\\"')
qf = f"""
?concept skos:prefLabel ?label .
FILTER(lang(?label)='de' && CONTAINS(LCASE(?label), LCASE("{t_escaped}")))
"""
query_fragments.append(f"{{ {qf} }}")
query_body = " UNION ".join(query_fragments) if query_fragments else ""
query = f"PREFIX skos: <http://www.w3.org/2004/02/skos/core#> SELECT ?label ?concept WHERE {{ {query_body} }} LIMIT 10"
retries = 0
success = False
start_term = time.time()
while retries < MAX_RETRIES and not success:
try:
logger.debug(f"[AAT] Anfrage (Retry {retries}) für '{term}'")
r = requests.get(endpoint, params={"query": query}, headers=headers, timeout=TIMEOUT)
if r.status_code != 200:
raise ValueError(f"HTTP {r.status_code}")
ret = r.json()
candidates = [(b['label']['value'], b['concept']['value']) for b in ret.get("results", {}).get("bindings", [])]
if candidates:
scored = [
(c[0], c[1], SequenceMatcher(None, term_norm, lemmatize_term(normalize_text(c[0]))).ratio())
for c in candidates
]
top = max(scored, key=lambda x: x[2])
results[term] = top[0]
logger.debug(f"[AAT] Treffer für '{term}': {results[term]} (Score: {top[2]:.3f})")
else:
results[term] = ""
logger.debug(f"[AAT] Kein Treffer für '{term}'")
success = True
except Exception as e:
retries += 1
wait = BACKOFF_FACTOR ** retries
logger.warning(f"[AAT] Fehler ({retries}/{MAX_RETRIES}) für '{term}': {e} warte {wait}s")
time.sleep(wait)
if retries == MAX_RETRIES:
results[term] = ""
fail_counter_local += 1
# polite delay
time.sleep(1.0)
elapsed_term = time.time() - start_term
logger.debug(f"[AAT] Dauer für '{term}': {elapsed_term:.2f}s")
if fail_counter_local >= FAIL_LIMIT:
logger.error("[AAT] Zu viele Fehler lokal - breche AAT-Abfragen ab.")
for t_rem in terms[idx:]:
results[t_rem] = ""
FAIL_COUNTER["aat"] += fail_counter_local
API_ACTIVE["aat"] = False
break
elapsed_all = time.time() - start_all
logger.info(f"[AAT] Getty AAT-Abgleich abgeschlossen. Dauer: {elapsed_all:.1f}s")
return results
# =========================
# Markierung / Export (Excel/ODS)
# =========================
def mark_norm_hits(file_path):
ext = file_path.suffix.lower()
try:
if ext in [".xlsx", ".xls"]:
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
wb = load_workbook(file_path)
ws = wb.active
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
norm_col = col_map.get("Norm_Treffer", None)
if not norm_col:
logger.debug("Spalte 'Norm_Treffer' nicht gefunden (mark_norm_hits).")
wb.save(file_path)
return
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
cell = row[0]
if cell.value and cell.value != "KEIN TREFFER":
cell.fill = green_fill
else:
cell.fill = red_fill
wb.save(file_path)
elif ext==".ods":
df = pd.read_excel(file_path, engine="odf")
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
df.to_excel(file_path, index=False, engine="odf")
except Exception as e:
logger.warning(f"Fehler beim Markieren der Treffer in {file_path}: {e}")
# =========================
# Fehlende Begriffe -> separate Datei
# =========================
def export_missing_terms(out_df, output_file):
missing_df = out_df[
(out_df["Norm_Treffer"] == "KEIN TREFFER") &
(out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
][["Begriff"]].drop_duplicates()
count_missing = len(missing_df)
logger.info(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
if count_missing == 0:
return
ext = output_file.suffix.lower()
base_name = output_file.stem
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}"
version = 1
while missing_file.exists():
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}"
version += 1
try:
if ext in [".xlsx", ".xls"]:
missing_df.to_excel(missing_file, index=False, engine="openpyxl")
elif ext == ".ods":
missing_df.to_excel(missing_file, index=False, engine="odf")
else:
missing_df.to_csv(missing_file, index=False, sep=";")
logger.info(f"Fehlende Begriffe gespeichert: {missing_file}")
except Exception as e:
logger.error(f"Fehler beim Speichern der fehlenden Begriffe: {e}")
# =========================
# Haupt-Loop: Verarbeitung Input-Dateien
# =========================
def process_files():
overall_start = time.time()
try:
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
except Exception as e:
logger.error("Normvokabular konnte nicht geladen werden. Beende.")
raise
total_terms = 0
total_hits = 0
if not INPUT_DIR.exists():
logger.error(f"Eingabeordner {INPUT_DIR} fehlt")
raise SystemExit(1)
files = list(INPUT_DIR.glob("*"))
if not files:
logger.info("Keine Dateien gefunden")
return
logger.info(f"Starte Verarbeitung von {len(files)} Dateien")
for file_idx, file_path in enumerate(files, start=1):
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
logger.debug(f"Übersprungen (kein unterstütztes Format): {file_path.name}")
continue
logger.info(f"[Datei {file_idx}/{len(files)}] Verarbeite: {file_path.name}")
file_start = time.time()
try:
if file_path.suffix.lower() == ".csv":
df = pd.read_csv(file_path)
else:
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
except Exception as e:
logger.error(f"Fehler beim Lesen von {file_path.name}: {e}")
continue
df = df.dropna(how="all")
df.columns = [str(c).strip() for c in df.columns]
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
urh_col = next((c for c in df.columns if "Urheber" in c), None)
if not besch_col:
logger.warning(f"Spalte 'Objektbeschreibung' nicht gefunden in {file_path.name}. Datei übersprungen.")
continue
row_terms_map = []
for r_idx, row in enumerate(df.itertuples(index=False), start=1):
try:
besch = str(row[df.columns.get_loc(besch_col)]).strip() if pd.notna(row[df.columns.get_loc(besch_col)]) else ""
except Exception:
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
if not besch:
continue
obj_box = row[df.columns.get_loc(box_col)] if box_col and box_col in df.columns else ""
urheber = row[df.columns.get_loc(urh_col)] if urh_col and urh_col in df.columns else ""
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
terms = []
for clause in clauses:
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS:
continue
if re.fullmatch(r"\d+", p):
continue
terms.append(p)
row_terms_map.append((obj_box, urheber, terms))
if (r_idx % 200) == 0:
logger.debug(f"[{file_path.name}] Zeile {r_idx} verarbeitet")
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
logger.info(f"[{file_path.name}] Gefundene unique Terme: {len(all_terms)}")
total_unique_terms = len(all_terms)
# API-Abfragen
t0 = time.time()
gnd_results = batch_query_gnd(all_terms)
t1 = time.time()
logger.info(f"[{file_path.name}] GND-Abfragen Dauer: {t1-t0:.1f}s")
wd_results = batch_query_wikidata(all_terms)
t2 = time.time()
logger.info(f"[{file_path.name}] Wikidata-Abfragen Dauer: {t2-t1:.1f}s")
aat_results = batch_query_getty_aat(all_terms) if API_ACTIVE.get("aat", False) else {t:"" for t in all_terms}
t3 = time.time()
logger.info(f"[{file_path.name}] AAT-Abfragen Dauer: {t3-t2:.1f}s")
# Build output rows
output_rows = []
processed_count = 0
for obj_box, urheber, terms in row_terms_map:
for term in terms:
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
total_terms += 1
if norm_name != "KEIN TREFFER":
total_hits += 1
out_row = {
"Box": obj_box,
"Objekt/Ebene": obj_box,
"Urheber": urheber,
"Begriff": term,
"Norm_Treffer": norm_name,
"Norm_ID": norm_id,
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
"GND_Top1": gnd_results.get(term,""),
"WD_Top1": wd_results.get(term,""),
"AAT_Top1": aat_results.get(term,"")
}
output_rows.append(out_row)
processed_count += 1
if (processed_count % 200) == 0:
logger.debug(f"[{file_path.name}] {processed_count}/{total_unique_terms} Terme verarbeitet")
out_df = pd.DataFrame(output_rows)
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
version = 1
while output_file.exists():
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
version += 1
engine = "odf" if output_file.suffix.lower()==".ods" else None
try:
out_df.to_excel(output_file, index=False, engine=engine)
logger.info(f"[{file_path.name}] Auswertung gespeichert: {output_file}")
except Exception as e:
logger.error(f"[{file_path.name}] Fehler beim Speichern der Auswertung {output_file}: {e}")
continue
export_missing_terms(out_df, output_file)
mark_norm_hits(output_file)
file_elapsed = time.time() - file_start
logger.info(f"[Datei {file_idx}/{len(files)}] Fertig ({file_elapsed:.1f}s)")
overall_elapsed = time.time() - overall_start
logger.info(f"Fertig. Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular. Gesamtzeit: {overall_elapsed:.1f}s")
# =========================
# Main
# =========================
if __name__ == "__main__":
try:
process_files()
except KeyboardInterrupt:
logger.warning("Abbruch durch Benutzer (KeyboardInterrupt).")
except SystemExit:
logger.warning("SystemExit aufgetreten.")
except Exception as e:
logger.exception(f"Ungefangener Fehler: {e}")
finally:
# Stop logger (flush remaining logs)
try:
save_cache()
except Exception:
pass
try:
logger.info("Beende.")
logger.stop()
except Exception:
pass