748 lines
29 KiB
Python
748 lines
29 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
NormVokabular Mapper – Version 1.4.1
|
||
- Detailliertes (DEBUG) Batch-Logging: gepufferte Logs werden periodisch in Konsole + Datei geschrieben
|
||
- Getty AAT (SPARQL via requests) – API-polite, timeout/retries/backoff
|
||
- Fehlertoleranz: API-Ausfälle führen nicht zum Totalabsturz
|
||
- Fehlende Begriffe -> separate Datei (gleiches Format wie Output)
|
||
- Bestehende Normalisierung/Lemmatisierung/Stemming wird weiterverwendet
|
||
- Batch-Logging-Modus (konfigurierbar)
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
import os
|
||
import sys
|
||
import re
|
||
import time
|
||
import json
|
||
import threading
|
||
import queue
|
||
import requests
|
||
import pandas as pd
|
||
from pathlib import Path
|
||
from collections import defaultdict
|
||
from difflib import SequenceMatcher
|
||
from datetime import datetime
|
||
|
||
# Optional libs
|
||
try:
|
||
from rapidfuzz import fuzz
|
||
RAPIDFUZZ_AVAILABLE = True
|
||
except Exception:
|
||
RAPIDFUZZ_AVAILABLE = False
|
||
|
||
try:
|
||
import spacy
|
||
nlp = spacy.load("de_core_news_sm")
|
||
SPACY_AVAILABLE = True
|
||
except Exception:
|
||
SPACY_AVAILABLE = False
|
||
nlp = None
|
||
|
||
# =========================
|
||
# Config & Pfade
|
||
# =========================
|
||
INPUT_DIR = Path("Input CSV")
|
||
OUTPUT_DIR = Path("Auswertung Ergebnisse")
|
||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
|
||
CACHE_FILE = "api_cache.json"
|
||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||
CONF_THRESHOLD = 0.75
|
||
TIMEOUT_DEFAULT = 5
|
||
MAX_RETRIES_DEFAULT = 3
|
||
BACKOFF_FACTOR_DEFAULT = 2
|
||
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
|
||
API_ACTIVE = {"gnd": True, "wikidata": True, "aat": True}
|
||
FAIL_COUNTER = {"gnd": 0, "wikidata": 0, "aat": 0}
|
||
|
||
# Logging file
|
||
LOG_FILE = OUTPUT_DIR / "mapper_log.txt"
|
||
|
||
# Batch logging parameters
|
||
LOG_BATCH_SIZE = 100 # flush wenn >= Einträge
|
||
LOG_FLUSH_INTERVAL = 5.0 # Sekunden zwischen Flushes (Batch-Logging)
|
||
LOG_LEVEL = "DEBUG" # ausführlich gewünscht
|
||
|
||
# =========================
|
||
# Buffered/Batched Logger
|
||
# =========================
|
||
class BatchLogger:
|
||
def __init__(self, logfile: Path, flush_interval: float = 5.0, batch_size: int = 100, level: str = "DEBUG"):
|
||
self.logfile = logfile
|
||
self.flush_interval = flush_interval
|
||
self.batch_size = batch_size
|
||
self.level = level
|
||
self.q = queue.Queue()
|
||
self._stop_event = threading.Event()
|
||
self._thread = threading.Thread(target=self._worker, daemon=True, name="BatchLoggerThread")
|
||
# Ensure logfile exists
|
||
try:
|
||
logfile.parent.mkdir(parents=True, exist_ok=True)
|
||
logfile.touch(exist_ok=True)
|
||
except Exception:
|
||
pass
|
||
self._thread.start()
|
||
|
||
def _format(self, level: str, msg: str) -> str:
|
||
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||
return f"{ts} - {level} - {msg}"
|
||
|
||
def log(self, level: str, msg: str):
|
||
if self._stop_event.is_set():
|
||
return
|
||
formatted = self._format(level, msg)
|
||
self.q.put((level, formatted))
|
||
# If queue too big, trigger immediate flush by putting a special token
|
||
if self.q.qsize() >= self.batch_size:
|
||
self.q.put(("__FLUSH__", "__FLUSH__"))
|
||
|
||
def debug(self, msg: str):
|
||
if LOG_LEVEL in ("DEBUG",):
|
||
self.log("DEBUG", msg)
|
||
|
||
def info(self, msg: str):
|
||
self.log("INFO", msg)
|
||
|
||
def warning(self, msg: str):
|
||
self.log("WARNING", msg)
|
||
|
||
def error(self, msg: str):
|
||
self.log("ERROR", msg)
|
||
|
||
def exception(self, msg: str):
|
||
self.log("EXCEPTION", msg)
|
||
|
||
def _worker(self):
|
||
buffer = []
|
||
last_flush = time.time()
|
||
while not self._stop_event.is_set() or not self.q.empty():
|
||
try:
|
||
item = None
|
||
try:
|
||
item = self.q.get(timeout=self.flush_interval)
|
||
except queue.Empty:
|
||
# time-based flush
|
||
if buffer:
|
||
self._flush_buffer(buffer)
|
||
buffer = []
|
||
last_flush = time.time()
|
||
continue
|
||
|
||
if item is None:
|
||
continue
|
||
level, formatted = item
|
||
if level == "__FLUSH__":
|
||
if buffer:
|
||
self._flush_buffer(buffer)
|
||
buffer = []
|
||
last_flush = time.time()
|
||
continue
|
||
buffer.append((level, formatted))
|
||
|
||
# flush conditions
|
||
if len(buffer) >= self.batch_size or (time.time() - last_flush) >= self.flush_interval:
|
||
self._flush_buffer(buffer)
|
||
buffer = []
|
||
last_flush = time.time()
|
||
except Exception as e:
|
||
# As a last resort, write error immediately to stderr
|
||
try:
|
||
sys.stderr.write(f"BatchLogger worker error: {e}\n")
|
||
except Exception:
|
||
pass
|
||
time.sleep(0.5)
|
||
# final flush
|
||
if buffer:
|
||
self._flush_buffer(buffer)
|
||
|
||
def _flush_buffer(self, buffer):
|
||
if not buffer:
|
||
return
|
||
# write to console and file
|
||
try:
|
||
# console
|
||
out_lines = [f"{line}\n" for _, line in buffer]
|
||
# write to stdout
|
||
try:
|
||
sys.stdout.writelines(out_lines)
|
||
sys.stdout.flush()
|
||
except Exception:
|
||
pass
|
||
# append to file
|
||
try:
|
||
with open(self.logfile, "a", encoding="utf-8") as f:
|
||
f.writelines(out_lines)
|
||
except Exception as e:
|
||
try:
|
||
sys.stderr.write(f"BatchLogger file write error: {e}\n")
|
||
except Exception:
|
||
pass
|
||
except Exception:
|
||
pass
|
||
|
||
def stop(self):
|
||
self._stop_event.set()
|
||
# put sentinel to wake worker
|
||
try:
|
||
self.q.put(("__FLUSH__", "__FLUSH__"))
|
||
except Exception:
|
||
pass
|
||
self._thread.join(timeout=5.0)
|
||
|
||
# Instantiate logger
|
||
logger = BatchLogger(LOG_FILE, flush_interval=LOG_FLUSH_INTERVAL, batch_size=LOG_BATCH_SIZE, level=LOG_LEVEL)
|
||
logger.info("Starte NormVokabular Mapper v1.4.1 (Batch-Logging aktiv)")
|
||
|
||
# =========================
|
||
# Cache laden/speichern
|
||
# =========================
|
||
if os.path.exists(CACHE_FILE):
|
||
try:
|
||
with open(CACHE_FILE,"r",encoding="utf-8") as f:
|
||
CACHE = json.load(f)
|
||
logger.debug(f"Cache geladen ({len(CACHE)} Einträge).")
|
||
except Exception as e:
|
||
logger.warning(f"Cache konnte nicht geladen werden: {e}")
|
||
CACHE = {}
|
||
else:
|
||
CACHE = {}
|
||
|
||
def save_cache():
|
||
try:
|
||
with open(CACHE_FILE,"w",encoding="utf-8") as f:
|
||
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
||
logger.debug("Cache gespeichert.")
|
||
except Exception as e:
|
||
logger.error(f"Cache konnte nicht gespeichert werden: {e}")
|
||
|
||
# =========================
|
||
# Normalisierung / Lemma / Tokenization
|
||
# =========================
|
||
def normalize_text(s):
|
||
if not s:
|
||
return ""
|
||
s = str(s).lower().strip()
|
||
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
|
||
s = re.sub(r"\s+"," ",s)
|
||
return s
|
||
|
||
lemma_cache = {}
|
||
|
||
def lemmatize_term(term):
|
||
term_norm = normalize_text(term)
|
||
if term_norm in lemma_cache:
|
||
return lemma_cache[term_norm]
|
||
if SPACY_AVAILABLE and nlp:
|
||
try:
|
||
doc = nlp(term_norm)
|
||
lemma = " ".join([token.lemma_ for token in doc])
|
||
except Exception:
|
||
lemma = term_norm
|
||
else:
|
||
lemma = term_norm
|
||
lemma_cache[term_norm] = lemma
|
||
return lemma
|
||
|
||
def compound_split(term):
|
||
if not term:
|
||
return []
|
||
parts = [p for p in re.split(r"[\s\-_/]+", term) if p]
|
||
return parts if parts else [term]
|
||
|
||
# =========================
|
||
# Normvokabular laden & Index
|
||
# =========================
|
||
def load_normvokabular(file_path):
|
||
try:
|
||
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
|
||
except Exception as e:
|
||
logger.error(f"Normvokabular konnte nicht geladen werden: {e}")
|
||
raise
|
||
norm_dict = {}
|
||
stem_index = defaultdict(list)
|
||
lemma_norm_map = {}
|
||
|
||
for sheet_name, df in sheets.items():
|
||
if sheet_name.lower() in ["master", "übersicht"]:
|
||
continue
|
||
df = df.dropna(how="all", axis=1)
|
||
df.columns = [str(c).strip() for c in df.columns]
|
||
id_col = next((c for c in df.columns if "ID" in c), None)
|
||
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c or "Begriff" in c), None)
|
||
if not id_col or not word_col:
|
||
continue
|
||
current_parent_id = None
|
||
for _, row in df.iterrows():
|
||
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
|
||
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
|
||
if row_id:
|
||
current_parent_id = row_id
|
||
if not row_word:
|
||
continue
|
||
assigned_parent_id = current_parent_id
|
||
entry = {"Name": row_word, "ID": assigned_parent_id or "", "Sheet": sheet_name, "Own_ID": row_id or ""}
|
||
key = normalize_text(row_word)
|
||
norm_dict[key] = entry
|
||
lemma = lemmatize_term(key)
|
||
stem_index[lemma].append(entry)
|
||
if lemma not in lemma_norm_map:
|
||
lemma_norm_map[lemma] = entry
|
||
logger.info(f"Normvokabular geladen: {len(norm_dict)} Einträge, {len(stem_index)} Stems")
|
||
return norm_dict, stem_index, lemma_norm_map
|
||
|
||
# =========================
|
||
# Mapping & Vorschläge
|
||
# =========================
|
||
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
|
||
term_norm = normalize_text(term)
|
||
term_lemma = lemmatize_term(term)
|
||
|
||
if term_norm in norm_dict:
|
||
e = norm_dict[term_norm]
|
||
logger.debug(f"map_to_norm: exakter Treffer für '{term}' -> {e['Name']}")
|
||
return e["Name"], e["ID"], []
|
||
|
||
if term_lemma in stem_index:
|
||
e = stem_index[term_lemma][0]
|
||
logger.debug(f"map_to_norm: Lemma-Treffer für '{term}' -> {e['Name']}")
|
||
return e["Name"], e["ID"], []
|
||
|
||
tokens = compound_split(term_norm)
|
||
if len(tokens) == 1:
|
||
suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
|
||
logger.debug(f"map_to_norm: KEIN TREFFER für '{term}', Vorschläge: {suggestions}")
|
||
return "KEIN TREFFER", "", suggestions
|
||
else:
|
||
token_matches = []
|
||
for t in tokens:
|
||
t_lemma = lemmatize_term(t)
|
||
if t_lemma in stem_index:
|
||
e = stem_index[t_lemma][0]
|
||
token_matches.append((t, e["Name"], e["ID"]))
|
||
else:
|
||
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
|
||
token_matches.append((t, "KEIN TREFFER", "", sugg))
|
||
combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
|
||
logger.debug(f"map_to_norm: Kompositum '{term}' -> combined_suggestions: {combined_suggestions}")
|
||
return "KEIN TREFFER", "", combined_suggestions
|
||
|
||
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
|
||
candidates = []
|
||
for key_lemma, entry in lemma_norm_map.items():
|
||
if RAPIDFUZZ_AVAILABLE:
|
||
score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
|
||
else:
|
||
score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
|
||
if key_lemma.lower().startswith(term_lemma.lower()):
|
||
score = min(score + 0.1, 1.0)
|
||
if score >= threshold:
|
||
candidates.append((score, entry["Name"], entry["ID"]))
|
||
candidates.sort(reverse=True)
|
||
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
|
||
|
||
# =========================
|
||
# Generic request with retries & caching
|
||
# =========================
|
||
def request_with_retries_generic(api_name, url, params=None, headers=None, timeout=TIMEOUT_DEFAULT, max_retries=MAX_RETRIES_DEFAULT, backoff=BACKOFF_FACTOR_DEFAULT):
|
||
cache_key = url + (json.dumps(params, sort_keys=True, ensure_ascii=False) if params else "")
|
||
if cache_key in CACHE:
|
||
logger.debug(f"[Cache] {api_name}: {cache_key}")
|
||
return CACHE[cache_key]
|
||
retries = 0
|
||
while retries < max_retries:
|
||
try:
|
||
r = requests.get(url, params=params, headers=headers or HEADERS, timeout=timeout)
|
||
if r.status_code == 200:
|
||
try:
|
||
data = r.json()
|
||
except Exception:
|
||
data = r.text
|
||
CACHE[cache_key] = data
|
||
FAIL_COUNTER[api_name] = 0
|
||
logger.debug(f"[{api_name}] Erfolgreiche Antwort für {url}")
|
||
return data
|
||
else:
|
||
logger.warning(f"[{api_name}] HTTP {r.status_code} für {url}")
|
||
raise ValueError(f"HTTP {r.status_code}")
|
||
except Exception as e:
|
||
retries += 1
|
||
wait = backoff ** retries
|
||
logger.warning(f"[{api_name}] Fehler ({retries}/{max_retries}) für {url}: {e}. Warte {wait}s")
|
||
time.sleep(wait)
|
||
FAIL_COUNTER[api_name] += 1
|
||
if FAIL_COUNTER[api_name] >= 10:
|
||
API_ACTIVE[api_name] = False
|
||
logger.error(f"[{api_name}] Deaktiviere API nach zu vielen Fehlern.")
|
||
return None
|
||
|
||
# =========================
|
||
# GND / Wikidata (bestehend)
|
||
# =========================
|
||
def batch_query_gnd(terms):
|
||
results = {}
|
||
if not API_ACTIVE.get("gnd", False):
|
||
for t in terms: results[t] = ""
|
||
return results
|
||
logger.info(f"[GND] Starte GND-Abfragen für {len(terms)} Terme")
|
||
start = time.time()
|
||
for idx, t in enumerate(terms, start=1):
|
||
logger.debug(f"[GND] ({idx}/{len(terms)}) Anfrage für '{t}'")
|
||
url = "https://lobid.org/gnd/search"
|
||
params = {"q": t, "format": "json"}
|
||
data = request_with_retries_generic("gnd", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
|
||
top = ""
|
||
try:
|
||
if data and "member" in data:
|
||
cands = [(doc.get("preferredName","") or doc.get("name",""),
|
||
SequenceMatcher(None, t.lower(), (doc.get("preferredName","") or doc.get("name","")).lower()).ratio())
|
||
for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
|
||
cands = [c for c in cands if c[1] >= 0.75]
|
||
if cands:
|
||
top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
|
||
except Exception as e:
|
||
logger.debug(f"[GND] Fehler bei Verarbeitung für '{t}': {e}")
|
||
results[t] = top
|
||
elapsed = time.time() - start
|
||
logger.info(f"[GND] Fertig. Dauer: {elapsed:.1f}s")
|
||
return results
|
||
|
||
def batch_query_wikidata(terms):
|
||
results = {}
|
||
if not API_ACTIVE.get("wikidata", False):
|
||
for t in terms: results[t] = ""
|
||
return results
|
||
logger.info(f"[WD] Starte Wikidata-Abfragen für {len(terms)} Terme")
|
||
start = time.time()
|
||
for idx, t in enumerate(terms, start=1):
|
||
logger.debug(f"[WD] ({idx}/{len(terms)}) Anfrage für '{t}'")
|
||
url = "https://www.wikidata.org/w/api.php"
|
||
params = {"action": "wbsearchentities", "search": t, "language": "de", "format": "json"}
|
||
data = request_with_retries_generic("wikidata", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
|
||
top = ""
|
||
try:
|
||
if data and "search" in data:
|
||
cands = [(e.get("label",""), SequenceMatcher(None, t.lower(), e.get("label","").lower()).ratio())
|
||
for e in data["search"] if e.get("label","")]
|
||
cands = [c for c in cands if c[1] >= 0.70]
|
||
if cands:
|
||
top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
|
||
except Exception as e:
|
||
logger.debug(f"[WD] Fehler bei Verarbeitung für '{t}': {e}")
|
||
results[t] = top
|
||
elapsed = time.time() - start
|
||
logger.info(f"[WD] Fertig. Dauer: {elapsed:.1f}s")
|
||
return results
|
||
|
||
# =========================
|
||
# Getty AAT Abfrage – robust & API-polite (requests)
|
||
# =========================
|
||
def batch_query_getty_aat(terms):
|
||
results = {}
|
||
if not API_ACTIVE.get("aat", False):
|
||
for t in terms: results[t] = ""
|
||
return results
|
||
|
||
endpoint = "https://vocab.getty.edu/sparql"
|
||
headers = {"Accept": "application/sparql-results+json", "User-Agent": HEADERS.get("User-Agent")}
|
||
TIMEOUT = 8
|
||
MAX_RETRIES = 3
|
||
BACKOFF_FACTOR = 2
|
||
FAIL_LIMIT = 5
|
||
fail_counter_local = 0
|
||
|
||
logger.info(f"[AAT] Starte Getty AAT-Abgleich für {len(terms)} Terme")
|
||
start_all = time.time()
|
||
for idx, term in enumerate(terms, start=1):
|
||
term_norm = lemmatize_term(normalize_text(term))
|
||
tokens = compound_split(term_norm)
|
||
logger.debug(f"[AAT] ({idx}/{len(terms)}) Begriff '{term}' -> Tokens: {tokens}")
|
||
|
||
query_fragments = []
|
||
for tkn in tokens:
|
||
t_escaped = tkn.replace('"', '\\"')
|
||
qf = f"""
|
||
?concept skos:prefLabel ?label .
|
||
FILTER(lang(?label)='de' && CONTAINS(LCASE(?label), LCASE("{t_escaped}")))
|
||
"""
|
||
query_fragments.append(f"{{ {qf} }}")
|
||
query_body = " UNION ".join(query_fragments) if query_fragments else ""
|
||
query = f"PREFIX skos: <http://www.w3.org/2004/02/skos/core#> SELECT ?label ?concept WHERE {{ {query_body} }} LIMIT 10"
|
||
|
||
retries = 0
|
||
success = False
|
||
start_term = time.time()
|
||
while retries < MAX_RETRIES and not success:
|
||
try:
|
||
logger.debug(f"[AAT] Anfrage (Retry {retries}) für '{term}'")
|
||
r = requests.get(endpoint, params={"query": query}, headers=headers, timeout=TIMEOUT)
|
||
if r.status_code != 200:
|
||
raise ValueError(f"HTTP {r.status_code}")
|
||
ret = r.json()
|
||
candidates = [(b['label']['value'], b['concept']['value']) for b in ret.get("results", {}).get("bindings", [])]
|
||
if candidates:
|
||
scored = [
|
||
(c[0], c[1], SequenceMatcher(None, term_norm, lemmatize_term(normalize_text(c[0]))).ratio())
|
||
for c in candidates
|
||
]
|
||
top = max(scored, key=lambda x: x[2])
|
||
results[term] = top[0]
|
||
logger.debug(f"[AAT] Treffer für '{term}': {results[term]} (Score: {top[2]:.3f})")
|
||
else:
|
||
results[term] = ""
|
||
logger.debug(f"[AAT] Kein Treffer für '{term}'")
|
||
success = True
|
||
except Exception as e:
|
||
retries += 1
|
||
wait = BACKOFF_FACTOR ** retries
|
||
logger.warning(f"[AAT] Fehler ({retries}/{MAX_RETRIES}) für '{term}': {e} – warte {wait}s")
|
||
time.sleep(wait)
|
||
if retries == MAX_RETRIES:
|
||
results[term] = ""
|
||
fail_counter_local += 1
|
||
# polite delay
|
||
time.sleep(1.0)
|
||
elapsed_term = time.time() - start_term
|
||
logger.debug(f"[AAT] Dauer für '{term}': {elapsed_term:.2f}s")
|
||
|
||
if fail_counter_local >= FAIL_LIMIT:
|
||
logger.error("[AAT] Zu viele Fehler lokal - breche AAT-Abfragen ab.")
|
||
for t_rem in terms[idx:]:
|
||
results[t_rem] = ""
|
||
FAIL_COUNTER["aat"] += fail_counter_local
|
||
API_ACTIVE["aat"] = False
|
||
break
|
||
|
||
elapsed_all = time.time() - start_all
|
||
logger.info(f"[AAT] Getty AAT-Abgleich abgeschlossen. Dauer: {elapsed_all:.1f}s")
|
||
return results
|
||
|
||
# =========================
|
||
# Markierung / Export (Excel/ODS)
|
||
# =========================
|
||
def mark_norm_hits(file_path):
|
||
ext = file_path.suffix.lower()
|
||
try:
|
||
if ext in [".xlsx", ".xls"]:
|
||
from openpyxl import load_workbook
|
||
from openpyxl.styles import PatternFill
|
||
wb = load_workbook(file_path)
|
||
ws = wb.active
|
||
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
|
||
norm_col = col_map.get("Norm_Treffer", None)
|
||
if not norm_col:
|
||
logger.debug("Spalte 'Norm_Treffer' nicht gefunden (mark_norm_hits).")
|
||
wb.save(file_path)
|
||
return
|
||
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
||
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
||
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
|
||
cell = row[0]
|
||
if cell.value and cell.value != "KEIN TREFFER":
|
||
cell.fill = green_fill
|
||
else:
|
||
cell.fill = red_fill
|
||
wb.save(file_path)
|
||
elif ext==".ods":
|
||
df = pd.read_excel(file_path, engine="odf")
|
||
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
|
||
df.to_excel(file_path, index=False, engine="odf")
|
||
except Exception as e:
|
||
logger.warning(f"Fehler beim Markieren der Treffer in {file_path}: {e}")
|
||
|
||
# =========================
|
||
# Fehlende Begriffe -> separate Datei
|
||
# =========================
|
||
def export_missing_terms(out_df, output_file):
|
||
missing_df = out_df[
|
||
(out_df["Norm_Treffer"] == "KEIN TREFFER") &
|
||
(out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
|
||
][["Begriff"]].drop_duplicates()
|
||
|
||
count_missing = len(missing_df)
|
||
logger.info(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
|
||
|
||
if count_missing == 0:
|
||
return
|
||
|
||
ext = output_file.suffix.lower()
|
||
base_name = output_file.stem
|
||
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}"
|
||
version = 1
|
||
while missing_file.exists():
|
||
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}"
|
||
version += 1
|
||
|
||
try:
|
||
if ext in [".xlsx", ".xls"]:
|
||
missing_df.to_excel(missing_file, index=False, engine="openpyxl")
|
||
elif ext == ".ods":
|
||
missing_df.to_excel(missing_file, index=False, engine="odf")
|
||
else:
|
||
missing_df.to_csv(missing_file, index=False, sep=";")
|
||
logger.info(f"Fehlende Begriffe gespeichert: {missing_file}")
|
||
except Exception as e:
|
||
logger.error(f"Fehler beim Speichern der fehlenden Begriffe: {e}")
|
||
|
||
# =========================
|
||
# Haupt-Loop: Verarbeitung Input-Dateien
|
||
# =========================
|
||
def process_files():
|
||
overall_start = time.time()
|
||
try:
|
||
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
|
||
except Exception as e:
|
||
logger.error("Normvokabular konnte nicht geladen werden. Beende.")
|
||
raise
|
||
|
||
total_terms = 0
|
||
total_hits = 0
|
||
|
||
if not INPUT_DIR.exists():
|
||
logger.error(f"Eingabeordner {INPUT_DIR} fehlt")
|
||
raise SystemExit(1)
|
||
files = list(INPUT_DIR.glob("*"))
|
||
if not files:
|
||
logger.info("Keine Dateien gefunden")
|
||
return
|
||
|
||
logger.info(f"Starte Verarbeitung von {len(files)} Dateien")
|
||
for file_idx, file_path in enumerate(files, start=1):
|
||
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
|
||
logger.debug(f"Übersprungen (kein unterstütztes Format): {file_path.name}")
|
||
continue
|
||
logger.info(f"[Datei {file_idx}/{len(files)}] Verarbeite: {file_path.name}")
|
||
file_start = time.time()
|
||
try:
|
||
if file_path.suffix.lower() == ".csv":
|
||
df = pd.read_csv(file_path)
|
||
else:
|
||
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
|
||
except Exception as e:
|
||
logger.error(f"Fehler beim Lesen von {file_path.name}: {e}")
|
||
continue
|
||
|
||
df = df.dropna(how="all")
|
||
df.columns = [str(c).strip() for c in df.columns]
|
||
|
||
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
|
||
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
|
||
urh_col = next((c for c in df.columns if "Urheber" in c), None)
|
||
if not besch_col:
|
||
logger.warning(f"Spalte 'Objektbeschreibung' nicht gefunden in {file_path.name}. Datei übersprungen.")
|
||
continue
|
||
|
||
row_terms_map = []
|
||
for r_idx, row in enumerate(df.itertuples(index=False), start=1):
|
||
try:
|
||
besch = str(row[df.columns.get_loc(besch_col)]).strip() if pd.notna(row[df.columns.get_loc(besch_col)]) else ""
|
||
except Exception:
|
||
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
|
||
if not besch:
|
||
continue
|
||
obj_box = row[df.columns.get_loc(box_col)] if box_col and box_col in df.columns else ""
|
||
urheber = row[df.columns.get_loc(urh_col)] if urh_col and urh_col in df.columns else ""
|
||
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
|
||
terms = []
|
||
for clause in clauses:
|
||
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
|
||
for p in parts:
|
||
if p.lower() in STOPWORDS:
|
||
continue
|
||
if re.fullmatch(r"\d+", p):
|
||
continue
|
||
terms.append(p)
|
||
row_terms_map.append((obj_box, urheber, terms))
|
||
if (r_idx % 200) == 0:
|
||
logger.debug(f"[{file_path.name}] Zeile {r_idx} verarbeitet")
|
||
|
||
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
|
||
logger.info(f"[{file_path.name}] Gefundene unique Terme: {len(all_terms)}")
|
||
total_unique_terms = len(all_terms)
|
||
# API-Abfragen
|
||
t0 = time.time()
|
||
gnd_results = batch_query_gnd(all_terms)
|
||
t1 = time.time()
|
||
logger.info(f"[{file_path.name}] GND-Abfragen Dauer: {t1-t0:.1f}s")
|
||
wd_results = batch_query_wikidata(all_terms)
|
||
t2 = time.time()
|
||
logger.info(f"[{file_path.name}] Wikidata-Abfragen Dauer: {t2-t1:.1f}s")
|
||
aat_results = batch_query_getty_aat(all_terms) if API_ACTIVE.get("aat", False) else {t:"" for t in all_terms}
|
||
t3 = time.time()
|
||
logger.info(f"[{file_path.name}] AAT-Abfragen Dauer: {t3-t2:.1f}s")
|
||
|
||
# Build output rows
|
||
output_rows = []
|
||
processed_count = 0
|
||
for obj_box, urheber, terms in row_terms_map:
|
||
for term in terms:
|
||
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
|
||
total_terms += 1
|
||
if norm_name != "KEIN TREFFER":
|
||
total_hits += 1
|
||
out_row = {
|
||
"Box": obj_box,
|
||
"Objekt/Ebene": obj_box,
|
||
"Urheber": urheber,
|
||
"Begriff": term,
|
||
"Norm_Treffer": norm_name,
|
||
"Norm_ID": norm_id,
|
||
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
|
||
"GND_Top1": gnd_results.get(term,""),
|
||
"WD_Top1": wd_results.get(term,""),
|
||
"AAT_Top1": aat_results.get(term,"")
|
||
}
|
||
output_rows.append(out_row)
|
||
processed_count += 1
|
||
if (processed_count % 200) == 0:
|
||
logger.debug(f"[{file_path.name}] {processed_count}/{total_unique_terms} Terme verarbeitet")
|
||
|
||
out_df = pd.DataFrame(output_rows)
|
||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
|
||
version = 1
|
||
while output_file.exists():
|
||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
|
||
version += 1
|
||
engine = "odf" if output_file.suffix.lower()==".ods" else None
|
||
|
||
try:
|
||
out_df.to_excel(output_file, index=False, engine=engine)
|
||
logger.info(f"[{file_path.name}] Auswertung gespeichert: {output_file}")
|
||
except Exception as e:
|
||
logger.error(f"[{file_path.name}] Fehler beim Speichern der Auswertung {output_file}: {e}")
|
||
continue
|
||
|
||
export_missing_terms(out_df, output_file)
|
||
mark_norm_hits(output_file)
|
||
|
||
file_elapsed = time.time() - file_start
|
||
logger.info(f"[Datei {file_idx}/{len(files)}] Fertig ({file_elapsed:.1f}s)")
|
||
|
||
overall_elapsed = time.time() - overall_start
|
||
logger.info(f"Fertig. Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular. Gesamtzeit: {overall_elapsed:.1f}s")
|
||
|
||
# =========================
|
||
# Main
|
||
# =========================
|
||
if __name__ == "__main__":
|
||
try:
|
||
process_files()
|
||
except KeyboardInterrupt:
|
||
logger.warning("Abbruch durch Benutzer (KeyboardInterrupt).")
|
||
except SystemExit:
|
||
logger.warning("SystemExit aufgetreten.")
|
||
except Exception as e:
|
||
logger.exception(f"Ungefangener Fehler: {e}")
|
||
finally:
|
||
# Stop logger (flush remaining logs)
|
||
try:
|
||
save_cache()
|
||
except Exception:
|
||
pass
|
||
try:
|
||
logger.info("Beende.")
|
||
logger.stop()
|
||
except Exception:
|
||
pass
|