#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ NormVokabular Mapper – Version 1.4.2 Dieses Skript normalisiert und mappt Begriffe aus Input-Dateien auf ein zentrales Normvokabular und führt optional API-Abgleiche mit GND und Wikidata durch. Ergebnisse werden in Excel/ODS gespeichert. """ from __future__ import annotations import os import sys import re import time import json import threading import queue import requests import pandas as pd from pathlib import Path from collections import defaultdict from difflib import SequenceMatcher from datetime import datetime # Optional Libraries try: from rapidfuzz import fuzz # für schnellere String-Similarity RAPIDFUZZ_AVAILABLE = True except Exception: RAPIDFUZZ_AVAILABLE = False try: import spacy nlp = spacy.load("de_core_news_sm") # deutsche Lemmatization SPACY_AVAILABLE = True except Exception: SPACY_AVAILABLE = False nlp = None # ========================= # Konfiguration & Pfade # ========================= INPUT_DIR = Path("Input CSV") # Eingabeverzeichnis OUTPUT_DIR = Path("Auswertung Ergebnisse") # Ausgabeordner OUTPUT_DIR.mkdir(exist_ok=True) # Verzeichnis erstellen, falls nicht vorhanden NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods") # Normvokabular-Datei CACHE_FILE = "api_cache.json" # Cache für API-Antworten STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"} CONF_THRESHOLD = 0.75 # Threshold für Vorschläge TIMEOUT_DEFAULT = 5 MAX_RETRIES_DEFAULT = 3 BACKOFF_FACTOR_DEFAULT = 2 HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"} API_ACTIVE = {"gnd": True, "wikidata": True} # API-Verfügbarkeit FAIL_COUNTER = {"gnd": 0, "wikidata": 0} # Logging-Parameter LOG_FILE = OUTPUT_DIR / "mapper_log.txt" LOG_BATCH_SIZE = 100 # Anzahl Logs vor Flush LOG_FLUSH_INTERVAL = 5.0 # Sekunden zwischen Flushes LOG_LEVEL = "DEBUG" # Logging-Level # ========================= # Batch/Buffered Logger # ========================= class BatchLogger: """ Buffered Logger: Speichert Logs in einem Queue-Buffer und schreibt sie periodisch in Datei und Konsole. Reduziert I/O-Aufwand bei vielen Logs. """ def __init__(self, logfile: Path, flush_interval: float = 5.0, batch_size: int = 100, level: str = "DEBUG"): self.logfile = logfile self.flush_interval = flush_interval self.batch_size = batch_size self.level = level self.q = queue.Queue() self._stop_event = threading.Event() self._thread = threading.Thread(target=self._worker, daemon=True, name="BatchLoggerThread") # Sicherstellen, dass die Log-Datei existiert try: logfile.parent.mkdir(parents=True, exist_ok=True) logfile.touch(exist_ok=True) except Exception: pass self._thread.start() def _format(self, level: str, msg: str) -> str: """Formatiert Logeinträge mit Timestamp""" ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") return f"{ts} - {level} - {msg}" def log(self, level: str, msg: str): """Fügt Log dem Queue hinzu und löst Flush aus, falls Batchgröße erreicht""" if self._stop_event.is_set(): return formatted = self._format(level, msg) self.q.put((level, formatted)) if self.q.qsize() >= self.batch_size: self.q.put(("__FLUSH__", "__FLUSH__")) def debug(self, msg: str): if LOG_LEVEL in ("DEBUG",): self.log("DEBUG", msg) def info(self, msg: str): self.log("INFO", msg) def warning(self, msg: str): self.log("WARNING", msg) def error(self, msg: str): self.log("ERROR", msg) def exception(self, msg: str): self.log("EXCEPTION", msg) def _worker(self): """Hintergrund-Thread: verarbeitet Queue, schreibt Logs periodisch""" buffer = [] last_flush = time.time() while not self._stop_event.is_set() or not self.q.empty(): try: item = None try: item = self.q.get(timeout=self.flush_interval) except queue.Empty: if buffer: self._flush_buffer(buffer) buffer = [] last_flush = time.time() continue if item is None: continue level, formatted = item if level == "__FLUSH__": if buffer: self._flush_buffer(buffer) buffer = [] last_flush = time.time() continue buffer.append((level, formatted)) if len(buffer) >= self.batch_size or (time.time() - last_flush) >= self.flush_interval: self._flush_buffer(buffer) buffer = [] last_flush = time.time() except Exception as e: try: sys.stderr.write(f"BatchLogger worker error: {e}\n") except Exception: pass time.sleep(0.5) if buffer: self._flush_buffer(buffer) def _flush_buffer(self, buffer): """Schreibt Puffer in Datei und Konsole""" if not buffer: return try: out_lines = [f"{line}\n" for _, line in buffer] try: sys.stdout.writelines(out_lines) sys.stdout.flush() except Exception: pass try: with open(self.logfile, "a", encoding="utf-8") as f: f.writelines(out_lines) except Exception as e: try: sys.stderr.write(f"BatchLogger file write error: {e}\n") except Exception: pass except Exception: pass def stop(self): """Stoppt Logger-Thread""" self._stop_event.set() try: self.q.put(("__FLUSH__", "__FLUSH__")) except Exception: pass self._thread.join(timeout=5.0) # Logger-Instanz erstellen logger = BatchLogger(LOG_FILE, flush_interval=LOG_FLUSH_INTERVAL, batch_size=LOG_BATCH_SIZE, level=LOG_LEVEL) logger.info("Starte NormVokabular Mapper v1.4.2 (Batch-Logging aktiv)") # ========================= # Cache laden/speichern # ========================= if os.path.exists(CACHE_FILE): try: with open(CACHE_FILE,"r",encoding="utf-8") as f: CACHE = json.load(f) logger.debug(f"Cache geladen ({len(CACHE)} Einträge).") except Exception as e: logger.warning(f"Cache konnte nicht geladen werden: {e}") CACHE = {} else: CACHE = {} def save_cache(): """Speichert aktuellen Cache in JSON""" try: with open(CACHE_FILE,"w",encoding="utf-8") as f: json.dump(CACHE, f, indent=2, ensure_ascii=False) logger.debug("Cache gespeichert.") except Exception as e: logger.error(f"Cache konnte nicht gespeichert werden: {e}") # ========================= # Normalisierung / Lemma / Tokenization # ========================= def normalize_text(s): """Text in Kleinbuchstaben, Sonderzeichen entfernen, Trim""" if not s: return "" s = str(s).lower().strip() s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s) s = re.sub(r"\s+"," ",s) return s lemma_cache = {} def lemmatize_term(term): """Lemmatize mit spaCy, Cache für Performance""" term_norm = normalize_text(term) if term_norm in lemma_cache: return lemma_cache[term_norm] if SPACY_AVAILABLE and nlp: try: doc = nlp(term_norm) lemma = " ".join([token.lemma_ for token in doc]) except Exception: lemma = term_norm else: lemma = term_norm lemma_cache[term_norm] = lemma return lemma def compound_split(term): """Splittet Komposita nach -, _, / oder Leerzeichen""" if not term: return [] parts = [p for p in re.split(r"[\s\-_/]+", term) if p] return parts if parts else [term] # ========================= # Normvokabular laden & Index # ========================= def load_normvokabular(file_path): """Lädt Normvokabular aus Excel/ODS, erstellt Dictionarys für Mapping""" try: sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None) except Exception as e: logger.error(f"Normvokabular konnte nicht geladen werden: {e}") raise norm_dict = {} stem_index = defaultdict(list) lemma_norm_map = {} for sheet_name, df in sheets.items(): if sheet_name.lower() in ["master", "übersicht"]: continue # Übersichtsblätter ignorieren df = df.dropna(how="all", axis=1) df.columns = [str(c).strip() for c in df.columns] # ID- und Wort-Spalte finden id_col = next((c for c in df.columns if "ID" in c), None) word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c or "Begriff" in c), None) if not id_col or not word_col: continue current_parent_id = None for _, row in df.iterrows(): row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None if row_id: current_parent_id = row_id if not row_word: continue assigned_parent_id = current_parent_id entry = {"Name": row_word, "ID": assigned_parent_id or "", "Sheet": sheet_name, "Own_ID": row_id or ""} key = normalize_text(row_word) norm_dict[key] = entry lemma = lemmatize_term(key) stem_index[lemma].append(entry) if lemma not in lemma_norm_map: lemma_norm_map[lemma] = entry logger.info(f"Normvokabular geladen: {len(norm_dict)} Einträge, {len(stem_index)} Stems") return norm_dict, stem_index, lemma_norm_map # ========================= # Mapping & Vorschläge # ========================= def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3): """ Mappt einen Begriff auf Normvokabular. Prüft exakte Treffer, Lemma-Treffer, Komposita und generiert Vorschläge. """ term_norm = normalize_text(term) term_lemma = lemmatize_term(term) if term_norm in norm_dict: e = norm_dict[term_norm] logger.debug(f"map_to_norm: exakter Treffer für '{term}' -> {e['Name']}") return e["Name"], e["ID"], [] if term_lemma in stem_index: e = stem_index[term_lemma][0] logger.debug(f"map_to_norm: Lemma-Treffer für '{term}' -> {e['Name']}") return e["Name"], e["ID"], [] tokens = compound_split(term_norm) if len(tokens) == 1: suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n) logger.debug(f"map_to_norm: KEIN TREFFER für '{term}', Vorschläge: {suggestions}") return "KEIN TREFFER", "", suggestions else: token_matches = [] for t in tokens: t_lemma = lemmatize_term(t) if t_lemma in stem_index: e = stem_index[t_lemma][0] token_matches.append((t, e["Name"], e["ID"])) else: sugg = get_suggestions(t_lemma, lemma_norm_map, top_n) token_matches.append((t, "KEIN TREFFER", "", sugg)) combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"] logger.debug(f"map_to_norm: Kompositum '{term}' -> combined_suggestions: {combined_suggestions}") return "KEIN TREFFER", "", combined_suggestions def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD): """Ermittelt Vorschläge basierend auf Similarity""" candidates = [] for key_lemma, entry in lemma_norm_map.items(): if RAPIDFUZZ_AVAILABLE: score = fuzz.token_set_ratio(term_lemma, key_lemma)/100 else: score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio() if key_lemma.lower().startswith(term_lemma.lower()): score = min(score + 0.1, 1.0) if score >= threshold: candidates.append((score, entry["Name"], entry["ID"])) candidates.sort(reverse=True) return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]] # ========================= # Generic request with retries & caching # ========================= def request_with_retries_generic(api_name, url, params=None, headers=None, timeout=TIMEOUT_DEFAULT, max_retries=MAX_RETRIES_DEFAULT, backoff=BACKOFF_FACTOR_DEFAULT): """ Sendet GET-Requests mit Retry-Logik, Backoff und Caching """ cache_key = url + (json.dumps(params, sort_keys=True, ensure_ascii=False) if params else "") if cache_key in CACHE: logger.debug(f"[Cache] {api_name}: {cache_key}") return CACHE[cache_key] retries = 0 while retries < max_retries: try: r = requests.get(url, params=params, headers=headers or HEADERS, timeout=timeout) if r.status_code == 200: try: data = r.json() except Exception: data = r.text CACHE[cache_key] = data FAIL_COUNTER[api_name] = 0 logger.debug(f"[{api_name}] Erfolgreiche Antwort für {url}") return data else: logger.warning(f"[{api_name}] HTTP {r.status_code} für {url}") raise ValueError(f"HTTP {r.status_code}") except Exception as e: retries += 1 wait = backoff ** retries logger.warning(f"[{api_name}] Fehler ({retries}/{max_retries}) für {url}: {e}. Warte {wait}s") time.sleep(wait) FAIL_COUNTER[api_name] += 1 if FAIL_COUNTER[api_name] >= 10: API_ACTIVE[api_name] = False logger.error(f"[{api_name}] Deaktiviere API nach zu vielen Fehlern.") return None # ========================= # GND / Wikidata Batch Queries # ========================= def batch_query_gnd(terms): """Batch-Abfrage der Begriffe bei GND""" results = {} if not API_ACTIVE.get("gnd", False): for t in terms: results[t] = "" return results logger.info(f"[GND] Starte GND-Abfragen für {len(terms)} Terme") start = time.time() for idx, t in enumerate(terms, start=1): logger.debug(f"[GND] ({idx}/{len(terms)}) Anfrage für '{t}'") url = "https://lobid.org/gnd/search" params = {"q": t, "format": "json"} data = request_with_retries_generic("gnd", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT) top = "" try: if data and "member" in data: cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None, t.lower(), (doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")] cands = [c for c in cands if c[1] >= 0.75] if cands: top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0] except Exception as e: logger.debug(f"[GND] Fehler bei Verarbeitung für '{t}': {e}") results[t] = top elapsed = time.time() - start logger.info(f"[GND] Fertig. Dauer: {elapsed:.1f}s") return results def batch_query_wikidata(terms): """Batch-Abfrage der Begriffe bei Wikidata""" results = {} if not API_ACTIVE.get("wikidata", False): for t in terms: results[t] = "" return results logger.info(f"[WD] Starte Wikidata-Abfragen für {len(terms)} Terme") start = time.time() for idx, t in enumerate(terms, start=1): logger.debug(f"[WD] ({idx}/{len(terms)}) Anfrage für '{t}'") url = "https://www.wikidata.org/w/api.php" params = {"action": "wbsearchentities", "search": t, "language": "de", "format": "json"} data = request_with_retries_generic("wikidata", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT) top = "" try: if data and "search" in data: # Ermittlung der Kandidaten mit Ähnlichkeitsbewertung cands = [(e.get("label",""), SequenceMatcher(None, t.lower(), e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")] # Filterung nach Mindestähnlichkeit (0.70) cands = [c for c in cands if c[1] >= 0.70] if cands: # Bestes Ergebnis nach Ähnlichkeit auswählen top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0] except Exception as e: logger.debug(f"[WD] Fehler bei Verarbeitung für '{t}': {e}") results[t] = top elapsed = time.time() - start logger.info(f"[WD] Fertig. Dauer: {elapsed:.1f}s") return results # ========================= # Markierung / Export (Excel/ODS) # ========================= def mark_norm_hits(file_path): """ Markiert Treffer in Excel/ODS farblich: Grün = Treffer, Rot = KEIN TREFFER """ ext = file_path.suffix.lower() try: if ext in [".xlsx", ".xls"]: from openpyxl import load_workbook from openpyxl.styles import PatternFill wb = load_workbook(file_path) ws = wb.active # Spaltenmapping anhand der Kopfzeile col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])} norm_col = col_map.get("Norm_Treffer", None) if not norm_col: logger.debug("Spalte 'Norm_Treffer' nicht gefunden (mark_norm_hits).") wb.save(file_path) return # Farben definieren green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid") red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid") for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col): cell = row[0] if cell.value and cell.value != "KEIN TREFFER": cell.fill = green_fill else: cell.fill = red_fill wb.save(file_path) elif ext == ".ods": # ODS: kein Zell-Fill, stattdessen Status-Spalte df = pd.read_excel(file_path, engine="odf") df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x != "KEIN TREFFER" else "Kein Treffer") df.to_excel(file_path, index=False, engine="odf") except Exception as e: logger.warning(f"Fehler beim Markieren der Treffer in {file_path}: {e}") # ========================= # Fehlende Begriffe -> separate Datei # ========================= def export_missing_terms(out_df, output_file): """ Speichert Begriffe ohne Treffer oder Vorschläge in separater Datei """ missing_df = out_df[ (out_df["Norm_Treffer"] == "KEIN TREFFER") & (out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == "")) ][["Begriff"]].drop_duplicates() count_missing = len(missing_df) logger.info(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}") if count_missing == 0: return ext = output_file.suffix.lower() base_name = output_file.stem missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}" version = 1 while missing_file.exists(): missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}" version += 1 try: if ext in [".xlsx", ".xls"]: missing_df.to_excel(missing_file, index=False, engine="openpyxl") elif ext == ".ods": missing_df.to_excel(missing_file, index=False, engine="odf") else: missing_df.to_csv(missing_file, index=False, sep=";") logger.info(f"Fehlende Begriffe gespeichert: {missing_file}") except Exception as e: logger.error(f"Fehler beim Speichern der fehlenden Begriffe: {e}") # ========================= # Haupt-Loop: Verarbeitung Input-Dateien # ========================= def process_files(): """Verarbeitet alle Dateien im Input-Ordner, mappt Begriffe und speichert Ergebnisse""" overall_start = time.time() try: # Normvokabular laden norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE) except Exception as e: logger.error("Normvokabular konnte nicht geladen werden. Beende.") raise total_terms = 0 total_hits = 0 if not INPUT_DIR.exists(): logger.error(f"Eingabeordner {INPUT_DIR} fehlt") raise SystemExit(1) files = list(INPUT_DIR.glob("*")) if not files: logger.info("Keine Dateien gefunden") return logger.info(f"Starte Verarbeitung von {len(files)} Dateien") for file_idx, file_path in enumerate(files, start=1): if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]: logger.debug(f"Übersprungen (kein unterstütztes Format): {file_path.name}") continue logger.info(f"[Datei {file_idx}/{len(files)}] Verarbeite: {file_path.name}") file_start = time.time() try: if file_path.suffix.lower() == ".csv": df = pd.read_csv(file_path) else: df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None) except Exception as e: logger.error(f"Fehler beim Lesen von {file_path.name}: {e}") continue df = df.dropna(how="all") df.columns = [str(c).strip() for c in df.columns] # Spalten identifizieren besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None) box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None) urh_col = next((c for c in df.columns if "Urheber" in c), None) if not besch_col: logger.warning(f"Spalte 'Objektbeschreibung' nicht gefunden in {file_path.name}. Datei übersprungen.") continue # Begriffe extrahieren row_terms_map = [] for r_idx, row in enumerate(df.itertuples(index=False), start=1): try: besch = str(row[df.columns.get_loc(besch_col)]).strip() if pd.notna(row[df.columns.get_loc(besch_col)]) else "" except Exception: besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else "" if not besch: continue obj_box = row[df.columns.get_loc(box_col)] if box_col and box_col in df.columns else "" urheber = row[df.columns.get_loc(urh_col)] if urh_col and urh_col in df.columns else "" clauses = [c.strip() for c in re.split(r",", besch) if c.strip()] terms = [] for clause in clauses: parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()] for p in parts: if p.lower() in STOPWORDS: continue if re.fullmatch(r"\d+", p): continue terms.append(p) row_terms_map.append((obj_box, urheber, terms)) if (r_idx % 200) == 0: logger.debug(f"[{file_path.name}] Zeile {r_idx} verarbeitet") # Alle einzigartigen Terme für API-Abfragen all_terms = list({t for _,_,terms in row_terms_map for t in terms}) logger.info(f"[{file_path.name}] Gefundene unique Terme: {len(all_terms)}") total_unique_terms = len(all_terms) # API-Abfragen t0 = time.time() gnd_results = batch_query_gnd(all_terms) t1 = time.time() logger.info(f"[{file_path.name}] GND-Abfragen Dauer: {t1-t0:.1f}s") wd_results = batch_query_wikidata(all_terms) t2 = time.time() logger.info(f"[{file_path.name}] Wikidata-Abfragen Dauer: {t2-t1:.1f}s") # Build output rows output_rows = [] processed_count = 0 for obj_box, urheber, terms in row_terms_map: for term in terms: norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map) total_terms += 1 if norm_name != "KEIN TREFFER": total_hits += 1 out_row = { "Box": obj_box, "Objekt/Ebene": obj_box, "Urheber": urheber, "Begriff": term, "Norm_Treffer": norm_name, "Norm_ID": norm_id, "Norm_Vorschlag": ", ".join(suggestions) if suggestions else "", "GND_Top1": gnd_results.get(term,""), "WD_Top1": wd_results.get(term,"") } output_rows.append(out_row) processed_count += 1 if (processed_count % 200) == 0: logger.debug(f"[{file_path.name}] {processed_count}/{total_unique_terms} Terme verarbeitet") # Save output out_df = pd.DataFrame(output_rows) out_file = OUTPUT_DIR / f"{file_path.stem}_mapped.xlsx" try: out_df.to_excel(out_file, index=False, engine="openpyxl") logger.info(f"Ergebnisse gespeichert: {out_file}") mark_norm_hits(out_file) export_missing_terms(out_df, out_file) except Exception as e: logger.error(f"Fehler beim Speichern der Ergebnisse für {file_path.name}: {e}") elapsed_total = time.time() - overall_start logger.info(f"Verarbeitung abgeschlossen. Gesamtzeit: {elapsed_total:.1f}s") logger.info(f"Gesamtterme: {total_terms}, Treffer: {total_hits}, Trefferquote: {total_hits/total_terms:.2%}" if total_terms else "") save_cache() logger.stop() if __name__ == "__main__": process_files()