""" ======================================================================== NormVokabular Mapper – Übersicht ======================================================================== Dieses Skript dient dazu, Begriffe aus Datenbeständen mit einem vordefinierten Normvokabular abzugleichen. Es identifiziert Treffer, gibt bei fehlenden Treffern Vorschläge aus und ermöglicht optional einen Abgleich mit externen APIs (GND, Wikidata). Hauptfunktionen: 1. **Input verarbeiten** - Liest CSV-, Excel- und ODS-Dateien aus dem Ordner "Input CSV". - Extrahiert relevante Begriffe aus Spalten wie "Objektbeschreibung", filtert Stopwords und Zahlen. 2. **Normvokabular laden** - Liest die Masterdatei NV_MASTER.ods ein. - Berücksichtigt Hierarchie-IDs, um übergeordnete Begriffe zuordnen zu können. - Erstellt ein Index für gestemmte Begriffe, um auch ähnliche Schreibweisen zu erkennen. 3. **Mapping auf Normvokabular** - Prüft, ob ein Begriff exakt oder gestemmt im Normvokabular vorkommt. - Wenn kein Treffer vorliegt, werden alternative Vorschläge generiert. 4. **API-Abgleich (optional)** - Fragt GND und Wikidata ab, um den Top-1 Treffer für jeden Begriff zu ermitteln. - Nutzt einen Cache, um wiederholte Requests zu vermeiden. - Bietet einen Dry-Run-Modus für Tests ohne Internetzugang. 5. **Ergebnis speichern** - Speichert die Auswertung in einem eigenen Ordner "Auswertung Ergebnisse". - Markiert Treffer visuell: grün = Treffer, rot = kein Treffer (bei Excel), bzw. fügt Statusspalte bei ODS-Dateien hinzu. - Enthält alle relevanten Informationen pro Begriff: Originalbegriff, Normbegriff, Norm-ID, Vorschläge, GND/Wikidata Top1 Treffer. 6. **Logging** - Informiert über Fortschritt, Anzahl der Begriffe, Treffer und mögliche Fehler. **Nutzung:** ```bash python normvokabular_mapper.py python normvokabular_mapper.py --dry-run # nur Simulation der API-Abfragen """ import os import sys import time import json import re import requests import pandas as pd from pathlib import Path from difflib import SequenceMatcher import argparse from collections import defaultdict # ========================= # Argumente / Dry-Run # ========================= parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true', help='API-Abfragen simulieren') args = parser.parse_args() DRY_RUN = args.dry_run # ========================= # Konfiguration # ========================= INPUT_DIR = Path("Input CSV") OUTPUT_DIR = Path("Auswertung Ergebnisse") OUTPUT_DIR.mkdir(exist_ok=True) NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods") TIMEOUT = 5 MAX_RETRIES = 3 BACKOFF_FACTOR = 2 CACHE_FILE = "api_cache.json" STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"} API_ACTIVE = {"gnd": True, "wikidata": True} FAIL_COUNTER = {"gnd":0, "wikidata":0} HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"} CONF_THRESHOLD = 0.75 # für Vorschläge # ========================= # Logging # ========================= def log(level, msg): ts = time.strftime("%Y-%m-%d %H:%M:%S") print(f"[{ts}] [{level}] {msg}") # ========================= # Cache laden / speichern # ========================= if os.path.exists(CACHE_FILE): try: with open(CACHE_FILE,"r",encoding="utf-8") as f: CACHE = json.load(f) log("INFO", f"Cache geladen: {CACHE_FILE}") except: CACHE = {} else: CACHE = {} def save_cache(): try: with open(CACHE_FILE,"w",encoding="utf-8") as f: json.dump(CACHE, f, indent=2, ensure_ascii=False) log("DEBUG","Cache gespeichert") except Exception as e: log("ERROR", f"Cache speichern fehlgeschlagen: {e}") # ========================= # Normalisierung / Stemming # ========================= try: from nltk.stem.snowball import GermanStemmer STEMMER = GermanStemmer() log("INFO","NLTK GermanStemmer verfügbar") except: STEMMER = None log("WARNING","NLTK nicht verfügbar, naive Pluralreduktion wird genutzt") def normalize_text(s): if s is None: return "" s = str(s).lower().strip() s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s) s = re.sub(r"\s+"," ",s) return s def naive_stem(w): for ending in ("ern","nen","en","er","e","n","s"): if w.endswith(ending) and len(w)-len(ending)>=3: return w[:-len(ending)] return w def stem_word(word): w = normalize_text(word) try: return STEMMER.stem(w) if STEMMER else naive_stem(w) except: return naive_stem(w) from collections import defaultdict from difflib import SequenceMatcher CONF_THRESHOLD = 0.75 # Confidence für Vorschläge # ========================= # Normvokabular laden (NV_MASTER) mit Parent-ID & Stem-Index # ========================= def load_normvokabular(file_path): import pandas as pd import re log("INFO", f"Normvokabular laden: {file_path}") engine = "odf" if file_path.suffix.lower() == ".ods" else None sheets = pd.read_excel(file_path, sheet_name=None, engine=engine) norm_dict = {} stem_index = defaultdict(list) count = 0 for sheet_name, df in sheets.items(): df.columns = [str(c).strip() for c in df.columns] current_parent_id = None for _, row in df.iterrows(): # Spaltennamen flexibel anpassen id_val = str(row.get("ID","")).strip() if "ID" in df.columns else "" wort = str(row.get("Wort/Vokabel","")).strip() if "Wort/Vokabel" in df.columns else "" # Zeilen mit ID, aber ohne Vokabel → Update Parent-ID if id_val: current_parent_id = id_val # Skip leere Vokabeln if not wort: continue assigned_id = current_parent_id # Parent-ID übernehmen key = normalize_text(wort) entry = { "Name": wort, "ID": assigned_id, "Sheet": sheet_name } norm_dict[key] = entry stem_index[stem_word(key)].append(entry) count += 1 log("INFO", f"{count} Begriffe aus Normvokabular geladen") return norm_dict, stem_index # ========================= # Mapping & Vorschläge # ========================= def map_to_norm(term, norm_dict, stem_index): tnorm = normalize_text(term) tstem = stem_word(tnorm) # Exakter Treffer if tnorm in norm_dict: e = norm_dict[tnorm] return e["Name"], e["ID"], [] # Gestemmter Treffer if tstem in stem_index: e = stem_index[tstem][0] return e["Name"], e["ID"], [] # Kein Treffer → Vorschläge suggestions = get_suggestions(tnorm, norm_dict) return "KEIN TREFFER", "", suggestions def get_suggestions(term, norm_dict, top_n=3, threshold=CONF_THRESHOLD): t = term.lower() scores = [] for key, val in norm_dict.items(): score = SequenceMatcher(None, t, key).ratio() if score >= threshold: scores.append((score, val["Name"], val["ID"])) scores.sort(reverse=True) return [f"{name} ({id_})" for _, name, id_ in scores[:top_n]] # ========================= # API-Abgleich (Top1) unverändert # ========================= def request_with_retries(api_name,url,params=None): if DRY_RUN: return None cache_key = url + str(params) if cache_key in CACHE: return CACHE[cache_key] retries = 0 while retries=10: API_ACTIVE[api_name]=False return None def compute_min_conf(term,api_name): l=len(term.strip()) if l<=3: return 0.90 if l<=6: return 0.85 if api_name=='gnd' else 0.80 return 0.75 if api_name=='gnd' else 0.70 def batch_query_gnd(terms): results={} if DRY_RUN or not API_ACTIVE.get("gnd",False): for t in terms: results[t]="TEST_GND" return results for t in terms: url="https://lobid.org/gnd/search" params={"q":t,"format":"json"} data=request_with_retries("gnd",url,params) top="" if data and "member" in data: min_conf=compute_min_conf(t,'gnd') cands=[] for doc in data["member"]: name=doc.get("preferredName","") or doc.get("name","") if not name: continue conf=SequenceMatcher(None,t.lower(),name.lower()).ratio() if conf>=min_conf: cands.append((name,conf)) if cands: top=sorted(cands,key=lambda x:x[1],reverse=True)[0][0] results[t]=top return results def batch_query_wikidata(terms): results={} if DRY_RUN or not API_ACTIVE.get("wikidata",False): for t in terms: results[t]="TEST_WD" return results for t in terms: url="https://www.wikidata.org/w/api.php" params={"action":"wbsearchentities","search":t,"language":"de","format":"json"} data=request_with_retries("wikidata",url,params) top="" if data and "search" in data: min_conf=compute_min_conf(t,'wikidata') cands=[] for e in data["search"]: label=e.get("label","") if not label: continue conf=SequenceMatcher(None,t.lower(),label.lower()).ratio() if conf>=min_conf: cands.append((label,conf)) if cands: top=sorted(cands,key=lambda x:x[1],reverse=True)[0][0] results[t]=top return results # ========================= # Formatabhängige Markierung / Status # ========================= def mark_norm_hits(file_path): ext = file_path.suffix.lower() if ext in [".xlsx", ".xls"]: from openpyxl import load_workbook from openpyxl.styles import PatternFill wb = load_workbook(file_path) ws = wb.active green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid") red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid") col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])} norm_col = col_map.get("Norm_Treffer", None) if not norm_col: log("WARNING","Spalte 'Norm_Treffer' nicht gefunden, keine Markierung möglich") return for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col): cell = row[0] if cell.value and cell.value!="KEIN TREFFER": cell.fill = green_fill else: cell.fill = red_fill wb.save(file_path) log("INFO","Excel: Treffer farblich markiert (grün=Treffer, rot=kein Treffer)") elif ext==".ods": df = pd.read_excel(file_path, engine="odf") df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer") df.to_excel(file_path, index=False, engine="odf") log("INFO","ODS: Spalte 'Norm_Status' eingefügt (Treffer / Kein Treffer)") else: log("WARNING","Unbekanntes Dateiformat, keine Markierung durchgeführt") # ========================= # Verarbeitung Input-Dateien # ========================= def process_files(): norm_dict, stem_index = load_normvokabular(NORMVOC_FILE) total_terms=0 total_norm_hits=0 if not INPUT_DIR.exists(): log("CRITICAL",f"Eingabeordner {INPUT_DIR} fehlt") sys.exit(1) files=list(INPUT_DIR.glob("*")) if not files: log("WARNING","Keine Dateien gefunden") for file_path in files: if not file_path.suffix.lower() in [".ods",".xlsx",".csv",".xls"]: continue log("INFO",f"Verarbeite Datei: {file_path.name}") # Output-Datei für diese Input-Datei erzeugen output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}" version = 1 while output_file.exists(): output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}" version += 1 try: if file_path.suffix.lower()==".csv": df=pd.read_csv(file_path) elif file_path.suffix.lower()==".ods": df=pd.read_excel(file_path, engine="odf") else: df=pd.read_excel(file_path) except Exception as e: log("ERROR",f"Datei {file_path.name} konnte nicht gelesen werden: {e}") continue df.columns=[str(c).strip() for c in df.columns] row_terms_map=[] for _,row in df.iterrows(): besch=row.get("Objektbeschreibung","") if pd.isna(besch) or not str(besch).strip(): continue besch=str(besch).strip() clauses=[c.strip() for c in re.split(r",",besch) if c.strip()] terms=[] for clause in clauses: parts=[p.strip() for p in re.split(r"\s+",clause) if p.strip()] for p in parts: if p.lower() in STOPWORDS: continue if re.fullmatch(r"\d+",p): continue terms.append(p) obj_box=row.get("Objekt/Ebene","") urheber=row.get("Urheber","") row_terms_map.append((obj_box,urheber,terms)) all_terms=[] for _,_,terms in row_terms_map: all_terms.extend(terms) all_terms = list(set(all_terms)) # unique gnd_results=batch_query_gnd(all_terms) wd_results=batch_query_wikidata(all_terms) output_rows=[] for obj_box,urheber,terms in row_terms_map: for term in terms: norm_name,norm_id,suggestions = map_to_norm(term,norm_dict, stem_index) total_terms+=1 if norm_name!="KEIN TREFFER": total_norm_hits+=1 out_row={ "Box": obj_box, "Objekt/Ebene": obj_box, "Urheber": urheber, "Begriff": term, "Norm_Treffer": norm_name, "Norm_ID": norm_id, "Norm_Vorschlag": ", ".join(suggestions) if suggestions else "", "GND_Top1": gnd_results.get(term,""), "WD_Top1": wd_results.get(term,"") } output_rows.append(out_row) out_df=pd.DataFrame(output_rows) engine = "odf" if output_file.suffix.lower()==".ods" else None out_df.to_excel(output_file,index=False,engine=engine) log("INFO",f"Auswertung gespeichert: {output_file}") mark_norm_hits(output_file) save_cache() log("INFO",f"Gesamt: {total_terms} Begriffe, {total_norm_hits} Treffer im Normvokabular") # ========================= # Main # ========================= if __name__=="__main__": process_files() log("INFO","Fertig")