""" ======================================================================== NormVokabular Mapper – Übersicht ======================================================================== Dieses Skript dient dazu, Begriffe aus Datenbeständen mit einem vordefinierten Normvokabular abzugleichen. Es identifiziert Treffer, gibt bei fehlenden Treffern Vorschläge aus und ermöglicht optional einen Abgleich mit externen APIs (GND, Wikidata). Hauptfunktionen: 1. **Input verarbeiten** - Liest CSV-, Excel- und ODS-Dateien aus dem Ordner "Input CSV". - Extrahiert relevante Begriffe aus Spalten wie "Objektbeschreibung", filtert Stopwords und Zahlen. 2. **Normvokabular laden** - Liest die Masterdatei NV_MASTER.ods ein. - Berücksichtigt Hierarchie-IDs, um übergeordnete Begriffe zuordnen zu können. - Erstellt ein Index für gestemmte Begriffe, um auch ähnliche Schreibweisen zu erkennen. 3. **Mapping auf Normvokabular** - Prüft, ob ein Begriff exakt oder gestemmt im Normvokabular vorkommt. - Wenn kein Treffer vorliegt, werden alternative Vorschläge generiert. 4. **API-Abgleich (optional)** - Fragt GND und Wikidata ab, um den Top-1 Treffer für jeden Begriff zu ermitteln. - Nutzt einen Cache, um wiederholte Requests zu vermeiden. - Bietet einen Dry-Run-Modus für Tests ohne Internetzugang. 5. **Ergebnis speichern** - Speichert die Auswertung in einem eigenen Ordner "Auswertung Ergebnisse". - Markiert Treffer visuell: grün = Treffer, rot = kein Treffer (bei Excel), bzw. fügt Statusspalte bei ODS-Dateien hinzu. - Enthält alle relevanten Informationen pro Begriff: Originalbegriff, Normbegriff, Norm-ID, Vorschläge, GND/Wikidata Top1 Treffer. 6. **Logging** - Informiert über Fortschritt, Anzahl der Begriffe, Treffer und mögliche Fehler. """ import os import sys import re import time import json import pandas as pd import requests from pathlib import Path from collections import defaultdict from difflib import SequenceMatcher # RapidFuzz für Token-basierte Fuzzy-Suche try: from rapidfuzz import fuzz RAPIDFUZZ_AVAILABLE = True print("RapidFuzz verfügbar") except ImportError: RAPIDFUZZ_AVAILABLE = False print("RapidFuzz nicht verfügbar – nutze SequenceMatcher") # Spacy Lemmatizer try: import spacy nlp = spacy.load("de_core_news_sm") SPACY_AVAILABLE = True print("Spacy Lemmatizer aktiviert") except: SPACY_AVAILABLE = False nlp = None print("Spacy nicht verfügbar – nutze naive Stemmer") # ========================= # Pfade & Config # ========================= INPUT_DIR = Path("Input CSV") OUTPUT_DIR = Path("Auswertung Ergebnisse") OUTPUT_DIR.mkdir(exist_ok=True) NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods") CACHE_FILE = "api_cache.json" STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"} CONF_THRESHOLD = 0.75 TIMEOUT = 5 MAX_RETRIES = 3 BACKOFF_FACTOR = 2 HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"} API_ACTIVE = {"gnd": True, "wikidata": True} FAIL_COUNTER = {"gnd": 0, "wikidata": 0} # Cache if os.path.exists(CACHE_FILE): with open(CACHE_FILE,"r",encoding="utf-8") as f: CACHE = json.load(f) else: CACHE = {} def save_cache(): with open(CACHE_FILE,"w",encoding="utf-8") as f: json.dump(CACHE, f, indent=2, ensure_ascii=False) # ========================= # Normalisierung / Lemma # ========================= def normalize_text(s): if not s: return "" s = str(s).lower().strip() s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s) s = re.sub(r"\s+"," ",s) return s # Lemma-Cache lemma_cache = {} def lemmatize_term(term): term_norm = normalize_text(term) if term_norm in lemma_cache: return lemma_cache[term_norm] if SPACY_AVAILABLE and nlp: doc = nlp(term_norm) lemma = " ".join([token.lemma_ for token in doc]) else: lemma = term_norm lemma_cache[term_norm] = lemma return lemma # ========================= # Kompositum-Zerlegung (einfacher Ansatz) # ========================= def compound_split(term): parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term) return parts if parts else [term] # ========================= # Normvokabular laden & Lemma vorbereiten # ========================= def load_normvokabular(file_path): sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None) norm_dict = {} stem_index = defaultdict(list) lemma_norm_map = {} # für RapidFuzz preprocessed for sheet_name, df in sheets.items(): if sheet_name.lower() in ["master", "übersicht"]: continue df = df.dropna(how="all", axis=1) df.columns = [str(c).strip() for c in df.columns] id_col = next((c for c in df.columns if "ID" in c), None) word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c), None) if not id_col or not word_col: continue current_parent_id = None for _, row in df.iterrows(): row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None if row_id: current_parent_id = row_id if not row_word: continue assigned_parent_id = current_parent_id entry = { "Name": row_word, "ID": assigned_parent_id, # Parent-ID "Sheet": sheet_name, "Own_ID": row_id or "" # eigene ID, falls vorhanden } key = normalize_text(row_word) norm_dict[key] = entry lemma = lemmatize_term(key) stem_index[lemma].append(entry) if lemma not in lemma_norm_map: lemma_norm_map[lemma] = entry return norm_dict, stem_index, lemma_norm_map # ========================= # Mapping & Vorschläge # ========================= def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3): term_norm = normalize_text(term) term_lemma = lemmatize_term(term) # Exakter Treffer if term_norm in norm_dict: e = norm_dict[term_norm] return e["Name"], e["ID"], [] # Lemma-Treffer if term_lemma in stem_index: e = stem_index[term_lemma][0] return e["Name"], e["ID"], [] # KEIN TREFFER → Kompositum-Split tokens = compound_split(term) if len(tokens) == 1: suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n) return "KEIN TREFFER", "", suggestions else: token_matches = [] for t in tokens: t_lemma = lemmatize_term(t) if t_lemma in stem_index: e = stem_index[t_lemma][0] token_matches.append((t, e["Name"], e["ID"])) else: sugg = get_suggestions(t_lemma, lemma_norm_map, top_n) token_matches.append((t, "KEIN TREFFER", "", sugg)) combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"] return "KEIN TREFFER", "", combined_suggestions def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD): candidates = [] for key_lemma, entry in lemma_norm_map.items(): if RAPIDFUZZ_AVAILABLE: score = fuzz.token_set_ratio(term_lemma, key_lemma)/100 else: score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio() if key_lemma.lower().startswith(term_lemma.lower()): score = min(score + 0.1, 1.0) if score >= threshold: candidates.append((score, entry["Name"], entry["ID"])) candidates.sort(reverse=True) return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]] # ========================= # API-Abfragen # ========================= def request_with_retries(api_name,url,params=None): cache_key = url + str(params) if cache_key in CACHE: return CACHE[cache_key] retries = 0 while retries < MAX_RETRIES: try: r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS) if r.status_code == 200: try: data = r.json() except: data = r.text CACHE[cache_key] = data FAIL_COUNTER[api_name] = 0 return data except: pass retries += 1 time.sleep(min(BACKOFF_FACTOR**retries,30)) FAIL_COUNTER[api_name] += 1 if FAIL_COUNTER[api_name] >= 10: API_ACTIVE[api_name] = False return None def batch_query_gnd(terms): results={} if not API_ACTIVE.get("gnd", False): for t in terms: results[t] = "" return results for t in terms: url="https://lobid.org/gnd/search" params={"q":t,"format":"json"} data = request_with_retries("gnd", url, params) top = "" if data and "member" in data: cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None,t.lower(),(doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")] cands = [c for c in cands if c[1]>=0.75] if cands: top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0] results[t] = top return results def batch_query_wikidata(terms): results={} if not API_ACTIVE.get("wikidata", False): for t in terms: results[t] = "" return results for t in terms: url="https://www.wikidata.org/w/api.php" params={"action":"wbsearchentities","search":t,"language":"de","format":"json"} data = request_with_retries("wikidata", url, params) top = "" if data and "search" in data: cands = [(e.get("label",""), SequenceMatcher(None,t.lower(),e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")] cands = [c for c in cands if c[1]>=0.70] if cands: top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0] results[t] = top return results # ========================= # Markierung / Export # ========================= def mark_norm_hits(file_path): ext = file_path.suffix.lower() if ext in [".xlsx", ".xls"]: from openpyxl import load_workbook from openpyxl.styles import PatternFill wb = load_workbook(file_path) ws = wb.active green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid") red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid") col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])} norm_col = col_map.get("Norm_Treffer", None) if not norm_col: print("Spalte 'Norm_Treffer' nicht gefunden") return for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col): cell = row[0] if cell.value and cell.value != "KEIN TREFFER": cell.fill = green_fill else: cell.fill = red_fill wb.save(file_path) elif ext==".ods": df = pd.read_excel(file_path, engine="odf") df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer") df.to_excel(file_path, index=False, engine="odf") # ========================= # Verarbeitung Input-Dateien # ========================= ## ========================= # Neue Funktion: fehlende Begriffe in separate Datei exportieren # ========================= def export_missing_terms(out_df, output_file): # Filter: KEIN TREFFER & keine Vorschläge missing_df = out_df[ (out_df["Norm_Treffer"] == "KEIN TREFFER") & (out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == "")) ][["Begriff"]].drop_duplicates() count_missing = len(missing_df) print(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}") if count_missing == 0: return # Neue Datei erzeugen ext = output_file.suffix.lower() base_name = output_file.stem missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}" # Bei vorhandener Datei: Versionsnummer anhängen version = 1 while missing_file.exists(): missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}" version += 1 if ext in [".xlsx", ".xls"]: missing_df.to_excel(missing_file, index=False, engine="openpyxl") elif ext == ".ods": missing_df.to_excel(missing_file, index=False, engine="odf") else: # Für CSV missing_df.to_csv(missing_file, index=False, sep=";") print(f"Fehlende Begriffe gespeichert: {missing_file}") # ========================= # Verarbeitung Input-Dateien (final) # ========================= # ========================= # Neue Funktion: fehlende Begriffe in separate Datei exportieren # ========================= def export_missing_terms(out_df, output_file): # Filter: KEIN TREFFER & keine Vorschläge missing_df = out_df[ (out_df["Norm_Treffer"] == "KEIN TREFFER") & (out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == "")) ][["Begriff"]].drop_duplicates() count_missing = len(missing_df) print(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}") if count_missing == 0: return # Neue Datei erzeugen ext = output_file.suffix.lower() base_name = output_file.stem missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}" # Bei vorhandener Datei: Versionsnummer anhängen version = 1 while missing_file.exists(): missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}" version += 1 if ext in [".xlsx", ".xls"]: missing_df.to_excel(missing_file, index=False, engine="openpyxl") elif ext == ".ods": missing_df.to_excel(missing_file, index=False, engine="odf") else: # Für CSV missing_df.to_csv(missing_file, index=False, sep=";") print(f"Fehlende Begriffe gespeichert: {missing_file}") # ========================= # Verarbeitung Input-Dateien (final) # ========================= def process_files(): norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE) total_terms = 0 total_hits = 0 if not INPUT_DIR.exists(): print(f"Eingabeordner {INPUT_DIR} fehlt") sys.exit(1) files = list(INPUT_DIR.glob("*")) if not files: print("Keine Dateien gefunden") return for file_path in files: if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]: continue print(f"Verarbeite Datei: {file_path.name}") try: if file_path.suffix.lower() == ".csv": df = pd.read_csv(file_path) else: df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None) except Exception as e: print(f"Fehler beim Lesen von {file_path.name}: {e}") continue df = df.dropna(how="all") df.columns = [str(c).strip() for c in df.columns] besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None) box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None) urh_col = next((c for c in df.columns if "Urheber" in c), None) if not besch_col: continue row_terms_map = [] for _, row in df.iterrows(): besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else "" if not besch: continue obj_box = row[box_col] if box_col else "" urheber = row[urh_col] if urh_col else "" clauses = [c.strip() for c in re.split(r",", besch) if c.strip()] terms = [] for clause in clauses: parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()] for p in parts: if p.lower() in STOPWORDS: continue if re.fullmatch(r"\d+", p): continue terms.append(p) row_terms_map.append((obj_box, urheber, terms)) all_terms = list({t for _,_,terms in row_terms_map for t in terms}) gnd_results = batch_query_gnd(all_terms) wd_results = batch_query_wikidata(all_terms) output_rows = [] for obj_box, urheber, terms in row_terms_map: for term in terms: norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map) total_terms += 1 if norm_name != "KEIN TREFFER": total_hits += 1 out_row = { "Box": obj_box, "Objekt/Ebene": obj_box, "Urheber": urheber, "Begriff": term, "Norm_Treffer": norm_name, "Norm_ID": norm_id, "Norm_Vorschlag": ", ".join(suggestions) if suggestions else "", "GND_Top1": gnd_results.get(term,""), "WD_Top1": wd_results.get(term,"") } output_rows.append(out_row) out_df = pd.DataFrame(output_rows) output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}" version = 1 while output_file.exists(): output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}" version += 1 engine = "odf" if output_file.suffix.lower()==".ods" else None out_df.to_excel(output_file, index=False, engine=engine) # --- NEU: fehlende Begriffe in separate Datei --- export_missing_terms(out_df, output_file) mark_norm_hits(output_file) print(f"Auswertung gespeichert: {output_file}") save_cache() print(f"Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular") # ========================= # Main # ========================= if __name__ == "__main__": process_files() print("Fertig")