import os import sys import re import time import json import pandas as pd import requests from pathlib import Path from collections import defaultdict from difflib import SequenceMatcher # RapidFuzz für Token-basierte Fuzzy-Suche try: from rapidfuzz import fuzz RAPIDFUZZ_AVAILABLE = True print("RapidFuzz verfügbar") except ImportError: RAPIDFUZZ_AVAILABLE = False print("RapidFuzz nicht verfügbar – nutze SequenceMatcher") # Spacy Lemmatizer try: import spacy nlp = spacy.load("de_core_news_sm") SPACY_AVAILABLE = True print("Spacy Lemmatizer aktiviert") except: SPACY_AVAILABLE = False nlp = None print("Spacy nicht verfügbar – nutze naive Stemmer") # ========================= # Pfade & Config # ========================= INPUT_DIR = Path("Input CSV") OUTPUT_DIR = Path("Auswertung Ergebnisse") OUTPUT_DIR.mkdir(exist_ok=True) NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods") CACHE_FILE = "api_cache.json" STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"} CONF_THRESHOLD = 0.75 TIMEOUT = 5 MAX_RETRIES = 3 BACKOFF_FACTOR = 2 HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"} API_ACTIVE = {"gnd": True, "wikidata": True} FAIL_COUNTER = {"gnd": 0, "wikidata": 0} # Cache if os.path.exists(CACHE_FILE): with open(CACHE_FILE,"r",encoding="utf-8") as f: CACHE = json.load(f) else: CACHE = {} def save_cache(): with open(CACHE_FILE,"w",encoding="utf-8") as f: json.dump(CACHE, f, indent=2, ensure_ascii=False) # ========================= # Normalisierung / Lemma # ========================= def normalize_text(s): if not s: return "" s = str(s).lower().strip() s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s) s = re.sub(r"\s+"," ",s) return s # Lemma-Cache lemma_cache = {} def lemmatize_term(term): term_norm = normalize_text(term) if term_norm in lemma_cache: return lemma_cache[term_norm] if SPACY_AVAILABLE and nlp: doc = nlp(term_norm) lemma = " ".join([token.lemma_ for token in doc]) else: lemma = term_norm lemma_cache[term_norm] = lemma return lemma # ========================= # Kompositum-Zerlegung (einfacher Ansatz) # ========================= def compound_split(term): parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term) return parts if parts else [term] # ========================= # Normvokabular laden & Lemma vorbereiten # ========================= def load_normvokabular(file_path): sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None) norm_dict = {} stem_index = defaultdict(list) lemma_norm_map = {} # für RapidFuzz preprocessed for sheet_name, df in sheets.items(): if sheet_name.lower() in ["master", "übersicht"]: continue df = df.dropna(how="all", axis=1) df.columns = [str(c).strip() for c in df.columns] id_col = next((c for c in df.columns if "ID" in c), None) word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c), None) if not id_col or not word_col: continue current_parent_id = None for _, row in df.iterrows(): row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None if row_id: current_parent_id = row_id if not row_word: continue assigned_parent_id = current_parent_id entry = { "Name": row_word, "ID": assigned_parent_id, # Parent-ID "Sheet": sheet_name, "Own_ID": row_id or "" # eigene ID, falls vorhanden } key = normalize_text(row_word) norm_dict[key] = entry lemma = lemmatize_term(key) stem_index[lemma].append(entry) if lemma not in lemma_norm_map: lemma_norm_map[lemma] = entry return norm_dict, stem_index, lemma_norm_map # ========================= # Mapping & Vorschläge # ========================= def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3): term_norm = normalize_text(term) term_lemma = lemmatize_term(term) # Exakter Treffer if term_norm in norm_dict: e = norm_dict[term_norm] return e["Name"], e["ID"], [] # Lemma-Treffer if term_lemma in stem_index: e = stem_index[term_lemma][0] return e["Name"], e["ID"], [] # KEIN TREFFER → Kompositum-Split tokens = compound_split(term) if len(tokens) == 1: suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n) return "KEIN TREFFER", "", suggestions else: token_matches = [] for t in tokens: t_lemma = lemmatize_term(t) if t_lemma in stem_index: e = stem_index[t_lemma][0] token_matches.append((t, e["Name"], e["ID"])) else: sugg = get_suggestions(t_lemma, lemma_norm_map, top_n) token_matches.append((t, "KEIN TREFFER", "", sugg)) combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"] return "KEIN TREFFER", "", combined_suggestions def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD): candidates = [] for key_lemma, entry in lemma_norm_map.items(): if RAPIDFUZZ_AVAILABLE: score = fuzz.token_set_ratio(term_lemma, key_lemma)/100 else: score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio() if key_lemma.lower().startswith(term_lemma.lower()): score = min(score + 0.1, 1.0) if score >= threshold: candidates.append((score, entry["Name"], entry["ID"])) candidates.sort(reverse=True) return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]] # ========================= # API-Abfragen # ========================= def request_with_retries(api_name,url,params=None): cache_key = url + str(params) if cache_key in CACHE: return CACHE[cache_key] retries = 0 while retries < MAX_RETRIES: try: r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS) if r.status_code == 200: try: data = r.json() except: data = r.text CACHE[cache_key] = data FAIL_COUNTER[api_name] = 0 return data except: pass retries += 1 time.sleep(min(BACKOFF_FACTOR**retries,30)) FAIL_COUNTER[api_name] += 1 if FAIL_COUNTER[api_name] >= 10: API_ACTIVE[api_name] = False return None def batch_query_gnd(terms): results={} if not API_ACTIVE.get("gnd", False): for t in terms: results[t] = "" return results for t in terms: url="https://lobid.org/gnd/search" params={"q":t,"format":"json"} data = request_with_retries("gnd", url, params) top = "" if data and "member" in data: cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None,t.lower(),(doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")] cands = [c for c in cands if c[1]>=0.75] if cands: top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0] results[t] = top return results def batch_query_wikidata(terms): results={} if not API_ACTIVE.get("wikidata", False): for t in terms: results[t] = "" return results for t in terms: url="https://www.wikidata.org/w/api.php" params={"action":"wbsearchentities","search":t,"language":"de","format":"json"} data = request_with_retries("wikidata", url, params) top = "" if data and "search" in data: cands = [(e.get("label",""), SequenceMatcher(None,t.lower(),e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")] cands = [c for c in cands if c[1]>=0.70] if cands: top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0] results[t] = top return results # ========================= # Markierung / Export # ========================= def mark_norm_hits(file_path): ext = file_path.suffix.lower() if ext in [".xlsx", ".xls"]: from openpyxl import load_workbook from openpyxl.styles import PatternFill wb = load_workbook(file_path) ws = wb.active green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid") red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid") col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])} norm_col = col_map.get("Norm_Treffer", None) if not norm_col: print("Spalte 'Norm_Treffer' nicht gefunden") return for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col): cell = row[0] if cell.value and cell.value != "KEIN TREFFER": cell.fill = green_fill else: cell.fill = red_fill wb.save(file_path) elif ext==".ods": df = pd.read_excel(file_path, engine="odf") df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer") df.to_excel(file_path, index=False, engine="odf") # ========================= # Verarbeitung Input-Dateien # ========================= def process_files(): norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE) total_terms = 0 total_hits = 0 if not INPUT_DIR.exists(): print(f"Eingabeordner {INPUT_DIR} fehlt") sys.exit(1) files = list(INPUT_DIR.glob("*")) if not files: print("Keine Dateien gefunden") return for file_path in files: if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]: continue print(f"Verarbeite Datei: {file_path.name}") try: if file_path.suffix.lower() == ".csv": df = pd.read_csv(file_path) else: df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None) except Exception as e: print(f"Fehler beim Lesen von {file_path.name}: {e}") continue df = df.dropna(how="all") df.columns = [str(c).strip() for c in df.columns] besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None) box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None) urh_col = next((c for c in df.columns if "Urheber" in c), None) if not besch_col: continue row_terms_map = [] for _, row in df.iterrows(): besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else "" if not besch: continue obj_box = row[box_col] if box_col else "" urheber = row[urh_col] if urh_col else "" clauses = [c.strip() for c in re.split(r",", besch) if c.strip()] terms = [] for clause in clauses: parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()] for p in parts: if p.lower() in STOPWORDS: continue if re.fullmatch(r"\d+", p): continue terms.append(p) row_terms_map.append((obj_box, urheber, terms)) all_terms = list({t for _,_,terms in row_terms_map for t in terms}) gnd_results = batch_query_gnd(all_terms) wd_results = batch_query_wikidata(all_terms) output_rows = [] for obj_box, urheber, terms in row_terms_map: for term in terms: norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map) total_terms += 1 if norm_name != "KEIN TREFFER": total_hits += 1 out_row = { "Box": obj_box, "Objekt/Ebene": obj_box, "Urheber": urheber, "Begriff": term, "Norm_Treffer": norm_name, "Norm_ID": norm_id, "Norm_Vorschlag": ", ".join(suggestions) if suggestions else "", "GND_Top1": gnd_results.get(term,""), "WD_Top1": wd_results.get(term,"") } output_rows.append(out_row) out_df = pd.DataFrame(output_rows) output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}" version = 1 while output_file.exists(): output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}" version += 1 engine = "odf" if output_file.suffix.lower()==".ods" else None out_df.to_excel(output_file, index=False, engine=engine) mark_norm_hits(output_file) print(f"Auswertung gespeichert: {output_file}") save_cache() print(f"Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular") # ========================= # Main # ========================= if __name__ == "__main__": process_files() print("Fertig")