GND_Skript_Test/NormVokabular_Mapper_1.1.py
2025-10-10 09:46:41 +02:00

450 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
========================================================================
NormVokabular Mapper Übersicht
========================================================================
Dieses Skript dient dazu, Begriffe aus Datenbeständen mit einem
vordefinierten Normvokabular abzugleichen. Es identifiziert Treffer,
gibt bei fehlenden Treffern Vorschläge aus und ermöglicht optional
einen Abgleich mit externen APIs (GND, Wikidata).
Hauptfunktionen:
1. **Input verarbeiten**
- Liest CSV-, Excel- und ODS-Dateien aus dem Ordner "Input CSV".
- Extrahiert relevante Begriffe aus Spalten wie "Objektbeschreibung",
filtert Stopwords und Zahlen.
2. **Normvokabular laden**
- Liest die Masterdatei NV_MASTER.ods ein.
- Berücksichtigt Hierarchie-IDs, um übergeordnete Begriffe zuordnen zu können.
- Erstellt ein Index für gestemmte Begriffe, um auch ähnliche Schreibweisen zu erkennen.
3. **Mapping auf Normvokabular**
- Prüft, ob ein Begriff exakt oder gestemmt im Normvokabular vorkommt.
- Wenn kein Treffer vorliegt, werden alternative Vorschläge generiert.
4. **API-Abgleich (optional)**
- Fragt GND und Wikidata ab, um den Top-1 Treffer für jeden Begriff zu ermitteln.
- Nutzt einen Cache, um wiederholte Requests zu vermeiden.
- Bietet einen Dry-Run-Modus für Tests ohne Internetzugang.
5. **Ergebnis speichern**
- Speichert die Auswertung in einem eigenen Ordner "Auswertung Ergebnisse".
- Markiert Treffer visuell: grün = Treffer, rot = kein Treffer (bei Excel),
bzw. fügt Statusspalte bei ODS-Dateien hinzu.
- Enthält alle relevanten Informationen pro Begriff: Originalbegriff, Normbegriff,
Norm-ID, Vorschläge, GND/Wikidata Top1 Treffer.
6. **Logging**
- Informiert über Fortschritt, Anzahl der Begriffe, Treffer und mögliche Fehler.
**Nutzung:**
```bash
python normvokabular_mapper.py
python normvokabular_mapper.py --dry-run # nur Simulation der API-Abfragen
"""
import os
import sys
import time
import json
import re
import requests
import pandas as pd
from pathlib import Path
from difflib import SequenceMatcher
import argparse
from collections import defaultdict
# =========================
# Argumente / Dry-Run
# =========================
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true', help='API-Abfragen simulieren')
args = parser.parse_args()
DRY_RUN = args.dry_run
# =========================
# Konfiguration
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
TIMEOUT = 5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
CACHE_FILE = "api_cache.json"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
API_ACTIVE = {"gnd": True, "wikidata": True}
FAIL_COUNTER = {"gnd":0, "wikidata":0}
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
CONF_THRESHOLD = 0.75 # für Vorschläge
# =========================
# Logging
# =========================
def log(level, msg):
ts = time.strftime("%Y-%m-%d %H:%M:%S")
print(f"[{ts}] [{level}] {msg}")
# =========================
# Cache laden / speichern
# =========================
if os.path.exists(CACHE_FILE):
try:
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
log("INFO", f"Cache geladen: {CACHE_FILE}")
except:
CACHE = {}
else:
CACHE = {}
def save_cache():
try:
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
log("DEBUG","Cache gespeichert")
except Exception as e:
log("ERROR", f"Cache speichern fehlgeschlagen: {e}")
# =========================
# Normalisierung / Stemming
# =========================
try:
from nltk.stem.snowball import GermanStemmer
STEMMER = GermanStemmer()
log("INFO","NLTK GermanStemmer verfügbar")
except:
STEMMER = None
log("WARNING","NLTK nicht verfügbar, naive Pluralreduktion wird genutzt")
def normalize_text(s):
if s is None:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
def naive_stem(w):
for ending in ("ern","nen","en","er","e","n","s"):
if w.endswith(ending) and len(w)-len(ending)>=3:
return w[:-len(ending)]
return w
def stem_word(word):
w = normalize_text(word)
try:
return STEMMER.stem(w) if STEMMER else naive_stem(w)
except:
return naive_stem(w)
from collections import defaultdict
from difflib import SequenceMatcher
CONF_THRESHOLD = 0.75 # Confidence für Vorschläge
# =========================
# Normvokabular laden (NV_MASTER) mit Parent-ID & Stem-Index
# =========================
def load_normvokabular(file_path):
import pandas as pd
import re
log("INFO", f"Normvokabular laden: {file_path}")
engine = "odf" if file_path.suffix.lower() == ".ods" else None
sheets = pd.read_excel(file_path, sheet_name=None, engine=engine)
norm_dict = {}
stem_index = defaultdict(list)
count = 0
for sheet_name, df in sheets.items():
df.columns = [str(c).strip() for c in df.columns]
current_parent_id = None
for _, row in df.iterrows():
# Spaltennamen flexibel anpassen
id_val = str(row.get("ID","")).strip() if "ID" in df.columns else ""
wort = str(row.get("Wort/Vokabel","")).strip() if "Wort/Vokabel" in df.columns else ""
# Zeilen mit ID, aber ohne Vokabel → Update Parent-ID
if id_val:
current_parent_id = id_val
# Skip leere Vokabeln
if not wort:
continue
assigned_id = current_parent_id # Parent-ID übernehmen
key = normalize_text(wort)
entry = {
"Name": wort,
"ID": assigned_id,
"Sheet": sheet_name
}
norm_dict[key] = entry
stem_index[stem_word(key)].append(entry)
count += 1
log("INFO", f"{count} Begriffe aus Normvokabular geladen")
return norm_dict, stem_index
# =========================
# Mapping & Vorschläge
# =========================
def map_to_norm(term, norm_dict, stem_index):
tnorm = normalize_text(term)
tstem = stem_word(tnorm)
# Exakter Treffer
if tnorm in norm_dict:
e = norm_dict[tnorm]
return e["Name"], e["ID"], []
# Gestemmter Treffer
if tstem in stem_index:
e = stem_index[tstem][0]
return e["Name"], e["ID"], []
# Kein Treffer → Vorschläge
suggestions = get_suggestions(tnorm, norm_dict)
return "KEIN TREFFER", "", suggestions
def get_suggestions(term, norm_dict, top_n=3, threshold=CONF_THRESHOLD):
t = term.lower()
scores = []
for key, val in norm_dict.items():
score = SequenceMatcher(None, t, key).ratio()
if score >= threshold:
scores.append((score, val["Name"], val["ID"]))
scores.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in scores[:top_n]]
# =========================
# API-Abgleich (Top1) unverändert
# =========================
def request_with_retries(api_name,url,params=None):
if DRY_RUN:
return None
cache_key = url + str(params)
if cache_key in CACHE:
return CACHE[cache_key]
retries = 0
while retries<MAX_RETRIES:
try:
r = requests.get(url,params=params,timeout=TIMEOUT,headers=HEADERS)
if r.status_code==200:
try: data=r.json()
except: data=r.text
CACHE[cache_key]=data
FAIL_COUNTER[api_name]=0
return data
except:
pass
retries+=1
time.sleep(min(BACKOFF_FACTOR**retries,30))
FAIL_COUNTER[api_name]+=1
if FAIL_COUNTER[api_name]>=10:
API_ACTIVE[api_name]=False
return None
def compute_min_conf(term,api_name):
l=len(term.strip())
if l<=3: return 0.90
if l<=6: return 0.85 if api_name=='gnd' else 0.80
return 0.75 if api_name=='gnd' else 0.70
def batch_query_gnd(terms):
results={}
if DRY_RUN or not API_ACTIVE.get("gnd",False):
for t in terms: results[t]="TEST_GND"
return results
for t in terms:
url="https://lobid.org/gnd/search"
params={"q":t,"format":"json"}
data=request_with_retries("gnd",url,params)
top=""
if data and "member" in data:
min_conf=compute_min_conf(t,'gnd')
cands=[]
for doc in data["member"]:
name=doc.get("preferredName","") or doc.get("name","")
if not name: continue
conf=SequenceMatcher(None,t.lower(),name.lower()).ratio()
if conf>=min_conf: cands.append((name,conf))
if cands:
top=sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t]=top
return results
def batch_query_wikidata(terms):
results={}
if DRY_RUN or not API_ACTIVE.get("wikidata",False):
for t in terms: results[t]="TEST_WD"
return results
for t in terms:
url="https://www.wikidata.org/w/api.php"
params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
data=request_with_retries("wikidata",url,params)
top=""
if data and "search" in data:
min_conf=compute_min_conf(t,'wikidata')
cands=[]
for e in data["search"]:
label=e.get("label","")
if not label: continue
conf=SequenceMatcher(None,t.lower(),label.lower()).ratio()
if conf>=min_conf: cands.append((label,conf))
if cands:
top=sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t]=top
return results
# =========================
# Formatabhängige Markierung / Status
# =========================
def mark_norm_hits(file_path):
ext = file_path.suffix.lower()
if ext in [".xlsx", ".xls"]:
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
wb = load_workbook(file_path)
ws = wb.active
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
norm_col = col_map.get("Norm_Treffer", None)
if not norm_col:
log("WARNING","Spalte 'Norm_Treffer' nicht gefunden, keine Markierung möglich")
return
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
cell = row[0]
if cell.value and cell.value!="KEIN TREFFER":
cell.fill = green_fill
else:
cell.fill = red_fill
wb.save(file_path)
log("INFO","Excel: Treffer farblich markiert (grün=Treffer, rot=kein Treffer)")
elif ext==".ods":
df = pd.read_excel(file_path, engine="odf")
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
df.to_excel(file_path, index=False, engine="odf")
log("INFO","ODS: Spalte 'Norm_Status' eingefügt (Treffer / Kein Treffer)")
else:
log("WARNING","Unbekanntes Dateiformat, keine Markierung durchgeführt")
# =========================
# Verarbeitung Input-Dateien
# =========================
def process_files():
norm_dict, stem_index = load_normvokabular(NORMVOC_FILE)
total_terms=0
total_norm_hits=0
if not INPUT_DIR.exists():
log("CRITICAL",f"Eingabeordner {INPUT_DIR} fehlt")
sys.exit(1)
files=list(INPUT_DIR.glob("*"))
if not files:
log("WARNING","Keine Dateien gefunden")
for file_path in files:
if not file_path.suffix.lower() in [".ods",".xlsx",".csv",".xls"]:
continue
log("INFO",f"Verarbeite Datei: {file_path.name}")
# Output-Datei für diese Input-Datei erzeugen
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
version = 1
while output_file.exists():
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
version += 1
try:
if file_path.suffix.lower()==".csv":
df=pd.read_csv(file_path)
elif file_path.suffix.lower()==".ods":
df=pd.read_excel(file_path, engine="odf")
else:
df=pd.read_excel(file_path)
except Exception as e:
log("ERROR",f"Datei {file_path.name} konnte nicht gelesen werden: {e}")
continue
df.columns=[str(c).strip() for c in df.columns]
row_terms_map=[]
for _,row in df.iterrows():
besch=row.get("Objektbeschreibung","")
if pd.isna(besch) or not str(besch).strip(): continue
besch=str(besch).strip()
clauses=[c.strip() for c in re.split(r",",besch) if c.strip()]
terms=[]
for clause in clauses:
parts=[p.strip() for p in re.split(r"\s+",clause) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS: continue
if re.fullmatch(r"\d+",p): continue
terms.append(p)
obj_box=row.get("Objekt/Ebene","")
urheber=row.get("Urheber","")
row_terms_map.append((obj_box,urheber,terms))
all_terms=[]
for _,_,terms in row_terms_map:
all_terms.extend(terms)
all_terms = list(set(all_terms)) # unique
gnd_results=batch_query_gnd(all_terms)
wd_results=batch_query_wikidata(all_terms)
output_rows=[]
for obj_box,urheber,terms in row_terms_map:
for term in terms:
norm_name,norm_id,suggestions = map_to_norm(term,norm_dict, stem_index)
total_terms+=1
if norm_name!="KEIN TREFFER":
total_norm_hits+=1
out_row={
"Box": obj_box,
"Objekt/Ebene": obj_box,
"Urheber": urheber,
"Begriff": term,
"Norm_Treffer": norm_name,
"Norm_ID": norm_id,
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
"GND_Top1": gnd_results.get(term,""),
"WD_Top1": wd_results.get(term,"")
}
output_rows.append(out_row)
out_df=pd.DataFrame(output_rows)
engine = "odf" if output_file.suffix.lower()==".ods" else None
out_df.to_excel(output_file,index=False,engine=engine)
log("INFO",f"Auswertung gespeichert: {output_file}")
mark_norm_hits(output_file)
save_cache()
log("INFO",f"Gesamt: {total_terms} Begriffe, {total_norm_hits} Treffer im Normvokabular")
# =========================
# Main
# =========================
if __name__=="__main__":
process_files()
log("INFO","Fertig")