450 lines
15 KiB
Python
450 lines
15 KiB
Python
"""
|
||
========================================================================
|
||
NormVokabular Mapper – Übersicht
|
||
========================================================================
|
||
|
||
Dieses Skript dient dazu, Begriffe aus Datenbeständen mit einem
|
||
vordefinierten Normvokabular abzugleichen. Es identifiziert Treffer,
|
||
gibt bei fehlenden Treffern Vorschläge aus und ermöglicht optional
|
||
einen Abgleich mit externen APIs (GND, Wikidata).
|
||
|
||
Hauptfunktionen:
|
||
|
||
1. **Input verarbeiten**
|
||
- Liest CSV-, Excel- und ODS-Dateien aus dem Ordner "Input CSV".
|
||
- Extrahiert relevante Begriffe aus Spalten wie "Objektbeschreibung",
|
||
filtert Stopwords und Zahlen.
|
||
|
||
2. **Normvokabular laden**
|
||
- Liest die Masterdatei NV_MASTER.ods ein.
|
||
- Berücksichtigt Hierarchie-IDs, um übergeordnete Begriffe zuordnen zu können.
|
||
- Erstellt ein Index für gestemmte Begriffe, um auch ähnliche Schreibweisen zu erkennen.
|
||
|
||
3. **Mapping auf Normvokabular**
|
||
- Prüft, ob ein Begriff exakt oder gestemmt im Normvokabular vorkommt.
|
||
- Wenn kein Treffer vorliegt, werden alternative Vorschläge generiert.
|
||
|
||
4. **API-Abgleich (optional)**
|
||
- Fragt GND und Wikidata ab, um den Top-1 Treffer für jeden Begriff zu ermitteln.
|
||
- Nutzt einen Cache, um wiederholte Requests zu vermeiden.
|
||
- Bietet einen Dry-Run-Modus für Tests ohne Internetzugang.
|
||
|
||
5. **Ergebnis speichern**
|
||
- Speichert die Auswertung in einem eigenen Ordner "Auswertung Ergebnisse".
|
||
- Markiert Treffer visuell: grün = Treffer, rot = kein Treffer (bei Excel),
|
||
bzw. fügt Statusspalte bei ODS-Dateien hinzu.
|
||
- Enthält alle relevanten Informationen pro Begriff: Originalbegriff, Normbegriff,
|
||
Norm-ID, Vorschläge, GND/Wikidata Top1 Treffer.
|
||
|
||
6. **Logging**
|
||
- Informiert über Fortschritt, Anzahl der Begriffe, Treffer und mögliche Fehler.
|
||
|
||
|
||
**Nutzung:**
|
||
```bash
|
||
python normvokabular_mapper.py
|
||
python normvokabular_mapper.py --dry-run # nur Simulation der API-Abfragen
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import time
|
||
import json
|
||
import re
|
||
import requests
|
||
import pandas as pd
|
||
from pathlib import Path
|
||
from difflib import SequenceMatcher
|
||
import argparse
|
||
from collections import defaultdict
|
||
|
||
# =========================
|
||
# Argumente / Dry-Run
|
||
# =========================
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument('--dry-run', action='store_true', help='API-Abfragen simulieren')
|
||
args = parser.parse_args()
|
||
DRY_RUN = args.dry_run
|
||
|
||
# =========================
|
||
# Konfiguration
|
||
# =========================
|
||
INPUT_DIR = Path("Input CSV")
|
||
OUTPUT_DIR = Path("Auswertung Ergebnisse")
|
||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
|
||
|
||
TIMEOUT = 5
|
||
MAX_RETRIES = 3
|
||
BACKOFF_FACTOR = 2
|
||
CACHE_FILE = "api_cache.json"
|
||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||
|
||
API_ACTIVE = {"gnd": True, "wikidata": True}
|
||
FAIL_COUNTER = {"gnd":0, "wikidata":0}
|
||
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
|
||
|
||
CONF_THRESHOLD = 0.75 # für Vorschläge
|
||
|
||
# =========================
|
||
# Logging
|
||
# =========================
|
||
def log(level, msg):
|
||
ts = time.strftime("%Y-%m-%d %H:%M:%S")
|
||
print(f"[{ts}] [{level}] {msg}")
|
||
|
||
# =========================
|
||
# Cache laden / speichern
|
||
# =========================
|
||
if os.path.exists(CACHE_FILE):
|
||
try:
|
||
with open(CACHE_FILE,"r",encoding="utf-8") as f:
|
||
CACHE = json.load(f)
|
||
log("INFO", f"Cache geladen: {CACHE_FILE}")
|
||
except:
|
||
CACHE = {}
|
||
else:
|
||
CACHE = {}
|
||
|
||
def save_cache():
|
||
try:
|
||
with open(CACHE_FILE,"w",encoding="utf-8") as f:
|
||
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
||
log("DEBUG","Cache gespeichert")
|
||
except Exception as e:
|
||
log("ERROR", f"Cache speichern fehlgeschlagen: {e}")
|
||
|
||
# =========================
|
||
# Normalisierung / Stemming
|
||
# =========================
|
||
try:
|
||
from nltk.stem.snowball import GermanStemmer
|
||
STEMMER = GermanStemmer()
|
||
log("INFO","NLTK GermanStemmer verfügbar")
|
||
except:
|
||
STEMMER = None
|
||
log("WARNING","NLTK nicht verfügbar, naive Pluralreduktion wird genutzt")
|
||
|
||
def normalize_text(s):
|
||
if s is None:
|
||
return ""
|
||
s = str(s).lower().strip()
|
||
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
|
||
s = re.sub(r"\s+"," ",s)
|
||
return s
|
||
|
||
def naive_stem(w):
|
||
for ending in ("ern","nen","en","er","e","n","s"):
|
||
if w.endswith(ending) and len(w)-len(ending)>=3:
|
||
return w[:-len(ending)]
|
||
return w
|
||
|
||
def stem_word(word):
|
||
w = normalize_text(word)
|
||
try:
|
||
return STEMMER.stem(w) if STEMMER else naive_stem(w)
|
||
except:
|
||
return naive_stem(w)
|
||
|
||
from collections import defaultdict
|
||
from difflib import SequenceMatcher
|
||
|
||
CONF_THRESHOLD = 0.75 # Confidence für Vorschläge
|
||
|
||
# =========================
|
||
# Normvokabular laden (NV_MASTER) mit Parent-ID & Stem-Index
|
||
# =========================
|
||
def load_normvokabular(file_path):
|
||
import pandas as pd
|
||
import re
|
||
log("INFO", f"Normvokabular laden: {file_path}")
|
||
|
||
engine = "odf" if file_path.suffix.lower() == ".ods" else None
|
||
sheets = pd.read_excel(file_path, sheet_name=None, engine=engine)
|
||
|
||
norm_dict = {}
|
||
stem_index = defaultdict(list)
|
||
count = 0
|
||
|
||
for sheet_name, df in sheets.items():
|
||
df.columns = [str(c).strip() for c in df.columns]
|
||
current_parent_id = None
|
||
|
||
for _, row in df.iterrows():
|
||
# Spaltennamen flexibel anpassen
|
||
id_val = str(row.get("ID","")).strip() if "ID" in df.columns else ""
|
||
wort = str(row.get("Wort/Vokabel","")).strip() if "Wort/Vokabel" in df.columns else ""
|
||
|
||
# Zeilen mit ID, aber ohne Vokabel → Update Parent-ID
|
||
if id_val:
|
||
current_parent_id = id_val
|
||
|
||
# Skip leere Vokabeln
|
||
if not wort:
|
||
continue
|
||
|
||
assigned_id = current_parent_id # Parent-ID übernehmen
|
||
key = normalize_text(wort)
|
||
entry = {
|
||
"Name": wort,
|
||
"ID": assigned_id,
|
||
"Sheet": sheet_name
|
||
}
|
||
norm_dict[key] = entry
|
||
stem_index[stem_word(key)].append(entry)
|
||
count += 1
|
||
|
||
log("INFO", f"{count} Begriffe aus Normvokabular geladen")
|
||
return norm_dict, stem_index
|
||
|
||
# =========================
|
||
# Mapping & Vorschläge
|
||
# =========================
|
||
def map_to_norm(term, norm_dict, stem_index):
|
||
tnorm = normalize_text(term)
|
||
tstem = stem_word(tnorm)
|
||
|
||
# Exakter Treffer
|
||
if tnorm in norm_dict:
|
||
e = norm_dict[tnorm]
|
||
return e["Name"], e["ID"], []
|
||
|
||
# Gestemmter Treffer
|
||
if tstem in stem_index:
|
||
e = stem_index[tstem][0]
|
||
return e["Name"], e["ID"], []
|
||
|
||
# Kein Treffer → Vorschläge
|
||
suggestions = get_suggestions(tnorm, norm_dict)
|
||
return "KEIN TREFFER", "", suggestions
|
||
|
||
def get_suggestions(term, norm_dict, top_n=3, threshold=CONF_THRESHOLD):
|
||
t = term.lower()
|
||
scores = []
|
||
for key, val in norm_dict.items():
|
||
score = SequenceMatcher(None, t, key).ratio()
|
||
if score >= threshold:
|
||
scores.append((score, val["Name"], val["ID"]))
|
||
scores.sort(reverse=True)
|
||
return [f"{name} ({id_})" for _, name, id_ in scores[:top_n]]
|
||
|
||
|
||
|
||
# =========================
|
||
# API-Abgleich (Top1) unverändert
|
||
# =========================
|
||
def request_with_retries(api_name,url,params=None):
|
||
if DRY_RUN:
|
||
return None
|
||
cache_key = url + str(params)
|
||
if cache_key in CACHE:
|
||
return CACHE[cache_key]
|
||
retries = 0
|
||
while retries<MAX_RETRIES:
|
||
try:
|
||
r = requests.get(url,params=params,timeout=TIMEOUT,headers=HEADERS)
|
||
if r.status_code==200:
|
||
try: data=r.json()
|
||
except: data=r.text
|
||
CACHE[cache_key]=data
|
||
FAIL_COUNTER[api_name]=0
|
||
return data
|
||
except:
|
||
pass
|
||
retries+=1
|
||
time.sleep(min(BACKOFF_FACTOR**retries,30))
|
||
FAIL_COUNTER[api_name]+=1
|
||
if FAIL_COUNTER[api_name]>=10:
|
||
API_ACTIVE[api_name]=False
|
||
return None
|
||
|
||
def compute_min_conf(term,api_name):
|
||
l=len(term.strip())
|
||
if l<=3: return 0.90
|
||
if l<=6: return 0.85 if api_name=='gnd' else 0.80
|
||
return 0.75 if api_name=='gnd' else 0.70
|
||
|
||
def batch_query_gnd(terms):
|
||
results={}
|
||
if DRY_RUN or not API_ACTIVE.get("gnd",False):
|
||
for t in terms: results[t]="TEST_GND"
|
||
return results
|
||
for t in terms:
|
||
url="https://lobid.org/gnd/search"
|
||
params={"q":t,"format":"json"}
|
||
data=request_with_retries("gnd",url,params)
|
||
top=""
|
||
if data and "member" in data:
|
||
min_conf=compute_min_conf(t,'gnd')
|
||
cands=[]
|
||
for doc in data["member"]:
|
||
name=doc.get("preferredName","") or doc.get("name","")
|
||
if not name: continue
|
||
conf=SequenceMatcher(None,t.lower(),name.lower()).ratio()
|
||
if conf>=min_conf: cands.append((name,conf))
|
||
if cands:
|
||
top=sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
|
||
results[t]=top
|
||
return results
|
||
|
||
def batch_query_wikidata(terms):
|
||
results={}
|
||
if DRY_RUN or not API_ACTIVE.get("wikidata",False):
|
||
for t in terms: results[t]="TEST_WD"
|
||
return results
|
||
for t in terms:
|
||
url="https://www.wikidata.org/w/api.php"
|
||
params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
|
||
data=request_with_retries("wikidata",url,params)
|
||
top=""
|
||
if data and "search" in data:
|
||
min_conf=compute_min_conf(t,'wikidata')
|
||
cands=[]
|
||
for e in data["search"]:
|
||
label=e.get("label","")
|
||
if not label: continue
|
||
conf=SequenceMatcher(None,t.lower(),label.lower()).ratio()
|
||
if conf>=min_conf: cands.append((label,conf))
|
||
if cands:
|
||
top=sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
|
||
results[t]=top
|
||
return results
|
||
|
||
# =========================
|
||
# Formatabhängige Markierung / Status
|
||
# =========================
|
||
def mark_norm_hits(file_path):
|
||
ext = file_path.suffix.lower()
|
||
if ext in [".xlsx", ".xls"]:
|
||
from openpyxl import load_workbook
|
||
from openpyxl.styles import PatternFill
|
||
|
||
wb = load_workbook(file_path)
|
||
ws = wb.active
|
||
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
||
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
||
|
||
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
|
||
norm_col = col_map.get("Norm_Treffer", None)
|
||
if not norm_col:
|
||
log("WARNING","Spalte 'Norm_Treffer' nicht gefunden, keine Markierung möglich")
|
||
return
|
||
|
||
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
|
||
cell = row[0]
|
||
if cell.value and cell.value!="KEIN TREFFER":
|
||
cell.fill = green_fill
|
||
else:
|
||
cell.fill = red_fill
|
||
wb.save(file_path)
|
||
log("INFO","Excel: Treffer farblich markiert (grün=Treffer, rot=kein Treffer)")
|
||
|
||
elif ext==".ods":
|
||
df = pd.read_excel(file_path, engine="odf")
|
||
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
|
||
df.to_excel(file_path, index=False, engine="odf")
|
||
log("INFO","ODS: Spalte 'Norm_Status' eingefügt (Treffer / Kein Treffer)")
|
||
else:
|
||
log("WARNING","Unbekanntes Dateiformat, keine Markierung durchgeführt")
|
||
|
||
# =========================
|
||
# Verarbeitung Input-Dateien
|
||
# =========================
|
||
def process_files():
|
||
norm_dict, stem_index = load_normvokabular(NORMVOC_FILE)
|
||
total_terms=0
|
||
total_norm_hits=0
|
||
|
||
if not INPUT_DIR.exists():
|
||
log("CRITICAL",f"Eingabeordner {INPUT_DIR} fehlt")
|
||
sys.exit(1)
|
||
|
||
files=list(INPUT_DIR.glob("*"))
|
||
if not files:
|
||
log("WARNING","Keine Dateien gefunden")
|
||
|
||
for file_path in files:
|
||
if not file_path.suffix.lower() in [".ods",".xlsx",".csv",".xls"]:
|
||
continue
|
||
log("INFO",f"Verarbeite Datei: {file_path.name}")
|
||
|
||
# Output-Datei für diese Input-Datei erzeugen
|
||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
|
||
version = 1
|
||
while output_file.exists():
|
||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
|
||
version += 1
|
||
|
||
try:
|
||
if file_path.suffix.lower()==".csv":
|
||
df=pd.read_csv(file_path)
|
||
elif file_path.suffix.lower()==".ods":
|
||
df=pd.read_excel(file_path, engine="odf")
|
||
else:
|
||
df=pd.read_excel(file_path)
|
||
except Exception as e:
|
||
log("ERROR",f"Datei {file_path.name} konnte nicht gelesen werden: {e}")
|
||
continue
|
||
|
||
df.columns=[str(c).strip() for c in df.columns]
|
||
|
||
row_terms_map=[]
|
||
for _,row in df.iterrows():
|
||
besch=row.get("Objektbeschreibung","")
|
||
if pd.isna(besch) or not str(besch).strip(): continue
|
||
besch=str(besch).strip()
|
||
clauses=[c.strip() for c in re.split(r",",besch) if c.strip()]
|
||
terms=[]
|
||
for clause in clauses:
|
||
parts=[p.strip() for p in re.split(r"\s+",clause) if p.strip()]
|
||
for p in parts:
|
||
if p.lower() in STOPWORDS: continue
|
||
if re.fullmatch(r"\d+",p): continue
|
||
terms.append(p)
|
||
obj_box=row.get("Objekt/Ebene","")
|
||
urheber=row.get("Urheber","")
|
||
row_terms_map.append((obj_box,urheber,terms))
|
||
|
||
all_terms=[]
|
||
for _,_,terms in row_terms_map:
|
||
all_terms.extend(terms)
|
||
all_terms = list(set(all_terms)) # unique
|
||
gnd_results=batch_query_gnd(all_terms)
|
||
wd_results=batch_query_wikidata(all_terms)
|
||
|
||
output_rows=[]
|
||
for obj_box,urheber,terms in row_terms_map:
|
||
for term in terms:
|
||
norm_name,norm_id,suggestions = map_to_norm(term,norm_dict, stem_index)
|
||
total_terms+=1
|
||
if norm_name!="KEIN TREFFER":
|
||
total_norm_hits+=1
|
||
out_row={
|
||
"Box": obj_box,
|
||
"Objekt/Ebene": obj_box,
|
||
"Urheber": urheber,
|
||
"Begriff": term,
|
||
"Norm_Treffer": norm_name,
|
||
"Norm_ID": norm_id,
|
||
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
|
||
"GND_Top1": gnd_results.get(term,""),
|
||
"WD_Top1": wd_results.get(term,"")
|
||
}
|
||
output_rows.append(out_row)
|
||
|
||
out_df=pd.DataFrame(output_rows)
|
||
engine = "odf" if output_file.suffix.lower()==".ods" else None
|
||
out_df.to_excel(output_file,index=False,engine=engine)
|
||
log("INFO",f"Auswertung gespeichert: {output_file}")
|
||
mark_norm_hits(output_file)
|
||
|
||
save_cache()
|
||
log("INFO",f"Gesamt: {total_terms} Begriffe, {total_norm_hits} Treffer im Normvokabular")
|
||
|
||
# =========================
|
||
# Main
|
||
# =========================
|
||
if __name__=="__main__":
|
||
process_files()
|
||
log("INFO","Fertig")
|