GND_Skript_Test/NormVokabular_Mapper_1.2.py

"""
========================================================================
NormVokabular Mapper – Übersicht
========================================================================

Dieses Skript dient dazu, Begriffe aus Datenbeständen mit einem
vordefinierten Normvokabular abzugleichen. Es identifiziert Treffer,
gibt bei fehlenden Treffern Vorschläge aus und ermöglicht optional
einen Abgleich mit externen APIs (GND, Wikidata).

Hauptfunktionen:

1. **Input verarbeiten**
   - Liest CSV-, Excel- und ODS-Dateien aus dem Ordner "Input CSV".
   - Extrahiert relevante Begriffe aus Spalten wie "Objektbeschreibung",
     filtert Stopwords und Zahlen.

2. **Normvokabular laden**
   - Liest die Masterdatei NV_MASTER.ods ein.
   - Berücksichtigt Hierarchie-IDs, um übergeordnete Begriffe zuordnen zu können.
   - Erstellt ein Index für gestemmte Begriffe, um auch ähnliche Schreibweisen zu erkennen.

3. **Mapping auf Normvokabular**
   - Prüft, ob ein Begriff exakt oder gestemmt im Normvokabular vorkommt.
   - Wenn kein Treffer vorliegt, werden alternative Vorschläge generiert.

4. **API-Abgleich (optional)**
   - Fragt GND und Wikidata ab, um den Top-1 Treffer für jeden Begriff zu ermitteln.
   - Nutzt einen Cache, um wiederholte Requests zu vermeiden.
   - Bietet einen Dry-Run-Modus für Tests ohne Internetzugang.

5. **Ergebnis speichern**
   - Speichert die Auswertung in einem eigenen Ordner "Auswertung Ergebnisse".
   - Markiert Treffer visuell: grün = Treffer, rot = kein Treffer (bei Excel),
     bzw. fügt Statusspalte bei ODS-Dateien hinzu.
   - Enthält alle relevanten Informationen pro Begriff: Originalbegriff, Normbegriff,
     Norm-ID, Vorschläge, GND/Wikidata Top1 Treffer.

6. **Logging**
   - Informiert über Fortschritt, Anzahl der Begriffe, Treffer und mögliche Fehler.

"""


import os
import sys
import re
import time
import json
import pandas as pd
import requests
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher

# RapidFuzz für Token-basierte Fuzzy-Suche
try:
    from rapidfuzz import fuzz
    RAPIDFUZZ_AVAILABLE = True
    print("RapidFuzz verfügbar")
except ImportError:
    RAPIDFUZZ_AVAILABLE = False
    print("RapidFuzz nicht verfügbar – nutze SequenceMatcher")

# Spacy Lemmatizer
try:
    import spacy
    nlp = spacy.load("de_core_news_sm")
    SPACY_AVAILABLE = True
    print("Spacy Lemmatizer aktiviert")
except:
    SPACY_AVAILABLE = False
    nlp = None
    print("Spacy nicht verfügbar – nutze naive Stemmer")

# =========================
# Pfade & Config
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
CACHE_FILE = "api_cache.json"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
TIMEOUT = 5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
API_ACTIVE = {"gnd": True, "wikidata": True}
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}

# Cache
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE,"r",encoding="utf-8") as f:
        CACHE = json.load(f)
else:
    CACHE = {}

def save_cache():
    with open(CACHE_FILE,"w",encoding="utf-8") as f:
        json.dump(CACHE, f, indent=2, ensure_ascii=False)

# =========================
# Normalisierung / Lemma
# =========================
def normalize_text(s):
    if not s:
        return ""
    s = str(s).lower().strip()
    s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
    s = re.sub(r"\s+"," ",s)
    return s

lemma_cache = {}

def lemmatize_term(term):
    term_norm = normalize_text(term)
    if term_norm in lemma_cache:
        return lemma_cache[term_norm]
    if SPACY_AVAILABLE and nlp:
        doc = nlp(term_norm)
        lemma = " ".join([token.lemma_ for token in doc])
    else:
        lemma = term_norm
    lemma_cache[term_norm] = lemma
    return lemma

# =========================
# Kompositum-Zerlegung (erweitert)
# =========================
def compound_split(term, norm_dict):
    """
    Zerlegt Komposita durch Prüfen auf Substrings, die im Normvokabular vorkommen.
    """
    term_norm = normalize_text(term)
    matches = []
    for i in range(len(term_norm)):
        for j in range(i+3, len(term_norm)+1):
            sub = term_norm[i:j]
            if sub in norm_dict and sub not in matches:
                matches.append(sub)
    if not matches:
        matches = [term_norm]
    return matches

# =========================
# Normvokabular laden & Lemma vorbereiten
# =========================
def load_normvokabular(file_path):
    sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
    norm_dict = {}
    stem_index = defaultdict(list)
    lemma_norm_map = {}

    for sheet_name, df in sheets.items():
        if sheet_name.lower() in ["master", "übersicht"]:
            continue
        df = df.dropna(how="all", axis=1)
        df.columns = [str(c).strip() for c in df.columns]
        id_col = next((c for c in df.columns if "ID" in c), None)
        word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c), None)
        if not id_col or not word_col:
            continue

        current_parent_id = None
        for _, row in df.iterrows():
            row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
            row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
            if row_id:
                current_parent_id = row_id
            if not row_word:
                continue
            assigned_parent_id = current_parent_id
            entry = {
                "Name": row_word,
                "ID": assigned_parent_id,
                "Sheet": sheet_name,
                "Own_ID": row_id or ""
            }
            key = normalize_text(row_word)
            norm_dict[key] = entry
            lemma = lemmatize_term(key)
            stem_index[lemma].append(entry)
            if lemma not in lemma_norm_map:
                lemma_norm_map[lemma] = entry
    return norm_dict, stem_index, lemma_norm_map

# =========================
# Vorschläge & Fuzzy Matching
# =========================
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
    candidates = []
    for key_lemma, entry in lemma_norm_map.items():
        if RAPIDFUZZ_AVAILABLE:
            score_token = fuzz.token_set_ratio(term_lemma, key_lemma)/100
            score_partial = fuzz.partial_ratio(term_lemma, key_lemma)/100
            score = max(score_token, score_partial)
        else:
            score_seq = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
            score = score_seq

        # Substring-Boost
        if term_lemma in key_lemma or key_lemma in term_lemma:
            score = max(score, 0.9)

        if score >= threshold:
            candidates.append((score, entry["Name"], entry["ID"]))

    candidates.sort(reverse=True)
    return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]

# =========================
# Mapping auf Normvokabular
# =========================
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
    term_norm = normalize_text(term)
    term_lemma = lemmatize_term(term)

    # Exakter Treffer
    if term_norm in norm_dict:
        e = norm_dict[term_norm]
        return e["Name"], e["ID"], []

    # Lemma-Treffer
    if term_lemma in stem_index:
        e = stem_index[term_lemma][0]
        return e["Name"], e["ID"], []

    # KEIN TREFFER → Kompositum-Split & Teilbegriffe prüfen
    tokens = compound_split(term, norm_dict)
    token_matches = []
    all_suggestions = []
    for t in tokens:
        t_lemma = lemmatize_term(t)
        if t_lemma in stem_index:
            e = stem_index[t_lemma][0]
            token_matches.append((t, e["Name"], e["ID"]))
        else:
            sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
            all_suggestions.extend(sugg)
            token_matches.append((t, "KEIN TREFFER", "", sugg))

    combined_matches = [m[1] for m in token_matches if m[1] != "KEIN TREFFER"]

    if combined_matches:
        return "KEIN TREFFER", "", combined_matches
    elif all_suggestions:
        return "KEIN TREFFER", "", all_suggestions
    else:
        return "KEIN TREFFER", "", []

# =========================
# API-Abfragen
# =========================
def request_with_retries(api_name,url,params=None):
    cache_key = url + str(params)
    if cache_key in CACHE:
        return CACHE[cache_key]
    retries = 0
    while retries < MAX_RETRIES:
        try:
            r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
            if r.status_code == 200:
                try: data = r.json()
                except: data = r.text
                CACHE[cache_key] = data
                FAIL_COUNTER[api_name] = 0
                return data
        except:
            pass
        retries += 1
        time.sleep(min(BACKOFF_FACTOR**retries,30))
    FAIL_COUNTER[api_name] += 1
    if FAIL_COUNTER[api_name] >= 10:
        API_ACTIVE[api_name] = False
    return None

def batch_query_gnd(terms):
    results={}
    if not API_ACTIVE.get("gnd", False):
        for t in terms: results[t] = ""
        return results
    for t in terms:
        url="https://lobid.org/gnd/search"
        params={"q":t,"format":"json"}
        data = request_with_retries("gnd", url, params)
        top = ""
        if data and "member" in data:
            cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None,t.lower(),(doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
            cands = [c for c in cands if c[1]>=0.75]
            if cands:
                top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
        results[t] = top
    return results

def batch_query_wikidata(terms):
    results={}
    if not API_ACTIVE.get("wikidata", False):
        for t in terms: results[t] = ""
        return results
    for t in terms:
        url="https://www.wikidata.org/w/api.php"
        params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
        data = request_with_retries("wikidata", url, params)
        top = ""
        if data and "search" in data:
            cands = [(e.get("label",""), SequenceMatcher(None,t.lower(),e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")]
            cands = [c for c in cands if c[1]>=0.70]
            if cands:
                top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
        results[t] = top
    return results

# =========================
# Markierung / Export
# =========================
def mark_norm_hits(file_path):
    ext = file_path.suffix.lower()
    if ext in [".xlsx", ".xls"]:
        from openpyxl import load_workbook
        from openpyxl.styles import PatternFill
        wb = load_workbook(file_path)
        ws = wb.active
        green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
        red_fill   = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
        col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
        norm_col = col_map.get("Norm_Treffer", None)
        if not norm_col:
            print("Spalte 'Norm_Treffer' nicht gefunden")
            return
        for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
            cell = row[0]
            if cell.value and cell.value != "KEIN TREFFER":
                cell.fill = green_fill
            else:
                cell.fill = red_fill
        wb.save(file_path)
    elif ext==".ods":
        df = pd.read_excel(file_path, engine="odf")
        df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
        df.to_excel(file_path, index=False, engine="odf")

# =========================
# Export mit zweitem Sheet für Begriffe ohne Treffer und Vorschlag
# =========================
def export_results_with_no_hits(out_df, output_file):
    """
    Exportiert das Mapping-Ergebnis und zusätzlich ein zweites Sheet
    mit allen Begriffen, deren Norm_Treffer == 'KEIN TREFFER' und Norm_Vorschlag leer ist.
    """
    # Begriffe ohne Treffer und ohne Vorschlag
    no_match_df = out_df[(out_df["Norm_Treffer"]=="KEIN TREFFER") & (out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip()==""))].copy()

    ext = output_file.suffix.lower()

    if ext in [".xlsx", ".xls"]:
        with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
            out_df.to_excel(writer, index=False, sheet_name="Mapping")
            no_match_df.to_excel(writer, index=False, sheet_name="Keine Treffer")
    elif ext == ".ods":
        # ODS-Export via odf-Engine
        with pd.ExcelWriter(output_file, engine="odf") as writer:
            out_df.to_excel(writer, index=False, sheet_name="Mapping")
            no_match_df.to_excel(writer, index=False, sheet_name="Keine Treffer")


# =========================
# Verarbeitung Input-Dateien
# =========================
def process_files():
    norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
    total_terms = 0
    total_hits = 0

    if not INPUT_DIR.exists():
        print(f"Eingabeordner {INPUT_DIR} fehlt")
        sys.exit(1)
    files = list(INPUT_DIR.glob("*"))
    if not files:
        print("Keine Dateien gefunden")
        return

    for file_path in files:
        if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
            continue
        print(f"Verarbeite Datei: {file_path.name}")
        try:
            if file_path.suffix.lower() == ".csv":
                df = pd.read_csv(file_path)
            else:
                df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
        except Exception as e:
            print(f"Fehler beim Lesen von {file_path.name}: {e}")
            continue

        df = df.dropna(how="all")
        df.columns = [str(c).strip() for c in df.columns]

        besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
        box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
        urh_col = next((c for c in df.columns if "Urheber" in c), None)
        if not besch_col: continue

        row_terms_map = []
        for _, row in df.iterrows():
            besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
            if not besch: continue
            obj_box = row[box_col] if box_col else ""
            urheber = row[urh_col] if urh_col else ""
            clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
            terms = []
            for clause in clauses:
                parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
                for p in parts:
                    if p.lower() in STOPWORDS: continue
                    if re.fullmatch(r"\d+", p): continue
                    terms.append(p)
            row_terms_map.append((obj_box, urheber, terms))

        all_terms = list({t for _,_,terms in row_terms_map for t in terms})
        gnd_results = batch_query_gnd(all_terms)
        wd_results = batch_query_wikidata(all_terms)

        output_rows = []
        for obj_box, urheber, terms in row_terms_map:
            for term in terms:
                norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
                total_terms += 1
                if norm_name != "KEIN TREFFER":
                    total_hits += 1
                out_row = {
                    "Box": obj_box,
                    "Objekt/Ebene": obj_box,
                    "Urheber": urheber,
                    "Begriff": term,
                    "Norm_Treffer": norm_name,
                    "Norm_ID": norm_id,
                    "Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
                    "GND_Top1": gnd_results.get(term,""),
                    "WD_Top1": wd_results.get(term,"")
                }
                output_rows.append(out_row)

        out_df = pd.DataFrame(output_rows)
        output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
        version = 1
        while output_file.exists():
            output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
            version += 1

        export_results_with_no_hits(out_df, output_file)
        mark_norm_hits(output_file)
        print(f"Auswertung gespeichert: {output_file}")

    save_cache()
    print(f"Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular")

# =========================
# Main
# =========================
if __name__ == "__main__":
    process_files()
    print("Fertig")