GND_Skript_Test/mapper_macro_1.1.py

# -*- coding: utf-8 -*-
# LibreOffice Calc macro: NV_MASTER-Abgleich, Pandas+odf, Cache, Farben
# Pfade: BASE_DIR muss auf das Verzeichnis zeigen, in dem NV_MASTER.ods + Makro liegen.
# Speichern: /home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro/mapper_macro.py

import os
import re
import json
import traceback

# UNO-Context wird zur Laufzeit zur Verfügung gestellt (XSCRIPTCONTEXT)
# Third-party libs: pandas, odfpy, optional: spacy, rapidfuzz
try:
    import pandas as pd
    PANDAS_AVAILABLE = True
except Exception:
    PANDAS_AVAILABLE = False

try:
    import spacy
    nlp = spacy.load("de_core_news_sm")
    SPACY_AVAILABLE = True
except Exception:
    SPACY_AVAILABLE = False
    nlp = None

try:
    from rapidfuzz import fuzz
    RAPIDFUZZ_AVAILABLE = True
except Exception:
    RAPIDFUZZ_AVAILABLE = False
    from difflib import SequenceMatcher

# ------------------------
# Konfiguration
# ------------------------
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro"
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")

STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75  # Basis-Schwelle für Vorschläge

# ------------------------
# Utilities: Logging & safe I/O
# ------------------------
def log(msg):
    try:
        with open(LOG_FILE, "a", encoding="utf-8") as f:
            f.write(msg + "\n")
    except Exception:
        pass

# ------------------------
# Cache laden
# ------------------------
try:
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, "r", encoding="utf-8") as f:
            CACHE = json.load(f)
    else:
        CACHE = {}
except Exception:
    CACHE = {}

# ------------------------
# Text-Normalisierung & Lemma
# ------------------------
def normalize_text(s):
    if not s:
        return ""
    s = str(s).strip().lower()
    s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
    s = re.sub(r"\s+", " ", s)
    return s

lemma_cache = {}
def lemmatize_term(term):
    term_norm = normalize_text(term)
    if term_norm in lemma_cache:
        return lemma_cache[term_norm]
    if SPACY_AVAILABLE and nlp:
        try:
            doc = nlp(term_norm)
            lemma = " ".join([token.lemma_ for token in doc])
        except Exception:
            lemma = term_norm
    else:
        lemma = term_norm
    lemma_cache[term_norm] = lemma
    return lemma

# ------------------------
# NV_MASTER robust laden (pandas + odf)
# ------------------------
def build_norm_index(nv_path):
    norm_dict = {}         # normalized_name -> list of entries (Name, ID, Sheet)
    lemma_index = {}       # lemma -> list of entries
    if not PANDAS_AVAILABLE:
        log("Pandas nicht verfügbar. NV_MASTER kann nicht zuverlässig gelesen werden.")
        return norm_dict, lemma_index

    try:
        sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
    except Exception as e:
        log(f"Fehler beim Einlesen von NV_MASTER mit pandas: {e}")
        return norm_dict, lemma_index

    for sheet_name, df in sheets.items():
        if str(sheet_name).strip().lower() == "master":
            continue
        # normalize columns names to find ID and Wort columns
        df = df.fillna("")  # leere Zellen als ""
        cols = [str(c).strip().lower() for c in df.columns]
        # try to find columns
        id_col = None
        word_col = None
        for i, c in enumerate(cols):
            if "id" in c:
                id_col = df.columns[i]
            if "wort" in c or "vokabel" in c:
                word_col = df.columns[i]
        # fallback: if not found, try first/last
        if word_col is None and len(df.columns) >= 1:
            word_col = df.columns[-1]
        if id_col is None and len(df.columns) >= 1:
            id_col = df.columns[0]

        current_parent_id = None
        for _, row in df.iterrows():
            id_val = str(row[id_col]).strip() if id_col in df.columns else ""
            word_val = str(row[word_col]).strip() if word_col in df.columns else ""
            # if row defines an ID, set as current parent
            if id_val:
                current_parent_id = id_val
            # skip empty word cells
            if not word_val:
                continue
            norm_name = normalize_text(word_val)
            lemma = lemmatize_term(word_val)
            entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
            # add to norm_dict by normalized name (exact matching)
            norm_dict.setdefault(norm_name, []).append(entry)
            # add to lemma_index
            lemma_index.setdefault(lemma, []).append(entry)

    log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}")
    return norm_dict, lemma_index

# ------------------------
# Matching: exakter Treffer, Lemma-Treffer, Fuzzy-Vorschläge
# ------------------------
def fuzzy_score(a, b):
    if RAPIDFUZZ_AVAILABLE:
        try:
            return fuzz.token_set_ratio(a, b) / 100.0
        except Exception:
            return 0.0
    else:
        try:
            return SequenceMatcher(None, a.lower(), b.lower()).ratio()
        except Exception:
            return 0.0

def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, threshold=CONF_THRESHOLD):
    # collect candidates from lemma_index keys and norm_dict keys
    candidates = []
    # iterate over lemma_index keys for candidate names
    for key_lemma, entries in lemma_index.items():
        score = fuzzy_score(term_lemma, key_lemma)
        if key_lemma.startswith(term_lemma):
            score = min(score + 0.1, 1.0)
        if score >= threshold:
            for e in entries:
                candidates.append((score, e["Name"], e["ID"]))
    # also check norm_dict keys (exact-normalized names) as additional candidates
    for norm_key, entries in norm_dict.items():
        score = fuzzy_score(term_lemma, norm_key)
        if norm_key.startswith(term_lemma):
            score = min(score + 0.1, 1.0)
        if score >= threshold:
            for e in entries:
                candidates.append((score, e["Name"], e["ID"]))
    # sort by score descending
    candidates.sort(key=lambda t: t[0], reverse=True)
    # unique by (Name, ID) preserve score order
    seen = set()
    results = []
    for score, name, id_ in candidates:
        key = (name, id_)
        if key in seen:
            continue
        seen.add(key)
        results.append({"score": score, "name": name, "id": id_})
    # return all candidates (no limit) as "Name (ID)"
    return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]

def map_term_with_indexes(term, norm_dict, lemma_index):
    term_norm = normalize_text(term)
    term_lemma = lemmatize_term(term)
    # cache lookup
    if term_lemma in CACHE:
        return CACHE[term_lemma]["hits"], CACHE[term_lemma]["suggestions"], CACHE[term_lemma]["ids"]

    hits = []
    suggestions = []
    ids = []

    # 1) exact normalized name match
    if term_norm in norm_dict:
        for e in norm_dict[term_norm]:
            hits.append(e["Name"])
            if e["ID"]:
                ids.append(e["ID"])

    # 2) lemma match (if not already hits)
    if not hits and term_lemma in lemma_index:
        for e in lemma_index[term_lemma]:
            hits.append(e["Name"])
            if e["ID"]:
                ids.append(e["ID"])

    # 3) suggestions via fuzzy (always compute even if hits exist, but suggestions empty if exact)
    suggs = get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, threshold=CONF_THRESHOLD)
    # If there are exact hits, we still may present suggestions (user wanted unlimited), but suggestions are secondary
    suggestions = suggs

    # deduplicate lists preserving order
    def unique_preserve(seq):
        seen = set()
        out = []
        for x in seq:
            if x not in seen:
                seen.add(x)
                out.append(x)
        return out

    hits = unique_preserve(hits)
    suggestions = unique_preserve(suggestions)
    ids = unique_preserve(ids)

    # cache result
    CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
    return hits, suggestions, ids

# ------------------------
# Haupt-Makro
# ------------------------
def run_mapper_macro():
    try:
        # UNO doc/sheet
        doc = XSCRIPTCONTEXT.getDocument()
        sheet = doc.CurrentController.ActiveSheet
        cursor = sheet.createCursor()
        cursor.gotoStartOfUsedArea(False)
        cursor.gotoEndOfUsedArea(True)
        data_range = cursor.getRangeAddress()
    except Exception as e:
        log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e))
        return

    # find header row and Objektbeschreibung column (search first 5 rows)
    header_row = None
    objekt_col = None
    max_col = data_range.EndColumn
    for r in range(0, min(5, data_range.EndRow+1)):
        for c in range(0, max_col+1):
            try:
                val = str(sheet.getCellByPosition(c, r).String).strip().lower()
            except Exception:
                val = ""
            if val == "objektbeschreibung":
                header_row = r
                objekt_col = c
                break
        if objekt_col is not None:
            break

    if objekt_col is None:
        log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.")
        return

    # determine or create result columns: search if exist anywhere; otherwise append at right end
    existing = {}
    for c in range(0, data_range.EndColumn+1):
        try:
            h = str(sheet.getCellByPosition(c, header_row).String).strip()
        except Exception:
            h = ""
        if h == "Norm_Treffer":
            existing["Norm_Treffer"] = c
        if h == "Norm_Vorschlag":
            existing["Norm_Vorschlag"] = c
        if h == "Norm_ID":
            existing["Norm_ID"] = c

    # append columns at right end if missing
    last_col = data_range.EndColumn
    if "Norm_Treffer" not in existing:
        last_col += 1
        existing["Norm_Treffer"] = last_col
        try:
            sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
        except Exception:
            pass
    if "Norm_Vorschlag" not in existing:
        last_col += 1
        existing["Norm_Vorschlag"] = last_col
        try:
            sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
        except Exception:
            pass
    if "Norm_ID" not in existing:
        last_col += 1
        existing["Norm_ID"] = last_col
        try:
            sheet.getCellByPosition(last_col, header_row).String = "Norm_ID"
        except Exception:
            pass

    norm_tr_col = existing["Norm_Treffer"]
    norm_sug_col = existing["Norm_Vorschlag"]
    norm_id_col = existing["Norm_ID"]

    # Build norm indexes
    norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
    if not norm_dict and not lemma_index:
        log("NV_MASTER leer oder nicht lesbar. Abbruch.")
        return

    # colors
    GREEN = 0xADFF2F
    YELLOW = 0xFFA500
    RED = 0xCC0000

    # iterate rows
    rows_processed = 0
    for r in range(header_row + 1, data_range.EndRow + 1):
        try:
            cell = sheet.getCellByPosition(objekt_col, r)
            txt = str(cell.String).strip()
            if not txt:
                # clear any previous outputs? keep existing per spec; skip empty
                continue

            # tokenize: split by commas first, then whitespace; filter stopwords and pure numbers
            clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
            terms = []
            for cl in clauses:
                parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()]
                for p in parts:
                    if p.lower() in STOPWORDS:
                        continue
                    if re.fullmatch(r"\d+", p):
                        continue
                    terms.append(p)

            # for each term, get hits/suggestions/ids
            row_hits = []
            row_sugs = []
            row_ids = []
            any_unmapped = False  # at least one term without hit and without suggestion
            # We will record for each term
            for term in terms:
                hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
                if hits:
                    row_hits.extend(hits)
                if sugs:
                    row_sugs.extend(sugs)
                if ids:
                    row_ids.extend(ids)
                if (not hits) and (not sugs):
                    any_unmapped = True

            # deduplicate preserving order
            def uniq(seq):
                seen = set()
                out = []
                for x in seq:
                    if x not in seen:
                        seen.add(x)
                        out.append(x)
                return out

            row_hits = uniq(row_hits)
            row_sugs = uniq(row_sugs)
            row_ids = uniq(row_ids)

            # write outputs (unlimited lists, joined with " | ")
            try:
                sheet.getCellByPosition(norm_tr_col, r).String = " | ".join(row_hits)
                sheet.getCellByPosition(norm_sug_col, r).String = " | ".join(row_sugs)
                sheet.getCellByPosition(norm_id_col, r).String = " | ".join(row_ids)
            except Exception:
                pass

            # Coloring rules per new spec:
            # - Objektbeschreibung cell: RED if any_unmapped else no change (we do not color green/yellow here)
            # - Norm_Treffer cell: GREEN if all terms matched (i.e., terms non-empty and no term unmapped and at least one hit per term)
            # - Norm_Vorschlag cell: YELLOW if at least one suggestion exists
            # Determine "all matched": terms non-empty and every term has at least one hit (we approximated by checking any_unmapped and hits length)
            all_matched = False
            if terms:
                # all_matched if no term without hit and there is at least one hit overall
                if (not any_unmapped) and row_hits:
                    all_matched = True

            # apply colors
            try:
                if any_unmapped:
                    cell.CellBackColor = RED
                else:
                    # clear red if previously set? We'll leave unchanged if not set. Optionally set to default 16777215 (white)
                    pass
                # Norm_Treffer coloring
                tr_cell = sheet.getCellByPosition(norm_tr_col, r)
                if all_matched:
                    tr_cell.CellBackColor = GREEN
                else:
                    # clear color if needed -> set to white
                    tr_cell.CellBackColor = 0xFFFFFF
                # Norm_Vorschlag coloring
                sug_cell = sheet.getCellByPosition(norm_sug_col, r)
                if row_sugs:
                    sug_cell.CellBackColor = YELLOW
                else:
                    sug_cell.CellBackColor = 0xFFFFFF
            except Exception:
                pass

            rows_processed += 1

        except Exception as e:
            # continue processing other rows; log once
            log(f"Fehler in Zeile {r}: {e}")

    # persist cache
    try:
        with open(CACHE_FILE, "w", encoding="utf-8") as f:
            json.dump(CACHE, f, ensure_ascii=False, indent=2)
    except Exception:
        pass

    log(f"run_mapper_macro fertig. Zeilen verarbeitet: {rows_processed}")

# Export for LO
g_exportedScripts = (run_mapper_macro,)