GND_Skript_Test/mapper_macro.py

import uno
import os
import re
import traceback
import json

# Optional für Lemmatizer
try:
    import spacy
    nlp = spacy.load("de_core_news_sm")
    SPACY_AVAILABLE = True
except:
    SPACY_AVAILABLE = False
    nlp = None

# Optional für Fuzzy Matching
try:
    from rapidfuzz import fuzz
    RAPIDFUZZ_AVAILABLE = True
except:
    from difflib import SequenceMatcher
    RAPIDFUZZ_AVAILABLE = False

import odf.opendocument
import odf.table
import odf.text

# ------------------------
# Konfiguration absolute Pfade
# ------------------------
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro"
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")

STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75

# ------------------------
# Logging
# ------------------------
def log(msg):
    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(msg + "\n")

# ------------------------
# Cache laden
# ------------------------
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, "r", encoding="utf-8") as f:
        CACHE = json.load(f)
else:
    CACHE = {}

# ------------------------
# Normalisierung / Lemma
# ------------------------
def normalize_text(s):
    if not s:
        return ""
    s = str(s).lower().strip()
    s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
    s = re.sub(r"\s+"," ",s)
    return s

lemma_cache = {}
def lemmatize_term(term):
    term_norm = normalize_text(term)
    if term_norm in lemma_cache:
        return lemma_cache[term_norm]
    if SPACY_AVAILABLE and nlp:
        doc = nlp(term_norm)
        lemma = " ".join([token.lemma_ for token in doc])
    else:
        lemma = term_norm
    lemma_cache[term_norm] = lemma
    return lemma

# ------------------------
# NV_MASTER einlesen
# ------------------------
def load_nv_master(path):
    norm_dict = {}
    try:
        doc = odf.opendocument.load(path)
    except Exception as e:
        log(f"Fehler beim Laden von NV_MASTER: {e}")
        return norm_dict

    for sheet in doc.spreadsheet.getElementsByType(odf.table.Table):
        sheet_name = sheet.getAttribute("name")
        if sheet_name.lower() == "master":
            continue

        current_parent_id = None
        for row in sheet.getElementsByType(odf.table.TableRow):
            cells = row.getElementsByType(odf.table.TableCell)
            cell_values = []
            for cell in cells:
                texts = cell.getElementsByType(odf.text.P)
                if texts and texts[0].firstChild:
                    cell_values.append(str(texts[0].firstChild.data).strip())
                else:
                    cell_values.append("")
            if not cell_values or len(cell_values)<4:
                continue
            id_val, unterk, unterunterk, word = cell_values[:4]
            if id_val:
                current_parent_id = id_val.strip()
            if not word:
                continue
            key = lemmatize_term(word)
            norm_dict[key] = {
                "Name": word.strip(),
                "ID": current_parent_id,
                "Sheet": sheet_name,
                "Unterkategorie": unterk.strip(),
                "Unterunterkategorie": unterunterk.strip()
            }
    log(f"NV_MASTER geladen: {len(norm_dict)} Begriffe")
    return norm_dict

# ------------------------
# Matching
# ------------------------
def get_suggestions(term_lemma, norm_dict, top_n=3, threshold=CONF_THRESHOLD):
    candidates = []
    for key, entry in norm_dict.items():
        if RAPIDFUZZ_AVAILABLE:
            score = fuzz.token_set_ratio(term_lemma, key)/100
        else:
            score = SequenceMatcher(None, term_lemma.lower(), key.lower()).ratio()
        if key.lower().startswith(term_lemma.lower()):
            score = min(score + 0.1, 1.0)
        if score >= threshold:
            candidates.append((score, entry["Name"], entry["ID"]))
    candidates.sort(reverse=True)
    return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]

def map_word(word, norm_dict):
    key = lemmatize_term(word)
    if key in CACHE:
        cached = CACHE[key]
        return cached["Norm"], cached["Suggestion"], cached["ID"]

    if key in norm_dict:
        entry = norm_dict[key]
        tr, sug, wid = entry["Name"], "", entry["ID"]
    else:
        suggestions = get_suggestions(term_lemma=key, norm_dict=norm_dict)
        if suggestions:
            tr, sug, wid = "KEIN TREFFER", ", ".join(suggestions), ""
        else:
            tr, sug, wid = "KEIN TREFFER", "", ""

    CACHE[key] = {"Norm": tr, "Suggestion": sug, "ID": wid}
    return tr, sug, wid

# ------------------------
# Makro-Hauptfunktion
# ------------------------
def run_mapper_macro():
    try:
        doc = XSCRIPTCONTEXT.getDocument()
        sheets = doc.getSheets()
        sheet = sheets.getByIndex(0)
        cursor = sheet.createCursor()
        cursor.gotoStartOfUsedArea(False)
        cursor.gotoEndOfUsedArea(True)
        data_range = cursor.getRangeAddress()

        header_row = 0
        objekt_col = None

        # Header prüfen
        for col in range(data_range.EndColumn+1):
            val = sheet.getCellByPosition(col, header_row).String.strip().lower()
            if val == "objektbeschreibung":
                objekt_col = col
                break

        if objekt_col is None:
            log("Spalte 'Objektbeschreibung' nicht gefunden")
            return

        # Neue Spalten am rechten Tabellenende erstellen
        max_col = data_range.EndColumn
        norm_tr_col = max_col + 1
        norm_sug_col = max_col + 2
        norm_id_col = max_col + 3

        sheet.getCellByPosition(norm_tr_col, header_row).String = "Norm_Treffer"
        sheet.getCellByPosition(norm_sug_col, header_row).String = "Norm_Vorschlag"
        sheet.getCellByPosition(norm_id_col, header_row).String = "Norm_ID"

        norm_dict = load_nv_master(NV_MASTER_PATH)

        # Farben
        GREEN = 0xC6EFCE
        YELLOW = 0xFFEB9C
        RED = 0xFFC7CE

        for row in range(1, data_range.EndRow+1):
            cell = sheet.getCellByPosition(objekt_col, row)
            val = cell.String.strip()
            if not val:
                continue
            words = [w.strip() for w in re.split(r"\s+", val) if w.strip() and w.lower() not in STOPWORDS]
            tr_list, sug_list, id_list = [], [], []
            for w in words:
                tr, sug, wid = map_word(w, norm_dict)
                if tr != "KEIN TREFFER":
                    tr_list.append(tr)
                if sug:
                    sug_list.append(sug)
                if wid:
                    id_list.append(wid)
            sheet.getCellByPosition(norm_tr_col, row).String = ", ".join(tr_list)
            sheet.getCellByPosition(norm_sug_col, row).String = ", ".join(sug_list)
            sheet.getCellByPosition(norm_id_col, row).String = ", ".join(id_list)
            # Farbmarkierung
            if tr_list:
                cell.CellBackColor = GREEN
            elif sug_list:
                cell.CellBackColor = YELLOW
            else:
                cell.CellBackColor = RED

        # Cache speichern
        with open(CACHE_FILE, "w", encoding="utf-8") as f:
            json.dump(CACHE, f, ensure_ascii=False, indent=2)

        log("Makro erfolgreich ausgeführt")

    except Exception as e:
        log("Fehler in run_mapper_macro:")
        log(traceback.format_exc())