GND_Skript_Test/Mapper_Makro_Alte_Versionen/mapper_macro_1.5.py

# -*- coding: utf-8 -*-
# mapper_macro 1.5 - korrigiert: Logging im Dokumentverzeichnis, stabile Button-Erstellung,
# keine Listener, optimiertes Mapping (ohne Listener-Teil)

import os
import re
import json
import datetime

# optionale Module (Pandas, Spacy, RapidFuzz)
try:
    import pandas as pd
    PANDAS_AVAILABLE = True
except Exception:
    PANDAS_AVAILABLE = False

try:
    import spacy
    nlp = spacy.load("de_core_news_sm")
    SPACY_AVAILABLE = True
except Exception:
    SPACY_AVAILABLE = False
    nlp = None

try:
    from rapidfuzz import fuzz
    RAPIDFUZZ_AVAILABLE = True
except Exception:
    RAPIDFUZZ_AVAILABLE = False

from difflib import SequenceMatcher

# UNO (für Button/Paths)
try:
    import uno
except Exception:
    uno = None

# ------------------------
# Konfiguration (Fallback-BASE_DIR)
# ------------------------
BASE_DIR = os.path.expanduser("~/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro")
NV_MASTER_FILENAME = "NV_MASTER.ods"
CACHE_FILENAME = "mapper_cache.json"
LOG_FILENAME = "mapper_macro.log"

STOPWORDS = {
    "mit", "ohne", "der", "die", "das", "ein", "eine", "und", "zu", "von", "im", "in", "auf", "an",
    "als", "bei", "für", "aus", "dem", "den", "des", "eines", "einer"
}
CONF_THRESHOLD = 0.82
FUZZY_CUTOFF = 0.88

# Per-document paths (initialized by set_paths_from_doc)
DOC_DIR = BASE_DIR
NV_MASTER_PATH = os.path.join(DOC_DIR, NV_MASTER_FILENAME)
CACHE_FILE = os.path.join(DOC_DIR, CACHE_FILENAME)
LOG_FILE = os.path.join(DOC_DIR, LOG_FILENAME)

# in-memory cache
try:
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, "r", encoding="utf-8") as f:
            CACHE = json.load(f)
    else:
        CACHE = {}
except Exception:
    CACHE = {}

# ------------------------
# Pfade im Dokument setzen
# ------------------------
def set_paths_from_doc(doc):
    global DOC_DIR, NV_MASTER_PATH, CACHE_FILE, LOG_FILE
    try:
        url = getattr(doc, "URL", "")
        if url and url.strip():
            # UNO liefert file:///...
            try:
                system_path = uno.fileUrlToSystemPath(url)
            except Exception:
                # fallback: try simple unquote
                from urllib.parse import unquote, urlparse
                parsed = urlparse(url)
                if parsed.scheme == "file":
                    system_path = unquote(parsed.path)
                else:
                    system_path = ""
            if system_path:
                d = os.path.dirname(system_path)
                if os.path.isdir(d):
                    DOC_DIR = d
    except Exception:
        DOC_DIR = BASE_DIR
    NV_MASTER_PATH = os.path.join(DOC_DIR, NV_MASTER_FILENAME)
    CACHE_FILE = os.path.join(DOC_DIR, CACHE_FILENAME)
    LOG_FILE = os.path.join(DOC_DIR, LOG_FILENAME)

# ------------------------
# Logging (Dokumentdir, robust)
# ------------------------
def log(msg, level="INFO"):
    ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    line = f"[{ts}] [{level}] {msg}\n"
    try:
        # ensure directory exists
        os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
        with open(LOG_FILE, "a", encoding="utf-8") as f:
            f.write(line)
    except Exception:
        # absolute fallback: try writing into BASE_DIR
        try:
            fallback = os.path.join(BASE_DIR, LOG_FILENAME)
            os.makedirs(os.path.dirname(fallback), exist_ok=True)
            with open(fallback, "a", encoding="utf-8") as f:
                f.write(line)
        except Exception:
            # last resort: silent
            pass

# ------------------------
# Textvorbereitung & Helpers
# ------------------------
lemma_cache = {}

def normalize_text(s):
    if not s:
        return ""
    s = str(s).strip().lower()
    s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
    s = re.sub(r"\s+", " ", s)
    return s

def lemmatize_term(term):
    term_norm = normalize_text(term)
    if term_norm in lemma_cache:
        return lemma_cache[term_norm]
    if SPACY_AVAILABLE and nlp:
        try:
            doc = nlp(term_norm)
            lemma = " ".join([t.lemma_ for t in doc])
        except Exception:
            lemma = term_norm
    else:
        lemma = term_norm
    lemma_cache[term_norm] = lemma
    return lemma

def compound_split(term):
    if not term:
        return []
    parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
    if parts:
        return parts
    parts = [p for p in re.split(r'[-\s]+', term) if p]
    return parts or [term]

# ------------------------
# NV_MASTER indexieren
# ------------------------
def build_norm_index(nv_path):
    norm_dict = {}
    lemma_index = {}
    if not PANDAS_AVAILABLE:
        log("Pandas nicht verfügbar, NV_MASTER kann nicht gelesen.", level="ERROR")
        return norm_dict, lemma_index
    try:
        sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
    except Exception as e:
        log(f"Fehler beim Einlesen von NV_MASTER: {e}", level="ERROR")
        return norm_dict, lemma_index

    for sheet_name, df in sheets.items():
        if str(sheet_name).strip().lower() == "master":
            continue
        df = df.fillna("")
        cols = [str(c).strip().lower() for c in df.columns]
        # find id/word columns with fallback
        id_col = None
        word_col = None
        for i, c in enumerate(cols):
            if "id" in c:
                id_col = df.columns[i]
            if "wort" in c or "vokabel" in c:
                word_col = df.columns[i]
        if word_col is None and len(df.columns) >= 1:
            word_col = df.columns[-1]
        if id_col is None and len(df.columns) >= 1:
            id_col = df.columns[0]

        current_parent_id = None
        for _, row in df.iterrows():
            id_val = str(row[id_col]).strip() if id_col in df.columns else ""
            word_val = str(row[word_col]).strip() if word_col in df.columns else ""
            if id_val:
                current_parent_id = id_val
            if not word_val:
                continue
            norm_name = normalize_text(word_val)
            lemma = lemmatize_term(word_val)
            entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
            norm_dict.setdefault(norm_name, []).append(entry)
            lemma_index.setdefault(lemma, []).append(entry)

    log(f"NV_MASTER geladen. Begriffe: {sum(len(v) for v in norm_dict.values())}", level="INFO")
    return norm_dict, lemma_index

# ------------------------
# Fuzzy Matching
# ------------------------
def fuzzy_score(a, b):
    a = (a or "").lower()
    b = (b or "").lower()
    if RAPIDFUZZ_AVAILABLE:
        try:
            return fuzz.token_sort_ratio(a, b) / 100.0
        except Exception:
            return 0.0
    else:
        return SequenceMatcher(None, a, b).ratio()

def get_suggestions(term_lemma, norm_dict, lemma_index, threshold=FUZZY_CUTOFF, max_sugs=6):
    candidates = []
    term_norm = term_lemma or ""
    for key_lemma, entries in lemma_index.items():
        if not key_lemma:
            continue
        score = fuzzy_score(term_norm, key_lemma)
        if key_lemma.startswith(term_norm):
            score = min(score + 0.08, 1.0)
        if score >= threshold:
            for e in entries:
                candidates.append((score, e["Name"], e["ID"]))
    # also check normalized names
    for norm_key, entries in norm_dict.items():
        score = fuzzy_score(term_norm, norm_key)
        if norm_key.startswith(term_norm):
            score = min(score + 0.08, 1.0)
        if score >= threshold:
            for e in entries:
                candidates.append((score, e["Name"], e["ID"]))
    # sort & dedupe
    candidates.sort(key=lambda t: t[0], reverse=True)
    seen = set()
    out = []
    for score, name, id_ in candidates:
        key = (name, id_)
        if key in seen:
            continue
        seen.add(key)
        if id_:
            out.append(f"{name} ({id_})")
        else:
            out.append(name)
        if len(out) >= max_sugs:
            break
    return out

# ------------------------
# Mapping mit Cache
# ------------------------
def map_term(term, norm_dict, lemma_index):
    term_norm = normalize_text(term)
    term_lemma = lemmatize_term(term)
    if term_lemma in CACHE:
        return CACHE[term_lemma]

    hits = []
    suggestions = []
    ids = []

    # exact
    if term_norm in norm_dict:
        for e in norm_dict[term_norm]:
            hits.append(e["Name"])
            if e["ID"]:
                ids.append(e["ID"])

    # lemma
    if not hits and term_lemma in lemma_index:
        for e in lemma_index[term_lemma]:
            hits.append(e["Name"])
            if e["ID"]:
                ids.append(e["ID"])

    # suggestions only if no hit
    if not hits:
        suggestions = get_suggestions(term_lemma, norm_dict, lemma_index)

    # remove suggestions that are equal/contain hits
    suggestions = [s for s in suggestions if not any(h.lower() in s.lower() for h in hits)]

    result = {"hits": hits, "suggestions": suggestions, "ids": ids}
    CACHE[term_lemma] = result
    return result

# ------------------------
# Button erstellen (sicher)
# ------------------------
def add_macro_button(sheet):
    try:
        doc = XSCRIPTCONTEXT.getDocument()
    except Exception:
        log("add_macro_button: kein Dokument-Kontext", level="WARN")
        return
    try:
        draw_page = sheet.DrawPage
        # avoid duplicate
        for shape in draw_page:
            try:
                if getattr(shape, "Name", "") == "MapperStartButton":
                    return
            except Exception:
                continue

        # create shape and button model
        shape = doc.createInstance("com.sun.star.drawing.ControlShape")
        shape.Name = "MapperStartButton"
        shape.Position = uno.createUnoStruct("com.sun.star.awt.Point")
        shape.Position.X = 1000
        shape.Position.Y = 200
        shape.Size = uno.createUnoStruct("com.sun.star.awt.Size")
        shape.Size.Width = 3000
        shape.Size.Height = 1000

        button_model = doc.createInstance("com.sun.star.form.component.CommandButton")
        button_model.Label = "Start Mapping"
        button_model.HelpText = "Startet das Mapping (run_mapper_macro)"
        # assign macro via ActionCommand is not enough; user must link in UI; we add the control and label

        shape.Control = button_model
        draw_page.add(shape)
        log("Button 'MapperStartButton' erstellt.", level="INFO")
    except Exception as e:
        log(f"add_macro_button Fehler: {e}", level="ERROR")

# ------------------------
# Hauptlauf (ohne Listener)
# ------------------------
def run_mapper_macro():
    try:
        doc = XSCRIPTCONTEXT.getDocument()
        set_paths_from_doc(doc)
        log("=== mapper_macro gestartet ===", level="INFO")
        sheet = doc.CurrentController.ActiveSheet
        add_macro_button(sheet)

        # used area
        cursor = sheet.createCursor()
        cursor.gotoStartOfUsedArea(False)
        cursor.gotoEndOfUsedArea(True)
        dr = cursor.getRangeAddress()

        # find header and objekt col
        header_row = None
        objekt_col = None
        for r in range(0, min(10, dr.EndRow + 1)):
            for c in range(0, dr.EndColumn + 1):
                try:
                    val = str(sheet.getCellByPosition(c, r).String).strip().lower()
                except Exception:
                    val = ""
                if val == "Objektbeschreibung":
                    header_row = r
                    objekt_col = c
                    break
            if objekt_col is not None:
                break

        if objekt_col is None:
            log("run_mapper_macro: 'Objektbeschreibung' Header nicht gefunden.", level="ERROR")
            return

        # ensure result cols
        existing = {}
        last_col = dr.EndColumn
        for c in range(0, dr.EndColumn + 1):
            try:
                h = str(sheet.getCellByPosition(c, header_row).String).strip()
            except Exception:
                h = ""
            if h == "Norm_Treffer":
                existing["Norm_Treffer"] = c
            if h == "Norm_Vorschlag":
                existing["Norm_Vorschlag"] = c
            if h == "Norm_ID":
                existing["Norm_ID"] = c

        if "Norm_Treffer" not in existing:
            last_col += 1
            existing["Norm_Treffer"] = last_col
            sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
        if "Norm_Vorschlag" not in existing:
            last_col += 1
            existing["Norm_Vorschlag"] = last_col
            sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
        if "Norm_ID" not in existing:
            last_col += 1
            existing["Norm_ID"] = last_col
            sheet.getCellByPosition(last_col, header_row).String = "Norm_ID"

        norm_tr_col = existing["Norm_Treffer"]
        norm_sug_col = existing["Norm_Vorschlag"]
        norm_id_col = existing["Norm_ID"]

        # build index
        norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
        if not norm_dict and not lemma_index:
            log("run_mapper_macro: NV_MASTER leer oder nicht lesbar.", level="ERROR")
            return

        GREEN, YELLOW, RED = 0xADFF2F, 0xFFFF66, 0xFF9999
        rows_processed = 0

        for r in range(header_row + 1, dr.EndRow + 1):
            try:
                cell = sheet.getCellByPosition(objekt_col, r)
                txt = str(cell.String).strip()
                if not txt:
                    continue

                # phrase-first: try entire cleaned phrase (remove stopwords)
                tokens = [t.strip() for t in re.split(r'\s+', normalize_text(txt)) if t and t not in STOPWORDS]
                phrase = " ".join(tokens).strip()
                terms = []
                if phrase:
                    # first try phrase as whole
                    mapped_phrase = map_term(phrase, norm_dict, lemma_index)
                    if mapped_phrase["hits"] or mapped_phrase["suggestions"]:
                        # use phrase result (flatten hits+suggestions for output)
                        row_hits = mapped_phrase["hits"]
                        row_sugs = mapped_phrase["suggestions"]
                        row_ids = mapped_phrase["ids"]
                        any_unmapped = False if (row_hits or row_sugs) else True
                    else:
                        # fallback to token/compound processing
                        for p in [p for p in re.split(r'[,\s]+', txt) if p.strip()]:
                            if p.lower() in STOPWORDS or re.fullmatch(r'\d+', p):
                                continue
                            for sp in compound_split(p):
                                if sp and sp.strip():
                                    terms.append(sp.strip())
                        row_hits = []
                        row_sugs = []
                        row_ids = []
                        any_unmapped = False
                        for term in terms:
                            mapped = map_term(term, norm_dict, lemma_index)
                            hits, sugs, ids = mapped["hits"], mapped["suggestions"], mapped["ids"]
                            if hits:
                                row_hits.extend(hits)
                            if sugs:
                                row_sugs.extend(sugs)
                            if ids:
                                row_ids.extend(ids)
                            if not hits and not sugs:
                                any_unmapped = True
                else:
                    row_hits, row_sugs, row_ids = [], [], []
                    any_unmapped = True

                # dedupe preserving order
                def uniq(seq):
                    seen = set()
                    out = []
                    for x in seq:
                        if x not in seen:
                            seen.add(x)
                            out.append(x)
                    return out

                row_hits = uniq(row_hits)
                row_sugs = uniq(row_sugs)
                row_ids = uniq(row_ids)

                # write
                sheet.getCellByPosition(norm_tr_col, r).String = " | ".join(row_hits)
                sheet.getCellByPosition(norm_sug_col, r).String = " | ".join(row_sugs)
                sheet.getCellByPosition(norm_id_col, r).String = " | ".join(row_ids)

                cell.CellBackColor = RED if any_unmapped else 0xFFFFFF
                sheet.getCellByPosition(norm_tr_col, r).CellBackColor = GREEN if row_hits else 0xFFFFFF
                sheet.getCellByPosition(norm_sug_col, r).CellBackColor = YELLOW if row_sugs else 0xFFFFFF

                rows_processed += 1
            except Exception as e:
                log(f"Fehler in Zeile {r}: {e}", level="ERROR")
                continue

        # persist cache file to DOC_DIR
        try:
            with open(CACHE_FILE, "w", encoding="utf-8") as f:
                json.dump(CACHE, f, ensure_ascii=False, indent=2)
        except Exception as e:
            log(f"Cache speichern fehlgeschlagen: {e}", level="WARN")

        log(f"=== mapper_macro fertig. Zeilen verarbeitet: {rows_processed} ===", level="INFO")
    except Exception as e:
        # top-level safety
        try:
            log(f"run_mapper_macro: Unhandled exception: {e}", level="ERROR")
        except Exception:
            pass

# ------------------------
# Export
# ------------------------
g_exportedScripts = (run_mapper_macro,)