import uno import os import re import traceback import json # Optional für Lemmatizer try: import spacy nlp = spacy.load("de_core_news_sm") SPACY_AVAILABLE = True except: SPACY_AVAILABLE = False nlp = None # Optional für Fuzzy Matching try: from rapidfuzz import fuzz RAPIDFUZZ_AVAILABLE = True except: from difflib import SequenceMatcher RAPIDFUZZ_AVAILABLE = False import odf.opendocument import odf.table import odf.text # ------------------------ # Konfiguration absolute Pfade # ------------------------ BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro" NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods") LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log") CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json") STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"} CONF_THRESHOLD = 0.75 # ------------------------ # Logging # ------------------------ def log(msg): with open(LOG_FILE, "a", encoding="utf-8") as f: f.write(msg + "\n") # ------------------------ # Cache laden # ------------------------ if os.path.exists(CACHE_FILE): with open(CACHE_FILE, "r", encoding="utf-8") as f: CACHE = json.load(f) else: CACHE = {} # ------------------------ # Normalisierung / Lemma # ------------------------ def normalize_text(s): if not s: return "" s = str(s).lower().strip() s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s) s = re.sub(r"\s+"," ",s) return s lemma_cache = {} def lemmatize_term(term): term_norm = normalize_text(term) if term_norm in lemma_cache: return lemma_cache[term_norm] if SPACY_AVAILABLE and nlp: doc = nlp(term_norm) lemma = " ".join([token.lemma_ for token in doc]) else: lemma = term_norm lemma_cache[term_norm] = lemma return lemma # ------------------------ # NV_MASTER einlesen # ------------------------ def load_nv_master(path): norm_dict = {} try: doc = odf.opendocument.load(path) except Exception as e: log(f"Fehler beim Laden von NV_MASTER: {e}") return norm_dict for sheet in doc.spreadsheet.getElementsByType(odf.table.Table): sheet_name = sheet.getAttribute("name") if sheet_name.lower() == "master": continue current_parent_id = None for row in sheet.getElementsByType(odf.table.TableRow): cells = row.getElementsByType(odf.table.TableCell) cell_values = [] for cell in cells: texts = cell.getElementsByType(odf.text.P) if texts and texts[0].firstChild: cell_values.append(str(texts[0].firstChild.data).strip()) else: cell_values.append("") if not cell_values or len(cell_values)<4: continue id_val, unterk, unterunterk, word = cell_values[:4] if id_val: current_parent_id = id_val.strip() if not word: continue key = lemmatize_term(word) norm_dict[key] = { "Name": word.strip(), "ID": current_parent_id, "Sheet": sheet_name, "Unterkategorie": unterk.strip(), "Unterunterkategorie": unterunterk.strip() } log(f"NV_MASTER geladen: {len(norm_dict)} Begriffe") return norm_dict # ------------------------ # Matching # ------------------------ def get_suggestions(term_lemma, norm_dict, top_n=3, threshold=CONF_THRESHOLD): candidates = [] for key, entry in norm_dict.items(): if RAPIDFUZZ_AVAILABLE: score = fuzz.token_set_ratio(term_lemma, key)/100 else: score = SequenceMatcher(None, term_lemma.lower(), key.lower()).ratio() if key.lower().startswith(term_lemma.lower()): score = min(score + 0.1, 1.0) if score >= threshold: candidates.append((score, entry["Name"], entry["ID"])) candidates.sort(reverse=True) return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]] def map_word(word, norm_dict): key = lemmatize_term(word) if key in CACHE: cached = CACHE[key] return cached["Norm"], cached["Suggestion"], cached["ID"] if key in norm_dict: entry = norm_dict[key] tr, sug, wid = entry["Name"], "", entry["ID"] else: suggestions = get_suggestions(term_lemma=key, norm_dict=norm_dict) if suggestions: tr, sug, wid = "KEIN TREFFER", ", ".join(suggestions), "" else: tr, sug, wid = "KEIN TREFFER", "", "" CACHE[key] = {"Norm": tr, "Suggestion": sug, "ID": wid} return tr, sug, wid # ------------------------ # Makro-Hauptfunktion # ------------------------ def run_mapper_macro(): try: doc = XSCRIPTCONTEXT.getDocument() sheets = doc.getSheets() sheet = sheets.getByIndex(0) cursor = sheet.createCursor() cursor.gotoStartOfUsedArea(False) cursor.gotoEndOfUsedArea(True) data_range = cursor.getRangeAddress() header_row = 0 objekt_col = None # Header prüfen for col in range(data_range.EndColumn+1): val = sheet.getCellByPosition(col, header_row).String.strip().lower() if val == "objektbeschreibung": objekt_col = col break if objekt_col is None: log("Spalte 'Objektbeschreibung' nicht gefunden") return # Neue Spalten am rechten Tabellenende erstellen max_col = data_range.EndColumn norm_tr_col = max_col + 1 norm_sug_col = max_col + 2 norm_id_col = max_col + 3 sheet.getCellByPosition(norm_tr_col, header_row).String = "Norm_Treffer" sheet.getCellByPosition(norm_sug_col, header_row).String = "Norm_Vorschlag" sheet.getCellByPosition(norm_id_col, header_row).String = "Norm_ID" norm_dict = load_nv_master(NV_MASTER_PATH) # Farben GREEN = 0xC6EFCE YELLOW = 0xFFEB9C RED = 0xFFC7CE for row in range(1, data_range.EndRow+1): cell = sheet.getCellByPosition(objekt_col, row) val = cell.String.strip() if not val: continue words = [w.strip() for w in re.split(r"\s+", val) if w.strip() and w.lower() not in STOPWORDS] tr_list, sug_list, id_list = [], [], [] for w in words: tr, sug, wid = map_word(w, norm_dict) if tr != "KEIN TREFFER": tr_list.append(tr) if sug: sug_list.append(sug) if wid: id_list.append(wid) sheet.getCellByPosition(norm_tr_col, row).String = ", ".join(tr_list) sheet.getCellByPosition(norm_sug_col, row).String = ", ".join(sug_list) sheet.getCellByPosition(norm_id_col, row).String = ", ".join(id_list) # Farbmarkierung if tr_list: cell.CellBackColor = GREEN elif sug_list: cell.CellBackColor = YELLOW else: cell.CellBackColor = RED # Cache speichern with open(CACHE_FILE, "w", encoding="utf-8") as f: json.dump(CACHE, f, ensure_ascii=False, indent=2) log("Makro erfolgreich ausgeführt") except Exception as e: log("Fehler in run_mapper_macro:") log(traceback.format_exc())