From b2244b48168be9f895a7ea353bd3983b865597de Mon Sep 17 00:00:00 2001 From: gumuArnold Date: Thu, 16 Oct 2025 13:35:08 +0000 Subject: [PATCH] Delete mapper_macro_2.2.py --- mapper_macro_2.2.py | 353 -------------------------------------------- 1 file changed, 353 deletions(-) delete mode 100644 mapper_macro_2.2.py diff --git a/mapper_macro_2.2.py b/mapper_macro_2.2.py deleted file mode 100644 index dbbe503..0000000 --- a/mapper_macro_2.2.py +++ /dev/null @@ -1,353 +0,0 @@ -# -*- coding: utf-8 -*- -# LibreOffice / Excel macro: NV_MASTER-Abgleich, Pandas+odf, Cache, Farben -# Version: 2.2 -# Speicherort (Linux/Windows automatisch erkannt) - -""" -Mapper Macro 2.2 -================ -Dieses Makro liest die Spalte 'Objektbeschreibung' im aktiven Sheet und versucht, -jedes Wort einem Eintrag im Normvokabular zuzuordnen. - -Features: -- Direkte Treffer werden unter "Norm_Treffer" gelistet (mit ID in Klammern) -- Vorschläge (Fuzzy Matching) werden unter "Norm_Vorschlag" gelistet -- Farbregeln: - * Grün: Alle Begriffe in der Zeile haben direkte Treffer - * Gelb: Mindestens ein Begriff hat Treffer, aber nicht alle - * Rot: Kein Treffer für alle Begriffe -- Logging aller Schritte in mapper_macro_2.2.log (selbes Verzeichnis wie Makro) -- Cache für bereits gematchte Begriffe -- OS-Erkennung (Linux/Windows) und automatische Pfadwahl -- Unterstützt LibreOffice und Excel (pandas für .ods/.xlsx) -""" - -import os -import re -import json -import traceback -import platform - -# ------------------------ -# OS-basierte Pfade -# ------------------------ -if platform.system().lower().startswith("win"): - BASE_DIR = os.path.join(os.environ["APPDATA"], "LibreOffice", "4", "user", "Scripts", "python", "Vokabular_Abgleich_Makro") -else: - BASE_DIR = os.path.expanduser("~/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro") - -NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods") -LOG_FILE = os.path.join(BASE_DIR, "mapper_macro_2.2.log") -CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache_2.2.json") - -# Verzeichnis ggf. anlegen -os.makedirs(BASE_DIR, exist_ok=True) - -# ------------------------ -# Abhängigkeiten -# ------------------------ -try: - import pandas as pd - PANDAS_AVAILABLE = True -except Exception: - PANDAS_AVAILABLE = False - -try: - import spacy - nlp = spacy.load("de_core_news_sm") - SPACY_AVAILABLE = True -except Exception: - SPACY_AVAILABLE = False - nlp = None - -try: - from rapidfuzz import fuzz - RAPIDFUZZ_AVAILABLE = True -except Exception: - RAPIDFUZZ_AVAILABLE = False - from difflib import SequenceMatcher - -# ------------------------ -# Konfiguration -# ------------------------ -STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"} -CONF_THRESHOLD = 0.75 # Basis-Schwelle für Vorschläge - -# ------------------------ -# Logging -# ------------------------ -def log(msg): - try: - with open(LOG_FILE, "a", encoding="utf-8") as f: - f.write(msg + "\n") - except Exception: - pass - -# ------------------------ -# Cache laden -# ------------------------ -try: - if os.path.exists(CACHE_FILE): - with open(CACHE_FILE, "r", encoding="utf-8") as f: - CACHE = json.load(f) - else: - CACHE = {} -except Exception: - CACHE = {} - -# ------------------------ -# Text-Normalisierung & Lemma -# ------------------------ -def normalize_text(s): - if not s: - return "" - s = str(s).strip().lower() - s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s) - s = re.sub(r"\s+", " ", s) - return s - -lemma_cache = {} -def lemmatize_term(term): - term_norm = normalize_text(term) - if term_norm in lemma_cache: - return lemma_cache[term_norm] - if SPACY_AVAILABLE and nlp: - try: - doc = nlp(term_norm) - lemma = " ".join([token.lemma_ for token in doc]) - except Exception: - lemma = term_norm - else: - lemma = term_norm - lemma_cache[term_norm] = lemma - return lemma - -# ------------------------ -# NV_MASTER laden -# ------------------------ -def build_norm_index(nv_path): - norm_dict = {} - lemma_index = {} - if not PANDAS_AVAILABLE: - log("Pandas nicht verfügbar. NV_MASTER kann nicht gelesen werden.") - return norm_dict, lemma_index - try: - sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf") - except Exception as e: - log(f"Fehler beim Einlesen von NV_MASTER: {e}") - return norm_dict, lemma_index - for sheet_name, df in sheets.items(): - if str(sheet_name).strip().lower() == "master": - continue - df = df.fillna("") - cols = [str(c).strip().lower() for c in df.columns] - id_col = None - word_col = None - for i, c in enumerate(cols): - if "id" in c: id_col = df.columns[i] - if "wort" in c or "vokabel" in c: word_col = df.columns[i] - if word_col is None and len(df.columns) >= 1: word_col = df.columns[-1] - if id_col is None and len(df.columns) >= 1: id_col = df.columns[0] - current_parent_id = None - for _, row in df.iterrows(): - id_val = str(row[id_col]).strip() if id_col in df.columns else "" - word_val = str(row[word_col]).strip() if word_col in df.columns else "" - if id_val: current_parent_id = id_val - if not word_val: continue - norm_name = normalize_text(word_val) - lemma = lemmatize_term(word_val) - entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name} - norm_dict.setdefault(norm_name, []).append(entry) - lemma_index.setdefault(lemma, []).append(entry) - log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}") - return norm_dict, lemma_index - -# ------------------------ -# Matching -# ------------------------ -def fuzzy_score(a, b): - if RAPIDFUZZ_AVAILABLE: - try: - return fuzz.token_set_ratio(a, b) / 100.0 - except Exception: - return 0.0 - else: - try: - return SequenceMatcher(None, a.lower(), b.lower()).ratio() - except Exception: - return 0.0 - -def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD): - candidates = [] - for key_lemma, entries in lemma_index.items(): - score = fuzzy_score(term_lemma, key_lemma) - if key_lemma.startswith(term_lemma): - score = min(score + 0.1, 1.0) - if score >= threshold: - for e in entries: candidates.append((score, e["Name"], e["ID"])) - for norm_key, entries in norm_dict.items(): - score = fuzzy_score(term_lemma, norm_key) - if norm_key.startswith(term_lemma): - score = min(score + 0.1, 1.0) - if score >= threshold: - for e in entries: candidates.append((score, e["Name"], e["ID"])) - candidates.sort(key=lambda t: t[0], reverse=True) - seen = set() - results = [] - for score, name, id_ in candidates: - key = (name, id_) - if key in seen: continue - seen.add(key) - results.append({"score": score, "name": name, "id": id_}) - return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results] - -def map_term_with_indexes(term, norm_dict, lemma_index): - term_norm = normalize_text(term) - term_lemma = lemmatize_term(term) - if term_lemma in CACHE: - cache_entry = CACHE[term_lemma] - hits = cache_entry.get("hits", []) - suggestions = cache_entry.get("suggestions", []) - return hits, suggestions - hits = [] - suggestions = [] - if term_norm in norm_dict: - for e in norm_dict[term_norm]: hits.append(f'{e["Name"]} ({e["ID"]})' if e["ID"] else e["Name"]) - if not hits and term_lemma in lemma_index: - for e in lemma_index[term_lemma]: hits.append(f'{e["Name"]} ({e["ID"]})' if e["ID"] else e["Name"]) - if not hits: - suggestions = get_suggestions_for_term(term_lemma, norm_dict, lemma_index) - # deduplicate - hits = list(dict.fromkeys(hits)) - suggestions = list(dict.fromkeys(suggestions)) - CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions} - return hits, suggestions - -# ------------------------ -# Haupt-Makro -# ------------------------ -def run_mapper_macro(): - try: - doc = XSCRIPTCONTEXT.getDocument() - sheet = doc.CurrentController.ActiveSheet - cursor = sheet.createCursor() - cursor.gotoStartOfUsedArea(False) - cursor.gotoEndOfUsedArea(True) - data_range = cursor.getRangeAddress() - except Exception as e: - log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e)) - return - - header_row = None - objekt_col = None - max_col = data_range.EndColumn - for r in range(0, min(5, data_range.EndRow+1)): - for c in range(0, max_col+1): - try: - val = str(sheet.getCellByPosition(c, r).String).strip().lower() - except Exception: - val = "" - if val == "objektbeschreibung": - header_row = r - objekt_col = c - break - if objekt_col is not None: - break - if objekt_col is None: - log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.") - return - - existing = {} - for c in range(0, data_range.EndColumn+1): - try: - h = str(sheet.getCellByPosition(c, header_row).String).strip() - except Exception: - h = "" - if h == "Norm_Treffer": existing["Norm_Treffer"] = c - if h == "Norm_Vorschlag": existing["Norm_Vorschlag"] = c - - last_col = data_range.EndColumn - if "Norm_Treffer" not in existing: - last_col += 1 - existing["Norm_Treffer"] = last_col - sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer" - if "Norm_Vorschlag" not in existing: - last_col += 1 - existing["Norm_Vorschlag"] = last_col - sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag" - - norm_tr_col = existing["Norm_Treffer"] - norm_sug_col = existing["Norm_Vorschlag"] - - norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH) - if not norm_dict and not lemma_index: - log("NV_MASTER leer oder nicht lesbar. Abbruch.") - return - - GREEN = 0xADFF2F - YELLOW = 0xFFA500 - RED = 0xCC0000 - WHITE = 0xFFFFFF - - rows_processed = 0 - for r in range(header_row + 1, data_range.EndRow + 1): - try: - cell = sheet.getCellByPosition(objekt_col, r) - txt = str(cell.String).strip() - if not txt: continue - - clauses = [c.strip() for c in re.split(r",", txt) if c.strip()] - terms = [] - for cl in clauses: - parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()] - for p in parts: - if p.lower() in STOPWORDS: continue - if re.fullmatch(r"\d+", p): continue - terms.append(p) - - row_hits = [] - row_sugs = [] - unmapped_terms = [] - - for term in terms: - hits, sugs = map_term_with_indexes(term, norm_dict, lemma_index) - if hits: row_hits.extend(hits) - else: - unmapped_terms.append(term) - if sugs: row_sugs.extend(sugs) - - row_hits = list(dict.fromkeys(row_hits)) - row_sugs = list(dict.fromkeys(row_sugs)) - - # Farblogik für Objektbeschreibung - if terms and not unmapped_terms and row_hits: - cell.CellBackColor = GREEN - row_sugs = [] - elif row_hits: - cell.CellBackColor = YELLOW - else: - cell.CellBackColor = RED - - tr_cell = sheet.getCellByPosition(norm_tr_col, r) - tr_cell.String = " | ".join(row_hits) - tr_cell.CellBackColor = GREEN if row_hits else WHITE - - sug_cell = sheet.getCellByPosition(norm_sug_col, r) - sug_cell.String = " | ".join(row_sugs) - sug_cell.CellBackColor = YELLOW if row_sugs else WHITE - - rows_processed += 1 - - except Exception as e: - log(f"Fehler in Zeile {r}: {e}\n{traceback.format_exc()}") - - try: - with open(CACHE_FILE, "w", encoding="utf-8") as f: - json.dump(CACHE, f, ensure_ascii=False, indent=2) - except Exception: - pass - - log(f"run_mapper_macro 2.2 fertig. Zeilen verarbeitet: {rows_processed}") - -# Export für LibreOffice -g_exportedScripts = (run_mapper_macro,)