Delete mapper_macro_1.5.py

This commit is contained in:
gumuArnold 2025-10-16 13:34:54 +00:00
parent 41ad23e8df
commit 94e62cc267

View File

@ -1,508 +0,0 @@
# -*- coding: utf-8 -*-
# mapper_macro 1.5 - korrigiert: Logging im Dokumentverzeichnis, stabile Button-Erstellung,
# keine Listener, optimiertes Mapping (ohne Listener-Teil)
import os
import re
import json
import datetime
# optionale Module (Pandas, Spacy, RapidFuzz)
try:
import pandas as pd
PANDAS_AVAILABLE = True
except Exception:
PANDAS_AVAILABLE = False
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
from difflib import SequenceMatcher
# UNO (für Button/Paths)
try:
import uno
except Exception:
uno = None
# ------------------------
# Konfiguration (Fallback-BASE_DIR)
# ------------------------
BASE_DIR = os.path.expanduser("~/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro")
NV_MASTER_FILENAME = "NV_MASTER.ods"
CACHE_FILENAME = "mapper_cache.json"
LOG_FILENAME = "mapper_macro.log"
STOPWORDS = {
"mit", "ohne", "der", "die", "das", "ein", "eine", "und", "zu", "von", "im", "in", "auf", "an",
"als", "bei", "für", "aus", "dem", "den", "des", "eines", "einer"
}
CONF_THRESHOLD = 0.82
FUZZY_CUTOFF = 0.88
# Per-document paths (initialized by set_paths_from_doc)
DOC_DIR = BASE_DIR
NV_MASTER_PATH = os.path.join(DOC_DIR, NV_MASTER_FILENAME)
CACHE_FILE = os.path.join(DOC_DIR, CACHE_FILENAME)
LOG_FILE = os.path.join(DOC_DIR, LOG_FILENAME)
# in-memory cache
try:
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
except Exception:
CACHE = {}
# ------------------------
# Pfade im Dokument setzen
# ------------------------
def set_paths_from_doc(doc):
global DOC_DIR, NV_MASTER_PATH, CACHE_FILE, LOG_FILE
try:
url = getattr(doc, "URL", "")
if url and url.strip():
# UNO liefert file:///...
try:
system_path = uno.fileUrlToSystemPath(url)
except Exception:
# fallback: try simple unquote
from urllib.parse import unquote, urlparse
parsed = urlparse(url)
if parsed.scheme == "file":
system_path = unquote(parsed.path)
else:
system_path = ""
if system_path:
d = os.path.dirname(system_path)
if os.path.isdir(d):
DOC_DIR = d
except Exception:
DOC_DIR = BASE_DIR
NV_MASTER_PATH = os.path.join(DOC_DIR, NV_MASTER_FILENAME)
CACHE_FILE = os.path.join(DOC_DIR, CACHE_FILENAME)
LOG_FILE = os.path.join(DOC_DIR, LOG_FILENAME)
# ------------------------
# Logging (Dokumentdir, robust)
# ------------------------
def log(msg, level="INFO"):
ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
line = f"[{ts}] [{level}] {msg}\n"
try:
# ensure directory exists
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(line)
except Exception:
# absolute fallback: try writing into BASE_DIR
try:
fallback = os.path.join(BASE_DIR, LOG_FILENAME)
os.makedirs(os.path.dirname(fallback), exist_ok=True)
with open(fallback, "a", encoding="utf-8") as f:
f.write(line)
except Exception:
# last resort: silent
pass
# ------------------------
# Textvorbereitung & Helpers
# ------------------------
lemma_cache = {}
def normalize_text(s):
if not s:
return ""
s = str(s).strip().lower()
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
s = re.sub(r"\s+", " ", s)
return s
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(term_norm)
lemma = " ".join([t.lemma_ for t in doc])
except Exception:
lemma = term_norm
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
def compound_split(term):
if not term:
return []
parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
if parts:
return parts
parts = [p for p in re.split(r'[-\s]+', term) if p]
return parts or [term]
# ------------------------
# NV_MASTER indexieren
# ------------------------
def build_norm_index(nv_path):
norm_dict = {}
lemma_index = {}
if not PANDAS_AVAILABLE:
log("Pandas nicht verfügbar, NV_MASTER kann nicht gelesen.", level="ERROR")
return norm_dict, lemma_index
try:
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
except Exception as e:
log(f"Fehler beim Einlesen von NV_MASTER: {e}", level="ERROR")
return norm_dict, lemma_index
for sheet_name, df in sheets.items():
if str(sheet_name).strip().lower() == "master":
continue
df = df.fillna("")
cols = [str(c).strip().lower() for c in df.columns]
# find id/word columns with fallback
id_col = None
word_col = None
for i, c in enumerate(cols):
if "id" in c:
id_col = df.columns[i]
if "wort" in c or "vokabel" in c:
word_col = df.columns[i]
if word_col is None and len(df.columns) >= 1:
word_col = df.columns[-1]
if id_col is None and len(df.columns) >= 1:
id_col = df.columns[0]
current_parent_id = None
for _, row in df.iterrows():
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
if id_val:
current_parent_id = id_val
if not word_val:
continue
norm_name = normalize_text(word_val)
lemma = lemmatize_term(word_val)
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
norm_dict.setdefault(norm_name, []).append(entry)
lemma_index.setdefault(lemma, []).append(entry)
log(f"NV_MASTER geladen. Begriffe: {sum(len(v) for v in norm_dict.values())}", level="INFO")
return norm_dict, lemma_index
# ------------------------
# Fuzzy Matching
# ------------------------
def fuzzy_score(a, b):
a = (a or "").lower()
b = (b or "").lower()
if RAPIDFUZZ_AVAILABLE:
try:
return fuzz.token_sort_ratio(a, b) / 100.0
except Exception:
return 0.0
else:
return SequenceMatcher(None, a, b).ratio()
def get_suggestions(term_lemma, norm_dict, lemma_index, threshold=FUZZY_CUTOFF, max_sugs=6):
candidates = []
term_norm = term_lemma or ""
for key_lemma, entries in lemma_index.items():
if not key_lemma:
continue
score = fuzzy_score(term_norm, key_lemma)
if key_lemma.startswith(term_norm):
score = min(score + 0.08, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
# also check normalized names
for norm_key, entries in norm_dict.items():
score = fuzzy_score(term_norm, norm_key)
if norm_key.startswith(term_norm):
score = min(score + 0.08, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
# sort & dedupe
candidates.sort(key=lambda t: t[0], reverse=True)
seen = set()
out = []
for score, name, id_ in candidates:
key = (name, id_)
if key in seen:
continue
seen.add(key)
if id_:
out.append(f"{name} ({id_})")
else:
out.append(name)
if len(out) >= max_sugs:
break
return out
# ------------------------
# Mapping mit Cache
# ------------------------
def map_term(term, norm_dict, lemma_index):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_lemma in CACHE:
return CACHE[term_lemma]
hits = []
suggestions = []
ids = []
# exact
if term_norm in norm_dict:
for e in norm_dict[term_norm]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
# lemma
if not hits and term_lemma in lemma_index:
for e in lemma_index[term_lemma]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
# suggestions only if no hit
if not hits:
suggestions = get_suggestions(term_lemma, norm_dict, lemma_index)
# remove suggestions that are equal/contain hits
suggestions = [s for s in suggestions if not any(h.lower() in s.lower() for h in hits)]
result = {"hits": hits, "suggestions": suggestions, "ids": ids}
CACHE[term_lemma] = result
return result
# ------------------------
# Button erstellen (sicher)
# ------------------------
def add_macro_button(sheet):
try:
doc = XSCRIPTCONTEXT.getDocument()
except Exception:
log("add_macro_button: kein Dokument-Kontext", level="WARN")
return
try:
draw_page = sheet.DrawPage
# avoid duplicate
for shape in draw_page:
try:
if getattr(shape, "Name", "") == "MapperStartButton":
return
except Exception:
continue
# create shape and button model
shape = doc.createInstance("com.sun.star.drawing.ControlShape")
shape.Name = "MapperStartButton"
shape.Position = uno.createUnoStruct("com.sun.star.awt.Point")
shape.Position.X = 1000
shape.Position.Y = 200
shape.Size = uno.createUnoStruct("com.sun.star.awt.Size")
shape.Size.Width = 3000
shape.Size.Height = 1000
button_model = doc.createInstance("com.sun.star.form.component.CommandButton")
button_model.Label = "Start Mapping"
button_model.HelpText = "Startet das Mapping (run_mapper_macro)"
# assign macro via ActionCommand is not enough; user must link in UI; we add the control and label
shape.Control = button_model
draw_page.add(shape)
log("Button 'MapperStartButton' erstellt.", level="INFO")
except Exception as e:
log(f"add_macro_button Fehler: {e}", level="ERROR")
# ------------------------
# Hauptlauf (ohne Listener)
# ------------------------
def run_mapper_macro():
try:
doc = XSCRIPTCONTEXT.getDocument()
set_paths_from_doc(doc)
log("=== mapper_macro gestartet ===", level="INFO")
sheet = doc.CurrentController.ActiveSheet
add_macro_button(sheet)
# used area
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
dr = cursor.getRangeAddress()
# find header and objekt col
header_row = None
objekt_col = None
for r in range(0, min(10, dr.EndRow + 1)):
for c in range(0, dr.EndColumn + 1):
try:
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
except Exception:
val = ""
if val == "Objektbeschreibung":
header_row = r
objekt_col = c
break
if objekt_col is not None:
break
if objekt_col is None:
log("run_mapper_macro: 'Objektbeschreibung' Header nicht gefunden.", level="ERROR")
return
# ensure result cols
existing = {}
last_col = dr.EndColumn
for c in range(0, dr.EndColumn + 1):
try:
h = str(sheet.getCellByPosition(c, header_row).String).strip()
except Exception:
h = ""
if h == "Norm_Treffer":
existing["Norm_Treffer"] = c
if h == "Norm_Vorschlag":
existing["Norm_Vorschlag"] = c
if h == "Norm_ID":
existing["Norm_ID"] = c
if "Norm_Treffer" not in existing:
last_col += 1
existing["Norm_Treffer"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
if "Norm_Vorschlag" not in existing:
last_col += 1
existing["Norm_Vorschlag"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
if "Norm_ID" not in existing:
last_col += 1
existing["Norm_ID"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_ID"
norm_tr_col = existing["Norm_Treffer"]
norm_sug_col = existing["Norm_Vorschlag"]
norm_id_col = existing["Norm_ID"]
# build index
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
if not norm_dict and not lemma_index:
log("run_mapper_macro: NV_MASTER leer oder nicht lesbar.", level="ERROR")
return
GREEN, YELLOW, RED = 0xADFF2F, 0xFFFF66, 0xFF9999
rows_processed = 0
for r in range(header_row + 1, dr.EndRow + 1):
try:
cell = sheet.getCellByPosition(objekt_col, r)
txt = str(cell.String).strip()
if not txt:
continue
# phrase-first: try entire cleaned phrase (remove stopwords)
tokens = [t.strip() for t in re.split(r'\s+', normalize_text(txt)) if t and t not in STOPWORDS]
phrase = " ".join(tokens).strip()
terms = []
if phrase:
# first try phrase as whole
mapped_phrase = map_term(phrase, norm_dict, lemma_index)
if mapped_phrase["hits"] or mapped_phrase["suggestions"]:
# use phrase result (flatten hits+suggestions for output)
row_hits = mapped_phrase["hits"]
row_sugs = mapped_phrase["suggestions"]
row_ids = mapped_phrase["ids"]
any_unmapped = False if (row_hits or row_sugs) else True
else:
# fallback to token/compound processing
for p in [p for p in re.split(r'[,\s]+', txt) if p.strip()]:
if p.lower() in STOPWORDS or re.fullmatch(r'\d+', p):
continue
for sp in compound_split(p):
if sp and sp.strip():
terms.append(sp.strip())
row_hits = []
row_sugs = []
row_ids = []
any_unmapped = False
for term in terms:
mapped = map_term(term, norm_dict, lemma_index)
hits, sugs, ids = mapped["hits"], mapped["suggestions"], mapped["ids"]
if hits:
row_hits.extend(hits)
if sugs:
row_sugs.extend(sugs)
if ids:
row_ids.extend(ids)
if not hits and not sugs:
any_unmapped = True
else:
row_hits, row_sugs, row_ids = [], [], []
any_unmapped = True
# dedupe preserving order
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
row_hits = uniq(row_hits)
row_sugs = uniq(row_sugs)
row_ids = uniq(row_ids)
# write
sheet.getCellByPosition(norm_tr_col, r).String = " | ".join(row_hits)
sheet.getCellByPosition(norm_sug_col, r).String = " | ".join(row_sugs)
sheet.getCellByPosition(norm_id_col, r).String = " | ".join(row_ids)
cell.CellBackColor = RED if any_unmapped else 0xFFFFFF
sheet.getCellByPosition(norm_tr_col, r).CellBackColor = GREEN if row_hits else 0xFFFFFF
sheet.getCellByPosition(norm_sug_col, r).CellBackColor = YELLOW if row_sugs else 0xFFFFFF
rows_processed += 1
except Exception as e:
log(f"Fehler in Zeile {r}: {e}", level="ERROR")
continue
# persist cache file to DOC_DIR
try:
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
except Exception as e:
log(f"Cache speichern fehlgeschlagen: {e}", level="WARN")
log(f"=== mapper_macro fertig. Zeilen verarbeitet: {rows_processed} ===", level="INFO")
except Exception as e:
# top-level safety
try:
log(f"run_mapper_macro: Unhandled exception: {e}", level="ERROR")
except Exception:
pass
# ------------------------
# Export
# ------------------------
g_exportedScripts = (run_mapper_macro,)