449 lines
16 KiB
Python
449 lines
16 KiB
Python
# -*- coding: utf-8 -*-
|
|
# LibreOffice Calc macro: NV_MASTER-Abgleich, Pandas+odf, Cache, Farben
|
|
# Pfade: BASE_DIR muss auf das Verzeichnis zeigen, in dem NV_MASTER.ods + Makro liegen.
|
|
# Speichern: /home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro/mapper_macro.py
|
|
|
|
import os
|
|
import re
|
|
import json
|
|
import traceback
|
|
|
|
# UNO-Context wird zur Laufzeit zur Verfügung gestellt (XSCRIPTCONTEXT)
|
|
# Third-party libs: pandas, odfpy, optional: spacy, rapidfuzz
|
|
try:
|
|
import pandas as pd
|
|
PANDAS_AVAILABLE = True
|
|
except Exception:
|
|
PANDAS_AVAILABLE = False
|
|
|
|
try:
|
|
import spacy
|
|
nlp = spacy.load("de_core_news_sm")
|
|
SPACY_AVAILABLE = True
|
|
except Exception:
|
|
SPACY_AVAILABLE = False
|
|
nlp = None
|
|
|
|
try:
|
|
from rapidfuzz import fuzz
|
|
RAPIDFUZZ_AVAILABLE = True
|
|
except Exception:
|
|
RAPIDFUZZ_AVAILABLE = False
|
|
from difflib import SequenceMatcher
|
|
|
|
# ------------------------
|
|
# Konfiguration
|
|
# ------------------------
|
|
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro"
|
|
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
|
|
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
|
|
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
|
|
|
|
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
|
CONF_THRESHOLD = 0.75 # Basis-Schwelle für Vorschläge
|
|
|
|
# ------------------------
|
|
# Utilities: Logging & safe I/O
|
|
# ------------------------
|
|
def log(msg):
|
|
try:
|
|
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
|
f.write(msg + "\n")
|
|
except Exception:
|
|
pass
|
|
|
|
# ------------------------
|
|
# Cache laden
|
|
# ------------------------
|
|
try:
|
|
if os.path.exists(CACHE_FILE):
|
|
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
|
CACHE = json.load(f)
|
|
else:
|
|
CACHE = {}
|
|
except Exception:
|
|
CACHE = {}
|
|
|
|
# ------------------------
|
|
# Text-Normalisierung & Lemma
|
|
# ------------------------
|
|
def normalize_text(s):
|
|
if not s:
|
|
return ""
|
|
s = str(s).strip().lower()
|
|
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
|
|
s = re.sub(r"\s+", " ", s)
|
|
return s
|
|
|
|
lemma_cache = {}
|
|
def lemmatize_term(term):
|
|
term_norm = normalize_text(term)
|
|
if term_norm in lemma_cache:
|
|
return lemma_cache[term_norm]
|
|
if SPACY_AVAILABLE and nlp:
|
|
try:
|
|
doc = nlp(term_norm)
|
|
lemma = " ".join([token.lemma_ for token in doc])
|
|
except Exception:
|
|
lemma = term_norm
|
|
else:
|
|
lemma = term_norm
|
|
lemma_cache[term_norm] = lemma
|
|
return lemma
|
|
|
|
# ------------------------
|
|
# NV_MASTER robust laden (pandas + odf)
|
|
# ------------------------
|
|
def build_norm_index(nv_path):
|
|
norm_dict = {} # normalized_name -> list of entries (Name, ID, Sheet)
|
|
lemma_index = {} # lemma -> list of entries
|
|
if not PANDAS_AVAILABLE:
|
|
log("Pandas nicht verfügbar. NV_MASTER kann nicht zuverlässig gelesen werden.")
|
|
return norm_dict, lemma_index
|
|
|
|
try:
|
|
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
|
|
except Exception as e:
|
|
log(f"Fehler beim Einlesen von NV_MASTER mit pandas: {e}")
|
|
return norm_dict, lemma_index
|
|
|
|
for sheet_name, df in sheets.items():
|
|
if str(sheet_name).strip().lower() == "master":
|
|
continue
|
|
# normalize columns names to find ID and Wort columns
|
|
df = df.fillna("") # leere Zellen als ""
|
|
cols = [str(c).strip().lower() for c in df.columns]
|
|
# try to find columns
|
|
id_col = None
|
|
word_col = None
|
|
for i, c in enumerate(cols):
|
|
if "id" in c:
|
|
id_col = df.columns[i]
|
|
if "wort" in c or "vokabel" in c:
|
|
word_col = df.columns[i]
|
|
# fallback: if not found, try first/last
|
|
if word_col is None and len(df.columns) >= 1:
|
|
word_col = df.columns[-1]
|
|
if id_col is None and len(df.columns) >= 1:
|
|
id_col = df.columns[0]
|
|
|
|
current_parent_id = None
|
|
for _, row in df.iterrows():
|
|
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
|
|
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
|
|
# if row defines an ID, set as current parent
|
|
if id_val:
|
|
current_parent_id = id_val
|
|
# skip empty word cells
|
|
if not word_val:
|
|
continue
|
|
norm_name = normalize_text(word_val)
|
|
lemma = lemmatize_term(word_val)
|
|
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
|
|
# add to norm_dict by normalized name (exact matching)
|
|
norm_dict.setdefault(norm_name, []).append(entry)
|
|
# add to lemma_index
|
|
lemma_index.setdefault(lemma, []).append(entry)
|
|
|
|
log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}")
|
|
return norm_dict, lemma_index
|
|
|
|
# ------------------------
|
|
# Matching: exakter Treffer, Lemma-Treffer, Fuzzy-Vorschläge
|
|
# ------------------------
|
|
def fuzzy_score(a, b):
|
|
if RAPIDFUZZ_AVAILABLE:
|
|
try:
|
|
return fuzz.token_set_ratio(a, b) / 100.0
|
|
except Exception:
|
|
return 0.0
|
|
else:
|
|
try:
|
|
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
|
except Exception:
|
|
return 0.0
|
|
|
|
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, threshold=CONF_THRESHOLD):
|
|
# collect candidates from lemma_index keys and norm_dict keys
|
|
candidates = []
|
|
# iterate over lemma_index keys for candidate names
|
|
for key_lemma, entries in lemma_index.items():
|
|
score = fuzzy_score(term_lemma, key_lemma)
|
|
if key_lemma.startswith(term_lemma):
|
|
score = min(score + 0.1, 1.0)
|
|
if score >= threshold:
|
|
for e in entries:
|
|
candidates.append((score, e["Name"], e["ID"]))
|
|
# also check norm_dict keys (exact-normalized names) as additional candidates
|
|
for norm_key, entries in norm_dict.items():
|
|
score = fuzzy_score(term_lemma, norm_key)
|
|
if norm_key.startswith(term_lemma):
|
|
score = min(score + 0.1, 1.0)
|
|
if score >= threshold:
|
|
for e in entries:
|
|
candidates.append((score, e["Name"], e["ID"]))
|
|
# sort by score descending
|
|
candidates.sort(key=lambda t: t[0], reverse=True)
|
|
# unique by (Name, ID) preserve score order
|
|
seen = set()
|
|
results = []
|
|
for score, name, id_ in candidates:
|
|
key = (name, id_)
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
results.append({"score": score, "name": name, "id": id_})
|
|
# return all candidates (no limit) as "Name (ID)"
|
|
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
|
|
|
|
def map_term_with_indexes(term, norm_dict, lemma_index):
|
|
term_norm = normalize_text(term)
|
|
term_lemma = lemmatize_term(term)
|
|
# cache lookup
|
|
if term_lemma in CACHE:
|
|
return CACHE[term_lemma]["hits"], CACHE[term_lemma]["suggestions"], CACHE[term_lemma]["ids"]
|
|
|
|
hits = []
|
|
suggestions = []
|
|
ids = []
|
|
|
|
# 1) exact normalized name match
|
|
if term_norm in norm_dict:
|
|
for e in norm_dict[term_norm]:
|
|
hits.append(e["Name"])
|
|
if e["ID"]:
|
|
ids.append(e["ID"])
|
|
|
|
# 2) lemma match (if not already hits)
|
|
if not hits and term_lemma in lemma_index:
|
|
for e in lemma_index[term_lemma]:
|
|
hits.append(e["Name"])
|
|
if e["ID"]:
|
|
ids.append(e["ID"])
|
|
|
|
# 3) suggestions via fuzzy (always compute even if hits exist, but suggestions empty if exact)
|
|
suggs = get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, threshold=CONF_THRESHOLD)
|
|
# If there are exact hits, we still may present suggestions (user wanted unlimited), but suggestions are secondary
|
|
suggestions = suggs
|
|
|
|
# deduplicate lists preserving order
|
|
def unique_preserve(seq):
|
|
seen = set()
|
|
out = []
|
|
for x in seq:
|
|
if x not in seen:
|
|
seen.add(x)
|
|
out.append(x)
|
|
return out
|
|
|
|
hits = unique_preserve(hits)
|
|
suggestions = unique_preserve(suggestions)
|
|
ids = unique_preserve(ids)
|
|
|
|
# cache result
|
|
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
|
|
return hits, suggestions, ids
|
|
|
|
# ------------------------
|
|
# Haupt-Makro
|
|
# ------------------------
|
|
def run_mapper_macro():
|
|
try:
|
|
# UNO doc/sheet
|
|
doc = XSCRIPTCONTEXT.getDocument()
|
|
sheet = doc.CurrentController.ActiveSheet
|
|
cursor = sheet.createCursor()
|
|
cursor.gotoStartOfUsedArea(False)
|
|
cursor.gotoEndOfUsedArea(True)
|
|
data_range = cursor.getRangeAddress()
|
|
except Exception as e:
|
|
log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e))
|
|
return
|
|
|
|
# find header row and Objektbeschreibung column (search first 5 rows)
|
|
header_row = None
|
|
objekt_col = None
|
|
max_col = data_range.EndColumn
|
|
for r in range(0, min(5, data_range.EndRow+1)):
|
|
for c in range(0, max_col+1):
|
|
try:
|
|
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
|
|
except Exception:
|
|
val = ""
|
|
if val == "objektbeschreibung":
|
|
header_row = r
|
|
objekt_col = c
|
|
break
|
|
if objekt_col is not None:
|
|
break
|
|
|
|
if objekt_col is None:
|
|
log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.")
|
|
return
|
|
|
|
# determine or create result columns: search if exist anywhere; otherwise append at right end
|
|
existing = {}
|
|
for c in range(0, data_range.EndColumn+1):
|
|
try:
|
|
h = str(sheet.getCellByPosition(c, header_row).String).strip()
|
|
except Exception:
|
|
h = ""
|
|
if h == "Norm_Treffer":
|
|
existing["Norm_Treffer"] = c
|
|
if h == "Norm_Vorschlag":
|
|
existing["Norm_Vorschlag"] = c
|
|
if h == "Norm_ID":
|
|
existing["Norm_ID"] = c
|
|
|
|
# append columns at right end if missing
|
|
last_col = data_range.EndColumn
|
|
if "Norm_Treffer" not in existing:
|
|
last_col += 1
|
|
existing["Norm_Treffer"] = last_col
|
|
try:
|
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
|
|
except Exception:
|
|
pass
|
|
if "Norm_Vorschlag" not in existing:
|
|
last_col += 1
|
|
existing["Norm_Vorschlag"] = last_col
|
|
try:
|
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
|
|
except Exception:
|
|
pass
|
|
if "Norm_ID" not in existing:
|
|
last_col += 1
|
|
existing["Norm_ID"] = last_col
|
|
try:
|
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_ID"
|
|
except Exception:
|
|
pass
|
|
|
|
norm_tr_col = existing["Norm_Treffer"]
|
|
norm_sug_col = existing["Norm_Vorschlag"]
|
|
norm_id_col = existing["Norm_ID"]
|
|
|
|
# Build norm indexes
|
|
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
|
|
if not norm_dict and not lemma_index:
|
|
log("NV_MASTER leer oder nicht lesbar. Abbruch.")
|
|
return
|
|
|
|
# colors
|
|
GREEN = 0xADFF2F
|
|
YELLOW = 0xFFA500
|
|
RED = 0xCC0000
|
|
|
|
# iterate rows
|
|
rows_processed = 0
|
|
for r in range(header_row + 1, data_range.EndRow + 1):
|
|
try:
|
|
cell = sheet.getCellByPosition(objekt_col, r)
|
|
txt = str(cell.String).strip()
|
|
if not txt:
|
|
# clear any previous outputs? keep existing per spec; skip empty
|
|
continue
|
|
|
|
# tokenize: split by commas first, then whitespace; filter stopwords and pure numbers
|
|
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
|
|
terms = []
|
|
for cl in clauses:
|
|
parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()]
|
|
for p in parts:
|
|
if p.lower() in STOPWORDS:
|
|
continue
|
|
if re.fullmatch(r"\d+", p):
|
|
continue
|
|
terms.append(p)
|
|
|
|
# for each term, get hits/suggestions/ids
|
|
row_hits = []
|
|
row_sugs = []
|
|
row_ids = []
|
|
any_unmapped = False # at least one term without hit and without suggestion
|
|
# We will record for each term
|
|
for term in terms:
|
|
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
|
|
if hits:
|
|
row_hits.extend(hits)
|
|
if sugs:
|
|
row_sugs.extend(sugs)
|
|
if ids:
|
|
row_ids.extend(ids)
|
|
if (not hits) and (not sugs):
|
|
any_unmapped = True
|
|
|
|
# deduplicate preserving order
|
|
def uniq(seq):
|
|
seen = set()
|
|
out = []
|
|
for x in seq:
|
|
if x not in seen:
|
|
seen.add(x)
|
|
out.append(x)
|
|
return out
|
|
|
|
row_hits = uniq(row_hits)
|
|
row_sugs = uniq(row_sugs)
|
|
row_ids = uniq(row_ids)
|
|
|
|
# write outputs (unlimited lists, joined with " | ")
|
|
try:
|
|
sheet.getCellByPosition(norm_tr_col, r).String = " | ".join(row_hits)
|
|
sheet.getCellByPosition(norm_sug_col, r).String = " | ".join(row_sugs)
|
|
sheet.getCellByPosition(norm_id_col, r).String = " | ".join(row_ids)
|
|
except Exception:
|
|
pass
|
|
|
|
# Coloring rules per new spec:
|
|
# - Objektbeschreibung cell: RED if any_unmapped else no change (we do not color green/yellow here)
|
|
# - Norm_Treffer cell: GREEN if all terms matched (i.e., terms non-empty and no term unmapped and at least one hit per term)
|
|
# - Norm_Vorschlag cell: YELLOW if at least one suggestion exists
|
|
# Determine "all matched": terms non-empty and every term has at least one hit (we approximated by checking any_unmapped and hits length)
|
|
all_matched = False
|
|
if terms:
|
|
# all_matched if no term without hit and there is at least one hit overall
|
|
if (not any_unmapped) and row_hits:
|
|
all_matched = True
|
|
|
|
# apply colors
|
|
try:
|
|
if any_unmapped:
|
|
cell.CellBackColor = RED
|
|
else:
|
|
# clear red if previously set? We'll leave unchanged if not set. Optionally set to default 16777215 (white)
|
|
pass
|
|
# Norm_Treffer coloring
|
|
tr_cell = sheet.getCellByPosition(norm_tr_col, r)
|
|
if all_matched:
|
|
tr_cell.CellBackColor = GREEN
|
|
else:
|
|
# clear color if needed -> set to white
|
|
tr_cell.CellBackColor = 0xFFFFFF
|
|
# Norm_Vorschlag coloring
|
|
sug_cell = sheet.getCellByPosition(norm_sug_col, r)
|
|
if row_sugs:
|
|
sug_cell.CellBackColor = YELLOW
|
|
else:
|
|
sug_cell.CellBackColor = 0xFFFFFF
|
|
except Exception:
|
|
pass
|
|
|
|
rows_processed += 1
|
|
|
|
except Exception as e:
|
|
# continue processing other rows; log once
|
|
log(f"Fehler in Zeile {r}: {e}")
|
|
|
|
# persist cache
|
|
try:
|
|
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
|
json.dump(CACHE, f, ensure_ascii=False, indent=2)
|
|
except Exception:
|
|
pass
|
|
|
|
log(f"run_mapper_macro fertig. Zeilen verarbeitet: {rows_processed}")
|
|
|
|
# Export for LO
|
|
g_exportedScripts = (run_mapper_macro,)
|