GND_Skript_Test/mapper_macro.py
2025-10-10 09:46:41 +02:00

238 lines
7.5 KiB
Python

import uno
import os
import re
import traceback
import json
# Optional für Lemmatizer
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except:
SPACY_AVAILABLE = False
nlp = None
# Optional für Fuzzy Matching
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except:
from difflib import SequenceMatcher
RAPIDFUZZ_AVAILABLE = False
import odf.opendocument
import odf.table
import odf.text
# ------------------------
# Konfiguration absolute Pfade
# ------------------------
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro"
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
# ------------------------
# Logging
# ------------------------
def log(msg):
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(msg + "\n")
# ------------------------
# Cache laden
# ------------------------
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
# ------------------------
# Normalisierung / Lemma
# ------------------------
def normalize_text(s):
if not s:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# ------------------------
# NV_MASTER einlesen
# ------------------------
def load_nv_master(path):
norm_dict = {}
try:
doc = odf.opendocument.load(path)
except Exception as e:
log(f"Fehler beim Laden von NV_MASTER: {e}")
return norm_dict
for sheet in doc.spreadsheet.getElementsByType(odf.table.Table):
sheet_name = sheet.getAttribute("name")
if sheet_name.lower() == "master":
continue
current_parent_id = None
for row in sheet.getElementsByType(odf.table.TableRow):
cells = row.getElementsByType(odf.table.TableCell)
cell_values = []
for cell in cells:
texts = cell.getElementsByType(odf.text.P)
if texts and texts[0].firstChild:
cell_values.append(str(texts[0].firstChild.data).strip())
else:
cell_values.append("")
if not cell_values or len(cell_values)<4:
continue
id_val, unterk, unterunterk, word = cell_values[:4]
if id_val:
current_parent_id = id_val.strip()
if not word:
continue
key = lemmatize_term(word)
norm_dict[key] = {
"Name": word.strip(),
"ID": current_parent_id,
"Sheet": sheet_name,
"Unterkategorie": unterk.strip(),
"Unterunterkategorie": unterunterk.strip()
}
log(f"NV_MASTER geladen: {len(norm_dict)} Begriffe")
return norm_dict
# ------------------------
# Matching
# ------------------------
def get_suggestions(term_lemma, norm_dict, top_n=3, threshold=CONF_THRESHOLD):
candidates = []
for key, entry in norm_dict.items():
if RAPIDFUZZ_AVAILABLE:
score = fuzz.token_set_ratio(term_lemma, key)/100
else:
score = SequenceMatcher(None, term_lemma.lower(), key.lower()).ratio()
if key.lower().startswith(term_lemma.lower()):
score = min(score + 0.1, 1.0)
if score >= threshold:
candidates.append((score, entry["Name"], entry["ID"]))
candidates.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
def map_word(word, norm_dict):
key = lemmatize_term(word)
if key in CACHE:
cached = CACHE[key]
return cached["Norm"], cached["Suggestion"], cached["ID"]
if key in norm_dict:
entry = norm_dict[key]
tr, sug, wid = entry["Name"], "", entry["ID"]
else:
suggestions = get_suggestions(term_lemma=key, norm_dict=norm_dict)
if suggestions:
tr, sug, wid = "KEIN TREFFER", ", ".join(suggestions), ""
else:
tr, sug, wid = "KEIN TREFFER", "", ""
CACHE[key] = {"Norm": tr, "Suggestion": sug, "ID": wid}
return tr, sug, wid
# ------------------------
# Makro-Hauptfunktion
# ------------------------
def run_mapper_macro():
try:
doc = XSCRIPTCONTEXT.getDocument()
sheets = doc.getSheets()
sheet = sheets.getByIndex(0)
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
data_range = cursor.getRangeAddress()
header_row = 0
objekt_col = None
# Header prüfen
for col in range(data_range.EndColumn+1):
val = sheet.getCellByPosition(col, header_row).String.strip().lower()
if val == "objektbeschreibung":
objekt_col = col
break
if objekt_col is None:
log("Spalte 'Objektbeschreibung' nicht gefunden")
return
# Neue Spalten am rechten Tabellenende erstellen
max_col = data_range.EndColumn
norm_tr_col = max_col + 1
norm_sug_col = max_col + 2
norm_id_col = max_col + 3
sheet.getCellByPosition(norm_tr_col, header_row).String = "Norm_Treffer"
sheet.getCellByPosition(norm_sug_col, header_row).String = "Norm_Vorschlag"
sheet.getCellByPosition(norm_id_col, header_row).String = "Norm_ID"
norm_dict = load_nv_master(NV_MASTER_PATH)
# Farben
GREEN = 0xC6EFCE
YELLOW = 0xFFEB9C
RED = 0xFFC7CE
for row in range(1, data_range.EndRow+1):
cell = sheet.getCellByPosition(objekt_col, row)
val = cell.String.strip()
if not val:
continue
words = [w.strip() for w in re.split(r"\s+", val) if w.strip() and w.lower() not in STOPWORDS]
tr_list, sug_list, id_list = [], [], []
for w in words:
tr, sug, wid = map_word(w, norm_dict)
if tr != "KEIN TREFFER":
tr_list.append(tr)
if sug:
sug_list.append(sug)
if wid:
id_list.append(wid)
sheet.getCellByPosition(norm_tr_col, row).String = ", ".join(tr_list)
sheet.getCellByPosition(norm_sug_col, row).String = ", ".join(sug_list)
sheet.getCellByPosition(norm_id_col, row).String = ", ".join(id_list)
# Farbmarkierung
if tr_list:
cell.CellBackColor = GREEN
elif sug_list:
cell.CellBackColor = YELLOW
else:
cell.CellBackColor = RED
# Cache speichern
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
log("Makro erfolgreich ausgeführt")
except Exception as e:
log("Fehler in run_mapper_macro:")
log(traceback.format_exc())