238 lines
7.5 KiB
Python
238 lines
7.5 KiB
Python
import uno
|
|
import os
|
|
import re
|
|
import traceback
|
|
import json
|
|
|
|
# Optional für Lemmatizer
|
|
try:
|
|
import spacy
|
|
nlp = spacy.load("de_core_news_sm")
|
|
SPACY_AVAILABLE = True
|
|
except:
|
|
SPACY_AVAILABLE = False
|
|
nlp = None
|
|
|
|
# Optional für Fuzzy Matching
|
|
try:
|
|
from rapidfuzz import fuzz
|
|
RAPIDFUZZ_AVAILABLE = True
|
|
except:
|
|
from difflib import SequenceMatcher
|
|
RAPIDFUZZ_AVAILABLE = False
|
|
|
|
import odf.opendocument
|
|
import odf.table
|
|
import odf.text
|
|
|
|
# ------------------------
|
|
# Konfiguration absolute Pfade
|
|
# ------------------------
|
|
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro"
|
|
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
|
|
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
|
|
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
|
|
|
|
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
|
CONF_THRESHOLD = 0.75
|
|
|
|
# ------------------------
|
|
# Logging
|
|
# ------------------------
|
|
def log(msg):
|
|
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
|
f.write(msg + "\n")
|
|
|
|
# ------------------------
|
|
# Cache laden
|
|
# ------------------------
|
|
if os.path.exists(CACHE_FILE):
|
|
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
|
CACHE = json.load(f)
|
|
else:
|
|
CACHE = {}
|
|
|
|
# ------------------------
|
|
# Normalisierung / Lemma
|
|
# ------------------------
|
|
def normalize_text(s):
|
|
if not s:
|
|
return ""
|
|
s = str(s).lower().strip()
|
|
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
|
|
s = re.sub(r"\s+"," ",s)
|
|
return s
|
|
|
|
lemma_cache = {}
|
|
def lemmatize_term(term):
|
|
term_norm = normalize_text(term)
|
|
if term_norm in lemma_cache:
|
|
return lemma_cache[term_norm]
|
|
if SPACY_AVAILABLE and nlp:
|
|
doc = nlp(term_norm)
|
|
lemma = " ".join([token.lemma_ for token in doc])
|
|
else:
|
|
lemma = term_norm
|
|
lemma_cache[term_norm] = lemma
|
|
return lemma
|
|
|
|
# ------------------------
|
|
# NV_MASTER einlesen
|
|
# ------------------------
|
|
def load_nv_master(path):
|
|
norm_dict = {}
|
|
try:
|
|
doc = odf.opendocument.load(path)
|
|
except Exception as e:
|
|
log(f"Fehler beim Laden von NV_MASTER: {e}")
|
|
return norm_dict
|
|
|
|
for sheet in doc.spreadsheet.getElementsByType(odf.table.Table):
|
|
sheet_name = sheet.getAttribute("name")
|
|
if sheet_name.lower() == "master":
|
|
continue
|
|
|
|
current_parent_id = None
|
|
for row in sheet.getElementsByType(odf.table.TableRow):
|
|
cells = row.getElementsByType(odf.table.TableCell)
|
|
cell_values = []
|
|
for cell in cells:
|
|
texts = cell.getElementsByType(odf.text.P)
|
|
if texts and texts[0].firstChild:
|
|
cell_values.append(str(texts[0].firstChild.data).strip())
|
|
else:
|
|
cell_values.append("")
|
|
if not cell_values or len(cell_values)<4:
|
|
continue
|
|
id_val, unterk, unterunterk, word = cell_values[:4]
|
|
if id_val:
|
|
current_parent_id = id_val.strip()
|
|
if not word:
|
|
continue
|
|
key = lemmatize_term(word)
|
|
norm_dict[key] = {
|
|
"Name": word.strip(),
|
|
"ID": current_parent_id,
|
|
"Sheet": sheet_name,
|
|
"Unterkategorie": unterk.strip(),
|
|
"Unterunterkategorie": unterunterk.strip()
|
|
}
|
|
log(f"NV_MASTER geladen: {len(norm_dict)} Begriffe")
|
|
return norm_dict
|
|
|
|
# ------------------------
|
|
# Matching
|
|
# ------------------------
|
|
def get_suggestions(term_lemma, norm_dict, top_n=3, threshold=CONF_THRESHOLD):
|
|
candidates = []
|
|
for key, entry in norm_dict.items():
|
|
if RAPIDFUZZ_AVAILABLE:
|
|
score = fuzz.token_set_ratio(term_lemma, key)/100
|
|
else:
|
|
score = SequenceMatcher(None, term_lemma.lower(), key.lower()).ratio()
|
|
if key.lower().startswith(term_lemma.lower()):
|
|
score = min(score + 0.1, 1.0)
|
|
if score >= threshold:
|
|
candidates.append((score, entry["Name"], entry["ID"]))
|
|
candidates.sort(reverse=True)
|
|
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
|
|
|
|
def map_word(word, norm_dict):
|
|
key = lemmatize_term(word)
|
|
if key in CACHE:
|
|
cached = CACHE[key]
|
|
return cached["Norm"], cached["Suggestion"], cached["ID"]
|
|
|
|
if key in norm_dict:
|
|
entry = norm_dict[key]
|
|
tr, sug, wid = entry["Name"], "", entry["ID"]
|
|
else:
|
|
suggestions = get_suggestions(term_lemma=key, norm_dict=norm_dict)
|
|
if suggestions:
|
|
tr, sug, wid = "KEIN TREFFER", ", ".join(suggestions), ""
|
|
else:
|
|
tr, sug, wid = "KEIN TREFFER", "", ""
|
|
|
|
CACHE[key] = {"Norm": tr, "Suggestion": sug, "ID": wid}
|
|
return tr, sug, wid
|
|
|
|
# ------------------------
|
|
# Makro-Hauptfunktion
|
|
# ------------------------
|
|
def run_mapper_macro():
|
|
try:
|
|
doc = XSCRIPTCONTEXT.getDocument()
|
|
sheets = doc.getSheets()
|
|
sheet = sheets.getByIndex(0)
|
|
cursor = sheet.createCursor()
|
|
cursor.gotoStartOfUsedArea(False)
|
|
cursor.gotoEndOfUsedArea(True)
|
|
data_range = cursor.getRangeAddress()
|
|
|
|
header_row = 0
|
|
objekt_col = None
|
|
|
|
# Header prüfen
|
|
for col in range(data_range.EndColumn+1):
|
|
val = sheet.getCellByPosition(col, header_row).String.strip().lower()
|
|
if val == "objektbeschreibung":
|
|
objekt_col = col
|
|
break
|
|
|
|
if objekt_col is None:
|
|
log("Spalte 'Objektbeschreibung' nicht gefunden")
|
|
return
|
|
|
|
# Neue Spalten am rechten Tabellenende erstellen
|
|
max_col = data_range.EndColumn
|
|
norm_tr_col = max_col + 1
|
|
norm_sug_col = max_col + 2
|
|
norm_id_col = max_col + 3
|
|
|
|
sheet.getCellByPosition(norm_tr_col, header_row).String = "Norm_Treffer"
|
|
sheet.getCellByPosition(norm_sug_col, header_row).String = "Norm_Vorschlag"
|
|
sheet.getCellByPosition(norm_id_col, header_row).String = "Norm_ID"
|
|
|
|
norm_dict = load_nv_master(NV_MASTER_PATH)
|
|
|
|
# Farben
|
|
GREEN = 0xC6EFCE
|
|
YELLOW = 0xFFEB9C
|
|
RED = 0xFFC7CE
|
|
|
|
for row in range(1, data_range.EndRow+1):
|
|
cell = sheet.getCellByPosition(objekt_col, row)
|
|
val = cell.String.strip()
|
|
if not val:
|
|
continue
|
|
words = [w.strip() for w in re.split(r"\s+", val) if w.strip() and w.lower() not in STOPWORDS]
|
|
tr_list, sug_list, id_list = [], [], []
|
|
for w in words:
|
|
tr, sug, wid = map_word(w, norm_dict)
|
|
if tr != "KEIN TREFFER":
|
|
tr_list.append(tr)
|
|
if sug:
|
|
sug_list.append(sug)
|
|
if wid:
|
|
id_list.append(wid)
|
|
sheet.getCellByPosition(norm_tr_col, row).String = ", ".join(tr_list)
|
|
sheet.getCellByPosition(norm_sug_col, row).String = ", ".join(sug_list)
|
|
sheet.getCellByPosition(norm_id_col, row).String = ", ".join(id_list)
|
|
# Farbmarkierung
|
|
if tr_list:
|
|
cell.CellBackColor = GREEN
|
|
elif sug_list:
|
|
cell.CellBackColor = YELLOW
|
|
else:
|
|
cell.CellBackColor = RED
|
|
|
|
# Cache speichern
|
|
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
|
json.dump(CACHE, f, ensure_ascii=False, indent=2)
|
|
|
|
log("Makro erfolgreich ausgeführt")
|
|
|
|
except Exception as e:
|
|
log("Fehler in run_mapper_macro:")
|
|
log(traceback.format_exc())
|