165 lines
5.4 KiB
Python
165 lines
5.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
import tkinter as tk
|
|
import json
|
|
import os
|
|
import threading
|
|
import time
|
|
import pandas as pd
|
|
import re
|
|
import spacy
|
|
from rapidfuzz import fuzz
|
|
|
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
TMP_DIR = os.path.join(BASE_DIR, "tmp")
|
|
ACTIVE_FILE = os.path.join(TMP_DIR, "active_term.json")
|
|
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache_2.3.json")
|
|
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
|
|
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
|
CONF_THRESHOLD = 0.75
|
|
MAX_SUGGESTIONS = 20
|
|
|
|
# -------------------------
|
|
# Logging
|
|
# -------------------------
|
|
def log(msg):
|
|
print(msg)
|
|
|
|
# -------------------------
|
|
# NV_MASTER laden
|
|
# -------------------------
|
|
def normalize_text(s):
|
|
if not s:
|
|
return ""
|
|
s = str(s).strip().lower()
|
|
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
|
|
s = re.sub(r"\s+", " ", s)
|
|
return s
|
|
|
|
nlp = spacy.load("de_core_news_sm")
|
|
|
|
lemma_cache = {}
|
|
def lemmatize_term(term):
|
|
term_norm = normalize_text(term)
|
|
if term_norm in lemma_cache:
|
|
return lemma_cache[term_norm]
|
|
doc = nlp(term_norm)
|
|
lemma = " ".join([token.lemma_ for token in doc])
|
|
lemma_cache[term_norm] = lemma
|
|
return lemma
|
|
|
|
def build_norm_index(nv_path):
|
|
norm_dict = {}
|
|
lemma_index = {}
|
|
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
|
|
for sheet_name, df in sheets.items():
|
|
if str(sheet_name).strip().lower() == "master":
|
|
continue
|
|
df = df.fillna("")
|
|
cols = [str(c).strip().lower() for c in df.columns]
|
|
id_col = None
|
|
word_col = None
|
|
for i, c in enumerate(cols):
|
|
if "id" in c:
|
|
id_col = df.columns[i]
|
|
if "wort" in c or "vokabel" in c:
|
|
word_col = df.columns[i]
|
|
if word_col is None and len(df.columns) >= 1:
|
|
word_col = df.columns[-1]
|
|
if id_col is None and len(df.columns) >= 1:
|
|
id_col = df.columns[0]
|
|
current_parent_id = None
|
|
for _, row in df.iterrows():
|
|
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
|
|
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
|
|
if id_val:
|
|
current_parent_id = id_val
|
|
if not word_val:
|
|
continue
|
|
norm_name = normalize_text(word_val)
|
|
lemma = lemmatize_term(word_val)
|
|
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
|
|
norm_dict.setdefault(norm_name, []).append(entry)
|
|
lemma_index.setdefault(lemma, []).append(entry)
|
|
log(f"NV_MASTER geladen. Begriffe: {sum(len(v) for v in norm_dict.values())}")
|
|
return norm_dict, lemma_index
|
|
|
|
# -------------------------
|
|
# Matching
|
|
# -------------------------
|
|
def fuzzy_score(a, b):
|
|
return fuzz.token_set_ratio(a, b)/100.0
|
|
|
|
def get_suggestions(term, norm_dict, lemma_index, threshold=CONF_THRESHOLD):
|
|
term_norm = normalize_text(term)
|
|
term_lemma = lemmatize_term(term)
|
|
candidates = []
|
|
for key_lemma, entries in lemma_index.items():
|
|
score = fuzzy_score(term_lemma, key_lemma)
|
|
if key_lemma.startswith(term_lemma):
|
|
score = min(score + 0.1, 1.0)
|
|
if score >= threshold:
|
|
for e in entries:
|
|
candidates.append((score, e["Name"], e["ID"]))
|
|
for norm_key, entries in norm_dict.items():
|
|
score = fuzzy_score(term_lemma, norm_key)
|
|
if norm_key.startswith(term_lemma):
|
|
score = min(score + 0.1, 1.0)
|
|
if score >= threshold:
|
|
for e in entries:
|
|
candidates.append((score, e["Name"], e["ID"]))
|
|
candidates.sort(key=lambda t: t[0], reverse=True)
|
|
seen = set()
|
|
results = []
|
|
for score, name, id_ in candidates:
|
|
key = (name, id_)
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
results.append(f"{name} ({id_})" if id_ else name)
|
|
if len(results) >= MAX_SUGGESTIONS:
|
|
break
|
|
return results
|
|
|
|
# -------------------------
|
|
# GUI
|
|
# -------------------------
|
|
class SuggestionPopup(tk.Tk):
|
|
def __init__(self, norm_dict, lemma_index):
|
|
super().__init__()
|
|
self.norm_dict = norm_dict
|
|
self.lemma_index = lemma_index
|
|
self.geometry("+1000+700") # unten rechts
|
|
self.overrideredirect(True)
|
|
self.configure(bg="white")
|
|
self.label = tk.Label(self, text="", justify="left", bg="white", anchor="nw")
|
|
self.label.pack(padx=5, pady=5)
|
|
self.last_term = None
|
|
self.check_loop()
|
|
|
|
def check_loop(self):
|
|
term = None
|
|
try:
|
|
if os.path.exists(ACTIVE_FILE):
|
|
with open(ACTIVE_FILE, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
term = data.get("term")
|
|
except Exception:
|
|
term = None
|
|
if term != self.last_term:
|
|
self.last_term = term
|
|
if term:
|
|
suggestions = get_suggestions(term, self.norm_dict, self.lemma_index)
|
|
self.label.config(text="\n".join(suggestions))
|
|
self.deiconify()
|
|
else:
|
|
self.withdraw()
|
|
self.after(300, self.check_loop) # alle 300ms prüfen
|
|
|
|
def main():
|
|
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
|
|
app = SuggestionPopup(norm_dict, lemma_index)
|
|
app.mainloop()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|