# -*- coding: utf-8 -*- import tkinter as tk import json import os import threading import time import pandas as pd import re import spacy from rapidfuzz import fuzz BASE_DIR = os.path.dirname(os.path.abspath(__file__)) TMP_DIR = os.path.join(BASE_DIR, "tmp") ACTIVE_FILE = os.path.join(TMP_DIR, "active_term.json") CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache_2.3.json") NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods") STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"} CONF_THRESHOLD = 0.75 MAX_SUGGESTIONS = 20 # ------------------------- # Logging # ------------------------- def log(msg): print(msg) # ------------------------- # NV_MASTER laden # ------------------------- def normalize_text(s): if not s: return "" s = str(s).strip().lower() s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s) s = re.sub(r"\s+", " ", s) return s nlp = spacy.load("de_core_news_sm") lemma_cache = {} def lemmatize_term(term): term_norm = normalize_text(term) if term_norm in lemma_cache: return lemma_cache[term_norm] doc = nlp(term_norm) lemma = " ".join([token.lemma_ for token in doc]) lemma_cache[term_norm] = lemma return lemma def build_norm_index(nv_path): norm_dict = {} lemma_index = {} sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf") for sheet_name, df in sheets.items(): if str(sheet_name).strip().lower() == "master": continue df = df.fillna("") cols = [str(c).strip().lower() for c in df.columns] id_col = None word_col = None for i, c in enumerate(cols): if "id" in c: id_col = df.columns[i] if "wort" in c or "vokabel" in c: word_col = df.columns[i] if word_col is None and len(df.columns) >= 1: word_col = df.columns[-1] if id_col is None and len(df.columns) >= 1: id_col = df.columns[0] current_parent_id = None for _, row in df.iterrows(): id_val = str(row[id_col]).strip() if id_col in df.columns else "" word_val = str(row[word_col]).strip() if word_col in df.columns else "" if id_val: current_parent_id = id_val if not word_val: continue norm_name = normalize_text(word_val) lemma = lemmatize_term(word_val) entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name} norm_dict.setdefault(norm_name, []).append(entry) lemma_index.setdefault(lemma, []).append(entry) log(f"NV_MASTER geladen. Begriffe: {sum(len(v) for v in norm_dict.values())}") return norm_dict, lemma_index # ------------------------- # Matching # ------------------------- def fuzzy_score(a, b): return fuzz.token_set_ratio(a, b)/100.0 def get_suggestions(term, norm_dict, lemma_index, threshold=CONF_THRESHOLD): term_norm = normalize_text(term) term_lemma = lemmatize_term(term) candidates = [] for key_lemma, entries in lemma_index.items(): score = fuzzy_score(term_lemma, key_lemma) if key_lemma.startswith(term_lemma): score = min(score + 0.1, 1.0) if score >= threshold: for e in entries: candidates.append((score, e["Name"], e["ID"])) for norm_key, entries in norm_dict.items(): score = fuzzy_score(term_lemma, norm_key) if norm_key.startswith(term_lemma): score = min(score + 0.1, 1.0) if score >= threshold: for e in entries: candidates.append((score, e["Name"], e["ID"])) candidates.sort(key=lambda t: t[0], reverse=True) seen = set() results = [] for score, name, id_ in candidates: key = (name, id_) if key in seen: continue seen.add(key) results.append(f"{name} ({id_})" if id_ else name) if len(results) >= MAX_SUGGESTIONS: break return results # ------------------------- # GUI # ------------------------- class SuggestionPopup(tk.Tk): def __init__(self, norm_dict, lemma_index): super().__init__() self.norm_dict = norm_dict self.lemma_index = lemma_index self.geometry("+1000+700") # unten rechts self.overrideredirect(True) self.configure(bg="white") self.label = tk.Label(self, text="", justify="left", bg="white", anchor="nw") self.label.pack(padx=5, pady=5) self.last_term = None self.check_loop() def check_loop(self): term = None try: if os.path.exists(ACTIVE_FILE): with open(ACTIVE_FILE, "r", encoding="utf-8") as f: data = json.load(f) term = data.get("term") except Exception: term = None if term != self.last_term: self.last_term = term if term: suggestions = get_suggestions(term, self.norm_dict, self.lemma_index) self.label.config(text="\n".join(suggestions)) self.deiconify() else: self.withdraw() self.after(300, self.check_loop) # alle 300ms prüfen def main(): norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH) app = SuggestionPopup(norm_dict, lemma_index) app.mainloop() if __name__ == "__main__": main()