GND_Skript_Test/mapper_gui.py

# -*- coding: utf-8 -*-
import tkinter as tk
import json
import os
import threading
import time
import pandas as pd
import re
import spacy
from rapidfuzz import fuzz

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
TMP_DIR = os.path.join(BASE_DIR, "tmp")
ACTIVE_FILE = os.path.join(TMP_DIR, "active_term.json")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache_2.3.json")
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
MAX_SUGGESTIONS = 20

# -------------------------
# Logging
# -------------------------
def log(msg):
    print(msg)

# -------------------------
# NV_MASTER laden
# -------------------------
def normalize_text(s):
    if not s:
        return ""
    s = str(s).strip().lower()
    s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
    s = re.sub(r"\s+", " ", s)
    return s

nlp = spacy.load("de_core_news_sm")

lemma_cache = {}
def lemmatize_term(term):
    term_norm = normalize_text(term)
    if term_norm in lemma_cache:
        return lemma_cache[term_norm]
    doc = nlp(term_norm)
    lemma = " ".join([token.lemma_ for token in doc])
    lemma_cache[term_norm] = lemma
    return lemma

def build_norm_index(nv_path):
    norm_dict = {}
    lemma_index = {}
    sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
    for sheet_name, df in sheets.items():
        if str(sheet_name).strip().lower() == "master":
            continue
        df = df.fillna("")
        cols = [str(c).strip().lower() for c in df.columns]
        id_col = None
        word_col = None
        for i, c in enumerate(cols):
            if "id" in c:
                id_col = df.columns[i]
            if "wort" in c or "vokabel" in c:
                word_col = df.columns[i]
        if word_col is None and len(df.columns) >= 1:
            word_col = df.columns[-1]
        if id_col is None and len(df.columns) >= 1:
            id_col = df.columns[0]
        current_parent_id = None
        for _, row in df.iterrows():
            id_val = str(row[id_col]).strip() if id_col in df.columns else ""
            word_val = str(row[word_col]).strip() if word_col in df.columns else ""
            if id_val:
                current_parent_id = id_val
            if not word_val:
                continue
            norm_name = normalize_text(word_val)
            lemma = lemmatize_term(word_val)
            entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
            norm_dict.setdefault(norm_name, []).append(entry)
            lemma_index.setdefault(lemma, []).append(entry)
    log(f"NV_MASTER geladen. Begriffe: {sum(len(v) for v in norm_dict.values())}")
    return norm_dict, lemma_index

# -------------------------
# Matching
# -------------------------
def fuzzy_score(a, b):
    return fuzz.token_set_ratio(a, b)/100.0

def get_suggestions(term, norm_dict, lemma_index, threshold=CONF_THRESHOLD):
    term_norm = normalize_text(term)
    term_lemma = lemmatize_term(term)
    candidates = []
    for key_lemma, entries in lemma_index.items():
        score = fuzzy_score(term_lemma, key_lemma)
        if key_lemma.startswith(term_lemma):
            score = min(score + 0.1, 1.0)
        if score >= threshold:
            for e in entries:
                candidates.append((score, e["Name"], e["ID"]))
    for norm_key, entries in norm_dict.items():
        score = fuzzy_score(term_lemma, norm_key)
        if norm_key.startswith(term_lemma):
            score = min(score + 0.1, 1.0)
        if score >= threshold:
            for e in entries:
                candidates.append((score, e["Name"], e["ID"]))
    candidates.sort(key=lambda t: t[0], reverse=True)
    seen = set()
    results = []
    for score, name, id_ in candidates:
        key = (name, id_)
        if key in seen:
            continue
        seen.add(key)
        results.append(f"{name} ({id_})" if id_ else name)
        if len(results) >= MAX_SUGGESTIONS:
            break
    return results

# -------------------------
# GUI
# -------------------------
class SuggestionPopup(tk.Tk):
    def __init__(self, norm_dict, lemma_index):
        super().__init__()
        self.norm_dict = norm_dict
        self.lemma_index = lemma_index
        self.geometry("+1000+700")  # unten rechts
        self.overrideredirect(True)
        self.configure(bg="white")
        self.label = tk.Label(self, text="", justify="left", bg="white", anchor="nw")
        self.label.pack(padx=5, pady=5)
        self.last_term = None
        self.check_loop()

    def check_loop(self):
        term = None
        try:
            if os.path.exists(ACTIVE_FILE):
                with open(ACTIVE_FILE, "r", encoding="utf-8") as f:
                    data = json.load(f)
                    term = data.get("term")
        except Exception:
            term = None
        if term != self.last_term:
            self.last_term = term
            if term:
                suggestions = get_suggestions(term, self.norm_dict, self.lemma_index)
                self.label.config(text="\n".join(suggestions))
                self.deiconify()
            else:
                self.withdraw()
        self.after(300, self.check_loop)  # alle 300ms prüfen

def main():
    norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
    app = SuggestionPopup(norm_dict, lemma_index)
    app.mainloop()

if __name__ == "__main__":
    main()