GND_Skript_Test/mapper_gui.py

165 lines
5.4 KiB
Python

# -*- coding: utf-8 -*-
import tkinter as tk
import json
import os
import threading
import time
import pandas as pd
import re
import spacy
from rapidfuzz import fuzz
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
TMP_DIR = os.path.join(BASE_DIR, "tmp")
ACTIVE_FILE = os.path.join(TMP_DIR, "active_term.json")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache_2.3.json")
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
MAX_SUGGESTIONS = 20
# -------------------------
# Logging
# -------------------------
def log(msg):
print(msg)
# -------------------------
# NV_MASTER laden
# -------------------------
def normalize_text(s):
if not s:
return ""
s = str(s).strip().lower()
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
s = re.sub(r"\s+", " ", s)
return s
nlp = spacy.load("de_core_news_sm")
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
lemma_cache[term_norm] = lemma
return lemma
def build_norm_index(nv_path):
norm_dict = {}
lemma_index = {}
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
for sheet_name, df in sheets.items():
if str(sheet_name).strip().lower() == "master":
continue
df = df.fillna("")
cols = [str(c).strip().lower() for c in df.columns]
id_col = None
word_col = None
for i, c in enumerate(cols):
if "id" in c:
id_col = df.columns[i]
if "wort" in c or "vokabel" in c:
word_col = df.columns[i]
if word_col is None and len(df.columns) >= 1:
word_col = df.columns[-1]
if id_col is None and len(df.columns) >= 1:
id_col = df.columns[0]
current_parent_id = None
for _, row in df.iterrows():
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
if id_val:
current_parent_id = id_val
if not word_val:
continue
norm_name = normalize_text(word_val)
lemma = lemmatize_term(word_val)
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
norm_dict.setdefault(norm_name, []).append(entry)
lemma_index.setdefault(lemma, []).append(entry)
log(f"NV_MASTER geladen. Begriffe: {sum(len(v) for v in norm_dict.values())}")
return norm_dict, lemma_index
# -------------------------
# Matching
# -------------------------
def fuzzy_score(a, b):
return fuzz.token_set_ratio(a, b)/100.0
def get_suggestions(term, norm_dict, lemma_index, threshold=CONF_THRESHOLD):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
candidates = []
for key_lemma, entries in lemma_index.items():
score = fuzzy_score(term_lemma, key_lemma)
if key_lemma.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
for norm_key, entries in norm_dict.items():
score = fuzzy_score(term_lemma, norm_key)
if norm_key.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
candidates.sort(key=lambda t: t[0], reverse=True)
seen = set()
results = []
for score, name, id_ in candidates:
key = (name, id_)
if key in seen:
continue
seen.add(key)
results.append(f"{name} ({id_})" if id_ else name)
if len(results) >= MAX_SUGGESTIONS:
break
return results
# -------------------------
# GUI
# -------------------------
class SuggestionPopup(tk.Tk):
def __init__(self, norm_dict, lemma_index):
super().__init__()
self.norm_dict = norm_dict
self.lemma_index = lemma_index
self.geometry("+1000+700") # unten rechts
self.overrideredirect(True)
self.configure(bg="white")
self.label = tk.Label(self, text="", justify="left", bg="white", anchor="nw")
self.label.pack(padx=5, pady=5)
self.last_term = None
self.check_loop()
def check_loop(self):
term = None
try:
if os.path.exists(ACTIVE_FILE):
with open(ACTIVE_FILE, "r", encoding="utf-8") as f:
data = json.load(f)
term = data.get("term")
except Exception:
term = None
if term != self.last_term:
self.last_term = term
if term:
suggestions = get_suggestions(term, self.norm_dict, self.lemma_index)
self.label.config(text="\n".join(suggestions))
self.deiconify()
else:
self.withdraw()
self.after(300, self.check_loop) # alle 300ms prüfen
def main():
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
app = SuggestionPopup(norm_dict, lemma_index)
app.mainloop()
if __name__ == "__main__":
main()