Jonas Arnold Praktikum alle Files hinzugefügt
This commit is contained in:
parent
41e8b7103e
commit
723ac7b6b1
File diff suppressed because it is too large
Load Diff
BIN
Box Ha-Ho.ods
BIN
Box Ha-Ho.ods
Binary file not shown.
BIN
Box Ha-Klinc.ods
BIN
Box Ha-Klinc.ods
Binary file not shown.
BIN
Box Hu-J.ods
BIN
Box Hu-J.ods
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1 +0,0 @@
|
|||||||
,jarnold,workPC,10.10.2025 09:26,file:///home/jarnold/.config/libreoffice/4;
|
|
||||||
Binary file not shown.
469
Mapper_Makro_Alte_Versionen/mapper_macro_1.4.py
Normal file
469
Mapper_Makro_Alte_Versionen/mapper_macro_1.4.py
Normal file
@ -0,0 +1,469 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# mapper_macro 1.5 - LibreOffice Calc
|
||||||
|
# Features: Kompositum-Split, Cache, Live-Vorschläge nur auf 'Objektbeschreibung', Logging
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
# optional imports (Pandas, Spacy, RapidFuzz)
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
PANDAS_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
PANDAS_AVAILABLE = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
import spacy
|
||||||
|
nlp = spacy.load("de_core_news_sm")
|
||||||
|
SPACY_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
SPACY_AVAILABLE = False
|
||||||
|
nlp = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from rapidfuzz import fuzz
|
||||||
|
RAPIDFUZZ_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
RAPIDFUZZ_AVAILABLE = False
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Konfiguration
|
||||||
|
# ------------------------
|
||||||
|
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro"
|
||||||
|
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
|
||||||
|
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
|
||||||
|
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
|
||||||
|
|
||||||
|
STOPWORDS = {
|
||||||
|
"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an",
|
||||||
|
"als","bei","für","aus","dem","den","des","eines","einer"
|
||||||
|
}
|
||||||
|
CONF_THRESHOLD = 0.75
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Logging
|
||||||
|
# ------------------------
|
||||||
|
def log(msg, level="INFO"):
|
||||||
|
ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
line = f"[{ts}] [{level}] {msg}\n"
|
||||||
|
try:
|
||||||
|
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
||||||
|
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
||||||
|
f.write(line)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Cache laden
|
||||||
|
# ------------------------
|
||||||
|
try:
|
||||||
|
if os.path.exists(CACHE_FILE):
|
||||||
|
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
||||||
|
CACHE = json.load(f)
|
||||||
|
else:
|
||||||
|
CACHE = {}
|
||||||
|
except Exception as e:
|
||||||
|
CACHE = {}
|
||||||
|
log(f"Fehler beim Laden des Caches: {e}", level="ERROR")
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Textnormalisierung & Lemma
|
||||||
|
# ------------------------
|
||||||
|
lemma_cache = {}
|
||||||
|
|
||||||
|
def normalize_text(s):
|
||||||
|
if not s:
|
||||||
|
return ""
|
||||||
|
s = str(s).strip().lower()
|
||||||
|
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
|
||||||
|
s = re.sub(r"\s+", " ", s)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def lemmatize_term(term):
|
||||||
|
term_norm = normalize_text(term)
|
||||||
|
if term_norm in lemma_cache:
|
||||||
|
return lemma_cache[term_norm]
|
||||||
|
if SPACY_AVAILABLE and nlp:
|
||||||
|
try:
|
||||||
|
doc = nlp(term_norm)
|
||||||
|
lemma = " ".join([t.lemma_ for t in doc])
|
||||||
|
except Exception:
|
||||||
|
lemma = term_norm
|
||||||
|
else:
|
||||||
|
lemma = term_norm
|
||||||
|
lemma_cache[term_norm] = lemma
|
||||||
|
return lemma
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Kompositum-Splitting
|
||||||
|
# ------------------------
|
||||||
|
def compound_split(term):
|
||||||
|
if not term:
|
||||||
|
return []
|
||||||
|
parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
|
||||||
|
if parts:
|
||||||
|
return parts
|
||||||
|
parts = [p for p in re.split(r'[-\s]+', term) if p]
|
||||||
|
return parts or [term]
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# NV_MASTER indexieren
|
||||||
|
# ------------------------
|
||||||
|
def build_norm_index(nv_path):
|
||||||
|
norm_dict = {}
|
||||||
|
lemma_index = {}
|
||||||
|
if not PANDAS_AVAILABLE:
|
||||||
|
log("Pandas nicht verfügbar, NV_MASTER kann nicht gelesen.", level="ERROR")
|
||||||
|
return norm_dict, lemma_index
|
||||||
|
try:
|
||||||
|
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Fehler beim Einlesen von NV_MASTER: {e}", level="ERROR")
|
||||||
|
return norm_dict, lemma_index
|
||||||
|
|
||||||
|
for sheet_name, df in sheets.items():
|
||||||
|
if str(sheet_name).strip().lower() == "master":
|
||||||
|
continue
|
||||||
|
df = df.fillna("")
|
||||||
|
cols = [str(c).strip().lower() for c in df.columns]
|
||||||
|
id_col = None
|
||||||
|
word_col = None
|
||||||
|
for i, c in enumerate(cols):
|
||||||
|
if "id" in c:
|
||||||
|
id_col = df.columns[i]
|
||||||
|
if "wort" in c or "vokabel" in c:
|
||||||
|
word_col = df.columns[i]
|
||||||
|
if word_col is None and len(df.columns) >= 1:
|
||||||
|
word_col = df.columns[-1]
|
||||||
|
if id_col is None and len(df.columns) >= 1:
|
||||||
|
id_col = df.columns[0]
|
||||||
|
|
||||||
|
current_parent_id = None
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
|
||||||
|
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
|
||||||
|
if id_val:
|
||||||
|
current_parent_id = id_val
|
||||||
|
if not word_val:
|
||||||
|
continue
|
||||||
|
norm_name = normalize_text(word_val)
|
||||||
|
lemma = lemmatize_term(word_val)
|
||||||
|
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
|
||||||
|
norm_dict.setdefault(norm_name, []).append(entry)
|
||||||
|
lemma_index.setdefault(lemma, []).append(entry)
|
||||||
|
|
||||||
|
log(f"NV_MASTER geladen. Begriffe: {sum(len(v) for v in norm_dict.values())}")
|
||||||
|
return norm_dict, lemma_index
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Fuzzy / Vorschläge
|
||||||
|
# ------------------------
|
||||||
|
def fuzzy_score(a, b):
|
||||||
|
if RAPIDFUZZ_AVAILABLE:
|
||||||
|
try:
|
||||||
|
return fuzz.token_set_ratio(a, b) / 100.0
|
||||||
|
except Exception:
|
||||||
|
return 0.0
|
||||||
|
else:
|
||||||
|
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
||||||
|
|
||||||
|
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD):
|
||||||
|
candidates = []
|
||||||
|
for key_lemma, entries in lemma_index.items():
|
||||||
|
score = fuzzy_score(term_lemma, key_lemma)
|
||||||
|
if key_lemma.startswith(term_lemma):
|
||||||
|
score = min(score + 0.1, 1.0)
|
||||||
|
if score >= threshold:
|
||||||
|
for e in entries:
|
||||||
|
candidates.append((score, e["Name"], e["ID"]))
|
||||||
|
for norm_key, entries in norm_dict.items():
|
||||||
|
score = fuzzy_score(term_lemma, norm_key)
|
||||||
|
if norm_key.startswith(term_lemma):
|
||||||
|
score = min(score + 0.1, 1.0)
|
||||||
|
if score >= threshold:
|
||||||
|
for e in entries:
|
||||||
|
candidates.append((score, e["Name"], e["ID"]))
|
||||||
|
candidates.sort(key=lambda t: t[0], reverse=True)
|
||||||
|
seen = set()
|
||||||
|
results = []
|
||||||
|
for score, name, id_ in candidates:
|
||||||
|
key = (name, id_)
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
results.append({"score": score, "name": name, "id": id_})
|
||||||
|
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Mapping eines Terms (mit Cache)
|
||||||
|
# ------------------------
|
||||||
|
def map_term_with_indexes(term, norm_dict, lemma_index):
|
||||||
|
term_norm = normalize_text(term)
|
||||||
|
term_lemma = lemmatize_term(term)
|
||||||
|
if term_lemma in CACHE:
|
||||||
|
c = CACHE[term_lemma]
|
||||||
|
return c.get("hits", []), c.get("suggestions", []), c.get("ids", [])
|
||||||
|
|
||||||
|
hits = []
|
||||||
|
suggestions = []
|
||||||
|
ids = []
|
||||||
|
|
||||||
|
if term_norm in norm_dict:
|
||||||
|
for e in norm_dict[term_norm]:
|
||||||
|
hits.append(e["Name"])
|
||||||
|
if e["ID"]:
|
||||||
|
ids.append(e["ID"])
|
||||||
|
if not hits and term_lemma in lemma_index:
|
||||||
|
for e in lemma_index[term_lemma]:
|
||||||
|
hits.append(e["Name"])
|
||||||
|
if e["ID"]:
|
||||||
|
ids.append(e["ID"])
|
||||||
|
suggestions = get_suggestions_for_term(term_lemma, norm_dict, lemma_index)
|
||||||
|
|
||||||
|
if not hits:
|
||||||
|
tokens = compound_split(term)
|
||||||
|
for t in tokens:
|
||||||
|
t_lemma = lemmatize_term(t)
|
||||||
|
if t_lemma in lemma_index:
|
||||||
|
for e in lemma_index[t_lemma]:
|
||||||
|
hits.append(e["Name"])
|
||||||
|
if e["ID"]:
|
||||||
|
ids.append(e["ID"])
|
||||||
|
else:
|
||||||
|
suggestions.extend(get_suggestions_for_term(t_lemma, norm_dict, lemma_index))
|
||||||
|
|
||||||
|
def uniq(seq):
|
||||||
|
seen = set()
|
||||||
|
out = []
|
||||||
|
for x in seq:
|
||||||
|
if x not in seen:
|
||||||
|
seen.add(x)
|
||||||
|
out.append(x)
|
||||||
|
return out
|
||||||
|
|
||||||
|
hits = uniq(hits)
|
||||||
|
suggestions = uniq(suggestions)
|
||||||
|
ids = uniq(ids)
|
||||||
|
|
||||||
|
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
|
||||||
|
return hits, suggestions, ids
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Header + Spalten
|
||||||
|
# ------------------------
|
||||||
|
def find_header_and_cols(sheet):
|
||||||
|
try:
|
||||||
|
cursor = sheet.createCursor()
|
||||||
|
cursor.gotoStartOfUsedArea(False)
|
||||||
|
cursor.gotoEndOfUsedArea(True)
|
||||||
|
dr = cursor.getRangeAddress()
|
||||||
|
except Exception:
|
||||||
|
return None, None, None
|
||||||
|
header_row = None
|
||||||
|
objekt_col = None
|
||||||
|
for r in range(0, min(5, dr.EndRow + 1)):
|
||||||
|
for c in range(0, dr.EndColumn + 1):
|
||||||
|
try:
|
||||||
|
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
|
||||||
|
except Exception:
|
||||||
|
val = ""
|
||||||
|
if val == "objektbeschreibung":
|
||||||
|
header_row = r
|
||||||
|
objekt_col = c
|
||||||
|
break
|
||||||
|
if objekt_col is not None:
|
||||||
|
break
|
||||||
|
|
||||||
|
if header_row is None:
|
||||||
|
return None, None, dr
|
||||||
|
existing = {}
|
||||||
|
for c in range(0, dr.EndColumn + 1):
|
||||||
|
try:
|
||||||
|
h = str(sheet.getCellByPosition(c, header_row).String).strip()
|
||||||
|
except Exception:
|
||||||
|
h = ""
|
||||||
|
if h == "Norm_Treffer":
|
||||||
|
existing["Norm_Treffer"] = c
|
||||||
|
if h == "Norm_Vorschlag":
|
||||||
|
existing["Norm_Vorschlag"] = c
|
||||||
|
if h == "Norm_ID":
|
||||||
|
existing["Norm_ID"] = c
|
||||||
|
return header_row, objekt_col, dr, existing
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Optimierter Live-Handler (nur Objektbeschreibung)
|
||||||
|
# ------------------------
|
||||||
|
def on_objektbeschreibung_change(oEvent=None):
|
||||||
|
try:
|
||||||
|
doc = XSCRIPTCONTEXT.getDocument()
|
||||||
|
sheet = doc.CurrentController.ActiveSheet
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Dokumentzugriff fehlgeschlagen: {e}", level="ERROR")
|
||||||
|
return
|
||||||
|
|
||||||
|
cell = None
|
||||||
|
try:
|
||||||
|
if oEvent and hasattr(oEvent, "Range") and oEvent.Range is not None:
|
||||||
|
cell = oEvent.Range
|
||||||
|
elif oEvent and hasattr(oEvent, "Source") and oEvent.Source is not None:
|
||||||
|
cell = oEvent.Source
|
||||||
|
except Exception:
|
||||||
|
cell = None
|
||||||
|
if cell is None:
|
||||||
|
try:
|
||||||
|
sel = doc.CurrentSelection
|
||||||
|
if hasattr(sel, "getCellByPosition"):
|
||||||
|
cell = sel
|
||||||
|
else:
|
||||||
|
cell = sel.getCellByPosition(0, 0)
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Keine Selektion: {e}", level="ERROR")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
row_index = cell.CellAddress.Row
|
||||||
|
col_index = cell.CellAddress.Column
|
||||||
|
except Exception:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
header_row, objekt_col, dr, existing = find_header_and_cols(sheet)
|
||||||
|
if header_row is None or col_index != objekt_col:
|
||||||
|
return # nur die Objektbeschreibung-Spalte bearbeiten
|
||||||
|
last_col = dr.EndColumn
|
||||||
|
if "Norm_Vorschlag" not in existing:
|
||||||
|
last_col += 1
|
||||||
|
existing["Norm_Vorschlag"] = last_col
|
||||||
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
|
||||||
|
norm_sug_col = existing["Norm_Vorschlag"]
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Fehler Spaltenbestimmung: {e}", level="ERROR")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
txt = str(cell.String).strip()
|
||||||
|
if not txt:
|
||||||
|
sheet.getCellByPosition(norm_sug_col, row_index).String = ""
|
||||||
|
return
|
||||||
|
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
|
||||||
|
suggestions_acc = []
|
||||||
|
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
|
||||||
|
for cl in clauses:
|
||||||
|
parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()]
|
||||||
|
for p in parts:
|
||||||
|
if p.lower() in STOPWORDS or re.fullmatch(r"\d+", p):
|
||||||
|
continue
|
||||||
|
for sp in compound_split(p):
|
||||||
|
_, sugs, _ = map_term_with_indexes(sp, norm_dict, lemma_index)
|
||||||
|
suggestions_acc.extend(sugs)
|
||||||
|
|
||||||
|
seen = set()
|
||||||
|
ordered = []
|
||||||
|
for s in suggestions_acc:
|
||||||
|
if s not in seen:
|
||||||
|
seen.add(s)
|
||||||
|
ordered.append(s)
|
||||||
|
sheet.getCellByPosition(norm_sug_col, row_index).String = " | ".join(ordered)
|
||||||
|
|
||||||
|
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(CACHE, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Fehler im Live-Handler: {e}", level="ERROR")
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Batch-Durchlauf
|
||||||
|
# ------------------------
|
||||||
|
def run_mapper_macro():
|
||||||
|
log("=== mapper_macro 1.5 gestartet ===", level="INFO")
|
||||||
|
try:
|
||||||
|
doc = XSCRIPTCONTEXT.getDocument()
|
||||||
|
sheet = doc.CurrentController.ActiveSheet
|
||||||
|
cursor = sheet.createCursor()
|
||||||
|
cursor.gotoStartOfUsedArea(False)
|
||||||
|
cursor.gotoEndOfUsedArea(True)
|
||||||
|
dr = cursor.getRangeAddress()
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Dokumentzugriff fehlgeschlagen: {e}", level="ERROR")
|
||||||
|
return
|
||||||
|
|
||||||
|
header_row, objekt_col, dr, existing = find_header_and_cols(sheet)
|
||||||
|
if objekt_col is None:
|
||||||
|
log("Spalte 'Objektbeschreibung' nicht gefunden.", level="ERROR")
|
||||||
|
return
|
||||||
|
if "Norm_Treffer" not in existing:
|
||||||
|
last_col = dr.EndColumn + 1
|
||||||
|
existing["Norm_Treffer"] = last_col
|
||||||
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
|
||||||
|
if "Norm_Vorschlag" not in existing:
|
||||||
|
last_col = dr.EndColumn + 2
|
||||||
|
existing["Norm_Vorschlag"] = last_col
|
||||||
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
|
||||||
|
if "Norm_ID" not in existing:
|
||||||
|
last_col = dr.EndColumn + 3
|
||||||
|
existing["Norm_ID"] = last_col
|
||||||
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_ID"
|
||||||
|
|
||||||
|
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
|
||||||
|
GREEN, YELLOW, RED = 0xADFF2F, 0xFFA500, 0xCC0000
|
||||||
|
|
||||||
|
for r in range(header_row + 1, dr.EndRow + 1):
|
||||||
|
try:
|
||||||
|
cell = sheet.getCellByPosition(objekt_col, r)
|
||||||
|
txt = str(cell.String).strip()
|
||||||
|
if not txt:
|
||||||
|
continue
|
||||||
|
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
|
||||||
|
terms = []
|
||||||
|
for cl in clauses:
|
||||||
|
for p in [p.strip() for p in re.split(r"\s+", cl) if p.strip()]:
|
||||||
|
if p.lower() in STOPWORDS or re.fullmatch(r"\d+", p):
|
||||||
|
continue
|
||||||
|
terms.extend([sp.strip() for sp in compound_split(p) if sp.strip()])
|
||||||
|
|
||||||
|
row_hits, row_sugs, row_ids = [], [], []
|
||||||
|
any_unmapped = False
|
||||||
|
for term in terms:
|
||||||
|
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
|
||||||
|
row_hits.extend(hits)
|
||||||
|
row_sugs.extend(sugs)
|
||||||
|
row_ids.extend(ids)
|
||||||
|
if not hits and not sugs:
|
||||||
|
any_unmapped = True
|
||||||
|
|
||||||
|
def uniq(seq):
|
||||||
|
seen = set()
|
||||||
|
out = []
|
||||||
|
for x in seq:
|
||||||
|
if x not in seen:
|
||||||
|
seen.add(x)
|
||||||
|
out.append(x)
|
||||||
|
return out
|
||||||
|
|
||||||
|
row_hits, row_sugs, row_ids = map(uniq, [row_hits, row_sugs, row_ids])
|
||||||
|
sheet.getCellByPosition(existing["Norm_Treffer"], r).String = " | ".join(row_hits)
|
||||||
|
sheet.getCellByPosition(existing["Norm_Vorschlag"], r).String = " | ".join(row_sugs)
|
||||||
|
sheet.getCellByPosition(existing["Norm_ID"], r).String = " | ".join(row_ids)
|
||||||
|
|
||||||
|
cell.CellBackColor = RED if any_unmapped else 0xFFFFFF
|
||||||
|
sheet.getCellByPosition(existing["Norm_Treffer"], r).CellBackColor = GREEN if row_hits and not any_unmapped else 0xFFFFFF
|
||||||
|
sheet.getCellByPosition(existing["Norm_Vorschlag"], r).CellBackColor = YELLOW if row_sugs else 0xFFFFFF
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Fehler in Zeile {r}: {e}", level="ERROR")
|
||||||
|
continue
|
||||||
|
|
||||||
|
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(CACHE, f, ensure_ascii=False, indent=2)
|
||||||
|
log("=== mapper_macro 1.5 fertig ===", level="INFO")
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Export
|
||||||
|
# ------------------------
|
||||||
|
g_exportedScripts = (
|
||||||
|
run_mapper_macro,
|
||||||
|
on_objektbeschreibung_change
|
||||||
|
)
|
||||||
508
Mapper_Makro_Alte_Versionen/mapper_macro_1.5.py
Normal file
508
Mapper_Makro_Alte_Versionen/mapper_macro_1.5.py
Normal file
@ -0,0 +1,508 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# mapper_macro 1.5 - korrigiert: Logging im Dokumentverzeichnis, stabile Button-Erstellung,
|
||||||
|
# keine Listener, optimiertes Mapping (ohne Listener-Teil)
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
# optionale Module (Pandas, Spacy, RapidFuzz)
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
PANDAS_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
PANDAS_AVAILABLE = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
import spacy
|
||||||
|
nlp = spacy.load("de_core_news_sm")
|
||||||
|
SPACY_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
SPACY_AVAILABLE = False
|
||||||
|
nlp = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from rapidfuzz import fuzz
|
||||||
|
RAPIDFUZZ_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
RAPIDFUZZ_AVAILABLE = False
|
||||||
|
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
|
# UNO (für Button/Paths)
|
||||||
|
try:
|
||||||
|
import uno
|
||||||
|
except Exception:
|
||||||
|
uno = None
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Konfiguration (Fallback-BASE_DIR)
|
||||||
|
# ------------------------
|
||||||
|
BASE_DIR = os.path.expanduser("~/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro")
|
||||||
|
NV_MASTER_FILENAME = "NV_MASTER.ods"
|
||||||
|
CACHE_FILENAME = "mapper_cache.json"
|
||||||
|
LOG_FILENAME = "mapper_macro.log"
|
||||||
|
|
||||||
|
STOPWORDS = {
|
||||||
|
"mit", "ohne", "der", "die", "das", "ein", "eine", "und", "zu", "von", "im", "in", "auf", "an",
|
||||||
|
"als", "bei", "für", "aus", "dem", "den", "des", "eines", "einer"
|
||||||
|
}
|
||||||
|
CONF_THRESHOLD = 0.82
|
||||||
|
FUZZY_CUTOFF = 0.88
|
||||||
|
|
||||||
|
# Per-document paths (initialized by set_paths_from_doc)
|
||||||
|
DOC_DIR = BASE_DIR
|
||||||
|
NV_MASTER_PATH = os.path.join(DOC_DIR, NV_MASTER_FILENAME)
|
||||||
|
CACHE_FILE = os.path.join(DOC_DIR, CACHE_FILENAME)
|
||||||
|
LOG_FILE = os.path.join(DOC_DIR, LOG_FILENAME)
|
||||||
|
|
||||||
|
# in-memory cache
|
||||||
|
try:
|
||||||
|
if os.path.exists(CACHE_FILE):
|
||||||
|
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
||||||
|
CACHE = json.load(f)
|
||||||
|
else:
|
||||||
|
CACHE = {}
|
||||||
|
except Exception:
|
||||||
|
CACHE = {}
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Pfade im Dokument setzen
|
||||||
|
# ------------------------
|
||||||
|
def set_paths_from_doc(doc):
|
||||||
|
global DOC_DIR, NV_MASTER_PATH, CACHE_FILE, LOG_FILE
|
||||||
|
try:
|
||||||
|
url = getattr(doc, "URL", "")
|
||||||
|
if url and url.strip():
|
||||||
|
# UNO liefert file:///...
|
||||||
|
try:
|
||||||
|
system_path = uno.fileUrlToSystemPath(url)
|
||||||
|
except Exception:
|
||||||
|
# fallback: try simple unquote
|
||||||
|
from urllib.parse import unquote, urlparse
|
||||||
|
parsed = urlparse(url)
|
||||||
|
if parsed.scheme == "file":
|
||||||
|
system_path = unquote(parsed.path)
|
||||||
|
else:
|
||||||
|
system_path = ""
|
||||||
|
if system_path:
|
||||||
|
d = os.path.dirname(system_path)
|
||||||
|
if os.path.isdir(d):
|
||||||
|
DOC_DIR = d
|
||||||
|
except Exception:
|
||||||
|
DOC_DIR = BASE_DIR
|
||||||
|
NV_MASTER_PATH = os.path.join(DOC_DIR, NV_MASTER_FILENAME)
|
||||||
|
CACHE_FILE = os.path.join(DOC_DIR, CACHE_FILENAME)
|
||||||
|
LOG_FILE = os.path.join(DOC_DIR, LOG_FILENAME)
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Logging (Dokumentdir, robust)
|
||||||
|
# ------------------------
|
||||||
|
def log(msg, level="INFO"):
|
||||||
|
ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
line = f"[{ts}] [{level}] {msg}\n"
|
||||||
|
try:
|
||||||
|
# ensure directory exists
|
||||||
|
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
||||||
|
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
||||||
|
f.write(line)
|
||||||
|
except Exception:
|
||||||
|
# absolute fallback: try writing into BASE_DIR
|
||||||
|
try:
|
||||||
|
fallback = os.path.join(BASE_DIR, LOG_FILENAME)
|
||||||
|
os.makedirs(os.path.dirname(fallback), exist_ok=True)
|
||||||
|
with open(fallback, "a", encoding="utf-8") as f:
|
||||||
|
f.write(line)
|
||||||
|
except Exception:
|
||||||
|
# last resort: silent
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Textvorbereitung & Helpers
|
||||||
|
# ------------------------
|
||||||
|
lemma_cache = {}
|
||||||
|
|
||||||
|
def normalize_text(s):
|
||||||
|
if not s:
|
||||||
|
return ""
|
||||||
|
s = str(s).strip().lower()
|
||||||
|
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
|
||||||
|
s = re.sub(r"\s+", " ", s)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def lemmatize_term(term):
|
||||||
|
term_norm = normalize_text(term)
|
||||||
|
if term_norm in lemma_cache:
|
||||||
|
return lemma_cache[term_norm]
|
||||||
|
if SPACY_AVAILABLE and nlp:
|
||||||
|
try:
|
||||||
|
doc = nlp(term_norm)
|
||||||
|
lemma = " ".join([t.lemma_ for t in doc])
|
||||||
|
except Exception:
|
||||||
|
lemma = term_norm
|
||||||
|
else:
|
||||||
|
lemma = term_norm
|
||||||
|
lemma_cache[term_norm] = lemma
|
||||||
|
return lemma
|
||||||
|
|
||||||
|
def compound_split(term):
|
||||||
|
if not term:
|
||||||
|
return []
|
||||||
|
parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
|
||||||
|
if parts:
|
||||||
|
return parts
|
||||||
|
parts = [p for p in re.split(r'[-\s]+', term) if p]
|
||||||
|
return parts or [term]
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# NV_MASTER indexieren
|
||||||
|
# ------------------------
|
||||||
|
def build_norm_index(nv_path):
|
||||||
|
norm_dict = {}
|
||||||
|
lemma_index = {}
|
||||||
|
if not PANDAS_AVAILABLE:
|
||||||
|
log("Pandas nicht verfügbar, NV_MASTER kann nicht gelesen.", level="ERROR")
|
||||||
|
return norm_dict, lemma_index
|
||||||
|
try:
|
||||||
|
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Fehler beim Einlesen von NV_MASTER: {e}", level="ERROR")
|
||||||
|
return norm_dict, lemma_index
|
||||||
|
|
||||||
|
for sheet_name, df in sheets.items():
|
||||||
|
if str(sheet_name).strip().lower() == "master":
|
||||||
|
continue
|
||||||
|
df = df.fillna("")
|
||||||
|
cols = [str(c).strip().lower() for c in df.columns]
|
||||||
|
# find id/word columns with fallback
|
||||||
|
id_col = None
|
||||||
|
word_col = None
|
||||||
|
for i, c in enumerate(cols):
|
||||||
|
if "id" in c:
|
||||||
|
id_col = df.columns[i]
|
||||||
|
if "wort" in c or "vokabel" in c:
|
||||||
|
word_col = df.columns[i]
|
||||||
|
if word_col is None and len(df.columns) >= 1:
|
||||||
|
word_col = df.columns[-1]
|
||||||
|
if id_col is None and len(df.columns) >= 1:
|
||||||
|
id_col = df.columns[0]
|
||||||
|
|
||||||
|
current_parent_id = None
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
|
||||||
|
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
|
||||||
|
if id_val:
|
||||||
|
current_parent_id = id_val
|
||||||
|
if not word_val:
|
||||||
|
continue
|
||||||
|
norm_name = normalize_text(word_val)
|
||||||
|
lemma = lemmatize_term(word_val)
|
||||||
|
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
|
||||||
|
norm_dict.setdefault(norm_name, []).append(entry)
|
||||||
|
lemma_index.setdefault(lemma, []).append(entry)
|
||||||
|
|
||||||
|
log(f"NV_MASTER geladen. Begriffe: {sum(len(v) for v in norm_dict.values())}", level="INFO")
|
||||||
|
return norm_dict, lemma_index
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Fuzzy Matching
|
||||||
|
# ------------------------
|
||||||
|
def fuzzy_score(a, b):
|
||||||
|
a = (a or "").lower()
|
||||||
|
b = (b or "").lower()
|
||||||
|
if RAPIDFUZZ_AVAILABLE:
|
||||||
|
try:
|
||||||
|
return fuzz.token_sort_ratio(a, b) / 100.0
|
||||||
|
except Exception:
|
||||||
|
return 0.0
|
||||||
|
else:
|
||||||
|
return SequenceMatcher(None, a, b).ratio()
|
||||||
|
|
||||||
|
def get_suggestions(term_lemma, norm_dict, lemma_index, threshold=FUZZY_CUTOFF, max_sugs=6):
|
||||||
|
candidates = []
|
||||||
|
term_norm = term_lemma or ""
|
||||||
|
for key_lemma, entries in lemma_index.items():
|
||||||
|
if not key_lemma:
|
||||||
|
continue
|
||||||
|
score = fuzzy_score(term_norm, key_lemma)
|
||||||
|
if key_lemma.startswith(term_norm):
|
||||||
|
score = min(score + 0.08, 1.0)
|
||||||
|
if score >= threshold:
|
||||||
|
for e in entries:
|
||||||
|
candidates.append((score, e["Name"], e["ID"]))
|
||||||
|
# also check normalized names
|
||||||
|
for norm_key, entries in norm_dict.items():
|
||||||
|
score = fuzzy_score(term_norm, norm_key)
|
||||||
|
if norm_key.startswith(term_norm):
|
||||||
|
score = min(score + 0.08, 1.0)
|
||||||
|
if score >= threshold:
|
||||||
|
for e in entries:
|
||||||
|
candidates.append((score, e["Name"], e["ID"]))
|
||||||
|
# sort & dedupe
|
||||||
|
candidates.sort(key=lambda t: t[0], reverse=True)
|
||||||
|
seen = set()
|
||||||
|
out = []
|
||||||
|
for score, name, id_ in candidates:
|
||||||
|
key = (name, id_)
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
if id_:
|
||||||
|
out.append(f"{name} ({id_})")
|
||||||
|
else:
|
||||||
|
out.append(name)
|
||||||
|
if len(out) >= max_sugs:
|
||||||
|
break
|
||||||
|
return out
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Mapping mit Cache
|
||||||
|
# ------------------------
|
||||||
|
def map_term(term, norm_dict, lemma_index):
|
||||||
|
term_norm = normalize_text(term)
|
||||||
|
term_lemma = lemmatize_term(term)
|
||||||
|
if term_lemma in CACHE:
|
||||||
|
return CACHE[term_lemma]
|
||||||
|
|
||||||
|
hits = []
|
||||||
|
suggestions = []
|
||||||
|
ids = []
|
||||||
|
|
||||||
|
# exact
|
||||||
|
if term_norm in norm_dict:
|
||||||
|
for e in norm_dict[term_norm]:
|
||||||
|
hits.append(e["Name"])
|
||||||
|
if e["ID"]:
|
||||||
|
ids.append(e["ID"])
|
||||||
|
|
||||||
|
# lemma
|
||||||
|
if not hits and term_lemma in lemma_index:
|
||||||
|
for e in lemma_index[term_lemma]:
|
||||||
|
hits.append(e["Name"])
|
||||||
|
if e["ID"]:
|
||||||
|
ids.append(e["ID"])
|
||||||
|
|
||||||
|
# suggestions only if no hit
|
||||||
|
if not hits:
|
||||||
|
suggestions = get_suggestions(term_lemma, norm_dict, lemma_index)
|
||||||
|
|
||||||
|
# remove suggestions that are equal/contain hits
|
||||||
|
suggestions = [s for s in suggestions if not any(h.lower() in s.lower() for h in hits)]
|
||||||
|
|
||||||
|
result = {"hits": hits, "suggestions": suggestions, "ids": ids}
|
||||||
|
CACHE[term_lemma] = result
|
||||||
|
return result
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Button erstellen (sicher)
|
||||||
|
# ------------------------
|
||||||
|
def add_macro_button(sheet):
|
||||||
|
try:
|
||||||
|
doc = XSCRIPTCONTEXT.getDocument()
|
||||||
|
except Exception:
|
||||||
|
log("add_macro_button: kein Dokument-Kontext", level="WARN")
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
draw_page = sheet.DrawPage
|
||||||
|
# avoid duplicate
|
||||||
|
for shape in draw_page:
|
||||||
|
try:
|
||||||
|
if getattr(shape, "Name", "") == "MapperStartButton":
|
||||||
|
return
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# create shape and button model
|
||||||
|
shape = doc.createInstance("com.sun.star.drawing.ControlShape")
|
||||||
|
shape.Name = "MapperStartButton"
|
||||||
|
shape.Position = uno.createUnoStruct("com.sun.star.awt.Point")
|
||||||
|
shape.Position.X = 1000
|
||||||
|
shape.Position.Y = 200
|
||||||
|
shape.Size = uno.createUnoStruct("com.sun.star.awt.Size")
|
||||||
|
shape.Size.Width = 3000
|
||||||
|
shape.Size.Height = 1000
|
||||||
|
|
||||||
|
button_model = doc.createInstance("com.sun.star.form.component.CommandButton")
|
||||||
|
button_model.Label = "Start Mapping"
|
||||||
|
button_model.HelpText = "Startet das Mapping (run_mapper_macro)"
|
||||||
|
# assign macro via ActionCommand is not enough; user must link in UI; we add the control and label
|
||||||
|
|
||||||
|
shape.Control = button_model
|
||||||
|
draw_page.add(shape)
|
||||||
|
log("Button 'MapperStartButton' erstellt.", level="INFO")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"add_macro_button Fehler: {e}", level="ERROR")
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Hauptlauf (ohne Listener)
|
||||||
|
# ------------------------
|
||||||
|
def run_mapper_macro():
|
||||||
|
try:
|
||||||
|
doc = XSCRIPTCONTEXT.getDocument()
|
||||||
|
set_paths_from_doc(doc)
|
||||||
|
log("=== mapper_macro gestartet ===", level="INFO")
|
||||||
|
sheet = doc.CurrentController.ActiveSheet
|
||||||
|
add_macro_button(sheet)
|
||||||
|
|
||||||
|
# used area
|
||||||
|
cursor = sheet.createCursor()
|
||||||
|
cursor.gotoStartOfUsedArea(False)
|
||||||
|
cursor.gotoEndOfUsedArea(True)
|
||||||
|
dr = cursor.getRangeAddress()
|
||||||
|
|
||||||
|
# find header and objekt col
|
||||||
|
header_row = None
|
||||||
|
objekt_col = None
|
||||||
|
for r in range(0, min(10, dr.EndRow + 1)):
|
||||||
|
for c in range(0, dr.EndColumn + 1):
|
||||||
|
try:
|
||||||
|
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
|
||||||
|
except Exception:
|
||||||
|
val = ""
|
||||||
|
if val == "Objektbeschreibung":
|
||||||
|
header_row = r
|
||||||
|
objekt_col = c
|
||||||
|
break
|
||||||
|
if objekt_col is not None:
|
||||||
|
break
|
||||||
|
|
||||||
|
if objekt_col is None:
|
||||||
|
log("run_mapper_macro: 'Objektbeschreibung' Header nicht gefunden.", level="ERROR")
|
||||||
|
return
|
||||||
|
|
||||||
|
# ensure result cols
|
||||||
|
existing = {}
|
||||||
|
last_col = dr.EndColumn
|
||||||
|
for c in range(0, dr.EndColumn + 1):
|
||||||
|
try:
|
||||||
|
h = str(sheet.getCellByPosition(c, header_row).String).strip()
|
||||||
|
except Exception:
|
||||||
|
h = ""
|
||||||
|
if h == "Norm_Treffer":
|
||||||
|
existing["Norm_Treffer"] = c
|
||||||
|
if h == "Norm_Vorschlag":
|
||||||
|
existing["Norm_Vorschlag"] = c
|
||||||
|
if h == "Norm_ID":
|
||||||
|
existing["Norm_ID"] = c
|
||||||
|
|
||||||
|
if "Norm_Treffer" not in existing:
|
||||||
|
last_col += 1
|
||||||
|
existing["Norm_Treffer"] = last_col
|
||||||
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
|
||||||
|
if "Norm_Vorschlag" not in existing:
|
||||||
|
last_col += 1
|
||||||
|
existing["Norm_Vorschlag"] = last_col
|
||||||
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
|
||||||
|
if "Norm_ID" not in existing:
|
||||||
|
last_col += 1
|
||||||
|
existing["Norm_ID"] = last_col
|
||||||
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_ID"
|
||||||
|
|
||||||
|
norm_tr_col = existing["Norm_Treffer"]
|
||||||
|
norm_sug_col = existing["Norm_Vorschlag"]
|
||||||
|
norm_id_col = existing["Norm_ID"]
|
||||||
|
|
||||||
|
# build index
|
||||||
|
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
|
||||||
|
if not norm_dict and not lemma_index:
|
||||||
|
log("run_mapper_macro: NV_MASTER leer oder nicht lesbar.", level="ERROR")
|
||||||
|
return
|
||||||
|
|
||||||
|
GREEN, YELLOW, RED = 0xADFF2F, 0xFFFF66, 0xFF9999
|
||||||
|
rows_processed = 0
|
||||||
|
|
||||||
|
for r in range(header_row + 1, dr.EndRow + 1):
|
||||||
|
try:
|
||||||
|
cell = sheet.getCellByPosition(objekt_col, r)
|
||||||
|
txt = str(cell.String).strip()
|
||||||
|
if not txt:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# phrase-first: try entire cleaned phrase (remove stopwords)
|
||||||
|
tokens = [t.strip() for t in re.split(r'\s+', normalize_text(txt)) if t and t not in STOPWORDS]
|
||||||
|
phrase = " ".join(tokens).strip()
|
||||||
|
terms = []
|
||||||
|
if phrase:
|
||||||
|
# first try phrase as whole
|
||||||
|
mapped_phrase = map_term(phrase, norm_dict, lemma_index)
|
||||||
|
if mapped_phrase["hits"] or mapped_phrase["suggestions"]:
|
||||||
|
# use phrase result (flatten hits+suggestions for output)
|
||||||
|
row_hits = mapped_phrase["hits"]
|
||||||
|
row_sugs = mapped_phrase["suggestions"]
|
||||||
|
row_ids = mapped_phrase["ids"]
|
||||||
|
any_unmapped = False if (row_hits or row_sugs) else True
|
||||||
|
else:
|
||||||
|
# fallback to token/compound processing
|
||||||
|
for p in [p for p in re.split(r'[,\s]+', txt) if p.strip()]:
|
||||||
|
if p.lower() in STOPWORDS or re.fullmatch(r'\d+', p):
|
||||||
|
continue
|
||||||
|
for sp in compound_split(p):
|
||||||
|
if sp and sp.strip():
|
||||||
|
terms.append(sp.strip())
|
||||||
|
row_hits = []
|
||||||
|
row_sugs = []
|
||||||
|
row_ids = []
|
||||||
|
any_unmapped = False
|
||||||
|
for term in terms:
|
||||||
|
mapped = map_term(term, norm_dict, lemma_index)
|
||||||
|
hits, sugs, ids = mapped["hits"], mapped["suggestions"], mapped["ids"]
|
||||||
|
if hits:
|
||||||
|
row_hits.extend(hits)
|
||||||
|
if sugs:
|
||||||
|
row_sugs.extend(sugs)
|
||||||
|
if ids:
|
||||||
|
row_ids.extend(ids)
|
||||||
|
if not hits and not sugs:
|
||||||
|
any_unmapped = True
|
||||||
|
else:
|
||||||
|
row_hits, row_sugs, row_ids = [], [], []
|
||||||
|
any_unmapped = True
|
||||||
|
|
||||||
|
# dedupe preserving order
|
||||||
|
def uniq(seq):
|
||||||
|
seen = set()
|
||||||
|
out = []
|
||||||
|
for x in seq:
|
||||||
|
if x not in seen:
|
||||||
|
seen.add(x)
|
||||||
|
out.append(x)
|
||||||
|
return out
|
||||||
|
|
||||||
|
row_hits = uniq(row_hits)
|
||||||
|
row_sugs = uniq(row_sugs)
|
||||||
|
row_ids = uniq(row_ids)
|
||||||
|
|
||||||
|
# write
|
||||||
|
sheet.getCellByPosition(norm_tr_col, r).String = " | ".join(row_hits)
|
||||||
|
sheet.getCellByPosition(norm_sug_col, r).String = " | ".join(row_sugs)
|
||||||
|
sheet.getCellByPosition(norm_id_col, r).String = " | ".join(row_ids)
|
||||||
|
|
||||||
|
cell.CellBackColor = RED if any_unmapped else 0xFFFFFF
|
||||||
|
sheet.getCellByPosition(norm_tr_col, r).CellBackColor = GREEN if row_hits else 0xFFFFFF
|
||||||
|
sheet.getCellByPosition(norm_sug_col, r).CellBackColor = YELLOW if row_sugs else 0xFFFFFF
|
||||||
|
|
||||||
|
rows_processed += 1
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Fehler in Zeile {r}: {e}", level="ERROR")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# persist cache file to DOC_DIR
|
||||||
|
try:
|
||||||
|
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(CACHE, f, ensure_ascii=False, indent=2)
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Cache speichern fehlgeschlagen: {e}", level="WARN")
|
||||||
|
|
||||||
|
log(f"=== mapper_macro fertig. Zeilen verarbeitet: {rows_processed} ===", level="INFO")
|
||||||
|
except Exception as e:
|
||||||
|
# top-level safety
|
||||||
|
try:
|
||||||
|
log(f"run_mapper_macro: Unhandled exception: {e}", level="ERROR")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Export
|
||||||
|
# ------------------------
|
||||||
|
g_exportedScripts = (run_mapper_macro,)
|
||||||
343
Mapper_Makro_Alte_Versionen/mapper_macro_2.0.py
Normal file
343
Mapper_Makro_Alte_Versionen/mapper_macro_2.0.py
Normal file
@ -0,0 +1,343 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
LibreOffice Calc Makro: NV_MASTER-Abgleich (verbessertes semantisches Matching)
|
||||||
|
Speicherort: /home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro/mapper_macro.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# LIBRARIES & MODELS
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
PANDAS_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
PANDAS_AVAILABLE = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
import spacy
|
||||||
|
# Verwende das mittlere Modell für semantische Ähnlichkeit
|
||||||
|
nlp = spacy.load("de_core_news_md")
|
||||||
|
SPACY_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
SPACY_AVAILABLE = False
|
||||||
|
nlp = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from rapidfuzz import fuzz
|
||||||
|
RAPIDFUZZ_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
RAPIDFUZZ_AVAILABLE = False
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# KONFIGURATION
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
|
||||||
|
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
|
||||||
|
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
|
||||||
|
|
||||||
|
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||||||
|
CONF_THRESHOLD = 0.70 # etwas großzügiger für semantisches Matching
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# LOGGING
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
def log(msg):
|
||||||
|
"""Schreibt technische Logs ins Makroverzeichnis."""
|
||||||
|
try:
|
||||||
|
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
||||||
|
f.write(msg.strip() + "\n")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
log("Makro gestartet")
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# CACHE
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
try:
|
||||||
|
if os.path.exists(CACHE_FILE):
|
||||||
|
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
||||||
|
CACHE = json.load(f)
|
||||||
|
else:
|
||||||
|
CACHE = {}
|
||||||
|
except Exception:
|
||||||
|
CACHE = {}
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# TEXTNORMALISIERUNG & LEMMATISIERUNG
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
def normalize_text(s):
|
||||||
|
if not s:
|
||||||
|
return ""
|
||||||
|
s = str(s).strip().lower()
|
||||||
|
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
|
||||||
|
s = re.sub(r"\s+", " ", s)
|
||||||
|
return s
|
||||||
|
|
||||||
|
lemma_cache = {}
|
||||||
|
def lemmatize_term(term):
|
||||||
|
t = normalize_text(term)
|
||||||
|
if t in lemma_cache:
|
||||||
|
return lemma_cache[t]
|
||||||
|
if SPACY_AVAILABLE and nlp:
|
||||||
|
try:
|
||||||
|
doc = nlp(t)
|
||||||
|
lemma = " ".join([token.lemma_ for token in doc])
|
||||||
|
except Exception:
|
||||||
|
lemma = t
|
||||||
|
else:
|
||||||
|
lemma = t
|
||||||
|
lemma_cache[t] = lemma
|
||||||
|
return lemma
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# NV_MASTER LADEN
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
def build_norm_index(nv_path):
|
||||||
|
norm_dict = {}
|
||||||
|
lemma_index = {}
|
||||||
|
|
||||||
|
if not PANDAS_AVAILABLE:
|
||||||
|
log("Pandas nicht verfügbar – NV_MASTER kann nicht geladen werden.")
|
||||||
|
return norm_dict, lemma_index
|
||||||
|
|
||||||
|
try:
|
||||||
|
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Fehler beim Laden von NV_MASTER: {e}")
|
||||||
|
return norm_dict, lemma_index
|
||||||
|
|
||||||
|
for sheet_name, df in sheets.items():
|
||||||
|
if str(sheet_name).strip().lower() == "master":
|
||||||
|
continue
|
||||||
|
df = df.fillna("")
|
||||||
|
cols = [str(c).strip().lower() for c in df.columns]
|
||||||
|
id_col = next((df.columns[i] for i, c in enumerate(cols) if "id" in c), df.columns[0])
|
||||||
|
word_col = next((df.columns[i] for i, c in enumerate(cols) if "wort" in c or "vokabel" in c), df.columns[-1])
|
||||||
|
|
||||||
|
current_parent_id = None
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
id_val = str(row[id_col]).strip()
|
||||||
|
word_val = str(row[word_col]).strip()
|
||||||
|
if id_val:
|
||||||
|
current_parent_id = id_val
|
||||||
|
if not word_val:
|
||||||
|
continue
|
||||||
|
norm_name = normalize_text(word_val)
|
||||||
|
lemma = lemmatize_term(word_val)
|
||||||
|
entry = {"Name": word_val, "ID": current_parent_id or "", "Sheet": sheet_name}
|
||||||
|
norm_dict.setdefault(norm_name, []).append(entry)
|
||||||
|
lemma_index.setdefault(lemma, []).append(entry)
|
||||||
|
|
||||||
|
log(f"NV_MASTER geladen: {sum(len(v) for v in norm_dict.values())} Begriffe.")
|
||||||
|
return norm_dict, lemma_index
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# SCORING: FUZZY + SEMANTISCH
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
def fuzzy_score(a, b):
|
||||||
|
if RAPIDFUZZ_AVAILABLE:
|
||||||
|
try:
|
||||||
|
return fuzz.token_set_ratio(a, b) / 100.0
|
||||||
|
except Exception:
|
||||||
|
return 0.0
|
||||||
|
else:
|
||||||
|
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
||||||
|
|
||||||
|
def semantic_similarity(a, b):
|
||||||
|
if not SPACY_AVAILABLE or not hasattr(nlp.vocab, "vectors"):
|
||||||
|
return 0.0
|
||||||
|
try:
|
||||||
|
doc_a, doc_b = nlp(a), nlp(b)
|
||||||
|
if doc_a.vector_norm and doc_b.vector_norm:
|
||||||
|
return float(doc_a.similarity(doc_b))
|
||||||
|
return 0.0
|
||||||
|
except Exception:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def combined_score(a, b):
|
||||||
|
sf = fuzzy_score(a, b)
|
||||||
|
ss = semantic_similarity(a, b)
|
||||||
|
return max(sf, ss)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# MATCHING & VORSCHLÄGE
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=3, threshold=CONF_THRESHOLD):
|
||||||
|
candidates = []
|
||||||
|
for key_lemma, entries in lemma_index.items():
|
||||||
|
score = combined_score(term_lemma, key_lemma)
|
||||||
|
if key_lemma.startswith(term_lemma):
|
||||||
|
score = min(score + 0.05, 1.0)
|
||||||
|
if score >= threshold:
|
||||||
|
for e in entries:
|
||||||
|
candidates.append((score, e["Name"], e["ID"]))
|
||||||
|
for norm_key, entries in norm_dict.items():
|
||||||
|
score = combined_score(term_lemma, norm_key)
|
||||||
|
if norm_key.startswith(term_lemma):
|
||||||
|
score = min(score + 0.05, 1.0)
|
||||||
|
if score >= threshold:
|
||||||
|
for e in entries:
|
||||||
|
candidates.append((score, e["Name"], e["ID"]))
|
||||||
|
candidates.sort(key=lambda x: x[0], reverse=True)
|
||||||
|
seen, results = set(), []
|
||||||
|
for score, name, id_ in candidates:
|
||||||
|
key = (name.lower(), id_.lower() if id_ else "")
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
results.append({"score": score, "name": name, "id": id_})
|
||||||
|
if len(results) >= top_n:
|
||||||
|
break
|
||||||
|
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
|
||||||
|
|
||||||
|
def map_term_with_indexes(term, norm_dict, lemma_index):
|
||||||
|
term_norm = normalize_text(term)
|
||||||
|
term_lemma = lemmatize_term(term)
|
||||||
|
|
||||||
|
if term_lemma in CACHE:
|
||||||
|
return CACHE[term_lemma]["hits"], CACHE[term_lemma]["suggestions"], CACHE[term_lemma]["ids"]
|
||||||
|
|
||||||
|
hits, suggestions, ids = [], [], []
|
||||||
|
|
||||||
|
if term_norm in norm_dict:
|
||||||
|
for e in norm_dict[term_norm]:
|
||||||
|
hits.append(e["Name"])
|
||||||
|
if e["ID"]:
|
||||||
|
ids.append(e["ID"])
|
||||||
|
|
||||||
|
if not hits and term_lemma in lemma_index:
|
||||||
|
for e in lemma_index[term_lemma]:
|
||||||
|
hits.append(e["Name"])
|
||||||
|
if e["ID"]:
|
||||||
|
ids.append(e["ID"])
|
||||||
|
|
||||||
|
suggs = get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=3, threshold=CONF_THRESHOLD)
|
||||||
|
filtered_suggs = []
|
||||||
|
for s in suggs:
|
||||||
|
s_clean = normalize_text(s.split(" (")[0])
|
||||||
|
if s_clean not in [normalize_text(h) for h in hits]:
|
||||||
|
filtered_suggs.append(s)
|
||||||
|
suggestions = filtered_suggs
|
||||||
|
|
||||||
|
def uniq(seq):
|
||||||
|
seen = set()
|
||||||
|
out = []
|
||||||
|
for x in seq:
|
||||||
|
if x not in seen:
|
||||||
|
seen.add(x)
|
||||||
|
out.append(x)
|
||||||
|
return out
|
||||||
|
|
||||||
|
hits, suggestions, ids = uniq(hits), uniq(suggestions), uniq(ids)
|
||||||
|
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
|
||||||
|
|
||||||
|
log(f"TERM: {term} | HITS: {hits} | SUGGS: {suggestions}")
|
||||||
|
return hits, suggestions, ids
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# HAUPTMAKRO
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
def run_mapper_macro():
|
||||||
|
try:
|
||||||
|
doc = XSCRIPTCONTEXT.getDocument()
|
||||||
|
sheet = doc.CurrentController.ActiveSheet
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Fehler beim Zugriff auf Dokument: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
|
||||||
|
if not norm_dict:
|
||||||
|
log("Fehler: NV_MASTER leer oder nicht gefunden.")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
cursor = sheet.createCursor()
|
||||||
|
cursor.gotoStartOfUsedArea(False)
|
||||||
|
cursor.gotoEndOfUsedArea(True)
|
||||||
|
used = cursor.getRangeAddress()
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Cursor-Fehler: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
header_row = 0
|
||||||
|
objekt_col = None
|
||||||
|
for c in range(0, used.EndColumn + 1):
|
||||||
|
val = str(sheet.getCellByPosition(c, header_row).String).strip().lower()
|
||||||
|
if val == "objektbeschreibung":
|
||||||
|
objekt_col = c
|
||||||
|
break
|
||||||
|
if objekt_col is None:
|
||||||
|
log("Keine Spalte 'Objektbeschreibung' gefunden.")
|
||||||
|
return
|
||||||
|
|
||||||
|
existing = {}
|
||||||
|
for c in range(0, used.EndColumn + 1):
|
||||||
|
h = str(sheet.getCellByPosition(c, header_row).String).strip()
|
||||||
|
if h == "Norm_Treffer": existing["Norm_Treffer"] = c
|
||||||
|
if h == "Norm_Vorschlag": existing["Norm_Vorschlag"] = c
|
||||||
|
if h == "Norm_ID": existing["Norm_ID"] = c
|
||||||
|
|
||||||
|
last_col = used.EndColumn
|
||||||
|
for name in ["Norm_Treffer", "Norm_Vorschlag", "Norm_ID"]:
|
||||||
|
if name not in existing:
|
||||||
|
last_col += 1
|
||||||
|
existing[name] = last_col
|
||||||
|
sheet.getCellByPosition(last_col, header_row).String = name
|
||||||
|
|
||||||
|
GREEN, YELLOW, RED = 0xADFF2F, 0xFFD700, 0xCC0000
|
||||||
|
norm_tr_col, norm_sug_col, norm_id_col = existing["Norm_Treffer"], existing["Norm_Vorschlag"], existing["Norm_ID"]
|
||||||
|
|
||||||
|
rows = 0
|
||||||
|
for r in range(header_row + 1, used.EndRow + 1):
|
||||||
|
txt = str(sheet.getCellByPosition(objekt_col, r).String).strip()
|
||||||
|
if not txt:
|
||||||
|
continue
|
||||||
|
terms = [t.strip() for t in re.split(r",|\s+", txt) if t.strip() and t.lower() not in STOPWORDS]
|
||||||
|
row_hits, row_sugs, row_ids, any_unmapped = [], [], [], False
|
||||||
|
for term in terms:
|
||||||
|
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
|
||||||
|
if hits: row_hits.extend(hits)
|
||||||
|
if sugs: row_sugs.extend(sugs)
|
||||||
|
if ids: row_ids.extend(ids)
|
||||||
|
if not hits and not sugs: any_unmapped = True
|
||||||
|
|
||||||
|
def uniq(seq):
|
||||||
|
seen = set()
|
||||||
|
out = []
|
||||||
|
for x in seq:
|
||||||
|
if x not in seen:
|
||||||
|
seen.add(x)
|
||||||
|
out.append(x)
|
||||||
|
return out
|
||||||
|
|
||||||
|
row_hits, row_sugs, row_ids = uniq(row_hits), uniq(row_sugs), uniq(row_ids)
|
||||||
|
sheet.getCellByPosition(norm_tr_col, r).String = " | ".join(row_hits)
|
||||||
|
sheet.getCellByPosition(norm_sug_col, r).String = " | ".join(row_sugs)
|
||||||
|
sheet.getCellByPosition(norm_id_col, r).String = " | ".join(row_ids)
|
||||||
|
|
||||||
|
obj_cell = sheet.getCellByPosition(objekt_col, r)
|
||||||
|
sug_cell = sheet.getCellByPosition(norm_sug_col, r)
|
||||||
|
tr_cell = sheet.getCellByPosition(norm_tr_col, r)
|
||||||
|
|
||||||
|
if any_unmapped:
|
||||||
|
obj_cell.CellBackColor = RED
|
||||||
|
elif row_hits:
|
||||||
|
tr_cell.CellBackColor = GREEN
|
||||||
|
if row_sugs:
|
||||||
|
sug_cell.CellBackColor = YELLOW
|
||||||
|
|
||||||
|
rows += 1
|
||||||
|
|
||||||
|
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(CACHE, f, ensure_ascii=False, indent=2)
|
||||||
|
log(f"Makro abgeschlossen, {rows} Zeilen verarbeitet.")
|
||||||
|
|
||||||
|
g_exportedScripts = (run_mapper_macro,)
|
||||||
@ -1,7 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# LibreOffice Calc macro: NV_MASTER-Abgleich, Pandas+odf, Cache, Farben
|
# LibreOffice Calc macro: NV_MASTER-Abgleich, Pandas+odf, Cache, Farben
|
||||||
# Pfade: BASE_DIR muss auf das Verzeichnis zeigen, in dem NV_MASTER.ods + Makro liegen.
|
# Speicherort: /home/jarnold/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro/mapper_macro_2.1.py
|
||||||
# Speichern: /home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro/mapper_macro.py
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
@ -9,7 +8,6 @@ import json
|
|||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
# UNO-Context wird zur Laufzeit zur Verfügung gestellt (XSCRIPTCONTEXT)
|
# UNO-Context wird zur Laufzeit zur Verfügung gestellt (XSCRIPTCONTEXT)
|
||||||
# Third-party libs: pandas, odfpy, optional: spacy, rapidfuzz
|
|
||||||
try:
|
try:
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
PANDAS_AVAILABLE = True
|
PANDAS_AVAILABLE = True
|
||||||
@ -34,10 +32,10 @@ except Exception:
|
|||||||
# ------------------------
|
# ------------------------
|
||||||
# Konfiguration
|
# Konfiguration
|
||||||
# ------------------------
|
# ------------------------
|
||||||
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro"
|
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro"
|
||||||
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
|
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
|
||||||
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
|
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro_2.1.log")
|
||||||
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
|
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache_2.1.json")
|
||||||
|
|
||||||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||||||
CONF_THRESHOLD = 0.75 # Basis-Schwelle für Vorschläge
|
CONF_THRESHOLD = 0.75 # Basis-Schwelle für Vorschläge
|
||||||
@ -110,10 +108,8 @@ def build_norm_index(nv_path):
|
|||||||
for sheet_name, df in sheets.items():
|
for sheet_name, df in sheets.items():
|
||||||
if str(sheet_name).strip().lower() == "master":
|
if str(sheet_name).strip().lower() == "master":
|
||||||
continue
|
continue
|
||||||
# normalize columns names to find ID and Wort columns
|
df = df.fillna("")
|
||||||
df = df.fillna("") # leere Zellen als ""
|
|
||||||
cols = [str(c).strip().lower() for c in df.columns]
|
cols = [str(c).strip().lower() for c in df.columns]
|
||||||
# try to find columns
|
|
||||||
id_col = None
|
id_col = None
|
||||||
word_col = None
|
word_col = None
|
||||||
for i, c in enumerate(cols):
|
for i, c in enumerate(cols):
|
||||||
@ -121,7 +117,6 @@ def build_norm_index(nv_path):
|
|||||||
id_col = df.columns[i]
|
id_col = df.columns[i]
|
||||||
if "wort" in c or "vokabel" in c:
|
if "wort" in c or "vokabel" in c:
|
||||||
word_col = df.columns[i]
|
word_col = df.columns[i]
|
||||||
# fallback: if not found, try first/last
|
|
||||||
if word_col is None and len(df.columns) >= 1:
|
if word_col is None and len(df.columns) >= 1:
|
||||||
word_col = df.columns[-1]
|
word_col = df.columns[-1]
|
||||||
if id_col is None and len(df.columns) >= 1:
|
if id_col is None and len(df.columns) >= 1:
|
||||||
@ -131,18 +126,14 @@ def build_norm_index(nv_path):
|
|||||||
for _, row in df.iterrows():
|
for _, row in df.iterrows():
|
||||||
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
|
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
|
||||||
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
|
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
|
||||||
# if row defines an ID, set as current parent
|
|
||||||
if id_val:
|
if id_val:
|
||||||
current_parent_id = id_val
|
current_parent_id = id_val
|
||||||
# skip empty word cells
|
|
||||||
if not word_val:
|
if not word_val:
|
||||||
continue
|
continue
|
||||||
norm_name = normalize_text(word_val)
|
norm_name = normalize_text(word_val)
|
||||||
lemma = lemmatize_term(word_val)
|
lemma = lemmatize_term(word_val)
|
||||||
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
|
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
|
||||||
# add to norm_dict by normalized name (exact matching)
|
|
||||||
norm_dict.setdefault(norm_name, []).append(entry)
|
norm_dict.setdefault(norm_name, []).append(entry)
|
||||||
# add to lemma_index
|
|
||||||
lemma_index.setdefault(lemma, []).append(entry)
|
lemma_index.setdefault(lemma, []).append(entry)
|
||||||
|
|
||||||
log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}")
|
log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}")
|
||||||
@ -163,10 +154,8 @@ def fuzzy_score(a, b):
|
|||||||
except Exception:
|
except Exception:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, threshold=CONF_THRESHOLD):
|
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD):
|
||||||
# collect candidates from lemma_index keys and norm_dict keys
|
|
||||||
candidates = []
|
candidates = []
|
||||||
# iterate over lemma_index keys for candidate names
|
|
||||||
for key_lemma, entries in lemma_index.items():
|
for key_lemma, entries in lemma_index.items():
|
||||||
score = fuzzy_score(term_lemma, key_lemma)
|
score = fuzzy_score(term_lemma, key_lemma)
|
||||||
if key_lemma.startswith(term_lemma):
|
if key_lemma.startswith(term_lemma):
|
||||||
@ -174,7 +163,6 @@ def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, thr
|
|||||||
if score >= threshold:
|
if score >= threshold:
|
||||||
for e in entries:
|
for e in entries:
|
||||||
candidates.append((score, e["Name"], e["ID"]))
|
candidates.append((score, e["Name"], e["ID"]))
|
||||||
# also check norm_dict keys (exact-normalized names) as additional candidates
|
|
||||||
for norm_key, entries in norm_dict.items():
|
for norm_key, entries in norm_dict.items():
|
||||||
score = fuzzy_score(term_lemma, norm_key)
|
score = fuzzy_score(term_lemma, norm_key)
|
||||||
if norm_key.startswith(term_lemma):
|
if norm_key.startswith(term_lemma):
|
||||||
@ -182,9 +170,7 @@ def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, thr
|
|||||||
if score >= threshold:
|
if score >= threshold:
|
||||||
for e in entries:
|
for e in entries:
|
||||||
candidates.append((score, e["Name"], e["ID"]))
|
candidates.append((score, e["Name"], e["ID"]))
|
||||||
# sort by score descending
|
|
||||||
candidates.sort(key=lambda t: t[0], reverse=True)
|
candidates.sort(key=lambda t: t[0], reverse=True)
|
||||||
# unique by (Name, ID) preserve score order
|
|
||||||
seen = set()
|
seen = set()
|
||||||
results = []
|
results = []
|
||||||
for score, name, id_ in candidates:
|
for score, name, id_ in candidates:
|
||||||
@ -193,40 +179,28 @@ def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, thr
|
|||||||
continue
|
continue
|
||||||
seen.add(key)
|
seen.add(key)
|
||||||
results.append({"score": score, "name": name, "id": id_})
|
results.append({"score": score, "name": name, "id": id_})
|
||||||
# return all candidates (no limit) as "Name (ID)"
|
|
||||||
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
|
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
|
||||||
|
|
||||||
def map_term_with_indexes(term, norm_dict, lemma_index):
|
def map_term_with_indexes(term, norm_dict, lemma_index):
|
||||||
term_norm = normalize_text(term)
|
term_norm = normalize_text(term)
|
||||||
term_lemma = lemmatize_term(term)
|
term_lemma = lemmatize_term(term)
|
||||||
# cache lookup
|
|
||||||
if term_lemma in CACHE:
|
if term_lemma in CACHE:
|
||||||
return CACHE[term_lemma]["hits"], CACHE[term_lemma]["suggestions"], CACHE[term_lemma]["ids"]
|
return CACHE[term_lemma]["hits"], CACHE[term_lemma]["suggestions"]
|
||||||
|
|
||||||
hits = []
|
hits = []
|
||||||
suggestions = []
|
suggestions = []
|
||||||
ids = []
|
|
||||||
|
|
||||||
# 1) exact normalized name match
|
|
||||||
if term_norm in norm_dict:
|
if term_norm in norm_dict:
|
||||||
for e in norm_dict[term_norm]:
|
for e in norm_dict[term_norm]:
|
||||||
hits.append(e["Name"])
|
hits.append(f'{e["Name"]} ({e["ID"]})' if e["ID"] else e["Name"])
|
||||||
if e["ID"]:
|
|
||||||
ids.append(e["ID"])
|
|
||||||
|
|
||||||
# 2) lemma match (if not already hits)
|
|
||||||
if not hits and term_lemma in lemma_index:
|
if not hits and term_lemma in lemma_index:
|
||||||
for e in lemma_index[term_lemma]:
|
for e in lemma_index[term_lemma]:
|
||||||
hits.append(e["Name"])
|
hits.append(f'{e["Name"]} ({e["ID"]})' if e["ID"] else e["Name"])
|
||||||
if e["ID"]:
|
|
||||||
ids.append(e["ID"])
|
|
||||||
|
|
||||||
# 3) suggestions via fuzzy (always compute even if hits exist, but suggestions empty if exact)
|
if not hits:
|
||||||
suggs = get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, threshold=CONF_THRESHOLD)
|
suggestions = get_suggestions_for_term(term_lemma, norm_dict, lemma_index)
|
||||||
# If there are exact hits, we still may present suggestions (user wanted unlimited), but suggestions are secondary
|
|
||||||
suggestions = suggs
|
|
||||||
|
|
||||||
# deduplicate lists preserving order
|
|
||||||
def unique_preserve(seq):
|
def unique_preserve(seq):
|
||||||
seen = set()
|
seen = set()
|
||||||
out = []
|
out = []
|
||||||
@ -238,18 +212,15 @@ def map_term_with_indexes(term, norm_dict, lemma_index):
|
|||||||
|
|
||||||
hits = unique_preserve(hits)
|
hits = unique_preserve(hits)
|
||||||
suggestions = unique_preserve(suggestions)
|
suggestions = unique_preserve(suggestions)
|
||||||
ids = unique_preserve(ids)
|
|
||||||
|
|
||||||
# cache result
|
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions}
|
||||||
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
|
return hits, suggestions
|
||||||
return hits, suggestions, ids
|
|
||||||
|
|
||||||
# ------------------------
|
# ------------------------
|
||||||
# Haupt-Makro
|
# Haupt-Makro
|
||||||
# ------------------------
|
# ------------------------
|
||||||
def run_mapper_macro():
|
def run_mapper_macro():
|
||||||
try:
|
try:
|
||||||
# UNO doc/sheet
|
|
||||||
doc = XSCRIPTCONTEXT.getDocument()
|
doc = XSCRIPTCONTEXT.getDocument()
|
||||||
sheet = doc.CurrentController.ActiveSheet
|
sheet = doc.CurrentController.ActiveSheet
|
||||||
cursor = sheet.createCursor()
|
cursor = sheet.createCursor()
|
||||||
@ -260,7 +231,6 @@ def run_mapper_macro():
|
|||||||
log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e))
|
log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e))
|
||||||
return
|
return
|
||||||
|
|
||||||
# find header row and Objektbeschreibung column (search first 5 rows)
|
|
||||||
header_row = None
|
header_row = None
|
||||||
objekt_col = None
|
objekt_col = None
|
||||||
max_col = data_range.EndColumn
|
max_col = data_range.EndColumn
|
||||||
@ -281,7 +251,7 @@ def run_mapper_macro():
|
|||||||
log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.")
|
log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.")
|
||||||
return
|
return
|
||||||
|
|
||||||
# determine or create result columns: search if exist anywhere; otherwise append at right end
|
# Prüfen/Anlegen der Ergebnis-Spalten
|
||||||
existing = {}
|
existing = {}
|
||||||
for c in range(0, data_range.EndColumn+1):
|
for c in range(0, data_range.EndColumn+1):
|
||||||
try:
|
try:
|
||||||
@ -292,59 +262,38 @@ def run_mapper_macro():
|
|||||||
existing["Norm_Treffer"] = c
|
existing["Norm_Treffer"] = c
|
||||||
if h == "Norm_Vorschlag":
|
if h == "Norm_Vorschlag":
|
||||||
existing["Norm_Vorschlag"] = c
|
existing["Norm_Vorschlag"] = c
|
||||||
if h == "Norm_ID":
|
|
||||||
existing["Norm_ID"] = c
|
|
||||||
|
|
||||||
# append columns at right end if missing
|
|
||||||
last_col = data_range.EndColumn
|
last_col = data_range.EndColumn
|
||||||
if "Norm_Treffer" not in existing:
|
if "Norm_Treffer" not in existing:
|
||||||
last_col += 1
|
last_col += 1
|
||||||
existing["Norm_Treffer"] = last_col
|
existing["Norm_Treffer"] = last_col
|
||||||
try:
|
|
||||||
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
if "Norm_Vorschlag" not in existing:
|
if "Norm_Vorschlag" not in existing:
|
||||||
last_col += 1
|
last_col += 1
|
||||||
existing["Norm_Vorschlag"] = last_col
|
existing["Norm_Vorschlag"] = last_col
|
||||||
try:
|
|
||||||
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
if "Norm_ID" not in existing:
|
|
||||||
last_col += 1
|
|
||||||
existing["Norm_ID"] = last_col
|
|
||||||
try:
|
|
||||||
sheet.getCellByPosition(last_col, header_row).String = "Norm_ID"
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
norm_tr_col = existing["Norm_Treffer"]
|
norm_tr_col = existing["Norm_Treffer"]
|
||||||
norm_sug_col = existing["Norm_Vorschlag"]
|
norm_sug_col = existing["Norm_Vorschlag"]
|
||||||
norm_id_col = existing["Norm_ID"]
|
|
||||||
|
|
||||||
# Build norm indexes
|
|
||||||
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
|
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
|
||||||
if not norm_dict and not lemma_index:
|
if not norm_dict and not lemma_index:
|
||||||
log("NV_MASTER leer oder nicht lesbar. Abbruch.")
|
log("NV_MASTER leer oder nicht lesbar. Abbruch.")
|
||||||
return
|
return
|
||||||
|
|
||||||
# colors
|
|
||||||
GREEN = 0xADFF2F
|
GREEN = 0xADFF2F
|
||||||
YELLOW = 0xFFA500
|
YELLOW = 0xFFA500
|
||||||
RED = 0xCC0000
|
RED = 0xCC0000
|
||||||
|
WHITE = 0xFFFFFF
|
||||||
|
|
||||||
# iterate rows
|
|
||||||
rows_processed = 0
|
rows_processed = 0
|
||||||
for r in range(header_row + 1, data_range.EndRow + 1):
|
for r in range(header_row + 1, data_range.EndRow + 1):
|
||||||
try:
|
try:
|
||||||
cell = sheet.getCellByPosition(objekt_col, r)
|
cell = sheet.getCellByPosition(objekt_col, r)
|
||||||
txt = str(cell.String).strip()
|
txt = str(cell.String).strip()
|
||||||
if not txt:
|
if not txt:
|
||||||
# clear any previous outputs? keep existing per spec; skip empty
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# tokenize: split by commas first, then whitespace; filter stopwords and pure numbers
|
|
||||||
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
|
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
|
||||||
terms = []
|
terms = []
|
||||||
for cl in clauses:
|
for cl in clauses:
|
||||||
@ -356,24 +305,19 @@ def run_mapper_macro():
|
|||||||
continue
|
continue
|
||||||
terms.append(p)
|
terms.append(p)
|
||||||
|
|
||||||
# for each term, get hits/suggestions/ids
|
|
||||||
row_hits = []
|
row_hits = []
|
||||||
row_sugs = []
|
row_sugs = []
|
||||||
row_ids = []
|
unmapped_terms = []
|
||||||
any_unmapped = False # at least one term without hit and without suggestion
|
|
||||||
# We will record for each term
|
|
||||||
for term in terms:
|
for term in terms:
|
||||||
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
|
hits, sugs = map_term_with_indexes(term, norm_dict, lemma_index)
|
||||||
if hits:
|
if hits:
|
||||||
row_hits.extend(hits)
|
row_hits.extend(hits)
|
||||||
|
else:
|
||||||
|
unmapped_terms.append(term)
|
||||||
if sugs:
|
if sugs:
|
||||||
row_sugs.extend(sugs)
|
row_sugs.extend(sugs)
|
||||||
if ids:
|
|
||||||
row_ids.extend(ids)
|
|
||||||
if (not hits) and (not sugs):
|
|
||||||
any_unmapped = True
|
|
||||||
|
|
||||||
# deduplicate preserving order
|
|
||||||
def uniq(seq):
|
def uniq(seq):
|
||||||
seen = set()
|
seen = set()
|
||||||
out = []
|
out = []
|
||||||
@ -385,57 +329,30 @@ def run_mapper_macro():
|
|||||||
|
|
||||||
row_hits = uniq(row_hits)
|
row_hits = uniq(row_hits)
|
||||||
row_sugs = uniq(row_sugs)
|
row_sugs = uniq(row_sugs)
|
||||||
row_ids = uniq(row_ids)
|
|
||||||
|
|
||||||
# write outputs (unlimited lists, joined with " | ")
|
# Farb-Logik für Objektbeschreibung
|
||||||
try:
|
if terms and not unmapped_terms and row_hits:
|
||||||
sheet.getCellByPosition(norm_tr_col, r).String = " | ".join(row_hits)
|
cell.CellBackColor = GREEN
|
||||||
sheet.getCellByPosition(norm_sug_col, r).String = " | ".join(row_sugs)
|
row_sugs = []
|
||||||
sheet.getCellByPosition(norm_id_col, r).String = " | ".join(row_ids)
|
elif row_hits:
|
||||||
except Exception:
|
cell.CellBackColor = YELLOW
|
||||||
pass
|
else:
|
||||||
|
|
||||||
# Coloring rules per new spec:
|
|
||||||
# - Objektbeschreibung cell: RED if any_unmapped else no change (we do not color green/yellow here)
|
|
||||||
# - Norm_Treffer cell: GREEN if all terms matched (i.e., terms non-empty and no term unmapped and at least one hit per term)
|
|
||||||
# - Norm_Vorschlag cell: YELLOW if at least one suggestion exists
|
|
||||||
# Determine "all matched": terms non-empty and every term has at least one hit (we approximated by checking any_unmapped and hits length)
|
|
||||||
all_matched = False
|
|
||||||
if terms:
|
|
||||||
# all_matched if no term without hit and there is at least one hit overall
|
|
||||||
if (not any_unmapped) and row_hits:
|
|
||||||
all_matched = True
|
|
||||||
|
|
||||||
# apply colors
|
|
||||||
try:
|
|
||||||
if any_unmapped:
|
|
||||||
cell.CellBackColor = RED
|
cell.CellBackColor = RED
|
||||||
else:
|
|
||||||
# clear red if previously set? We'll leave unchanged if not set. Optionally set to default 16777215 (white)
|
# Ergebniszellen
|
||||||
pass
|
|
||||||
# Norm_Treffer coloring
|
|
||||||
tr_cell = sheet.getCellByPosition(norm_tr_col, r)
|
tr_cell = sheet.getCellByPosition(norm_tr_col, r)
|
||||||
if all_matched:
|
tr_cell.String = " | ".join(row_hits)
|
||||||
tr_cell.CellBackColor = GREEN
|
tr_cell.CellBackColor = GREEN if row_hits else WHITE
|
||||||
else:
|
|
||||||
# clear color if needed -> set to white
|
|
||||||
tr_cell.CellBackColor = 0xFFFFFF
|
|
||||||
# Norm_Vorschlag coloring
|
|
||||||
sug_cell = sheet.getCellByPosition(norm_sug_col, r)
|
sug_cell = sheet.getCellByPosition(norm_sug_col, r)
|
||||||
if row_sugs:
|
sug_cell.String = " | ".join(row_sugs)
|
||||||
sug_cell.CellBackColor = YELLOW
|
sug_cell.CellBackColor = YELLOW if row_sugs else WHITE
|
||||||
else:
|
|
||||||
sug_cell.CellBackColor = 0xFFFFFF
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
rows_processed += 1
|
rows_processed += 1
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# continue processing other rows; log once
|
log(f"Fehler in Zeile {r}: {e}\n{traceback.format_exc()}")
|
||||||
log(f"Fehler in Zeile {r}: {e}")
|
|
||||||
|
|
||||||
# persist cache
|
|
||||||
try:
|
try:
|
||||||
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
||||||
json.dump(CACHE, f, ensure_ascii=False, indent=2)
|
json.dump(CACHE, f, ensure_ascii=False, indent=2)
|
||||||
@ -444,5 +361,5 @@ def run_mapper_macro():
|
|||||||
|
|
||||||
log(f"run_mapper_macro fertig. Zeilen verarbeitet: {rows_processed}")
|
log(f"run_mapper_macro fertig. Zeilen verarbeitet: {rows_processed}")
|
||||||
|
|
||||||
# Export for LO
|
# Export für LibreOffice
|
||||||
g_exportedScripts = (run_mapper_macro,)
|
g_exportedScripts = (run_mapper_macro,)
|
||||||
455
Mapper_Makro_Alte_Versionen/mapper_macro_2.2.py
Normal file
455
Mapper_Makro_Alte_Versionen/mapper_macro_2.2.py
Normal file
@ -0,0 +1,455 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
LibreOffice/Excel Macro: NV_MASTER-Abgleich
|
||||||
|
Version: 2.3
|
||||||
|
Pfad: libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro/mapper_macro_2.3.py
|
||||||
|
|
||||||
|
Beschreibung:
|
||||||
|
-------------
|
||||||
|
Dieses Python-Makro für LibreOffice/Excel führt einen Abgleich von Begriffen
|
||||||
|
aus einem aktiven Sheet gegen ein zentral gepflegtes NV_MASTER-Vokabular durch.
|
||||||
|
Es erstellt Treffer, Vorschläge und markiert die Zellen farblich.
|
||||||
|
|
||||||
|
Hauptfunktionen:
|
||||||
|
----------------
|
||||||
|
1. Text-Normalisierung und Lemma-Bestimmung
|
||||||
|
2. Laden des NV_MASTER-Vokabulars und Aufbau von Norm-Index + Lemma-Index
|
||||||
|
3. Fuzzy Matching (RapidFuzz oder difflib) für Begriffe
|
||||||
|
4. Treffer- und Vorschlagsbestimmung
|
||||||
|
5. Mapping auf Sheet:
|
||||||
|
- Norm_Treffer (grün)
|
||||||
|
- Norm_Vorschlag (gelb)
|
||||||
|
- Kein_Treffer (rot)
|
||||||
|
6. Caching zur Vermeidung mehrfacher Berechnungen
|
||||||
|
7. Logging in externe Datei
|
||||||
|
|
||||||
|
Externe Abhängigkeiten:
|
||||||
|
-----------------------
|
||||||
|
- pandas (für ODS/Excel-Leseoperationen)
|
||||||
|
- spacy (für deutsche Lemma-Bestimmung)
|
||||||
|
- rapidfuzz (optional für schnellere Fuzzy-String-Matches)
|
||||||
|
|
||||||
|
UNO-spezifische Objekte:
|
||||||
|
------------------------
|
||||||
|
- XSCRIPTCONTEXT: Bereitgestellt durch LibreOffice zur Laufzeit
|
||||||
|
|
||||||
|
Schwachstellen / Optimierungsansätze:
|
||||||
|
-------------------------------------
|
||||||
|
- Fehlerbehandlung ist robust, aber teilweise sehr still (z.B. Cache-Fehler, Pandas-Fehler).
|
||||||
|
- Schleifen über Zellen sind bei großen Sheets langsam (potenziell durch pandas vollständig ersetzen).
|
||||||
|
- Lemma-Berechnung könnte nur einmal für NV_MASTER und einmal für Sheet durchgeführt werden.
|
||||||
|
- RapidFuzz optional; fallback auf SequenceMatcher ist deutlich langsamer.
|
||||||
|
- Cache wird nur am Ende geschrieben; Absturz vor Ende verliert bisherige Ergebnisse.
|
||||||
|
- Farbwerte sind fest codiert; parametrisieren könnte Flexibilität erhöhen.
|
||||||
|
- Stopwords sind hart codiert; konfigurierbar wäre effizienter.
|
||||||
|
- Es werden keine parallelen Abfragen / Batch-Operationen verwendet.
|
||||||
|
- Logging nur in Datei; LibreOffice-eigene Meldungen oder Fortschrittsanzeige fehlen.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
# UNO-Context wird zur Laufzeit von LibreOffice bereitgestellt
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
PANDAS_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
PANDAS_AVAILABLE = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
import spacy
|
||||||
|
nlp = spacy.load("de_core_news_sm")
|
||||||
|
SPACY_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
SPACY_AVAILABLE = False
|
||||||
|
nlp = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from rapidfuzz import fuzz
|
||||||
|
RAPIDFUZZ_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
RAPIDFUZZ_AVAILABLE = False
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Konfiguration
|
||||||
|
# ------------------------
|
||||||
|
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro"
|
||||||
|
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
|
||||||
|
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro_2.3.log")
|
||||||
|
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache_2.3.json")
|
||||||
|
|
||||||
|
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||||||
|
CONF_THRESHOLD = 0.75 # Basis-Schwelle für Vorschläge
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Logging-Funktion
|
||||||
|
# ------------------------
|
||||||
|
def log(msg):
|
||||||
|
"""Schreibt Nachricht in LOG_FILE. Fehler werden ignoriert."""
|
||||||
|
try:
|
||||||
|
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
||||||
|
f.write(msg + "\n")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Cache laden
|
||||||
|
# ------------------------
|
||||||
|
try:
|
||||||
|
if os.path.exists(CACHE_FILE):
|
||||||
|
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
||||||
|
CACHE = json.load(f)
|
||||||
|
else:
|
||||||
|
CACHE = {}
|
||||||
|
except Exception:
|
||||||
|
CACHE = {}
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Text-Normalisierung & Lemma
|
||||||
|
# ------------------------
|
||||||
|
def normalize_text(s):
|
||||||
|
"""Entfernt Sonderzeichen, multiple Whitespaces, wandelt in lowercase."""
|
||||||
|
if not s:
|
||||||
|
return ""
|
||||||
|
s = str(s).strip().lower()
|
||||||
|
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
|
||||||
|
s = re.sub(r"\s+", " ", s)
|
||||||
|
return s
|
||||||
|
|
||||||
|
lemma_cache = {}
|
||||||
|
def lemmatize_term(term):
|
||||||
|
"""Lemmatisiert einen Begriff mit SpaCy. Falls nicht verfügbar, Rückgabe Normalized String."""
|
||||||
|
term_norm = normalize_text(term)
|
||||||
|
if term_norm in lemma_cache:
|
||||||
|
return lemma_cache[term_norm]
|
||||||
|
if SPACY_AVAILABLE and nlp:
|
||||||
|
try:
|
||||||
|
doc = nlp(term_norm)
|
||||||
|
lemma = " ".join([token.lemma_ for token in doc])
|
||||||
|
except Exception:
|
||||||
|
lemma = term_norm
|
||||||
|
else:
|
||||||
|
lemma = term_norm
|
||||||
|
lemma_cache[term_norm] = lemma
|
||||||
|
return lemma
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# NV_MASTER laden
|
||||||
|
# ------------------------
|
||||||
|
def build_norm_index(nv_path):
|
||||||
|
"""
|
||||||
|
Liest NV_MASTER ein und erstellt:
|
||||||
|
- norm_dict: Normalisierte Begriffe -> Einträge mit Name, ID, Sheet
|
||||||
|
- lemma_index: Lemma -> Einträge
|
||||||
|
"""
|
||||||
|
norm_dict = {}
|
||||||
|
lemma_index = {}
|
||||||
|
if not PANDAS_AVAILABLE:
|
||||||
|
log("Pandas nicht verfügbar. NV_MASTER kann nicht gelesen werden.")
|
||||||
|
return norm_dict, lemma_index
|
||||||
|
try:
|
||||||
|
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Fehler beim Einlesen NV_MASTER: {e}")
|
||||||
|
return norm_dict, lemma_index
|
||||||
|
|
||||||
|
for sheet_name, df in sheets.items():
|
||||||
|
if str(sheet_name).strip().lower() == "master":
|
||||||
|
continue
|
||||||
|
df = df.fillna("")
|
||||||
|
cols = [str(c).strip().lower() for c in df.columns]
|
||||||
|
id_col = None
|
||||||
|
word_col = None
|
||||||
|
for i, c in enumerate(cols):
|
||||||
|
if "id" in c:
|
||||||
|
id_col = df.columns[i]
|
||||||
|
if "wort" in c or "vokabel" in c:
|
||||||
|
word_col = df.columns[i]
|
||||||
|
if word_col is None and len(df.columns) >= 1:
|
||||||
|
word_col = df.columns[-1]
|
||||||
|
if id_col is None and len(df.columns) >= 1:
|
||||||
|
id_col = df.columns[0]
|
||||||
|
|
||||||
|
current_parent_id = None
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
|
||||||
|
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
|
||||||
|
if id_val:
|
||||||
|
current_parent_id = id_val
|
||||||
|
if not word_val:
|
||||||
|
continue
|
||||||
|
norm_name = normalize_text(word_val)
|
||||||
|
lemma = lemmatize_term(word_val)
|
||||||
|
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
|
||||||
|
norm_dict.setdefault(norm_name, []).append(entry)
|
||||||
|
lemma_index.setdefault(lemma, []).append(entry)
|
||||||
|
log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}")
|
||||||
|
return norm_dict, lemma_index
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Matching-Funktionen
|
||||||
|
# ------------------------
|
||||||
|
def fuzzy_score(a, b):
|
||||||
|
"""Berechnet Fuzzy-Score zwischen zwei Strings. RapidFuzz oder fallback SequenceMatcher."""
|
||||||
|
if RAPIDFUZZ_AVAILABLE:
|
||||||
|
try:
|
||||||
|
return fuzz.token_set_ratio(a, b) / 100.0
|
||||||
|
except Exception:
|
||||||
|
return 0.0
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
||||||
|
except Exception:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD):
|
||||||
|
"""
|
||||||
|
Liefert Vorschläge für ein Lemma, wenn kein exakter Treffer existiert.
|
||||||
|
Score-basierte Sortierung, Duplikate werden entfernt.
|
||||||
|
"""
|
||||||
|
candidates = []
|
||||||
|
for key_lemma, entries in lemma_index.items():
|
||||||
|
score = fuzzy_score(term_lemma, key_lemma)
|
||||||
|
if key_lemma.startswith(term_lemma):
|
||||||
|
score = min(score + 0.1, 1.0)
|
||||||
|
if score >= threshold:
|
||||||
|
for e in entries:
|
||||||
|
candidates.append((score, e["Name"], e["ID"]))
|
||||||
|
for norm_key, entries in norm_dict.items():
|
||||||
|
score = fuzzy_score(term_lemma, norm_key)
|
||||||
|
if norm_key.startswith(term_lemma):
|
||||||
|
score = min(score + 0.1, 1.0)
|
||||||
|
if score >= threshold:
|
||||||
|
for e in entries:
|
||||||
|
candidates.append((score, e["Name"], e["ID"]))
|
||||||
|
candidates.sort(key=lambda t: t[0], reverse=True)
|
||||||
|
seen = set()
|
||||||
|
results = []
|
||||||
|
for score, name, id_ in candidates:
|
||||||
|
key = (name, id_)
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
results.append({"score": score, "name": name, "id": id_})
|
||||||
|
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
|
||||||
|
|
||||||
|
def map_term_with_indexes(term, norm_dict, lemma_index):
|
||||||
|
"""
|
||||||
|
Mappt einen Term auf NV_MASTER:
|
||||||
|
- Treffer
|
||||||
|
- Vorschläge
|
||||||
|
- IDs
|
||||||
|
Nutzt Cache, um Wiederholungen zu vermeiden.
|
||||||
|
"""
|
||||||
|
term_norm = normalize_text(term)
|
||||||
|
term_lemma = lemmatize_term(term)
|
||||||
|
if term_lemma in CACHE:
|
||||||
|
cached = CACHE[term_lemma]
|
||||||
|
return cached.get("hits", []), cached.get("suggestions", []), cached.get("ids", [])
|
||||||
|
hits = []
|
||||||
|
suggestions = []
|
||||||
|
ids = []
|
||||||
|
if term_norm in norm_dict:
|
||||||
|
for e in norm_dict[term_norm]:
|
||||||
|
hits.append(e["Name"])
|
||||||
|
if e["ID"]:
|
||||||
|
ids.append(e["ID"])
|
||||||
|
if not hits and term_lemma in lemma_index:
|
||||||
|
for e in lemma_index[term_lemma]:
|
||||||
|
hits.append(e["Name"])
|
||||||
|
if e["ID"]:
|
||||||
|
ids.append(e["ID"])
|
||||||
|
if not hits:
|
||||||
|
suggestions = get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD)
|
||||||
|
|
||||||
|
# Duplikate entfernen
|
||||||
|
def unique_preserve(seq):
|
||||||
|
seen = set()
|
||||||
|
out = []
|
||||||
|
for x in seq:
|
||||||
|
if x not in seen:
|
||||||
|
seen.add(x)
|
||||||
|
out.append(x)
|
||||||
|
return out
|
||||||
|
|
||||||
|
hits = unique_preserve(hits)
|
||||||
|
suggestions = unique_preserve(suggestions)
|
||||||
|
ids = unique_preserve(ids)
|
||||||
|
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
|
||||||
|
return hits, suggestions, ids
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Haupt-Makro
|
||||||
|
# ------------------------
|
||||||
|
def run_mapper_macro():
|
||||||
|
"""
|
||||||
|
Haupt-Makro für LibreOffice:
|
||||||
|
1. Bestimmt Header + Spalten
|
||||||
|
2. Fügt Spalten für Norm_Treffer, Norm_Vorschlag, Kein_Treffer hinzu
|
||||||
|
3. Liest NV_MASTER und baut Indizes
|
||||||
|
4. Iteriert über Zeilen und Terms
|
||||||
|
5. Markiert Zellen farblich (grün/gelb/rot)
|
||||||
|
6. Schreibt Cache am Ende
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
doc = XSCRIPTCONTEXT.getDocument()
|
||||||
|
sheet = doc.CurrentController.ActiveSheet
|
||||||
|
cursor = sheet.createCursor()
|
||||||
|
cursor.gotoStartOfUsedArea(False)
|
||||||
|
cursor.gotoEndOfUsedArea(True)
|
||||||
|
data_range = cursor.getRangeAddress()
|
||||||
|
except Exception as e:
|
||||||
|
log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e))
|
||||||
|
return
|
||||||
|
|
||||||
|
# Header finden
|
||||||
|
header_row = None
|
||||||
|
objekt_col = None
|
||||||
|
max_col = data_range.EndColumn
|
||||||
|
for r in range(0, min(5, data_range.EndRow+1)):
|
||||||
|
for c in range(0, max_col+1):
|
||||||
|
try:
|
||||||
|
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
|
||||||
|
except Exception:
|
||||||
|
val = ""
|
||||||
|
if val == "objektbeschreibung":
|
||||||
|
header_row = r
|
||||||
|
objekt_col = c
|
||||||
|
break
|
||||||
|
if objekt_col is not None:
|
||||||
|
break
|
||||||
|
if objekt_col is None:
|
||||||
|
log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Spalten anlegen, falls nicht vorhanden
|
||||||
|
existing = {}
|
||||||
|
for c in range(0, data_range.EndColumn+1):
|
||||||
|
try:
|
||||||
|
h = str(sheet.getCellByPosition(c, header_row).String).strip()
|
||||||
|
except Exception:
|
||||||
|
h = ""
|
||||||
|
if h == "Norm_Treffer":
|
||||||
|
existing["Norm_Treffer"] = c
|
||||||
|
if h == "Norm_Vorschlag":
|
||||||
|
existing["Norm_Vorschlag"] = c
|
||||||
|
last_col = data_range.EndColumn
|
||||||
|
if "Norm_Treffer" not in existing:
|
||||||
|
last_col += 1
|
||||||
|
existing["Norm_Treffer"] = last_col
|
||||||
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
|
||||||
|
if "Norm_Vorschlag" not in existing:
|
||||||
|
last_col += 1
|
||||||
|
existing["Norm_Vorschlag"] = last_col
|
||||||
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
|
||||||
|
if "Kein_Treffer" not in existing:
|
||||||
|
last_col += 1
|
||||||
|
existing["Kein_Treffer"] = last_col
|
||||||
|
sheet.getCellByPosition(last_col, header_row).String = "Kein_Treffer"
|
||||||
|
|
||||||
|
norm_tr_col = existing["Norm_Treffer"]
|
||||||
|
norm_sug_col = existing["Norm_Vorschlag"]
|
||||||
|
kein_tr_col = existing["Kein_Treffer"]
|
||||||
|
|
||||||
|
# NV_MASTER laden
|
||||||
|
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
|
||||||
|
if not norm_dict and not lemma_index:
|
||||||
|
log("NV_MASTER leer oder nicht lesbar. Abbruch.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Farben
|
||||||
|
GREEN = 0xADFF2F
|
||||||
|
YELLOW = 0xFFA500
|
||||||
|
RED = 0xCC0000
|
||||||
|
WHITE = 0xFFFFFF
|
||||||
|
|
||||||
|
rows_processed = 0
|
||||||
|
for r in range(header_row + 1, data_range.EndRow + 1):
|
||||||
|
try:
|
||||||
|
cell = sheet.getCellByPosition(objekt_col, r)
|
||||||
|
txt = str(cell.String).strip()
|
||||||
|
if not txt:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Term-Extraktion
|
||||||
|
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
|
||||||
|
terms = []
|
||||||
|
for cl in clauses:
|
||||||
|
parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()]
|
||||||
|
for p in parts:
|
||||||
|
if p.lower() in STOPWORDS:
|
||||||
|
continue
|
||||||
|
if re.fullmatch(r"\d+", p):
|
||||||
|
continue
|
||||||
|
terms.append(p)
|
||||||
|
|
||||||
|
row_hits = []
|
||||||
|
row_sugs = []
|
||||||
|
row_ids = []
|
||||||
|
unmapped_terms = []
|
||||||
|
|
||||||
|
for term in terms:
|
||||||
|
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
|
||||||
|
if hits:
|
||||||
|
row_hits.extend([f"{h} ({id_})" if id_ else h for h,id_ in zip(hits, ids + [""]*len(hits))])
|
||||||
|
else:
|
||||||
|
unmapped_terms.append(term)
|
||||||
|
if sugs:
|
||||||
|
row_sugs.extend([f"{s}" for s in sugs])
|
||||||
|
if ids:
|
||||||
|
row_ids.extend(ids)
|
||||||
|
|
||||||
|
def uniq(seq):
|
||||||
|
seen = set()
|
||||||
|
out = []
|
||||||
|
for x in seq:
|
||||||
|
if x not in seen:
|
||||||
|
seen.add(x)
|
||||||
|
out.append(x)
|
||||||
|
return out
|
||||||
|
|
||||||
|
row_hits = uniq(row_hits)
|
||||||
|
row_sugs = uniq(row_sugs)
|
||||||
|
unmapped_terms = uniq(unmapped_terms)
|
||||||
|
|
||||||
|
# Farb-Logik
|
||||||
|
if terms and not unmapped_terms and row_hits:
|
||||||
|
cell.CellBackColor = GREEN
|
||||||
|
row_sugs = [] # keine Vorschläge wenn alles Treffer
|
||||||
|
elif row_hits:
|
||||||
|
cell.CellBackColor = YELLOW
|
||||||
|
else:
|
||||||
|
cell.CellBackColor = RED
|
||||||
|
|
||||||
|
# Ergebnisse schreiben
|
||||||
|
tr_cell = sheet.getCellByPosition(norm_tr_col, r)
|
||||||
|
tr_cell.String = " | ".join(row_hits)
|
||||||
|
tr_cell.CellBackColor = GREEN if row_hits else WHITE
|
||||||
|
|
||||||
|
sug_cell = sheet.getCellByPosition(norm_sug_col, r)
|
||||||
|
sug_cell.String = " | ".join(row_sugs)
|
||||||
|
sug_cell.CellBackColor = YELLOW if row_sugs else WHITE
|
||||||
|
|
||||||
|
kt_cell = sheet.getCellByPosition(kein_tr_col, r)
|
||||||
|
kt_cell.String = " | ".join(unmapped_terms)
|
||||||
|
kt_cell.CellBackColor = RED if unmapped_terms else WHITE
|
||||||
|
|
||||||
|
rows_processed += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Fehler in Zeile {r}: {e}\n{traceback.format_exc()}")
|
||||||
|
|
||||||
|
# Cache speichern
|
||||||
|
try:
|
||||||
|
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(CACHE, f, ensure_ascii=False, indent=2)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
log(f"run_mapper_macro fertig. Zeilen verarbeitet: {rows_processed}")
|
||||||
|
|
||||||
|
# Export für LibreOffice
|
||||||
|
g_exportedScripts = (run_mapper_macro,)
|
||||||
BIN
NV_MASTER.ods
BIN
NV_MASTER.ods
Binary file not shown.
Binary file not shown.
@ -1,171 +0,0 @@
|
|||||||
import os
|
|
||||||
import re
|
|
||||||
import logging
|
|
||||||
import datetime
|
|
||||||
import pandas as pd
|
|
||||||
from openpyxl.utils import get_column_letter
|
|
||||||
from openpyxl.styles import Alignment
|
|
||||||
import ezodf
|
|
||||||
|
|
||||||
# ----------------- KONFIGURATION -----------------
|
|
||||||
INPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods"
|
|
||||||
MASTER_SHEET_NAME = "Masterstruktur"
|
|
||||||
today = datetime.datetime.today().strftime("%y.%m.%d")
|
|
||||||
base, ext = os.path.splitext(INPUT_FILE)
|
|
||||||
OUTPUT_FILE = f"{base}_Updated_{today}{ext}"
|
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
||||||
|
|
||||||
# ----------------- HILFSFUNKTIONEN -----------------
|
|
||||||
|
|
||||||
def load_file(input_file):
|
|
||||||
"""
|
|
||||||
Prüft Dateiformat und gibt für Excel: pd.ExcelFile + Engine zurück,
|
|
||||||
für ODS: None + "odf" (da ODS direkt über ezodf gelesen wird).
|
|
||||||
"""
|
|
||||||
ext = os.path.splitext(input_file)[1].lower()
|
|
||||||
if ext in [".xlsx", ".xls"]:
|
|
||||||
engine = "openpyxl"
|
|
||||||
xls = pd.ExcelFile(input_file, engine=engine)
|
|
||||||
elif ext == ".ods":
|
|
||||||
engine = "odf"
|
|
||||||
xls = None # ODS wird direkt über ezodf gelesen
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Nicht unterstütztes Dateiformat: {ext}")
|
|
||||||
logging.info(f"Lade Datei {input_file} mit Engine '{engine}'")
|
|
||||||
return xls, engine
|
|
||||||
|
|
||||||
def read_ods_sheet(filename, sheet_name):
|
|
||||||
"""Liests ODS Sheet sauber ein, inklusive Header."""
|
|
||||||
doc = ezodf.opendoc(filename)
|
|
||||||
sheet = doc.sheets[sheet_name]
|
|
||||||
data = []
|
|
||||||
headers = [str(sheet[0, col].value).strip() for col in range(sheet.ncols())]
|
|
||||||
for row_idx in range(1, sheet.nrows()):
|
|
||||||
row = {}
|
|
||||||
empty_row = True
|
|
||||||
for col_idx, col_name in enumerate(headers):
|
|
||||||
cell_val = sheet[row_idx, col_idx].value
|
|
||||||
val = "" if cell_val is None else str(cell_val).strip()
|
|
||||||
row[col_name] = val
|
|
||||||
if val:
|
|
||||||
empty_row = False
|
|
||||||
if not empty_row:
|
|
||||||
data.append(row)
|
|
||||||
df = pd.DataFrame(data, columns=headers)
|
|
||||||
return df
|
|
||||||
|
|
||||||
def process_category_sheet(df):
|
|
||||||
"""Erstellt die treppenartige Hierarchie."""
|
|
||||||
df = df.copy()
|
|
||||||
for col in ["ID","Unterkategorie","Unterunterkategorie","Wort/Vokabel"]:
|
|
||||||
if col not in df.columns:
|
|
||||||
df[col] = ""
|
|
||||||
rows = []
|
|
||||||
current_id = ""
|
|
||||||
current_uuk = ""
|
|
||||||
for _, r in df.iterrows():
|
|
||||||
id_val = str(r.get("ID","")).strip()
|
|
||||||
uuk_val = str(r.get("Unterunterkategorie","")).strip()
|
|
||||||
word_val = str(r.get("Wort/Vokabel","")).strip()
|
|
||||||
|
|
||||||
if id_val: # Kategoriezeile
|
|
||||||
current_id = id_val
|
|
||||||
current_uuk = uuk_val or word_val
|
|
||||||
rows.append({"ID": current_id, "Unterkategorie": "", "Unterunterkategorie": current_uuk, "Wort/Vokabel": ""})
|
|
||||||
continue
|
|
||||||
if uuk_val: # Unterunterkategorie
|
|
||||||
current_uuk = uuk_val
|
|
||||||
rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": current_uuk, "Wort/Vokabel": ""})
|
|
||||||
continue
|
|
||||||
if word_val: # Vokabel
|
|
||||||
rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": "", "Wort/Vokabel": word_val})
|
|
||||||
continue
|
|
||||||
return pd.DataFrame(rows, columns=["ID","Unterkategorie","Unterunterkategorie","Wort/Vokabel"])
|
|
||||||
|
|
||||||
def remove_empty_vocabulary_rows(df):
|
|
||||||
"""Entfernt Zeilen, die nur leere Wort/Vokabel-Spalte haben."""
|
|
||||||
return df[df["Wort/Vokabel"].astype(str).str.strip() != ""].copy().reset_index(drop=True)
|
|
||||||
|
|
||||||
def sync_master_and_sheets(master_df, category_dfs):
|
|
||||||
"""Synchronisiert Kategorien nach Master, Vokabeln bleiben erhalten."""
|
|
||||||
master_df = master_df.copy()
|
|
||||||
master_df["ID"] = master_df["ID"].astype(str).str.strip()
|
|
||||||
master_dict = dict(zip(master_df["ID"], master_df["Kategorie"]))
|
|
||||||
updated_dfs = {}
|
|
||||||
summary = {}
|
|
||||||
|
|
||||||
for sheet_name, df in category_dfs.items():
|
|
||||||
rows_out = []
|
|
||||||
changes = {"removed":0}
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
id_val = str(row.get("ID","")).strip()
|
|
||||||
if id_val and id_val not in master_dict:
|
|
||||||
changes["removed"] +=1
|
|
||||||
continue
|
|
||||||
rows_out.append(row.to_dict())
|
|
||||||
updated_dfs[sheet_name] = pd.DataFrame(rows_out, columns=df.columns)
|
|
||||||
summary[sheet_name] = changes
|
|
||||||
|
|
||||||
new_master = pd.DataFrame([{"ID":k,"Kategorie":v} for k,v in sorted(master_dict.items())])
|
|
||||||
return new_master, updated_dfs, summary
|
|
||||||
|
|
||||||
def save_excel(processed_sheets, output_file):
|
|
||||||
from openpyxl import Workbook
|
|
||||||
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
|
|
||||||
for sheet_name, df in processed_sheets.items():
|
|
||||||
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
|
||||||
ws = writer.sheets[sheet_name]
|
|
||||||
for col_idx, col in enumerate(df.columns,1):
|
|
||||||
max_len = max(df[col].astype(str).map(len).max() if len(df)>0 else 0,len(col))+2
|
|
||||||
ws.column_dimensions[get_column_letter(col_idx)].width = max_len
|
|
||||||
for row_idx in range(1,len(df)+2):
|
|
||||||
ws.cell(row=row_idx,column=col_idx).alignment = Alignment(horizontal='left')
|
|
||||||
|
|
||||||
def save_ods(processed_sheets, output_file):
|
|
||||||
doc = ezodf.newdoc(doctype="ods", filename=output_file)
|
|
||||||
for name, df in processed_sheets.items():
|
|
||||||
sheet = ezodf.Sheet(name, size=(len(df)+1,len(df.columns)))
|
|
||||||
doc.sheets += sheet
|
|
||||||
for col_idx, col_name in enumerate(df.columns):
|
|
||||||
sheet[0,col_idx].set_value(col_name)
|
|
||||||
for row_idx,row in enumerate(df.itertuples(index=False),start=1):
|
|
||||||
for col_idx,value in enumerate(row):
|
|
||||||
sheet[row_idx,col_idx].set_value("" if pd.isna(value) else value)
|
|
||||||
doc.save()
|
|
||||||
|
|
||||||
# ----------------- HAUPTPROGRAMM -----------------
|
|
||||||
def main():
|
|
||||||
xls, engine = load_file(INPUT_FILE)
|
|
||||||
if engine == "odf":
|
|
||||||
doc = ezodf.opendoc(INPUT_FILE)
|
|
||||||
sheet_names = [s.name for s in doc.sheets if s.name != MASTER_SHEET_NAME]
|
|
||||||
category_dfs = {name: process_category_sheet(read_ods_sheet(INPUT_FILE,name)) for name in sheet_names}
|
|
||||||
master_df = read_ods_sheet(INPUT_FILE, MASTER_SHEET_NAME)
|
|
||||||
else:
|
|
||||||
sheet_names = [s for s in xls.sheet_names if s != MASTER_SHEET_NAME]
|
|
||||||
category_dfs = {}
|
|
||||||
for sheet_name in sheet_names:
|
|
||||||
df = pd.read_excel(xls, sheet_name=sheet_name, engine=engine)
|
|
||||||
df.columns = [str(c).strip() for c in df.columns]
|
|
||||||
category_dfs[sheet_name] = process_category_sheet(df)
|
|
||||||
master_df = pd.read_excel(xls, sheet_name=MASTER_SHEET_NAME, engine=engine)
|
|
||||||
master_df.columns = [str(c).strip() for c in master_df.columns]
|
|
||||||
|
|
||||||
new_master, updated_dfs, summary = sync_master_and_sheets(master_df, category_dfs)
|
|
||||||
processed_sheets = {MASTER_SHEET_NAME:new_master}
|
|
||||||
processed_sheets.update({k:remove_empty_vocabulary_rows(v) for k,v in updated_dfs.items()})
|
|
||||||
|
|
||||||
ext_out = os.path.splitext(OUTPUT_FILE)[1].lower()
|
|
||||||
if ext_out in [".xlsx",".xls"]:
|
|
||||||
save_excel(processed_sheets, OUTPUT_FILE)
|
|
||||||
else:
|
|
||||||
save_ods(processed_sheets, OUTPUT_FILE)
|
|
||||||
|
|
||||||
logging.info(f"Datei gespeichert: {OUTPUT_FILE}")
|
|
||||||
logging.info("===== SYNC SUMMARY =====")
|
|
||||||
for sheet, info in summary.items():
|
|
||||||
logging.info(f"{sheet}: {info}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@ -1,3 +1,32 @@
|
|||||||
|
"""
|
||||||
|
===============================================================================
|
||||||
|
Skriptname: NV_SPOT_Export.py
|
||||||
|
Beschreibung:
|
||||||
|
Dieses Skript soll hierarchische Normvokabular-Tabellen
|
||||||
|
(ODS/XLSX-Format) in eine JSON-basierte SPOT-Struktur (Strukturierter
|
||||||
|
Positionsbaum) konvertieren. Es ermöglicht das Exportieren in Excel und ODS, sowie
|
||||||
|
das nachträgliche Ergänzen von Kategorien, Unterkategorien und Wörtern.
|
||||||
|
|
||||||
|
//NOCH NICHT GETESTET//
|
||||||
|
|
||||||
|
Hauptfunktionen:
|
||||||
|
- Node: Klasse zur Repräsentation von Baumknoten.
|
||||||
|
- load_excel_or_ods: Lädt Tabellen aus ODS/XLSX-Dateien.
|
||||||
|
- process_sheet_to_tree: Erzeugt eine Baumstruktur aus einem Sheet.
|
||||||
|
- save_spot_json: Speichert den SPOT-Baum als JSON.
|
||||||
|
- load_spot_json: Lädt SPOT-Daten aus JSON-Dateien.
|
||||||
|
- export_spot_to_excel: Exportiert den SPOT-Baum nach Excel.
|
||||||
|
- export_spot_to_ods: Exportiert den SPOT-Baum nach ODS.
|
||||||
|
- add_category/subcategory/word: Fügt Elemente im Baum hinzu.
|
||||||
|
- main: Steuert den Workflow.
|
||||||
|
|
||||||
|
Abhängigkeiten:
|
||||||
|
Python 3.x, pandas, openpyxl, ezodf, json, logging, datetime
|
||||||
|
|
||||||
|
Stand: 2025-10-01
|
||||||
|
===============================================================================
|
||||||
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import datetime
|
import datetime
|
||||||
@ -12,6 +41,20 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(
|
|||||||
|
|
||||||
# ---------------- SPOT-Baumstruktur ----------------
|
# ---------------- SPOT-Baumstruktur ----------------
|
||||||
class Node:
|
class Node:
|
||||||
|
"""
|
||||||
|
Repräsentiert einen Knoten in der SPOT-Baumstruktur.
|
||||||
|
|
||||||
|
Attribute:
|
||||||
|
name (str): Anzeigename des Knotens.
|
||||||
|
id (str): Optionale ID (nur für Kategorien).
|
||||||
|
type (str): Knotentyp ("category", "subcategory", "word").
|
||||||
|
children (list[Node]): Unterknoten.
|
||||||
|
|
||||||
|
Methoden:
|
||||||
|
add_child(child): Fügt einen Unterknoten hinzu.
|
||||||
|
to_dict(): Serialisiert den Knoten in ein Dictionary/JSON-kompatibles Format.
|
||||||
|
from_dict(d): Rekonstruiert den Baum aus einem Dictionary.
|
||||||
|
"""
|
||||||
def __init__(self, name, node_type="category", id=None):
|
def __init__(self, name, node_type="category", id=None):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.id = id
|
self.id = id
|
||||||
@ -19,9 +62,11 @@ class Node:
|
|||||||
self.children = []
|
self.children = []
|
||||||
|
|
||||||
def add_child(self, child):
|
def add_child(self, child):
|
||||||
|
"""Fügt dem aktuellen Knoten einen Unterknoten hinzu."""
|
||||||
self.children.append(child)
|
self.children.append(child)
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
|
"""Wandelt den Knoten (rekursiv) in ein Dictionary um."""
|
||||||
if self.type == "word":
|
if self.type == "word":
|
||||||
return self.name
|
return self.name
|
||||||
return {
|
return {
|
||||||
@ -33,14 +78,26 @@ class Node:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_dict(d):
|
def from_dict(d):
|
||||||
|
"""Erzeugt aus einem Dictionary ein Node-Objekt (rekursiv)."""
|
||||||
if isinstance(d, str):
|
if isinstance(d, str):
|
||||||
return Node(d, "word")
|
return Node(d, "word")
|
||||||
node = Node(d["name"], d.get("type", "category"), d.get("id"))
|
node = Node(d["name"], d.get("type", "category"), d.get("id"))
|
||||||
node.children = [Node.from_dict(c) for c in d.get("children", [])]
|
node.children = [Node.from_dict(c) for c in d.get("children", [])]
|
||||||
return node
|
return node
|
||||||
|
|
||||||
|
|
||||||
# ---------------- Funktionen zum Laden ----------------
|
# ---------------- Funktionen zum Laden ----------------
|
||||||
def load_excel_or_ods(input_file, master_sheet="Masterstruktur"):
|
def load_excel_or_ods(input_file, master_sheet="Masterstruktur"):
|
||||||
|
"""
|
||||||
|
Lädt ODS oder Excel-Datei und gibt Master- sowie Kategorien-DataFrames zurück.
|
||||||
|
|
||||||
|
Parameter:
|
||||||
|
input_file (str): Pfad zur Quelldatei.
|
||||||
|
master_sheet (str): Name des Masterblattes.
|
||||||
|
|
||||||
|
Rückgabe:
|
||||||
|
(master_df, dfs): Master-DataFrame und Dictionary mit anderen Sheets.
|
||||||
|
"""
|
||||||
ext = os.path.splitext(input_file)[1].lower()
|
ext = os.path.splitext(input_file)[1].lower()
|
||||||
engine = "openpyxl" if ext in [".xlsx", ".xls"] else "odf"
|
engine = "openpyxl" if ext in [".xlsx", ".xls"] else "odf"
|
||||||
xls = pd.ExcelFile(input_file, engine=engine)
|
xls = pd.ExcelFile(input_file, engine=engine)
|
||||||
@ -49,26 +106,44 @@ def load_excel_or_ods(input_file, master_sheet="Masterstruktur"):
|
|||||||
master_df = pd.read_excel(xls, sheet_name=master_sheet, engine=engine)
|
master_df = pd.read_excel(xls, sheet_name=master_sheet, engine=engine)
|
||||||
return master_df, dfs
|
return master_df, dfs
|
||||||
|
|
||||||
|
|
||||||
# ---------------- Baum aus Sheet erstellen ----------------
|
# ---------------- Baum aus Sheet erstellen ----------------
|
||||||
def process_sheet_to_tree(df):
|
def process_sheet_to_tree(df):
|
||||||
|
"""
|
||||||
|
Wandelt ein Kategoriensheet in eine hierarchische Baumstruktur (Liste von Nodes) um.
|
||||||
|
|
||||||
|
Struktur:
|
||||||
|
Kategorie → Unterkategorie → Wort
|
||||||
|
|
||||||
|
Parameter:
|
||||||
|
df (pd.DataFrame): Eingabedaten mit Spalten ["ID", "Unterkategorie",
|
||||||
|
"Unterunterkategorie", "Wort/Vokabel"].
|
||||||
|
|
||||||
|
Rückgabe:
|
||||||
|
list[Node]: Liste von Baumknoten der obersten Ebene.
|
||||||
|
"""
|
||||||
df = df.fillna("").astype(str)
|
df = df.fillna("").astype(str)
|
||||||
tree_nodes = []
|
tree_nodes = []
|
||||||
current_cat = None
|
current_cat = None
|
||||||
current_sub = None
|
current_sub = None
|
||||||
|
|
||||||
for idx, row in df.iterrows():
|
for idx, row in df.iterrows():
|
||||||
id_val = row.get("ID", "").strip()
|
id_val = row.get("ID", "").strip()
|
||||||
uk_val = row.get("Unterkategorie", "").strip()
|
uk_val = row.get("Unterkategorie", "").strip()
|
||||||
uuk_val = row.get("Unterunterkategorie", "").strip()
|
uuk_val = row.get("Unterunterkategorie", "").strip()
|
||||||
word_val = row.get("Wort/Vokabel", "").strip()
|
word_val = row.get("Wort/Vokabel", "").strip()
|
||||||
|
|
||||||
|
# Neue Kategorieebene
|
||||||
if id_val:
|
if id_val:
|
||||||
current_cat = Node(uk_val or word_val, "category", id=id_val)
|
current_cat = Node(uk_val or word_val, "category", id=id_val)
|
||||||
tree_nodes.append(current_cat)
|
tree_nodes.append(current_cat)
|
||||||
current_sub = None
|
current_sub = None
|
||||||
|
# Unterkategorie
|
||||||
elif uuk_val:
|
elif uuk_val:
|
||||||
current_sub = Node(uuk_val, "subcategory")
|
current_sub = Node(uuk_val, "subcategory")
|
||||||
if current_cat:
|
if current_cat:
|
||||||
current_cat.add_child(current_sub)
|
current_cat.add_child(current_sub)
|
||||||
|
# Wortebene
|
||||||
elif word_val:
|
elif word_val:
|
||||||
word_node = Node(word_val, "word")
|
word_node = Node(word_val, "word")
|
||||||
if current_sub:
|
if current_sub:
|
||||||
@ -77,28 +152,60 @@ def process_sheet_to_tree(df):
|
|||||||
current_cat.add_child(word_node)
|
current_cat.add_child(word_node)
|
||||||
return tree_nodes
|
return tree_nodes
|
||||||
|
|
||||||
|
|
||||||
# ---------------- SPOT laden/speichern ----------------
|
# ---------------- SPOT laden/speichern ----------------
|
||||||
def save_spot_json(tree_nodes, file_path):
|
def save_spot_json(tree_nodes, file_path):
|
||||||
|
"""
|
||||||
|
Speichert den SPOT-Baum als JSON-Datei.
|
||||||
|
|
||||||
|
Parameter:
|
||||||
|
tree_nodes (list[Node]): Wurzelknoten der Baumstruktur.
|
||||||
|
file_path (str): Zielpfad.
|
||||||
|
"""
|
||||||
with open(file_path, "w", encoding="utf-8") as f:
|
with open(file_path, "w", encoding="utf-8") as f:
|
||||||
json.dump([n.to_dict() for n in tree_nodes], f, indent=2, ensure_ascii=False)
|
json.dump([n.to_dict() for n in tree_nodes], f, indent=2, ensure_ascii=False)
|
||||||
logging.info(f"SPOT gespeichert: {file_path}")
|
logging.info(f"SPOT gespeichert: {file_path}")
|
||||||
|
|
||||||
|
|
||||||
def load_spot_json(file_path):
|
def load_spot_json(file_path):
|
||||||
|
"""
|
||||||
|
Lädt SPOT-JSON-Datei und rekonstruiert den Baum.
|
||||||
|
|
||||||
|
Parameter:
|
||||||
|
file_path (str): Pfad zur JSON-Datei.
|
||||||
|
|
||||||
|
Rückgabe:
|
||||||
|
list[Node]: Liste oberster Knoten.
|
||||||
|
"""
|
||||||
with open(file_path, "r", encoding="utf-8") as f:
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
return [Node.from_dict(n) for n in data]
|
return [Node.from_dict(n) for n in data]
|
||||||
|
|
||||||
|
|
||||||
# ---------------- Export in Excel ----------------
|
# ---------------- Export in Excel ----------------
|
||||||
def export_spot_to_excel(tree_nodes, output_file):
|
def export_spot_to_excel(tree_nodes, output_file):
|
||||||
|
"""
|
||||||
|
Exportiert den SPOT-Baum in eine Excel-Datei.
|
||||||
|
|
||||||
|
Struktur:
|
||||||
|
Spalten A–D: ID, Kategorie, Unterkategorie, Wort.
|
||||||
|
|
||||||
|
Parameter:
|
||||||
|
tree_nodes (list[Node]): Baumstruktur.
|
||||||
|
output_file (str): Zielpfad der Excel-Datei.
|
||||||
|
"""
|
||||||
wb = Workbook()
|
wb = Workbook()
|
||||||
wb.remove(wb.active)
|
wb.remove(wb.active)
|
||||||
|
|
||||||
for node in tree_nodes:
|
for node in tree_nodes:
|
||||||
ws = wb.create_sheet(title=node.name[:31])
|
ws = wb.create_sheet(title=node.name[:31])
|
||||||
row_idx = 1
|
row_idx = 1
|
||||||
# Kategorie
|
|
||||||
|
# Kategoriezeile
|
||||||
ws.cell(row=row_idx, column=1, value=node.id)
|
ws.cell(row=row_idx, column=1, value=node.id)
|
||||||
ws.cell(row=row_idx, column=2, value=node.name)
|
ws.cell(row=row_idx, column=2, value=node.name)
|
||||||
row_idx += 1
|
row_idx += 1
|
||||||
|
|
||||||
for sub in node.children:
|
for sub in node.children:
|
||||||
if sub.type == "subcategory":
|
if sub.type == "subcategory":
|
||||||
ws.cell(row=row_idx, column=3, value=sub.name)
|
ws.cell(row=row_idx, column=3, value=sub.name)
|
||||||
@ -109,54 +216,99 @@ def export_spot_to_excel(tree_nodes, output_file):
|
|||||||
elif sub.type == "word":
|
elif sub.type == "word":
|
||||||
ws.cell(row=row_idx, column=4, value=sub.name)
|
ws.cell(row=row_idx, column=4, value=sub.name)
|
||||||
row_idx += 1
|
row_idx += 1
|
||||||
# Spaltenbreiten anpassen
|
|
||||||
for col_idx, col_letter in enumerate(["A","B","C","D"],1):
|
# Spaltenbreiten und Ausrichtung
|
||||||
|
for col_idx, col_letter in enumerate(["A", "B", "C", "D"], 1):
|
||||||
ws.column_dimensions[col_letter].width = 20
|
ws.column_dimensions[col_letter].width = 20
|
||||||
for r in range(1,row_idx):
|
for r in range(1, row_idx):
|
||||||
ws.cell(r,col_idx).alignment = Alignment(horizontal='left')
|
ws.cell(r, col_idx).alignment = Alignment(horizontal='left')
|
||||||
|
|
||||||
wb.save(output_file)
|
wb.save(output_file)
|
||||||
logging.info(f"Excel exportiert: {output_file}")
|
logging.info(f"Excel exportiert: {output_file}")
|
||||||
|
|
||||||
|
|
||||||
# ---------------- Export in ODS ----------------
|
# ---------------- Export in ODS ----------------
|
||||||
def export_spot_to_ods(tree_nodes, output_file):
|
def export_spot_to_ods(tree_nodes, output_file):
|
||||||
|
"""
|
||||||
|
Exportiert den SPOT-Baum in eine ODS-Datei.
|
||||||
|
|
||||||
|
Struktur analog zum Excel-Export.
|
||||||
|
|
||||||
|
Parameter:
|
||||||
|
tree_nodes (list[Node]): Baumstruktur.
|
||||||
|
output_file (str): Zielpfad der ODS-Datei.
|
||||||
|
"""
|
||||||
doc = ezodf.newdoc(doctype="ods", filename=output_file)
|
doc = ezodf.newdoc(doctype="ods", filename=output_file)
|
||||||
|
|
||||||
for node in tree_nodes:
|
for node in tree_nodes:
|
||||||
sheet = ezodf.Sheet(node.name[:31], size=(len(node.children)+10,4))
|
sheet = ezodf.Sheet(node.name[:31], size=(len(node.children) + 10, 4))
|
||||||
doc.sheets += sheet
|
doc.sheets += sheet
|
||||||
sheet[0,0].set_value("ID")
|
|
||||||
sheet[0,1].set_value("Unterkategorie")
|
sheet[0, 0].set_value("ID")
|
||||||
sheet[0,2].set_value("Unterunterkategorie")
|
sheet[0, 1].set_value("Unterkategorie")
|
||||||
sheet[0,3].set_value("Wort/Vokabel")
|
sheet[0, 2].set_value("Unterunterkategorie")
|
||||||
|
sheet[0, 3].set_value("Wort/Vokabel")
|
||||||
|
|
||||||
row_idx = 1
|
row_idx = 1
|
||||||
sheet[row_idx,0].set_value(node.id)
|
sheet[row_idx, 0].set_value(node.id)
|
||||||
sheet[row_idx,1].set_value(node.name)
|
sheet[row_idx, 1].set_value(node.name)
|
||||||
row_idx +=1
|
row_idx += 1
|
||||||
|
|
||||||
for sub in node.children:
|
for sub in node.children:
|
||||||
if sub.type == "subcategory":
|
if sub.type == "subcategory":
|
||||||
sheet[row_idx,2].set_value(sub.name)
|
sheet[row_idx, 2].set_value(sub.name)
|
||||||
row_idx +=1
|
row_idx += 1
|
||||||
for word in sub.children:
|
for word in sub.children:
|
||||||
sheet[row_idx,3].set_value(word.name)
|
sheet[row_idx, 3].set_value(word.name)
|
||||||
row_idx +=1
|
row_idx += 1
|
||||||
elif sub.type == "word":
|
elif sub.type == "word":
|
||||||
sheet[row_idx,3].set_value(sub.name)
|
sheet[row_idx, 3].set_value(sub.name)
|
||||||
row_idx +=1
|
row_idx += 1
|
||||||
|
|
||||||
doc.save()
|
doc.save()
|
||||||
logging.info(f"ODS exportiert: {output_file}")
|
logging.info(f"ODS exportiert: {output_file}")
|
||||||
|
|
||||||
|
|
||||||
# ---------------- CLI-Funktionen zum Editieren ----------------
|
# ---------------- CLI-Funktionen zum Editieren ----------------
|
||||||
def add_category(tree_nodes, cat_id, cat_name):
|
def add_category(tree_nodes, cat_id, cat_name):
|
||||||
|
"""
|
||||||
|
Fügt eine neue Kategorie zum SPOT-Baum hinzu.
|
||||||
|
|
||||||
|
Parameter:
|
||||||
|
tree_nodes (list[Node]): Liste der obersten Knoten.
|
||||||
|
cat_id (str): ID der Kategorie.
|
||||||
|
cat_name (str): Name der Kategorie.
|
||||||
|
"""
|
||||||
tree_nodes.append(Node(cat_name, "category", id=cat_id))
|
tree_nodes.append(Node(cat_name, "category", id=cat_id))
|
||||||
logging.info(f"Kategorie hinzugefügt: {cat_id} {cat_name}")
|
logging.info(f"Kategorie hinzugefügt: {cat_id} {cat_name}")
|
||||||
|
|
||||||
|
|
||||||
def add_subcategory(tree_nodes, cat_id, sub_name):
|
def add_subcategory(tree_nodes, cat_id, sub_name):
|
||||||
|
"""
|
||||||
|
Fügt einer vorhandenen Kategorie eine Unterkategorie hinzu.
|
||||||
|
|
||||||
|
Parameter:
|
||||||
|
tree_nodes (list[Node]): Wurzelknoten.
|
||||||
|
cat_id (str): Zielkategorie-ID.
|
||||||
|
sub_name (str): Name der Unterkategorie.
|
||||||
|
"""
|
||||||
for cat in tree_nodes:
|
for cat in tree_nodes:
|
||||||
if cat.id == cat_id:
|
if cat.id == cat_id:
|
||||||
cat.add_child(Node(sub_name, "subcategory"))
|
cat.add_child(Node(sub_name, "subcategory"))
|
||||||
logging.info(f"Unterkategorie hinzugefügt: {sub_name} in {cat_id}")
|
logging.info(f"Unterkategorie hinzugefügt: {sub_name} in {cat_id}")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def add_word(tree_nodes, cat_id, sub_name, word_name):
|
def add_word(tree_nodes, cat_id, sub_name, word_name):
|
||||||
|
"""
|
||||||
|
Fügt einem Unterknoten ein Wort hinzu.
|
||||||
|
|
||||||
|
Parameter:
|
||||||
|
tree_nodes (list[Node]): Wurzelknoten.
|
||||||
|
cat_id (str): ID der Kategorie.
|
||||||
|
sub_name (str): Name der Unterkategorie.
|
||||||
|
word_name (str): Neues Wort.
|
||||||
|
"""
|
||||||
for cat in tree_nodes:
|
for cat in tree_nodes:
|
||||||
if cat.id == cat_id:
|
if cat.id == cat_id:
|
||||||
for sub in cat.children:
|
for sub in cat.children:
|
||||||
@ -165,9 +317,18 @@ def add_word(tree_nodes, cat_id, sub_name, word_name):
|
|||||||
logging.info(f"Wort hinzugefügt: {word_name} unter {sub_name}")
|
logging.info(f"Wort hinzugefügt: {word_name} unter {sub_name}")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# ---------------- HAUPTPROGRAMM ----------------
|
# ---------------- HAUPTPROGRAMM ----------------
|
||||||
def main():
|
def main():
|
||||||
INPUT_FILE = "NV_MASTER.ods" # Beispielpfad
|
"""
|
||||||
|
Ablauf:
|
||||||
|
1. Liest Masterdatei (ODS oder XLSX).
|
||||||
|
2. Wandelt Kategorienblätter in SPOT-Struktur um.
|
||||||
|
3. Speichert SPOT als JSON.
|
||||||
|
4. Exportiert SPOT nach Excel und ODS.
|
||||||
|
5. Optional: Bearbeiten des Baums über CLI-Funktionen.
|
||||||
|
"""
|
||||||
|
INPUT_FILE = "NV_MASTER.ods"
|
||||||
OUTPUT_SPOT = "nv_spot.json"
|
OUTPUT_SPOT = "nv_spot.json"
|
||||||
today = datetime.datetime.today().strftime("%y.%m.%d")
|
today = datetime.datetime.today().strftime("%y.%m.%d")
|
||||||
OUTPUT_EXCEL = f"NV_MASTER_SPOT_{today}.xlsx"
|
OUTPUT_EXCEL = f"NV_MASTER_SPOT_{today}.xlsx"
|
||||||
@ -177,9 +338,10 @@ def main():
|
|||||||
spot_tree = []
|
spot_tree = []
|
||||||
for sheet, df in dfs.items():
|
for sheet, df in dfs.items():
|
||||||
spot_tree.extend(process_sheet_to_tree(df))
|
spot_tree.extend(process_sheet_to_tree(df))
|
||||||
|
|
||||||
save_spot_json(spot_tree, OUTPUT_SPOT)
|
save_spot_json(spot_tree, OUTPUT_SPOT)
|
||||||
|
|
||||||
# Beispiel: Editieren
|
# Beispielhafte Nutzung der Editierfunktionen:
|
||||||
# add_category(spot_tree, "10.1", "Neue Kategorie")
|
# add_category(spot_tree, "10.1", "Neue Kategorie")
|
||||||
# add_subcategory(spot_tree, "10.1", "Neue Unterunterkategorie")
|
# add_subcategory(spot_tree, "10.1", "Neue Unterunterkategorie")
|
||||||
# add_word(spot_tree, "10.1", "Neue Unterunterkategorie", "Neues Wort")
|
# add_word(spot_tree, "10.1", "Neue Unterunterkategorie", "Neues Wort")
|
||||||
@ -188,5 +350,6 @@ def main():
|
|||||||
export_spot_to_ods(spot_tree, OUTPUT_ODS)
|
export_spot_to_ods(spot_tree, OUTPUT_ODS)
|
||||||
logging.info("SPOT-Workflow abgeschlossen.")
|
logging.info("SPOT-Workflow abgeschlossen.")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
@ -1,13 +1,10 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""
|
"""
|
||||||
NormVokabular Mapper – Version 1.4.1
|
NormVokabular Mapper – Version 1.4.2
|
||||||
- Detailliertes (DEBUG) Batch-Logging: gepufferte Logs werden periodisch in Konsole + Datei geschrieben
|
|
||||||
- Getty AAT (SPARQL via requests) – API-polite, timeout/retries/backoff
|
Dieses Skript normalisiert und mappt Begriffe aus Input-Dateien auf ein zentrales Normvokabular
|
||||||
- Fehlertoleranz: API-Ausfälle führen nicht zum Totalabsturz
|
und führt optional API-Abgleiche mit GND und Wikidata durch. Ergebnisse werden in Excel/ODS gespeichert.
|
||||||
- Fehlende Begriffe -> separate Datei (gleiches Format wie Output)
|
|
||||||
- Bestehende Normalisierung/Lemmatisierung/Stemming wird weiterverwendet
|
|
||||||
- Batch-Logging-Modus (konfigurierbar)
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@ -25,50 +22,52 @@ from collections import defaultdict
|
|||||||
from difflib import SequenceMatcher
|
from difflib import SequenceMatcher
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
# Optional libs
|
# Optional Libraries
|
||||||
try:
|
try:
|
||||||
from rapidfuzz import fuzz
|
from rapidfuzz import fuzz # für schnellere String-Similarity
|
||||||
RAPIDFUZZ_AVAILABLE = True
|
RAPIDFUZZ_AVAILABLE = True
|
||||||
except Exception:
|
except Exception:
|
||||||
RAPIDFUZZ_AVAILABLE = False
|
RAPIDFUZZ_AVAILABLE = False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import spacy
|
import spacy
|
||||||
nlp = spacy.load("de_core_news_sm")
|
nlp = spacy.load("de_core_news_sm") # deutsche Lemmatization
|
||||||
SPACY_AVAILABLE = True
|
SPACY_AVAILABLE = True
|
||||||
except Exception:
|
except Exception:
|
||||||
SPACY_AVAILABLE = False
|
SPACY_AVAILABLE = False
|
||||||
nlp = None
|
nlp = None
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
# Config & Pfade
|
# Konfiguration & Pfade
|
||||||
# =========================
|
# =========================
|
||||||
INPUT_DIR = Path("Input CSV")
|
INPUT_DIR = Path("Input CSV") # Eingabeverzeichnis
|
||||||
OUTPUT_DIR = Path("Auswertung Ergebnisse")
|
OUTPUT_DIR = Path("Auswertung Ergebnisse") # Ausgabeordner
|
||||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
OUTPUT_DIR.mkdir(exist_ok=True) # Verzeichnis erstellen, falls nicht vorhanden
|
||||||
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
|
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods") # Normvokabular-Datei
|
||||||
CACHE_FILE = "api_cache.json"
|
CACHE_FILE = "api_cache.json" # Cache für API-Antworten
|
||||||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||||||
CONF_THRESHOLD = 0.75
|
CONF_THRESHOLD = 0.75 # Threshold für Vorschläge
|
||||||
TIMEOUT_DEFAULT = 5
|
TIMEOUT_DEFAULT = 5
|
||||||
MAX_RETRIES_DEFAULT = 3
|
MAX_RETRIES_DEFAULT = 3
|
||||||
BACKOFF_FACTOR_DEFAULT = 2
|
BACKOFF_FACTOR_DEFAULT = 2
|
||||||
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
|
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
|
||||||
API_ACTIVE = {"gnd": True, "wikidata": True, "aat": True}
|
API_ACTIVE = {"gnd": True, "wikidata": True} # API-Verfügbarkeit
|
||||||
FAIL_COUNTER = {"gnd": 0, "wikidata": 0, "aat": 0}
|
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
|
||||||
|
|
||||||
# Logging file
|
# Logging-Parameter
|
||||||
LOG_FILE = OUTPUT_DIR / "mapper_log.txt"
|
LOG_FILE = OUTPUT_DIR / "mapper_log.txt"
|
||||||
|
LOG_BATCH_SIZE = 100 # Anzahl Logs vor Flush
|
||||||
# Batch logging parameters
|
LOG_FLUSH_INTERVAL = 5.0 # Sekunden zwischen Flushes
|
||||||
LOG_BATCH_SIZE = 100 # flush wenn >= Einträge
|
LOG_LEVEL = "DEBUG" # Logging-Level
|
||||||
LOG_FLUSH_INTERVAL = 5.0 # Sekunden zwischen Flushes (Batch-Logging)
|
|
||||||
LOG_LEVEL = "DEBUG" # ausführlich gewünscht
|
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
# Buffered/Batched Logger
|
# Batch/Buffered Logger
|
||||||
# =========================
|
# =========================
|
||||||
class BatchLogger:
|
class BatchLogger:
|
||||||
|
"""
|
||||||
|
Buffered Logger: Speichert Logs in einem Queue-Buffer und schreibt sie periodisch in Datei und Konsole.
|
||||||
|
Reduziert I/O-Aufwand bei vielen Logs.
|
||||||
|
"""
|
||||||
def __init__(self, logfile: Path, flush_interval: float = 5.0, batch_size: int = 100, level: str = "DEBUG"):
|
def __init__(self, logfile: Path, flush_interval: float = 5.0, batch_size: int = 100, level: str = "DEBUG"):
|
||||||
self.logfile = logfile
|
self.logfile = logfile
|
||||||
self.flush_interval = flush_interval
|
self.flush_interval = flush_interval
|
||||||
@ -77,7 +76,7 @@ class BatchLogger:
|
|||||||
self.q = queue.Queue()
|
self.q = queue.Queue()
|
||||||
self._stop_event = threading.Event()
|
self._stop_event = threading.Event()
|
||||||
self._thread = threading.Thread(target=self._worker, daemon=True, name="BatchLoggerThread")
|
self._thread = threading.Thread(target=self._worker, daemon=True, name="BatchLoggerThread")
|
||||||
# Ensure logfile exists
|
# Sicherstellen, dass die Log-Datei existiert
|
||||||
try:
|
try:
|
||||||
logfile.parent.mkdir(parents=True, exist_ok=True)
|
logfile.parent.mkdir(parents=True, exist_ok=True)
|
||||||
logfile.touch(exist_ok=True)
|
logfile.touch(exist_ok=True)
|
||||||
@ -86,35 +85,33 @@ class BatchLogger:
|
|||||||
self._thread.start()
|
self._thread.start()
|
||||||
|
|
||||||
def _format(self, level: str, msg: str) -> str:
|
def _format(self, level: str, msg: str) -> str:
|
||||||
|
"""Formatiert Logeinträge mit Timestamp"""
|
||||||
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
return f"{ts} - {level} - {msg}"
|
return f"{ts} - {level} - {msg}"
|
||||||
|
|
||||||
def log(self, level: str, msg: str):
|
def log(self, level: str, msg: str):
|
||||||
|
"""Fügt Log dem Queue hinzu und löst Flush aus, falls Batchgröße erreicht"""
|
||||||
if self._stop_event.is_set():
|
if self._stop_event.is_set():
|
||||||
return
|
return
|
||||||
formatted = self._format(level, msg)
|
formatted = self._format(level, msg)
|
||||||
self.q.put((level, formatted))
|
self.q.put((level, formatted))
|
||||||
# If queue too big, trigger immediate flush by putting a special token
|
|
||||||
if self.q.qsize() >= self.batch_size:
|
if self.q.qsize() >= self.batch_size:
|
||||||
self.q.put(("__FLUSH__", "__FLUSH__"))
|
self.q.put(("__FLUSH__", "__FLUSH__"))
|
||||||
|
|
||||||
def debug(self, msg: str):
|
def debug(self, msg: str):
|
||||||
if LOG_LEVEL in ("DEBUG",):
|
if LOG_LEVEL in ("DEBUG",):
|
||||||
self.log("DEBUG", msg)
|
self.log("DEBUG", msg)
|
||||||
|
|
||||||
def info(self, msg: str):
|
def info(self, msg: str):
|
||||||
self.log("INFO", msg)
|
self.log("INFO", msg)
|
||||||
|
|
||||||
def warning(self, msg: str):
|
def warning(self, msg: str):
|
||||||
self.log("WARNING", msg)
|
self.log("WARNING", msg)
|
||||||
|
|
||||||
def error(self, msg: str):
|
def error(self, msg: str):
|
||||||
self.log("ERROR", msg)
|
self.log("ERROR", msg)
|
||||||
|
|
||||||
def exception(self, msg: str):
|
def exception(self, msg: str):
|
||||||
self.log("EXCEPTION", msg)
|
self.log("EXCEPTION", msg)
|
||||||
|
|
||||||
def _worker(self):
|
def _worker(self):
|
||||||
|
"""Hintergrund-Thread: verarbeitet Queue, schreibt Logs periodisch"""
|
||||||
buffer = []
|
buffer = []
|
||||||
last_flush = time.time()
|
last_flush = time.time()
|
||||||
while not self._stop_event.is_set() or not self.q.empty():
|
while not self._stop_event.is_set() or not self.q.empty():
|
||||||
@ -123,7 +120,6 @@ class BatchLogger:
|
|||||||
try:
|
try:
|
||||||
item = self.q.get(timeout=self.flush_interval)
|
item = self.q.get(timeout=self.flush_interval)
|
||||||
except queue.Empty:
|
except queue.Empty:
|
||||||
# time-based flush
|
|
||||||
if buffer:
|
if buffer:
|
||||||
self._flush_buffer(buffer)
|
self._flush_buffer(buffer)
|
||||||
buffer = []
|
buffer = []
|
||||||
@ -141,36 +137,30 @@ class BatchLogger:
|
|||||||
continue
|
continue
|
||||||
buffer.append((level, formatted))
|
buffer.append((level, formatted))
|
||||||
|
|
||||||
# flush conditions
|
|
||||||
if len(buffer) >= self.batch_size or (time.time() - last_flush) >= self.flush_interval:
|
if len(buffer) >= self.batch_size or (time.time() - last_flush) >= self.flush_interval:
|
||||||
self._flush_buffer(buffer)
|
self._flush_buffer(buffer)
|
||||||
buffer = []
|
buffer = []
|
||||||
last_flush = time.time()
|
last_flush = time.time()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# As a last resort, write error immediately to stderr
|
|
||||||
try:
|
try:
|
||||||
sys.stderr.write(f"BatchLogger worker error: {e}\n")
|
sys.stderr.write(f"BatchLogger worker error: {e}\n")
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
# final flush
|
|
||||||
if buffer:
|
if buffer:
|
||||||
self._flush_buffer(buffer)
|
self._flush_buffer(buffer)
|
||||||
|
|
||||||
def _flush_buffer(self, buffer):
|
def _flush_buffer(self, buffer):
|
||||||
|
"""Schreibt Puffer in Datei und Konsole"""
|
||||||
if not buffer:
|
if not buffer:
|
||||||
return
|
return
|
||||||
# write to console and file
|
|
||||||
try:
|
try:
|
||||||
# console
|
|
||||||
out_lines = [f"{line}\n" for _, line in buffer]
|
out_lines = [f"{line}\n" for _, line in buffer]
|
||||||
# write to stdout
|
|
||||||
try:
|
try:
|
||||||
sys.stdout.writelines(out_lines)
|
sys.stdout.writelines(out_lines)
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
# append to file
|
|
||||||
try:
|
try:
|
||||||
with open(self.logfile, "a", encoding="utf-8") as f:
|
with open(self.logfile, "a", encoding="utf-8") as f:
|
||||||
f.writelines(out_lines)
|
f.writelines(out_lines)
|
||||||
@ -183,17 +173,17 @@ class BatchLogger:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
|
"""Stoppt Logger-Thread"""
|
||||||
self._stop_event.set()
|
self._stop_event.set()
|
||||||
# put sentinel to wake worker
|
|
||||||
try:
|
try:
|
||||||
self.q.put(("__FLUSH__", "__FLUSH__"))
|
self.q.put(("__FLUSH__", "__FLUSH__"))
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
self._thread.join(timeout=5.0)
|
self._thread.join(timeout=5.0)
|
||||||
|
|
||||||
# Instantiate logger
|
# Logger-Instanz erstellen
|
||||||
logger = BatchLogger(LOG_FILE, flush_interval=LOG_FLUSH_INTERVAL, batch_size=LOG_BATCH_SIZE, level=LOG_LEVEL)
|
logger = BatchLogger(LOG_FILE, flush_interval=LOG_FLUSH_INTERVAL, batch_size=LOG_BATCH_SIZE, level=LOG_LEVEL)
|
||||||
logger.info("Starte NormVokabular Mapper v1.4.1 (Batch-Logging aktiv)")
|
logger.info("Starte NormVokabular Mapper v1.4.2 (Batch-Logging aktiv)")
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
# Cache laden/speichern
|
# Cache laden/speichern
|
||||||
@ -210,6 +200,7 @@ else:
|
|||||||
CACHE = {}
|
CACHE = {}
|
||||||
|
|
||||||
def save_cache():
|
def save_cache():
|
||||||
|
"""Speichert aktuellen Cache in JSON"""
|
||||||
try:
|
try:
|
||||||
with open(CACHE_FILE,"w",encoding="utf-8") as f:
|
with open(CACHE_FILE,"w",encoding="utf-8") as f:
|
||||||
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
||||||
@ -221,6 +212,7 @@ def save_cache():
|
|||||||
# Normalisierung / Lemma / Tokenization
|
# Normalisierung / Lemma / Tokenization
|
||||||
# =========================
|
# =========================
|
||||||
def normalize_text(s):
|
def normalize_text(s):
|
||||||
|
"""Text in Kleinbuchstaben, Sonderzeichen entfernen, Trim"""
|
||||||
if not s:
|
if not s:
|
||||||
return ""
|
return ""
|
||||||
s = str(s).lower().strip()
|
s = str(s).lower().strip()
|
||||||
@ -229,8 +221,8 @@ def normalize_text(s):
|
|||||||
return s
|
return s
|
||||||
|
|
||||||
lemma_cache = {}
|
lemma_cache = {}
|
||||||
|
|
||||||
def lemmatize_term(term):
|
def lemmatize_term(term):
|
||||||
|
"""Lemmatize mit spaCy, Cache für Performance"""
|
||||||
term_norm = normalize_text(term)
|
term_norm = normalize_text(term)
|
||||||
if term_norm in lemma_cache:
|
if term_norm in lemma_cache:
|
||||||
return lemma_cache[term_norm]
|
return lemma_cache[term_norm]
|
||||||
@ -246,6 +238,7 @@ def lemmatize_term(term):
|
|||||||
return lemma
|
return lemma
|
||||||
|
|
||||||
def compound_split(term):
|
def compound_split(term):
|
||||||
|
"""Splittet Komposita nach -, _, / oder Leerzeichen"""
|
||||||
if not term:
|
if not term:
|
||||||
return []
|
return []
|
||||||
parts = [p for p in re.split(r"[\s\-_/]+", term) if p]
|
parts = [p for p in re.split(r"[\s\-_/]+", term) if p]
|
||||||
@ -255,24 +248,29 @@ def compound_split(term):
|
|||||||
# Normvokabular laden & Index
|
# Normvokabular laden & Index
|
||||||
# =========================
|
# =========================
|
||||||
def load_normvokabular(file_path):
|
def load_normvokabular(file_path):
|
||||||
|
"""Lädt Normvokabular aus Excel/ODS, erstellt Dictionarys für Mapping"""
|
||||||
try:
|
try:
|
||||||
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
|
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Normvokabular konnte nicht geladen werden: {e}")
|
logger.error(f"Normvokabular konnte nicht geladen werden: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
norm_dict = {}
|
norm_dict = {}
|
||||||
stem_index = defaultdict(list)
|
stem_index = defaultdict(list)
|
||||||
lemma_norm_map = {}
|
lemma_norm_map = {}
|
||||||
|
|
||||||
for sheet_name, df in sheets.items():
|
for sheet_name, df in sheets.items():
|
||||||
if sheet_name.lower() in ["master", "übersicht"]:
|
if sheet_name.lower() in ["master", "übersicht"]:
|
||||||
continue
|
continue # Übersichtsblätter ignorieren
|
||||||
df = df.dropna(how="all", axis=1)
|
df = df.dropna(how="all", axis=1)
|
||||||
df.columns = [str(c).strip() for c in df.columns]
|
df.columns = [str(c).strip() for c in df.columns]
|
||||||
|
|
||||||
|
# ID- und Wort-Spalte finden
|
||||||
id_col = next((c for c in df.columns if "ID" in c), None)
|
id_col = next((c for c in df.columns if "ID" in c), None)
|
||||||
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c or "Begriff" in c), None)
|
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c or "Begriff" in c), None)
|
||||||
if not id_col or not word_col:
|
if not id_col or not word_col:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
current_parent_id = None
|
current_parent_id = None
|
||||||
for _, row in df.iterrows():
|
for _, row in df.iterrows():
|
||||||
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
|
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
|
||||||
@ -296,6 +294,10 @@ def load_normvokabular(file_path):
|
|||||||
# Mapping & Vorschläge
|
# Mapping & Vorschläge
|
||||||
# =========================
|
# =========================
|
||||||
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
|
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
|
||||||
|
"""
|
||||||
|
Mappt einen Begriff auf Normvokabular.
|
||||||
|
Prüft exakte Treffer, Lemma-Treffer, Komposita und generiert Vorschläge.
|
||||||
|
"""
|
||||||
term_norm = normalize_text(term)
|
term_norm = normalize_text(term)
|
||||||
term_lemma = lemmatize_term(term)
|
term_lemma = lemmatize_term(term)
|
||||||
|
|
||||||
@ -329,6 +331,7 @@ def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
|
|||||||
return "KEIN TREFFER", "", combined_suggestions
|
return "KEIN TREFFER", "", combined_suggestions
|
||||||
|
|
||||||
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
|
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
|
||||||
|
"""Ermittelt Vorschläge basierend auf Similarity"""
|
||||||
candidates = []
|
candidates = []
|
||||||
for key_lemma, entry in lemma_norm_map.items():
|
for key_lemma, entry in lemma_norm_map.items():
|
||||||
if RAPIDFUZZ_AVAILABLE:
|
if RAPIDFUZZ_AVAILABLE:
|
||||||
@ -346,10 +349,14 @@ def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOL
|
|||||||
# Generic request with retries & caching
|
# Generic request with retries & caching
|
||||||
# =========================
|
# =========================
|
||||||
def request_with_retries_generic(api_name, url, params=None, headers=None, timeout=TIMEOUT_DEFAULT, max_retries=MAX_RETRIES_DEFAULT, backoff=BACKOFF_FACTOR_DEFAULT):
|
def request_with_retries_generic(api_name, url, params=None, headers=None, timeout=TIMEOUT_DEFAULT, max_retries=MAX_RETRIES_DEFAULT, backoff=BACKOFF_FACTOR_DEFAULT):
|
||||||
|
"""
|
||||||
|
Sendet GET-Requests mit Retry-Logik, Backoff und Caching
|
||||||
|
"""
|
||||||
cache_key = url + (json.dumps(params, sort_keys=True, ensure_ascii=False) if params else "")
|
cache_key = url + (json.dumps(params, sort_keys=True, ensure_ascii=False) if params else "")
|
||||||
if cache_key in CACHE:
|
if cache_key in CACHE:
|
||||||
logger.debug(f"[Cache] {api_name}: {cache_key}")
|
logger.debug(f"[Cache] {api_name}: {cache_key}")
|
||||||
return CACHE[cache_key]
|
return CACHE[cache_key]
|
||||||
|
|
||||||
retries = 0
|
retries = 0
|
||||||
while retries < max_retries:
|
while retries < max_retries:
|
||||||
try:
|
try:
|
||||||
@ -378,9 +385,10 @@ def request_with_retries_generic(api_name, url, params=None, headers=None, timeo
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
# GND / Wikidata (bestehend)
|
# GND / Wikidata Batch Queries
|
||||||
# =========================
|
# =========================
|
||||||
def batch_query_gnd(terms):
|
def batch_query_gnd(terms):
|
||||||
|
"""Batch-Abfrage der Begriffe bei GND"""
|
||||||
results = {}
|
results = {}
|
||||||
if not API_ACTIVE.get("gnd", False):
|
if not API_ACTIVE.get("gnd", False):
|
||||||
for t in terms: results[t] = ""
|
for t in terms: results[t] = ""
|
||||||
@ -409,6 +417,7 @@ def batch_query_gnd(terms):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
def batch_query_wikidata(terms):
|
def batch_query_wikidata(terms):
|
||||||
|
"""Batch-Abfrage der Begriffe bei Wikidata"""
|
||||||
results = {}
|
results = {}
|
||||||
if not API_ACTIVE.get("wikidata", False):
|
if not API_ACTIVE.get("wikidata", False):
|
||||||
for t in terms: results[t] = ""
|
for t in terms: results[t] = ""
|
||||||
@ -423,10 +432,13 @@ def batch_query_wikidata(terms):
|
|||||||
top = ""
|
top = ""
|
||||||
try:
|
try:
|
||||||
if data and "search" in data:
|
if data and "search" in data:
|
||||||
|
# Ermittlung der Kandidaten mit Ähnlichkeitsbewertung
|
||||||
cands = [(e.get("label",""), SequenceMatcher(None, t.lower(), e.get("label","").lower()).ratio())
|
cands = [(e.get("label",""), SequenceMatcher(None, t.lower(), e.get("label","").lower()).ratio())
|
||||||
for e in data["search"] if e.get("label","")]
|
for e in data["search"] if e.get("label","")]
|
||||||
|
# Filterung nach Mindestähnlichkeit (0.70)
|
||||||
cands = [c for c in cands if c[1] >= 0.70]
|
cands = [c for c in cands if c[1] >= 0.70]
|
||||||
if cands:
|
if cands:
|
||||||
|
# Bestes Ergebnis nach Ähnlichkeit auswählen
|
||||||
top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
|
top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"[WD] Fehler bei Verarbeitung für '{t}': {e}")
|
logger.debug(f"[WD] Fehler bei Verarbeitung für '{t}': {e}")
|
||||||
@ -435,93 +447,14 @@ def batch_query_wikidata(terms):
|
|||||||
logger.info(f"[WD] Fertig. Dauer: {elapsed:.1f}s")
|
logger.info(f"[WD] Fertig. Dauer: {elapsed:.1f}s")
|
||||||
return results
|
return results
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Getty AAT Abfrage – robust & API-polite (requests)
|
|
||||||
# =========================
|
|
||||||
def batch_query_getty_aat(terms):
|
|
||||||
results = {}
|
|
||||||
if not API_ACTIVE.get("aat", False):
|
|
||||||
for t in terms: results[t] = ""
|
|
||||||
return results
|
|
||||||
|
|
||||||
endpoint = "https://vocab.getty.edu/sparql"
|
|
||||||
headers = {"Accept": "application/sparql-results+json", "User-Agent": HEADERS.get("User-Agent")}
|
|
||||||
TIMEOUT = 8
|
|
||||||
MAX_RETRIES = 3
|
|
||||||
BACKOFF_FACTOR = 2
|
|
||||||
FAIL_LIMIT = 5
|
|
||||||
fail_counter_local = 0
|
|
||||||
|
|
||||||
logger.info(f"[AAT] Starte Getty AAT-Abgleich für {len(terms)} Terme")
|
|
||||||
start_all = time.time()
|
|
||||||
for idx, term in enumerate(terms, start=1):
|
|
||||||
term_norm = lemmatize_term(normalize_text(term))
|
|
||||||
tokens = compound_split(term_norm)
|
|
||||||
logger.debug(f"[AAT] ({idx}/{len(terms)}) Begriff '{term}' -> Tokens: {tokens}")
|
|
||||||
|
|
||||||
query_fragments = []
|
|
||||||
for tkn in tokens:
|
|
||||||
t_escaped = tkn.replace('"', '\\"')
|
|
||||||
qf = f"""
|
|
||||||
?concept skos:prefLabel ?label .
|
|
||||||
FILTER(lang(?label)='de' && CONTAINS(LCASE(?label), LCASE("{t_escaped}")))
|
|
||||||
"""
|
|
||||||
query_fragments.append(f"{{ {qf} }}")
|
|
||||||
query_body = " UNION ".join(query_fragments) if query_fragments else ""
|
|
||||||
query = f"PREFIX skos: <http://www.w3.org/2004/02/skos/core#> SELECT ?label ?concept WHERE {{ {query_body} }} LIMIT 10"
|
|
||||||
|
|
||||||
retries = 0
|
|
||||||
success = False
|
|
||||||
start_term = time.time()
|
|
||||||
while retries < MAX_RETRIES and not success:
|
|
||||||
try:
|
|
||||||
logger.debug(f"[AAT] Anfrage (Retry {retries}) für '{term}'")
|
|
||||||
r = requests.get(endpoint, params={"query": query}, headers=headers, timeout=TIMEOUT)
|
|
||||||
if r.status_code != 200:
|
|
||||||
raise ValueError(f"HTTP {r.status_code}")
|
|
||||||
ret = r.json()
|
|
||||||
candidates = [(b['label']['value'], b['concept']['value']) for b in ret.get("results", {}).get("bindings", [])]
|
|
||||||
if candidates:
|
|
||||||
scored = [
|
|
||||||
(c[0], c[1], SequenceMatcher(None, term_norm, lemmatize_term(normalize_text(c[0]))).ratio())
|
|
||||||
for c in candidates
|
|
||||||
]
|
|
||||||
top = max(scored, key=lambda x: x[2])
|
|
||||||
results[term] = top[0]
|
|
||||||
logger.debug(f"[AAT] Treffer für '{term}': {results[term]} (Score: {top[2]:.3f})")
|
|
||||||
else:
|
|
||||||
results[term] = ""
|
|
||||||
logger.debug(f"[AAT] Kein Treffer für '{term}'")
|
|
||||||
success = True
|
|
||||||
except Exception as e:
|
|
||||||
retries += 1
|
|
||||||
wait = BACKOFF_FACTOR ** retries
|
|
||||||
logger.warning(f"[AAT] Fehler ({retries}/{MAX_RETRIES}) für '{term}': {e} – warte {wait}s")
|
|
||||||
time.sleep(wait)
|
|
||||||
if retries == MAX_RETRIES:
|
|
||||||
results[term] = ""
|
|
||||||
fail_counter_local += 1
|
|
||||||
# polite delay
|
|
||||||
time.sleep(1.0)
|
|
||||||
elapsed_term = time.time() - start_term
|
|
||||||
logger.debug(f"[AAT] Dauer für '{term}': {elapsed_term:.2f}s")
|
|
||||||
|
|
||||||
if fail_counter_local >= FAIL_LIMIT:
|
|
||||||
logger.error("[AAT] Zu viele Fehler lokal - breche AAT-Abfragen ab.")
|
|
||||||
for t_rem in terms[idx:]:
|
|
||||||
results[t_rem] = ""
|
|
||||||
FAIL_COUNTER["aat"] += fail_counter_local
|
|
||||||
API_ACTIVE["aat"] = False
|
|
||||||
break
|
|
||||||
|
|
||||||
elapsed_all = time.time() - start_all
|
|
||||||
logger.info(f"[AAT] Getty AAT-Abgleich abgeschlossen. Dauer: {elapsed_all:.1f}s")
|
|
||||||
return results
|
|
||||||
|
|
||||||
# =========================
|
# =========================
|
||||||
# Markierung / Export (Excel/ODS)
|
# Markierung / Export (Excel/ODS)
|
||||||
# =========================
|
# =========================
|
||||||
def mark_norm_hits(file_path):
|
def mark_norm_hits(file_path):
|
||||||
|
"""
|
||||||
|
Markiert Treffer in Excel/ODS farblich:
|
||||||
|
Grün = Treffer, Rot = KEIN TREFFER
|
||||||
|
"""
|
||||||
ext = file_path.suffix.lower()
|
ext = file_path.suffix.lower()
|
||||||
try:
|
try:
|
||||||
if ext in [".xlsx", ".xls"]:
|
if ext in [".xlsx", ".xls"]:
|
||||||
@ -529,12 +462,14 @@ def mark_norm_hits(file_path):
|
|||||||
from openpyxl.styles import PatternFill
|
from openpyxl.styles import PatternFill
|
||||||
wb = load_workbook(file_path)
|
wb = load_workbook(file_path)
|
||||||
ws = wb.active
|
ws = wb.active
|
||||||
|
# Spaltenmapping anhand der Kopfzeile
|
||||||
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
|
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
|
||||||
norm_col = col_map.get("Norm_Treffer", None)
|
norm_col = col_map.get("Norm_Treffer", None)
|
||||||
if not norm_col:
|
if not norm_col:
|
||||||
logger.debug("Spalte 'Norm_Treffer' nicht gefunden (mark_norm_hits).")
|
logger.debug("Spalte 'Norm_Treffer' nicht gefunden (mark_norm_hits).")
|
||||||
wb.save(file_path)
|
wb.save(file_path)
|
||||||
return
|
return
|
||||||
|
# Farben definieren
|
||||||
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
||||||
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
||||||
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
|
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
|
||||||
@ -544,9 +479,10 @@ def mark_norm_hits(file_path):
|
|||||||
else:
|
else:
|
||||||
cell.fill = red_fill
|
cell.fill = red_fill
|
||||||
wb.save(file_path)
|
wb.save(file_path)
|
||||||
elif ext==".ods":
|
elif ext == ".ods":
|
||||||
|
# ODS: kein Zell-Fill, stattdessen Status-Spalte
|
||||||
df = pd.read_excel(file_path, engine="odf")
|
df = pd.read_excel(file_path, engine="odf")
|
||||||
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
|
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x != "KEIN TREFFER" else "Kein Treffer")
|
||||||
df.to_excel(file_path, index=False, engine="odf")
|
df.to_excel(file_path, index=False, engine="odf")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Fehler beim Markieren der Treffer in {file_path}: {e}")
|
logger.warning(f"Fehler beim Markieren der Treffer in {file_path}: {e}")
|
||||||
@ -555,6 +491,9 @@ def mark_norm_hits(file_path):
|
|||||||
# Fehlende Begriffe -> separate Datei
|
# Fehlende Begriffe -> separate Datei
|
||||||
# =========================
|
# =========================
|
||||||
def export_missing_terms(out_df, output_file):
|
def export_missing_terms(out_df, output_file):
|
||||||
|
"""
|
||||||
|
Speichert Begriffe ohne Treffer oder Vorschläge in separater Datei
|
||||||
|
"""
|
||||||
missing_df = out_df[
|
missing_df = out_df[
|
||||||
(out_df["Norm_Treffer"] == "KEIN TREFFER") &
|
(out_df["Norm_Treffer"] == "KEIN TREFFER") &
|
||||||
(out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
|
(out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
|
||||||
@ -562,7 +501,6 @@ def export_missing_terms(out_df, output_file):
|
|||||||
|
|
||||||
count_missing = len(missing_df)
|
count_missing = len(missing_df)
|
||||||
logger.info(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
|
logger.info(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
|
||||||
|
|
||||||
if count_missing == 0:
|
if count_missing == 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -589,8 +527,10 @@ def export_missing_terms(out_df, output_file):
|
|||||||
# Haupt-Loop: Verarbeitung Input-Dateien
|
# Haupt-Loop: Verarbeitung Input-Dateien
|
||||||
# =========================
|
# =========================
|
||||||
def process_files():
|
def process_files():
|
||||||
|
"""Verarbeitet alle Dateien im Input-Ordner, mappt Begriffe und speichert Ergebnisse"""
|
||||||
overall_start = time.time()
|
overall_start = time.time()
|
||||||
try:
|
try:
|
||||||
|
# Normvokabular laden
|
||||||
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
|
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Normvokabular konnte nicht geladen werden. Beende.")
|
logger.error("Normvokabular konnte nicht geladen werden. Beende.")
|
||||||
@ -626,6 +566,7 @@ def process_files():
|
|||||||
df = df.dropna(how="all")
|
df = df.dropna(how="all")
|
||||||
df.columns = [str(c).strip() for c in df.columns]
|
df.columns = [str(c).strip() for c in df.columns]
|
||||||
|
|
||||||
|
# Spalten identifizieren
|
||||||
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
|
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
|
||||||
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
|
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
|
||||||
urh_col = next((c for c in df.columns if "Urheber" in c), None)
|
urh_col = next((c for c in df.columns if "Urheber" in c), None)
|
||||||
@ -633,6 +574,7 @@ def process_files():
|
|||||||
logger.warning(f"Spalte 'Objektbeschreibung' nicht gefunden in {file_path.name}. Datei übersprungen.")
|
logger.warning(f"Spalte 'Objektbeschreibung' nicht gefunden in {file_path.name}. Datei übersprungen.")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Begriffe extrahieren
|
||||||
row_terms_map = []
|
row_terms_map = []
|
||||||
for r_idx, row in enumerate(df.itertuples(index=False), start=1):
|
for r_idx, row in enumerate(df.itertuples(index=False), start=1):
|
||||||
try:
|
try:
|
||||||
@ -657,9 +599,11 @@ def process_files():
|
|||||||
if (r_idx % 200) == 0:
|
if (r_idx % 200) == 0:
|
||||||
logger.debug(f"[{file_path.name}] Zeile {r_idx} verarbeitet")
|
logger.debug(f"[{file_path.name}] Zeile {r_idx} verarbeitet")
|
||||||
|
|
||||||
|
# Alle einzigartigen Terme für API-Abfragen
|
||||||
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
|
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
|
||||||
logger.info(f"[{file_path.name}] Gefundene unique Terme: {len(all_terms)}")
|
logger.info(f"[{file_path.name}] Gefundene unique Terme: {len(all_terms)}")
|
||||||
total_unique_terms = len(all_terms)
|
total_unique_terms = len(all_terms)
|
||||||
|
|
||||||
# API-Abfragen
|
# API-Abfragen
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
gnd_results = batch_query_gnd(all_terms)
|
gnd_results = batch_query_gnd(all_terms)
|
||||||
@ -668,9 +612,6 @@ def process_files():
|
|||||||
wd_results = batch_query_wikidata(all_terms)
|
wd_results = batch_query_wikidata(all_terms)
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
logger.info(f"[{file_path.name}] Wikidata-Abfragen Dauer: {t2-t1:.1f}s")
|
logger.info(f"[{file_path.name}] Wikidata-Abfragen Dauer: {t2-t1:.1f}s")
|
||||||
aat_results = batch_query_getty_aat(all_terms) if API_ACTIVE.get("aat", False) else {t:"" for t in all_terms}
|
|
||||||
t3 = time.time()
|
|
||||||
logger.info(f"[{file_path.name}] AAT-Abfragen Dauer: {t3-t2:.1f}s")
|
|
||||||
|
|
||||||
# Build output rows
|
# Build output rows
|
||||||
output_rows = []
|
output_rows = []
|
||||||
@ -690,58 +631,30 @@ def process_files():
|
|||||||
"Norm_ID": norm_id,
|
"Norm_ID": norm_id,
|
||||||
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
|
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
|
||||||
"GND_Top1": gnd_results.get(term,""),
|
"GND_Top1": gnd_results.get(term,""),
|
||||||
"WD_Top1": wd_results.get(term,""),
|
"WD_Top1": wd_results.get(term,"")
|
||||||
"AAT_Top1": aat_results.get(term,"")
|
|
||||||
}
|
}
|
||||||
output_rows.append(out_row)
|
output_rows.append(out_row)
|
||||||
processed_count += 1
|
processed_count += 1
|
||||||
if (processed_count % 200) == 0:
|
if (processed_count % 200) == 0:
|
||||||
logger.debug(f"[{file_path.name}] {processed_count}/{total_unique_terms} Terme verarbeitet")
|
logger.debug(f"[{file_path.name}] {processed_count}/{total_unique_terms} Terme verarbeitet")
|
||||||
|
|
||||||
|
# Save output
|
||||||
out_df = pd.DataFrame(output_rows)
|
out_df = pd.DataFrame(output_rows)
|
||||||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
|
out_file = OUTPUT_DIR / f"{file_path.stem}_mapped.xlsx"
|
||||||
version = 1
|
|
||||||
while output_file.exists():
|
|
||||||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
|
|
||||||
version += 1
|
|
||||||
engine = "odf" if output_file.suffix.lower()==".ods" else None
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
out_df.to_excel(output_file, index=False, engine=engine)
|
out_df.to_excel(out_file, index=False, engine="openpyxl")
|
||||||
logger.info(f"[{file_path.name}] Auswertung gespeichert: {output_file}")
|
logger.info(f"Ergebnisse gespeichert: {out_file}")
|
||||||
|
mark_norm_hits(out_file)
|
||||||
|
export_missing_terms(out_df, out_file)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"[{file_path.name}] Fehler beim Speichern der Auswertung {output_file}: {e}")
|
logger.error(f"Fehler beim Speichern der Ergebnisse für {file_path.name}: {e}")
|
||||||
continue
|
|
||||||
|
|
||||||
export_missing_terms(out_df, output_file)
|
elapsed_total = time.time() - overall_start
|
||||||
mark_norm_hits(output_file)
|
logger.info(f"Verarbeitung abgeschlossen. Gesamtzeit: {elapsed_total:.1f}s")
|
||||||
|
logger.info(f"Gesamtterme: {total_terms}, Treffer: {total_hits}, Trefferquote: {total_hits/total_terms:.2%}" if total_terms else "")
|
||||||
|
|
||||||
file_elapsed = time.time() - file_start
|
|
||||||
logger.info(f"[Datei {file_idx}/{len(files)}] Fertig ({file_elapsed:.1f}s)")
|
|
||||||
|
|
||||||
overall_elapsed = time.time() - overall_start
|
|
||||||
logger.info(f"Fertig. Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular. Gesamtzeit: {overall_elapsed:.1f}s")
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Main
|
|
||||||
# =========================
|
|
||||||
if __name__ == "__main__":
|
|
||||||
try:
|
|
||||||
process_files()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
logger.warning("Abbruch durch Benutzer (KeyboardInterrupt).")
|
|
||||||
except SystemExit:
|
|
||||||
logger.warning("SystemExit aufgetreten.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception(f"Ungefangener Fehler: {e}")
|
|
||||||
finally:
|
|
||||||
# Stop logger (flush remaining logs)
|
|
||||||
try:
|
|
||||||
save_cache()
|
save_cache()
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
logger.info("Beende.")
|
|
||||||
logger.stop()
|
logger.stop()
|
||||||
except Exception:
|
|
||||||
pass
|
if __name__ == "__main__":
|
||||||
|
process_files()
|
||||||
|
|||||||
@ -1,46 +0,0 @@
|
|||||||
import subprocess
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
def run_mapper(term):
|
|
||||||
"""
|
|
||||||
Ruft das bestehende mapper script auf und liefert Vorschläge zurück.
|
|
||||||
Erwartet, dass das mapper script eine JSON-Ausgabe liefert:
|
|
||||||
{
|
|
||||||
"term": "Begriff",
|
|
||||||
"norm_name": "Normierter Treffer oder KEIN TREFFER",
|
|
||||||
"norm_id": "ID",
|
|
||||||
"suggestions": ["Vorschlag1", "Vorschlag2", "Vorschlag3"]
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
mapper_script = Path("/home/jarnold/projects/GND-Skript Test/NormVokabular_Mapper_1.2.py") # dein bestehendes Mapper-Skript
|
|
||||||
if not mapper_script.exists():
|
|
||||||
raise FileNotFoundError(f"{mapper_script} nicht gefunden")
|
|
||||||
|
|
||||||
# Übergabe als JSON-String
|
|
||||||
input_json = json.dumps({"term": term})
|
|
||||||
|
|
||||||
# Aufruf via subprocess
|
|
||||||
result = subprocess.run(
|
|
||||||
[sys.executable, str(mapper_script), input_json],
|
|
||||||
capture_output=True,
|
|
||||||
text=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.returncode != 0:
|
|
||||||
raise RuntimeError(f"Mapper Fehler: {result.stderr}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
output = json.loads(result.stdout)
|
|
||||||
except Exception as e:
|
|
||||||
raise ValueError(f"Ungültige Ausgabe vom Mapper: {e}")
|
|
||||||
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
if len(sys.argv) > 1:
|
|
||||||
term = sys.argv[1]
|
|
||||||
output = run_mapper(term)
|
|
||||||
print(json.dumps(output, ensure_ascii=False))
|
|
||||||
Binary file not shown.
BIN
Test API.ods
BIN
Test API.ods
Binary file not shown.
101
Tryout/NVTest.py
101
Tryout/NVTest.py
@ -1,101 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
import requests
|
|
||||||
import time
|
|
||||||
import os
|
|
||||||
|
|
||||||
def match_gnd(token, delay=0.3):
|
|
||||||
"""GND-Abfrage für ein Schlagwort, gibt erstes Ergebnis zurück"""
|
|
||||||
url = f"https://lobid.org/gnd/search?q={token}&format=json"
|
|
||||||
try:
|
|
||||||
resp = requests.get(url, timeout=5)
|
|
||||||
if resp.status_code == 200:
|
|
||||||
data = resp.json()
|
|
||||||
if 'member' in data and data['member']:
|
|
||||||
first = data['member'][0]
|
|
||||||
return first.get('preferredName'), first.get('gndIdentifier')
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Fehler bei GND-Abfrage für '{token}': {e}")
|
|
||||||
time.sleep(delay)
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
def load_exlibris_refs(path):
|
|
||||||
"""CSV einlesen, Scan-Zuordnung, Platzhalter-Inventarnummer, GND-Abgleich"""
|
|
||||||
df = pd.read_csv(path, dtype=str, header=0)
|
|
||||||
# erste Spalte leer? → "Kürzel"
|
|
||||||
if df.columns[0].strip() == '':
|
|
||||||
df.rename(columns={df.columns[0]: 'Kürzel'}, inplace=True)
|
|
||||||
df.fillna('', inplace=True)
|
|
||||||
|
|
||||||
# Scan-Level-Spalten
|
|
||||||
level_cols = [c for c in df.columns if c.strip() in ['0','1','2','3','4']]
|
|
||||||
|
|
||||||
obj_list = []
|
|
||||||
current_obj = None
|
|
||||||
placeholder_counter = 1
|
|
||||||
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
has_0 = row['0'].strip() if '0' in df.columns else ''
|
|
||||||
row_refs = []
|
|
||||||
for c in level_cols:
|
|
||||||
val = row[c].strip()
|
|
||||||
if val:
|
|
||||||
row_refs.append({'level': c, 'scan_ref': val})
|
|
||||||
|
|
||||||
if has_0:
|
|
||||||
if current_obj:
|
|
||||||
obj_list.append(current_obj)
|
|
||||||
core_data = {col: row[col] for col in df.columns if col not in level_cols}
|
|
||||||
# Inventarnummer prüfen
|
|
||||||
inv = core_data.get('Inventarnummer','').strip()
|
|
||||||
if not inv:
|
|
||||||
core_data['Inventarnummer'] = f'PL-{placeholder_counter:04d}'
|
|
||||||
placeholder_counter += 1
|
|
||||||
# GND-Abgleich
|
|
||||||
obj_descr = core_data.get('Objektbeschreibung','')
|
|
||||||
gnd_name, gnd_id = None, None
|
|
||||||
if obj_descr:
|
|
||||||
tokens = [t.strip() for t in obj_descr.split(',') if t.strip()]
|
|
||||||
for t in tokens:
|
|
||||||
name, gid = match_gnd(t)
|
|
||||||
if gid:
|
|
||||||
gnd_name = name
|
|
||||||
gnd_id = gid
|
|
||||||
break
|
|
||||||
core_data['GND_Name'] = gnd_name
|
|
||||||
core_data['GND_ID'] = gnd_id
|
|
||||||
current_obj = core_data
|
|
||||||
current_obj['ScanReferenzen'] = row_refs
|
|
||||||
else:
|
|
||||||
if current_obj:
|
|
||||||
current_obj['ScanReferenzen'].extend(row_refs)
|
|
||||||
|
|
||||||
if current_obj:
|
|
||||||
obj_list.append(current_obj)
|
|
||||||
|
|
||||||
out_df = pd.DataFrame(obj_list)
|
|
||||||
core_fields = ['Kürzel','Inventarnummer','Standort','Jahr','Urheber','Eigner',
|
|
||||||
'Objektbeschreibung','Material','Maße (in cm)',
|
|
||||||
'Objekttyp','Inschrift','Anmerkungen','ScanReferenzen',
|
|
||||||
'GND_Name','GND_ID']
|
|
||||||
available = [c for c in core_fields if c in out_df.columns]
|
|
||||||
return out_df[available]
|
|
||||||
|
|
||||||
# ====================
|
|
||||||
# Hauptteil
|
|
||||||
# ====================
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# CSV im gleichen Ordner suchen
|
|
||||||
csv_files = [f for f in os.listdir('.') if f.lower().endswith('.csv')]
|
|
||||||
if not csv_files:
|
|
||||||
print("Keine CSV-Datei im aktuellen Ordner gefunden.")
|
|
||||||
exit(1)
|
|
||||||
# nimm die erste gefundene CSV
|
|
||||||
input_csv = csv_files[0]
|
|
||||||
print(f"Verwende CSV-Datei: {input_csv}")
|
|
||||||
|
|
||||||
df = load_exlibris_refs(input_csv)
|
|
||||||
|
|
||||||
# Ergebnis als Testergebnis.csv speichern
|
|
||||||
output_file = "Testergebnis.csv"
|
|
||||||
df.to_csv(output_file, index=False)
|
|
||||||
print(f"Aufbereitete Daten gespeichert als {output_file}")
|
|
||||||
190
VLG.py
190
VLG.py
@ -1,190 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
VLG_AAT.py Gruppierung, Auflösung "Objektbeschreibung"
|
|
||||||
NOCH OHNE AAT-ABGLEICH
|
|
||||||
|
|
||||||
- Prüft ezodf in aktueller Umgebung
|
|
||||||
- Liest ODS aus "Input CSV/"
|
|
||||||
- Extrahiert Begriffe aus "Objektbeschreibung"
|
|
||||||
- Lemmatisierung (Spacy) + Stopwortfilter
|
|
||||||
- Subtokenisierung komplexer Phrasen
|
|
||||||
- Zählt Häufigkeiten
|
|
||||||
- Ausgabe ODS / CSV-Fallback in "Auswertung Ergebnisse"
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import logging
|
|
||||||
from collections import Counter
|
|
||||||
import pandas as pd
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
# ---------------------------
|
|
||||||
# Logging
|
|
||||||
# ---------------------------
|
|
||||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
|
||||||
|
|
||||||
# ---------------------------
|
|
||||||
# ezodf prüfen
|
|
||||||
# ---------------------------
|
|
||||||
try:
|
|
||||||
import ezodf
|
|
||||||
EZODF_AVAILABLE = True
|
|
||||||
logging.info(f"ezodf erkannt")
|
|
||||||
except ImportError:
|
|
||||||
EZODF_AVAILABLE = False
|
|
||||||
logging.error("ezodf konnte nicht importiert werden!")
|
|
||||||
logging.error("Möglicherweise nutzen Sie nicht die Python-Umgebung, in der ezodf installiert ist.")
|
|
||||||
logging.error(f"Aktuelle Python-Executable: {sys.executable}")
|
|
||||||
logging.error("Bitte prüfen Sie Ihre venv oder installieren Sie ezodf in dieser Umgebung:")
|
|
||||||
logging.error(" python -m pip install ezodf")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# ---------------------------
|
|
||||||
# Spacy laden
|
|
||||||
# ---------------------------
|
|
||||||
try:
|
|
||||||
nlp = spacy.load("de_core_news_sm")
|
|
||||||
logging.info("Spacy-Modell geladen.")
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Spacy-Modell konnte nicht geladen werden: {e}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# ---------------------------
|
|
||||||
# Konfiguration
|
|
||||||
# ---------------------------
|
|
||||||
INPUT_FOLDER = "Input CSV"
|
|
||||||
OUTPUT_FOLDER = "Auswertung Ergebnisse"
|
|
||||||
INPUT_FILENAME = None
|
|
||||||
TARGET_COLUMN = "Objektbeschreibung"
|
|
||||||
STOPWORDS = {"mit", "auf", "von", "und", "der", "die", "das"} # erweiterbar
|
|
||||||
MAPPING = { # Projektinterne Sonderfälle
|
|
||||||
"exlibris": "exlibris",
|
|
||||||
"wappen": "wappen"
|
|
||||||
}
|
|
||||||
|
|
||||||
# ---------------------------
|
|
||||||
# Funktionen
|
|
||||||
# ---------------------------
|
|
||||||
def find_input_file(folder: str, filename_hint: str = None):
|
|
||||||
if not os.path.isdir(folder):
|
|
||||||
raise FileNotFoundError(f"Input-Ordner '{folder}' existiert nicht.")
|
|
||||||
files = [f for f in os.listdir(folder) if f.lower().endswith(".ods")]
|
|
||||||
if filename_hint:
|
|
||||||
for f in files:
|
|
||||||
if f == filename_hint or filename_hint in f:
|
|
||||||
return os.path.join(folder, f)
|
|
||||||
if not files:
|
|
||||||
raise FileNotFoundError(f"Keine .ods-Dateien in '{folder}' gefunden.")
|
|
||||||
return os.path.join(folder, files[0])
|
|
||||||
|
|
||||||
def read_ods_first_sheet(path: str) -> pd.DataFrame:
|
|
||||||
"""Lädt ODS, erkennt automatisch Header-Zeile."""
|
|
||||||
try:
|
|
||||||
df = pd.read_excel(path, engine="odf", header=None)
|
|
||||||
logging.info("ODS mit pandas + odfpy geladen.")
|
|
||||||
except Exception as e1:
|
|
||||||
logging.warning(f"pandas + odfpy konnte ODS nicht lesen ({e1}).")
|
|
||||||
if not EZODF_AVAILABLE:
|
|
||||||
raise RuntimeError("ezodf nicht installiert und pandas + odfpy fehlgeschlagen.")
|
|
||||||
doc = ezodf.opendoc(path)
|
|
||||||
sheet = doc.sheets[0]
|
|
||||||
data = []
|
|
||||||
for row in sheet.rows():
|
|
||||||
values = [c.value if hasattr(c, "value") else "" for c in row]
|
|
||||||
data.append(values)
|
|
||||||
df = pd.DataFrame(data)
|
|
||||||
logging.info("ODS mit ezodf geladen.")
|
|
||||||
|
|
||||||
# Header-Zeile automatisch finden
|
|
||||||
header_row_index = None
|
|
||||||
for i, row in df.iterrows():
|
|
||||||
row_str = row.fillna("").astype(str).str.lower()
|
|
||||||
if any("objektbeschreibung" in str(cell) for cell in row_str):
|
|
||||||
header_row_index = i
|
|
||||||
break
|
|
||||||
if header_row_index is None:
|
|
||||||
raise KeyError("Keine Header-Zeile mit 'Objektbeschreibung' gefunden.")
|
|
||||||
|
|
||||||
df.columns = df.iloc[header_row_index]
|
|
||||||
df = df.iloc[header_row_index + 1:].reset_index(drop=True)
|
|
||||||
return df
|
|
||||||
|
|
||||||
def tokenize_and_lemmatize(series: pd.Series) -> list:
|
|
||||||
"""Tokenisiert, entfernt Stopwords, wendet Mapping + Spacy-Lemmatisierung an."""
|
|
||||||
series = series.fillna("").astype(str).str.strip().str.lower()
|
|
||||||
all_terms = []
|
|
||||||
for text in series:
|
|
||||||
if not text:
|
|
||||||
continue
|
|
||||||
# Komma-Split
|
|
||||||
for part in [p.strip() for p in text.split(",") if p.strip()]:
|
|
||||||
# Subtokenisierung via Spacy
|
|
||||||
doc = nlp(part)
|
|
||||||
for token in doc:
|
|
||||||
lemma = token.lemma_.lower()
|
|
||||||
if lemma in STOPWORDS:
|
|
||||||
continue
|
|
||||||
lemma = MAPPING.get(lemma, lemma)
|
|
||||||
if lemma:
|
|
||||||
all_terms.append(lemma)
|
|
||||||
return all_terms
|
|
||||||
|
|
||||||
def write_output(rows: list, outpath: str):
|
|
||||||
if EZODF_AVAILABLE:
|
|
||||||
if not rows:
|
|
||||||
logging.warning("Keine Daten zum Schreiben.")
|
|
||||||
return
|
|
||||||
keys = list(rows[0].keys())
|
|
||||||
doc = ezodf.newdoc(doctype="ods", filename=outpath)
|
|
||||||
sheet = ezodf.Sheet("Auswertung", size=(len(rows)+1, len(keys)))
|
|
||||||
doc.sheets += sheet
|
|
||||||
for ci, k in enumerate(keys):
|
|
||||||
sheet[0, ci].set_value(k)
|
|
||||||
for ri, row in enumerate(rows, start=1):
|
|
||||||
for ci, k in enumerate(keys):
|
|
||||||
sheet[ri, ci].set_value(row.get(k, ""))
|
|
||||||
doc.save()
|
|
||||||
logging.info(f"ODS geschrieben: {outpath}")
|
|
||||||
else:
|
|
||||||
csv_path = os.path.splitext(outpath)[0] + ".csv"
|
|
||||||
df = pd.DataFrame(rows)
|
|
||||||
df.to_csv(csv_path, index=False, sep=";", encoding="utf-8")
|
|
||||||
logging.info(f"CSV-Fallback geschrieben: {csv_path}")
|
|
||||||
|
|
||||||
# ---------------------------
|
|
||||||
# Hauptfunktion
|
|
||||||
# ---------------------------
|
|
||||||
def main(input_folder=INPUT_FOLDER, input_filename=INPUT_FILENAME):
|
|
||||||
input_path = find_input_file(input_folder, filename_hint=input_filename)
|
|
||||||
input_basename = os.path.splitext(os.path.basename(input_path))[0]
|
|
||||||
logging.info(f"Verarbeite Datei: {input_path}")
|
|
||||||
|
|
||||||
df = read_ods_first_sheet(input_path)
|
|
||||||
logging.info(f"Geladene Spalten: {list(df.columns)}")
|
|
||||||
|
|
||||||
if TARGET_COLUMN.lower() not in [str(c).lower() for c in df.columns]:
|
|
||||||
raise KeyError(f"Spalte '{TARGET_COLUMN}' nicht gefunden.")
|
|
||||||
|
|
||||||
terms = tokenize_and_lemmatize(df[TARGET_COLUMN])
|
|
||||||
logging.info(f"Gefundene Begriffe: {len(terms)}")
|
|
||||||
|
|
||||||
counts = Counter(terms)
|
|
||||||
sorted_terms = sorted(counts.items(), key=lambda kv: kv[1], reverse=True)
|
|
||||||
rows = [{"Begriff": term, "Anzahl": freq} for term, freq in sorted_terms]
|
|
||||||
|
|
||||||
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
|
|
||||||
out_name = f"{input_basename} Auswertung.ods"
|
|
||||||
out_path = os.path.join(OUTPUT_FOLDER, out_name)
|
|
||||||
write_output(rows, out_path)
|
|
||||||
logging.info("Fertig.")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
argv = sys.argv[1:]
|
|
||||||
folder = INPUT_FOLDER
|
|
||||||
fname = INPUT_FILENAME
|
|
||||||
if len(argv) >= 1:
|
|
||||||
folder = argv[0]
|
|
||||||
if len(argv) >= 2:
|
|
||||||
fname = argv[1]
|
|
||||||
main(input_folder=folder, input_filename=fname)
|
|
||||||
262
VLG_API_multi.py
262
VLG_API_multi.py
@ -1,262 +0,0 @@
|
|||||||
import os
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
import requests
|
|
||||||
import pandas as pd
|
|
||||||
from pathlib import Path
|
|
||||||
from difflib import SequenceMatcher
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Argumente / Dry-Run
|
|
||||||
# =========================
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('--dry-run', action='store_true', help='API-Abfragen simulieren')
|
|
||||||
args = parser.parse_args()
|
|
||||||
DRY_RUN = args.dry_run
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Konfiguration
|
|
||||||
# =========================
|
|
||||||
INPUT_DIR = Path("Input CSV")
|
|
||||||
OUTPUT_DIR = Path("Auswertung Ergebnisse")
|
|
||||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
|
||||||
|
|
||||||
TIMEOUT = 5
|
|
||||||
MAX_RETRIES = 3
|
|
||||||
BACKOFF_FACTOR = 2
|
|
||||||
MAX_CONSECUTIVE_FAILURES = 10
|
|
||||||
|
|
||||||
CACHE_FILE = "api_cache.json"
|
|
||||||
if os.path.exists(CACHE_FILE):
|
|
||||||
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
|
||||||
CACHE = json.load(f)
|
|
||||||
else:
|
|
||||||
CACHE = {}
|
|
||||||
|
|
||||||
API_ACTIVE = {"gnd": True, "wikidata": True}
|
|
||||||
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
|
|
||||||
|
|
||||||
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Logging
|
|
||||||
# =========================
|
|
||||||
def log(level, msg):
|
|
||||||
print(f"[{level}] {msg}")
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Cache speichern
|
|
||||||
# =========================
|
|
||||||
def save_cache():
|
|
||||||
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
|
||||||
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Request mit Retry & Backoff
|
|
||||||
# =========================
|
|
||||||
def request_with_retries(api_name, url, params=None):
|
|
||||||
if DRY_RUN:
|
|
||||||
return {"dummy": True}
|
|
||||||
if not API_ACTIVE[api_name]:
|
|
||||||
return None
|
|
||||||
|
|
||||||
cache_key = url + (str(params) if params else "")
|
|
||||||
if cache_key in CACHE:
|
|
||||||
return CACHE[cache_key]
|
|
||||||
|
|
||||||
retries = 0
|
|
||||||
while retries < MAX_RETRIES:
|
|
||||||
try:
|
|
||||||
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
|
|
||||||
if r.status_code == 200:
|
|
||||||
try:
|
|
||||||
data = r.json()
|
|
||||||
except:
|
|
||||||
data = r.text
|
|
||||||
CACHE[cache_key] = data
|
|
||||||
save_cache()
|
|
||||||
FAIL_COUNTER[api_name] = 0
|
|
||||||
return data
|
|
||||||
elif r.status_code in [403, 429]:
|
|
||||||
log("ERROR", f"{api_name.upper()} HTTP {r.status_code} – Stopschalter aktiviert")
|
|
||||||
API_ACTIVE[api_name] = False
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
log("ERROR", f"{api_name.upper()} HTTP {r.status_code}")
|
|
||||||
except requests.exceptions.Timeout:
|
|
||||||
log("ERROR", f"Timeout bei {api_name.upper()}")
|
|
||||||
except Exception as e:
|
|
||||||
log("ERROR", f"Fehler bei {api_name.upper()}: {e}")
|
|
||||||
|
|
||||||
retries += 1
|
|
||||||
sleep_time = min(BACKOFF_FACTOR ** retries, 30)
|
|
||||||
time.sleep(sleep_time)
|
|
||||||
|
|
||||||
FAIL_COUNTER[api_name] += 1
|
|
||||||
if FAIL_COUNTER[api_name] >= MAX_CONSECUTIVE_FAILURES:
|
|
||||||
log("CRITICAL", f"{MAX_CONSECUTIVE_FAILURES} Fehler bei {api_name.upper()} – Stopschalter aktiviert")
|
|
||||||
API_ACTIVE[api_name] = False
|
|
||||||
return None
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# API-Abfragen mit Confidence
|
|
||||||
# =========================
|
|
||||||
def query_gnd(term, min_conf=0.6):
|
|
||||||
if DRY_RUN or not API_ACTIVE["gnd"]:
|
|
||||||
return "TEST_GND", 1.0
|
|
||||||
|
|
||||||
url = f"https://lobid.org/gnd/search?q={term}&format=json"
|
|
||||||
data = request_with_retries("gnd", url)
|
|
||||||
if not data:
|
|
||||||
return "API nicht erreichbar", 0.0
|
|
||||||
|
|
||||||
results = []
|
|
||||||
scores = []
|
|
||||||
for doc in data.get("member", []):
|
|
||||||
name = doc.get("preferredName", "")
|
|
||||||
conf = SequenceMatcher(None, term.lower(), name.lower()).ratio()
|
|
||||||
if conf >= min_conf:
|
|
||||||
results.append(name)
|
|
||||||
scores.append(conf)
|
|
||||||
if results:
|
|
||||||
return ", ".join(results), max(scores)
|
|
||||||
return "ohne Ergebnis", 0.0
|
|
||||||
|
|
||||||
def query_wikidata(term, min_conf=0.5):
|
|
||||||
if DRY_RUN or not API_ACTIVE["wikidata"]:
|
|
||||||
return "TEST_WD", 1.0
|
|
||||||
|
|
||||||
url = "https://www.wikidata.org/w/api.php"
|
|
||||||
params = {"action": "wbsearchentities", "search": term, "language": "de", "format": "json"}
|
|
||||||
data = request_with_retries("wikidata", url, params)
|
|
||||||
if not data:
|
|
||||||
return "API nicht erreichbar", 0.0
|
|
||||||
|
|
||||||
results = []
|
|
||||||
scores = []
|
|
||||||
for entry in data.get("search", []):
|
|
||||||
match_info = entry.get("match", {})
|
|
||||||
score = match_info.get("score", 0.0)
|
|
||||||
if score >= min_conf:
|
|
||||||
results.append(entry["label"])
|
|
||||||
scores.append(score)
|
|
||||||
if results:
|
|
||||||
return ", ".join(results), max(scores)
|
|
||||||
return "ohne Ergebnis", 0.0
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Input laden
|
|
||||||
# =========================
|
|
||||||
def load_input_file(file_path):
|
|
||||||
try:
|
|
||||||
if file_path.suffix.lower() == ".ods":
|
|
||||||
df = pd.read_excel(file_path, engine="odf", header=None)
|
|
||||||
elif file_path.suffix.lower() == ".xlsx":
|
|
||||||
df = pd.read_excel(file_path, engine="openpyxl", header=None)
|
|
||||||
elif file_path.suffix.lower() == ".csv":
|
|
||||||
df = pd.read_csv(file_path, header=None)
|
|
||||||
else:
|
|
||||||
log("WARNING", f"Unbekanntes Dateiformat: {file_path.name}")
|
|
||||||
return None
|
|
||||||
return df
|
|
||||||
except Exception as e:
|
|
||||||
log("ERROR", f"Fehler beim Laden von {file_path.name}: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Header-Zeile suchen
|
|
||||||
# =========================
|
|
||||||
def find_header_row(df, keywords=["objektbeschreibung", "objekt/ebene"]):
|
|
||||||
for i, row in df.iterrows():
|
|
||||||
row_lower = [str(cell).lower() if pd.notna(cell) else "" for cell in row]
|
|
||||||
if any(kw in cell for kw in keywords for cell in row_lower):
|
|
||||||
return i, row_lower
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Verarbeitung
|
|
||||||
# =========================
|
|
||||||
def process_files():
|
|
||||||
all_terms = []
|
|
||||||
output_rows = []
|
|
||||||
|
|
||||||
for file_path in INPUT_DIR.glob("*"):
|
|
||||||
if not file_path.suffix.lower() in [".csv", ".xlsx", ".ods"]:
|
|
||||||
continue
|
|
||||||
log("INFO", f"Verarbeite {file_path.name}")
|
|
||||||
df = load_input_file(file_path)
|
|
||||||
if df is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
header_idx, header_row = find_header_row(df)
|
|
||||||
if header_idx is None:
|
|
||||||
log("WARNING", f"Keine Header-Zeile gefunden in {file_path.name}")
|
|
||||||
continue
|
|
||||||
df.columns = header_row
|
|
||||||
df = df.iloc[header_idx+1:].reset_index(drop=True)
|
|
||||||
|
|
||||||
col_objdesc = next((col for col in df.columns if "objektbeschreibung" in str(col).lower()), None)
|
|
||||||
col_objlevel = next((col for col in df.columns if "objekt/ebene" in str(col).lower()), None)
|
|
||||||
if not col_objdesc:
|
|
||||||
log("WARNING", f"Keine Spalte 'Objektbeschreibung' in {file_path.name}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
term_list = []
|
|
||||||
obj_level_list = []
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
terms = str(row[col_objdesc]) if pd.notna(row[col_objdesc]) else ""
|
|
||||||
if not terms:
|
|
||||||
continue
|
|
||||||
for term in [t.strip() for t in terms.split(",") if t.strip()]:
|
|
||||||
term_list.append(term)
|
|
||||||
obj_level_list.append(row[col_objlevel] if col_objlevel and pd.notna(row[col_objlevel]) else "")
|
|
||||||
|
|
||||||
# API-Abfragen
|
|
||||||
gnd_results = []
|
|
||||||
gnd_scores = []
|
|
||||||
wikidata_results = []
|
|
||||||
wikidata_scores = []
|
|
||||||
|
|
||||||
for term in term_list:
|
|
||||||
gnd_res, gnd_conf = query_gnd(term)
|
|
||||||
wikidata_res, wd_conf = query_wikidata(term)
|
|
||||||
gnd_results.append(gnd_res)
|
|
||||||
gnd_scores.append(gnd_conf)
|
|
||||||
wikidata_results.append(wikidata_res)
|
|
||||||
wikidata_scores.append(wd_conf)
|
|
||||||
|
|
||||||
for idx, term in enumerate(term_list):
|
|
||||||
output_rows.append({
|
|
||||||
"Begriff": term,
|
|
||||||
"Quelle": file_path.name,
|
|
||||||
"Objekt/Ebene": obj_level_list[idx],
|
|
||||||
"GND": gnd_results[idx],
|
|
||||||
"GND_Confidence": gnd_scores[idx],
|
|
||||||
"Wikidata": wikidata_results[idx],
|
|
||||||
"Wikidata_Confidence": wikidata_scores[idx]
|
|
||||||
})
|
|
||||||
all_terms.extend(term_list)
|
|
||||||
|
|
||||||
# Hauptoutput
|
|
||||||
out_df = pd.DataFrame(output_rows)
|
|
||||||
out_file = OUTPUT_DIR / "Auswertung_gesamt.ods"
|
|
||||||
out_df.to_excel(out_file, index=False, engine="odf")
|
|
||||||
log("INFO", f"Hauptauswertung gespeichert: {out_file}")
|
|
||||||
|
|
||||||
# Rohdatei
|
|
||||||
raw_terms = pd.Series(all_terms).value_counts().reset_index()
|
|
||||||
raw_terms.columns = ["Begriff", "Häufigkeit"]
|
|
||||||
raw_file = OUTPUT_DIR / "Rohbegriffe.ods"
|
|
||||||
raw_terms.to_excel(raw_file, index=False, engine="odf")
|
|
||||||
log("INFO", f"Rohbegriffe gespeichert: {raw_file}")
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Main
|
|
||||||
# =========================
|
|
||||||
if __name__ == "__main__":
|
|
||||||
if not INPUT_DIR.exists():
|
|
||||||
log("CRITICAL", f"Eingabeordner {INPUT_DIR} fehlt!")
|
|
||||||
sys.exit(1)
|
|
||||||
process_files()
|
|
||||||
2
Vorschlag_Ersetzen/Vorschlag_Ersetzen_Makro
Normal file
2
Vorschlag_Ersetzen/Vorschlag_Ersetzen_Makro
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
Makro für die Erfassungstabelle, mit dem Vorschläge für Begriffe per Klick angenommen und ersetzt werden sollen.
|
||||||
|
Funktioniert nicht in LibreOffice, müsste in excel aber laufen.
|
||||||
@ -0,0 +1 @@
|
|||||||
|
,jarnold,workPC,16.10.2025 13:04,file:///home/jarnold/.config/libreoffice/4;
|
||||||
Binary file not shown.
BIN
Wiki_Anleitungen/Anleitung_Normvokabular_Abgleich_Makro.odt
Normal file
BIN
Wiki_Anleitungen/Anleitung_Normvokabular_Abgleich_Makro.odt
Normal file
Binary file not shown.
@ -0,0 +1,125 @@
|
|||||||
|
= Benutzungsanleitung - NV_MASTER Abgleich Makro =
|
||||||
|
'''(mapper_macro_2.x.py)'''
|
||||||
|
|
||||||
|
== 1. Was das Makro macht ==
|
||||||
|
Dieses Makro hilft dir dabei, Begriffe in der Auswertungstabelle zu vereinheitlichen.
|
||||||
|
Es vergleicht automatisch die Inhalte aus der Spalte „Objektbeschreibung“ mit einer
|
||||||
|
Normvokabular-Referenzdatei namens „NV_MASTER.ods“.
|
||||||
|
So findest du heraus, welche Begriffe schon genormt sind, wo es passende Vorschläge gibt
|
||||||
|
und wo etwas gar nicht erkannt wurde.
|
||||||
|
Das Makro markiert in der Auswertungstabelle jede Zeile unter „Objektbeschreibung“
|
||||||
|
farbig:
|
||||||
|
|
||||||
|
* <span style="color:green;">Grün</span>: Alles passt, alle Begriffe gefunden
|
||||||
|
* <span style="color:yellow;">Gelb</span>: Einige Begriffe wurden erkannt, andere nicht
|
||||||
|
* <span style="color:red;">Rot</span>: Kein einziger Begriff erkannt
|
||||||
|
|
||||||
|
Beispiel:
|
||||||
|
|
||||||
|
{| class="wikitable"
|
||||||
|
|+ Tabelle 1
|
||||||
|
|-
|
||||||
|
! Objektbeschreibung !! Norm_Treffer !! Norm_Vorschlag !! Kein_Treffer
|
||||||
|
|-
|
||||||
|
| (leer) || || ||
|
||||||
|
|}
|
||||||
|
|
||||||
|
Die Spalten „Norm_Treffer“, „Norm_Vorschlag“ und „Kein_Treffer“ legt das Makro
|
||||||
|
automatisch an, wenn sie fehlen.
|
||||||
|
|
||||||
|
! Tipps zur Nutzung !
|
||||||
|
* Wenn du die NV_MASTER-Datei änderst, starte das Makro neu – es liest sie bei
|
||||||
|
jedem Lauf neu ein.
|
||||||
|
* Erstelle ein Backup der Auswertungstabelle, bevor du das Makro ausführst.
|
||||||
|
* Schaue ab und zu in die Logdatei, um zu prüfen, ob alles korrekt läuft.
|
||||||
|
* Wenn ein Begriff rot markiert wird, aber deiner Meinung nach sinnvoll und zutreffend
|
||||||
|
für das beschriebene Objekt ist, schreibe den Begriff auf und sprich mit deinen
|
||||||
|
Vorgesetzten ab, ob er in das Normvokabular aufgenommen werden sollte.
|
||||||
|
|
||||||
|
== 2. Wo die Dateien des Makros liegen müssen ==
|
||||||
|
'''Unter Linux:'''
|
||||||
|
<pre>
|
||||||
|
/home/<dein-benutzername>/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro/
|
||||||
|
</pre>
|
||||||
|
|
||||||
|
'''Unter Windows:'''
|
||||||
|
<pre>
|
||||||
|
C:\Users\<dein-benutzername>\AppData\Roaming\LibreOffice\4\user\Scripts\python\Vokabular_Abgleich_Makro\
|
||||||
|
</pre>
|
||||||
|
|
||||||
|
In diesem Ordner müssen liegen:
|
||||||
|
* mapper_macro_2.x.py (das Makro)
|
||||||
|
* NV_MASTER.ods (die Referenzdatei)
|
||||||
|
* optional: mapper_macro_2.x.log (wird automatisch erstellt)
|
||||||
|
|
||||||
|
== 3. Wie du das Makro startest ==
|
||||||
|
# Öffne deine Calc-Datei mit den Begriffen
|
||||||
|
# Gehe im Menü auf Extras → Makros → Makros ausführen...
|
||||||
|
# Wähle: Meine Makros → mapper_macro_2.x.py → run_mapper_macro
|
||||||
|
# Klicke auf Ausführen
|
||||||
|
|
||||||
|
Das Makro startet sofort. Je nach Tabellenumfang dauert der Abgleich ein paar Sekunden bis Minuten.
|
||||||
|
Wenn nichts passiert, liegt ein Fehler vor. In diesem Fall schaue in die .log-Datei, die das Makro bei jedem Durchlauf erstellt. Sie liegt im selben Ordner wie das Makro.
|
||||||
|
|
||||||
|
== 4. Wie du das Ergebnis liest ==
|
||||||
|
Nach dem Lauf schreibt das Makro die Treffer und Vorschläge direkt in deine Tabelle und markiert sie:
|
||||||
|
|
||||||
|
{| class="wikitable"
|
||||||
|
|+ Tabelle 2
|
||||||
|
|-
|
||||||
|
! Objektbeschreibung !! Norm_Treffer !! Norm_Vorschlag !! Kein_Treffer
|
||||||
|
|-
|
||||||
|
| Harfe, Noten, Bäume, Geldbeutel, Landschaft, Gewässer || Harfe (2.1) | Noten (3.4) | Landschaft (7.2) Gewässer (9.1) || Baum || Geldbeutel
|
||||||
|
|}
|
||||||
|
|
||||||
|
Farben:
|
||||||
|
* 🟩 <span style="color:green;">Grün</span>: Alle Begriffe wurden direkt erkannt → Perfekt!
|
||||||
|
* 🟨 <span style="color:yellow;">Gelb</span>: Einige Begriffe wurden erkannt, aber andere nur teilweise oder gar nicht → Vorschläge unter der Spalte „Norm_Vorschlag“ prüfen
|
||||||
|
* 🟥 <span style="color:red;">Rot</span>: Kein Begriff wurde gefunden → Objektbeschreibung anpassen, ggf. neue Begriffe in das Normvokabular aufnehmen
|
||||||
|
|
||||||
|
== 5. Wo das Protokoll liegt (Logdatei) ==
|
||||||
|
Das Makro schreibt alles, was passiert, in eine Logdatei:
|
||||||
|
|
||||||
|
'''Linux:''' /home/<dein-benutzername>/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro/mapper_macro_2.x.log
|
||||||
|
'''Windows:''' C:\Users\<dein-benutzername>\AppData\Roaming\LibreOffice\4\user\Scripts\python\Vokabular_Abgleich_Makro\mapper_macro_2.x.log
|
||||||
|
|
||||||
|
Dort siehst du:
|
||||||
|
* wann das Makro gestartet wurde
|
||||||
|
* wie viele Zeilen verarbeitet wurden
|
||||||
|
* und ob Fehler aufgetreten sind
|
||||||
|
|
||||||
|
{| class="wikitable"
|
||||||
|
|+ Tabelle 3
|
||||||
|
|-
|
||||||
|
! Problem !! Ursache !! Lösung
|
||||||
|
|-
|
||||||
|
| Das Makro taucht nicht auf || Falscher Speicherort || Prüfe, ob das Skript wirklich im Ordner Scripts/python liegt
|
||||||
|
|-
|
||||||
|
| Fehlermeldung „Module not found“ || Python-Bibliotheken fehlen || Installiere pandas, odfpy, spacy, rapidfuzz
|
||||||
|
|-
|
||||||
|
| NV_MASTER wird nicht gelesen || Datei fehlt oder ist kaputt || Prüfe Name und Speicherort
|
||||||
|
|-
|
||||||
|
| LibreOffice stürzt ab || Sehr große Datei oder fehlerhafte NV_MASTER || Teste mit kleinerer Datei oder neuem NV_MASTER
|
||||||
|
|}
|
||||||
|
|
||||||
|
== 6. Was das Makro benötigt, um einwandfrei zu laufen ==
|
||||||
|
Alle folgenden Pakete sind für das Makro notwendig, egal ob LibreOffice oder Excel:
|
||||||
|
|
||||||
|
{| class="wikitable"
|
||||||
|
|+ Tabelle 4
|
||||||
|
|-
|
||||||
|
! Paket !! Zweck
|
||||||
|
|-
|
||||||
|
| pandas || Einlesen der Referenzdatei (NV_MASTER.ods)
|
||||||
|
|-
|
||||||
|
| odfpy || Ermöglicht Lesen von .ods-Dateien (für pandas.read_excel(..., engine="odf"))
|
||||||
|
|-
|
||||||
|
| spacy || Lemmatisierung (optional, aber empfohlen)
|
||||||
|
|-
|
||||||
|
| rapidfuzz || Schnelles Fuzzy-Matching (Alternativ zu difflib)
|
||||||
|
|-
|
||||||
|
| openpyxl || Wird benötigt, falls .xlsx genutzt wird
|
||||||
|
|-
|
||||||
|
| python-dateutil || Wird automatisch von pandas gebraucht
|
||||||
|
|}
|
||||||
|
|
||||||
BIN
Wiki_Anleitungen/DigitalisierungWorkflow151025.drawio.pdf
Normal file
BIN
Wiki_Anleitungen/DigitalisierungWorkflow151025.drawio.pdf
Normal file
Binary file not shown.
622
Wiki_Anleitungen/Digitalisierung_Workflow_141025.drawio
Normal file
622
Wiki_Anleitungen/Digitalisierung_Workflow_141025.drawio
Normal file
@ -0,0 +1,622 @@
|
|||||||
|
<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:143.0) Gecko/20100101 Firefox/143.0" version="28.2.5">
|
||||||
|
<diagram name="Page-1" id="aLmyRVYCle99qeRE2JvP">
|
||||||
|
<mxGraphModel dx="1301" dy="1900" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
|
||||||
|
<root>
|
||||||
|
<mxCell id="0" />
|
||||||
|
<mxCell id="1" parent="0" />
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-3" value="" style="group" parent="1" vertex="1" connectable="0">
|
||||||
|
<mxGeometry x="85" y="932" width="310" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-2" value="Scan- und Erfassungsprozess" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="WA2_J1DCvVjPXciXSW-M-3" vertex="1">
|
||||||
|
<mxGeometry x="60" y="-900" width="210" height="30" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-13" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-4" target="WA2_J1DCvVjPXciXSW-M-6" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-14" value="Makro gibt Vorschläge aus NV zurück" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-13" vertex="1" connectable="0">
|
||||||
|
<mxGeometry x="0.2678" y="-1" relative="1" as="geometry">
|
||||||
|
<mxPoint as="offset" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-4" value="<div>Makro (mapper_macro_2.x.py)</div>" style="ellipse;whiteSpace=wrap;html=1;fillColor=#FFFF66;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="575" y="52" width="200" height="100" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-5" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.367;exitY=0.988;exitDx=0;exitDy=0;exitPerimeter=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-1" target="WA2_J1DCvVjPXciXSW-M-4" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<mxPoint x="405" y="97" as="sourcePoint" />
|
||||||
|
<Array as="points">
|
||||||
|
<mxPoint x="235" y="91" />
|
||||||
|
<mxPoint x="235" y="117" />
|
||||||
|
</Array>
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-12" value="Wird vom Makro gelesen und mit NV abgeglichen" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-5" vertex="1" connectable="0">
|
||||||
|
<mxGeometry x="0.0228" y="4" relative="1" as="geometry">
|
||||||
|
<mxPoint as="offset" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-6" value="Anpassung der Erfassungstabelle anhand der Vorschläge" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="80" y="212" width="320" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-10" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" edge="1">
|
||||||
|
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||||
|
<mxPoint x="515" y="972" as="sourcePoint" />
|
||||||
|
<mxPoint x="515" y="12" as="targetPoint" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-11" value="<h1 style="margin-top: 0px;">Workflow Digitalisierung -</h1><h1 style="margin-top: 0px;"><u><font style="font-size: 20px;">Objekterfassung und Pflege des Normvokabulars</font></u></h1><div><font style="font-size: 14px;">- Erfassung und Verschlagwortung von Bildobjekten</font></div><div><font style="font-size: 14px;">- Abgleich mit internem Normvokabular</font></div><div><font style="font-size: 14px;">- API-Abgleich mit getty und GND</font></div><div><font style="font-size: 14px;">- Pflege und Erweiterung des Normvokabulars</font></div>" style="text;html=1;whiteSpace=wrap;overflow=hidden;rounded=0;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="30" y="-1070" width="455" height="220" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-1" value="<div>Scan und Erfassen der Objekte, Erfassung in Tabellen, Spalte "Objektbeschreibung"</div>" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="85" y="32" width="310" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-16" value="Makro 2 (Übernahme von Vorschlägen aus NV per Klick)" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#FF6666;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="575" y="292" width="190" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-20" value="" style="html=1;shadow=0;dashed=0;align=center;verticalAlign=middle;shape=mxgraph.arrows2.arrow;dy=0.6;dx=40;notch=0;fillColor=#FF6666;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="460" y="312" width="90" height="20" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-23" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.071;entryY=0.25;entryDx=0;entryDy=0;entryPerimeter=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;dashed=1;" parent="1" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<mxPoint x="592.495" y="462" as="targetPoint" />
|
||||||
|
<mxPoint x="232.5700000000001" y="432" as="sourcePoint" />
|
||||||
|
<Array as="points">
|
||||||
|
<mxPoint x="233" y="462" />
|
||||||
|
</Array>
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-34" value="Gleiche Funktion wie Makro 1 + API-Abgleich" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-23" vertex="1" connectable="0">
|
||||||
|
<mxGeometry x="-0.4298" relative="1" as="geometry">
|
||||||
|
<mxPoint x="53" as="offset" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="ey7EfLCcf-ExpX1qzLUj-15" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.25;exitY=1;exitDx=0;exitDy=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-21">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<mxPoint x="165.20000000000005" y="510" as="targetPoint" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-21" value="Bereinigte Erfassungstabelle" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="85" y="362" width="320" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-37" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.452;entryY=-0.007;entryDx=0;entryDy=0;entryPerimeter=0;dashed=1;" parent="1" source="WA2_J1DCvVjPXciXSW-M-24" target="WA2_J1DCvVjPXciXSW-M-33" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<Array as="points">
|
||||||
|
<mxPoint x="232" y="480" />
|
||||||
|
</Array>
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-60" value="gibt aus" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-37" vertex="1" connectable="0">
|
||||||
|
<mxGeometry x="-0.0997" y="-1" relative="1" as="geometry">
|
||||||
|
<mxPoint as="offset" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-24" value="NormVokabular_Mapper.py" style="ellipse;whiteSpace=wrap;html=1;fillColor=#FFFF66;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="592.5" y="432" width="175" height="80" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-27" value="<u><b>WHK/Manuell</b></u>" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="145" width="100" height="30" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-28" value="<b><u>Programm/automatisiert</u></b>" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="590" width="160" height="30" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-29" value="<div>Mögliche Optimierung, funktioniert aber nicht in LO</div>" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;fontSize=8;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="570" y="362" width="200" height="20" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-32" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.484;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-6" target="WA2_J1DCvVjPXciXSW-M-21" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="ey7EfLCcf-ExpX1qzLUj-17" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-33">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<mxPoint x="247.5" y="720" as="targetPoint" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-33" value="Bereinigte Erfassungstabelle" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="85" y="512" width="325" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-35" value="<ul><li>Liest Spalte Objektbeschreibung aus, filtert Stopwords und Zahlen raus</li><li>Normalisierung, Lemmatisierung, Stemming der Wörter für höhere Trefferwahrscheinlichkeit</li><li>Liest das Normvokabular, Berücksichtigt ID-Hierarchie, erstellt Index für gestemmte Begriffe</li><li>Abgleich mit Normvokabular, generiert Vorschläge wenn kein Treffer vorliegt</li><li>API-Abgleich (aktuell GND und wikidata, Top1-Treffer)</li><li>Erstellt eine Auswertungsdatei, markiert Begriffe entsprechend ihres Status)</li></ul>" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;fontSize=10;align=left;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="520" y="532" width="300" height="160" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-93" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-39" target="WA2_J1DCvVjPXciXSW-M-45" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-39" value="Aufnahme ins Normvokabular oder Verwerfen des Begriffs" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="85" y="722" width="330" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-43" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" edge="1">
|
||||||
|
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||||
|
<mxPoint x="515" y="1192" as="sourcePoint" />
|
||||||
|
<mxPoint x="515" y="962" as="targetPoint" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-94" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0;entryY=0.5;entryDx=0;entryDy=0;dashed=1;" parent="1" source="WA2_J1DCvVjPXciXSW-M-45" target="WA2_J1DCvVjPXciXSW-M-46" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<mxPoint x="615" y="832" as="targetPoint" />
|
||||||
|
<Array as="points">
|
||||||
|
<mxPoint x="475" y="822" />
|
||||||
|
<mxPoint x="475" y="822" />
|
||||||
|
</Array>
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="ey7EfLCcf-ExpX1qzLUj-16" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.25;exitY=1;exitDx=0;exitDy=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-45">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<mxPoint x="167.66666666666674" y="980" as="targetPoint" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-45" value="Manuelle Anpassung der Normvokabular-Masterfile" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="85" y="802" width="330" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-92" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;dashed=1;" parent="1" source="WA2_J1DCvVjPXciXSW-M-46" target="WA2_J1DCvVjPXciXSW-M-52" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<Array as="points">
|
||||||
|
<mxPoint x="695" y="912" />
|
||||||
|
<mxPoint x="198" y="912" />
|
||||||
|
</Array>
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-220" value="gibt aus" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-92" vertex="1" connectable="0">
|
||||||
|
<mxGeometry x="0.3024" y="-2" relative="1" as="geometry">
|
||||||
|
<mxPoint as="offset" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-46" value="Masterfile_Editor.py" style="ellipse;whiteSpace=wrap;html=1;fillColor=#FFFF66;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="635" y="782" width="120" height="80" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-49" value="liest und bereinigt Normvokabular" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="427.5" y="817" width="200" height="30" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-58" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-52" target="WA2_J1DCvVjPXciXSW-M-57" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-221" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" parent="1" source="WA2_J1DCvVjPXciXSW-M-52" target="WA2_J1DCvVjPXciXSW-M-57" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-222" value="=" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-221" vertex="1" connectable="0">
|
||||||
|
<mxGeometry x="-0.3079" y="1" relative="1" as="geometry">
|
||||||
|
<mxPoint as="offset" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-52" value="Aktualisierte Masterfile, mit allen Änderungen und in der richtigen Struktur" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="85" y="980" width="225" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-59" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-57" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<mxPoint x="70" y="242" as="targetPoint" />
|
||||||
|
<Array as="points">
|
||||||
|
<mxPoint x="40" y="1130" />
|
||||||
|
<mxPoint x="40" y="242" />
|
||||||
|
</Array>
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-57" value="Masterfile Normvokabular Updated" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="117.5" y="1100" width="160" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-64" value="" style="html=1;shadow=0;dashed=0;align=center;verticalAlign=middle;shape=mxgraph.arrows2.arrow;dy=0.6;dx=40;notch=0;fillColor=#FF6666;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="410" y="1107.5" width="90" height="20" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-200" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-65" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<mxPoint x="40" y="1140" as="targetPoint" />
|
||||||
|
<Array as="points">
|
||||||
|
<mxPoint x="680" y="1180" />
|
||||||
|
<mxPoint x="40" y="1180" />
|
||||||
|
</Array>
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-65" value="<div>Normvokabular-Masterfile muss&nbsp;</div><div><b>zentral</b> als <b>SPOT</b> vorliegen und gepflegt werden können</div>" style="ellipse;whiteSpace=wrap;html=1;fillColor=#FF6666;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="575" y="1075" width="210" height="85" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-66" value="" style="whiteSpace=wrap;html=1;aspect=fixed;fillColor=#99CCFF;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="485" y="-1046" width="20" height="20" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-67" value="" style="whiteSpace=wrap;html=1;aspect=fixed;fillColor=#FF6666;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="485" y="-1006" width="20" height="20" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-68" value="" style="whiteSpace=wrap;html=1;aspect=fixed;fillColor=#B3FF66;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="485" y="-966" width="20" height="20" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-69" value="" style="whiteSpace=wrap;html=1;aspect=fixed;fillColor=#FFFF66;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="485" y="-926" width="20" height="20" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-70" value="<b>Datei</b>" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="518" y="-1050" width="50" height="30" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-71" value="<b>Fehlender Schritt/Optimierungsmöglichkeit</b>" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="510" y="-1011" width="270" height="30" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-72" value="<b>Vorgang, WHK</b>" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="515" y="-971" width="110" height="30" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-73" value="<b>Programm</b>" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="520" y="-931" width="80" height="30" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-74" value="" style="endArrow=none;html=1;rounded=0;" parent="1" edge="1">
|
||||||
|
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||||
|
<mxPoint x="35" y="-850" as="sourcePoint" />
|
||||||
|
<mxPoint x="805" y="-850" as="targetPoint" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-75" value="<div align="left"><font style="font-size: 13px;"><b><u><font>Probleme/Noch zu klären:</font></u></b></font><ul><li><font style="font-size: 13px;"><b>Makro 1 und NormVokabular-Mapper</b> redundant, eine Methode festlegen (Makro benutzerfreundlicher, Treffer/Vorschläge direkt in Erfassung sichtbar, Mapper genauer, API-Abgleich, Auswertungsdatei übersichtlicher)</font></li><li><font style="font-size: 13px;"><b>Makro 2</b> (Vorschläge aus Normvokabular können automatisch per Klick in die Erfassungstabelle übernommen werden)</font></li><li><font style="font-size: 13px;"><b>Normvokabular</b>: Eine zentrale .json als SPOT etablieren und zentral in alle Prozesse einbinden</font></li><li><font style="font-size: 13px;"><b>Mapper</b>&nbsp;oder <b>Makro</b> benötigt Funktion, Wörter ohne Treffer und Vorschlag in <br>eigene Liste zu übernehmen und auszugeben -&gt; manuelle Prüfung</font></li><li><font style="font-size: 13px;"><b>Normvokabular</b>: Regeln, ID-Struktur, Kategorien müssen auf Qualität und Nutzbarkeit geprüft werden; danach Anpassung aller Programme, die sich auf Normvokabular stützen</font></li></ul><font style="font-size: 13px;"><br></font></div>" style="rounded=0;whiteSpace=wrap;html=1;align=left;spacing=2;spacingRight=0;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="40" y="1232" width="770" height="190" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-80" value="<ul><li>Liest Spalte Objektbeschreibung aus, filtert Stopwords und Zahlen raus</li><li>Normalisierung, Lemmatisierung, Stemming der Wörter für höhere Trefferwahrscheinlichkeit</li><li>Liest das Normvokabular, Berücksichtigt ID-Hierarchie, erstellt Index für gestemmte Begriffe, cache und log</li><li>Abgleich mit Normvokabular, generiert Vorschläge wenn kein Treffer vorliegt</li><li>Markiert Treffer, Vorschläge und Keine Treffer</li></ul>" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;fontSize=10;align=left;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="525" y="132" width="300" height="160" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-81" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;" parent="1" edge="1">
|
||||||
|
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||||
|
<mxPoint x="245" y="322" as="sourcePoint" />
|
||||||
|
<mxPoint x="455" y="322" as="targetPoint" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-83" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;entryX=0.055;entryY=0.48;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" target="WA2_J1DCvVjPXciXSW-M-64" edge="1">
|
||||||
|
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||||
|
<mxPoint x="281" y="1117" as="sourcePoint" />
|
||||||
|
<mxPoint x="365" y="1002" as="targetPoint" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-91" value="<ul><li>Automatische Spaltenerkennung (ID, Name/Wort/Vokabel)</li><li>Aufbau einer hierarchischen Struktur (Ober-, Unter-, Unterunterkategorien)</li><li>Erstellung eines Mastersheets mit eindeutigen IDs</li><li>Sortierte Ausgabe nach vordefinierter Sheet-Reihenfolge</li><li>Protokollierung im Terminal (Zeilenanzahl, Warnungen, ID-Zählung)</li><li>Speicherung einer neuen, synchronisierten Output-Datei ohne Änderung der Originaldatei</li></ul>" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;fontSize=10;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="510" y="902" width="310" height="160" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-96" value="" style="endArrow=none;html=1;rounded=0;" parent="1" edge="1">
|
||||||
|
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||||
|
<mxPoint x="15" y="1460" as="sourcePoint" />
|
||||||
|
<mxPoint x="815" y="1460" as="targetPoint" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-192" value="<font style="font-size: 19px;"><b><u>3. Aktuelle Struktur des Normvokabulars (Stand 10/25)</u></b></font>" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="35" y="1480" width="510" height="40" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-195" value="" style="group" parent="1" vertex="1" connectable="0">
|
||||||
|
<mxGeometry x="90" y="1740" width="580" height="380" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-97" value="Assets" style="childLayout=tableLayout;recursiveResize=0;strokeColor=#98bf21;fillColor=#A7C942;shadow=1;" parent="WA2_J1DCvVjPXciXSW-M-195" vertex="1">
|
||||||
|
<mxGeometry x="50" y="40" width="550" height="330" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-98" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=0;strokeColor=inherit;fillColor=#ffffff;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
|
||||||
|
<mxGeometry width="550" height="43" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-99" value="ID" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#A7C942;align=center;fontStyle=1;fontColor=#FFFFFF;html=1;" parent="WA2_J1DCvVjPXciXSW-M-98" vertex="1">
|
||||||
|
<mxGeometry width="117" height="43" as="geometry">
|
||||||
|
<mxRectangle width="117" height="43" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-100" value="Unterkategorie" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#A7C942;align=center;fontStyle=1;fontColor=#FFFFFF;html=1;" parent="WA2_J1DCvVjPXciXSW-M-98" vertex="1">
|
||||||
|
<mxGeometry x="117" width="159" height="43" as="geometry">
|
||||||
|
<mxRectangle width="159" height="43" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-101" value="Unterunterkategorie" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#A7C942;align=center;fontStyle=1;fontColor=#FFFFFF;html=1;" parent="WA2_J1DCvVjPXciXSW-M-98" vertex="1">
|
||||||
|
<mxGeometry x="276" width="137" height="43" as="geometry">
|
||||||
|
<mxRectangle width="137" height="43" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-165" value="Wort/Vokabel" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#A7C942;align=center;fontStyle=1;fontColor=#FFFFFF;html=1;" parent="WA2_J1DCvVjPXciXSW-M-98" vertex="1">
|
||||||
|
<mxGeometry x="413" width="137" height="43" as="geometry">
|
||||||
|
<mxRectangle width="137" height="43" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-102" value="" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=0;strokeColor=inherit;fillColor=#ffffff;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
|
||||||
|
<mxGeometry y="43" width="550" height="42" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-103" value="7.1.1" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-102" vertex="1">
|
||||||
|
<mxGeometry width="117" height="42" as="geometry">
|
||||||
|
<mxRectangle width="117" height="42" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-104" value="Außenarchitektur" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-102" vertex="1">
|
||||||
|
<mxGeometry x="117" width="159" height="42" as="geometry">
|
||||||
|
<mxRectangle width="159" height="42" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-105" value="Außenarchitektur allgemein" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-102" vertex="1">
|
||||||
|
<mxGeometry x="276" width="137" height="42" as="geometry">
|
||||||
|
<mxRectangle width="137" height="42" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-166" value="Außenarchitektur allgemein" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-102" vertex="1">
|
||||||
|
<mxGeometry x="413" width="137" height="42" as="geometry">
|
||||||
|
<mxRectangle width="137" height="42" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-187" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=0;strokeColor=inherit;fillColor=#ffffff;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
|
||||||
|
<mxGeometry y="85" width="550" height="41" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-188" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-187" vertex="1">
|
||||||
|
<mxGeometry width="117" height="41" as="geometry">
|
||||||
|
<mxRectangle width="117" height="41" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-189" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-187" vertex="1">
|
||||||
|
<mxGeometry x="117" width="159" height="41" as="geometry">
|
||||||
|
<mxRectangle width="159" height="41" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-190" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-187" vertex="1">
|
||||||
|
<mxGeometry x="276" width="137" height="41" as="geometry">
|
||||||
|
<mxRectangle width="137" height="41" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-191" value="Hof" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-187" vertex="1">
|
||||||
|
<mxGeometry x="413" width="137" height="41" as="geometry">
|
||||||
|
<mxRectangle width="137" height="41" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-106" value="" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=1;strokeColor=inherit;fillColor=#EAF2D3;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
|
||||||
|
<mxGeometry y="126" width="550" height="41" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-107" value="7.1.2" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-106" vertex="1">
|
||||||
|
<mxGeometry width="117" height="41" as="geometry">
|
||||||
|
<mxRectangle width="117" height="41" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-108" value="Außenarchitektur" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-106" vertex="1">
|
||||||
|
<mxGeometry x="117" width="159" height="41" as="geometry">
|
||||||
|
<mxRectangle width="159" height="41" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-109" value="Gebäudetypen" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-106" vertex="1">
|
||||||
|
<mxGeometry x="276" width="137" height="41" as="geometry">
|
||||||
|
<mxRectangle width="137" height="41" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-167" value="Gebäudetypen" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-106" vertex="1">
|
||||||
|
<mxGeometry x="413" width="137" height="41" as="geometry">
|
||||||
|
<mxRectangle width="137" height="41" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-110" value="" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=0;strokeColor=inherit;fillColor=#ffffff;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
|
||||||
|
<mxGeometry y="167" width="550" height="44" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-111" value="" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;fontStyle=0;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-110" vertex="1">
|
||||||
|
<mxGeometry width="117" height="44" as="geometry">
|
||||||
|
<mxRectangle width="117" height="44" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-112" value="" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;fontStyle=0;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-110" vertex="1">
|
||||||
|
<mxGeometry x="117" width="159" height="44" as="geometry">
|
||||||
|
<mxRectangle width="159" height="44" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-113" value="" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;fontStyle=0;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-110" vertex="1">
|
||||||
|
<mxGeometry x="276" width="137" height="44" as="geometry">
|
||||||
|
<mxRectangle width="137" height="44" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-168" value="Haus" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;fontStyle=0;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-110" vertex="1">
|
||||||
|
<mxGeometry x="413" width="137" height="44" as="geometry">
|
||||||
|
<mxRectangle width="137" height="44" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-114" value="" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=1;strokeColor=inherit;fillColor=#EAF2D3;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
|
||||||
|
<mxGeometry y="211" width="550" height="39" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-115" value="7.2" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-114" vertex="1">
|
||||||
|
<mxGeometry width="117" height="39" as="geometry">
|
||||||
|
<mxRectangle width="117" height="39" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-116" value="Innenarchitektur" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-114" vertex="1">
|
||||||
|
<mxGeometry x="117" width="159" height="39" as="geometry">
|
||||||
|
<mxRectangle width="159" height="39" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-117" value="" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-114" vertex="1">
|
||||||
|
<mxGeometry x="276" width="137" height="39" as="geometry">
|
||||||
|
<mxRectangle width="137" height="39" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-169" value="Innenarchitektur" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-114" vertex="1">
|
||||||
|
<mxGeometry x="413" width="137" height="39" as="geometry">
|
||||||
|
<mxRectangle width="137" height="39" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-175" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=1;strokeColor=inherit;fillColor=#FFFFFF;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
|
||||||
|
<mxGeometry y="250" width="550" height="40" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-176" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-175" vertex="1">
|
||||||
|
<mxGeometry width="117" height="40" as="geometry">
|
||||||
|
<mxRectangle width="117" height="40" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-177" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-175" vertex="1">
|
||||||
|
<mxGeometry x="117" width="159" height="40" as="geometry">
|
||||||
|
<mxRectangle width="159" height="40" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-178" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-175" vertex="1">
|
||||||
|
<mxGeometry x="276" width="137" height="40" as="geometry">
|
||||||
|
<mxRectangle width="137" height="40" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-179" value="Zimmer" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-175" vertex="1">
|
||||||
|
<mxGeometry x="413" width="137" height="40" as="geometry">
|
||||||
|
<mxRectangle width="137" height="40" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-170" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=1;strokeColor=inherit;fillColor=#EAF2D3;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
|
||||||
|
<mxGeometry y="290" width="550" height="40" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-171" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-170" vertex="1">
|
||||||
|
<mxGeometry width="117" height="40" as="geometry">
|
||||||
|
<mxRectangle width="117" height="40" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-172" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-170" vertex="1">
|
||||||
|
<mxGeometry x="117" width="159" height="40" as="geometry">
|
||||||
|
<mxRectangle width="159" height="40" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-173" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-170" vertex="1">
|
||||||
|
<mxGeometry x="276" width="137" height="40" as="geometry">
|
||||||
|
<mxRectangle width="137" height="40" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-174" value="Fußboden" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-170" vertex="1">
|
||||||
|
<mxGeometry x="413" width="137" height="40" as="geometry">
|
||||||
|
<mxRectangle width="137" height="40" as="alternateBounds" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-323" value="<b><u>b) Beispiel</u></b>" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="WA2_J1DCvVjPXciXSW-M-195" vertex="1">
|
||||||
|
<mxGeometry x="-30" width="80" height="30" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-197" value="" style="endArrow=none;html=1;rounded=0;" parent="1" edge="1">
|
||||||
|
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||||
|
<mxPoint x="45" y="-110" as="sourcePoint" />
|
||||||
|
<mxPoint x="815" y="-110" as="targetPoint" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-198" value="<font style="font-size: 21px;"><b><u>2. Normvokabular-Abgleich</u></b></font>" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="45" y="-70" width="290" height="40" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-199" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;exitX=1;exitY=0.5;exitDx=0;exitDy=0;exitPerimeter=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-64" target="WA2_J1DCvVjPXciXSW-M-65" edge="1">
|
||||||
|
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||||
|
<mxPoint x="440" y="1110" as="sourcePoint" />
|
||||||
|
<mxPoint x="534" y="1110" as="targetPoint" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-202" value="Scanvorgang" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="70" y="-670" width="200" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-203" value="Erfassen" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="240" y="-400" width="200" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-207" value="Ebenenstruktur festlegen" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="180" y="-490" width="200" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-209" value="Erfassungstabelle" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="370" y="-200" width="247.5" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-210" value="<ul><li>Durchgehen einer Box von vorne bis hinten</li><li>Auflegen des Objekts, Ausrichtung der Farbkarte</li><li>Manuelles Festlegen des Scanbereichs</li><li>Scan der gesamten Box</li></ul>" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="255" y="-690" width="320" height="80" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-211" value="<ul><li>Durchgehen der exportierten Scans im Bildviewer</li><li>Festlegung der Scanebenen (Umschlag, Vorderseite, Rückseite, etc.)</li></ul>" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="367.5" y="-500" width="320" height="80" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-212" value="<ul><li>Durchgehen der Scans</li><li>Erfassen: Datum, Urheber, Eigner, Material</li><li>Vermessen des Objekts</li><li>Objektbeschreibung: Verschlagwortung des Bildinhalts</li><li>Erfassen etwaiger Inschriften und Anmerkungen</li></ul>" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="438" y="-440" width="300" height="140" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-216" value="<font style="font-size: 21px;"><b><u>1. Ablauf des Scan- und Erfassungsprozesses</u></b></font>" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="35" y="-840" width="490" height="40" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-217" value="Vorbereitung" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="30" y="-760" width="200" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-218" value="<ul><li>PC hochfahren</li><li>Scanner starten/Kamera und Beleuchtung vorbereiten, Farbkarte platzieren</li><li>Software starten, Scanauftrag wählen</li><li>Erfassungstabelle öffnen</li><li>Passende Box wählen</li></ul>" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="212.5" y="-790" width="555" height="110" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-236" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.25;exitY=1;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-226" target="WA2_J1DCvVjPXciXSW-M-228" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-318" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-226" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<mxPoint x="350" y="1573" as="targetPoint" />
|
||||||
|
<Array as="points">
|
||||||
|
<mxPoint x="340" y="1573" />
|
||||||
|
<mxPoint x="360" y="1573" />
|
||||||
|
</Array>
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-226" value="Kategorie" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="170" y="1562.5" width="150" height="20" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-237" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.25;exitY=1;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-228" target="WA2_J1DCvVjPXciXSW-M-229" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-228" value="Unterkategorie" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="290" y="1605" width="150" height="20" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-238" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.232;exitY=1.005;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;exitPerimeter=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-229" target="WA2_J1DCvVjPXciXSW-M-230" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<mxPoint x="407.5" y="1652.5" as="sourcePoint" />
|
||||||
|
<mxPoint x="440" y="1687.5" as="targetPoint" />
|
||||||
|
<Array as="points">
|
||||||
|
<mxPoint x="440" y="1700" />
|
||||||
|
</Array>
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-320" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-229" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<mxPoint x="590" y="1660" as="targetPoint" />
|
||||||
|
<Array as="points">
|
||||||
|
<mxPoint x="580" y="1660" />
|
||||||
|
<mxPoint x="580" y="1660" />
|
||||||
|
</Array>
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-229" value="Unterunterkategorie" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="405" y="1650" width="150" height="20" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-230" value="Wort/Vokabel" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="513" y="1690" width="150" height="20" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-313" value="<font style="font-size: 10px;">1</font>" style="ellipse;whiteSpace=wrap;html=1;rounded=0;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="352.5" y="1560" width="25" height="25" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-315" value="<font style="font-size: 10px;">1.1</font>" style="ellipse;whiteSpace=wrap;html=1;rounded=0;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="475" y="1602.5" width="25" height="25" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-316" value="<font style="font-size: 10px;">1.1.1</font>" style="ellipse;whiteSpace=wrap;html=1;rounded=0;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="592.5" y="1647.5" width="25" height="25" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-319" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-228" edge="1">
|
||||||
|
<mxGeometry relative="1" as="geometry">
|
||||||
|
<mxPoint x="470" y="1615" as="targetPoint" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-321" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;" parent="1" edge="1">
|
||||||
|
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||||
|
<mxPoint x="50" y="1740" as="sourcePoint" />
|
||||||
|
<mxPoint x="800" y="1740" as="targetPoint" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-322" value="<b><u>a) Hierarchie und ID-Struktur</u></b>" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="40" y="1530" width="190" height="30" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="WA2_J1DCvVjPXciXSW-M-193" value="Blatt 7 - Architektur" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="140" y="2110" width="165" height="30" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="B-3lv8s0GtbLfT8x5DVe-1" value="Scan exportieren" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="122.5" y="-580" width="200" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="B-3lv8s0GtbLfT8x5DVe-2" value="<ul><li>Export der gesamten Scans einer Box in einen Ordner</li><li>Reihenfolge der Scans checken</li></ul>" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" parent="1" vertex="1">
|
||||||
|
<mxGeometry x="307.5" y="-590" width="320" height="80" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="ey7EfLCcf-ExpX1qzLUj-1" value="Erfassung prüfen" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" vertex="1" parent="1">
|
||||||
|
<mxGeometry x="310" y="-300" width="200" height="60" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="ey7EfLCcf-ExpX1qzLUj-2" value="<ul><li>Durchgehen der Scans, Vergleich der Nummern mit dem Inhalt der Erfassung</li><li>Makro laufen lassen: Prüft Begriffe unter "Objektbschreibung" auf Treffer im Normvokabular (siehe Anleitung)</li></ul>" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" vertex="1" parent="1">
|
||||||
|
<mxGeometry x="490" y="-310" width="320" height="90" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="ey7EfLCcf-ExpX1qzLUj-3" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.3;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-217" target="WA2_J1DCvVjPXciXSW-M-202">
|
||||||
|
<mxGeometry relative="1" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="ey7EfLCcf-ExpX1qzLUj-4" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.238;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-202" target="B-3lv8s0GtbLfT8x5DVe-1">
|
||||||
|
<mxGeometry relative="1" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="ey7EfLCcf-ExpX1qzLUj-6" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.213;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="B-3lv8s0GtbLfT8x5DVe-1" target="WA2_J1DCvVjPXciXSW-M-207">
|
||||||
|
<mxGeometry relative="1" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="ey7EfLCcf-ExpX1qzLUj-7" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.2;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-207" target="WA2_J1DCvVjPXciXSW-M-203">
|
||||||
|
<mxGeometry relative="1" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="ey7EfLCcf-ExpX1qzLUj-8" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.148;entryY=-0.056;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-203" target="ey7EfLCcf-ExpX1qzLUj-1">
|
||||||
|
<mxGeometry relative="1" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="ey7EfLCcf-ExpX1qzLUj-10" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.161;entryY=-0.039;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="ey7EfLCcf-ExpX1qzLUj-1" target="WA2_J1DCvVjPXciXSW-M-209">
|
||||||
|
<mxGeometry relative="1" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="ey7EfLCcf-ExpX1qzLUj-11" value="<font style="font-size: 15px;"><b>Stand: 14.10.25</b></font>" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
|
||||||
|
<mxGeometry x="745" y="-1090" width="105" height="50" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="ey7EfLCcf-ExpX1qzLUj-12" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;" edge="1" parent="1">
|
||||||
|
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||||
|
<mxPoint x="470" y="-880" as="sourcePoint" />
|
||||||
|
<mxPoint x="520" y="-880" as="targetPoint" />
|
||||||
|
</mxGeometry>
|
||||||
|
</mxCell>
|
||||||
|
<mxCell id="ey7EfLCcf-ExpX1qzLUj-13" value="<b>Optional/Optimierungsmöglichkeit</b>" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" vertex="1" parent="1">
|
||||||
|
<mxGeometry x="530" y="-896" width="220" height="30" as="geometry" />
|
||||||
|
</mxCell>
|
||||||
|
</root>
|
||||||
|
</mxGraphModel>
|
||||||
|
</diagram>
|
||||||
|
</mxfile>
|
||||||
@ -0,0 +1,97 @@
|
|||||||
|
= ExLibris Scannen und Erfassen – Leitfaden =
|
||||||
|
|
||||||
|
Die Digitalisierung von Exlibris-Objekten ist in zwei Hauptschritte gegliedert: **Scannen** und **Erfassen** der Objekte.
|
||||||
|
Jedes Objekt ist Teil einer Box, jede Box ist Teil eines Oktavs (°).
|
||||||
|
Eine Box wird vollständig gescannt, bevor die Erfassung beginnt – das vermeidet Fehler, wenn sich z. B. die Reihenfolge der Scans durch nachträgliches Hinzufügen oder Löschen ändert.
|
||||||
|
|
||||||
|
== 1. Vorbereitung ==
|
||||||
|
|
||||||
|
=== a) Hardware vorbereiten ===
|
||||||
|
* PC starten
|
||||||
|
* Lichter anschalten
|
||||||
|
* Kameraabdeckung entfernen
|
||||||
|
* Kamera einschalten
|
||||||
|
|
||||||
|
=== b) Software vorbereiten ===
|
||||||
|
* MultiDotScan by Walter Nagel starten
|
||||||
|
* Scanauftrag auswählen → '''WELCHEN SCANAUFTRAG'''
|
||||||
|
|
||||||
|
== 2. Scanvorgang ==
|
||||||
|
* Objekt auf Unterlage legen, rechtwinklig zum Bildausschnitt in der Software
|
||||||
|
* Farbkarte anlegen – je nach Format des Objekts rechts oder unten
|
||||||
|
* Bildausschnitt an Objekt anpassen (Rand ca. 10–20 mm)
|
||||||
|
* Kamera mit dem Pedal auslösen → '''Scan'''
|
||||||
|
* Wenn ein Scan fehlt oder neu gemacht werden muss: in der Software an die richtige Stelle ziehen → Scans werden beim Export automatisch in der korrekten Reihenfolge angeordnet
|
||||||
|
|
||||||
|
== 3. Scans exportieren ==
|
||||||
|
* Nach dem Scan der gesamten Box den Scanauftrag exportieren → landet unter '''DATEIPFAD'''
|
||||||
|
|
||||||
|
== 4. Erfassen ==
|
||||||
|
|
||||||
|
* Erfasste Scans unter '''DATEIPFAD''' öffnen
|
||||||
|
(im Bildexplorer, '''nicht''' in der Scansoftware – sonst kann sich die Reihenfolge der Scans ändern, was zu Fehlern in der Erfassungstabelle führt)
|
||||||
|
* Jede Box (= jeder Scanauftrag) beginnt numerisch bei 1; jeder Scan ist fortlaufend nummeriert
|
||||||
|
* Scannummern in die Erfassungstabelle eintragen, dabei Ebenenstruktur berücksichtigen:
|
||||||
|
|
||||||
|
=== Ebenenstruktur ===
|
||||||
|
* Standard: Vorderseite → Ebene 0, Rückseite → Ebene 1
|
||||||
|
* Wenn das Exlibris einen Umschlag hat oder mehrere Exlibris in einem Briefumschlag liegen:
|
||||||
|
* Umschlag = Ebene 0
|
||||||
|
* Vorderseite = Ebene 1
|
||||||
|
* Rückseite = Ebene 2
|
||||||
|
* Rückseite Umschlag = Ebene 3
|
||||||
|
→ So ist klar erkennbar, wo ein Umschlag beginnt und endet.
|
||||||
|
|
||||||
|
=== Erfassen der Metadaten ===
|
||||||
|
* Jahr – steht eine Jahreszahl auf Vorder- oder Rückseite?
|
||||||
|
* Urheber – Künstler
|
||||||
|
* Eigner – wem gehört das Exlibris?
|
||||||
|
* Objektbeschreibung – was ist zu sehen? Verschlagwortung des Bildinhalts
|
||||||
|
|
||||||
|
==== Beachten ====
|
||||||
|
* Beschreibung von '''grob → genau'''
|
||||||
|
* Beispiel: „Baum“ statt „Schwarzeiche“
|
||||||
|
* „Helm“ statt „Topfhelm 15. Jahrhundert“
|
||||||
|
* '''Singularformen''' bevorzugen – auch bei mehreren Objekten
|
||||||
|
* z. B. „Buch“ statt „Bücher“, „Figur, weiblich“ statt „Frauengruppe“
|
||||||
|
* '''Aktivitäten im Infinitiv''' angeben: „sitzen“, „lesen“, „fahren“ statt „sitzt“, „lesend“, „fährt“
|
||||||
|
* '''Verbindungswörter vermeiden''' („Stopwords“):
|
||||||
|
<nowiki>mit, ohne, der, die, das, ein, eine, und, zu, von, im, in, auf, an, als, bei, für, aus, dem, den, des, eines, einer</nowiki>
|
||||||
|
(werden vom Mapper-Makro ohnehin herausgefiltert)
|
||||||
|
|
||||||
|
* Material – meist Papier
|
||||||
|
* Maße – Höhe × Breite in cm (bei geraden cm-Zahlen „,0“ anfügen, z. B. 14,3 × 7,0 cm statt 14,3 × 7)
|
||||||
|
* Objekttyp – Exlibris, Rückseite, Umschlag, Zettel
|
||||||
|
* Inschrift – z. B. Wappen mit Spruchband
|
||||||
|
* Anmerkungen – sonstige Notizen oder Hinweise (Bleistifteinträge etc.)
|
||||||
|
* AUX – irrelevant
|
||||||
|
|
||||||
|
== 5. Erfassung überprüfen ==
|
||||||
|
* Stimmt die Nummerierung der Scans mit der entsprechenden Zeile in der Erfassungstabelle überein?
|
||||||
|
* Makro über die Tabelle laufen lassen:
|
||||||
|
Es existiert ein Makro für den Abgleich der Spalte „Objektbeschreibung“ mit dem internen Normvokabular, das die Verschlagwortung vereinheitlicht.
|
||||||
|
|
||||||
|
Dieses Makro kann direkt in LibreOffice Calc über das Menü gestartet werden:
|
||||||
|
|
||||||
|
<pre>
|
||||||
|
Extras → Makros → Makros verwalten → Python →
|
||||||
|
Meine Makros → Vokabular_Abgleich_Makro → mapper_macro_2.x → run_mapper_macro → Ausführen
|
||||||
|
</pre>
|
||||||
|
|
||||||
|
'''Hinweis:'''
|
||||||
|
Für die Benutzung des Makros liegt eine ausführliche Anleitung unter '''DATEIPFAD'''.
|
||||||
|
|
||||||
|
== 6. Abschluss ==
|
||||||
|
* Vordruck ausfüllen:
|
||||||
|
* Name
|
||||||
|
* Datum
|
||||||
|
* Welche Box
|
||||||
|
* Bis wohin wurde gescannt/erfasst
|
||||||
|
* Gibt es etwas zu beachten?
|
||||||
|
|
||||||
|
== 7. Best Practices ==
|
||||||
|
* Lieber zu viel als zu wenig scannen (Rückseiten, Umschläge usw.)
|
||||||
|
* Lieber zu viel als zu wenig beschreiben (alles, was nachvollziehbar erkennbar ist, kann verschlagwortet werden)
|
||||||
|
* Notizen oder Beschriftungen auf Exlibris oder Rückseiten vollständig erfassen
|
||||||
|
* Bei Unsicherheiten: nachfragen
|
||||||
|
|
||||||
2815369
api_cache.json
2815369
api_cache.json
File diff suppressed because it is too large
Load Diff
@ -1,9 +0,0 @@
|
|||||||
{
|
|
||||||
"normvokabular_path": "/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods",
|
|
||||||
"max_suggestions": 3,
|
|
||||||
"color_hit": "#C6EFCE",
|
|
||||||
"color_miss": "#FFC7CE",
|
|
||||||
"use_rapidfuzz": false,
|
|
||||||
"use_spacy": false,
|
|
||||||
"autosave": false
|
|
||||||
}
|
|
||||||
371
mapper.py
371
mapper.py
@ -1,371 +0,0 @@
|
|||||||
import os
|
|
||||||
import sys
|
|
||||||
import re
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
import pandas as pd
|
|
||||||
import requests
|
|
||||||
from pathlib import Path
|
|
||||||
from collections import defaultdict
|
|
||||||
from difflib import SequenceMatcher
|
|
||||||
|
|
||||||
# RapidFuzz für Token-basierte Fuzzy-Suche
|
|
||||||
try:
|
|
||||||
from rapidfuzz import fuzz
|
|
||||||
RAPIDFUZZ_AVAILABLE = True
|
|
||||||
print("RapidFuzz verfügbar")
|
|
||||||
except ImportError:
|
|
||||||
RAPIDFUZZ_AVAILABLE = False
|
|
||||||
print("RapidFuzz nicht verfügbar – nutze SequenceMatcher")
|
|
||||||
|
|
||||||
# Spacy Lemmatizer
|
|
||||||
try:
|
|
||||||
import spacy
|
|
||||||
nlp = spacy.load("de_core_news_sm")
|
|
||||||
SPACY_AVAILABLE = True
|
|
||||||
print("Spacy Lemmatizer aktiviert")
|
|
||||||
except:
|
|
||||||
SPACY_AVAILABLE = False
|
|
||||||
nlp = None
|
|
||||||
print("Spacy nicht verfügbar – nutze naive Stemmer")
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Pfade & Config
|
|
||||||
# =========================
|
|
||||||
INPUT_DIR = Path("Input CSV")
|
|
||||||
OUTPUT_DIR = Path("Auswertung Ergebnisse")
|
|
||||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
|
||||||
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
|
|
||||||
CACHE_FILE = "api_cache.json"
|
|
||||||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
|
||||||
CONF_THRESHOLD = 0.75
|
|
||||||
TIMEOUT = 5
|
|
||||||
MAX_RETRIES = 3
|
|
||||||
BACKOFF_FACTOR = 2
|
|
||||||
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
|
|
||||||
API_ACTIVE = {"gnd": True, "wikidata": True}
|
|
||||||
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
|
|
||||||
|
|
||||||
# Cache
|
|
||||||
if os.path.exists(CACHE_FILE):
|
|
||||||
with open(CACHE_FILE,"r",encoding="utf-8") as f:
|
|
||||||
CACHE = json.load(f)
|
|
||||||
else:
|
|
||||||
CACHE = {}
|
|
||||||
|
|
||||||
def save_cache():
|
|
||||||
with open(CACHE_FILE,"w",encoding="utf-8") as f:
|
|
||||||
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Normalisierung / Lemma
|
|
||||||
# =========================
|
|
||||||
def normalize_text(s):
|
|
||||||
if not s:
|
|
||||||
return ""
|
|
||||||
s = str(s).lower().strip()
|
|
||||||
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
|
|
||||||
s = re.sub(r"\s+"," ",s)
|
|
||||||
return s
|
|
||||||
|
|
||||||
# Lemma-Cache
|
|
||||||
lemma_cache = {}
|
|
||||||
|
|
||||||
def lemmatize_term(term):
|
|
||||||
term_norm = normalize_text(term)
|
|
||||||
if term_norm in lemma_cache:
|
|
||||||
return lemma_cache[term_norm]
|
|
||||||
if SPACY_AVAILABLE and nlp:
|
|
||||||
doc = nlp(term_norm)
|
|
||||||
lemma = " ".join([token.lemma_ for token in doc])
|
|
||||||
else:
|
|
||||||
lemma = term_norm
|
|
||||||
lemma_cache[term_norm] = lemma
|
|
||||||
return lemma
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Kompositum-Zerlegung (einfacher Ansatz)
|
|
||||||
# =========================
|
|
||||||
def compound_split(term):
|
|
||||||
parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
|
|
||||||
return parts if parts else [term]
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Normvokabular laden & Lemma vorbereiten
|
|
||||||
# =========================
|
|
||||||
def load_normvokabular(file_path):
|
|
||||||
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
|
|
||||||
norm_dict = {}
|
|
||||||
stem_index = defaultdict(list)
|
|
||||||
lemma_norm_map = {} # für RapidFuzz preprocessed
|
|
||||||
|
|
||||||
for sheet_name, df in sheets.items():
|
|
||||||
if sheet_name.lower() in ["master", "übersicht"]:
|
|
||||||
continue
|
|
||||||
df = df.dropna(how="all", axis=1)
|
|
||||||
df.columns = [str(c).strip() for c in df.columns]
|
|
||||||
id_col = next((c for c in df.columns if "ID" in c), None)
|
|
||||||
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c), None)
|
|
||||||
if not id_col or not word_col:
|
|
||||||
continue
|
|
||||||
|
|
||||||
current_parent_id = None
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
|
|
||||||
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
|
|
||||||
if row_id:
|
|
||||||
current_parent_id = row_id
|
|
||||||
if not row_word:
|
|
||||||
continue
|
|
||||||
assigned_parent_id = current_parent_id
|
|
||||||
entry = {
|
|
||||||
"Name": row_word,
|
|
||||||
"ID": assigned_parent_id, # Parent-ID
|
|
||||||
"Sheet": sheet_name,
|
|
||||||
"Own_ID": row_id or "" # eigene ID, falls vorhanden
|
|
||||||
}
|
|
||||||
key = normalize_text(row_word)
|
|
||||||
norm_dict[key] = entry
|
|
||||||
lemma = lemmatize_term(key)
|
|
||||||
stem_index[lemma].append(entry)
|
|
||||||
if lemma not in lemma_norm_map:
|
|
||||||
lemma_norm_map[lemma] = entry
|
|
||||||
return norm_dict, stem_index, lemma_norm_map
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Mapping & Vorschläge
|
|
||||||
# =========================
|
|
||||||
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
|
|
||||||
term_norm = normalize_text(term)
|
|
||||||
term_lemma = lemmatize_term(term)
|
|
||||||
|
|
||||||
# Exakter Treffer
|
|
||||||
if term_norm in norm_dict:
|
|
||||||
e = norm_dict[term_norm]
|
|
||||||
return e["Name"], e["ID"], []
|
|
||||||
|
|
||||||
# Lemma-Treffer
|
|
||||||
if term_lemma in stem_index:
|
|
||||||
e = stem_index[term_lemma][0]
|
|
||||||
return e["Name"], e["ID"], []
|
|
||||||
|
|
||||||
# KEIN TREFFER → Kompositum-Split
|
|
||||||
tokens = compound_split(term)
|
|
||||||
if len(tokens) == 1:
|
|
||||||
suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
|
|
||||||
return "KEIN TREFFER", "", suggestions
|
|
||||||
else:
|
|
||||||
token_matches = []
|
|
||||||
for t in tokens:
|
|
||||||
t_lemma = lemmatize_term(t)
|
|
||||||
if t_lemma in stem_index:
|
|
||||||
e = stem_index[t_lemma][0]
|
|
||||||
token_matches.append((t, e["Name"], e["ID"]))
|
|
||||||
else:
|
|
||||||
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
|
|
||||||
token_matches.append((t, "KEIN TREFFER", "", sugg))
|
|
||||||
combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
|
|
||||||
return "KEIN TREFFER", "", combined_suggestions
|
|
||||||
|
|
||||||
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
|
|
||||||
candidates = []
|
|
||||||
for key_lemma, entry in lemma_norm_map.items():
|
|
||||||
if RAPIDFUZZ_AVAILABLE:
|
|
||||||
score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
|
|
||||||
else:
|
|
||||||
score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
|
|
||||||
if key_lemma.lower().startswith(term_lemma.lower()):
|
|
||||||
score = min(score + 0.1, 1.0)
|
|
||||||
if score >= threshold:
|
|
||||||
candidates.append((score, entry["Name"], entry["ID"]))
|
|
||||||
candidates.sort(reverse=True)
|
|
||||||
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# API-Abfragen
|
|
||||||
# =========================
|
|
||||||
def request_with_retries(api_name,url,params=None):
|
|
||||||
cache_key = url + str(params)
|
|
||||||
if cache_key in CACHE:
|
|
||||||
return CACHE[cache_key]
|
|
||||||
retries = 0
|
|
||||||
while retries < MAX_RETRIES:
|
|
||||||
try:
|
|
||||||
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
|
|
||||||
if r.status_code == 200:
|
|
||||||
try: data = r.json()
|
|
||||||
except: data = r.text
|
|
||||||
CACHE[cache_key] = data
|
|
||||||
FAIL_COUNTER[api_name] = 0
|
|
||||||
return data
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
retries += 1
|
|
||||||
time.sleep(min(BACKOFF_FACTOR**retries,30))
|
|
||||||
FAIL_COUNTER[api_name] += 1
|
|
||||||
if FAIL_COUNTER[api_name] >= 10:
|
|
||||||
API_ACTIVE[api_name] = False
|
|
||||||
return None
|
|
||||||
|
|
||||||
def batch_query_gnd(terms):
|
|
||||||
results={}
|
|
||||||
if not API_ACTIVE.get("gnd", False):
|
|
||||||
for t in terms: results[t] = ""
|
|
||||||
return results
|
|
||||||
for t in terms:
|
|
||||||
url="https://lobid.org/gnd/search"
|
|
||||||
params={"q":t,"format":"json"}
|
|
||||||
data = request_with_retries("gnd", url, params)
|
|
||||||
top = ""
|
|
||||||
if data and "member" in data:
|
|
||||||
cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None,t.lower(),(doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
|
|
||||||
cands = [c for c in cands if c[1]>=0.75]
|
|
||||||
if cands:
|
|
||||||
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
|
|
||||||
results[t] = top
|
|
||||||
return results
|
|
||||||
|
|
||||||
def batch_query_wikidata(terms):
|
|
||||||
results={}
|
|
||||||
if not API_ACTIVE.get("wikidata", False):
|
|
||||||
for t in terms: results[t] = ""
|
|
||||||
return results
|
|
||||||
for t in terms:
|
|
||||||
url="https://www.wikidata.org/w/api.php"
|
|
||||||
params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
|
|
||||||
data = request_with_retries("wikidata", url, params)
|
|
||||||
top = ""
|
|
||||||
if data and "search" in data:
|
|
||||||
cands = [(e.get("label",""), SequenceMatcher(None,t.lower(),e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")]
|
|
||||||
cands = [c for c in cands if c[1]>=0.70]
|
|
||||||
if cands:
|
|
||||||
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
|
|
||||||
results[t] = top
|
|
||||||
return results
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Markierung / Export
|
|
||||||
# =========================
|
|
||||||
def mark_norm_hits(file_path):
|
|
||||||
ext = file_path.suffix.lower()
|
|
||||||
if ext in [".xlsx", ".xls"]:
|
|
||||||
from openpyxl import load_workbook
|
|
||||||
from openpyxl.styles import PatternFill
|
|
||||||
wb = load_workbook(file_path)
|
|
||||||
ws = wb.active
|
|
||||||
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
|
||||||
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
|
||||||
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
|
|
||||||
norm_col = col_map.get("Norm_Treffer", None)
|
|
||||||
if not norm_col:
|
|
||||||
print("Spalte 'Norm_Treffer' nicht gefunden")
|
|
||||||
return
|
|
||||||
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
|
|
||||||
cell = row[0]
|
|
||||||
if cell.value and cell.value != "KEIN TREFFER":
|
|
||||||
cell.fill = green_fill
|
|
||||||
else:
|
|
||||||
cell.fill = red_fill
|
|
||||||
wb.save(file_path)
|
|
||||||
elif ext==".ods":
|
|
||||||
df = pd.read_excel(file_path, engine="odf")
|
|
||||||
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
|
|
||||||
df.to_excel(file_path, index=False, engine="odf")
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Verarbeitung Input-Dateien
|
|
||||||
# =========================
|
|
||||||
def process_files():
|
|
||||||
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
|
|
||||||
total_terms = 0
|
|
||||||
total_hits = 0
|
|
||||||
|
|
||||||
if not INPUT_DIR.exists():
|
|
||||||
print(f"Eingabeordner {INPUT_DIR} fehlt")
|
|
||||||
sys.exit(1)
|
|
||||||
files = list(INPUT_DIR.glob("*"))
|
|
||||||
if not files:
|
|
||||||
print("Keine Dateien gefunden")
|
|
||||||
return
|
|
||||||
|
|
||||||
for file_path in files:
|
|
||||||
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
|
|
||||||
continue
|
|
||||||
print(f"Verarbeite Datei: {file_path.name}")
|
|
||||||
try:
|
|
||||||
if file_path.suffix.lower() == ".csv":
|
|
||||||
df = pd.read_csv(file_path)
|
|
||||||
else:
|
|
||||||
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Fehler beim Lesen von {file_path.name}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
df = df.dropna(how="all")
|
|
||||||
df.columns = [str(c).strip() for c in df.columns]
|
|
||||||
|
|
||||||
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
|
|
||||||
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
|
|
||||||
urh_col = next((c for c in df.columns if "Urheber" in c), None)
|
|
||||||
if not besch_col: continue
|
|
||||||
|
|
||||||
row_terms_map = []
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
|
|
||||||
if not besch: continue
|
|
||||||
obj_box = row[box_col] if box_col else ""
|
|
||||||
urheber = row[urh_col] if urh_col else ""
|
|
||||||
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
|
|
||||||
terms = []
|
|
||||||
for clause in clauses:
|
|
||||||
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
|
|
||||||
for p in parts:
|
|
||||||
if p.lower() in STOPWORDS: continue
|
|
||||||
if re.fullmatch(r"\d+", p): continue
|
|
||||||
terms.append(p)
|
|
||||||
row_terms_map.append((obj_box, urheber, terms))
|
|
||||||
|
|
||||||
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
|
|
||||||
gnd_results = batch_query_gnd(all_terms)
|
|
||||||
wd_results = batch_query_wikidata(all_terms)
|
|
||||||
|
|
||||||
output_rows = []
|
|
||||||
for obj_box, urheber, terms in row_terms_map:
|
|
||||||
for term in terms:
|
|
||||||
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
|
|
||||||
total_terms += 1
|
|
||||||
if norm_name != "KEIN TREFFER":
|
|
||||||
total_hits += 1
|
|
||||||
out_row = {
|
|
||||||
"Box": obj_box,
|
|
||||||
"Objekt/Ebene": obj_box,
|
|
||||||
"Urheber": urheber,
|
|
||||||
"Begriff": term,
|
|
||||||
"Norm_Treffer": norm_name,
|
|
||||||
"Norm_ID": norm_id,
|
|
||||||
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
|
|
||||||
"GND_Top1": gnd_results.get(term,""),
|
|
||||||
"WD_Top1": wd_results.get(term,"")
|
|
||||||
}
|
|
||||||
output_rows.append(out_row)
|
|
||||||
|
|
||||||
out_df = pd.DataFrame(output_rows)
|
|
||||||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
|
|
||||||
version = 1
|
|
||||||
while output_file.exists():
|
|
||||||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
|
|
||||||
version += 1
|
|
||||||
engine = "odf" if output_file.suffix.lower()==".ods" else None
|
|
||||||
out_df.to_excel(output_file, index=False, engine=engine)
|
|
||||||
mark_norm_hits(output_file)
|
|
||||||
print(f"Auswertung gespeichert: {output_file}")
|
|
||||||
|
|
||||||
save_cache()
|
|
||||||
print(f"Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular")
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Main
|
|
||||||
# =========================
|
|
||||||
if __name__ == "__main__":
|
|
||||||
process_files()
|
|
||||||
print("Fertig")
|
|
||||||
237
mapper_macro.py
237
mapper_macro.py
@ -1,237 +0,0 @@
|
|||||||
import uno
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import traceback
|
|
||||||
import json
|
|
||||||
|
|
||||||
# Optional für Lemmatizer
|
|
||||||
try:
|
|
||||||
import spacy
|
|
||||||
nlp = spacy.load("de_core_news_sm")
|
|
||||||
SPACY_AVAILABLE = True
|
|
||||||
except:
|
|
||||||
SPACY_AVAILABLE = False
|
|
||||||
nlp = None
|
|
||||||
|
|
||||||
# Optional für Fuzzy Matching
|
|
||||||
try:
|
|
||||||
from rapidfuzz import fuzz
|
|
||||||
RAPIDFUZZ_AVAILABLE = True
|
|
||||||
except:
|
|
||||||
from difflib import SequenceMatcher
|
|
||||||
RAPIDFUZZ_AVAILABLE = False
|
|
||||||
|
|
||||||
import odf.opendocument
|
|
||||||
import odf.table
|
|
||||||
import odf.text
|
|
||||||
|
|
||||||
# ------------------------
|
|
||||||
# Konfiguration absolute Pfade
|
|
||||||
# ------------------------
|
|
||||||
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro"
|
|
||||||
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
|
|
||||||
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
|
|
||||||
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
|
|
||||||
|
|
||||||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
|
||||||
CONF_THRESHOLD = 0.75
|
|
||||||
|
|
||||||
# ------------------------
|
|
||||||
# Logging
|
|
||||||
# ------------------------
|
|
||||||
def log(msg):
|
|
||||||
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
|
||||||
f.write(msg + "\n")
|
|
||||||
|
|
||||||
# ------------------------
|
|
||||||
# Cache laden
|
|
||||||
# ------------------------
|
|
||||||
if os.path.exists(CACHE_FILE):
|
|
||||||
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
|
||||||
CACHE = json.load(f)
|
|
||||||
else:
|
|
||||||
CACHE = {}
|
|
||||||
|
|
||||||
# ------------------------
|
|
||||||
# Normalisierung / Lemma
|
|
||||||
# ------------------------
|
|
||||||
def normalize_text(s):
|
|
||||||
if not s:
|
|
||||||
return ""
|
|
||||||
s = str(s).lower().strip()
|
|
||||||
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
|
|
||||||
s = re.sub(r"\s+"," ",s)
|
|
||||||
return s
|
|
||||||
|
|
||||||
lemma_cache = {}
|
|
||||||
def lemmatize_term(term):
|
|
||||||
term_norm = normalize_text(term)
|
|
||||||
if term_norm in lemma_cache:
|
|
||||||
return lemma_cache[term_norm]
|
|
||||||
if SPACY_AVAILABLE and nlp:
|
|
||||||
doc = nlp(term_norm)
|
|
||||||
lemma = " ".join([token.lemma_ for token in doc])
|
|
||||||
else:
|
|
||||||
lemma = term_norm
|
|
||||||
lemma_cache[term_norm] = lemma
|
|
||||||
return lemma
|
|
||||||
|
|
||||||
# ------------------------
|
|
||||||
# NV_MASTER einlesen
|
|
||||||
# ------------------------
|
|
||||||
def load_nv_master(path):
|
|
||||||
norm_dict = {}
|
|
||||||
try:
|
|
||||||
doc = odf.opendocument.load(path)
|
|
||||||
except Exception as e:
|
|
||||||
log(f"Fehler beim Laden von NV_MASTER: {e}")
|
|
||||||
return norm_dict
|
|
||||||
|
|
||||||
for sheet in doc.spreadsheet.getElementsByType(odf.table.Table):
|
|
||||||
sheet_name = sheet.getAttribute("name")
|
|
||||||
if sheet_name.lower() == "master":
|
|
||||||
continue
|
|
||||||
|
|
||||||
current_parent_id = None
|
|
||||||
for row in sheet.getElementsByType(odf.table.TableRow):
|
|
||||||
cells = row.getElementsByType(odf.table.TableCell)
|
|
||||||
cell_values = []
|
|
||||||
for cell in cells:
|
|
||||||
texts = cell.getElementsByType(odf.text.P)
|
|
||||||
if texts and texts[0].firstChild:
|
|
||||||
cell_values.append(str(texts[0].firstChild.data).strip())
|
|
||||||
else:
|
|
||||||
cell_values.append("")
|
|
||||||
if not cell_values or len(cell_values)<4:
|
|
||||||
continue
|
|
||||||
id_val, unterk, unterunterk, word = cell_values[:4]
|
|
||||||
if id_val:
|
|
||||||
current_parent_id = id_val.strip()
|
|
||||||
if not word:
|
|
||||||
continue
|
|
||||||
key = lemmatize_term(word)
|
|
||||||
norm_dict[key] = {
|
|
||||||
"Name": word.strip(),
|
|
||||||
"ID": current_parent_id,
|
|
||||||
"Sheet": sheet_name,
|
|
||||||
"Unterkategorie": unterk.strip(),
|
|
||||||
"Unterunterkategorie": unterunterk.strip()
|
|
||||||
}
|
|
||||||
log(f"NV_MASTER geladen: {len(norm_dict)} Begriffe")
|
|
||||||
return norm_dict
|
|
||||||
|
|
||||||
# ------------------------
|
|
||||||
# Matching
|
|
||||||
# ------------------------
|
|
||||||
def get_suggestions(term_lemma, norm_dict, top_n=3, threshold=CONF_THRESHOLD):
|
|
||||||
candidates = []
|
|
||||||
for key, entry in norm_dict.items():
|
|
||||||
if RAPIDFUZZ_AVAILABLE:
|
|
||||||
score = fuzz.token_set_ratio(term_lemma, key)/100
|
|
||||||
else:
|
|
||||||
score = SequenceMatcher(None, term_lemma.lower(), key.lower()).ratio()
|
|
||||||
if key.lower().startswith(term_lemma.lower()):
|
|
||||||
score = min(score + 0.1, 1.0)
|
|
||||||
if score >= threshold:
|
|
||||||
candidates.append((score, entry["Name"], entry["ID"]))
|
|
||||||
candidates.sort(reverse=True)
|
|
||||||
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
|
|
||||||
|
|
||||||
def map_word(word, norm_dict):
|
|
||||||
key = lemmatize_term(word)
|
|
||||||
if key in CACHE:
|
|
||||||
cached = CACHE[key]
|
|
||||||
return cached["Norm"], cached["Suggestion"], cached["ID"]
|
|
||||||
|
|
||||||
if key in norm_dict:
|
|
||||||
entry = norm_dict[key]
|
|
||||||
tr, sug, wid = entry["Name"], "", entry["ID"]
|
|
||||||
else:
|
|
||||||
suggestions = get_suggestions(term_lemma=key, norm_dict=norm_dict)
|
|
||||||
if suggestions:
|
|
||||||
tr, sug, wid = "KEIN TREFFER", ", ".join(suggestions), ""
|
|
||||||
else:
|
|
||||||
tr, sug, wid = "KEIN TREFFER", "", ""
|
|
||||||
|
|
||||||
CACHE[key] = {"Norm": tr, "Suggestion": sug, "ID": wid}
|
|
||||||
return tr, sug, wid
|
|
||||||
|
|
||||||
# ------------------------
|
|
||||||
# Makro-Hauptfunktion
|
|
||||||
# ------------------------
|
|
||||||
def run_mapper_macro():
|
|
||||||
try:
|
|
||||||
doc = XSCRIPTCONTEXT.getDocument()
|
|
||||||
sheets = doc.getSheets()
|
|
||||||
sheet = sheets.getByIndex(0)
|
|
||||||
cursor = sheet.createCursor()
|
|
||||||
cursor.gotoStartOfUsedArea(False)
|
|
||||||
cursor.gotoEndOfUsedArea(True)
|
|
||||||
data_range = cursor.getRangeAddress()
|
|
||||||
|
|
||||||
header_row = 0
|
|
||||||
objekt_col = None
|
|
||||||
|
|
||||||
# Header prüfen
|
|
||||||
for col in range(data_range.EndColumn+1):
|
|
||||||
val = sheet.getCellByPosition(col, header_row).String.strip().lower()
|
|
||||||
if val == "objektbeschreibung":
|
|
||||||
objekt_col = col
|
|
||||||
break
|
|
||||||
|
|
||||||
if objekt_col is None:
|
|
||||||
log("Spalte 'Objektbeschreibung' nicht gefunden")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Neue Spalten am rechten Tabellenende erstellen
|
|
||||||
max_col = data_range.EndColumn
|
|
||||||
norm_tr_col = max_col + 1
|
|
||||||
norm_sug_col = max_col + 2
|
|
||||||
norm_id_col = max_col + 3
|
|
||||||
|
|
||||||
sheet.getCellByPosition(norm_tr_col, header_row).String = "Norm_Treffer"
|
|
||||||
sheet.getCellByPosition(norm_sug_col, header_row).String = "Norm_Vorschlag"
|
|
||||||
sheet.getCellByPosition(norm_id_col, header_row).String = "Norm_ID"
|
|
||||||
|
|
||||||
norm_dict = load_nv_master(NV_MASTER_PATH)
|
|
||||||
|
|
||||||
# Farben
|
|
||||||
GREEN = 0xC6EFCE
|
|
||||||
YELLOW = 0xFFEB9C
|
|
||||||
RED = 0xFFC7CE
|
|
||||||
|
|
||||||
for row in range(1, data_range.EndRow+1):
|
|
||||||
cell = sheet.getCellByPosition(objekt_col, row)
|
|
||||||
val = cell.String.strip()
|
|
||||||
if not val:
|
|
||||||
continue
|
|
||||||
words = [w.strip() for w in re.split(r"\s+", val) if w.strip() and w.lower() not in STOPWORDS]
|
|
||||||
tr_list, sug_list, id_list = [], [], []
|
|
||||||
for w in words:
|
|
||||||
tr, sug, wid = map_word(w, norm_dict)
|
|
||||||
if tr != "KEIN TREFFER":
|
|
||||||
tr_list.append(tr)
|
|
||||||
if sug:
|
|
||||||
sug_list.append(sug)
|
|
||||||
if wid:
|
|
||||||
id_list.append(wid)
|
|
||||||
sheet.getCellByPosition(norm_tr_col, row).String = ", ".join(tr_list)
|
|
||||||
sheet.getCellByPosition(norm_sug_col, row).String = ", ".join(sug_list)
|
|
||||||
sheet.getCellByPosition(norm_id_col, row).String = ", ".join(id_list)
|
|
||||||
# Farbmarkierung
|
|
||||||
if tr_list:
|
|
||||||
cell.CellBackColor = GREEN
|
|
||||||
elif sug_list:
|
|
||||||
cell.CellBackColor = YELLOW
|
|
||||||
else:
|
|
||||||
cell.CellBackColor = RED
|
|
||||||
|
|
||||||
# Cache speichern
|
|
||||||
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
|
||||||
json.dump(CACHE, f, ensure_ascii=False, indent=2)
|
|
||||||
|
|
||||||
log("Makro erfolgreich ausgeführt")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
log("Fehler in run_mapper_macro:")
|
|
||||||
log(traceback.format_exc())
|
|
||||||
@ -1,297 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
import os
|
|
||||||
import uno
|
|
||||||
import unohelper
|
|
||||||
import re
|
|
||||||
import json
|
|
||||||
import pandas as pd
|
|
||||||
from pathlib import Path
|
|
||||||
from difflib import SequenceMatcher
|
|
||||||
|
|
||||||
# RapidFuzz für Fuzzy-Suche
|
|
||||||
try:
|
|
||||||
from rapidfuzz import fuzz
|
|
||||||
RAPIDFUZZ_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
RAPIDFUZZ_AVAILABLE = False
|
|
||||||
|
|
||||||
# Spacy Lemmatizer
|
|
||||||
try:
|
|
||||||
import spacy
|
|
||||||
nlp = spacy.load("de_core_news_sm")
|
|
||||||
SPACY_AVAILABLE = True
|
|
||||||
except:
|
|
||||||
SPACY_AVAILABLE = False
|
|
||||||
nlp = None
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Pfade & Config
|
|
||||||
# =========================
|
|
||||||
SCRIPT_DIR = Path("/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro")
|
|
||||||
NV_MASTER_FILE = SCRIPT_DIR / "NV_MASTER.ods"
|
|
||||||
CACHE_FILE = SCRIPT_DIR / "mapper_cache.json"
|
|
||||||
LOG_FILE = SCRIPT_DIR / "mapper_log.txt"
|
|
||||||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Cache & Logging
|
|
||||||
# =========================
|
|
||||||
if CACHE_FILE.exists():
|
|
||||||
with open(CACHE_FILE,"r",encoding="utf-8") as f:
|
|
||||||
CACHE = json.load(f)
|
|
||||||
else:
|
|
||||||
CACHE = {}
|
|
||||||
|
|
||||||
def save_cache():
|
|
||||||
with open(CACHE_FILE,"w",encoding="utf-8") as f:
|
|
||||||
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
def log(msg):
|
|
||||||
with open(LOG_FILE,"a",encoding="utf-8") as f:
|
|
||||||
f.write(msg + "\n")
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Textverarbeitung
|
|
||||||
# =========================
|
|
||||||
def normalize_text(s):
|
|
||||||
if not s: return ""
|
|
||||||
s = str(s).lower().strip()
|
|
||||||
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
|
|
||||||
s = re.sub(r"\s+"," ",s)
|
|
||||||
return s
|
|
||||||
|
|
||||||
lemma_cache = {}
|
|
||||||
def lemmatize_term(term):
|
|
||||||
term_norm = normalize_text(term)
|
|
||||||
if term_norm in lemma_cache:
|
|
||||||
return lemma_cache[term_norm]
|
|
||||||
if SPACY_AVAILABLE and nlp:
|
|
||||||
doc = nlp(term_norm)
|
|
||||||
lemma = " ".join([token.lemma_ for token in doc])
|
|
||||||
else:
|
|
||||||
lemma = term_norm
|
|
||||||
lemma_cache[term_norm] = lemma
|
|
||||||
return lemma
|
|
||||||
|
|
||||||
def compound_split(term):
|
|
||||||
parts = re.findall(r'[A-ZÄÖÜa-zäöü]+', term)
|
|
||||||
return parts if parts else [term]
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# NV_MASTER laden
|
|
||||||
# =========================
|
|
||||||
def load_normvokabular(file_path):
|
|
||||||
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf")
|
|
||||||
norm_dict = {}
|
|
||||||
for sheet_name, df in sheets.items():
|
|
||||||
df = df.dropna(how="all", axis=1)
|
|
||||||
df.columns = [str(c).strip() for c in df.columns]
|
|
||||||
if "ID" not in df.columns or "Wort/Vokabel" not in df.columns:
|
|
||||||
continue
|
|
||||||
current_parent_id = None
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
row_id = str(row["ID"]).strip() if pd.notna(row["ID"]) else None
|
|
||||||
row_word = str(row["Wort/Vokabel"]).strip() if pd.notna(row["Wort/Vokabel"]) else None
|
|
||||||
if row_id: current_parent_id = row_id
|
|
||||||
if not row_word: continue
|
|
||||||
norm_dict[normalize_text(row_word)] = {
|
|
||||||
"ID": current_parent_id,
|
|
||||||
"Wort/Vokabel": row_word
|
|
||||||
}
|
|
||||||
return norm_dict
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Mapping
|
|
||||||
# =========================
|
|
||||||
def map_term_with_indexes(term, norm_dict):
|
|
||||||
term_norm = normalize_text(term)
|
|
||||||
term_lemma = lemmatize_term(term_norm)
|
|
||||||
|
|
||||||
# Cache prüfen
|
|
||||||
if term_lemma in CACHE:
|
|
||||||
cached = CACHE[term_lemma]
|
|
||||||
if isinstance(cached, dict) and all(k in cached for k in ("hits","suggestions","ids")):
|
|
||||||
return cached["hits"], cached["suggestions"], cached["ids"]
|
|
||||||
else:
|
|
||||||
CACHE.pop(term_lemma, None)
|
|
||||||
|
|
||||||
hits = []
|
|
||||||
suggestions = []
|
|
||||||
ids = []
|
|
||||||
|
|
||||||
# Exakte Treffer
|
|
||||||
if term_norm in norm_dict:
|
|
||||||
e = norm_dict[term_norm]
|
|
||||||
hits.append(e["Wort/Vokabel"])
|
|
||||||
ids.append(e["ID"])
|
|
||||||
elif term_lemma in norm_dict:
|
|
||||||
e = norm_dict[term_lemma]
|
|
||||||
hits.append(e["Wort/Vokabel"])
|
|
||||||
ids.append(e["ID"])
|
|
||||||
else:
|
|
||||||
# Fuzzy Matching
|
|
||||||
for key, e in norm_dict.items():
|
|
||||||
score = fuzz.token_sort_ratio(term_lemma, key)/100.0 if RAPIDFUZZ_AVAILABLE else SequenceMatcher(None, term_lemma, key).ratio()
|
|
||||||
if score >= 0.75:
|
|
||||||
suggestions.append(e["Wort/Vokabel"])
|
|
||||||
ids.append(e["ID"])
|
|
||||||
|
|
||||||
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
|
|
||||||
return hits, suggestions, ids
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# LibreOffice Dialog (ListBox + Checkbox)
|
|
||||||
# =========================
|
|
||||||
def apply_proposals_dialog():
|
|
||||||
ctx = uno.getComponentContext()
|
|
||||||
smgr = ctx.ServiceManager
|
|
||||||
desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
|
|
||||||
doc = desktop.getCurrentComponent()
|
|
||||||
if not doc.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
|
|
||||||
log("Kein Calc-Dokument aktiv")
|
|
||||||
return
|
|
||||||
selection = doc.CurrentSelection
|
|
||||||
sheet = doc.CurrentController.ActiveSheet
|
|
||||||
|
|
||||||
# Prüfen ob eine Zelle ausgewählt ist
|
|
||||||
if selection is None or not hasattr(selection, "getCellAddress"):
|
|
||||||
log("Keine Zelle ausgewählt")
|
|
||||||
return
|
|
||||||
cell = selection
|
|
||||||
|
|
||||||
# Spalte überprüfen
|
|
||||||
header_row = sheet.getCellRangeByPosition(0,0,sheet.Columns.Count-1,0)
|
|
||||||
objekt_col = None
|
|
||||||
norm_vorschlag_col = None
|
|
||||||
for col_idx in range(sheet.Columns.Count):
|
|
||||||
val = sheet.getCellByPosition(col_idx,0).String
|
|
||||||
if val.strip().lower() == "objektbeschreibung":
|
|
||||||
objekt_col = col_idx
|
|
||||||
elif val.strip().lower() == "norm_vorschlag":
|
|
||||||
norm_vorschlag_col = col_idx
|
|
||||||
if norm_vorschlag_col is None or objekt_col is None:
|
|
||||||
log("Spalte 'Norm_Vorschlag' oder 'Objektbeschreibung' nicht gefunden")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Vorschläge auslesen
|
|
||||||
proposals_str = sheet.getCellByPosition(norm_vorschlag_col, cell.RangeAddress.StartRow).String
|
|
||||||
if not proposals_str.strip():
|
|
||||||
log("Keine Vorschläge in der ausgewählten Zelle")
|
|
||||||
return
|
|
||||||
proposals = [p.strip() for p in proposals_str.split(";") if p.strip()]
|
|
||||||
|
|
||||||
# Dialog erstellen
|
|
||||||
toolkit = smgr.createInstanceWithContext("com.sun.star.awt.Toolkit", ctx)
|
|
||||||
dialog_model = smgr.createInstanceWithContext("com.sun.star.awt.UnoControlDialogModel", ctx)
|
|
||||||
dialog_model.Width = 180
|
|
||||||
dialog_model.Height = 150
|
|
||||||
dialog_model.Title = "Vorschläge übernehmen"
|
|
||||||
|
|
||||||
# ListBox
|
|
||||||
lb_model = dialog_model.createInstance("com.sun.star.awt.UnoControlListBoxModel")
|
|
||||||
lb_model.Name = "ProposalList"
|
|
||||||
lb_model.PositionX = 10
|
|
||||||
lb_model.PositionY = 10
|
|
||||||
lb_model.Width = 160
|
|
||||||
lb_model.Height = 80
|
|
||||||
lb_model.StringItemList = tuple(proposals)
|
|
||||||
dialog_model.insertByName("ProposalList", lb_model)
|
|
||||||
|
|
||||||
# Checkbox
|
|
||||||
cb_model = dialog_model.createInstance("com.sun.star.awt.UnoControlCheckBoxModel")
|
|
||||||
cb_model.Name = "AllCheck"
|
|
||||||
cb_model.PositionX = 10
|
|
||||||
cb_model.PositionY = 95
|
|
||||||
cb_model.Width = 160
|
|
||||||
cb_model.Height = 15
|
|
||||||
cb_model.Label = "Alle Vorschläge übernehmen"
|
|
||||||
dialog_model.insertByName("AllCheck", cb_model)
|
|
||||||
|
|
||||||
# OK-Button
|
|
||||||
btn_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
|
|
||||||
btn_model.Name = "OKButton"
|
|
||||||
btn_model.PositionX = 10
|
|
||||||
btn_model.PositionY = 115
|
|
||||||
btn_model.Width = 80
|
|
||||||
btn_model.Height = 20
|
|
||||||
btn_model.Label = "OK"
|
|
||||||
dialog_model.insertByName("OKButton", btn_model)
|
|
||||||
|
|
||||||
# Abbrechen-Button
|
|
||||||
cancel_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
|
|
||||||
cancel_model.Name = "CancelButton"
|
|
||||||
cancel_model.PositionX = 100
|
|
||||||
cancel_model.PositionY = 115
|
|
||||||
cancel_model.Width = 80
|
|
||||||
cancel_model.Height = 20
|
|
||||||
cancel_model.Label = "Abbrechen"
|
|
||||||
dialog_model.insertByName("CancelButton", cancel_model)
|
|
||||||
|
|
||||||
# Control Dialog
|
|
||||||
dialog = smgr.createInstanceWithContext("com.sun.star.awt.UnoControlDialog", ctx)
|
|
||||||
dialog.setModel(dialog_model)
|
|
||||||
dialog.setVisible(True)
|
|
||||||
toolkit.createPeer(dialog, None)
|
|
||||||
|
|
||||||
# Warten auf OK
|
|
||||||
while True:
|
|
||||||
import time
|
|
||||||
time.sleep(0.1)
|
|
||||||
# Prüfen auf Klick
|
|
||||||
if dialog.getControl("OKButton").Pressed:
|
|
||||||
all_flag = dialog.getControl("AllCheck").State == 1
|
|
||||||
selected_idx = dialog.getControl("ProposalList").SelectedItems
|
|
||||||
if selected_idx:
|
|
||||||
selected_proposal = proposals[selected_idx[0]]
|
|
||||||
else:
|
|
||||||
selected_proposal = None
|
|
||||||
break
|
|
||||||
elif dialog.getControl("CancelButton").Pressed:
|
|
||||||
dialog.endExecute()
|
|
||||||
return
|
|
||||||
|
|
||||||
# Anwenden
|
|
||||||
obj_cell = sheet.getCellByPosition(objekt_col, cell.RangeAddress.StartRow)
|
|
||||||
obj_text = obj_cell.String
|
|
||||||
if all_flag:
|
|
||||||
for prop in proposals:
|
|
||||||
idx = obj_text.lower().find(prop.lower())
|
|
||||||
if idx != -1:
|
|
||||||
obj_text = obj_text[:idx] + prop + obj_text[idx+len(prop):]
|
|
||||||
else:
|
|
||||||
if selected_proposal:
|
|
||||||
idx = obj_text.lower().find(selected_proposal.lower())
|
|
||||||
if idx != -1:
|
|
||||||
obj_text = obj_text[:idx] + selected_proposal + obj_text[idx+len(selected_proposal):]
|
|
||||||
|
|
||||||
obj_cell.String = obj_text
|
|
||||||
obj_cell.CellBackColor = 0x00FF00 # grün
|
|
||||||
dialog.endExecute()
|
|
||||||
save_cache()
|
|
||||||
log(f"Vorschlag übernommen: {obj_text}")
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Automatische Button-Registrierung
|
|
||||||
# =========================
|
|
||||||
def register_toolbar_button():
|
|
||||||
ctx = uno.getComponentContext()
|
|
||||||
smgr = ctx.ServiceManager
|
|
||||||
desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
|
|
||||||
doc = desktop.getCurrentComponent()
|
|
||||||
frame = doc.CurrentController.Frame
|
|
||||||
# Button kann manuell über Makro-Menü an Toolbar gebunden werden
|
|
||||||
# Hier wird nur das Makro selbst registriert
|
|
||||||
# Symbolleiste muss in LO einmalig erstellt werden
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Hauptmakro
|
|
||||||
# =========================
|
|
||||||
def run_mapper_macro():
|
|
||||||
try:
|
|
||||||
norm_dict = load_normvokabular(NV_MASTER_FILE)
|
|
||||||
log(f"NV_MASTER geladen ({len(norm_dict)} Begriffe)")
|
|
||||||
|
|
||||||
apply_proposals_dialog()
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
log(f"Fehler in run_mapper_macro: {e}")
|
|
||||||
379
mapper_macro_2.3.py
Normal file
379
mapper_macro_2.3.py
Normal file
@ -0,0 +1,379 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# LibreOffice/Excel Macro: NV_MASTER-Abgleich, Pandas+odf, Cache, Farben
|
||||||
|
# Version 2.3 – mit "Kein_Treffer" Spalte
|
||||||
|
# Speicherort: libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro/mapper_macro_2.3.py
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
# UNO-Context wird zur Laufzeit zur Verfügung gestellt (XSCRIPTCONTEXT)
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
PANDAS_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
PANDAS_AVAILABLE = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
import spacy
|
||||||
|
nlp = spacy.load("de_core_news_sm")
|
||||||
|
SPACY_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
SPACY_AVAILABLE = False
|
||||||
|
nlp = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from rapidfuzz import fuzz
|
||||||
|
RAPIDFUZZ_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
RAPIDFUZZ_AVAILABLE = False
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Konfiguration
|
||||||
|
# ------------------------
|
||||||
|
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro"
|
||||||
|
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
|
||||||
|
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro_2.3.log")
|
||||||
|
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache_2.3.json")
|
||||||
|
|
||||||
|
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||||||
|
CONF_THRESHOLD = 0.75 # Basis-Schwelle für Vorschläge
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Logging
|
||||||
|
# ------------------------
|
||||||
|
def log(msg):
|
||||||
|
try:
|
||||||
|
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
||||||
|
f.write(msg + "\n")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Cache laden
|
||||||
|
# ------------------------
|
||||||
|
try:
|
||||||
|
if os.path.exists(CACHE_FILE):
|
||||||
|
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
||||||
|
CACHE = json.load(f)
|
||||||
|
else:
|
||||||
|
CACHE = {}
|
||||||
|
except Exception:
|
||||||
|
CACHE = {}
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Text-Normalisierung & Lemma
|
||||||
|
# ------------------------
|
||||||
|
def normalize_text(s):
|
||||||
|
if not s:
|
||||||
|
return ""
|
||||||
|
s = str(s).strip().lower()
|
||||||
|
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
|
||||||
|
s = re.sub(r"\s+", " ", s)
|
||||||
|
return s
|
||||||
|
|
||||||
|
lemma_cache = {}
|
||||||
|
def lemmatize_term(term):
|
||||||
|
term_norm = normalize_text(term)
|
||||||
|
if term_norm in lemma_cache:
|
||||||
|
return lemma_cache[term_norm]
|
||||||
|
if SPACY_AVAILABLE and nlp:
|
||||||
|
try:
|
||||||
|
doc = nlp(term_norm)
|
||||||
|
lemma = " ".join([token.lemma_ for token in doc])
|
||||||
|
except Exception:
|
||||||
|
lemma = term_norm
|
||||||
|
else:
|
||||||
|
lemma = term_norm
|
||||||
|
lemma_cache[term_norm] = lemma
|
||||||
|
return lemma
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# NV_MASTER laden
|
||||||
|
# ------------------------
|
||||||
|
def build_norm_index(nv_path):
|
||||||
|
norm_dict = {}
|
||||||
|
lemma_index = {}
|
||||||
|
if not PANDAS_AVAILABLE:
|
||||||
|
log("Pandas nicht verfügbar. NV_MASTER kann nicht gelesen werden.")
|
||||||
|
return norm_dict, lemma_index
|
||||||
|
try:
|
||||||
|
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Fehler beim Einlesen NV_MASTER: {e}")
|
||||||
|
return norm_dict, lemma_index
|
||||||
|
for sheet_name, df in sheets.items():
|
||||||
|
if str(sheet_name).strip().lower() == "master":
|
||||||
|
continue
|
||||||
|
df = df.fillna("")
|
||||||
|
cols = [str(c).strip().lower() for c in df.columns]
|
||||||
|
id_col = None
|
||||||
|
word_col = None
|
||||||
|
for i, c in enumerate(cols):
|
||||||
|
if "id" in c:
|
||||||
|
id_col = df.columns[i]
|
||||||
|
if "wort" in c or "vokabel" in c:
|
||||||
|
word_col = df.columns[i]
|
||||||
|
if word_col is None and len(df.columns) >= 1:
|
||||||
|
word_col = df.columns[-1]
|
||||||
|
if id_col is None and len(df.columns) >= 1:
|
||||||
|
id_col = df.columns[0]
|
||||||
|
current_parent_id = None
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
|
||||||
|
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
|
||||||
|
if id_val:
|
||||||
|
current_parent_id = id_val
|
||||||
|
if not word_val:
|
||||||
|
continue
|
||||||
|
norm_name = normalize_text(word_val)
|
||||||
|
lemma = lemmatize_term(word_val)
|
||||||
|
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
|
||||||
|
norm_dict.setdefault(norm_name, []).append(entry)
|
||||||
|
lemma_index.setdefault(lemma, []).append(entry)
|
||||||
|
log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}")
|
||||||
|
return norm_dict, lemma_index
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Matching
|
||||||
|
# ------------------------
|
||||||
|
def fuzzy_score(a, b):
|
||||||
|
if RAPIDFUZZ_AVAILABLE:
|
||||||
|
try:
|
||||||
|
return fuzz.token_set_ratio(a, b) / 100.0
|
||||||
|
except Exception:
|
||||||
|
return 0.0
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
||||||
|
except Exception:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD):
|
||||||
|
candidates = []
|
||||||
|
for key_lemma, entries in lemma_index.items():
|
||||||
|
score = fuzzy_score(term_lemma, key_lemma)
|
||||||
|
if key_lemma.startswith(term_lemma):
|
||||||
|
score = min(score + 0.1, 1.0)
|
||||||
|
if score >= threshold:
|
||||||
|
for e in entries:
|
||||||
|
candidates.append((score, e["Name"], e["ID"]))
|
||||||
|
for norm_key, entries in norm_dict.items():
|
||||||
|
score = fuzzy_score(term_lemma, norm_key)
|
||||||
|
if norm_key.startswith(term_lemma):
|
||||||
|
score = min(score + 0.1, 1.0)
|
||||||
|
if score >= threshold:
|
||||||
|
for e in entries:
|
||||||
|
candidates.append((score, e["Name"], e["ID"]))
|
||||||
|
candidates.sort(key=lambda t: t[0], reverse=True)
|
||||||
|
seen = set()
|
||||||
|
results = []
|
||||||
|
for score, name, id_ in candidates:
|
||||||
|
key = (name, id_)
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
results.append({"score": score, "name": name, "id": id_})
|
||||||
|
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
|
||||||
|
|
||||||
|
def map_term_with_indexes(term, norm_dict, lemma_index):
|
||||||
|
term_norm = normalize_text(term)
|
||||||
|
term_lemma = lemmatize_term(term)
|
||||||
|
if term_lemma in CACHE:
|
||||||
|
cached = CACHE[term_lemma]
|
||||||
|
return cached.get("hits", []), cached.get("suggestions", []), cached.get("ids", [])
|
||||||
|
hits = []
|
||||||
|
suggestions = []
|
||||||
|
ids = []
|
||||||
|
if term_norm in norm_dict:
|
||||||
|
for e in norm_dict[term_norm]:
|
||||||
|
hits.append(e["Name"])
|
||||||
|
if e["ID"]:
|
||||||
|
ids.append(e["ID"])
|
||||||
|
if not hits and term_lemma in lemma_index:
|
||||||
|
for e in lemma_index[term_lemma]:
|
||||||
|
hits.append(e["Name"])
|
||||||
|
if e["ID"]:
|
||||||
|
ids.append(e["ID"])
|
||||||
|
if not hits:
|
||||||
|
suggestions = get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD)
|
||||||
|
def unique_preserve(seq):
|
||||||
|
seen = set()
|
||||||
|
out = []
|
||||||
|
for x in seq:
|
||||||
|
if x not in seen:
|
||||||
|
seen.add(x)
|
||||||
|
out.append(x)
|
||||||
|
return out
|
||||||
|
hits = unique_preserve(hits)
|
||||||
|
suggestions = unique_preserve(suggestions)
|
||||||
|
ids = unique_preserve(ids)
|
||||||
|
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
|
||||||
|
return hits, suggestions, ids
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Haupt-Makro
|
||||||
|
# ------------------------
|
||||||
|
def run_mapper_macro():
|
||||||
|
try:
|
||||||
|
doc = XSCRIPTCONTEXT.getDocument()
|
||||||
|
sheet = doc.CurrentController.ActiveSheet
|
||||||
|
cursor = sheet.createCursor()
|
||||||
|
cursor.gotoStartOfUsedArea(False)
|
||||||
|
cursor.gotoEndOfUsedArea(True)
|
||||||
|
data_range = cursor.getRangeAddress()
|
||||||
|
except Exception as e:
|
||||||
|
log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e))
|
||||||
|
return
|
||||||
|
|
||||||
|
# Header finden
|
||||||
|
header_row = None
|
||||||
|
objekt_col = None
|
||||||
|
max_col = data_range.EndColumn
|
||||||
|
for r in range(0, min(5, data_range.EndRow+1)):
|
||||||
|
for c in range(0, max_col+1):
|
||||||
|
try:
|
||||||
|
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
|
||||||
|
except Exception:
|
||||||
|
val = ""
|
||||||
|
if val == "objektbeschreibung":
|
||||||
|
header_row = r
|
||||||
|
objekt_col = c
|
||||||
|
break
|
||||||
|
if objekt_col is not None:
|
||||||
|
break
|
||||||
|
if objekt_col is None:
|
||||||
|
log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Spalten anlegen
|
||||||
|
existing = {}
|
||||||
|
for c in range(0, data_range.EndColumn+1):
|
||||||
|
try:
|
||||||
|
h = str(sheet.getCellByPosition(c, header_row).String).strip()
|
||||||
|
except Exception:
|
||||||
|
h = ""
|
||||||
|
if h == "Norm_Treffer":
|
||||||
|
existing["Norm_Treffer"] = c
|
||||||
|
if h == "Norm_Vorschlag":
|
||||||
|
existing["Norm_Vorschlag"] = c
|
||||||
|
|
||||||
|
last_col = data_range.EndColumn
|
||||||
|
if "Norm_Treffer" not in existing:
|
||||||
|
last_col += 1
|
||||||
|
existing["Norm_Treffer"] = last_col
|
||||||
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
|
||||||
|
if "Norm_Vorschlag" not in existing:
|
||||||
|
last_col += 1
|
||||||
|
existing["Norm_Vorschlag"] = last_col
|
||||||
|
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
|
||||||
|
|
||||||
|
# Neue Spalte "Kein_Treffer"
|
||||||
|
if "Kein_Treffer" not in existing:
|
||||||
|
last_col += 1
|
||||||
|
existing["Kein_Treffer"] = last_col
|
||||||
|
sheet.getCellByPosition(last_col, header_row).String = "Kein_Treffer"
|
||||||
|
|
||||||
|
norm_tr_col = existing["Norm_Treffer"]
|
||||||
|
norm_sug_col = existing["Norm_Vorschlag"]
|
||||||
|
kein_tr_col = existing["Kein_Treffer"]
|
||||||
|
|
||||||
|
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
|
||||||
|
if not norm_dict and not lemma_index:
|
||||||
|
log("NV_MASTER leer oder nicht lesbar. Abbruch.")
|
||||||
|
return
|
||||||
|
|
||||||
|
GREEN = 0xADFF2F
|
||||||
|
YELLOW = 0xFFA500
|
||||||
|
RED = 0xCC0000
|
||||||
|
WHITE = 0xFFFFFF
|
||||||
|
|
||||||
|
rows_processed = 0
|
||||||
|
for r in range(header_row + 1, data_range.EndRow + 1):
|
||||||
|
try:
|
||||||
|
cell = sheet.getCellByPosition(objekt_col, r)
|
||||||
|
txt = str(cell.String).strip()
|
||||||
|
if not txt:
|
||||||
|
continue
|
||||||
|
|
||||||
|
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
|
||||||
|
terms = []
|
||||||
|
for cl in clauses:
|
||||||
|
parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()]
|
||||||
|
for p in parts:
|
||||||
|
if p.lower() in STOPWORDS:
|
||||||
|
continue
|
||||||
|
if re.fullmatch(r"\d+", p):
|
||||||
|
continue
|
||||||
|
terms.append(p)
|
||||||
|
|
||||||
|
row_hits = []
|
||||||
|
row_sugs = []
|
||||||
|
row_ids = []
|
||||||
|
unmapped_terms = []
|
||||||
|
|
||||||
|
for term in terms:
|
||||||
|
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
|
||||||
|
if hits:
|
||||||
|
row_hits.extend([f"{h} ({id_})" if id_ else h for h,id_ in zip(hits, ids + [""]*len(hits))])
|
||||||
|
else:
|
||||||
|
unmapped_terms.append(term)
|
||||||
|
if sugs:
|
||||||
|
row_sugs.extend([f"{s}" for s in sugs])
|
||||||
|
if ids:
|
||||||
|
row_ids.extend(ids)
|
||||||
|
|
||||||
|
def uniq(seq):
|
||||||
|
seen = set()
|
||||||
|
out = []
|
||||||
|
for x in seq:
|
||||||
|
if x not in seen:
|
||||||
|
seen.add(x)
|
||||||
|
out.append(x)
|
||||||
|
return out
|
||||||
|
|
||||||
|
row_hits = uniq(row_hits)
|
||||||
|
row_sugs = uniq(row_sugs)
|
||||||
|
unmapped_terms = uniq(unmapped_terms)
|
||||||
|
|
||||||
|
# Farb-Logik Objektbeschreibung
|
||||||
|
if terms and not unmapped_terms and row_hits:
|
||||||
|
cell.CellBackColor = GREEN
|
||||||
|
row_sugs = [] # keine Vorschläge wenn alles Treffer
|
||||||
|
elif row_hits:
|
||||||
|
cell.CellBackColor = YELLOW
|
||||||
|
else:
|
||||||
|
cell.CellBackColor = RED
|
||||||
|
|
||||||
|
# Norm_Treffer
|
||||||
|
tr_cell = sheet.getCellByPosition(norm_tr_col, r)
|
||||||
|
tr_cell.String = " | ".join(row_hits)
|
||||||
|
tr_cell.CellBackColor = GREEN if row_hits else WHITE
|
||||||
|
|
||||||
|
# Norm_Vorschlag
|
||||||
|
sug_cell = sheet.getCellByPosition(norm_sug_col, r)
|
||||||
|
sug_cell.String = " | ".join(row_sugs)
|
||||||
|
sug_cell.CellBackColor = YELLOW if row_sugs else WHITE
|
||||||
|
|
||||||
|
# Kein_Treffer
|
||||||
|
kt_cell = sheet.getCellByPosition(kein_tr_col, r)
|
||||||
|
kt_cell.String = " | ".join(unmapped_terms)
|
||||||
|
kt_cell.CellBackColor = RED if unmapped_terms else WHITE
|
||||||
|
|
||||||
|
rows_processed += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Fehler in Zeile {r}: {e}\n{traceback.format_exc()}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(CACHE, f, ensure_ascii=False, indent=2)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
log(f"run_mapper_macro fertig. Zeilen verarbeitet: {rows_processed}")
|
||||||
|
|
||||||
|
# Export für LibreOffice
|
||||||
|
g_exportedScripts = (run_mapper_macro,)
|
||||||
@ -1,121 +0,0 @@
|
|||||||
import uno
|
|
||||||
import json
|
|
||||||
import subprocess
|
|
||||||
from pathlib import Path
|
|
||||||
from com.sun.star.awt import XActionListener
|
|
||||||
|
|
||||||
# Farbwerte (BGR)
|
|
||||||
GREEN = 0xC6EFCE
|
|
||||||
RED = 0xFFC7CE
|
|
||||||
YELLOW = 0xFFEB9C
|
|
||||||
|
|
||||||
def get_objektbeschreibung_column(sheet):
|
|
||||||
"""Findet die Spalte 'Objektbeschreibung'."""
|
|
||||||
for row in range(sheet.Rows.Count):
|
|
||||||
for col in range(sheet.Columns.Count):
|
|
||||||
cell = sheet.getCellByPosition(col, row)
|
|
||||||
if cell.String.strip().lower() == "objektbeschreibung":
|
|
||||||
return col
|
|
||||||
return None
|
|
||||||
|
|
||||||
def update_cell_color(cell, status):
|
|
||||||
"""Färbt die Zelle."""
|
|
||||||
if status == "grün":
|
|
||||||
cell.CellBackColor = GREEN
|
|
||||||
elif status == "gelb":
|
|
||||||
cell.CellBackColor = YELLOW
|
|
||||||
else:
|
|
||||||
cell.CellBackColor = RED
|
|
||||||
|
|
||||||
def call_mapper(term):
|
|
||||||
"""Ruft den lokalen Wrapper auf."""
|
|
||||||
wrapper = Path("/home/jarnold/projects/GND-Skript Test/NormVokabular_Mapper_Wrapper.py")
|
|
||||||
if not wrapper.exists():
|
|
||||||
return {"term": term, "norm_name": "KEIN TREFFER", "norm_id": "", "suggestions": []}
|
|
||||||
|
|
||||||
result = subprocess.run(
|
|
||||||
["python3", str(wrapper), term],
|
|
||||||
capture_output=True,
|
|
||||||
text=True
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
output = json.loads(result.stdout)
|
|
||||||
except:
|
|
||||||
output = {"term": term, "norm_name": "KEIN TREFFER", "norm_id": "", "suggestions": []}
|
|
||||||
return output
|
|
||||||
|
|
||||||
class SuggestionListener(XActionListener):
|
|
||||||
"""Listener für Klick auf Vorschlag-Button."""
|
|
||||||
def __init__(self, cell, suggestion, dialog):
|
|
||||||
self.cell = cell
|
|
||||||
self.suggestion = suggestion
|
|
||||||
self.dialog = dialog
|
|
||||||
|
|
||||||
def actionPerformed(self, event):
|
|
||||||
self.cell.String = self.suggestion
|
|
||||||
update_cell_color(self.cell, "grün")
|
|
||||||
self.dialog.endExecute() # schließt das Dialogfenster
|
|
||||||
|
|
||||||
def disposing(self, event):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def show_suggestion_dialog(cell, term, suggestions):
|
|
||||||
"""Zeigt ein Dialog-Fenster mit klickbaren Vorschlägen."""
|
|
||||||
ctx = XSCRIPTCONTEXT.getComponentContext()
|
|
||||||
smgr = ctx.getServiceManager()
|
|
||||||
toolkit = smgr.createInstance("com.sun.star.awt.Toolkit")
|
|
||||||
dialog_model = smgr.createInstance("com.sun.star.awt.UnoControlDialogModel")
|
|
||||||
dialog_model.PositionX = 100
|
|
||||||
dialog_model.PositionY = 100
|
|
||||||
dialog_model.Width = 200
|
|
||||||
dialog_model.Height = 30 + 25*len(suggestions)
|
|
||||||
dialog_model.Title = f"Vorschläge für '{term}'"
|
|
||||||
|
|
||||||
for i, sugg in enumerate(suggestions[:3]):
|
|
||||||
btn_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
|
|
||||||
btn_model.Name = f"btn_{i}"
|
|
||||||
btn_model.Label = sugg
|
|
||||||
btn_model.PositionX = 10
|
|
||||||
btn_model.PositionY = 10 + i*25
|
|
||||||
btn_model.Width = 180
|
|
||||||
btn_model.Height = 20
|
|
||||||
dialog_model.insertByName(btn_model.Name, btn_model)
|
|
||||||
|
|
||||||
dialog = smgr.createInstance("com.sun.star.awt.UnoControlDialog")
|
|
||||||
dialog.setModel(dialog_model)
|
|
||||||
dialog.setVisible(True)
|
|
||||||
|
|
||||||
for i, sugg in enumerate(suggestions[:3]):
|
|
||||||
btn = dialog.getControl(f"btn_{i}")
|
|
||||||
listener = SuggestionListener(cell, sugg, dialog)
|
|
||||||
btn.addActionListener(listener)
|
|
||||||
|
|
||||||
toolkit.createDialog(dialog).execute()
|
|
||||||
|
|
||||||
def mapper_process_column():
|
|
||||||
"""Verarbeitet alle Zellen unter 'Objektbeschreibung' in der aktiven Tabelle."""
|
|
||||||
doc = XSCRIPTCONTEXT.getDocument()
|
|
||||||
sheet = doc.CurrentController.ActiveSheet
|
|
||||||
col_index = get_objektbeschreibung_column(sheet)
|
|
||||||
if col_index is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
for row in range(sheet.Rows.Count):
|
|
||||||
cell = sheet.getCellByPosition(col_index, row)
|
|
||||||
if not cell.String.strip():
|
|
||||||
continue # leere Zelle ignorieren
|
|
||||||
term = cell.String.strip()
|
|
||||||
result = call_mapper(term)
|
|
||||||
|
|
||||||
if result["norm_name"] != "KEIN TREFFER":
|
|
||||||
cell.String = result["norm_name"]
|
|
||||||
update_cell_color(cell, "grün")
|
|
||||||
elif result["suggestions"]:
|
|
||||||
update_cell_color(cell, "gelb")
|
|
||||||
show_suggestion_dialog(cell, term, result["suggestions"])
|
|
||||||
else:
|
|
||||||
update_cell_color(cell, "rot")
|
|
||||||
show_suggestion_dialog(cell, term, [])
|
|
||||||
|
|
||||||
# Export
|
|
||||||
g_exportedScripts = mapper_process_column,
|
|
||||||
Loading…
x
Reference in New Issue
Block a user