Jonas Arnold Praktikum alle Files hinzugefügt

This commit is contained in:
gumuArnold 2025-10-16 15:39:43 +02:00
parent 41e8b7103e
commit 723ac7b6b1
45 changed files with 3316 additions and 2819476 deletions

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1 +0,0 @@
,jarnold,workPC,10.10.2025 09:26,file:///home/jarnold/.config/libreoffice/4;

View File

@ -0,0 +1,469 @@
# -*- coding: utf-8 -*-
# mapper_macro 1.5 - LibreOffice Calc
# Features: Kompositum-Split, Cache, Live-Vorschläge nur auf 'Objektbeschreibung', Logging
import os
import re
import json
import datetime
# optional imports (Pandas, Spacy, RapidFuzz)
try:
import pandas as pd
PANDAS_AVAILABLE = True
except Exception:
PANDAS_AVAILABLE = False
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
from difflib import SequenceMatcher
# ------------------------
# Konfiguration
# ------------------------
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro"
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
STOPWORDS = {
"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an",
"als","bei","für","aus","dem","den","des","eines","einer"
}
CONF_THRESHOLD = 0.75
# ------------------------
# Logging
# ------------------------
def log(msg, level="INFO"):
ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
line = f"[{ts}] [{level}] {msg}\n"
try:
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(line)
except Exception:
pass
# ------------------------
# Cache laden
# ------------------------
try:
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
except Exception as e:
CACHE = {}
log(f"Fehler beim Laden des Caches: {e}", level="ERROR")
# ------------------------
# Textnormalisierung & Lemma
# ------------------------
lemma_cache = {}
def normalize_text(s):
if not s:
return ""
s = str(s).strip().lower()
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
s = re.sub(r"\s+", " ", s)
return s
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(term_norm)
lemma = " ".join([t.lemma_ for t in doc])
except Exception:
lemma = term_norm
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# ------------------------
# Kompositum-Splitting
# ------------------------
def compound_split(term):
if not term:
return []
parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
if parts:
return parts
parts = [p for p in re.split(r'[-\s]+', term) if p]
return parts or [term]
# ------------------------
# NV_MASTER indexieren
# ------------------------
def build_norm_index(nv_path):
norm_dict = {}
lemma_index = {}
if not PANDAS_AVAILABLE:
log("Pandas nicht verfügbar, NV_MASTER kann nicht gelesen.", level="ERROR")
return norm_dict, lemma_index
try:
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
except Exception as e:
log(f"Fehler beim Einlesen von NV_MASTER: {e}", level="ERROR")
return norm_dict, lemma_index
for sheet_name, df in sheets.items():
if str(sheet_name).strip().lower() == "master":
continue
df = df.fillna("")
cols = [str(c).strip().lower() for c in df.columns]
id_col = None
word_col = None
for i, c in enumerate(cols):
if "id" in c:
id_col = df.columns[i]
if "wort" in c or "vokabel" in c:
word_col = df.columns[i]
if word_col is None and len(df.columns) >= 1:
word_col = df.columns[-1]
if id_col is None and len(df.columns) >= 1:
id_col = df.columns[0]
current_parent_id = None
for _, row in df.iterrows():
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
if id_val:
current_parent_id = id_val
if not word_val:
continue
norm_name = normalize_text(word_val)
lemma = lemmatize_term(word_val)
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
norm_dict.setdefault(norm_name, []).append(entry)
lemma_index.setdefault(lemma, []).append(entry)
log(f"NV_MASTER geladen. Begriffe: {sum(len(v) for v in norm_dict.values())}")
return norm_dict, lemma_index
# ------------------------
# Fuzzy / Vorschläge
# ------------------------
def fuzzy_score(a, b):
if RAPIDFUZZ_AVAILABLE:
try:
return fuzz.token_set_ratio(a, b) / 100.0
except Exception:
return 0.0
else:
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD):
candidates = []
for key_lemma, entries in lemma_index.items():
score = fuzzy_score(term_lemma, key_lemma)
if key_lemma.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
for norm_key, entries in norm_dict.items():
score = fuzzy_score(term_lemma, norm_key)
if norm_key.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
candidates.sort(key=lambda t: t[0], reverse=True)
seen = set()
results = []
for score, name, id_ in candidates:
key = (name, id_)
if key in seen:
continue
seen.add(key)
results.append({"score": score, "name": name, "id": id_})
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
# ------------------------
# Mapping eines Terms (mit Cache)
# ------------------------
def map_term_with_indexes(term, norm_dict, lemma_index):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_lemma in CACHE:
c = CACHE[term_lemma]
return c.get("hits", []), c.get("suggestions", []), c.get("ids", [])
hits = []
suggestions = []
ids = []
if term_norm in norm_dict:
for e in norm_dict[term_norm]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
if not hits and term_lemma in lemma_index:
for e in lemma_index[term_lemma]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
suggestions = get_suggestions_for_term(term_lemma, norm_dict, lemma_index)
if not hits:
tokens = compound_split(term)
for t in tokens:
t_lemma = lemmatize_term(t)
if t_lemma in lemma_index:
for e in lemma_index[t_lemma]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
else:
suggestions.extend(get_suggestions_for_term(t_lemma, norm_dict, lemma_index))
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
hits = uniq(hits)
suggestions = uniq(suggestions)
ids = uniq(ids)
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
return hits, suggestions, ids
# ------------------------
# Header + Spalten
# ------------------------
def find_header_and_cols(sheet):
try:
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
dr = cursor.getRangeAddress()
except Exception:
return None, None, None
header_row = None
objekt_col = None
for r in range(0, min(5, dr.EndRow + 1)):
for c in range(0, dr.EndColumn + 1):
try:
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
except Exception:
val = ""
if val == "objektbeschreibung":
header_row = r
objekt_col = c
break
if objekt_col is not None:
break
if header_row is None:
return None, None, dr
existing = {}
for c in range(0, dr.EndColumn + 1):
try:
h = str(sheet.getCellByPosition(c, header_row).String).strip()
except Exception:
h = ""
if h == "Norm_Treffer":
existing["Norm_Treffer"] = c
if h == "Norm_Vorschlag":
existing["Norm_Vorschlag"] = c
if h == "Norm_ID":
existing["Norm_ID"] = c
return header_row, objekt_col, dr, existing
# ------------------------
# Optimierter Live-Handler (nur Objektbeschreibung)
# ------------------------
def on_objektbeschreibung_change(oEvent=None):
try:
doc = XSCRIPTCONTEXT.getDocument()
sheet = doc.CurrentController.ActiveSheet
except Exception as e:
log(f"Dokumentzugriff fehlgeschlagen: {e}", level="ERROR")
return
cell = None
try:
if oEvent and hasattr(oEvent, "Range") and oEvent.Range is not None:
cell = oEvent.Range
elif oEvent and hasattr(oEvent, "Source") and oEvent.Source is not None:
cell = oEvent.Source
except Exception:
cell = None
if cell is None:
try:
sel = doc.CurrentSelection
if hasattr(sel, "getCellByPosition"):
cell = sel
else:
cell = sel.getCellByPosition(0, 0)
except Exception as e:
log(f"Keine Selektion: {e}", level="ERROR")
return
try:
row_index = cell.CellAddress.Row
col_index = cell.CellAddress.Column
except Exception:
return
try:
header_row, objekt_col, dr, existing = find_header_and_cols(sheet)
if header_row is None or col_index != objekt_col:
return # nur die Objektbeschreibung-Spalte bearbeiten
last_col = dr.EndColumn
if "Norm_Vorschlag" not in existing:
last_col += 1
existing["Norm_Vorschlag"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
norm_sug_col = existing["Norm_Vorschlag"]
except Exception as e:
log(f"Fehler Spaltenbestimmung: {e}", level="ERROR")
return
try:
txt = str(cell.String).strip()
if not txt:
sheet.getCellByPosition(norm_sug_col, row_index).String = ""
return
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
suggestions_acc = []
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
for cl in clauses:
parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS or re.fullmatch(r"\d+", p):
continue
for sp in compound_split(p):
_, sugs, _ = map_term_with_indexes(sp, norm_dict, lemma_index)
suggestions_acc.extend(sugs)
seen = set()
ordered = []
for s in suggestions_acc:
if s not in seen:
seen.add(s)
ordered.append(s)
sheet.getCellByPosition(norm_sug_col, row_index).String = " | ".join(ordered)
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
except Exception as e:
log(f"Fehler im Live-Handler: {e}", level="ERROR")
# ------------------------
# Batch-Durchlauf
# ------------------------
def run_mapper_macro():
log("=== mapper_macro 1.5 gestartet ===", level="INFO")
try:
doc = XSCRIPTCONTEXT.getDocument()
sheet = doc.CurrentController.ActiveSheet
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
dr = cursor.getRangeAddress()
except Exception as e:
log(f"Dokumentzugriff fehlgeschlagen: {e}", level="ERROR")
return
header_row, objekt_col, dr, existing = find_header_and_cols(sheet)
if objekt_col is None:
log("Spalte 'Objektbeschreibung' nicht gefunden.", level="ERROR")
return
if "Norm_Treffer" not in existing:
last_col = dr.EndColumn + 1
existing["Norm_Treffer"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
if "Norm_Vorschlag" not in existing:
last_col = dr.EndColumn + 2
existing["Norm_Vorschlag"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
if "Norm_ID" not in existing:
last_col = dr.EndColumn + 3
existing["Norm_ID"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_ID"
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
GREEN, YELLOW, RED = 0xADFF2F, 0xFFA500, 0xCC0000
for r in range(header_row + 1, dr.EndRow + 1):
try:
cell = sheet.getCellByPosition(objekt_col, r)
txt = str(cell.String).strip()
if not txt:
continue
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
terms = []
for cl in clauses:
for p in [p.strip() for p in re.split(r"\s+", cl) if p.strip()]:
if p.lower() in STOPWORDS or re.fullmatch(r"\d+", p):
continue
terms.extend([sp.strip() for sp in compound_split(p) if sp.strip()])
row_hits, row_sugs, row_ids = [], [], []
any_unmapped = False
for term in terms:
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
row_hits.extend(hits)
row_sugs.extend(sugs)
row_ids.extend(ids)
if not hits and not sugs:
any_unmapped = True
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
row_hits, row_sugs, row_ids = map(uniq, [row_hits, row_sugs, row_ids])
sheet.getCellByPosition(existing["Norm_Treffer"], r).String = " | ".join(row_hits)
sheet.getCellByPosition(existing["Norm_Vorschlag"], r).String = " | ".join(row_sugs)
sheet.getCellByPosition(existing["Norm_ID"], r).String = " | ".join(row_ids)
cell.CellBackColor = RED if any_unmapped else 0xFFFFFF
sheet.getCellByPosition(existing["Norm_Treffer"], r).CellBackColor = GREEN if row_hits and not any_unmapped else 0xFFFFFF
sheet.getCellByPosition(existing["Norm_Vorschlag"], r).CellBackColor = YELLOW if row_sugs else 0xFFFFFF
except Exception as e:
log(f"Fehler in Zeile {r}: {e}", level="ERROR")
continue
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
log("=== mapper_macro 1.5 fertig ===", level="INFO")
# ------------------------
# Export
# ------------------------
g_exportedScripts = (
run_mapper_macro,
on_objektbeschreibung_change
)

View File

@ -0,0 +1,508 @@
# -*- coding: utf-8 -*-
# mapper_macro 1.5 - korrigiert: Logging im Dokumentverzeichnis, stabile Button-Erstellung,
# keine Listener, optimiertes Mapping (ohne Listener-Teil)
import os
import re
import json
import datetime
# optionale Module (Pandas, Spacy, RapidFuzz)
try:
import pandas as pd
PANDAS_AVAILABLE = True
except Exception:
PANDAS_AVAILABLE = False
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
from difflib import SequenceMatcher
# UNO (für Button/Paths)
try:
import uno
except Exception:
uno = None
# ------------------------
# Konfiguration (Fallback-BASE_DIR)
# ------------------------
BASE_DIR = os.path.expanduser("~/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro")
NV_MASTER_FILENAME = "NV_MASTER.ods"
CACHE_FILENAME = "mapper_cache.json"
LOG_FILENAME = "mapper_macro.log"
STOPWORDS = {
"mit", "ohne", "der", "die", "das", "ein", "eine", "und", "zu", "von", "im", "in", "auf", "an",
"als", "bei", "für", "aus", "dem", "den", "des", "eines", "einer"
}
CONF_THRESHOLD = 0.82
FUZZY_CUTOFF = 0.88
# Per-document paths (initialized by set_paths_from_doc)
DOC_DIR = BASE_DIR
NV_MASTER_PATH = os.path.join(DOC_DIR, NV_MASTER_FILENAME)
CACHE_FILE = os.path.join(DOC_DIR, CACHE_FILENAME)
LOG_FILE = os.path.join(DOC_DIR, LOG_FILENAME)
# in-memory cache
try:
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
except Exception:
CACHE = {}
# ------------------------
# Pfade im Dokument setzen
# ------------------------
def set_paths_from_doc(doc):
global DOC_DIR, NV_MASTER_PATH, CACHE_FILE, LOG_FILE
try:
url = getattr(doc, "URL", "")
if url and url.strip():
# UNO liefert file:///...
try:
system_path = uno.fileUrlToSystemPath(url)
except Exception:
# fallback: try simple unquote
from urllib.parse import unquote, urlparse
parsed = urlparse(url)
if parsed.scheme == "file":
system_path = unquote(parsed.path)
else:
system_path = ""
if system_path:
d = os.path.dirname(system_path)
if os.path.isdir(d):
DOC_DIR = d
except Exception:
DOC_DIR = BASE_DIR
NV_MASTER_PATH = os.path.join(DOC_DIR, NV_MASTER_FILENAME)
CACHE_FILE = os.path.join(DOC_DIR, CACHE_FILENAME)
LOG_FILE = os.path.join(DOC_DIR, LOG_FILENAME)
# ------------------------
# Logging (Dokumentdir, robust)
# ------------------------
def log(msg, level="INFO"):
ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
line = f"[{ts}] [{level}] {msg}\n"
try:
# ensure directory exists
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(line)
except Exception:
# absolute fallback: try writing into BASE_DIR
try:
fallback = os.path.join(BASE_DIR, LOG_FILENAME)
os.makedirs(os.path.dirname(fallback), exist_ok=True)
with open(fallback, "a", encoding="utf-8") as f:
f.write(line)
except Exception:
# last resort: silent
pass
# ------------------------
# Textvorbereitung & Helpers
# ------------------------
lemma_cache = {}
def normalize_text(s):
if not s:
return ""
s = str(s).strip().lower()
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
s = re.sub(r"\s+", " ", s)
return s
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(term_norm)
lemma = " ".join([t.lemma_ for t in doc])
except Exception:
lemma = term_norm
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
def compound_split(term):
if not term:
return []
parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
if parts:
return parts
parts = [p for p in re.split(r'[-\s]+', term) if p]
return parts or [term]
# ------------------------
# NV_MASTER indexieren
# ------------------------
def build_norm_index(nv_path):
norm_dict = {}
lemma_index = {}
if not PANDAS_AVAILABLE:
log("Pandas nicht verfügbar, NV_MASTER kann nicht gelesen.", level="ERROR")
return norm_dict, lemma_index
try:
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
except Exception as e:
log(f"Fehler beim Einlesen von NV_MASTER: {e}", level="ERROR")
return norm_dict, lemma_index
for sheet_name, df in sheets.items():
if str(sheet_name).strip().lower() == "master":
continue
df = df.fillna("")
cols = [str(c).strip().lower() for c in df.columns]
# find id/word columns with fallback
id_col = None
word_col = None
for i, c in enumerate(cols):
if "id" in c:
id_col = df.columns[i]
if "wort" in c or "vokabel" in c:
word_col = df.columns[i]
if word_col is None and len(df.columns) >= 1:
word_col = df.columns[-1]
if id_col is None and len(df.columns) >= 1:
id_col = df.columns[0]
current_parent_id = None
for _, row in df.iterrows():
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
if id_val:
current_parent_id = id_val
if not word_val:
continue
norm_name = normalize_text(word_val)
lemma = lemmatize_term(word_val)
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
norm_dict.setdefault(norm_name, []).append(entry)
lemma_index.setdefault(lemma, []).append(entry)
log(f"NV_MASTER geladen. Begriffe: {sum(len(v) for v in norm_dict.values())}", level="INFO")
return norm_dict, lemma_index
# ------------------------
# Fuzzy Matching
# ------------------------
def fuzzy_score(a, b):
a = (a or "").lower()
b = (b or "").lower()
if RAPIDFUZZ_AVAILABLE:
try:
return fuzz.token_sort_ratio(a, b) / 100.0
except Exception:
return 0.0
else:
return SequenceMatcher(None, a, b).ratio()
def get_suggestions(term_lemma, norm_dict, lemma_index, threshold=FUZZY_CUTOFF, max_sugs=6):
candidates = []
term_norm = term_lemma or ""
for key_lemma, entries in lemma_index.items():
if not key_lemma:
continue
score = fuzzy_score(term_norm, key_lemma)
if key_lemma.startswith(term_norm):
score = min(score + 0.08, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
# also check normalized names
for norm_key, entries in norm_dict.items():
score = fuzzy_score(term_norm, norm_key)
if norm_key.startswith(term_norm):
score = min(score + 0.08, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
# sort & dedupe
candidates.sort(key=lambda t: t[0], reverse=True)
seen = set()
out = []
for score, name, id_ in candidates:
key = (name, id_)
if key in seen:
continue
seen.add(key)
if id_:
out.append(f"{name} ({id_})")
else:
out.append(name)
if len(out) >= max_sugs:
break
return out
# ------------------------
# Mapping mit Cache
# ------------------------
def map_term(term, norm_dict, lemma_index):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_lemma in CACHE:
return CACHE[term_lemma]
hits = []
suggestions = []
ids = []
# exact
if term_norm in norm_dict:
for e in norm_dict[term_norm]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
# lemma
if not hits and term_lemma in lemma_index:
for e in lemma_index[term_lemma]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
# suggestions only if no hit
if not hits:
suggestions = get_suggestions(term_lemma, norm_dict, lemma_index)
# remove suggestions that are equal/contain hits
suggestions = [s for s in suggestions if not any(h.lower() in s.lower() for h in hits)]
result = {"hits": hits, "suggestions": suggestions, "ids": ids}
CACHE[term_lemma] = result
return result
# ------------------------
# Button erstellen (sicher)
# ------------------------
def add_macro_button(sheet):
try:
doc = XSCRIPTCONTEXT.getDocument()
except Exception:
log("add_macro_button: kein Dokument-Kontext", level="WARN")
return
try:
draw_page = sheet.DrawPage
# avoid duplicate
for shape in draw_page:
try:
if getattr(shape, "Name", "") == "MapperStartButton":
return
except Exception:
continue
# create shape and button model
shape = doc.createInstance("com.sun.star.drawing.ControlShape")
shape.Name = "MapperStartButton"
shape.Position = uno.createUnoStruct("com.sun.star.awt.Point")
shape.Position.X = 1000
shape.Position.Y = 200
shape.Size = uno.createUnoStruct("com.sun.star.awt.Size")
shape.Size.Width = 3000
shape.Size.Height = 1000
button_model = doc.createInstance("com.sun.star.form.component.CommandButton")
button_model.Label = "Start Mapping"
button_model.HelpText = "Startet das Mapping (run_mapper_macro)"
# assign macro via ActionCommand is not enough; user must link in UI; we add the control and label
shape.Control = button_model
draw_page.add(shape)
log("Button 'MapperStartButton' erstellt.", level="INFO")
except Exception as e:
log(f"add_macro_button Fehler: {e}", level="ERROR")
# ------------------------
# Hauptlauf (ohne Listener)
# ------------------------
def run_mapper_macro():
try:
doc = XSCRIPTCONTEXT.getDocument()
set_paths_from_doc(doc)
log("=== mapper_macro gestartet ===", level="INFO")
sheet = doc.CurrentController.ActiveSheet
add_macro_button(sheet)
# used area
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
dr = cursor.getRangeAddress()
# find header and objekt col
header_row = None
objekt_col = None
for r in range(0, min(10, dr.EndRow + 1)):
for c in range(0, dr.EndColumn + 1):
try:
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
except Exception:
val = ""
if val == "Objektbeschreibung":
header_row = r
objekt_col = c
break
if objekt_col is not None:
break
if objekt_col is None:
log("run_mapper_macro: 'Objektbeschreibung' Header nicht gefunden.", level="ERROR")
return
# ensure result cols
existing = {}
last_col = dr.EndColumn
for c in range(0, dr.EndColumn + 1):
try:
h = str(sheet.getCellByPosition(c, header_row).String).strip()
except Exception:
h = ""
if h == "Norm_Treffer":
existing["Norm_Treffer"] = c
if h == "Norm_Vorschlag":
existing["Norm_Vorschlag"] = c
if h == "Norm_ID":
existing["Norm_ID"] = c
if "Norm_Treffer" not in existing:
last_col += 1
existing["Norm_Treffer"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
if "Norm_Vorschlag" not in existing:
last_col += 1
existing["Norm_Vorschlag"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
if "Norm_ID" not in existing:
last_col += 1
existing["Norm_ID"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_ID"
norm_tr_col = existing["Norm_Treffer"]
norm_sug_col = existing["Norm_Vorschlag"]
norm_id_col = existing["Norm_ID"]
# build index
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
if not norm_dict and not lemma_index:
log("run_mapper_macro: NV_MASTER leer oder nicht lesbar.", level="ERROR")
return
GREEN, YELLOW, RED = 0xADFF2F, 0xFFFF66, 0xFF9999
rows_processed = 0
for r in range(header_row + 1, dr.EndRow + 1):
try:
cell = sheet.getCellByPosition(objekt_col, r)
txt = str(cell.String).strip()
if not txt:
continue
# phrase-first: try entire cleaned phrase (remove stopwords)
tokens = [t.strip() for t in re.split(r'\s+', normalize_text(txt)) if t and t not in STOPWORDS]
phrase = " ".join(tokens).strip()
terms = []
if phrase:
# first try phrase as whole
mapped_phrase = map_term(phrase, norm_dict, lemma_index)
if mapped_phrase["hits"] or mapped_phrase["suggestions"]:
# use phrase result (flatten hits+suggestions for output)
row_hits = mapped_phrase["hits"]
row_sugs = mapped_phrase["suggestions"]
row_ids = mapped_phrase["ids"]
any_unmapped = False if (row_hits or row_sugs) else True
else:
# fallback to token/compound processing
for p in [p for p in re.split(r'[,\s]+', txt) if p.strip()]:
if p.lower() in STOPWORDS or re.fullmatch(r'\d+', p):
continue
for sp in compound_split(p):
if sp and sp.strip():
terms.append(sp.strip())
row_hits = []
row_sugs = []
row_ids = []
any_unmapped = False
for term in terms:
mapped = map_term(term, norm_dict, lemma_index)
hits, sugs, ids = mapped["hits"], mapped["suggestions"], mapped["ids"]
if hits:
row_hits.extend(hits)
if sugs:
row_sugs.extend(sugs)
if ids:
row_ids.extend(ids)
if not hits and not sugs:
any_unmapped = True
else:
row_hits, row_sugs, row_ids = [], [], []
any_unmapped = True
# dedupe preserving order
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
row_hits = uniq(row_hits)
row_sugs = uniq(row_sugs)
row_ids = uniq(row_ids)
# write
sheet.getCellByPosition(norm_tr_col, r).String = " | ".join(row_hits)
sheet.getCellByPosition(norm_sug_col, r).String = " | ".join(row_sugs)
sheet.getCellByPosition(norm_id_col, r).String = " | ".join(row_ids)
cell.CellBackColor = RED if any_unmapped else 0xFFFFFF
sheet.getCellByPosition(norm_tr_col, r).CellBackColor = GREEN if row_hits else 0xFFFFFF
sheet.getCellByPosition(norm_sug_col, r).CellBackColor = YELLOW if row_sugs else 0xFFFFFF
rows_processed += 1
except Exception as e:
log(f"Fehler in Zeile {r}: {e}", level="ERROR")
continue
# persist cache file to DOC_DIR
try:
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
except Exception as e:
log(f"Cache speichern fehlgeschlagen: {e}", level="WARN")
log(f"=== mapper_macro fertig. Zeilen verarbeitet: {rows_processed} ===", level="INFO")
except Exception as e:
# top-level safety
try:
log(f"run_mapper_macro: Unhandled exception: {e}", level="ERROR")
except Exception:
pass
# ------------------------
# Export
# ------------------------
g_exportedScripts = (run_mapper_macro,)

View File

@ -0,0 +1,343 @@
# -*- coding: utf-8 -*-
"""
LibreOffice Calc Makro: NV_MASTER-Abgleich (verbessertes semantisches Matching)
Speicherort: /home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro/mapper_macro.py
"""
import os
import re
import json
import traceback
# ------------------------------------------------------------
# LIBRARIES & MODELS
# ------------------------------------------------------------
try:
import pandas as pd
PANDAS_AVAILABLE = True
except Exception:
PANDAS_AVAILABLE = False
try:
import spacy
# Verwende das mittlere Modell für semantische Ähnlichkeit
nlp = spacy.load("de_core_news_md")
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
from difflib import SequenceMatcher
# ------------------------------------------------------------
# KONFIGURATION
# ------------------------------------------------------------
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.70 # etwas großzügiger für semantisches Matching
# ------------------------------------------------------------
# LOGGING
# ------------------------------------------------------------
def log(msg):
"""Schreibt technische Logs ins Makroverzeichnis."""
try:
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(msg.strip() + "\n")
except Exception:
pass
log("Makro gestartet")
# ------------------------------------------------------------
# CACHE
# ------------------------------------------------------------
try:
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
except Exception:
CACHE = {}
# ------------------------------------------------------------
# TEXTNORMALISIERUNG & LEMMATISIERUNG
# ------------------------------------------------------------
def normalize_text(s):
if not s:
return ""
s = str(s).strip().lower()
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
s = re.sub(r"\s+", " ", s)
return s
lemma_cache = {}
def lemmatize_term(term):
t = normalize_text(term)
if t in lemma_cache:
return lemma_cache[t]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(t)
lemma = " ".join([token.lemma_ for token in doc])
except Exception:
lemma = t
else:
lemma = t
lemma_cache[t] = lemma
return lemma
# ------------------------------------------------------------
# NV_MASTER LADEN
# ------------------------------------------------------------
def build_norm_index(nv_path):
norm_dict = {}
lemma_index = {}
if not PANDAS_AVAILABLE:
log("Pandas nicht verfügbar NV_MASTER kann nicht geladen werden.")
return norm_dict, lemma_index
try:
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
except Exception as e:
log(f"Fehler beim Laden von NV_MASTER: {e}")
return norm_dict, lemma_index
for sheet_name, df in sheets.items():
if str(sheet_name).strip().lower() == "master":
continue
df = df.fillna("")
cols = [str(c).strip().lower() for c in df.columns]
id_col = next((df.columns[i] for i, c in enumerate(cols) if "id" in c), df.columns[0])
word_col = next((df.columns[i] for i, c in enumerate(cols) if "wort" in c or "vokabel" in c), df.columns[-1])
current_parent_id = None
for _, row in df.iterrows():
id_val = str(row[id_col]).strip()
word_val = str(row[word_col]).strip()
if id_val:
current_parent_id = id_val
if not word_val:
continue
norm_name = normalize_text(word_val)
lemma = lemmatize_term(word_val)
entry = {"Name": word_val, "ID": current_parent_id or "", "Sheet": sheet_name}
norm_dict.setdefault(norm_name, []).append(entry)
lemma_index.setdefault(lemma, []).append(entry)
log(f"NV_MASTER geladen: {sum(len(v) for v in norm_dict.values())} Begriffe.")
return norm_dict, lemma_index
# ------------------------------------------------------------
# SCORING: FUZZY + SEMANTISCH
# ------------------------------------------------------------
def fuzzy_score(a, b):
if RAPIDFUZZ_AVAILABLE:
try:
return fuzz.token_set_ratio(a, b) / 100.0
except Exception:
return 0.0
else:
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
def semantic_similarity(a, b):
if not SPACY_AVAILABLE or not hasattr(nlp.vocab, "vectors"):
return 0.0
try:
doc_a, doc_b = nlp(a), nlp(b)
if doc_a.vector_norm and doc_b.vector_norm:
return float(doc_a.similarity(doc_b))
return 0.0
except Exception:
return 0.0
def combined_score(a, b):
sf = fuzzy_score(a, b)
ss = semantic_similarity(a, b)
return max(sf, ss)
# ------------------------------------------------------------
# MATCHING & VORSCHLÄGE
# ------------------------------------------------------------
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=3, threshold=CONF_THRESHOLD):
candidates = []
for key_lemma, entries in lemma_index.items():
score = combined_score(term_lemma, key_lemma)
if key_lemma.startswith(term_lemma):
score = min(score + 0.05, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
for norm_key, entries in norm_dict.items():
score = combined_score(term_lemma, norm_key)
if norm_key.startswith(term_lemma):
score = min(score + 0.05, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
candidates.sort(key=lambda x: x[0], reverse=True)
seen, results = set(), []
for score, name, id_ in candidates:
key = (name.lower(), id_.lower() if id_ else "")
if key in seen:
continue
seen.add(key)
results.append({"score": score, "name": name, "id": id_})
if len(results) >= top_n:
break
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
def map_term_with_indexes(term, norm_dict, lemma_index):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_lemma in CACHE:
return CACHE[term_lemma]["hits"], CACHE[term_lemma]["suggestions"], CACHE[term_lemma]["ids"]
hits, suggestions, ids = [], [], []
if term_norm in norm_dict:
for e in norm_dict[term_norm]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
if not hits and term_lemma in lemma_index:
for e in lemma_index[term_lemma]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
suggs = get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=3, threshold=CONF_THRESHOLD)
filtered_suggs = []
for s in suggs:
s_clean = normalize_text(s.split(" (")[0])
if s_clean not in [normalize_text(h) for h in hits]:
filtered_suggs.append(s)
suggestions = filtered_suggs
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
hits, suggestions, ids = uniq(hits), uniq(suggestions), uniq(ids)
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
log(f"TERM: {term} | HITS: {hits} | SUGGS: {suggestions}")
return hits, suggestions, ids
# ------------------------------------------------------------
# HAUPTMAKRO
# ------------------------------------------------------------
def run_mapper_macro():
try:
doc = XSCRIPTCONTEXT.getDocument()
sheet = doc.CurrentController.ActiveSheet
except Exception as e:
log(f"Fehler beim Zugriff auf Dokument: {e}")
return
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
if not norm_dict:
log("Fehler: NV_MASTER leer oder nicht gefunden.")
return
try:
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
used = cursor.getRangeAddress()
except Exception as e:
log(f"Cursor-Fehler: {e}")
return
header_row = 0
objekt_col = None
for c in range(0, used.EndColumn + 1):
val = str(sheet.getCellByPosition(c, header_row).String).strip().lower()
if val == "objektbeschreibung":
objekt_col = c
break
if objekt_col is None:
log("Keine Spalte 'Objektbeschreibung' gefunden.")
return
existing = {}
for c in range(0, used.EndColumn + 1):
h = str(sheet.getCellByPosition(c, header_row).String).strip()
if h == "Norm_Treffer": existing["Norm_Treffer"] = c
if h == "Norm_Vorschlag": existing["Norm_Vorschlag"] = c
if h == "Norm_ID": existing["Norm_ID"] = c
last_col = used.EndColumn
for name in ["Norm_Treffer", "Norm_Vorschlag", "Norm_ID"]:
if name not in existing:
last_col += 1
existing[name] = last_col
sheet.getCellByPosition(last_col, header_row).String = name
GREEN, YELLOW, RED = 0xADFF2F, 0xFFD700, 0xCC0000
norm_tr_col, norm_sug_col, norm_id_col = existing["Norm_Treffer"], existing["Norm_Vorschlag"], existing["Norm_ID"]
rows = 0
for r in range(header_row + 1, used.EndRow + 1):
txt = str(sheet.getCellByPosition(objekt_col, r).String).strip()
if not txt:
continue
terms = [t.strip() for t in re.split(r",|\s+", txt) if t.strip() and t.lower() not in STOPWORDS]
row_hits, row_sugs, row_ids, any_unmapped = [], [], [], False
for term in terms:
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
if hits: row_hits.extend(hits)
if sugs: row_sugs.extend(sugs)
if ids: row_ids.extend(ids)
if not hits and not sugs: any_unmapped = True
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
row_hits, row_sugs, row_ids = uniq(row_hits), uniq(row_sugs), uniq(row_ids)
sheet.getCellByPosition(norm_tr_col, r).String = " | ".join(row_hits)
sheet.getCellByPosition(norm_sug_col, r).String = " | ".join(row_sugs)
sheet.getCellByPosition(norm_id_col, r).String = " | ".join(row_ids)
obj_cell = sheet.getCellByPosition(objekt_col, r)
sug_cell = sheet.getCellByPosition(norm_sug_col, r)
tr_cell = sheet.getCellByPosition(norm_tr_col, r)
if any_unmapped:
obj_cell.CellBackColor = RED
elif row_hits:
tr_cell.CellBackColor = GREEN
if row_sugs:
sug_cell.CellBackColor = YELLOW
rows += 1
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
log(f"Makro abgeschlossen, {rows} Zeilen verarbeitet.")
g_exportedScripts = (run_mapper_macro,)

View File

@ -1,7 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# LibreOffice Calc macro: NV_MASTER-Abgleich, Pandas+odf, Cache, Farben # LibreOffice Calc macro: NV_MASTER-Abgleich, Pandas+odf, Cache, Farben
# Pfade: BASE_DIR muss auf das Verzeichnis zeigen, in dem NV_MASTER.ods + Makro liegen. # Speicherort: /home/jarnold/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro/mapper_macro_2.1.py
# Speichern: /home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro/mapper_macro.py
import os import os
import re import re
@ -9,7 +8,6 @@ import json
import traceback import traceback
# UNO-Context wird zur Laufzeit zur Verfügung gestellt (XSCRIPTCONTEXT) # UNO-Context wird zur Laufzeit zur Verfügung gestellt (XSCRIPTCONTEXT)
# Third-party libs: pandas, odfpy, optional: spacy, rapidfuzz
try: try:
import pandas as pd import pandas as pd
PANDAS_AVAILABLE = True PANDAS_AVAILABLE = True
@ -34,10 +32,10 @@ except Exception:
# ------------------------ # ------------------------
# Konfiguration # Konfiguration
# ------------------------ # ------------------------
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro" BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro"
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods") NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log") LOG_FILE = os.path.join(BASE_DIR, "mapper_macro_2.1.log")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json") CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache_2.1.json")
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"} STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75 # Basis-Schwelle für Vorschläge CONF_THRESHOLD = 0.75 # Basis-Schwelle für Vorschläge
@ -110,10 +108,8 @@ def build_norm_index(nv_path):
for sheet_name, df in sheets.items(): for sheet_name, df in sheets.items():
if str(sheet_name).strip().lower() == "master": if str(sheet_name).strip().lower() == "master":
continue continue
# normalize columns names to find ID and Wort columns df = df.fillna("")
df = df.fillna("") # leere Zellen als ""
cols = [str(c).strip().lower() for c in df.columns] cols = [str(c).strip().lower() for c in df.columns]
# try to find columns
id_col = None id_col = None
word_col = None word_col = None
for i, c in enumerate(cols): for i, c in enumerate(cols):
@ -121,7 +117,6 @@ def build_norm_index(nv_path):
id_col = df.columns[i] id_col = df.columns[i]
if "wort" in c or "vokabel" in c: if "wort" in c or "vokabel" in c:
word_col = df.columns[i] word_col = df.columns[i]
# fallback: if not found, try first/last
if word_col is None and len(df.columns) >= 1: if word_col is None and len(df.columns) >= 1:
word_col = df.columns[-1] word_col = df.columns[-1]
if id_col is None and len(df.columns) >= 1: if id_col is None and len(df.columns) >= 1:
@ -131,18 +126,14 @@ def build_norm_index(nv_path):
for _, row in df.iterrows(): for _, row in df.iterrows():
id_val = str(row[id_col]).strip() if id_col in df.columns else "" id_val = str(row[id_col]).strip() if id_col in df.columns else ""
word_val = str(row[word_col]).strip() if word_col in df.columns else "" word_val = str(row[word_col]).strip() if word_col in df.columns else ""
# if row defines an ID, set as current parent
if id_val: if id_val:
current_parent_id = id_val current_parent_id = id_val
# skip empty word cells
if not word_val: if not word_val:
continue continue
norm_name = normalize_text(word_val) norm_name = normalize_text(word_val)
lemma = lemmatize_term(word_val) lemma = lemmatize_term(word_val)
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name} entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
# add to norm_dict by normalized name (exact matching)
norm_dict.setdefault(norm_name, []).append(entry) norm_dict.setdefault(norm_name, []).append(entry)
# add to lemma_index
lemma_index.setdefault(lemma, []).append(entry) lemma_index.setdefault(lemma, []).append(entry)
log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}") log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}")
@ -163,10 +154,8 @@ def fuzzy_score(a, b):
except Exception: except Exception:
return 0.0 return 0.0
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, threshold=CONF_THRESHOLD): def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD):
# collect candidates from lemma_index keys and norm_dict keys
candidates = [] candidates = []
# iterate over lemma_index keys for candidate names
for key_lemma, entries in lemma_index.items(): for key_lemma, entries in lemma_index.items():
score = fuzzy_score(term_lemma, key_lemma) score = fuzzy_score(term_lemma, key_lemma)
if key_lemma.startswith(term_lemma): if key_lemma.startswith(term_lemma):
@ -174,7 +163,6 @@ def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, thr
if score >= threshold: if score >= threshold:
for e in entries: for e in entries:
candidates.append((score, e["Name"], e["ID"])) candidates.append((score, e["Name"], e["ID"]))
# also check norm_dict keys (exact-normalized names) as additional candidates
for norm_key, entries in norm_dict.items(): for norm_key, entries in norm_dict.items():
score = fuzzy_score(term_lemma, norm_key) score = fuzzy_score(term_lemma, norm_key)
if norm_key.startswith(term_lemma): if norm_key.startswith(term_lemma):
@ -182,9 +170,7 @@ def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, thr
if score >= threshold: if score >= threshold:
for e in entries: for e in entries:
candidates.append((score, e["Name"], e["ID"])) candidates.append((score, e["Name"], e["ID"]))
# sort by score descending
candidates.sort(key=lambda t: t[0], reverse=True) candidates.sort(key=lambda t: t[0], reverse=True)
# unique by (Name, ID) preserve score order
seen = set() seen = set()
results = [] results = []
for score, name, id_ in candidates: for score, name, id_ in candidates:
@ -193,40 +179,28 @@ def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, thr
continue continue
seen.add(key) seen.add(key)
results.append({"score": score, "name": name, "id": id_}) results.append({"score": score, "name": name, "id": id_})
# return all candidates (no limit) as "Name (ID)"
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results] return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
def map_term_with_indexes(term, norm_dict, lemma_index): def map_term_with_indexes(term, norm_dict, lemma_index):
term_norm = normalize_text(term) term_norm = normalize_text(term)
term_lemma = lemmatize_term(term) term_lemma = lemmatize_term(term)
# cache lookup
if term_lemma in CACHE: if term_lemma in CACHE:
return CACHE[term_lemma]["hits"], CACHE[term_lemma]["suggestions"], CACHE[term_lemma]["ids"] return CACHE[term_lemma]["hits"], CACHE[term_lemma]["suggestions"]
hits = [] hits = []
suggestions = [] suggestions = []
ids = []
# 1) exact normalized name match
if term_norm in norm_dict: if term_norm in norm_dict:
for e in norm_dict[term_norm]: for e in norm_dict[term_norm]:
hits.append(e["Name"]) hits.append(f'{e["Name"]} ({e["ID"]})' if e["ID"] else e["Name"])
if e["ID"]:
ids.append(e["ID"])
# 2) lemma match (if not already hits)
if not hits and term_lemma in lemma_index: if not hits and term_lemma in lemma_index:
for e in lemma_index[term_lemma]: for e in lemma_index[term_lemma]:
hits.append(e["Name"]) hits.append(f'{e["Name"]} ({e["ID"]})' if e["ID"] else e["Name"])
if e["ID"]:
ids.append(e["ID"])
# 3) suggestions via fuzzy (always compute even if hits exist, but suggestions empty if exact) if not hits:
suggs = get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, threshold=CONF_THRESHOLD) suggestions = get_suggestions_for_term(term_lemma, norm_dict, lemma_index)
# If there are exact hits, we still may present suggestions (user wanted unlimited), but suggestions are secondary
suggestions = suggs
# deduplicate lists preserving order
def unique_preserve(seq): def unique_preserve(seq):
seen = set() seen = set()
out = [] out = []
@ -238,18 +212,15 @@ def map_term_with_indexes(term, norm_dict, lemma_index):
hits = unique_preserve(hits) hits = unique_preserve(hits)
suggestions = unique_preserve(suggestions) suggestions = unique_preserve(suggestions)
ids = unique_preserve(ids)
# cache result CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions}
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids} return hits, suggestions
return hits, suggestions, ids
# ------------------------ # ------------------------
# Haupt-Makro # Haupt-Makro
# ------------------------ # ------------------------
def run_mapper_macro(): def run_mapper_macro():
try: try:
# UNO doc/sheet
doc = XSCRIPTCONTEXT.getDocument() doc = XSCRIPTCONTEXT.getDocument()
sheet = doc.CurrentController.ActiveSheet sheet = doc.CurrentController.ActiveSheet
cursor = sheet.createCursor() cursor = sheet.createCursor()
@ -260,7 +231,6 @@ def run_mapper_macro():
log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e)) log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e))
return return
# find header row and Objektbeschreibung column (search first 5 rows)
header_row = None header_row = None
objekt_col = None objekt_col = None
max_col = data_range.EndColumn max_col = data_range.EndColumn
@ -281,7 +251,7 @@ def run_mapper_macro():
log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.") log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.")
return return
# determine or create result columns: search if exist anywhere; otherwise append at right end # Prüfen/Anlegen der Ergebnis-Spalten
existing = {} existing = {}
for c in range(0, data_range.EndColumn+1): for c in range(0, data_range.EndColumn+1):
try: try:
@ -292,59 +262,38 @@ def run_mapper_macro():
existing["Norm_Treffer"] = c existing["Norm_Treffer"] = c
if h == "Norm_Vorschlag": if h == "Norm_Vorschlag":
existing["Norm_Vorschlag"] = c existing["Norm_Vorschlag"] = c
if h == "Norm_ID":
existing["Norm_ID"] = c
# append columns at right end if missing
last_col = data_range.EndColumn last_col = data_range.EndColumn
if "Norm_Treffer" not in existing: if "Norm_Treffer" not in existing:
last_col += 1 last_col += 1
existing["Norm_Treffer"] = last_col existing["Norm_Treffer"] = last_col
try:
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer" sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
except Exception:
pass
if "Norm_Vorschlag" not in existing: if "Norm_Vorschlag" not in existing:
last_col += 1 last_col += 1
existing["Norm_Vorschlag"] = last_col existing["Norm_Vorschlag"] = last_col
try:
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag" sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
except Exception:
pass
if "Norm_ID" not in existing:
last_col += 1
existing["Norm_ID"] = last_col
try:
sheet.getCellByPosition(last_col, header_row).String = "Norm_ID"
except Exception:
pass
norm_tr_col = existing["Norm_Treffer"] norm_tr_col = existing["Norm_Treffer"]
norm_sug_col = existing["Norm_Vorschlag"] norm_sug_col = existing["Norm_Vorschlag"]
norm_id_col = existing["Norm_ID"]
# Build norm indexes
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH) norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
if not norm_dict and not lemma_index: if not norm_dict and not lemma_index:
log("NV_MASTER leer oder nicht lesbar. Abbruch.") log("NV_MASTER leer oder nicht lesbar. Abbruch.")
return return
# colors
GREEN = 0xADFF2F GREEN = 0xADFF2F
YELLOW = 0xFFA500 YELLOW = 0xFFA500
RED = 0xCC0000 RED = 0xCC0000
WHITE = 0xFFFFFF
# iterate rows
rows_processed = 0 rows_processed = 0
for r in range(header_row + 1, data_range.EndRow + 1): for r in range(header_row + 1, data_range.EndRow + 1):
try: try:
cell = sheet.getCellByPosition(objekt_col, r) cell = sheet.getCellByPosition(objekt_col, r)
txt = str(cell.String).strip() txt = str(cell.String).strip()
if not txt: if not txt:
# clear any previous outputs? keep existing per spec; skip empty
continue continue
# tokenize: split by commas first, then whitespace; filter stopwords and pure numbers
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()] clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
terms = [] terms = []
for cl in clauses: for cl in clauses:
@ -356,24 +305,19 @@ def run_mapper_macro():
continue continue
terms.append(p) terms.append(p)
# for each term, get hits/suggestions/ids
row_hits = [] row_hits = []
row_sugs = [] row_sugs = []
row_ids = [] unmapped_terms = []
any_unmapped = False # at least one term without hit and without suggestion
# We will record for each term
for term in terms: for term in terms:
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index) hits, sugs = map_term_with_indexes(term, norm_dict, lemma_index)
if hits: if hits:
row_hits.extend(hits) row_hits.extend(hits)
else:
unmapped_terms.append(term)
if sugs: if sugs:
row_sugs.extend(sugs) row_sugs.extend(sugs)
if ids:
row_ids.extend(ids)
if (not hits) and (not sugs):
any_unmapped = True
# deduplicate preserving order
def uniq(seq): def uniq(seq):
seen = set() seen = set()
out = [] out = []
@ -385,57 +329,30 @@ def run_mapper_macro():
row_hits = uniq(row_hits) row_hits = uniq(row_hits)
row_sugs = uniq(row_sugs) row_sugs = uniq(row_sugs)
row_ids = uniq(row_ids)
# write outputs (unlimited lists, joined with " | ") # Farb-Logik für Objektbeschreibung
try: if terms and not unmapped_terms and row_hits:
sheet.getCellByPosition(norm_tr_col, r).String = " | ".join(row_hits) cell.CellBackColor = GREEN
sheet.getCellByPosition(norm_sug_col, r).String = " | ".join(row_sugs) row_sugs = []
sheet.getCellByPosition(norm_id_col, r).String = " | ".join(row_ids) elif row_hits:
except Exception: cell.CellBackColor = YELLOW
pass else:
# Coloring rules per new spec:
# - Objektbeschreibung cell: RED if any_unmapped else no change (we do not color green/yellow here)
# - Norm_Treffer cell: GREEN if all terms matched (i.e., terms non-empty and no term unmapped and at least one hit per term)
# - Norm_Vorschlag cell: YELLOW if at least one suggestion exists
# Determine "all matched": terms non-empty and every term has at least one hit (we approximated by checking any_unmapped and hits length)
all_matched = False
if terms:
# all_matched if no term without hit and there is at least one hit overall
if (not any_unmapped) and row_hits:
all_matched = True
# apply colors
try:
if any_unmapped:
cell.CellBackColor = RED cell.CellBackColor = RED
else:
# clear red if previously set? We'll leave unchanged if not set. Optionally set to default 16777215 (white) # Ergebniszellen
pass
# Norm_Treffer coloring
tr_cell = sheet.getCellByPosition(norm_tr_col, r) tr_cell = sheet.getCellByPosition(norm_tr_col, r)
if all_matched: tr_cell.String = " | ".join(row_hits)
tr_cell.CellBackColor = GREEN tr_cell.CellBackColor = GREEN if row_hits else WHITE
else:
# clear color if needed -> set to white
tr_cell.CellBackColor = 0xFFFFFF
# Norm_Vorschlag coloring
sug_cell = sheet.getCellByPosition(norm_sug_col, r) sug_cell = sheet.getCellByPosition(norm_sug_col, r)
if row_sugs: sug_cell.String = " | ".join(row_sugs)
sug_cell.CellBackColor = YELLOW sug_cell.CellBackColor = YELLOW if row_sugs else WHITE
else:
sug_cell.CellBackColor = 0xFFFFFF
except Exception:
pass
rows_processed += 1 rows_processed += 1
except Exception as e: except Exception as e:
# continue processing other rows; log once log(f"Fehler in Zeile {r}: {e}\n{traceback.format_exc()}")
log(f"Fehler in Zeile {r}: {e}")
# persist cache
try: try:
with open(CACHE_FILE, "w", encoding="utf-8") as f: with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2) json.dump(CACHE, f, ensure_ascii=False, indent=2)
@ -444,5 +361,5 @@ def run_mapper_macro():
log(f"run_mapper_macro fertig. Zeilen verarbeitet: {rows_processed}") log(f"run_mapper_macro fertig. Zeilen verarbeitet: {rows_processed}")
# Export for LO # Export für LibreOffice
g_exportedScripts = (run_mapper_macro,) g_exportedScripts = (run_mapper_macro,)

View File

@ -0,0 +1,455 @@
# -*- coding: utf-8 -*-
"""
LibreOffice/Excel Macro: NV_MASTER-Abgleich
Version: 2.3
Pfad: libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro/mapper_macro_2.3.py
Beschreibung:
-------------
Dieses Python-Makro für LibreOffice/Excel führt einen Abgleich von Begriffen
aus einem aktiven Sheet gegen ein zentral gepflegtes NV_MASTER-Vokabular durch.
Es erstellt Treffer, Vorschläge und markiert die Zellen farblich.
Hauptfunktionen:
----------------
1. Text-Normalisierung und Lemma-Bestimmung
2. Laden des NV_MASTER-Vokabulars und Aufbau von Norm-Index + Lemma-Index
3. Fuzzy Matching (RapidFuzz oder difflib) für Begriffe
4. Treffer- und Vorschlagsbestimmung
5. Mapping auf Sheet:
- Norm_Treffer (grün)
- Norm_Vorschlag (gelb)
- Kein_Treffer (rot)
6. Caching zur Vermeidung mehrfacher Berechnungen
7. Logging in externe Datei
Externe Abhängigkeiten:
-----------------------
- pandas (für ODS/Excel-Leseoperationen)
- spacy (für deutsche Lemma-Bestimmung)
- rapidfuzz (optional für schnellere Fuzzy-String-Matches)
UNO-spezifische Objekte:
------------------------
- XSCRIPTCONTEXT: Bereitgestellt durch LibreOffice zur Laufzeit
Schwachstellen / Optimierungsansätze:
-------------------------------------
- Fehlerbehandlung ist robust, aber teilweise sehr still (z.B. Cache-Fehler, Pandas-Fehler).
- Schleifen über Zellen sind bei großen Sheets langsam (potenziell durch pandas vollständig ersetzen).
- Lemma-Berechnung könnte nur einmal für NV_MASTER und einmal für Sheet durchgeführt werden.
- RapidFuzz optional; fallback auf SequenceMatcher ist deutlich langsamer.
- Cache wird nur am Ende geschrieben; Absturz vor Ende verliert bisherige Ergebnisse.
- Farbwerte sind fest codiert; parametrisieren könnte Flexibilität erhöhen.
- Stopwords sind hart codiert; konfigurierbar wäre effizienter.
- Es werden keine parallelen Abfragen / Batch-Operationen verwendet.
- Logging nur in Datei; LibreOffice-eigene Meldungen oder Fortschrittsanzeige fehlen.
"""
import os
import re
import json
import traceback
# UNO-Context wird zur Laufzeit von LibreOffice bereitgestellt
try:
import pandas as pd
PANDAS_AVAILABLE = True
except Exception:
PANDAS_AVAILABLE = False
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
from difflib import SequenceMatcher
# ------------------------
# Konfiguration
# ------------------------
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro"
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro_2.3.log")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache_2.3.json")
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75 # Basis-Schwelle für Vorschläge
# ------------------------
# Logging-Funktion
# ------------------------
def log(msg):
"""Schreibt Nachricht in LOG_FILE. Fehler werden ignoriert."""
try:
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(msg + "\n")
except Exception:
pass
# ------------------------
# Cache laden
# ------------------------
try:
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
except Exception:
CACHE = {}
# ------------------------
# Text-Normalisierung & Lemma
# ------------------------
def normalize_text(s):
"""Entfernt Sonderzeichen, multiple Whitespaces, wandelt in lowercase."""
if not s:
return ""
s = str(s).strip().lower()
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
s = re.sub(r"\s+", " ", s)
return s
lemma_cache = {}
def lemmatize_term(term):
"""Lemmatisiert einen Begriff mit SpaCy. Falls nicht verfügbar, Rückgabe Normalized String."""
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
except Exception:
lemma = term_norm
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# ------------------------
# NV_MASTER laden
# ------------------------
def build_norm_index(nv_path):
"""
Liest NV_MASTER ein und erstellt:
- norm_dict: Normalisierte Begriffe -> Einträge mit Name, ID, Sheet
- lemma_index: Lemma -> Einträge
"""
norm_dict = {}
lemma_index = {}
if not PANDAS_AVAILABLE:
log("Pandas nicht verfügbar. NV_MASTER kann nicht gelesen werden.")
return norm_dict, lemma_index
try:
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
except Exception as e:
log(f"Fehler beim Einlesen NV_MASTER: {e}")
return norm_dict, lemma_index
for sheet_name, df in sheets.items():
if str(sheet_name).strip().lower() == "master":
continue
df = df.fillna("")
cols = [str(c).strip().lower() for c in df.columns]
id_col = None
word_col = None
for i, c in enumerate(cols):
if "id" in c:
id_col = df.columns[i]
if "wort" in c or "vokabel" in c:
word_col = df.columns[i]
if word_col is None and len(df.columns) >= 1:
word_col = df.columns[-1]
if id_col is None and len(df.columns) >= 1:
id_col = df.columns[0]
current_parent_id = None
for _, row in df.iterrows():
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
if id_val:
current_parent_id = id_val
if not word_val:
continue
norm_name = normalize_text(word_val)
lemma = lemmatize_term(word_val)
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
norm_dict.setdefault(norm_name, []).append(entry)
lemma_index.setdefault(lemma, []).append(entry)
log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}")
return norm_dict, lemma_index
# ------------------------
# Matching-Funktionen
# ------------------------
def fuzzy_score(a, b):
"""Berechnet Fuzzy-Score zwischen zwei Strings. RapidFuzz oder fallback SequenceMatcher."""
if RAPIDFUZZ_AVAILABLE:
try:
return fuzz.token_set_ratio(a, b) / 100.0
except Exception:
return 0.0
else:
try:
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
except Exception:
return 0.0
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD):
"""
Liefert Vorschläge für ein Lemma, wenn kein exakter Treffer existiert.
Score-basierte Sortierung, Duplikate werden entfernt.
"""
candidates = []
for key_lemma, entries in lemma_index.items():
score = fuzzy_score(term_lemma, key_lemma)
if key_lemma.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
for norm_key, entries in norm_dict.items():
score = fuzzy_score(term_lemma, norm_key)
if norm_key.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
candidates.sort(key=lambda t: t[0], reverse=True)
seen = set()
results = []
for score, name, id_ in candidates:
key = (name, id_)
if key in seen:
continue
seen.add(key)
results.append({"score": score, "name": name, "id": id_})
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
def map_term_with_indexes(term, norm_dict, lemma_index):
"""
Mappt einen Term auf NV_MASTER:
- Treffer
- Vorschläge
- IDs
Nutzt Cache, um Wiederholungen zu vermeiden.
"""
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_lemma in CACHE:
cached = CACHE[term_lemma]
return cached.get("hits", []), cached.get("suggestions", []), cached.get("ids", [])
hits = []
suggestions = []
ids = []
if term_norm in norm_dict:
for e in norm_dict[term_norm]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
if not hits and term_lemma in lemma_index:
for e in lemma_index[term_lemma]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
if not hits:
suggestions = get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD)
# Duplikate entfernen
def unique_preserve(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
hits = unique_preserve(hits)
suggestions = unique_preserve(suggestions)
ids = unique_preserve(ids)
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
return hits, suggestions, ids
# ------------------------
# Haupt-Makro
# ------------------------
def run_mapper_macro():
"""
Haupt-Makro für LibreOffice:
1. Bestimmt Header + Spalten
2. Fügt Spalten für Norm_Treffer, Norm_Vorschlag, Kein_Treffer hinzu
3. Liest NV_MASTER und baut Indizes
4. Iteriert über Zeilen und Terms
5. Markiert Zellen farblich (grün/gelb/rot)
6. Schreibt Cache am Ende
"""
try:
doc = XSCRIPTCONTEXT.getDocument()
sheet = doc.CurrentController.ActiveSheet
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
data_range = cursor.getRangeAddress()
except Exception as e:
log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e))
return
# Header finden
header_row = None
objekt_col = None
max_col = data_range.EndColumn
for r in range(0, min(5, data_range.EndRow+1)):
for c in range(0, max_col+1):
try:
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
except Exception:
val = ""
if val == "objektbeschreibung":
header_row = r
objekt_col = c
break
if objekt_col is not None:
break
if objekt_col is None:
log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.")
return
# Spalten anlegen, falls nicht vorhanden
existing = {}
for c in range(0, data_range.EndColumn+1):
try:
h = str(sheet.getCellByPosition(c, header_row).String).strip()
except Exception:
h = ""
if h == "Norm_Treffer":
existing["Norm_Treffer"] = c
if h == "Norm_Vorschlag":
existing["Norm_Vorschlag"] = c
last_col = data_range.EndColumn
if "Norm_Treffer" not in existing:
last_col += 1
existing["Norm_Treffer"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
if "Norm_Vorschlag" not in existing:
last_col += 1
existing["Norm_Vorschlag"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
if "Kein_Treffer" not in existing:
last_col += 1
existing["Kein_Treffer"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Kein_Treffer"
norm_tr_col = existing["Norm_Treffer"]
norm_sug_col = existing["Norm_Vorschlag"]
kein_tr_col = existing["Kein_Treffer"]
# NV_MASTER laden
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
if not norm_dict and not lemma_index:
log("NV_MASTER leer oder nicht lesbar. Abbruch.")
return
# Farben
GREEN = 0xADFF2F
YELLOW = 0xFFA500
RED = 0xCC0000
WHITE = 0xFFFFFF
rows_processed = 0
for r in range(header_row + 1, data_range.EndRow + 1):
try:
cell = sheet.getCellByPosition(objekt_col, r)
txt = str(cell.String).strip()
if not txt:
continue
# Term-Extraktion
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
terms = []
for cl in clauses:
parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS:
continue
if re.fullmatch(r"\d+", p):
continue
terms.append(p)
row_hits = []
row_sugs = []
row_ids = []
unmapped_terms = []
for term in terms:
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
if hits:
row_hits.extend([f"{h} ({id_})" if id_ else h for h,id_ in zip(hits, ids + [""]*len(hits))])
else:
unmapped_terms.append(term)
if sugs:
row_sugs.extend([f"{s}" for s in sugs])
if ids:
row_ids.extend(ids)
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
row_hits = uniq(row_hits)
row_sugs = uniq(row_sugs)
unmapped_terms = uniq(unmapped_terms)
# Farb-Logik
if terms and not unmapped_terms and row_hits:
cell.CellBackColor = GREEN
row_sugs = [] # keine Vorschläge wenn alles Treffer
elif row_hits:
cell.CellBackColor = YELLOW
else:
cell.CellBackColor = RED
# Ergebnisse schreiben
tr_cell = sheet.getCellByPosition(norm_tr_col, r)
tr_cell.String = " | ".join(row_hits)
tr_cell.CellBackColor = GREEN if row_hits else WHITE
sug_cell = sheet.getCellByPosition(norm_sug_col, r)
sug_cell.String = " | ".join(row_sugs)
sug_cell.CellBackColor = YELLOW if row_sugs else WHITE
kt_cell = sheet.getCellByPosition(kein_tr_col, r)
kt_cell.String = " | ".join(unmapped_terms)
kt_cell.CellBackColor = RED if unmapped_terms else WHITE
rows_processed += 1
except Exception as e:
log(f"Fehler in Zeile {r}: {e}\n{traceback.format_exc()}")
# Cache speichern
try:
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
except Exception:
pass
log(f"run_mapper_macro fertig. Zeilen verarbeitet: {rows_processed}")
# Export für LibreOffice
g_exportedScripts = (run_mapper_macro,)

Binary file not shown.

Binary file not shown.

View File

@ -1,171 +0,0 @@
import os
import re
import logging
import datetime
import pandas as pd
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment
import ezodf
# ----------------- KONFIGURATION -----------------
INPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods"
MASTER_SHEET_NAME = "Masterstruktur"
today = datetime.datetime.today().strftime("%y.%m.%d")
base, ext = os.path.splitext(INPUT_FILE)
OUTPUT_FILE = f"{base}_Updated_{today}{ext}"
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# ----------------- HILFSFUNKTIONEN -----------------
def load_file(input_file):
"""
Prüft Dateiformat und gibt für Excel: pd.ExcelFile + Engine zurück,
für ODS: None + "odf" (da ODS direkt über ezodf gelesen wird).
"""
ext = os.path.splitext(input_file)[1].lower()
if ext in [".xlsx", ".xls"]:
engine = "openpyxl"
xls = pd.ExcelFile(input_file, engine=engine)
elif ext == ".ods":
engine = "odf"
xls = None # ODS wird direkt über ezodf gelesen
else:
raise ValueError(f"Nicht unterstütztes Dateiformat: {ext}")
logging.info(f"Lade Datei {input_file} mit Engine '{engine}'")
return xls, engine
def read_ods_sheet(filename, sheet_name):
"""Liests ODS Sheet sauber ein, inklusive Header."""
doc = ezodf.opendoc(filename)
sheet = doc.sheets[sheet_name]
data = []
headers = [str(sheet[0, col].value).strip() for col in range(sheet.ncols())]
for row_idx in range(1, sheet.nrows()):
row = {}
empty_row = True
for col_idx, col_name in enumerate(headers):
cell_val = sheet[row_idx, col_idx].value
val = "" if cell_val is None else str(cell_val).strip()
row[col_name] = val
if val:
empty_row = False
if not empty_row:
data.append(row)
df = pd.DataFrame(data, columns=headers)
return df
def process_category_sheet(df):
"""Erstellt die treppenartige Hierarchie."""
df = df.copy()
for col in ["ID","Unterkategorie","Unterunterkategorie","Wort/Vokabel"]:
if col not in df.columns:
df[col] = ""
rows = []
current_id = ""
current_uuk = ""
for _, r in df.iterrows():
id_val = str(r.get("ID","")).strip()
uuk_val = str(r.get("Unterunterkategorie","")).strip()
word_val = str(r.get("Wort/Vokabel","")).strip()
if id_val: # Kategoriezeile
current_id = id_val
current_uuk = uuk_val or word_val
rows.append({"ID": current_id, "Unterkategorie": "", "Unterunterkategorie": current_uuk, "Wort/Vokabel": ""})
continue
if uuk_val: # Unterunterkategorie
current_uuk = uuk_val
rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": current_uuk, "Wort/Vokabel": ""})
continue
if word_val: # Vokabel
rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": "", "Wort/Vokabel": word_val})
continue
return pd.DataFrame(rows, columns=["ID","Unterkategorie","Unterunterkategorie","Wort/Vokabel"])
def remove_empty_vocabulary_rows(df):
"""Entfernt Zeilen, die nur leere Wort/Vokabel-Spalte haben."""
return df[df["Wort/Vokabel"].astype(str).str.strip() != ""].copy().reset_index(drop=True)
def sync_master_and_sheets(master_df, category_dfs):
"""Synchronisiert Kategorien nach Master, Vokabeln bleiben erhalten."""
master_df = master_df.copy()
master_df["ID"] = master_df["ID"].astype(str).str.strip()
master_dict = dict(zip(master_df["ID"], master_df["Kategorie"]))
updated_dfs = {}
summary = {}
for sheet_name, df in category_dfs.items():
rows_out = []
changes = {"removed":0}
for _, row in df.iterrows():
id_val = str(row.get("ID","")).strip()
if id_val and id_val not in master_dict:
changes["removed"] +=1
continue
rows_out.append(row.to_dict())
updated_dfs[sheet_name] = pd.DataFrame(rows_out, columns=df.columns)
summary[sheet_name] = changes
new_master = pd.DataFrame([{"ID":k,"Kategorie":v} for k,v in sorted(master_dict.items())])
return new_master, updated_dfs, summary
def save_excel(processed_sheets, output_file):
from openpyxl import Workbook
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
for sheet_name, df in processed_sheets.items():
df.to_excel(writer, sheet_name=sheet_name, index=False)
ws = writer.sheets[sheet_name]
for col_idx, col in enumerate(df.columns,1):
max_len = max(df[col].astype(str).map(len).max() if len(df)>0 else 0,len(col))+2
ws.column_dimensions[get_column_letter(col_idx)].width = max_len
for row_idx in range(1,len(df)+2):
ws.cell(row=row_idx,column=col_idx).alignment = Alignment(horizontal='left')
def save_ods(processed_sheets, output_file):
doc = ezodf.newdoc(doctype="ods", filename=output_file)
for name, df in processed_sheets.items():
sheet = ezodf.Sheet(name, size=(len(df)+1,len(df.columns)))
doc.sheets += sheet
for col_idx, col_name in enumerate(df.columns):
sheet[0,col_idx].set_value(col_name)
for row_idx,row in enumerate(df.itertuples(index=False),start=1):
for col_idx,value in enumerate(row):
sheet[row_idx,col_idx].set_value("" if pd.isna(value) else value)
doc.save()
# ----------------- HAUPTPROGRAMM -----------------
def main():
xls, engine = load_file(INPUT_FILE)
if engine == "odf":
doc = ezodf.opendoc(INPUT_FILE)
sheet_names = [s.name for s in doc.sheets if s.name != MASTER_SHEET_NAME]
category_dfs = {name: process_category_sheet(read_ods_sheet(INPUT_FILE,name)) for name in sheet_names}
master_df = read_ods_sheet(INPUT_FILE, MASTER_SHEET_NAME)
else:
sheet_names = [s for s in xls.sheet_names if s != MASTER_SHEET_NAME]
category_dfs = {}
for sheet_name in sheet_names:
df = pd.read_excel(xls, sheet_name=sheet_name, engine=engine)
df.columns = [str(c).strip() for c in df.columns]
category_dfs[sheet_name] = process_category_sheet(df)
master_df = pd.read_excel(xls, sheet_name=MASTER_SHEET_NAME, engine=engine)
master_df.columns = [str(c).strip() for c in master_df.columns]
new_master, updated_dfs, summary = sync_master_and_sheets(master_df, category_dfs)
processed_sheets = {MASTER_SHEET_NAME:new_master}
processed_sheets.update({k:remove_empty_vocabulary_rows(v) for k,v in updated_dfs.items()})
ext_out = os.path.splitext(OUTPUT_FILE)[1].lower()
if ext_out in [".xlsx",".xls"]:
save_excel(processed_sheets, OUTPUT_FILE)
else:
save_ods(processed_sheets, OUTPUT_FILE)
logging.info(f"Datei gespeichert: {OUTPUT_FILE}")
logging.info("===== SYNC SUMMARY =====")
for sheet, info in summary.items():
logging.info(f"{sheet}: {info}")
if __name__ == "__main__":
main()

View File

@ -1,3 +1,32 @@
"""
===============================================================================
Skriptname: NV_SPOT_Export.py
Beschreibung:
Dieses Skript soll hierarchische Normvokabular-Tabellen
(ODS/XLSX-Format) in eine JSON-basierte SPOT-Struktur (Strukturierter
Positionsbaum) konvertieren. Es ermöglicht das Exportieren in Excel und ODS, sowie
das nachträgliche Ergänzen von Kategorien, Unterkategorien und Wörtern.
//NOCH NICHT GETESTET//
Hauptfunktionen:
- Node: Klasse zur Repräsentation von Baumknoten.
- load_excel_or_ods: Lädt Tabellen aus ODS/XLSX-Dateien.
- process_sheet_to_tree: Erzeugt eine Baumstruktur aus einem Sheet.
- save_spot_json: Speichert den SPOT-Baum als JSON.
- load_spot_json: Lädt SPOT-Daten aus JSON-Dateien.
- export_spot_to_excel: Exportiert den SPOT-Baum nach Excel.
- export_spot_to_ods: Exportiert den SPOT-Baum nach ODS.
- add_category/subcategory/word: Fügt Elemente im Baum hinzu.
- main: Steuert den Workflow.
Abhängigkeiten:
Python 3.x, pandas, openpyxl, ezodf, json, logging, datetime
Stand: 2025-10-01
===============================================================================
"""
import os import os
import json import json
import datetime import datetime
@ -12,6 +41,20 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(
# ---------------- SPOT-Baumstruktur ---------------- # ---------------- SPOT-Baumstruktur ----------------
class Node: class Node:
"""
Repräsentiert einen Knoten in der SPOT-Baumstruktur.
Attribute:
name (str): Anzeigename des Knotens.
id (str): Optionale ID (nur für Kategorien).
type (str): Knotentyp ("category", "subcategory", "word").
children (list[Node]): Unterknoten.
Methoden:
add_child(child): Fügt einen Unterknoten hinzu.
to_dict(): Serialisiert den Knoten in ein Dictionary/JSON-kompatibles Format.
from_dict(d): Rekonstruiert den Baum aus einem Dictionary.
"""
def __init__(self, name, node_type="category", id=None): def __init__(self, name, node_type="category", id=None):
self.name = name self.name = name
self.id = id self.id = id
@ -19,9 +62,11 @@ class Node:
self.children = [] self.children = []
def add_child(self, child): def add_child(self, child):
"""Fügt dem aktuellen Knoten einen Unterknoten hinzu."""
self.children.append(child) self.children.append(child)
def to_dict(self): def to_dict(self):
"""Wandelt den Knoten (rekursiv) in ein Dictionary um."""
if self.type == "word": if self.type == "word":
return self.name return self.name
return { return {
@ -33,14 +78,26 @@ class Node:
@staticmethod @staticmethod
def from_dict(d): def from_dict(d):
"""Erzeugt aus einem Dictionary ein Node-Objekt (rekursiv)."""
if isinstance(d, str): if isinstance(d, str):
return Node(d, "word") return Node(d, "word")
node = Node(d["name"], d.get("type", "category"), d.get("id")) node = Node(d["name"], d.get("type", "category"), d.get("id"))
node.children = [Node.from_dict(c) for c in d.get("children", [])] node.children = [Node.from_dict(c) for c in d.get("children", [])]
return node return node
# ---------------- Funktionen zum Laden ---------------- # ---------------- Funktionen zum Laden ----------------
def load_excel_or_ods(input_file, master_sheet="Masterstruktur"): def load_excel_or_ods(input_file, master_sheet="Masterstruktur"):
"""
Lädt ODS oder Excel-Datei und gibt Master- sowie Kategorien-DataFrames zurück.
Parameter:
input_file (str): Pfad zur Quelldatei.
master_sheet (str): Name des Masterblattes.
Rückgabe:
(master_df, dfs): Master-DataFrame und Dictionary mit anderen Sheets.
"""
ext = os.path.splitext(input_file)[1].lower() ext = os.path.splitext(input_file)[1].lower()
engine = "openpyxl" if ext in [".xlsx", ".xls"] else "odf" engine = "openpyxl" if ext in [".xlsx", ".xls"] else "odf"
xls = pd.ExcelFile(input_file, engine=engine) xls = pd.ExcelFile(input_file, engine=engine)
@ -49,26 +106,44 @@ def load_excel_or_ods(input_file, master_sheet="Masterstruktur"):
master_df = pd.read_excel(xls, sheet_name=master_sheet, engine=engine) master_df = pd.read_excel(xls, sheet_name=master_sheet, engine=engine)
return master_df, dfs return master_df, dfs
# ---------------- Baum aus Sheet erstellen ---------------- # ---------------- Baum aus Sheet erstellen ----------------
def process_sheet_to_tree(df): def process_sheet_to_tree(df):
"""
Wandelt ein Kategoriensheet in eine hierarchische Baumstruktur (Liste von Nodes) um.
Struktur:
Kategorie Unterkategorie Wort
Parameter:
df (pd.DataFrame): Eingabedaten mit Spalten ["ID", "Unterkategorie",
"Unterunterkategorie", "Wort/Vokabel"].
Rückgabe:
list[Node]: Liste von Baumknoten der obersten Ebene.
"""
df = df.fillna("").astype(str) df = df.fillna("").astype(str)
tree_nodes = [] tree_nodes = []
current_cat = None current_cat = None
current_sub = None current_sub = None
for idx, row in df.iterrows(): for idx, row in df.iterrows():
id_val = row.get("ID", "").strip() id_val = row.get("ID", "").strip()
uk_val = row.get("Unterkategorie", "").strip() uk_val = row.get("Unterkategorie", "").strip()
uuk_val = row.get("Unterunterkategorie", "").strip() uuk_val = row.get("Unterunterkategorie", "").strip()
word_val = row.get("Wort/Vokabel", "").strip() word_val = row.get("Wort/Vokabel", "").strip()
# Neue Kategorieebene
if id_val: if id_val:
current_cat = Node(uk_val or word_val, "category", id=id_val) current_cat = Node(uk_val or word_val, "category", id=id_val)
tree_nodes.append(current_cat) tree_nodes.append(current_cat)
current_sub = None current_sub = None
# Unterkategorie
elif uuk_val: elif uuk_val:
current_sub = Node(uuk_val, "subcategory") current_sub = Node(uuk_val, "subcategory")
if current_cat: if current_cat:
current_cat.add_child(current_sub) current_cat.add_child(current_sub)
# Wortebene
elif word_val: elif word_val:
word_node = Node(word_val, "word") word_node = Node(word_val, "word")
if current_sub: if current_sub:
@ -77,28 +152,60 @@ def process_sheet_to_tree(df):
current_cat.add_child(word_node) current_cat.add_child(word_node)
return tree_nodes return tree_nodes
# ---------------- SPOT laden/speichern ---------------- # ---------------- SPOT laden/speichern ----------------
def save_spot_json(tree_nodes, file_path): def save_spot_json(tree_nodes, file_path):
"""
Speichert den SPOT-Baum als JSON-Datei.
Parameter:
tree_nodes (list[Node]): Wurzelknoten der Baumstruktur.
file_path (str): Zielpfad.
"""
with open(file_path, "w", encoding="utf-8") as f: with open(file_path, "w", encoding="utf-8") as f:
json.dump([n.to_dict() for n in tree_nodes], f, indent=2, ensure_ascii=False) json.dump([n.to_dict() for n in tree_nodes], f, indent=2, ensure_ascii=False)
logging.info(f"SPOT gespeichert: {file_path}") logging.info(f"SPOT gespeichert: {file_path}")
def load_spot_json(file_path): def load_spot_json(file_path):
"""
Lädt SPOT-JSON-Datei und rekonstruiert den Baum.
Parameter:
file_path (str): Pfad zur JSON-Datei.
Rückgabe:
list[Node]: Liste oberster Knoten.
"""
with open(file_path, "r", encoding="utf-8") as f: with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f) data = json.load(f)
return [Node.from_dict(n) for n in data] return [Node.from_dict(n) for n in data]
# ---------------- Export in Excel ---------------- # ---------------- Export in Excel ----------------
def export_spot_to_excel(tree_nodes, output_file): def export_spot_to_excel(tree_nodes, output_file):
"""
Exportiert den SPOT-Baum in eine Excel-Datei.
Struktur:
Spalten AD: ID, Kategorie, Unterkategorie, Wort.
Parameter:
tree_nodes (list[Node]): Baumstruktur.
output_file (str): Zielpfad der Excel-Datei.
"""
wb = Workbook() wb = Workbook()
wb.remove(wb.active) wb.remove(wb.active)
for node in tree_nodes: for node in tree_nodes:
ws = wb.create_sheet(title=node.name[:31]) ws = wb.create_sheet(title=node.name[:31])
row_idx = 1 row_idx = 1
# Kategorie
# Kategoriezeile
ws.cell(row=row_idx, column=1, value=node.id) ws.cell(row=row_idx, column=1, value=node.id)
ws.cell(row=row_idx, column=2, value=node.name) ws.cell(row=row_idx, column=2, value=node.name)
row_idx += 1 row_idx += 1
for sub in node.children: for sub in node.children:
if sub.type == "subcategory": if sub.type == "subcategory":
ws.cell(row=row_idx, column=3, value=sub.name) ws.cell(row=row_idx, column=3, value=sub.name)
@ -109,28 +216,44 @@ def export_spot_to_excel(tree_nodes, output_file):
elif sub.type == "word": elif sub.type == "word":
ws.cell(row=row_idx, column=4, value=sub.name) ws.cell(row=row_idx, column=4, value=sub.name)
row_idx += 1 row_idx += 1
# Spaltenbreiten anpassen
# Spaltenbreiten und Ausrichtung
for col_idx, col_letter in enumerate(["A", "B", "C", "D"], 1): for col_idx, col_letter in enumerate(["A", "B", "C", "D"], 1):
ws.column_dimensions[col_letter].width = 20 ws.column_dimensions[col_letter].width = 20
for r in range(1, row_idx): for r in range(1, row_idx):
ws.cell(r, col_idx).alignment = Alignment(horizontal='left') ws.cell(r, col_idx).alignment = Alignment(horizontal='left')
wb.save(output_file) wb.save(output_file)
logging.info(f"Excel exportiert: {output_file}") logging.info(f"Excel exportiert: {output_file}")
# ---------------- Export in ODS ---------------- # ---------------- Export in ODS ----------------
def export_spot_to_ods(tree_nodes, output_file): def export_spot_to_ods(tree_nodes, output_file):
"""
Exportiert den SPOT-Baum in eine ODS-Datei.
Struktur analog zum Excel-Export.
Parameter:
tree_nodes (list[Node]): Baumstruktur.
output_file (str): Zielpfad der ODS-Datei.
"""
doc = ezodf.newdoc(doctype="ods", filename=output_file) doc = ezodf.newdoc(doctype="ods", filename=output_file)
for node in tree_nodes: for node in tree_nodes:
sheet = ezodf.Sheet(node.name[:31], size=(len(node.children) + 10, 4)) sheet = ezodf.Sheet(node.name[:31], size=(len(node.children) + 10, 4))
doc.sheets += sheet doc.sheets += sheet
sheet[0, 0].set_value("ID") sheet[0, 0].set_value("ID")
sheet[0, 1].set_value("Unterkategorie") sheet[0, 1].set_value("Unterkategorie")
sheet[0, 2].set_value("Unterunterkategorie") sheet[0, 2].set_value("Unterunterkategorie")
sheet[0, 3].set_value("Wort/Vokabel") sheet[0, 3].set_value("Wort/Vokabel")
row_idx = 1 row_idx = 1
sheet[row_idx, 0].set_value(node.id) sheet[row_idx, 0].set_value(node.id)
sheet[row_idx, 1].set_value(node.name) sheet[row_idx, 1].set_value(node.name)
row_idx += 1 row_idx += 1
for sub in node.children: for sub in node.children:
if sub.type == "subcategory": if sub.type == "subcategory":
sheet[row_idx, 2].set_value(sub.name) sheet[row_idx, 2].set_value(sub.name)
@ -141,22 +264,51 @@ def export_spot_to_ods(tree_nodes, output_file):
elif sub.type == "word": elif sub.type == "word":
sheet[row_idx, 3].set_value(sub.name) sheet[row_idx, 3].set_value(sub.name)
row_idx += 1 row_idx += 1
doc.save() doc.save()
logging.info(f"ODS exportiert: {output_file}") logging.info(f"ODS exportiert: {output_file}")
# ---------------- CLI-Funktionen zum Editieren ---------------- # ---------------- CLI-Funktionen zum Editieren ----------------
def add_category(tree_nodes, cat_id, cat_name): def add_category(tree_nodes, cat_id, cat_name):
"""
Fügt eine neue Kategorie zum SPOT-Baum hinzu.
Parameter:
tree_nodes (list[Node]): Liste der obersten Knoten.
cat_id (str): ID der Kategorie.
cat_name (str): Name der Kategorie.
"""
tree_nodes.append(Node(cat_name, "category", id=cat_id)) tree_nodes.append(Node(cat_name, "category", id=cat_id))
logging.info(f"Kategorie hinzugefügt: {cat_id} {cat_name}") logging.info(f"Kategorie hinzugefügt: {cat_id} {cat_name}")
def add_subcategory(tree_nodes, cat_id, sub_name): def add_subcategory(tree_nodes, cat_id, sub_name):
"""
Fügt einer vorhandenen Kategorie eine Unterkategorie hinzu.
Parameter:
tree_nodes (list[Node]): Wurzelknoten.
cat_id (str): Zielkategorie-ID.
sub_name (str): Name der Unterkategorie.
"""
for cat in tree_nodes: for cat in tree_nodes:
if cat.id == cat_id: if cat.id == cat_id:
cat.add_child(Node(sub_name, "subcategory")) cat.add_child(Node(sub_name, "subcategory"))
logging.info(f"Unterkategorie hinzugefügt: {sub_name} in {cat_id}") logging.info(f"Unterkategorie hinzugefügt: {sub_name} in {cat_id}")
return return
def add_word(tree_nodes, cat_id, sub_name, word_name): def add_word(tree_nodes, cat_id, sub_name, word_name):
"""
Fügt einem Unterknoten ein Wort hinzu.
Parameter:
tree_nodes (list[Node]): Wurzelknoten.
cat_id (str): ID der Kategorie.
sub_name (str): Name der Unterkategorie.
word_name (str): Neues Wort.
"""
for cat in tree_nodes: for cat in tree_nodes:
if cat.id == cat_id: if cat.id == cat_id:
for sub in cat.children: for sub in cat.children:
@ -165,9 +317,18 @@ def add_word(tree_nodes, cat_id, sub_name, word_name):
logging.info(f"Wort hinzugefügt: {word_name} unter {sub_name}") logging.info(f"Wort hinzugefügt: {word_name} unter {sub_name}")
return return
# ---------------- HAUPTPROGRAMM ---------------- # ---------------- HAUPTPROGRAMM ----------------
def main(): def main():
INPUT_FILE = "NV_MASTER.ods" # Beispielpfad """
Ablauf:
1. Liest Masterdatei (ODS oder XLSX).
2. Wandelt Kategorienblätter in SPOT-Struktur um.
3. Speichert SPOT als JSON.
4. Exportiert SPOT nach Excel und ODS.
5. Optional: Bearbeiten des Baums über CLI-Funktionen.
"""
INPUT_FILE = "NV_MASTER.ods"
OUTPUT_SPOT = "nv_spot.json" OUTPUT_SPOT = "nv_spot.json"
today = datetime.datetime.today().strftime("%y.%m.%d") today = datetime.datetime.today().strftime("%y.%m.%d")
OUTPUT_EXCEL = f"NV_MASTER_SPOT_{today}.xlsx" OUTPUT_EXCEL = f"NV_MASTER_SPOT_{today}.xlsx"
@ -177,9 +338,10 @@ def main():
spot_tree = [] spot_tree = []
for sheet, df in dfs.items(): for sheet, df in dfs.items():
spot_tree.extend(process_sheet_to_tree(df)) spot_tree.extend(process_sheet_to_tree(df))
save_spot_json(spot_tree, OUTPUT_SPOT) save_spot_json(spot_tree, OUTPUT_SPOT)
# Beispiel: Editieren # Beispielhafte Nutzung der Editierfunktionen:
# add_category(spot_tree, "10.1", "Neue Kategorie") # add_category(spot_tree, "10.1", "Neue Kategorie")
# add_subcategory(spot_tree, "10.1", "Neue Unterunterkategorie") # add_subcategory(spot_tree, "10.1", "Neue Unterunterkategorie")
# add_word(spot_tree, "10.1", "Neue Unterunterkategorie", "Neues Wort") # add_word(spot_tree, "10.1", "Neue Unterunterkategorie", "Neues Wort")
@ -188,5 +350,6 @@ def main():
export_spot_to_ods(spot_tree, OUTPUT_ODS) export_spot_to_ods(spot_tree, OUTPUT_ODS)
logging.info("SPOT-Workflow abgeschlossen.") logging.info("SPOT-Workflow abgeschlossen.")
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -1,13 +1,10 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
NormVokabular Mapper Version 1.4.1 NormVokabular Mapper Version 1.4.2
- Detailliertes (DEBUG) Batch-Logging: gepufferte Logs werden periodisch in Konsole + Datei geschrieben
- Getty AAT (SPARQL via requests) API-polite, timeout/retries/backoff Dieses Skript normalisiert und mappt Begriffe aus Input-Dateien auf ein zentrales Normvokabular
- Fehlertoleranz: API-Ausfälle führen nicht zum Totalabsturz und führt optional API-Abgleiche mit GND und Wikidata durch. Ergebnisse werden in Excel/ODS gespeichert.
- Fehlende Begriffe -> separate Datei (gleiches Format wie Output)
- Bestehende Normalisierung/Lemmatisierung/Stemming wird weiterverwendet
- Batch-Logging-Modus (konfigurierbar)
""" """
from __future__ import annotations from __future__ import annotations
@ -25,50 +22,52 @@ from collections import defaultdict
from difflib import SequenceMatcher from difflib import SequenceMatcher
from datetime import datetime from datetime import datetime
# Optional libs # Optional Libraries
try: try:
from rapidfuzz import fuzz from rapidfuzz import fuzz # für schnellere String-Similarity
RAPIDFUZZ_AVAILABLE = True RAPIDFUZZ_AVAILABLE = True
except Exception: except Exception:
RAPIDFUZZ_AVAILABLE = False RAPIDFUZZ_AVAILABLE = False
try: try:
import spacy import spacy
nlp = spacy.load("de_core_news_sm") nlp = spacy.load("de_core_news_sm") # deutsche Lemmatization
SPACY_AVAILABLE = True SPACY_AVAILABLE = True
except Exception: except Exception:
SPACY_AVAILABLE = False SPACY_AVAILABLE = False
nlp = None nlp = None
# ========================= # =========================
# Config & Pfade # Konfiguration & Pfade
# ========================= # =========================
INPUT_DIR = Path("Input CSV") INPUT_DIR = Path("Input CSV") # Eingabeverzeichnis
OUTPUT_DIR = Path("Auswertung Ergebnisse") OUTPUT_DIR = Path("Auswertung Ergebnisse") # Ausgabeordner
OUTPUT_DIR.mkdir(exist_ok=True) OUTPUT_DIR.mkdir(exist_ok=True) # Verzeichnis erstellen, falls nicht vorhanden
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods") NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods") # Normvokabular-Datei
CACHE_FILE = "api_cache.json" CACHE_FILE = "api_cache.json" # Cache für API-Antworten
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"} STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75 CONF_THRESHOLD = 0.75 # Threshold für Vorschläge
TIMEOUT_DEFAULT = 5 TIMEOUT_DEFAULT = 5
MAX_RETRIES_DEFAULT = 3 MAX_RETRIES_DEFAULT = 3
BACKOFF_FACTOR_DEFAULT = 2 BACKOFF_FACTOR_DEFAULT = 2
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"} HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
API_ACTIVE = {"gnd": True, "wikidata": True, "aat": True} API_ACTIVE = {"gnd": True, "wikidata": True} # API-Verfügbarkeit
FAIL_COUNTER = {"gnd": 0, "wikidata": 0, "aat": 0} FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
# Logging file # Logging-Parameter
LOG_FILE = OUTPUT_DIR / "mapper_log.txt" LOG_FILE = OUTPUT_DIR / "mapper_log.txt"
LOG_BATCH_SIZE = 100 # Anzahl Logs vor Flush
# Batch logging parameters LOG_FLUSH_INTERVAL = 5.0 # Sekunden zwischen Flushes
LOG_BATCH_SIZE = 100 # flush wenn >= Einträge LOG_LEVEL = "DEBUG" # Logging-Level
LOG_FLUSH_INTERVAL = 5.0 # Sekunden zwischen Flushes (Batch-Logging)
LOG_LEVEL = "DEBUG" # ausführlich gewünscht
# ========================= # =========================
# Buffered/Batched Logger # Batch/Buffered Logger
# ========================= # =========================
class BatchLogger: class BatchLogger:
"""
Buffered Logger: Speichert Logs in einem Queue-Buffer und schreibt sie periodisch in Datei und Konsole.
Reduziert I/O-Aufwand bei vielen Logs.
"""
def __init__(self, logfile: Path, flush_interval: float = 5.0, batch_size: int = 100, level: str = "DEBUG"): def __init__(self, logfile: Path, flush_interval: float = 5.0, batch_size: int = 100, level: str = "DEBUG"):
self.logfile = logfile self.logfile = logfile
self.flush_interval = flush_interval self.flush_interval = flush_interval
@ -77,7 +76,7 @@ class BatchLogger:
self.q = queue.Queue() self.q = queue.Queue()
self._stop_event = threading.Event() self._stop_event = threading.Event()
self._thread = threading.Thread(target=self._worker, daemon=True, name="BatchLoggerThread") self._thread = threading.Thread(target=self._worker, daemon=True, name="BatchLoggerThread")
# Ensure logfile exists # Sicherstellen, dass die Log-Datei existiert
try: try:
logfile.parent.mkdir(parents=True, exist_ok=True) logfile.parent.mkdir(parents=True, exist_ok=True)
logfile.touch(exist_ok=True) logfile.touch(exist_ok=True)
@ -86,35 +85,33 @@ class BatchLogger:
self._thread.start() self._thread.start()
def _format(self, level: str, msg: str) -> str: def _format(self, level: str, msg: str) -> str:
"""Formatiert Logeinträge mit Timestamp"""
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
return f"{ts} - {level} - {msg}" return f"{ts} - {level} - {msg}"
def log(self, level: str, msg: str): def log(self, level: str, msg: str):
"""Fügt Log dem Queue hinzu und löst Flush aus, falls Batchgröße erreicht"""
if self._stop_event.is_set(): if self._stop_event.is_set():
return return
formatted = self._format(level, msg) formatted = self._format(level, msg)
self.q.put((level, formatted)) self.q.put((level, formatted))
# If queue too big, trigger immediate flush by putting a special token
if self.q.qsize() >= self.batch_size: if self.q.qsize() >= self.batch_size:
self.q.put(("__FLUSH__", "__FLUSH__")) self.q.put(("__FLUSH__", "__FLUSH__"))
def debug(self, msg: str): def debug(self, msg: str):
if LOG_LEVEL in ("DEBUG",): if LOG_LEVEL in ("DEBUG",):
self.log("DEBUG", msg) self.log("DEBUG", msg)
def info(self, msg: str): def info(self, msg: str):
self.log("INFO", msg) self.log("INFO", msg)
def warning(self, msg: str): def warning(self, msg: str):
self.log("WARNING", msg) self.log("WARNING", msg)
def error(self, msg: str): def error(self, msg: str):
self.log("ERROR", msg) self.log("ERROR", msg)
def exception(self, msg: str): def exception(self, msg: str):
self.log("EXCEPTION", msg) self.log("EXCEPTION", msg)
def _worker(self): def _worker(self):
"""Hintergrund-Thread: verarbeitet Queue, schreibt Logs periodisch"""
buffer = [] buffer = []
last_flush = time.time() last_flush = time.time()
while not self._stop_event.is_set() or not self.q.empty(): while not self._stop_event.is_set() or not self.q.empty():
@ -123,7 +120,6 @@ class BatchLogger:
try: try:
item = self.q.get(timeout=self.flush_interval) item = self.q.get(timeout=self.flush_interval)
except queue.Empty: except queue.Empty:
# time-based flush
if buffer: if buffer:
self._flush_buffer(buffer) self._flush_buffer(buffer)
buffer = [] buffer = []
@ -141,36 +137,30 @@ class BatchLogger:
continue continue
buffer.append((level, formatted)) buffer.append((level, formatted))
# flush conditions
if len(buffer) >= self.batch_size or (time.time() - last_flush) >= self.flush_interval: if len(buffer) >= self.batch_size or (time.time() - last_flush) >= self.flush_interval:
self._flush_buffer(buffer) self._flush_buffer(buffer)
buffer = [] buffer = []
last_flush = time.time() last_flush = time.time()
except Exception as e: except Exception as e:
# As a last resort, write error immediately to stderr
try: try:
sys.stderr.write(f"BatchLogger worker error: {e}\n") sys.stderr.write(f"BatchLogger worker error: {e}\n")
except Exception: except Exception:
pass pass
time.sleep(0.5) time.sleep(0.5)
# final flush
if buffer: if buffer:
self._flush_buffer(buffer) self._flush_buffer(buffer)
def _flush_buffer(self, buffer): def _flush_buffer(self, buffer):
"""Schreibt Puffer in Datei und Konsole"""
if not buffer: if not buffer:
return return
# write to console and file
try: try:
# console
out_lines = [f"{line}\n" for _, line in buffer] out_lines = [f"{line}\n" for _, line in buffer]
# write to stdout
try: try:
sys.stdout.writelines(out_lines) sys.stdout.writelines(out_lines)
sys.stdout.flush() sys.stdout.flush()
except Exception: except Exception:
pass pass
# append to file
try: try:
with open(self.logfile, "a", encoding="utf-8") as f: with open(self.logfile, "a", encoding="utf-8") as f:
f.writelines(out_lines) f.writelines(out_lines)
@ -183,17 +173,17 @@ class BatchLogger:
pass pass
def stop(self): def stop(self):
"""Stoppt Logger-Thread"""
self._stop_event.set() self._stop_event.set()
# put sentinel to wake worker
try: try:
self.q.put(("__FLUSH__", "__FLUSH__")) self.q.put(("__FLUSH__", "__FLUSH__"))
except Exception: except Exception:
pass pass
self._thread.join(timeout=5.0) self._thread.join(timeout=5.0)
# Instantiate logger # Logger-Instanz erstellen
logger = BatchLogger(LOG_FILE, flush_interval=LOG_FLUSH_INTERVAL, batch_size=LOG_BATCH_SIZE, level=LOG_LEVEL) logger = BatchLogger(LOG_FILE, flush_interval=LOG_FLUSH_INTERVAL, batch_size=LOG_BATCH_SIZE, level=LOG_LEVEL)
logger.info("Starte NormVokabular Mapper v1.4.1 (Batch-Logging aktiv)") logger.info("Starte NormVokabular Mapper v1.4.2 (Batch-Logging aktiv)")
# ========================= # =========================
# Cache laden/speichern # Cache laden/speichern
@ -210,6 +200,7 @@ else:
CACHE = {} CACHE = {}
def save_cache(): def save_cache():
"""Speichert aktuellen Cache in JSON"""
try: try:
with open(CACHE_FILE,"w",encoding="utf-8") as f: with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False) json.dump(CACHE, f, indent=2, ensure_ascii=False)
@ -221,6 +212,7 @@ def save_cache():
# Normalisierung / Lemma / Tokenization # Normalisierung / Lemma / Tokenization
# ========================= # =========================
def normalize_text(s): def normalize_text(s):
"""Text in Kleinbuchstaben, Sonderzeichen entfernen, Trim"""
if not s: if not s:
return "" return ""
s = str(s).lower().strip() s = str(s).lower().strip()
@ -229,8 +221,8 @@ def normalize_text(s):
return s return s
lemma_cache = {} lemma_cache = {}
def lemmatize_term(term): def lemmatize_term(term):
"""Lemmatize mit spaCy, Cache für Performance"""
term_norm = normalize_text(term) term_norm = normalize_text(term)
if term_norm in lemma_cache: if term_norm in lemma_cache:
return lemma_cache[term_norm] return lemma_cache[term_norm]
@ -246,6 +238,7 @@ def lemmatize_term(term):
return lemma return lemma
def compound_split(term): def compound_split(term):
"""Splittet Komposita nach -, _, / oder Leerzeichen"""
if not term: if not term:
return [] return []
parts = [p for p in re.split(r"[\s\-_/]+", term) if p] parts = [p for p in re.split(r"[\s\-_/]+", term) if p]
@ -255,24 +248,29 @@ def compound_split(term):
# Normvokabular laden & Index # Normvokabular laden & Index
# ========================= # =========================
def load_normvokabular(file_path): def load_normvokabular(file_path):
"""Lädt Normvokabular aus Excel/ODS, erstellt Dictionarys für Mapping"""
try: try:
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None) sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
except Exception as e: except Exception as e:
logger.error(f"Normvokabular konnte nicht geladen werden: {e}") logger.error(f"Normvokabular konnte nicht geladen werden: {e}")
raise raise
norm_dict = {} norm_dict = {}
stem_index = defaultdict(list) stem_index = defaultdict(list)
lemma_norm_map = {} lemma_norm_map = {}
for sheet_name, df in sheets.items(): for sheet_name, df in sheets.items():
if sheet_name.lower() in ["master", "übersicht"]: if sheet_name.lower() in ["master", "übersicht"]:
continue continue # Übersichtsblätter ignorieren
df = df.dropna(how="all", axis=1) df = df.dropna(how="all", axis=1)
df.columns = [str(c).strip() for c in df.columns] df.columns = [str(c).strip() for c in df.columns]
# ID- und Wort-Spalte finden
id_col = next((c for c in df.columns if "ID" in c), None) id_col = next((c for c in df.columns if "ID" in c), None)
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c or "Begriff" in c), None) word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c or "Begriff" in c), None)
if not id_col or not word_col: if not id_col or not word_col:
continue continue
current_parent_id = None current_parent_id = None
for _, row in df.iterrows(): for _, row in df.iterrows():
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
@ -296,6 +294,10 @@ def load_normvokabular(file_path):
# Mapping & Vorschläge # Mapping & Vorschläge
# ========================= # =========================
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3): def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
"""
Mappt einen Begriff auf Normvokabular.
Prüft exakte Treffer, Lemma-Treffer, Komposita und generiert Vorschläge.
"""
term_norm = normalize_text(term) term_norm = normalize_text(term)
term_lemma = lemmatize_term(term) term_lemma = lemmatize_term(term)
@ -329,6 +331,7 @@ def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
return "KEIN TREFFER", "", combined_suggestions return "KEIN TREFFER", "", combined_suggestions
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD): def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
"""Ermittelt Vorschläge basierend auf Similarity"""
candidates = [] candidates = []
for key_lemma, entry in lemma_norm_map.items(): for key_lemma, entry in lemma_norm_map.items():
if RAPIDFUZZ_AVAILABLE: if RAPIDFUZZ_AVAILABLE:
@ -346,10 +349,14 @@ def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOL
# Generic request with retries & caching # Generic request with retries & caching
# ========================= # =========================
def request_with_retries_generic(api_name, url, params=None, headers=None, timeout=TIMEOUT_DEFAULT, max_retries=MAX_RETRIES_DEFAULT, backoff=BACKOFF_FACTOR_DEFAULT): def request_with_retries_generic(api_name, url, params=None, headers=None, timeout=TIMEOUT_DEFAULT, max_retries=MAX_RETRIES_DEFAULT, backoff=BACKOFF_FACTOR_DEFAULT):
"""
Sendet GET-Requests mit Retry-Logik, Backoff und Caching
"""
cache_key = url + (json.dumps(params, sort_keys=True, ensure_ascii=False) if params else "") cache_key = url + (json.dumps(params, sort_keys=True, ensure_ascii=False) if params else "")
if cache_key in CACHE: if cache_key in CACHE:
logger.debug(f"[Cache] {api_name}: {cache_key}") logger.debug(f"[Cache] {api_name}: {cache_key}")
return CACHE[cache_key] return CACHE[cache_key]
retries = 0 retries = 0
while retries < max_retries: while retries < max_retries:
try: try:
@ -378,9 +385,10 @@ def request_with_retries_generic(api_name, url, params=None, headers=None, timeo
return None return None
# ========================= # =========================
# GND / Wikidata (bestehend) # GND / Wikidata Batch Queries
# ========================= # =========================
def batch_query_gnd(terms): def batch_query_gnd(terms):
"""Batch-Abfrage der Begriffe bei GND"""
results = {} results = {}
if not API_ACTIVE.get("gnd", False): if not API_ACTIVE.get("gnd", False):
for t in terms: results[t] = "" for t in terms: results[t] = ""
@ -409,6 +417,7 @@ def batch_query_gnd(terms):
return results return results
def batch_query_wikidata(terms): def batch_query_wikidata(terms):
"""Batch-Abfrage der Begriffe bei Wikidata"""
results = {} results = {}
if not API_ACTIVE.get("wikidata", False): if not API_ACTIVE.get("wikidata", False):
for t in terms: results[t] = "" for t in terms: results[t] = ""
@ -423,10 +432,13 @@ def batch_query_wikidata(terms):
top = "" top = ""
try: try:
if data and "search" in data: if data and "search" in data:
# Ermittlung der Kandidaten mit Ähnlichkeitsbewertung
cands = [(e.get("label",""), SequenceMatcher(None, t.lower(), e.get("label","").lower()).ratio()) cands = [(e.get("label",""), SequenceMatcher(None, t.lower(), e.get("label","").lower()).ratio())
for e in data["search"] if e.get("label","")] for e in data["search"] if e.get("label","")]
# Filterung nach Mindestähnlichkeit (0.70)
cands = [c for c in cands if c[1] >= 0.70] cands = [c for c in cands if c[1] >= 0.70]
if cands: if cands:
# Bestes Ergebnis nach Ähnlichkeit auswählen
top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0] top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
except Exception as e: except Exception as e:
logger.debug(f"[WD] Fehler bei Verarbeitung für '{t}': {e}") logger.debug(f"[WD] Fehler bei Verarbeitung für '{t}': {e}")
@ -435,93 +447,14 @@ def batch_query_wikidata(terms):
logger.info(f"[WD] Fertig. Dauer: {elapsed:.1f}s") logger.info(f"[WD] Fertig. Dauer: {elapsed:.1f}s")
return results return results
# =========================
# Getty AAT Abfrage robust & API-polite (requests)
# =========================
def batch_query_getty_aat(terms):
results = {}
if not API_ACTIVE.get("aat", False):
for t in terms: results[t] = ""
return results
endpoint = "https://vocab.getty.edu/sparql"
headers = {"Accept": "application/sparql-results+json", "User-Agent": HEADERS.get("User-Agent")}
TIMEOUT = 8
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
FAIL_LIMIT = 5
fail_counter_local = 0
logger.info(f"[AAT] Starte Getty AAT-Abgleich für {len(terms)} Terme")
start_all = time.time()
for idx, term in enumerate(terms, start=1):
term_norm = lemmatize_term(normalize_text(term))
tokens = compound_split(term_norm)
logger.debug(f"[AAT] ({idx}/{len(terms)}) Begriff '{term}' -> Tokens: {tokens}")
query_fragments = []
for tkn in tokens:
t_escaped = tkn.replace('"', '\\"')
qf = f"""
?concept skos:prefLabel ?label .
FILTER(lang(?label)='de' && CONTAINS(LCASE(?label), LCASE("{t_escaped}")))
"""
query_fragments.append(f"{{ {qf} }}")
query_body = " UNION ".join(query_fragments) if query_fragments else ""
query = f"PREFIX skos: <http://www.w3.org/2004/02/skos/core#> SELECT ?label ?concept WHERE {{ {query_body} }} LIMIT 10"
retries = 0
success = False
start_term = time.time()
while retries < MAX_RETRIES and not success:
try:
logger.debug(f"[AAT] Anfrage (Retry {retries}) für '{term}'")
r = requests.get(endpoint, params={"query": query}, headers=headers, timeout=TIMEOUT)
if r.status_code != 200:
raise ValueError(f"HTTP {r.status_code}")
ret = r.json()
candidates = [(b['label']['value'], b['concept']['value']) for b in ret.get("results", {}).get("bindings", [])]
if candidates:
scored = [
(c[0], c[1], SequenceMatcher(None, term_norm, lemmatize_term(normalize_text(c[0]))).ratio())
for c in candidates
]
top = max(scored, key=lambda x: x[2])
results[term] = top[0]
logger.debug(f"[AAT] Treffer für '{term}': {results[term]} (Score: {top[2]:.3f})")
else:
results[term] = ""
logger.debug(f"[AAT] Kein Treffer für '{term}'")
success = True
except Exception as e:
retries += 1
wait = BACKOFF_FACTOR ** retries
logger.warning(f"[AAT] Fehler ({retries}/{MAX_RETRIES}) für '{term}': {e} warte {wait}s")
time.sleep(wait)
if retries == MAX_RETRIES:
results[term] = ""
fail_counter_local += 1
# polite delay
time.sleep(1.0)
elapsed_term = time.time() - start_term
logger.debug(f"[AAT] Dauer für '{term}': {elapsed_term:.2f}s")
if fail_counter_local >= FAIL_LIMIT:
logger.error("[AAT] Zu viele Fehler lokal - breche AAT-Abfragen ab.")
for t_rem in terms[idx:]:
results[t_rem] = ""
FAIL_COUNTER["aat"] += fail_counter_local
API_ACTIVE["aat"] = False
break
elapsed_all = time.time() - start_all
logger.info(f"[AAT] Getty AAT-Abgleich abgeschlossen. Dauer: {elapsed_all:.1f}s")
return results
# ========================= # =========================
# Markierung / Export (Excel/ODS) # Markierung / Export (Excel/ODS)
# ========================= # =========================
def mark_norm_hits(file_path): def mark_norm_hits(file_path):
"""
Markiert Treffer in Excel/ODS farblich:
Grün = Treffer, Rot = KEIN TREFFER
"""
ext = file_path.suffix.lower() ext = file_path.suffix.lower()
try: try:
if ext in [".xlsx", ".xls"]: if ext in [".xlsx", ".xls"]:
@ -529,12 +462,14 @@ def mark_norm_hits(file_path):
from openpyxl.styles import PatternFill from openpyxl.styles import PatternFill
wb = load_workbook(file_path) wb = load_workbook(file_path)
ws = wb.active ws = wb.active
# Spaltenmapping anhand der Kopfzeile
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])} col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
norm_col = col_map.get("Norm_Treffer", None) norm_col = col_map.get("Norm_Treffer", None)
if not norm_col: if not norm_col:
logger.debug("Spalte 'Norm_Treffer' nicht gefunden (mark_norm_hits).") logger.debug("Spalte 'Norm_Treffer' nicht gefunden (mark_norm_hits).")
wb.save(file_path) wb.save(file_path)
return return
# Farben definieren
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid") green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid") red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col): for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
@ -545,6 +480,7 @@ def mark_norm_hits(file_path):
cell.fill = red_fill cell.fill = red_fill
wb.save(file_path) wb.save(file_path)
elif ext == ".ods": elif ext == ".ods":
# ODS: kein Zell-Fill, stattdessen Status-Spalte
df = pd.read_excel(file_path, engine="odf") df = pd.read_excel(file_path, engine="odf")
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x != "KEIN TREFFER" else "Kein Treffer") df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x != "KEIN TREFFER" else "Kein Treffer")
df.to_excel(file_path, index=False, engine="odf") df.to_excel(file_path, index=False, engine="odf")
@ -555,6 +491,9 @@ def mark_norm_hits(file_path):
# Fehlende Begriffe -> separate Datei # Fehlende Begriffe -> separate Datei
# ========================= # =========================
def export_missing_terms(out_df, output_file): def export_missing_terms(out_df, output_file):
"""
Speichert Begriffe ohne Treffer oder Vorschläge in separater Datei
"""
missing_df = out_df[ missing_df = out_df[
(out_df["Norm_Treffer"] == "KEIN TREFFER") & (out_df["Norm_Treffer"] == "KEIN TREFFER") &
(out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == "")) (out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
@ -562,7 +501,6 @@ def export_missing_terms(out_df, output_file):
count_missing = len(missing_df) count_missing = len(missing_df)
logger.info(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}") logger.info(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
if count_missing == 0: if count_missing == 0:
return return
@ -589,8 +527,10 @@ def export_missing_terms(out_df, output_file):
# Haupt-Loop: Verarbeitung Input-Dateien # Haupt-Loop: Verarbeitung Input-Dateien
# ========================= # =========================
def process_files(): def process_files():
"""Verarbeitet alle Dateien im Input-Ordner, mappt Begriffe und speichert Ergebnisse"""
overall_start = time.time() overall_start = time.time()
try: try:
# Normvokabular laden
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE) norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
except Exception as e: except Exception as e:
logger.error("Normvokabular konnte nicht geladen werden. Beende.") logger.error("Normvokabular konnte nicht geladen werden. Beende.")
@ -626,6 +566,7 @@ def process_files():
df = df.dropna(how="all") df = df.dropna(how="all")
df.columns = [str(c).strip() for c in df.columns] df.columns = [str(c).strip() for c in df.columns]
# Spalten identifizieren
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None) besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None) box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
urh_col = next((c for c in df.columns if "Urheber" in c), None) urh_col = next((c for c in df.columns if "Urheber" in c), None)
@ -633,6 +574,7 @@ def process_files():
logger.warning(f"Spalte 'Objektbeschreibung' nicht gefunden in {file_path.name}. Datei übersprungen.") logger.warning(f"Spalte 'Objektbeschreibung' nicht gefunden in {file_path.name}. Datei übersprungen.")
continue continue
# Begriffe extrahieren
row_terms_map = [] row_terms_map = []
for r_idx, row in enumerate(df.itertuples(index=False), start=1): for r_idx, row in enumerate(df.itertuples(index=False), start=1):
try: try:
@ -657,9 +599,11 @@ def process_files():
if (r_idx % 200) == 0: if (r_idx % 200) == 0:
logger.debug(f"[{file_path.name}] Zeile {r_idx} verarbeitet") logger.debug(f"[{file_path.name}] Zeile {r_idx} verarbeitet")
# Alle einzigartigen Terme für API-Abfragen
all_terms = list({t for _,_,terms in row_terms_map for t in terms}) all_terms = list({t for _,_,terms in row_terms_map for t in terms})
logger.info(f"[{file_path.name}] Gefundene unique Terme: {len(all_terms)}") logger.info(f"[{file_path.name}] Gefundene unique Terme: {len(all_terms)}")
total_unique_terms = len(all_terms) total_unique_terms = len(all_terms)
# API-Abfragen # API-Abfragen
t0 = time.time() t0 = time.time()
gnd_results = batch_query_gnd(all_terms) gnd_results = batch_query_gnd(all_terms)
@ -668,9 +612,6 @@ def process_files():
wd_results = batch_query_wikidata(all_terms) wd_results = batch_query_wikidata(all_terms)
t2 = time.time() t2 = time.time()
logger.info(f"[{file_path.name}] Wikidata-Abfragen Dauer: {t2-t1:.1f}s") logger.info(f"[{file_path.name}] Wikidata-Abfragen Dauer: {t2-t1:.1f}s")
aat_results = batch_query_getty_aat(all_terms) if API_ACTIVE.get("aat", False) else {t:"" for t in all_terms}
t3 = time.time()
logger.info(f"[{file_path.name}] AAT-Abfragen Dauer: {t3-t2:.1f}s")
# Build output rows # Build output rows
output_rows = [] output_rows = []
@ -690,58 +631,30 @@ def process_files():
"Norm_ID": norm_id, "Norm_ID": norm_id,
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "", "Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
"GND_Top1": gnd_results.get(term,""), "GND_Top1": gnd_results.get(term,""),
"WD_Top1": wd_results.get(term,""), "WD_Top1": wd_results.get(term,"")
"AAT_Top1": aat_results.get(term,"")
} }
output_rows.append(out_row) output_rows.append(out_row)
processed_count += 1 processed_count += 1
if (processed_count % 200) == 0: if (processed_count % 200) == 0:
logger.debug(f"[{file_path.name}] {processed_count}/{total_unique_terms} Terme verarbeitet") logger.debug(f"[{file_path.name}] {processed_count}/{total_unique_terms} Terme verarbeitet")
# Save output
out_df = pd.DataFrame(output_rows) out_df = pd.DataFrame(output_rows)
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}" out_file = OUTPUT_DIR / f"{file_path.stem}_mapped.xlsx"
version = 1
while output_file.exists():
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
version += 1
engine = "odf" if output_file.suffix.lower()==".ods" else None
try: try:
out_df.to_excel(output_file, index=False, engine=engine) out_df.to_excel(out_file, index=False, engine="openpyxl")
logger.info(f"[{file_path.name}] Auswertung gespeichert: {output_file}") logger.info(f"Ergebnisse gespeichert: {out_file}")
mark_norm_hits(out_file)
export_missing_terms(out_df, out_file)
except Exception as e: except Exception as e:
logger.error(f"[{file_path.name}] Fehler beim Speichern der Auswertung {output_file}: {e}") logger.error(f"Fehler beim Speichern der Ergebnisse für {file_path.name}: {e}")
continue
export_missing_terms(out_df, output_file) elapsed_total = time.time() - overall_start
mark_norm_hits(output_file) logger.info(f"Verarbeitung abgeschlossen. Gesamtzeit: {elapsed_total:.1f}s")
logger.info(f"Gesamtterme: {total_terms}, Treffer: {total_hits}, Trefferquote: {total_hits/total_terms:.2%}" if total_terms else "")
file_elapsed = time.time() - file_start
logger.info(f"[Datei {file_idx}/{len(files)}] Fertig ({file_elapsed:.1f}s)")
overall_elapsed = time.time() - overall_start
logger.info(f"Fertig. Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular. Gesamtzeit: {overall_elapsed:.1f}s")
# =========================
# Main
# =========================
if __name__ == "__main__":
try:
process_files()
except KeyboardInterrupt:
logger.warning("Abbruch durch Benutzer (KeyboardInterrupt).")
except SystemExit:
logger.warning("SystemExit aufgetreten.")
except Exception as e:
logger.exception(f"Ungefangener Fehler: {e}")
finally:
# Stop logger (flush remaining logs)
try:
save_cache() save_cache()
except Exception:
pass
try:
logger.info("Beende.")
logger.stop() logger.stop()
except Exception:
pass if __name__ == "__main__":
process_files()

View File

@ -1,46 +0,0 @@
import subprocess
import json
import sys
from pathlib import Path
def run_mapper(term):
"""
Ruft das bestehende mapper script auf und liefert Vorschläge zurück.
Erwartet, dass das mapper script eine JSON-Ausgabe liefert:
{
"term": "Begriff",
"norm_name": "Normierter Treffer oder KEIN TREFFER",
"norm_id": "ID",
"suggestions": ["Vorschlag1", "Vorschlag2", "Vorschlag3"]
}
"""
mapper_script = Path("/home/jarnold/projects/GND-Skript Test/NormVokabular_Mapper_1.2.py") # dein bestehendes Mapper-Skript
if not mapper_script.exists():
raise FileNotFoundError(f"{mapper_script} nicht gefunden")
# Übergabe als JSON-String
input_json = json.dumps({"term": term})
# Aufruf via subprocess
result = subprocess.run(
[sys.executable, str(mapper_script), input_json],
capture_output=True,
text=True
)
if result.returncode != 0:
raise RuntimeError(f"Mapper Fehler: {result.stderr}")
try:
output = json.loads(result.stdout)
except Exception as e:
raise ValueError(f"Ungültige Ausgabe vom Mapper: {e}")
return output
if __name__ == "__main__":
if len(sys.argv) > 1:
term = sys.argv[1]
output = run_mapper(term)
print(json.dumps(output, ensure_ascii=False))

Binary file not shown.

Binary file not shown.

View File

@ -1,101 +0,0 @@
import pandas as pd
import requests
import time
import os
def match_gnd(token, delay=0.3):
"""GND-Abfrage für ein Schlagwort, gibt erstes Ergebnis zurück"""
url = f"https://lobid.org/gnd/search?q={token}&format=json"
try:
resp = requests.get(url, timeout=5)
if resp.status_code == 200:
data = resp.json()
if 'member' in data and data['member']:
first = data['member'][0]
return first.get('preferredName'), first.get('gndIdentifier')
except Exception as e:
print(f"Fehler bei GND-Abfrage für '{token}': {e}")
time.sleep(delay)
return None, None
def load_exlibris_refs(path):
"""CSV einlesen, Scan-Zuordnung, Platzhalter-Inventarnummer, GND-Abgleich"""
df = pd.read_csv(path, dtype=str, header=0)
# erste Spalte leer? → "Kürzel"
if df.columns[0].strip() == '':
df.rename(columns={df.columns[0]: 'Kürzel'}, inplace=True)
df.fillna('', inplace=True)
# Scan-Level-Spalten
level_cols = [c for c in df.columns if c.strip() in ['0','1','2','3','4']]
obj_list = []
current_obj = None
placeholder_counter = 1
for _, row in df.iterrows():
has_0 = row['0'].strip() if '0' in df.columns else ''
row_refs = []
for c in level_cols:
val = row[c].strip()
if val:
row_refs.append({'level': c, 'scan_ref': val})
if has_0:
if current_obj:
obj_list.append(current_obj)
core_data = {col: row[col] for col in df.columns if col not in level_cols}
# Inventarnummer prüfen
inv = core_data.get('Inventarnummer','').strip()
if not inv:
core_data['Inventarnummer'] = f'PL-{placeholder_counter:04d}'
placeholder_counter += 1
# GND-Abgleich
obj_descr = core_data.get('Objektbeschreibung','')
gnd_name, gnd_id = None, None
if obj_descr:
tokens = [t.strip() for t in obj_descr.split(',') if t.strip()]
for t in tokens:
name, gid = match_gnd(t)
if gid:
gnd_name = name
gnd_id = gid
break
core_data['GND_Name'] = gnd_name
core_data['GND_ID'] = gnd_id
current_obj = core_data
current_obj['ScanReferenzen'] = row_refs
else:
if current_obj:
current_obj['ScanReferenzen'].extend(row_refs)
if current_obj:
obj_list.append(current_obj)
out_df = pd.DataFrame(obj_list)
core_fields = ['Kürzel','Inventarnummer','Standort','Jahr','Urheber','Eigner',
'Objektbeschreibung','Material','Maße (in cm)',
'Objekttyp','Inschrift','Anmerkungen','ScanReferenzen',
'GND_Name','GND_ID']
available = [c for c in core_fields if c in out_df.columns]
return out_df[available]
# ====================
# Hauptteil
# ====================
if __name__ == "__main__":
# CSV im gleichen Ordner suchen
csv_files = [f for f in os.listdir('.') if f.lower().endswith('.csv')]
if not csv_files:
print("Keine CSV-Datei im aktuellen Ordner gefunden.")
exit(1)
# nimm die erste gefundene CSV
input_csv = csv_files[0]
print(f"Verwende CSV-Datei: {input_csv}")
df = load_exlibris_refs(input_csv)
# Ergebnis als Testergebnis.csv speichern
output_file = "Testergebnis.csv"
df.to_csv(output_file, index=False)
print(f"Aufbereitete Daten gespeichert als {output_file}")

190
VLG.py
View File

@ -1,190 +0,0 @@
#!/usr/bin/env python3
"""
VLG_AAT.py Gruppierung, Auflösung "Objektbeschreibung"
NOCH OHNE AAT-ABGLEICH
- Prüft ezodf in aktueller Umgebung
- Liest ODS aus "Input CSV/"
- Extrahiert Begriffe aus "Objektbeschreibung"
- Lemmatisierung (Spacy) + Stopwortfilter
- Subtokenisierung komplexer Phrasen
- Zählt Häufigkeiten
- Ausgabe ODS / CSV-Fallback in "Auswertung Ergebnisse"
"""
import os
import sys
import logging
from collections import Counter
import pandas as pd
import spacy
# ---------------------------
# Logging
# ---------------------------
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
# ---------------------------
# ezodf prüfen
# ---------------------------
try:
import ezodf
EZODF_AVAILABLE = True
logging.info(f"ezodf erkannt")
except ImportError:
EZODF_AVAILABLE = False
logging.error("ezodf konnte nicht importiert werden!")
logging.error("Möglicherweise nutzen Sie nicht die Python-Umgebung, in der ezodf installiert ist.")
logging.error(f"Aktuelle Python-Executable: {sys.executable}")
logging.error("Bitte prüfen Sie Ihre venv oder installieren Sie ezodf in dieser Umgebung:")
logging.error(" python -m pip install ezodf")
sys.exit(1)
# ---------------------------
# Spacy laden
# ---------------------------
try:
nlp = spacy.load("de_core_news_sm")
logging.info("Spacy-Modell geladen.")
except Exception as e:
logging.error(f"Spacy-Modell konnte nicht geladen werden: {e}")
sys.exit(1)
# ---------------------------
# Konfiguration
# ---------------------------
INPUT_FOLDER = "Input CSV"
OUTPUT_FOLDER = "Auswertung Ergebnisse"
INPUT_FILENAME = None
TARGET_COLUMN = "Objektbeschreibung"
STOPWORDS = {"mit", "auf", "von", "und", "der", "die", "das"} # erweiterbar
MAPPING = { # Projektinterne Sonderfälle
"exlibris": "exlibris",
"wappen": "wappen"
}
# ---------------------------
# Funktionen
# ---------------------------
def find_input_file(folder: str, filename_hint: str = None):
if not os.path.isdir(folder):
raise FileNotFoundError(f"Input-Ordner '{folder}' existiert nicht.")
files = [f for f in os.listdir(folder) if f.lower().endswith(".ods")]
if filename_hint:
for f in files:
if f == filename_hint or filename_hint in f:
return os.path.join(folder, f)
if not files:
raise FileNotFoundError(f"Keine .ods-Dateien in '{folder}' gefunden.")
return os.path.join(folder, files[0])
def read_ods_first_sheet(path: str) -> pd.DataFrame:
"""Lädt ODS, erkennt automatisch Header-Zeile."""
try:
df = pd.read_excel(path, engine="odf", header=None)
logging.info("ODS mit pandas + odfpy geladen.")
except Exception as e1:
logging.warning(f"pandas + odfpy konnte ODS nicht lesen ({e1}).")
if not EZODF_AVAILABLE:
raise RuntimeError("ezodf nicht installiert und pandas + odfpy fehlgeschlagen.")
doc = ezodf.opendoc(path)
sheet = doc.sheets[0]
data = []
for row in sheet.rows():
values = [c.value if hasattr(c, "value") else "" for c in row]
data.append(values)
df = pd.DataFrame(data)
logging.info("ODS mit ezodf geladen.")
# Header-Zeile automatisch finden
header_row_index = None
for i, row in df.iterrows():
row_str = row.fillna("").astype(str).str.lower()
if any("objektbeschreibung" in str(cell) for cell in row_str):
header_row_index = i
break
if header_row_index is None:
raise KeyError("Keine Header-Zeile mit 'Objektbeschreibung' gefunden.")
df.columns = df.iloc[header_row_index]
df = df.iloc[header_row_index + 1:].reset_index(drop=True)
return df
def tokenize_and_lemmatize(series: pd.Series) -> list:
"""Tokenisiert, entfernt Stopwords, wendet Mapping + Spacy-Lemmatisierung an."""
series = series.fillna("").astype(str).str.strip().str.lower()
all_terms = []
for text in series:
if not text:
continue
# Komma-Split
for part in [p.strip() for p in text.split(",") if p.strip()]:
# Subtokenisierung via Spacy
doc = nlp(part)
for token in doc:
lemma = token.lemma_.lower()
if lemma in STOPWORDS:
continue
lemma = MAPPING.get(lemma, lemma)
if lemma:
all_terms.append(lemma)
return all_terms
def write_output(rows: list, outpath: str):
if EZODF_AVAILABLE:
if not rows:
logging.warning("Keine Daten zum Schreiben.")
return
keys = list(rows[0].keys())
doc = ezodf.newdoc(doctype="ods", filename=outpath)
sheet = ezodf.Sheet("Auswertung", size=(len(rows)+1, len(keys)))
doc.sheets += sheet
for ci, k in enumerate(keys):
sheet[0, ci].set_value(k)
for ri, row in enumerate(rows, start=1):
for ci, k in enumerate(keys):
sheet[ri, ci].set_value(row.get(k, ""))
doc.save()
logging.info(f"ODS geschrieben: {outpath}")
else:
csv_path = os.path.splitext(outpath)[0] + ".csv"
df = pd.DataFrame(rows)
df.to_csv(csv_path, index=False, sep=";", encoding="utf-8")
logging.info(f"CSV-Fallback geschrieben: {csv_path}")
# ---------------------------
# Hauptfunktion
# ---------------------------
def main(input_folder=INPUT_FOLDER, input_filename=INPUT_FILENAME):
input_path = find_input_file(input_folder, filename_hint=input_filename)
input_basename = os.path.splitext(os.path.basename(input_path))[0]
logging.info(f"Verarbeite Datei: {input_path}")
df = read_ods_first_sheet(input_path)
logging.info(f"Geladene Spalten: {list(df.columns)}")
if TARGET_COLUMN.lower() not in [str(c).lower() for c in df.columns]:
raise KeyError(f"Spalte '{TARGET_COLUMN}' nicht gefunden.")
terms = tokenize_and_lemmatize(df[TARGET_COLUMN])
logging.info(f"Gefundene Begriffe: {len(terms)}")
counts = Counter(terms)
sorted_terms = sorted(counts.items(), key=lambda kv: kv[1], reverse=True)
rows = [{"Begriff": term, "Anzahl": freq} for term, freq in sorted_terms]
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
out_name = f"{input_basename} Auswertung.ods"
out_path = os.path.join(OUTPUT_FOLDER, out_name)
write_output(rows, out_path)
logging.info("Fertig.")
if __name__ == "__main__":
argv = sys.argv[1:]
folder = INPUT_FOLDER
fname = INPUT_FILENAME
if len(argv) >= 1:
folder = argv[0]
if len(argv) >= 2:
fname = argv[1]
main(input_folder=folder, input_filename=fname)

View File

@ -1,262 +0,0 @@
import os
import sys
import time
import json
import requests
import pandas as pd
from pathlib import Path
from difflib import SequenceMatcher
import argparse
# =========================
# Argumente / Dry-Run
# =========================
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true', help='API-Abfragen simulieren')
args = parser.parse_args()
DRY_RUN = args.dry_run
# =========================
# Konfiguration
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
TIMEOUT = 5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
MAX_CONSECUTIVE_FAILURES = 10
CACHE_FILE = "api_cache.json"
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
API_ACTIVE = {"gnd": True, "wikidata": True}
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
# =========================
# Logging
# =========================
def log(level, msg):
print(f"[{level}] {msg}")
# =========================
# Cache speichern
# =========================
def save_cache():
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
# =========================
# Request mit Retry & Backoff
# =========================
def request_with_retries(api_name, url, params=None):
if DRY_RUN:
return {"dummy": True}
if not API_ACTIVE[api_name]:
return None
cache_key = url + (str(params) if params else "")
if cache_key in CACHE:
return CACHE[cache_key]
retries = 0
while retries < MAX_RETRIES:
try:
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
if r.status_code == 200:
try:
data = r.json()
except:
data = r.text
CACHE[cache_key] = data
save_cache()
FAIL_COUNTER[api_name] = 0
return data
elif r.status_code in [403, 429]:
log("ERROR", f"{api_name.upper()} HTTP {r.status_code} Stopschalter aktiviert")
API_ACTIVE[api_name] = False
return None
else:
log("ERROR", f"{api_name.upper()} HTTP {r.status_code}")
except requests.exceptions.Timeout:
log("ERROR", f"Timeout bei {api_name.upper()}")
except Exception as e:
log("ERROR", f"Fehler bei {api_name.upper()}: {e}")
retries += 1
sleep_time = min(BACKOFF_FACTOR ** retries, 30)
time.sleep(sleep_time)
FAIL_COUNTER[api_name] += 1
if FAIL_COUNTER[api_name] >= MAX_CONSECUTIVE_FAILURES:
log("CRITICAL", f"{MAX_CONSECUTIVE_FAILURES} Fehler bei {api_name.upper()} Stopschalter aktiviert")
API_ACTIVE[api_name] = False
return None
# =========================
# API-Abfragen mit Confidence
# =========================
def query_gnd(term, min_conf=0.6):
if DRY_RUN or not API_ACTIVE["gnd"]:
return "TEST_GND", 1.0
url = f"https://lobid.org/gnd/search?q={term}&format=json"
data = request_with_retries("gnd", url)
if not data:
return "API nicht erreichbar", 0.0
results = []
scores = []
for doc in data.get("member", []):
name = doc.get("preferredName", "")
conf = SequenceMatcher(None, term.lower(), name.lower()).ratio()
if conf >= min_conf:
results.append(name)
scores.append(conf)
if results:
return ", ".join(results), max(scores)
return "ohne Ergebnis", 0.0
def query_wikidata(term, min_conf=0.5):
if DRY_RUN or not API_ACTIVE["wikidata"]:
return "TEST_WD", 1.0
url = "https://www.wikidata.org/w/api.php"
params = {"action": "wbsearchentities", "search": term, "language": "de", "format": "json"}
data = request_with_retries("wikidata", url, params)
if not data:
return "API nicht erreichbar", 0.0
results = []
scores = []
for entry in data.get("search", []):
match_info = entry.get("match", {})
score = match_info.get("score", 0.0)
if score >= min_conf:
results.append(entry["label"])
scores.append(score)
if results:
return ", ".join(results), max(scores)
return "ohne Ergebnis", 0.0
# =========================
# Input laden
# =========================
def load_input_file(file_path):
try:
if file_path.suffix.lower() == ".ods":
df = pd.read_excel(file_path, engine="odf", header=None)
elif file_path.suffix.lower() == ".xlsx":
df = pd.read_excel(file_path, engine="openpyxl", header=None)
elif file_path.suffix.lower() == ".csv":
df = pd.read_csv(file_path, header=None)
else:
log("WARNING", f"Unbekanntes Dateiformat: {file_path.name}")
return None
return df
except Exception as e:
log("ERROR", f"Fehler beim Laden von {file_path.name}: {e}")
return None
# =========================
# Header-Zeile suchen
# =========================
def find_header_row(df, keywords=["objektbeschreibung", "objekt/ebene"]):
for i, row in df.iterrows():
row_lower = [str(cell).lower() if pd.notna(cell) else "" for cell in row]
if any(kw in cell for kw in keywords for cell in row_lower):
return i, row_lower
return None, None
# =========================
# Verarbeitung
# =========================
def process_files():
all_terms = []
output_rows = []
for file_path in INPUT_DIR.glob("*"):
if not file_path.suffix.lower() in [".csv", ".xlsx", ".ods"]:
continue
log("INFO", f"Verarbeite {file_path.name}")
df = load_input_file(file_path)
if df is None:
continue
header_idx, header_row = find_header_row(df)
if header_idx is None:
log("WARNING", f"Keine Header-Zeile gefunden in {file_path.name}")
continue
df.columns = header_row
df = df.iloc[header_idx+1:].reset_index(drop=True)
col_objdesc = next((col for col in df.columns if "objektbeschreibung" in str(col).lower()), None)
col_objlevel = next((col for col in df.columns if "objekt/ebene" in str(col).lower()), None)
if not col_objdesc:
log("WARNING", f"Keine Spalte 'Objektbeschreibung' in {file_path.name}")
continue
term_list = []
obj_level_list = []
for _, row in df.iterrows():
terms = str(row[col_objdesc]) if pd.notna(row[col_objdesc]) else ""
if not terms:
continue
for term in [t.strip() for t in terms.split(",") if t.strip()]:
term_list.append(term)
obj_level_list.append(row[col_objlevel] if col_objlevel and pd.notna(row[col_objlevel]) else "")
# API-Abfragen
gnd_results = []
gnd_scores = []
wikidata_results = []
wikidata_scores = []
for term in term_list:
gnd_res, gnd_conf = query_gnd(term)
wikidata_res, wd_conf = query_wikidata(term)
gnd_results.append(gnd_res)
gnd_scores.append(gnd_conf)
wikidata_results.append(wikidata_res)
wikidata_scores.append(wd_conf)
for idx, term in enumerate(term_list):
output_rows.append({
"Begriff": term,
"Quelle": file_path.name,
"Objekt/Ebene": obj_level_list[idx],
"GND": gnd_results[idx],
"GND_Confidence": gnd_scores[idx],
"Wikidata": wikidata_results[idx],
"Wikidata_Confidence": wikidata_scores[idx]
})
all_terms.extend(term_list)
# Hauptoutput
out_df = pd.DataFrame(output_rows)
out_file = OUTPUT_DIR / "Auswertung_gesamt.ods"
out_df.to_excel(out_file, index=False, engine="odf")
log("INFO", f"Hauptauswertung gespeichert: {out_file}")
# Rohdatei
raw_terms = pd.Series(all_terms).value_counts().reset_index()
raw_terms.columns = ["Begriff", "Häufigkeit"]
raw_file = OUTPUT_DIR / "Rohbegriffe.ods"
raw_terms.to_excel(raw_file, index=False, engine="odf")
log("INFO", f"Rohbegriffe gespeichert: {raw_file}")
# =========================
# Main
# =========================
if __name__ == "__main__":
if not INPUT_DIR.exists():
log("CRITICAL", f"Eingabeordner {INPUT_DIR} fehlt!")
sys.exit(1)
process_files()

View File

@ -0,0 +1,2 @@
Makro für die Erfassungstabelle, mit dem Vorschläge für Begriffe per Klick angenommen und ersetzt werden sollen.
Funktioniert nicht in LibreOffice, müsste in excel aber laufen.

View File

@ -0,0 +1 @@
,jarnold,workPC,16.10.2025 13:04,file:///home/jarnold/.config/libreoffice/4;

View File

@ -0,0 +1,125 @@
= Benutzungsanleitung - NV_MASTER Abgleich Makro =
'''(mapper_macro_2.x.py)'''
== 1. Was das Makro macht ==
Dieses Makro hilft dir dabei, Begriffe in der Auswertungstabelle zu vereinheitlichen.
Es vergleicht automatisch die Inhalte aus der Spalte „Objektbeschreibung“ mit einer
Normvokabular-Referenzdatei namens „NV_MASTER.ods“.
So findest du heraus, welche Begriffe schon genormt sind, wo es passende Vorschläge gibt
und wo etwas gar nicht erkannt wurde.
Das Makro markiert in der Auswertungstabelle jede Zeile unter „Objektbeschreibung“
farbig:
* <span style="color:green;">Grün</span>: Alles passt, alle Begriffe gefunden
* <span style="color:yellow;">Gelb</span>: Einige Begriffe wurden erkannt, andere nicht
* <span style="color:red;">Rot</span>: Kein einziger Begriff erkannt
Beispiel:
{| class="wikitable"
|+ Tabelle 1
|-
! Objektbeschreibung !! Norm_Treffer !! Norm_Vorschlag !! Kein_Treffer
|-
| (leer) || || ||
|}
Die Spalten „Norm_Treffer“, „Norm_Vorschlag“ und „Kein_Treffer“ legt das Makro
automatisch an, wenn sie fehlen.
! Tipps zur Nutzung !
* Wenn du die NV_MASTER-Datei änderst, starte das Makro neu es liest sie bei
jedem Lauf neu ein.
* Erstelle ein Backup der Auswertungstabelle, bevor du das Makro ausführst.
* Schaue ab und zu in die Logdatei, um zu prüfen, ob alles korrekt läuft.
* Wenn ein Begriff rot markiert wird, aber deiner Meinung nach sinnvoll und zutreffend
für das beschriebene Objekt ist, schreibe den Begriff auf und sprich mit deinen
Vorgesetzten ab, ob er in das Normvokabular aufgenommen werden sollte.
== 2. Wo die Dateien des Makros liegen müssen ==
'''Unter Linux:'''
<pre>
/home/&lt;dein-benutzername&gt;/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro/
</pre>
'''Unter Windows:'''
<pre>
C:\Users\<dein-benutzername>\AppData\Roaming\LibreOffice\4\user\Scripts\python\Vokabular_Abgleich_Makro\
</pre>
In diesem Ordner müssen liegen:
* mapper_macro_2.x.py (das Makro)
* NV_MASTER.ods (die Referenzdatei)
* optional: mapper_macro_2.x.log (wird automatisch erstellt)
== 3. Wie du das Makro startest ==
# Öffne deine Calc-Datei mit den Begriffen
# Gehe im Menü auf Extras → Makros → Makros ausführen...
# Wähle: Meine Makros → mapper_macro_2.x.py → run_mapper_macro
# Klicke auf Ausführen
Das Makro startet sofort. Je nach Tabellenumfang dauert der Abgleich ein paar Sekunden bis Minuten.
Wenn nichts passiert, liegt ein Fehler vor. In diesem Fall schaue in die .log-Datei, die das Makro bei jedem Durchlauf erstellt. Sie liegt im selben Ordner wie das Makro.
== 4. Wie du das Ergebnis liest ==
Nach dem Lauf schreibt das Makro die Treffer und Vorschläge direkt in deine Tabelle und markiert sie:
{| class="wikitable"
|+ Tabelle 2
|-
! Objektbeschreibung !! Norm_Treffer !! Norm_Vorschlag !! Kein_Treffer
|-
| Harfe, Noten, Bäume, Geldbeutel, Landschaft, Gewässer || Harfe (2.1) | Noten (3.4) | Landschaft (7.2) Gewässer (9.1) || Baum || Geldbeutel
|}
Farben:
* 🟩 <span style="color:green;">Grün</span>: Alle Begriffe wurden direkt erkannt → Perfekt!
* 🟨 <span style="color:yellow;">Gelb</span>: Einige Begriffe wurden erkannt, aber andere nur teilweise oder gar nicht → Vorschläge unter der Spalte „Norm_Vorschlag“ prüfen
* 🟥 <span style="color:red;">Rot</span>: Kein Begriff wurde gefunden → Objektbeschreibung anpassen, ggf. neue Begriffe in das Normvokabular aufnehmen
== 5. Wo das Protokoll liegt (Logdatei) ==
Das Makro schreibt alles, was passiert, in eine Logdatei:
'''Linux:''' /home/<dein-benutzername>/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro/mapper_macro_2.x.log
'''Windows:''' C:\Users\<dein-benutzername>\AppData\Roaming\LibreOffice\4\user\Scripts\python\Vokabular_Abgleich_Makro\mapper_macro_2.x.log
Dort siehst du:
* wann das Makro gestartet wurde
* wie viele Zeilen verarbeitet wurden
* und ob Fehler aufgetreten sind
{| class="wikitable"
|+ Tabelle 3
|-
! Problem !! Ursache !! Lösung
|-
| Das Makro taucht nicht auf || Falscher Speicherort || Prüfe, ob das Skript wirklich im Ordner Scripts/python liegt
|-
| Fehlermeldung „Module not found“ || Python-Bibliotheken fehlen || Installiere pandas, odfpy, spacy, rapidfuzz
|-
| NV_MASTER wird nicht gelesen || Datei fehlt oder ist kaputt || Prüfe Name und Speicherort
|-
| LibreOffice stürzt ab || Sehr große Datei oder fehlerhafte NV_MASTER || Teste mit kleinerer Datei oder neuem NV_MASTER
|}
== 6. Was das Makro benötigt, um einwandfrei zu laufen ==
Alle folgenden Pakete sind für das Makro notwendig, egal ob LibreOffice oder Excel:
{| class="wikitable"
|+ Tabelle 4
|-
! Paket !! Zweck
|-
| pandas || Einlesen der Referenzdatei (NV_MASTER.ods)
|-
| odfpy || Ermöglicht Lesen von .ods-Dateien (für pandas.read_excel(..., engine="odf"))
|-
| spacy || Lemmatisierung (optional, aber empfohlen)
|-
| rapidfuzz || Schnelles Fuzzy-Matching (Alternativ zu difflib)
|-
| openpyxl || Wird benötigt, falls .xlsx genutzt wird
|-
| python-dateutil || Wird automatisch von pandas gebraucht
|}

View File

@ -0,0 +1,622 @@
<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:143.0) Gecko/20100101 Firefox/143.0" version="28.2.5">
<diagram name="Page-1" id="aLmyRVYCle99qeRE2JvP">
<mxGraphModel dx="1301" dy="1900" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
<root>
<mxCell id="0" />
<mxCell id="1" parent="0" />
<mxCell id="WA2_J1DCvVjPXciXSW-M-3" value="" style="group" parent="1" vertex="1" connectable="0">
<mxGeometry x="85" y="932" width="310" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-2" value="Scan- und Erfassungsprozess" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="WA2_J1DCvVjPXciXSW-M-3" vertex="1">
<mxGeometry x="60" y="-900" width="210" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-13" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-4" target="WA2_J1DCvVjPXciXSW-M-6" edge="1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-14" value="Makro gibt Vorschläge aus NV zurück" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-13" vertex="1" connectable="0">
<mxGeometry x="0.2678" y="-1" relative="1" as="geometry">
<mxPoint as="offset" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-4" value="&lt;div&gt;Makro (mapper_macro_2.x.py)&lt;/div&gt;" style="ellipse;whiteSpace=wrap;html=1;fillColor=#FFFF66;" parent="1" vertex="1">
<mxGeometry x="575" y="52" width="200" height="100" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-5" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.367;exitY=0.988;exitDx=0;exitDy=0;exitPerimeter=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-1" target="WA2_J1DCvVjPXciXSW-M-4" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="405" y="97" as="sourcePoint" />
<Array as="points">
<mxPoint x="235" y="91" />
<mxPoint x="235" y="117" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-12" value="Wird vom Makro gelesen und mit NV abgeglichen" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-5" vertex="1" connectable="0">
<mxGeometry x="0.0228" y="4" relative="1" as="geometry">
<mxPoint as="offset" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-6" value="Anpassung der Erfassungstabelle anhand der Vorschläge" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="80" y="212" width="320" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-10" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="515" y="972" as="sourcePoint" />
<mxPoint x="515" y="12" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-11" value="&lt;h1 style=&quot;margin-top: 0px;&quot;&gt;Workflow Digitalisierung -&lt;/h1&gt;&lt;h1 style=&quot;margin-top: 0px;&quot;&gt;&lt;u&gt;&lt;font style=&quot;font-size: 20px;&quot;&gt;Objekterfassung und Pflege des Normvokabulars&lt;/font&gt;&lt;/u&gt;&lt;/h1&gt;&lt;div&gt;&lt;font style=&quot;font-size: 14px;&quot;&gt;- Erfassung und Verschlagwortung von Bildobjekten&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font style=&quot;font-size: 14px;&quot;&gt;- Abgleich mit internem Normvokabular&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font style=&quot;font-size: 14px;&quot;&gt;- API-Abgleich mit getty und GND&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font style=&quot;font-size: 14px;&quot;&gt;- Pflege und Erweiterung des Normvokabulars&lt;/font&gt;&lt;/div&gt;" style="text;html=1;whiteSpace=wrap;overflow=hidden;rounded=0;" parent="1" vertex="1">
<mxGeometry x="30" y="-1070" width="455" height="220" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-1" value="&lt;div&gt;Scan und Erfassen der Objekte, Erfassung in Tabellen, Spalte &quot;Objektbeschreibung&quot;&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="85" y="32" width="310" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-16" value="Makro 2 (Übernahme von Vorschlägen aus NV per Klick)" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#FF6666;" parent="1" vertex="1">
<mxGeometry x="575" y="292" width="190" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-20" value="" style="html=1;shadow=0;dashed=0;align=center;verticalAlign=middle;shape=mxgraph.arrows2.arrow;dy=0.6;dx=40;notch=0;fillColor=#FF6666;" parent="1" vertex="1">
<mxGeometry x="460" y="312" width="90" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-23" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.071;entryY=0.25;entryDx=0;entryDy=0;entryPerimeter=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;dashed=1;" parent="1" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="592.495" y="462" as="targetPoint" />
<mxPoint x="232.5700000000001" y="432" as="sourcePoint" />
<Array as="points">
<mxPoint x="233" y="462" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-34" value="Gleiche Funktion wie Makro 1 + API-Abgleich" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-23" vertex="1" connectable="0">
<mxGeometry x="-0.4298" relative="1" as="geometry">
<mxPoint x="53" as="offset" />
</mxGeometry>
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-15" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.25;exitY=1;exitDx=0;exitDy=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-21">
<mxGeometry relative="1" as="geometry">
<mxPoint x="165.20000000000005" y="510" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-21" value="Bereinigte Erfassungstabelle" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="85" y="362" width="320" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-37" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.452;entryY=-0.007;entryDx=0;entryDy=0;entryPerimeter=0;dashed=1;" parent="1" source="WA2_J1DCvVjPXciXSW-M-24" target="WA2_J1DCvVjPXciXSW-M-33" edge="1">
<mxGeometry relative="1" as="geometry">
<Array as="points">
<mxPoint x="232" y="480" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-60" value="gibt aus" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-37" vertex="1" connectable="0">
<mxGeometry x="-0.0997" y="-1" relative="1" as="geometry">
<mxPoint as="offset" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-24" value="NormVokabular_Mapper.py" style="ellipse;whiteSpace=wrap;html=1;fillColor=#FFFF66;" parent="1" vertex="1">
<mxGeometry x="592.5" y="432" width="175" height="80" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-27" value="&lt;u&gt;&lt;b&gt;WHK/Manuell&lt;/b&gt;&lt;/u&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="145" width="100" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-28" value="&lt;b&gt;&lt;u&gt;Programm/automatisiert&lt;/u&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="590" width="160" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-29" value="&lt;div&gt;Mögliche Optimierung, funktioniert aber nicht in LO&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;fontSize=8;" parent="1" vertex="1">
<mxGeometry x="570" y="362" width="200" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-32" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.484;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-6" target="WA2_J1DCvVjPXciXSW-M-21" edge="1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-17" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-33">
<mxGeometry relative="1" as="geometry">
<mxPoint x="247.5" y="720" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-33" value="Bereinigte Erfassungstabelle" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="85" y="512" width="325" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-35" value="&lt;ul&gt;&lt;li&gt;Liest Spalte Objektbeschreibung aus, filtert Stopwords und Zahlen raus&lt;/li&gt;&lt;li&gt;Normalisierung, Lemmatisierung, Stemming der Wörter für höhere Trefferwahrscheinlichkeit&lt;/li&gt;&lt;li&gt;Liest das Normvokabular, Berücksichtigt ID-Hierarchie, erstellt Index für gestemmte Begriffe&lt;/li&gt;&lt;li&gt;Abgleich mit Normvokabular, generiert Vorschläge wenn kein Treffer vorliegt&lt;/li&gt;&lt;li&gt;API-Abgleich (aktuell GND und wikidata, Top1-Treffer)&lt;/li&gt;&lt;li&gt;Erstellt eine Auswertungsdatei, markiert Begriffe entsprechend ihres Status)&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;fontSize=10;align=left;" parent="1" vertex="1">
<mxGeometry x="520" y="532" width="300" height="160" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-93" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-39" target="WA2_J1DCvVjPXciXSW-M-45" edge="1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-39" value="Aufnahme ins Normvokabular oder Verwerfen des Begriffs" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="85" y="722" width="330" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-43" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="515" y="1192" as="sourcePoint" />
<mxPoint x="515" y="962" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-94" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0;entryY=0.5;entryDx=0;entryDy=0;dashed=1;" parent="1" source="WA2_J1DCvVjPXciXSW-M-45" target="WA2_J1DCvVjPXciXSW-M-46" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="615" y="832" as="targetPoint" />
<Array as="points">
<mxPoint x="475" y="822" />
<mxPoint x="475" y="822" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-16" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.25;exitY=1;exitDx=0;exitDy=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-45">
<mxGeometry relative="1" as="geometry">
<mxPoint x="167.66666666666674" y="980" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-45" value="Manuelle Anpassung der Normvokabular-Masterfile" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="85" y="802" width="330" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-92" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;dashed=1;" parent="1" source="WA2_J1DCvVjPXciXSW-M-46" target="WA2_J1DCvVjPXciXSW-M-52" edge="1">
<mxGeometry relative="1" as="geometry">
<Array as="points">
<mxPoint x="695" y="912" />
<mxPoint x="198" y="912" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-220" value="gibt aus" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-92" vertex="1" connectable="0">
<mxGeometry x="0.3024" y="-2" relative="1" as="geometry">
<mxPoint as="offset" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-46" value="Masterfile_Editor.py" style="ellipse;whiteSpace=wrap;html=1;fillColor=#FFFF66;" parent="1" vertex="1">
<mxGeometry x="635" y="782" width="120" height="80" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-49" value="liest und bereinigt Normvokabular" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="427.5" y="817" width="200" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-58" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-52" target="WA2_J1DCvVjPXciXSW-M-57" edge="1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-221" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" parent="1" source="WA2_J1DCvVjPXciXSW-M-52" target="WA2_J1DCvVjPXciXSW-M-57" edge="1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-222" value="=" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-221" vertex="1" connectable="0">
<mxGeometry x="-0.3079" y="1" relative="1" as="geometry">
<mxPoint as="offset" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-52" value="Aktualisierte Masterfile, mit allen Änderungen und in der richtigen Struktur" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="85" y="980" width="225" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-59" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-57" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="70" y="242" as="targetPoint" />
<Array as="points">
<mxPoint x="40" y="1130" />
<mxPoint x="40" y="242" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-57" value="Masterfile Normvokabular Updated" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="117.5" y="1100" width="160" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-64" value="" style="html=1;shadow=0;dashed=0;align=center;verticalAlign=middle;shape=mxgraph.arrows2.arrow;dy=0.6;dx=40;notch=0;fillColor=#FF6666;" parent="1" vertex="1">
<mxGeometry x="410" y="1107.5" width="90" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-200" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-65" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="40" y="1140" as="targetPoint" />
<Array as="points">
<mxPoint x="680" y="1180" />
<mxPoint x="40" y="1180" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-65" value="&lt;div&gt;Normvokabular-Masterfile muss&amp;nbsp;&lt;/div&gt;&lt;div&gt;&lt;b&gt;zentral&lt;/b&gt; als &lt;b&gt;SPOT&lt;/b&gt; vorliegen und gepflegt werden können&lt;/div&gt;" style="ellipse;whiteSpace=wrap;html=1;fillColor=#FF6666;" parent="1" vertex="1">
<mxGeometry x="575" y="1075" width="210" height="85" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-66" value="" style="whiteSpace=wrap;html=1;aspect=fixed;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="485" y="-1046" width="20" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-67" value="" style="whiteSpace=wrap;html=1;aspect=fixed;fillColor=#FF6666;" parent="1" vertex="1">
<mxGeometry x="485" y="-1006" width="20" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-68" value="" style="whiteSpace=wrap;html=1;aspect=fixed;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="485" y="-966" width="20" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-69" value="" style="whiteSpace=wrap;html=1;aspect=fixed;fillColor=#FFFF66;" parent="1" vertex="1">
<mxGeometry x="485" y="-926" width="20" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-70" value="&lt;b&gt;Datei&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="518" y="-1050" width="50" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-71" value="&lt;b&gt;Fehlender Schritt/Optimierungsmöglichkeit&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="510" y="-1011" width="270" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-72" value="&lt;b&gt;Vorgang, WHK&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="515" y="-971" width="110" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-73" value="&lt;b&gt;Programm&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="520" y="-931" width="80" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-74" value="" style="endArrow=none;html=1;rounded=0;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="35" y="-850" as="sourcePoint" />
<mxPoint x="805" y="-850" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-75" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;&lt;u&gt;&lt;font&gt;Probleme/Noch zu klären:&lt;/font&gt;&lt;/u&gt;&lt;/b&gt;&lt;/font&gt;&lt;ul&gt;&lt;li&gt;&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;Makro 1 und NormVokabular-Mapper&lt;/b&gt; redundant, eine Methode festlegen (Makro benutzerfreundlicher, Treffer/Vorschläge direkt in Erfassung sichtbar, Mapper genauer, API-Abgleich, Auswertungsdatei übersichtlicher)&lt;/font&gt;&lt;/li&gt;&lt;li&gt;&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;Makro 2&lt;/b&gt; (Vorschläge aus Normvokabular können automatisch per Klick in die Erfassungstabelle übernommen werden)&lt;/font&gt;&lt;/li&gt;&lt;li&gt;&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;Normvokabular&lt;/b&gt;: Eine zentrale .json als SPOT etablieren und zentral in alle Prozesse einbinden&lt;/font&gt;&lt;/li&gt;&lt;li&gt;&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;Mapper&lt;/b&gt;&amp;nbsp;oder &lt;b&gt;Makro&lt;/b&gt; benötigt Funktion, Wörter ohne Treffer und Vorschlag in &lt;br&gt;eigene Liste zu übernehmen und auszugeben -&amp;gt; manuelle Prüfung&lt;/font&gt;&lt;/li&gt;&lt;li&gt;&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;Normvokabular&lt;/b&gt;: Regeln, ID-Struktur, Kategorien müssen auf Qualität und Nutzbarkeit geprüft werden; danach Anpassung aller Programme, die sich auf Normvokabular stützen&lt;/font&gt;&lt;/li&gt;&lt;/ul&gt;&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;br&gt;&lt;/font&gt;&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;align=left;spacing=2;spacingRight=0;" parent="1" vertex="1">
<mxGeometry x="40" y="1232" width="770" height="190" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-80" value="&lt;ul&gt;&lt;li&gt;Liest Spalte Objektbeschreibung aus, filtert Stopwords und Zahlen raus&lt;/li&gt;&lt;li&gt;Normalisierung, Lemmatisierung, Stemming der Wörter für höhere Trefferwahrscheinlichkeit&lt;/li&gt;&lt;li&gt;Liest das Normvokabular, Berücksichtigt ID-Hierarchie, erstellt Index für gestemmte Begriffe, cache und log&lt;/li&gt;&lt;li&gt;Abgleich mit Normvokabular, generiert Vorschläge wenn kein Treffer vorliegt&lt;/li&gt;&lt;li&gt;Markiert Treffer, Vorschläge und Keine Treffer&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;fontSize=10;align=left;" parent="1" vertex="1">
<mxGeometry x="525" y="132" width="300" height="160" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-81" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="245" y="322" as="sourcePoint" />
<mxPoint x="455" y="322" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-83" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;entryX=0.055;entryY=0.48;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" target="WA2_J1DCvVjPXciXSW-M-64" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="281" y="1117" as="sourcePoint" />
<mxPoint x="365" y="1002" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-91" value="&lt;ul&gt;&lt;li&gt;Automatische Spaltenerkennung (ID, Name/Wort/Vokabel)&lt;/li&gt;&lt;li&gt;Aufbau einer hierarchischen Struktur (Ober-, Unter-, Unterunterkategorien)&lt;/li&gt;&lt;li&gt;Erstellung eines Mastersheets mit eindeutigen IDs&lt;/li&gt;&lt;li&gt;Sortierte Ausgabe nach vordefinierter Sheet-Reihenfolge&lt;/li&gt;&lt;li&gt;Protokollierung im Terminal (Zeilenanzahl, Warnungen, ID-Zählung)&lt;/li&gt;&lt;li&gt;Speicherung einer neuen, synchronisierten Output-Datei ohne Änderung der Originaldatei&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;fontSize=10;" parent="1" vertex="1">
<mxGeometry x="510" y="902" width="310" height="160" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-96" value="" style="endArrow=none;html=1;rounded=0;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="15" y="1460" as="sourcePoint" />
<mxPoint x="815" y="1460" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-192" value="&lt;font style=&quot;font-size: 19px;&quot;&gt;&lt;b&gt;&lt;u&gt;3. Aktuelle Struktur des Normvokabulars (Stand 10/25)&lt;/u&gt;&lt;/b&gt;&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="35" y="1480" width="510" height="40" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-195" value="" style="group" parent="1" vertex="1" connectable="0">
<mxGeometry x="90" y="1740" width="580" height="380" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-97" value="Assets" style="childLayout=tableLayout;recursiveResize=0;strokeColor=#98bf21;fillColor=#A7C942;shadow=1;" parent="WA2_J1DCvVjPXciXSW-M-195" vertex="1">
<mxGeometry x="50" y="40" width="550" height="330" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-98" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=0;strokeColor=inherit;fillColor=#ffffff;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry width="550" height="43" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-99" value="ID" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#A7C942;align=center;fontStyle=1;fontColor=#FFFFFF;html=1;" parent="WA2_J1DCvVjPXciXSW-M-98" vertex="1">
<mxGeometry width="117" height="43" as="geometry">
<mxRectangle width="117" height="43" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-100" value="Unterkategorie" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#A7C942;align=center;fontStyle=1;fontColor=#FFFFFF;html=1;" parent="WA2_J1DCvVjPXciXSW-M-98" vertex="1">
<mxGeometry x="117" width="159" height="43" as="geometry">
<mxRectangle width="159" height="43" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-101" value="Unterunterkategorie" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#A7C942;align=center;fontStyle=1;fontColor=#FFFFFF;html=1;" parent="WA2_J1DCvVjPXciXSW-M-98" vertex="1">
<mxGeometry x="276" width="137" height="43" as="geometry">
<mxRectangle width="137" height="43" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-165" value="Wort/Vokabel" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#A7C942;align=center;fontStyle=1;fontColor=#FFFFFF;html=1;" parent="WA2_J1DCvVjPXciXSW-M-98" vertex="1">
<mxGeometry x="413" width="137" height="43" as="geometry">
<mxRectangle width="137" height="43" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-102" value="" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=0;strokeColor=inherit;fillColor=#ffffff;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry y="43" width="550" height="42" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-103" value="7.1.1" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-102" vertex="1">
<mxGeometry width="117" height="42" as="geometry">
<mxRectangle width="117" height="42" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-104" value="Außenarchitektur" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-102" vertex="1">
<mxGeometry x="117" width="159" height="42" as="geometry">
<mxRectangle width="159" height="42" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-105" value="Außenarchitektur allgemein" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-102" vertex="1">
<mxGeometry x="276" width="137" height="42" as="geometry">
<mxRectangle width="137" height="42" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-166" value="Außenarchitektur allgemein" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-102" vertex="1">
<mxGeometry x="413" width="137" height="42" as="geometry">
<mxRectangle width="137" height="42" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-187" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=0;strokeColor=inherit;fillColor=#ffffff;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry y="85" width="550" height="41" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-188" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-187" vertex="1">
<mxGeometry width="117" height="41" as="geometry">
<mxRectangle width="117" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-189" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-187" vertex="1">
<mxGeometry x="117" width="159" height="41" as="geometry">
<mxRectangle width="159" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-190" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-187" vertex="1">
<mxGeometry x="276" width="137" height="41" as="geometry">
<mxRectangle width="137" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-191" value="Hof" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-187" vertex="1">
<mxGeometry x="413" width="137" height="41" as="geometry">
<mxRectangle width="137" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-106" value="" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=1;strokeColor=inherit;fillColor=#EAF2D3;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry y="126" width="550" height="41" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-107" value="7.1.2" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-106" vertex="1">
<mxGeometry width="117" height="41" as="geometry">
<mxRectangle width="117" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-108" value="Außenarchitektur" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-106" vertex="1">
<mxGeometry x="117" width="159" height="41" as="geometry">
<mxRectangle width="159" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-109" value="Gebäudetypen" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-106" vertex="1">
<mxGeometry x="276" width="137" height="41" as="geometry">
<mxRectangle width="137" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-167" value="Gebäudetypen" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-106" vertex="1">
<mxGeometry x="413" width="137" height="41" as="geometry">
<mxRectangle width="137" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-110" value="" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=0;strokeColor=inherit;fillColor=#ffffff;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry y="167" width="550" height="44" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-111" value="" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;fontStyle=0;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-110" vertex="1">
<mxGeometry width="117" height="44" as="geometry">
<mxRectangle width="117" height="44" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-112" value="" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;fontStyle=0;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-110" vertex="1">
<mxGeometry x="117" width="159" height="44" as="geometry">
<mxRectangle width="159" height="44" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-113" value="" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;fontStyle=0;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-110" vertex="1">
<mxGeometry x="276" width="137" height="44" as="geometry">
<mxRectangle width="137" height="44" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-168" value="Haus" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;fontStyle=0;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-110" vertex="1">
<mxGeometry x="413" width="137" height="44" as="geometry">
<mxRectangle width="137" height="44" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-114" value="" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=1;strokeColor=inherit;fillColor=#EAF2D3;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry y="211" width="550" height="39" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-115" value="7.2" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-114" vertex="1">
<mxGeometry width="117" height="39" as="geometry">
<mxRectangle width="117" height="39" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-116" value="Innenarchitektur" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-114" vertex="1">
<mxGeometry x="117" width="159" height="39" as="geometry">
<mxRectangle width="159" height="39" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-117" value="" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-114" vertex="1">
<mxGeometry x="276" width="137" height="39" as="geometry">
<mxRectangle width="137" height="39" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-169" value="Innenarchitektur" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-114" vertex="1">
<mxGeometry x="413" width="137" height="39" as="geometry">
<mxRectangle width="137" height="39" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-175" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=1;strokeColor=inherit;fillColor=#FFFFFF;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry y="250" width="550" height="40" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-176" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-175" vertex="1">
<mxGeometry width="117" height="40" as="geometry">
<mxRectangle width="117" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-177" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-175" vertex="1">
<mxGeometry x="117" width="159" height="40" as="geometry">
<mxRectangle width="159" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-178" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-175" vertex="1">
<mxGeometry x="276" width="137" height="40" as="geometry">
<mxRectangle width="137" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-179" value="Zimmer" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-175" vertex="1">
<mxGeometry x="413" width="137" height="40" as="geometry">
<mxRectangle width="137" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-170" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=1;strokeColor=inherit;fillColor=#EAF2D3;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry y="290" width="550" height="40" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-171" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-170" vertex="1">
<mxGeometry width="117" height="40" as="geometry">
<mxRectangle width="117" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-172" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-170" vertex="1">
<mxGeometry x="117" width="159" height="40" as="geometry">
<mxRectangle width="159" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-173" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-170" vertex="1">
<mxGeometry x="276" width="137" height="40" as="geometry">
<mxRectangle width="137" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-174" value="Fußboden" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-170" vertex="1">
<mxGeometry x="413" width="137" height="40" as="geometry">
<mxRectangle width="137" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-323" value="&lt;b&gt;&lt;u&gt;b) Beispiel&lt;/u&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="WA2_J1DCvVjPXciXSW-M-195" vertex="1">
<mxGeometry x="-30" width="80" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-197" value="" style="endArrow=none;html=1;rounded=0;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="45" y="-110" as="sourcePoint" />
<mxPoint x="815" y="-110" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-198" value="&lt;font style=&quot;font-size: 21px;&quot;&gt;&lt;b&gt;&lt;u&gt;2. Normvokabular-Abgleich&lt;/u&gt;&lt;/b&gt;&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="45" y="-70" width="290" height="40" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-199" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;exitX=1;exitY=0.5;exitDx=0;exitDy=0;exitPerimeter=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-64" target="WA2_J1DCvVjPXciXSW-M-65" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="440" y="1110" as="sourcePoint" />
<mxPoint x="534" y="1110" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-202" value="Scanvorgang" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="70" y="-670" width="200" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-203" value="Erfassen" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="240" y="-400" width="200" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-207" value="Ebenenstruktur festlegen" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="180" y="-490" width="200" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-209" value="Erfassungstabelle" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="370" y="-200" width="247.5" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-210" value="&lt;ul&gt;&lt;li&gt;Durchgehen einer Box von vorne bis hinten&lt;/li&gt;&lt;li&gt;Auflegen des Objekts, Ausrichtung der Farbkarte&lt;/li&gt;&lt;li&gt;Manuelles Festlegen des Scanbereichs&lt;/li&gt;&lt;li&gt;Scan der gesamten Box&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" parent="1" vertex="1">
<mxGeometry x="255" y="-690" width="320" height="80" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-211" value="&lt;ul&gt;&lt;li&gt;Durchgehen der exportierten Scans im Bildviewer&lt;/li&gt;&lt;li&gt;Festlegung der Scanebenen (Umschlag, Vorderseite, Rückseite, etc.)&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" parent="1" vertex="1">
<mxGeometry x="367.5" y="-500" width="320" height="80" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-212" value="&lt;ul&gt;&lt;li&gt;Durchgehen der Scans&lt;/li&gt;&lt;li&gt;Erfassen: Datum, Urheber, Eigner, Material&lt;/li&gt;&lt;li&gt;Vermessen des Objekts&lt;/li&gt;&lt;li&gt;Objektbeschreibung: Verschlagwortung des Bildinhalts&lt;/li&gt;&lt;li&gt;Erfassen etwaiger Inschriften und Anmerkungen&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" parent="1" vertex="1">
<mxGeometry x="438" y="-440" width="300" height="140" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-216" value="&lt;font style=&quot;font-size: 21px;&quot;&gt;&lt;b&gt;&lt;u&gt;1. Ablauf des Scan- und Erfassungsprozesses&lt;/u&gt;&lt;/b&gt;&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="35" y="-840" width="490" height="40" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-217" value="Vorbereitung" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="30" y="-760" width="200" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-218" value="&lt;ul&gt;&lt;li&gt;PC hochfahren&lt;/li&gt;&lt;li&gt;Scanner starten/Kamera und Beleuchtung vorbereiten, Farbkarte platzieren&lt;/li&gt;&lt;li&gt;Software starten, Scanauftrag wählen&lt;/li&gt;&lt;li&gt;Erfassungstabelle öffnen&lt;/li&gt;&lt;li&gt;Passende Box wählen&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" parent="1" vertex="1">
<mxGeometry x="212.5" y="-790" width="555" height="110" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-236" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.25;exitY=1;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-226" target="WA2_J1DCvVjPXciXSW-M-228" edge="1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-318" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-226" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="350" y="1573" as="targetPoint" />
<Array as="points">
<mxPoint x="340" y="1573" />
<mxPoint x="360" y="1573" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-226" value="Kategorie" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
<mxGeometry x="170" y="1562.5" width="150" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-237" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.25;exitY=1;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-228" target="WA2_J1DCvVjPXciXSW-M-229" edge="1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-228" value="Unterkategorie" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
<mxGeometry x="290" y="1605" width="150" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-238" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.232;exitY=1.005;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;exitPerimeter=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-229" target="WA2_J1DCvVjPXciXSW-M-230" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="407.5" y="1652.5" as="sourcePoint" />
<mxPoint x="440" y="1687.5" as="targetPoint" />
<Array as="points">
<mxPoint x="440" y="1700" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-320" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-229" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="590" y="1660" as="targetPoint" />
<Array as="points">
<mxPoint x="580" y="1660" />
<mxPoint x="580" y="1660" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-229" value="Unterunterkategorie" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
<mxGeometry x="405" y="1650" width="150" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-230" value="Wort/Vokabel" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
<mxGeometry x="513" y="1690" width="150" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-313" value="&lt;font style=&quot;font-size: 10px;&quot;&gt;1&lt;/font&gt;" style="ellipse;whiteSpace=wrap;html=1;rounded=0;" parent="1" vertex="1">
<mxGeometry x="352.5" y="1560" width="25" height="25" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-315" value="&lt;font style=&quot;font-size: 10px;&quot;&gt;1.1&lt;/font&gt;" style="ellipse;whiteSpace=wrap;html=1;rounded=0;" parent="1" vertex="1">
<mxGeometry x="475" y="1602.5" width="25" height="25" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-316" value="&lt;font style=&quot;font-size: 10px;&quot;&gt;1.1.1&lt;/font&gt;" style="ellipse;whiteSpace=wrap;html=1;rounded=0;" parent="1" vertex="1">
<mxGeometry x="592.5" y="1647.5" width="25" height="25" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-319" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-228" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="470" y="1615" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-321" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="50" y="1740" as="sourcePoint" />
<mxPoint x="800" y="1740" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-322" value="&lt;b&gt;&lt;u&gt;a) Hierarchie und ID-Struktur&lt;/u&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="40" y="1530" width="190" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-193" value="Blatt 7 - Architektur" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
<mxGeometry x="140" y="2110" width="165" height="30" as="geometry" />
</mxCell>
<mxCell id="B-3lv8s0GtbLfT8x5DVe-1" value="Scan exportieren" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="122.5" y="-580" width="200" height="60" as="geometry" />
</mxCell>
<mxCell id="B-3lv8s0GtbLfT8x5DVe-2" value="&lt;ul&gt;&lt;li&gt;Export der gesamten Scans einer Box in einen Ordner&lt;/li&gt;&lt;li&gt;Reihenfolge der Scans checken&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" parent="1" vertex="1">
<mxGeometry x="307.5" y="-590" width="320" height="80" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-1" value="Erfassung prüfen" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" vertex="1" parent="1">
<mxGeometry x="310" y="-300" width="200" height="60" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-2" value="&lt;ul&gt;&lt;li&gt;Durchgehen der Scans, Vergleich der Nummern mit dem Inhalt der Erfassung&lt;/li&gt;&lt;li&gt;Makro laufen lassen: Prüft Begriffe unter &quot;Objektbschreibung&quot; auf Treffer im Normvokabular (siehe Anleitung)&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" vertex="1" parent="1">
<mxGeometry x="490" y="-310" width="320" height="90" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-3" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.3;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-217" target="WA2_J1DCvVjPXciXSW-M-202">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-4" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.238;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-202" target="B-3lv8s0GtbLfT8x5DVe-1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-6" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.213;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="B-3lv8s0GtbLfT8x5DVe-1" target="WA2_J1DCvVjPXciXSW-M-207">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-7" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.2;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-207" target="WA2_J1DCvVjPXciXSW-M-203">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-8" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.148;entryY=-0.056;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-203" target="ey7EfLCcf-ExpX1qzLUj-1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-10" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.161;entryY=-0.039;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="ey7EfLCcf-ExpX1qzLUj-1" target="WA2_J1DCvVjPXciXSW-M-209">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-11" value="&lt;font style=&quot;font-size: 15px;&quot;&gt;&lt;b&gt;Stand: 14.10.25&lt;/b&gt;&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
<mxGeometry x="745" y="-1090" width="105" height="50" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-12" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;" edge="1" parent="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="470" y="-880" as="sourcePoint" />
<mxPoint x="520" y="-880" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-13" value="&lt;b&gt;Optional/Optimierungsmöglichkeit&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" vertex="1" parent="1">
<mxGeometry x="530" y="-896" width="220" height="30" as="geometry" />
</mxCell>
</root>
</mxGraphModel>
</diagram>
</mxfile>

View File

@ -0,0 +1,97 @@
= ExLibris Scannen und Erfassen Leitfaden =
Die Digitalisierung von Exlibris-Objekten ist in zwei Hauptschritte gegliedert: **Scannen** und **Erfassen** der Objekte.
Jedes Objekt ist Teil einer Box, jede Box ist Teil eines Oktavs (°).
Eine Box wird vollständig gescannt, bevor die Erfassung beginnt das vermeidet Fehler, wenn sich z. B. die Reihenfolge der Scans durch nachträgliches Hinzufügen oder Löschen ändert.
== 1. Vorbereitung ==
=== a) Hardware vorbereiten ===
* PC starten
* Lichter anschalten
* Kameraabdeckung entfernen
* Kamera einschalten
=== b) Software vorbereiten ===
* MultiDotScan by Walter Nagel starten
* Scanauftrag auswählen → '''WELCHEN SCANAUFTRAG'''
== 2. Scanvorgang ==
* Objekt auf Unterlage legen, rechtwinklig zum Bildausschnitt in der Software
* Farbkarte anlegen je nach Format des Objekts rechts oder unten
* Bildausschnitt an Objekt anpassen (Rand ca. 1020 mm)
* Kamera mit dem Pedal auslösen → '''Scan'''
* Wenn ein Scan fehlt oder neu gemacht werden muss: in der Software an die richtige Stelle ziehen → Scans werden beim Export automatisch in der korrekten Reihenfolge angeordnet
== 3. Scans exportieren ==
* Nach dem Scan der gesamten Box den Scanauftrag exportieren → landet unter '''DATEIPFAD'''
== 4. Erfassen ==
* Erfasste Scans unter '''DATEIPFAD''' öffnen
(im Bildexplorer, '''nicht''' in der Scansoftware sonst kann sich die Reihenfolge der Scans ändern, was zu Fehlern in der Erfassungstabelle führt)
* Jede Box (= jeder Scanauftrag) beginnt numerisch bei 1; jeder Scan ist fortlaufend nummeriert
* Scannummern in die Erfassungstabelle eintragen, dabei Ebenenstruktur berücksichtigen:
=== Ebenenstruktur ===
* Standard: Vorderseite → Ebene 0, Rückseite → Ebene 1
* Wenn das Exlibris einen Umschlag hat oder mehrere Exlibris in einem Briefumschlag liegen:
* Umschlag = Ebene 0
* Vorderseite = Ebene 1
* Rückseite = Ebene 2
* Rückseite Umschlag = Ebene 3
→ So ist klar erkennbar, wo ein Umschlag beginnt und endet.
=== Erfassen der Metadaten ===
* Jahr steht eine Jahreszahl auf Vorder- oder Rückseite?
* Urheber Künstler
* Eigner wem gehört das Exlibris?
* Objektbeschreibung was ist zu sehen? Verschlagwortung des Bildinhalts
==== Beachten ====
* Beschreibung von '''grob → genau'''
* Beispiel: „Baum“ statt „Schwarzeiche“
* „Helm“ statt „Topfhelm 15. Jahrhundert“
* '''Singularformen''' bevorzugen auch bei mehreren Objekten
* z. B. „Buch“ statt „Bücher“, „Figur, weiblich“ statt „Frauengruppe“
* '''Aktivitäten im Infinitiv''' angeben: „sitzen“, „lesen“, „fahren“ statt „sitzt“, „lesend“, „fährt“
* '''Verbindungswörter vermeiden''' („Stopwords“):
<nowiki>mit, ohne, der, die, das, ein, eine, und, zu, von, im, in, auf, an, als, bei, für, aus, dem, den, des, eines, einer</nowiki>
(werden vom Mapper-Makro ohnehin herausgefiltert)
* Material meist Papier
* Maße Höhe × Breite in cm (bei geraden cm-Zahlen „,0“ anfügen, z. B. 14,3 × 7,0 cm statt 14,3 × 7)
* Objekttyp Exlibris, Rückseite, Umschlag, Zettel
* Inschrift z. B. Wappen mit Spruchband
* Anmerkungen sonstige Notizen oder Hinweise (Bleistifteinträge etc.)
* AUX irrelevant
== 5. Erfassung überprüfen ==
* Stimmt die Nummerierung der Scans mit der entsprechenden Zeile in der Erfassungstabelle überein?
* Makro über die Tabelle laufen lassen:
Es existiert ein Makro für den Abgleich der Spalte „Objektbeschreibung“ mit dem internen Normvokabular, das die Verschlagwortung vereinheitlicht.
Dieses Makro kann direkt in LibreOffice Calc über das Menü gestartet werden:
<pre>
Extras → Makros → Makros verwalten → Python →
Meine Makros → Vokabular_Abgleich_Makro → mapper_macro_2.x → run_mapper_macro → Ausführen
</pre>
'''Hinweis:'''
Für die Benutzung des Makros liegt eine ausführliche Anleitung unter '''DATEIPFAD'''.
== 6. Abschluss ==
* Vordruck ausfüllen:
* Name
* Datum
* Welche Box
* Bis wohin wurde gescannt/erfasst
* Gibt es etwas zu beachten?
== 7. Best Practices ==
* Lieber zu viel als zu wenig scannen (Rückseiten, Umschläge usw.)
* Lieber zu viel als zu wenig beschreiben (alles, was nachvollziehbar erkennbar ist, kann verschlagwortet werden)
* Notizen oder Beschriftungen auf Exlibris oder Rückseiten vollständig erfassen
* Bei Unsicherheiten: nachfragen

2815369
api_cache.json

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +0,0 @@
{
"normvokabular_path": "/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods",
"max_suggestions": 3,
"color_hit": "#C6EFCE",
"color_miss": "#FFC7CE",
"use_rapidfuzz": false,
"use_spacy": false,
"autosave": false
}

371
mapper.py
View File

@ -1,371 +0,0 @@
import os
import sys
import re
import time
import json
import pandas as pd
import requests
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher
# RapidFuzz für Token-basierte Fuzzy-Suche
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
print("RapidFuzz verfügbar")
except ImportError:
RAPIDFUZZ_AVAILABLE = False
print("RapidFuzz nicht verfügbar nutze SequenceMatcher")
# Spacy Lemmatizer
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
print("Spacy Lemmatizer aktiviert")
except:
SPACY_AVAILABLE = False
nlp = None
print("Spacy nicht verfügbar nutze naive Stemmer")
# =========================
# Pfade & Config
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
CACHE_FILE = "api_cache.json"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
TIMEOUT = 5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
API_ACTIVE = {"gnd": True, "wikidata": True}
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
# Cache
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
def save_cache():
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
# =========================
# Normalisierung / Lemma
# =========================
def normalize_text(s):
if not s:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
# Lemma-Cache
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# =========================
# Kompositum-Zerlegung (einfacher Ansatz)
# =========================
def compound_split(term):
parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
return parts if parts else [term]
# =========================
# Normvokabular laden & Lemma vorbereiten
# =========================
def load_normvokabular(file_path):
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
norm_dict = {}
stem_index = defaultdict(list)
lemma_norm_map = {} # für RapidFuzz preprocessed
for sheet_name, df in sheets.items():
if sheet_name.lower() in ["master", "übersicht"]:
continue
df = df.dropna(how="all", axis=1)
df.columns = [str(c).strip() for c in df.columns]
id_col = next((c for c in df.columns if "ID" in c), None)
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c), None)
if not id_col or not word_col:
continue
current_parent_id = None
for _, row in df.iterrows():
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
if row_id:
current_parent_id = row_id
if not row_word:
continue
assigned_parent_id = current_parent_id
entry = {
"Name": row_word,
"ID": assigned_parent_id, # Parent-ID
"Sheet": sheet_name,
"Own_ID": row_id or "" # eigene ID, falls vorhanden
}
key = normalize_text(row_word)
norm_dict[key] = entry
lemma = lemmatize_term(key)
stem_index[lemma].append(entry)
if lemma not in lemma_norm_map:
lemma_norm_map[lemma] = entry
return norm_dict, stem_index, lemma_norm_map
# =========================
# Mapping & Vorschläge
# =========================
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
# Exakter Treffer
if term_norm in norm_dict:
e = norm_dict[term_norm]
return e["Name"], e["ID"], []
# Lemma-Treffer
if term_lemma in stem_index:
e = stem_index[term_lemma][0]
return e["Name"], e["ID"], []
# KEIN TREFFER → Kompositum-Split
tokens = compound_split(term)
if len(tokens) == 1:
suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
return "KEIN TREFFER", "", suggestions
else:
token_matches = []
for t in tokens:
t_lemma = lemmatize_term(t)
if t_lemma in stem_index:
e = stem_index[t_lemma][0]
token_matches.append((t, e["Name"], e["ID"]))
else:
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
token_matches.append((t, "KEIN TREFFER", "", sugg))
combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
return "KEIN TREFFER", "", combined_suggestions
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
candidates = []
for key_lemma, entry in lemma_norm_map.items():
if RAPIDFUZZ_AVAILABLE:
score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
else:
score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
if key_lemma.lower().startswith(term_lemma.lower()):
score = min(score + 0.1, 1.0)
if score >= threshold:
candidates.append((score, entry["Name"], entry["ID"]))
candidates.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
# =========================
# API-Abfragen
# =========================
def request_with_retries(api_name,url,params=None):
cache_key = url + str(params)
if cache_key in CACHE:
return CACHE[cache_key]
retries = 0
while retries < MAX_RETRIES:
try:
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
if r.status_code == 200:
try: data = r.json()
except: data = r.text
CACHE[cache_key] = data
FAIL_COUNTER[api_name] = 0
return data
except:
pass
retries += 1
time.sleep(min(BACKOFF_FACTOR**retries,30))
FAIL_COUNTER[api_name] += 1
if FAIL_COUNTER[api_name] >= 10:
API_ACTIVE[api_name] = False
return None
def batch_query_gnd(terms):
results={}
if not API_ACTIVE.get("gnd", False):
for t in terms: results[t] = ""
return results
for t in terms:
url="https://lobid.org/gnd/search"
params={"q":t,"format":"json"}
data = request_with_retries("gnd", url, params)
top = ""
if data and "member" in data:
cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None,t.lower(),(doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
cands = [c for c in cands if c[1]>=0.75]
if cands:
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t] = top
return results
def batch_query_wikidata(terms):
results={}
if not API_ACTIVE.get("wikidata", False):
for t in terms: results[t] = ""
return results
for t in terms:
url="https://www.wikidata.org/w/api.php"
params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
data = request_with_retries("wikidata", url, params)
top = ""
if data and "search" in data:
cands = [(e.get("label",""), SequenceMatcher(None,t.lower(),e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")]
cands = [c for c in cands if c[1]>=0.70]
if cands:
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t] = top
return results
# =========================
# Markierung / Export
# =========================
def mark_norm_hits(file_path):
ext = file_path.suffix.lower()
if ext in [".xlsx", ".xls"]:
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
wb = load_workbook(file_path)
ws = wb.active
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
norm_col = col_map.get("Norm_Treffer", None)
if not norm_col:
print("Spalte 'Norm_Treffer' nicht gefunden")
return
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
cell = row[0]
if cell.value and cell.value != "KEIN TREFFER":
cell.fill = green_fill
else:
cell.fill = red_fill
wb.save(file_path)
elif ext==".ods":
df = pd.read_excel(file_path, engine="odf")
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
df.to_excel(file_path, index=False, engine="odf")
# =========================
# Verarbeitung Input-Dateien
# =========================
def process_files():
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
total_terms = 0
total_hits = 0
if not INPUT_DIR.exists():
print(f"Eingabeordner {INPUT_DIR} fehlt")
sys.exit(1)
files = list(INPUT_DIR.glob("*"))
if not files:
print("Keine Dateien gefunden")
return
for file_path in files:
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
continue
print(f"Verarbeite Datei: {file_path.name}")
try:
if file_path.suffix.lower() == ".csv":
df = pd.read_csv(file_path)
else:
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
except Exception as e:
print(f"Fehler beim Lesen von {file_path.name}: {e}")
continue
df = df.dropna(how="all")
df.columns = [str(c).strip() for c in df.columns]
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
urh_col = next((c for c in df.columns if "Urheber" in c), None)
if not besch_col: continue
row_terms_map = []
for _, row in df.iterrows():
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
if not besch: continue
obj_box = row[box_col] if box_col else ""
urheber = row[urh_col] if urh_col else ""
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
terms = []
for clause in clauses:
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS: continue
if re.fullmatch(r"\d+", p): continue
terms.append(p)
row_terms_map.append((obj_box, urheber, terms))
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
gnd_results = batch_query_gnd(all_terms)
wd_results = batch_query_wikidata(all_terms)
output_rows = []
for obj_box, urheber, terms in row_terms_map:
for term in terms:
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
total_terms += 1
if norm_name != "KEIN TREFFER":
total_hits += 1
out_row = {
"Box": obj_box,
"Objekt/Ebene": obj_box,
"Urheber": urheber,
"Begriff": term,
"Norm_Treffer": norm_name,
"Norm_ID": norm_id,
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
"GND_Top1": gnd_results.get(term,""),
"WD_Top1": wd_results.get(term,"")
}
output_rows.append(out_row)
out_df = pd.DataFrame(output_rows)
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
version = 1
while output_file.exists():
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
version += 1
engine = "odf" if output_file.suffix.lower()==".ods" else None
out_df.to_excel(output_file, index=False, engine=engine)
mark_norm_hits(output_file)
print(f"Auswertung gespeichert: {output_file}")
save_cache()
print(f"Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular")
# =========================
# Main
# =========================
if __name__ == "__main__":
process_files()
print("Fertig")

View File

@ -1,237 +0,0 @@
import uno
import os
import re
import traceback
import json
# Optional für Lemmatizer
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except:
SPACY_AVAILABLE = False
nlp = None
# Optional für Fuzzy Matching
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except:
from difflib import SequenceMatcher
RAPIDFUZZ_AVAILABLE = False
import odf.opendocument
import odf.table
import odf.text
# ------------------------
# Konfiguration absolute Pfade
# ------------------------
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro"
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
# ------------------------
# Logging
# ------------------------
def log(msg):
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(msg + "\n")
# ------------------------
# Cache laden
# ------------------------
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
# ------------------------
# Normalisierung / Lemma
# ------------------------
def normalize_text(s):
if not s:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# ------------------------
# NV_MASTER einlesen
# ------------------------
def load_nv_master(path):
norm_dict = {}
try:
doc = odf.opendocument.load(path)
except Exception as e:
log(f"Fehler beim Laden von NV_MASTER: {e}")
return norm_dict
for sheet in doc.spreadsheet.getElementsByType(odf.table.Table):
sheet_name = sheet.getAttribute("name")
if sheet_name.lower() == "master":
continue
current_parent_id = None
for row in sheet.getElementsByType(odf.table.TableRow):
cells = row.getElementsByType(odf.table.TableCell)
cell_values = []
for cell in cells:
texts = cell.getElementsByType(odf.text.P)
if texts and texts[0].firstChild:
cell_values.append(str(texts[0].firstChild.data).strip())
else:
cell_values.append("")
if not cell_values or len(cell_values)<4:
continue
id_val, unterk, unterunterk, word = cell_values[:4]
if id_val:
current_parent_id = id_val.strip()
if not word:
continue
key = lemmatize_term(word)
norm_dict[key] = {
"Name": word.strip(),
"ID": current_parent_id,
"Sheet": sheet_name,
"Unterkategorie": unterk.strip(),
"Unterunterkategorie": unterunterk.strip()
}
log(f"NV_MASTER geladen: {len(norm_dict)} Begriffe")
return norm_dict
# ------------------------
# Matching
# ------------------------
def get_suggestions(term_lemma, norm_dict, top_n=3, threshold=CONF_THRESHOLD):
candidates = []
for key, entry in norm_dict.items():
if RAPIDFUZZ_AVAILABLE:
score = fuzz.token_set_ratio(term_lemma, key)/100
else:
score = SequenceMatcher(None, term_lemma.lower(), key.lower()).ratio()
if key.lower().startswith(term_lemma.lower()):
score = min(score + 0.1, 1.0)
if score >= threshold:
candidates.append((score, entry["Name"], entry["ID"]))
candidates.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
def map_word(word, norm_dict):
key = lemmatize_term(word)
if key in CACHE:
cached = CACHE[key]
return cached["Norm"], cached["Suggestion"], cached["ID"]
if key in norm_dict:
entry = norm_dict[key]
tr, sug, wid = entry["Name"], "", entry["ID"]
else:
suggestions = get_suggestions(term_lemma=key, norm_dict=norm_dict)
if suggestions:
tr, sug, wid = "KEIN TREFFER", ", ".join(suggestions), ""
else:
tr, sug, wid = "KEIN TREFFER", "", ""
CACHE[key] = {"Norm": tr, "Suggestion": sug, "ID": wid}
return tr, sug, wid
# ------------------------
# Makro-Hauptfunktion
# ------------------------
def run_mapper_macro():
try:
doc = XSCRIPTCONTEXT.getDocument()
sheets = doc.getSheets()
sheet = sheets.getByIndex(0)
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
data_range = cursor.getRangeAddress()
header_row = 0
objekt_col = None
# Header prüfen
for col in range(data_range.EndColumn+1):
val = sheet.getCellByPosition(col, header_row).String.strip().lower()
if val == "objektbeschreibung":
objekt_col = col
break
if objekt_col is None:
log("Spalte 'Objektbeschreibung' nicht gefunden")
return
# Neue Spalten am rechten Tabellenende erstellen
max_col = data_range.EndColumn
norm_tr_col = max_col + 1
norm_sug_col = max_col + 2
norm_id_col = max_col + 3
sheet.getCellByPosition(norm_tr_col, header_row).String = "Norm_Treffer"
sheet.getCellByPosition(norm_sug_col, header_row).String = "Norm_Vorschlag"
sheet.getCellByPosition(norm_id_col, header_row).String = "Norm_ID"
norm_dict = load_nv_master(NV_MASTER_PATH)
# Farben
GREEN = 0xC6EFCE
YELLOW = 0xFFEB9C
RED = 0xFFC7CE
for row in range(1, data_range.EndRow+1):
cell = sheet.getCellByPosition(objekt_col, row)
val = cell.String.strip()
if not val:
continue
words = [w.strip() for w in re.split(r"\s+", val) if w.strip() and w.lower() not in STOPWORDS]
tr_list, sug_list, id_list = [], [], []
for w in words:
tr, sug, wid = map_word(w, norm_dict)
if tr != "KEIN TREFFER":
tr_list.append(tr)
if sug:
sug_list.append(sug)
if wid:
id_list.append(wid)
sheet.getCellByPosition(norm_tr_col, row).String = ", ".join(tr_list)
sheet.getCellByPosition(norm_sug_col, row).String = ", ".join(sug_list)
sheet.getCellByPosition(norm_id_col, row).String = ", ".join(id_list)
# Farbmarkierung
if tr_list:
cell.CellBackColor = GREEN
elif sug_list:
cell.CellBackColor = YELLOW
else:
cell.CellBackColor = RED
# Cache speichern
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
log("Makro erfolgreich ausgeführt")
except Exception as e:
log("Fehler in run_mapper_macro:")
log(traceback.format_exc())

View File

@ -1,297 +0,0 @@
# -*- coding: utf-8 -*-
import os
import uno
import unohelper
import re
import json
import pandas as pd
from pathlib import Path
from difflib import SequenceMatcher
# RapidFuzz für Fuzzy-Suche
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except ImportError:
RAPIDFUZZ_AVAILABLE = False
# Spacy Lemmatizer
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except:
SPACY_AVAILABLE = False
nlp = None
# =========================
# Pfade & Config
# =========================
SCRIPT_DIR = Path("/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro")
NV_MASTER_FILE = SCRIPT_DIR / "NV_MASTER.ods"
CACHE_FILE = SCRIPT_DIR / "mapper_cache.json"
LOG_FILE = SCRIPT_DIR / "mapper_log.txt"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
# =========================
# Cache & Logging
# =========================
if CACHE_FILE.exists():
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
def save_cache():
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
def log(msg):
with open(LOG_FILE,"a",encoding="utf-8") as f:
f.write(msg + "\n")
# =========================
# Textverarbeitung
# =========================
def normalize_text(s):
if not s: return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
def compound_split(term):
parts = re.findall(r'[A-ZÄÖÜa-zäöü]+', term)
return parts if parts else [term]
# =========================
# NV_MASTER laden
# =========================
def load_normvokabular(file_path):
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf")
norm_dict = {}
for sheet_name, df in sheets.items():
df = df.dropna(how="all", axis=1)
df.columns = [str(c).strip() for c in df.columns]
if "ID" not in df.columns or "Wort/Vokabel" not in df.columns:
continue
current_parent_id = None
for _, row in df.iterrows():
row_id = str(row["ID"]).strip() if pd.notna(row["ID"]) else None
row_word = str(row["Wort/Vokabel"]).strip() if pd.notna(row["Wort/Vokabel"]) else None
if row_id: current_parent_id = row_id
if not row_word: continue
norm_dict[normalize_text(row_word)] = {
"ID": current_parent_id,
"Wort/Vokabel": row_word
}
return norm_dict
# =========================
# Mapping
# =========================
def map_term_with_indexes(term, norm_dict):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term_norm)
# Cache prüfen
if term_lemma in CACHE:
cached = CACHE[term_lemma]
if isinstance(cached, dict) and all(k in cached for k in ("hits","suggestions","ids")):
return cached["hits"], cached["suggestions"], cached["ids"]
else:
CACHE.pop(term_lemma, None)
hits = []
suggestions = []
ids = []
# Exakte Treffer
if term_norm in norm_dict:
e = norm_dict[term_norm]
hits.append(e["Wort/Vokabel"])
ids.append(e["ID"])
elif term_lemma in norm_dict:
e = norm_dict[term_lemma]
hits.append(e["Wort/Vokabel"])
ids.append(e["ID"])
else:
# Fuzzy Matching
for key, e in norm_dict.items():
score = fuzz.token_sort_ratio(term_lemma, key)/100.0 if RAPIDFUZZ_AVAILABLE else SequenceMatcher(None, term_lemma, key).ratio()
if score >= 0.75:
suggestions.append(e["Wort/Vokabel"])
ids.append(e["ID"])
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
return hits, suggestions, ids
# =========================
# LibreOffice Dialog (ListBox + Checkbox)
# =========================
def apply_proposals_dialog():
ctx = uno.getComponentContext()
smgr = ctx.ServiceManager
desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
doc = desktop.getCurrentComponent()
if not doc.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
log("Kein Calc-Dokument aktiv")
return
selection = doc.CurrentSelection
sheet = doc.CurrentController.ActiveSheet
# Prüfen ob eine Zelle ausgewählt ist
if selection is None or not hasattr(selection, "getCellAddress"):
log("Keine Zelle ausgewählt")
return
cell = selection
# Spalte überprüfen
header_row = sheet.getCellRangeByPosition(0,0,sheet.Columns.Count-1,0)
objekt_col = None
norm_vorschlag_col = None
for col_idx in range(sheet.Columns.Count):
val = sheet.getCellByPosition(col_idx,0).String
if val.strip().lower() == "objektbeschreibung":
objekt_col = col_idx
elif val.strip().lower() == "norm_vorschlag":
norm_vorschlag_col = col_idx
if norm_vorschlag_col is None or objekt_col is None:
log("Spalte 'Norm_Vorschlag' oder 'Objektbeschreibung' nicht gefunden")
return
# Vorschläge auslesen
proposals_str = sheet.getCellByPosition(norm_vorschlag_col, cell.RangeAddress.StartRow).String
if not proposals_str.strip():
log("Keine Vorschläge in der ausgewählten Zelle")
return
proposals = [p.strip() for p in proposals_str.split(";") if p.strip()]
# Dialog erstellen
toolkit = smgr.createInstanceWithContext("com.sun.star.awt.Toolkit", ctx)
dialog_model = smgr.createInstanceWithContext("com.sun.star.awt.UnoControlDialogModel", ctx)
dialog_model.Width = 180
dialog_model.Height = 150
dialog_model.Title = "Vorschläge übernehmen"
# ListBox
lb_model = dialog_model.createInstance("com.sun.star.awt.UnoControlListBoxModel")
lb_model.Name = "ProposalList"
lb_model.PositionX = 10
lb_model.PositionY = 10
lb_model.Width = 160
lb_model.Height = 80
lb_model.StringItemList = tuple(proposals)
dialog_model.insertByName("ProposalList", lb_model)
# Checkbox
cb_model = dialog_model.createInstance("com.sun.star.awt.UnoControlCheckBoxModel")
cb_model.Name = "AllCheck"
cb_model.PositionX = 10
cb_model.PositionY = 95
cb_model.Width = 160
cb_model.Height = 15
cb_model.Label = "Alle Vorschläge übernehmen"
dialog_model.insertByName("AllCheck", cb_model)
# OK-Button
btn_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
btn_model.Name = "OKButton"
btn_model.PositionX = 10
btn_model.PositionY = 115
btn_model.Width = 80
btn_model.Height = 20
btn_model.Label = "OK"
dialog_model.insertByName("OKButton", btn_model)
# Abbrechen-Button
cancel_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
cancel_model.Name = "CancelButton"
cancel_model.PositionX = 100
cancel_model.PositionY = 115
cancel_model.Width = 80
cancel_model.Height = 20
cancel_model.Label = "Abbrechen"
dialog_model.insertByName("CancelButton", cancel_model)
# Control Dialog
dialog = smgr.createInstanceWithContext("com.sun.star.awt.UnoControlDialog", ctx)
dialog.setModel(dialog_model)
dialog.setVisible(True)
toolkit.createPeer(dialog, None)
# Warten auf OK
while True:
import time
time.sleep(0.1)
# Prüfen auf Klick
if dialog.getControl("OKButton").Pressed:
all_flag = dialog.getControl("AllCheck").State == 1
selected_idx = dialog.getControl("ProposalList").SelectedItems
if selected_idx:
selected_proposal = proposals[selected_idx[0]]
else:
selected_proposal = None
break
elif dialog.getControl("CancelButton").Pressed:
dialog.endExecute()
return
# Anwenden
obj_cell = sheet.getCellByPosition(objekt_col, cell.RangeAddress.StartRow)
obj_text = obj_cell.String
if all_flag:
for prop in proposals:
idx = obj_text.lower().find(prop.lower())
if idx != -1:
obj_text = obj_text[:idx] + prop + obj_text[idx+len(prop):]
else:
if selected_proposal:
idx = obj_text.lower().find(selected_proposal.lower())
if idx != -1:
obj_text = obj_text[:idx] + selected_proposal + obj_text[idx+len(selected_proposal):]
obj_cell.String = obj_text
obj_cell.CellBackColor = 0x00FF00 # grün
dialog.endExecute()
save_cache()
log(f"Vorschlag übernommen: {obj_text}")
# =========================
# Automatische Button-Registrierung
# =========================
def register_toolbar_button():
ctx = uno.getComponentContext()
smgr = ctx.ServiceManager
desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
doc = desktop.getCurrentComponent()
frame = doc.CurrentController.Frame
# Button kann manuell über Makro-Menü an Toolbar gebunden werden
# Hier wird nur das Makro selbst registriert
# Symbolleiste muss in LO einmalig erstellt werden
# =========================
# Hauptmakro
# =========================
def run_mapper_macro():
try:
norm_dict = load_normvokabular(NV_MASTER_FILE)
log(f"NV_MASTER geladen ({len(norm_dict)} Begriffe)")
apply_proposals_dialog()
except Exception as e:
log(f"Fehler in run_mapper_macro: {e}")

379
mapper_macro_2.3.py Normal file
View File

@ -0,0 +1,379 @@
# -*- coding: utf-8 -*-
# LibreOffice/Excel Macro: NV_MASTER-Abgleich, Pandas+odf, Cache, Farben
# Version 2.3 mit "Kein_Treffer" Spalte
# Speicherort: libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro/mapper_macro_2.3.py
import os
import re
import json
import traceback
# UNO-Context wird zur Laufzeit zur Verfügung gestellt (XSCRIPTCONTEXT)
try:
import pandas as pd
PANDAS_AVAILABLE = True
except Exception:
PANDAS_AVAILABLE = False
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
from difflib import SequenceMatcher
# ------------------------
# Konfiguration
# ------------------------
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro"
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro_2.3.log")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache_2.3.json")
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75 # Basis-Schwelle für Vorschläge
# ------------------------
# Logging
# ------------------------
def log(msg):
try:
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(msg + "\n")
except Exception:
pass
# ------------------------
# Cache laden
# ------------------------
try:
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
except Exception:
CACHE = {}
# ------------------------
# Text-Normalisierung & Lemma
# ------------------------
def normalize_text(s):
if not s:
return ""
s = str(s).strip().lower()
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
s = re.sub(r"\s+", " ", s)
return s
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
except Exception:
lemma = term_norm
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# ------------------------
# NV_MASTER laden
# ------------------------
def build_norm_index(nv_path):
norm_dict = {}
lemma_index = {}
if not PANDAS_AVAILABLE:
log("Pandas nicht verfügbar. NV_MASTER kann nicht gelesen werden.")
return norm_dict, lemma_index
try:
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
except Exception as e:
log(f"Fehler beim Einlesen NV_MASTER: {e}")
return norm_dict, lemma_index
for sheet_name, df in sheets.items():
if str(sheet_name).strip().lower() == "master":
continue
df = df.fillna("")
cols = [str(c).strip().lower() for c in df.columns]
id_col = None
word_col = None
for i, c in enumerate(cols):
if "id" in c:
id_col = df.columns[i]
if "wort" in c or "vokabel" in c:
word_col = df.columns[i]
if word_col is None and len(df.columns) >= 1:
word_col = df.columns[-1]
if id_col is None and len(df.columns) >= 1:
id_col = df.columns[0]
current_parent_id = None
for _, row in df.iterrows():
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
if id_val:
current_parent_id = id_val
if not word_val:
continue
norm_name = normalize_text(word_val)
lemma = lemmatize_term(word_val)
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
norm_dict.setdefault(norm_name, []).append(entry)
lemma_index.setdefault(lemma, []).append(entry)
log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}")
return norm_dict, lemma_index
# ------------------------
# Matching
# ------------------------
def fuzzy_score(a, b):
if RAPIDFUZZ_AVAILABLE:
try:
return fuzz.token_set_ratio(a, b) / 100.0
except Exception:
return 0.0
else:
try:
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
except Exception:
return 0.0
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD):
candidates = []
for key_lemma, entries in lemma_index.items():
score = fuzzy_score(term_lemma, key_lemma)
if key_lemma.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
for norm_key, entries in norm_dict.items():
score = fuzzy_score(term_lemma, norm_key)
if norm_key.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
candidates.sort(key=lambda t: t[0], reverse=True)
seen = set()
results = []
for score, name, id_ in candidates:
key = (name, id_)
if key in seen:
continue
seen.add(key)
results.append({"score": score, "name": name, "id": id_})
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
def map_term_with_indexes(term, norm_dict, lemma_index):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_lemma in CACHE:
cached = CACHE[term_lemma]
return cached.get("hits", []), cached.get("suggestions", []), cached.get("ids", [])
hits = []
suggestions = []
ids = []
if term_norm in norm_dict:
for e in norm_dict[term_norm]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
if not hits and term_lemma in lemma_index:
for e in lemma_index[term_lemma]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
if not hits:
suggestions = get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD)
def unique_preserve(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
hits = unique_preserve(hits)
suggestions = unique_preserve(suggestions)
ids = unique_preserve(ids)
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
return hits, suggestions, ids
# ------------------------
# Haupt-Makro
# ------------------------
def run_mapper_macro():
try:
doc = XSCRIPTCONTEXT.getDocument()
sheet = doc.CurrentController.ActiveSheet
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
data_range = cursor.getRangeAddress()
except Exception as e:
log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e))
return
# Header finden
header_row = None
objekt_col = None
max_col = data_range.EndColumn
for r in range(0, min(5, data_range.EndRow+1)):
for c in range(0, max_col+1):
try:
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
except Exception:
val = ""
if val == "objektbeschreibung":
header_row = r
objekt_col = c
break
if objekt_col is not None:
break
if objekt_col is None:
log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.")
return
# Spalten anlegen
existing = {}
for c in range(0, data_range.EndColumn+1):
try:
h = str(sheet.getCellByPosition(c, header_row).String).strip()
except Exception:
h = ""
if h == "Norm_Treffer":
existing["Norm_Treffer"] = c
if h == "Norm_Vorschlag":
existing["Norm_Vorschlag"] = c
last_col = data_range.EndColumn
if "Norm_Treffer" not in existing:
last_col += 1
existing["Norm_Treffer"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
if "Norm_Vorschlag" not in existing:
last_col += 1
existing["Norm_Vorschlag"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
# Neue Spalte "Kein_Treffer"
if "Kein_Treffer" not in existing:
last_col += 1
existing["Kein_Treffer"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Kein_Treffer"
norm_tr_col = existing["Norm_Treffer"]
norm_sug_col = existing["Norm_Vorschlag"]
kein_tr_col = existing["Kein_Treffer"]
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
if not norm_dict and not lemma_index:
log("NV_MASTER leer oder nicht lesbar. Abbruch.")
return
GREEN = 0xADFF2F
YELLOW = 0xFFA500
RED = 0xCC0000
WHITE = 0xFFFFFF
rows_processed = 0
for r in range(header_row + 1, data_range.EndRow + 1):
try:
cell = sheet.getCellByPosition(objekt_col, r)
txt = str(cell.String).strip()
if not txt:
continue
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
terms = []
for cl in clauses:
parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS:
continue
if re.fullmatch(r"\d+", p):
continue
terms.append(p)
row_hits = []
row_sugs = []
row_ids = []
unmapped_terms = []
for term in terms:
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
if hits:
row_hits.extend([f"{h} ({id_})" if id_ else h for h,id_ in zip(hits, ids + [""]*len(hits))])
else:
unmapped_terms.append(term)
if sugs:
row_sugs.extend([f"{s}" for s in sugs])
if ids:
row_ids.extend(ids)
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
row_hits = uniq(row_hits)
row_sugs = uniq(row_sugs)
unmapped_terms = uniq(unmapped_terms)
# Farb-Logik Objektbeschreibung
if terms and not unmapped_terms and row_hits:
cell.CellBackColor = GREEN
row_sugs = [] # keine Vorschläge wenn alles Treffer
elif row_hits:
cell.CellBackColor = YELLOW
else:
cell.CellBackColor = RED
# Norm_Treffer
tr_cell = sheet.getCellByPosition(norm_tr_col, r)
tr_cell.String = " | ".join(row_hits)
tr_cell.CellBackColor = GREEN if row_hits else WHITE
# Norm_Vorschlag
sug_cell = sheet.getCellByPosition(norm_sug_col, r)
sug_cell.String = " | ".join(row_sugs)
sug_cell.CellBackColor = YELLOW if row_sugs else WHITE
# Kein_Treffer
kt_cell = sheet.getCellByPosition(kein_tr_col, r)
kt_cell.String = " | ".join(unmapped_terms)
kt_cell.CellBackColor = RED if unmapped_terms else WHITE
rows_processed += 1
except Exception as e:
log(f"Fehler in Zeile {r}: {e}\n{traceback.format_exc()}")
try:
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
except Exception:
pass
log(f"run_mapper_macro fertig. Zeilen verarbeitet: {rows_processed}")
# Export für LibreOffice
g_exportedScripts = (run_mapper_macro,)

View File

@ -1,121 +0,0 @@
import uno
import json
import subprocess
from pathlib import Path
from com.sun.star.awt import XActionListener
# Farbwerte (BGR)
GREEN = 0xC6EFCE
RED = 0xFFC7CE
YELLOW = 0xFFEB9C
def get_objektbeschreibung_column(sheet):
"""Findet die Spalte 'Objektbeschreibung'."""
for row in range(sheet.Rows.Count):
for col in range(sheet.Columns.Count):
cell = sheet.getCellByPosition(col, row)
if cell.String.strip().lower() == "objektbeschreibung":
return col
return None
def update_cell_color(cell, status):
"""Färbt die Zelle."""
if status == "grün":
cell.CellBackColor = GREEN
elif status == "gelb":
cell.CellBackColor = YELLOW
else:
cell.CellBackColor = RED
def call_mapper(term):
"""Ruft den lokalen Wrapper auf."""
wrapper = Path("/home/jarnold/projects/GND-Skript Test/NormVokabular_Mapper_Wrapper.py")
if not wrapper.exists():
return {"term": term, "norm_name": "KEIN TREFFER", "norm_id": "", "suggestions": []}
result = subprocess.run(
["python3", str(wrapper), term],
capture_output=True,
text=True
)
try:
output = json.loads(result.stdout)
except:
output = {"term": term, "norm_name": "KEIN TREFFER", "norm_id": "", "suggestions": []}
return output
class SuggestionListener(XActionListener):
"""Listener für Klick auf Vorschlag-Button."""
def __init__(self, cell, suggestion, dialog):
self.cell = cell
self.suggestion = suggestion
self.dialog = dialog
def actionPerformed(self, event):
self.cell.String = self.suggestion
update_cell_color(self.cell, "grün")
self.dialog.endExecute() # schließt das Dialogfenster
def disposing(self, event):
pass
def show_suggestion_dialog(cell, term, suggestions):
"""Zeigt ein Dialog-Fenster mit klickbaren Vorschlägen."""
ctx = XSCRIPTCONTEXT.getComponentContext()
smgr = ctx.getServiceManager()
toolkit = smgr.createInstance("com.sun.star.awt.Toolkit")
dialog_model = smgr.createInstance("com.sun.star.awt.UnoControlDialogModel")
dialog_model.PositionX = 100
dialog_model.PositionY = 100
dialog_model.Width = 200
dialog_model.Height = 30 + 25*len(suggestions)
dialog_model.Title = f"Vorschläge für '{term}'"
for i, sugg in enumerate(suggestions[:3]):
btn_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
btn_model.Name = f"btn_{i}"
btn_model.Label = sugg
btn_model.PositionX = 10
btn_model.PositionY = 10 + i*25
btn_model.Width = 180
btn_model.Height = 20
dialog_model.insertByName(btn_model.Name, btn_model)
dialog = smgr.createInstance("com.sun.star.awt.UnoControlDialog")
dialog.setModel(dialog_model)
dialog.setVisible(True)
for i, sugg in enumerate(suggestions[:3]):
btn = dialog.getControl(f"btn_{i}")
listener = SuggestionListener(cell, sugg, dialog)
btn.addActionListener(listener)
toolkit.createDialog(dialog).execute()
def mapper_process_column():
"""Verarbeitet alle Zellen unter 'Objektbeschreibung' in der aktiven Tabelle."""
doc = XSCRIPTCONTEXT.getDocument()
sheet = doc.CurrentController.ActiveSheet
col_index = get_objektbeschreibung_column(sheet)
if col_index is None:
return
for row in range(sheet.Rows.Count):
cell = sheet.getCellByPosition(col_index, row)
if not cell.String.strip():
continue # leere Zelle ignorieren
term = cell.String.strip()
result = call_mapper(term)
if result["norm_name"] != "KEIN TREFFER":
cell.String = result["norm_name"]
update_cell_color(cell, "grün")
elif result["suggestions"]:
update_cell_color(cell, "gelb")
show_suggestion_dialog(cell, term, result["suggestions"])
else:
update_cell_color(cell, "rot")
show_suggestion_dialog(cell, term, [])
# Export
g_exportedScripts = mapper_process_column,

View File