Compare commits

..

No commits in common. "5836423a650e11b087e7c6a0449d57bc914c68d2" and "b2244b48168be9f895a7ea353bd3983b865597de" have entirely different histories.

15300 changed files with 0 additions and 2882246 deletions

View File

@ -1,297 +0,0 @@
# -*- coding: utf-8 -*-
import os
import uno
import unohelper
import re
import json
import pandas as pd
from pathlib import Path
from difflib import SequenceMatcher
# RapidFuzz für Fuzzy-Suche
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except ImportError:
RAPIDFUZZ_AVAILABLE = False
# Spacy Lemmatizer
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except:
SPACY_AVAILABLE = False
nlp = None
# =========================
# Pfade & Config
# =========================
SCRIPT_DIR = Path("/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro")
NV_MASTER_FILE = SCRIPT_DIR / "NV_MASTER.ods"
CACHE_FILE = SCRIPT_DIR / "mapper_cache.json"
LOG_FILE = SCRIPT_DIR / "mapper_log.txt"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
# =========================
# Cache & Logging
# =========================
if CACHE_FILE.exists():
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
def save_cache():
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
def log(msg):
with open(LOG_FILE,"a",encoding="utf-8") as f:
f.write(msg + "\n")
# =========================
# Textverarbeitung
# =========================
def normalize_text(s):
if not s: return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
def compound_split(term):
parts = re.findall(r'[A-ZÄÖÜa-zäöü]+', term)
return parts if parts else [term]
# =========================
# NV_MASTER laden
# =========================
def load_normvokabular(file_path):
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf")
norm_dict = {}
for sheet_name, df in sheets.items():
df = df.dropna(how="all", axis=1)
df.columns = [str(c).strip() for c in df.columns]
if "ID" not in df.columns or "Wort/Vokabel" not in df.columns:
continue
current_parent_id = None
for _, row in df.iterrows():
row_id = str(row["ID"]).strip() if pd.notna(row["ID"]) else None
row_word = str(row["Wort/Vokabel"]).strip() if pd.notna(row["Wort/Vokabel"]) else None
if row_id: current_parent_id = row_id
if not row_word: continue
norm_dict[normalize_text(row_word)] = {
"ID": current_parent_id,
"Wort/Vokabel": row_word
}
return norm_dict
# =========================
# Mapping
# =========================
def map_term_with_indexes(term, norm_dict):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term_norm)
# Cache prüfen
if term_lemma in CACHE:
cached = CACHE[term_lemma]
if isinstance(cached, dict) and all(k in cached for k in ("hits","suggestions","ids")):
return cached["hits"], cached["suggestions"], cached["ids"]
else:
CACHE.pop(term_lemma, None)
hits = []
suggestions = []
ids = []
# Exakte Treffer
if term_norm in norm_dict:
e = norm_dict[term_norm]
hits.append(e["Wort/Vokabel"])
ids.append(e["ID"])
elif term_lemma in norm_dict:
e = norm_dict[term_lemma]
hits.append(e["Wort/Vokabel"])
ids.append(e["ID"])
else:
# Fuzzy Matching
for key, e in norm_dict.items():
score = fuzz.token_sort_ratio(term_lemma, key)/100.0 if RAPIDFUZZ_AVAILABLE else SequenceMatcher(None, term_lemma, key).ratio()
if score >= 0.75:
suggestions.append(e["Wort/Vokabel"])
ids.append(e["ID"])
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
return hits, suggestions, ids
# =========================
# LibreOffice Dialog (ListBox + Checkbox)
# =========================
def apply_proposals_dialog():
ctx = uno.getComponentContext()
smgr = ctx.ServiceManager
desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
doc = desktop.getCurrentComponent()
if not doc.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
log("Kein Calc-Dokument aktiv")
return
selection = doc.CurrentSelection
sheet = doc.CurrentController.ActiveSheet
# Prüfen ob eine Zelle ausgewählt ist
if selection is None or not hasattr(selection, "getCellAddress"):
log("Keine Zelle ausgewählt")
return
cell = selection
# Spalte überprüfen
header_row = sheet.getCellRangeByPosition(0,0,sheet.Columns.Count-1,0)
objekt_col = None
norm_vorschlag_col = None
for col_idx in range(sheet.Columns.Count):
val = sheet.getCellByPosition(col_idx,0).String
if val.strip().lower() == "objektbeschreibung":
objekt_col = col_idx
elif val.strip().lower() == "norm_vorschlag":
norm_vorschlag_col = col_idx
if norm_vorschlag_col is None or objekt_col is None:
log("Spalte 'Norm_Vorschlag' oder 'Objektbeschreibung' nicht gefunden")
return
# Vorschläge auslesen
proposals_str = sheet.getCellByPosition(norm_vorschlag_col, cell.RangeAddress.StartRow).String
if not proposals_str.strip():
log("Keine Vorschläge in der ausgewählten Zelle")
return
proposals = [p.strip() for p in proposals_str.split(";") if p.strip()]
# Dialog erstellen
toolkit = smgr.createInstanceWithContext("com.sun.star.awt.Toolkit", ctx)
dialog_model = smgr.createInstanceWithContext("com.sun.star.awt.UnoControlDialogModel", ctx)
dialog_model.Width = 180
dialog_model.Height = 150
dialog_model.Title = "Vorschläge übernehmen"
# ListBox
lb_model = dialog_model.createInstance("com.sun.star.awt.UnoControlListBoxModel")
lb_model.Name = "ProposalList"
lb_model.PositionX = 10
lb_model.PositionY = 10
lb_model.Width = 160
lb_model.Height = 80
lb_model.StringItemList = tuple(proposals)
dialog_model.insertByName("ProposalList", lb_model)
# Checkbox
cb_model = dialog_model.createInstance("com.sun.star.awt.UnoControlCheckBoxModel")
cb_model.Name = "AllCheck"
cb_model.PositionX = 10
cb_model.PositionY = 95
cb_model.Width = 160
cb_model.Height = 15
cb_model.Label = "Alle Vorschläge übernehmen"
dialog_model.insertByName("AllCheck", cb_model)
# OK-Button
btn_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
btn_model.Name = "OKButton"
btn_model.PositionX = 10
btn_model.PositionY = 115
btn_model.Width = 80
btn_model.Height = 20
btn_model.Label = "OK"
dialog_model.insertByName("OKButton", btn_model)
# Abbrechen-Button
cancel_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
cancel_model.Name = "CancelButton"
cancel_model.PositionX = 100
cancel_model.PositionY = 115
cancel_model.Width = 80
cancel_model.Height = 20
cancel_model.Label = "Abbrechen"
dialog_model.insertByName("CancelButton", cancel_model)
# Control Dialog
dialog = smgr.createInstanceWithContext("com.sun.star.awt.UnoControlDialog", ctx)
dialog.setModel(dialog_model)
dialog.setVisible(True)
toolkit.createPeer(dialog, None)
# Warten auf OK
while True:
import time
time.sleep(0.1)
# Prüfen auf Klick
if dialog.getControl("OKButton").Pressed:
all_flag = dialog.getControl("AllCheck").State == 1
selected_idx = dialog.getControl("ProposalList").SelectedItems
if selected_idx:
selected_proposal = proposals[selected_idx[0]]
else:
selected_proposal = None
break
elif dialog.getControl("CancelButton").Pressed:
dialog.endExecute()
return
# Anwenden
obj_cell = sheet.getCellByPosition(objekt_col, cell.RangeAddress.StartRow)
obj_text = obj_cell.String
if all_flag:
for prop in proposals:
idx = obj_text.lower().find(prop.lower())
if idx != -1:
obj_text = obj_text[:idx] + prop + obj_text[idx+len(prop):]
else:
if selected_proposal:
idx = obj_text.lower().find(selected_proposal.lower())
if idx != -1:
obj_text = obj_text[:idx] + selected_proposal + obj_text[idx+len(selected_proposal):]
obj_cell.String = obj_text
obj_cell.CellBackColor = 0x00FF00 # grün
dialog.endExecute()
save_cache()
log(f"Vorschlag übernommen: {obj_text}")
# =========================
# Automatische Button-Registrierung
# =========================
def register_toolbar_button():
ctx = uno.getComponentContext()
smgr = ctx.ServiceManager
desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
doc = desktop.getCurrentComponent()
frame = doc.CurrentController.Frame
# Button kann manuell über Makro-Menü an Toolbar gebunden werden
# Hier wird nur das Makro selbst registriert
# Symbolleiste muss in LO einmalig erstellt werden
# =========================
# Hauptmakro
# =========================
def run_mapper_macro():
try:
norm_dict = load_normvokabular(NV_MASTER_FILE)
log(f"NV_MASTER geladen ({len(norm_dict)} Begriffe)")
apply_proposals_dialog()
except Exception as e:
log(f"Fehler in run_mapper_macro: {e}")

View File

@ -1,469 +0,0 @@
# -*- coding: utf-8 -*-
# mapper_macro 1.5 - LibreOffice Calc
# Features: Kompositum-Split, Cache, Live-Vorschläge nur auf 'Objektbeschreibung', Logging
import os
import re
import json
import datetime
# optional imports (Pandas, Spacy, RapidFuzz)
try:
import pandas as pd
PANDAS_AVAILABLE = True
except Exception:
PANDAS_AVAILABLE = False
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
from difflib import SequenceMatcher
# ------------------------
# Konfiguration
# ------------------------
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro"
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
STOPWORDS = {
"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an",
"als","bei","für","aus","dem","den","des","eines","einer"
}
CONF_THRESHOLD = 0.75
# ------------------------
# Logging
# ------------------------
def log(msg, level="INFO"):
ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
line = f"[{ts}] [{level}] {msg}\n"
try:
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(line)
except Exception:
pass
# ------------------------
# Cache laden
# ------------------------
try:
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
except Exception as e:
CACHE = {}
log(f"Fehler beim Laden des Caches: {e}", level="ERROR")
# ------------------------
# Textnormalisierung & Lemma
# ------------------------
lemma_cache = {}
def normalize_text(s):
if not s:
return ""
s = str(s).strip().lower()
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
s = re.sub(r"\s+", " ", s)
return s
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(term_norm)
lemma = " ".join([t.lemma_ for t in doc])
except Exception:
lemma = term_norm
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# ------------------------
# Kompositum-Splitting
# ------------------------
def compound_split(term):
if not term:
return []
parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
if parts:
return parts
parts = [p for p in re.split(r'[-\s]+', term) if p]
return parts or [term]
# ------------------------
# NV_MASTER indexieren
# ------------------------
def build_norm_index(nv_path):
norm_dict = {}
lemma_index = {}
if not PANDAS_AVAILABLE:
log("Pandas nicht verfügbar, NV_MASTER kann nicht gelesen.", level="ERROR")
return norm_dict, lemma_index
try:
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
except Exception as e:
log(f"Fehler beim Einlesen von NV_MASTER: {e}", level="ERROR")
return norm_dict, lemma_index
for sheet_name, df in sheets.items():
if str(sheet_name).strip().lower() == "master":
continue
df = df.fillna("")
cols = [str(c).strip().lower() for c in df.columns]
id_col = None
word_col = None
for i, c in enumerate(cols):
if "id" in c:
id_col = df.columns[i]
if "wort" in c or "vokabel" in c:
word_col = df.columns[i]
if word_col is None and len(df.columns) >= 1:
word_col = df.columns[-1]
if id_col is None and len(df.columns) >= 1:
id_col = df.columns[0]
current_parent_id = None
for _, row in df.iterrows():
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
if id_val:
current_parent_id = id_val
if not word_val:
continue
norm_name = normalize_text(word_val)
lemma = lemmatize_term(word_val)
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
norm_dict.setdefault(norm_name, []).append(entry)
lemma_index.setdefault(lemma, []).append(entry)
log(f"NV_MASTER geladen. Begriffe: {sum(len(v) for v in norm_dict.values())}")
return norm_dict, lemma_index
# ------------------------
# Fuzzy / Vorschläge
# ------------------------
def fuzzy_score(a, b):
if RAPIDFUZZ_AVAILABLE:
try:
return fuzz.token_set_ratio(a, b) / 100.0
except Exception:
return 0.0
else:
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD):
candidates = []
for key_lemma, entries in lemma_index.items():
score = fuzzy_score(term_lemma, key_lemma)
if key_lemma.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
for norm_key, entries in norm_dict.items():
score = fuzzy_score(term_lemma, norm_key)
if norm_key.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
candidates.sort(key=lambda t: t[0], reverse=True)
seen = set()
results = []
for score, name, id_ in candidates:
key = (name, id_)
if key in seen:
continue
seen.add(key)
results.append({"score": score, "name": name, "id": id_})
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
# ------------------------
# Mapping eines Terms (mit Cache)
# ------------------------
def map_term_with_indexes(term, norm_dict, lemma_index):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_lemma in CACHE:
c = CACHE[term_lemma]
return c.get("hits", []), c.get("suggestions", []), c.get("ids", [])
hits = []
suggestions = []
ids = []
if term_norm in norm_dict:
for e in norm_dict[term_norm]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
if not hits and term_lemma in lemma_index:
for e in lemma_index[term_lemma]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
suggestions = get_suggestions_for_term(term_lemma, norm_dict, lemma_index)
if not hits:
tokens = compound_split(term)
for t in tokens:
t_lemma = lemmatize_term(t)
if t_lemma in lemma_index:
for e in lemma_index[t_lemma]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
else:
suggestions.extend(get_suggestions_for_term(t_lemma, norm_dict, lemma_index))
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
hits = uniq(hits)
suggestions = uniq(suggestions)
ids = uniq(ids)
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
return hits, suggestions, ids
# ------------------------
# Header + Spalten
# ------------------------
def find_header_and_cols(sheet):
try:
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
dr = cursor.getRangeAddress()
except Exception:
return None, None, None
header_row = None
objekt_col = None
for r in range(0, min(5, dr.EndRow + 1)):
for c in range(0, dr.EndColumn + 1):
try:
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
except Exception:
val = ""
if val == "objektbeschreibung":
header_row = r
objekt_col = c
break
if objekt_col is not None:
break
if header_row is None:
return None, None, dr
existing = {}
for c in range(0, dr.EndColumn + 1):
try:
h = str(sheet.getCellByPosition(c, header_row).String).strip()
except Exception:
h = ""
if h == "Norm_Treffer":
existing["Norm_Treffer"] = c
if h == "Norm_Vorschlag":
existing["Norm_Vorschlag"] = c
if h == "Norm_ID":
existing["Norm_ID"] = c
return header_row, objekt_col, dr, existing
# ------------------------
# Optimierter Live-Handler (nur Objektbeschreibung)
# ------------------------
def on_objektbeschreibung_change(oEvent=None):
try:
doc = XSCRIPTCONTEXT.getDocument()
sheet = doc.CurrentController.ActiveSheet
except Exception as e:
log(f"Dokumentzugriff fehlgeschlagen: {e}", level="ERROR")
return
cell = None
try:
if oEvent and hasattr(oEvent, "Range") and oEvent.Range is not None:
cell = oEvent.Range
elif oEvent and hasattr(oEvent, "Source") and oEvent.Source is not None:
cell = oEvent.Source
except Exception:
cell = None
if cell is None:
try:
sel = doc.CurrentSelection
if hasattr(sel, "getCellByPosition"):
cell = sel
else:
cell = sel.getCellByPosition(0, 0)
except Exception as e:
log(f"Keine Selektion: {e}", level="ERROR")
return
try:
row_index = cell.CellAddress.Row
col_index = cell.CellAddress.Column
except Exception:
return
try:
header_row, objekt_col, dr, existing = find_header_and_cols(sheet)
if header_row is None or col_index != objekt_col:
return # nur die Objektbeschreibung-Spalte bearbeiten
last_col = dr.EndColumn
if "Norm_Vorschlag" not in existing:
last_col += 1
existing["Norm_Vorschlag"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
norm_sug_col = existing["Norm_Vorschlag"]
except Exception as e:
log(f"Fehler Spaltenbestimmung: {e}", level="ERROR")
return
try:
txt = str(cell.String).strip()
if not txt:
sheet.getCellByPosition(norm_sug_col, row_index).String = ""
return
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
suggestions_acc = []
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
for cl in clauses:
parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS or re.fullmatch(r"\d+", p):
continue
for sp in compound_split(p):
_, sugs, _ = map_term_with_indexes(sp, norm_dict, lemma_index)
suggestions_acc.extend(sugs)
seen = set()
ordered = []
for s in suggestions_acc:
if s not in seen:
seen.add(s)
ordered.append(s)
sheet.getCellByPosition(norm_sug_col, row_index).String = " | ".join(ordered)
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
except Exception as e:
log(f"Fehler im Live-Handler: {e}", level="ERROR")
# ------------------------
# Batch-Durchlauf
# ------------------------
def run_mapper_macro():
log("=== mapper_macro 1.5 gestartet ===", level="INFO")
try:
doc = XSCRIPTCONTEXT.getDocument()
sheet = doc.CurrentController.ActiveSheet
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
dr = cursor.getRangeAddress()
except Exception as e:
log(f"Dokumentzugriff fehlgeschlagen: {e}", level="ERROR")
return
header_row, objekt_col, dr, existing = find_header_and_cols(sheet)
if objekt_col is None:
log("Spalte 'Objektbeschreibung' nicht gefunden.", level="ERROR")
return
if "Norm_Treffer" not in existing:
last_col = dr.EndColumn + 1
existing["Norm_Treffer"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
if "Norm_Vorschlag" not in existing:
last_col = dr.EndColumn + 2
existing["Norm_Vorschlag"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
if "Norm_ID" not in existing:
last_col = dr.EndColumn + 3
existing["Norm_ID"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_ID"
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
GREEN, YELLOW, RED = 0xADFF2F, 0xFFA500, 0xCC0000
for r in range(header_row + 1, dr.EndRow + 1):
try:
cell = sheet.getCellByPosition(objekt_col, r)
txt = str(cell.String).strip()
if not txt:
continue
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
terms = []
for cl in clauses:
for p in [p.strip() for p in re.split(r"\s+", cl) if p.strip()]:
if p.lower() in STOPWORDS or re.fullmatch(r"\d+", p):
continue
terms.extend([sp.strip() for sp in compound_split(p) if sp.strip()])
row_hits, row_sugs, row_ids = [], [], []
any_unmapped = False
for term in terms:
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
row_hits.extend(hits)
row_sugs.extend(sugs)
row_ids.extend(ids)
if not hits and not sugs:
any_unmapped = True
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
row_hits, row_sugs, row_ids = map(uniq, [row_hits, row_sugs, row_ids])
sheet.getCellByPosition(existing["Norm_Treffer"], r).String = " | ".join(row_hits)
sheet.getCellByPosition(existing["Norm_Vorschlag"], r).String = " | ".join(row_sugs)
sheet.getCellByPosition(existing["Norm_ID"], r).String = " | ".join(row_ids)
cell.CellBackColor = RED if any_unmapped else 0xFFFFFF
sheet.getCellByPosition(existing["Norm_Treffer"], r).CellBackColor = GREEN if row_hits and not any_unmapped else 0xFFFFFF
sheet.getCellByPosition(existing["Norm_Vorschlag"], r).CellBackColor = YELLOW if row_sugs else 0xFFFFFF
except Exception as e:
log(f"Fehler in Zeile {r}: {e}", level="ERROR")
continue
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
log("=== mapper_macro 1.5 fertig ===", level="INFO")
# ------------------------
# Export
# ------------------------
g_exportedScripts = (
run_mapper_macro,
on_objektbeschreibung_change
)

View File

@ -1,508 +0,0 @@
# -*- coding: utf-8 -*-
# mapper_macro 1.5 - korrigiert: Logging im Dokumentverzeichnis, stabile Button-Erstellung,
# keine Listener, optimiertes Mapping (ohne Listener-Teil)
import os
import re
import json
import datetime
# optionale Module (Pandas, Spacy, RapidFuzz)
try:
import pandas as pd
PANDAS_AVAILABLE = True
except Exception:
PANDAS_AVAILABLE = False
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
from difflib import SequenceMatcher
# UNO (für Button/Paths)
try:
import uno
except Exception:
uno = None
# ------------------------
# Konfiguration (Fallback-BASE_DIR)
# ------------------------
BASE_DIR = os.path.expanduser("~/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro")
NV_MASTER_FILENAME = "NV_MASTER.ods"
CACHE_FILENAME = "mapper_cache.json"
LOG_FILENAME = "mapper_macro.log"
STOPWORDS = {
"mit", "ohne", "der", "die", "das", "ein", "eine", "und", "zu", "von", "im", "in", "auf", "an",
"als", "bei", "für", "aus", "dem", "den", "des", "eines", "einer"
}
CONF_THRESHOLD = 0.82
FUZZY_CUTOFF = 0.88
# Per-document paths (initialized by set_paths_from_doc)
DOC_DIR = BASE_DIR
NV_MASTER_PATH = os.path.join(DOC_DIR, NV_MASTER_FILENAME)
CACHE_FILE = os.path.join(DOC_DIR, CACHE_FILENAME)
LOG_FILE = os.path.join(DOC_DIR, LOG_FILENAME)
# in-memory cache
try:
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
except Exception:
CACHE = {}
# ------------------------
# Pfade im Dokument setzen
# ------------------------
def set_paths_from_doc(doc):
global DOC_DIR, NV_MASTER_PATH, CACHE_FILE, LOG_FILE
try:
url = getattr(doc, "URL", "")
if url and url.strip():
# UNO liefert file:///...
try:
system_path = uno.fileUrlToSystemPath(url)
except Exception:
# fallback: try simple unquote
from urllib.parse import unquote, urlparse
parsed = urlparse(url)
if parsed.scheme == "file":
system_path = unquote(parsed.path)
else:
system_path = ""
if system_path:
d = os.path.dirname(system_path)
if os.path.isdir(d):
DOC_DIR = d
except Exception:
DOC_DIR = BASE_DIR
NV_MASTER_PATH = os.path.join(DOC_DIR, NV_MASTER_FILENAME)
CACHE_FILE = os.path.join(DOC_DIR, CACHE_FILENAME)
LOG_FILE = os.path.join(DOC_DIR, LOG_FILENAME)
# ------------------------
# Logging (Dokumentdir, robust)
# ------------------------
def log(msg, level="INFO"):
ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
line = f"[{ts}] [{level}] {msg}\n"
try:
# ensure directory exists
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(line)
except Exception:
# absolute fallback: try writing into BASE_DIR
try:
fallback = os.path.join(BASE_DIR, LOG_FILENAME)
os.makedirs(os.path.dirname(fallback), exist_ok=True)
with open(fallback, "a", encoding="utf-8") as f:
f.write(line)
except Exception:
# last resort: silent
pass
# ------------------------
# Textvorbereitung & Helpers
# ------------------------
lemma_cache = {}
def normalize_text(s):
if not s:
return ""
s = str(s).strip().lower()
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
s = re.sub(r"\s+", " ", s)
return s
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(term_norm)
lemma = " ".join([t.lemma_ for t in doc])
except Exception:
lemma = term_norm
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
def compound_split(term):
if not term:
return []
parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
if parts:
return parts
parts = [p for p in re.split(r'[-\s]+', term) if p]
return parts or [term]
# ------------------------
# NV_MASTER indexieren
# ------------------------
def build_norm_index(nv_path):
norm_dict = {}
lemma_index = {}
if not PANDAS_AVAILABLE:
log("Pandas nicht verfügbar, NV_MASTER kann nicht gelesen.", level="ERROR")
return norm_dict, lemma_index
try:
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
except Exception as e:
log(f"Fehler beim Einlesen von NV_MASTER: {e}", level="ERROR")
return norm_dict, lemma_index
for sheet_name, df in sheets.items():
if str(sheet_name).strip().lower() == "master":
continue
df = df.fillna("")
cols = [str(c).strip().lower() for c in df.columns]
# find id/word columns with fallback
id_col = None
word_col = None
for i, c in enumerate(cols):
if "id" in c:
id_col = df.columns[i]
if "wort" in c or "vokabel" in c:
word_col = df.columns[i]
if word_col is None and len(df.columns) >= 1:
word_col = df.columns[-1]
if id_col is None and len(df.columns) >= 1:
id_col = df.columns[0]
current_parent_id = None
for _, row in df.iterrows():
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
if id_val:
current_parent_id = id_val
if not word_val:
continue
norm_name = normalize_text(word_val)
lemma = lemmatize_term(word_val)
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
norm_dict.setdefault(norm_name, []).append(entry)
lemma_index.setdefault(lemma, []).append(entry)
log(f"NV_MASTER geladen. Begriffe: {sum(len(v) for v in norm_dict.values())}", level="INFO")
return norm_dict, lemma_index
# ------------------------
# Fuzzy Matching
# ------------------------
def fuzzy_score(a, b):
a = (a or "").lower()
b = (b or "").lower()
if RAPIDFUZZ_AVAILABLE:
try:
return fuzz.token_sort_ratio(a, b) / 100.0
except Exception:
return 0.0
else:
return SequenceMatcher(None, a, b).ratio()
def get_suggestions(term_lemma, norm_dict, lemma_index, threshold=FUZZY_CUTOFF, max_sugs=6):
candidates = []
term_norm = term_lemma or ""
for key_lemma, entries in lemma_index.items():
if not key_lemma:
continue
score = fuzzy_score(term_norm, key_lemma)
if key_lemma.startswith(term_norm):
score = min(score + 0.08, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
# also check normalized names
for norm_key, entries in norm_dict.items():
score = fuzzy_score(term_norm, norm_key)
if norm_key.startswith(term_norm):
score = min(score + 0.08, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
# sort & dedupe
candidates.sort(key=lambda t: t[0], reverse=True)
seen = set()
out = []
for score, name, id_ in candidates:
key = (name, id_)
if key in seen:
continue
seen.add(key)
if id_:
out.append(f"{name} ({id_})")
else:
out.append(name)
if len(out) >= max_sugs:
break
return out
# ------------------------
# Mapping mit Cache
# ------------------------
def map_term(term, norm_dict, lemma_index):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_lemma in CACHE:
return CACHE[term_lemma]
hits = []
suggestions = []
ids = []
# exact
if term_norm in norm_dict:
for e in norm_dict[term_norm]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
# lemma
if not hits and term_lemma in lemma_index:
for e in lemma_index[term_lemma]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
# suggestions only if no hit
if not hits:
suggestions = get_suggestions(term_lemma, norm_dict, lemma_index)
# remove suggestions that are equal/contain hits
suggestions = [s for s in suggestions if not any(h.lower() in s.lower() for h in hits)]
result = {"hits": hits, "suggestions": suggestions, "ids": ids}
CACHE[term_lemma] = result
return result
# ------------------------
# Button erstellen (sicher)
# ------------------------
def add_macro_button(sheet):
try:
doc = XSCRIPTCONTEXT.getDocument()
except Exception:
log("add_macro_button: kein Dokument-Kontext", level="WARN")
return
try:
draw_page = sheet.DrawPage
# avoid duplicate
for shape in draw_page:
try:
if getattr(shape, "Name", "") == "MapperStartButton":
return
except Exception:
continue
# create shape and button model
shape = doc.createInstance("com.sun.star.drawing.ControlShape")
shape.Name = "MapperStartButton"
shape.Position = uno.createUnoStruct("com.sun.star.awt.Point")
shape.Position.X = 1000
shape.Position.Y = 200
shape.Size = uno.createUnoStruct("com.sun.star.awt.Size")
shape.Size.Width = 3000
shape.Size.Height = 1000
button_model = doc.createInstance("com.sun.star.form.component.CommandButton")
button_model.Label = "Start Mapping"
button_model.HelpText = "Startet das Mapping (run_mapper_macro)"
# assign macro via ActionCommand is not enough; user must link in UI; we add the control and label
shape.Control = button_model
draw_page.add(shape)
log("Button 'MapperStartButton' erstellt.", level="INFO")
except Exception as e:
log(f"add_macro_button Fehler: {e}", level="ERROR")
# ------------------------
# Hauptlauf (ohne Listener)
# ------------------------
def run_mapper_macro():
try:
doc = XSCRIPTCONTEXT.getDocument()
set_paths_from_doc(doc)
log("=== mapper_macro gestartet ===", level="INFO")
sheet = doc.CurrentController.ActiveSheet
add_macro_button(sheet)
# used area
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
dr = cursor.getRangeAddress()
# find header and objekt col
header_row = None
objekt_col = None
for r in range(0, min(10, dr.EndRow + 1)):
for c in range(0, dr.EndColumn + 1):
try:
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
except Exception:
val = ""
if val == "Objektbeschreibung":
header_row = r
objekt_col = c
break
if objekt_col is not None:
break
if objekt_col is None:
log("run_mapper_macro: 'Objektbeschreibung' Header nicht gefunden.", level="ERROR")
return
# ensure result cols
existing = {}
last_col = dr.EndColumn
for c in range(0, dr.EndColumn + 1):
try:
h = str(sheet.getCellByPosition(c, header_row).String).strip()
except Exception:
h = ""
if h == "Norm_Treffer":
existing["Norm_Treffer"] = c
if h == "Norm_Vorschlag":
existing["Norm_Vorschlag"] = c
if h == "Norm_ID":
existing["Norm_ID"] = c
if "Norm_Treffer" not in existing:
last_col += 1
existing["Norm_Treffer"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
if "Norm_Vorschlag" not in existing:
last_col += 1
existing["Norm_Vorschlag"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
if "Norm_ID" not in existing:
last_col += 1
existing["Norm_ID"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_ID"
norm_tr_col = existing["Norm_Treffer"]
norm_sug_col = existing["Norm_Vorschlag"]
norm_id_col = existing["Norm_ID"]
# build index
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
if not norm_dict and not lemma_index:
log("run_mapper_macro: NV_MASTER leer oder nicht lesbar.", level="ERROR")
return
GREEN, YELLOW, RED = 0xADFF2F, 0xFFFF66, 0xFF9999
rows_processed = 0
for r in range(header_row + 1, dr.EndRow + 1):
try:
cell = sheet.getCellByPosition(objekt_col, r)
txt = str(cell.String).strip()
if not txt:
continue
# phrase-first: try entire cleaned phrase (remove stopwords)
tokens = [t.strip() for t in re.split(r'\s+', normalize_text(txt)) if t and t not in STOPWORDS]
phrase = " ".join(tokens).strip()
terms = []
if phrase:
# first try phrase as whole
mapped_phrase = map_term(phrase, norm_dict, lemma_index)
if mapped_phrase["hits"] or mapped_phrase["suggestions"]:
# use phrase result (flatten hits+suggestions for output)
row_hits = mapped_phrase["hits"]
row_sugs = mapped_phrase["suggestions"]
row_ids = mapped_phrase["ids"]
any_unmapped = False if (row_hits or row_sugs) else True
else:
# fallback to token/compound processing
for p in [p for p in re.split(r'[,\s]+', txt) if p.strip()]:
if p.lower() in STOPWORDS or re.fullmatch(r'\d+', p):
continue
for sp in compound_split(p):
if sp and sp.strip():
terms.append(sp.strip())
row_hits = []
row_sugs = []
row_ids = []
any_unmapped = False
for term in terms:
mapped = map_term(term, norm_dict, lemma_index)
hits, sugs, ids = mapped["hits"], mapped["suggestions"], mapped["ids"]
if hits:
row_hits.extend(hits)
if sugs:
row_sugs.extend(sugs)
if ids:
row_ids.extend(ids)
if not hits and not sugs:
any_unmapped = True
else:
row_hits, row_sugs, row_ids = [], [], []
any_unmapped = True
# dedupe preserving order
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
row_hits = uniq(row_hits)
row_sugs = uniq(row_sugs)
row_ids = uniq(row_ids)
# write
sheet.getCellByPosition(norm_tr_col, r).String = " | ".join(row_hits)
sheet.getCellByPosition(norm_sug_col, r).String = " | ".join(row_sugs)
sheet.getCellByPosition(norm_id_col, r).String = " | ".join(row_ids)
cell.CellBackColor = RED if any_unmapped else 0xFFFFFF
sheet.getCellByPosition(norm_tr_col, r).CellBackColor = GREEN if row_hits else 0xFFFFFF
sheet.getCellByPosition(norm_sug_col, r).CellBackColor = YELLOW if row_sugs else 0xFFFFFF
rows_processed += 1
except Exception as e:
log(f"Fehler in Zeile {r}: {e}", level="ERROR")
continue
# persist cache file to DOC_DIR
try:
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
except Exception as e:
log(f"Cache speichern fehlgeschlagen: {e}", level="WARN")
log(f"=== mapper_macro fertig. Zeilen verarbeitet: {rows_processed} ===", level="INFO")
except Exception as e:
# top-level safety
try:
log(f"run_mapper_macro: Unhandled exception: {e}", level="ERROR")
except Exception:
pass
# ------------------------
# Export
# ------------------------
g_exportedScripts = (run_mapper_macro,)

View File

@ -1,343 +0,0 @@
# -*- coding: utf-8 -*-
"""
LibreOffice Calc Makro: NV_MASTER-Abgleich (verbessertes semantisches Matching)
Speicherort: /home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro/mapper_macro.py
"""
import os
import re
import json
import traceback
# ------------------------------------------------------------
# LIBRARIES & MODELS
# ------------------------------------------------------------
try:
import pandas as pd
PANDAS_AVAILABLE = True
except Exception:
PANDAS_AVAILABLE = False
try:
import spacy
# Verwende das mittlere Modell für semantische Ähnlichkeit
nlp = spacy.load("de_core_news_md")
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
from difflib import SequenceMatcher
# ------------------------------------------------------------
# KONFIGURATION
# ------------------------------------------------------------
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.70 # etwas großzügiger für semantisches Matching
# ------------------------------------------------------------
# LOGGING
# ------------------------------------------------------------
def log(msg):
"""Schreibt technische Logs ins Makroverzeichnis."""
try:
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(msg.strip() + "\n")
except Exception:
pass
log("Makro gestartet")
# ------------------------------------------------------------
# CACHE
# ------------------------------------------------------------
try:
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
except Exception:
CACHE = {}
# ------------------------------------------------------------
# TEXTNORMALISIERUNG & LEMMATISIERUNG
# ------------------------------------------------------------
def normalize_text(s):
if not s:
return ""
s = str(s).strip().lower()
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
s = re.sub(r"\s+", " ", s)
return s
lemma_cache = {}
def lemmatize_term(term):
t = normalize_text(term)
if t in lemma_cache:
return lemma_cache[t]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(t)
lemma = " ".join([token.lemma_ for token in doc])
except Exception:
lemma = t
else:
lemma = t
lemma_cache[t] = lemma
return lemma
# ------------------------------------------------------------
# NV_MASTER LADEN
# ------------------------------------------------------------
def build_norm_index(nv_path):
norm_dict = {}
lemma_index = {}
if not PANDAS_AVAILABLE:
log("Pandas nicht verfügbar NV_MASTER kann nicht geladen werden.")
return norm_dict, lemma_index
try:
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
except Exception as e:
log(f"Fehler beim Laden von NV_MASTER: {e}")
return norm_dict, lemma_index
for sheet_name, df in sheets.items():
if str(sheet_name).strip().lower() == "master":
continue
df = df.fillna("")
cols = [str(c).strip().lower() for c in df.columns]
id_col = next((df.columns[i] for i, c in enumerate(cols) if "id" in c), df.columns[0])
word_col = next((df.columns[i] for i, c in enumerate(cols) if "wort" in c or "vokabel" in c), df.columns[-1])
current_parent_id = None
for _, row in df.iterrows():
id_val = str(row[id_col]).strip()
word_val = str(row[word_col]).strip()
if id_val:
current_parent_id = id_val
if not word_val:
continue
norm_name = normalize_text(word_val)
lemma = lemmatize_term(word_val)
entry = {"Name": word_val, "ID": current_parent_id or "", "Sheet": sheet_name}
norm_dict.setdefault(norm_name, []).append(entry)
lemma_index.setdefault(lemma, []).append(entry)
log(f"NV_MASTER geladen: {sum(len(v) for v in norm_dict.values())} Begriffe.")
return norm_dict, lemma_index
# ------------------------------------------------------------
# SCORING: FUZZY + SEMANTISCH
# ------------------------------------------------------------
def fuzzy_score(a, b):
if RAPIDFUZZ_AVAILABLE:
try:
return fuzz.token_set_ratio(a, b) / 100.0
except Exception:
return 0.0
else:
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
def semantic_similarity(a, b):
if not SPACY_AVAILABLE or not hasattr(nlp.vocab, "vectors"):
return 0.0
try:
doc_a, doc_b = nlp(a), nlp(b)
if doc_a.vector_norm and doc_b.vector_norm:
return float(doc_a.similarity(doc_b))
return 0.0
except Exception:
return 0.0
def combined_score(a, b):
sf = fuzzy_score(a, b)
ss = semantic_similarity(a, b)
return max(sf, ss)
# ------------------------------------------------------------
# MATCHING & VORSCHLÄGE
# ------------------------------------------------------------
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=3, threshold=CONF_THRESHOLD):
candidates = []
for key_lemma, entries in lemma_index.items():
score = combined_score(term_lemma, key_lemma)
if key_lemma.startswith(term_lemma):
score = min(score + 0.05, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
for norm_key, entries in norm_dict.items():
score = combined_score(term_lemma, norm_key)
if norm_key.startswith(term_lemma):
score = min(score + 0.05, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
candidates.sort(key=lambda x: x[0], reverse=True)
seen, results = set(), []
for score, name, id_ in candidates:
key = (name.lower(), id_.lower() if id_ else "")
if key in seen:
continue
seen.add(key)
results.append({"score": score, "name": name, "id": id_})
if len(results) >= top_n:
break
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
def map_term_with_indexes(term, norm_dict, lemma_index):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_lemma in CACHE:
return CACHE[term_lemma]["hits"], CACHE[term_lemma]["suggestions"], CACHE[term_lemma]["ids"]
hits, suggestions, ids = [], [], []
if term_norm in norm_dict:
for e in norm_dict[term_norm]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
if not hits and term_lemma in lemma_index:
for e in lemma_index[term_lemma]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
suggs = get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=3, threshold=CONF_THRESHOLD)
filtered_suggs = []
for s in suggs:
s_clean = normalize_text(s.split(" (")[0])
if s_clean not in [normalize_text(h) for h in hits]:
filtered_suggs.append(s)
suggestions = filtered_suggs
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
hits, suggestions, ids = uniq(hits), uniq(suggestions), uniq(ids)
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
log(f"TERM: {term} | HITS: {hits} | SUGGS: {suggestions}")
return hits, suggestions, ids
# ------------------------------------------------------------
# HAUPTMAKRO
# ------------------------------------------------------------
def run_mapper_macro():
try:
doc = XSCRIPTCONTEXT.getDocument()
sheet = doc.CurrentController.ActiveSheet
except Exception as e:
log(f"Fehler beim Zugriff auf Dokument: {e}")
return
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
if not norm_dict:
log("Fehler: NV_MASTER leer oder nicht gefunden.")
return
try:
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
used = cursor.getRangeAddress()
except Exception as e:
log(f"Cursor-Fehler: {e}")
return
header_row = 0
objekt_col = None
for c in range(0, used.EndColumn + 1):
val = str(sheet.getCellByPosition(c, header_row).String).strip().lower()
if val == "objektbeschreibung":
objekt_col = c
break
if objekt_col is None:
log("Keine Spalte 'Objektbeschreibung' gefunden.")
return
existing = {}
for c in range(0, used.EndColumn + 1):
h = str(sheet.getCellByPosition(c, header_row).String).strip()
if h == "Norm_Treffer": existing["Norm_Treffer"] = c
if h == "Norm_Vorschlag": existing["Norm_Vorschlag"] = c
if h == "Norm_ID": existing["Norm_ID"] = c
last_col = used.EndColumn
for name in ["Norm_Treffer", "Norm_Vorschlag", "Norm_ID"]:
if name not in existing:
last_col += 1
existing[name] = last_col
sheet.getCellByPosition(last_col, header_row).String = name
GREEN, YELLOW, RED = 0xADFF2F, 0xFFD700, 0xCC0000
norm_tr_col, norm_sug_col, norm_id_col = existing["Norm_Treffer"], existing["Norm_Vorschlag"], existing["Norm_ID"]
rows = 0
for r in range(header_row + 1, used.EndRow + 1):
txt = str(sheet.getCellByPosition(objekt_col, r).String).strip()
if not txt:
continue
terms = [t.strip() for t in re.split(r",|\s+", txt) if t.strip() and t.lower() not in STOPWORDS]
row_hits, row_sugs, row_ids, any_unmapped = [], [], [], False
for term in terms:
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
if hits: row_hits.extend(hits)
if sugs: row_sugs.extend(sugs)
if ids: row_ids.extend(ids)
if not hits and not sugs: any_unmapped = True
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
row_hits, row_sugs, row_ids = uniq(row_hits), uniq(row_sugs), uniq(row_ids)
sheet.getCellByPosition(norm_tr_col, r).String = " | ".join(row_hits)
sheet.getCellByPosition(norm_sug_col, r).String = " | ".join(row_sugs)
sheet.getCellByPosition(norm_id_col, r).String = " | ".join(row_ids)
obj_cell = sheet.getCellByPosition(objekt_col, r)
sug_cell = sheet.getCellByPosition(norm_sug_col, r)
tr_cell = sheet.getCellByPosition(norm_tr_col, r)
if any_unmapped:
obj_cell.CellBackColor = RED
elif row_hits:
tr_cell.CellBackColor = GREEN
if row_sugs:
sug_cell.CellBackColor = YELLOW
rows += 1
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
log(f"Makro abgeschlossen, {rows} Zeilen verarbeitet.")
g_exportedScripts = (run_mapper_macro,)

View File

@ -1,365 +0,0 @@
# -*- coding: utf-8 -*-
# LibreOffice Calc macro: NV_MASTER-Abgleich, Pandas+odf, Cache, Farben
# Speicherort: /home/jarnold/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro/mapper_macro_2.1.py
import os
import re
import json
import traceback
# UNO-Context wird zur Laufzeit zur Verfügung gestellt (XSCRIPTCONTEXT)
try:
import pandas as pd
PANDAS_AVAILABLE = True
except Exception:
PANDAS_AVAILABLE = False
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
from difflib import SequenceMatcher
# ------------------------
# Konfiguration
# ------------------------
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro"
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro_2.1.log")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache_2.1.json")
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75 # Basis-Schwelle für Vorschläge
# ------------------------
# Utilities: Logging & safe I/O
# ------------------------
def log(msg):
try:
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(msg + "\n")
except Exception:
pass
# ------------------------
# Cache laden
# ------------------------
try:
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
except Exception:
CACHE = {}
# ------------------------
# Text-Normalisierung & Lemma
# ------------------------
def normalize_text(s):
if not s:
return ""
s = str(s).strip().lower()
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
s = re.sub(r"\s+", " ", s)
return s
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
except Exception:
lemma = term_norm
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# ------------------------
# NV_MASTER robust laden (pandas + odf)
# ------------------------
def build_norm_index(nv_path):
norm_dict = {} # normalized_name -> list of entries (Name, ID, Sheet)
lemma_index = {} # lemma -> list of entries
if not PANDAS_AVAILABLE:
log("Pandas nicht verfügbar. NV_MASTER kann nicht zuverlässig gelesen werden.")
return norm_dict, lemma_index
try:
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
except Exception as e:
log(f"Fehler beim Einlesen von NV_MASTER mit pandas: {e}")
return norm_dict, lemma_index
for sheet_name, df in sheets.items():
if str(sheet_name).strip().lower() == "master":
continue
df = df.fillna("")
cols = [str(c).strip().lower() for c in df.columns]
id_col = None
word_col = None
for i, c in enumerate(cols):
if "id" in c:
id_col = df.columns[i]
if "wort" in c or "vokabel" in c:
word_col = df.columns[i]
if word_col is None and len(df.columns) >= 1:
word_col = df.columns[-1]
if id_col is None and len(df.columns) >= 1:
id_col = df.columns[0]
current_parent_id = None
for _, row in df.iterrows():
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
if id_val:
current_parent_id = id_val
if not word_val:
continue
norm_name = normalize_text(word_val)
lemma = lemmatize_term(word_val)
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
norm_dict.setdefault(norm_name, []).append(entry)
lemma_index.setdefault(lemma, []).append(entry)
log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}")
return norm_dict, lemma_index
# ------------------------
# Matching: exakter Treffer, Lemma-Treffer, Fuzzy-Vorschläge
# ------------------------
def fuzzy_score(a, b):
if RAPIDFUZZ_AVAILABLE:
try:
return fuzz.token_set_ratio(a, b) / 100.0
except Exception:
return 0.0
else:
try:
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
except Exception:
return 0.0
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD):
candidates = []
for key_lemma, entries in lemma_index.items():
score = fuzzy_score(term_lemma, key_lemma)
if key_lemma.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
for norm_key, entries in norm_dict.items():
score = fuzzy_score(term_lemma, norm_key)
if norm_key.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
candidates.sort(key=lambda t: t[0], reverse=True)
seen = set()
results = []
for score, name, id_ in candidates:
key = (name, id_)
if key in seen:
continue
seen.add(key)
results.append({"score": score, "name": name, "id": id_})
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
def map_term_with_indexes(term, norm_dict, lemma_index):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_lemma in CACHE:
return CACHE[term_lemma]["hits"], CACHE[term_lemma]["suggestions"]
hits = []
suggestions = []
if term_norm in norm_dict:
for e in norm_dict[term_norm]:
hits.append(f'{e["Name"]} ({e["ID"]})' if e["ID"] else e["Name"])
if not hits and term_lemma in lemma_index:
for e in lemma_index[term_lemma]:
hits.append(f'{e["Name"]} ({e["ID"]})' if e["ID"] else e["Name"])
if not hits:
suggestions = get_suggestions_for_term(term_lemma, norm_dict, lemma_index)
def unique_preserve(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
hits = unique_preserve(hits)
suggestions = unique_preserve(suggestions)
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions}
return hits, suggestions
# ------------------------
# Haupt-Makro
# ------------------------
def run_mapper_macro():
try:
doc = XSCRIPTCONTEXT.getDocument()
sheet = doc.CurrentController.ActiveSheet
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
data_range = cursor.getRangeAddress()
except Exception as e:
log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e))
return
header_row = None
objekt_col = None
max_col = data_range.EndColumn
for r in range(0, min(5, data_range.EndRow+1)):
for c in range(0, max_col+1):
try:
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
except Exception:
val = ""
if val == "objektbeschreibung":
header_row = r
objekt_col = c
break
if objekt_col is not None:
break
if objekt_col is None:
log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.")
return
# Prüfen/Anlegen der Ergebnis-Spalten
existing = {}
for c in range(0, data_range.EndColumn+1):
try:
h = str(sheet.getCellByPosition(c, header_row).String).strip()
except Exception:
h = ""
if h == "Norm_Treffer":
existing["Norm_Treffer"] = c
if h == "Norm_Vorschlag":
existing["Norm_Vorschlag"] = c
last_col = data_range.EndColumn
if "Norm_Treffer" not in existing:
last_col += 1
existing["Norm_Treffer"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
if "Norm_Vorschlag" not in existing:
last_col += 1
existing["Norm_Vorschlag"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
norm_tr_col = existing["Norm_Treffer"]
norm_sug_col = existing["Norm_Vorschlag"]
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
if not norm_dict and not lemma_index:
log("NV_MASTER leer oder nicht lesbar. Abbruch.")
return
GREEN = 0xADFF2F
YELLOW = 0xFFA500
RED = 0xCC0000
WHITE = 0xFFFFFF
rows_processed = 0
for r in range(header_row + 1, data_range.EndRow + 1):
try:
cell = sheet.getCellByPosition(objekt_col, r)
txt = str(cell.String).strip()
if not txt:
continue
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
terms = []
for cl in clauses:
parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS:
continue
if re.fullmatch(r"\d+", p):
continue
terms.append(p)
row_hits = []
row_sugs = []
unmapped_terms = []
for term in terms:
hits, sugs = map_term_with_indexes(term, norm_dict, lemma_index)
if hits:
row_hits.extend(hits)
else:
unmapped_terms.append(term)
if sugs:
row_sugs.extend(sugs)
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
row_hits = uniq(row_hits)
row_sugs = uniq(row_sugs)
# Farb-Logik für Objektbeschreibung
if terms and not unmapped_terms and row_hits:
cell.CellBackColor = GREEN
row_sugs = []
elif row_hits:
cell.CellBackColor = YELLOW
else:
cell.CellBackColor = RED
# Ergebniszellen
tr_cell = sheet.getCellByPosition(norm_tr_col, r)
tr_cell.String = " | ".join(row_hits)
tr_cell.CellBackColor = GREEN if row_hits else WHITE
sug_cell = sheet.getCellByPosition(norm_sug_col, r)
sug_cell.String = " | ".join(row_sugs)
sug_cell.CellBackColor = YELLOW if row_sugs else WHITE
rows_processed += 1
except Exception as e:
log(f"Fehler in Zeile {r}: {e}\n{traceback.format_exc()}")
try:
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
except Exception:
pass
log(f"run_mapper_macro fertig. Zeilen verarbeitet: {rows_processed}")
# Export für LibreOffice
g_exportedScripts = (run_mapper_macro,)

View File

@ -1,455 +0,0 @@
# -*- coding: utf-8 -*-
"""
LibreOffice/Excel Macro: NV_MASTER-Abgleich
Version: 2.3
Pfad: libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro/mapper_macro_2.3.py
Beschreibung:
-------------
Dieses Python-Makro für LibreOffice/Excel führt einen Abgleich von Begriffen
aus einem aktiven Sheet gegen ein zentral gepflegtes NV_MASTER-Vokabular durch.
Es erstellt Treffer, Vorschläge und markiert die Zellen farblich.
Hauptfunktionen:
----------------
1. Text-Normalisierung und Lemma-Bestimmung
2. Laden des NV_MASTER-Vokabulars und Aufbau von Norm-Index + Lemma-Index
3. Fuzzy Matching (RapidFuzz oder difflib) für Begriffe
4. Treffer- und Vorschlagsbestimmung
5. Mapping auf Sheet:
- Norm_Treffer (grün)
- Norm_Vorschlag (gelb)
- Kein_Treffer (rot)
6. Caching zur Vermeidung mehrfacher Berechnungen
7. Logging in externe Datei
Externe Abhängigkeiten:
-----------------------
- pandas (für ODS/Excel-Leseoperationen)
- spacy (für deutsche Lemma-Bestimmung)
- rapidfuzz (optional für schnellere Fuzzy-String-Matches)
UNO-spezifische Objekte:
------------------------
- XSCRIPTCONTEXT: Bereitgestellt durch LibreOffice zur Laufzeit
Schwachstellen / Optimierungsansätze:
-------------------------------------
- Fehlerbehandlung ist robust, aber teilweise sehr still (z.B. Cache-Fehler, Pandas-Fehler).
- Schleifen über Zellen sind bei großen Sheets langsam (potenziell durch pandas vollständig ersetzen).
- Lemma-Berechnung könnte nur einmal für NV_MASTER und einmal für Sheet durchgeführt werden.
- RapidFuzz optional; fallback auf SequenceMatcher ist deutlich langsamer.
- Cache wird nur am Ende geschrieben; Absturz vor Ende verliert bisherige Ergebnisse.
- Farbwerte sind fest codiert; parametrisieren könnte Flexibilität erhöhen.
- Stopwords sind hart codiert; konfigurierbar wäre effizienter.
- Es werden keine parallelen Abfragen / Batch-Operationen verwendet.
- Logging nur in Datei; LibreOffice-eigene Meldungen oder Fortschrittsanzeige fehlen.
"""
import os
import re
import json
import traceback
# UNO-Context wird zur Laufzeit von LibreOffice bereitgestellt
try:
import pandas as pd
PANDAS_AVAILABLE = True
except Exception:
PANDAS_AVAILABLE = False
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
from difflib import SequenceMatcher
# ------------------------
# Konfiguration
# ------------------------
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro"
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro_2.3.log")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache_2.3.json")
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75 # Basis-Schwelle für Vorschläge
# ------------------------
# Logging-Funktion
# ------------------------
def log(msg):
"""Schreibt Nachricht in LOG_FILE. Fehler werden ignoriert."""
try:
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(msg + "\n")
except Exception:
pass
# ------------------------
# Cache laden
# ------------------------
try:
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
except Exception:
CACHE = {}
# ------------------------
# Text-Normalisierung & Lemma
# ------------------------
def normalize_text(s):
"""Entfernt Sonderzeichen, multiple Whitespaces, wandelt in lowercase."""
if not s:
return ""
s = str(s).strip().lower()
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
s = re.sub(r"\s+", " ", s)
return s
lemma_cache = {}
def lemmatize_term(term):
"""Lemmatisiert einen Begriff mit SpaCy. Falls nicht verfügbar, Rückgabe Normalized String."""
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
except Exception:
lemma = term_norm
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# ------------------------
# NV_MASTER laden
# ------------------------
def build_norm_index(nv_path):
"""
Liest NV_MASTER ein und erstellt:
- norm_dict: Normalisierte Begriffe -> Einträge mit Name, ID, Sheet
- lemma_index: Lemma -> Einträge
"""
norm_dict = {}
lemma_index = {}
if not PANDAS_AVAILABLE:
log("Pandas nicht verfügbar. NV_MASTER kann nicht gelesen werden.")
return norm_dict, lemma_index
try:
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
except Exception as e:
log(f"Fehler beim Einlesen NV_MASTER: {e}")
return norm_dict, lemma_index
for sheet_name, df in sheets.items():
if str(sheet_name).strip().lower() == "master":
continue
df = df.fillna("")
cols = [str(c).strip().lower() for c in df.columns]
id_col = None
word_col = None
for i, c in enumerate(cols):
if "id" in c:
id_col = df.columns[i]
if "wort" in c or "vokabel" in c:
word_col = df.columns[i]
if word_col is None and len(df.columns) >= 1:
word_col = df.columns[-1]
if id_col is None and len(df.columns) >= 1:
id_col = df.columns[0]
current_parent_id = None
for _, row in df.iterrows():
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
if id_val:
current_parent_id = id_val
if not word_val:
continue
norm_name = normalize_text(word_val)
lemma = lemmatize_term(word_val)
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
norm_dict.setdefault(norm_name, []).append(entry)
lemma_index.setdefault(lemma, []).append(entry)
log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}")
return norm_dict, lemma_index
# ------------------------
# Matching-Funktionen
# ------------------------
def fuzzy_score(a, b):
"""Berechnet Fuzzy-Score zwischen zwei Strings. RapidFuzz oder fallback SequenceMatcher."""
if RAPIDFUZZ_AVAILABLE:
try:
return fuzz.token_set_ratio(a, b) / 100.0
except Exception:
return 0.0
else:
try:
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
except Exception:
return 0.0
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD):
"""
Liefert Vorschläge für ein Lemma, wenn kein exakter Treffer existiert.
Score-basierte Sortierung, Duplikate werden entfernt.
"""
candidates = []
for key_lemma, entries in lemma_index.items():
score = fuzzy_score(term_lemma, key_lemma)
if key_lemma.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
for norm_key, entries in norm_dict.items():
score = fuzzy_score(term_lemma, norm_key)
if norm_key.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
candidates.sort(key=lambda t: t[0], reverse=True)
seen = set()
results = []
for score, name, id_ in candidates:
key = (name, id_)
if key in seen:
continue
seen.add(key)
results.append({"score": score, "name": name, "id": id_})
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
def map_term_with_indexes(term, norm_dict, lemma_index):
"""
Mappt einen Term auf NV_MASTER:
- Treffer
- Vorschläge
- IDs
Nutzt Cache, um Wiederholungen zu vermeiden.
"""
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_lemma in CACHE:
cached = CACHE[term_lemma]
return cached.get("hits", []), cached.get("suggestions", []), cached.get("ids", [])
hits = []
suggestions = []
ids = []
if term_norm in norm_dict:
for e in norm_dict[term_norm]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
if not hits and term_lemma in lemma_index:
for e in lemma_index[term_lemma]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
if not hits:
suggestions = get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD)
# Duplikate entfernen
def unique_preserve(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
hits = unique_preserve(hits)
suggestions = unique_preserve(suggestions)
ids = unique_preserve(ids)
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
return hits, suggestions, ids
# ------------------------
# Haupt-Makro
# ------------------------
def run_mapper_macro():
"""
Haupt-Makro für LibreOffice:
1. Bestimmt Header + Spalten
2. Fügt Spalten für Norm_Treffer, Norm_Vorschlag, Kein_Treffer hinzu
3. Liest NV_MASTER und baut Indizes
4. Iteriert über Zeilen und Terms
5. Markiert Zellen farblich (grün/gelb/rot)
6. Schreibt Cache am Ende
"""
try:
doc = XSCRIPTCONTEXT.getDocument()
sheet = doc.CurrentController.ActiveSheet
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
data_range = cursor.getRangeAddress()
except Exception as e:
log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e))
return
# Header finden
header_row = None
objekt_col = None
max_col = data_range.EndColumn
for r in range(0, min(5, data_range.EndRow+1)):
for c in range(0, max_col+1):
try:
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
except Exception:
val = ""
if val == "objektbeschreibung":
header_row = r
objekt_col = c
break
if objekt_col is not None:
break
if objekt_col is None:
log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.")
return
# Spalten anlegen, falls nicht vorhanden
existing = {}
for c in range(0, data_range.EndColumn+1):
try:
h = str(sheet.getCellByPosition(c, header_row).String).strip()
except Exception:
h = ""
if h == "Norm_Treffer":
existing["Norm_Treffer"] = c
if h == "Norm_Vorschlag":
existing["Norm_Vorschlag"] = c
last_col = data_range.EndColumn
if "Norm_Treffer" not in existing:
last_col += 1
existing["Norm_Treffer"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
if "Norm_Vorschlag" not in existing:
last_col += 1
existing["Norm_Vorschlag"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
if "Kein_Treffer" not in existing:
last_col += 1
existing["Kein_Treffer"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Kein_Treffer"
norm_tr_col = existing["Norm_Treffer"]
norm_sug_col = existing["Norm_Vorschlag"]
kein_tr_col = existing["Kein_Treffer"]
# NV_MASTER laden
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
if not norm_dict and not lemma_index:
log("NV_MASTER leer oder nicht lesbar. Abbruch.")
return
# Farben
GREEN = 0xADFF2F
YELLOW = 0xFFA500
RED = 0xCC0000
WHITE = 0xFFFFFF
rows_processed = 0
for r in range(header_row + 1, data_range.EndRow + 1):
try:
cell = sheet.getCellByPosition(objekt_col, r)
txt = str(cell.String).strip()
if not txt:
continue
# Term-Extraktion
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
terms = []
for cl in clauses:
parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS:
continue
if re.fullmatch(r"\d+", p):
continue
terms.append(p)
row_hits = []
row_sugs = []
row_ids = []
unmapped_terms = []
for term in terms:
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
if hits:
row_hits.extend([f"{h} ({id_})" if id_ else h for h,id_ in zip(hits, ids + [""]*len(hits))])
else:
unmapped_terms.append(term)
if sugs:
row_sugs.extend([f"{s}" for s in sugs])
if ids:
row_ids.extend(ids)
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
row_hits = uniq(row_hits)
row_sugs = uniq(row_sugs)
unmapped_terms = uniq(unmapped_terms)
# Farb-Logik
if terms and not unmapped_terms and row_hits:
cell.CellBackColor = GREEN
row_sugs = [] # keine Vorschläge wenn alles Treffer
elif row_hits:
cell.CellBackColor = YELLOW
else:
cell.CellBackColor = RED
# Ergebnisse schreiben
tr_cell = sheet.getCellByPosition(norm_tr_col, r)
tr_cell.String = " | ".join(row_hits)
tr_cell.CellBackColor = GREEN if row_hits else WHITE
sug_cell = sheet.getCellByPosition(norm_sug_col, r)
sug_cell.String = " | ".join(row_sugs)
sug_cell.CellBackColor = YELLOW if row_sugs else WHITE
kt_cell = sheet.getCellByPosition(kein_tr_col, r)
kt_cell.String = " | ".join(unmapped_terms)
kt_cell.CellBackColor = RED if unmapped_terms else WHITE
rows_processed += 1
except Exception as e:
log(f"Fehler in Zeile {r}: {e}\n{traceback.format_exc()}")
# Cache speichern
try:
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
except Exception:
pass
log(f"run_mapper_macro fertig. Zeilen verarbeitet: {rows_processed}")
# Export für LibreOffice
g_exportedScripts = (run_mapper_macro,)

View File

@ -1,212 +0,0 @@
import os
import re
import logging
import pandas as pd
import ezodf
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment
# -------------------------------------------------
# KONFIGURATION
# -------------------------------------------------
INPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods"
OUTPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Normvokabular_INTERN/NV_MASTER_Updated.ods"
MASTER_SHEET_NAME = "Masterstruktur"
SHEET_ORDER = [
"Masterstruktur",
"1 Figur",
"2 Objekt",
"3 Flora",
"4 Fauna",
"5 Landschaft",
"6 Phänomene, Erscheinungen",
"7 Architektur",
"8 Verzierungen, Ornamentik",
"9 Aktivität, Handlung, Pose"
]
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# -------------------------------------------------
# HELFERFUNKTIONEN
# -------------------------------------------------
def detect_id_and_name(df):
df_cols = [str(c).strip().lower() for c in df.columns]
id_col, name_col = None, None
for idx, col in enumerate(df_cols):
if col == "id":
id_col = df.columns[idx]
elif col in ["name", "wort", "wort/vokabel"]:
name_col = df.columns[idx]
if id_col is None or name_col is None:
logging.warning(f"Sheet hat keine ID oder Name/Wort-Spalte: {df.columns}")
return id_col, name_col
def parse_id_level(id_val):
if pd.isna(id_val):
return None
id_str = str(id_val).strip()
if re.match(r'^\d+(\.\d+){0,2}$', id_str):
return len(id_str.split("."))
return None
def process_category_df(df, sheet_name):
id_col, name_col = detect_id_and_name(df)
if id_col is None or name_col is None:
return None
current_level = {2: None, 3: None}
new_rows = []
for _, row in df.iterrows():
id_val = row[id_col] if pd.notna(row[id_col]) else ""
name_val = row[name_col] if pd.notna(row[name_col]) else ""
if not id_val and not name_val:
continue
level = parse_id_level(id_val)
if level:
if level >= 2:
current_level[level] = name_val
for deeper in range(level+1, 4):
current_level[deeper] = None
new_rows.append({
"ID": id_val,
"Unterkategorie": current_level[2] if level >= 2 else "",
"Unterunterkategorie": current_level[3] if level >= 3 else "",
"Wort/Vokabel": name_val
})
else:
new_rows.append({
"ID": "",
"Unterkategorie": "",
"Unterunterkategorie": "",
"Wort/Vokabel": name_val
})
df_new = pd.DataFrame(new_rows, columns=["ID", "Unterkategorie", "Unterunterkategorie", "Wort/Vokabel"])
logging.info(f"Sheet '{sheet_name}' verarbeitet: {len(df_new)} Zeilen")
return df_new
def merge_new_terms(original_df, processed_df):
"""Fügt neue Wörter aus original_df (ohne ID) in processed_df ein, wenn sie noch nicht vorhanden sind."""
_, orig_name_col = detect_id_and_name(original_df)
if orig_name_col is None or orig_name_col not in original_df.columns:
return processed_df
existing_words = set(str(x).strip().lower() for x in processed_df["Wort/Vokabel"].dropna())
new_rows = []
for _, row in original_df.iterrows():
name = str(row.get(orig_name_col, "")).strip()
id_val = str(row.get("ID", "")).strip() if "ID" in row else ""
if not name:
continue
if not id_val and name.lower() not in existing_words:
new_rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": "", "Wort/Vokabel": name})
if new_rows:
df_new = pd.concat([processed_df, pd.DataFrame(new_rows)], ignore_index=True)
logging.info(f"{len(new_rows)} neue Wörter übernommen.")
return df_new
return processed_df
def build_master_df(category_dfs):
seen_ids = set()
master_rows = []
for df in category_dfs:
for _, row in df.iterrows():
id_val = row["ID"]
name_val = row["Wort/Vokabel"]
if id_val and id_val not in seen_ids:
seen_ids.add(id_val)
master_rows.append({"ID": id_val, "Name": name_val})
master_df = pd.DataFrame(master_rows)
logging.info(f"Masterstruktur enthält {len(master_df)} eindeutige IDs")
return master_df
# -------------------------------------------------
# FORMATIERUNG UND SPEICHERN
# -------------------------------------------------
def format_excel_sheet(df, sheet_name, writer):
df.to_excel(writer, sheet_name=sheet_name, index=False)
worksheet = writer.sheets[sheet_name]
for col_idx, col in enumerate(df.columns, 1):
max_len = max([len(str(cell)) if cell is not None else 0 for cell in df[col]])
max_len = max(max_len, len(col)) + 2
worksheet.column_dimensions[get_column_letter(col_idx)].width = max_len
for row_idx in range(1, len(df) + 2):
worksheet.cell(row=row_idx, column=col_idx).alignment = Alignment(horizontal='left')
def save_ods(processed_sheets, output_file):
doc = ezodf.newdoc(doctype="ods")
for name, df in processed_sheets.items():
df = df.fillna("")
sheet = ezodf.Sheet(name, size=(len(df) + 1, len(df.columns)))
doc.sheets += sheet
for col_idx, col_name in enumerate(df.columns):
sheet[0, col_idx].set_value(str(col_name))
for row_idx, row in enumerate(df.itertuples(index=False), start=1):
for col_idx, value in enumerate(row):
if value is None or str(value).lower() == "nan":
value = ""
sheet[row_idx, col_idx].set_value(str(value))
doc.saveas(output_file)
logging.info(f"ODS-Datei gespeichert: {output_file}")
# -------------------------------------------------
# HAUPTPROGRAMM
# -------------------------------------------------
def main():
if not os.path.exists(INPUT_FILE):
logging.error(f"Datei {INPUT_FILE} existiert nicht.")
return
ext = os.path.splitext(INPUT_FILE)[1].lower()
engine = None
if ext in [".xlsx", ".xls"]:
engine = "openpyxl"
elif ext == ".ods":
engine = "odf"
else:
logging.error("Nicht unterstütztes Dateiformat")
return
logging.info(f"Lade Datei {INPUT_FILE} mit Engine '{engine}'")
xls = pd.ExcelFile(INPUT_FILE, engine=engine)
processed_sheets = {}
category_dfs = []
for sheet_name in xls.sheet_names:
if sheet_name == MASTER_SHEET_NAME:
continue
df = pd.read_excel(xls, sheet_name=sheet_name, engine=engine)
df_new = process_category_df(df, sheet_name)
if df_new is not None:
df_merged = merge_new_terms(df, df_new)
processed_sheets[sheet_name] = df_merged
category_dfs.append(df_merged)
else:
processed_sheets[sheet_name] = df
master_df = build_master_df(category_dfs)
processed_sheets[MASTER_SHEET_NAME] = master_df
ordered_sheets = {name: processed_sheets[name] for name in SHEET_ORDER if name in processed_sheets}
ext_out = os.path.splitext(OUTPUT_FILE)[1].lower()
if ext_out in [".xlsx", ".xls"]:
with pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl") as writer:
for name, df in ordered_sheets.items():
format_excel_sheet(df, name, writer)
logging.info(f"Excel-Datei gespeichert: {OUTPUT_FILE}")
elif ext_out == ".ods":
save_ods(ordered_sheets, OUTPUT_FILE)
if __name__ == "__main__":
main()

Binary file not shown.

View File

@ -1,355 +0,0 @@
"""
===============================================================================
Skriptname: NV_SPOT_Export.py
Beschreibung:
Dieses Skript soll hierarchische Normvokabular-Tabellen
(ODS/XLSX-Format) in eine JSON-basierte SPOT-Struktur (Strukturierter
Positionsbaum) konvertieren. Es ermöglicht das Exportieren in Excel und ODS, sowie
das nachträgliche Ergänzen von Kategorien, Unterkategorien und Wörtern.
//NOCH NICHT GETESTET//
Hauptfunktionen:
- Node: Klasse zur Repräsentation von Baumknoten.
- load_excel_or_ods: Lädt Tabellen aus ODS/XLSX-Dateien.
- process_sheet_to_tree: Erzeugt eine Baumstruktur aus einem Sheet.
- save_spot_json: Speichert den SPOT-Baum als JSON.
- load_spot_json: Lädt SPOT-Daten aus JSON-Dateien.
- export_spot_to_excel: Exportiert den SPOT-Baum nach Excel.
- export_spot_to_ods: Exportiert den SPOT-Baum nach ODS.
- add_category/subcategory/word: Fügt Elemente im Baum hinzu.
- main: Steuert den Workflow.
Abhängigkeiten:
Python 3.x, pandas, openpyxl, ezodf, json, logging, datetime
Stand: 2025-10-01
===============================================================================
"""
import os
import json
import datetime
import pandas as pd
import ezodf
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# ---------------- SPOT-Baumstruktur ----------------
class Node:
"""
Repräsentiert einen Knoten in der SPOT-Baumstruktur.
Attribute:
name (str): Anzeigename des Knotens.
id (str): Optionale ID (nur für Kategorien).
type (str): Knotentyp ("category", "subcategory", "word").
children (list[Node]): Unterknoten.
Methoden:
add_child(child): Fügt einen Unterknoten hinzu.
to_dict(): Serialisiert den Knoten in ein Dictionary/JSON-kompatibles Format.
from_dict(d): Rekonstruiert den Baum aus einem Dictionary.
"""
def __init__(self, name, node_type="category", id=None):
self.name = name
self.id = id
self.type = node_type # "category", "subcategory", "word"
self.children = []
def add_child(self, child):
"""Fügt dem aktuellen Knoten einen Unterknoten hinzu."""
self.children.append(child)
def to_dict(self):
"""Wandelt den Knoten (rekursiv) in ein Dictionary um."""
if self.type == "word":
return self.name
return {
"id": self.id,
"name": self.name,
"type": self.type,
"children": [c.to_dict() for c in self.children]
}
@staticmethod
def from_dict(d):
"""Erzeugt aus einem Dictionary ein Node-Objekt (rekursiv)."""
if isinstance(d, str):
return Node(d, "word")
node = Node(d["name"], d.get("type", "category"), d.get("id"))
node.children = [Node.from_dict(c) for c in d.get("children", [])]
return node
# ---------------- Funktionen zum Laden ----------------
def load_excel_or_ods(input_file, master_sheet="Masterstruktur"):
"""
Lädt ODS oder Excel-Datei und gibt Master- sowie Kategorien-DataFrames zurück.
Parameter:
input_file (str): Pfad zur Quelldatei.
master_sheet (str): Name des Masterblattes.
Rückgabe:
(master_df, dfs): Master-DataFrame und Dictionary mit anderen Sheets.
"""
ext = os.path.splitext(input_file)[1].lower()
engine = "openpyxl" if ext in [".xlsx", ".xls"] else "odf"
xls = pd.ExcelFile(input_file, engine=engine)
sheet_names = [s for s in xls.sheet_names if s != master_sheet]
dfs = {s: pd.read_excel(xls, sheet_name=s, engine=engine) for s in sheet_names}
master_df = pd.read_excel(xls, sheet_name=master_sheet, engine=engine)
return master_df, dfs
# ---------------- Baum aus Sheet erstellen ----------------
def process_sheet_to_tree(df):
"""
Wandelt ein Kategoriensheet in eine hierarchische Baumstruktur (Liste von Nodes) um.
Struktur:
Kategorie Unterkategorie Wort
Parameter:
df (pd.DataFrame): Eingabedaten mit Spalten ["ID", "Unterkategorie",
"Unterunterkategorie", "Wort/Vokabel"].
Rückgabe:
list[Node]: Liste von Baumknoten der obersten Ebene.
"""
df = df.fillna("").astype(str)
tree_nodes = []
current_cat = None
current_sub = None
for idx, row in df.iterrows():
id_val = row.get("ID", "").strip()
uk_val = row.get("Unterkategorie", "").strip()
uuk_val = row.get("Unterunterkategorie", "").strip()
word_val = row.get("Wort/Vokabel", "").strip()
# Neue Kategorieebene
if id_val:
current_cat = Node(uk_val or word_val, "category", id=id_val)
tree_nodes.append(current_cat)
current_sub = None
# Unterkategorie
elif uuk_val:
current_sub = Node(uuk_val, "subcategory")
if current_cat:
current_cat.add_child(current_sub)
# Wortebene
elif word_val:
word_node = Node(word_val, "word")
if current_sub:
current_sub.add_child(word_node)
elif current_cat:
current_cat.add_child(word_node)
return tree_nodes
# ---------------- SPOT laden/speichern ----------------
def save_spot_json(tree_nodes, file_path):
"""
Speichert den SPOT-Baum als JSON-Datei.
Parameter:
tree_nodes (list[Node]): Wurzelknoten der Baumstruktur.
file_path (str): Zielpfad.
"""
with open(file_path, "w", encoding="utf-8") as f:
json.dump([n.to_dict() for n in tree_nodes], f, indent=2, ensure_ascii=False)
logging.info(f"SPOT gespeichert: {file_path}")
def load_spot_json(file_path):
"""
Lädt SPOT-JSON-Datei und rekonstruiert den Baum.
Parameter:
file_path (str): Pfad zur JSON-Datei.
Rückgabe:
list[Node]: Liste oberster Knoten.
"""
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
return [Node.from_dict(n) for n in data]
# ---------------- Export in Excel ----------------
def export_spot_to_excel(tree_nodes, output_file):
"""
Exportiert den SPOT-Baum in eine Excel-Datei.
Struktur:
Spalten AD: ID, Kategorie, Unterkategorie, Wort.
Parameter:
tree_nodes (list[Node]): Baumstruktur.
output_file (str): Zielpfad der Excel-Datei.
"""
wb = Workbook()
wb.remove(wb.active)
for node in tree_nodes:
ws = wb.create_sheet(title=node.name[:31])
row_idx = 1
# Kategoriezeile
ws.cell(row=row_idx, column=1, value=node.id)
ws.cell(row=row_idx, column=2, value=node.name)
row_idx += 1
for sub in node.children:
if sub.type == "subcategory":
ws.cell(row=row_idx, column=3, value=sub.name)
row_idx += 1
for word in sub.children:
ws.cell(row=row_idx, column=4, value=word.name)
row_idx += 1
elif sub.type == "word":
ws.cell(row=row_idx, column=4, value=sub.name)
row_idx += 1
# Spaltenbreiten und Ausrichtung
for col_idx, col_letter in enumerate(["A", "B", "C", "D"], 1):
ws.column_dimensions[col_letter].width = 20
for r in range(1, row_idx):
ws.cell(r, col_idx).alignment = Alignment(horizontal='left')
wb.save(output_file)
logging.info(f"Excel exportiert: {output_file}")
# ---------------- Export in ODS ----------------
def export_spot_to_ods(tree_nodes, output_file):
"""
Exportiert den SPOT-Baum in eine ODS-Datei.
Struktur analog zum Excel-Export.
Parameter:
tree_nodes (list[Node]): Baumstruktur.
output_file (str): Zielpfad der ODS-Datei.
"""
doc = ezodf.newdoc(doctype="ods", filename=output_file)
for node in tree_nodes:
sheet = ezodf.Sheet(node.name[:31], size=(len(node.children) + 10, 4))
doc.sheets += sheet
sheet[0, 0].set_value("ID")
sheet[0, 1].set_value("Unterkategorie")
sheet[0, 2].set_value("Unterunterkategorie")
sheet[0, 3].set_value("Wort/Vokabel")
row_idx = 1
sheet[row_idx, 0].set_value(node.id)
sheet[row_idx, 1].set_value(node.name)
row_idx += 1
for sub in node.children:
if sub.type == "subcategory":
sheet[row_idx, 2].set_value(sub.name)
row_idx += 1
for word in sub.children:
sheet[row_idx, 3].set_value(word.name)
row_idx += 1
elif sub.type == "word":
sheet[row_idx, 3].set_value(sub.name)
row_idx += 1
doc.save()
logging.info(f"ODS exportiert: {output_file}")
# ---------------- CLI-Funktionen zum Editieren ----------------
def add_category(tree_nodes, cat_id, cat_name):
"""
Fügt eine neue Kategorie zum SPOT-Baum hinzu.
Parameter:
tree_nodes (list[Node]): Liste der obersten Knoten.
cat_id (str): ID der Kategorie.
cat_name (str): Name der Kategorie.
"""
tree_nodes.append(Node(cat_name, "category", id=cat_id))
logging.info(f"Kategorie hinzugefügt: {cat_id} {cat_name}")
def add_subcategory(tree_nodes, cat_id, sub_name):
"""
Fügt einer vorhandenen Kategorie eine Unterkategorie hinzu.
Parameter:
tree_nodes (list[Node]): Wurzelknoten.
cat_id (str): Zielkategorie-ID.
sub_name (str): Name der Unterkategorie.
"""
for cat in tree_nodes:
if cat.id == cat_id:
cat.add_child(Node(sub_name, "subcategory"))
logging.info(f"Unterkategorie hinzugefügt: {sub_name} in {cat_id}")
return
def add_word(tree_nodes, cat_id, sub_name, word_name):
"""
Fügt einem Unterknoten ein Wort hinzu.
Parameter:
tree_nodes (list[Node]): Wurzelknoten.
cat_id (str): ID der Kategorie.
sub_name (str): Name der Unterkategorie.
word_name (str): Neues Wort.
"""
for cat in tree_nodes:
if cat.id == cat_id:
for sub in cat.children:
if sub.name == sub_name:
sub.add_child(Node(word_name, "word"))
logging.info(f"Wort hinzugefügt: {word_name} unter {sub_name}")
return
# ---------------- HAUPTPROGRAMM ----------------
def main():
"""
Ablauf:
1. Liest Masterdatei (ODS oder XLSX).
2. Wandelt Kategorienblätter in SPOT-Struktur um.
3. Speichert SPOT als JSON.
4. Exportiert SPOT nach Excel und ODS.
5. Optional: Bearbeiten des Baums über CLI-Funktionen.
"""
INPUT_FILE = "NV_MASTER.ods"
OUTPUT_SPOT = "nv_spot.json"
today = datetime.datetime.today().strftime("%y.%m.%d")
OUTPUT_EXCEL = f"NV_MASTER_SPOT_{today}.xlsx"
OUTPUT_ODS = f"NV_MASTER_SPOT_{today}.ods"
master_df, dfs = load_excel_or_ods(INPUT_FILE)
spot_tree = []
for sheet, df in dfs.items():
spot_tree.extend(process_sheet_to_tree(df))
save_spot_json(spot_tree, OUTPUT_SPOT)
# Beispielhafte Nutzung der Editierfunktionen:
# add_category(spot_tree, "10.1", "Neue Kategorie")
# add_subcategory(spot_tree, "10.1", "Neue Unterunterkategorie")
# add_word(spot_tree, "10.1", "Neue Unterunterkategorie", "Neues Wort")
export_spot_to_excel(spot_tree, OUTPUT_EXCEL)
export_spot_to_ods(spot_tree, OUTPUT_ODS)
logging.info("SPOT-Workflow abgeschlossen.")
if __name__ == "__main__":
main()

View File

@ -1,660 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
NormVokabular Mapper Version 1.4.2
Dieses Skript normalisiert und mappt Begriffe aus Input-Dateien auf ein zentrales Normvokabular
und führt optional API-Abgleiche mit GND und Wikidata durch. Ergebnisse werden in Excel/ODS gespeichert.
"""
from __future__ import annotations
import os
import sys
import re
import time
import json
import threading
import queue
import requests
import pandas as pd
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher
from datetime import datetime
# Optional Libraries
try:
from rapidfuzz import fuzz # für schnellere String-Similarity
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
try:
import spacy
nlp = spacy.load("de_core_news_sm") # deutsche Lemmatization
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
# =========================
# Konfiguration & Pfade
# =========================
INPUT_DIR = Path("Input CSV") # Eingabeverzeichnis
OUTPUT_DIR = Path("Auswertung Ergebnisse") # Ausgabeordner
OUTPUT_DIR.mkdir(exist_ok=True) # Verzeichnis erstellen, falls nicht vorhanden
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods") # Normvokabular-Datei
CACHE_FILE = "api_cache.json" # Cache für API-Antworten
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75 # Threshold für Vorschläge
TIMEOUT_DEFAULT = 5
MAX_RETRIES_DEFAULT = 3
BACKOFF_FACTOR_DEFAULT = 2
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
API_ACTIVE = {"gnd": True, "wikidata": True} # API-Verfügbarkeit
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
# Logging-Parameter
LOG_FILE = OUTPUT_DIR / "mapper_log.txt"
LOG_BATCH_SIZE = 100 # Anzahl Logs vor Flush
LOG_FLUSH_INTERVAL = 5.0 # Sekunden zwischen Flushes
LOG_LEVEL = "DEBUG" # Logging-Level
# =========================
# Batch/Buffered Logger
# =========================
class BatchLogger:
"""
Buffered Logger: Speichert Logs in einem Queue-Buffer und schreibt sie periodisch in Datei und Konsole.
Reduziert I/O-Aufwand bei vielen Logs.
"""
def __init__(self, logfile: Path, flush_interval: float = 5.0, batch_size: int = 100, level: str = "DEBUG"):
self.logfile = logfile
self.flush_interval = flush_interval
self.batch_size = batch_size
self.level = level
self.q = queue.Queue()
self._stop_event = threading.Event()
self._thread = threading.Thread(target=self._worker, daemon=True, name="BatchLoggerThread")
# Sicherstellen, dass die Log-Datei existiert
try:
logfile.parent.mkdir(parents=True, exist_ok=True)
logfile.touch(exist_ok=True)
except Exception:
pass
self._thread.start()
def _format(self, level: str, msg: str) -> str:
"""Formatiert Logeinträge mit Timestamp"""
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
return f"{ts} - {level} - {msg}"
def log(self, level: str, msg: str):
"""Fügt Log dem Queue hinzu und löst Flush aus, falls Batchgröße erreicht"""
if self._stop_event.is_set():
return
formatted = self._format(level, msg)
self.q.put((level, formatted))
if self.q.qsize() >= self.batch_size:
self.q.put(("__FLUSH__", "__FLUSH__"))
def debug(self, msg: str):
if LOG_LEVEL in ("DEBUG",):
self.log("DEBUG", msg)
def info(self, msg: str):
self.log("INFO", msg)
def warning(self, msg: str):
self.log("WARNING", msg)
def error(self, msg: str):
self.log("ERROR", msg)
def exception(self, msg: str):
self.log("EXCEPTION", msg)
def _worker(self):
"""Hintergrund-Thread: verarbeitet Queue, schreibt Logs periodisch"""
buffer = []
last_flush = time.time()
while not self._stop_event.is_set() or not self.q.empty():
try:
item = None
try:
item = self.q.get(timeout=self.flush_interval)
except queue.Empty:
if buffer:
self._flush_buffer(buffer)
buffer = []
last_flush = time.time()
continue
if item is None:
continue
level, formatted = item
if level == "__FLUSH__":
if buffer:
self._flush_buffer(buffer)
buffer = []
last_flush = time.time()
continue
buffer.append((level, formatted))
if len(buffer) >= self.batch_size or (time.time() - last_flush) >= self.flush_interval:
self._flush_buffer(buffer)
buffer = []
last_flush = time.time()
except Exception as e:
try:
sys.stderr.write(f"BatchLogger worker error: {e}\n")
except Exception:
pass
time.sleep(0.5)
if buffer:
self._flush_buffer(buffer)
def _flush_buffer(self, buffer):
"""Schreibt Puffer in Datei und Konsole"""
if not buffer:
return
try:
out_lines = [f"{line}\n" for _, line in buffer]
try:
sys.stdout.writelines(out_lines)
sys.stdout.flush()
except Exception:
pass
try:
with open(self.logfile, "a", encoding="utf-8") as f:
f.writelines(out_lines)
except Exception as e:
try:
sys.stderr.write(f"BatchLogger file write error: {e}\n")
except Exception:
pass
except Exception:
pass
def stop(self):
"""Stoppt Logger-Thread"""
self._stop_event.set()
try:
self.q.put(("__FLUSH__", "__FLUSH__"))
except Exception:
pass
self._thread.join(timeout=5.0)
# Logger-Instanz erstellen
logger = BatchLogger(LOG_FILE, flush_interval=LOG_FLUSH_INTERVAL, batch_size=LOG_BATCH_SIZE, level=LOG_LEVEL)
logger.info("Starte NormVokabular Mapper v1.4.2 (Batch-Logging aktiv)")
# =========================
# Cache laden/speichern
# =========================
if os.path.exists(CACHE_FILE):
try:
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
logger.debug(f"Cache geladen ({len(CACHE)} Einträge).")
except Exception as e:
logger.warning(f"Cache konnte nicht geladen werden: {e}")
CACHE = {}
else:
CACHE = {}
def save_cache():
"""Speichert aktuellen Cache in JSON"""
try:
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
logger.debug("Cache gespeichert.")
except Exception as e:
logger.error(f"Cache konnte nicht gespeichert werden: {e}")
# =========================
# Normalisierung / Lemma / Tokenization
# =========================
def normalize_text(s):
"""Text in Kleinbuchstaben, Sonderzeichen entfernen, Trim"""
if not s:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
lemma_cache = {}
def lemmatize_term(term):
"""Lemmatize mit spaCy, Cache für Performance"""
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
except Exception:
lemma = term_norm
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
def compound_split(term):
"""Splittet Komposita nach -, _, / oder Leerzeichen"""
if not term:
return []
parts = [p for p in re.split(r"[\s\-_/]+", term) if p]
return parts if parts else [term]
# =========================
# Normvokabular laden & Index
# =========================
def load_normvokabular(file_path):
"""Lädt Normvokabular aus Excel/ODS, erstellt Dictionarys für Mapping"""
try:
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
except Exception as e:
logger.error(f"Normvokabular konnte nicht geladen werden: {e}")
raise
norm_dict = {}
stem_index = defaultdict(list)
lemma_norm_map = {}
for sheet_name, df in sheets.items():
if sheet_name.lower() in ["master", "übersicht"]:
continue # Übersichtsblätter ignorieren
df = df.dropna(how="all", axis=1)
df.columns = [str(c).strip() for c in df.columns]
# ID- und Wort-Spalte finden
id_col = next((c for c in df.columns if "ID" in c), None)
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c or "Begriff" in c), None)
if not id_col or not word_col:
continue
current_parent_id = None
for _, row in df.iterrows():
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
if row_id:
current_parent_id = row_id
if not row_word:
continue
assigned_parent_id = current_parent_id
entry = {"Name": row_word, "ID": assigned_parent_id or "", "Sheet": sheet_name, "Own_ID": row_id or ""}
key = normalize_text(row_word)
norm_dict[key] = entry
lemma = lemmatize_term(key)
stem_index[lemma].append(entry)
if lemma not in lemma_norm_map:
lemma_norm_map[lemma] = entry
logger.info(f"Normvokabular geladen: {len(norm_dict)} Einträge, {len(stem_index)} Stems")
return norm_dict, stem_index, lemma_norm_map
# =========================
# Mapping & Vorschläge
# =========================
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
"""
Mappt einen Begriff auf Normvokabular.
Prüft exakte Treffer, Lemma-Treffer, Komposita und generiert Vorschläge.
"""
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_norm in norm_dict:
e = norm_dict[term_norm]
logger.debug(f"map_to_norm: exakter Treffer für '{term}' -> {e['Name']}")
return e["Name"], e["ID"], []
if term_lemma in stem_index:
e = stem_index[term_lemma][0]
logger.debug(f"map_to_norm: Lemma-Treffer für '{term}' -> {e['Name']}")
return e["Name"], e["ID"], []
tokens = compound_split(term_norm)
if len(tokens) == 1:
suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
logger.debug(f"map_to_norm: KEIN TREFFER für '{term}', Vorschläge: {suggestions}")
return "KEIN TREFFER", "", suggestions
else:
token_matches = []
for t in tokens:
t_lemma = lemmatize_term(t)
if t_lemma in stem_index:
e = stem_index[t_lemma][0]
token_matches.append((t, e["Name"], e["ID"]))
else:
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
token_matches.append((t, "KEIN TREFFER", "", sugg))
combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
logger.debug(f"map_to_norm: Kompositum '{term}' -> combined_suggestions: {combined_suggestions}")
return "KEIN TREFFER", "", combined_suggestions
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
"""Ermittelt Vorschläge basierend auf Similarity"""
candidates = []
for key_lemma, entry in lemma_norm_map.items():
if RAPIDFUZZ_AVAILABLE:
score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
else:
score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
if key_lemma.lower().startswith(term_lemma.lower()):
score = min(score + 0.1, 1.0)
if score >= threshold:
candidates.append((score, entry["Name"], entry["ID"]))
candidates.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
# =========================
# Generic request with retries & caching
# =========================
def request_with_retries_generic(api_name, url, params=None, headers=None, timeout=TIMEOUT_DEFAULT, max_retries=MAX_RETRIES_DEFAULT, backoff=BACKOFF_FACTOR_DEFAULT):
"""
Sendet GET-Requests mit Retry-Logik, Backoff und Caching
"""
cache_key = url + (json.dumps(params, sort_keys=True, ensure_ascii=False) if params else "")
if cache_key in CACHE:
logger.debug(f"[Cache] {api_name}: {cache_key}")
return CACHE[cache_key]
retries = 0
while retries < max_retries:
try:
r = requests.get(url, params=params, headers=headers or HEADERS, timeout=timeout)
if r.status_code == 200:
try:
data = r.json()
except Exception:
data = r.text
CACHE[cache_key] = data
FAIL_COUNTER[api_name] = 0
logger.debug(f"[{api_name}] Erfolgreiche Antwort für {url}")
return data
else:
logger.warning(f"[{api_name}] HTTP {r.status_code} für {url}")
raise ValueError(f"HTTP {r.status_code}")
except Exception as e:
retries += 1
wait = backoff ** retries
logger.warning(f"[{api_name}] Fehler ({retries}/{max_retries}) für {url}: {e}. Warte {wait}s")
time.sleep(wait)
FAIL_COUNTER[api_name] += 1
if FAIL_COUNTER[api_name] >= 10:
API_ACTIVE[api_name] = False
logger.error(f"[{api_name}] Deaktiviere API nach zu vielen Fehlern.")
return None
# =========================
# GND / Wikidata Batch Queries
# =========================
def batch_query_gnd(terms):
"""Batch-Abfrage der Begriffe bei GND"""
results = {}
if not API_ACTIVE.get("gnd", False):
for t in terms: results[t] = ""
return results
logger.info(f"[GND] Starte GND-Abfragen für {len(terms)} Terme")
start = time.time()
for idx, t in enumerate(terms, start=1):
logger.debug(f"[GND] ({idx}/{len(terms)}) Anfrage für '{t}'")
url = "https://lobid.org/gnd/search"
params = {"q": t, "format": "json"}
data = request_with_retries_generic("gnd", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
top = ""
try:
if data and "member" in data:
cands = [(doc.get("preferredName","") or doc.get("name",""),
SequenceMatcher(None, t.lower(), (doc.get("preferredName","") or doc.get("name","")).lower()).ratio())
for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
cands = [c for c in cands if c[1] >= 0.75]
if cands:
top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
except Exception as e:
logger.debug(f"[GND] Fehler bei Verarbeitung für '{t}': {e}")
results[t] = top
elapsed = time.time() - start
logger.info(f"[GND] Fertig. Dauer: {elapsed:.1f}s")
return results
def batch_query_wikidata(terms):
"""Batch-Abfrage der Begriffe bei Wikidata"""
results = {}
if not API_ACTIVE.get("wikidata", False):
for t in terms: results[t] = ""
return results
logger.info(f"[WD] Starte Wikidata-Abfragen für {len(terms)} Terme")
start = time.time()
for idx, t in enumerate(terms, start=1):
logger.debug(f"[WD] ({idx}/{len(terms)}) Anfrage für '{t}'")
url = "https://www.wikidata.org/w/api.php"
params = {"action": "wbsearchentities", "search": t, "language": "de", "format": "json"}
data = request_with_retries_generic("wikidata", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
top = ""
try:
if data and "search" in data:
# Ermittlung der Kandidaten mit Ähnlichkeitsbewertung
cands = [(e.get("label",""), SequenceMatcher(None, t.lower(), e.get("label","").lower()).ratio())
for e in data["search"] if e.get("label","")]
# Filterung nach Mindestähnlichkeit (0.70)
cands = [c for c in cands if c[1] >= 0.70]
if cands:
# Bestes Ergebnis nach Ähnlichkeit auswählen
top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
except Exception as e:
logger.debug(f"[WD] Fehler bei Verarbeitung für '{t}': {e}")
results[t] = top
elapsed = time.time() - start
logger.info(f"[WD] Fertig. Dauer: {elapsed:.1f}s")
return results
# =========================
# Markierung / Export (Excel/ODS)
# =========================
def mark_norm_hits(file_path):
"""
Markiert Treffer in Excel/ODS farblich:
Grün = Treffer, Rot = KEIN TREFFER
"""
ext = file_path.suffix.lower()
try:
if ext in [".xlsx", ".xls"]:
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
wb = load_workbook(file_path)
ws = wb.active
# Spaltenmapping anhand der Kopfzeile
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
norm_col = col_map.get("Norm_Treffer", None)
if not norm_col:
logger.debug("Spalte 'Norm_Treffer' nicht gefunden (mark_norm_hits).")
wb.save(file_path)
return
# Farben definieren
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
cell = row[0]
if cell.value and cell.value != "KEIN TREFFER":
cell.fill = green_fill
else:
cell.fill = red_fill
wb.save(file_path)
elif ext == ".ods":
# ODS: kein Zell-Fill, stattdessen Status-Spalte
df = pd.read_excel(file_path, engine="odf")
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x != "KEIN TREFFER" else "Kein Treffer")
df.to_excel(file_path, index=False, engine="odf")
except Exception as e:
logger.warning(f"Fehler beim Markieren der Treffer in {file_path}: {e}")
# =========================
# Fehlende Begriffe -> separate Datei
# =========================
def export_missing_terms(out_df, output_file):
"""
Speichert Begriffe ohne Treffer oder Vorschläge in separater Datei
"""
missing_df = out_df[
(out_df["Norm_Treffer"] == "KEIN TREFFER") &
(out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
][["Begriff"]].drop_duplicates()
count_missing = len(missing_df)
logger.info(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
if count_missing == 0:
return
ext = output_file.suffix.lower()
base_name = output_file.stem
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}"
version = 1
while missing_file.exists():
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}"
version += 1
try:
if ext in [".xlsx", ".xls"]:
missing_df.to_excel(missing_file, index=False, engine="openpyxl")
elif ext == ".ods":
missing_df.to_excel(missing_file, index=False, engine="odf")
else:
missing_df.to_csv(missing_file, index=False, sep=";")
logger.info(f"Fehlende Begriffe gespeichert: {missing_file}")
except Exception as e:
logger.error(f"Fehler beim Speichern der fehlenden Begriffe: {e}")
# =========================
# Haupt-Loop: Verarbeitung Input-Dateien
# =========================
def process_files():
"""Verarbeitet alle Dateien im Input-Ordner, mappt Begriffe und speichert Ergebnisse"""
overall_start = time.time()
try:
# Normvokabular laden
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
except Exception as e:
logger.error("Normvokabular konnte nicht geladen werden. Beende.")
raise
total_terms = 0
total_hits = 0
if not INPUT_DIR.exists():
logger.error(f"Eingabeordner {INPUT_DIR} fehlt")
raise SystemExit(1)
files = list(INPUT_DIR.glob("*"))
if not files:
logger.info("Keine Dateien gefunden")
return
logger.info(f"Starte Verarbeitung von {len(files)} Dateien")
for file_idx, file_path in enumerate(files, start=1):
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
logger.debug(f"Übersprungen (kein unterstütztes Format): {file_path.name}")
continue
logger.info(f"[Datei {file_idx}/{len(files)}] Verarbeite: {file_path.name}")
file_start = time.time()
try:
if file_path.suffix.lower() == ".csv":
df = pd.read_csv(file_path)
else:
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
except Exception as e:
logger.error(f"Fehler beim Lesen von {file_path.name}: {e}")
continue
df = df.dropna(how="all")
df.columns = [str(c).strip() for c in df.columns]
# Spalten identifizieren
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
urh_col = next((c for c in df.columns if "Urheber" in c), None)
if not besch_col:
logger.warning(f"Spalte 'Objektbeschreibung' nicht gefunden in {file_path.name}. Datei übersprungen.")
continue
# Begriffe extrahieren
row_terms_map = []
for r_idx, row in enumerate(df.itertuples(index=False), start=1):
try:
besch = str(row[df.columns.get_loc(besch_col)]).strip() if pd.notna(row[df.columns.get_loc(besch_col)]) else ""
except Exception:
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
if not besch:
continue
obj_box = row[df.columns.get_loc(box_col)] if box_col and box_col in df.columns else ""
urheber = row[df.columns.get_loc(urh_col)] if urh_col and urh_col in df.columns else ""
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
terms = []
for clause in clauses:
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS:
continue
if re.fullmatch(r"\d+", p):
continue
terms.append(p)
row_terms_map.append((obj_box, urheber, terms))
if (r_idx % 200) == 0:
logger.debug(f"[{file_path.name}] Zeile {r_idx} verarbeitet")
# Alle einzigartigen Terme für API-Abfragen
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
logger.info(f"[{file_path.name}] Gefundene unique Terme: {len(all_terms)}")
total_unique_terms = len(all_terms)
# API-Abfragen
t0 = time.time()
gnd_results = batch_query_gnd(all_terms)
t1 = time.time()
logger.info(f"[{file_path.name}] GND-Abfragen Dauer: {t1-t0:.1f}s")
wd_results = batch_query_wikidata(all_terms)
t2 = time.time()
logger.info(f"[{file_path.name}] Wikidata-Abfragen Dauer: {t2-t1:.1f}s")
# Build output rows
output_rows = []
processed_count = 0
for obj_box, urheber, terms in row_terms_map:
for term in terms:
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
total_terms += 1
if norm_name != "KEIN TREFFER":
total_hits += 1
out_row = {
"Box": obj_box,
"Objekt/Ebene": obj_box,
"Urheber": urheber,
"Begriff": term,
"Norm_Treffer": norm_name,
"Norm_ID": norm_id,
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
"GND_Top1": gnd_results.get(term,""),
"WD_Top1": wd_results.get(term,"")
}
output_rows.append(out_row)
processed_count += 1
if (processed_count % 200) == 0:
logger.debug(f"[{file_path.name}] {processed_count}/{total_unique_terms} Terme verarbeitet")
# Save output
out_df = pd.DataFrame(output_rows)
out_file = OUTPUT_DIR / f"{file_path.stem}_mapped.xlsx"
try:
out_df.to_excel(out_file, index=False, engine="openpyxl")
logger.info(f"Ergebnisse gespeichert: {out_file}")
mark_norm_hits(out_file)
export_missing_terms(out_df, out_file)
except Exception as e:
logger.error(f"Fehler beim Speichern der Ergebnisse für {file_path.name}: {e}")
elapsed_total = time.time() - overall_start
logger.info(f"Verarbeitung abgeschlossen. Gesamtzeit: {elapsed_total:.1f}s")
logger.info(f"Gesamtterme: {total_terms}, Treffer: {total_hits}, Trefferquote: {total_hits/total_terms:.2%}" if total_terms else "")
save_cache()
logger.stop()
if __name__ == "__main__":
process_files()

View File

@ -1,449 +0,0 @@
"""
========================================================================
NormVokabular Mapper Übersicht
========================================================================
Dieses Skript dient dazu, Begriffe aus Datenbeständen mit einem
vordefinierten Normvokabular abzugleichen. Es identifiziert Treffer,
gibt bei fehlenden Treffern Vorschläge aus und ermöglicht optional
einen Abgleich mit externen APIs (GND, Wikidata).
Hauptfunktionen:
1. **Input verarbeiten**
- Liest CSV-, Excel- und ODS-Dateien aus dem Ordner "Input CSV".
- Extrahiert relevante Begriffe aus Spalten wie "Objektbeschreibung",
filtert Stopwords und Zahlen.
2. **Normvokabular laden**
- Liest die Masterdatei NV_MASTER.ods ein.
- Berücksichtigt Hierarchie-IDs, um übergeordnete Begriffe zuordnen zu können.
- Erstellt ein Index für gestemmte Begriffe, um auch ähnliche Schreibweisen zu erkennen.
3. **Mapping auf Normvokabular**
- Prüft, ob ein Begriff exakt oder gestemmt im Normvokabular vorkommt.
- Wenn kein Treffer vorliegt, werden alternative Vorschläge generiert.
4. **API-Abgleich (optional)**
- Fragt GND und Wikidata ab, um den Top-1 Treffer für jeden Begriff zu ermitteln.
- Nutzt einen Cache, um wiederholte Requests zu vermeiden.
- Bietet einen Dry-Run-Modus für Tests ohne Internetzugang.
5. **Ergebnis speichern**
- Speichert die Auswertung in einem eigenen Ordner "Auswertung Ergebnisse".
- Markiert Treffer visuell: grün = Treffer, rot = kein Treffer (bei Excel),
bzw. fügt Statusspalte bei ODS-Dateien hinzu.
- Enthält alle relevanten Informationen pro Begriff: Originalbegriff, Normbegriff,
Norm-ID, Vorschläge, GND/Wikidata Top1 Treffer.
6. **Logging**
- Informiert über Fortschritt, Anzahl der Begriffe, Treffer und mögliche Fehler.
**Nutzung:**
```bash
python normvokabular_mapper.py
python normvokabular_mapper.py --dry-run # nur Simulation der API-Abfragen
"""
import os
import sys
import time
import json
import re
import requests
import pandas as pd
from pathlib import Path
from difflib import SequenceMatcher
import argparse
from collections import defaultdict
# =========================
# Argumente / Dry-Run
# =========================
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true', help='API-Abfragen simulieren')
args = parser.parse_args()
DRY_RUN = args.dry_run
# =========================
# Konfiguration
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
TIMEOUT = 5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
CACHE_FILE = "api_cache.json"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
API_ACTIVE = {"gnd": True, "wikidata": True}
FAIL_COUNTER = {"gnd":0, "wikidata":0}
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
CONF_THRESHOLD = 0.75 # für Vorschläge
# =========================
# Logging
# =========================
def log(level, msg):
ts = time.strftime("%Y-%m-%d %H:%M:%S")
print(f"[{ts}] [{level}] {msg}")
# =========================
# Cache laden / speichern
# =========================
if os.path.exists(CACHE_FILE):
try:
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
log("INFO", f"Cache geladen: {CACHE_FILE}")
except:
CACHE = {}
else:
CACHE = {}
def save_cache():
try:
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
log("DEBUG","Cache gespeichert")
except Exception as e:
log("ERROR", f"Cache speichern fehlgeschlagen: {e}")
# =========================
# Normalisierung / Stemming
# =========================
try:
from nltk.stem.snowball import GermanStemmer
STEMMER = GermanStemmer()
log("INFO","NLTK GermanStemmer verfügbar")
except:
STEMMER = None
log("WARNING","NLTK nicht verfügbar, naive Pluralreduktion wird genutzt")
def normalize_text(s):
if s is None:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
def naive_stem(w):
for ending in ("ern","nen","en","er","e","n","s"):
if w.endswith(ending) and len(w)-len(ending)>=3:
return w[:-len(ending)]
return w
def stem_word(word):
w = normalize_text(word)
try:
return STEMMER.stem(w) if STEMMER else naive_stem(w)
except:
return naive_stem(w)
from collections import defaultdict
from difflib import SequenceMatcher
CONF_THRESHOLD = 0.75 # Confidence für Vorschläge
# =========================
# Normvokabular laden (NV_MASTER) mit Parent-ID & Stem-Index
# =========================
def load_normvokabular(file_path):
import pandas as pd
import re
log("INFO", f"Normvokabular laden: {file_path}")
engine = "odf" if file_path.suffix.lower() == ".ods" else None
sheets = pd.read_excel(file_path, sheet_name=None, engine=engine)
norm_dict = {}
stem_index = defaultdict(list)
count = 0
for sheet_name, df in sheets.items():
df.columns = [str(c).strip() for c in df.columns]
current_parent_id = None
for _, row in df.iterrows():
# Spaltennamen flexibel anpassen
id_val = str(row.get("ID","")).strip() if "ID" in df.columns else ""
wort = str(row.get("Wort/Vokabel","")).strip() if "Wort/Vokabel" in df.columns else ""
# Zeilen mit ID, aber ohne Vokabel → Update Parent-ID
if id_val:
current_parent_id = id_val
# Skip leere Vokabeln
if not wort:
continue
assigned_id = current_parent_id # Parent-ID übernehmen
key = normalize_text(wort)
entry = {
"Name": wort,
"ID": assigned_id,
"Sheet": sheet_name
}
norm_dict[key] = entry
stem_index[stem_word(key)].append(entry)
count += 1
log("INFO", f"{count} Begriffe aus Normvokabular geladen")
return norm_dict, stem_index
# =========================
# Mapping & Vorschläge
# =========================
def map_to_norm(term, norm_dict, stem_index):
tnorm = normalize_text(term)
tstem = stem_word(tnorm)
# Exakter Treffer
if tnorm in norm_dict:
e = norm_dict[tnorm]
return e["Name"], e["ID"], []
# Gestemmter Treffer
if tstem in stem_index:
e = stem_index[tstem][0]
return e["Name"], e["ID"], []
# Kein Treffer → Vorschläge
suggestions = get_suggestions(tnorm, norm_dict)
return "KEIN TREFFER", "", suggestions
def get_suggestions(term, norm_dict, top_n=3, threshold=CONF_THRESHOLD):
t = term.lower()
scores = []
for key, val in norm_dict.items():
score = SequenceMatcher(None, t, key).ratio()
if score >= threshold:
scores.append((score, val["Name"], val["ID"]))
scores.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in scores[:top_n]]
# =========================
# API-Abgleich (Top1) unverändert
# =========================
def request_with_retries(api_name,url,params=None):
if DRY_RUN:
return None
cache_key = url + str(params)
if cache_key in CACHE:
return CACHE[cache_key]
retries = 0
while retries<MAX_RETRIES:
try:
r = requests.get(url,params=params,timeout=TIMEOUT,headers=HEADERS)
if r.status_code==200:
try: data=r.json()
except: data=r.text
CACHE[cache_key]=data
FAIL_COUNTER[api_name]=0
return data
except:
pass
retries+=1
time.sleep(min(BACKOFF_FACTOR**retries,30))
FAIL_COUNTER[api_name]+=1
if FAIL_COUNTER[api_name]>=10:
API_ACTIVE[api_name]=False
return None
def compute_min_conf(term,api_name):
l=len(term.strip())
if l<=3: return 0.90
if l<=6: return 0.85 if api_name=='gnd' else 0.80
return 0.75 if api_name=='gnd' else 0.70
def batch_query_gnd(terms):
results={}
if DRY_RUN or not API_ACTIVE.get("gnd",False):
for t in terms: results[t]="TEST_GND"
return results
for t in terms:
url="https://lobid.org/gnd/search"
params={"q":t,"format":"json"}
data=request_with_retries("gnd",url,params)
top=""
if data and "member" in data:
min_conf=compute_min_conf(t,'gnd')
cands=[]
for doc in data["member"]:
name=doc.get("preferredName","") or doc.get("name","")
if not name: continue
conf=SequenceMatcher(None,t.lower(),name.lower()).ratio()
if conf>=min_conf: cands.append((name,conf))
if cands:
top=sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t]=top
return results
def batch_query_wikidata(terms):
results={}
if DRY_RUN or not API_ACTIVE.get("wikidata",False):
for t in terms: results[t]="TEST_WD"
return results
for t in terms:
url="https://www.wikidata.org/w/api.php"
params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
data=request_with_retries("wikidata",url,params)
top=""
if data and "search" in data:
min_conf=compute_min_conf(t,'wikidata')
cands=[]
for e in data["search"]:
label=e.get("label","")
if not label: continue
conf=SequenceMatcher(None,t.lower(),label.lower()).ratio()
if conf>=min_conf: cands.append((label,conf))
if cands:
top=sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t]=top
return results
# =========================
# Formatabhängige Markierung / Status
# =========================
def mark_norm_hits(file_path):
ext = file_path.suffix.lower()
if ext in [".xlsx", ".xls"]:
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
wb = load_workbook(file_path)
ws = wb.active
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
norm_col = col_map.get("Norm_Treffer", None)
if not norm_col:
log("WARNING","Spalte 'Norm_Treffer' nicht gefunden, keine Markierung möglich")
return
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
cell = row[0]
if cell.value and cell.value!="KEIN TREFFER":
cell.fill = green_fill
else:
cell.fill = red_fill
wb.save(file_path)
log("INFO","Excel: Treffer farblich markiert (grün=Treffer, rot=kein Treffer)")
elif ext==".ods":
df = pd.read_excel(file_path, engine="odf")
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
df.to_excel(file_path, index=False, engine="odf")
log("INFO","ODS: Spalte 'Norm_Status' eingefügt (Treffer / Kein Treffer)")
else:
log("WARNING","Unbekanntes Dateiformat, keine Markierung durchgeführt")
# =========================
# Verarbeitung Input-Dateien
# =========================
def process_files():
norm_dict, stem_index = load_normvokabular(NORMVOC_FILE)
total_terms=0
total_norm_hits=0
if not INPUT_DIR.exists():
log("CRITICAL",f"Eingabeordner {INPUT_DIR} fehlt")
sys.exit(1)
files=list(INPUT_DIR.glob("*"))
if not files:
log("WARNING","Keine Dateien gefunden")
for file_path in files:
if not file_path.suffix.lower() in [".ods",".xlsx",".csv",".xls"]:
continue
log("INFO",f"Verarbeite Datei: {file_path.name}")
# Output-Datei für diese Input-Datei erzeugen
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
version = 1
while output_file.exists():
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
version += 1
try:
if file_path.suffix.lower()==".csv":
df=pd.read_csv(file_path)
elif file_path.suffix.lower()==".ods":
df=pd.read_excel(file_path, engine="odf")
else:
df=pd.read_excel(file_path)
except Exception as e:
log("ERROR",f"Datei {file_path.name} konnte nicht gelesen werden: {e}")
continue
df.columns=[str(c).strip() for c in df.columns]
row_terms_map=[]
for _,row in df.iterrows():
besch=row.get("Objektbeschreibung","")
if pd.isna(besch) or not str(besch).strip(): continue
besch=str(besch).strip()
clauses=[c.strip() for c in re.split(r",",besch) if c.strip()]
terms=[]
for clause in clauses:
parts=[p.strip() for p in re.split(r"\s+",clause) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS: continue
if re.fullmatch(r"\d+",p): continue
terms.append(p)
obj_box=row.get("Objekt/Ebene","")
urheber=row.get("Urheber","")
row_terms_map.append((obj_box,urheber,terms))
all_terms=[]
for _,_,terms in row_terms_map:
all_terms.extend(terms)
all_terms = list(set(all_terms)) # unique
gnd_results=batch_query_gnd(all_terms)
wd_results=batch_query_wikidata(all_terms)
output_rows=[]
for obj_box,urheber,terms in row_terms_map:
for term in terms:
norm_name,norm_id,suggestions = map_to_norm(term,norm_dict, stem_index)
total_terms+=1
if norm_name!="KEIN TREFFER":
total_norm_hits+=1
out_row={
"Box": obj_box,
"Objekt/Ebene": obj_box,
"Urheber": urheber,
"Begriff": term,
"Norm_Treffer": norm_name,
"Norm_ID": norm_id,
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
"GND_Top1": gnd_results.get(term,""),
"WD_Top1": wd_results.get(term,"")
}
output_rows.append(out_row)
out_df=pd.DataFrame(output_rows)
engine = "odf" if output_file.suffix.lower()==".ods" else None
out_df.to_excel(output_file,index=False,engine=engine)
log("INFO",f"Auswertung gespeichert: {output_file}")
mark_norm_hits(output_file)
save_cache()
log("INFO",f"Gesamt: {total_terms} Begriffe, {total_norm_hits} Treffer im Normvokabular")
# =========================
# Main
# =========================
if __name__=="__main__":
process_files()
log("INFO","Fertig")

View File

@ -1,471 +0,0 @@
"""
========================================================================
NormVokabular Mapper Übersicht
========================================================================
Dieses Skript dient dazu, Begriffe aus Datenbeständen mit einem
vordefinierten Normvokabular abzugleichen. Es identifiziert Treffer,
gibt bei fehlenden Treffern Vorschläge aus und ermöglicht optional
einen Abgleich mit externen APIs (GND, Wikidata).
Hauptfunktionen:
1. **Input verarbeiten**
- Liest CSV-, Excel- und ODS-Dateien aus dem Ordner "Input CSV".
- Extrahiert relevante Begriffe aus Spalten wie "Objektbeschreibung",
filtert Stopwords und Zahlen.
2. **Normvokabular laden**
- Liest die Masterdatei NV_MASTER.ods ein.
- Berücksichtigt Hierarchie-IDs, um übergeordnete Begriffe zuordnen zu können.
- Erstellt ein Index für gestemmte Begriffe, um auch ähnliche Schreibweisen zu erkennen.
3. **Mapping auf Normvokabular**
- Prüft, ob ein Begriff exakt oder gestemmt im Normvokabular vorkommt.
- Wenn kein Treffer vorliegt, werden alternative Vorschläge generiert.
4. **API-Abgleich (optional)**
- Fragt GND und Wikidata ab, um den Top-1 Treffer für jeden Begriff zu ermitteln.
- Nutzt einen Cache, um wiederholte Requests zu vermeiden.
- Bietet einen Dry-Run-Modus für Tests ohne Internetzugang.
5. **Ergebnis speichern**
- Speichert die Auswertung in einem eigenen Ordner "Auswertung Ergebnisse".
- Markiert Treffer visuell: grün = Treffer, rot = kein Treffer (bei Excel),
bzw. fügt Statusspalte bei ODS-Dateien hinzu.
- Enthält alle relevanten Informationen pro Begriff: Originalbegriff, Normbegriff,
Norm-ID, Vorschläge, GND/Wikidata Top1 Treffer.
6. **Logging**
- Informiert über Fortschritt, Anzahl der Begriffe, Treffer und mögliche Fehler.
"""
import os
import sys
import re
import time
import json
import pandas as pd
import requests
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher
# RapidFuzz für Token-basierte Fuzzy-Suche
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
print("RapidFuzz verfügbar")
except ImportError:
RAPIDFUZZ_AVAILABLE = False
print("RapidFuzz nicht verfügbar nutze SequenceMatcher")
# Spacy Lemmatizer
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
print("Spacy Lemmatizer aktiviert")
except:
SPACY_AVAILABLE = False
nlp = None
print("Spacy nicht verfügbar nutze naive Stemmer")
# =========================
# Pfade & Config
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
CACHE_FILE = "api_cache.json"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
TIMEOUT = 5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
API_ACTIVE = {"gnd": True, "wikidata": True}
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
# Cache
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
def save_cache():
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
# =========================
# Normalisierung / Lemma
# =========================
def normalize_text(s):
if not s:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# =========================
# Kompositum-Zerlegung (erweitert)
# =========================
def compound_split(term, norm_dict):
"""
Zerlegt Komposita durch Prüfen auf Substrings, die im Normvokabular vorkommen.
"""
term_norm = normalize_text(term)
matches = []
for i in range(len(term_norm)):
for j in range(i+3, len(term_norm)+1):
sub = term_norm[i:j]
if sub in norm_dict and sub not in matches:
matches.append(sub)
if not matches:
matches = [term_norm]
return matches
# =========================
# Normvokabular laden & Lemma vorbereiten
# =========================
def load_normvokabular(file_path):
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
norm_dict = {}
stem_index = defaultdict(list)
lemma_norm_map = {}
for sheet_name, df in sheets.items():
if sheet_name.lower() in ["master", "übersicht"]:
continue
df = df.dropna(how="all", axis=1)
df.columns = [str(c).strip() for c in df.columns]
id_col = next((c for c in df.columns if "ID" in c), None)
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c), None)
if not id_col or not word_col:
continue
current_parent_id = None
for _, row in df.iterrows():
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
if row_id:
current_parent_id = row_id
if not row_word:
continue
assigned_parent_id = current_parent_id
entry = {
"Name": row_word,
"ID": assigned_parent_id,
"Sheet": sheet_name,
"Own_ID": row_id or ""
}
key = normalize_text(row_word)
norm_dict[key] = entry
lemma = lemmatize_term(key)
stem_index[lemma].append(entry)
if lemma not in lemma_norm_map:
lemma_norm_map[lemma] = entry
return norm_dict, stem_index, lemma_norm_map
# =========================
# Vorschläge & Fuzzy Matching
# =========================
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
candidates = []
for key_lemma, entry in lemma_norm_map.items():
if RAPIDFUZZ_AVAILABLE:
score_token = fuzz.token_set_ratio(term_lemma, key_lemma)/100
score_partial = fuzz.partial_ratio(term_lemma, key_lemma)/100
score = max(score_token, score_partial)
else:
score_seq = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
score = score_seq
# Substring-Boost
if term_lemma in key_lemma or key_lemma in term_lemma:
score = max(score, 0.9)
if score >= threshold:
candidates.append((score, entry["Name"], entry["ID"]))
candidates.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
# =========================
# Mapping auf Normvokabular
# =========================
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
# Exakter Treffer
if term_norm in norm_dict:
e = norm_dict[term_norm]
return e["Name"], e["ID"], []
# Lemma-Treffer
if term_lemma in stem_index:
e = stem_index[term_lemma][0]
return e["Name"], e["ID"], []
# KEIN TREFFER → Kompositum-Split & Teilbegriffe prüfen
tokens = compound_split(term, norm_dict)
token_matches = []
all_suggestions = []
for t in tokens:
t_lemma = lemmatize_term(t)
if t_lemma in stem_index:
e = stem_index[t_lemma][0]
token_matches.append((t, e["Name"], e["ID"]))
else:
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
all_suggestions.extend(sugg)
token_matches.append((t, "KEIN TREFFER", "", sugg))
combined_matches = [m[1] for m in token_matches if m[1] != "KEIN TREFFER"]
if combined_matches:
return "KEIN TREFFER", "", combined_matches
elif all_suggestions:
return "KEIN TREFFER", "", all_suggestions
else:
return "KEIN TREFFER", "", []
# =========================
# API-Abfragen
# =========================
def request_with_retries(api_name,url,params=None):
cache_key = url + str(params)
if cache_key in CACHE:
return CACHE[cache_key]
retries = 0
while retries < MAX_RETRIES:
try:
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
if r.status_code == 200:
try: data = r.json()
except: data = r.text
CACHE[cache_key] = data
FAIL_COUNTER[api_name] = 0
return data
except:
pass
retries += 1
time.sleep(min(BACKOFF_FACTOR**retries,30))
FAIL_COUNTER[api_name] += 1
if FAIL_COUNTER[api_name] >= 10:
API_ACTIVE[api_name] = False
return None
def batch_query_gnd(terms):
results={}
if not API_ACTIVE.get("gnd", False):
for t in terms: results[t] = ""
return results
for t in terms:
url="https://lobid.org/gnd/search"
params={"q":t,"format":"json"}
data = request_with_retries("gnd", url, params)
top = ""
if data and "member" in data:
cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None,t.lower(),(doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
cands = [c for c in cands if c[1]>=0.75]
if cands:
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t] = top
return results
def batch_query_wikidata(terms):
results={}
if not API_ACTIVE.get("wikidata", False):
for t in terms: results[t] = ""
return results
for t in terms:
url="https://www.wikidata.org/w/api.php"
params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
data = request_with_retries("wikidata", url, params)
top = ""
if data and "search" in data:
cands = [(e.get("label",""), SequenceMatcher(None,t.lower(),e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")]
cands = [c for c in cands if c[1]>=0.70]
if cands:
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t] = top
return results
# =========================
# Markierung / Export
# =========================
def mark_norm_hits(file_path):
ext = file_path.suffix.lower()
if ext in [".xlsx", ".xls"]:
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
wb = load_workbook(file_path)
ws = wb.active
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
norm_col = col_map.get("Norm_Treffer", None)
if not norm_col:
print("Spalte 'Norm_Treffer' nicht gefunden")
return
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
cell = row[0]
if cell.value and cell.value != "KEIN TREFFER":
cell.fill = green_fill
else:
cell.fill = red_fill
wb.save(file_path)
elif ext==".ods":
df = pd.read_excel(file_path, engine="odf")
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
df.to_excel(file_path, index=False, engine="odf")
# =========================
# Export mit zweitem Sheet für Begriffe ohne Treffer und Vorschlag
# =========================
def export_results_with_no_hits(out_df, output_file):
"""
Exportiert das Mapping-Ergebnis und zusätzlich ein zweites Sheet
mit allen Begriffen, deren Norm_Treffer == 'KEIN TREFFER' und Norm_Vorschlag leer ist.
"""
# Begriffe ohne Treffer und ohne Vorschlag
no_match_df = out_df[(out_df["Norm_Treffer"]=="KEIN TREFFER") & (out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip()==""))].copy()
ext = output_file.suffix.lower()
if ext in [".xlsx", ".xls"]:
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
out_df.to_excel(writer, index=False, sheet_name="Mapping")
no_match_df.to_excel(writer, index=False, sheet_name="Keine Treffer")
elif ext == ".ods":
# ODS-Export via odf-Engine
with pd.ExcelWriter(output_file, engine="odf") as writer:
out_df.to_excel(writer, index=False, sheet_name="Mapping")
no_match_df.to_excel(writer, index=False, sheet_name="Keine Treffer")
# =========================
# Verarbeitung Input-Dateien
# =========================
def process_files():
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
total_terms = 0
total_hits = 0
if not INPUT_DIR.exists():
print(f"Eingabeordner {INPUT_DIR} fehlt")
sys.exit(1)
files = list(INPUT_DIR.glob("*"))
if not files:
print("Keine Dateien gefunden")
return
for file_path in files:
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
continue
print(f"Verarbeite Datei: {file_path.name}")
try:
if file_path.suffix.lower() == ".csv":
df = pd.read_csv(file_path)
else:
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
except Exception as e:
print(f"Fehler beim Lesen von {file_path.name}: {e}")
continue
df = df.dropna(how="all")
df.columns = [str(c).strip() for c in df.columns]
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
urh_col = next((c for c in df.columns if "Urheber" in c), None)
if not besch_col: continue
row_terms_map = []
for _, row in df.iterrows():
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
if not besch: continue
obj_box = row[box_col] if box_col else ""
urheber = row[urh_col] if urh_col else ""
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
terms = []
for clause in clauses:
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS: continue
if re.fullmatch(r"\d+", p): continue
terms.append(p)
row_terms_map.append((obj_box, urheber, terms))
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
gnd_results = batch_query_gnd(all_terms)
wd_results = batch_query_wikidata(all_terms)
output_rows = []
for obj_box, urheber, terms in row_terms_map:
for term in terms:
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
total_terms += 1
if norm_name != "KEIN TREFFER":
total_hits += 1
out_row = {
"Box": obj_box,
"Objekt/Ebene": obj_box,
"Urheber": urheber,
"Begriff": term,
"Norm_Treffer": norm_name,
"Norm_ID": norm_id,
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
"GND_Top1": gnd_results.get(term,""),
"WD_Top1": wd_results.get(term,"")
}
output_rows.append(out_row)
out_df = pd.DataFrame(output_rows)
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
version = 1
while output_file.exists():
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
version += 1
export_results_with_no_hits(out_df, output_file)
mark_norm_hits(output_file)
print(f"Auswertung gespeichert: {output_file}")
save_cache()
print(f"Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular")
# =========================
# Main
# =========================
if __name__ == "__main__":
process_files()
print("Fertig")

View File

@ -1,509 +0,0 @@
"""
========================================================================
NormVokabular Mapper Übersicht
========================================================================
Dieses Skript dient dazu, Begriffe aus Datenbeständen mit einem
vordefinierten Normvokabular abzugleichen. Es identifiziert Treffer,
gibt bei fehlenden Treffern Vorschläge aus und ermöglicht optional
einen Abgleich mit externen APIs (GND, Wikidata).
Hauptfunktionen:
1. **Input verarbeiten**
- Liest CSV-, Excel- und ODS-Dateien aus dem Ordner "Input CSV".
- Extrahiert relevante Begriffe aus Spalten wie "Objektbeschreibung",
filtert Stopwords und Zahlen.
2. **Normvokabular laden**
- Liest die Masterdatei NV_MASTER.ods ein.
- Berücksichtigt Hierarchie-IDs, um übergeordnete Begriffe zuordnen zu können.
- Erstellt ein Index für gestemmte Begriffe, um auch ähnliche Schreibweisen zu erkennen.
3. **Mapping auf Normvokabular**
- Prüft, ob ein Begriff exakt oder gestemmt im Normvokabular vorkommt.
- Wenn kein Treffer vorliegt, werden alternative Vorschläge generiert.
4. **API-Abgleich (optional)**
- Fragt GND und Wikidata ab, um den Top-1 Treffer für jeden Begriff zu ermitteln.
- Nutzt einen Cache, um wiederholte Requests zu vermeiden.
- Bietet einen Dry-Run-Modus für Tests ohne Internetzugang.
5. **Ergebnis speichern**
- Speichert die Auswertung in einem eigenen Ordner "Auswertung Ergebnisse".
- Markiert Treffer visuell: grün = Treffer, rot = kein Treffer (bei Excel),
bzw. fügt Statusspalte bei ODS-Dateien hinzu.
- Enthält alle relevanten Informationen pro Begriff: Originalbegriff, Normbegriff,
Norm-ID, Vorschläge, GND/Wikidata Top1 Treffer.
6. **Logging**
- Informiert über Fortschritt, Anzahl der Begriffe, Treffer und mögliche Fehler.
"""
import os
import sys
import re
import time
import json
import pandas as pd
import requests
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher
# RapidFuzz für Token-basierte Fuzzy-Suche
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
print("RapidFuzz verfügbar")
except ImportError:
RAPIDFUZZ_AVAILABLE = False
print("RapidFuzz nicht verfügbar nutze SequenceMatcher")
# Spacy Lemmatizer
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
print("Spacy Lemmatizer aktiviert")
except:
SPACY_AVAILABLE = False
nlp = None
print("Spacy nicht verfügbar nutze naive Stemmer")
# =========================
# Pfade & Config
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
CACHE_FILE = "api_cache.json"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
TIMEOUT = 5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
API_ACTIVE = {"gnd": True, "wikidata": True}
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
# Cache
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
def save_cache():
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
# =========================
# Normalisierung / Lemma
# =========================
def normalize_text(s):
if not s:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
# Lemma-Cache
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# =========================
# Kompositum-Zerlegung (einfacher Ansatz)
# =========================
def compound_split(term):
parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
return parts if parts else [term]
# =========================
# Normvokabular laden & Lemma vorbereiten
# =========================
def load_normvokabular(file_path):
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
norm_dict = {}
stem_index = defaultdict(list)
lemma_norm_map = {} # für RapidFuzz preprocessed
for sheet_name, df in sheets.items():
if sheet_name.lower() in ["master", "übersicht"]:
continue
df = df.dropna(how="all", axis=1)
df.columns = [str(c).strip() for c in df.columns]
id_col = next((c for c in df.columns if "ID" in c), None)
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c), None)
if not id_col or not word_col:
continue
current_parent_id = None
for _, row in df.iterrows():
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
if row_id:
current_parent_id = row_id
if not row_word:
continue
assigned_parent_id = current_parent_id
entry = {
"Name": row_word,
"ID": assigned_parent_id, # Parent-ID
"Sheet": sheet_name,
"Own_ID": row_id or "" # eigene ID, falls vorhanden
}
key = normalize_text(row_word)
norm_dict[key] = entry
lemma = lemmatize_term(key)
stem_index[lemma].append(entry)
if lemma not in lemma_norm_map:
lemma_norm_map[lemma] = entry
return norm_dict, stem_index, lemma_norm_map
# =========================
# Mapping & Vorschläge
# =========================
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
# Exakter Treffer
if term_norm in norm_dict:
e = norm_dict[term_norm]
return e["Name"], e["ID"], []
# Lemma-Treffer
if term_lemma in stem_index:
e = stem_index[term_lemma][0]
return e["Name"], e["ID"], []
# KEIN TREFFER → Kompositum-Split
tokens = compound_split(term)
if len(tokens) == 1:
suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
return "KEIN TREFFER", "", suggestions
else:
token_matches = []
for t in tokens:
t_lemma = lemmatize_term(t)
if t_lemma in stem_index:
e = stem_index[t_lemma][0]
token_matches.append((t, e["Name"], e["ID"]))
else:
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
token_matches.append((t, "KEIN TREFFER", "", sugg))
combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
return "KEIN TREFFER", "", combined_suggestions
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
candidates = []
for key_lemma, entry in lemma_norm_map.items():
if RAPIDFUZZ_AVAILABLE:
score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
else:
score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
if key_lemma.lower().startswith(term_lemma.lower()):
score = min(score + 0.1, 1.0)
if score >= threshold:
candidates.append((score, entry["Name"], entry["ID"]))
candidates.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
# =========================
# API-Abfragen
# =========================
def request_with_retries(api_name,url,params=None):
cache_key = url + str(params)
if cache_key in CACHE:
return CACHE[cache_key]
retries = 0
while retries < MAX_RETRIES:
try:
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
if r.status_code == 200:
try: data = r.json()
except: data = r.text
CACHE[cache_key] = data
FAIL_COUNTER[api_name] = 0
return data
except:
pass
retries += 1
time.sleep(min(BACKOFF_FACTOR**retries,30))
FAIL_COUNTER[api_name] += 1
if FAIL_COUNTER[api_name] >= 10:
API_ACTIVE[api_name] = False
return None
def batch_query_gnd(terms):
results={}
if not API_ACTIVE.get("gnd", False):
for t in terms: results[t] = ""
return results
for t in terms:
url="https://lobid.org/gnd/search"
params={"q":t,"format":"json"}
data = request_with_retries("gnd", url, params)
top = ""
if data and "member" in data:
cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None,t.lower(),(doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
cands = [c for c in cands if c[1]>=0.75]
if cands:
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t] = top
return results
def batch_query_wikidata(terms):
results={}
if not API_ACTIVE.get("wikidata", False):
for t in terms: results[t] = ""
return results
for t in terms:
url="https://www.wikidata.org/w/api.php"
params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
data = request_with_retries("wikidata", url, params)
top = ""
if data and "search" in data:
cands = [(e.get("label",""), SequenceMatcher(None,t.lower(),e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")]
cands = [c for c in cands if c[1]>=0.70]
if cands:
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t] = top
return results
# =========================
# Markierung / Export
# =========================
def mark_norm_hits(file_path):
ext = file_path.suffix.lower()
if ext in [".xlsx", ".xls"]:
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
wb = load_workbook(file_path)
ws = wb.active
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
norm_col = col_map.get("Norm_Treffer", None)
if not norm_col:
print("Spalte 'Norm_Treffer' nicht gefunden")
return
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
cell = row[0]
if cell.value and cell.value != "KEIN TREFFER":
cell.fill = green_fill
else:
cell.fill = red_fill
wb.save(file_path)
elif ext==".ods":
df = pd.read_excel(file_path, engine="odf")
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
df.to_excel(file_path, index=False, engine="odf")
# =========================
# Verarbeitung Input-Dateien
# =========================
## =========================
# Neue Funktion: fehlende Begriffe in separate Datei exportieren
# =========================
def export_missing_terms(out_df, output_file):
# Filter: KEIN TREFFER & keine Vorschläge
missing_df = out_df[
(out_df["Norm_Treffer"] == "KEIN TREFFER") &
(out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
][["Begriff"]].drop_duplicates()
count_missing = len(missing_df)
print(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
if count_missing == 0:
return
# Neue Datei erzeugen
ext = output_file.suffix.lower()
base_name = output_file.stem
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}"
# Bei vorhandener Datei: Versionsnummer anhängen
version = 1
while missing_file.exists():
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}"
version += 1
if ext in [".xlsx", ".xls"]:
missing_df.to_excel(missing_file, index=False, engine="openpyxl")
elif ext == ".ods":
missing_df.to_excel(missing_file, index=False, engine="odf")
else:
# Für CSV
missing_df.to_csv(missing_file, index=False, sep=";")
print(f"Fehlende Begriffe gespeichert: {missing_file}")
# =========================
# Verarbeitung Input-Dateien (final)
# =========================
# =========================
# Neue Funktion: fehlende Begriffe in separate Datei exportieren
# =========================
def export_missing_terms(out_df, output_file):
# Filter: KEIN TREFFER & keine Vorschläge
missing_df = out_df[
(out_df["Norm_Treffer"] == "KEIN TREFFER") &
(out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
][["Begriff"]].drop_duplicates()
count_missing = len(missing_df)
print(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
if count_missing == 0:
return
# Neue Datei erzeugen
ext = output_file.suffix.lower()
base_name = output_file.stem
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}"
# Bei vorhandener Datei: Versionsnummer anhängen
version = 1
while missing_file.exists():
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}"
version += 1
if ext in [".xlsx", ".xls"]:
missing_df.to_excel(missing_file, index=False, engine="openpyxl")
elif ext == ".ods":
missing_df.to_excel(missing_file, index=False, engine="odf")
else:
# Für CSV
missing_df.to_csv(missing_file, index=False, sep=";")
print(f"Fehlende Begriffe gespeichert: {missing_file}")
# =========================
# Verarbeitung Input-Dateien (final)
# =========================
def process_files():
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
total_terms = 0
total_hits = 0
if not INPUT_DIR.exists():
print(f"Eingabeordner {INPUT_DIR} fehlt")
sys.exit(1)
files = list(INPUT_DIR.glob("*"))
if not files:
print("Keine Dateien gefunden")
return
for file_path in files:
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
continue
print(f"Verarbeite Datei: {file_path.name}")
try:
if file_path.suffix.lower() == ".csv":
df = pd.read_csv(file_path)
else:
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
except Exception as e:
print(f"Fehler beim Lesen von {file_path.name}: {e}")
continue
df = df.dropna(how="all")
df.columns = [str(c).strip() for c in df.columns]
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
urh_col = next((c for c in df.columns if "Urheber" in c), None)
if not besch_col: continue
row_terms_map = []
for _, row in df.iterrows():
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
if not besch: continue
obj_box = row[box_col] if box_col else ""
urheber = row[urh_col] if urh_col else ""
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
terms = []
for clause in clauses:
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS: continue
if re.fullmatch(r"\d+", p): continue
terms.append(p)
row_terms_map.append((obj_box, urheber, terms))
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
gnd_results = batch_query_gnd(all_terms)
wd_results = batch_query_wikidata(all_terms)
output_rows = []
for obj_box, urheber, terms in row_terms_map:
for term in terms:
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
total_terms += 1
if norm_name != "KEIN TREFFER":
total_hits += 1
out_row = {
"Box": obj_box,
"Objekt/Ebene": obj_box,
"Urheber": urheber,
"Begriff": term,
"Norm_Treffer": norm_name,
"Norm_ID": norm_id,
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
"GND_Top1": gnd_results.get(term,""),
"WD_Top1": wd_results.get(term,"")
}
output_rows.append(out_row)
out_df = pd.DataFrame(output_rows)
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
version = 1
while output_file.exists():
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
version += 1
engine = "odf" if output_file.suffix.lower()==".ods" else None
out_df.to_excel(output_file, index=False, engine=engine)
# --- NEU: fehlende Begriffe in separate Datei ---
export_missing_terms(out_df, output_file)
mark_norm_hits(output_file)
print(f"Auswertung gespeichert: {output_file}")
save_cache()
print(f"Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular")
# =========================
# Main
# =========================
if __name__ == "__main__":
process_files()
print("Fertig")

View File

@ -1,2 +0,0 @@
Makro für die Erfassungstabelle, mit dem Vorschläge für Begriffe per Klick angenommen und ersetzt werden sollen.
Funktioniert nicht in LibreOffice, müsste in excel aber laufen.

View File

@ -1,195 +0,0 @@
# -*- coding: utf-8 -*-
# LibreOffice Calc Makro: Interaktive Termersetzung mit Logging
import os
import re
try:
import uno
from com.sun.star.awt import XActionListener
UNO_AVAILABLE = True
except ImportError:
UNO_AVAILABLE = False
# Log-Datei definieren
BASE_DIR = os.path.expanduser("~/.config/libreoffice/4/user/Scripts/python/")
LOG_FILE = os.path.join(BASE_DIR, "vorschlag_ersetzen_log.txt")
def log(msg):
try:
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(msg + "\n")
except Exception:
pass
def interactive_term_replace():
"""Makro zum interaktiven Ersetzen von Termen mit Logging."""
if not UNO_AVAILABLE:
log("Fehler: UNO-Bindings nicht verfügbar. Bitte LibreOffice-Python verwenden.")
return
try:
doc = XSCRIPTCONTEXT.getDocument()
controller = doc.getCurrentController()
if controller is None:
log("Fehler: Kein aktiver Controller gefunden.")
return
sheet = controller.ActiveSheet
if sheet is None:
log("Fehler: Kein aktives Sheet gefunden.")
return
# Aktive Zelle abrufen
try:
cell = controller.getActiveCell()
except Exception as e:
log(f"Fehler: Konnte aktive Zelle nicht abrufen: {e}")
return
if cell is None:
log("Fehler: Keine Zelle ausgewählt.")
return
cell_text = str(cell.String).strip()
if not cell_text:
log("Info: Ausgewählte Zelle ist leer.")
return
# Term-Liste erzeugen
clauses = [c.strip() for c in re.split(r",", cell_text) if c.strip()]
terms = []
for cl in clauses:
parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()]
terms.extend(parts)
if not terms:
log("Info: Keine Terme in der Zelle gefunden.")
return
# Norm_Vorschlag Spalte finden
header_row = None
norm_sug_col = None
for r in range(5):
for c in range(sheet.Columns.Count):
try:
val = str(sheet.getCellByPosition(c, r).String).strip()
except Exception:
val = ""
if val == "Norm_Vorschlag":
norm_sug_col = c
if val.lower() == "objektbeschreibung":
header_row = r
if header_row is not None and norm_sug_col is not None:
break
if header_row is not None and norm_sug_col is not None:
break
if norm_sug_col is None:
log("Fehler: Spalte 'Norm_Vorschlag' nicht gefunden.")
return
# Vorschläge auslesen
row = cell.RangeAddress.StartRow
sugg_str = str(sheet.getCellByPosition(norm_sug_col, row).String).strip()
all_suggestions = [s.strip() for s in sugg_str.split("|") if s.strip()]
if not all_suggestions:
log(f"Info: Keine Vorschläge in Zeile {row+1}.")
return
# Dialog erstellen (wie vorher)
toolkit = XSCRIPTCONTEXT.getDesktop().getCurrentFrame().getContainerWindow().getToolkit()
dialog_model = ctx.ServiceManager.createInstance("com.sun.star.awt.UnoControlDialogModel")
dialog_model.PositionX = 100
dialog_model.PositionY = 100
dialog_model.Width = 300
dialog_model.Height = 250
dialog_model.Title = "Term ersetzen"
# ListBox: Terme
term_list_model = dialog_model.createInstance("com.sun.star.awt.UnoControlListBoxModel")
term_list_model.Name = "term_list"
term_list_model.PositionX = 10
term_list_model.PositionY = 10
term_list_model.Width = 280
term_list_model.Height = 80
term_list_model.StringItemList = tuple(terms)
term_list_model.MultiSelection = False
dialog_model.insertByName("term_list", term_list_model)
# ListBox: Vorschläge
sugg_list_model = dialog_model.createInstance("com.sun.star.awt.UnoControlListBoxModel")
sugg_list_model.Name = "sugg_list"
sugg_list_model.PositionX = 10
sugg_list_model.PositionY = 100
sugg_list_model.Width = 280
sugg_list_model.Height = 80
sugg_list_model.StringItemList = tuple(all_suggestions)
sugg_list_model.MultiSelection = False
dialog_model.insertByName("sugg_list", sugg_list_model)
# Checkbox für "Alle Terme ersetzen"
cb_model = dialog_model.createInstance("com.sun.star.awt.UnoControlCheckBoxModel")
cb_model.Name = "replace_all"
cb_model.PositionX = 10
cb_model.PositionY = 190
cb_model.Width = 200
cb_model.Height = 15
cb_model.Label = "Alle Terme ersetzen"
dialog_model.insertByName("replace_all", cb_model)
# OK-Button
ok_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
ok_model.Name = "ok_button"
ok_model.PositionX = 200
ok_model.PositionY = 190
ok_model.Width = 80
ok_model.Height = 25
ok_model.Label = "OK"
dialog_model.insertByName("ok_button", ok_model)
# Dialog erstellen
dialog = ctx.ServiceManager.createInstance("com.sun.star.awt.UnoControlDialog")
dialog.setModel(dialog_model)
dialog.setVisible(False)
dialog.createPeer(toolkit, None)
# ActionListener für OK
class OkListener(XActionListener):
def __init__(self, dialog, cell, terms):
self.dialog = dialog
self.cell = cell
self.terms = terms
def actionPerformed(self, event):
try:
list_box = self.dialog.getControl("term_list")
sugg_box = self.dialog.getControl("sugg_list")
cb = self.dialog.getControl("replace_all")
selected_sugg_idx = sugg_box.getSelectedItemPos()
if selected_sugg_idx < 0:
self.dialog.endExecute()
return
selected_sugg = sugg_box.getItem(selected_sugg_idx)
replace_all = cb.State == 1
if replace_all:
self.cell.String = " ".join([selected_sugg for t in self.terms])
else:
term_idx = list_box.getSelectedItemPos()
if term_idx >= 0:
self.terms[term_idx] = selected_sugg
self.cell.String = " ".join(self.terms)
except Exception as e:
log(f"Fehler während Dialogaktion: {e}")
finally:
self.dialog.endExecute()
def disposing(self, event):
pass
listener = OkListener(dialog, cell, terms)
ok_btn = dialog.getControl("ok_button")
ok_btn.addActionListener(listener)
dialog.execute()
except Exception as e:
log(f"Unbekannter Fehler: {e}")
# Export für LibreOffice
g_exportedScripts = (interactive_term_replace,)

View File

@ -1 +0,0 @@
,jarnold,workPC,17.10.2025 09:20,file:///home/jarnold/.config/libreoffice/4;

View File

@ -1,125 +0,0 @@
= Benutzungsanleitung - NV_MASTER Abgleich Makro =
'''(mapper_macro_2.x.py)'''
== 1. Was das Makro macht ==
Dieses Makro hilft dir dabei, Begriffe in der Auswertungstabelle zu vereinheitlichen.
Es vergleicht automatisch die Inhalte aus der Spalte „Objektbeschreibung“ mit einer
Normvokabular-Referenzdatei namens „NV_MASTER.ods“.
So findest du heraus, welche Begriffe schon genormt sind, wo es passende Vorschläge gibt
und wo etwas gar nicht erkannt wurde.
Das Makro markiert in der Auswertungstabelle jede Zeile unter „Objektbeschreibung“
farbig:
* <span style="color:green;">Grün</span>: Alles passt, alle Begriffe gefunden
* <span style="color:yellow;">Gelb</span>: Einige Begriffe wurden erkannt, andere nicht
* <span style="color:red;">Rot</span>: Kein einziger Begriff erkannt
Beispiel:
{| class="wikitable"
|+ Tabelle 1
|-
! Objektbeschreibung !! Norm_Treffer !! Norm_Vorschlag !! Kein_Treffer
|-
| (leer) || || ||
|}
Die Spalten „Norm_Treffer“, „Norm_Vorschlag“ und „Kein_Treffer“ legt das Makro
automatisch an, wenn sie fehlen.
! Tipps zur Nutzung !
* Wenn du die NV_MASTER-Datei änderst, starte das Makro neu es liest sie bei
jedem Lauf neu ein.
* Erstelle ein Backup der Auswertungstabelle, bevor du das Makro ausführst.
* Schaue ab und zu in die Logdatei, um zu prüfen, ob alles korrekt läuft.
* Wenn ein Begriff rot markiert wird, aber deiner Meinung nach sinnvoll und zutreffend
für das beschriebene Objekt ist, schreibe den Begriff auf und sprich mit deinen
Vorgesetzten ab, ob er in das Normvokabular aufgenommen werden sollte.
== 2. Wo die Dateien des Makros liegen müssen ==
'''Unter Linux:'''
<pre>
/home/&lt;dein-benutzername&gt;/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro/
</pre>
'''Unter Windows:'''
<pre>
C:\Users\<dein-benutzername>\AppData\Roaming\LibreOffice\4\user\Scripts\python\Vokabular_Abgleich_Makro\
</pre>
In diesem Ordner müssen liegen:
* mapper_macro_2.x.py (das Makro)
* NV_MASTER.ods (die Referenzdatei)
* optional: mapper_macro_2.x.log (wird automatisch erstellt)
== 3. Wie du das Makro startest ==
# Öffne deine Calc-Datei mit den Begriffen
# Gehe im Menü auf Extras → Makros → Makros ausführen...
# Wähle: Meine Makros → mapper_macro_2.x.py → run_mapper_macro
# Klicke auf Ausführen
Das Makro startet sofort. Je nach Tabellenumfang dauert der Abgleich ein paar Sekunden bis Minuten.
Wenn nichts passiert, liegt ein Fehler vor. In diesem Fall schaue in die .log-Datei, die das Makro bei jedem Durchlauf erstellt. Sie liegt im selben Ordner wie das Makro.
== 4. Wie du das Ergebnis liest ==
Nach dem Lauf schreibt das Makro die Treffer und Vorschläge direkt in deine Tabelle und markiert sie:
{| class="wikitable"
|+ Tabelle 2
|-
! Objektbeschreibung !! Norm_Treffer !! Norm_Vorschlag !! Kein_Treffer
|-
| Harfe, Noten, Bäume, Geldbeutel, Landschaft, Gewässer || Harfe (2.1) | Noten (3.4) | Landschaft (7.2) Gewässer (9.1) || Baum || Geldbeutel
|}
Farben:
* 🟩 <span style="color:green;">Grün</span>: Alle Begriffe wurden direkt erkannt → Perfekt!
* 🟨 <span style="color:yellow;">Gelb</span>: Einige Begriffe wurden erkannt, aber andere nur teilweise oder gar nicht → Vorschläge unter der Spalte „Norm_Vorschlag“ prüfen
* 🟥 <span style="color:red;">Rot</span>: Kein Begriff wurde gefunden → Objektbeschreibung anpassen, ggf. neue Begriffe in das Normvokabular aufnehmen
== 5. Wo das Protokoll liegt (Logdatei) ==
Das Makro schreibt alles, was passiert, in eine Logdatei:
'''Linux:''' /home/<dein-benutzername>/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro/mapper_macro_2.x.log
'''Windows:''' C:\Users\<dein-benutzername>\AppData\Roaming\LibreOffice\4\user\Scripts\python\Vokabular_Abgleich_Makro\mapper_macro_2.x.log
Dort siehst du:
* wann das Makro gestartet wurde
* wie viele Zeilen verarbeitet wurden
* und ob Fehler aufgetreten sind
{| class="wikitable"
|+ Tabelle 3
|-
! Problem !! Ursache !! Lösung
|-
| Das Makro taucht nicht auf || Falscher Speicherort || Prüfe, ob das Skript wirklich im Ordner Scripts/python liegt
|-
| Fehlermeldung „Module not found“ || Python-Bibliotheken fehlen || Installiere pandas, odfpy, spacy, rapidfuzz
|-
| NV_MASTER wird nicht gelesen || Datei fehlt oder ist kaputt || Prüfe Name und Speicherort
|-
| LibreOffice stürzt ab || Sehr große Datei oder fehlerhafte NV_MASTER || Teste mit kleinerer Datei oder neuem NV_MASTER
|}
== 6. Was das Makro benötigt, um einwandfrei zu laufen ==
Alle folgenden Pakete sind für das Makro notwendig, egal ob LibreOffice oder Excel:
{| class="wikitable"
|+ Tabelle 4
|-
! Paket !! Zweck
|-
| pandas || Einlesen der Referenzdatei (NV_MASTER.ods)
|-
| odfpy || Ermöglicht Lesen von .ods-Dateien (für pandas.read_excel(..., engine="odf"))
|-
| spacy || Lemmatisierung (optional, aber empfohlen)
|-
| rapidfuzz || Schnelles Fuzzy-Matching (Alternativ zu difflib)
|-
| openpyxl || Wird benötigt, falls .xlsx genutzt wird
|-
| python-dateutil || Wird automatisch von pandas gebraucht
|}

View File

@ -1,622 +0,0 @@
<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:143.0) Gecko/20100101 Firefox/143.0" version="28.2.5">
<diagram name="Page-1" id="aLmyRVYCle99qeRE2JvP">
<mxGraphModel dx="1301" dy="1900" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
<root>
<mxCell id="0" />
<mxCell id="1" parent="0" />
<mxCell id="WA2_J1DCvVjPXciXSW-M-3" value="" style="group" parent="1" vertex="1" connectable="0">
<mxGeometry x="85" y="932" width="310" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-2" value="Scan- und Erfassungsprozess" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="WA2_J1DCvVjPXciXSW-M-3" vertex="1">
<mxGeometry x="60" y="-900" width="210" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-13" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-4" target="WA2_J1DCvVjPXciXSW-M-6" edge="1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-14" value="Makro gibt Vorschläge aus NV zurück" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-13" vertex="1" connectable="0">
<mxGeometry x="0.2678" y="-1" relative="1" as="geometry">
<mxPoint as="offset" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-4" value="&lt;div&gt;Makro (mapper_macro_2.x.py)&lt;/div&gt;" style="ellipse;whiteSpace=wrap;html=1;fillColor=#FFFF66;" parent="1" vertex="1">
<mxGeometry x="575" y="52" width="200" height="100" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-5" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.367;exitY=0.988;exitDx=0;exitDy=0;exitPerimeter=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-1" target="WA2_J1DCvVjPXciXSW-M-4" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="405" y="97" as="sourcePoint" />
<Array as="points">
<mxPoint x="235" y="91" />
<mxPoint x="235" y="117" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-12" value="Wird vom Makro gelesen und mit NV abgeglichen" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-5" vertex="1" connectable="0">
<mxGeometry x="0.0228" y="4" relative="1" as="geometry">
<mxPoint as="offset" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-6" value="Anpassung der Erfassungstabelle anhand der Vorschläge" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="80" y="212" width="320" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-10" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="515" y="972" as="sourcePoint" />
<mxPoint x="515" y="12" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-11" value="&lt;h1 style=&quot;margin-top: 0px;&quot;&gt;Workflow Digitalisierung -&lt;/h1&gt;&lt;h1 style=&quot;margin-top: 0px;&quot;&gt;&lt;u&gt;&lt;font style=&quot;font-size: 20px;&quot;&gt;Objekterfassung und Pflege des Normvokabulars&lt;/font&gt;&lt;/u&gt;&lt;/h1&gt;&lt;div&gt;&lt;font style=&quot;font-size: 14px;&quot;&gt;- Erfassung und Verschlagwortung von Bildobjekten&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font style=&quot;font-size: 14px;&quot;&gt;- Abgleich mit internem Normvokabular&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font style=&quot;font-size: 14px;&quot;&gt;- API-Abgleich mit getty und GND&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font style=&quot;font-size: 14px;&quot;&gt;- Pflege und Erweiterung des Normvokabulars&lt;/font&gt;&lt;/div&gt;" style="text;html=1;whiteSpace=wrap;overflow=hidden;rounded=0;" parent="1" vertex="1">
<mxGeometry x="30" y="-1070" width="455" height="220" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-1" value="&lt;div&gt;Scan und Erfassen der Objekte, Erfassung in Tabellen, Spalte &quot;Objektbeschreibung&quot;&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="85" y="32" width="310" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-16" value="Makro 2 (Übernahme von Vorschlägen aus NV per Klick)" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#FF6666;" parent="1" vertex="1">
<mxGeometry x="575" y="292" width="190" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-20" value="" style="html=1;shadow=0;dashed=0;align=center;verticalAlign=middle;shape=mxgraph.arrows2.arrow;dy=0.6;dx=40;notch=0;fillColor=#FF6666;" parent="1" vertex="1">
<mxGeometry x="460" y="312" width="90" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-23" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.071;entryY=0.25;entryDx=0;entryDy=0;entryPerimeter=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;dashed=1;" parent="1" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="592.495" y="462" as="targetPoint" />
<mxPoint x="232.5700000000001" y="432" as="sourcePoint" />
<Array as="points">
<mxPoint x="233" y="462" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-34" value="Gleiche Funktion wie Makro 1 + API-Abgleich" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-23" vertex="1" connectable="0">
<mxGeometry x="-0.4298" relative="1" as="geometry">
<mxPoint x="53" as="offset" />
</mxGeometry>
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-15" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.25;exitY=1;exitDx=0;exitDy=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-21">
<mxGeometry relative="1" as="geometry">
<mxPoint x="165.20000000000005" y="510" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-21" value="Bereinigte Erfassungstabelle" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="85" y="362" width="320" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-37" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.452;entryY=-0.007;entryDx=0;entryDy=0;entryPerimeter=0;dashed=1;" parent="1" source="WA2_J1DCvVjPXciXSW-M-24" target="WA2_J1DCvVjPXciXSW-M-33" edge="1">
<mxGeometry relative="1" as="geometry">
<Array as="points">
<mxPoint x="232" y="480" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-60" value="gibt aus" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-37" vertex="1" connectable="0">
<mxGeometry x="-0.0997" y="-1" relative="1" as="geometry">
<mxPoint as="offset" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-24" value="NormVokabular_Mapper.py" style="ellipse;whiteSpace=wrap;html=1;fillColor=#FFFF66;" parent="1" vertex="1">
<mxGeometry x="592.5" y="432" width="175" height="80" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-27" value="&lt;u&gt;&lt;b&gt;WHK/Manuell&lt;/b&gt;&lt;/u&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="145" width="100" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-28" value="&lt;b&gt;&lt;u&gt;Programm/automatisiert&lt;/u&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="590" width="160" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-29" value="&lt;div&gt;Mögliche Optimierung, funktioniert aber nicht in LO&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;fontSize=8;" parent="1" vertex="1">
<mxGeometry x="570" y="362" width="200" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-32" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.484;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-6" target="WA2_J1DCvVjPXciXSW-M-21" edge="1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-17" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-33">
<mxGeometry relative="1" as="geometry">
<mxPoint x="247.5" y="720" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-33" value="Bereinigte Erfassungstabelle" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="85" y="512" width="325" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-35" value="&lt;ul&gt;&lt;li&gt;Liest Spalte Objektbeschreibung aus, filtert Stopwords und Zahlen raus&lt;/li&gt;&lt;li&gt;Normalisierung, Lemmatisierung, Stemming der Wörter für höhere Trefferwahrscheinlichkeit&lt;/li&gt;&lt;li&gt;Liest das Normvokabular, Berücksichtigt ID-Hierarchie, erstellt Index für gestemmte Begriffe&lt;/li&gt;&lt;li&gt;Abgleich mit Normvokabular, generiert Vorschläge wenn kein Treffer vorliegt&lt;/li&gt;&lt;li&gt;API-Abgleich (aktuell GND und wikidata, Top1-Treffer)&lt;/li&gt;&lt;li&gt;Erstellt eine Auswertungsdatei, markiert Begriffe entsprechend ihres Status)&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;fontSize=10;align=left;" parent="1" vertex="1">
<mxGeometry x="520" y="532" width="300" height="160" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-93" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-39" target="WA2_J1DCvVjPXciXSW-M-45" edge="1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-39" value="Aufnahme ins Normvokabular oder Verwerfen des Begriffs" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="85" y="722" width="330" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-43" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="515" y="1192" as="sourcePoint" />
<mxPoint x="515" y="962" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-94" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0;entryY=0.5;entryDx=0;entryDy=0;dashed=1;" parent="1" source="WA2_J1DCvVjPXciXSW-M-45" target="WA2_J1DCvVjPXciXSW-M-46" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="615" y="832" as="targetPoint" />
<Array as="points">
<mxPoint x="475" y="822" />
<mxPoint x="475" y="822" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-16" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.25;exitY=1;exitDx=0;exitDy=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-45">
<mxGeometry relative="1" as="geometry">
<mxPoint x="167.66666666666674" y="980" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-45" value="Manuelle Anpassung der Normvokabular-Masterfile" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="85" y="802" width="330" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-92" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;dashed=1;" parent="1" source="WA2_J1DCvVjPXciXSW-M-46" target="WA2_J1DCvVjPXciXSW-M-52" edge="1">
<mxGeometry relative="1" as="geometry">
<Array as="points">
<mxPoint x="695" y="912" />
<mxPoint x="198" y="912" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-220" value="gibt aus" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-92" vertex="1" connectable="0">
<mxGeometry x="0.3024" y="-2" relative="1" as="geometry">
<mxPoint as="offset" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-46" value="Masterfile_Editor.py" style="ellipse;whiteSpace=wrap;html=1;fillColor=#FFFF66;" parent="1" vertex="1">
<mxGeometry x="635" y="782" width="120" height="80" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-49" value="liest und bereinigt Normvokabular" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="427.5" y="817" width="200" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-58" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-52" target="WA2_J1DCvVjPXciXSW-M-57" edge="1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-221" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" parent="1" source="WA2_J1DCvVjPXciXSW-M-52" target="WA2_J1DCvVjPXciXSW-M-57" edge="1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-222" value="=" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="WA2_J1DCvVjPXciXSW-M-221" vertex="1" connectable="0">
<mxGeometry x="-0.3079" y="1" relative="1" as="geometry">
<mxPoint as="offset" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-52" value="Aktualisierte Masterfile, mit allen Änderungen und in der richtigen Struktur" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="85" y="980" width="225" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-59" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-57" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="70" y="242" as="targetPoint" />
<Array as="points">
<mxPoint x="40" y="1130" />
<mxPoint x="40" y="242" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-57" value="Masterfile Normvokabular Updated" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="117.5" y="1100" width="160" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-64" value="" style="html=1;shadow=0;dashed=0;align=center;verticalAlign=middle;shape=mxgraph.arrows2.arrow;dy=0.6;dx=40;notch=0;fillColor=#FF6666;" parent="1" vertex="1">
<mxGeometry x="410" y="1107.5" width="90" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-200" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-65" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="40" y="1140" as="targetPoint" />
<Array as="points">
<mxPoint x="680" y="1180" />
<mxPoint x="40" y="1180" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-65" value="&lt;div&gt;Normvokabular-Masterfile muss&amp;nbsp;&lt;/div&gt;&lt;div&gt;&lt;b&gt;zentral&lt;/b&gt; als &lt;b&gt;SPOT&lt;/b&gt; vorliegen und gepflegt werden können&lt;/div&gt;" style="ellipse;whiteSpace=wrap;html=1;fillColor=#FF6666;" parent="1" vertex="1">
<mxGeometry x="575" y="1075" width="210" height="85" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-66" value="" style="whiteSpace=wrap;html=1;aspect=fixed;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="485" y="-1046" width="20" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-67" value="" style="whiteSpace=wrap;html=1;aspect=fixed;fillColor=#FF6666;" parent="1" vertex="1">
<mxGeometry x="485" y="-1006" width="20" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-68" value="" style="whiteSpace=wrap;html=1;aspect=fixed;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="485" y="-966" width="20" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-69" value="" style="whiteSpace=wrap;html=1;aspect=fixed;fillColor=#FFFF66;" parent="1" vertex="1">
<mxGeometry x="485" y="-926" width="20" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-70" value="&lt;b&gt;Datei&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="518" y="-1050" width="50" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-71" value="&lt;b&gt;Fehlender Schritt/Optimierungsmöglichkeit&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="510" y="-1011" width="270" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-72" value="&lt;b&gt;Vorgang, WHK&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="515" y="-971" width="110" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-73" value="&lt;b&gt;Programm&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="520" y="-931" width="80" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-74" value="" style="endArrow=none;html=1;rounded=0;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="35" y="-850" as="sourcePoint" />
<mxPoint x="805" y="-850" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-75" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;&lt;u&gt;&lt;font&gt;Probleme/Noch zu klären:&lt;/font&gt;&lt;/u&gt;&lt;/b&gt;&lt;/font&gt;&lt;ul&gt;&lt;li&gt;&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;Makro 1 und NormVokabular-Mapper&lt;/b&gt; redundant, eine Methode festlegen (Makro benutzerfreundlicher, Treffer/Vorschläge direkt in Erfassung sichtbar, Mapper genauer, API-Abgleich, Auswertungsdatei übersichtlicher)&lt;/font&gt;&lt;/li&gt;&lt;li&gt;&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;Makro 2&lt;/b&gt; (Vorschläge aus Normvokabular können automatisch per Klick in die Erfassungstabelle übernommen werden)&lt;/font&gt;&lt;/li&gt;&lt;li&gt;&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;Normvokabular&lt;/b&gt;: Eine zentrale .json als SPOT etablieren und zentral in alle Prozesse einbinden&lt;/font&gt;&lt;/li&gt;&lt;li&gt;&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;Mapper&lt;/b&gt;&amp;nbsp;oder &lt;b&gt;Makro&lt;/b&gt; benötigt Funktion, Wörter ohne Treffer und Vorschlag in &lt;br&gt;eigene Liste zu übernehmen und auszugeben -&amp;gt; manuelle Prüfung&lt;/font&gt;&lt;/li&gt;&lt;li&gt;&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;Normvokabular&lt;/b&gt;: Regeln, ID-Struktur, Kategorien müssen auf Qualität und Nutzbarkeit geprüft werden; danach Anpassung aller Programme, die sich auf Normvokabular stützen&lt;/font&gt;&lt;/li&gt;&lt;/ul&gt;&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;br&gt;&lt;/font&gt;&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;align=left;spacing=2;spacingRight=0;" parent="1" vertex="1">
<mxGeometry x="40" y="1232" width="770" height="190" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-80" value="&lt;ul&gt;&lt;li&gt;Liest Spalte Objektbeschreibung aus, filtert Stopwords und Zahlen raus&lt;/li&gt;&lt;li&gt;Normalisierung, Lemmatisierung, Stemming der Wörter für höhere Trefferwahrscheinlichkeit&lt;/li&gt;&lt;li&gt;Liest das Normvokabular, Berücksichtigt ID-Hierarchie, erstellt Index für gestemmte Begriffe, cache und log&lt;/li&gt;&lt;li&gt;Abgleich mit Normvokabular, generiert Vorschläge wenn kein Treffer vorliegt&lt;/li&gt;&lt;li&gt;Markiert Treffer, Vorschläge und Keine Treffer&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;fontSize=10;align=left;" parent="1" vertex="1">
<mxGeometry x="525" y="132" width="300" height="160" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-81" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="245" y="322" as="sourcePoint" />
<mxPoint x="455" y="322" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-83" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;entryX=0.055;entryY=0.48;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" target="WA2_J1DCvVjPXciXSW-M-64" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="281" y="1117" as="sourcePoint" />
<mxPoint x="365" y="1002" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-91" value="&lt;ul&gt;&lt;li&gt;Automatische Spaltenerkennung (ID, Name/Wort/Vokabel)&lt;/li&gt;&lt;li&gt;Aufbau einer hierarchischen Struktur (Ober-, Unter-, Unterunterkategorien)&lt;/li&gt;&lt;li&gt;Erstellung eines Mastersheets mit eindeutigen IDs&lt;/li&gt;&lt;li&gt;Sortierte Ausgabe nach vordefinierter Sheet-Reihenfolge&lt;/li&gt;&lt;li&gt;Protokollierung im Terminal (Zeilenanzahl, Warnungen, ID-Zählung)&lt;/li&gt;&lt;li&gt;Speicherung einer neuen, synchronisierten Output-Datei ohne Änderung der Originaldatei&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;fontSize=10;" parent="1" vertex="1">
<mxGeometry x="510" y="902" width="310" height="160" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-96" value="" style="endArrow=none;html=1;rounded=0;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="15" y="1460" as="sourcePoint" />
<mxPoint x="815" y="1460" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-192" value="&lt;font style=&quot;font-size: 19px;&quot;&gt;&lt;b&gt;&lt;u&gt;3. Aktuelle Struktur des Normvokabulars (Stand 10/25)&lt;/u&gt;&lt;/b&gt;&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="35" y="1480" width="510" height="40" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-195" value="" style="group" parent="1" vertex="1" connectable="0">
<mxGeometry x="90" y="1740" width="580" height="380" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-97" value="Assets" style="childLayout=tableLayout;recursiveResize=0;strokeColor=#98bf21;fillColor=#A7C942;shadow=1;" parent="WA2_J1DCvVjPXciXSW-M-195" vertex="1">
<mxGeometry x="50" y="40" width="550" height="330" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-98" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=0;strokeColor=inherit;fillColor=#ffffff;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry width="550" height="43" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-99" value="ID" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#A7C942;align=center;fontStyle=1;fontColor=#FFFFFF;html=1;" parent="WA2_J1DCvVjPXciXSW-M-98" vertex="1">
<mxGeometry width="117" height="43" as="geometry">
<mxRectangle width="117" height="43" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-100" value="Unterkategorie" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#A7C942;align=center;fontStyle=1;fontColor=#FFFFFF;html=1;" parent="WA2_J1DCvVjPXciXSW-M-98" vertex="1">
<mxGeometry x="117" width="159" height="43" as="geometry">
<mxRectangle width="159" height="43" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-101" value="Unterunterkategorie" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#A7C942;align=center;fontStyle=1;fontColor=#FFFFFF;html=1;" parent="WA2_J1DCvVjPXciXSW-M-98" vertex="1">
<mxGeometry x="276" width="137" height="43" as="geometry">
<mxRectangle width="137" height="43" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-165" value="Wort/Vokabel" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#A7C942;align=center;fontStyle=1;fontColor=#FFFFFF;html=1;" parent="WA2_J1DCvVjPXciXSW-M-98" vertex="1">
<mxGeometry x="413" width="137" height="43" as="geometry">
<mxRectangle width="137" height="43" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-102" value="" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=0;strokeColor=inherit;fillColor=#ffffff;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry y="43" width="550" height="42" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-103" value="7.1.1" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-102" vertex="1">
<mxGeometry width="117" height="42" as="geometry">
<mxRectangle width="117" height="42" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-104" value="Außenarchitektur" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-102" vertex="1">
<mxGeometry x="117" width="159" height="42" as="geometry">
<mxRectangle width="159" height="42" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-105" value="Außenarchitektur allgemein" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-102" vertex="1">
<mxGeometry x="276" width="137" height="42" as="geometry">
<mxRectangle width="137" height="42" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-166" value="Außenarchitektur allgemein" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-102" vertex="1">
<mxGeometry x="413" width="137" height="42" as="geometry">
<mxRectangle width="137" height="42" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-187" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=0;strokeColor=inherit;fillColor=#ffffff;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry y="85" width="550" height="41" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-188" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-187" vertex="1">
<mxGeometry width="117" height="41" as="geometry">
<mxRectangle width="117" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-189" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-187" vertex="1">
<mxGeometry x="117" width="159" height="41" as="geometry">
<mxRectangle width="159" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-190" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-187" vertex="1">
<mxGeometry x="276" width="137" height="41" as="geometry">
<mxRectangle width="137" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-191" value="Hof" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-187" vertex="1">
<mxGeometry x="413" width="137" height="41" as="geometry">
<mxRectangle width="137" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-106" value="" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=1;strokeColor=inherit;fillColor=#EAF2D3;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry y="126" width="550" height="41" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-107" value="7.1.2" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-106" vertex="1">
<mxGeometry width="117" height="41" as="geometry">
<mxRectangle width="117" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-108" value="Außenarchitektur" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-106" vertex="1">
<mxGeometry x="117" width="159" height="41" as="geometry">
<mxRectangle width="159" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-109" value="Gebäudetypen" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-106" vertex="1">
<mxGeometry x="276" width="137" height="41" as="geometry">
<mxRectangle width="137" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-167" value="Gebäudetypen" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-106" vertex="1">
<mxGeometry x="413" width="137" height="41" as="geometry">
<mxRectangle width="137" height="41" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-110" value="" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=0;strokeColor=inherit;fillColor=#ffffff;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry y="167" width="550" height="44" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-111" value="" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;fontStyle=0;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-110" vertex="1">
<mxGeometry width="117" height="44" as="geometry">
<mxRectangle width="117" height="44" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-112" value="" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;fontStyle=0;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-110" vertex="1">
<mxGeometry x="117" width="159" height="44" as="geometry">
<mxRectangle width="159" height="44" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-113" value="" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;fontStyle=0;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-110" vertex="1">
<mxGeometry x="276" width="137" height="44" as="geometry">
<mxRectangle width="137" height="44" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-168" value="Haus" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;fontStyle=0;align=center;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-110" vertex="1">
<mxGeometry x="413" width="137" height="44" as="geometry">
<mxRectangle width="137" height="44" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-114" value="" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=1;strokeColor=inherit;fillColor=#EAF2D3;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry y="211" width="550" height="39" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-115" value="7.2" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-114" vertex="1">
<mxGeometry width="117" height="39" as="geometry">
<mxRectangle width="117" height="39" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-116" value="Innenarchitektur" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-114" vertex="1">
<mxGeometry x="117" width="159" height="39" as="geometry">
<mxRectangle width="159" height="39" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-117" value="" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-114" vertex="1">
<mxGeometry x="276" width="137" height="39" as="geometry">
<mxRectangle width="137" height="39" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-169" value="Innenarchitektur" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-114" vertex="1">
<mxGeometry x="413" width="137" height="39" as="geometry">
<mxRectangle width="137" height="39" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-175" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=1;strokeColor=inherit;fillColor=#FFFFFF;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry y="250" width="550" height="40" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-176" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-175" vertex="1">
<mxGeometry width="117" height="40" as="geometry">
<mxRectangle width="117" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-177" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-175" vertex="1">
<mxGeometry x="117" width="159" height="40" as="geometry">
<mxRectangle width="159" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-178" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-175" vertex="1">
<mxGeometry x="276" width="137" height="40" as="geometry">
<mxRectangle width="137" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-179" value="Zimmer" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=inherit;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-175" vertex="1">
<mxGeometry x="413" width="137" height="40" as="geometry">
<mxRectangle width="137" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-170" style="shape=tableRow;horizontal=0;startSize=0;swimlaneHead=0;swimlaneBody=0;top=0;left=0;bottom=0;right=0;dropTarget=0;collapsible=0;recursiveResize=0;expand=0;fontStyle=1;strokeColor=inherit;fillColor=#EAF2D3;" parent="WA2_J1DCvVjPXciXSW-M-97" vertex="1">
<mxGeometry y="290" width="550" height="40" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-171" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-170" vertex="1">
<mxGeometry width="117" height="40" as="geometry">
<mxRectangle width="117" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-172" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-170" vertex="1">
<mxGeometry x="117" width="159" height="40" as="geometry">
<mxRectangle width="159" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-173" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-170" vertex="1">
<mxGeometry x="276" width="137" height="40" as="geometry">
<mxRectangle width="137" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-174" value="Fußboden" style="connectable=0;recursiveResize=0;strokeColor=inherit;fillColor=#E6FFCC;whiteSpace=wrap;html=1;" parent="WA2_J1DCvVjPXciXSW-M-170" vertex="1">
<mxGeometry x="413" width="137" height="40" as="geometry">
<mxRectangle width="137" height="40" as="alternateBounds" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-323" value="&lt;b&gt;&lt;u&gt;b) Beispiel&lt;/u&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="WA2_J1DCvVjPXciXSW-M-195" vertex="1">
<mxGeometry x="-30" width="80" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-197" value="" style="endArrow=none;html=1;rounded=0;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="45" y="-110" as="sourcePoint" />
<mxPoint x="815" y="-110" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-198" value="&lt;font style=&quot;font-size: 21px;&quot;&gt;&lt;b&gt;&lt;u&gt;2. Normvokabular-Abgleich&lt;/u&gt;&lt;/b&gt;&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="45" y="-70" width="290" height="40" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-199" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;exitX=1;exitY=0.5;exitDx=0;exitDy=0;exitPerimeter=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-64" target="WA2_J1DCvVjPXciXSW-M-65" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="440" y="1110" as="sourcePoint" />
<mxPoint x="534" y="1110" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-202" value="Scanvorgang" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="70" y="-670" width="200" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-203" value="Erfassen" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="240" y="-400" width="200" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-207" value="Ebenenstruktur festlegen" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="180" y="-490" width="200" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-209" value="Erfassungstabelle" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="370" y="-200" width="247.5" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-210" value="&lt;ul&gt;&lt;li&gt;Durchgehen einer Box von vorne bis hinten&lt;/li&gt;&lt;li&gt;Auflegen des Objekts, Ausrichtung der Farbkarte&lt;/li&gt;&lt;li&gt;Manuelles Festlegen des Scanbereichs&lt;/li&gt;&lt;li&gt;Scan der gesamten Box&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" parent="1" vertex="1">
<mxGeometry x="255" y="-690" width="320" height="80" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-211" value="&lt;ul&gt;&lt;li&gt;Durchgehen der exportierten Scans im Bildviewer&lt;/li&gt;&lt;li&gt;Festlegung der Scanebenen (Umschlag, Vorderseite, Rückseite, etc.)&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" parent="1" vertex="1">
<mxGeometry x="367.5" y="-500" width="320" height="80" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-212" value="&lt;ul&gt;&lt;li&gt;Durchgehen der Scans&lt;/li&gt;&lt;li&gt;Erfassen: Datum, Urheber, Eigner, Material&lt;/li&gt;&lt;li&gt;Vermessen des Objekts&lt;/li&gt;&lt;li&gt;Objektbeschreibung: Verschlagwortung des Bildinhalts&lt;/li&gt;&lt;li&gt;Erfassen etwaiger Inschriften und Anmerkungen&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" parent="1" vertex="1">
<mxGeometry x="438" y="-440" width="300" height="140" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-216" value="&lt;font style=&quot;font-size: 21px;&quot;&gt;&lt;b&gt;&lt;u&gt;1. Ablauf des Scan- und Erfassungsprozesses&lt;/u&gt;&lt;/b&gt;&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="35" y="-840" width="490" height="40" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-217" value="Vorbereitung" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="30" y="-760" width="200" height="60" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-218" value="&lt;ul&gt;&lt;li&gt;PC hochfahren&lt;/li&gt;&lt;li&gt;Scanner starten/Kamera und Beleuchtung vorbereiten, Farbkarte platzieren&lt;/li&gt;&lt;li&gt;Software starten, Scanauftrag wählen&lt;/li&gt;&lt;li&gt;Erfassungstabelle öffnen&lt;/li&gt;&lt;li&gt;Passende Box wählen&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" parent="1" vertex="1">
<mxGeometry x="212.5" y="-790" width="555" height="110" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-236" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.25;exitY=1;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-226" target="WA2_J1DCvVjPXciXSW-M-228" edge="1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-318" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-226" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="350" y="1573" as="targetPoint" />
<Array as="points">
<mxPoint x="340" y="1573" />
<mxPoint x="360" y="1573" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-226" value="Kategorie" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
<mxGeometry x="170" y="1562.5" width="150" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-237" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.25;exitY=1;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-228" target="WA2_J1DCvVjPXciXSW-M-229" edge="1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-228" value="Unterkategorie" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
<mxGeometry x="290" y="1605" width="150" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-238" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.232;exitY=1.005;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;exitPerimeter=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-229" target="WA2_J1DCvVjPXciXSW-M-230" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="407.5" y="1652.5" as="sourcePoint" />
<mxPoint x="440" y="1687.5" as="targetPoint" />
<Array as="points">
<mxPoint x="440" y="1700" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-320" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-229" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="590" y="1660" as="targetPoint" />
<Array as="points">
<mxPoint x="580" y="1660" />
<mxPoint x="580" y="1660" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-229" value="Unterunterkategorie" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
<mxGeometry x="405" y="1650" width="150" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-230" value="Wort/Vokabel" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
<mxGeometry x="513" y="1690" width="150" height="20" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-313" value="&lt;font style=&quot;font-size: 10px;&quot;&gt;1&lt;/font&gt;" style="ellipse;whiteSpace=wrap;html=1;rounded=0;" parent="1" vertex="1">
<mxGeometry x="352.5" y="1560" width="25" height="25" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-315" value="&lt;font style=&quot;font-size: 10px;&quot;&gt;1.1&lt;/font&gt;" style="ellipse;whiteSpace=wrap;html=1;rounded=0;" parent="1" vertex="1">
<mxGeometry x="475" y="1602.5" width="25" height="25" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-316" value="&lt;font style=&quot;font-size: 10px;&quot;&gt;1.1.1&lt;/font&gt;" style="ellipse;whiteSpace=wrap;html=1;rounded=0;" parent="1" vertex="1">
<mxGeometry x="592.5" y="1647.5" width="25" height="25" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-319" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="WA2_J1DCvVjPXciXSW-M-228" edge="1">
<mxGeometry relative="1" as="geometry">
<mxPoint x="470" y="1615" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-321" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="50" y="1740" as="sourcePoint" />
<mxPoint x="800" y="1740" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-322" value="&lt;b&gt;&lt;u&gt;a) Hierarchie und ID-Struktur&lt;/u&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
<mxGeometry x="40" y="1530" width="190" height="30" as="geometry" />
</mxCell>
<mxCell id="WA2_J1DCvVjPXciXSW-M-193" value="Blatt 7 - Architektur" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
<mxGeometry x="140" y="2110" width="165" height="30" as="geometry" />
</mxCell>
<mxCell id="B-3lv8s0GtbLfT8x5DVe-1" value="Scan exportieren" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" parent="1" vertex="1">
<mxGeometry x="122.5" y="-580" width="200" height="60" as="geometry" />
</mxCell>
<mxCell id="B-3lv8s0GtbLfT8x5DVe-2" value="&lt;ul&gt;&lt;li&gt;Export der gesamten Scans einer Box in einen Ordner&lt;/li&gt;&lt;li&gt;Reihenfolge der Scans checken&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" parent="1" vertex="1">
<mxGeometry x="307.5" y="-590" width="320" height="80" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-1" value="Erfassung prüfen" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#B3FF66;" vertex="1" parent="1">
<mxGeometry x="310" y="-300" width="200" height="60" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-2" value="&lt;ul&gt;&lt;li&gt;Durchgehen der Scans, Vergleich der Nummern mit dem Inhalt der Erfassung&lt;/li&gt;&lt;li&gt;Makro laufen lassen: Prüft Begriffe unter &quot;Objektbschreibung&quot; auf Treffer im Normvokabular (siehe Anleitung)&lt;/li&gt;&lt;/ul&gt;" style="text;strokeColor=none;fillColor=none;html=1;whiteSpace=wrap;verticalAlign=middle;overflow=hidden;" vertex="1" parent="1">
<mxGeometry x="490" y="-310" width="320" height="90" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-3" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.3;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-217" target="WA2_J1DCvVjPXciXSW-M-202">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-4" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.238;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-202" target="B-3lv8s0GtbLfT8x5DVe-1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-6" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.213;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="B-3lv8s0GtbLfT8x5DVe-1" target="WA2_J1DCvVjPXciXSW-M-207">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-7" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.2;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-207" target="WA2_J1DCvVjPXciXSW-M-203">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-8" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.148;entryY=-0.056;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="WA2_J1DCvVjPXciXSW-M-203" target="ey7EfLCcf-ExpX1qzLUj-1">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-10" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.161;entryY=-0.039;entryDx=0;entryDy=0;entryPerimeter=0;" edge="1" parent="1" source="ey7EfLCcf-ExpX1qzLUj-1" target="WA2_J1DCvVjPXciXSW-M-209">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-11" value="&lt;font style=&quot;font-size: 15px;&quot;&gt;&lt;b&gt;Stand: 14.10.25&lt;/b&gt;&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
<mxGeometry x="745" y="-1090" width="105" height="50" as="geometry" />
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-12" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;" edge="1" parent="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="470" y="-880" as="sourcePoint" />
<mxPoint x="520" y="-880" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="ey7EfLCcf-ExpX1qzLUj-13" value="&lt;b&gt;Optional/Optimierungsmöglichkeit&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" vertex="1" parent="1">
<mxGeometry x="530" y="-896" width="220" height="30" as="geometry" />
</mxCell>
</root>
</mxGraphModel>
</diagram>
</mxfile>

View File

@ -1,97 +0,0 @@
= ExLibris Scannen und Erfassen Leitfaden =
Die Digitalisierung von Exlibris-Objekten ist in zwei Hauptschritte gegliedert: **Scannen** und **Erfassen** der Objekte.
Jedes Objekt ist Teil einer Box, jede Box ist Teil eines Oktavs (°).
Eine Box wird vollständig gescannt, bevor die Erfassung beginnt das vermeidet Fehler, wenn sich z. B. die Reihenfolge der Scans durch nachträgliches Hinzufügen oder Löschen ändert.
== 1. Vorbereitung ==
=== a) Hardware vorbereiten ===
* PC starten
* Lichter anschalten
* Kameraabdeckung entfernen
* Kamera einschalten
=== b) Software vorbereiten ===
* MultiDotScan by Walter Nagel starten
* Scanauftrag auswählen → '''WELCHEN SCANAUFTRAG'''
== 2. Scanvorgang ==
* Objekt auf Unterlage legen, rechtwinklig zum Bildausschnitt in der Software
* Farbkarte anlegen je nach Format des Objekts rechts oder unten
* Bildausschnitt an Objekt anpassen (Rand ca. 1020 mm)
* Kamera mit dem Pedal auslösen → '''Scan'''
* Wenn ein Scan fehlt oder neu gemacht werden muss: in der Software an die richtige Stelle ziehen → Scans werden beim Export automatisch in der korrekten Reihenfolge angeordnet
== 3. Scans exportieren ==
* Nach dem Scan der gesamten Box den Scanauftrag exportieren → landet unter '''DATEIPFAD'''
== 4. Erfassen ==
* Erfasste Scans unter '''DATEIPFAD''' öffnen
(im Bildexplorer, '''nicht''' in der Scansoftware sonst kann sich die Reihenfolge der Scans ändern, was zu Fehlern in der Erfassungstabelle führt)
* Jede Box (= jeder Scanauftrag) beginnt numerisch bei 1; jeder Scan ist fortlaufend nummeriert
* Scannummern in die Erfassungstabelle eintragen, dabei Ebenenstruktur berücksichtigen:
=== Ebenenstruktur ===
* Standard: Vorderseite → Ebene 0, Rückseite → Ebene 1
* Wenn das Exlibris einen Umschlag hat oder mehrere Exlibris in einem Briefumschlag liegen:
* Umschlag = Ebene 0
* Vorderseite = Ebene 1
* Rückseite = Ebene 2
* Rückseite Umschlag = Ebene 3
→ So ist klar erkennbar, wo ein Umschlag beginnt und endet.
=== Erfassen der Metadaten ===
* Jahr steht eine Jahreszahl auf Vorder- oder Rückseite?
* Urheber Künstler
* Eigner wem gehört das Exlibris?
* Objektbeschreibung was ist zu sehen? Verschlagwortung des Bildinhalts
==== Beachten ====
* Beschreibung von '''grob → genau'''
* Beispiel: „Baum“ statt „Schwarzeiche“
* „Helm“ statt „Topfhelm 15. Jahrhundert“
* '''Singularformen''' bevorzugen auch bei mehreren Objekten
* z. B. „Buch“ statt „Bücher“, „Figur, weiblich“ statt „Frauengruppe“
* '''Aktivitäten im Infinitiv''' angeben: „sitzen“, „lesen“, „fahren“ statt „sitzt“, „lesend“, „fährt“
* '''Verbindungswörter vermeiden''' („Stopwords“):
<nowiki>mit, ohne, der, die, das, ein, eine, und, zu, von, im, in, auf, an, als, bei, für, aus, dem, den, des, eines, einer</nowiki>
(werden vom Mapper-Makro ohnehin herausgefiltert)
* Material meist Papier
* Maße Höhe × Breite in cm (bei geraden cm-Zahlen „,0“ anfügen, z. B. 14,3 × 7,0 cm statt 14,3 × 7)
* Objekttyp Exlibris, Rückseite, Umschlag, Zettel
* Inschrift z. B. Wappen mit Spruchband
* Anmerkungen sonstige Notizen oder Hinweise (Bleistifteinträge etc.)
* AUX irrelevant
== 5. Erfassung überprüfen ==
* Stimmt die Nummerierung der Scans mit der entsprechenden Zeile in der Erfassungstabelle überein?
* Makro über die Tabelle laufen lassen:
Es existiert ein Makro für den Abgleich der Spalte „Objektbeschreibung“ mit dem internen Normvokabular, das die Verschlagwortung vereinheitlicht.
Dieses Makro kann direkt in LibreOffice Calc über das Menü gestartet werden:
<pre>
Extras → Makros → Makros verwalten → Python →
Meine Makros → Vokabular_Abgleich_Makro → mapper_macro_2.x → run_mapper_macro → Ausführen
</pre>
'''Hinweis:'''
Für die Benutzung des Makros liegt eine ausführliche Anleitung unter '''DATEIPFAD'''.
== 6. Abschluss ==
* Vordruck ausfüllen:
* Name
* Datum
* Welche Box
* Bis wohin wurde gescannt/erfasst
* Gibt es etwas zu beachten?
== 7. Best Practices ==
* Lieber zu viel als zu wenig scannen (Rückseiten, Umschläge usw.)
* Lieber zu viel als zu wenig beschreiben (alles, was nachvollziehbar erkennbar ist, kann verschlagwortet werden)
* Notizen oder Beschriftungen auf Exlibris oder Rückseiten vollständig erfassen
* Bei Unsicherheiten: nachfragen

View File

@ -1,379 +0,0 @@
# -*- coding: utf-8 -*-
# LibreOffice/Excel Macro: NV_MASTER-Abgleich, Pandas+odf, Cache, Farben
# Version 2.3 mit "Kein_Treffer" Spalte
# Speicherort: libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro/mapper_macro_2.3.py
import os
import re
import json
import traceback
# UNO-Context wird zur Laufzeit zur Verfügung gestellt (XSCRIPTCONTEXT)
try:
import pandas as pd
PANDAS_AVAILABLE = True
except Exception:
PANDAS_AVAILABLE = False
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
from difflib import SequenceMatcher
# ------------------------
# Konfiguration
# ------------------------
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/Vokabular_Abgleich_Makro"
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro_2.3.log")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache_2.3.json")
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75 # Basis-Schwelle für Vorschläge
# ------------------------
# Logging
# ------------------------
def log(msg):
try:
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(msg + "\n")
except Exception:
pass
# ------------------------
# Cache laden
# ------------------------
try:
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
except Exception:
CACHE = {}
# ------------------------
# Text-Normalisierung & Lemma
# ------------------------
def normalize_text(s):
if not s:
return ""
s = str(s).strip().lower()
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
s = re.sub(r"\s+", " ", s)
return s
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
except Exception:
lemma = term_norm
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# ------------------------
# NV_MASTER laden
# ------------------------
def build_norm_index(nv_path):
norm_dict = {}
lemma_index = {}
if not PANDAS_AVAILABLE:
log("Pandas nicht verfügbar. NV_MASTER kann nicht gelesen werden.")
return norm_dict, lemma_index
try:
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
except Exception as e:
log(f"Fehler beim Einlesen NV_MASTER: {e}")
return norm_dict, lemma_index
for sheet_name, df in sheets.items():
if str(sheet_name).strip().lower() == "master":
continue
df = df.fillna("")
cols = [str(c).strip().lower() for c in df.columns]
id_col = None
word_col = None
for i, c in enumerate(cols):
if "id" in c:
id_col = df.columns[i]
if "wort" in c or "vokabel" in c:
word_col = df.columns[i]
if word_col is None and len(df.columns) >= 1:
word_col = df.columns[-1]
if id_col is None and len(df.columns) >= 1:
id_col = df.columns[0]
current_parent_id = None
for _, row in df.iterrows():
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
if id_val:
current_parent_id = id_val
if not word_val:
continue
norm_name = normalize_text(word_val)
lemma = lemmatize_term(word_val)
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
norm_dict.setdefault(norm_name, []).append(entry)
lemma_index.setdefault(lemma, []).append(entry)
log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}")
return norm_dict, lemma_index
# ------------------------
# Matching
# ------------------------
def fuzzy_score(a, b):
if RAPIDFUZZ_AVAILABLE:
try:
return fuzz.token_set_ratio(a, b) / 100.0
except Exception:
return 0.0
else:
try:
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
except Exception:
return 0.0
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD):
candidates = []
for key_lemma, entries in lemma_index.items():
score = fuzzy_score(term_lemma, key_lemma)
if key_lemma.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
for norm_key, entries in norm_dict.items():
score = fuzzy_score(term_lemma, norm_key)
if norm_key.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
candidates.sort(key=lambda t: t[0], reverse=True)
seen = set()
results = []
for score, name, id_ in candidates:
key = (name, id_)
if key in seen:
continue
seen.add(key)
results.append({"score": score, "name": name, "id": id_})
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
def map_term_with_indexes(term, norm_dict, lemma_index):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_lemma in CACHE:
cached = CACHE[term_lemma]
return cached.get("hits", []), cached.get("suggestions", []), cached.get("ids", [])
hits = []
suggestions = []
ids = []
if term_norm in norm_dict:
for e in norm_dict[term_norm]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
if not hits and term_lemma in lemma_index:
for e in lemma_index[term_lemma]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
if not hits:
suggestions = get_suggestions_for_term(term_lemma, norm_dict, lemma_index, threshold=CONF_THRESHOLD)
def unique_preserve(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
hits = unique_preserve(hits)
suggestions = unique_preserve(suggestions)
ids = unique_preserve(ids)
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
return hits, suggestions, ids
# ------------------------
# Haupt-Makro
# ------------------------
def run_mapper_macro():
try:
doc = XSCRIPTCONTEXT.getDocument()
sheet = doc.CurrentController.ActiveSheet
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
data_range = cursor.getRangeAddress()
except Exception as e:
log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e))
return
# Header finden
header_row = None
objekt_col = None
max_col = data_range.EndColumn
for r in range(0, min(5, data_range.EndRow+1)):
for c in range(0, max_col+1):
try:
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
except Exception:
val = ""
if val == "objektbeschreibung":
header_row = r
objekt_col = c
break
if objekt_col is not None:
break
if objekt_col is None:
log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.")
return
# Spalten anlegen
existing = {}
for c in range(0, data_range.EndColumn+1):
try:
h = str(sheet.getCellByPosition(c, header_row).String).strip()
except Exception:
h = ""
if h == "Norm_Treffer":
existing["Norm_Treffer"] = c
if h == "Norm_Vorschlag":
existing["Norm_Vorschlag"] = c
last_col = data_range.EndColumn
if "Norm_Treffer" not in existing:
last_col += 1
existing["Norm_Treffer"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
if "Norm_Vorschlag" not in existing:
last_col += 1
existing["Norm_Vorschlag"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
# Neue Spalte "Kein_Treffer"
if "Kein_Treffer" not in existing:
last_col += 1
existing["Kein_Treffer"] = last_col
sheet.getCellByPosition(last_col, header_row).String = "Kein_Treffer"
norm_tr_col = existing["Norm_Treffer"]
norm_sug_col = existing["Norm_Vorschlag"]
kein_tr_col = existing["Kein_Treffer"]
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
if not norm_dict and not lemma_index:
log("NV_MASTER leer oder nicht lesbar. Abbruch.")
return
GREEN = 0xADFF2F
YELLOW = 0xFFA500
RED = 0xCC0000
WHITE = 0xFFFFFF
rows_processed = 0
for r in range(header_row + 1, data_range.EndRow + 1):
try:
cell = sheet.getCellByPosition(objekt_col, r)
txt = str(cell.String).strip()
if not txt:
continue
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
terms = []
for cl in clauses:
parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS:
continue
if re.fullmatch(r"\d+", p):
continue
terms.append(p)
row_hits = []
row_sugs = []
row_ids = []
unmapped_terms = []
for term in terms:
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
if hits:
row_hits.extend([f"{h} ({id_})" if id_ else h for h,id_ in zip(hits, ids + [""]*len(hits))])
else:
unmapped_terms.append(term)
if sugs:
row_sugs.extend([f"{s}" for s in sugs])
if ids:
row_ids.extend(ids)
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
row_hits = uniq(row_hits)
row_sugs = uniq(row_sugs)
unmapped_terms = uniq(unmapped_terms)
# Farb-Logik Objektbeschreibung
if terms and not unmapped_terms and row_hits:
cell.CellBackColor = GREEN
row_sugs = [] # keine Vorschläge wenn alles Treffer
elif row_hits:
cell.CellBackColor = YELLOW
else:
cell.CellBackColor = RED
# Norm_Treffer
tr_cell = sheet.getCellByPosition(norm_tr_col, r)
tr_cell.String = " | ".join(row_hits)
tr_cell.CellBackColor = GREEN if row_hits else WHITE
# Norm_Vorschlag
sug_cell = sheet.getCellByPosition(norm_sug_col, r)
sug_cell.String = " | ".join(row_sugs)
sug_cell.CellBackColor = YELLOW if row_sugs else WHITE
# Kein_Treffer
kt_cell = sheet.getCellByPosition(kein_tr_col, r)
kt_cell.String = " | ".join(unmapped_terms)
kt_cell.CellBackColor = RED if unmapped_terms else WHITE
rows_processed += 1
except Exception as e:
log(f"Fehler in Zeile {r}: {e}\n{traceback.format_exc()}")
try:
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
except Exception:
pass
log(f"run_mapper_macro fertig. Zeilen verarbeitet: {rows_processed}")
# Export für LibreOffice
g_exportedScripts = (run_mapper_macro,)

View File

@ -1,247 +0,0 @@
<#
.Synopsis
Activate a Python virtual environment for the current PowerShell session.
.Description
Pushes the python executable for a virtual environment to the front of the
$Env:PATH environment variable and sets the prompt to signify that you are
in a Python virtual environment. Makes use of the command line switches as
well as the `pyvenv.cfg` file values present in the virtual environment.
.Parameter VenvDir
Path to the directory that contains the virtual environment to activate. The
default value for this is the parent of the directory that the Activate.ps1
script is located within.
.Parameter Prompt
The prompt prefix to display when this virtual environment is activated. By
default, this prompt is the name of the virtual environment folder (VenvDir)
surrounded by parentheses and followed by a single space (ie. '(.venv) ').
.Example
Activate.ps1
Activates the Python virtual environment that contains the Activate.ps1 script.
.Example
Activate.ps1 -Verbose
Activates the Python virtual environment that contains the Activate.ps1 script,
and shows extra information about the activation as it executes.
.Example
Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
Activates the Python virtual environment located in the specified location.
.Example
Activate.ps1 -Prompt "MyPython"
Activates the Python virtual environment that contains the Activate.ps1 script,
and prefixes the current prompt with the specified string (surrounded in
parentheses) while the virtual environment is active.
.Notes
On Windows, it may be required to enable this Activate.ps1 script by setting the
execution policy for the user. You can do this by issuing the following PowerShell
command:
PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
For more information on Execution Policies:
https://go.microsoft.com/fwlink/?LinkID=135170
#>
Param(
[Parameter(Mandatory = $false)]
[String]
$VenvDir,
[Parameter(Mandatory = $false)]
[String]
$Prompt
)
<# Function declarations --------------------------------------------------- #>
<#
.Synopsis
Remove all shell session elements added by the Activate script, including the
addition of the virtual environment's Python executable from the beginning of
the PATH variable.
.Parameter NonDestructive
If present, do not remove this function from the global namespace for the
session.
#>
function global:deactivate ([switch]$NonDestructive) {
# Revert to original values
# The prior prompt:
if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
}
# The prior PYTHONHOME:
if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
}
# The prior PATH:
if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
Remove-Item -Path Env:_OLD_VIRTUAL_PATH
}
# Just remove the VIRTUAL_ENV altogether:
if (Test-Path -Path Env:VIRTUAL_ENV) {
Remove-Item -Path env:VIRTUAL_ENV
}
# Just remove VIRTUAL_ENV_PROMPT altogether.
if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
Remove-Item -Path env:VIRTUAL_ENV_PROMPT
}
# Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
}
# Leave deactivate function in the global namespace if requested:
if (-not $NonDestructive) {
Remove-Item -Path function:deactivate
}
}
<#
.Description
Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
given folder, and returns them in a map.
For each line in the pyvenv.cfg file, if that line can be parsed into exactly
two strings separated by `=` (with any amount of whitespace surrounding the =)
then it is considered a `key = value` line. The left hand string is the key,
the right hand is the value.
If the value starts with a `'` or a `"` then the first and last character is
stripped from the value before being captured.
.Parameter ConfigDir
Path to the directory that contains the `pyvenv.cfg` file.
#>
function Get-PyVenvConfig(
[String]
$ConfigDir
) {
Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
# Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
$pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
# An empty map will be returned if no config file is found.
$pyvenvConfig = @{ }
if ($pyvenvConfigPath) {
Write-Verbose "File exists, parse `key = value` lines"
$pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
$pyvenvConfigContent | ForEach-Object {
$keyval = $PSItem -split "\s*=\s*", 2
if ($keyval[0] -and $keyval[1]) {
$val = $keyval[1]
# Remove extraneous quotations around a string value.
if ("'""".Contains($val.Substring(0, 1))) {
$val = $val.Substring(1, $val.Length - 2)
}
$pyvenvConfig[$keyval[0]] = $val
Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
}
}
}
return $pyvenvConfig
}
<# Begin Activate script --------------------------------------------------- #>
# Determine the containing directory of this script
$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
$VenvExecDir = Get-Item -Path $VenvExecPath
Write-Verbose "Activation script is located in path: '$VenvExecPath'"
Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
# Set values required in priority: CmdLine, ConfigFile, Default
# First, get the location of the virtual environment, it might not be
# VenvExecDir if specified on the command line.
if ($VenvDir) {
Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
}
else {
Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
$VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
Write-Verbose "VenvDir=$VenvDir"
}
# Next, read the `pyvenv.cfg` file to determine any required value such
# as `prompt`.
$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
# Next, set the prompt from the command line, or the config file, or
# just use the name of the virtual environment folder.
if ($Prompt) {
Write-Verbose "Prompt specified as argument, using '$Prompt'"
}
else {
Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
$Prompt = $pyvenvCfg['prompt'];
}
else {
Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
$Prompt = Split-Path -Path $venvDir -Leaf
}
}
Write-Verbose "Prompt = '$Prompt'"
Write-Verbose "VenvDir='$VenvDir'"
# Deactivate any currently active virtual environment, but leave the
# deactivate function in place.
deactivate -nondestructive
# Now set the environment variable VIRTUAL_ENV, used by many tools to determine
# that there is an activated venv.
$env:VIRTUAL_ENV = $VenvDir
if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
Write-Verbose "Setting prompt to '$Prompt'"
# Set the prompt to include the env name
# Make sure _OLD_VIRTUAL_PROMPT is global
function global:_OLD_VIRTUAL_PROMPT { "" }
Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
function global:prompt {
Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
_OLD_VIRTUAL_PROMPT
}
$env:VIRTUAL_ENV_PROMPT = $Prompt
}
# Clear PYTHONHOME
if (Test-Path -Path Env:PYTHONHOME) {
Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
Remove-Item -Path Env:PYTHONHOME
}
# Add the venv to the PATH
Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"

View File

@ -1,69 +0,0 @@
# This file must be used with "source bin/activate" *from bash*
# you cannot run it directly
deactivate () {
# reset old environment variables
if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
PATH="${_OLD_VIRTUAL_PATH:-}"
export PATH
unset _OLD_VIRTUAL_PATH
fi
if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
export PYTHONHOME
unset _OLD_VIRTUAL_PYTHONHOME
fi
# This should detect bash and zsh, which have a hash command that must
# be called to get it to forget past commands. Without forgetting
# past commands the $PATH changes we made may not be respected
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
hash -r 2> /dev/null
fi
if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
PS1="${_OLD_VIRTUAL_PS1:-}"
export PS1
unset _OLD_VIRTUAL_PS1
fi
unset VIRTUAL_ENV
unset VIRTUAL_ENV_PROMPT
if [ ! "${1:-}" = "nondestructive" ] ; then
# Self destruct!
unset -f deactivate
fi
}
# unset irrelevant variables
deactivate nondestructive
VIRTUAL_ENV='/home/jarnold/projects/GND-Skript Test/venv'
export VIRTUAL_ENV
_OLD_VIRTUAL_PATH="$PATH"
PATH="$VIRTUAL_ENV/"bin":$PATH"
export PATH
# unset PYTHONHOME if set
# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
# could use `if (set -u; : $PYTHONHOME) ;` in bash
if [ -n "${PYTHONHOME:-}" ] ; then
_OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
unset PYTHONHOME
fi
if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
_OLD_VIRTUAL_PS1="${PS1:-}"
PS1='(venv) '"${PS1:-}"
export PS1
VIRTUAL_ENV_PROMPT='(venv) '
export VIRTUAL_ENV_PROMPT
fi
# This should detect bash and zsh, which have a hash command that must
# be called to get it to forget past commands. Without forgetting
# past commands the $PATH changes we made may not be respected
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
hash -r 2> /dev/null
fi

View File

@ -1,26 +0,0 @@
# This file must be used with "source bin/activate.csh" *from csh*.
# You cannot run it directly.
# Created by Davide Di Blasi <davidedb@gmail.com>.
# Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate'
# Unset irrelevant variables.
deactivate nondestructive
setenv VIRTUAL_ENV '/home/jarnold/projects/GND-Skript Test/venv'
set _OLD_VIRTUAL_PATH="$PATH"
setenv PATH "$VIRTUAL_ENV/"bin":$PATH"
set _OLD_VIRTUAL_PROMPT="$prompt"
if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
set prompt = '(venv) '"$prompt"
setenv VIRTUAL_ENV_PROMPT '(venv) '
endif
alias pydoc python -m pydoc
rehash

View File

@ -1,69 +0,0 @@
# This file must be used with "source <venv>/bin/activate.fish" *from fish*
# (https://fishshell.com/); you cannot run it directly.
function deactivate -d "Exit virtual environment and return to normal shell environment"
# reset old environment variables
if test -n "$_OLD_VIRTUAL_PATH"
set -gx PATH $_OLD_VIRTUAL_PATH
set -e _OLD_VIRTUAL_PATH
end
if test -n "$_OLD_VIRTUAL_PYTHONHOME"
set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
set -e _OLD_VIRTUAL_PYTHONHOME
end
if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
set -e _OLD_FISH_PROMPT_OVERRIDE
# prevents error when using nested fish instances (Issue #93858)
if functions -q _old_fish_prompt
functions -e fish_prompt
functions -c _old_fish_prompt fish_prompt
functions -e _old_fish_prompt
end
end
set -e VIRTUAL_ENV
set -e VIRTUAL_ENV_PROMPT
if test "$argv[1]" != "nondestructive"
# Self-destruct!
functions -e deactivate
end
end
# Unset irrelevant variables.
deactivate nondestructive
set -gx VIRTUAL_ENV '/home/jarnold/projects/GND-Skript Test/venv'
set -gx _OLD_VIRTUAL_PATH $PATH
set -gx PATH "$VIRTUAL_ENV/"bin $PATH
# Unset PYTHONHOME if set.
if set -q PYTHONHOME
set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
set -e PYTHONHOME
end
if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
# fish uses a function instead of an env var to generate the prompt.
# Save the current fish_prompt function as the function _old_fish_prompt.
functions -c fish_prompt _old_fish_prompt
# With the original prompt function renamed, we can override with our own.
function fish_prompt
# Save the return status of the last command.
set -l old_status $status
# Output the venv prompt; color taken from the blue of the Python logo.
printf "%s%s%s" (set_color 4B8BBE) '(venv) ' (set_color normal)
# Restore the return status of the previous command.
echo "exit $old_status" | .
# Output the original/"old" prompt.
_old_fish_prompt
end
set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
set -gx VIRTUAL_ENV_PROMPT '(venv) '
end

View File

@ -1,229 +0,0 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2008 Agustin Henze -> agustinhenze at gmail.com
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
# Søren Roug
#
# Oct 2014: Georges Khaznadar <georgesk@debian.org>
# - ported to Python3
# - imlemented the missing switch -c / --encoding, with an extra
# feature for POSIX platforms which can guess encoding.
from odf.opendocument import OpenDocumentSpreadsheet
from odf.style import Style, TextProperties, ParagraphProperties, TableColumnProperties
from odf.text import P
from odf.table import Table, TableColumn, TableRow, TableCell
from optparse import OptionParser
import sys,csv,re, os, codecs
if sys.version_info[0]==3: unicode=str
if sys.version_info[0]==2:
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
def csvToOds( pathFileCSV, pathFileODS, tableName='table',
delimiter=',', quoting=csv.QUOTE_MINIMAL,
quotechar = '"', escapechar = None,
skipinitialspace = False, lineterminator = '\r\n',
encoding="utf-8"):
textdoc = OpenDocumentSpreadsheet()
# Create a style for the table content. One we can modify
# later in the word processor.
tablecontents = Style(name="Table Contents", family="paragraph")
tablecontents.addElement(ParagraphProperties(numberlines="false", linenumber="0"))
tablecontents.addElement(TextProperties(fontweight="bold"))
textdoc.styles.addElement(tablecontents)
# Start the table
table = Table( name=tableName )
if sys.version_info[0]==3:
reader = csv.reader(open(pathFileCSV, encoding=encoding),
delimiter=delimiter,
quoting=quoting,
quotechar=quotechar,
escapechar=escapechar,
skipinitialspace=skipinitialspace,
lineterminator=lineterminator)
else:
reader = UnicodeReader(open(pathFileCSV),
encoding=encoding,
delimiter=delimiter,
quoting=quoting,
quotechar=quotechar,
escapechar=escapechar,
skipinitialspace=skipinitialspace,
lineterminator=lineterminator)
fltExp = re.compile('^\s*[-+]?\d+(\.\d+)?\s*$')
for row in reader:
tr = TableRow()
table.addElement(tr)
for val in row:
if fltExp.match(val):
tc = TableCell(valuetype="float", value=val.strip())
else:
tc = TableCell(valuetype="string")
tr.addElement(tc)
p = P(stylename=tablecontents,text=val)
tc.addElement(p)
textdoc.spreadsheet.addElement(table)
textdoc.save( pathFileODS )
if __name__ == "__main__":
usage = "%prog -i file.csv -o file.ods -d"
parser = OptionParser(usage=usage, version="%prog 0.1")
parser.add_option('-i','--input', action='store',
dest='input', help='File input in csv')
parser.add_option('-o','--output', action='store',
dest='output', help='File output in ods')
parser.add_option('-d','--delimiter', action='store',
dest='delimiter', help='specifies a one-character string to use as the field separator. It defaults to ",".')
parser.add_option('-c','--encoding', action='store',
dest='encoding', help='specifies the encoding the file csv. It defaults to utf-8')
parser.add_option('-t','--table', action='store',
dest='tableName', help='The table name in the output file')
parser.add_option('-s','--skipinitialspace',
dest='skipinitialspace', help='''specifies how to interpret whitespace which
immediately follows a delimiter. It defaults to False, which
means that whitespace immediately following a delimiter is part
of the following field.''')
parser.add_option('-l','--lineterminator', action='store',
dest='lineterminator', help='''specifies the character sequence which should
terminate rows.''')
parser.add_option('-q','--quoting', action='store',
dest='quoting', help='''It can take on any of the following module constants:
0 = QUOTE_MINIMAL means only when required, for example, when a field contains either the quotechar or the delimiter
1 = QUOTE_ALL means that quotes are always placed around fields.
2 = QUOTE_NONNUMERIC means that quotes are always placed around fields which do not parse as integers or floating point numbers.
3 = QUOTE_NONE means that quotes are never placed around fields.
It defaults is QUOTE_MINIMAL''')
parser.add_option('-e','--escapechar', action='store',
dest='escapechar', help='''specifies a one-character string used to escape the delimiter when quoting is set to QUOTE_NONE.''')
parser.add_option('-r','--quotechar', action='store',
dest='quotechar', help='''specifies a one-character string to use as the quoting character. It defaults to ".''')
(options, args) = parser.parse_args()
if options.input:
pathFileCSV = options.input
else:
parser.print_help()
exit( 0 )
if options.output:
pathFileODS = options.output
else:
parser.print_help()
exit( 0 )
if options.delimiter:
delimiter = options.delimiter
else:
delimiter = ","
if options.skipinitialspace:
skipinitialspace = True
else:
skipinitialspace=False
if options.lineterminator:
lineterminator = options.lineterminator
else:
lineterminator ="\r\n"
if options.escapechar:
escapechar = options.escapechar
else:
escapechar=None
if options.tableName:
tableName = options.tableName
else:
tableName = "table"
if options.quotechar:
quotechar = options.quotechar
else:
quotechar = "\""
encoding = "utf-8" # default setting
###########################################################
## try to guess the encoding; this is implemented only with
## POSIX platforms. Can it be improved?
output = os.popen('/usr/bin/file ' + pathFileCSV).read()
m=re.match(r'^.*: ([-a-zA-Z0-9]+) text$', output)
if m:
encoding=m.group(1)
if 'ISO-8859' in encoding:
encoding="latin-1"
else:
encoding="utf-8"
############################################################
# when the -c or --coding switch is used, it takes precedence
if options.encoding:
encoding = options.encoding
csvToOds( pathFileCSV=unicode(pathFileCSV),
pathFileODS=unicode(pathFileODS),
delimiter=delimiter, skipinitialspace=skipinitialspace,
escapechar=escapechar,
lineterminator=unicode(lineterminator),
tableName=tableName, quotechar=quotechar,
encoding=encoding)
# Local Variables: ***
# mode: python ***
# End: ***

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from rdflib.tools.csv2rdf import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from numpy.f2py.f2py2e import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -1,95 +0,0 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2006 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
from odf.odf2xhtml import ODF2XHTML
import zipfile
import sys, os, smtplib, getopt
from email.mime.multipart import MIMEMultipart
from email.mime.nonmultipart import MIMENonMultipart
from email.mime.text import MIMEText
from email.encoders import encode_base64
if sys.version_info[0]==3: unicode=str
def usage():
sys.stderr.write("Usage: %s [-f from] [-s subject] inputfile recipients...\n" % sys.argv[0])
try:
opts, args = getopt.getopt(sys.argv[1:], "f:s:", ["from=", "subject="])
except getopt.GetoptError:
usage()
sys.exit(2)
fromaddr = os.getlogin() + "@" + os.getenv('HOSTNAME','localhost')
subject = None
for o, a in opts:
if o in ("-f", "--from"):
fromaddr = a
if o in ("-s", "--subject"):
subject = a
if len(args) < 2:
usage()
sys.exit(2)
suffices = {
'wmf':('image','x-wmf'),
'png':('image','png'),
'gif':('image','gif'),
'jpg':('image','jpeg'),
'jpeg':('image','jpeg')
}
msg = MIMEMultipart('related',type="text/html")
msg['From'] = fromaddr
# msg['Date'] = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
msg['To'] = ','.join(args[1:])
msg.preamble = 'This is a multi-part message in MIME format.'
msg.epilogue = ''
odhandler = ODF2XHTML()
result = odhandler.odf2xhtml(unicode(args[0]))
if subject:
msg['Subject'] = subject
else:
msg['Subject'] = odhandler.title
htmlpart = MIMEText(result,'html','us-ascii')
htmlpart['Content-Location'] = 'index.html'
msg.attach(htmlpart)
z = zipfile.ZipFile(unicode(args[0]))
for file in z.namelist():
if file[0:9] == 'Pictures/':
suffix = file[file.rfind(".")+1:]
main,sub = suffices.get(suffix,('application','octet-stream'))
img = MIMENonMultipart(main,sub)
img.set_payload(z.read(file))
img['Content-Location'] = "" + file
encode_base64(img)
msg.attach(img)
z.close()
server = smtplib.SMTP('localhost')
#server.set_debuglevel(1)
server.sendmail(fromaddr, args[1:], msg.as_string())
server.quit()
# Local Variables: ***
# mode: python ***
# End: ***

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from markdown_it.cli.parse import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from nltk.cli import cli
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(cli())

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from charset_normalizer.cli import cli_detect
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(cli_detect())

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from numpy._configtool import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -1,72 +0,0 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2006 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
from __future__ import print_function
from odf.odf2xhtml import ODF2XHTML
import zipfile
import sys
#from time import gmtime, strftime
from email.mime.multipart import MIMEMultipart
from email.mime.nonmultipart import MIMENonMultipart
from email.mime.text import MIMEText
from email import encoders
if sys.version_info[0]==3: unicode=str
if len(sys.argv) != 2:
sys.stderr.write("Usage: %s inputfile\n" % sys.argv[0])
sys.exit(1)
suffices = {
'wmf':('image','x-wmf'),
'png':('image','png'),
'gif':('image','gif'),
'jpg':('image','jpeg'),
'jpeg':('image','jpeg')
}
msg = MIMEMultipart('related',type="text/html")
# msg['Subject'] = 'Subject here'
# msg['From'] = '<Saved by ODT2MHT>'
# msg['Date'] = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
msg.preamble = 'This is a multi-part message in MIME format.'
msg.epilogue = ''
odhandler = ODF2XHTML()
result = odhandler.odf2xhtml(unicode(sys.argv[1]))
htmlpart = MIMEText(result,'html','us-ascii')
htmlpart['Content-Location'] = 'index.html'
msg.attach(htmlpart)
z = zipfile.ZipFile(sys.argv[1])
for file in z.namelist():
if file[0:9] == 'Pictures/':
suffix = file[file.rfind(".")+1:]
main,sub = suffices.get(suffix,('application','octet-stream'))
img = MIMENonMultipart(main,sub)
img.set_payload(z.read(file))
img['Content-Location'] = "" + file
encoders.encode_base64(img)
msg.attach(img)
z.close()
print (msg.as_string())
# Local Variables: ***
# mode: python ***
# End: ***

View File

@ -1,59 +0,0 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2007 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
from odf.odf2xhtml import ODF2XHTML
import sys, getopt
if sys.version_info[0]==3: unicode=str
from io import StringIO
def usage():
sys.stderr.write("Usage: %s [-p] inputfile\n" % sys.argv[0])
try:
opts, args = getopt.getopt(sys.argv[1:], "ep", ["plain","embedable"])
except getopt.GetoptError:
usage()
sys.exit(2)
generatecss = True
embedable = False
for o, a in opts:
if o in ("-p", "--plain"):
generatecss = False
if o in ("-e", "--embedable"):
embedable = True
if len(args) != 1:
usage()
sys.exit(2)
odhandler = ODF2XHTML(generatecss, embedable)
try:
result = odhandler.odf2xhtml(unicode(args[0]))
except:
sys.stderr.write("Unable to open file %s or file is not OpenDocument\n" % args[0])
sys.exit(1)
sys.stdout.write(result)
# Local Variables: ***
# mode: python ***
# End: ***

View File

@ -1,81 +0,0 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2008 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
#
# OpenDocument can be a complete office document in a single
# XML document. This script will create such a document.
import sys, getopt, base64
from odf.opendocument import load
from odf.draw import Image, ObjectOle
from odf.style import BackgroundImage
from odf.text import ListLevelStyleImage
from odf.office import BinaryData
if sys.version_info[0]==3: unicode=str
def usage():
sys.stderr.write("Usage: %s [-e] [-o outputfile] [inputfile]\n" % sys.argv[0])
if __name__ == "__main__":
embedimage = False
try:
opts, args = getopt.getopt(sys.argv[1:], "o:e", ["output="])
except getopt.GetoptError:
usage()
sys.exit(2)
outputfile = '-'
for o, a in opts:
if o in ("-o", "--output"):
outputfile = a
if o == '-e':
embedimage = True
if len(args) > 1:
usage()
sys.exit(2)
if len(args) == 0:
d = load(sys.stdin)
else:
d = load(unicode(args[0]))
if embedimage:
images = d.getElementsByType(Image) + \
d.getElementsByType(BackgroundImage) + \
d.getElementsByType(ObjectOle) + \
d.getElementsByType(ListLevelStyleImage)
for image in images:
href = image.getAttribute('href')
if href and href[:9] == "Pictures/":
p = d.Pictures[href]
bp = base64.encodestring(p[1])
image.addElement(BinaryData(text=bp))
image.removeAttribute('href')
xml = d.xml()
if outputfile == '-':
print (xml)
else:
open(outputfile,"wb").write(xml)
# Local Variables: ***
# mode: python ***
# End: ***

View File

@ -1,190 +0,0 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2007-2009 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
from __future__ import print_function
import zipfile, sys, getopt, mimetypes
try:
from urllib2 import urlopen, quote, unquote
except ImportError:
from urllib.request import urlopen, quote, unquote
try:
from urlparse import urlunsplit, urlsplit
except ImportError:
from urllib.parse import urlunsplit, urlsplit
from odf.opendocument import load
from odf.draw import Image
if sys.version_info[0]==3: unicode=str
#sys.tracebacklimit = 0
# Variable to count the number of retrieval failures
failures = 0
# Set to one if quiet behaviour is wanted
quiet = 0
# If set will write every url to import
verbose = 0
# Dictionary with new pictures. Key is original file path
# Item is newfilename
newpictures = {}
doc = None
def importpicture(href):
""" Add the picture to the ZIP file
Returns the new path name to the file in the zip archive
If it is unable to import, then it returns the original href
Sideeffect: add line to manifest
"""
global doc, newpictures, failures, verbose
# Check that it is not already in the manifest
if href in doc.Pictures: return href
image = None
if verbose: print ("Importing", href, file=sys.stderr)
if href[:7] == "http://" or href[:8] == "https://" or href[:6] == "ftp://":
# There is a bug in urlopen: It can't open urls with non-ascii unicode
# characters. Convert to UTF-8 and then use percent encoding
try:
goodhref = href.encode('ascii')
except:
o = list(urlsplit(href))
o[2] = quote(o[2].encode('utf-8'))
goodhref = urlunsplit(o)
if goodhref in newpictures:
if verbose: print ("already imported", file=sys.stderr)
return newpictures[goodhref] # Already imported
try:
f = urlopen(goodhref.decode("utf-8"))
image = f.read()
headers = f.info()
f.close()
# Get the mimetype from the headerlines
c_t = headers['Content-Type'].split(';')[0].strip()
if c_t: mediatype = c_t.split(';')[0].strip()
if verbose: print ("OK", file=sys.stderr)
except:
failures += 1
if verbose: print ("failed", file=sys.stderr)
return href
# Remove query string
try: href= href[:href.rindex('?')]
except: pass
try:
lastslash = href[href.rindex('/'):]
ext = lastslash[lastslash.rindex('.'):]
except: ext = mimetypes.guess_extension(mediatype)
# Everything is a simple path.
else:
goodhref = href
if href[:3] == '../':
if directory is None:
goodhref = unquote(href[3:])
else:
goodhref = unquote(directory + href[2:])
if goodhref in newpictures:
if verbose: print ("already imported", file=sys.stderr)
return newpictures[goodhref] # Already imported
mediatype, encoding = mimetypes.guess_type(goodhref)
if mediatype is None:
mediatype = ''
try: ext = goodhref[goodhref.rindex('.'):]
except: ext=''
else:
ext = mimetypes.guess_extension(mediatype)
try:
image = file(goodhref).read()
if verbose: print ("OK", file=sys.stderr)
except:
failures += 1
if verbose: print ("failed", file=sys.stderr)
return href
# If we have a picture to import, the image variable contains it
# and manifestfn, ext and mediatype has a value
if image:
manifestfn = doc.addPictureFromString(image, unicode(mediatype))
newpictures[goodhref] = manifestfn
return manifestfn
if verbose: print ("not imported", file=sys.stderr)
return href
def exitwithusage(exitcode=2):
""" Print out usage information and exit """
print ("Usage: %s [-q] [-v] [-o output] [inputfile]" % sys.argv[0], file=sys.stderr)
print ("\tInputfile must be OpenDocument format", file=sys.stderr)
sys.exit(exitcode)
outputfile = None
writefile = True
try:
opts, args = getopt.getopt(sys.argv[1:], "qvo:")
except getopt.GetoptError:
exitwithusage()
for o, a in opts:
if o == "-o":
outputfile = a
writefile = True
if o == "-q":
quiet = 1
if o == "-v":
verbose = 1
if len(args) == 0:
try:
doc = load(sys.stdin)
directory = None
except:
print ("Couldn't open OpenDocument file", file=sys.stderr)
exitwithusage()
else:
fn = unicode(args[0])
if not zipfile.is_zipfile(fn):
exitwithusage()
dirinx = max(fn.rfind('\\'), fn.rfind('/'))
if dirinx >= 0: directory = fn[:dirinx]
else: directory = "."
doc = load(fn)
for image in doc.getElementsByType(Image):
href = image.getAttribute('href')
newhref = importpicture(href)
image.setAttribute('href',newhref)
if writefile:
if outputfile is None:
doc.save(fn)
else:
doc.save(unicode(outputfile))
if quiet == 0 and failures > 0:
print ("Couldn't import %d image(s)" % failures, file=sys.stderr)
sys.exit( int(failures > 0) )
# Local Variables: ***
# mode: python ***
# End: ***

View File

@ -1,216 +0,0 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2009 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
import zipfile
from xml.sax import make_parser,handler
from xml.sax.xmlreader import InputSource
import xml.sax.saxutils
import sys
from odf.opendocument import OpenDocument
from odf import element, grammar
from odf.namespaces import *
from odf.attrconverters import attrconverters, cnv_string
from io import BytesIO
if sys.version_info[0]==3: unicode=str
extension_attributes = {
"OpenOffice.org" : {
(METANS,u'template'): (
(XLINKNS,u'role'),
),
(STYLENS,u'graphic-properties'): (
(STYLENS,u'background-transparency'),
),
(STYLENS,u'paragraph-properties'): (
(TEXTNS,u'enable-numbering'),
(STYLENS,u'join-border'),
),
(STYLENS,u'table-cell-properties'): (
(STYLENS,u'writing-mode'),
),
(STYLENS,u'table-row-properties'): (
(STYLENS,u'keep-together'),
),
},
"KOffice" : {
(STYLENS,u'graphic-properties'): (
(KOFFICENS,u'frame-behavior-on-new-page'),
),
(DRAWNS,u'page'): (
(KOFFICENS,u'name'),
),
(PRESENTATIONNS,u'show-shape'): (
(KOFFICENS,u'order-id'),
),
(PRESENTATIONNS,u'hide-shape'): (
(KOFFICENS,u'order-id'),
),
(CHARTNS,u'legend'): (
(KOFFICENS,u'title'),
),
}
}
printed_errors = []
def print_error(str):
if str not in printed_errors:
printed_errors.append(str)
print (str)
def chop_arg(arg):
if len(arg) > 20:
return "%s..." % arg[0:20]
return arg
def make_qname(tag):
return "%s:%s" % (nsdict.get(tag[0],tag[0]), tag[1])
def allowed_attributes(tag):
return grammar.allowed_attributes.get(tag)
class ODFElementHandler(handler.ContentHandler):
""" Extract headings from content.xml of an ODT file """
def __init__(self, document):
self.doc = document
self.tagstack = []
self.data = []
self.currtag = None
def characters(self, data):
self.data.append(data)
def startElementNS(self, tag, qname, attrs):
""" Pseudo-create an element
"""
allowed_attrs = grammar.allowed_attributes.get(tag)
attrdict = {}
for (att,value) in attrs.items():
prefix = nsdict.get(att[0],att[0])
# Check if it is a known extension
notan_extension = True
for product, ext_attrs in extension_attributes.items():
allowed_ext_attrs = ext_attrs.get(tag)
if allowed_ext_attrs and att in allowed_ext_attrs:
print_error("Warning: Attribute %s in element <%s> is illegal - %s extension" % ( make_qname(att), make_qname(tag), product))
notan_extension = False
# Check if it is an allowed attribute
if notan_extension and allowed_attrs and att not in allowed_attrs:
print_error("Error: Attribute %s:%s is not allowed in element <%s>" % ( prefix, att[1], make_qname(tag)))
# Check the value
try:
convert = attrconverters.get(att, cnv_string)
convert(att, value, tag)
except ValueError as res:
print_error("Error: Bad value '%s' for attribute %s:%s in tag: <%s> - %s" %
(chop_arg(value), prefix, att[1], make_qname(tag), res))
self.tagstack.append(tag)
self.data = []
# Check that the parent allows this child element
if tag not in ( (OFFICENS, 'document'), (OFFICENS, 'document-content'), (OFFICENS, 'document-styles'),
(OFFICENS, 'document-meta'), (OFFICENS, 'document-settings'),
(MANIFESTNS,'manifest')):
try:
parent = self.tagstack[-2]
allowed_children = grammar.allowed_children.get(parent)
except:
print_error("Error: This document starts with the wrong tag: <%s>" % make_qname(tag))
allowed_children = None
if allowed_children and tag not in allowed_children:
print_error("Error: Element %s is not allowed in element %s" % ( make_qname(tag), make_qname(parent)))
# Test that all mandatory attributes have been added.
required = grammar.required_attributes.get(tag)
if required:
for r in required:
if attrs.get(r) is None:
print_error("Error: Required attribute missing: %s in <%s>" % (make_qname(r), make_qname(tag)))
def endElementNS(self, tag, qname):
self.currtag = self.tagstack.pop()
str = ''.join(self.data).strip()
# Check that only elements that can take text have text
# But only elements we know exist in grammar
if tag in grammar.allowed_children:
if str != '' and tag not in grammar.allows_text:
print_error("Error: %s does not allow text data" % make_qname(tag))
self.data = []
class ODFDTDHandler(handler.DTDHandler):
def notationDecl(self, name, public_id, system_id):
""" Ignore DTDs """
print_error("Warning: ODF doesn't use DOCTYPEs")
def exitwithusage(exitcode=2):
""" print out usage information """
sys.stderr.write("Usage: %s inputfile\n" % sys.argv[0])
sys.stderr.write("\tInputfile must be OpenDocument format\n")
sys.exit(exitcode)
def lint(odffile):
if not zipfile.is_zipfile(odffile):
print_error("Error: This is not a zipped file")
return
zfd = zipfile.ZipFile(odffile)
try:
mimetype = zfd.read('mimetype')
except:
mimetype=''
d = OpenDocument(unicode(mimetype))
first = True
for zi in zfd.infolist():
if first:
if zi.filename == 'mimetype':
if zi.compress_type != zipfile.ZIP_STORED:
print_error("Error: The 'mimetype' member must be stored - not deflated")
if zi.comment != "":
print_error("Error: The 'mimetype' member must not have extra header info")
else:
print_error("Warning: The first member in the archive should be the mimetype")
first = False
if zi.filename in ('META-INF/manifest.xml', 'content.xml', 'meta.xml', 'styles.xml', 'settings.xml'):
content = zfd.read(zi.filename)
parser = make_parser()
parser.setFeature(handler.feature_namespaces, True)
parser.setFeature(handler.feature_external_ges, False)
parser.setContentHandler(ODFElementHandler(d))
dtdh = ODFDTDHandler()
parser.setDTDHandler(dtdh)
parser.setErrorHandler(handler.ErrorHandler())
inpsrc = InputSource()
if not isinstance(content, str):
content=content
inpsrc.setByteStream(BytesIO(content))
parser.parse(inpsrc)
if len(sys.argv) != 2:
exitwithusage()
lint(unicode(sys.argv[1]))
# Local Variables: ***
# mode: python ***
# End: ***

View File

@ -1,266 +0,0 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2006-2009 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
import zipfile, time, sys, getopt, re
import xml.sax, xml.sax.saxutils
from odf.namespaces import TOOLSVERSION, OFFICENS, XLINKNS, DCNS, METANS
from io import BytesIO
OUTENCODING="utf-8"
whitespace = re.compile(r'\s+')
fields = {
'title': (DCNS,u'title'),
'description': (DCNS,u'description'),
'subject': (DCNS,u'subject'),
'creator': (DCNS,u'creator'),
'date': (DCNS,u'date'),
'language': (DCNS,u'language'),
'generator': (METANS,u'generator'),
'initial-creator': (METANS,u'initial-creator'),
'keyword': (METANS,u'keyword'),
'editing-duration': (METANS,u'editing-duration'),
'editing-cycles': (METANS,u'editing-cycles'),
'printed-by': (METANS,u'printed-by'),
'print-date': (METANS,u'print-date'),
'creation-date': (METANS,u'creation-date'),
'user-defined': (METANS,u'user-defined'),
#'template': (METANS,u'template'),
}
xfields = []
Xfields = []
addfields = {}
deletefields = {}
yieldfields = {}
showversion = None
def exitwithusage(exitcode=2):
""" print out usage information """
sys.stderr.write("Usage: %s [-cdlvV] [-xXaAI metafield]... [-o output] [inputfile]\n" % sys.argv[0])
sys.stderr.write("\tInputfile must be OpenDocument format\n")
sys.exit(exitcode)
def normalize(str):
"""
The normalize-space function returns the argument string with whitespace
normalized by stripping leading and trailing whitespace and replacing
sequences of whitespace characters by a single space.
"""
return whitespace.sub(' ', str).strip()
class MetaCollector:
"""
The MetaCollector is a pseudo file object, that can temporarily ignore write-calls
It could probably be replaced with a StringIO object.
"""
def __init__(self):
self._content = []
self.dowrite = True
def write(self, str):
if self.dowrite:
self._content.append(str)
def content(self):
return ''.join(self._content)
base = xml.sax.saxutils.XMLGenerator
class odfmetaparser(base):
""" Parse a meta.xml file with an event-driven parser and replace elements.
It would probably be a cleaner approach to use a DOM based parser and
then manipulate in memory.
Small issue: Reorders elements
"""
version = 'Unknown'
def __init__(self):
self._mimetype = ''
self.output = MetaCollector()
self._data = []
self.seenfields = {}
base.__init__(self, self.output, OUTENCODING)
def startElementNS(self, name, qname, attrs):
self._data = []
field = name
# I can't modify the template until the tool replaces elements at the same
# location and not at the end
# if name == (METANS,u'template'):
# self._data = [attrs.get((XLINKNS,u'title'),'')]
if showversion and name == (OFFICENS,u'document-meta'):
if showversion == '-V':
print ("version:%s" % attrs.get((OFFICENS,u'version'),'Unknown').decode('utf-8'))
else:
print ("%s" % attrs.get((OFFICENS,u'version'),'Unknown').decode('utf-8'))
if name == (METANS,u'user-defined'):
field = attrs.get((METANS,u'name'))
if field in deletefields:
self.output.dowrite = False
elif field in yieldfields:
del addfields[field]
base.startElementNS(self, name, qname, attrs)
else:
base.startElementNS(self, name, qname, attrs)
self._tag = field
def endElementNS(self, name, qname):
field = name
if name == (METANS,u'user-defined'):
field = self._tag
if name == (OFFICENS,u'meta'):
for k,v in addfields.items():
if len(v) > 0:
if type(k) == type(''):
base.startElementNS(self,(METANS,u'user-defined'),None,{(METANS,u'name'):k})
base.characters(self, v)
base.endElementNS(self, (METANS,u'user-defined'),None)
else:
base.startElementNS(self, k, None, {})
base.characters(self, v)
base.endElementNS(self, k, None)
if name in xfields:
print ("%s" % self.data())
if name in Xfields:
if isinstance(self._tag, tuple):
texttag = self._tag[1]
else:
texttag = self._tag
print ("%s:%s" % (texttag, self.data()))
if field in deletefields:
self.output.dowrite = True
else:
base.endElementNS(self, name, qname)
def characters(self, content):
base.characters(self, content)
self._data.append(content)
def meta(self):
return self.output.content()
def data(self):
if usenormalize:
return normalize(''.join(self._data))
else:
return ''.join(self._data)
now = time.localtime()[:6]
outputfile = "-"
writemeta = False # Do we change any meta data?
usenormalize = False
try:
opts, args = getopt.getopt(sys.argv[1:], "cdlvVI:A:a:o:x:X:")
except getopt.GetoptError:
exitwithusage()
if len(opts) == 0:
opts = [ ('-l','') ]
for o, a in opts:
if o in ('-a','-A','-I'):
writemeta = True
if a.find(":") >= 0:
k,v = a.split(":",1)
else:
k,v = (a, "")
if len(k) == 0:
exitwithusage()
k = fields.get(k,k)
addfields[k] = unicode(v,'utf-8')
if o == '-a':
yieldfields[k] = True
if o == '-I':
deletefields[k] = True
if o == '-d':
writemeta = True
addfields[(DCNS,u'date')] = "%04d-%02d-%02dT%02d:%02d:%02d" % now
deletefields[(DCNS,u'date')] = True
if o == '-c':
usenormalize = True
if o in ('-v', '-V'):
showversion = o
if o == '-l':
Xfields = fields.values()
if o == "-x":
xfields.append(fields.get(a,a))
if o == "-X":
Xfields.append(fields.get(a,a))
if o == "-o":
outputfile = a
# The specification says we should change the element to our own,
# and must not export the original identifier.
if writemeta:
addfields[(METANS,u'generator')] = TOOLSVERSION
deletefields[(METANS,u'generator')] = True
odfs = odfmetaparser()
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 1)
parser.setContentHandler(odfs)
if len(args) == 0:
zin = zipfile.ZipFile(sys.stdin,'r')
else:
if not zipfile.is_zipfile(args[0]):
exitwithusage()
zin = zipfile.ZipFile(args[0], 'r')
try:
content = zin.read('meta.xml').decode('utf-8')
except:
sys.stderr.write("File has no meta data\n")
sys.exit(1)
parser.parse(BytesIO(content.encode('utf-8')))
if writemeta:
if outputfile == '-':
if sys.stdout.isatty():
sys.stderr.write("Won't write ODF file to terminal\n")
sys.exit(1)
zout = zipfile.ZipFile(sys.stdout,"w")
else:
zout = zipfile.ZipFile(outputfile,"w")
# Loop through the input zipfile and copy the content to the output until we
# get to the meta.xml. Then substitute.
for zinfo in zin.infolist():
if zinfo.filename == "meta.xml":
# Write meta
zi = zipfile.ZipInfo("meta.xml", now)
zi.compress_type = zipfile.ZIP_DEFLATED
zout.writestr(zi,odfs.meta() )
else:
payload = zin.read(zinfo.filename)
zout.writestr(zinfo, payload)
zout.close()
zin.close()
# Local Variables: ***
# mode: python ***
# End: ***

View File

@ -1,144 +0,0 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2006 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
from __future__ import print_function
import zipfile
from xml.sax import make_parser,handler
from xml.sax.xmlreader import InputSource
import xml.sax.saxutils
import sys
from odf.namespaces import TEXTNS, TABLENS, DRAWNS
try:
from cStringIO import StringIO
except ImportError:
from io import StringIO
def getxmlpart(odffile, xmlfile):
""" Get the content out of the ODT file"""
z = zipfile.ZipFile(odffile)
content = z.read(xmlfile)
z.close()
return content
#
# Extract headings from content.xml
#
class ODTHeadingHandler(handler.ContentHandler):
""" Extract headings from content.xml of an ODT file """
def __init__(self, eater):
self.r = eater
self.data = []
self.level = 0
def characters(self, data):
self.data.append(data)
def startElementNS(self, tag, qname, attrs):
if tag == (TEXTNS, 'h'):
self.level = 0
for (att,value) in attrs.items():
if att == (TEXTNS, 'outline-level'):
self.level = int(value)
self.data = []
def endElementNS(self, tag, qname):
if tag == (TEXTNS, 'h'):
str = ''.join(self.data)
self.data = []
self.r.append("%d%*s%s" % (self.level, self.level, '', str))
class ODTSheetHandler(handler.ContentHandler):
""" Extract sheet names from content.xml of an ODS file """
def __init__(self, eater):
self.r = eater
def startElementNS(self, tag, qname, attrs):
if tag == (TABLENS, 'table'):
sheetname = attrs.get((TABLENS, 'name'))
if sheetname:
self.r.append(sheetname)
class ODTSlideHandler(handler.ContentHandler):
""" Extract headings from content.xml of an ODT file """
def __init__(self, eater):
self.r = eater
self.data = []
self.pagenum = 0
def characters(self, data):
self.data.append(data)
def startElementNS(self, tag, qname, attrs):
if tag == (DRAWNS, 'page'):
self.pagenum = self.pagenum + 1
self.r.append("SLIDE %d: %s" % ( self.pagenum, attrs.get((DRAWNS, 'name'),'')))
if tag == (TEXTNS, 'p'):
self.data = []
def endElementNS(self, tag, qname):
if tag == (TEXTNS, 'p'):
str = ''.join(self.data)
self.data = []
if len(str) > 0:
self.r.append(" " + str)
def odtheadings(odtfile):
mimetype = getxmlpart(odtfile,'mimetype')
content = getxmlpart(odtfile,'content.xml')
lines = []
parser = make_parser()
parser.setFeature(handler.feature_namespaces, 1)
if not isinstance(mimetype, str):
mimetype=mimetype.decode("utf-8")
if mimetype in ('application/vnd.oasis.opendocument.text',
'application/vnd.oasis.opendocument.text-template'):
parser.setContentHandler(ODTHeadingHandler(lines))
elif mimetype in ('application/vnd.oasis.opendocument.spreadsheet',
'application/vnd.oasis.opendocument.spreadsheet-template'):
parser.setContentHandler(ODTSheetHandler(lines))
elif mimetype in ('application/vnd.oasis.opendocument.presentation'
'application/vnd.oasis.opendocument.presentation-template'):
parser.setContentHandler(ODTSlideHandler(lines))
else:
print ("Unsupported fileformat")
sys.exit(2)
parser.setErrorHandler(handler.ErrorHandler())
inpsrc = InputSource()
if not isinstance(content, str):
content=content.decode("utf-8")
inpsrc.setByteStream(StringIO(content))
parser.parse(inpsrc)
return lines
if __name__ == "__main__":
filler = " "
for heading in odtheadings(sys.argv[1]):
print (heading)
# Local Variables: ***
# mode: python ***
# End: ***

View File

@ -1,101 +0,0 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2006-2007 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s): Michael Howitz, gocept gmbh & co. kg
import sys
import getopt
import odf.userfield
if sys.version_info[0]==3: unicode=str
listfields = False
Listfields = False
xfields = []
Xfields = []
setfields = {}
outputfile = None
inputfile = None
def exitwithusage(exitcode=2):
""" print out usage information """
sys.stderr.write("Usage: %s [-lL] [-xX metafield] [-s metafield:value]... "
"[-o output] [inputfile]\n" % sys.argv[0])
sys.stderr.write("\tInputfile must be OpenDocument format\n")
sys.exit(exitcode)
try:
opts, args = getopt.getopt(sys.argv[1:], "lLs:o:x:X:")
except getopt.GetoptError:
exitwithusage()
if len(opts) == 0:
exitwithusage()
for o, a in opts:
if o == '-s':
if a.find(":") >= 0:
k,v = a.split(":",1)
else:
k,v = (a, "")
if len(k) == 0:
exitwithusage()
setfields[unicode(k)] = unicode(v)
if o == '-l':
listfields = True
Listfields = False
if o == '-L':
Listfields = True
listfields = False
if o == "-x":
xfields.append(unicode(a))
if o == "-X":
Xfields.append(unicode(a))
if o == "-o":
outputfile = unicode(a)
if len(args) != 0:
inputfile = unicode(args[0])
user_fields = odf.userfield.UserFields(inputfile, outputfile)
if xfields:
for value in user_fields.list_values(xfields):
print (value)
if Listfields or Xfields:
if Listfields:
Xfields = None
for field_name, value_type, value in user_fields.list_fields_and_values(
Xfields):
print ("%s#%s:%s" % (field_name, value_type, value))
if listfields:
for value in user_fields.list_fields():
print (value)
if setfields:
user_fields.update(setfields)
# Local Variables: ***
# mode: python ***
# End: ***

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from pip._internal.cli.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from pip._internal.cli.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from pip._internal.cli.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from pygments.cmdline import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -1 +0,0 @@
python3

View File

@ -1 +0,0 @@
/usr/bin/python3

View File

@ -1 +0,0 @@
python3

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from rdflib.tools.rdf2dot import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from rdflib.tools.graphisomorphism import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from rdflib.tools.rdfpipe import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from rdflib.tools.rdfs2dot import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from SPARQLWrapper.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from spacy.cli import setup_cli
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(setup_cli())

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from tqdm.cli import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from typer.cli import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

View File

@ -1,10 +0,0 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from weasel.cli import app
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(app())

View File

@ -1,241 +0,0 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2006 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
#
# OpenDocument can be a complete office document in a single
# XML document. This script will take such a document and create
# a package
import io
import zipfile,time, sys, getopt
import xml.sax, xml.sax.saxutils
from odf import manifest
class SplitWriter:
def __init__(self):
self.activefiles = []
self._content = []
self._meta = []
self._styles = []
self._settings = []
self.files = {'content': self._content, 'meta': self._meta,
'styles':self._styles, 'settings': self._settings }
def write(self, str):
for f in self.activefiles:
f.append(str)
def activate(self, filename):
file = self.files[filename]
if file not in self.activefiles:
self.activefiles.append(file)
def deactivate(self, filename):
file = self.files[filename]
if file in self.activefiles:
self.activefiles.remove(file)
odmimetypes = {
'application/vnd.oasis.opendocument.text': '.odt',
'application/vnd.oasis.opendocument.text-template': '.ott',
'application/vnd.oasis.opendocument.graphics': '.odg',
'application/vnd.oasis.opendocument.graphics-template': '.otg',
'application/vnd.oasis.opendocument.presentation': '.odp',
'application/vnd.oasis.opendocument.presentation-template': '.otp',
'application/vnd.oasis.opendocument.spreadsheet': '.ods',
'application/vnd.oasis.opendocument.spreadsheet-template': '.ots',
'application/vnd.oasis.opendocument.chart': '.odc',
'application/vnd.oasis.opendocument.chart-template': '.otc',
'application/vnd.oasis.opendocument.image': '.odi',
'application/vnd.oasis.opendocument.image-template': '.oti',
'application/vnd.oasis.opendocument.formula': '.odf',
'application/vnd.oasis.opendocument.formula-template': '.otf',
'application/vnd.oasis.opendocument.text-master': '.odm',
'application/vnd.oasis.opendocument.text-web': '.oth',
}
OFFICENS = u"urn:oasis:names:tc:opendocument:xmlns:office:1.0"
base = xml.sax.saxutils.XMLGenerator
class odfsplitter(base):
def __init__(self):
self._mimetype = ''
self.output = SplitWriter()
self._prefixes = []
base.__init__(self, self.output, 'utf-8')
def startPrefixMapping(self, prefix, uri):
base.startPrefixMapping(self, prefix, uri)
self._prefixes.append('xmlns:%s="%s"' % (prefix, uri))
def startElementNS(self, name, qname, attrs):
if name == (OFFICENS, u"document"):
self._mimetype = attrs.get((OFFICENS, "mimetype"))
elif name == (OFFICENS, u"meta"):
self.output.activate('meta')
elif name == (OFFICENS, u"settings"):
self.output.activate('settings')
elif name == (OFFICENS, u"scripts"):
self.output.activate('content')
elif name == (OFFICENS, u"font-face-decls"):
self.output.activate('content')
self.output.activate('styles')
elif name == (OFFICENS, u"styles"):
self.output.activate('styles')
elif name == (OFFICENS, u"automatic-styles"):
self.output.activate('content')
self.output.activate('styles')
elif name == (OFFICENS, u"master-styles"):
self.output.activate('styles')
elif name == (OFFICENS, u"body"):
self.output.activate('content')
base.startElementNS(self, name, qname, attrs)
def endElementNS(self, name, qname):
base.endElementNS(self, name, qname)
if name == (OFFICENS, u"meta"):
self.output.deactivate('meta')
elif name == (OFFICENS, u"settings"):
self.output.deactivate('settings')
elif name == (OFFICENS, u"scripts"):
self.output.deactivate('content')
elif name == (OFFICENS, u"font-face-decls"):
self.output.deactivate('content')
self.output.deactivate('styles')
elif name == (OFFICENS, u"styles"):
self.output.deactivate('styles')
elif name == (OFFICENS, u"automatic-styles"):
self.output.deactivate('content')
self.output.deactivate('styles')
elif name == (OFFICENS, u"master-styles"):
self.output.deactivate('styles')
elif name == (OFFICENS, u"body"):
self.output.deactivate('content')
def content(self):
""" Return the content inside a wrapper called <office:document-content>
"""
prefixes = ' '.join(self._prefixes)
return ''.join(['<?xml version="1.0" encoding="UTF-8"?>\n<office:document-content %s office:version="1.0">' % prefixes] + list(map(lambda x: x.decode("utf-8"), self.output._content)) + ['</office:document-content>'])
def settings(self):
prefixes = ' '.join(self._prefixes).encode('utf-8')
return ''.join( ['<?xml version="1.0" encoding="UTF-8"?>\n<office:document-settings %s office:version="1.0">' % prefixes] + self.output._settings + ['''</office:document-settings>'''])
def styles(self):
prefixes = ' '.join(self._prefixes)
return ''.join( ['<?xml version="1.0" encoding="UTF-8"?>\n<office:document-styles %s office:version="1.0">' % prefixes] + list(map(lambda x: x.decode("utf-8"), self.output._styles)) + ['''</office:document-styles>'''])
def meta(self):
prefixes = ' '.join(self._prefixes)
return ''.join( ['<?xml version="1.0" encoding="UTF-8"?>\n<office:document-meta %s office:version="1.0">' % prefixes] + list(map(lambda x: x.decode("utf-8"), self.output._meta)) + ['''</office:document-meta>'''])
def usage():
sys.stderr.write("Usage: %s [-o outputfile] [-s] inputfile\n" % sys.argv[0])
def manifestxml(m):
""" Generates the content of the manifest.xml file """
xml=io.StringIO()
xml.write(u"<?xml version='1.0' encoding='UTF-8'?>\n")
m.toXml(0,xml)
return xml.getvalue()
try:
opts, args = getopt.getopt(sys.argv[1:], "o:s", ["output=","suffix"])
except getopt.GetoptError:
usage()
sys.exit(2)
outputfile = '-'
addsuffix = False
for o, a in opts:
if o in ("-o", "--output"):
outputfile = a
if o in ("-s", "--suffix"):
addsuffix = True
if len(args) > 1:
usage()
sys.exit(2)
odfs = odfsplitter()
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 1)
parser.setContentHandler(odfs)
if len(args) == 0:
parser.parse(sys.stdin)
else:
parser.parse(open(args[0],"r"))
mimetype = odfs._mimetype
suffix = odmimetypes.get(mimetype,'.xxx')
if outputfile == '-':
if sys.stdout.isatty():
sys.stderr.write("Won't write ODF file to terminal\n")
sys.exit(1)
z = zipfile.ZipFile(sys.stdout,"w")
else:
if addsuffix:
outputfile = outputfile + suffix
z = zipfile.ZipFile(outputfile,"w")
now = time.localtime()[:6]
# Write mimetype
zi = zipfile.ZipInfo('mimetype', now)
zi.compress_type = zipfile.ZIP_STORED
z.writestr(zi,mimetype)
# Write content
zi = zipfile.ZipInfo("content.xml", now)
zi.compress_type = zipfile.ZIP_DEFLATED
z.writestr(zi,odfs.content() )
# Write styles
zi = zipfile.ZipInfo("styles.xml", now)
zi.compress_type = zipfile.ZIP_DEFLATED
z.writestr(zi,odfs.styles() )
# Write meta
zi = zipfile.ZipInfo("meta.xml", now)
zi.compress_type = zipfile.ZIP_DEFLATED
z.writestr(zi,odfs.meta() )
m = manifest.Manifest()
m.addElement(manifest.FileEntry(fullpath="/", mediatype=mimetype))
m.addElement(manifest.FileEntry(fullpath="content.xml",mediatype="text/xml"))
m.addElement(manifest.FileEntry(fullpath="styles.xml", mediatype="text/xml"))
m.addElement(manifest.FileEntry(fullpath="meta.xml", mediatype="text/xml"))
# Write manifest
zi = zipfile.ZipInfo("META-INF/manifest.xml", now)
zi.compress_type = zipfile.ZIP_DEFLATED
z.writestr(zi, manifestxml(m).encode("utf-8") )
z.close()
# Local Variables: ***
# mode: python ***
# End: ***

View File

@ -1,28 +0,0 @@
Copyright 2010 Pallets
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,92 +0,0 @@
Metadata-Version: 2.1
Name: MarkupSafe
Version: 3.0.2
Summary: Safely add untrusted strings to HTML/XML markup.
Maintainer-email: Pallets <contact@palletsprojects.com>
License: Copyright 2010 Pallets
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Project-URL: Donate, https://palletsprojects.com/donate
Project-URL: Documentation, https://markupsafe.palletsprojects.com/
Project-URL: Changes, https://markupsafe.palletsprojects.com/changes/
Project-URL: Source, https://github.com/pallets/markupsafe/
Project-URL: Chat, https://discord.gg/pallets
Classifier: Development Status :: 5 - Production/Stable
Classifier: Environment :: Web Environment
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: BSD License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
Classifier: Topic :: Text Processing :: Markup :: HTML
Classifier: Typing :: Typed
Requires-Python: >=3.9
Description-Content-Type: text/markdown
License-File: LICENSE.txt
# MarkupSafe
MarkupSafe implements a text object that escapes characters so it is
safe to use in HTML and XML. Characters that have special meanings are
replaced so that they display as the actual characters. This mitigates
injection attacks, meaning untrusted user input can safely be displayed
on a page.
## Examples
```pycon
>>> from markupsafe import Markup, escape
>>> # escape replaces special characters and wraps in Markup
>>> escape("<script>alert(document.cookie);</script>")
Markup('&lt;script&gt;alert(document.cookie);&lt;/script&gt;')
>>> # wrap in Markup to mark text "safe" and prevent escaping
>>> Markup("<strong>Hello</strong>")
Markup('<strong>hello</strong>')
>>> escape(Markup("<strong>Hello</strong>"))
Markup('<strong>hello</strong>')
>>> # Markup is a str subclass
>>> # methods and operators escape their arguments
>>> template = Markup("Hello <em>{name}</em>")
>>> template.format(name='"World"')
Markup('Hello <em>&#34;World&#34;</em>')
```
## Donate
The Pallets organization develops and supports MarkupSafe and other
popular packages. In order to grow the community of contributors and
users, and allow the maintainers to devote more time to the projects,
[please donate today][].
[please donate today]: https://palletsprojects.com/donate

View File

@ -1,14 +0,0 @@
MarkupSafe-3.0.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
MarkupSafe-3.0.2.dist-info/LICENSE.txt,sha256=SJqOEQhQntmKN7uYPhHg9-HTHwvY-Zp5yESOf_N9B-o,1475
MarkupSafe-3.0.2.dist-info/METADATA,sha256=aAwbZhSmXdfFuMM-rEHpeiHRkBOGESyVLJIuwzHP-nw,3975
MarkupSafe-3.0.2.dist-info/RECORD,,
MarkupSafe-3.0.2.dist-info/WHEEL,sha256=_kVlewavvOSnwZE_whBk3jlE_Ob-nL5GvlVcLkpXSD8,151
MarkupSafe-3.0.2.dist-info/top_level.txt,sha256=qy0Plje5IJuvsCBjejJyhDCjEAdcDLK_2agVcex8Z6U,11
markupsafe/__init__.py,sha256=sr-U6_27DfaSrj5jnHYxWN-pvhM27sjlDplMDPZKm7k,13214
markupsafe/__pycache__/__init__.cpython-310.pyc,,
markupsafe/__pycache__/_native.cpython-310.pyc,,
markupsafe/_native.py,sha256=hSLs8Jmz5aqayuengJJ3kdT5PwNpBWpKrmQSdipndC8,210
markupsafe/_speedups.c,sha256=O7XulmTo-epI6n2FtMVOrJXl8EAaIwD2iNYmBI5SEoQ,4149
markupsafe/_speedups.cpython-310-x86_64-linux-gnu.so,sha256=x4RoxWgyqAEokk-AZrWvrLDxLE-dm-zZSZYV_gOiLJA,34976
markupsafe/_speedups.pyi,sha256=ENd1bYe7gbBUf2ywyYWOGUpnXOHNJ-cgTNqetlW8h5k,41
markupsafe/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0

View File

@ -1,6 +0,0 @@
Wheel-Version: 1.0
Generator: setuptools (75.2.0)
Root-Is-Purelib: false
Tag: cp310-cp310-manylinux_2_17_x86_64
Tag: cp310-cp310-manylinux2014_x86_64

View File

@ -1,37 +0,0 @@
# Authors
* Ivan Herman ([@iherman](http://github.com/iherman))
* Sergio Fernández ([@wikier](http://github.com/wikier))
* Carlos Tejo ([@dayures](http://github.com/dayures))
* Alexey Zakhlestin ([@indeyets](http://github.com/indeyets))
# Contributors
See https://github.com/RDFLib/sparqlwrapper/graphs/contributors
* [@eggplants]https://github.com/eggplants: most things to make 2.0.0 happen
* Obey Arthur Liu ([@ArthurLiu](http://github.com/ArthurLiu)): different patches
* Christopher Lenz ([@cmlenz](http://github.com/cmlenz)): feature to allow developers to choose the json module
* Pēteris Caune ([@cuu508](http://github.com/cuu508)): great feedback and patches
* Bogdan Benea ([bugdone@users.sourceforge.net](mailto:bugdone@users.sourceforge.net)), patch for the query regular expresion
* William Waites ([@wwaites](http://github.com/wwaites)): patches for RDFLib3
* Christoph Burgmer ([@cburgmer](http://github.com/cburgmer)): patches for RDFLib3
* Thomas Kluyver ([@takluyver](http://github.com/takluyver)): patches for Python 3.x
* Diego Berrueta ([@berrueta](http://github.com/berrueta)): new function for printing results as table
* Olivier Berger ([@olberger](http://github.com/olberger)): patch regarding raw response for unknown formats
* Benjamin Cogrel ([@bcogrel](http://github.com/bcogrel)): standard query types
* Urs Holzer ([@uholzer](http://github.com/uholzer)): features, patches and testing
* Alf Lervåg ([@alf](http://github.com/alf)): setup patch
* Nolan Nichols ([@nicholsn](http://github.com/nicholsn)): http disgest auth support
* Kevin Turner ([@keturn](https://github.com/keturn)): `SmartWrapper.Value.__repr__()` implementation
* Marcelo Jorge Vieira ([@marcelometal](https://github.com/marcelometal)): typos
* Trevor Andersen ([@trevorandersen](https://github.com/trevorandersen): patches for Python 3.x
* Carlos Martinez-Ortiz ([@cmartinez](https://github.com/cmartinez): improves support for return format HTTP parameter
* Christian Amsüss ([@chrysn](https://github.com/chrysn)): dependecy fixes
* Chris Lamb ([@lamby](https://github.com/lamby)): typo
* Hugo van Kemenade ([@hugovk](https://github.com/hugovk)): update classifiers (Python 3.6)
* Edward Betts ([@EdwardBetts](https://github.com/EdwardBetts)): Correct spelling mistakes
* Carlos Martínez ([@c-martinez](https://github.com/c-martinez)): Mainly support for CSV and TSV results in SPARQL SELECT queries
* Dan Michael O. Heggø ([@danmichaelo](https://github.com/danmichaelo)): update README with SPARQLWrapper2 example
* Sam Clements ([@borntyping](https://github.com/borntyping)): Provide hints about setting properly the timeout
* Marc Feger ([@MaFeg100](https://github.com/MaFeg100)): Improve/tests for development

View File

@ -1,18 +0,0 @@
SPARQL Python Wrapper is released under the W3C® SOFTWARE NOTICE AND LICENSE.
This work (and included software, documentation such as READMEs, or other related items) is being provided by the copyright holders under the following license. By obtaining, using and/or copying this work, you (the licensee) agree that you have read, understood, and will comply with the following terms and conditions.
Permission to copy, modify, and distribute this software and its documentation, with or without modification, for any purpose and without fee or royalty is hereby granted, provided that you include the following on ALL copies of the software and documentation or portions thereof, including modifications:
1. The full text of this NOTICE in a location viewable to users of the redistributed or derivative work.
2. Any pre-existing intellectual property disclaimers, notices, or terms and conditions. If none exist, the W3C Software Short Notice should be included (hypertext is preferred, text is permitted) within the body of any redistributed or derivative code.
3. Notice of any changes or modifications to the files, including the date changes were made. (We recommend you provide URIs to the location from which the code is derived.)
THIS SOFTWARE AND DOCUMENTATION IS PROVIDED "AS IS," AND COPYRIGHT HOLDERS MAKE NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS.
COPYRIGHT HOLDERS WILL NOT BE LIABLE FOR ANY DIRECT, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF ANY USE OF THE SOFTWARE OR DOCUMENTATION.
The name and trademarks of copyright holders may NOT be used in advertising or publicity pertaining to the software without specific, written prior permission. Title to copyright in this software and any associated documentation will at all times remain with copyright holders.
See also http://www.w3.org/Consortium/Legal/copyright-software for further details

View File

@ -1,45 +0,0 @@
Metadata-Version: 2.1
Name: SPARQLWrapper
Version: 2.0.0
Summary: SPARQL Endpoint interface to Python
Home-page: http://rdflib.github.io/sparqlwrapper
Download-URL: https://github.com/RDFLib/sparqlwrapper/releases
Author: Ivan Herman, Sergio Fernández, Carlos Tejo Alonso, Alexey Zakhlestin
Author-email: rdflib-dev@googlegroups.com
License: W3C SOFTWARE NOTICE AND LICENSE
Project-URL: Home, https://rdflib.github.io/sparqlwrapper
Project-URL: Documentation, https://sparqlwrapper.readthedocs.io
Project-URL: Source, https://github.com/RDFLib/sparqlwrapper
Project-URL: Tracker, https://github.com/RDFLib/sparqlwrapper/issues
Keywords: python,sparql,rdf,rdflib
Platform: any
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: W3C License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Requires-Python: >=3.7
License-File: LICENSE.txt
License-File: AUTHORS.md
Requires-Dist: rdflib (>=6.1.1)
Provides-Extra: dev
Requires-Dist: setuptools (>=3.7.1) ; extra == 'dev'
Requires-Dist: mypy (>=0.931) ; extra == 'dev'
Requires-Dist: pandas (>=1.3.5) ; extra == 'dev'
Requires-Dist: pandas-stubs (>=1.2.0.48) ; extra == 'dev'
Provides-Extra: docs
Requires-Dist: sphinx (<5) ; extra == 'docs'
Requires-Dist: sphinx-rtd-theme ; extra == 'docs'
Provides-Extra: keepalive
Requires-Dist: keepalive (>=0.5) ; extra == 'keepalive'
Provides-Extra: pandas
Requires-Dist: pandas (>=1.3.5) ; extra == 'pandas'
This is a wrapper around a SPARQL service. It helps in creating the query URI and, possibly, convert the result into a more manageable format.

View File

@ -1,25 +0,0 @@
../../../bin/rqw,sha256=qf6Nvwhjovp_uPIPeeMNocB3j7iZ_YnskuMQcUK6DYY,291
SPARQLWrapper-2.0.0.dist-info/AUTHORS.md,sha256=7oV4hamlTbjfsaWy15f3BVH2h90Nf5mJ-rR0Z1azy9s,2725
SPARQLWrapper-2.0.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
SPARQLWrapper-2.0.0.dist-info/LICENSE.txt,sha256=Z1IX12CEodcefDAOAMJ7irELJAX-huUCOiuzio5G8Ik,2134
SPARQLWrapper-2.0.0.dist-info/METADATA,sha256=kU92L4KNVjo9aP6-jm4FXVAUpNScd5mIWWbIGHu_D_I,2020
SPARQLWrapper-2.0.0.dist-info/RECORD,,
SPARQLWrapper-2.0.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
SPARQLWrapper-2.0.0.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
SPARQLWrapper-2.0.0.dist-info/entry_points.txt,sha256=aIYAzonEA7winfiw8NydOLNu406HC6aRBlKLI2H5kEQ,48
SPARQLWrapper-2.0.0.dist-info/top_level.txt,sha256=3KluNiTwOkX16hLJwC3UEYzKdEscknK--UV5q56mYWY,14
SPARQLWrapper/KeyCaseInsensitiveDict.py,sha256=JF83-6EPbcm9F4gg0GQ11vTVuLzdJ7sDsubEP9j-3zw,1377
SPARQLWrapper/SPARQLExceptions.py,sha256=qFlU175hp61gO6bvgQsCdSTEGOFnJwJNBQlIGS5W7-o,2595
SPARQLWrapper/SmartWrapper.py,sha256=GxZiMGZpGppPZX54W-YdUtcdAAa83GJjPLdyfLWPK-4,15557
SPARQLWrapper/Wrapper.py,sha256=M9lTPkpvRU2xAUbrHiKYK0mEV8pkycNS3lPoO__0gSE,58238
SPARQLWrapper/__init__.py,sha256=6kU9hD9FnlFbk2c8uFkpGb1arB3268nN74RUh91e60s,1213
SPARQLWrapper/__pycache__/KeyCaseInsensitiveDict.cpython-310.pyc,,
SPARQLWrapper/__pycache__/SPARQLExceptions.cpython-310.pyc,,
SPARQLWrapper/__pycache__/SmartWrapper.cpython-310.pyc,,
SPARQLWrapper/__pycache__/Wrapper.cpython-310.pyc,,
SPARQLWrapper/__pycache__/__init__.cpython-310.pyc,,
SPARQLWrapper/__pycache__/main.cpython-310.pyc,,
SPARQLWrapper/__pycache__/sparql_dataframe.cpython-310.pyc,,
SPARQLWrapper/main.py,sha256=MKNPMrFxIGN_A7-UwyMS_AycjswscgKsP37h2K2df8k,4330
SPARQLWrapper/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
SPARQLWrapper/sparql_dataframe.py,sha256=-oM7_eXbwGgeNkFv9mSxe3JWHM3xQQk90nNrbhthnrI,2429

View File

@ -1,5 +0,0 @@
Wheel-Version: 1.0
Generator: bdist_wheel (0.37.1)
Root-Is-Purelib: true
Tag: py3-none-any

View File

@ -1,2 +0,0 @@
[console_scripts]
rqw = SPARQLWrapper.main:main

View File

@ -1,46 +0,0 @@
# -*- coding: utf-8 -*-
"""
A simple implementation of a key case-insensitive dictionary.
..
Developers involved:
* Ivan Herman <http://www.ivan-herman.net>
* Sergio Fernández <http://www.wikier.org>
* Carlos Tejo Alonso <http://www.dayures.net>
* Alexey Zakhlestin <https://indeyets.ru/>
Organizations involved:
* `World Wide Web Consortium <http://www.w3.org>`_
* `Foundation CTIC <http://www.fundacionctic.org/>`_
:license: `W3C® Software notice and license <http://www.w3.org/Consortium/Legal/copyright-software>`_
"""
from typing import Dict, Mapping, TypeVar
_V = TypeVar("_V")
class KeyCaseInsensitiveDict(Dict[str, _V]):
"""
A simple implementation of a key case-insensitive dictionary
"""
def __init__(self, d: Mapping[str, _V]={}) -> None:
"""
:param dict d: The source dictionary.
"""
for k, v in d.items():
self[k] = v
def __setitem__(self, key: str, value: _V) -> None:
if hasattr(key, "lower"):
key = key.lower()
dict.__setitem__(self, key, value)
def __getitem__(self, key: str) -> _V:
if hasattr(key, "lower"):
key = key.lower()
return dict.__getitem__(self, key)
def __delitem__(self, key: str) -> None:
if hasattr(key, "lower"):
key = key.lower()
dict.__delitem__(self, key)

View File

@ -1,94 +0,0 @@
# -*- coding: utf-8 -*-
"""
SPARQL Wrapper exceptions
..
Developers involved:
* Ivan Herman <http://www.ivan-herman.net>
* Sergio Fernández <http://www.wikier.org>
* Carlos Tejo Alonso <http://www.dayures.net>
* Alexey Zakhlestin <https://indeyets.ru/>
Organizations involved:
* `World Wide Web Consortium <http://www.w3.org>`_
* `Foundation CTIC <http://www.fundacionctic.org/>`_
:license: `W3C® Software notice and license <http://www.w3.org/Consortium/Legal/copyright-software>`_
"""
from typing import Optional
class SPARQLWrapperException(Exception):
"""
Base class for SPARQL Wrapper exceptions
"""
msg = "an exception has occurred"
def __init__(self, response: Optional[bytes] = None):
"""
:param string response: The server response
"""
if response:
formatted_msg = "%s: %s. \n\nResponse:\n%r" % (
self.__class__.__name__,
self.msg,
response,
)
else:
formatted_msg = "%s: %s." % (self.__class__.__name__, self.msg)
super(SPARQLWrapperException, self).__init__(formatted_msg)
class EndPointInternalError(SPARQLWrapperException):
"""
Exception type for Internal Server Error responses. Usually HTTP response status code ``500``.
"""
msg = "The endpoint returned the HTTP status code 500"
class QueryBadFormed(SPARQLWrapperException):
"""
Query Bad Formed exception. Usually HTTP response status code ``400``.
"""
msg = "A bad request has been sent to the endpoint: probably the SPARQL query is badly formed"
class EndPointNotFound(SPARQLWrapperException):
"""
End Point Not Found exception. Usually HTTP response status code ``404``.
"""
msg = "It was not possible to connect to the given endpoint: check it is correct"
class Unauthorized(SPARQLWrapperException):
"""
Access is denied due to invalid credentials (unauthorized). Usually HTTP response status code ``401``.
.. versionadded:: 1.8.2
"""
msg = "Access to that endpoint is denied due to invalid credentials (unauthorized). Check the credentials"
class URITooLong(SPARQLWrapperException):
"""
The URI requested by the client is longer than the server is willing to interpret. Usually HTTP response
status code ``414``.
.. versionadded:: 1.8.3
"""
msg = (
"The URI requested by the client is longer than the server is willing to interpret. "
"Check if the request was sent using GET method instead of POST method."
)

View File

@ -1,366 +0,0 @@
# -*- coding: utf-8 -*-
"""
..
Developers involved:
* Ivan Herman <http://www.ivan-herman.net>
* Sergio Fernández <http://www.wikier.org>
* Carlos Tejo Alonso <http://www.dayures.net>
* Alexey Zakhlestin <https://indeyets.ru/>
Organizations involved:
* `World Wide Web Consortium <http://www.w3.org>`_
* `Foundation CTIC <http://www.fundacionctic.org/>`_
:license: `W3C® Software notice and license <http://www.w3.org/Consortium/Legal/copyright-software>`_
:requires: `RDFLib <https://rdflib.readthedocs.io>`_ package.
"""
from typing import Any, Dict, List, Optional, Tuple, Union
from SPARQLWrapper.Wrapper import JSON, SELECT, QueryResult
from SPARQLWrapper.Wrapper import SPARQLWrapper as SW
######################################################################################
class Value(object):
"""
Class encapsulating a single binding for a variable.
:ivar variable: The original variable, stored for an easier reference.
:vartype variable: string
:ivar value: Value of the binding.
:vartype value: string
:ivar type: Type of the binding. One of :attr:`Value.URI`, :attr:`Value.Literal`, :attr:`Value.TypedLiteral`, or
:attr:`Value.BNODE`.
:vartype type: string
:ivar lang: Language tag of the binding, or ``None`` if not set.
:vartype lang: string
:ivar datatype: Datatype of the binding, or ``None`` if not set. It is an URI.
:vartype datatype: string
"""
URI = "uri"
"""the string denoting a URI variable."""
Literal = "literal"
"""the string denoting a Literal variable."""
TypedLiteral = "typed-literal"
"""the string denoting a typed literal variable."""
BNODE = "bnode"
"""the string denoting a blank node variable."""
def __init__(self, variable: str, binding: Dict[str, str]) -> None:
"""
:param variable: the variable for that binding. Stored for an easier reference.
:type variable: string
:param binding: the binding dictionary part of the return result for a specific binding.
:type binding: dict
"""
self.variable = variable
self.value = binding["value"]
self.type = binding["type"]
self.lang = None
self.datatype = None
try:
self.lang = binding["xml:lang"]
except:
# no lang is set
pass
try:
self.datatype = binding["datatype"]
except:
pass
def __repr__(self) -> str:
cls = self.__class__.__name__
return "%s(%s:%r)" % (cls, self.type, self.value)
######################################################################################
class Bindings(object):
"""
Class encapsulating one query result, based on the JSON return format. It decodes the
return values to make it a bit more usable for a standard usage. The class consumes the
return value and instantiates a number of attributes that can be consulted directly. See
the list of variables.
The `Serializing SPARQL Query Results in JSON <http://www.w3.org/TR/rdf-sparql-json-res/>`_ explains the details of
the JSON return structures. Very succinctly: the return data has "bindings", which means a list of dictionaries.
Each dictionary is a possible binding of the SELECT variables to :class:`Value` instances. This structure is made a
bit more usable by this class.
:ivar fullResult: The original dictionary of the results, stored for an easier reference.
:vartype fullResult: dict
:ivar head: Header part of the return, see the JSON return format document for details.
:vartype head: dict
:ivar variables: List of unbounds (variables) of the original query. It is a list of strings. ``None`` in the case
of an ASK query.
:vartype variables: list
:ivar bindings: The final bindings: list of dictionaries, mapping variables to :class:`Value` instances. \
If unbound, then no value is set in the dictionary; that can be easily checked with \
``var in res.bindings[..]``, for example.
:vartype bindings: list
:ivar askResult: by default, set to **False**; in case of an ASK query, the result of the query.
:vartype askResult: bool
"""
def __init__(self, retval: QueryResult):
"""
:param retval: the query result.
:type retval: :class:`QueryResult<SPARQLWrapper.Wrapper.QueryResult>`
"""
self.fullResult = retval._convertJSON()
self.head = self.fullResult["head"]
self.variables: Optional[List[str]] = None
try:
self.variables = self.fullResult["head"]["vars"]
except:
pass
self.bindings: List[Dict[str, Value]] = []
try:
for b in self.fullResult["results"]["bindings"]:
# This is a single binding. It is a dictionary per variable; each value is a dictionary again
# that has to be converted into a Value instance
newBind = {}
# type error: Item "None" of "Union[List[str], Any, None]" has no attribute "__iter__" (not iterable)
for key in self.variables: # type: ignore [union-attr]
if key in b:
# there is a real binding for this key
newBind[key] = Value(key, b[key])
self.bindings.append(newBind)
except:
pass
self.askResult = False
try:
self.askResult = self.fullResult["boolean"]
except:
pass
def getValues(self, key: str) -> Optional[List[Value]]:
"""A shorthand for the retrieval of all bindings for a single key. It is
equivalent to ``[b[key] for b in self[key]]``
:param key: possible variable name.
:type key: string
:return: list of :class:`Value` instances.
:rtype: list
"""
try:
return [b[key] for b in self[key]]
except:
return []
def __contains__(self, key: Union[str, List[str], Tuple[str]]) -> bool:
"""Emulation of the "``key in obj``" operator. Key can be a string for a variable or an array/tuple
of strings.
If ``key`` is a variable, the return value is ``True`` if there is at least one binding where ``key`` is
bound. If ``key`` is an array or tuple, the return value is ``True`` if there is at least one binding
where *all* variables in ``key`` are bound.
:param key: possible variable, or array/tuple of variables
:return: whether there is a binding of the variable in the return
:rtype: Boolean
"""
if len(self.bindings) == 0:
return False
if type(key) is list or type(key) is tuple:
# check first whether they are all really variables
# type error: Unsupported right operand type for in ("Optional[List[str]]")
if False in [k in self.variables for k in key]: # type: ignore [operator]
return False
for b in self.bindings:
# try to find a binding where all key elements are present
if False in [k in b for k in key]:
# this is not a binding for the key combination, move on...
continue
else:
# yep, this one is good!
return True
return False
else:
# type error: Unsupported right operand type for in ("Optional[List[str]]")
if key not in self.variables: # type: ignore [operator]
return False
for b in self.bindings:
if key in b:
return True
return False
def __getitem__(self, key: Union[slice, str, List[str]]) -> List[Dict[str, Value]]:
"""Emulation of the ``obj[key]`` operator. Slice notation is also available.
The goal is to choose the right bindings among the available ones. The return values are always
arrays of bindings, ie, arrays of dictionaries mapping variable keys to :class:`Value` instances.
The different value settings mean the followings:
- ``obj[key]`` returns the bindings where ``key`` has a valid value
- ``obj[key1,key2,...]`` returns the bindings where *all* ``key1,key2,...`` have valid values
- ``obj[(key1,key2,...):(nkey1,nkey2,...)]`` returns the bindings where all ``key1,key2,...`` have
valid values and *none* of the ``nkey1,nkey2,...`` have valid values
- ``obj[:(nkey1,nkey2,...)]`` returns the bindings where *none* of the ``nkey1,nkey2,...`` have valid values
In all cases complete bindings are returned, ie, the values for other variables, not present among
the keys in the call, may or may not be present depending on the query results.
:param key: possible variable or array/tuple of keys with possible slice notation
:return: list of bindings
:rtype: array of variable -> :class:`Value` dictionaries
"""
def _checkKeys(keys: Union[List[Any], Tuple[Any, ...]]) -> bool:
if len(keys) == 0:
return False
for k in keys:
# type error: Unsupported right operand type for in ("Optional[List[str]]")
if (
not isinstance(k, str)
or k not in self.variables # type: ignore [operator]
):
return False
return True
def _nonSliceCase(
key: Union[
str,
List[Any],
Tuple[Any],
]
) -> Union[List[Any], bool, Tuple[Any]]:
# type error: Unsupported right operand type for in ("Optional[List[str]]")
if isinstance(key, str) and key != "" and key in self.variables: # type: ignore[operator]
# unicode or string:
return [key]
elif type(key) is list or type(key) is tuple:
if _checkKeys(key):
return key
return False
# The arguments should be reduced to arrays of variables, ie, unicode strings
yes_keys: Union[List[Any], bool, Tuple[Any]] = []
no_keys: Union[List[Any], bool, Tuple[Any]] = []
if type(key) is slice:
# Note: None for start or stop is all right
if key.start:
yes_keys = _nonSliceCase(key.start)
if not yes_keys:
raise TypeError
if key.stop:
no_keys = _nonSliceCase(key.stop)
if not no_keys:
raise TypeError
else:
yes_keys = _nonSliceCase(key)
# got it right, now get the right binding line with the constraints
retval: List[Dict[str, Value]] = []
for b in self.bindings:
# first check whether the 'yes' part is all there:
# type error: Item "bool" of "Union[List[Any], bool, Tuple[Any]]" has no attribute "__iter__" (not iterable)
if False in [k in b for k in yes_keys]: # type: ignore[union-attr]
continue
# type error: Item "bool" of "Union[List[Any], bool, Tuple[Any]]" has no attribute "__iter__" (not iterable)
if True in [k in b for k in no_keys]: # type: ignore[union-attr]
continue
# if we got that far, we should be all right!
retval.append(b)
# if retval is of zero length, no hit; an exception should be raised to stay within the python style
if len(retval) == 0:
raise IndexError
return retval
def convert(self) -> "Bindings":
"""This is just a convenience method, returns ``self``.
Although :class:`SPARQLWrapper2.Bindings` is not a subclass of
:class:`SPARQLWrapper.QueryResult<SPARQLWrapper.Wrapper.QueryResult>`, it is returned as a result by
:func:`SPARQLWrapper2.query`, just like :class:`QueryResult<SPARQLWrapper.Wrapper.QueryResult>` is returned by
:func:`SPARQLWrapper.query()<SPARQLWrapper.Wrapper.SPARQLWrapper.query>`. Consequently,
having an empty :func:`convert` method to imitate
:class:`QueryResult's convert() method<SPARQLWrapper.Wrapper.QueryResult.convert>`
may avoid unnecessary problems.
"""
return self
##############################################################################################################
class SPARQLWrapper2(SW):
"""Subclass of :class:`~SPARQLWrapper.Wrapper.SPARQLWrapper` that works with a JSON SELECT return result only. The
query result is automatically set to a :class:`Bindings` instance. Makes the average query processing a bit
simpler..."""
def __init__(self, baseURI: str, defaultGraph: Optional[str] = None):
"""
Class encapsulating a full SPARQL call. In contrast to the :class:`~SPARQLWrapper.Wrapper.SPARQLWrapper`
superclass, the return format cannot be set (it is defaulted to
:attr:`~SPARQLWrapper.Wrapper.SPARQLWrapper.JSON`).
:param baseURI: string of the SPARQL endpoint's URI.
:type baseURI: string
:param defaultGraph: URI for the default graph. Default is ``None``, can be set via an explicit call, too.
:type defaultGraph: string
"""
super(SPARQLWrapper2, self).__init__(
baseURI, returnFormat=JSON, defaultGraph=defaultGraph
)
def setReturnFormat(self, format: Optional[str]) -> None:
"""
Set the return format (:meth:`overriding the inherited method
<SPARQLWrapper.Wrapper.SPARQLWrapper.setReturnFormat>`).
.. warning::
This method does nothing; this class instance should work with JSON only. The method is defined \
just to avoid possible errors by erroneously setting the return format. \
When using this class, the user can safely ignore this call.
:param format: return format
:type format: string
"""
pass
def query(self) -> Union[Bindings, QueryResult]: # type: ignore[override]
"""
Execute the query and do an automatic conversion.
Exceptions can be raised if either the URI is wrong or the HTTP sends back an error.
The usual urllib2 exceptions are raised, which cover possible SPARQL errors, too.
If the query type is *not* SELECT, the method falls back to the
:meth:`corresponding method in the superclass<SPARQLWrapper.Wrapper.SPARQLWrapper.query>`.
:return: query result
:rtype: :class:`Bindings` instance
"""
res = super(SPARQLWrapper2, self).query()
if self.queryType == SELECT:
return Bindings(res)
else:
return res
def queryAndConvert( # type: ignore[override]
self,
) -> Union[Union[Bindings, QueryResult], QueryResult.ConvertResult]:
"""This is here to override the inherited method; it is equivalent to :class:`query`.
If the query type is *not* SELECT, the method falls back to the
:meth:`corresponding method in the superclass<SPARQLWrapper.Wrapper.SPARQLWrapper.queryAndConvert>`.
:return: the converted query result.
"""
if self.queryType == SELECT:
return self.query()
else:
return super(SPARQLWrapper2, self).queryAndConvert()

File diff suppressed because it is too large Load Diff

View File

@ -1,72 +0,0 @@
# -*- coding: utf8 -*-
"""
**SPARQLWrapper** is a simple Python wrapper around a `SPARQL <https://www.w3.org/TR/sparql11-overview/>`_ service to
remotelly execute your queries. It helps in creating the query
invokation and, possibly, convert the result into a more manageable
format.
"""
__version__ = "2.0.0"
"""The version of SPARQLWrapper"""
__agent__: str = f"sparqlwrapper {__version__} (rdflib.github.io/sparqlwrapper)"
from .SmartWrapper import SPARQLWrapper2
from .sparql_dataframe import get_sparql_dataframe
from .Wrapper import (
ASK,
BASIC,
CONSTRUCT,
CSV,
DELETE,
DESCRIBE,
DIGEST,
GET,
INSERT,
JSON,
JSONLD,
N3,
POST,
POSTDIRECTLY,
RDF,
RDFXML,
SELECT,
TSV,
TURTLE,
URLENCODED,
XML,
QueryResult,
SPARQLWrapper,
)
__all__ = [
"SPARQLWrapper2",
"get_sparql_dataframe",
"ASK",
"BASIC",
"CONSTRUCT",
"CSV",
"DELETE",
"DESCRIBE",
"DIGEST",
"GET",
"INSERT",
"JSON",
"JSONLD",
"N3",
"POST",
"POSTDIRECTLY",
"RDF",
"RDFXML",
"SELECT",
"TSV",
"TURTLE",
"URLENCODED",
"XML",
"QueryResult",
"SPARQLWrapper",
]

View File

@ -1,157 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import json
import os
import shutil
import sys
import xml
from typing import List, Optional
import rdflib
from . import __version__
from .Wrapper import SPARQLWrapper, _allowedAuth, _allowedFormats, _allowedRequests
class SPARQLWrapperFormatter(
argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter
):
pass
def check_file(v: str) -> str:
if os.path.isfile(v):
return v
elif v == "-":
return "-" # stdin
else:
raise argparse.ArgumentTypeError("file '%s' is not found" % v)
def choicesDescriptions() -> str:
d = "\n - ".join(["allowed FORMAT:"] + _allowedFormats)
d += "\n - ".join(["\n\nallowed METHOD:"] + _allowedRequests)
d += "\n - ".join(["\n\nallowed AUTH:"] + _allowedAuth)
return d
def parse_args(test: Optional[List[str]] = None) -> argparse.Namespace:
"""Parse arguments."""
parser = argparse.ArgumentParser(
prog="rqw",
formatter_class=(
lambda prog: SPARQLWrapperFormatter(
prog,
**{
"width": shutil.get_terminal_size(fallback=(120, 50)).columns,
"max_help_position": 30,
},
)
),
description="sparqlwrapper CLI",
epilog=choicesDescriptions(),
)
input_group = parser.add_mutually_exclusive_group(required=True)
input_group.add_argument(
"-f",
"--file",
metavar="FILE",
type=check_file,
help="query with sparql file (stdin: -)",
)
input_group.add_argument("-Q", "--query", metavar="QUERY", help="query with string")
parser.add_argument(
"-F",
"--format",
default="json",
metavar="FORMAT",
choices=_allowedFormats,
help="response format",
)
parser.add_argument(
"-e",
"--endpoint",
metavar="URI",
help="sparql endpoint",
default="http://dbpedia.org/sparql",
)
parser.add_argument(
"-m",
"--method",
metavar="METHOD",
choices=_allowedRequests,
help="request method",
)
parser.add_argument(
"-a", "--auth", metavar="AUTH", choices=_allowedAuth, help="HTTP auth"
)
parser.add_argument(
"-u", "--username", metavar="ID", default="guest", help="username for auth"
)
parser.add_argument(
"-p", "--password", metavar="PW", default="", help="password for auth"
)
parser.add_argument("-q", "--quiet", action="store_true", help="supress warnings")
parser.add_argument(
"-V", "--version", action="version", version="%(prog)s {}".format(__version__)
)
if test is None:
return parser.parse_args()
else:
return parser.parse_args(test)
def main(test: Optional[List[str]] = None) -> None:
args = parse_args(test)
if args.quiet:
import warnings
warnings.filterwarnings("ignore")
q = ""
if args.query is not None:
q = args.query
elif args.file == "-":
q = sys.stdin.read()
else:
q = open(args.file, "r").read()
sparql = SPARQLWrapper(
args.endpoint,
agent=(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/96.0.4664.110 Safari/537.36"
),
)
if args.auth is not None:
sparql.setHTTPAuth(args.auth)
sparql.setCredentials(args.username, args.password)
if args.method is not None:
sparql.setMethod(args.method)
sparql.setQuery(q)
sparql.setReturnFormat(args.format)
results = sparql.query().convert()
if isinstance(results, dict):
# "json"
print(json.dumps(results, indent=4))
elif isinstance(results, xml.dom.minidom.Document):
# "xml"
print(results.toxml())
elif isinstance(results, bytes):
# "csv", "tsv", "turtle", "n3"
print(results.decode("utf-8"))
elif isinstance(results, rdflib.graph.ConjunctiveGraph):
# "rdf"
print(results.serialize())
else:
# unknown type
raise TypeError(f"Unsupported result of type {type(results)}: {results!r}")
if __name__ == "__main__":
main()

View File

@ -1,74 +0,0 @@
"""
Query a SPARQL endpoint and return results as a Pandas dataframe.
"""
import io
from typing import TYPE_CHECKING, Any, Dict, List, Union
from SPARQLWrapper.SmartWrapper import Bindings, SPARQLWrapper2, Value
from SPARQLWrapper.Wrapper import CSV, SELECT, SPARQLWrapper
if TYPE_CHECKING:
import pandas as pd
class QueryException(Exception):
pass
def get_sparql_dataframe_orig(
endpoint: str, query: Union[str, bytes]
) -> "pd.DataFrame":
"""copy paste from: https://github.com/lawlesst/sparql-dataframe"""
# pandas inside to avoid requiring it
import pandas as pd
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(query)
if sparql.queryType != SELECT:
raise QueryException("Only SPARQL SELECT queries are supported.")
sparql.setReturnFormat(CSV)
results = sparql.query().convert()
if isinstance(results, bytes):
_csv = io.StringIO(results.decode("utf-8"))
return pd.read_csv(_csv, sep=",")
else:
raise TypeError(type(results))
def get_sparql_typed_dict(
endpoint: str, query: Union[str, bytes]
) -> List[Dict[str, Value]]:
"""modified from: https://github.com/lawlesst/sparql-dataframe"""
# pandas inside to avoid requiring it
import pandas as pd
# rdflib in here because there is some meta stuff in the setup.py and Travis fails because rdflib is installed later
import rdflib.term
sparql = SPARQLWrapper2(endpoint)
sparql.setQuery(query)
if sparql.queryType != SELECT:
raise QueryException("Only SPARQL SELECT queries are supported.")
# sparql.setReturnFormat(JSON)
results = sparql.query()
if not isinstance(results, Bindings):
raise TypeError(type(results))
# consider perf hacking later, probably slow
# convert list of dicts to python types
d = []
for x in results.bindings:
row = {}
for k in x:
v = x[k]
vv = rdflib.term.Literal(v.value, datatype=v.datatype).toPython() # type: ignore[no-untyped-call]
row[k] = vv
d.append(row)
return d
def get_sparql_dataframe(endpoint: str, query: Union[str, bytes]) -> "pd.DataFrame":
# pandas inside to avoid requiring it
import pandas as pd
d = get_sparql_typed_dict(endpoint, query)
# TODO: will nan fill somehow, make more strict if there is way of getting the nan types from rdflib
df = pd.DataFrame(d)
return df

View File

@ -1,132 +0,0 @@
import sys
import os
import re
import importlib
import warnings
is_pypy = '__pypy__' in sys.builtin_module_names
warnings.filterwarnings('ignore',
r'.+ distutils\b.+ deprecated',
DeprecationWarning)
def warn_distutils_present():
if 'distutils' not in sys.modules:
return
if is_pypy and sys.version_info < (3, 7):
# PyPy for 3.6 unconditionally imports distutils, so bypass the warning
# https://foss.heptapod.net/pypy/pypy/-/blob/be829135bc0d758997b3566062999ee8b23872b4/lib-python/3/site.py#L250
return
warnings.warn(
"Distutils was imported before Setuptools, but importing Setuptools "
"also replaces the `distutils` module in `sys.modules`. This may lead "
"to undesirable behaviors or errors. To avoid these issues, avoid "
"using distutils directly, ensure that setuptools is installed in the "
"traditional way (e.g. not an editable install), and/or make sure "
"that setuptools is always imported before distutils.")
def clear_distutils():
if 'distutils' not in sys.modules:
return
warnings.warn("Setuptools is replacing distutils.")
mods = [name for name in sys.modules if re.match(r'distutils\b', name)]
for name in mods:
del sys.modules[name]
def enabled():
"""
Allow selection of distutils by environment variable.
"""
which = os.environ.get('SETUPTOOLS_USE_DISTUTILS', 'stdlib')
return which == 'local'
def ensure_local_distutils():
clear_distutils()
# With the DistutilsMetaFinder in place,
# perform an import to cause distutils to be
# loaded from setuptools._distutils. Ref #2906.
add_shim()
importlib.import_module('distutils')
remove_shim()
# check that submodules load as expected
core = importlib.import_module('distutils.core')
assert '_distutils' in core.__file__, core.__file__
def do_override():
"""
Ensure that the local copy of distutils is preferred over stdlib.
See https://github.com/pypa/setuptools/issues/417#issuecomment-392298401
for more motivation.
"""
if enabled():
warn_distutils_present()
ensure_local_distutils()
class DistutilsMetaFinder:
def find_spec(self, fullname, path, target=None):
if path is not None:
return
method_name = 'spec_for_{fullname}'.format(**locals())
method = getattr(self, method_name, lambda: None)
return method()
def spec_for_distutils(self):
import importlib.abc
import importlib.util
class DistutilsLoader(importlib.abc.Loader):
def create_module(self, spec):
return importlib.import_module('setuptools._distutils')
def exec_module(self, module):
pass
return importlib.util.spec_from_loader('distutils', DistutilsLoader())
def spec_for_pip(self):
"""
Ensure stdlib distutils when running under pip.
See pypa/pip#8761 for rationale.
"""
if self.pip_imported_during_build():
return
clear_distutils()
self.spec_for_distutils = lambda: None
@staticmethod
def pip_imported_during_build():
"""
Detect if pip is being imported in a build script. Ref #2355.
"""
import traceback
return any(
frame.f_globals['__file__'].endswith('setup.py')
for frame, line in traceback.walk_stack(None)
)
DISTUTILS_FINDER = DistutilsMetaFinder()
def add_shim():
sys.meta_path.insert(0, DISTUTILS_FINDER)
def remove_shim():
try:
sys.meta_path.remove(DISTUTILS_FINDER)
except ValueError:
pass

View File

@ -1 +0,0 @@
__import__('_distutils_hack').do_override()

View File

@ -1,295 +0,0 @@
Metadata-Version: 2.3
Name: annotated-types
Version: 0.7.0
Summary: Reusable constraint types to use with typing.Annotated
Project-URL: Homepage, https://github.com/annotated-types/annotated-types
Project-URL: Source, https://github.com/annotated-types/annotated-types
Project-URL: Changelog, https://github.com/annotated-types/annotated-types/releases
Author-email: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>, Samuel Colvin <s@muelcolvin.com>, Zac Hatfield-Dodds <zac@zhd.dev>
License-File: LICENSE
Classifier: Development Status :: 4 - Beta
Classifier: Environment :: Console
Classifier: Environment :: MacOS X
Classifier: Intended Audience :: Developers
Classifier: Intended Audience :: Information Technology
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: POSIX :: Linux
Classifier: Operating System :: Unix
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Typing :: Typed
Requires-Python: >=3.8
Requires-Dist: typing-extensions>=4.0.0; python_version < '3.9'
Description-Content-Type: text/markdown
# annotated-types
[![CI](https://github.com/annotated-types/annotated-types/workflows/CI/badge.svg?event=push)](https://github.com/annotated-types/annotated-types/actions?query=event%3Apush+branch%3Amain+workflow%3ACI)
[![pypi](https://img.shields.io/pypi/v/annotated-types.svg)](https://pypi.python.org/pypi/annotated-types)
[![versions](https://img.shields.io/pypi/pyversions/annotated-types.svg)](https://github.com/annotated-types/annotated-types)
[![license](https://img.shields.io/github/license/annotated-types/annotated-types.svg)](https://github.com/annotated-types/annotated-types/blob/main/LICENSE)
[PEP-593](https://peps.python.org/pep-0593/) added `typing.Annotated` as a way of
adding context-specific metadata to existing types, and specifies that
`Annotated[T, x]` _should_ be treated as `T` by any tool or library without special
logic for `x`.
This package provides metadata objects which can be used to represent common
constraints such as upper and lower bounds on scalar values and collection sizes,
a `Predicate` marker for runtime checks, and
descriptions of how we intend these metadata to be interpreted. In some cases,
we also note alternative representations which do not require this package.
## Install
```bash
pip install annotated-types
```
## Examples
```python
from typing import Annotated
from annotated_types import Gt, Len, Predicate
class MyClass:
age: Annotated[int, Gt(18)] # Valid: 19, 20, ...
# Invalid: 17, 18, "19", 19.0, ...
factors: list[Annotated[int, Predicate(is_prime)]] # Valid: 2, 3, 5, 7, 11, ...
# Invalid: 4, 8, -2, 5.0, "prime", ...
my_list: Annotated[list[int], Len(0, 10)] # Valid: [], [10, 20, 30, 40, 50]
# Invalid: (1, 2), ["abc"], [0] * 20
```
## Documentation
_While `annotated-types` avoids runtime checks for performance, users should not
construct invalid combinations such as `MultipleOf("non-numeric")` or `Annotated[int, Len(3)]`.
Downstream implementors may choose to raise an error, emit a warning, silently ignore
a metadata item, etc., if the metadata objects described below are used with an
incompatible type - or for any other reason!_
### Gt, Ge, Lt, Le
Express inclusive and/or exclusive bounds on orderable values - which may be numbers,
dates, times, strings, sets, etc. Note that the boundary value need not be of the
same type that was annotated, so long as they can be compared: `Annotated[int, Gt(1.5)]`
is fine, for example, and implies that the value is an integer x such that `x > 1.5`.
We suggest that implementors may also interpret `functools.partial(operator.le, 1.5)`
as being equivalent to `Gt(1.5)`, for users who wish to avoid a runtime dependency on
the `annotated-types` package.
To be explicit, these types have the following meanings:
* `Gt(x)` - value must be "Greater Than" `x` - equivalent to exclusive minimum
* `Ge(x)` - value must be "Greater than or Equal" to `x` - equivalent to inclusive minimum
* `Lt(x)` - value must be "Less Than" `x` - equivalent to exclusive maximum
* `Le(x)` - value must be "Less than or Equal" to `x` - equivalent to inclusive maximum
### Interval
`Interval(gt, ge, lt, le)` allows you to specify an upper and lower bound with a single
metadata object. `None` attributes should be ignored, and non-`None` attributes
treated as per the single bounds above.
### MultipleOf
`MultipleOf(multiple_of=x)` might be interpreted in two ways:
1. Python semantics, implying `value % multiple_of == 0`, or
2. [JSONschema semantics](https://json-schema.org/draft/2020-12/json-schema-validation.html#rfc.section.6.2.1),
where `int(value / multiple_of) == value / multiple_of`.
We encourage users to be aware of these two common interpretations and their
distinct behaviours, especially since very large or non-integer numbers make
it easy to cause silent data corruption due to floating-point imprecision.
We encourage libraries to carefully document which interpretation they implement.
### MinLen, MaxLen, Len
`Len()` implies that `min_length <= len(value) <= max_length` - lower and upper bounds are inclusive.
As well as `Len()` which can optionally include upper and lower bounds, we also
provide `MinLen(x)` and `MaxLen(y)` which are equivalent to `Len(min_length=x)`
and `Len(max_length=y)` respectively.
`Len`, `MinLen`, and `MaxLen` may be used with any type which supports `len(value)`.
Examples of usage:
* `Annotated[list, MaxLen(10)]` (or `Annotated[list, Len(max_length=10))`) - list must have a length of 10 or less
* `Annotated[str, MaxLen(10)]` - string must have a length of 10 or less
* `Annotated[list, MinLen(3))` (or `Annotated[list, Len(min_length=3))`) - list must have a length of 3 or more
* `Annotated[list, Len(4, 6)]` - list must have a length of 4, 5, or 6
* `Annotated[list, Len(8, 8)]` - list must have a length of exactly 8
#### Changed in v0.4.0
* `min_inclusive` has been renamed to `min_length`, no change in meaning
* `max_exclusive` has been renamed to `max_length`, upper bound is now **inclusive** instead of **exclusive**
* The recommendation that slices are interpreted as `Len` has been removed due to ambiguity and different semantic
meaning of the upper bound in slices vs. `Len`
See [issue #23](https://github.com/annotated-types/annotated-types/issues/23) for discussion.
### Timezone
`Timezone` can be used with a `datetime` or a `time` to express which timezones
are allowed. `Annotated[datetime, Timezone(None)]` must be a naive datetime.
`Timezone[...]` ([literal ellipsis](https://docs.python.org/3/library/constants.html#Ellipsis))
expresses that any timezone-aware datetime is allowed. You may also pass a specific
timezone string or [`tzinfo`](https://docs.python.org/3/library/datetime.html#tzinfo-objects)
object such as `Timezone(timezone.utc)` or `Timezone("Africa/Abidjan")` to express that you only
allow a specific timezone, though we note that this is often a symptom of fragile design.
#### Changed in v0.x.x
* `Timezone` accepts [`tzinfo`](https://docs.python.org/3/library/datetime.html#tzinfo-objects) objects instead of
`timezone`, extending compatibility to [`zoneinfo`](https://docs.python.org/3/library/zoneinfo.html) and third party libraries.
### Unit
`Unit(unit: str)` expresses that the annotated numeric value is the magnitude of
a quantity with the specified unit. For example, `Annotated[float, Unit("m/s")]`
would be a float representing a velocity in meters per second.
Please note that `annotated_types` itself makes no attempt to parse or validate
the unit string in any way. That is left entirely to downstream libraries,
such as [`pint`](https://pint.readthedocs.io) or
[`astropy.units`](https://docs.astropy.org/en/stable/units/).
An example of how a library might use this metadata:
```python
from annotated_types import Unit
from typing import Annotated, TypeVar, Callable, Any, get_origin, get_args
# given a type annotated with a unit:
Meters = Annotated[float, Unit("m")]
# you can cast the annotation to a specific unit type with any
# callable that accepts a string and returns the desired type
T = TypeVar("T")
def cast_unit(tp: Any, unit_cls: Callable[[str], T]) -> T | None:
if get_origin(tp) is Annotated:
for arg in get_args(tp):
if isinstance(arg, Unit):
return unit_cls(arg.unit)
return None
# using `pint`
import pint
pint_unit = cast_unit(Meters, pint.Unit)
# using `astropy.units`
import astropy.units as u
astropy_unit = cast_unit(Meters, u.Unit)
```
### Predicate
`Predicate(func: Callable)` expresses that `func(value)` is truthy for valid values.
Users should prefer the statically inspectable metadata above, but if you need
the full power and flexibility of arbitrary runtime predicates... here it is.
For some common constraints, we provide generic types:
* `IsLower = Annotated[T, Predicate(str.islower)]`
* `IsUpper = Annotated[T, Predicate(str.isupper)]`
* `IsDigit = Annotated[T, Predicate(str.isdigit)]`
* `IsFinite = Annotated[T, Predicate(math.isfinite)]`
* `IsNotFinite = Annotated[T, Predicate(Not(math.isfinite))]`
* `IsNan = Annotated[T, Predicate(math.isnan)]`
* `IsNotNan = Annotated[T, Predicate(Not(math.isnan))]`
* `IsInfinite = Annotated[T, Predicate(math.isinf)]`
* `IsNotInfinite = Annotated[T, Predicate(Not(math.isinf))]`
so that you can write e.g. `x: IsFinite[float] = 2.0` instead of the longer
(but exactly equivalent) `x: Annotated[float, Predicate(math.isfinite)] = 2.0`.
Some libraries might have special logic to handle known or understandable predicates,
for example by checking for `str.isdigit` and using its presence to both call custom
logic to enforce digit-only strings, and customise some generated external schema.
Users are therefore encouraged to avoid indirection like `lambda s: s.lower()`, in
favor of introspectable methods such as `str.lower` or `re.compile("pattern").search`.
To enable basic negation of commonly used predicates like `math.isnan` without introducing introspection that makes it impossible for implementers to introspect the predicate we provide a `Not` wrapper that simply negates the predicate in an introspectable manner. Several of the predicates listed above are created in this manner.
We do not specify what behaviour should be expected for predicates that raise
an exception. For example `Annotated[int, Predicate(str.isdigit)]` might silently
skip invalid constraints, or statically raise an error; or it might try calling it
and then propagate or discard the resulting
`TypeError: descriptor 'isdigit' for 'str' objects doesn't apply to a 'int' object`
exception. We encourage libraries to document the behaviour they choose.
### Doc
`doc()` can be used to add documentation information in `Annotated`, for function and method parameters, variables, class attributes, return types, and any place where `Annotated` can be used.
It expects a value that can be statically analyzed, as the main use case is for static analysis, editors, documentation generators, and similar tools.
It returns a `DocInfo` class with a single attribute `documentation` containing the value passed to `doc()`.
This is the early adopter's alternative form of the [`typing-doc` proposal](https://github.com/tiangolo/fastapi/blob/typing-doc/typing_doc.md).
### Integrating downstream types with `GroupedMetadata`
Implementers may choose to provide a convenience wrapper that groups multiple pieces of metadata.
This can help reduce verbosity and cognitive overhead for users.
For example, an implementer like Pydantic might provide a `Field` or `Meta` type that accepts keyword arguments and transforms these into low-level metadata:
```python
from dataclasses import dataclass
from typing import Iterator
from annotated_types import GroupedMetadata, Ge
@dataclass
class Field(GroupedMetadata):
ge: int | None = None
description: str | None = None
def __iter__(self) -> Iterator[object]:
# Iterating over a GroupedMetadata object should yield annotated-types
# constraint metadata objects which describe it as fully as possible,
# and may include other unknown objects too.
if self.ge is not None:
yield Ge(self.ge)
if self.description is not None:
yield Description(self.description)
```
Libraries consuming annotated-types constraints should check for `GroupedMetadata` and unpack it by iterating over the object and treating the results as if they had been "unpacked" in the `Annotated` type. The same logic should be applied to the [PEP 646 `Unpack` type](https://peps.python.org/pep-0646/), so that `Annotated[T, Field(...)]`, `Annotated[T, Unpack[Field(...)]]` and `Annotated[T, *Field(...)]` are all treated consistently.
Libraries consuming annotated-types should also ignore any metadata they do not recongize that came from unpacking a `GroupedMetadata`, just like they ignore unrecognized metadata in `Annotated` itself.
Our own `annotated_types.Interval` class is a `GroupedMetadata` which unpacks itself into `Gt`, `Lt`, etc., so this is not an abstract concern. Similarly, `annotated_types.Len` is a `GroupedMetadata` which unpacks itself into `MinLen` (optionally) and `MaxLen`.
### Consuming metadata
We intend to not be prescriptive as to _how_ the metadata and constraints are used, but as an example of how one might parse constraints from types annotations see our [implementation in `test_main.py`](https://github.com/annotated-types/annotated-types/blob/f59cf6d1b5255a0fe359b93896759a180bec30ae/tests/test_main.py#L94-L103).
It is up to the implementer to determine how this metadata is used.
You could use the metadata for runtime type checking, for generating schemas or to generate example data, amongst other use cases.
## Design & History
This package was designed at the PyCon 2022 sprints by the maintainers of Pydantic
and Hypothesis, with the goal of making it as easy as possible for end-users to
provide more informative annotations for use by runtime libraries.
It is deliberately minimal, and following PEP-593 allows considerable downstream
discretion in what (if anything!) they choose to support. Nonetheless, we expect
that staying simple and covering _only_ the most common use-cases will give users
and maintainers the best experience we can. If you'd like more constraints for your
types - follow our lead, by defining them and documenting them downstream!

View File

@ -1,10 +0,0 @@
annotated_types-0.7.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
annotated_types-0.7.0.dist-info/METADATA,sha256=7ltqxksJJ0wCYFGBNIQCWTlWQGeAH0hRFdnK3CB895E,15046
annotated_types-0.7.0.dist-info/RECORD,,
annotated_types-0.7.0.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
annotated_types-0.7.0.dist-info/licenses/LICENSE,sha256=_hBJiEsaDZNCkB6I4H8ykl0ksxIdmXK2poBfuYJLCV0,1083
annotated_types/__init__.py,sha256=RynLsRKUEGI0KimXydlD1fZEfEzWwDo0Uon3zOKhG1Q,13819
annotated_types/__pycache__/__init__.cpython-310.pyc,,
annotated_types/__pycache__/test_cases.cpython-310.pyc,,
annotated_types/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
annotated_types/test_cases.py,sha256=zHFX6EpcMbGJ8FzBYDbO56bPwx_DYIVSKbZM-4B3_lg,6421

Some files were not shown because too many files have changed in this diff Show More