initial commit

This commit is contained in:
gumuArnold 2025-10-10 09:46:41 +02:00
commit 41e8b7103e
15310 changed files with 5698406 additions and 0 deletions

File diff suppressed because it is too large Load Diff

BIN
Box Ha-Ho.ods Normal file

Binary file not shown.

BIN
Box Ha-Klinc.ods Normal file

Binary file not shown.

BIN
Box Hu-J.ods Normal file

Binary file not shown.

BIN
Box K - Klinc.ods Normal file

Binary file not shown.

BIN
Input CSV/Box Ha-Klinc.ods Normal file

Binary file not shown.

View File

@ -0,0 +1 @@
,jarnold,workPC,10.10.2025 09:26,file:///home/jarnold/.config/libreoffice/4;

Binary file not shown.

212
Masterfile_Editor.py Normal file
View File

@ -0,0 +1,212 @@
import os
import re
import logging
import pandas as pd
import ezodf
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment
# -------------------------------------------------
# KONFIGURATION
# -------------------------------------------------
INPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods"
OUTPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Normvokabular_INTERN/NV_MASTER_Updated.ods"
MASTER_SHEET_NAME = "Masterstruktur"
SHEET_ORDER = [
"Masterstruktur",
"1 Figur",
"2 Objekt",
"3 Flora",
"4 Fauna",
"5 Landschaft",
"6 Phänomene, Erscheinungen",
"7 Architektur",
"8 Verzierungen, Ornamentik",
"9 Aktivität, Handlung, Pose"
]
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# -------------------------------------------------
# HELFERFUNKTIONEN
# -------------------------------------------------
def detect_id_and_name(df):
df_cols = [str(c).strip().lower() for c in df.columns]
id_col, name_col = None, None
for idx, col in enumerate(df_cols):
if col == "id":
id_col = df.columns[idx]
elif col in ["name", "wort", "wort/vokabel"]:
name_col = df.columns[idx]
if id_col is None or name_col is None:
logging.warning(f"Sheet hat keine ID oder Name/Wort-Spalte: {df.columns}")
return id_col, name_col
def parse_id_level(id_val):
if pd.isna(id_val):
return None
id_str = str(id_val).strip()
if re.match(r'^\d+(\.\d+){0,2}$', id_str):
return len(id_str.split("."))
return None
def process_category_df(df, sheet_name):
id_col, name_col = detect_id_and_name(df)
if id_col is None or name_col is None:
return None
current_level = {2: None, 3: None}
new_rows = []
for _, row in df.iterrows():
id_val = row[id_col] if pd.notna(row[id_col]) else ""
name_val = row[name_col] if pd.notna(row[name_col]) else ""
if not id_val and not name_val:
continue
level = parse_id_level(id_val)
if level:
if level >= 2:
current_level[level] = name_val
for deeper in range(level+1, 4):
current_level[deeper] = None
new_rows.append({
"ID": id_val,
"Unterkategorie": current_level[2] if level >= 2 else "",
"Unterunterkategorie": current_level[3] if level >= 3 else "",
"Wort/Vokabel": name_val
})
else:
new_rows.append({
"ID": "",
"Unterkategorie": "",
"Unterunterkategorie": "",
"Wort/Vokabel": name_val
})
df_new = pd.DataFrame(new_rows, columns=["ID", "Unterkategorie", "Unterunterkategorie", "Wort/Vokabel"])
logging.info(f"Sheet '{sheet_name}' verarbeitet: {len(df_new)} Zeilen")
return df_new
def merge_new_terms(original_df, processed_df):
"""Fügt neue Wörter aus original_df (ohne ID) in processed_df ein, wenn sie noch nicht vorhanden sind."""
_, orig_name_col = detect_id_and_name(original_df)
if orig_name_col is None or orig_name_col not in original_df.columns:
return processed_df
existing_words = set(str(x).strip().lower() for x in processed_df["Wort/Vokabel"].dropna())
new_rows = []
for _, row in original_df.iterrows():
name = str(row.get(orig_name_col, "")).strip()
id_val = str(row.get("ID", "")).strip() if "ID" in row else ""
if not name:
continue
if not id_val and name.lower() not in existing_words:
new_rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": "", "Wort/Vokabel": name})
if new_rows:
df_new = pd.concat([processed_df, pd.DataFrame(new_rows)], ignore_index=True)
logging.info(f"{len(new_rows)} neue Wörter übernommen.")
return df_new
return processed_df
def build_master_df(category_dfs):
seen_ids = set()
master_rows = []
for df in category_dfs:
for _, row in df.iterrows():
id_val = row["ID"]
name_val = row["Wort/Vokabel"]
if id_val and id_val not in seen_ids:
seen_ids.add(id_val)
master_rows.append({"ID": id_val, "Name": name_val})
master_df = pd.DataFrame(master_rows)
logging.info(f"Masterstruktur enthält {len(master_df)} eindeutige IDs")
return master_df
# -------------------------------------------------
# FORMATIERUNG UND SPEICHERN
# -------------------------------------------------
def format_excel_sheet(df, sheet_name, writer):
df.to_excel(writer, sheet_name=sheet_name, index=False)
worksheet = writer.sheets[sheet_name]
for col_idx, col in enumerate(df.columns, 1):
max_len = max([len(str(cell)) if cell is not None else 0 for cell in df[col]])
max_len = max(max_len, len(col)) + 2
worksheet.column_dimensions[get_column_letter(col_idx)].width = max_len
for row_idx in range(1, len(df) + 2):
worksheet.cell(row=row_idx, column=col_idx).alignment = Alignment(horizontal='left')
def save_ods(processed_sheets, output_file):
doc = ezodf.newdoc(doctype="ods")
for name, df in processed_sheets.items():
df = df.fillna("")
sheet = ezodf.Sheet(name, size=(len(df) + 1, len(df.columns)))
doc.sheets += sheet
for col_idx, col_name in enumerate(df.columns):
sheet[0, col_idx].set_value(str(col_name))
for row_idx, row in enumerate(df.itertuples(index=False), start=1):
for col_idx, value in enumerate(row):
if value is None or str(value).lower() == "nan":
value = ""
sheet[row_idx, col_idx].set_value(str(value))
doc.saveas(output_file)
logging.info(f"ODS-Datei gespeichert: {output_file}")
# -------------------------------------------------
# HAUPTPROGRAMM
# -------------------------------------------------
def main():
if not os.path.exists(INPUT_FILE):
logging.error(f"Datei {INPUT_FILE} existiert nicht.")
return
ext = os.path.splitext(INPUT_FILE)[1].lower()
engine = None
if ext in [".xlsx", ".xls"]:
engine = "openpyxl"
elif ext == ".ods":
engine = "odf"
else:
logging.error("Nicht unterstütztes Dateiformat")
return
logging.info(f"Lade Datei {INPUT_FILE} mit Engine '{engine}'")
xls = pd.ExcelFile(INPUT_FILE, engine=engine)
processed_sheets = {}
category_dfs = []
for sheet_name in xls.sheet_names:
if sheet_name == MASTER_SHEET_NAME:
continue
df = pd.read_excel(xls, sheet_name=sheet_name, engine=engine)
df_new = process_category_df(df, sheet_name)
if df_new is not None:
df_merged = merge_new_terms(df, df_new)
processed_sheets[sheet_name] = df_merged
category_dfs.append(df_merged)
else:
processed_sheets[sheet_name] = df
master_df = build_master_df(category_dfs)
processed_sheets[MASTER_SHEET_NAME] = master_df
ordered_sheets = {name: processed_sheets[name] for name in SHEET_ORDER if name in processed_sheets}
ext_out = os.path.splitext(OUTPUT_FILE)[1].lower()
if ext_out in [".xlsx", ".xls"]:
with pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl") as writer:
for name, df in ordered_sheets.items():
format_excel_sheet(df, name, writer)
logging.info(f"Excel-Datei gespeichert: {OUTPUT_FILE}")
elif ext_out == ".ods":
save_ods(ordered_sheets, OUTPUT_FILE)
if __name__ == "__main__":
main()

BIN
NV_MASTER.ods Normal file

Binary file not shown.

BIN
NV_MASTER_Updated.ods Normal file

Binary file not shown.

171
NV_Master_EditorFAIL.py Normal file
View File

@ -0,0 +1,171 @@
import os
import re
import logging
import datetime
import pandas as pd
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment
import ezodf
# ----------------- KONFIGURATION -----------------
INPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods"
MASTER_SHEET_NAME = "Masterstruktur"
today = datetime.datetime.today().strftime("%y.%m.%d")
base, ext = os.path.splitext(INPUT_FILE)
OUTPUT_FILE = f"{base}_Updated_{today}{ext}"
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# ----------------- HILFSFUNKTIONEN -----------------
def load_file(input_file):
"""
Prüft Dateiformat und gibt für Excel: pd.ExcelFile + Engine zurück,
für ODS: None + "odf" (da ODS direkt über ezodf gelesen wird).
"""
ext = os.path.splitext(input_file)[1].lower()
if ext in [".xlsx", ".xls"]:
engine = "openpyxl"
xls = pd.ExcelFile(input_file, engine=engine)
elif ext == ".ods":
engine = "odf"
xls = None # ODS wird direkt über ezodf gelesen
else:
raise ValueError(f"Nicht unterstütztes Dateiformat: {ext}")
logging.info(f"Lade Datei {input_file} mit Engine '{engine}'")
return xls, engine
def read_ods_sheet(filename, sheet_name):
"""Liests ODS Sheet sauber ein, inklusive Header."""
doc = ezodf.opendoc(filename)
sheet = doc.sheets[sheet_name]
data = []
headers = [str(sheet[0, col].value).strip() for col in range(sheet.ncols())]
for row_idx in range(1, sheet.nrows()):
row = {}
empty_row = True
for col_idx, col_name in enumerate(headers):
cell_val = sheet[row_idx, col_idx].value
val = "" if cell_val is None else str(cell_val).strip()
row[col_name] = val
if val:
empty_row = False
if not empty_row:
data.append(row)
df = pd.DataFrame(data, columns=headers)
return df
def process_category_sheet(df):
"""Erstellt die treppenartige Hierarchie."""
df = df.copy()
for col in ["ID","Unterkategorie","Unterunterkategorie","Wort/Vokabel"]:
if col not in df.columns:
df[col] = ""
rows = []
current_id = ""
current_uuk = ""
for _, r in df.iterrows():
id_val = str(r.get("ID","")).strip()
uuk_val = str(r.get("Unterunterkategorie","")).strip()
word_val = str(r.get("Wort/Vokabel","")).strip()
if id_val: # Kategoriezeile
current_id = id_val
current_uuk = uuk_val or word_val
rows.append({"ID": current_id, "Unterkategorie": "", "Unterunterkategorie": current_uuk, "Wort/Vokabel": ""})
continue
if uuk_val: # Unterunterkategorie
current_uuk = uuk_val
rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": current_uuk, "Wort/Vokabel": ""})
continue
if word_val: # Vokabel
rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": "", "Wort/Vokabel": word_val})
continue
return pd.DataFrame(rows, columns=["ID","Unterkategorie","Unterunterkategorie","Wort/Vokabel"])
def remove_empty_vocabulary_rows(df):
"""Entfernt Zeilen, die nur leere Wort/Vokabel-Spalte haben."""
return df[df["Wort/Vokabel"].astype(str).str.strip() != ""].copy().reset_index(drop=True)
def sync_master_and_sheets(master_df, category_dfs):
"""Synchronisiert Kategorien nach Master, Vokabeln bleiben erhalten."""
master_df = master_df.copy()
master_df["ID"] = master_df["ID"].astype(str).str.strip()
master_dict = dict(zip(master_df["ID"], master_df["Kategorie"]))
updated_dfs = {}
summary = {}
for sheet_name, df in category_dfs.items():
rows_out = []
changes = {"removed":0}
for _, row in df.iterrows():
id_val = str(row.get("ID","")).strip()
if id_val and id_val not in master_dict:
changes["removed"] +=1
continue
rows_out.append(row.to_dict())
updated_dfs[sheet_name] = pd.DataFrame(rows_out, columns=df.columns)
summary[sheet_name] = changes
new_master = pd.DataFrame([{"ID":k,"Kategorie":v} for k,v in sorted(master_dict.items())])
return new_master, updated_dfs, summary
def save_excel(processed_sheets, output_file):
from openpyxl import Workbook
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
for sheet_name, df in processed_sheets.items():
df.to_excel(writer, sheet_name=sheet_name, index=False)
ws = writer.sheets[sheet_name]
for col_idx, col in enumerate(df.columns,1):
max_len = max(df[col].astype(str).map(len).max() if len(df)>0 else 0,len(col))+2
ws.column_dimensions[get_column_letter(col_idx)].width = max_len
for row_idx in range(1,len(df)+2):
ws.cell(row=row_idx,column=col_idx).alignment = Alignment(horizontal='left')
def save_ods(processed_sheets, output_file):
doc = ezodf.newdoc(doctype="ods", filename=output_file)
for name, df in processed_sheets.items():
sheet = ezodf.Sheet(name, size=(len(df)+1,len(df.columns)))
doc.sheets += sheet
for col_idx, col_name in enumerate(df.columns):
sheet[0,col_idx].set_value(col_name)
for row_idx,row in enumerate(df.itertuples(index=False),start=1):
for col_idx,value in enumerate(row):
sheet[row_idx,col_idx].set_value("" if pd.isna(value) else value)
doc.save()
# ----------------- HAUPTPROGRAMM -----------------
def main():
xls, engine = load_file(INPUT_FILE)
if engine == "odf":
doc = ezodf.opendoc(INPUT_FILE)
sheet_names = [s.name for s in doc.sheets if s.name != MASTER_SHEET_NAME]
category_dfs = {name: process_category_sheet(read_ods_sheet(INPUT_FILE,name)) for name in sheet_names}
master_df = read_ods_sheet(INPUT_FILE, MASTER_SHEET_NAME)
else:
sheet_names = [s for s in xls.sheet_names if s != MASTER_SHEET_NAME]
category_dfs = {}
for sheet_name in sheet_names:
df = pd.read_excel(xls, sheet_name=sheet_name, engine=engine)
df.columns = [str(c).strip() for c in df.columns]
category_dfs[sheet_name] = process_category_sheet(df)
master_df = pd.read_excel(xls, sheet_name=MASTER_SHEET_NAME, engine=engine)
master_df.columns = [str(c).strip() for c in master_df.columns]
new_master, updated_dfs, summary = sync_master_and_sheets(master_df, category_dfs)
processed_sheets = {MASTER_SHEET_NAME:new_master}
processed_sheets.update({k:remove_empty_vocabulary_rows(v) for k,v in updated_dfs.items()})
ext_out = os.path.splitext(OUTPUT_FILE)[1].lower()
if ext_out in [".xlsx",".xls"]:
save_excel(processed_sheets, OUTPUT_FILE)
else:
save_ods(processed_sheets, OUTPUT_FILE)
logging.info(f"Datei gespeichert: {OUTPUT_FILE}")
logging.info("===== SYNC SUMMARY =====")
for sheet, info in summary.items():
logging.info(f"{sheet}: {info}")
if __name__ == "__main__":
main()

192
NV_Master_to_SPOT.py Normal file
View File

@ -0,0 +1,192 @@
import os
import json
import datetime
import pandas as pd
import ezodf
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# ---------------- SPOT-Baumstruktur ----------------
class Node:
def __init__(self, name, node_type="category", id=None):
self.name = name
self.id = id
self.type = node_type # "category", "subcategory", "word"
self.children = []
def add_child(self, child):
self.children.append(child)
def to_dict(self):
if self.type == "word":
return self.name
return {
"id": self.id,
"name": self.name,
"type": self.type,
"children": [c.to_dict() for c in self.children]
}
@staticmethod
def from_dict(d):
if isinstance(d, str):
return Node(d, "word")
node = Node(d["name"], d.get("type", "category"), d.get("id"))
node.children = [Node.from_dict(c) for c in d.get("children", [])]
return node
# ---------------- Funktionen zum Laden ----------------
def load_excel_or_ods(input_file, master_sheet="Masterstruktur"):
ext = os.path.splitext(input_file)[1].lower()
engine = "openpyxl" if ext in [".xlsx", ".xls"] else "odf"
xls = pd.ExcelFile(input_file, engine=engine)
sheet_names = [s for s in xls.sheet_names if s != master_sheet]
dfs = {s: pd.read_excel(xls, sheet_name=s, engine=engine) for s in sheet_names}
master_df = pd.read_excel(xls, sheet_name=master_sheet, engine=engine)
return master_df, dfs
# ---------------- Baum aus Sheet erstellen ----------------
def process_sheet_to_tree(df):
df = df.fillna("").astype(str)
tree_nodes = []
current_cat = None
current_sub = None
for idx, row in df.iterrows():
id_val = row.get("ID", "").strip()
uk_val = row.get("Unterkategorie", "").strip()
uuk_val = row.get("Unterunterkategorie", "").strip()
word_val = row.get("Wort/Vokabel", "").strip()
if id_val:
current_cat = Node(uk_val or word_val, "category", id=id_val)
tree_nodes.append(current_cat)
current_sub = None
elif uuk_val:
current_sub = Node(uuk_val, "subcategory")
if current_cat:
current_cat.add_child(current_sub)
elif word_val:
word_node = Node(word_val, "word")
if current_sub:
current_sub.add_child(word_node)
elif current_cat:
current_cat.add_child(word_node)
return tree_nodes
# ---------------- SPOT laden/speichern ----------------
def save_spot_json(tree_nodes, file_path):
with open(file_path, "w", encoding="utf-8") as f:
json.dump([n.to_dict() for n in tree_nodes], f, indent=2, ensure_ascii=False)
logging.info(f"SPOT gespeichert: {file_path}")
def load_spot_json(file_path):
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
return [Node.from_dict(n) for n in data]
# ---------------- Export in Excel ----------------
def export_spot_to_excel(tree_nodes, output_file):
wb = Workbook()
wb.remove(wb.active)
for node in tree_nodes:
ws = wb.create_sheet(title=node.name[:31])
row_idx = 1
# Kategorie
ws.cell(row=row_idx, column=1, value=node.id)
ws.cell(row=row_idx, column=2, value=node.name)
row_idx += 1
for sub in node.children:
if sub.type == "subcategory":
ws.cell(row=row_idx, column=3, value=sub.name)
row_idx += 1
for word in sub.children:
ws.cell(row=row_idx, column=4, value=word.name)
row_idx += 1
elif sub.type == "word":
ws.cell(row=row_idx, column=4, value=sub.name)
row_idx += 1
# Spaltenbreiten anpassen
for col_idx, col_letter in enumerate(["A","B","C","D"],1):
ws.column_dimensions[col_letter].width = 20
for r in range(1,row_idx):
ws.cell(r,col_idx).alignment = Alignment(horizontal='left')
wb.save(output_file)
logging.info(f"Excel exportiert: {output_file}")
# ---------------- Export in ODS ----------------
def export_spot_to_ods(tree_nodes, output_file):
doc = ezodf.newdoc(doctype="ods", filename=output_file)
for node in tree_nodes:
sheet = ezodf.Sheet(node.name[:31], size=(len(node.children)+10,4))
doc.sheets += sheet
sheet[0,0].set_value("ID")
sheet[0,1].set_value("Unterkategorie")
sheet[0,2].set_value("Unterunterkategorie")
sheet[0,3].set_value("Wort/Vokabel")
row_idx = 1
sheet[row_idx,0].set_value(node.id)
sheet[row_idx,1].set_value(node.name)
row_idx +=1
for sub in node.children:
if sub.type == "subcategory":
sheet[row_idx,2].set_value(sub.name)
row_idx +=1
for word in sub.children:
sheet[row_idx,3].set_value(word.name)
row_idx +=1
elif sub.type == "word":
sheet[row_idx,3].set_value(sub.name)
row_idx +=1
doc.save()
logging.info(f"ODS exportiert: {output_file}")
# ---------------- CLI-Funktionen zum Editieren ----------------
def add_category(tree_nodes, cat_id, cat_name):
tree_nodes.append(Node(cat_name, "category", id=cat_id))
logging.info(f"Kategorie hinzugefügt: {cat_id} {cat_name}")
def add_subcategory(tree_nodes, cat_id, sub_name):
for cat in tree_nodes:
if cat.id == cat_id:
cat.add_child(Node(sub_name, "subcategory"))
logging.info(f"Unterkategorie hinzugefügt: {sub_name} in {cat_id}")
return
def add_word(tree_nodes, cat_id, sub_name, word_name):
for cat in tree_nodes:
if cat.id == cat_id:
for sub in cat.children:
if sub.name == sub_name:
sub.add_child(Node(word_name, "word"))
logging.info(f"Wort hinzugefügt: {word_name} unter {sub_name}")
return
# ---------------- HAUPTPROGRAMM ----------------
def main():
INPUT_FILE = "NV_MASTER.ods" # Beispielpfad
OUTPUT_SPOT = "nv_spot.json"
today = datetime.datetime.today().strftime("%y.%m.%d")
OUTPUT_EXCEL = f"NV_MASTER_SPOT_{today}.xlsx"
OUTPUT_ODS = f"NV_MASTER_SPOT_{today}.ods"
master_df, dfs = load_excel_or_ods(INPUT_FILE)
spot_tree = []
for sheet, df in dfs.items():
spot_tree.extend(process_sheet_to_tree(df))
save_spot_json(spot_tree, OUTPUT_SPOT)
# Beispiel: Editieren
# add_category(spot_tree, "10.1", "Neue Kategorie")
# add_subcategory(spot_tree, "10.1", "Neue Unterunterkategorie")
# add_word(spot_tree, "10.1", "Neue Unterunterkategorie", "Neues Wort")
export_spot_to_excel(spot_tree, OUTPUT_EXCEL)
export_spot_to_ods(spot_tree, OUTPUT_ODS)
logging.info("SPOT-Workflow abgeschlossen.")
if __name__ == "__main__":
main()

449
NormVokabular_Mapper_1.1.py Normal file
View File

@ -0,0 +1,449 @@
"""
========================================================================
NormVokabular Mapper Übersicht
========================================================================
Dieses Skript dient dazu, Begriffe aus Datenbeständen mit einem
vordefinierten Normvokabular abzugleichen. Es identifiziert Treffer,
gibt bei fehlenden Treffern Vorschläge aus und ermöglicht optional
einen Abgleich mit externen APIs (GND, Wikidata).
Hauptfunktionen:
1. **Input verarbeiten**
- Liest CSV-, Excel- und ODS-Dateien aus dem Ordner "Input CSV".
- Extrahiert relevante Begriffe aus Spalten wie "Objektbeschreibung",
filtert Stopwords und Zahlen.
2. **Normvokabular laden**
- Liest die Masterdatei NV_MASTER.ods ein.
- Berücksichtigt Hierarchie-IDs, um übergeordnete Begriffe zuordnen zu können.
- Erstellt ein Index für gestemmte Begriffe, um auch ähnliche Schreibweisen zu erkennen.
3. **Mapping auf Normvokabular**
- Prüft, ob ein Begriff exakt oder gestemmt im Normvokabular vorkommt.
- Wenn kein Treffer vorliegt, werden alternative Vorschläge generiert.
4. **API-Abgleich (optional)**
- Fragt GND und Wikidata ab, um den Top-1 Treffer für jeden Begriff zu ermitteln.
- Nutzt einen Cache, um wiederholte Requests zu vermeiden.
- Bietet einen Dry-Run-Modus für Tests ohne Internetzugang.
5. **Ergebnis speichern**
- Speichert die Auswertung in einem eigenen Ordner "Auswertung Ergebnisse".
- Markiert Treffer visuell: grün = Treffer, rot = kein Treffer (bei Excel),
bzw. fügt Statusspalte bei ODS-Dateien hinzu.
- Enthält alle relevanten Informationen pro Begriff: Originalbegriff, Normbegriff,
Norm-ID, Vorschläge, GND/Wikidata Top1 Treffer.
6. **Logging**
- Informiert über Fortschritt, Anzahl der Begriffe, Treffer und mögliche Fehler.
**Nutzung:**
```bash
python normvokabular_mapper.py
python normvokabular_mapper.py --dry-run # nur Simulation der API-Abfragen
"""
import os
import sys
import time
import json
import re
import requests
import pandas as pd
from pathlib import Path
from difflib import SequenceMatcher
import argparse
from collections import defaultdict
# =========================
# Argumente / Dry-Run
# =========================
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true', help='API-Abfragen simulieren')
args = parser.parse_args()
DRY_RUN = args.dry_run
# =========================
# Konfiguration
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
TIMEOUT = 5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
CACHE_FILE = "api_cache.json"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
API_ACTIVE = {"gnd": True, "wikidata": True}
FAIL_COUNTER = {"gnd":0, "wikidata":0}
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
CONF_THRESHOLD = 0.75 # für Vorschläge
# =========================
# Logging
# =========================
def log(level, msg):
ts = time.strftime("%Y-%m-%d %H:%M:%S")
print(f"[{ts}] [{level}] {msg}")
# =========================
# Cache laden / speichern
# =========================
if os.path.exists(CACHE_FILE):
try:
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
log("INFO", f"Cache geladen: {CACHE_FILE}")
except:
CACHE = {}
else:
CACHE = {}
def save_cache():
try:
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
log("DEBUG","Cache gespeichert")
except Exception as e:
log("ERROR", f"Cache speichern fehlgeschlagen: {e}")
# =========================
# Normalisierung / Stemming
# =========================
try:
from nltk.stem.snowball import GermanStemmer
STEMMER = GermanStemmer()
log("INFO","NLTK GermanStemmer verfügbar")
except:
STEMMER = None
log("WARNING","NLTK nicht verfügbar, naive Pluralreduktion wird genutzt")
def normalize_text(s):
if s is None:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
def naive_stem(w):
for ending in ("ern","nen","en","er","e","n","s"):
if w.endswith(ending) and len(w)-len(ending)>=3:
return w[:-len(ending)]
return w
def stem_word(word):
w = normalize_text(word)
try:
return STEMMER.stem(w) if STEMMER else naive_stem(w)
except:
return naive_stem(w)
from collections import defaultdict
from difflib import SequenceMatcher
CONF_THRESHOLD = 0.75 # Confidence für Vorschläge
# =========================
# Normvokabular laden (NV_MASTER) mit Parent-ID & Stem-Index
# =========================
def load_normvokabular(file_path):
import pandas as pd
import re
log("INFO", f"Normvokabular laden: {file_path}")
engine = "odf" if file_path.suffix.lower() == ".ods" else None
sheets = pd.read_excel(file_path, sheet_name=None, engine=engine)
norm_dict = {}
stem_index = defaultdict(list)
count = 0
for sheet_name, df in sheets.items():
df.columns = [str(c).strip() for c in df.columns]
current_parent_id = None
for _, row in df.iterrows():
# Spaltennamen flexibel anpassen
id_val = str(row.get("ID","")).strip() if "ID" in df.columns else ""
wort = str(row.get("Wort/Vokabel","")).strip() if "Wort/Vokabel" in df.columns else ""
# Zeilen mit ID, aber ohne Vokabel → Update Parent-ID
if id_val:
current_parent_id = id_val
# Skip leere Vokabeln
if not wort:
continue
assigned_id = current_parent_id # Parent-ID übernehmen
key = normalize_text(wort)
entry = {
"Name": wort,
"ID": assigned_id,
"Sheet": sheet_name
}
norm_dict[key] = entry
stem_index[stem_word(key)].append(entry)
count += 1
log("INFO", f"{count} Begriffe aus Normvokabular geladen")
return norm_dict, stem_index
# =========================
# Mapping & Vorschläge
# =========================
def map_to_norm(term, norm_dict, stem_index):
tnorm = normalize_text(term)
tstem = stem_word(tnorm)
# Exakter Treffer
if tnorm in norm_dict:
e = norm_dict[tnorm]
return e["Name"], e["ID"], []
# Gestemmter Treffer
if tstem in stem_index:
e = stem_index[tstem][0]
return e["Name"], e["ID"], []
# Kein Treffer → Vorschläge
suggestions = get_suggestions(tnorm, norm_dict)
return "KEIN TREFFER", "", suggestions
def get_suggestions(term, norm_dict, top_n=3, threshold=CONF_THRESHOLD):
t = term.lower()
scores = []
for key, val in norm_dict.items():
score = SequenceMatcher(None, t, key).ratio()
if score >= threshold:
scores.append((score, val["Name"], val["ID"]))
scores.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in scores[:top_n]]
# =========================
# API-Abgleich (Top1) unverändert
# =========================
def request_with_retries(api_name,url,params=None):
if DRY_RUN:
return None
cache_key = url + str(params)
if cache_key in CACHE:
return CACHE[cache_key]
retries = 0
while retries<MAX_RETRIES:
try:
r = requests.get(url,params=params,timeout=TIMEOUT,headers=HEADERS)
if r.status_code==200:
try: data=r.json()
except: data=r.text
CACHE[cache_key]=data
FAIL_COUNTER[api_name]=0
return data
except:
pass
retries+=1
time.sleep(min(BACKOFF_FACTOR**retries,30))
FAIL_COUNTER[api_name]+=1
if FAIL_COUNTER[api_name]>=10:
API_ACTIVE[api_name]=False
return None
def compute_min_conf(term,api_name):
l=len(term.strip())
if l<=3: return 0.90
if l<=6: return 0.85 if api_name=='gnd' else 0.80
return 0.75 if api_name=='gnd' else 0.70
def batch_query_gnd(terms):
results={}
if DRY_RUN or not API_ACTIVE.get("gnd",False):
for t in terms: results[t]="TEST_GND"
return results
for t in terms:
url="https://lobid.org/gnd/search"
params={"q":t,"format":"json"}
data=request_with_retries("gnd",url,params)
top=""
if data and "member" in data:
min_conf=compute_min_conf(t,'gnd')
cands=[]
for doc in data["member"]:
name=doc.get("preferredName","") or doc.get("name","")
if not name: continue
conf=SequenceMatcher(None,t.lower(),name.lower()).ratio()
if conf>=min_conf: cands.append((name,conf))
if cands:
top=sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t]=top
return results
def batch_query_wikidata(terms):
results={}
if DRY_RUN or not API_ACTIVE.get("wikidata",False):
for t in terms: results[t]="TEST_WD"
return results
for t in terms:
url="https://www.wikidata.org/w/api.php"
params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
data=request_with_retries("wikidata",url,params)
top=""
if data and "search" in data:
min_conf=compute_min_conf(t,'wikidata')
cands=[]
for e in data["search"]:
label=e.get("label","")
if not label: continue
conf=SequenceMatcher(None,t.lower(),label.lower()).ratio()
if conf>=min_conf: cands.append((label,conf))
if cands:
top=sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t]=top
return results
# =========================
# Formatabhängige Markierung / Status
# =========================
def mark_norm_hits(file_path):
ext = file_path.suffix.lower()
if ext in [".xlsx", ".xls"]:
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
wb = load_workbook(file_path)
ws = wb.active
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
norm_col = col_map.get("Norm_Treffer", None)
if not norm_col:
log("WARNING","Spalte 'Norm_Treffer' nicht gefunden, keine Markierung möglich")
return
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
cell = row[0]
if cell.value and cell.value!="KEIN TREFFER":
cell.fill = green_fill
else:
cell.fill = red_fill
wb.save(file_path)
log("INFO","Excel: Treffer farblich markiert (grün=Treffer, rot=kein Treffer)")
elif ext==".ods":
df = pd.read_excel(file_path, engine="odf")
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
df.to_excel(file_path, index=False, engine="odf")
log("INFO","ODS: Spalte 'Norm_Status' eingefügt (Treffer / Kein Treffer)")
else:
log("WARNING","Unbekanntes Dateiformat, keine Markierung durchgeführt")
# =========================
# Verarbeitung Input-Dateien
# =========================
def process_files():
norm_dict, stem_index = load_normvokabular(NORMVOC_FILE)
total_terms=0
total_norm_hits=0
if not INPUT_DIR.exists():
log("CRITICAL",f"Eingabeordner {INPUT_DIR} fehlt")
sys.exit(1)
files=list(INPUT_DIR.glob("*"))
if not files:
log("WARNING","Keine Dateien gefunden")
for file_path in files:
if not file_path.suffix.lower() in [".ods",".xlsx",".csv",".xls"]:
continue
log("INFO",f"Verarbeite Datei: {file_path.name}")
# Output-Datei für diese Input-Datei erzeugen
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
version = 1
while output_file.exists():
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
version += 1
try:
if file_path.suffix.lower()==".csv":
df=pd.read_csv(file_path)
elif file_path.suffix.lower()==".ods":
df=pd.read_excel(file_path, engine="odf")
else:
df=pd.read_excel(file_path)
except Exception as e:
log("ERROR",f"Datei {file_path.name} konnte nicht gelesen werden: {e}")
continue
df.columns=[str(c).strip() for c in df.columns]
row_terms_map=[]
for _,row in df.iterrows():
besch=row.get("Objektbeschreibung","")
if pd.isna(besch) or not str(besch).strip(): continue
besch=str(besch).strip()
clauses=[c.strip() for c in re.split(r",",besch) if c.strip()]
terms=[]
for clause in clauses:
parts=[p.strip() for p in re.split(r"\s+",clause) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS: continue
if re.fullmatch(r"\d+",p): continue
terms.append(p)
obj_box=row.get("Objekt/Ebene","")
urheber=row.get("Urheber","")
row_terms_map.append((obj_box,urheber,terms))
all_terms=[]
for _,_,terms in row_terms_map:
all_terms.extend(terms)
all_terms = list(set(all_terms)) # unique
gnd_results=batch_query_gnd(all_terms)
wd_results=batch_query_wikidata(all_terms)
output_rows=[]
for obj_box,urheber,terms in row_terms_map:
for term in terms:
norm_name,norm_id,suggestions = map_to_norm(term,norm_dict, stem_index)
total_terms+=1
if norm_name!="KEIN TREFFER":
total_norm_hits+=1
out_row={
"Box": obj_box,
"Objekt/Ebene": obj_box,
"Urheber": urheber,
"Begriff": term,
"Norm_Treffer": norm_name,
"Norm_ID": norm_id,
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
"GND_Top1": gnd_results.get(term,""),
"WD_Top1": wd_results.get(term,"")
}
output_rows.append(out_row)
out_df=pd.DataFrame(output_rows)
engine = "odf" if output_file.suffix.lower()==".ods" else None
out_df.to_excel(output_file,index=False,engine=engine)
log("INFO",f"Auswertung gespeichert: {output_file}")
mark_norm_hits(output_file)
save_cache()
log("INFO",f"Gesamt: {total_terms} Begriffe, {total_norm_hits} Treffer im Normvokabular")
# =========================
# Main
# =========================
if __name__=="__main__":
process_files()
log("INFO","Fertig")

471
NormVokabular_Mapper_1.2.py Normal file
View File

@ -0,0 +1,471 @@
"""
========================================================================
NormVokabular Mapper Übersicht
========================================================================
Dieses Skript dient dazu, Begriffe aus Datenbeständen mit einem
vordefinierten Normvokabular abzugleichen. Es identifiziert Treffer,
gibt bei fehlenden Treffern Vorschläge aus und ermöglicht optional
einen Abgleich mit externen APIs (GND, Wikidata).
Hauptfunktionen:
1. **Input verarbeiten**
- Liest CSV-, Excel- und ODS-Dateien aus dem Ordner "Input CSV".
- Extrahiert relevante Begriffe aus Spalten wie "Objektbeschreibung",
filtert Stopwords und Zahlen.
2. **Normvokabular laden**
- Liest die Masterdatei NV_MASTER.ods ein.
- Berücksichtigt Hierarchie-IDs, um übergeordnete Begriffe zuordnen zu können.
- Erstellt ein Index für gestemmte Begriffe, um auch ähnliche Schreibweisen zu erkennen.
3. **Mapping auf Normvokabular**
- Prüft, ob ein Begriff exakt oder gestemmt im Normvokabular vorkommt.
- Wenn kein Treffer vorliegt, werden alternative Vorschläge generiert.
4. **API-Abgleich (optional)**
- Fragt GND und Wikidata ab, um den Top-1 Treffer für jeden Begriff zu ermitteln.
- Nutzt einen Cache, um wiederholte Requests zu vermeiden.
- Bietet einen Dry-Run-Modus für Tests ohne Internetzugang.
5. **Ergebnis speichern**
- Speichert die Auswertung in einem eigenen Ordner "Auswertung Ergebnisse".
- Markiert Treffer visuell: grün = Treffer, rot = kein Treffer (bei Excel),
bzw. fügt Statusspalte bei ODS-Dateien hinzu.
- Enthält alle relevanten Informationen pro Begriff: Originalbegriff, Normbegriff,
Norm-ID, Vorschläge, GND/Wikidata Top1 Treffer.
6. **Logging**
- Informiert über Fortschritt, Anzahl der Begriffe, Treffer und mögliche Fehler.
"""
import os
import sys
import re
import time
import json
import pandas as pd
import requests
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher
# RapidFuzz für Token-basierte Fuzzy-Suche
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
print("RapidFuzz verfügbar")
except ImportError:
RAPIDFUZZ_AVAILABLE = False
print("RapidFuzz nicht verfügbar nutze SequenceMatcher")
# Spacy Lemmatizer
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
print("Spacy Lemmatizer aktiviert")
except:
SPACY_AVAILABLE = False
nlp = None
print("Spacy nicht verfügbar nutze naive Stemmer")
# =========================
# Pfade & Config
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
CACHE_FILE = "api_cache.json"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
TIMEOUT = 5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
API_ACTIVE = {"gnd": True, "wikidata": True}
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
# Cache
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
def save_cache():
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
# =========================
# Normalisierung / Lemma
# =========================
def normalize_text(s):
if not s:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# =========================
# Kompositum-Zerlegung (erweitert)
# =========================
def compound_split(term, norm_dict):
"""
Zerlegt Komposita durch Prüfen auf Substrings, die im Normvokabular vorkommen.
"""
term_norm = normalize_text(term)
matches = []
for i in range(len(term_norm)):
for j in range(i+3, len(term_norm)+1):
sub = term_norm[i:j]
if sub in norm_dict and sub not in matches:
matches.append(sub)
if not matches:
matches = [term_norm]
return matches
# =========================
# Normvokabular laden & Lemma vorbereiten
# =========================
def load_normvokabular(file_path):
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
norm_dict = {}
stem_index = defaultdict(list)
lemma_norm_map = {}
for sheet_name, df in sheets.items():
if sheet_name.lower() in ["master", "übersicht"]:
continue
df = df.dropna(how="all", axis=1)
df.columns = [str(c).strip() for c in df.columns]
id_col = next((c for c in df.columns if "ID" in c), None)
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c), None)
if not id_col or not word_col:
continue
current_parent_id = None
for _, row in df.iterrows():
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
if row_id:
current_parent_id = row_id
if not row_word:
continue
assigned_parent_id = current_parent_id
entry = {
"Name": row_word,
"ID": assigned_parent_id,
"Sheet": sheet_name,
"Own_ID": row_id or ""
}
key = normalize_text(row_word)
norm_dict[key] = entry
lemma = lemmatize_term(key)
stem_index[lemma].append(entry)
if lemma not in lemma_norm_map:
lemma_norm_map[lemma] = entry
return norm_dict, stem_index, lemma_norm_map
# =========================
# Vorschläge & Fuzzy Matching
# =========================
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
candidates = []
for key_lemma, entry in lemma_norm_map.items():
if RAPIDFUZZ_AVAILABLE:
score_token = fuzz.token_set_ratio(term_lemma, key_lemma)/100
score_partial = fuzz.partial_ratio(term_lemma, key_lemma)/100
score = max(score_token, score_partial)
else:
score_seq = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
score = score_seq
# Substring-Boost
if term_lemma in key_lemma or key_lemma in term_lemma:
score = max(score, 0.9)
if score >= threshold:
candidates.append((score, entry["Name"], entry["ID"]))
candidates.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
# =========================
# Mapping auf Normvokabular
# =========================
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
# Exakter Treffer
if term_norm in norm_dict:
e = norm_dict[term_norm]
return e["Name"], e["ID"], []
# Lemma-Treffer
if term_lemma in stem_index:
e = stem_index[term_lemma][0]
return e["Name"], e["ID"], []
# KEIN TREFFER → Kompositum-Split & Teilbegriffe prüfen
tokens = compound_split(term, norm_dict)
token_matches = []
all_suggestions = []
for t in tokens:
t_lemma = lemmatize_term(t)
if t_lemma in stem_index:
e = stem_index[t_lemma][0]
token_matches.append((t, e["Name"], e["ID"]))
else:
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
all_suggestions.extend(sugg)
token_matches.append((t, "KEIN TREFFER", "", sugg))
combined_matches = [m[1] for m in token_matches if m[1] != "KEIN TREFFER"]
if combined_matches:
return "KEIN TREFFER", "", combined_matches
elif all_suggestions:
return "KEIN TREFFER", "", all_suggestions
else:
return "KEIN TREFFER", "", []
# =========================
# API-Abfragen
# =========================
def request_with_retries(api_name,url,params=None):
cache_key = url + str(params)
if cache_key in CACHE:
return CACHE[cache_key]
retries = 0
while retries < MAX_RETRIES:
try:
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
if r.status_code == 200:
try: data = r.json()
except: data = r.text
CACHE[cache_key] = data
FAIL_COUNTER[api_name] = 0
return data
except:
pass
retries += 1
time.sleep(min(BACKOFF_FACTOR**retries,30))
FAIL_COUNTER[api_name] += 1
if FAIL_COUNTER[api_name] >= 10:
API_ACTIVE[api_name] = False
return None
def batch_query_gnd(terms):
results={}
if not API_ACTIVE.get("gnd", False):
for t in terms: results[t] = ""
return results
for t in terms:
url="https://lobid.org/gnd/search"
params={"q":t,"format":"json"}
data = request_with_retries("gnd", url, params)
top = ""
if data and "member" in data:
cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None,t.lower(),(doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
cands = [c for c in cands if c[1]>=0.75]
if cands:
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t] = top
return results
def batch_query_wikidata(terms):
results={}
if not API_ACTIVE.get("wikidata", False):
for t in terms: results[t] = ""
return results
for t in terms:
url="https://www.wikidata.org/w/api.php"
params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
data = request_with_retries("wikidata", url, params)
top = ""
if data and "search" in data:
cands = [(e.get("label",""), SequenceMatcher(None,t.lower(),e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")]
cands = [c for c in cands if c[1]>=0.70]
if cands:
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t] = top
return results
# =========================
# Markierung / Export
# =========================
def mark_norm_hits(file_path):
ext = file_path.suffix.lower()
if ext in [".xlsx", ".xls"]:
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
wb = load_workbook(file_path)
ws = wb.active
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
norm_col = col_map.get("Norm_Treffer", None)
if not norm_col:
print("Spalte 'Norm_Treffer' nicht gefunden")
return
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
cell = row[0]
if cell.value and cell.value != "KEIN TREFFER":
cell.fill = green_fill
else:
cell.fill = red_fill
wb.save(file_path)
elif ext==".ods":
df = pd.read_excel(file_path, engine="odf")
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
df.to_excel(file_path, index=False, engine="odf")
# =========================
# Export mit zweitem Sheet für Begriffe ohne Treffer und Vorschlag
# =========================
def export_results_with_no_hits(out_df, output_file):
"""
Exportiert das Mapping-Ergebnis und zusätzlich ein zweites Sheet
mit allen Begriffen, deren Norm_Treffer == 'KEIN TREFFER' und Norm_Vorschlag leer ist.
"""
# Begriffe ohne Treffer und ohne Vorschlag
no_match_df = out_df[(out_df["Norm_Treffer"]=="KEIN TREFFER") & (out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip()==""))].copy()
ext = output_file.suffix.lower()
if ext in [".xlsx", ".xls"]:
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
out_df.to_excel(writer, index=False, sheet_name="Mapping")
no_match_df.to_excel(writer, index=False, sheet_name="Keine Treffer")
elif ext == ".ods":
# ODS-Export via odf-Engine
with pd.ExcelWriter(output_file, engine="odf") as writer:
out_df.to_excel(writer, index=False, sheet_name="Mapping")
no_match_df.to_excel(writer, index=False, sheet_name="Keine Treffer")
# =========================
# Verarbeitung Input-Dateien
# =========================
def process_files():
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
total_terms = 0
total_hits = 0
if not INPUT_DIR.exists():
print(f"Eingabeordner {INPUT_DIR} fehlt")
sys.exit(1)
files = list(INPUT_DIR.glob("*"))
if not files:
print("Keine Dateien gefunden")
return
for file_path in files:
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
continue
print(f"Verarbeite Datei: {file_path.name}")
try:
if file_path.suffix.lower() == ".csv":
df = pd.read_csv(file_path)
else:
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
except Exception as e:
print(f"Fehler beim Lesen von {file_path.name}: {e}")
continue
df = df.dropna(how="all")
df.columns = [str(c).strip() for c in df.columns]
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
urh_col = next((c for c in df.columns if "Urheber" in c), None)
if not besch_col: continue
row_terms_map = []
for _, row in df.iterrows():
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
if not besch: continue
obj_box = row[box_col] if box_col else ""
urheber = row[urh_col] if urh_col else ""
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
terms = []
for clause in clauses:
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS: continue
if re.fullmatch(r"\d+", p): continue
terms.append(p)
row_terms_map.append((obj_box, urheber, terms))
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
gnd_results = batch_query_gnd(all_terms)
wd_results = batch_query_wikidata(all_terms)
output_rows = []
for obj_box, urheber, terms in row_terms_map:
for term in terms:
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
total_terms += 1
if norm_name != "KEIN TREFFER":
total_hits += 1
out_row = {
"Box": obj_box,
"Objekt/Ebene": obj_box,
"Urheber": urheber,
"Begriff": term,
"Norm_Treffer": norm_name,
"Norm_ID": norm_id,
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
"GND_Top1": gnd_results.get(term,""),
"WD_Top1": wd_results.get(term,"")
}
output_rows.append(out_row)
out_df = pd.DataFrame(output_rows)
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
version = 1
while output_file.exists():
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
version += 1
export_results_with_no_hits(out_df, output_file)
mark_norm_hits(output_file)
print(f"Auswertung gespeichert: {output_file}")
save_cache()
print(f"Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular")
# =========================
# Main
# =========================
if __name__ == "__main__":
process_files()
print("Fertig")

509
NormVokabular_Mapper_1.3.py Normal file
View File

@ -0,0 +1,509 @@
"""
========================================================================
NormVokabular Mapper Übersicht
========================================================================
Dieses Skript dient dazu, Begriffe aus Datenbeständen mit einem
vordefinierten Normvokabular abzugleichen. Es identifiziert Treffer,
gibt bei fehlenden Treffern Vorschläge aus und ermöglicht optional
einen Abgleich mit externen APIs (GND, Wikidata).
Hauptfunktionen:
1. **Input verarbeiten**
- Liest CSV-, Excel- und ODS-Dateien aus dem Ordner "Input CSV".
- Extrahiert relevante Begriffe aus Spalten wie "Objektbeschreibung",
filtert Stopwords und Zahlen.
2. **Normvokabular laden**
- Liest die Masterdatei NV_MASTER.ods ein.
- Berücksichtigt Hierarchie-IDs, um übergeordnete Begriffe zuordnen zu können.
- Erstellt ein Index für gestemmte Begriffe, um auch ähnliche Schreibweisen zu erkennen.
3. **Mapping auf Normvokabular**
- Prüft, ob ein Begriff exakt oder gestemmt im Normvokabular vorkommt.
- Wenn kein Treffer vorliegt, werden alternative Vorschläge generiert.
4. **API-Abgleich (optional)**
- Fragt GND und Wikidata ab, um den Top-1 Treffer für jeden Begriff zu ermitteln.
- Nutzt einen Cache, um wiederholte Requests zu vermeiden.
- Bietet einen Dry-Run-Modus für Tests ohne Internetzugang.
5. **Ergebnis speichern**
- Speichert die Auswertung in einem eigenen Ordner "Auswertung Ergebnisse".
- Markiert Treffer visuell: grün = Treffer, rot = kein Treffer (bei Excel),
bzw. fügt Statusspalte bei ODS-Dateien hinzu.
- Enthält alle relevanten Informationen pro Begriff: Originalbegriff, Normbegriff,
Norm-ID, Vorschläge, GND/Wikidata Top1 Treffer.
6. **Logging**
- Informiert über Fortschritt, Anzahl der Begriffe, Treffer und mögliche Fehler.
"""
import os
import sys
import re
import time
import json
import pandas as pd
import requests
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher
# RapidFuzz für Token-basierte Fuzzy-Suche
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
print("RapidFuzz verfügbar")
except ImportError:
RAPIDFUZZ_AVAILABLE = False
print("RapidFuzz nicht verfügbar nutze SequenceMatcher")
# Spacy Lemmatizer
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
print("Spacy Lemmatizer aktiviert")
except:
SPACY_AVAILABLE = False
nlp = None
print("Spacy nicht verfügbar nutze naive Stemmer")
# =========================
# Pfade & Config
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
CACHE_FILE = "api_cache.json"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
TIMEOUT = 5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
API_ACTIVE = {"gnd": True, "wikidata": True}
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
# Cache
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
def save_cache():
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
# =========================
# Normalisierung / Lemma
# =========================
def normalize_text(s):
if not s:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
# Lemma-Cache
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# =========================
# Kompositum-Zerlegung (einfacher Ansatz)
# =========================
def compound_split(term):
parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
return parts if parts else [term]
# =========================
# Normvokabular laden & Lemma vorbereiten
# =========================
def load_normvokabular(file_path):
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
norm_dict = {}
stem_index = defaultdict(list)
lemma_norm_map = {} # für RapidFuzz preprocessed
for sheet_name, df in sheets.items():
if sheet_name.lower() in ["master", "übersicht"]:
continue
df = df.dropna(how="all", axis=1)
df.columns = [str(c).strip() for c in df.columns]
id_col = next((c for c in df.columns if "ID" in c), None)
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c), None)
if not id_col or not word_col:
continue
current_parent_id = None
for _, row in df.iterrows():
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
if row_id:
current_parent_id = row_id
if not row_word:
continue
assigned_parent_id = current_parent_id
entry = {
"Name": row_word,
"ID": assigned_parent_id, # Parent-ID
"Sheet": sheet_name,
"Own_ID": row_id or "" # eigene ID, falls vorhanden
}
key = normalize_text(row_word)
norm_dict[key] = entry
lemma = lemmatize_term(key)
stem_index[lemma].append(entry)
if lemma not in lemma_norm_map:
lemma_norm_map[lemma] = entry
return norm_dict, stem_index, lemma_norm_map
# =========================
# Mapping & Vorschläge
# =========================
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
# Exakter Treffer
if term_norm in norm_dict:
e = norm_dict[term_norm]
return e["Name"], e["ID"], []
# Lemma-Treffer
if term_lemma in stem_index:
e = stem_index[term_lemma][0]
return e["Name"], e["ID"], []
# KEIN TREFFER → Kompositum-Split
tokens = compound_split(term)
if len(tokens) == 1:
suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
return "KEIN TREFFER", "", suggestions
else:
token_matches = []
for t in tokens:
t_lemma = lemmatize_term(t)
if t_lemma in stem_index:
e = stem_index[t_lemma][0]
token_matches.append((t, e["Name"], e["ID"]))
else:
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
token_matches.append((t, "KEIN TREFFER", "", sugg))
combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
return "KEIN TREFFER", "", combined_suggestions
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
candidates = []
for key_lemma, entry in lemma_norm_map.items():
if RAPIDFUZZ_AVAILABLE:
score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
else:
score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
if key_lemma.lower().startswith(term_lemma.lower()):
score = min(score + 0.1, 1.0)
if score >= threshold:
candidates.append((score, entry["Name"], entry["ID"]))
candidates.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
# =========================
# API-Abfragen
# =========================
def request_with_retries(api_name,url,params=None):
cache_key = url + str(params)
if cache_key in CACHE:
return CACHE[cache_key]
retries = 0
while retries < MAX_RETRIES:
try:
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
if r.status_code == 200:
try: data = r.json()
except: data = r.text
CACHE[cache_key] = data
FAIL_COUNTER[api_name] = 0
return data
except:
pass
retries += 1
time.sleep(min(BACKOFF_FACTOR**retries,30))
FAIL_COUNTER[api_name] += 1
if FAIL_COUNTER[api_name] >= 10:
API_ACTIVE[api_name] = False
return None
def batch_query_gnd(terms):
results={}
if not API_ACTIVE.get("gnd", False):
for t in terms: results[t] = ""
return results
for t in terms:
url="https://lobid.org/gnd/search"
params={"q":t,"format":"json"}
data = request_with_retries("gnd", url, params)
top = ""
if data and "member" in data:
cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None,t.lower(),(doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
cands = [c for c in cands if c[1]>=0.75]
if cands:
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t] = top
return results
def batch_query_wikidata(terms):
results={}
if not API_ACTIVE.get("wikidata", False):
for t in terms: results[t] = ""
return results
for t in terms:
url="https://www.wikidata.org/w/api.php"
params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
data = request_with_retries("wikidata", url, params)
top = ""
if data and "search" in data:
cands = [(e.get("label",""), SequenceMatcher(None,t.lower(),e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")]
cands = [c for c in cands if c[1]>=0.70]
if cands:
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t] = top
return results
# =========================
# Markierung / Export
# =========================
def mark_norm_hits(file_path):
ext = file_path.suffix.lower()
if ext in [".xlsx", ".xls"]:
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
wb = load_workbook(file_path)
ws = wb.active
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
norm_col = col_map.get("Norm_Treffer", None)
if not norm_col:
print("Spalte 'Norm_Treffer' nicht gefunden")
return
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
cell = row[0]
if cell.value and cell.value != "KEIN TREFFER":
cell.fill = green_fill
else:
cell.fill = red_fill
wb.save(file_path)
elif ext==".ods":
df = pd.read_excel(file_path, engine="odf")
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
df.to_excel(file_path, index=False, engine="odf")
# =========================
# Verarbeitung Input-Dateien
# =========================
## =========================
# Neue Funktion: fehlende Begriffe in separate Datei exportieren
# =========================
def export_missing_terms(out_df, output_file):
# Filter: KEIN TREFFER & keine Vorschläge
missing_df = out_df[
(out_df["Norm_Treffer"] == "KEIN TREFFER") &
(out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
][["Begriff"]].drop_duplicates()
count_missing = len(missing_df)
print(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
if count_missing == 0:
return
# Neue Datei erzeugen
ext = output_file.suffix.lower()
base_name = output_file.stem
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}"
# Bei vorhandener Datei: Versionsnummer anhängen
version = 1
while missing_file.exists():
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}"
version += 1
if ext in [".xlsx", ".xls"]:
missing_df.to_excel(missing_file, index=False, engine="openpyxl")
elif ext == ".ods":
missing_df.to_excel(missing_file, index=False, engine="odf")
else:
# Für CSV
missing_df.to_csv(missing_file, index=False, sep=";")
print(f"Fehlende Begriffe gespeichert: {missing_file}")
# =========================
# Verarbeitung Input-Dateien (final)
# =========================
# =========================
# Neue Funktion: fehlende Begriffe in separate Datei exportieren
# =========================
def export_missing_terms(out_df, output_file):
# Filter: KEIN TREFFER & keine Vorschläge
missing_df = out_df[
(out_df["Norm_Treffer"] == "KEIN TREFFER") &
(out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
][["Begriff"]].drop_duplicates()
count_missing = len(missing_df)
print(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
if count_missing == 0:
return
# Neue Datei erzeugen
ext = output_file.suffix.lower()
base_name = output_file.stem
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}"
# Bei vorhandener Datei: Versionsnummer anhängen
version = 1
while missing_file.exists():
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}"
version += 1
if ext in [".xlsx", ".xls"]:
missing_df.to_excel(missing_file, index=False, engine="openpyxl")
elif ext == ".ods":
missing_df.to_excel(missing_file, index=False, engine="odf")
else:
# Für CSV
missing_df.to_csv(missing_file, index=False, sep=";")
print(f"Fehlende Begriffe gespeichert: {missing_file}")
# =========================
# Verarbeitung Input-Dateien (final)
# =========================
def process_files():
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
total_terms = 0
total_hits = 0
if not INPUT_DIR.exists():
print(f"Eingabeordner {INPUT_DIR} fehlt")
sys.exit(1)
files = list(INPUT_DIR.glob("*"))
if not files:
print("Keine Dateien gefunden")
return
for file_path in files:
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
continue
print(f"Verarbeite Datei: {file_path.name}")
try:
if file_path.suffix.lower() == ".csv":
df = pd.read_csv(file_path)
else:
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
except Exception as e:
print(f"Fehler beim Lesen von {file_path.name}: {e}")
continue
df = df.dropna(how="all")
df.columns = [str(c).strip() for c in df.columns]
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
urh_col = next((c for c in df.columns if "Urheber" in c), None)
if not besch_col: continue
row_terms_map = []
for _, row in df.iterrows():
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
if not besch: continue
obj_box = row[box_col] if box_col else ""
urheber = row[urh_col] if urh_col else ""
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
terms = []
for clause in clauses:
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS: continue
if re.fullmatch(r"\d+", p): continue
terms.append(p)
row_terms_map.append((obj_box, urheber, terms))
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
gnd_results = batch_query_gnd(all_terms)
wd_results = batch_query_wikidata(all_terms)
output_rows = []
for obj_box, urheber, terms in row_terms_map:
for term in terms:
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
total_terms += 1
if norm_name != "KEIN TREFFER":
total_hits += 1
out_row = {
"Box": obj_box,
"Objekt/Ebene": obj_box,
"Urheber": urheber,
"Begriff": term,
"Norm_Treffer": norm_name,
"Norm_ID": norm_id,
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
"GND_Top1": gnd_results.get(term,""),
"WD_Top1": wd_results.get(term,"")
}
output_rows.append(out_row)
out_df = pd.DataFrame(output_rows)
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
version = 1
while output_file.exists():
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
version += 1
engine = "odf" if output_file.suffix.lower()==".ods" else None
out_df.to_excel(output_file, index=False, engine=engine)
# --- NEU: fehlende Begriffe in separate Datei ---
export_missing_terms(out_df, output_file)
mark_norm_hits(output_file)
print(f"Auswertung gespeichert: {output_file}")
save_cache()
print(f"Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular")
# =========================
# Main
# =========================
if __name__ == "__main__":
process_files()
print("Fertig")

747
NormVokabular_Mapper_1.4.py Normal file
View File

@ -0,0 +1,747 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
NormVokabular Mapper Version 1.4.1
- Detailliertes (DEBUG) Batch-Logging: gepufferte Logs werden periodisch in Konsole + Datei geschrieben
- Getty AAT (SPARQL via requests) API-polite, timeout/retries/backoff
- Fehlertoleranz: API-Ausfälle führen nicht zum Totalabsturz
- Fehlende Begriffe -> separate Datei (gleiches Format wie Output)
- Bestehende Normalisierung/Lemmatisierung/Stemming wird weiterverwendet
- Batch-Logging-Modus (konfigurierbar)
"""
from __future__ import annotations
import os
import sys
import re
import time
import json
import threading
import queue
import requests
import pandas as pd
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher
from datetime import datetime
# Optional libs
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
# =========================
# Config & Pfade
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
CACHE_FILE = "api_cache.json"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
TIMEOUT_DEFAULT = 5
MAX_RETRIES_DEFAULT = 3
BACKOFF_FACTOR_DEFAULT = 2
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
API_ACTIVE = {"gnd": True, "wikidata": True, "aat": True}
FAIL_COUNTER = {"gnd": 0, "wikidata": 0, "aat": 0}
# Logging file
LOG_FILE = OUTPUT_DIR / "mapper_log.txt"
# Batch logging parameters
LOG_BATCH_SIZE = 100 # flush wenn >= Einträge
LOG_FLUSH_INTERVAL = 5.0 # Sekunden zwischen Flushes (Batch-Logging)
LOG_LEVEL = "DEBUG" # ausführlich gewünscht
# =========================
# Buffered/Batched Logger
# =========================
class BatchLogger:
def __init__(self, logfile: Path, flush_interval: float = 5.0, batch_size: int = 100, level: str = "DEBUG"):
self.logfile = logfile
self.flush_interval = flush_interval
self.batch_size = batch_size
self.level = level
self.q = queue.Queue()
self._stop_event = threading.Event()
self._thread = threading.Thread(target=self._worker, daemon=True, name="BatchLoggerThread")
# Ensure logfile exists
try:
logfile.parent.mkdir(parents=True, exist_ok=True)
logfile.touch(exist_ok=True)
except Exception:
pass
self._thread.start()
def _format(self, level: str, msg: str) -> str:
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
return f"{ts} - {level} - {msg}"
def log(self, level: str, msg: str):
if self._stop_event.is_set():
return
formatted = self._format(level, msg)
self.q.put((level, formatted))
# If queue too big, trigger immediate flush by putting a special token
if self.q.qsize() >= self.batch_size:
self.q.put(("__FLUSH__", "__FLUSH__"))
def debug(self, msg: str):
if LOG_LEVEL in ("DEBUG",):
self.log("DEBUG", msg)
def info(self, msg: str):
self.log("INFO", msg)
def warning(self, msg: str):
self.log("WARNING", msg)
def error(self, msg: str):
self.log("ERROR", msg)
def exception(self, msg: str):
self.log("EXCEPTION", msg)
def _worker(self):
buffer = []
last_flush = time.time()
while not self._stop_event.is_set() or not self.q.empty():
try:
item = None
try:
item = self.q.get(timeout=self.flush_interval)
except queue.Empty:
# time-based flush
if buffer:
self._flush_buffer(buffer)
buffer = []
last_flush = time.time()
continue
if item is None:
continue
level, formatted = item
if level == "__FLUSH__":
if buffer:
self._flush_buffer(buffer)
buffer = []
last_flush = time.time()
continue
buffer.append((level, formatted))
# flush conditions
if len(buffer) >= self.batch_size or (time.time() - last_flush) >= self.flush_interval:
self._flush_buffer(buffer)
buffer = []
last_flush = time.time()
except Exception as e:
# As a last resort, write error immediately to stderr
try:
sys.stderr.write(f"BatchLogger worker error: {e}\n")
except Exception:
pass
time.sleep(0.5)
# final flush
if buffer:
self._flush_buffer(buffer)
def _flush_buffer(self, buffer):
if not buffer:
return
# write to console and file
try:
# console
out_lines = [f"{line}\n" for _, line in buffer]
# write to stdout
try:
sys.stdout.writelines(out_lines)
sys.stdout.flush()
except Exception:
pass
# append to file
try:
with open(self.logfile, "a", encoding="utf-8") as f:
f.writelines(out_lines)
except Exception as e:
try:
sys.stderr.write(f"BatchLogger file write error: {e}\n")
except Exception:
pass
except Exception:
pass
def stop(self):
self._stop_event.set()
# put sentinel to wake worker
try:
self.q.put(("__FLUSH__", "__FLUSH__"))
except Exception:
pass
self._thread.join(timeout=5.0)
# Instantiate logger
logger = BatchLogger(LOG_FILE, flush_interval=LOG_FLUSH_INTERVAL, batch_size=LOG_BATCH_SIZE, level=LOG_LEVEL)
logger.info("Starte NormVokabular Mapper v1.4.1 (Batch-Logging aktiv)")
# =========================
# Cache laden/speichern
# =========================
if os.path.exists(CACHE_FILE):
try:
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
logger.debug(f"Cache geladen ({len(CACHE)} Einträge).")
except Exception as e:
logger.warning(f"Cache konnte nicht geladen werden: {e}")
CACHE = {}
else:
CACHE = {}
def save_cache():
try:
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
logger.debug("Cache gespeichert.")
except Exception as e:
logger.error(f"Cache konnte nicht gespeichert werden: {e}")
# =========================
# Normalisierung / Lemma / Tokenization
# =========================
def normalize_text(s):
if not s:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
except Exception:
lemma = term_norm
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
def compound_split(term):
if not term:
return []
parts = [p for p in re.split(r"[\s\-_/]+", term) if p]
return parts if parts else [term]
# =========================
# Normvokabular laden & Index
# =========================
def load_normvokabular(file_path):
try:
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
except Exception as e:
logger.error(f"Normvokabular konnte nicht geladen werden: {e}")
raise
norm_dict = {}
stem_index = defaultdict(list)
lemma_norm_map = {}
for sheet_name, df in sheets.items():
if sheet_name.lower() in ["master", "übersicht"]:
continue
df = df.dropna(how="all", axis=1)
df.columns = [str(c).strip() for c in df.columns]
id_col = next((c for c in df.columns if "ID" in c), None)
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c or "Begriff" in c), None)
if not id_col or not word_col:
continue
current_parent_id = None
for _, row in df.iterrows():
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
if row_id:
current_parent_id = row_id
if not row_word:
continue
assigned_parent_id = current_parent_id
entry = {"Name": row_word, "ID": assigned_parent_id or "", "Sheet": sheet_name, "Own_ID": row_id or ""}
key = normalize_text(row_word)
norm_dict[key] = entry
lemma = lemmatize_term(key)
stem_index[lemma].append(entry)
if lemma not in lemma_norm_map:
lemma_norm_map[lemma] = entry
logger.info(f"Normvokabular geladen: {len(norm_dict)} Einträge, {len(stem_index)} Stems")
return norm_dict, stem_index, lemma_norm_map
# =========================
# Mapping & Vorschläge
# =========================
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
if term_norm in norm_dict:
e = norm_dict[term_norm]
logger.debug(f"map_to_norm: exakter Treffer für '{term}' -> {e['Name']}")
return e["Name"], e["ID"], []
if term_lemma in stem_index:
e = stem_index[term_lemma][0]
logger.debug(f"map_to_norm: Lemma-Treffer für '{term}' -> {e['Name']}")
return e["Name"], e["ID"], []
tokens = compound_split(term_norm)
if len(tokens) == 1:
suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
logger.debug(f"map_to_norm: KEIN TREFFER für '{term}', Vorschläge: {suggestions}")
return "KEIN TREFFER", "", suggestions
else:
token_matches = []
for t in tokens:
t_lemma = lemmatize_term(t)
if t_lemma in stem_index:
e = stem_index[t_lemma][0]
token_matches.append((t, e["Name"], e["ID"]))
else:
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
token_matches.append((t, "KEIN TREFFER", "", sugg))
combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
logger.debug(f"map_to_norm: Kompositum '{term}' -> combined_suggestions: {combined_suggestions}")
return "KEIN TREFFER", "", combined_suggestions
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
candidates = []
for key_lemma, entry in lemma_norm_map.items():
if RAPIDFUZZ_AVAILABLE:
score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
else:
score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
if key_lemma.lower().startswith(term_lemma.lower()):
score = min(score + 0.1, 1.0)
if score >= threshold:
candidates.append((score, entry["Name"], entry["ID"]))
candidates.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
# =========================
# Generic request with retries & caching
# =========================
def request_with_retries_generic(api_name, url, params=None, headers=None, timeout=TIMEOUT_DEFAULT, max_retries=MAX_RETRIES_DEFAULT, backoff=BACKOFF_FACTOR_DEFAULT):
cache_key = url + (json.dumps(params, sort_keys=True, ensure_ascii=False) if params else "")
if cache_key in CACHE:
logger.debug(f"[Cache] {api_name}: {cache_key}")
return CACHE[cache_key]
retries = 0
while retries < max_retries:
try:
r = requests.get(url, params=params, headers=headers or HEADERS, timeout=timeout)
if r.status_code == 200:
try:
data = r.json()
except Exception:
data = r.text
CACHE[cache_key] = data
FAIL_COUNTER[api_name] = 0
logger.debug(f"[{api_name}] Erfolgreiche Antwort für {url}")
return data
else:
logger.warning(f"[{api_name}] HTTP {r.status_code} für {url}")
raise ValueError(f"HTTP {r.status_code}")
except Exception as e:
retries += 1
wait = backoff ** retries
logger.warning(f"[{api_name}] Fehler ({retries}/{max_retries}) für {url}: {e}. Warte {wait}s")
time.sleep(wait)
FAIL_COUNTER[api_name] += 1
if FAIL_COUNTER[api_name] >= 10:
API_ACTIVE[api_name] = False
logger.error(f"[{api_name}] Deaktiviere API nach zu vielen Fehlern.")
return None
# =========================
# GND / Wikidata (bestehend)
# =========================
def batch_query_gnd(terms):
results = {}
if not API_ACTIVE.get("gnd", False):
for t in terms: results[t] = ""
return results
logger.info(f"[GND] Starte GND-Abfragen für {len(terms)} Terme")
start = time.time()
for idx, t in enumerate(terms, start=1):
logger.debug(f"[GND] ({idx}/{len(terms)}) Anfrage für '{t}'")
url = "https://lobid.org/gnd/search"
params = {"q": t, "format": "json"}
data = request_with_retries_generic("gnd", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
top = ""
try:
if data and "member" in data:
cands = [(doc.get("preferredName","") or doc.get("name",""),
SequenceMatcher(None, t.lower(), (doc.get("preferredName","") or doc.get("name","")).lower()).ratio())
for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
cands = [c for c in cands if c[1] >= 0.75]
if cands:
top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
except Exception as e:
logger.debug(f"[GND] Fehler bei Verarbeitung für '{t}': {e}")
results[t] = top
elapsed = time.time() - start
logger.info(f"[GND] Fertig. Dauer: {elapsed:.1f}s")
return results
def batch_query_wikidata(terms):
results = {}
if not API_ACTIVE.get("wikidata", False):
for t in terms: results[t] = ""
return results
logger.info(f"[WD] Starte Wikidata-Abfragen für {len(terms)} Terme")
start = time.time()
for idx, t in enumerate(terms, start=1):
logger.debug(f"[WD] ({idx}/{len(terms)}) Anfrage für '{t}'")
url = "https://www.wikidata.org/w/api.php"
params = {"action": "wbsearchentities", "search": t, "language": "de", "format": "json"}
data = request_with_retries_generic("wikidata", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
top = ""
try:
if data and "search" in data:
cands = [(e.get("label",""), SequenceMatcher(None, t.lower(), e.get("label","").lower()).ratio())
for e in data["search"] if e.get("label","")]
cands = [c for c in cands if c[1] >= 0.70]
if cands:
top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
except Exception as e:
logger.debug(f"[WD] Fehler bei Verarbeitung für '{t}': {e}")
results[t] = top
elapsed = time.time() - start
logger.info(f"[WD] Fertig. Dauer: {elapsed:.1f}s")
return results
# =========================
# Getty AAT Abfrage robust & API-polite (requests)
# =========================
def batch_query_getty_aat(terms):
results = {}
if not API_ACTIVE.get("aat", False):
for t in terms: results[t] = ""
return results
endpoint = "https://vocab.getty.edu/sparql"
headers = {"Accept": "application/sparql-results+json", "User-Agent": HEADERS.get("User-Agent")}
TIMEOUT = 8
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
FAIL_LIMIT = 5
fail_counter_local = 0
logger.info(f"[AAT] Starte Getty AAT-Abgleich für {len(terms)} Terme")
start_all = time.time()
for idx, term in enumerate(terms, start=1):
term_norm = lemmatize_term(normalize_text(term))
tokens = compound_split(term_norm)
logger.debug(f"[AAT] ({idx}/{len(terms)}) Begriff '{term}' -> Tokens: {tokens}")
query_fragments = []
for tkn in tokens:
t_escaped = tkn.replace('"', '\\"')
qf = f"""
?concept skos:prefLabel ?label .
FILTER(lang(?label)='de' && CONTAINS(LCASE(?label), LCASE("{t_escaped}")))
"""
query_fragments.append(f"{{ {qf} }}")
query_body = " UNION ".join(query_fragments) if query_fragments else ""
query = f"PREFIX skos: <http://www.w3.org/2004/02/skos/core#> SELECT ?label ?concept WHERE {{ {query_body} }} LIMIT 10"
retries = 0
success = False
start_term = time.time()
while retries < MAX_RETRIES and not success:
try:
logger.debug(f"[AAT] Anfrage (Retry {retries}) für '{term}'")
r = requests.get(endpoint, params={"query": query}, headers=headers, timeout=TIMEOUT)
if r.status_code != 200:
raise ValueError(f"HTTP {r.status_code}")
ret = r.json()
candidates = [(b['label']['value'], b['concept']['value']) for b in ret.get("results", {}).get("bindings", [])]
if candidates:
scored = [
(c[0], c[1], SequenceMatcher(None, term_norm, lemmatize_term(normalize_text(c[0]))).ratio())
for c in candidates
]
top = max(scored, key=lambda x: x[2])
results[term] = top[0]
logger.debug(f"[AAT] Treffer für '{term}': {results[term]} (Score: {top[2]:.3f})")
else:
results[term] = ""
logger.debug(f"[AAT] Kein Treffer für '{term}'")
success = True
except Exception as e:
retries += 1
wait = BACKOFF_FACTOR ** retries
logger.warning(f"[AAT] Fehler ({retries}/{MAX_RETRIES}) für '{term}': {e} warte {wait}s")
time.sleep(wait)
if retries == MAX_RETRIES:
results[term] = ""
fail_counter_local += 1
# polite delay
time.sleep(1.0)
elapsed_term = time.time() - start_term
logger.debug(f"[AAT] Dauer für '{term}': {elapsed_term:.2f}s")
if fail_counter_local >= FAIL_LIMIT:
logger.error("[AAT] Zu viele Fehler lokal - breche AAT-Abfragen ab.")
for t_rem in terms[idx:]:
results[t_rem] = ""
FAIL_COUNTER["aat"] += fail_counter_local
API_ACTIVE["aat"] = False
break
elapsed_all = time.time() - start_all
logger.info(f"[AAT] Getty AAT-Abgleich abgeschlossen. Dauer: {elapsed_all:.1f}s")
return results
# =========================
# Markierung / Export (Excel/ODS)
# =========================
def mark_norm_hits(file_path):
ext = file_path.suffix.lower()
try:
if ext in [".xlsx", ".xls"]:
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
wb = load_workbook(file_path)
ws = wb.active
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
norm_col = col_map.get("Norm_Treffer", None)
if not norm_col:
logger.debug("Spalte 'Norm_Treffer' nicht gefunden (mark_norm_hits).")
wb.save(file_path)
return
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
cell = row[0]
if cell.value and cell.value != "KEIN TREFFER":
cell.fill = green_fill
else:
cell.fill = red_fill
wb.save(file_path)
elif ext==".ods":
df = pd.read_excel(file_path, engine="odf")
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
df.to_excel(file_path, index=False, engine="odf")
except Exception as e:
logger.warning(f"Fehler beim Markieren der Treffer in {file_path}: {e}")
# =========================
# Fehlende Begriffe -> separate Datei
# =========================
def export_missing_terms(out_df, output_file):
missing_df = out_df[
(out_df["Norm_Treffer"] == "KEIN TREFFER") &
(out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
][["Begriff"]].drop_duplicates()
count_missing = len(missing_df)
logger.info(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
if count_missing == 0:
return
ext = output_file.suffix.lower()
base_name = output_file.stem
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}"
version = 1
while missing_file.exists():
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}"
version += 1
try:
if ext in [".xlsx", ".xls"]:
missing_df.to_excel(missing_file, index=False, engine="openpyxl")
elif ext == ".ods":
missing_df.to_excel(missing_file, index=False, engine="odf")
else:
missing_df.to_csv(missing_file, index=False, sep=";")
logger.info(f"Fehlende Begriffe gespeichert: {missing_file}")
except Exception as e:
logger.error(f"Fehler beim Speichern der fehlenden Begriffe: {e}")
# =========================
# Haupt-Loop: Verarbeitung Input-Dateien
# =========================
def process_files():
overall_start = time.time()
try:
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
except Exception as e:
logger.error("Normvokabular konnte nicht geladen werden. Beende.")
raise
total_terms = 0
total_hits = 0
if not INPUT_DIR.exists():
logger.error(f"Eingabeordner {INPUT_DIR} fehlt")
raise SystemExit(1)
files = list(INPUT_DIR.glob("*"))
if not files:
logger.info("Keine Dateien gefunden")
return
logger.info(f"Starte Verarbeitung von {len(files)} Dateien")
for file_idx, file_path in enumerate(files, start=1):
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
logger.debug(f"Übersprungen (kein unterstütztes Format): {file_path.name}")
continue
logger.info(f"[Datei {file_idx}/{len(files)}] Verarbeite: {file_path.name}")
file_start = time.time()
try:
if file_path.suffix.lower() == ".csv":
df = pd.read_csv(file_path)
else:
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
except Exception as e:
logger.error(f"Fehler beim Lesen von {file_path.name}: {e}")
continue
df = df.dropna(how="all")
df.columns = [str(c).strip() for c in df.columns]
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
urh_col = next((c for c in df.columns if "Urheber" in c), None)
if not besch_col:
logger.warning(f"Spalte 'Objektbeschreibung' nicht gefunden in {file_path.name}. Datei übersprungen.")
continue
row_terms_map = []
for r_idx, row in enumerate(df.itertuples(index=False), start=1):
try:
besch = str(row[df.columns.get_loc(besch_col)]).strip() if pd.notna(row[df.columns.get_loc(besch_col)]) else ""
except Exception:
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
if not besch:
continue
obj_box = row[df.columns.get_loc(box_col)] if box_col and box_col in df.columns else ""
urheber = row[df.columns.get_loc(urh_col)] if urh_col and urh_col in df.columns else ""
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
terms = []
for clause in clauses:
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS:
continue
if re.fullmatch(r"\d+", p):
continue
terms.append(p)
row_terms_map.append((obj_box, urheber, terms))
if (r_idx % 200) == 0:
logger.debug(f"[{file_path.name}] Zeile {r_idx} verarbeitet")
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
logger.info(f"[{file_path.name}] Gefundene unique Terme: {len(all_terms)}")
total_unique_terms = len(all_terms)
# API-Abfragen
t0 = time.time()
gnd_results = batch_query_gnd(all_terms)
t1 = time.time()
logger.info(f"[{file_path.name}] GND-Abfragen Dauer: {t1-t0:.1f}s")
wd_results = batch_query_wikidata(all_terms)
t2 = time.time()
logger.info(f"[{file_path.name}] Wikidata-Abfragen Dauer: {t2-t1:.1f}s")
aat_results = batch_query_getty_aat(all_terms) if API_ACTIVE.get("aat", False) else {t:"" for t in all_terms}
t3 = time.time()
logger.info(f"[{file_path.name}] AAT-Abfragen Dauer: {t3-t2:.1f}s")
# Build output rows
output_rows = []
processed_count = 0
for obj_box, urheber, terms in row_terms_map:
for term in terms:
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
total_terms += 1
if norm_name != "KEIN TREFFER":
total_hits += 1
out_row = {
"Box": obj_box,
"Objekt/Ebene": obj_box,
"Urheber": urheber,
"Begriff": term,
"Norm_Treffer": norm_name,
"Norm_ID": norm_id,
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
"GND_Top1": gnd_results.get(term,""),
"WD_Top1": wd_results.get(term,""),
"AAT_Top1": aat_results.get(term,"")
}
output_rows.append(out_row)
processed_count += 1
if (processed_count % 200) == 0:
logger.debug(f"[{file_path.name}] {processed_count}/{total_unique_terms} Terme verarbeitet")
out_df = pd.DataFrame(output_rows)
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
version = 1
while output_file.exists():
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
version += 1
engine = "odf" if output_file.suffix.lower()==".ods" else None
try:
out_df.to_excel(output_file, index=False, engine=engine)
logger.info(f"[{file_path.name}] Auswertung gespeichert: {output_file}")
except Exception as e:
logger.error(f"[{file_path.name}] Fehler beim Speichern der Auswertung {output_file}: {e}")
continue
export_missing_terms(out_df, output_file)
mark_norm_hits(output_file)
file_elapsed = time.time() - file_start
logger.info(f"[Datei {file_idx}/{len(files)}] Fertig ({file_elapsed:.1f}s)")
overall_elapsed = time.time() - overall_start
logger.info(f"Fertig. Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular. Gesamtzeit: {overall_elapsed:.1f}s")
# =========================
# Main
# =========================
if __name__ == "__main__":
try:
process_files()
except KeyboardInterrupt:
logger.warning("Abbruch durch Benutzer (KeyboardInterrupt).")
except SystemExit:
logger.warning("SystemExit aufgetreten.")
except Exception as e:
logger.exception(f"Ungefangener Fehler: {e}")
finally:
# Stop logger (flush remaining logs)
try:
save_cache()
except Exception:
pass
try:
logger.info("Beende.")
logger.stop()
except Exception:
pass

View File

@ -0,0 +1,46 @@
import subprocess
import json
import sys
from pathlib import Path
def run_mapper(term):
"""
Ruft das bestehende mapper script auf und liefert Vorschläge zurück.
Erwartet, dass das mapper script eine JSON-Ausgabe liefert:
{
"term": "Begriff",
"norm_name": "Normierter Treffer oder KEIN TREFFER",
"norm_id": "ID",
"suggestions": ["Vorschlag1", "Vorschlag2", "Vorschlag3"]
}
"""
mapper_script = Path("/home/jarnold/projects/GND-Skript Test/NormVokabular_Mapper_1.2.py") # dein bestehendes Mapper-Skript
if not mapper_script.exists():
raise FileNotFoundError(f"{mapper_script} nicht gefunden")
# Übergabe als JSON-String
input_json = json.dumps({"term": term})
# Aufruf via subprocess
result = subprocess.run(
[sys.executable, str(mapper_script), input_json],
capture_output=True,
text=True
)
if result.returncode != 0:
raise RuntimeError(f"Mapper Fehler: {result.stderr}")
try:
output = json.loads(result.stdout)
except Exception as e:
raise ValueError(f"Ungültige Ausgabe vom Mapper: {e}")
return output
if __name__ == "__main__":
if len(sys.argv) > 1:
term = sys.argv[1]
output = run_mapper(term)
print(json.dumps(output, ensure_ascii=False))

Binary file not shown.

BIN
Test API.ods Normal file

Binary file not shown.

101
Tryout/NVTest.py Normal file
View File

@ -0,0 +1,101 @@
import pandas as pd
import requests
import time
import os
def match_gnd(token, delay=0.3):
"""GND-Abfrage für ein Schlagwort, gibt erstes Ergebnis zurück"""
url = f"https://lobid.org/gnd/search?q={token}&format=json"
try:
resp = requests.get(url, timeout=5)
if resp.status_code == 200:
data = resp.json()
if 'member' in data and data['member']:
first = data['member'][0]
return first.get('preferredName'), first.get('gndIdentifier')
except Exception as e:
print(f"Fehler bei GND-Abfrage für '{token}': {e}")
time.sleep(delay)
return None, None
def load_exlibris_refs(path):
"""CSV einlesen, Scan-Zuordnung, Platzhalter-Inventarnummer, GND-Abgleich"""
df = pd.read_csv(path, dtype=str, header=0)
# erste Spalte leer? → "Kürzel"
if df.columns[0].strip() == '':
df.rename(columns={df.columns[0]: 'Kürzel'}, inplace=True)
df.fillna('', inplace=True)
# Scan-Level-Spalten
level_cols = [c for c in df.columns if c.strip() in ['0','1','2','3','4']]
obj_list = []
current_obj = None
placeholder_counter = 1
for _, row in df.iterrows():
has_0 = row['0'].strip() if '0' in df.columns else ''
row_refs = []
for c in level_cols:
val = row[c].strip()
if val:
row_refs.append({'level': c, 'scan_ref': val})
if has_0:
if current_obj:
obj_list.append(current_obj)
core_data = {col: row[col] for col in df.columns if col not in level_cols}
# Inventarnummer prüfen
inv = core_data.get('Inventarnummer','').strip()
if not inv:
core_data['Inventarnummer'] = f'PL-{placeholder_counter:04d}'
placeholder_counter += 1
# GND-Abgleich
obj_descr = core_data.get('Objektbeschreibung','')
gnd_name, gnd_id = None, None
if obj_descr:
tokens = [t.strip() for t in obj_descr.split(',') if t.strip()]
for t in tokens:
name, gid = match_gnd(t)
if gid:
gnd_name = name
gnd_id = gid
break
core_data['GND_Name'] = gnd_name
core_data['GND_ID'] = gnd_id
current_obj = core_data
current_obj['ScanReferenzen'] = row_refs
else:
if current_obj:
current_obj['ScanReferenzen'].extend(row_refs)
if current_obj:
obj_list.append(current_obj)
out_df = pd.DataFrame(obj_list)
core_fields = ['Kürzel','Inventarnummer','Standort','Jahr','Urheber','Eigner',
'Objektbeschreibung','Material','Maße (in cm)',
'Objekttyp','Inschrift','Anmerkungen','ScanReferenzen',
'GND_Name','GND_ID']
available = [c for c in core_fields if c in out_df.columns]
return out_df[available]
# ====================
# Hauptteil
# ====================
if __name__ == "__main__":
# CSV im gleichen Ordner suchen
csv_files = [f for f in os.listdir('.') if f.lower().endswith('.csv')]
if not csv_files:
print("Keine CSV-Datei im aktuellen Ordner gefunden.")
exit(1)
# nimm die erste gefundene CSV
input_csv = csv_files[0]
print(f"Verwende CSV-Datei: {input_csv}")
df = load_exlibris_refs(input_csv)
# Ergebnis als Testergebnis.csv speichern
output_file = "Testergebnis.csv"
df.to_csv(output_file, index=False)
print(f"Aufbereitete Daten gespeichert als {output_file}")

190
VLG.py Normal file
View File

@ -0,0 +1,190 @@
#!/usr/bin/env python3
"""
VLG_AAT.py Gruppierung, Auflösung "Objektbeschreibung"
NOCH OHNE AAT-ABGLEICH
- Prüft ezodf in aktueller Umgebung
- Liest ODS aus "Input CSV/"
- Extrahiert Begriffe aus "Objektbeschreibung"
- Lemmatisierung (Spacy) + Stopwortfilter
- Subtokenisierung komplexer Phrasen
- Zählt Häufigkeiten
- Ausgabe ODS / CSV-Fallback in "Auswertung Ergebnisse"
"""
import os
import sys
import logging
from collections import Counter
import pandas as pd
import spacy
# ---------------------------
# Logging
# ---------------------------
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
# ---------------------------
# ezodf prüfen
# ---------------------------
try:
import ezodf
EZODF_AVAILABLE = True
logging.info(f"ezodf erkannt")
except ImportError:
EZODF_AVAILABLE = False
logging.error("ezodf konnte nicht importiert werden!")
logging.error("Möglicherweise nutzen Sie nicht die Python-Umgebung, in der ezodf installiert ist.")
logging.error(f"Aktuelle Python-Executable: {sys.executable}")
logging.error("Bitte prüfen Sie Ihre venv oder installieren Sie ezodf in dieser Umgebung:")
logging.error(" python -m pip install ezodf")
sys.exit(1)
# ---------------------------
# Spacy laden
# ---------------------------
try:
nlp = spacy.load("de_core_news_sm")
logging.info("Spacy-Modell geladen.")
except Exception as e:
logging.error(f"Spacy-Modell konnte nicht geladen werden: {e}")
sys.exit(1)
# ---------------------------
# Konfiguration
# ---------------------------
INPUT_FOLDER = "Input CSV"
OUTPUT_FOLDER = "Auswertung Ergebnisse"
INPUT_FILENAME = None
TARGET_COLUMN = "Objektbeschreibung"
STOPWORDS = {"mit", "auf", "von", "und", "der", "die", "das"} # erweiterbar
MAPPING = { # Projektinterne Sonderfälle
"exlibris": "exlibris",
"wappen": "wappen"
}
# ---------------------------
# Funktionen
# ---------------------------
def find_input_file(folder: str, filename_hint: str = None):
if not os.path.isdir(folder):
raise FileNotFoundError(f"Input-Ordner '{folder}' existiert nicht.")
files = [f for f in os.listdir(folder) if f.lower().endswith(".ods")]
if filename_hint:
for f in files:
if f == filename_hint or filename_hint in f:
return os.path.join(folder, f)
if not files:
raise FileNotFoundError(f"Keine .ods-Dateien in '{folder}' gefunden.")
return os.path.join(folder, files[0])
def read_ods_first_sheet(path: str) -> pd.DataFrame:
"""Lädt ODS, erkennt automatisch Header-Zeile."""
try:
df = pd.read_excel(path, engine="odf", header=None)
logging.info("ODS mit pandas + odfpy geladen.")
except Exception as e1:
logging.warning(f"pandas + odfpy konnte ODS nicht lesen ({e1}).")
if not EZODF_AVAILABLE:
raise RuntimeError("ezodf nicht installiert und pandas + odfpy fehlgeschlagen.")
doc = ezodf.opendoc(path)
sheet = doc.sheets[0]
data = []
for row in sheet.rows():
values = [c.value if hasattr(c, "value") else "" for c in row]
data.append(values)
df = pd.DataFrame(data)
logging.info("ODS mit ezodf geladen.")
# Header-Zeile automatisch finden
header_row_index = None
for i, row in df.iterrows():
row_str = row.fillna("").astype(str).str.lower()
if any("objektbeschreibung" in str(cell) for cell in row_str):
header_row_index = i
break
if header_row_index is None:
raise KeyError("Keine Header-Zeile mit 'Objektbeschreibung' gefunden.")
df.columns = df.iloc[header_row_index]
df = df.iloc[header_row_index + 1:].reset_index(drop=True)
return df
def tokenize_and_lemmatize(series: pd.Series) -> list:
"""Tokenisiert, entfernt Stopwords, wendet Mapping + Spacy-Lemmatisierung an."""
series = series.fillna("").astype(str).str.strip().str.lower()
all_terms = []
for text in series:
if not text:
continue
# Komma-Split
for part in [p.strip() for p in text.split(",") if p.strip()]:
# Subtokenisierung via Spacy
doc = nlp(part)
for token in doc:
lemma = token.lemma_.lower()
if lemma in STOPWORDS:
continue
lemma = MAPPING.get(lemma, lemma)
if lemma:
all_terms.append(lemma)
return all_terms
def write_output(rows: list, outpath: str):
if EZODF_AVAILABLE:
if not rows:
logging.warning("Keine Daten zum Schreiben.")
return
keys = list(rows[0].keys())
doc = ezodf.newdoc(doctype="ods", filename=outpath)
sheet = ezodf.Sheet("Auswertung", size=(len(rows)+1, len(keys)))
doc.sheets += sheet
for ci, k in enumerate(keys):
sheet[0, ci].set_value(k)
for ri, row in enumerate(rows, start=1):
for ci, k in enumerate(keys):
sheet[ri, ci].set_value(row.get(k, ""))
doc.save()
logging.info(f"ODS geschrieben: {outpath}")
else:
csv_path = os.path.splitext(outpath)[0] + ".csv"
df = pd.DataFrame(rows)
df.to_csv(csv_path, index=False, sep=";", encoding="utf-8")
logging.info(f"CSV-Fallback geschrieben: {csv_path}")
# ---------------------------
# Hauptfunktion
# ---------------------------
def main(input_folder=INPUT_FOLDER, input_filename=INPUT_FILENAME):
input_path = find_input_file(input_folder, filename_hint=input_filename)
input_basename = os.path.splitext(os.path.basename(input_path))[0]
logging.info(f"Verarbeite Datei: {input_path}")
df = read_ods_first_sheet(input_path)
logging.info(f"Geladene Spalten: {list(df.columns)}")
if TARGET_COLUMN.lower() not in [str(c).lower() for c in df.columns]:
raise KeyError(f"Spalte '{TARGET_COLUMN}' nicht gefunden.")
terms = tokenize_and_lemmatize(df[TARGET_COLUMN])
logging.info(f"Gefundene Begriffe: {len(terms)}")
counts = Counter(terms)
sorted_terms = sorted(counts.items(), key=lambda kv: kv[1], reverse=True)
rows = [{"Begriff": term, "Anzahl": freq} for term, freq in sorted_terms]
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
out_name = f"{input_basename} Auswertung.ods"
out_path = os.path.join(OUTPUT_FOLDER, out_name)
write_output(rows, out_path)
logging.info("Fertig.")
if __name__ == "__main__":
argv = sys.argv[1:]
folder = INPUT_FOLDER
fname = INPUT_FILENAME
if len(argv) >= 1:
folder = argv[0]
if len(argv) >= 2:
fname = argv[1]
main(input_folder=folder, input_filename=fname)

262
VLG_API_multi.py Normal file
View File

@ -0,0 +1,262 @@
import os
import sys
import time
import json
import requests
import pandas as pd
from pathlib import Path
from difflib import SequenceMatcher
import argparse
# =========================
# Argumente / Dry-Run
# =========================
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true', help='API-Abfragen simulieren')
args = parser.parse_args()
DRY_RUN = args.dry_run
# =========================
# Konfiguration
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
TIMEOUT = 5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
MAX_CONSECUTIVE_FAILURES = 10
CACHE_FILE = "api_cache.json"
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
API_ACTIVE = {"gnd": True, "wikidata": True}
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
# =========================
# Logging
# =========================
def log(level, msg):
print(f"[{level}] {msg}")
# =========================
# Cache speichern
# =========================
def save_cache():
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
# =========================
# Request mit Retry & Backoff
# =========================
def request_with_retries(api_name, url, params=None):
if DRY_RUN:
return {"dummy": True}
if not API_ACTIVE[api_name]:
return None
cache_key = url + (str(params) if params else "")
if cache_key in CACHE:
return CACHE[cache_key]
retries = 0
while retries < MAX_RETRIES:
try:
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
if r.status_code == 200:
try:
data = r.json()
except:
data = r.text
CACHE[cache_key] = data
save_cache()
FAIL_COUNTER[api_name] = 0
return data
elif r.status_code in [403, 429]:
log("ERROR", f"{api_name.upper()} HTTP {r.status_code} Stopschalter aktiviert")
API_ACTIVE[api_name] = False
return None
else:
log("ERROR", f"{api_name.upper()} HTTP {r.status_code}")
except requests.exceptions.Timeout:
log("ERROR", f"Timeout bei {api_name.upper()}")
except Exception as e:
log("ERROR", f"Fehler bei {api_name.upper()}: {e}")
retries += 1
sleep_time = min(BACKOFF_FACTOR ** retries, 30)
time.sleep(sleep_time)
FAIL_COUNTER[api_name] += 1
if FAIL_COUNTER[api_name] >= MAX_CONSECUTIVE_FAILURES:
log("CRITICAL", f"{MAX_CONSECUTIVE_FAILURES} Fehler bei {api_name.upper()} Stopschalter aktiviert")
API_ACTIVE[api_name] = False
return None
# =========================
# API-Abfragen mit Confidence
# =========================
def query_gnd(term, min_conf=0.6):
if DRY_RUN or not API_ACTIVE["gnd"]:
return "TEST_GND", 1.0
url = f"https://lobid.org/gnd/search?q={term}&format=json"
data = request_with_retries("gnd", url)
if not data:
return "API nicht erreichbar", 0.0
results = []
scores = []
for doc in data.get("member", []):
name = doc.get("preferredName", "")
conf = SequenceMatcher(None, term.lower(), name.lower()).ratio()
if conf >= min_conf:
results.append(name)
scores.append(conf)
if results:
return ", ".join(results), max(scores)
return "ohne Ergebnis", 0.0
def query_wikidata(term, min_conf=0.5):
if DRY_RUN or not API_ACTIVE["wikidata"]:
return "TEST_WD", 1.0
url = "https://www.wikidata.org/w/api.php"
params = {"action": "wbsearchentities", "search": term, "language": "de", "format": "json"}
data = request_with_retries("wikidata", url, params)
if not data:
return "API nicht erreichbar", 0.0
results = []
scores = []
for entry in data.get("search", []):
match_info = entry.get("match", {})
score = match_info.get("score", 0.0)
if score >= min_conf:
results.append(entry["label"])
scores.append(score)
if results:
return ", ".join(results), max(scores)
return "ohne Ergebnis", 0.0
# =========================
# Input laden
# =========================
def load_input_file(file_path):
try:
if file_path.suffix.lower() == ".ods":
df = pd.read_excel(file_path, engine="odf", header=None)
elif file_path.suffix.lower() == ".xlsx":
df = pd.read_excel(file_path, engine="openpyxl", header=None)
elif file_path.suffix.lower() == ".csv":
df = pd.read_csv(file_path, header=None)
else:
log("WARNING", f"Unbekanntes Dateiformat: {file_path.name}")
return None
return df
except Exception as e:
log("ERROR", f"Fehler beim Laden von {file_path.name}: {e}")
return None
# =========================
# Header-Zeile suchen
# =========================
def find_header_row(df, keywords=["objektbeschreibung", "objekt/ebene"]):
for i, row in df.iterrows():
row_lower = [str(cell).lower() if pd.notna(cell) else "" for cell in row]
if any(kw in cell for kw in keywords for cell in row_lower):
return i, row_lower
return None, None
# =========================
# Verarbeitung
# =========================
def process_files():
all_terms = []
output_rows = []
for file_path in INPUT_DIR.glob("*"):
if not file_path.suffix.lower() in [".csv", ".xlsx", ".ods"]:
continue
log("INFO", f"Verarbeite {file_path.name}")
df = load_input_file(file_path)
if df is None:
continue
header_idx, header_row = find_header_row(df)
if header_idx is None:
log("WARNING", f"Keine Header-Zeile gefunden in {file_path.name}")
continue
df.columns = header_row
df = df.iloc[header_idx+1:].reset_index(drop=True)
col_objdesc = next((col for col in df.columns if "objektbeschreibung" in str(col).lower()), None)
col_objlevel = next((col for col in df.columns if "objekt/ebene" in str(col).lower()), None)
if not col_objdesc:
log("WARNING", f"Keine Spalte 'Objektbeschreibung' in {file_path.name}")
continue
term_list = []
obj_level_list = []
for _, row in df.iterrows():
terms = str(row[col_objdesc]) if pd.notna(row[col_objdesc]) else ""
if not terms:
continue
for term in [t.strip() for t in terms.split(",") if t.strip()]:
term_list.append(term)
obj_level_list.append(row[col_objlevel] if col_objlevel and pd.notna(row[col_objlevel]) else "")
# API-Abfragen
gnd_results = []
gnd_scores = []
wikidata_results = []
wikidata_scores = []
for term in term_list:
gnd_res, gnd_conf = query_gnd(term)
wikidata_res, wd_conf = query_wikidata(term)
gnd_results.append(gnd_res)
gnd_scores.append(gnd_conf)
wikidata_results.append(wikidata_res)
wikidata_scores.append(wd_conf)
for idx, term in enumerate(term_list):
output_rows.append({
"Begriff": term,
"Quelle": file_path.name,
"Objekt/Ebene": obj_level_list[idx],
"GND": gnd_results[idx],
"GND_Confidence": gnd_scores[idx],
"Wikidata": wikidata_results[idx],
"Wikidata_Confidence": wikidata_scores[idx]
})
all_terms.extend(term_list)
# Hauptoutput
out_df = pd.DataFrame(output_rows)
out_file = OUTPUT_DIR / "Auswertung_gesamt.ods"
out_df.to_excel(out_file, index=False, engine="odf")
log("INFO", f"Hauptauswertung gespeichert: {out_file}")
# Rohdatei
raw_terms = pd.Series(all_terms).value_counts().reset_index()
raw_terms.columns = ["Begriff", "Häufigkeit"]
raw_file = OUTPUT_DIR / "Rohbegriffe.ods"
raw_terms.to_excel(raw_file, index=False, engine="odf")
log("INFO", f"Rohbegriffe gespeichert: {raw_file}")
# =========================
# Main
# =========================
if __name__ == "__main__":
if not INPUT_DIR.exists():
log("CRITICAL", f"Eingabeordner {INPUT_DIR} fehlt!")
sys.exit(1)
process_files()

2815369
api_cache.json Normal file

File diff suppressed because it is too large Load Diff

9
config.json Normal file
View File

@ -0,0 +1,9 @@
{
"normvokabular_path": "/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods",
"max_suggestions": 3,
"color_hit": "#C6EFCE",
"color_miss": "#FFC7CE",
"use_rapidfuzz": false,
"use_spacy": false,
"autosave": false
}

371
mapper.py Normal file
View File

@ -0,0 +1,371 @@
import os
import sys
import re
import time
import json
import pandas as pd
import requests
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher
# RapidFuzz für Token-basierte Fuzzy-Suche
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
print("RapidFuzz verfügbar")
except ImportError:
RAPIDFUZZ_AVAILABLE = False
print("RapidFuzz nicht verfügbar nutze SequenceMatcher")
# Spacy Lemmatizer
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
print("Spacy Lemmatizer aktiviert")
except:
SPACY_AVAILABLE = False
nlp = None
print("Spacy nicht verfügbar nutze naive Stemmer")
# =========================
# Pfade & Config
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
CACHE_FILE = "api_cache.json"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
TIMEOUT = 5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
API_ACTIVE = {"gnd": True, "wikidata": True}
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
# Cache
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
def save_cache():
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
# =========================
# Normalisierung / Lemma
# =========================
def normalize_text(s):
if not s:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
# Lemma-Cache
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# =========================
# Kompositum-Zerlegung (einfacher Ansatz)
# =========================
def compound_split(term):
parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
return parts if parts else [term]
# =========================
# Normvokabular laden & Lemma vorbereiten
# =========================
def load_normvokabular(file_path):
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
norm_dict = {}
stem_index = defaultdict(list)
lemma_norm_map = {} # für RapidFuzz preprocessed
for sheet_name, df in sheets.items():
if sheet_name.lower() in ["master", "übersicht"]:
continue
df = df.dropna(how="all", axis=1)
df.columns = [str(c).strip() for c in df.columns]
id_col = next((c for c in df.columns if "ID" in c), None)
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c), None)
if not id_col or not word_col:
continue
current_parent_id = None
for _, row in df.iterrows():
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
if row_id:
current_parent_id = row_id
if not row_word:
continue
assigned_parent_id = current_parent_id
entry = {
"Name": row_word,
"ID": assigned_parent_id, # Parent-ID
"Sheet": sheet_name,
"Own_ID": row_id or "" # eigene ID, falls vorhanden
}
key = normalize_text(row_word)
norm_dict[key] = entry
lemma = lemmatize_term(key)
stem_index[lemma].append(entry)
if lemma not in lemma_norm_map:
lemma_norm_map[lemma] = entry
return norm_dict, stem_index, lemma_norm_map
# =========================
# Mapping & Vorschläge
# =========================
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
# Exakter Treffer
if term_norm in norm_dict:
e = norm_dict[term_norm]
return e["Name"], e["ID"], []
# Lemma-Treffer
if term_lemma in stem_index:
e = stem_index[term_lemma][0]
return e["Name"], e["ID"], []
# KEIN TREFFER → Kompositum-Split
tokens = compound_split(term)
if len(tokens) == 1:
suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
return "KEIN TREFFER", "", suggestions
else:
token_matches = []
for t in tokens:
t_lemma = lemmatize_term(t)
if t_lemma in stem_index:
e = stem_index[t_lemma][0]
token_matches.append((t, e["Name"], e["ID"]))
else:
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
token_matches.append((t, "KEIN TREFFER", "", sugg))
combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
return "KEIN TREFFER", "", combined_suggestions
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
candidates = []
for key_lemma, entry in lemma_norm_map.items():
if RAPIDFUZZ_AVAILABLE:
score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
else:
score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
if key_lemma.lower().startswith(term_lemma.lower()):
score = min(score + 0.1, 1.0)
if score >= threshold:
candidates.append((score, entry["Name"], entry["ID"]))
candidates.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
# =========================
# API-Abfragen
# =========================
def request_with_retries(api_name,url,params=None):
cache_key = url + str(params)
if cache_key in CACHE:
return CACHE[cache_key]
retries = 0
while retries < MAX_RETRIES:
try:
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
if r.status_code == 200:
try: data = r.json()
except: data = r.text
CACHE[cache_key] = data
FAIL_COUNTER[api_name] = 0
return data
except:
pass
retries += 1
time.sleep(min(BACKOFF_FACTOR**retries,30))
FAIL_COUNTER[api_name] += 1
if FAIL_COUNTER[api_name] >= 10:
API_ACTIVE[api_name] = False
return None
def batch_query_gnd(terms):
results={}
if not API_ACTIVE.get("gnd", False):
for t in terms: results[t] = ""
return results
for t in terms:
url="https://lobid.org/gnd/search"
params={"q":t,"format":"json"}
data = request_with_retries("gnd", url, params)
top = ""
if data and "member" in data:
cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None,t.lower(),(doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
cands = [c for c in cands if c[1]>=0.75]
if cands:
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t] = top
return results
def batch_query_wikidata(terms):
results={}
if not API_ACTIVE.get("wikidata", False):
for t in terms: results[t] = ""
return results
for t in terms:
url="https://www.wikidata.org/w/api.php"
params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
data = request_with_retries("wikidata", url, params)
top = ""
if data and "search" in data:
cands = [(e.get("label",""), SequenceMatcher(None,t.lower(),e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")]
cands = [c for c in cands if c[1]>=0.70]
if cands:
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
results[t] = top
return results
# =========================
# Markierung / Export
# =========================
def mark_norm_hits(file_path):
ext = file_path.suffix.lower()
if ext in [".xlsx", ".xls"]:
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
wb = load_workbook(file_path)
ws = wb.active
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
norm_col = col_map.get("Norm_Treffer", None)
if not norm_col:
print("Spalte 'Norm_Treffer' nicht gefunden")
return
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
cell = row[0]
if cell.value and cell.value != "KEIN TREFFER":
cell.fill = green_fill
else:
cell.fill = red_fill
wb.save(file_path)
elif ext==".ods":
df = pd.read_excel(file_path, engine="odf")
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
df.to_excel(file_path, index=False, engine="odf")
# =========================
# Verarbeitung Input-Dateien
# =========================
def process_files():
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
total_terms = 0
total_hits = 0
if not INPUT_DIR.exists():
print(f"Eingabeordner {INPUT_DIR} fehlt")
sys.exit(1)
files = list(INPUT_DIR.glob("*"))
if not files:
print("Keine Dateien gefunden")
return
for file_path in files:
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
continue
print(f"Verarbeite Datei: {file_path.name}")
try:
if file_path.suffix.lower() == ".csv":
df = pd.read_csv(file_path)
else:
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
except Exception as e:
print(f"Fehler beim Lesen von {file_path.name}: {e}")
continue
df = df.dropna(how="all")
df.columns = [str(c).strip() for c in df.columns]
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
urh_col = next((c for c in df.columns if "Urheber" in c), None)
if not besch_col: continue
row_terms_map = []
for _, row in df.iterrows():
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
if not besch: continue
obj_box = row[box_col] if box_col else ""
urheber = row[urh_col] if urh_col else ""
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
terms = []
for clause in clauses:
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS: continue
if re.fullmatch(r"\d+", p): continue
terms.append(p)
row_terms_map.append((obj_box, urheber, terms))
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
gnd_results = batch_query_gnd(all_terms)
wd_results = batch_query_wikidata(all_terms)
output_rows = []
for obj_box, urheber, terms in row_terms_map:
for term in terms:
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
total_terms += 1
if norm_name != "KEIN TREFFER":
total_hits += 1
out_row = {
"Box": obj_box,
"Objekt/Ebene": obj_box,
"Urheber": urheber,
"Begriff": term,
"Norm_Treffer": norm_name,
"Norm_ID": norm_id,
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
"GND_Top1": gnd_results.get(term,""),
"WD_Top1": wd_results.get(term,"")
}
output_rows.append(out_row)
out_df = pd.DataFrame(output_rows)
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
version = 1
while output_file.exists():
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
version += 1
engine = "odf" if output_file.suffix.lower()==".ods" else None
out_df.to_excel(output_file, index=False, engine=engine)
mark_norm_hits(output_file)
print(f"Auswertung gespeichert: {output_file}")
save_cache()
print(f"Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular")
# =========================
# Main
# =========================
if __name__ == "__main__":
process_files()
print("Fertig")

237
mapper_macro.py Normal file
View File

@ -0,0 +1,237 @@
import uno
import os
import re
import traceback
import json
# Optional für Lemmatizer
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except:
SPACY_AVAILABLE = False
nlp = None
# Optional für Fuzzy Matching
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except:
from difflib import SequenceMatcher
RAPIDFUZZ_AVAILABLE = False
import odf.opendocument
import odf.table
import odf.text
# ------------------------
# Konfiguration absolute Pfade
# ------------------------
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro"
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
# ------------------------
# Logging
# ------------------------
def log(msg):
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(msg + "\n")
# ------------------------
# Cache laden
# ------------------------
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
# ------------------------
# Normalisierung / Lemma
# ------------------------
def normalize_text(s):
if not s:
return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# ------------------------
# NV_MASTER einlesen
# ------------------------
def load_nv_master(path):
norm_dict = {}
try:
doc = odf.opendocument.load(path)
except Exception as e:
log(f"Fehler beim Laden von NV_MASTER: {e}")
return norm_dict
for sheet in doc.spreadsheet.getElementsByType(odf.table.Table):
sheet_name = sheet.getAttribute("name")
if sheet_name.lower() == "master":
continue
current_parent_id = None
for row in sheet.getElementsByType(odf.table.TableRow):
cells = row.getElementsByType(odf.table.TableCell)
cell_values = []
for cell in cells:
texts = cell.getElementsByType(odf.text.P)
if texts and texts[0].firstChild:
cell_values.append(str(texts[0].firstChild.data).strip())
else:
cell_values.append("")
if not cell_values or len(cell_values)<4:
continue
id_val, unterk, unterunterk, word = cell_values[:4]
if id_val:
current_parent_id = id_val.strip()
if not word:
continue
key = lemmatize_term(word)
norm_dict[key] = {
"Name": word.strip(),
"ID": current_parent_id,
"Sheet": sheet_name,
"Unterkategorie": unterk.strip(),
"Unterunterkategorie": unterunterk.strip()
}
log(f"NV_MASTER geladen: {len(norm_dict)} Begriffe")
return norm_dict
# ------------------------
# Matching
# ------------------------
def get_suggestions(term_lemma, norm_dict, top_n=3, threshold=CONF_THRESHOLD):
candidates = []
for key, entry in norm_dict.items():
if RAPIDFUZZ_AVAILABLE:
score = fuzz.token_set_ratio(term_lemma, key)/100
else:
score = SequenceMatcher(None, term_lemma.lower(), key.lower()).ratio()
if key.lower().startswith(term_lemma.lower()):
score = min(score + 0.1, 1.0)
if score >= threshold:
candidates.append((score, entry["Name"], entry["ID"]))
candidates.sort(reverse=True)
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
def map_word(word, norm_dict):
key = lemmatize_term(word)
if key in CACHE:
cached = CACHE[key]
return cached["Norm"], cached["Suggestion"], cached["ID"]
if key in norm_dict:
entry = norm_dict[key]
tr, sug, wid = entry["Name"], "", entry["ID"]
else:
suggestions = get_suggestions(term_lemma=key, norm_dict=norm_dict)
if suggestions:
tr, sug, wid = "KEIN TREFFER", ", ".join(suggestions), ""
else:
tr, sug, wid = "KEIN TREFFER", "", ""
CACHE[key] = {"Norm": tr, "Suggestion": sug, "ID": wid}
return tr, sug, wid
# ------------------------
# Makro-Hauptfunktion
# ------------------------
def run_mapper_macro():
try:
doc = XSCRIPTCONTEXT.getDocument()
sheets = doc.getSheets()
sheet = sheets.getByIndex(0)
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
data_range = cursor.getRangeAddress()
header_row = 0
objekt_col = None
# Header prüfen
for col in range(data_range.EndColumn+1):
val = sheet.getCellByPosition(col, header_row).String.strip().lower()
if val == "objektbeschreibung":
objekt_col = col
break
if objekt_col is None:
log("Spalte 'Objektbeschreibung' nicht gefunden")
return
# Neue Spalten am rechten Tabellenende erstellen
max_col = data_range.EndColumn
norm_tr_col = max_col + 1
norm_sug_col = max_col + 2
norm_id_col = max_col + 3
sheet.getCellByPosition(norm_tr_col, header_row).String = "Norm_Treffer"
sheet.getCellByPosition(norm_sug_col, header_row).String = "Norm_Vorschlag"
sheet.getCellByPosition(norm_id_col, header_row).String = "Norm_ID"
norm_dict = load_nv_master(NV_MASTER_PATH)
# Farben
GREEN = 0xC6EFCE
YELLOW = 0xFFEB9C
RED = 0xFFC7CE
for row in range(1, data_range.EndRow+1):
cell = sheet.getCellByPosition(objekt_col, row)
val = cell.String.strip()
if not val:
continue
words = [w.strip() for w in re.split(r"\s+", val) if w.strip() and w.lower() not in STOPWORDS]
tr_list, sug_list, id_list = [], [], []
for w in words:
tr, sug, wid = map_word(w, norm_dict)
if tr != "KEIN TREFFER":
tr_list.append(tr)
if sug:
sug_list.append(sug)
if wid:
id_list.append(wid)
sheet.getCellByPosition(norm_tr_col, row).String = ", ".join(tr_list)
sheet.getCellByPosition(norm_sug_col, row).String = ", ".join(sug_list)
sheet.getCellByPosition(norm_id_col, row).String = ", ".join(id_list)
# Farbmarkierung
if tr_list:
cell.CellBackColor = GREEN
elif sug_list:
cell.CellBackColor = YELLOW
else:
cell.CellBackColor = RED
# Cache speichern
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
log("Makro erfolgreich ausgeführt")
except Exception as e:
log("Fehler in run_mapper_macro:")
log(traceback.format_exc())

448
mapper_macro_1.1.py Normal file
View File

@ -0,0 +1,448 @@
# -*- coding: utf-8 -*-
# LibreOffice Calc macro: NV_MASTER-Abgleich, Pandas+odf, Cache, Farben
# Pfade: BASE_DIR muss auf das Verzeichnis zeigen, in dem NV_MASTER.ods + Makro liegen.
# Speichern: /home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro/mapper_macro.py
import os
import re
import json
import traceback
# UNO-Context wird zur Laufzeit zur Verfügung gestellt (XSCRIPTCONTEXT)
# Third-party libs: pandas, odfpy, optional: spacy, rapidfuzz
try:
import pandas as pd
PANDAS_AVAILABLE = True
except Exception:
PANDAS_AVAILABLE = False
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except Exception:
SPACY_AVAILABLE = False
nlp = None
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except Exception:
RAPIDFUZZ_AVAILABLE = False
from difflib import SequenceMatcher
# ------------------------
# Konfiguration
# ------------------------
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro"
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75 # Basis-Schwelle für Vorschläge
# ------------------------
# Utilities: Logging & safe I/O
# ------------------------
def log(msg):
try:
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(msg + "\n")
except Exception:
pass
# ------------------------
# Cache laden
# ------------------------
try:
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
except Exception:
CACHE = {}
# ------------------------
# Text-Normalisierung & Lemma
# ------------------------
def normalize_text(s):
if not s:
return ""
s = str(s).strip().lower()
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
s = re.sub(r"\s+", " ", s)
return s
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
try:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
except Exception:
lemma = term_norm
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
# ------------------------
# NV_MASTER robust laden (pandas + odf)
# ------------------------
def build_norm_index(nv_path):
norm_dict = {} # normalized_name -> list of entries (Name, ID, Sheet)
lemma_index = {} # lemma -> list of entries
if not PANDAS_AVAILABLE:
log("Pandas nicht verfügbar. NV_MASTER kann nicht zuverlässig gelesen werden.")
return norm_dict, lemma_index
try:
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
except Exception as e:
log(f"Fehler beim Einlesen von NV_MASTER mit pandas: {e}")
return norm_dict, lemma_index
for sheet_name, df in sheets.items():
if str(sheet_name).strip().lower() == "master":
continue
# normalize columns names to find ID and Wort columns
df = df.fillna("") # leere Zellen als ""
cols = [str(c).strip().lower() for c in df.columns]
# try to find columns
id_col = None
word_col = None
for i, c in enumerate(cols):
if "id" in c:
id_col = df.columns[i]
if "wort" in c or "vokabel" in c:
word_col = df.columns[i]
# fallback: if not found, try first/last
if word_col is None and len(df.columns) >= 1:
word_col = df.columns[-1]
if id_col is None and len(df.columns) >= 1:
id_col = df.columns[0]
current_parent_id = None
for _, row in df.iterrows():
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
# if row defines an ID, set as current parent
if id_val:
current_parent_id = id_val
# skip empty word cells
if not word_val:
continue
norm_name = normalize_text(word_val)
lemma = lemmatize_term(word_val)
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
# add to norm_dict by normalized name (exact matching)
norm_dict.setdefault(norm_name, []).append(entry)
# add to lemma_index
lemma_index.setdefault(lemma, []).append(entry)
log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}")
return norm_dict, lemma_index
# ------------------------
# Matching: exakter Treffer, Lemma-Treffer, Fuzzy-Vorschläge
# ------------------------
def fuzzy_score(a, b):
if RAPIDFUZZ_AVAILABLE:
try:
return fuzz.token_set_ratio(a, b) / 100.0
except Exception:
return 0.0
else:
try:
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
except Exception:
return 0.0
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, threshold=CONF_THRESHOLD):
# collect candidates from lemma_index keys and norm_dict keys
candidates = []
# iterate over lemma_index keys for candidate names
for key_lemma, entries in lemma_index.items():
score = fuzzy_score(term_lemma, key_lemma)
if key_lemma.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
# also check norm_dict keys (exact-normalized names) as additional candidates
for norm_key, entries in norm_dict.items():
score = fuzzy_score(term_lemma, norm_key)
if norm_key.startswith(term_lemma):
score = min(score + 0.1, 1.0)
if score >= threshold:
for e in entries:
candidates.append((score, e["Name"], e["ID"]))
# sort by score descending
candidates.sort(key=lambda t: t[0], reverse=True)
# unique by (Name, ID) preserve score order
seen = set()
results = []
for score, name, id_ in candidates:
key = (name, id_)
if key in seen:
continue
seen.add(key)
results.append({"score": score, "name": name, "id": id_})
# return all candidates (no limit) as "Name (ID)"
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
def map_term_with_indexes(term, norm_dict, lemma_index):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term)
# cache lookup
if term_lemma in CACHE:
return CACHE[term_lemma]["hits"], CACHE[term_lemma]["suggestions"], CACHE[term_lemma]["ids"]
hits = []
suggestions = []
ids = []
# 1) exact normalized name match
if term_norm in norm_dict:
for e in norm_dict[term_norm]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
# 2) lemma match (if not already hits)
if not hits and term_lemma in lemma_index:
for e in lemma_index[term_lemma]:
hits.append(e["Name"])
if e["ID"]:
ids.append(e["ID"])
# 3) suggestions via fuzzy (always compute even if hits exist, but suggestions empty if exact)
suggs = get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, threshold=CONF_THRESHOLD)
# If there are exact hits, we still may present suggestions (user wanted unlimited), but suggestions are secondary
suggestions = suggs
# deduplicate lists preserving order
def unique_preserve(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
hits = unique_preserve(hits)
suggestions = unique_preserve(suggestions)
ids = unique_preserve(ids)
# cache result
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
return hits, suggestions, ids
# ------------------------
# Haupt-Makro
# ------------------------
def run_mapper_macro():
try:
# UNO doc/sheet
doc = XSCRIPTCONTEXT.getDocument()
sheet = doc.CurrentController.ActiveSheet
cursor = sheet.createCursor()
cursor.gotoStartOfUsedArea(False)
cursor.gotoEndOfUsedArea(True)
data_range = cursor.getRangeAddress()
except Exception as e:
log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e))
return
# find header row and Objektbeschreibung column (search first 5 rows)
header_row = None
objekt_col = None
max_col = data_range.EndColumn
for r in range(0, min(5, data_range.EndRow+1)):
for c in range(0, max_col+1):
try:
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
except Exception:
val = ""
if val == "objektbeschreibung":
header_row = r
objekt_col = c
break
if objekt_col is not None:
break
if objekt_col is None:
log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.")
return
# determine or create result columns: search if exist anywhere; otherwise append at right end
existing = {}
for c in range(0, data_range.EndColumn+1):
try:
h = str(sheet.getCellByPosition(c, header_row).String).strip()
except Exception:
h = ""
if h == "Norm_Treffer":
existing["Norm_Treffer"] = c
if h == "Norm_Vorschlag":
existing["Norm_Vorschlag"] = c
if h == "Norm_ID":
existing["Norm_ID"] = c
# append columns at right end if missing
last_col = data_range.EndColumn
if "Norm_Treffer" not in existing:
last_col += 1
existing["Norm_Treffer"] = last_col
try:
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
except Exception:
pass
if "Norm_Vorschlag" not in existing:
last_col += 1
existing["Norm_Vorschlag"] = last_col
try:
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
except Exception:
pass
if "Norm_ID" not in existing:
last_col += 1
existing["Norm_ID"] = last_col
try:
sheet.getCellByPosition(last_col, header_row).String = "Norm_ID"
except Exception:
pass
norm_tr_col = existing["Norm_Treffer"]
norm_sug_col = existing["Norm_Vorschlag"]
norm_id_col = existing["Norm_ID"]
# Build norm indexes
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
if not norm_dict and not lemma_index:
log("NV_MASTER leer oder nicht lesbar. Abbruch.")
return
# colors
GREEN = 0xADFF2F
YELLOW = 0xFFA500
RED = 0xCC0000
# iterate rows
rows_processed = 0
for r in range(header_row + 1, data_range.EndRow + 1):
try:
cell = sheet.getCellByPosition(objekt_col, r)
txt = str(cell.String).strip()
if not txt:
# clear any previous outputs? keep existing per spec; skip empty
continue
# tokenize: split by commas first, then whitespace; filter stopwords and pure numbers
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
terms = []
for cl in clauses:
parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()]
for p in parts:
if p.lower() in STOPWORDS:
continue
if re.fullmatch(r"\d+", p):
continue
terms.append(p)
# for each term, get hits/suggestions/ids
row_hits = []
row_sugs = []
row_ids = []
any_unmapped = False # at least one term without hit and without suggestion
# We will record for each term
for term in terms:
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
if hits:
row_hits.extend(hits)
if sugs:
row_sugs.extend(sugs)
if ids:
row_ids.extend(ids)
if (not hits) and (not sugs):
any_unmapped = True
# deduplicate preserving order
def uniq(seq):
seen = set()
out = []
for x in seq:
if x not in seen:
seen.add(x)
out.append(x)
return out
row_hits = uniq(row_hits)
row_sugs = uniq(row_sugs)
row_ids = uniq(row_ids)
# write outputs (unlimited lists, joined with " | ")
try:
sheet.getCellByPosition(norm_tr_col, r).String = " | ".join(row_hits)
sheet.getCellByPosition(norm_sug_col, r).String = " | ".join(row_sugs)
sheet.getCellByPosition(norm_id_col, r).String = " | ".join(row_ids)
except Exception:
pass
# Coloring rules per new spec:
# - Objektbeschreibung cell: RED if any_unmapped else no change (we do not color green/yellow here)
# - Norm_Treffer cell: GREEN if all terms matched (i.e., terms non-empty and no term unmapped and at least one hit per term)
# - Norm_Vorschlag cell: YELLOW if at least one suggestion exists
# Determine "all matched": terms non-empty and every term has at least one hit (we approximated by checking any_unmapped and hits length)
all_matched = False
if terms:
# all_matched if no term without hit and there is at least one hit overall
if (not any_unmapped) and row_hits:
all_matched = True
# apply colors
try:
if any_unmapped:
cell.CellBackColor = RED
else:
# clear red if previously set? We'll leave unchanged if not set. Optionally set to default 16777215 (white)
pass
# Norm_Treffer coloring
tr_cell = sheet.getCellByPosition(norm_tr_col, r)
if all_matched:
tr_cell.CellBackColor = GREEN
else:
# clear color if needed -> set to white
tr_cell.CellBackColor = 0xFFFFFF
# Norm_Vorschlag coloring
sug_cell = sheet.getCellByPosition(norm_sug_col, r)
if row_sugs:
sug_cell.CellBackColor = YELLOW
else:
sug_cell.CellBackColor = 0xFFFFFF
except Exception:
pass
rows_processed += 1
except Exception as e:
# continue processing other rows; log once
log(f"Fehler in Zeile {r}: {e}")
# persist cache
try:
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, ensure_ascii=False, indent=2)
except Exception:
pass
log(f"run_mapper_macro fertig. Zeilen verarbeitet: {rows_processed}")
# Export for LO
g_exportedScripts = (run_mapper_macro,)

297
mapper_macro_1.2.py Normal file
View File

@ -0,0 +1,297 @@
# -*- coding: utf-8 -*-
import os
import uno
import unohelper
import re
import json
import pandas as pd
from pathlib import Path
from difflib import SequenceMatcher
# RapidFuzz für Fuzzy-Suche
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except ImportError:
RAPIDFUZZ_AVAILABLE = False
# Spacy Lemmatizer
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except:
SPACY_AVAILABLE = False
nlp = None
# =========================
# Pfade & Config
# =========================
SCRIPT_DIR = Path("/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro")
NV_MASTER_FILE = SCRIPT_DIR / "NV_MASTER.ods"
CACHE_FILE = SCRIPT_DIR / "mapper_cache.json"
LOG_FILE = SCRIPT_DIR / "mapper_log.txt"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
# =========================
# Cache & Logging
# =========================
if CACHE_FILE.exists():
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
def save_cache():
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
def log(msg):
with open(LOG_FILE,"a",encoding="utf-8") as f:
f.write(msg + "\n")
# =========================
# Textverarbeitung
# =========================
def normalize_text(s):
if not s: return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
def compound_split(term):
parts = re.findall(r'[A-ZÄÖÜa-zäöü]+', term)
return parts if parts else [term]
# =========================
# NV_MASTER laden
# =========================
def load_normvokabular(file_path):
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf")
norm_dict = {}
for sheet_name, df in sheets.items():
df = df.dropna(how="all", axis=1)
df.columns = [str(c).strip() for c in df.columns]
if "ID" not in df.columns or "Wort/Vokabel" not in df.columns:
continue
current_parent_id = None
for _, row in df.iterrows():
row_id = str(row["ID"]).strip() if pd.notna(row["ID"]) else None
row_word = str(row["Wort/Vokabel"]).strip() if pd.notna(row["Wort/Vokabel"]) else None
if row_id: current_parent_id = row_id
if not row_word: continue
norm_dict[normalize_text(row_word)] = {
"ID": current_parent_id,
"Wort/Vokabel": row_word
}
return norm_dict
# =========================
# Mapping
# =========================
def map_term_with_indexes(term, norm_dict):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term_norm)
# Cache prüfen
if term_lemma in CACHE:
cached = CACHE[term_lemma]
if isinstance(cached, dict) and all(k in cached for k in ("hits","suggestions","ids")):
return cached["hits"], cached["suggestions"], cached["ids"]
else:
CACHE.pop(term_lemma, None)
hits = []
suggestions = []
ids = []
# Exakte Treffer
if term_norm in norm_dict:
e = norm_dict[term_norm]
hits.append(e["Wort/Vokabel"])
ids.append(e["ID"])
elif term_lemma in norm_dict:
e = norm_dict[term_lemma]
hits.append(e["Wort/Vokabel"])
ids.append(e["ID"])
else:
# Fuzzy Matching
for key, e in norm_dict.items():
score = fuzz.token_sort_ratio(term_lemma, key)/100.0 if RAPIDFUZZ_AVAILABLE else SequenceMatcher(None, term_lemma, key).ratio()
if score >= 0.75:
suggestions.append(e["Wort/Vokabel"])
ids.append(e["ID"])
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
return hits, suggestions, ids
# =========================
# LibreOffice Dialog (ListBox + Checkbox)
# =========================
def apply_proposals_dialog():
ctx = uno.getComponentContext()
smgr = ctx.ServiceManager
desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
doc = desktop.getCurrentComponent()
if not doc.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
log("Kein Calc-Dokument aktiv")
return
selection = doc.CurrentSelection
sheet = doc.CurrentController.ActiveSheet
# Prüfen ob eine Zelle ausgewählt ist
if selection is None or not hasattr(selection, "getCellAddress"):
log("Keine Zelle ausgewählt")
return
cell = selection
# Spalte überprüfen
header_row = sheet.getCellRangeByPosition(0,0,sheet.Columns.Count-1,0)
objekt_col = None
norm_vorschlag_col = None
for col_idx in range(sheet.Columns.Count):
val = sheet.getCellByPosition(col_idx,0).String
if val.strip().lower() == "objektbeschreibung":
objekt_col = col_idx
elif val.strip().lower() == "norm_vorschlag":
norm_vorschlag_col = col_idx
if norm_vorschlag_col is None or objekt_col is None:
log("Spalte 'Norm_Vorschlag' oder 'Objektbeschreibung' nicht gefunden")
return
# Vorschläge auslesen
proposals_str = sheet.getCellByPosition(norm_vorschlag_col, cell.RangeAddress.StartRow).String
if not proposals_str.strip():
log("Keine Vorschläge in der ausgewählten Zelle")
return
proposals = [p.strip() for p in proposals_str.split(";") if p.strip()]
# Dialog erstellen
toolkit = smgr.createInstanceWithContext("com.sun.star.awt.Toolkit", ctx)
dialog_model = smgr.createInstanceWithContext("com.sun.star.awt.UnoControlDialogModel", ctx)
dialog_model.Width = 180
dialog_model.Height = 150
dialog_model.Title = "Vorschläge übernehmen"
# ListBox
lb_model = dialog_model.createInstance("com.sun.star.awt.UnoControlListBoxModel")
lb_model.Name = "ProposalList"
lb_model.PositionX = 10
lb_model.PositionY = 10
lb_model.Width = 160
lb_model.Height = 80
lb_model.StringItemList = tuple(proposals)
dialog_model.insertByName("ProposalList", lb_model)
# Checkbox
cb_model = dialog_model.createInstance("com.sun.star.awt.UnoControlCheckBoxModel")
cb_model.Name = "AllCheck"
cb_model.PositionX = 10
cb_model.PositionY = 95
cb_model.Width = 160
cb_model.Height = 15
cb_model.Label = "Alle Vorschläge übernehmen"
dialog_model.insertByName("AllCheck", cb_model)
# OK-Button
btn_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
btn_model.Name = "OKButton"
btn_model.PositionX = 10
btn_model.PositionY = 115
btn_model.Width = 80
btn_model.Height = 20
btn_model.Label = "OK"
dialog_model.insertByName("OKButton", btn_model)
# Abbrechen-Button
cancel_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
cancel_model.Name = "CancelButton"
cancel_model.PositionX = 100
cancel_model.PositionY = 115
cancel_model.Width = 80
cancel_model.Height = 20
cancel_model.Label = "Abbrechen"
dialog_model.insertByName("CancelButton", cancel_model)
# Control Dialog
dialog = smgr.createInstanceWithContext("com.sun.star.awt.UnoControlDialog", ctx)
dialog.setModel(dialog_model)
dialog.setVisible(True)
toolkit.createPeer(dialog, None)
# Warten auf OK
while True:
import time
time.sleep(0.1)
# Prüfen auf Klick
if dialog.getControl("OKButton").Pressed:
all_flag = dialog.getControl("AllCheck").State == 1
selected_idx = dialog.getControl("ProposalList").SelectedItems
if selected_idx:
selected_proposal = proposals[selected_idx[0]]
else:
selected_proposal = None
break
elif dialog.getControl("CancelButton").Pressed:
dialog.endExecute()
return
# Anwenden
obj_cell = sheet.getCellByPosition(objekt_col, cell.RangeAddress.StartRow)
obj_text = obj_cell.String
if all_flag:
for prop in proposals:
idx = obj_text.lower().find(prop.lower())
if idx != -1:
obj_text = obj_text[:idx] + prop + obj_text[idx+len(prop):]
else:
if selected_proposal:
idx = obj_text.lower().find(selected_proposal.lower())
if idx != -1:
obj_text = obj_text[:idx] + selected_proposal + obj_text[idx+len(selected_proposal):]
obj_cell.String = obj_text
obj_cell.CellBackColor = 0x00FF00 # grün
dialog.endExecute()
save_cache()
log(f"Vorschlag übernommen: {obj_text}")
# =========================
# Automatische Button-Registrierung
# =========================
def register_toolbar_button():
ctx = uno.getComponentContext()
smgr = ctx.ServiceManager
desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
doc = desktop.getCurrentComponent()
frame = doc.CurrentController.Frame
# Button kann manuell über Makro-Menü an Toolbar gebunden werden
# Hier wird nur das Makro selbst registriert
# Symbolleiste muss in LO einmalig erstellt werden
# =========================
# Hauptmakro
# =========================
def run_mapper_macro():
try:
norm_dict = load_normvokabular(NV_MASTER_FILE)
log(f"NV_MASTER geladen ({len(norm_dict)} Begriffe)")
apply_proposals_dialog()
except Exception as e:
log(f"Fehler in run_mapper_macro: {e}")

297
mapper_macro_1.3.py Normal file
View File

@ -0,0 +1,297 @@
# -*- coding: utf-8 -*-
import os
import uno
import unohelper
import re
import json
import pandas as pd
from pathlib import Path
from difflib import SequenceMatcher
# RapidFuzz für Fuzzy-Suche
try:
from rapidfuzz import fuzz
RAPIDFUZZ_AVAILABLE = True
except ImportError:
RAPIDFUZZ_AVAILABLE = False
# Spacy Lemmatizer
try:
import spacy
nlp = spacy.load("de_core_news_sm")
SPACY_AVAILABLE = True
except:
SPACY_AVAILABLE = False
nlp = None
# =========================
# Pfade & Config
# =========================
SCRIPT_DIR = Path("/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro")
NV_MASTER_FILE = SCRIPT_DIR / "NV_MASTER.ods"
CACHE_FILE = SCRIPT_DIR / "mapper_cache.json"
LOG_FILE = SCRIPT_DIR / "mapper_log.txt"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
# =========================
# Cache & Logging
# =========================
if CACHE_FILE.exists():
with open(CACHE_FILE,"r",encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
def save_cache():
with open(CACHE_FILE,"w",encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
def log(msg):
with open(LOG_FILE,"a",encoding="utf-8") as f:
f.write(msg + "\n")
# =========================
# Textverarbeitung
# =========================
def normalize_text(s):
if not s: return ""
s = str(s).lower().strip()
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
s = re.sub(r"\s+"," ",s)
return s
lemma_cache = {}
def lemmatize_term(term):
term_norm = normalize_text(term)
if term_norm in lemma_cache:
return lemma_cache[term_norm]
if SPACY_AVAILABLE and nlp:
doc = nlp(term_norm)
lemma = " ".join([token.lemma_ for token in doc])
else:
lemma = term_norm
lemma_cache[term_norm] = lemma
return lemma
def compound_split(term):
parts = re.findall(r'[A-ZÄÖÜa-zäöü]+', term)
return parts if parts else [term]
# =========================
# NV_MASTER laden
# =========================
def load_normvokabular(file_path):
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf")
norm_dict = {}
for sheet_name, df in sheets.items():
df = df.dropna(how="all", axis=1)
df.columns = [str(c).strip() for c in df.columns]
if "ID" not in df.columns or "Wort/Vokabel" not in df.columns:
continue
current_parent_id = None
for _, row in df.iterrows():
row_id = str(row["ID"]).strip() if pd.notna(row["ID"]) else None
row_word = str(row["Wort/Vokabel"]).strip() if pd.notna(row["Wort/Vokabel"]) else None
if row_id: current_parent_id = row_id
if not row_word: continue
norm_dict[normalize_text(row_word)] = {
"ID": current_parent_id,
"Wort/Vokabel": row_word
}
return norm_dict
# =========================
# Mapping
# =========================
def map_term_with_indexes(term, norm_dict):
term_norm = normalize_text(term)
term_lemma = lemmatize_term(term_norm)
# Cache prüfen
if term_lemma in CACHE:
cached = CACHE[term_lemma]
if isinstance(cached, dict) and all(k in cached for k in ("hits","suggestions","ids")):
return cached["hits"], cached["suggestions"], cached["ids"]
else:
CACHE.pop(term_lemma, None)
hits = []
suggestions = []
ids = []
# Exakte Treffer
if term_norm in norm_dict:
e = norm_dict[term_norm]
hits.append(e["Wort/Vokabel"])
ids.append(e["ID"])
elif term_lemma in norm_dict:
e = norm_dict[term_lemma]
hits.append(e["Wort/Vokabel"])
ids.append(e["ID"])
else:
# Fuzzy Matching
for key, e in norm_dict.items():
score = fuzz.token_sort_ratio(term_lemma, key)/100.0 if RAPIDFUZZ_AVAILABLE else SequenceMatcher(None, term_lemma, key).ratio()
if score >= 0.75:
suggestions.append(e["Wort/Vokabel"])
ids.append(e["ID"])
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
return hits, suggestions, ids
# =========================
# LibreOffice Dialog (ListBox + Checkbox)
# =========================
def apply_proposals_dialog():
ctx = uno.getComponentContext()
smgr = ctx.ServiceManager
desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
doc = desktop.getCurrentComponent()
if not doc.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
log("Kein Calc-Dokument aktiv")
return
selection = doc.CurrentSelection
sheet = doc.CurrentController.ActiveSheet
# Prüfen ob eine Zelle ausgewählt ist
if selection is None or not hasattr(selection, "getCellAddress"):
log("Keine Zelle ausgewählt")
return
cell = selection
# Spalte überprüfen
header_row = sheet.getCellRangeByPosition(0,0,sheet.Columns.Count-1,0)
objekt_col = None
norm_vorschlag_col = None
for col_idx in range(sheet.Columns.Count):
val = sheet.getCellByPosition(col_idx,0).String
if val.strip().lower() == "objektbeschreibung":
objekt_col = col_idx
elif val.strip().lower() == "norm_vorschlag":
norm_vorschlag_col = col_idx
if norm_vorschlag_col is None or objekt_col is None:
log("Spalte 'Norm_Vorschlag' oder 'Objektbeschreibung' nicht gefunden")
return
# Vorschläge auslesen
proposals_str = sheet.getCellByPosition(norm_vorschlag_col, cell.RangeAddress.StartRow).String
if not proposals_str.strip():
log("Keine Vorschläge in der ausgewählten Zelle")
return
proposals = [p.strip() for p in proposals_str.split(";") if p.strip()]
# Dialog erstellen
toolkit = smgr.createInstanceWithContext("com.sun.star.awt.Toolkit", ctx)
dialog_model = smgr.createInstanceWithContext("com.sun.star.awt.UnoControlDialogModel", ctx)
dialog_model.Width = 180
dialog_model.Height = 150
dialog_model.Title = "Vorschläge übernehmen"
# ListBox
lb_model = dialog_model.createInstance("com.sun.star.awt.UnoControlListBoxModel")
lb_model.Name = "ProposalList"
lb_model.PositionX = 10
lb_model.PositionY = 10
lb_model.Width = 160
lb_model.Height = 80
lb_model.StringItemList = tuple(proposals)
dialog_model.insertByName("ProposalList", lb_model)
# Checkbox
cb_model = dialog_model.createInstance("com.sun.star.awt.UnoControlCheckBoxModel")
cb_model.Name = "AllCheck"
cb_model.PositionX = 10
cb_model.PositionY = 95
cb_model.Width = 160
cb_model.Height = 15
cb_model.Label = "Alle Vorschläge übernehmen"
dialog_model.insertByName("AllCheck", cb_model)
# OK-Button
btn_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
btn_model.Name = "OKButton"
btn_model.PositionX = 10
btn_model.PositionY = 115
btn_model.Width = 80
btn_model.Height = 20
btn_model.Label = "OK"
dialog_model.insertByName("OKButton", btn_model)
# Abbrechen-Button
cancel_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
cancel_model.Name = "CancelButton"
cancel_model.PositionX = 100
cancel_model.PositionY = 115
cancel_model.Width = 80
cancel_model.Height = 20
cancel_model.Label = "Abbrechen"
dialog_model.insertByName("CancelButton", cancel_model)
# Control Dialog
dialog = smgr.createInstanceWithContext("com.sun.star.awt.UnoControlDialog", ctx)
dialog.setModel(dialog_model)
dialog.setVisible(True)
toolkit.createPeer(dialog, None)
# Warten auf OK
while True:
import time
time.sleep(0.1)
# Prüfen auf Klick
if dialog.getControl("OKButton").Pressed:
all_flag = dialog.getControl("AllCheck").State == 1
selected_idx = dialog.getControl("ProposalList").SelectedItems
if selected_idx:
selected_proposal = proposals[selected_idx[0]]
else:
selected_proposal = None
break
elif dialog.getControl("CancelButton").Pressed:
dialog.endExecute()
return
# Anwenden
obj_cell = sheet.getCellByPosition(objekt_col, cell.RangeAddress.StartRow)
obj_text = obj_cell.String
if all_flag:
for prop in proposals:
idx = obj_text.lower().find(prop.lower())
if idx != -1:
obj_text = obj_text[:idx] + prop + obj_text[idx+len(prop):]
else:
if selected_proposal:
idx = obj_text.lower().find(selected_proposal.lower())
if idx != -1:
obj_text = obj_text[:idx] + selected_proposal + obj_text[idx+len(selected_proposal):]
obj_cell.String = obj_text
obj_cell.CellBackColor = 0x00FF00 # grün
dialog.endExecute()
save_cache()
log(f"Vorschlag übernommen: {obj_text}")
# =========================
# Automatische Button-Registrierung
# =========================
def register_toolbar_button():
ctx = uno.getComponentContext()
smgr = ctx.ServiceManager
desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
doc = desktop.getCurrentComponent()
frame = doc.CurrentController.Frame
# Button kann manuell über Makro-Menü an Toolbar gebunden werden
# Hier wird nur das Makro selbst registriert
# Symbolleiste muss in LO einmalig erstellt werden
# =========================
# Hauptmakro
# =========================
def run_mapper_macro():
try:
norm_dict = load_normvokabular(NV_MASTER_FILE)
log(f"NV_MASTER geladen ({len(norm_dict)} Begriffe)")
apply_proposals_dialog()
except Exception as e:
log(f"Fehler in run_mapper_macro: {e}")

121
normmapper_macro.py Normal file
View File

@ -0,0 +1,121 @@
import uno
import json
import subprocess
from pathlib import Path
from com.sun.star.awt import XActionListener
# Farbwerte (BGR)
GREEN = 0xC6EFCE
RED = 0xFFC7CE
YELLOW = 0xFFEB9C
def get_objektbeschreibung_column(sheet):
"""Findet die Spalte 'Objektbeschreibung'."""
for row in range(sheet.Rows.Count):
for col in range(sheet.Columns.Count):
cell = sheet.getCellByPosition(col, row)
if cell.String.strip().lower() == "objektbeschreibung":
return col
return None
def update_cell_color(cell, status):
"""Färbt die Zelle."""
if status == "grün":
cell.CellBackColor = GREEN
elif status == "gelb":
cell.CellBackColor = YELLOW
else:
cell.CellBackColor = RED
def call_mapper(term):
"""Ruft den lokalen Wrapper auf."""
wrapper = Path("/home/jarnold/projects/GND-Skript Test/NormVokabular_Mapper_Wrapper.py")
if not wrapper.exists():
return {"term": term, "norm_name": "KEIN TREFFER", "norm_id": "", "suggestions": []}
result = subprocess.run(
["python3", str(wrapper), term],
capture_output=True,
text=True
)
try:
output = json.loads(result.stdout)
except:
output = {"term": term, "norm_name": "KEIN TREFFER", "norm_id": "", "suggestions": []}
return output
class SuggestionListener(XActionListener):
"""Listener für Klick auf Vorschlag-Button."""
def __init__(self, cell, suggestion, dialog):
self.cell = cell
self.suggestion = suggestion
self.dialog = dialog
def actionPerformed(self, event):
self.cell.String = self.suggestion
update_cell_color(self.cell, "grün")
self.dialog.endExecute() # schließt das Dialogfenster
def disposing(self, event):
pass
def show_suggestion_dialog(cell, term, suggestions):
"""Zeigt ein Dialog-Fenster mit klickbaren Vorschlägen."""
ctx = XSCRIPTCONTEXT.getComponentContext()
smgr = ctx.getServiceManager()
toolkit = smgr.createInstance("com.sun.star.awt.Toolkit")
dialog_model = smgr.createInstance("com.sun.star.awt.UnoControlDialogModel")
dialog_model.PositionX = 100
dialog_model.PositionY = 100
dialog_model.Width = 200
dialog_model.Height = 30 + 25*len(suggestions)
dialog_model.Title = f"Vorschläge für '{term}'"
for i, sugg in enumerate(suggestions[:3]):
btn_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
btn_model.Name = f"btn_{i}"
btn_model.Label = sugg
btn_model.PositionX = 10
btn_model.PositionY = 10 + i*25
btn_model.Width = 180
btn_model.Height = 20
dialog_model.insertByName(btn_model.Name, btn_model)
dialog = smgr.createInstance("com.sun.star.awt.UnoControlDialog")
dialog.setModel(dialog_model)
dialog.setVisible(True)
for i, sugg in enumerate(suggestions[:3]):
btn = dialog.getControl(f"btn_{i}")
listener = SuggestionListener(cell, sugg, dialog)
btn.addActionListener(listener)
toolkit.createDialog(dialog).execute()
def mapper_process_column():
"""Verarbeitet alle Zellen unter 'Objektbeschreibung' in der aktiven Tabelle."""
doc = XSCRIPTCONTEXT.getDocument()
sheet = doc.CurrentController.ActiveSheet
col_index = get_objektbeschreibung_column(sheet)
if col_index is None:
return
for row in range(sheet.Rows.Count):
cell = sheet.getCellByPosition(col_index, row)
if not cell.String.strip():
continue # leere Zelle ignorieren
term = cell.String.strip()
result = call_mapper(term)
if result["norm_name"] != "KEIN TREFFER":
cell.String = result["norm_name"]
update_cell_color(cell, "grün")
elif result["suggestions"]:
update_cell_color(cell, "gelb")
show_suggestion_dialog(cell, term, result["suggestions"])
else:
update_cell_color(cell, "rot")
show_suggestion_dialog(cell, term, [])
# Export
g_exportedScripts = mapper_process_column,

0
test.py Normal file
View File

247
venv/bin/Activate.ps1 Normal file
View File

@ -0,0 +1,247 @@
<#
.Synopsis
Activate a Python virtual environment for the current PowerShell session.
.Description
Pushes the python executable for a virtual environment to the front of the
$Env:PATH environment variable and sets the prompt to signify that you are
in a Python virtual environment. Makes use of the command line switches as
well as the `pyvenv.cfg` file values present in the virtual environment.
.Parameter VenvDir
Path to the directory that contains the virtual environment to activate. The
default value for this is the parent of the directory that the Activate.ps1
script is located within.
.Parameter Prompt
The prompt prefix to display when this virtual environment is activated. By
default, this prompt is the name of the virtual environment folder (VenvDir)
surrounded by parentheses and followed by a single space (ie. '(.venv) ').
.Example
Activate.ps1
Activates the Python virtual environment that contains the Activate.ps1 script.
.Example
Activate.ps1 -Verbose
Activates the Python virtual environment that contains the Activate.ps1 script,
and shows extra information about the activation as it executes.
.Example
Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
Activates the Python virtual environment located in the specified location.
.Example
Activate.ps1 -Prompt "MyPython"
Activates the Python virtual environment that contains the Activate.ps1 script,
and prefixes the current prompt with the specified string (surrounded in
parentheses) while the virtual environment is active.
.Notes
On Windows, it may be required to enable this Activate.ps1 script by setting the
execution policy for the user. You can do this by issuing the following PowerShell
command:
PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
For more information on Execution Policies:
https://go.microsoft.com/fwlink/?LinkID=135170
#>
Param(
[Parameter(Mandatory = $false)]
[String]
$VenvDir,
[Parameter(Mandatory = $false)]
[String]
$Prompt
)
<# Function declarations --------------------------------------------------- #>
<#
.Synopsis
Remove all shell session elements added by the Activate script, including the
addition of the virtual environment's Python executable from the beginning of
the PATH variable.
.Parameter NonDestructive
If present, do not remove this function from the global namespace for the
session.
#>
function global:deactivate ([switch]$NonDestructive) {
# Revert to original values
# The prior prompt:
if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
}
# The prior PYTHONHOME:
if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
}
# The prior PATH:
if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
Remove-Item -Path Env:_OLD_VIRTUAL_PATH
}
# Just remove the VIRTUAL_ENV altogether:
if (Test-Path -Path Env:VIRTUAL_ENV) {
Remove-Item -Path env:VIRTUAL_ENV
}
# Just remove VIRTUAL_ENV_PROMPT altogether.
if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
Remove-Item -Path env:VIRTUAL_ENV_PROMPT
}
# Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
}
# Leave deactivate function in the global namespace if requested:
if (-not $NonDestructive) {
Remove-Item -Path function:deactivate
}
}
<#
.Description
Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
given folder, and returns them in a map.
For each line in the pyvenv.cfg file, if that line can be parsed into exactly
two strings separated by `=` (with any amount of whitespace surrounding the =)
then it is considered a `key = value` line. The left hand string is the key,
the right hand is the value.
If the value starts with a `'` or a `"` then the first and last character is
stripped from the value before being captured.
.Parameter ConfigDir
Path to the directory that contains the `pyvenv.cfg` file.
#>
function Get-PyVenvConfig(
[String]
$ConfigDir
) {
Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
# Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
$pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
# An empty map will be returned if no config file is found.
$pyvenvConfig = @{ }
if ($pyvenvConfigPath) {
Write-Verbose "File exists, parse `key = value` lines"
$pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
$pyvenvConfigContent | ForEach-Object {
$keyval = $PSItem -split "\s*=\s*", 2
if ($keyval[0] -and $keyval[1]) {
$val = $keyval[1]
# Remove extraneous quotations around a string value.
if ("'""".Contains($val.Substring(0, 1))) {
$val = $val.Substring(1, $val.Length - 2)
}
$pyvenvConfig[$keyval[0]] = $val
Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
}
}
}
return $pyvenvConfig
}
<# Begin Activate script --------------------------------------------------- #>
# Determine the containing directory of this script
$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
$VenvExecDir = Get-Item -Path $VenvExecPath
Write-Verbose "Activation script is located in path: '$VenvExecPath'"
Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
# Set values required in priority: CmdLine, ConfigFile, Default
# First, get the location of the virtual environment, it might not be
# VenvExecDir if specified on the command line.
if ($VenvDir) {
Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
}
else {
Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
$VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
Write-Verbose "VenvDir=$VenvDir"
}
# Next, read the `pyvenv.cfg` file to determine any required value such
# as `prompt`.
$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
# Next, set the prompt from the command line, or the config file, or
# just use the name of the virtual environment folder.
if ($Prompt) {
Write-Verbose "Prompt specified as argument, using '$Prompt'"
}
else {
Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
$Prompt = $pyvenvCfg['prompt'];
}
else {
Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
$Prompt = Split-Path -Path $venvDir -Leaf
}
}
Write-Verbose "Prompt = '$Prompt'"
Write-Verbose "VenvDir='$VenvDir'"
# Deactivate any currently active virtual environment, but leave the
# deactivate function in place.
deactivate -nondestructive
# Now set the environment variable VIRTUAL_ENV, used by many tools to determine
# that there is an activated venv.
$env:VIRTUAL_ENV = $VenvDir
if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
Write-Verbose "Setting prompt to '$Prompt'"
# Set the prompt to include the env name
# Make sure _OLD_VIRTUAL_PROMPT is global
function global:_OLD_VIRTUAL_PROMPT { "" }
Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
function global:prompt {
Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
_OLD_VIRTUAL_PROMPT
}
$env:VIRTUAL_ENV_PROMPT = $Prompt
}
# Clear PYTHONHOME
if (Test-Path -Path Env:PYTHONHOME) {
Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
Remove-Item -Path Env:PYTHONHOME
}
# Add the venv to the PATH
Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"

69
venv/bin/activate Normal file
View File

@ -0,0 +1,69 @@
# This file must be used with "source bin/activate" *from bash*
# you cannot run it directly
deactivate () {
# reset old environment variables
if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
PATH="${_OLD_VIRTUAL_PATH:-}"
export PATH
unset _OLD_VIRTUAL_PATH
fi
if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
export PYTHONHOME
unset _OLD_VIRTUAL_PYTHONHOME
fi
# This should detect bash and zsh, which have a hash command that must
# be called to get it to forget past commands. Without forgetting
# past commands the $PATH changes we made may not be respected
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
hash -r 2> /dev/null
fi
if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
PS1="${_OLD_VIRTUAL_PS1:-}"
export PS1
unset _OLD_VIRTUAL_PS1
fi
unset VIRTUAL_ENV
unset VIRTUAL_ENV_PROMPT
if [ ! "${1:-}" = "nondestructive" ] ; then
# Self destruct!
unset -f deactivate
fi
}
# unset irrelevant variables
deactivate nondestructive
VIRTUAL_ENV='/home/jarnold/projects/GND-Skript Test/venv'
export VIRTUAL_ENV
_OLD_VIRTUAL_PATH="$PATH"
PATH="$VIRTUAL_ENV/"bin":$PATH"
export PATH
# unset PYTHONHOME if set
# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
# could use `if (set -u; : $PYTHONHOME) ;` in bash
if [ -n "${PYTHONHOME:-}" ] ; then
_OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
unset PYTHONHOME
fi
if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
_OLD_VIRTUAL_PS1="${PS1:-}"
PS1='(venv) '"${PS1:-}"
export PS1
VIRTUAL_ENV_PROMPT='(venv) '
export VIRTUAL_ENV_PROMPT
fi
# This should detect bash and zsh, which have a hash command that must
# be called to get it to forget past commands. Without forgetting
# past commands the $PATH changes we made may not be respected
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
hash -r 2> /dev/null
fi

26
venv/bin/activate.csh Normal file
View File

@ -0,0 +1,26 @@
# This file must be used with "source bin/activate.csh" *from csh*.
# You cannot run it directly.
# Created by Davide Di Blasi <davidedb@gmail.com>.
# Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate'
# Unset irrelevant variables.
deactivate nondestructive
setenv VIRTUAL_ENV '/home/jarnold/projects/GND-Skript Test/venv'
set _OLD_VIRTUAL_PATH="$PATH"
setenv PATH "$VIRTUAL_ENV/"bin":$PATH"
set _OLD_VIRTUAL_PROMPT="$prompt"
if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
set prompt = '(venv) '"$prompt"
setenv VIRTUAL_ENV_PROMPT '(venv) '
endif
alias pydoc python -m pydoc
rehash

69
venv/bin/activate.fish Normal file
View File

@ -0,0 +1,69 @@
# This file must be used with "source <venv>/bin/activate.fish" *from fish*
# (https://fishshell.com/); you cannot run it directly.
function deactivate -d "Exit virtual environment and return to normal shell environment"
# reset old environment variables
if test -n "$_OLD_VIRTUAL_PATH"
set -gx PATH $_OLD_VIRTUAL_PATH
set -e _OLD_VIRTUAL_PATH
end
if test -n "$_OLD_VIRTUAL_PYTHONHOME"
set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
set -e _OLD_VIRTUAL_PYTHONHOME
end
if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
set -e _OLD_FISH_PROMPT_OVERRIDE
# prevents error when using nested fish instances (Issue #93858)
if functions -q _old_fish_prompt
functions -e fish_prompt
functions -c _old_fish_prompt fish_prompt
functions -e _old_fish_prompt
end
end
set -e VIRTUAL_ENV
set -e VIRTUAL_ENV_PROMPT
if test "$argv[1]" != "nondestructive"
# Self-destruct!
functions -e deactivate
end
end
# Unset irrelevant variables.
deactivate nondestructive
set -gx VIRTUAL_ENV '/home/jarnold/projects/GND-Skript Test/venv'
set -gx _OLD_VIRTUAL_PATH $PATH
set -gx PATH "$VIRTUAL_ENV/"bin $PATH
# Unset PYTHONHOME if set.
if set -q PYTHONHOME
set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
set -e PYTHONHOME
end
if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
# fish uses a function instead of an env var to generate the prompt.
# Save the current fish_prompt function as the function _old_fish_prompt.
functions -c fish_prompt _old_fish_prompt
# With the original prompt function renamed, we can override with our own.
function fish_prompt
# Save the return status of the last command.
set -l old_status $status
# Output the venv prompt; color taken from the blue of the Python logo.
printf "%s%s%s" (set_color 4B8BBE) '(venv) ' (set_color normal)
# Restore the return status of the previous command.
echo "exit $old_status" | .
# Output the original/"old" prompt.
_old_fish_prompt
end
set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
set -gx VIRTUAL_ENV_PROMPT '(venv) '
end

229
venv/bin/csv2ods Executable file
View File

@ -0,0 +1,229 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2008 Agustin Henze -> agustinhenze at gmail.com
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
# Søren Roug
#
# Oct 2014: Georges Khaznadar <georgesk@debian.org>
# - ported to Python3
# - imlemented the missing switch -c / --encoding, with an extra
# feature for POSIX platforms which can guess encoding.
from odf.opendocument import OpenDocumentSpreadsheet
from odf.style import Style, TextProperties, ParagraphProperties, TableColumnProperties
from odf.text import P
from odf.table import Table, TableColumn, TableRow, TableCell
from optparse import OptionParser
import sys,csv,re, os, codecs
if sys.version_info[0]==3: unicode=str
if sys.version_info[0]==2:
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
def csvToOds( pathFileCSV, pathFileODS, tableName='table',
delimiter=',', quoting=csv.QUOTE_MINIMAL,
quotechar = '"', escapechar = None,
skipinitialspace = False, lineterminator = '\r\n',
encoding="utf-8"):
textdoc = OpenDocumentSpreadsheet()
# Create a style for the table content. One we can modify
# later in the word processor.
tablecontents = Style(name="Table Contents", family="paragraph")
tablecontents.addElement(ParagraphProperties(numberlines="false", linenumber="0"))
tablecontents.addElement(TextProperties(fontweight="bold"))
textdoc.styles.addElement(tablecontents)
# Start the table
table = Table( name=tableName )
if sys.version_info[0]==3:
reader = csv.reader(open(pathFileCSV, encoding=encoding),
delimiter=delimiter,
quoting=quoting,
quotechar=quotechar,
escapechar=escapechar,
skipinitialspace=skipinitialspace,
lineterminator=lineterminator)
else:
reader = UnicodeReader(open(pathFileCSV),
encoding=encoding,
delimiter=delimiter,
quoting=quoting,
quotechar=quotechar,
escapechar=escapechar,
skipinitialspace=skipinitialspace,
lineterminator=lineterminator)
fltExp = re.compile('^\s*[-+]?\d+(\.\d+)?\s*$')
for row in reader:
tr = TableRow()
table.addElement(tr)
for val in row:
if fltExp.match(val):
tc = TableCell(valuetype="float", value=val.strip())
else:
tc = TableCell(valuetype="string")
tr.addElement(tc)
p = P(stylename=tablecontents,text=val)
tc.addElement(p)
textdoc.spreadsheet.addElement(table)
textdoc.save( pathFileODS )
if __name__ == "__main__":
usage = "%prog -i file.csv -o file.ods -d"
parser = OptionParser(usage=usage, version="%prog 0.1")
parser.add_option('-i','--input', action='store',
dest='input', help='File input in csv')
parser.add_option('-o','--output', action='store',
dest='output', help='File output in ods')
parser.add_option('-d','--delimiter', action='store',
dest='delimiter', help='specifies a one-character string to use as the field separator. It defaults to ",".')
parser.add_option('-c','--encoding', action='store',
dest='encoding', help='specifies the encoding the file csv. It defaults to utf-8')
parser.add_option('-t','--table', action='store',
dest='tableName', help='The table name in the output file')
parser.add_option('-s','--skipinitialspace',
dest='skipinitialspace', help='''specifies how to interpret whitespace which
immediately follows a delimiter. It defaults to False, which
means that whitespace immediately following a delimiter is part
of the following field.''')
parser.add_option('-l','--lineterminator', action='store',
dest='lineterminator', help='''specifies the character sequence which should
terminate rows.''')
parser.add_option('-q','--quoting', action='store',
dest='quoting', help='''It can take on any of the following module constants:
0 = QUOTE_MINIMAL means only when required, for example, when a field contains either the quotechar or the delimiter
1 = QUOTE_ALL means that quotes are always placed around fields.
2 = QUOTE_NONNUMERIC means that quotes are always placed around fields which do not parse as integers or floating point numbers.
3 = QUOTE_NONE means that quotes are never placed around fields.
It defaults is QUOTE_MINIMAL''')
parser.add_option('-e','--escapechar', action='store',
dest='escapechar', help='''specifies a one-character string used to escape the delimiter when quoting is set to QUOTE_NONE.''')
parser.add_option('-r','--quotechar', action='store',
dest='quotechar', help='''specifies a one-character string to use as the quoting character. It defaults to ".''')
(options, args) = parser.parse_args()
if options.input:
pathFileCSV = options.input
else:
parser.print_help()
exit( 0 )
if options.output:
pathFileODS = options.output
else:
parser.print_help()
exit( 0 )
if options.delimiter:
delimiter = options.delimiter
else:
delimiter = ","
if options.skipinitialspace:
skipinitialspace = True
else:
skipinitialspace=False
if options.lineterminator:
lineterminator = options.lineterminator
else:
lineterminator ="\r\n"
if options.escapechar:
escapechar = options.escapechar
else:
escapechar=None
if options.tableName:
tableName = options.tableName
else:
tableName = "table"
if options.quotechar:
quotechar = options.quotechar
else:
quotechar = "\""
encoding = "utf-8" # default setting
###########################################################
## try to guess the encoding; this is implemented only with
## POSIX platforms. Can it be improved?
output = os.popen('/usr/bin/file ' + pathFileCSV).read()
m=re.match(r'^.*: ([-a-zA-Z0-9]+) text$', output)
if m:
encoding=m.group(1)
if 'ISO-8859' in encoding:
encoding="latin-1"
else:
encoding="utf-8"
############################################################
# when the -c or --coding switch is used, it takes precedence
if options.encoding:
encoding = options.encoding
csvToOds( pathFileCSV=unicode(pathFileCSV),
pathFileODS=unicode(pathFileODS),
delimiter=delimiter, skipinitialspace=skipinitialspace,
escapechar=escapechar,
lineterminator=unicode(lineterminator),
tableName=tableName, quotechar=quotechar,
encoding=encoding)
# Local Variables: ***
# mode: python ***
# End: ***

10
venv/bin/csv2rdf Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from rdflib.tools.csv2rdf import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

10
venv/bin/f2py Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from numpy.f2py.f2py2e import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

95
venv/bin/mailodf Executable file
View File

@ -0,0 +1,95 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2006 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
from odf.odf2xhtml import ODF2XHTML
import zipfile
import sys, os, smtplib, getopt
from email.mime.multipart import MIMEMultipart
from email.mime.nonmultipart import MIMENonMultipart
from email.mime.text import MIMEText
from email.encoders import encode_base64
if sys.version_info[0]==3: unicode=str
def usage():
sys.stderr.write("Usage: %s [-f from] [-s subject] inputfile recipients...\n" % sys.argv[0])
try:
opts, args = getopt.getopt(sys.argv[1:], "f:s:", ["from=", "subject="])
except getopt.GetoptError:
usage()
sys.exit(2)
fromaddr = os.getlogin() + "@" + os.getenv('HOSTNAME','localhost')
subject = None
for o, a in opts:
if o in ("-f", "--from"):
fromaddr = a
if o in ("-s", "--subject"):
subject = a
if len(args) < 2:
usage()
sys.exit(2)
suffices = {
'wmf':('image','x-wmf'),
'png':('image','png'),
'gif':('image','gif'),
'jpg':('image','jpeg'),
'jpeg':('image','jpeg')
}
msg = MIMEMultipart('related',type="text/html")
msg['From'] = fromaddr
# msg['Date'] = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
msg['To'] = ','.join(args[1:])
msg.preamble = 'This is a multi-part message in MIME format.'
msg.epilogue = ''
odhandler = ODF2XHTML()
result = odhandler.odf2xhtml(unicode(args[0]))
if subject:
msg['Subject'] = subject
else:
msg['Subject'] = odhandler.title
htmlpart = MIMEText(result,'html','us-ascii')
htmlpart['Content-Location'] = 'index.html'
msg.attach(htmlpart)
z = zipfile.ZipFile(unicode(args[0]))
for file in z.namelist():
if file[0:9] == 'Pictures/':
suffix = file[file.rfind(".")+1:]
main,sub = suffices.get(suffix,('application','octet-stream'))
img = MIMENonMultipart(main,sub)
img.set_payload(z.read(file))
img['Content-Location'] = "" + file
encode_base64(img)
msg.attach(img)
z.close()
server = smtplib.SMTP('localhost')
#server.set_debuglevel(1)
server.sendmail(fromaddr, args[1:], msg.as_string())
server.quit()
# Local Variables: ***
# mode: python ***
# End: ***

10
venv/bin/markdown-it Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from markdown_it.cli.parse import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

10
venv/bin/nltk Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from nltk.cli import cli
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(cli())

10
venv/bin/normalizer Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from charset_normalizer.cli import cli_detect
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(cli_detect())

10
venv/bin/numpy-config Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from numpy._configtool import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

72
venv/bin/odf2mht Executable file
View File

@ -0,0 +1,72 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2006 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
from __future__ import print_function
from odf.odf2xhtml import ODF2XHTML
import zipfile
import sys
#from time import gmtime, strftime
from email.mime.multipart import MIMEMultipart
from email.mime.nonmultipart import MIMENonMultipart
from email.mime.text import MIMEText
from email import encoders
if sys.version_info[0]==3: unicode=str
if len(sys.argv) != 2:
sys.stderr.write("Usage: %s inputfile\n" % sys.argv[0])
sys.exit(1)
suffices = {
'wmf':('image','x-wmf'),
'png':('image','png'),
'gif':('image','gif'),
'jpg':('image','jpeg'),
'jpeg':('image','jpeg')
}
msg = MIMEMultipart('related',type="text/html")
# msg['Subject'] = 'Subject here'
# msg['From'] = '<Saved by ODT2MHT>'
# msg['Date'] = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
msg.preamble = 'This is a multi-part message in MIME format.'
msg.epilogue = ''
odhandler = ODF2XHTML()
result = odhandler.odf2xhtml(unicode(sys.argv[1]))
htmlpart = MIMEText(result,'html','us-ascii')
htmlpart['Content-Location'] = 'index.html'
msg.attach(htmlpart)
z = zipfile.ZipFile(sys.argv[1])
for file in z.namelist():
if file[0:9] == 'Pictures/':
suffix = file[file.rfind(".")+1:]
main,sub = suffices.get(suffix,('application','octet-stream'))
img = MIMENonMultipart(main,sub)
img.set_payload(z.read(file))
img['Content-Location'] = "" + file
encoders.encode_base64(img)
msg.attach(img)
z.close()
print (msg.as_string())
# Local Variables: ***
# mode: python ***
# End: ***

59
venv/bin/odf2xhtml Executable file
View File

@ -0,0 +1,59 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2007 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
from odf.odf2xhtml import ODF2XHTML
import sys, getopt
if sys.version_info[0]==3: unicode=str
from io import StringIO
def usage():
sys.stderr.write("Usage: %s [-p] inputfile\n" % sys.argv[0])
try:
opts, args = getopt.getopt(sys.argv[1:], "ep", ["plain","embedable"])
except getopt.GetoptError:
usage()
sys.exit(2)
generatecss = True
embedable = False
for o, a in opts:
if o in ("-p", "--plain"):
generatecss = False
if o in ("-e", "--embedable"):
embedable = True
if len(args) != 1:
usage()
sys.exit(2)
odhandler = ODF2XHTML(generatecss, embedable)
try:
result = odhandler.odf2xhtml(unicode(args[0]))
except:
sys.stderr.write("Unable to open file %s or file is not OpenDocument\n" % args[0])
sys.exit(1)
sys.stdout.write(result)
# Local Variables: ***
# mode: python ***
# End: ***

81
venv/bin/odf2xml Executable file
View File

@ -0,0 +1,81 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2008 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
#
# OpenDocument can be a complete office document in a single
# XML document. This script will create such a document.
import sys, getopt, base64
from odf.opendocument import load
from odf.draw import Image, ObjectOle
from odf.style import BackgroundImage
from odf.text import ListLevelStyleImage
from odf.office import BinaryData
if sys.version_info[0]==3: unicode=str
def usage():
sys.stderr.write("Usage: %s [-e] [-o outputfile] [inputfile]\n" % sys.argv[0])
if __name__ == "__main__":
embedimage = False
try:
opts, args = getopt.getopt(sys.argv[1:], "o:e", ["output="])
except getopt.GetoptError:
usage()
sys.exit(2)
outputfile = '-'
for o, a in opts:
if o in ("-o", "--output"):
outputfile = a
if o == '-e':
embedimage = True
if len(args) > 1:
usage()
sys.exit(2)
if len(args) == 0:
d = load(sys.stdin)
else:
d = load(unicode(args[0]))
if embedimage:
images = d.getElementsByType(Image) + \
d.getElementsByType(BackgroundImage) + \
d.getElementsByType(ObjectOle) + \
d.getElementsByType(ListLevelStyleImage)
for image in images:
href = image.getAttribute('href')
if href and href[:9] == "Pictures/":
p = d.Pictures[href]
bp = base64.encodestring(p[1])
image.addElement(BinaryData(text=bp))
image.removeAttribute('href')
xml = d.xml()
if outputfile == '-':
print (xml)
else:
open(outputfile,"wb").write(xml)
# Local Variables: ***
# mode: python ***
# End: ***

190
venv/bin/odfimgimport Executable file
View File

@ -0,0 +1,190 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2007-2009 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
from __future__ import print_function
import zipfile, sys, getopt, mimetypes
try:
from urllib2 import urlopen, quote, unquote
except ImportError:
from urllib.request import urlopen, quote, unquote
try:
from urlparse import urlunsplit, urlsplit
except ImportError:
from urllib.parse import urlunsplit, urlsplit
from odf.opendocument import load
from odf.draw import Image
if sys.version_info[0]==3: unicode=str
#sys.tracebacklimit = 0
# Variable to count the number of retrieval failures
failures = 0
# Set to one if quiet behaviour is wanted
quiet = 0
# If set will write every url to import
verbose = 0
# Dictionary with new pictures. Key is original file path
# Item is newfilename
newpictures = {}
doc = None
def importpicture(href):
""" Add the picture to the ZIP file
Returns the new path name to the file in the zip archive
If it is unable to import, then it returns the original href
Sideeffect: add line to manifest
"""
global doc, newpictures, failures, verbose
# Check that it is not already in the manifest
if href in doc.Pictures: return href
image = None
if verbose: print ("Importing", href, file=sys.stderr)
if href[:7] == "http://" or href[:8] == "https://" or href[:6] == "ftp://":
# There is a bug in urlopen: It can't open urls with non-ascii unicode
# characters. Convert to UTF-8 and then use percent encoding
try:
goodhref = href.encode('ascii')
except:
o = list(urlsplit(href))
o[2] = quote(o[2].encode('utf-8'))
goodhref = urlunsplit(o)
if goodhref in newpictures:
if verbose: print ("already imported", file=sys.stderr)
return newpictures[goodhref] # Already imported
try:
f = urlopen(goodhref.decode("utf-8"))
image = f.read()
headers = f.info()
f.close()
# Get the mimetype from the headerlines
c_t = headers['Content-Type'].split(';')[0].strip()
if c_t: mediatype = c_t.split(';')[0].strip()
if verbose: print ("OK", file=sys.stderr)
except:
failures += 1
if verbose: print ("failed", file=sys.stderr)
return href
# Remove query string
try: href= href[:href.rindex('?')]
except: pass
try:
lastslash = href[href.rindex('/'):]
ext = lastslash[lastslash.rindex('.'):]
except: ext = mimetypes.guess_extension(mediatype)
# Everything is a simple path.
else:
goodhref = href
if href[:3] == '../':
if directory is None:
goodhref = unquote(href[3:])
else:
goodhref = unquote(directory + href[2:])
if goodhref in newpictures:
if verbose: print ("already imported", file=sys.stderr)
return newpictures[goodhref] # Already imported
mediatype, encoding = mimetypes.guess_type(goodhref)
if mediatype is None:
mediatype = ''
try: ext = goodhref[goodhref.rindex('.'):]
except: ext=''
else:
ext = mimetypes.guess_extension(mediatype)
try:
image = file(goodhref).read()
if verbose: print ("OK", file=sys.stderr)
except:
failures += 1
if verbose: print ("failed", file=sys.stderr)
return href
# If we have a picture to import, the image variable contains it
# and manifestfn, ext and mediatype has a value
if image:
manifestfn = doc.addPictureFromString(image, unicode(mediatype))
newpictures[goodhref] = manifestfn
return manifestfn
if verbose: print ("not imported", file=sys.stderr)
return href
def exitwithusage(exitcode=2):
""" Print out usage information and exit """
print ("Usage: %s [-q] [-v] [-o output] [inputfile]" % sys.argv[0], file=sys.stderr)
print ("\tInputfile must be OpenDocument format", file=sys.stderr)
sys.exit(exitcode)
outputfile = None
writefile = True
try:
opts, args = getopt.getopt(sys.argv[1:], "qvo:")
except getopt.GetoptError:
exitwithusage()
for o, a in opts:
if o == "-o":
outputfile = a
writefile = True
if o == "-q":
quiet = 1
if o == "-v":
verbose = 1
if len(args) == 0:
try:
doc = load(sys.stdin)
directory = None
except:
print ("Couldn't open OpenDocument file", file=sys.stderr)
exitwithusage()
else:
fn = unicode(args[0])
if not zipfile.is_zipfile(fn):
exitwithusage()
dirinx = max(fn.rfind('\\'), fn.rfind('/'))
if dirinx >= 0: directory = fn[:dirinx]
else: directory = "."
doc = load(fn)
for image in doc.getElementsByType(Image):
href = image.getAttribute('href')
newhref = importpicture(href)
image.setAttribute('href',newhref)
if writefile:
if outputfile is None:
doc.save(fn)
else:
doc.save(unicode(outputfile))
if quiet == 0 and failures > 0:
print ("Couldn't import %d image(s)" % failures, file=sys.stderr)
sys.exit( int(failures > 0) )
# Local Variables: ***
# mode: python ***
# End: ***

216
venv/bin/odflint Executable file
View File

@ -0,0 +1,216 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2009 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
import zipfile
from xml.sax import make_parser,handler
from xml.sax.xmlreader import InputSource
import xml.sax.saxutils
import sys
from odf.opendocument import OpenDocument
from odf import element, grammar
from odf.namespaces import *
from odf.attrconverters import attrconverters, cnv_string
from io import BytesIO
if sys.version_info[0]==3: unicode=str
extension_attributes = {
"OpenOffice.org" : {
(METANS,u'template'): (
(XLINKNS,u'role'),
),
(STYLENS,u'graphic-properties'): (
(STYLENS,u'background-transparency'),
),
(STYLENS,u'paragraph-properties'): (
(TEXTNS,u'enable-numbering'),
(STYLENS,u'join-border'),
),
(STYLENS,u'table-cell-properties'): (
(STYLENS,u'writing-mode'),
),
(STYLENS,u'table-row-properties'): (
(STYLENS,u'keep-together'),
),
},
"KOffice" : {
(STYLENS,u'graphic-properties'): (
(KOFFICENS,u'frame-behavior-on-new-page'),
),
(DRAWNS,u'page'): (
(KOFFICENS,u'name'),
),
(PRESENTATIONNS,u'show-shape'): (
(KOFFICENS,u'order-id'),
),
(PRESENTATIONNS,u'hide-shape'): (
(KOFFICENS,u'order-id'),
),
(CHARTNS,u'legend'): (
(KOFFICENS,u'title'),
),
}
}
printed_errors = []
def print_error(str):
if str not in printed_errors:
printed_errors.append(str)
print (str)
def chop_arg(arg):
if len(arg) > 20:
return "%s..." % arg[0:20]
return arg
def make_qname(tag):
return "%s:%s" % (nsdict.get(tag[0],tag[0]), tag[1])
def allowed_attributes(tag):
return grammar.allowed_attributes.get(tag)
class ODFElementHandler(handler.ContentHandler):
""" Extract headings from content.xml of an ODT file """
def __init__(self, document):
self.doc = document
self.tagstack = []
self.data = []
self.currtag = None
def characters(self, data):
self.data.append(data)
def startElementNS(self, tag, qname, attrs):
""" Pseudo-create an element
"""
allowed_attrs = grammar.allowed_attributes.get(tag)
attrdict = {}
for (att,value) in attrs.items():
prefix = nsdict.get(att[0],att[0])
# Check if it is a known extension
notan_extension = True
for product, ext_attrs in extension_attributes.items():
allowed_ext_attrs = ext_attrs.get(tag)
if allowed_ext_attrs and att in allowed_ext_attrs:
print_error("Warning: Attribute %s in element <%s> is illegal - %s extension" % ( make_qname(att), make_qname(tag), product))
notan_extension = False
# Check if it is an allowed attribute
if notan_extension and allowed_attrs and att not in allowed_attrs:
print_error("Error: Attribute %s:%s is not allowed in element <%s>" % ( prefix, att[1], make_qname(tag)))
# Check the value
try:
convert = attrconverters.get(att, cnv_string)
convert(att, value, tag)
except ValueError as res:
print_error("Error: Bad value '%s' for attribute %s:%s in tag: <%s> - %s" %
(chop_arg(value), prefix, att[1], make_qname(tag), res))
self.tagstack.append(tag)
self.data = []
# Check that the parent allows this child element
if tag not in ( (OFFICENS, 'document'), (OFFICENS, 'document-content'), (OFFICENS, 'document-styles'),
(OFFICENS, 'document-meta'), (OFFICENS, 'document-settings'),
(MANIFESTNS,'manifest')):
try:
parent = self.tagstack[-2]
allowed_children = grammar.allowed_children.get(parent)
except:
print_error("Error: This document starts with the wrong tag: <%s>" % make_qname(tag))
allowed_children = None
if allowed_children and tag not in allowed_children:
print_error("Error: Element %s is not allowed in element %s" % ( make_qname(tag), make_qname(parent)))
# Test that all mandatory attributes have been added.
required = grammar.required_attributes.get(tag)
if required:
for r in required:
if attrs.get(r) is None:
print_error("Error: Required attribute missing: %s in <%s>" % (make_qname(r), make_qname(tag)))
def endElementNS(self, tag, qname):
self.currtag = self.tagstack.pop()
str = ''.join(self.data).strip()
# Check that only elements that can take text have text
# But only elements we know exist in grammar
if tag in grammar.allowed_children:
if str != '' and tag not in grammar.allows_text:
print_error("Error: %s does not allow text data" % make_qname(tag))
self.data = []
class ODFDTDHandler(handler.DTDHandler):
def notationDecl(self, name, public_id, system_id):
""" Ignore DTDs """
print_error("Warning: ODF doesn't use DOCTYPEs")
def exitwithusage(exitcode=2):
""" print out usage information """
sys.stderr.write("Usage: %s inputfile\n" % sys.argv[0])
sys.stderr.write("\tInputfile must be OpenDocument format\n")
sys.exit(exitcode)
def lint(odffile):
if not zipfile.is_zipfile(odffile):
print_error("Error: This is not a zipped file")
return
zfd = zipfile.ZipFile(odffile)
try:
mimetype = zfd.read('mimetype')
except:
mimetype=''
d = OpenDocument(unicode(mimetype))
first = True
for zi in zfd.infolist():
if first:
if zi.filename == 'mimetype':
if zi.compress_type != zipfile.ZIP_STORED:
print_error("Error: The 'mimetype' member must be stored - not deflated")
if zi.comment != "":
print_error("Error: The 'mimetype' member must not have extra header info")
else:
print_error("Warning: The first member in the archive should be the mimetype")
first = False
if zi.filename in ('META-INF/manifest.xml', 'content.xml', 'meta.xml', 'styles.xml', 'settings.xml'):
content = zfd.read(zi.filename)
parser = make_parser()
parser.setFeature(handler.feature_namespaces, True)
parser.setFeature(handler.feature_external_ges, False)
parser.setContentHandler(ODFElementHandler(d))
dtdh = ODFDTDHandler()
parser.setDTDHandler(dtdh)
parser.setErrorHandler(handler.ErrorHandler())
inpsrc = InputSource()
if not isinstance(content, str):
content=content
inpsrc.setByteStream(BytesIO(content))
parser.parse(inpsrc)
if len(sys.argv) != 2:
exitwithusage()
lint(unicode(sys.argv[1]))
# Local Variables: ***
# mode: python ***
# End: ***

266
venv/bin/odfmeta Executable file
View File

@ -0,0 +1,266 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2006-2009 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
import zipfile, time, sys, getopt, re
import xml.sax, xml.sax.saxutils
from odf.namespaces import TOOLSVERSION, OFFICENS, XLINKNS, DCNS, METANS
from io import BytesIO
OUTENCODING="utf-8"
whitespace = re.compile(r'\s+')
fields = {
'title': (DCNS,u'title'),
'description': (DCNS,u'description'),
'subject': (DCNS,u'subject'),
'creator': (DCNS,u'creator'),
'date': (DCNS,u'date'),
'language': (DCNS,u'language'),
'generator': (METANS,u'generator'),
'initial-creator': (METANS,u'initial-creator'),
'keyword': (METANS,u'keyword'),
'editing-duration': (METANS,u'editing-duration'),
'editing-cycles': (METANS,u'editing-cycles'),
'printed-by': (METANS,u'printed-by'),
'print-date': (METANS,u'print-date'),
'creation-date': (METANS,u'creation-date'),
'user-defined': (METANS,u'user-defined'),
#'template': (METANS,u'template'),
}
xfields = []
Xfields = []
addfields = {}
deletefields = {}
yieldfields = {}
showversion = None
def exitwithusage(exitcode=2):
""" print out usage information """
sys.stderr.write("Usage: %s [-cdlvV] [-xXaAI metafield]... [-o output] [inputfile]\n" % sys.argv[0])
sys.stderr.write("\tInputfile must be OpenDocument format\n")
sys.exit(exitcode)
def normalize(str):
"""
The normalize-space function returns the argument string with whitespace
normalized by stripping leading and trailing whitespace and replacing
sequences of whitespace characters by a single space.
"""
return whitespace.sub(' ', str).strip()
class MetaCollector:
"""
The MetaCollector is a pseudo file object, that can temporarily ignore write-calls
It could probably be replaced with a StringIO object.
"""
def __init__(self):
self._content = []
self.dowrite = True
def write(self, str):
if self.dowrite:
self._content.append(str)
def content(self):
return ''.join(self._content)
base = xml.sax.saxutils.XMLGenerator
class odfmetaparser(base):
""" Parse a meta.xml file with an event-driven parser and replace elements.
It would probably be a cleaner approach to use a DOM based parser and
then manipulate in memory.
Small issue: Reorders elements
"""
version = 'Unknown'
def __init__(self):
self._mimetype = ''
self.output = MetaCollector()
self._data = []
self.seenfields = {}
base.__init__(self, self.output, OUTENCODING)
def startElementNS(self, name, qname, attrs):
self._data = []
field = name
# I can't modify the template until the tool replaces elements at the same
# location and not at the end
# if name == (METANS,u'template'):
# self._data = [attrs.get((XLINKNS,u'title'),'')]
if showversion and name == (OFFICENS,u'document-meta'):
if showversion == '-V':
print ("version:%s" % attrs.get((OFFICENS,u'version'),'Unknown').decode('utf-8'))
else:
print ("%s" % attrs.get((OFFICENS,u'version'),'Unknown').decode('utf-8'))
if name == (METANS,u'user-defined'):
field = attrs.get((METANS,u'name'))
if field in deletefields:
self.output.dowrite = False
elif field in yieldfields:
del addfields[field]
base.startElementNS(self, name, qname, attrs)
else:
base.startElementNS(self, name, qname, attrs)
self._tag = field
def endElementNS(self, name, qname):
field = name
if name == (METANS,u'user-defined'):
field = self._tag
if name == (OFFICENS,u'meta'):
for k,v in addfields.items():
if len(v) > 0:
if type(k) == type(''):
base.startElementNS(self,(METANS,u'user-defined'),None,{(METANS,u'name'):k})
base.characters(self, v)
base.endElementNS(self, (METANS,u'user-defined'),None)
else:
base.startElementNS(self, k, None, {})
base.characters(self, v)
base.endElementNS(self, k, None)
if name in xfields:
print ("%s" % self.data())
if name in Xfields:
if isinstance(self._tag, tuple):
texttag = self._tag[1]
else:
texttag = self._tag
print ("%s:%s" % (texttag, self.data()))
if field in deletefields:
self.output.dowrite = True
else:
base.endElementNS(self, name, qname)
def characters(self, content):
base.characters(self, content)
self._data.append(content)
def meta(self):
return self.output.content()
def data(self):
if usenormalize:
return normalize(''.join(self._data))
else:
return ''.join(self._data)
now = time.localtime()[:6]
outputfile = "-"
writemeta = False # Do we change any meta data?
usenormalize = False
try:
opts, args = getopt.getopt(sys.argv[1:], "cdlvVI:A:a:o:x:X:")
except getopt.GetoptError:
exitwithusage()
if len(opts) == 0:
opts = [ ('-l','') ]
for o, a in opts:
if o in ('-a','-A','-I'):
writemeta = True
if a.find(":") >= 0:
k,v = a.split(":",1)
else:
k,v = (a, "")
if len(k) == 0:
exitwithusage()
k = fields.get(k,k)
addfields[k] = unicode(v,'utf-8')
if o == '-a':
yieldfields[k] = True
if o == '-I':
deletefields[k] = True
if o == '-d':
writemeta = True
addfields[(DCNS,u'date')] = "%04d-%02d-%02dT%02d:%02d:%02d" % now
deletefields[(DCNS,u'date')] = True
if o == '-c':
usenormalize = True
if o in ('-v', '-V'):
showversion = o
if o == '-l':
Xfields = fields.values()
if o == "-x":
xfields.append(fields.get(a,a))
if o == "-X":
Xfields.append(fields.get(a,a))
if o == "-o":
outputfile = a
# The specification says we should change the element to our own,
# and must not export the original identifier.
if writemeta:
addfields[(METANS,u'generator')] = TOOLSVERSION
deletefields[(METANS,u'generator')] = True
odfs = odfmetaparser()
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 1)
parser.setContentHandler(odfs)
if len(args) == 0:
zin = zipfile.ZipFile(sys.stdin,'r')
else:
if not zipfile.is_zipfile(args[0]):
exitwithusage()
zin = zipfile.ZipFile(args[0], 'r')
try:
content = zin.read('meta.xml').decode('utf-8')
except:
sys.stderr.write("File has no meta data\n")
sys.exit(1)
parser.parse(BytesIO(content.encode('utf-8')))
if writemeta:
if outputfile == '-':
if sys.stdout.isatty():
sys.stderr.write("Won't write ODF file to terminal\n")
sys.exit(1)
zout = zipfile.ZipFile(sys.stdout,"w")
else:
zout = zipfile.ZipFile(outputfile,"w")
# Loop through the input zipfile and copy the content to the output until we
# get to the meta.xml. Then substitute.
for zinfo in zin.infolist():
if zinfo.filename == "meta.xml":
# Write meta
zi = zipfile.ZipInfo("meta.xml", now)
zi.compress_type = zipfile.ZIP_DEFLATED
zout.writestr(zi,odfs.meta() )
else:
payload = zin.read(zinfo.filename)
zout.writestr(zinfo, payload)
zout.close()
zin.close()
# Local Variables: ***
# mode: python ***
# End: ***

144
venv/bin/odfoutline Executable file
View File

@ -0,0 +1,144 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2006 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
from __future__ import print_function
import zipfile
from xml.sax import make_parser,handler
from xml.sax.xmlreader import InputSource
import xml.sax.saxutils
import sys
from odf.namespaces import TEXTNS, TABLENS, DRAWNS
try:
from cStringIO import StringIO
except ImportError:
from io import StringIO
def getxmlpart(odffile, xmlfile):
""" Get the content out of the ODT file"""
z = zipfile.ZipFile(odffile)
content = z.read(xmlfile)
z.close()
return content
#
# Extract headings from content.xml
#
class ODTHeadingHandler(handler.ContentHandler):
""" Extract headings from content.xml of an ODT file """
def __init__(self, eater):
self.r = eater
self.data = []
self.level = 0
def characters(self, data):
self.data.append(data)
def startElementNS(self, tag, qname, attrs):
if tag == (TEXTNS, 'h'):
self.level = 0
for (att,value) in attrs.items():
if att == (TEXTNS, 'outline-level'):
self.level = int(value)
self.data = []
def endElementNS(self, tag, qname):
if tag == (TEXTNS, 'h'):
str = ''.join(self.data)
self.data = []
self.r.append("%d%*s%s" % (self.level, self.level, '', str))
class ODTSheetHandler(handler.ContentHandler):
""" Extract sheet names from content.xml of an ODS file """
def __init__(self, eater):
self.r = eater
def startElementNS(self, tag, qname, attrs):
if tag == (TABLENS, 'table'):
sheetname = attrs.get((TABLENS, 'name'))
if sheetname:
self.r.append(sheetname)
class ODTSlideHandler(handler.ContentHandler):
""" Extract headings from content.xml of an ODT file """
def __init__(self, eater):
self.r = eater
self.data = []
self.pagenum = 0
def characters(self, data):
self.data.append(data)
def startElementNS(self, tag, qname, attrs):
if tag == (DRAWNS, 'page'):
self.pagenum = self.pagenum + 1
self.r.append("SLIDE %d: %s" % ( self.pagenum, attrs.get((DRAWNS, 'name'),'')))
if tag == (TEXTNS, 'p'):
self.data = []
def endElementNS(self, tag, qname):
if tag == (TEXTNS, 'p'):
str = ''.join(self.data)
self.data = []
if len(str) > 0:
self.r.append(" " + str)
def odtheadings(odtfile):
mimetype = getxmlpart(odtfile,'mimetype')
content = getxmlpart(odtfile,'content.xml')
lines = []
parser = make_parser()
parser.setFeature(handler.feature_namespaces, 1)
if not isinstance(mimetype, str):
mimetype=mimetype.decode("utf-8")
if mimetype in ('application/vnd.oasis.opendocument.text',
'application/vnd.oasis.opendocument.text-template'):
parser.setContentHandler(ODTHeadingHandler(lines))
elif mimetype in ('application/vnd.oasis.opendocument.spreadsheet',
'application/vnd.oasis.opendocument.spreadsheet-template'):
parser.setContentHandler(ODTSheetHandler(lines))
elif mimetype in ('application/vnd.oasis.opendocument.presentation'
'application/vnd.oasis.opendocument.presentation-template'):
parser.setContentHandler(ODTSlideHandler(lines))
else:
print ("Unsupported fileformat")
sys.exit(2)
parser.setErrorHandler(handler.ErrorHandler())
inpsrc = InputSource()
if not isinstance(content, str):
content=content.decode("utf-8")
inpsrc.setByteStream(StringIO(content))
parser.parse(inpsrc)
return lines
if __name__ == "__main__":
filler = " "
for heading in odtheadings(sys.argv[1]):
print (heading)
# Local Variables: ***
# mode: python ***
# End: ***

101
venv/bin/odfuserfield Executable file
View File

@ -0,0 +1,101 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2006-2007 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s): Michael Howitz, gocept gmbh & co. kg
import sys
import getopt
import odf.userfield
if sys.version_info[0]==3: unicode=str
listfields = False
Listfields = False
xfields = []
Xfields = []
setfields = {}
outputfile = None
inputfile = None
def exitwithusage(exitcode=2):
""" print out usage information """
sys.stderr.write("Usage: %s [-lL] [-xX metafield] [-s metafield:value]... "
"[-o output] [inputfile]\n" % sys.argv[0])
sys.stderr.write("\tInputfile must be OpenDocument format\n")
sys.exit(exitcode)
try:
opts, args = getopt.getopt(sys.argv[1:], "lLs:o:x:X:")
except getopt.GetoptError:
exitwithusage()
if len(opts) == 0:
exitwithusage()
for o, a in opts:
if o == '-s':
if a.find(":") >= 0:
k,v = a.split(":",1)
else:
k,v = (a, "")
if len(k) == 0:
exitwithusage()
setfields[unicode(k)] = unicode(v)
if o == '-l':
listfields = True
Listfields = False
if o == '-L':
Listfields = True
listfields = False
if o == "-x":
xfields.append(unicode(a))
if o == "-X":
Xfields.append(unicode(a))
if o == "-o":
outputfile = unicode(a)
if len(args) != 0:
inputfile = unicode(args[0])
user_fields = odf.userfield.UserFields(inputfile, outputfile)
if xfields:
for value in user_fields.list_values(xfields):
print (value)
if Listfields or Xfields:
if Listfields:
Xfields = None
for field_name, value_type, value in user_fields.list_fields_and_values(
Xfields):
print ("%s#%s:%s" % (field_name, value_type, value))
if listfields:
for value in user_fields.list_fields():
print (value)
if setfields:
user_fields.update(setfields)
# Local Variables: ***
# mode: python ***
# End: ***

10
venv/bin/pip Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from pip._internal.cli.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

10
venv/bin/pip3 Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from pip._internal.cli.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

10
venv/bin/pip3.10 Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from pip._internal.cli.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

10
venv/bin/pygmentize Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from pygments.cmdline import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

1
venv/bin/python Symbolic link
View File

@ -0,0 +1 @@
python3

1
venv/bin/python3 Symbolic link
View File

@ -0,0 +1 @@
/usr/bin/python3

1
venv/bin/python3.10 Symbolic link
View File

@ -0,0 +1 @@
python3

10
venv/bin/rdf2dot Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from rdflib.tools.rdf2dot import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

10
venv/bin/rdfgraphisomorphism Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from rdflib.tools.graphisomorphism import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

10
venv/bin/rdfpipe Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from rdflib.tools.rdfpipe import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

10
venv/bin/rdfs2dot Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from rdflib.tools.rdfs2dot import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

10
venv/bin/rqw Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from SPARQLWrapper.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

10
venv/bin/spacy Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from spacy.cli import setup_cli
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(setup_cli())

10
venv/bin/tqdm Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from tqdm.cli import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

10
venv/bin/typer Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from typer.cli import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

10
venv/bin/weasel Executable file
View File

@ -0,0 +1,10 @@
#!/bin/sh
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
' '''
# -*- coding: utf-8 -*-
import re
import sys
from weasel.cli import app
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(app())

241
venv/bin/xml2odf Executable file
View File

@ -0,0 +1,241 @@
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2006 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
#
# OpenDocument can be a complete office document in a single
# XML document. This script will take such a document and create
# a package
import io
import zipfile,time, sys, getopt
import xml.sax, xml.sax.saxutils
from odf import manifest
class SplitWriter:
def __init__(self):
self.activefiles = []
self._content = []
self._meta = []
self._styles = []
self._settings = []
self.files = {'content': self._content, 'meta': self._meta,
'styles':self._styles, 'settings': self._settings }
def write(self, str):
for f in self.activefiles:
f.append(str)
def activate(self, filename):
file = self.files[filename]
if file not in self.activefiles:
self.activefiles.append(file)
def deactivate(self, filename):
file = self.files[filename]
if file in self.activefiles:
self.activefiles.remove(file)
odmimetypes = {
'application/vnd.oasis.opendocument.text': '.odt',
'application/vnd.oasis.opendocument.text-template': '.ott',
'application/vnd.oasis.opendocument.graphics': '.odg',
'application/vnd.oasis.opendocument.graphics-template': '.otg',
'application/vnd.oasis.opendocument.presentation': '.odp',
'application/vnd.oasis.opendocument.presentation-template': '.otp',
'application/vnd.oasis.opendocument.spreadsheet': '.ods',
'application/vnd.oasis.opendocument.spreadsheet-template': '.ots',
'application/vnd.oasis.opendocument.chart': '.odc',
'application/vnd.oasis.opendocument.chart-template': '.otc',
'application/vnd.oasis.opendocument.image': '.odi',
'application/vnd.oasis.opendocument.image-template': '.oti',
'application/vnd.oasis.opendocument.formula': '.odf',
'application/vnd.oasis.opendocument.formula-template': '.otf',
'application/vnd.oasis.opendocument.text-master': '.odm',
'application/vnd.oasis.opendocument.text-web': '.oth',
}
OFFICENS = u"urn:oasis:names:tc:opendocument:xmlns:office:1.0"
base = xml.sax.saxutils.XMLGenerator
class odfsplitter(base):
def __init__(self):
self._mimetype = ''
self.output = SplitWriter()
self._prefixes = []
base.__init__(self, self.output, 'utf-8')
def startPrefixMapping(self, prefix, uri):
base.startPrefixMapping(self, prefix, uri)
self._prefixes.append('xmlns:%s="%s"' % (prefix, uri))
def startElementNS(self, name, qname, attrs):
if name == (OFFICENS, u"document"):
self._mimetype = attrs.get((OFFICENS, "mimetype"))
elif name == (OFFICENS, u"meta"):
self.output.activate('meta')
elif name == (OFFICENS, u"settings"):
self.output.activate('settings')
elif name == (OFFICENS, u"scripts"):
self.output.activate('content')
elif name == (OFFICENS, u"font-face-decls"):
self.output.activate('content')
self.output.activate('styles')
elif name == (OFFICENS, u"styles"):
self.output.activate('styles')
elif name == (OFFICENS, u"automatic-styles"):
self.output.activate('content')
self.output.activate('styles')
elif name == (OFFICENS, u"master-styles"):
self.output.activate('styles')
elif name == (OFFICENS, u"body"):
self.output.activate('content')
base.startElementNS(self, name, qname, attrs)
def endElementNS(self, name, qname):
base.endElementNS(self, name, qname)
if name == (OFFICENS, u"meta"):
self.output.deactivate('meta')
elif name == (OFFICENS, u"settings"):
self.output.deactivate('settings')
elif name == (OFFICENS, u"scripts"):
self.output.deactivate('content')
elif name == (OFFICENS, u"font-face-decls"):
self.output.deactivate('content')
self.output.deactivate('styles')
elif name == (OFFICENS, u"styles"):
self.output.deactivate('styles')
elif name == (OFFICENS, u"automatic-styles"):
self.output.deactivate('content')
self.output.deactivate('styles')
elif name == (OFFICENS, u"master-styles"):
self.output.deactivate('styles')
elif name == (OFFICENS, u"body"):
self.output.deactivate('content')
def content(self):
""" Return the content inside a wrapper called <office:document-content>
"""
prefixes = ' '.join(self._prefixes)
return ''.join(['<?xml version="1.0" encoding="UTF-8"?>\n<office:document-content %s office:version="1.0">' % prefixes] + list(map(lambda x: x.decode("utf-8"), self.output._content)) + ['</office:document-content>'])
def settings(self):
prefixes = ' '.join(self._prefixes).encode('utf-8')
return ''.join( ['<?xml version="1.0" encoding="UTF-8"?>\n<office:document-settings %s office:version="1.0">' % prefixes] + self.output._settings + ['''</office:document-settings>'''])
def styles(self):
prefixes = ' '.join(self._prefixes)
return ''.join( ['<?xml version="1.0" encoding="UTF-8"?>\n<office:document-styles %s office:version="1.0">' % prefixes] + list(map(lambda x: x.decode("utf-8"), self.output._styles)) + ['''</office:document-styles>'''])
def meta(self):
prefixes = ' '.join(self._prefixes)
return ''.join( ['<?xml version="1.0" encoding="UTF-8"?>\n<office:document-meta %s office:version="1.0">' % prefixes] + list(map(lambda x: x.decode("utf-8"), self.output._meta)) + ['''</office:document-meta>'''])
def usage():
sys.stderr.write("Usage: %s [-o outputfile] [-s] inputfile\n" % sys.argv[0])
def manifestxml(m):
""" Generates the content of the manifest.xml file """
xml=io.StringIO()
xml.write(u"<?xml version='1.0' encoding='UTF-8'?>\n")
m.toXml(0,xml)
return xml.getvalue()
try:
opts, args = getopt.getopt(sys.argv[1:], "o:s", ["output=","suffix"])
except getopt.GetoptError:
usage()
sys.exit(2)
outputfile = '-'
addsuffix = False
for o, a in opts:
if o in ("-o", "--output"):
outputfile = a
if o in ("-s", "--suffix"):
addsuffix = True
if len(args) > 1:
usage()
sys.exit(2)
odfs = odfsplitter()
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 1)
parser.setContentHandler(odfs)
if len(args) == 0:
parser.parse(sys.stdin)
else:
parser.parse(open(args[0],"r"))
mimetype = odfs._mimetype
suffix = odmimetypes.get(mimetype,'.xxx')
if outputfile == '-':
if sys.stdout.isatty():
sys.stderr.write("Won't write ODF file to terminal\n")
sys.exit(1)
z = zipfile.ZipFile(sys.stdout,"w")
else:
if addsuffix:
outputfile = outputfile + suffix
z = zipfile.ZipFile(outputfile,"w")
now = time.localtime()[:6]
# Write mimetype
zi = zipfile.ZipInfo('mimetype', now)
zi.compress_type = zipfile.ZIP_STORED
z.writestr(zi,mimetype)
# Write content
zi = zipfile.ZipInfo("content.xml", now)
zi.compress_type = zipfile.ZIP_DEFLATED
z.writestr(zi,odfs.content() )
# Write styles
zi = zipfile.ZipInfo("styles.xml", now)
zi.compress_type = zipfile.ZIP_DEFLATED
z.writestr(zi,odfs.styles() )
# Write meta
zi = zipfile.ZipInfo("meta.xml", now)
zi.compress_type = zipfile.ZIP_DEFLATED
z.writestr(zi,odfs.meta() )
m = manifest.Manifest()
m.addElement(manifest.FileEntry(fullpath="/", mediatype=mimetype))
m.addElement(manifest.FileEntry(fullpath="content.xml",mediatype="text/xml"))
m.addElement(manifest.FileEntry(fullpath="styles.xml", mediatype="text/xml"))
m.addElement(manifest.FileEntry(fullpath="meta.xml", mediatype="text/xml"))
# Write manifest
zi = zipfile.ZipInfo("META-INF/manifest.xml", now)
zi.compress_type = zipfile.ZIP_DEFLATED
z.writestr(zi, manifestxml(m).encode("utf-8") )
z.close()
# Local Variables: ***
# mode: python ***
# End: ***

View File

@ -0,0 +1,28 @@
Copyright 2010 Pallets
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,92 @@
Metadata-Version: 2.1
Name: MarkupSafe
Version: 3.0.2
Summary: Safely add untrusted strings to HTML/XML markup.
Maintainer-email: Pallets <contact@palletsprojects.com>
License: Copyright 2010 Pallets
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Project-URL: Donate, https://palletsprojects.com/donate
Project-URL: Documentation, https://markupsafe.palletsprojects.com/
Project-URL: Changes, https://markupsafe.palletsprojects.com/changes/
Project-URL: Source, https://github.com/pallets/markupsafe/
Project-URL: Chat, https://discord.gg/pallets
Classifier: Development Status :: 5 - Production/Stable
Classifier: Environment :: Web Environment
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: BSD License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
Classifier: Topic :: Text Processing :: Markup :: HTML
Classifier: Typing :: Typed
Requires-Python: >=3.9
Description-Content-Type: text/markdown
License-File: LICENSE.txt
# MarkupSafe
MarkupSafe implements a text object that escapes characters so it is
safe to use in HTML and XML. Characters that have special meanings are
replaced so that they display as the actual characters. This mitigates
injection attacks, meaning untrusted user input can safely be displayed
on a page.
## Examples
```pycon
>>> from markupsafe import Markup, escape
>>> # escape replaces special characters and wraps in Markup
>>> escape("<script>alert(document.cookie);</script>")
Markup('&lt;script&gt;alert(document.cookie);&lt;/script&gt;')
>>> # wrap in Markup to mark text "safe" and prevent escaping
>>> Markup("<strong>Hello</strong>")
Markup('<strong>hello</strong>')
>>> escape(Markup("<strong>Hello</strong>"))
Markup('<strong>hello</strong>')
>>> # Markup is a str subclass
>>> # methods and operators escape their arguments
>>> template = Markup("Hello <em>{name}</em>")
>>> template.format(name='"World"')
Markup('Hello <em>&#34;World&#34;</em>')
```
## Donate
The Pallets organization develops and supports MarkupSafe and other
popular packages. In order to grow the community of contributors and
users, and allow the maintainers to devote more time to the projects,
[please donate today][].
[please donate today]: https://palletsprojects.com/donate

View File

@ -0,0 +1,14 @@
MarkupSafe-3.0.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
MarkupSafe-3.0.2.dist-info/LICENSE.txt,sha256=SJqOEQhQntmKN7uYPhHg9-HTHwvY-Zp5yESOf_N9B-o,1475
MarkupSafe-3.0.2.dist-info/METADATA,sha256=aAwbZhSmXdfFuMM-rEHpeiHRkBOGESyVLJIuwzHP-nw,3975
MarkupSafe-3.0.2.dist-info/RECORD,,
MarkupSafe-3.0.2.dist-info/WHEEL,sha256=_kVlewavvOSnwZE_whBk3jlE_Ob-nL5GvlVcLkpXSD8,151
MarkupSafe-3.0.2.dist-info/top_level.txt,sha256=qy0Plje5IJuvsCBjejJyhDCjEAdcDLK_2agVcex8Z6U,11
markupsafe/__init__.py,sha256=sr-U6_27DfaSrj5jnHYxWN-pvhM27sjlDplMDPZKm7k,13214
markupsafe/__pycache__/__init__.cpython-310.pyc,,
markupsafe/__pycache__/_native.cpython-310.pyc,,
markupsafe/_native.py,sha256=hSLs8Jmz5aqayuengJJ3kdT5PwNpBWpKrmQSdipndC8,210
markupsafe/_speedups.c,sha256=O7XulmTo-epI6n2FtMVOrJXl8EAaIwD2iNYmBI5SEoQ,4149
markupsafe/_speedups.cpython-310-x86_64-linux-gnu.so,sha256=x4RoxWgyqAEokk-AZrWvrLDxLE-dm-zZSZYV_gOiLJA,34976
markupsafe/_speedups.pyi,sha256=ENd1bYe7gbBUf2ywyYWOGUpnXOHNJ-cgTNqetlW8h5k,41
markupsafe/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0

View File

@ -0,0 +1,6 @@
Wheel-Version: 1.0
Generator: setuptools (75.2.0)
Root-Is-Purelib: false
Tag: cp310-cp310-manylinux_2_17_x86_64
Tag: cp310-cp310-manylinux2014_x86_64

View File

@ -0,0 +1 @@
markupsafe

View File

@ -0,0 +1,37 @@
# Authors
* Ivan Herman ([@iherman](http://github.com/iherman))
* Sergio Fernández ([@wikier](http://github.com/wikier))
* Carlos Tejo ([@dayures](http://github.com/dayures))
* Alexey Zakhlestin ([@indeyets](http://github.com/indeyets))
# Contributors
See https://github.com/RDFLib/sparqlwrapper/graphs/contributors
* [@eggplants]https://github.com/eggplants: most things to make 2.0.0 happen
* Obey Arthur Liu ([@ArthurLiu](http://github.com/ArthurLiu)): different patches
* Christopher Lenz ([@cmlenz](http://github.com/cmlenz)): feature to allow developers to choose the json module
* Pēteris Caune ([@cuu508](http://github.com/cuu508)): great feedback and patches
* Bogdan Benea ([bugdone@users.sourceforge.net](mailto:bugdone@users.sourceforge.net)), patch for the query regular expresion
* William Waites ([@wwaites](http://github.com/wwaites)): patches for RDFLib3
* Christoph Burgmer ([@cburgmer](http://github.com/cburgmer)): patches for RDFLib3
* Thomas Kluyver ([@takluyver](http://github.com/takluyver)): patches for Python 3.x
* Diego Berrueta ([@berrueta](http://github.com/berrueta)): new function for printing results as table
* Olivier Berger ([@olberger](http://github.com/olberger)): patch regarding raw response for unknown formats
* Benjamin Cogrel ([@bcogrel](http://github.com/bcogrel)): standard query types
* Urs Holzer ([@uholzer](http://github.com/uholzer)): features, patches and testing
* Alf Lervåg ([@alf](http://github.com/alf)): setup patch
* Nolan Nichols ([@nicholsn](http://github.com/nicholsn)): http disgest auth support
* Kevin Turner ([@keturn](https://github.com/keturn)): `SmartWrapper.Value.__repr__()` implementation
* Marcelo Jorge Vieira ([@marcelometal](https://github.com/marcelometal)): typos
* Trevor Andersen ([@trevorandersen](https://github.com/trevorandersen): patches for Python 3.x
* Carlos Martinez-Ortiz ([@cmartinez](https://github.com/cmartinez): improves support for return format HTTP parameter
* Christian Amsüss ([@chrysn](https://github.com/chrysn)): dependecy fixes
* Chris Lamb ([@lamby](https://github.com/lamby)): typo
* Hugo van Kemenade ([@hugovk](https://github.com/hugovk)): update classifiers (Python 3.6)
* Edward Betts ([@EdwardBetts](https://github.com/EdwardBetts)): Correct spelling mistakes
* Carlos Martínez ([@c-martinez](https://github.com/c-martinez)): Mainly support for CSV and TSV results in SPARQL SELECT queries
* Dan Michael O. Heggø ([@danmichaelo](https://github.com/danmichaelo)): update README with SPARQLWrapper2 example
* Sam Clements ([@borntyping](https://github.com/borntyping)): Provide hints about setting properly the timeout
* Marc Feger ([@MaFeg100](https://github.com/MaFeg100)): Improve/tests for development

View File

@ -0,0 +1,18 @@
SPARQL Python Wrapper is released under the W3C® SOFTWARE NOTICE AND LICENSE.
This work (and included software, documentation such as READMEs, or other related items) is being provided by the copyright holders under the following license. By obtaining, using and/or copying this work, you (the licensee) agree that you have read, understood, and will comply with the following terms and conditions.
Permission to copy, modify, and distribute this software and its documentation, with or without modification, for any purpose and without fee or royalty is hereby granted, provided that you include the following on ALL copies of the software and documentation or portions thereof, including modifications:
1. The full text of this NOTICE in a location viewable to users of the redistributed or derivative work.
2. Any pre-existing intellectual property disclaimers, notices, or terms and conditions. If none exist, the W3C Software Short Notice should be included (hypertext is preferred, text is permitted) within the body of any redistributed or derivative code.
3. Notice of any changes or modifications to the files, including the date changes were made. (We recommend you provide URIs to the location from which the code is derived.)
THIS SOFTWARE AND DOCUMENTATION IS PROVIDED "AS IS," AND COPYRIGHT HOLDERS MAKE NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS.
COPYRIGHT HOLDERS WILL NOT BE LIABLE FOR ANY DIRECT, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF ANY USE OF THE SOFTWARE OR DOCUMENTATION.
The name and trademarks of copyright holders may NOT be used in advertising or publicity pertaining to the software without specific, written prior permission. Title to copyright in this software and any associated documentation will at all times remain with copyright holders.
See also http://www.w3.org/Consortium/Legal/copyright-software for further details

View File

@ -0,0 +1,45 @@
Metadata-Version: 2.1
Name: SPARQLWrapper
Version: 2.0.0
Summary: SPARQL Endpoint interface to Python
Home-page: http://rdflib.github.io/sparqlwrapper
Download-URL: https://github.com/RDFLib/sparqlwrapper/releases
Author: Ivan Herman, Sergio Fernández, Carlos Tejo Alonso, Alexey Zakhlestin
Author-email: rdflib-dev@googlegroups.com
License: W3C SOFTWARE NOTICE AND LICENSE
Project-URL: Home, https://rdflib.github.io/sparqlwrapper
Project-URL: Documentation, https://sparqlwrapper.readthedocs.io
Project-URL: Source, https://github.com/RDFLib/sparqlwrapper
Project-URL: Tracker, https://github.com/RDFLib/sparqlwrapper/issues
Keywords: python,sparql,rdf,rdflib
Platform: any
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: W3C License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Requires-Python: >=3.7
License-File: LICENSE.txt
License-File: AUTHORS.md
Requires-Dist: rdflib (>=6.1.1)
Provides-Extra: dev
Requires-Dist: setuptools (>=3.7.1) ; extra == 'dev'
Requires-Dist: mypy (>=0.931) ; extra == 'dev'
Requires-Dist: pandas (>=1.3.5) ; extra == 'dev'
Requires-Dist: pandas-stubs (>=1.2.0.48) ; extra == 'dev'
Provides-Extra: docs
Requires-Dist: sphinx (<5) ; extra == 'docs'
Requires-Dist: sphinx-rtd-theme ; extra == 'docs'
Provides-Extra: keepalive
Requires-Dist: keepalive (>=0.5) ; extra == 'keepalive'
Provides-Extra: pandas
Requires-Dist: pandas (>=1.3.5) ; extra == 'pandas'
This is a wrapper around a SPARQL service. It helps in creating the query URI and, possibly, convert the result into a more manageable format.

View File

@ -0,0 +1,25 @@
../../../bin/rqw,sha256=qf6Nvwhjovp_uPIPeeMNocB3j7iZ_YnskuMQcUK6DYY,291
SPARQLWrapper-2.0.0.dist-info/AUTHORS.md,sha256=7oV4hamlTbjfsaWy15f3BVH2h90Nf5mJ-rR0Z1azy9s,2725
SPARQLWrapper-2.0.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
SPARQLWrapper-2.0.0.dist-info/LICENSE.txt,sha256=Z1IX12CEodcefDAOAMJ7irELJAX-huUCOiuzio5G8Ik,2134
SPARQLWrapper-2.0.0.dist-info/METADATA,sha256=kU92L4KNVjo9aP6-jm4FXVAUpNScd5mIWWbIGHu_D_I,2020
SPARQLWrapper-2.0.0.dist-info/RECORD,,
SPARQLWrapper-2.0.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
SPARQLWrapper-2.0.0.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
SPARQLWrapper-2.0.0.dist-info/entry_points.txt,sha256=aIYAzonEA7winfiw8NydOLNu406HC6aRBlKLI2H5kEQ,48
SPARQLWrapper-2.0.0.dist-info/top_level.txt,sha256=3KluNiTwOkX16hLJwC3UEYzKdEscknK--UV5q56mYWY,14
SPARQLWrapper/KeyCaseInsensitiveDict.py,sha256=JF83-6EPbcm9F4gg0GQ11vTVuLzdJ7sDsubEP9j-3zw,1377
SPARQLWrapper/SPARQLExceptions.py,sha256=qFlU175hp61gO6bvgQsCdSTEGOFnJwJNBQlIGS5W7-o,2595
SPARQLWrapper/SmartWrapper.py,sha256=GxZiMGZpGppPZX54W-YdUtcdAAa83GJjPLdyfLWPK-4,15557
SPARQLWrapper/Wrapper.py,sha256=M9lTPkpvRU2xAUbrHiKYK0mEV8pkycNS3lPoO__0gSE,58238
SPARQLWrapper/__init__.py,sha256=6kU9hD9FnlFbk2c8uFkpGb1arB3268nN74RUh91e60s,1213
SPARQLWrapper/__pycache__/KeyCaseInsensitiveDict.cpython-310.pyc,,
SPARQLWrapper/__pycache__/SPARQLExceptions.cpython-310.pyc,,
SPARQLWrapper/__pycache__/SmartWrapper.cpython-310.pyc,,
SPARQLWrapper/__pycache__/Wrapper.cpython-310.pyc,,
SPARQLWrapper/__pycache__/__init__.cpython-310.pyc,,
SPARQLWrapper/__pycache__/main.cpython-310.pyc,,
SPARQLWrapper/__pycache__/sparql_dataframe.cpython-310.pyc,,
SPARQLWrapper/main.py,sha256=MKNPMrFxIGN_A7-UwyMS_AycjswscgKsP37h2K2df8k,4330
SPARQLWrapper/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
SPARQLWrapper/sparql_dataframe.py,sha256=-oM7_eXbwGgeNkFv9mSxe3JWHM3xQQk90nNrbhthnrI,2429

View File

@ -0,0 +1,5 @@
Wheel-Version: 1.0
Generator: bdist_wheel (0.37.1)
Root-Is-Purelib: true
Tag: py3-none-any

View File

@ -0,0 +1,2 @@
[console_scripts]
rqw = SPARQLWrapper.main:main

View File

@ -0,0 +1 @@
SPARQLWrapper

View File

@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
"""
A simple implementation of a key case-insensitive dictionary.
..
Developers involved:
* Ivan Herman <http://www.ivan-herman.net>
* Sergio Fernández <http://www.wikier.org>
* Carlos Tejo Alonso <http://www.dayures.net>
* Alexey Zakhlestin <https://indeyets.ru/>
Organizations involved:
* `World Wide Web Consortium <http://www.w3.org>`_
* `Foundation CTIC <http://www.fundacionctic.org/>`_
:license: `W3C® Software notice and license <http://www.w3.org/Consortium/Legal/copyright-software>`_
"""
from typing import Dict, Mapping, TypeVar
_V = TypeVar("_V")
class KeyCaseInsensitiveDict(Dict[str, _V]):
"""
A simple implementation of a key case-insensitive dictionary
"""
def __init__(self, d: Mapping[str, _V]={}) -> None:
"""
:param dict d: The source dictionary.
"""
for k, v in d.items():
self[k] = v
def __setitem__(self, key: str, value: _V) -> None:
if hasattr(key, "lower"):
key = key.lower()
dict.__setitem__(self, key, value)
def __getitem__(self, key: str) -> _V:
if hasattr(key, "lower"):
key = key.lower()
return dict.__getitem__(self, key)
def __delitem__(self, key: str) -> None:
if hasattr(key, "lower"):
key = key.lower()
dict.__delitem__(self, key)

View File

@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
"""
SPARQL Wrapper exceptions
..
Developers involved:
* Ivan Herman <http://www.ivan-herman.net>
* Sergio Fernández <http://www.wikier.org>
* Carlos Tejo Alonso <http://www.dayures.net>
* Alexey Zakhlestin <https://indeyets.ru/>
Organizations involved:
* `World Wide Web Consortium <http://www.w3.org>`_
* `Foundation CTIC <http://www.fundacionctic.org/>`_
:license: `W3C® Software notice and license <http://www.w3.org/Consortium/Legal/copyright-software>`_
"""
from typing import Optional
class SPARQLWrapperException(Exception):
"""
Base class for SPARQL Wrapper exceptions
"""
msg = "an exception has occurred"
def __init__(self, response: Optional[bytes] = None):
"""
:param string response: The server response
"""
if response:
formatted_msg = "%s: %s. \n\nResponse:\n%r" % (
self.__class__.__name__,
self.msg,
response,
)
else:
formatted_msg = "%s: %s." % (self.__class__.__name__, self.msg)
super(SPARQLWrapperException, self).__init__(formatted_msg)
class EndPointInternalError(SPARQLWrapperException):
"""
Exception type for Internal Server Error responses. Usually HTTP response status code ``500``.
"""
msg = "The endpoint returned the HTTP status code 500"
class QueryBadFormed(SPARQLWrapperException):
"""
Query Bad Formed exception. Usually HTTP response status code ``400``.
"""
msg = "A bad request has been sent to the endpoint: probably the SPARQL query is badly formed"
class EndPointNotFound(SPARQLWrapperException):
"""
End Point Not Found exception. Usually HTTP response status code ``404``.
"""
msg = "It was not possible to connect to the given endpoint: check it is correct"
class Unauthorized(SPARQLWrapperException):
"""
Access is denied due to invalid credentials (unauthorized). Usually HTTP response status code ``401``.
.. versionadded:: 1.8.2
"""
msg = "Access to that endpoint is denied due to invalid credentials (unauthorized). Check the credentials"
class URITooLong(SPARQLWrapperException):
"""
The URI requested by the client is longer than the server is willing to interpret. Usually HTTP response
status code ``414``.
.. versionadded:: 1.8.3
"""
msg = (
"The URI requested by the client is longer than the server is willing to interpret. "
"Check if the request was sent using GET method instead of POST method."
)

View File

@ -0,0 +1,366 @@
# -*- coding: utf-8 -*-
"""
..
Developers involved:
* Ivan Herman <http://www.ivan-herman.net>
* Sergio Fernández <http://www.wikier.org>
* Carlos Tejo Alonso <http://www.dayures.net>
* Alexey Zakhlestin <https://indeyets.ru/>
Organizations involved:
* `World Wide Web Consortium <http://www.w3.org>`_
* `Foundation CTIC <http://www.fundacionctic.org/>`_
:license: `W3C® Software notice and license <http://www.w3.org/Consortium/Legal/copyright-software>`_
:requires: `RDFLib <https://rdflib.readthedocs.io>`_ package.
"""
from typing import Any, Dict, List, Optional, Tuple, Union
from SPARQLWrapper.Wrapper import JSON, SELECT, QueryResult
from SPARQLWrapper.Wrapper import SPARQLWrapper as SW
######################################################################################
class Value(object):
"""
Class encapsulating a single binding for a variable.
:ivar variable: The original variable, stored for an easier reference.
:vartype variable: string
:ivar value: Value of the binding.
:vartype value: string
:ivar type: Type of the binding. One of :attr:`Value.URI`, :attr:`Value.Literal`, :attr:`Value.TypedLiteral`, or
:attr:`Value.BNODE`.
:vartype type: string
:ivar lang: Language tag of the binding, or ``None`` if not set.
:vartype lang: string
:ivar datatype: Datatype of the binding, or ``None`` if not set. It is an URI.
:vartype datatype: string
"""
URI = "uri"
"""the string denoting a URI variable."""
Literal = "literal"
"""the string denoting a Literal variable."""
TypedLiteral = "typed-literal"
"""the string denoting a typed literal variable."""
BNODE = "bnode"
"""the string denoting a blank node variable."""
def __init__(self, variable: str, binding: Dict[str, str]) -> None:
"""
:param variable: the variable for that binding. Stored for an easier reference.
:type variable: string
:param binding: the binding dictionary part of the return result for a specific binding.
:type binding: dict
"""
self.variable = variable
self.value = binding["value"]
self.type = binding["type"]
self.lang = None
self.datatype = None
try:
self.lang = binding["xml:lang"]
except:
# no lang is set
pass
try:
self.datatype = binding["datatype"]
except:
pass
def __repr__(self) -> str:
cls = self.__class__.__name__
return "%s(%s:%r)" % (cls, self.type, self.value)
######################################################################################
class Bindings(object):
"""
Class encapsulating one query result, based on the JSON return format. It decodes the
return values to make it a bit more usable for a standard usage. The class consumes the
return value and instantiates a number of attributes that can be consulted directly. See
the list of variables.
The `Serializing SPARQL Query Results in JSON <http://www.w3.org/TR/rdf-sparql-json-res/>`_ explains the details of
the JSON return structures. Very succinctly: the return data has "bindings", which means a list of dictionaries.
Each dictionary is a possible binding of the SELECT variables to :class:`Value` instances. This structure is made a
bit more usable by this class.
:ivar fullResult: The original dictionary of the results, stored for an easier reference.
:vartype fullResult: dict
:ivar head: Header part of the return, see the JSON return format document for details.
:vartype head: dict
:ivar variables: List of unbounds (variables) of the original query. It is a list of strings. ``None`` in the case
of an ASK query.
:vartype variables: list
:ivar bindings: The final bindings: list of dictionaries, mapping variables to :class:`Value` instances. \
If unbound, then no value is set in the dictionary; that can be easily checked with \
``var in res.bindings[..]``, for example.
:vartype bindings: list
:ivar askResult: by default, set to **False**; in case of an ASK query, the result of the query.
:vartype askResult: bool
"""
def __init__(self, retval: QueryResult):
"""
:param retval: the query result.
:type retval: :class:`QueryResult<SPARQLWrapper.Wrapper.QueryResult>`
"""
self.fullResult = retval._convertJSON()
self.head = self.fullResult["head"]
self.variables: Optional[List[str]] = None
try:
self.variables = self.fullResult["head"]["vars"]
except:
pass
self.bindings: List[Dict[str, Value]] = []
try:
for b in self.fullResult["results"]["bindings"]:
# This is a single binding. It is a dictionary per variable; each value is a dictionary again
# that has to be converted into a Value instance
newBind = {}
# type error: Item "None" of "Union[List[str], Any, None]" has no attribute "__iter__" (not iterable)
for key in self.variables: # type: ignore [union-attr]
if key in b:
# there is a real binding for this key
newBind[key] = Value(key, b[key])
self.bindings.append(newBind)
except:
pass
self.askResult = False
try:
self.askResult = self.fullResult["boolean"]
except:
pass
def getValues(self, key: str) -> Optional[List[Value]]:
"""A shorthand for the retrieval of all bindings for a single key. It is
equivalent to ``[b[key] for b in self[key]]``
:param key: possible variable name.
:type key: string
:return: list of :class:`Value` instances.
:rtype: list
"""
try:
return [b[key] for b in self[key]]
except:
return []
def __contains__(self, key: Union[str, List[str], Tuple[str]]) -> bool:
"""Emulation of the "``key in obj``" operator. Key can be a string for a variable or an array/tuple
of strings.
If ``key`` is a variable, the return value is ``True`` if there is at least one binding where ``key`` is
bound. If ``key`` is an array or tuple, the return value is ``True`` if there is at least one binding
where *all* variables in ``key`` are bound.
:param key: possible variable, or array/tuple of variables
:return: whether there is a binding of the variable in the return
:rtype: Boolean
"""
if len(self.bindings) == 0:
return False
if type(key) is list or type(key) is tuple:
# check first whether they are all really variables
# type error: Unsupported right operand type for in ("Optional[List[str]]")
if False in [k in self.variables for k in key]: # type: ignore [operator]
return False
for b in self.bindings:
# try to find a binding where all key elements are present
if False in [k in b for k in key]:
# this is not a binding for the key combination, move on...
continue
else:
# yep, this one is good!
return True
return False
else:
# type error: Unsupported right operand type for in ("Optional[List[str]]")
if key not in self.variables: # type: ignore [operator]
return False
for b in self.bindings:
if key in b:
return True
return False
def __getitem__(self, key: Union[slice, str, List[str]]) -> List[Dict[str, Value]]:
"""Emulation of the ``obj[key]`` operator. Slice notation is also available.
The goal is to choose the right bindings among the available ones. The return values are always
arrays of bindings, ie, arrays of dictionaries mapping variable keys to :class:`Value` instances.
The different value settings mean the followings:
- ``obj[key]`` returns the bindings where ``key`` has a valid value
- ``obj[key1,key2,...]`` returns the bindings where *all* ``key1,key2,...`` have valid values
- ``obj[(key1,key2,...):(nkey1,nkey2,...)]`` returns the bindings where all ``key1,key2,...`` have
valid values and *none* of the ``nkey1,nkey2,...`` have valid values
- ``obj[:(nkey1,nkey2,...)]`` returns the bindings where *none* of the ``nkey1,nkey2,...`` have valid values
In all cases complete bindings are returned, ie, the values for other variables, not present among
the keys in the call, may or may not be present depending on the query results.
:param key: possible variable or array/tuple of keys with possible slice notation
:return: list of bindings
:rtype: array of variable -> :class:`Value` dictionaries
"""
def _checkKeys(keys: Union[List[Any], Tuple[Any, ...]]) -> bool:
if len(keys) == 0:
return False
for k in keys:
# type error: Unsupported right operand type for in ("Optional[List[str]]")
if (
not isinstance(k, str)
or k not in self.variables # type: ignore [operator]
):
return False
return True
def _nonSliceCase(
key: Union[
str,
List[Any],
Tuple[Any],
]
) -> Union[List[Any], bool, Tuple[Any]]:
# type error: Unsupported right operand type for in ("Optional[List[str]]")
if isinstance(key, str) and key != "" and key in self.variables: # type: ignore[operator]
# unicode or string:
return [key]
elif type(key) is list or type(key) is tuple:
if _checkKeys(key):
return key
return False
# The arguments should be reduced to arrays of variables, ie, unicode strings
yes_keys: Union[List[Any], bool, Tuple[Any]] = []
no_keys: Union[List[Any], bool, Tuple[Any]] = []
if type(key) is slice:
# Note: None for start or stop is all right
if key.start:
yes_keys = _nonSliceCase(key.start)
if not yes_keys:
raise TypeError
if key.stop:
no_keys = _nonSliceCase(key.stop)
if not no_keys:
raise TypeError
else:
yes_keys = _nonSliceCase(key)
# got it right, now get the right binding line with the constraints
retval: List[Dict[str, Value]] = []
for b in self.bindings:
# first check whether the 'yes' part is all there:
# type error: Item "bool" of "Union[List[Any], bool, Tuple[Any]]" has no attribute "__iter__" (not iterable)
if False in [k in b for k in yes_keys]: # type: ignore[union-attr]
continue
# type error: Item "bool" of "Union[List[Any], bool, Tuple[Any]]" has no attribute "__iter__" (not iterable)
if True in [k in b for k in no_keys]: # type: ignore[union-attr]
continue
# if we got that far, we should be all right!
retval.append(b)
# if retval is of zero length, no hit; an exception should be raised to stay within the python style
if len(retval) == 0:
raise IndexError
return retval
def convert(self) -> "Bindings":
"""This is just a convenience method, returns ``self``.
Although :class:`SPARQLWrapper2.Bindings` is not a subclass of
:class:`SPARQLWrapper.QueryResult<SPARQLWrapper.Wrapper.QueryResult>`, it is returned as a result by
:func:`SPARQLWrapper2.query`, just like :class:`QueryResult<SPARQLWrapper.Wrapper.QueryResult>` is returned by
:func:`SPARQLWrapper.query()<SPARQLWrapper.Wrapper.SPARQLWrapper.query>`. Consequently,
having an empty :func:`convert` method to imitate
:class:`QueryResult's convert() method<SPARQLWrapper.Wrapper.QueryResult.convert>`
may avoid unnecessary problems.
"""
return self
##############################################################################################################
class SPARQLWrapper2(SW):
"""Subclass of :class:`~SPARQLWrapper.Wrapper.SPARQLWrapper` that works with a JSON SELECT return result only. The
query result is automatically set to a :class:`Bindings` instance. Makes the average query processing a bit
simpler..."""
def __init__(self, baseURI: str, defaultGraph: Optional[str] = None):
"""
Class encapsulating a full SPARQL call. In contrast to the :class:`~SPARQLWrapper.Wrapper.SPARQLWrapper`
superclass, the return format cannot be set (it is defaulted to
:attr:`~SPARQLWrapper.Wrapper.SPARQLWrapper.JSON`).
:param baseURI: string of the SPARQL endpoint's URI.
:type baseURI: string
:param defaultGraph: URI for the default graph. Default is ``None``, can be set via an explicit call, too.
:type defaultGraph: string
"""
super(SPARQLWrapper2, self).__init__(
baseURI, returnFormat=JSON, defaultGraph=defaultGraph
)
def setReturnFormat(self, format: Optional[str]) -> None:
"""
Set the return format (:meth:`overriding the inherited method
<SPARQLWrapper.Wrapper.SPARQLWrapper.setReturnFormat>`).
.. warning::
This method does nothing; this class instance should work with JSON only. The method is defined \
just to avoid possible errors by erroneously setting the return format. \
When using this class, the user can safely ignore this call.
:param format: return format
:type format: string
"""
pass
def query(self) -> Union[Bindings, QueryResult]: # type: ignore[override]
"""
Execute the query and do an automatic conversion.
Exceptions can be raised if either the URI is wrong or the HTTP sends back an error.
The usual urllib2 exceptions are raised, which cover possible SPARQL errors, too.
If the query type is *not* SELECT, the method falls back to the
:meth:`corresponding method in the superclass<SPARQLWrapper.Wrapper.SPARQLWrapper.query>`.
:return: query result
:rtype: :class:`Bindings` instance
"""
res = super(SPARQLWrapper2, self).query()
if self.queryType == SELECT:
return Bindings(res)
else:
return res
def queryAndConvert( # type: ignore[override]
self,
) -> Union[Union[Bindings, QueryResult], QueryResult.ConvertResult]:
"""This is here to override the inherited method; it is equivalent to :class:`query`.
If the query type is *not* SELECT, the method falls back to the
:meth:`corresponding method in the superclass<SPARQLWrapper.Wrapper.SPARQLWrapper.queryAndConvert>`.
:return: the converted query result.
"""
if self.queryType == SELECT:
return self.query()
else:
return super(SPARQLWrapper2, self).queryAndConvert()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,72 @@
# -*- coding: utf8 -*-
"""
**SPARQLWrapper** is a simple Python wrapper around a `SPARQL <https://www.w3.org/TR/sparql11-overview/>`_ service to
remotelly execute your queries. It helps in creating the query
invokation and, possibly, convert the result into a more manageable
format.
"""
__version__ = "2.0.0"
"""The version of SPARQLWrapper"""
__agent__: str = f"sparqlwrapper {__version__} (rdflib.github.io/sparqlwrapper)"
from .SmartWrapper import SPARQLWrapper2
from .sparql_dataframe import get_sparql_dataframe
from .Wrapper import (
ASK,
BASIC,
CONSTRUCT,
CSV,
DELETE,
DESCRIBE,
DIGEST,
GET,
INSERT,
JSON,
JSONLD,
N3,
POST,
POSTDIRECTLY,
RDF,
RDFXML,
SELECT,
TSV,
TURTLE,
URLENCODED,
XML,
QueryResult,
SPARQLWrapper,
)
__all__ = [
"SPARQLWrapper2",
"get_sparql_dataframe",
"ASK",
"BASIC",
"CONSTRUCT",
"CSV",
"DELETE",
"DESCRIBE",
"DIGEST",
"GET",
"INSERT",
"JSON",
"JSONLD",
"N3",
"POST",
"POSTDIRECTLY",
"RDF",
"RDFXML",
"SELECT",
"TSV",
"TURTLE",
"URLENCODED",
"XML",
"QueryResult",
"SPARQLWrapper",
]

View File

@ -0,0 +1,157 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import json
import os
import shutil
import sys
import xml
from typing import List, Optional
import rdflib
from . import __version__
from .Wrapper import SPARQLWrapper, _allowedAuth, _allowedFormats, _allowedRequests
class SPARQLWrapperFormatter(
argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter
):
pass
def check_file(v: str) -> str:
if os.path.isfile(v):
return v
elif v == "-":
return "-" # stdin
else:
raise argparse.ArgumentTypeError("file '%s' is not found" % v)
def choicesDescriptions() -> str:
d = "\n - ".join(["allowed FORMAT:"] + _allowedFormats)
d += "\n - ".join(["\n\nallowed METHOD:"] + _allowedRequests)
d += "\n - ".join(["\n\nallowed AUTH:"] + _allowedAuth)
return d
def parse_args(test: Optional[List[str]] = None) -> argparse.Namespace:
"""Parse arguments."""
parser = argparse.ArgumentParser(
prog="rqw",
formatter_class=(
lambda prog: SPARQLWrapperFormatter(
prog,
**{
"width": shutil.get_terminal_size(fallback=(120, 50)).columns,
"max_help_position": 30,
},
)
),
description="sparqlwrapper CLI",
epilog=choicesDescriptions(),
)
input_group = parser.add_mutually_exclusive_group(required=True)
input_group.add_argument(
"-f",
"--file",
metavar="FILE",
type=check_file,
help="query with sparql file (stdin: -)",
)
input_group.add_argument("-Q", "--query", metavar="QUERY", help="query with string")
parser.add_argument(
"-F",
"--format",
default="json",
metavar="FORMAT",
choices=_allowedFormats,
help="response format",
)
parser.add_argument(
"-e",
"--endpoint",
metavar="URI",
help="sparql endpoint",
default="http://dbpedia.org/sparql",
)
parser.add_argument(
"-m",
"--method",
metavar="METHOD",
choices=_allowedRequests,
help="request method",
)
parser.add_argument(
"-a", "--auth", metavar="AUTH", choices=_allowedAuth, help="HTTP auth"
)
parser.add_argument(
"-u", "--username", metavar="ID", default="guest", help="username for auth"
)
parser.add_argument(
"-p", "--password", metavar="PW", default="", help="password for auth"
)
parser.add_argument("-q", "--quiet", action="store_true", help="supress warnings")
parser.add_argument(
"-V", "--version", action="version", version="%(prog)s {}".format(__version__)
)
if test is None:
return parser.parse_args()
else:
return parser.parse_args(test)
def main(test: Optional[List[str]] = None) -> None:
args = parse_args(test)
if args.quiet:
import warnings
warnings.filterwarnings("ignore")
q = ""
if args.query is not None:
q = args.query
elif args.file == "-":
q = sys.stdin.read()
else:
q = open(args.file, "r").read()
sparql = SPARQLWrapper(
args.endpoint,
agent=(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/96.0.4664.110 Safari/537.36"
),
)
if args.auth is not None:
sparql.setHTTPAuth(args.auth)
sparql.setCredentials(args.username, args.password)
if args.method is not None:
sparql.setMethod(args.method)
sparql.setQuery(q)
sparql.setReturnFormat(args.format)
results = sparql.query().convert()
if isinstance(results, dict):
# "json"
print(json.dumps(results, indent=4))
elif isinstance(results, xml.dom.minidom.Document):
# "xml"
print(results.toxml())
elif isinstance(results, bytes):
# "csv", "tsv", "turtle", "n3"
print(results.decode("utf-8"))
elif isinstance(results, rdflib.graph.ConjunctiveGraph):
# "rdf"
print(results.serialize())
else:
# unknown type
raise TypeError(f"Unsupported result of type {type(results)}: {results!r}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,74 @@
"""
Query a SPARQL endpoint and return results as a Pandas dataframe.
"""
import io
from typing import TYPE_CHECKING, Any, Dict, List, Union
from SPARQLWrapper.SmartWrapper import Bindings, SPARQLWrapper2, Value
from SPARQLWrapper.Wrapper import CSV, SELECT, SPARQLWrapper
if TYPE_CHECKING:
import pandas as pd
class QueryException(Exception):
pass
def get_sparql_dataframe_orig(
endpoint: str, query: Union[str, bytes]
) -> "pd.DataFrame":
"""copy paste from: https://github.com/lawlesst/sparql-dataframe"""
# pandas inside to avoid requiring it
import pandas as pd
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(query)
if sparql.queryType != SELECT:
raise QueryException("Only SPARQL SELECT queries are supported.")
sparql.setReturnFormat(CSV)
results = sparql.query().convert()
if isinstance(results, bytes):
_csv = io.StringIO(results.decode("utf-8"))
return pd.read_csv(_csv, sep=",")
else:
raise TypeError(type(results))
def get_sparql_typed_dict(
endpoint: str, query: Union[str, bytes]
) -> List[Dict[str, Value]]:
"""modified from: https://github.com/lawlesst/sparql-dataframe"""
# pandas inside to avoid requiring it
import pandas as pd
# rdflib in here because there is some meta stuff in the setup.py and Travis fails because rdflib is installed later
import rdflib.term
sparql = SPARQLWrapper2(endpoint)
sparql.setQuery(query)
if sparql.queryType != SELECT:
raise QueryException("Only SPARQL SELECT queries are supported.")
# sparql.setReturnFormat(JSON)
results = sparql.query()
if not isinstance(results, Bindings):
raise TypeError(type(results))
# consider perf hacking later, probably slow
# convert list of dicts to python types
d = []
for x in results.bindings:
row = {}
for k in x:
v = x[k]
vv = rdflib.term.Literal(v.value, datatype=v.datatype).toPython() # type: ignore[no-untyped-call]
row[k] = vv
d.append(row)
return d
def get_sparql_dataframe(endpoint: str, query: Union[str, bytes]) -> "pd.DataFrame":
# pandas inside to avoid requiring it
import pandas as pd
d = get_sparql_typed_dict(endpoint, query)
# TODO: will nan fill somehow, make more strict if there is way of getting the nan types from rdflib
df = pd.DataFrame(d)
return df

Some files were not shown because too many files have changed in this diff Show More