initial commit
This commit is contained in:
commit
41e8b7103e
1979
Auswertung Ergebnisse/mapper_log.txt
Normal file
1979
Auswertung Ergebnisse/mapper_log.txt
Normal file
File diff suppressed because it is too large
Load Diff
BIN
Box Ha-Ho.ods
Normal file
BIN
Box Ha-Ho.ods
Normal file
Binary file not shown.
BIN
Box Ha-Klinc.ods
Normal file
BIN
Box Ha-Klinc.ods
Normal file
Binary file not shown.
BIN
Box Hu-J.ods
Normal file
BIN
Box Hu-J.ods
Normal file
Binary file not shown.
BIN
Box K - Klinc.ods
Normal file
BIN
Box K - Klinc.ods
Normal file
Binary file not shown.
BIN
Input CSV/Box Ha-Klinc.ods
Normal file
BIN
Input CSV/Box Ha-Klinc.ods
Normal file
Binary file not shown.
1
Input CSV/Normvokabular_INTERN/.~lock.NV_MASTER.ods#
Normal file
1
Input CSV/Normvokabular_INTERN/.~lock.NV_MASTER.ods#
Normal file
@ -0,0 +1 @@
|
||||
,jarnold,workPC,10.10.2025 09:26,file:///home/jarnold/.config/libreoffice/4;
|
||||
BIN
Input CSV/Normvokabular_INTERN/NV_MASTER.ods
Normal file
BIN
Input CSV/Normvokabular_INTERN/NV_MASTER.ods
Normal file
Binary file not shown.
212
Masterfile_Editor.py
Normal file
212
Masterfile_Editor.py
Normal file
@ -0,0 +1,212 @@
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import pandas as pd
|
||||
import ezodf
|
||||
|
||||
from openpyxl.utils import get_column_letter
|
||||
from openpyxl.styles import Alignment
|
||||
|
||||
# -------------------------------------------------
|
||||
# KONFIGURATION
|
||||
# -------------------------------------------------
|
||||
INPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods"
|
||||
OUTPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Normvokabular_INTERN/NV_MASTER_Updated.ods"
|
||||
|
||||
MASTER_SHEET_NAME = "Masterstruktur"
|
||||
SHEET_ORDER = [
|
||||
"Masterstruktur",
|
||||
"1 Figur",
|
||||
"2 Objekt",
|
||||
"3 Flora",
|
||||
"4 Fauna",
|
||||
"5 Landschaft",
|
||||
"6 Phänomene, Erscheinungen",
|
||||
"7 Architektur",
|
||||
"8 Verzierungen, Ornamentik",
|
||||
"9 Aktivität, Handlung, Pose"
|
||||
]
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||
|
||||
# -------------------------------------------------
|
||||
# HELFERFUNKTIONEN
|
||||
# -------------------------------------------------
|
||||
def detect_id_and_name(df):
|
||||
df_cols = [str(c).strip().lower() for c in df.columns]
|
||||
id_col, name_col = None, None
|
||||
for idx, col in enumerate(df_cols):
|
||||
if col == "id":
|
||||
id_col = df.columns[idx]
|
||||
elif col in ["name", "wort", "wort/vokabel"]:
|
||||
name_col = df.columns[idx]
|
||||
if id_col is None or name_col is None:
|
||||
logging.warning(f"Sheet hat keine ID oder Name/Wort-Spalte: {df.columns}")
|
||||
return id_col, name_col
|
||||
|
||||
def parse_id_level(id_val):
|
||||
if pd.isna(id_val):
|
||||
return None
|
||||
id_str = str(id_val).strip()
|
||||
if re.match(r'^\d+(\.\d+){0,2}$', id_str):
|
||||
return len(id_str.split("."))
|
||||
return None
|
||||
|
||||
def process_category_df(df, sheet_name):
|
||||
id_col, name_col = detect_id_and_name(df)
|
||||
if id_col is None or name_col is None:
|
||||
return None
|
||||
|
||||
current_level = {2: None, 3: None}
|
||||
new_rows = []
|
||||
|
||||
for _, row in df.iterrows():
|
||||
id_val = row[id_col] if pd.notna(row[id_col]) else ""
|
||||
name_val = row[name_col] if pd.notna(row[name_col]) else ""
|
||||
if not id_val and not name_val:
|
||||
continue
|
||||
|
||||
level = parse_id_level(id_val)
|
||||
if level:
|
||||
if level >= 2:
|
||||
current_level[level] = name_val
|
||||
for deeper in range(level+1, 4):
|
||||
current_level[deeper] = None
|
||||
new_rows.append({
|
||||
"ID": id_val,
|
||||
"Unterkategorie": current_level[2] if level >= 2 else "",
|
||||
"Unterunterkategorie": current_level[3] if level >= 3 else "",
|
||||
"Wort/Vokabel": name_val
|
||||
})
|
||||
else:
|
||||
new_rows.append({
|
||||
"ID": "",
|
||||
"Unterkategorie": "",
|
||||
"Unterunterkategorie": "",
|
||||
"Wort/Vokabel": name_val
|
||||
})
|
||||
df_new = pd.DataFrame(new_rows, columns=["ID", "Unterkategorie", "Unterunterkategorie", "Wort/Vokabel"])
|
||||
logging.info(f"Sheet '{sheet_name}' verarbeitet: {len(df_new)} Zeilen")
|
||||
return df_new
|
||||
|
||||
def merge_new_terms(original_df, processed_df):
|
||||
"""Fügt neue Wörter aus original_df (ohne ID) in processed_df ein, wenn sie noch nicht vorhanden sind."""
|
||||
_, orig_name_col = detect_id_and_name(original_df)
|
||||
if orig_name_col is None or orig_name_col not in original_df.columns:
|
||||
return processed_df
|
||||
|
||||
existing_words = set(str(x).strip().lower() for x in processed_df["Wort/Vokabel"].dropna())
|
||||
new_rows = []
|
||||
|
||||
for _, row in original_df.iterrows():
|
||||
name = str(row.get(orig_name_col, "")).strip()
|
||||
id_val = str(row.get("ID", "")).strip() if "ID" in row else ""
|
||||
if not name:
|
||||
continue
|
||||
if not id_val and name.lower() not in existing_words:
|
||||
new_rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": "", "Wort/Vokabel": name})
|
||||
|
||||
if new_rows:
|
||||
df_new = pd.concat([processed_df, pd.DataFrame(new_rows)], ignore_index=True)
|
||||
logging.info(f"{len(new_rows)} neue Wörter übernommen.")
|
||||
return df_new
|
||||
return processed_df
|
||||
|
||||
def build_master_df(category_dfs):
|
||||
seen_ids = set()
|
||||
master_rows = []
|
||||
for df in category_dfs:
|
||||
for _, row in df.iterrows():
|
||||
id_val = row["ID"]
|
||||
name_val = row["Wort/Vokabel"]
|
||||
if id_val and id_val not in seen_ids:
|
||||
seen_ids.add(id_val)
|
||||
master_rows.append({"ID": id_val, "Name": name_val})
|
||||
master_df = pd.DataFrame(master_rows)
|
||||
logging.info(f"Masterstruktur enthält {len(master_df)} eindeutige IDs")
|
||||
return master_df
|
||||
|
||||
# -------------------------------------------------
|
||||
# FORMATIERUNG UND SPEICHERN
|
||||
# -------------------------------------------------
|
||||
def format_excel_sheet(df, sheet_name, writer):
|
||||
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
||||
worksheet = writer.sheets[sheet_name]
|
||||
|
||||
for col_idx, col in enumerate(df.columns, 1):
|
||||
max_len = max([len(str(cell)) if cell is not None else 0 for cell in df[col]])
|
||||
max_len = max(max_len, len(col)) + 2
|
||||
worksheet.column_dimensions[get_column_letter(col_idx)].width = max_len
|
||||
for row_idx in range(1, len(df) + 2):
|
||||
worksheet.cell(row=row_idx, column=col_idx).alignment = Alignment(horizontal='left')
|
||||
|
||||
def save_ods(processed_sheets, output_file):
|
||||
doc = ezodf.newdoc(doctype="ods")
|
||||
for name, df in processed_sheets.items():
|
||||
df = df.fillna("")
|
||||
sheet = ezodf.Sheet(name, size=(len(df) + 1, len(df.columns)))
|
||||
doc.sheets += sheet
|
||||
|
||||
for col_idx, col_name in enumerate(df.columns):
|
||||
sheet[0, col_idx].set_value(str(col_name))
|
||||
|
||||
for row_idx, row in enumerate(df.itertuples(index=False), start=1):
|
||||
for col_idx, value in enumerate(row):
|
||||
if value is None or str(value).lower() == "nan":
|
||||
value = ""
|
||||
sheet[row_idx, col_idx].set_value(str(value))
|
||||
doc.saveas(output_file)
|
||||
logging.info(f"ODS-Datei gespeichert: {output_file}")
|
||||
|
||||
# -------------------------------------------------
|
||||
# HAUPTPROGRAMM
|
||||
# -------------------------------------------------
|
||||
def main():
|
||||
if not os.path.exists(INPUT_FILE):
|
||||
logging.error(f"Datei {INPUT_FILE} existiert nicht.")
|
||||
return
|
||||
|
||||
ext = os.path.splitext(INPUT_FILE)[1].lower()
|
||||
engine = None
|
||||
if ext in [".xlsx", ".xls"]:
|
||||
engine = "openpyxl"
|
||||
elif ext == ".ods":
|
||||
engine = "odf"
|
||||
else:
|
||||
logging.error("Nicht unterstütztes Dateiformat")
|
||||
return
|
||||
|
||||
logging.info(f"Lade Datei {INPUT_FILE} mit Engine '{engine}'")
|
||||
xls = pd.ExcelFile(INPUT_FILE, engine=engine)
|
||||
|
||||
processed_sheets = {}
|
||||
category_dfs = []
|
||||
|
||||
for sheet_name in xls.sheet_names:
|
||||
if sheet_name == MASTER_SHEET_NAME:
|
||||
continue
|
||||
df = pd.read_excel(xls, sheet_name=sheet_name, engine=engine)
|
||||
df_new = process_category_df(df, sheet_name)
|
||||
if df_new is not None:
|
||||
df_merged = merge_new_terms(df, df_new)
|
||||
processed_sheets[sheet_name] = df_merged
|
||||
category_dfs.append(df_merged)
|
||||
else:
|
||||
processed_sheets[sheet_name] = df
|
||||
|
||||
master_df = build_master_df(category_dfs)
|
||||
processed_sheets[MASTER_SHEET_NAME] = master_df
|
||||
|
||||
ordered_sheets = {name: processed_sheets[name] for name in SHEET_ORDER if name in processed_sheets}
|
||||
|
||||
ext_out = os.path.splitext(OUTPUT_FILE)[1].lower()
|
||||
if ext_out in [".xlsx", ".xls"]:
|
||||
with pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl") as writer:
|
||||
for name, df in ordered_sheets.items():
|
||||
format_excel_sheet(df, name, writer)
|
||||
logging.info(f"Excel-Datei gespeichert: {OUTPUT_FILE}")
|
||||
elif ext_out == ".ods":
|
||||
save_ods(ordered_sheets, OUTPUT_FILE)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
NV_MASTER.ods
Normal file
BIN
NV_MASTER.ods
Normal file
Binary file not shown.
BIN
NV_MASTER_Updated.ods
Normal file
BIN
NV_MASTER_Updated.ods
Normal file
Binary file not shown.
171
NV_Master_EditorFAIL.py
Normal file
171
NV_Master_EditorFAIL.py
Normal file
@ -0,0 +1,171 @@
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import datetime
|
||||
import pandas as pd
|
||||
from openpyxl.utils import get_column_letter
|
||||
from openpyxl.styles import Alignment
|
||||
import ezodf
|
||||
|
||||
# ----------------- KONFIGURATION -----------------
|
||||
INPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods"
|
||||
MASTER_SHEET_NAME = "Masterstruktur"
|
||||
today = datetime.datetime.today().strftime("%y.%m.%d")
|
||||
base, ext = os.path.splitext(INPUT_FILE)
|
||||
OUTPUT_FILE = f"{base}_Updated_{today}{ext}"
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||
|
||||
# ----------------- HILFSFUNKTIONEN -----------------
|
||||
|
||||
def load_file(input_file):
|
||||
"""
|
||||
Prüft Dateiformat und gibt für Excel: pd.ExcelFile + Engine zurück,
|
||||
für ODS: None + "odf" (da ODS direkt über ezodf gelesen wird).
|
||||
"""
|
||||
ext = os.path.splitext(input_file)[1].lower()
|
||||
if ext in [".xlsx", ".xls"]:
|
||||
engine = "openpyxl"
|
||||
xls = pd.ExcelFile(input_file, engine=engine)
|
||||
elif ext == ".ods":
|
||||
engine = "odf"
|
||||
xls = None # ODS wird direkt über ezodf gelesen
|
||||
else:
|
||||
raise ValueError(f"Nicht unterstütztes Dateiformat: {ext}")
|
||||
logging.info(f"Lade Datei {input_file} mit Engine '{engine}'")
|
||||
return xls, engine
|
||||
|
||||
def read_ods_sheet(filename, sheet_name):
|
||||
"""Liests ODS Sheet sauber ein, inklusive Header."""
|
||||
doc = ezodf.opendoc(filename)
|
||||
sheet = doc.sheets[sheet_name]
|
||||
data = []
|
||||
headers = [str(sheet[0, col].value).strip() for col in range(sheet.ncols())]
|
||||
for row_idx in range(1, sheet.nrows()):
|
||||
row = {}
|
||||
empty_row = True
|
||||
for col_idx, col_name in enumerate(headers):
|
||||
cell_val = sheet[row_idx, col_idx].value
|
||||
val = "" if cell_val is None else str(cell_val).strip()
|
||||
row[col_name] = val
|
||||
if val:
|
||||
empty_row = False
|
||||
if not empty_row:
|
||||
data.append(row)
|
||||
df = pd.DataFrame(data, columns=headers)
|
||||
return df
|
||||
|
||||
def process_category_sheet(df):
|
||||
"""Erstellt die treppenartige Hierarchie."""
|
||||
df = df.copy()
|
||||
for col in ["ID","Unterkategorie","Unterunterkategorie","Wort/Vokabel"]:
|
||||
if col not in df.columns:
|
||||
df[col] = ""
|
||||
rows = []
|
||||
current_id = ""
|
||||
current_uuk = ""
|
||||
for _, r in df.iterrows():
|
||||
id_val = str(r.get("ID","")).strip()
|
||||
uuk_val = str(r.get("Unterunterkategorie","")).strip()
|
||||
word_val = str(r.get("Wort/Vokabel","")).strip()
|
||||
|
||||
if id_val: # Kategoriezeile
|
||||
current_id = id_val
|
||||
current_uuk = uuk_val or word_val
|
||||
rows.append({"ID": current_id, "Unterkategorie": "", "Unterunterkategorie": current_uuk, "Wort/Vokabel": ""})
|
||||
continue
|
||||
if uuk_val: # Unterunterkategorie
|
||||
current_uuk = uuk_val
|
||||
rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": current_uuk, "Wort/Vokabel": ""})
|
||||
continue
|
||||
if word_val: # Vokabel
|
||||
rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": "", "Wort/Vokabel": word_val})
|
||||
continue
|
||||
return pd.DataFrame(rows, columns=["ID","Unterkategorie","Unterunterkategorie","Wort/Vokabel"])
|
||||
|
||||
def remove_empty_vocabulary_rows(df):
|
||||
"""Entfernt Zeilen, die nur leere Wort/Vokabel-Spalte haben."""
|
||||
return df[df["Wort/Vokabel"].astype(str).str.strip() != ""].copy().reset_index(drop=True)
|
||||
|
||||
def sync_master_and_sheets(master_df, category_dfs):
|
||||
"""Synchronisiert Kategorien nach Master, Vokabeln bleiben erhalten."""
|
||||
master_df = master_df.copy()
|
||||
master_df["ID"] = master_df["ID"].astype(str).str.strip()
|
||||
master_dict = dict(zip(master_df["ID"], master_df["Kategorie"]))
|
||||
updated_dfs = {}
|
||||
summary = {}
|
||||
|
||||
for sheet_name, df in category_dfs.items():
|
||||
rows_out = []
|
||||
changes = {"removed":0}
|
||||
for _, row in df.iterrows():
|
||||
id_val = str(row.get("ID","")).strip()
|
||||
if id_val and id_val not in master_dict:
|
||||
changes["removed"] +=1
|
||||
continue
|
||||
rows_out.append(row.to_dict())
|
||||
updated_dfs[sheet_name] = pd.DataFrame(rows_out, columns=df.columns)
|
||||
summary[sheet_name] = changes
|
||||
|
||||
new_master = pd.DataFrame([{"ID":k,"Kategorie":v} for k,v in sorted(master_dict.items())])
|
||||
return new_master, updated_dfs, summary
|
||||
|
||||
def save_excel(processed_sheets, output_file):
|
||||
from openpyxl import Workbook
|
||||
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
|
||||
for sheet_name, df in processed_sheets.items():
|
||||
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
||||
ws = writer.sheets[sheet_name]
|
||||
for col_idx, col in enumerate(df.columns,1):
|
||||
max_len = max(df[col].astype(str).map(len).max() if len(df)>0 else 0,len(col))+2
|
||||
ws.column_dimensions[get_column_letter(col_idx)].width = max_len
|
||||
for row_idx in range(1,len(df)+2):
|
||||
ws.cell(row=row_idx,column=col_idx).alignment = Alignment(horizontal='left')
|
||||
|
||||
def save_ods(processed_sheets, output_file):
|
||||
doc = ezodf.newdoc(doctype="ods", filename=output_file)
|
||||
for name, df in processed_sheets.items():
|
||||
sheet = ezodf.Sheet(name, size=(len(df)+1,len(df.columns)))
|
||||
doc.sheets += sheet
|
||||
for col_idx, col_name in enumerate(df.columns):
|
||||
sheet[0,col_idx].set_value(col_name)
|
||||
for row_idx,row in enumerate(df.itertuples(index=False),start=1):
|
||||
for col_idx,value in enumerate(row):
|
||||
sheet[row_idx,col_idx].set_value("" if pd.isna(value) else value)
|
||||
doc.save()
|
||||
|
||||
# ----------------- HAUPTPROGRAMM -----------------
|
||||
def main():
|
||||
xls, engine = load_file(INPUT_FILE)
|
||||
if engine == "odf":
|
||||
doc = ezodf.opendoc(INPUT_FILE)
|
||||
sheet_names = [s.name for s in doc.sheets if s.name != MASTER_SHEET_NAME]
|
||||
category_dfs = {name: process_category_sheet(read_ods_sheet(INPUT_FILE,name)) for name in sheet_names}
|
||||
master_df = read_ods_sheet(INPUT_FILE, MASTER_SHEET_NAME)
|
||||
else:
|
||||
sheet_names = [s for s in xls.sheet_names if s != MASTER_SHEET_NAME]
|
||||
category_dfs = {}
|
||||
for sheet_name in sheet_names:
|
||||
df = pd.read_excel(xls, sheet_name=sheet_name, engine=engine)
|
||||
df.columns = [str(c).strip() for c in df.columns]
|
||||
category_dfs[sheet_name] = process_category_sheet(df)
|
||||
master_df = pd.read_excel(xls, sheet_name=MASTER_SHEET_NAME, engine=engine)
|
||||
master_df.columns = [str(c).strip() for c in master_df.columns]
|
||||
|
||||
new_master, updated_dfs, summary = sync_master_and_sheets(master_df, category_dfs)
|
||||
processed_sheets = {MASTER_SHEET_NAME:new_master}
|
||||
processed_sheets.update({k:remove_empty_vocabulary_rows(v) for k,v in updated_dfs.items()})
|
||||
|
||||
ext_out = os.path.splitext(OUTPUT_FILE)[1].lower()
|
||||
if ext_out in [".xlsx",".xls"]:
|
||||
save_excel(processed_sheets, OUTPUT_FILE)
|
||||
else:
|
||||
save_ods(processed_sheets, OUTPUT_FILE)
|
||||
|
||||
logging.info(f"Datei gespeichert: {OUTPUT_FILE}")
|
||||
logging.info("===== SYNC SUMMARY =====")
|
||||
for sheet, info in summary.items():
|
||||
logging.info(f"{sheet}: {info}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
192
NV_Master_to_SPOT.py
Normal file
192
NV_Master_to_SPOT.py
Normal file
@ -0,0 +1,192 @@
|
||||
import os
|
||||
import json
|
||||
import datetime
|
||||
import pandas as pd
|
||||
import ezodf
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.utils import get_column_letter
|
||||
from openpyxl.styles import Alignment
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||
|
||||
# ---------------- SPOT-Baumstruktur ----------------
|
||||
class Node:
|
||||
def __init__(self, name, node_type="category", id=None):
|
||||
self.name = name
|
||||
self.id = id
|
||||
self.type = node_type # "category", "subcategory", "word"
|
||||
self.children = []
|
||||
|
||||
def add_child(self, child):
|
||||
self.children.append(child)
|
||||
|
||||
def to_dict(self):
|
||||
if self.type == "word":
|
||||
return self.name
|
||||
return {
|
||||
"id": self.id,
|
||||
"name": self.name,
|
||||
"type": self.type,
|
||||
"children": [c.to_dict() for c in self.children]
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_dict(d):
|
||||
if isinstance(d, str):
|
||||
return Node(d, "word")
|
||||
node = Node(d["name"], d.get("type", "category"), d.get("id"))
|
||||
node.children = [Node.from_dict(c) for c in d.get("children", [])]
|
||||
return node
|
||||
|
||||
# ---------------- Funktionen zum Laden ----------------
|
||||
def load_excel_or_ods(input_file, master_sheet="Masterstruktur"):
|
||||
ext = os.path.splitext(input_file)[1].lower()
|
||||
engine = "openpyxl" if ext in [".xlsx", ".xls"] else "odf"
|
||||
xls = pd.ExcelFile(input_file, engine=engine)
|
||||
sheet_names = [s for s in xls.sheet_names if s != master_sheet]
|
||||
dfs = {s: pd.read_excel(xls, sheet_name=s, engine=engine) for s in sheet_names}
|
||||
master_df = pd.read_excel(xls, sheet_name=master_sheet, engine=engine)
|
||||
return master_df, dfs
|
||||
|
||||
# ---------------- Baum aus Sheet erstellen ----------------
|
||||
def process_sheet_to_tree(df):
|
||||
df = df.fillna("").astype(str)
|
||||
tree_nodes = []
|
||||
current_cat = None
|
||||
current_sub = None
|
||||
for idx, row in df.iterrows():
|
||||
id_val = row.get("ID", "").strip()
|
||||
uk_val = row.get("Unterkategorie", "").strip()
|
||||
uuk_val = row.get("Unterunterkategorie", "").strip()
|
||||
word_val = row.get("Wort/Vokabel", "").strip()
|
||||
|
||||
if id_val:
|
||||
current_cat = Node(uk_val or word_val, "category", id=id_val)
|
||||
tree_nodes.append(current_cat)
|
||||
current_sub = None
|
||||
elif uuk_val:
|
||||
current_sub = Node(uuk_val, "subcategory")
|
||||
if current_cat:
|
||||
current_cat.add_child(current_sub)
|
||||
elif word_val:
|
||||
word_node = Node(word_val, "word")
|
||||
if current_sub:
|
||||
current_sub.add_child(word_node)
|
||||
elif current_cat:
|
||||
current_cat.add_child(word_node)
|
||||
return tree_nodes
|
||||
|
||||
# ---------------- SPOT laden/speichern ----------------
|
||||
def save_spot_json(tree_nodes, file_path):
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
json.dump([n.to_dict() for n in tree_nodes], f, indent=2, ensure_ascii=False)
|
||||
logging.info(f"SPOT gespeichert: {file_path}")
|
||||
|
||||
def load_spot_json(file_path):
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
return [Node.from_dict(n) for n in data]
|
||||
|
||||
# ---------------- Export in Excel ----------------
|
||||
def export_spot_to_excel(tree_nodes, output_file):
|
||||
wb = Workbook()
|
||||
wb.remove(wb.active)
|
||||
for node in tree_nodes:
|
||||
ws = wb.create_sheet(title=node.name[:31])
|
||||
row_idx = 1
|
||||
# Kategorie
|
||||
ws.cell(row=row_idx, column=1, value=node.id)
|
||||
ws.cell(row=row_idx, column=2, value=node.name)
|
||||
row_idx += 1
|
||||
for sub in node.children:
|
||||
if sub.type == "subcategory":
|
||||
ws.cell(row=row_idx, column=3, value=sub.name)
|
||||
row_idx += 1
|
||||
for word in sub.children:
|
||||
ws.cell(row=row_idx, column=4, value=word.name)
|
||||
row_idx += 1
|
||||
elif sub.type == "word":
|
||||
ws.cell(row=row_idx, column=4, value=sub.name)
|
||||
row_idx += 1
|
||||
# Spaltenbreiten anpassen
|
||||
for col_idx, col_letter in enumerate(["A","B","C","D"],1):
|
||||
ws.column_dimensions[col_letter].width = 20
|
||||
for r in range(1,row_idx):
|
||||
ws.cell(r,col_idx).alignment = Alignment(horizontal='left')
|
||||
wb.save(output_file)
|
||||
logging.info(f"Excel exportiert: {output_file}")
|
||||
|
||||
# ---------------- Export in ODS ----------------
|
||||
def export_spot_to_ods(tree_nodes, output_file):
|
||||
doc = ezodf.newdoc(doctype="ods", filename=output_file)
|
||||
for node in tree_nodes:
|
||||
sheet = ezodf.Sheet(node.name[:31], size=(len(node.children)+10,4))
|
||||
doc.sheets += sheet
|
||||
sheet[0,0].set_value("ID")
|
||||
sheet[0,1].set_value("Unterkategorie")
|
||||
sheet[0,2].set_value("Unterunterkategorie")
|
||||
sheet[0,3].set_value("Wort/Vokabel")
|
||||
row_idx = 1
|
||||
sheet[row_idx,0].set_value(node.id)
|
||||
sheet[row_idx,1].set_value(node.name)
|
||||
row_idx +=1
|
||||
for sub in node.children:
|
||||
if sub.type == "subcategory":
|
||||
sheet[row_idx,2].set_value(sub.name)
|
||||
row_idx +=1
|
||||
for word in sub.children:
|
||||
sheet[row_idx,3].set_value(word.name)
|
||||
row_idx +=1
|
||||
elif sub.type == "word":
|
||||
sheet[row_idx,3].set_value(sub.name)
|
||||
row_idx +=1
|
||||
doc.save()
|
||||
logging.info(f"ODS exportiert: {output_file}")
|
||||
|
||||
# ---------------- CLI-Funktionen zum Editieren ----------------
|
||||
def add_category(tree_nodes, cat_id, cat_name):
|
||||
tree_nodes.append(Node(cat_name, "category", id=cat_id))
|
||||
logging.info(f"Kategorie hinzugefügt: {cat_id} {cat_name}")
|
||||
|
||||
def add_subcategory(tree_nodes, cat_id, sub_name):
|
||||
for cat in tree_nodes:
|
||||
if cat.id == cat_id:
|
||||
cat.add_child(Node(sub_name, "subcategory"))
|
||||
logging.info(f"Unterkategorie hinzugefügt: {sub_name} in {cat_id}")
|
||||
return
|
||||
|
||||
def add_word(tree_nodes, cat_id, sub_name, word_name):
|
||||
for cat in tree_nodes:
|
||||
if cat.id == cat_id:
|
||||
for sub in cat.children:
|
||||
if sub.name == sub_name:
|
||||
sub.add_child(Node(word_name, "word"))
|
||||
logging.info(f"Wort hinzugefügt: {word_name} unter {sub_name}")
|
||||
return
|
||||
|
||||
# ---------------- HAUPTPROGRAMM ----------------
|
||||
def main():
|
||||
INPUT_FILE = "NV_MASTER.ods" # Beispielpfad
|
||||
OUTPUT_SPOT = "nv_spot.json"
|
||||
today = datetime.datetime.today().strftime("%y.%m.%d")
|
||||
OUTPUT_EXCEL = f"NV_MASTER_SPOT_{today}.xlsx"
|
||||
OUTPUT_ODS = f"NV_MASTER_SPOT_{today}.ods"
|
||||
|
||||
master_df, dfs = load_excel_or_ods(INPUT_FILE)
|
||||
spot_tree = []
|
||||
for sheet, df in dfs.items():
|
||||
spot_tree.extend(process_sheet_to_tree(df))
|
||||
save_spot_json(spot_tree, OUTPUT_SPOT)
|
||||
|
||||
# Beispiel: Editieren
|
||||
# add_category(spot_tree, "10.1", "Neue Kategorie")
|
||||
# add_subcategory(spot_tree, "10.1", "Neue Unterunterkategorie")
|
||||
# add_word(spot_tree, "10.1", "Neue Unterunterkategorie", "Neues Wort")
|
||||
|
||||
export_spot_to_excel(spot_tree, OUTPUT_EXCEL)
|
||||
export_spot_to_ods(spot_tree, OUTPUT_ODS)
|
||||
logging.info("SPOT-Workflow abgeschlossen.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
449
NormVokabular_Mapper_1.1.py
Normal file
449
NormVokabular_Mapper_1.1.py
Normal file
@ -0,0 +1,449 @@
|
||||
"""
|
||||
========================================================================
|
||||
NormVokabular Mapper – Übersicht
|
||||
========================================================================
|
||||
|
||||
Dieses Skript dient dazu, Begriffe aus Datenbeständen mit einem
|
||||
vordefinierten Normvokabular abzugleichen. Es identifiziert Treffer,
|
||||
gibt bei fehlenden Treffern Vorschläge aus und ermöglicht optional
|
||||
einen Abgleich mit externen APIs (GND, Wikidata).
|
||||
|
||||
Hauptfunktionen:
|
||||
|
||||
1. **Input verarbeiten**
|
||||
- Liest CSV-, Excel- und ODS-Dateien aus dem Ordner "Input CSV".
|
||||
- Extrahiert relevante Begriffe aus Spalten wie "Objektbeschreibung",
|
||||
filtert Stopwords und Zahlen.
|
||||
|
||||
2. **Normvokabular laden**
|
||||
- Liest die Masterdatei NV_MASTER.ods ein.
|
||||
- Berücksichtigt Hierarchie-IDs, um übergeordnete Begriffe zuordnen zu können.
|
||||
- Erstellt ein Index für gestemmte Begriffe, um auch ähnliche Schreibweisen zu erkennen.
|
||||
|
||||
3. **Mapping auf Normvokabular**
|
||||
- Prüft, ob ein Begriff exakt oder gestemmt im Normvokabular vorkommt.
|
||||
- Wenn kein Treffer vorliegt, werden alternative Vorschläge generiert.
|
||||
|
||||
4. **API-Abgleich (optional)**
|
||||
- Fragt GND und Wikidata ab, um den Top-1 Treffer für jeden Begriff zu ermitteln.
|
||||
- Nutzt einen Cache, um wiederholte Requests zu vermeiden.
|
||||
- Bietet einen Dry-Run-Modus für Tests ohne Internetzugang.
|
||||
|
||||
5. **Ergebnis speichern**
|
||||
- Speichert die Auswertung in einem eigenen Ordner "Auswertung Ergebnisse".
|
||||
- Markiert Treffer visuell: grün = Treffer, rot = kein Treffer (bei Excel),
|
||||
bzw. fügt Statusspalte bei ODS-Dateien hinzu.
|
||||
- Enthält alle relevanten Informationen pro Begriff: Originalbegriff, Normbegriff,
|
||||
Norm-ID, Vorschläge, GND/Wikidata Top1 Treffer.
|
||||
|
||||
6. **Logging**
|
||||
- Informiert über Fortschritt, Anzahl der Begriffe, Treffer und mögliche Fehler.
|
||||
|
||||
|
||||
**Nutzung:**
|
||||
```bash
|
||||
python normvokabular_mapper.py
|
||||
python normvokabular_mapper.py --dry-run # nur Simulation der API-Abfragen
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import re
|
||||
import requests
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from difflib import SequenceMatcher
|
||||
import argparse
|
||||
from collections import defaultdict
|
||||
|
||||
# =========================
|
||||
# Argumente / Dry-Run
|
||||
# =========================
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--dry-run', action='store_true', help='API-Abfragen simulieren')
|
||||
args = parser.parse_args()
|
||||
DRY_RUN = args.dry_run
|
||||
|
||||
# =========================
|
||||
# Konfiguration
|
||||
# =========================
|
||||
INPUT_DIR = Path("Input CSV")
|
||||
OUTPUT_DIR = Path("Auswertung Ergebnisse")
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
|
||||
|
||||
TIMEOUT = 5
|
||||
MAX_RETRIES = 3
|
||||
BACKOFF_FACTOR = 2
|
||||
CACHE_FILE = "api_cache.json"
|
||||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||||
|
||||
API_ACTIVE = {"gnd": True, "wikidata": True}
|
||||
FAIL_COUNTER = {"gnd":0, "wikidata":0}
|
||||
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
|
||||
|
||||
CONF_THRESHOLD = 0.75 # für Vorschläge
|
||||
|
||||
# =========================
|
||||
# Logging
|
||||
# =========================
|
||||
def log(level, msg):
|
||||
ts = time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
print(f"[{ts}] [{level}] {msg}")
|
||||
|
||||
# =========================
|
||||
# Cache laden / speichern
|
||||
# =========================
|
||||
if os.path.exists(CACHE_FILE):
|
||||
try:
|
||||
with open(CACHE_FILE,"r",encoding="utf-8") as f:
|
||||
CACHE = json.load(f)
|
||||
log("INFO", f"Cache geladen: {CACHE_FILE}")
|
||||
except:
|
||||
CACHE = {}
|
||||
else:
|
||||
CACHE = {}
|
||||
|
||||
def save_cache():
|
||||
try:
|
||||
with open(CACHE_FILE,"w",encoding="utf-8") as f:
|
||||
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
||||
log("DEBUG","Cache gespeichert")
|
||||
except Exception as e:
|
||||
log("ERROR", f"Cache speichern fehlgeschlagen: {e}")
|
||||
|
||||
# =========================
|
||||
# Normalisierung / Stemming
|
||||
# =========================
|
||||
try:
|
||||
from nltk.stem.snowball import GermanStemmer
|
||||
STEMMER = GermanStemmer()
|
||||
log("INFO","NLTK GermanStemmer verfügbar")
|
||||
except:
|
||||
STEMMER = None
|
||||
log("WARNING","NLTK nicht verfügbar, naive Pluralreduktion wird genutzt")
|
||||
|
||||
def normalize_text(s):
|
||||
if s is None:
|
||||
return ""
|
||||
s = str(s).lower().strip()
|
||||
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
|
||||
s = re.sub(r"\s+"," ",s)
|
||||
return s
|
||||
|
||||
def naive_stem(w):
|
||||
for ending in ("ern","nen","en","er","e","n","s"):
|
||||
if w.endswith(ending) and len(w)-len(ending)>=3:
|
||||
return w[:-len(ending)]
|
||||
return w
|
||||
|
||||
def stem_word(word):
|
||||
w = normalize_text(word)
|
||||
try:
|
||||
return STEMMER.stem(w) if STEMMER else naive_stem(w)
|
||||
except:
|
||||
return naive_stem(w)
|
||||
|
||||
from collections import defaultdict
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
CONF_THRESHOLD = 0.75 # Confidence für Vorschläge
|
||||
|
||||
# =========================
|
||||
# Normvokabular laden (NV_MASTER) mit Parent-ID & Stem-Index
|
||||
# =========================
|
||||
def load_normvokabular(file_path):
|
||||
import pandas as pd
|
||||
import re
|
||||
log("INFO", f"Normvokabular laden: {file_path}")
|
||||
|
||||
engine = "odf" if file_path.suffix.lower() == ".ods" else None
|
||||
sheets = pd.read_excel(file_path, sheet_name=None, engine=engine)
|
||||
|
||||
norm_dict = {}
|
||||
stem_index = defaultdict(list)
|
||||
count = 0
|
||||
|
||||
for sheet_name, df in sheets.items():
|
||||
df.columns = [str(c).strip() for c in df.columns]
|
||||
current_parent_id = None
|
||||
|
||||
for _, row in df.iterrows():
|
||||
# Spaltennamen flexibel anpassen
|
||||
id_val = str(row.get("ID","")).strip() if "ID" in df.columns else ""
|
||||
wort = str(row.get("Wort/Vokabel","")).strip() if "Wort/Vokabel" in df.columns else ""
|
||||
|
||||
# Zeilen mit ID, aber ohne Vokabel → Update Parent-ID
|
||||
if id_val:
|
||||
current_parent_id = id_val
|
||||
|
||||
# Skip leere Vokabeln
|
||||
if not wort:
|
||||
continue
|
||||
|
||||
assigned_id = current_parent_id # Parent-ID übernehmen
|
||||
key = normalize_text(wort)
|
||||
entry = {
|
||||
"Name": wort,
|
||||
"ID": assigned_id,
|
||||
"Sheet": sheet_name
|
||||
}
|
||||
norm_dict[key] = entry
|
||||
stem_index[stem_word(key)].append(entry)
|
||||
count += 1
|
||||
|
||||
log("INFO", f"{count} Begriffe aus Normvokabular geladen")
|
||||
return norm_dict, stem_index
|
||||
|
||||
# =========================
|
||||
# Mapping & Vorschläge
|
||||
# =========================
|
||||
def map_to_norm(term, norm_dict, stem_index):
|
||||
tnorm = normalize_text(term)
|
||||
tstem = stem_word(tnorm)
|
||||
|
||||
# Exakter Treffer
|
||||
if tnorm in norm_dict:
|
||||
e = norm_dict[tnorm]
|
||||
return e["Name"], e["ID"], []
|
||||
|
||||
# Gestemmter Treffer
|
||||
if tstem in stem_index:
|
||||
e = stem_index[tstem][0]
|
||||
return e["Name"], e["ID"], []
|
||||
|
||||
# Kein Treffer → Vorschläge
|
||||
suggestions = get_suggestions(tnorm, norm_dict)
|
||||
return "KEIN TREFFER", "", suggestions
|
||||
|
||||
def get_suggestions(term, norm_dict, top_n=3, threshold=CONF_THRESHOLD):
|
||||
t = term.lower()
|
||||
scores = []
|
||||
for key, val in norm_dict.items():
|
||||
score = SequenceMatcher(None, t, key).ratio()
|
||||
if score >= threshold:
|
||||
scores.append((score, val["Name"], val["ID"]))
|
||||
scores.sort(reverse=True)
|
||||
return [f"{name} ({id_})" for _, name, id_ in scores[:top_n]]
|
||||
|
||||
|
||||
|
||||
# =========================
|
||||
# API-Abgleich (Top1) unverändert
|
||||
# =========================
|
||||
def request_with_retries(api_name,url,params=None):
|
||||
if DRY_RUN:
|
||||
return None
|
||||
cache_key = url + str(params)
|
||||
if cache_key in CACHE:
|
||||
return CACHE[cache_key]
|
||||
retries = 0
|
||||
while retries<MAX_RETRIES:
|
||||
try:
|
||||
r = requests.get(url,params=params,timeout=TIMEOUT,headers=HEADERS)
|
||||
if r.status_code==200:
|
||||
try: data=r.json()
|
||||
except: data=r.text
|
||||
CACHE[cache_key]=data
|
||||
FAIL_COUNTER[api_name]=0
|
||||
return data
|
||||
except:
|
||||
pass
|
||||
retries+=1
|
||||
time.sleep(min(BACKOFF_FACTOR**retries,30))
|
||||
FAIL_COUNTER[api_name]+=1
|
||||
if FAIL_COUNTER[api_name]>=10:
|
||||
API_ACTIVE[api_name]=False
|
||||
return None
|
||||
|
||||
def compute_min_conf(term,api_name):
|
||||
l=len(term.strip())
|
||||
if l<=3: return 0.90
|
||||
if l<=6: return 0.85 if api_name=='gnd' else 0.80
|
||||
return 0.75 if api_name=='gnd' else 0.70
|
||||
|
||||
def batch_query_gnd(terms):
|
||||
results={}
|
||||
if DRY_RUN or not API_ACTIVE.get("gnd",False):
|
||||
for t in terms: results[t]="TEST_GND"
|
||||
return results
|
||||
for t in terms:
|
||||
url="https://lobid.org/gnd/search"
|
||||
params={"q":t,"format":"json"}
|
||||
data=request_with_retries("gnd",url,params)
|
||||
top=""
|
||||
if data and "member" in data:
|
||||
min_conf=compute_min_conf(t,'gnd')
|
||||
cands=[]
|
||||
for doc in data["member"]:
|
||||
name=doc.get("preferredName","") or doc.get("name","")
|
||||
if not name: continue
|
||||
conf=SequenceMatcher(None,t.lower(),name.lower()).ratio()
|
||||
if conf>=min_conf: cands.append((name,conf))
|
||||
if cands:
|
||||
top=sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
|
||||
results[t]=top
|
||||
return results
|
||||
|
||||
def batch_query_wikidata(terms):
|
||||
results={}
|
||||
if DRY_RUN or not API_ACTIVE.get("wikidata",False):
|
||||
for t in terms: results[t]="TEST_WD"
|
||||
return results
|
||||
for t in terms:
|
||||
url="https://www.wikidata.org/w/api.php"
|
||||
params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
|
||||
data=request_with_retries("wikidata",url,params)
|
||||
top=""
|
||||
if data and "search" in data:
|
||||
min_conf=compute_min_conf(t,'wikidata')
|
||||
cands=[]
|
||||
for e in data["search"]:
|
||||
label=e.get("label","")
|
||||
if not label: continue
|
||||
conf=SequenceMatcher(None,t.lower(),label.lower()).ratio()
|
||||
if conf>=min_conf: cands.append((label,conf))
|
||||
if cands:
|
||||
top=sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
|
||||
results[t]=top
|
||||
return results
|
||||
|
||||
# =========================
|
||||
# Formatabhängige Markierung / Status
|
||||
# =========================
|
||||
def mark_norm_hits(file_path):
|
||||
ext = file_path.suffix.lower()
|
||||
if ext in [".xlsx", ".xls"]:
|
||||
from openpyxl import load_workbook
|
||||
from openpyxl.styles import PatternFill
|
||||
|
||||
wb = load_workbook(file_path)
|
||||
ws = wb.active
|
||||
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
||||
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
||||
|
||||
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
|
||||
norm_col = col_map.get("Norm_Treffer", None)
|
||||
if not norm_col:
|
||||
log("WARNING","Spalte 'Norm_Treffer' nicht gefunden, keine Markierung möglich")
|
||||
return
|
||||
|
||||
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
|
||||
cell = row[0]
|
||||
if cell.value and cell.value!="KEIN TREFFER":
|
||||
cell.fill = green_fill
|
||||
else:
|
||||
cell.fill = red_fill
|
||||
wb.save(file_path)
|
||||
log("INFO","Excel: Treffer farblich markiert (grün=Treffer, rot=kein Treffer)")
|
||||
|
||||
elif ext==".ods":
|
||||
df = pd.read_excel(file_path, engine="odf")
|
||||
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
|
||||
df.to_excel(file_path, index=False, engine="odf")
|
||||
log("INFO","ODS: Spalte 'Norm_Status' eingefügt (Treffer / Kein Treffer)")
|
||||
else:
|
||||
log("WARNING","Unbekanntes Dateiformat, keine Markierung durchgeführt")
|
||||
|
||||
# =========================
|
||||
# Verarbeitung Input-Dateien
|
||||
# =========================
|
||||
def process_files():
|
||||
norm_dict, stem_index = load_normvokabular(NORMVOC_FILE)
|
||||
total_terms=0
|
||||
total_norm_hits=0
|
||||
|
||||
if not INPUT_DIR.exists():
|
||||
log("CRITICAL",f"Eingabeordner {INPUT_DIR} fehlt")
|
||||
sys.exit(1)
|
||||
|
||||
files=list(INPUT_DIR.glob("*"))
|
||||
if not files:
|
||||
log("WARNING","Keine Dateien gefunden")
|
||||
|
||||
for file_path in files:
|
||||
if not file_path.suffix.lower() in [".ods",".xlsx",".csv",".xls"]:
|
||||
continue
|
||||
log("INFO",f"Verarbeite Datei: {file_path.name}")
|
||||
|
||||
# Output-Datei für diese Input-Datei erzeugen
|
||||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
|
||||
version = 1
|
||||
while output_file.exists():
|
||||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
|
||||
version += 1
|
||||
|
||||
try:
|
||||
if file_path.suffix.lower()==".csv":
|
||||
df=pd.read_csv(file_path)
|
||||
elif file_path.suffix.lower()==".ods":
|
||||
df=pd.read_excel(file_path, engine="odf")
|
||||
else:
|
||||
df=pd.read_excel(file_path)
|
||||
except Exception as e:
|
||||
log("ERROR",f"Datei {file_path.name} konnte nicht gelesen werden: {e}")
|
||||
continue
|
||||
|
||||
df.columns=[str(c).strip() for c in df.columns]
|
||||
|
||||
row_terms_map=[]
|
||||
for _,row in df.iterrows():
|
||||
besch=row.get("Objektbeschreibung","")
|
||||
if pd.isna(besch) or not str(besch).strip(): continue
|
||||
besch=str(besch).strip()
|
||||
clauses=[c.strip() for c in re.split(r",",besch) if c.strip()]
|
||||
terms=[]
|
||||
for clause in clauses:
|
||||
parts=[p.strip() for p in re.split(r"\s+",clause) if p.strip()]
|
||||
for p in parts:
|
||||
if p.lower() in STOPWORDS: continue
|
||||
if re.fullmatch(r"\d+",p): continue
|
||||
terms.append(p)
|
||||
obj_box=row.get("Objekt/Ebene","")
|
||||
urheber=row.get("Urheber","")
|
||||
row_terms_map.append((obj_box,urheber,terms))
|
||||
|
||||
all_terms=[]
|
||||
for _,_,terms in row_terms_map:
|
||||
all_terms.extend(terms)
|
||||
all_terms = list(set(all_terms)) # unique
|
||||
gnd_results=batch_query_gnd(all_terms)
|
||||
wd_results=batch_query_wikidata(all_terms)
|
||||
|
||||
output_rows=[]
|
||||
for obj_box,urheber,terms in row_terms_map:
|
||||
for term in terms:
|
||||
norm_name,norm_id,suggestions = map_to_norm(term,norm_dict, stem_index)
|
||||
total_terms+=1
|
||||
if norm_name!="KEIN TREFFER":
|
||||
total_norm_hits+=1
|
||||
out_row={
|
||||
"Box": obj_box,
|
||||
"Objekt/Ebene": obj_box,
|
||||
"Urheber": urheber,
|
||||
"Begriff": term,
|
||||
"Norm_Treffer": norm_name,
|
||||
"Norm_ID": norm_id,
|
||||
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
|
||||
"GND_Top1": gnd_results.get(term,""),
|
||||
"WD_Top1": wd_results.get(term,"")
|
||||
}
|
||||
output_rows.append(out_row)
|
||||
|
||||
out_df=pd.DataFrame(output_rows)
|
||||
engine = "odf" if output_file.suffix.lower()==".ods" else None
|
||||
out_df.to_excel(output_file,index=False,engine=engine)
|
||||
log("INFO",f"Auswertung gespeichert: {output_file}")
|
||||
mark_norm_hits(output_file)
|
||||
|
||||
save_cache()
|
||||
log("INFO",f"Gesamt: {total_terms} Begriffe, {total_norm_hits} Treffer im Normvokabular")
|
||||
|
||||
# =========================
|
||||
# Main
|
||||
# =========================
|
||||
if __name__=="__main__":
|
||||
process_files()
|
||||
log("INFO","Fertig")
|
||||
471
NormVokabular_Mapper_1.2.py
Normal file
471
NormVokabular_Mapper_1.2.py
Normal file
@ -0,0 +1,471 @@
|
||||
"""
|
||||
========================================================================
|
||||
NormVokabular Mapper – Übersicht
|
||||
========================================================================
|
||||
|
||||
Dieses Skript dient dazu, Begriffe aus Datenbeständen mit einem
|
||||
vordefinierten Normvokabular abzugleichen. Es identifiziert Treffer,
|
||||
gibt bei fehlenden Treffern Vorschläge aus und ermöglicht optional
|
||||
einen Abgleich mit externen APIs (GND, Wikidata).
|
||||
|
||||
Hauptfunktionen:
|
||||
|
||||
1. **Input verarbeiten**
|
||||
- Liest CSV-, Excel- und ODS-Dateien aus dem Ordner "Input CSV".
|
||||
- Extrahiert relevante Begriffe aus Spalten wie "Objektbeschreibung",
|
||||
filtert Stopwords und Zahlen.
|
||||
|
||||
2. **Normvokabular laden**
|
||||
- Liest die Masterdatei NV_MASTER.ods ein.
|
||||
- Berücksichtigt Hierarchie-IDs, um übergeordnete Begriffe zuordnen zu können.
|
||||
- Erstellt ein Index für gestemmte Begriffe, um auch ähnliche Schreibweisen zu erkennen.
|
||||
|
||||
3. **Mapping auf Normvokabular**
|
||||
- Prüft, ob ein Begriff exakt oder gestemmt im Normvokabular vorkommt.
|
||||
- Wenn kein Treffer vorliegt, werden alternative Vorschläge generiert.
|
||||
|
||||
4. **API-Abgleich (optional)**
|
||||
- Fragt GND und Wikidata ab, um den Top-1 Treffer für jeden Begriff zu ermitteln.
|
||||
- Nutzt einen Cache, um wiederholte Requests zu vermeiden.
|
||||
- Bietet einen Dry-Run-Modus für Tests ohne Internetzugang.
|
||||
|
||||
5. **Ergebnis speichern**
|
||||
- Speichert die Auswertung in einem eigenen Ordner "Auswertung Ergebnisse".
|
||||
- Markiert Treffer visuell: grün = Treffer, rot = kein Treffer (bei Excel),
|
||||
bzw. fügt Statusspalte bei ODS-Dateien hinzu.
|
||||
- Enthält alle relevanten Informationen pro Begriff: Originalbegriff, Normbegriff,
|
||||
Norm-ID, Vorschläge, GND/Wikidata Top1 Treffer.
|
||||
|
||||
6. **Logging**
|
||||
- Informiert über Fortschritt, Anzahl der Begriffe, Treffer und mögliche Fehler.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import time
|
||||
import json
|
||||
import pandas as pd
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
# RapidFuzz für Token-basierte Fuzzy-Suche
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
RAPIDFUZZ_AVAILABLE = True
|
||||
print("RapidFuzz verfügbar")
|
||||
except ImportError:
|
||||
RAPIDFUZZ_AVAILABLE = False
|
||||
print("RapidFuzz nicht verfügbar – nutze SequenceMatcher")
|
||||
|
||||
# Spacy Lemmatizer
|
||||
try:
|
||||
import spacy
|
||||
nlp = spacy.load("de_core_news_sm")
|
||||
SPACY_AVAILABLE = True
|
||||
print("Spacy Lemmatizer aktiviert")
|
||||
except:
|
||||
SPACY_AVAILABLE = False
|
||||
nlp = None
|
||||
print("Spacy nicht verfügbar – nutze naive Stemmer")
|
||||
|
||||
# =========================
|
||||
# Pfade & Config
|
||||
# =========================
|
||||
INPUT_DIR = Path("Input CSV")
|
||||
OUTPUT_DIR = Path("Auswertung Ergebnisse")
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
|
||||
CACHE_FILE = "api_cache.json"
|
||||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||||
CONF_THRESHOLD = 0.75
|
||||
TIMEOUT = 5
|
||||
MAX_RETRIES = 3
|
||||
BACKOFF_FACTOR = 2
|
||||
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
|
||||
API_ACTIVE = {"gnd": True, "wikidata": True}
|
||||
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
|
||||
|
||||
# Cache
|
||||
if os.path.exists(CACHE_FILE):
|
||||
with open(CACHE_FILE,"r",encoding="utf-8") as f:
|
||||
CACHE = json.load(f)
|
||||
else:
|
||||
CACHE = {}
|
||||
|
||||
def save_cache():
|
||||
with open(CACHE_FILE,"w",encoding="utf-8") as f:
|
||||
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# =========================
|
||||
# Normalisierung / Lemma
|
||||
# =========================
|
||||
def normalize_text(s):
|
||||
if not s:
|
||||
return ""
|
||||
s = str(s).lower().strip()
|
||||
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
|
||||
s = re.sub(r"\s+"," ",s)
|
||||
return s
|
||||
|
||||
lemma_cache = {}
|
||||
|
||||
def lemmatize_term(term):
|
||||
term_norm = normalize_text(term)
|
||||
if term_norm in lemma_cache:
|
||||
return lemma_cache[term_norm]
|
||||
if SPACY_AVAILABLE and nlp:
|
||||
doc = nlp(term_norm)
|
||||
lemma = " ".join([token.lemma_ for token in doc])
|
||||
else:
|
||||
lemma = term_norm
|
||||
lemma_cache[term_norm] = lemma
|
||||
return lemma
|
||||
|
||||
# =========================
|
||||
# Kompositum-Zerlegung (erweitert)
|
||||
# =========================
|
||||
def compound_split(term, norm_dict):
|
||||
"""
|
||||
Zerlegt Komposita durch Prüfen auf Substrings, die im Normvokabular vorkommen.
|
||||
"""
|
||||
term_norm = normalize_text(term)
|
||||
matches = []
|
||||
for i in range(len(term_norm)):
|
||||
for j in range(i+3, len(term_norm)+1):
|
||||
sub = term_norm[i:j]
|
||||
if sub in norm_dict and sub not in matches:
|
||||
matches.append(sub)
|
||||
if not matches:
|
||||
matches = [term_norm]
|
||||
return matches
|
||||
|
||||
# =========================
|
||||
# Normvokabular laden & Lemma vorbereiten
|
||||
# =========================
|
||||
def load_normvokabular(file_path):
|
||||
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
|
||||
norm_dict = {}
|
||||
stem_index = defaultdict(list)
|
||||
lemma_norm_map = {}
|
||||
|
||||
for sheet_name, df in sheets.items():
|
||||
if sheet_name.lower() in ["master", "übersicht"]:
|
||||
continue
|
||||
df = df.dropna(how="all", axis=1)
|
||||
df.columns = [str(c).strip() for c in df.columns]
|
||||
id_col = next((c for c in df.columns if "ID" in c), None)
|
||||
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c), None)
|
||||
if not id_col or not word_col:
|
||||
continue
|
||||
|
||||
current_parent_id = None
|
||||
for _, row in df.iterrows():
|
||||
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
|
||||
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
|
||||
if row_id:
|
||||
current_parent_id = row_id
|
||||
if not row_word:
|
||||
continue
|
||||
assigned_parent_id = current_parent_id
|
||||
entry = {
|
||||
"Name": row_word,
|
||||
"ID": assigned_parent_id,
|
||||
"Sheet": sheet_name,
|
||||
"Own_ID": row_id or ""
|
||||
}
|
||||
key = normalize_text(row_word)
|
||||
norm_dict[key] = entry
|
||||
lemma = lemmatize_term(key)
|
||||
stem_index[lemma].append(entry)
|
||||
if lemma not in lemma_norm_map:
|
||||
lemma_norm_map[lemma] = entry
|
||||
return norm_dict, stem_index, lemma_norm_map
|
||||
|
||||
# =========================
|
||||
# Vorschläge & Fuzzy Matching
|
||||
# =========================
|
||||
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
|
||||
candidates = []
|
||||
for key_lemma, entry in lemma_norm_map.items():
|
||||
if RAPIDFUZZ_AVAILABLE:
|
||||
score_token = fuzz.token_set_ratio(term_lemma, key_lemma)/100
|
||||
score_partial = fuzz.partial_ratio(term_lemma, key_lemma)/100
|
||||
score = max(score_token, score_partial)
|
||||
else:
|
||||
score_seq = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
|
||||
score = score_seq
|
||||
|
||||
# Substring-Boost
|
||||
if term_lemma in key_lemma or key_lemma in term_lemma:
|
||||
score = max(score, 0.9)
|
||||
|
||||
if score >= threshold:
|
||||
candidates.append((score, entry["Name"], entry["ID"]))
|
||||
|
||||
candidates.sort(reverse=True)
|
||||
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
|
||||
|
||||
# =========================
|
||||
# Mapping auf Normvokabular
|
||||
# =========================
|
||||
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
|
||||
term_norm = normalize_text(term)
|
||||
term_lemma = lemmatize_term(term)
|
||||
|
||||
# Exakter Treffer
|
||||
if term_norm in norm_dict:
|
||||
e = norm_dict[term_norm]
|
||||
return e["Name"], e["ID"], []
|
||||
|
||||
# Lemma-Treffer
|
||||
if term_lemma in stem_index:
|
||||
e = stem_index[term_lemma][0]
|
||||
return e["Name"], e["ID"], []
|
||||
|
||||
# KEIN TREFFER → Kompositum-Split & Teilbegriffe prüfen
|
||||
tokens = compound_split(term, norm_dict)
|
||||
token_matches = []
|
||||
all_suggestions = []
|
||||
for t in tokens:
|
||||
t_lemma = lemmatize_term(t)
|
||||
if t_lemma in stem_index:
|
||||
e = stem_index[t_lemma][0]
|
||||
token_matches.append((t, e["Name"], e["ID"]))
|
||||
else:
|
||||
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
|
||||
all_suggestions.extend(sugg)
|
||||
token_matches.append((t, "KEIN TREFFER", "", sugg))
|
||||
|
||||
combined_matches = [m[1] for m in token_matches if m[1] != "KEIN TREFFER"]
|
||||
|
||||
if combined_matches:
|
||||
return "KEIN TREFFER", "", combined_matches
|
||||
elif all_suggestions:
|
||||
return "KEIN TREFFER", "", all_suggestions
|
||||
else:
|
||||
return "KEIN TREFFER", "", []
|
||||
|
||||
# =========================
|
||||
# API-Abfragen
|
||||
# =========================
|
||||
def request_with_retries(api_name,url,params=None):
|
||||
cache_key = url + str(params)
|
||||
if cache_key in CACHE:
|
||||
return CACHE[cache_key]
|
||||
retries = 0
|
||||
while retries < MAX_RETRIES:
|
||||
try:
|
||||
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
|
||||
if r.status_code == 200:
|
||||
try: data = r.json()
|
||||
except: data = r.text
|
||||
CACHE[cache_key] = data
|
||||
FAIL_COUNTER[api_name] = 0
|
||||
return data
|
||||
except:
|
||||
pass
|
||||
retries += 1
|
||||
time.sleep(min(BACKOFF_FACTOR**retries,30))
|
||||
FAIL_COUNTER[api_name] += 1
|
||||
if FAIL_COUNTER[api_name] >= 10:
|
||||
API_ACTIVE[api_name] = False
|
||||
return None
|
||||
|
||||
def batch_query_gnd(terms):
|
||||
results={}
|
||||
if not API_ACTIVE.get("gnd", False):
|
||||
for t in terms: results[t] = ""
|
||||
return results
|
||||
for t in terms:
|
||||
url="https://lobid.org/gnd/search"
|
||||
params={"q":t,"format":"json"}
|
||||
data = request_with_retries("gnd", url, params)
|
||||
top = ""
|
||||
if data and "member" in data:
|
||||
cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None,t.lower(),(doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
|
||||
cands = [c for c in cands if c[1]>=0.75]
|
||||
if cands:
|
||||
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
|
||||
results[t] = top
|
||||
return results
|
||||
|
||||
def batch_query_wikidata(terms):
|
||||
results={}
|
||||
if not API_ACTIVE.get("wikidata", False):
|
||||
for t in terms: results[t] = ""
|
||||
return results
|
||||
for t in terms:
|
||||
url="https://www.wikidata.org/w/api.php"
|
||||
params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
|
||||
data = request_with_retries("wikidata", url, params)
|
||||
top = ""
|
||||
if data and "search" in data:
|
||||
cands = [(e.get("label",""), SequenceMatcher(None,t.lower(),e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")]
|
||||
cands = [c for c in cands if c[1]>=0.70]
|
||||
if cands:
|
||||
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
|
||||
results[t] = top
|
||||
return results
|
||||
|
||||
# =========================
|
||||
# Markierung / Export
|
||||
# =========================
|
||||
def mark_norm_hits(file_path):
|
||||
ext = file_path.suffix.lower()
|
||||
if ext in [".xlsx", ".xls"]:
|
||||
from openpyxl import load_workbook
|
||||
from openpyxl.styles import PatternFill
|
||||
wb = load_workbook(file_path)
|
||||
ws = wb.active
|
||||
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
||||
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
||||
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
|
||||
norm_col = col_map.get("Norm_Treffer", None)
|
||||
if not norm_col:
|
||||
print("Spalte 'Norm_Treffer' nicht gefunden")
|
||||
return
|
||||
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
|
||||
cell = row[0]
|
||||
if cell.value and cell.value != "KEIN TREFFER":
|
||||
cell.fill = green_fill
|
||||
else:
|
||||
cell.fill = red_fill
|
||||
wb.save(file_path)
|
||||
elif ext==".ods":
|
||||
df = pd.read_excel(file_path, engine="odf")
|
||||
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
|
||||
df.to_excel(file_path, index=False, engine="odf")
|
||||
|
||||
# =========================
|
||||
# Export mit zweitem Sheet für Begriffe ohne Treffer und Vorschlag
|
||||
# =========================
|
||||
def export_results_with_no_hits(out_df, output_file):
|
||||
"""
|
||||
Exportiert das Mapping-Ergebnis und zusätzlich ein zweites Sheet
|
||||
mit allen Begriffen, deren Norm_Treffer == 'KEIN TREFFER' und Norm_Vorschlag leer ist.
|
||||
"""
|
||||
# Begriffe ohne Treffer und ohne Vorschlag
|
||||
no_match_df = out_df[(out_df["Norm_Treffer"]=="KEIN TREFFER") & (out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip()==""))].copy()
|
||||
|
||||
ext = output_file.suffix.lower()
|
||||
|
||||
if ext in [".xlsx", ".xls"]:
|
||||
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
|
||||
out_df.to_excel(writer, index=False, sheet_name="Mapping")
|
||||
no_match_df.to_excel(writer, index=False, sheet_name="Keine Treffer")
|
||||
elif ext == ".ods":
|
||||
# ODS-Export via odf-Engine
|
||||
with pd.ExcelWriter(output_file, engine="odf") as writer:
|
||||
out_df.to_excel(writer, index=False, sheet_name="Mapping")
|
||||
no_match_df.to_excel(writer, index=False, sheet_name="Keine Treffer")
|
||||
|
||||
|
||||
# =========================
|
||||
# Verarbeitung Input-Dateien
|
||||
# =========================
|
||||
def process_files():
|
||||
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
|
||||
total_terms = 0
|
||||
total_hits = 0
|
||||
|
||||
if not INPUT_DIR.exists():
|
||||
print(f"Eingabeordner {INPUT_DIR} fehlt")
|
||||
sys.exit(1)
|
||||
files = list(INPUT_DIR.glob("*"))
|
||||
if not files:
|
||||
print("Keine Dateien gefunden")
|
||||
return
|
||||
|
||||
for file_path in files:
|
||||
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
|
||||
continue
|
||||
print(f"Verarbeite Datei: {file_path.name}")
|
||||
try:
|
||||
if file_path.suffix.lower() == ".csv":
|
||||
df = pd.read_csv(file_path)
|
||||
else:
|
||||
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
|
||||
except Exception as e:
|
||||
print(f"Fehler beim Lesen von {file_path.name}: {e}")
|
||||
continue
|
||||
|
||||
df = df.dropna(how="all")
|
||||
df.columns = [str(c).strip() for c in df.columns]
|
||||
|
||||
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
|
||||
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
|
||||
urh_col = next((c for c in df.columns if "Urheber" in c), None)
|
||||
if not besch_col: continue
|
||||
|
||||
row_terms_map = []
|
||||
for _, row in df.iterrows():
|
||||
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
|
||||
if not besch: continue
|
||||
obj_box = row[box_col] if box_col else ""
|
||||
urheber = row[urh_col] if urh_col else ""
|
||||
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
|
||||
terms = []
|
||||
for clause in clauses:
|
||||
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
|
||||
for p in parts:
|
||||
if p.lower() in STOPWORDS: continue
|
||||
if re.fullmatch(r"\d+", p): continue
|
||||
terms.append(p)
|
||||
row_terms_map.append((obj_box, urheber, terms))
|
||||
|
||||
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
|
||||
gnd_results = batch_query_gnd(all_terms)
|
||||
wd_results = batch_query_wikidata(all_terms)
|
||||
|
||||
output_rows = []
|
||||
for obj_box, urheber, terms in row_terms_map:
|
||||
for term in terms:
|
||||
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
|
||||
total_terms += 1
|
||||
if norm_name != "KEIN TREFFER":
|
||||
total_hits += 1
|
||||
out_row = {
|
||||
"Box": obj_box,
|
||||
"Objekt/Ebene": obj_box,
|
||||
"Urheber": urheber,
|
||||
"Begriff": term,
|
||||
"Norm_Treffer": norm_name,
|
||||
"Norm_ID": norm_id,
|
||||
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
|
||||
"GND_Top1": gnd_results.get(term,""),
|
||||
"WD_Top1": wd_results.get(term,"")
|
||||
}
|
||||
output_rows.append(out_row)
|
||||
|
||||
out_df = pd.DataFrame(output_rows)
|
||||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
|
||||
version = 1
|
||||
while output_file.exists():
|
||||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
|
||||
version += 1
|
||||
|
||||
export_results_with_no_hits(out_df, output_file)
|
||||
mark_norm_hits(output_file)
|
||||
print(f"Auswertung gespeichert: {output_file}")
|
||||
|
||||
save_cache()
|
||||
print(f"Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular")
|
||||
|
||||
# =========================
|
||||
# Main
|
||||
# =========================
|
||||
if __name__ == "__main__":
|
||||
process_files()
|
||||
print("Fertig")
|
||||
509
NormVokabular_Mapper_1.3.py
Normal file
509
NormVokabular_Mapper_1.3.py
Normal file
@ -0,0 +1,509 @@
|
||||
"""
|
||||
========================================================================
|
||||
NormVokabular Mapper – Übersicht
|
||||
========================================================================
|
||||
|
||||
Dieses Skript dient dazu, Begriffe aus Datenbeständen mit einem
|
||||
vordefinierten Normvokabular abzugleichen. Es identifiziert Treffer,
|
||||
gibt bei fehlenden Treffern Vorschläge aus und ermöglicht optional
|
||||
einen Abgleich mit externen APIs (GND, Wikidata).
|
||||
|
||||
Hauptfunktionen:
|
||||
|
||||
1. **Input verarbeiten**
|
||||
- Liest CSV-, Excel- und ODS-Dateien aus dem Ordner "Input CSV".
|
||||
- Extrahiert relevante Begriffe aus Spalten wie "Objektbeschreibung",
|
||||
filtert Stopwords und Zahlen.
|
||||
|
||||
2. **Normvokabular laden**
|
||||
- Liest die Masterdatei NV_MASTER.ods ein.
|
||||
- Berücksichtigt Hierarchie-IDs, um übergeordnete Begriffe zuordnen zu können.
|
||||
- Erstellt ein Index für gestemmte Begriffe, um auch ähnliche Schreibweisen zu erkennen.
|
||||
|
||||
3. **Mapping auf Normvokabular**
|
||||
- Prüft, ob ein Begriff exakt oder gestemmt im Normvokabular vorkommt.
|
||||
- Wenn kein Treffer vorliegt, werden alternative Vorschläge generiert.
|
||||
|
||||
4. **API-Abgleich (optional)**
|
||||
- Fragt GND und Wikidata ab, um den Top-1 Treffer für jeden Begriff zu ermitteln.
|
||||
- Nutzt einen Cache, um wiederholte Requests zu vermeiden.
|
||||
- Bietet einen Dry-Run-Modus für Tests ohne Internetzugang.
|
||||
|
||||
5. **Ergebnis speichern**
|
||||
- Speichert die Auswertung in einem eigenen Ordner "Auswertung Ergebnisse".
|
||||
- Markiert Treffer visuell: grün = Treffer, rot = kein Treffer (bei Excel),
|
||||
bzw. fügt Statusspalte bei ODS-Dateien hinzu.
|
||||
- Enthält alle relevanten Informationen pro Begriff: Originalbegriff, Normbegriff,
|
||||
Norm-ID, Vorschläge, GND/Wikidata Top1 Treffer.
|
||||
|
||||
6. **Logging**
|
||||
- Informiert über Fortschritt, Anzahl der Begriffe, Treffer und mögliche Fehler.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import time
|
||||
import json
|
||||
import pandas as pd
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
# RapidFuzz für Token-basierte Fuzzy-Suche
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
RAPIDFUZZ_AVAILABLE = True
|
||||
print("RapidFuzz verfügbar")
|
||||
except ImportError:
|
||||
RAPIDFUZZ_AVAILABLE = False
|
||||
print("RapidFuzz nicht verfügbar – nutze SequenceMatcher")
|
||||
|
||||
# Spacy Lemmatizer
|
||||
try:
|
||||
import spacy
|
||||
nlp = spacy.load("de_core_news_sm")
|
||||
SPACY_AVAILABLE = True
|
||||
print("Spacy Lemmatizer aktiviert")
|
||||
except:
|
||||
SPACY_AVAILABLE = False
|
||||
nlp = None
|
||||
print("Spacy nicht verfügbar – nutze naive Stemmer")
|
||||
|
||||
# =========================
|
||||
# Pfade & Config
|
||||
# =========================
|
||||
INPUT_DIR = Path("Input CSV")
|
||||
OUTPUT_DIR = Path("Auswertung Ergebnisse")
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
|
||||
CACHE_FILE = "api_cache.json"
|
||||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||||
CONF_THRESHOLD = 0.75
|
||||
TIMEOUT = 5
|
||||
MAX_RETRIES = 3
|
||||
BACKOFF_FACTOR = 2
|
||||
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
|
||||
API_ACTIVE = {"gnd": True, "wikidata": True}
|
||||
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
|
||||
|
||||
# Cache
|
||||
if os.path.exists(CACHE_FILE):
|
||||
with open(CACHE_FILE,"r",encoding="utf-8") as f:
|
||||
CACHE = json.load(f)
|
||||
else:
|
||||
CACHE = {}
|
||||
|
||||
def save_cache():
|
||||
with open(CACHE_FILE,"w",encoding="utf-8") as f:
|
||||
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# =========================
|
||||
# Normalisierung / Lemma
|
||||
# =========================
|
||||
def normalize_text(s):
|
||||
if not s:
|
||||
return ""
|
||||
s = str(s).lower().strip()
|
||||
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
|
||||
s = re.sub(r"\s+"," ",s)
|
||||
return s
|
||||
|
||||
# Lemma-Cache
|
||||
lemma_cache = {}
|
||||
|
||||
def lemmatize_term(term):
|
||||
term_norm = normalize_text(term)
|
||||
if term_norm in lemma_cache:
|
||||
return lemma_cache[term_norm]
|
||||
if SPACY_AVAILABLE and nlp:
|
||||
doc = nlp(term_norm)
|
||||
lemma = " ".join([token.lemma_ for token in doc])
|
||||
else:
|
||||
lemma = term_norm
|
||||
lemma_cache[term_norm] = lemma
|
||||
return lemma
|
||||
|
||||
# =========================
|
||||
# Kompositum-Zerlegung (einfacher Ansatz)
|
||||
# =========================
|
||||
def compound_split(term):
|
||||
parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
|
||||
return parts if parts else [term]
|
||||
|
||||
# =========================
|
||||
# Normvokabular laden & Lemma vorbereiten
|
||||
# =========================
|
||||
def load_normvokabular(file_path):
|
||||
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
|
||||
norm_dict = {}
|
||||
stem_index = defaultdict(list)
|
||||
lemma_norm_map = {} # für RapidFuzz preprocessed
|
||||
|
||||
for sheet_name, df in sheets.items():
|
||||
if sheet_name.lower() in ["master", "übersicht"]:
|
||||
continue
|
||||
df = df.dropna(how="all", axis=1)
|
||||
df.columns = [str(c).strip() for c in df.columns]
|
||||
id_col = next((c for c in df.columns if "ID" in c), None)
|
||||
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c), None)
|
||||
if not id_col or not word_col:
|
||||
continue
|
||||
|
||||
current_parent_id = None
|
||||
for _, row in df.iterrows():
|
||||
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
|
||||
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
|
||||
if row_id:
|
||||
current_parent_id = row_id
|
||||
if not row_word:
|
||||
continue
|
||||
assigned_parent_id = current_parent_id
|
||||
entry = {
|
||||
"Name": row_word,
|
||||
"ID": assigned_parent_id, # Parent-ID
|
||||
"Sheet": sheet_name,
|
||||
"Own_ID": row_id or "" # eigene ID, falls vorhanden
|
||||
}
|
||||
key = normalize_text(row_word)
|
||||
norm_dict[key] = entry
|
||||
lemma = lemmatize_term(key)
|
||||
stem_index[lemma].append(entry)
|
||||
if lemma not in lemma_norm_map:
|
||||
lemma_norm_map[lemma] = entry
|
||||
return norm_dict, stem_index, lemma_norm_map
|
||||
|
||||
# =========================
|
||||
# Mapping & Vorschläge
|
||||
# =========================
|
||||
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
|
||||
term_norm = normalize_text(term)
|
||||
term_lemma = lemmatize_term(term)
|
||||
|
||||
# Exakter Treffer
|
||||
if term_norm in norm_dict:
|
||||
e = norm_dict[term_norm]
|
||||
return e["Name"], e["ID"], []
|
||||
|
||||
# Lemma-Treffer
|
||||
if term_lemma in stem_index:
|
||||
e = stem_index[term_lemma][0]
|
||||
return e["Name"], e["ID"], []
|
||||
|
||||
# KEIN TREFFER → Kompositum-Split
|
||||
tokens = compound_split(term)
|
||||
if len(tokens) == 1:
|
||||
suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
|
||||
return "KEIN TREFFER", "", suggestions
|
||||
else:
|
||||
token_matches = []
|
||||
for t in tokens:
|
||||
t_lemma = lemmatize_term(t)
|
||||
if t_lemma in stem_index:
|
||||
e = stem_index[t_lemma][0]
|
||||
token_matches.append((t, e["Name"], e["ID"]))
|
||||
else:
|
||||
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
|
||||
token_matches.append((t, "KEIN TREFFER", "", sugg))
|
||||
combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
|
||||
return "KEIN TREFFER", "", combined_suggestions
|
||||
|
||||
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
|
||||
candidates = []
|
||||
for key_lemma, entry in lemma_norm_map.items():
|
||||
if RAPIDFUZZ_AVAILABLE:
|
||||
score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
|
||||
else:
|
||||
score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
|
||||
if key_lemma.lower().startswith(term_lemma.lower()):
|
||||
score = min(score + 0.1, 1.0)
|
||||
if score >= threshold:
|
||||
candidates.append((score, entry["Name"], entry["ID"]))
|
||||
candidates.sort(reverse=True)
|
||||
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
|
||||
|
||||
# =========================
|
||||
# API-Abfragen
|
||||
# =========================
|
||||
def request_with_retries(api_name,url,params=None):
|
||||
cache_key = url + str(params)
|
||||
if cache_key in CACHE:
|
||||
return CACHE[cache_key]
|
||||
retries = 0
|
||||
while retries < MAX_RETRIES:
|
||||
try:
|
||||
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
|
||||
if r.status_code == 200:
|
||||
try: data = r.json()
|
||||
except: data = r.text
|
||||
CACHE[cache_key] = data
|
||||
FAIL_COUNTER[api_name] = 0
|
||||
return data
|
||||
except:
|
||||
pass
|
||||
retries += 1
|
||||
time.sleep(min(BACKOFF_FACTOR**retries,30))
|
||||
FAIL_COUNTER[api_name] += 1
|
||||
if FAIL_COUNTER[api_name] >= 10:
|
||||
API_ACTIVE[api_name] = False
|
||||
return None
|
||||
|
||||
def batch_query_gnd(terms):
|
||||
results={}
|
||||
if not API_ACTIVE.get("gnd", False):
|
||||
for t in terms: results[t] = ""
|
||||
return results
|
||||
for t in terms:
|
||||
url="https://lobid.org/gnd/search"
|
||||
params={"q":t,"format":"json"}
|
||||
data = request_with_retries("gnd", url, params)
|
||||
top = ""
|
||||
if data and "member" in data:
|
||||
cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None,t.lower(),(doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
|
||||
cands = [c for c in cands if c[1]>=0.75]
|
||||
if cands:
|
||||
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
|
||||
results[t] = top
|
||||
return results
|
||||
|
||||
def batch_query_wikidata(terms):
|
||||
results={}
|
||||
if not API_ACTIVE.get("wikidata", False):
|
||||
for t in terms: results[t] = ""
|
||||
return results
|
||||
for t in terms:
|
||||
url="https://www.wikidata.org/w/api.php"
|
||||
params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
|
||||
data = request_with_retries("wikidata", url, params)
|
||||
top = ""
|
||||
if data and "search" in data:
|
||||
cands = [(e.get("label",""), SequenceMatcher(None,t.lower(),e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")]
|
||||
cands = [c for c in cands if c[1]>=0.70]
|
||||
if cands:
|
||||
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
|
||||
results[t] = top
|
||||
return results
|
||||
|
||||
# =========================
|
||||
# Markierung / Export
|
||||
# =========================
|
||||
def mark_norm_hits(file_path):
|
||||
ext = file_path.suffix.lower()
|
||||
if ext in [".xlsx", ".xls"]:
|
||||
from openpyxl import load_workbook
|
||||
from openpyxl.styles import PatternFill
|
||||
wb = load_workbook(file_path)
|
||||
ws = wb.active
|
||||
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
||||
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
||||
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
|
||||
norm_col = col_map.get("Norm_Treffer", None)
|
||||
if not norm_col:
|
||||
print("Spalte 'Norm_Treffer' nicht gefunden")
|
||||
return
|
||||
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
|
||||
cell = row[0]
|
||||
if cell.value and cell.value != "KEIN TREFFER":
|
||||
cell.fill = green_fill
|
||||
else:
|
||||
cell.fill = red_fill
|
||||
wb.save(file_path)
|
||||
elif ext==".ods":
|
||||
df = pd.read_excel(file_path, engine="odf")
|
||||
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
|
||||
df.to_excel(file_path, index=False, engine="odf")
|
||||
|
||||
# =========================
|
||||
# Verarbeitung Input-Dateien
|
||||
# =========================
|
||||
## =========================
|
||||
# Neue Funktion: fehlende Begriffe in separate Datei exportieren
|
||||
# =========================
|
||||
def export_missing_terms(out_df, output_file):
|
||||
# Filter: KEIN TREFFER & keine Vorschläge
|
||||
missing_df = out_df[
|
||||
(out_df["Norm_Treffer"] == "KEIN TREFFER") &
|
||||
(out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
|
||||
][["Begriff"]].drop_duplicates()
|
||||
|
||||
count_missing = len(missing_df)
|
||||
print(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
|
||||
|
||||
if count_missing == 0:
|
||||
return
|
||||
|
||||
# Neue Datei erzeugen
|
||||
ext = output_file.suffix.lower()
|
||||
base_name = output_file.stem
|
||||
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}"
|
||||
|
||||
# Bei vorhandener Datei: Versionsnummer anhängen
|
||||
version = 1
|
||||
while missing_file.exists():
|
||||
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}"
|
||||
version += 1
|
||||
|
||||
if ext in [".xlsx", ".xls"]:
|
||||
missing_df.to_excel(missing_file, index=False, engine="openpyxl")
|
||||
elif ext == ".ods":
|
||||
missing_df.to_excel(missing_file, index=False, engine="odf")
|
||||
else:
|
||||
# Für CSV
|
||||
missing_df.to_csv(missing_file, index=False, sep=";")
|
||||
|
||||
print(f"Fehlende Begriffe gespeichert: {missing_file}")
|
||||
|
||||
|
||||
# =========================
|
||||
# Verarbeitung Input-Dateien (final)
|
||||
# =========================
|
||||
# =========================
|
||||
# Neue Funktion: fehlende Begriffe in separate Datei exportieren
|
||||
# =========================
|
||||
def export_missing_terms(out_df, output_file):
|
||||
# Filter: KEIN TREFFER & keine Vorschläge
|
||||
missing_df = out_df[
|
||||
(out_df["Norm_Treffer"] == "KEIN TREFFER") &
|
||||
(out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
|
||||
][["Begriff"]].drop_duplicates()
|
||||
|
||||
count_missing = len(missing_df)
|
||||
print(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
|
||||
|
||||
if count_missing == 0:
|
||||
return
|
||||
|
||||
# Neue Datei erzeugen
|
||||
ext = output_file.suffix.lower()
|
||||
base_name = output_file.stem
|
||||
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}"
|
||||
|
||||
# Bei vorhandener Datei: Versionsnummer anhängen
|
||||
version = 1
|
||||
while missing_file.exists():
|
||||
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}"
|
||||
version += 1
|
||||
|
||||
if ext in [".xlsx", ".xls"]:
|
||||
missing_df.to_excel(missing_file, index=False, engine="openpyxl")
|
||||
elif ext == ".ods":
|
||||
missing_df.to_excel(missing_file, index=False, engine="odf")
|
||||
else:
|
||||
# Für CSV
|
||||
missing_df.to_csv(missing_file, index=False, sep=";")
|
||||
|
||||
print(f"Fehlende Begriffe gespeichert: {missing_file}")
|
||||
|
||||
|
||||
# =========================
|
||||
# Verarbeitung Input-Dateien (final)
|
||||
# =========================
|
||||
def process_files():
|
||||
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
|
||||
total_terms = 0
|
||||
total_hits = 0
|
||||
|
||||
if not INPUT_DIR.exists():
|
||||
print(f"Eingabeordner {INPUT_DIR} fehlt")
|
||||
sys.exit(1)
|
||||
files = list(INPUT_DIR.glob("*"))
|
||||
if not files:
|
||||
print("Keine Dateien gefunden")
|
||||
return
|
||||
|
||||
for file_path in files:
|
||||
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
|
||||
continue
|
||||
print(f"Verarbeite Datei: {file_path.name}")
|
||||
try:
|
||||
if file_path.suffix.lower() == ".csv":
|
||||
df = pd.read_csv(file_path)
|
||||
else:
|
||||
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
|
||||
except Exception as e:
|
||||
print(f"Fehler beim Lesen von {file_path.name}: {e}")
|
||||
continue
|
||||
|
||||
df = df.dropna(how="all")
|
||||
df.columns = [str(c).strip() for c in df.columns]
|
||||
|
||||
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
|
||||
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
|
||||
urh_col = next((c for c in df.columns if "Urheber" in c), None)
|
||||
if not besch_col: continue
|
||||
|
||||
row_terms_map = []
|
||||
for _, row in df.iterrows():
|
||||
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
|
||||
if not besch: continue
|
||||
obj_box = row[box_col] if box_col else ""
|
||||
urheber = row[urh_col] if urh_col else ""
|
||||
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
|
||||
terms = []
|
||||
for clause in clauses:
|
||||
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
|
||||
for p in parts:
|
||||
if p.lower() in STOPWORDS: continue
|
||||
if re.fullmatch(r"\d+", p): continue
|
||||
terms.append(p)
|
||||
row_terms_map.append((obj_box, urheber, terms))
|
||||
|
||||
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
|
||||
gnd_results = batch_query_gnd(all_terms)
|
||||
wd_results = batch_query_wikidata(all_terms)
|
||||
|
||||
output_rows = []
|
||||
for obj_box, urheber, terms in row_terms_map:
|
||||
for term in terms:
|
||||
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
|
||||
total_terms += 1
|
||||
if norm_name != "KEIN TREFFER":
|
||||
total_hits += 1
|
||||
out_row = {
|
||||
"Box": obj_box,
|
||||
"Objekt/Ebene": obj_box,
|
||||
"Urheber": urheber,
|
||||
"Begriff": term,
|
||||
"Norm_Treffer": norm_name,
|
||||
"Norm_ID": norm_id,
|
||||
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
|
||||
"GND_Top1": gnd_results.get(term,""),
|
||||
"WD_Top1": wd_results.get(term,"")
|
||||
}
|
||||
output_rows.append(out_row)
|
||||
|
||||
out_df = pd.DataFrame(output_rows)
|
||||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
|
||||
version = 1
|
||||
while output_file.exists():
|
||||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
|
||||
version += 1
|
||||
engine = "odf" if output_file.suffix.lower()==".ods" else None
|
||||
out_df.to_excel(output_file, index=False, engine=engine)
|
||||
|
||||
# --- NEU: fehlende Begriffe in separate Datei ---
|
||||
export_missing_terms(out_df, output_file)
|
||||
|
||||
mark_norm_hits(output_file)
|
||||
print(f"Auswertung gespeichert: {output_file}")
|
||||
|
||||
save_cache()
|
||||
print(f"Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular")
|
||||
|
||||
|
||||
# =========================
|
||||
# Main
|
||||
# =========================
|
||||
if __name__ == "__main__":
|
||||
process_files()
|
||||
print("Fertig")
|
||||
747
NormVokabular_Mapper_1.4.py
Normal file
747
NormVokabular_Mapper_1.4.py
Normal file
@ -0,0 +1,747 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
NormVokabular Mapper – Version 1.4.1
|
||||
- Detailliertes (DEBUG) Batch-Logging: gepufferte Logs werden periodisch in Konsole + Datei geschrieben
|
||||
- Getty AAT (SPARQL via requests) – API-polite, timeout/retries/backoff
|
||||
- Fehlertoleranz: API-Ausfälle führen nicht zum Totalabsturz
|
||||
- Fehlende Begriffe -> separate Datei (gleiches Format wie Output)
|
||||
- Bestehende Normalisierung/Lemmatisierung/Stemming wird weiterverwendet
|
||||
- Batch-Logging-Modus (konfigurierbar)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import time
|
||||
import json
|
||||
import threading
|
||||
import queue
|
||||
import requests
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
from difflib import SequenceMatcher
|
||||
from datetime import datetime
|
||||
|
||||
# Optional libs
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
RAPIDFUZZ_AVAILABLE = True
|
||||
except Exception:
|
||||
RAPIDFUZZ_AVAILABLE = False
|
||||
|
||||
try:
|
||||
import spacy
|
||||
nlp = spacy.load("de_core_news_sm")
|
||||
SPACY_AVAILABLE = True
|
||||
except Exception:
|
||||
SPACY_AVAILABLE = False
|
||||
nlp = None
|
||||
|
||||
# =========================
|
||||
# Config & Pfade
|
||||
# =========================
|
||||
INPUT_DIR = Path("Input CSV")
|
||||
OUTPUT_DIR = Path("Auswertung Ergebnisse")
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
|
||||
CACHE_FILE = "api_cache.json"
|
||||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||||
CONF_THRESHOLD = 0.75
|
||||
TIMEOUT_DEFAULT = 5
|
||||
MAX_RETRIES_DEFAULT = 3
|
||||
BACKOFF_FACTOR_DEFAULT = 2
|
||||
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
|
||||
API_ACTIVE = {"gnd": True, "wikidata": True, "aat": True}
|
||||
FAIL_COUNTER = {"gnd": 0, "wikidata": 0, "aat": 0}
|
||||
|
||||
# Logging file
|
||||
LOG_FILE = OUTPUT_DIR / "mapper_log.txt"
|
||||
|
||||
# Batch logging parameters
|
||||
LOG_BATCH_SIZE = 100 # flush wenn >= Einträge
|
||||
LOG_FLUSH_INTERVAL = 5.0 # Sekunden zwischen Flushes (Batch-Logging)
|
||||
LOG_LEVEL = "DEBUG" # ausführlich gewünscht
|
||||
|
||||
# =========================
|
||||
# Buffered/Batched Logger
|
||||
# =========================
|
||||
class BatchLogger:
|
||||
def __init__(self, logfile: Path, flush_interval: float = 5.0, batch_size: int = 100, level: str = "DEBUG"):
|
||||
self.logfile = logfile
|
||||
self.flush_interval = flush_interval
|
||||
self.batch_size = batch_size
|
||||
self.level = level
|
||||
self.q = queue.Queue()
|
||||
self._stop_event = threading.Event()
|
||||
self._thread = threading.Thread(target=self._worker, daemon=True, name="BatchLoggerThread")
|
||||
# Ensure logfile exists
|
||||
try:
|
||||
logfile.parent.mkdir(parents=True, exist_ok=True)
|
||||
logfile.touch(exist_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
self._thread.start()
|
||||
|
||||
def _format(self, level: str, msg: str) -> str:
|
||||
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
return f"{ts} - {level} - {msg}"
|
||||
|
||||
def log(self, level: str, msg: str):
|
||||
if self._stop_event.is_set():
|
||||
return
|
||||
formatted = self._format(level, msg)
|
||||
self.q.put((level, formatted))
|
||||
# If queue too big, trigger immediate flush by putting a special token
|
||||
if self.q.qsize() >= self.batch_size:
|
||||
self.q.put(("__FLUSH__", "__FLUSH__"))
|
||||
|
||||
def debug(self, msg: str):
|
||||
if LOG_LEVEL in ("DEBUG",):
|
||||
self.log("DEBUG", msg)
|
||||
|
||||
def info(self, msg: str):
|
||||
self.log("INFO", msg)
|
||||
|
||||
def warning(self, msg: str):
|
||||
self.log("WARNING", msg)
|
||||
|
||||
def error(self, msg: str):
|
||||
self.log("ERROR", msg)
|
||||
|
||||
def exception(self, msg: str):
|
||||
self.log("EXCEPTION", msg)
|
||||
|
||||
def _worker(self):
|
||||
buffer = []
|
||||
last_flush = time.time()
|
||||
while not self._stop_event.is_set() or not self.q.empty():
|
||||
try:
|
||||
item = None
|
||||
try:
|
||||
item = self.q.get(timeout=self.flush_interval)
|
||||
except queue.Empty:
|
||||
# time-based flush
|
||||
if buffer:
|
||||
self._flush_buffer(buffer)
|
||||
buffer = []
|
||||
last_flush = time.time()
|
||||
continue
|
||||
|
||||
if item is None:
|
||||
continue
|
||||
level, formatted = item
|
||||
if level == "__FLUSH__":
|
||||
if buffer:
|
||||
self._flush_buffer(buffer)
|
||||
buffer = []
|
||||
last_flush = time.time()
|
||||
continue
|
||||
buffer.append((level, formatted))
|
||||
|
||||
# flush conditions
|
||||
if len(buffer) >= self.batch_size or (time.time() - last_flush) >= self.flush_interval:
|
||||
self._flush_buffer(buffer)
|
||||
buffer = []
|
||||
last_flush = time.time()
|
||||
except Exception as e:
|
||||
# As a last resort, write error immediately to stderr
|
||||
try:
|
||||
sys.stderr.write(f"BatchLogger worker error: {e}\n")
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(0.5)
|
||||
# final flush
|
||||
if buffer:
|
||||
self._flush_buffer(buffer)
|
||||
|
||||
def _flush_buffer(self, buffer):
|
||||
if not buffer:
|
||||
return
|
||||
# write to console and file
|
||||
try:
|
||||
# console
|
||||
out_lines = [f"{line}\n" for _, line in buffer]
|
||||
# write to stdout
|
||||
try:
|
||||
sys.stdout.writelines(out_lines)
|
||||
sys.stdout.flush()
|
||||
except Exception:
|
||||
pass
|
||||
# append to file
|
||||
try:
|
||||
with open(self.logfile, "a", encoding="utf-8") as f:
|
||||
f.writelines(out_lines)
|
||||
except Exception as e:
|
||||
try:
|
||||
sys.stderr.write(f"BatchLogger file write error: {e}\n")
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def stop(self):
|
||||
self._stop_event.set()
|
||||
# put sentinel to wake worker
|
||||
try:
|
||||
self.q.put(("__FLUSH__", "__FLUSH__"))
|
||||
except Exception:
|
||||
pass
|
||||
self._thread.join(timeout=5.0)
|
||||
|
||||
# Instantiate logger
|
||||
logger = BatchLogger(LOG_FILE, flush_interval=LOG_FLUSH_INTERVAL, batch_size=LOG_BATCH_SIZE, level=LOG_LEVEL)
|
||||
logger.info("Starte NormVokabular Mapper v1.4.1 (Batch-Logging aktiv)")
|
||||
|
||||
# =========================
|
||||
# Cache laden/speichern
|
||||
# =========================
|
||||
if os.path.exists(CACHE_FILE):
|
||||
try:
|
||||
with open(CACHE_FILE,"r",encoding="utf-8") as f:
|
||||
CACHE = json.load(f)
|
||||
logger.debug(f"Cache geladen ({len(CACHE)} Einträge).")
|
||||
except Exception as e:
|
||||
logger.warning(f"Cache konnte nicht geladen werden: {e}")
|
||||
CACHE = {}
|
||||
else:
|
||||
CACHE = {}
|
||||
|
||||
def save_cache():
|
||||
try:
|
||||
with open(CACHE_FILE,"w",encoding="utf-8") as f:
|
||||
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
||||
logger.debug("Cache gespeichert.")
|
||||
except Exception as e:
|
||||
logger.error(f"Cache konnte nicht gespeichert werden: {e}")
|
||||
|
||||
# =========================
|
||||
# Normalisierung / Lemma / Tokenization
|
||||
# =========================
|
||||
def normalize_text(s):
|
||||
if not s:
|
||||
return ""
|
||||
s = str(s).lower().strip()
|
||||
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
|
||||
s = re.sub(r"\s+"," ",s)
|
||||
return s
|
||||
|
||||
lemma_cache = {}
|
||||
|
||||
def lemmatize_term(term):
|
||||
term_norm = normalize_text(term)
|
||||
if term_norm in lemma_cache:
|
||||
return lemma_cache[term_norm]
|
||||
if SPACY_AVAILABLE and nlp:
|
||||
try:
|
||||
doc = nlp(term_norm)
|
||||
lemma = " ".join([token.lemma_ for token in doc])
|
||||
except Exception:
|
||||
lemma = term_norm
|
||||
else:
|
||||
lemma = term_norm
|
||||
lemma_cache[term_norm] = lemma
|
||||
return lemma
|
||||
|
||||
def compound_split(term):
|
||||
if not term:
|
||||
return []
|
||||
parts = [p for p in re.split(r"[\s\-_/]+", term) if p]
|
||||
return parts if parts else [term]
|
||||
|
||||
# =========================
|
||||
# Normvokabular laden & Index
|
||||
# =========================
|
||||
def load_normvokabular(file_path):
|
||||
try:
|
||||
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
|
||||
except Exception as e:
|
||||
logger.error(f"Normvokabular konnte nicht geladen werden: {e}")
|
||||
raise
|
||||
norm_dict = {}
|
||||
stem_index = defaultdict(list)
|
||||
lemma_norm_map = {}
|
||||
|
||||
for sheet_name, df in sheets.items():
|
||||
if sheet_name.lower() in ["master", "übersicht"]:
|
||||
continue
|
||||
df = df.dropna(how="all", axis=1)
|
||||
df.columns = [str(c).strip() for c in df.columns]
|
||||
id_col = next((c for c in df.columns if "ID" in c), None)
|
||||
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c or "Begriff" in c), None)
|
||||
if not id_col or not word_col:
|
||||
continue
|
||||
current_parent_id = None
|
||||
for _, row in df.iterrows():
|
||||
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
|
||||
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
|
||||
if row_id:
|
||||
current_parent_id = row_id
|
||||
if not row_word:
|
||||
continue
|
||||
assigned_parent_id = current_parent_id
|
||||
entry = {"Name": row_word, "ID": assigned_parent_id or "", "Sheet": sheet_name, "Own_ID": row_id or ""}
|
||||
key = normalize_text(row_word)
|
||||
norm_dict[key] = entry
|
||||
lemma = lemmatize_term(key)
|
||||
stem_index[lemma].append(entry)
|
||||
if lemma not in lemma_norm_map:
|
||||
lemma_norm_map[lemma] = entry
|
||||
logger.info(f"Normvokabular geladen: {len(norm_dict)} Einträge, {len(stem_index)} Stems")
|
||||
return norm_dict, stem_index, lemma_norm_map
|
||||
|
||||
# =========================
|
||||
# Mapping & Vorschläge
|
||||
# =========================
|
||||
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
|
||||
term_norm = normalize_text(term)
|
||||
term_lemma = lemmatize_term(term)
|
||||
|
||||
if term_norm in norm_dict:
|
||||
e = norm_dict[term_norm]
|
||||
logger.debug(f"map_to_norm: exakter Treffer für '{term}' -> {e['Name']}")
|
||||
return e["Name"], e["ID"], []
|
||||
|
||||
if term_lemma in stem_index:
|
||||
e = stem_index[term_lemma][0]
|
||||
logger.debug(f"map_to_norm: Lemma-Treffer für '{term}' -> {e['Name']}")
|
||||
return e["Name"], e["ID"], []
|
||||
|
||||
tokens = compound_split(term_norm)
|
||||
if len(tokens) == 1:
|
||||
suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
|
||||
logger.debug(f"map_to_norm: KEIN TREFFER für '{term}', Vorschläge: {suggestions}")
|
||||
return "KEIN TREFFER", "", suggestions
|
||||
else:
|
||||
token_matches = []
|
||||
for t in tokens:
|
||||
t_lemma = lemmatize_term(t)
|
||||
if t_lemma in stem_index:
|
||||
e = stem_index[t_lemma][0]
|
||||
token_matches.append((t, e["Name"], e["ID"]))
|
||||
else:
|
||||
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
|
||||
token_matches.append((t, "KEIN TREFFER", "", sugg))
|
||||
combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
|
||||
logger.debug(f"map_to_norm: Kompositum '{term}' -> combined_suggestions: {combined_suggestions}")
|
||||
return "KEIN TREFFER", "", combined_suggestions
|
||||
|
||||
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
|
||||
candidates = []
|
||||
for key_lemma, entry in lemma_norm_map.items():
|
||||
if RAPIDFUZZ_AVAILABLE:
|
||||
score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
|
||||
else:
|
||||
score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
|
||||
if key_lemma.lower().startswith(term_lemma.lower()):
|
||||
score = min(score + 0.1, 1.0)
|
||||
if score >= threshold:
|
||||
candidates.append((score, entry["Name"], entry["ID"]))
|
||||
candidates.sort(reverse=True)
|
||||
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
|
||||
|
||||
# =========================
|
||||
# Generic request with retries & caching
|
||||
# =========================
|
||||
def request_with_retries_generic(api_name, url, params=None, headers=None, timeout=TIMEOUT_DEFAULT, max_retries=MAX_RETRIES_DEFAULT, backoff=BACKOFF_FACTOR_DEFAULT):
|
||||
cache_key = url + (json.dumps(params, sort_keys=True, ensure_ascii=False) if params else "")
|
||||
if cache_key in CACHE:
|
||||
logger.debug(f"[Cache] {api_name}: {cache_key}")
|
||||
return CACHE[cache_key]
|
||||
retries = 0
|
||||
while retries < max_retries:
|
||||
try:
|
||||
r = requests.get(url, params=params, headers=headers or HEADERS, timeout=timeout)
|
||||
if r.status_code == 200:
|
||||
try:
|
||||
data = r.json()
|
||||
except Exception:
|
||||
data = r.text
|
||||
CACHE[cache_key] = data
|
||||
FAIL_COUNTER[api_name] = 0
|
||||
logger.debug(f"[{api_name}] Erfolgreiche Antwort für {url}")
|
||||
return data
|
||||
else:
|
||||
logger.warning(f"[{api_name}] HTTP {r.status_code} für {url}")
|
||||
raise ValueError(f"HTTP {r.status_code}")
|
||||
except Exception as e:
|
||||
retries += 1
|
||||
wait = backoff ** retries
|
||||
logger.warning(f"[{api_name}] Fehler ({retries}/{max_retries}) für {url}: {e}. Warte {wait}s")
|
||||
time.sleep(wait)
|
||||
FAIL_COUNTER[api_name] += 1
|
||||
if FAIL_COUNTER[api_name] >= 10:
|
||||
API_ACTIVE[api_name] = False
|
||||
logger.error(f"[{api_name}] Deaktiviere API nach zu vielen Fehlern.")
|
||||
return None
|
||||
|
||||
# =========================
|
||||
# GND / Wikidata (bestehend)
|
||||
# =========================
|
||||
def batch_query_gnd(terms):
|
||||
results = {}
|
||||
if not API_ACTIVE.get("gnd", False):
|
||||
for t in terms: results[t] = ""
|
||||
return results
|
||||
logger.info(f"[GND] Starte GND-Abfragen für {len(terms)} Terme")
|
||||
start = time.time()
|
||||
for idx, t in enumerate(terms, start=1):
|
||||
logger.debug(f"[GND] ({idx}/{len(terms)}) Anfrage für '{t}'")
|
||||
url = "https://lobid.org/gnd/search"
|
||||
params = {"q": t, "format": "json"}
|
||||
data = request_with_retries_generic("gnd", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
|
||||
top = ""
|
||||
try:
|
||||
if data and "member" in data:
|
||||
cands = [(doc.get("preferredName","") or doc.get("name",""),
|
||||
SequenceMatcher(None, t.lower(), (doc.get("preferredName","") or doc.get("name","")).lower()).ratio())
|
||||
for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
|
||||
cands = [c for c in cands if c[1] >= 0.75]
|
||||
if cands:
|
||||
top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
|
||||
except Exception as e:
|
||||
logger.debug(f"[GND] Fehler bei Verarbeitung für '{t}': {e}")
|
||||
results[t] = top
|
||||
elapsed = time.time() - start
|
||||
logger.info(f"[GND] Fertig. Dauer: {elapsed:.1f}s")
|
||||
return results
|
||||
|
||||
def batch_query_wikidata(terms):
|
||||
results = {}
|
||||
if not API_ACTIVE.get("wikidata", False):
|
||||
for t in terms: results[t] = ""
|
||||
return results
|
||||
logger.info(f"[WD] Starte Wikidata-Abfragen für {len(terms)} Terme")
|
||||
start = time.time()
|
||||
for idx, t in enumerate(terms, start=1):
|
||||
logger.debug(f"[WD] ({idx}/{len(terms)}) Anfrage für '{t}'")
|
||||
url = "https://www.wikidata.org/w/api.php"
|
||||
params = {"action": "wbsearchentities", "search": t, "language": "de", "format": "json"}
|
||||
data = request_with_retries_generic("wikidata", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT)
|
||||
top = ""
|
||||
try:
|
||||
if data and "search" in data:
|
||||
cands = [(e.get("label",""), SequenceMatcher(None, t.lower(), e.get("label","").lower()).ratio())
|
||||
for e in data["search"] if e.get("label","")]
|
||||
cands = [c for c in cands if c[1] >= 0.70]
|
||||
if cands:
|
||||
top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0]
|
||||
except Exception as e:
|
||||
logger.debug(f"[WD] Fehler bei Verarbeitung für '{t}': {e}")
|
||||
results[t] = top
|
||||
elapsed = time.time() - start
|
||||
logger.info(f"[WD] Fertig. Dauer: {elapsed:.1f}s")
|
||||
return results
|
||||
|
||||
# =========================
|
||||
# Getty AAT Abfrage – robust & API-polite (requests)
|
||||
# =========================
|
||||
def batch_query_getty_aat(terms):
|
||||
results = {}
|
||||
if not API_ACTIVE.get("aat", False):
|
||||
for t in terms: results[t] = ""
|
||||
return results
|
||||
|
||||
endpoint = "https://vocab.getty.edu/sparql"
|
||||
headers = {"Accept": "application/sparql-results+json", "User-Agent": HEADERS.get("User-Agent")}
|
||||
TIMEOUT = 8
|
||||
MAX_RETRIES = 3
|
||||
BACKOFF_FACTOR = 2
|
||||
FAIL_LIMIT = 5
|
||||
fail_counter_local = 0
|
||||
|
||||
logger.info(f"[AAT] Starte Getty AAT-Abgleich für {len(terms)} Terme")
|
||||
start_all = time.time()
|
||||
for idx, term in enumerate(terms, start=1):
|
||||
term_norm = lemmatize_term(normalize_text(term))
|
||||
tokens = compound_split(term_norm)
|
||||
logger.debug(f"[AAT] ({idx}/{len(terms)}) Begriff '{term}' -> Tokens: {tokens}")
|
||||
|
||||
query_fragments = []
|
||||
for tkn in tokens:
|
||||
t_escaped = tkn.replace('"', '\\"')
|
||||
qf = f"""
|
||||
?concept skos:prefLabel ?label .
|
||||
FILTER(lang(?label)='de' && CONTAINS(LCASE(?label), LCASE("{t_escaped}")))
|
||||
"""
|
||||
query_fragments.append(f"{{ {qf} }}")
|
||||
query_body = " UNION ".join(query_fragments) if query_fragments else ""
|
||||
query = f"PREFIX skos: <http://www.w3.org/2004/02/skos/core#> SELECT ?label ?concept WHERE {{ {query_body} }} LIMIT 10"
|
||||
|
||||
retries = 0
|
||||
success = False
|
||||
start_term = time.time()
|
||||
while retries < MAX_RETRIES and not success:
|
||||
try:
|
||||
logger.debug(f"[AAT] Anfrage (Retry {retries}) für '{term}'")
|
||||
r = requests.get(endpoint, params={"query": query}, headers=headers, timeout=TIMEOUT)
|
||||
if r.status_code != 200:
|
||||
raise ValueError(f"HTTP {r.status_code}")
|
||||
ret = r.json()
|
||||
candidates = [(b['label']['value'], b['concept']['value']) for b in ret.get("results", {}).get("bindings", [])]
|
||||
if candidates:
|
||||
scored = [
|
||||
(c[0], c[1], SequenceMatcher(None, term_norm, lemmatize_term(normalize_text(c[0]))).ratio())
|
||||
for c in candidates
|
||||
]
|
||||
top = max(scored, key=lambda x: x[2])
|
||||
results[term] = top[0]
|
||||
logger.debug(f"[AAT] Treffer für '{term}': {results[term]} (Score: {top[2]:.3f})")
|
||||
else:
|
||||
results[term] = ""
|
||||
logger.debug(f"[AAT] Kein Treffer für '{term}'")
|
||||
success = True
|
||||
except Exception as e:
|
||||
retries += 1
|
||||
wait = BACKOFF_FACTOR ** retries
|
||||
logger.warning(f"[AAT] Fehler ({retries}/{MAX_RETRIES}) für '{term}': {e} – warte {wait}s")
|
||||
time.sleep(wait)
|
||||
if retries == MAX_RETRIES:
|
||||
results[term] = ""
|
||||
fail_counter_local += 1
|
||||
# polite delay
|
||||
time.sleep(1.0)
|
||||
elapsed_term = time.time() - start_term
|
||||
logger.debug(f"[AAT] Dauer für '{term}': {elapsed_term:.2f}s")
|
||||
|
||||
if fail_counter_local >= FAIL_LIMIT:
|
||||
logger.error("[AAT] Zu viele Fehler lokal - breche AAT-Abfragen ab.")
|
||||
for t_rem in terms[idx:]:
|
||||
results[t_rem] = ""
|
||||
FAIL_COUNTER["aat"] += fail_counter_local
|
||||
API_ACTIVE["aat"] = False
|
||||
break
|
||||
|
||||
elapsed_all = time.time() - start_all
|
||||
logger.info(f"[AAT] Getty AAT-Abgleich abgeschlossen. Dauer: {elapsed_all:.1f}s")
|
||||
return results
|
||||
|
||||
# =========================
|
||||
# Markierung / Export (Excel/ODS)
|
||||
# =========================
|
||||
def mark_norm_hits(file_path):
|
||||
ext = file_path.suffix.lower()
|
||||
try:
|
||||
if ext in [".xlsx", ".xls"]:
|
||||
from openpyxl import load_workbook
|
||||
from openpyxl.styles import PatternFill
|
||||
wb = load_workbook(file_path)
|
||||
ws = wb.active
|
||||
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
|
||||
norm_col = col_map.get("Norm_Treffer", None)
|
||||
if not norm_col:
|
||||
logger.debug("Spalte 'Norm_Treffer' nicht gefunden (mark_norm_hits).")
|
||||
wb.save(file_path)
|
||||
return
|
||||
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
||||
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
||||
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
|
||||
cell = row[0]
|
||||
if cell.value and cell.value != "KEIN TREFFER":
|
||||
cell.fill = green_fill
|
||||
else:
|
||||
cell.fill = red_fill
|
||||
wb.save(file_path)
|
||||
elif ext==".ods":
|
||||
df = pd.read_excel(file_path, engine="odf")
|
||||
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
|
||||
df.to_excel(file_path, index=False, engine="odf")
|
||||
except Exception as e:
|
||||
logger.warning(f"Fehler beim Markieren der Treffer in {file_path}: {e}")
|
||||
|
||||
# =========================
|
||||
# Fehlende Begriffe -> separate Datei
|
||||
# =========================
|
||||
def export_missing_terms(out_df, output_file):
|
||||
missing_df = out_df[
|
||||
(out_df["Norm_Treffer"] == "KEIN TREFFER") &
|
||||
(out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == ""))
|
||||
][["Begriff"]].drop_duplicates()
|
||||
|
||||
count_missing = len(missing_df)
|
||||
logger.info(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}")
|
||||
|
||||
if count_missing == 0:
|
||||
return
|
||||
|
||||
ext = output_file.suffix.lower()
|
||||
base_name = output_file.stem
|
||||
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}"
|
||||
version = 1
|
||||
while missing_file.exists():
|
||||
missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}"
|
||||
version += 1
|
||||
|
||||
try:
|
||||
if ext in [".xlsx", ".xls"]:
|
||||
missing_df.to_excel(missing_file, index=False, engine="openpyxl")
|
||||
elif ext == ".ods":
|
||||
missing_df.to_excel(missing_file, index=False, engine="odf")
|
||||
else:
|
||||
missing_df.to_csv(missing_file, index=False, sep=";")
|
||||
logger.info(f"Fehlende Begriffe gespeichert: {missing_file}")
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler beim Speichern der fehlenden Begriffe: {e}")
|
||||
|
||||
# =========================
|
||||
# Haupt-Loop: Verarbeitung Input-Dateien
|
||||
# =========================
|
||||
def process_files():
|
||||
overall_start = time.time()
|
||||
try:
|
||||
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
|
||||
except Exception as e:
|
||||
logger.error("Normvokabular konnte nicht geladen werden. Beende.")
|
||||
raise
|
||||
|
||||
total_terms = 0
|
||||
total_hits = 0
|
||||
|
||||
if not INPUT_DIR.exists():
|
||||
logger.error(f"Eingabeordner {INPUT_DIR} fehlt")
|
||||
raise SystemExit(1)
|
||||
files = list(INPUT_DIR.glob("*"))
|
||||
if not files:
|
||||
logger.info("Keine Dateien gefunden")
|
||||
return
|
||||
|
||||
logger.info(f"Starte Verarbeitung von {len(files)} Dateien")
|
||||
for file_idx, file_path in enumerate(files, start=1):
|
||||
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
|
||||
logger.debug(f"Übersprungen (kein unterstütztes Format): {file_path.name}")
|
||||
continue
|
||||
logger.info(f"[Datei {file_idx}/{len(files)}] Verarbeite: {file_path.name}")
|
||||
file_start = time.time()
|
||||
try:
|
||||
if file_path.suffix.lower() == ".csv":
|
||||
df = pd.read_csv(file_path)
|
||||
else:
|
||||
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler beim Lesen von {file_path.name}: {e}")
|
||||
continue
|
||||
|
||||
df = df.dropna(how="all")
|
||||
df.columns = [str(c).strip() for c in df.columns]
|
||||
|
||||
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
|
||||
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
|
||||
urh_col = next((c for c in df.columns if "Urheber" in c), None)
|
||||
if not besch_col:
|
||||
logger.warning(f"Spalte 'Objektbeschreibung' nicht gefunden in {file_path.name}. Datei übersprungen.")
|
||||
continue
|
||||
|
||||
row_terms_map = []
|
||||
for r_idx, row in enumerate(df.itertuples(index=False), start=1):
|
||||
try:
|
||||
besch = str(row[df.columns.get_loc(besch_col)]).strip() if pd.notna(row[df.columns.get_loc(besch_col)]) else ""
|
||||
except Exception:
|
||||
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
|
||||
if not besch:
|
||||
continue
|
||||
obj_box = row[df.columns.get_loc(box_col)] if box_col and box_col in df.columns else ""
|
||||
urheber = row[df.columns.get_loc(urh_col)] if urh_col and urh_col in df.columns else ""
|
||||
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
|
||||
terms = []
|
||||
for clause in clauses:
|
||||
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
|
||||
for p in parts:
|
||||
if p.lower() in STOPWORDS:
|
||||
continue
|
||||
if re.fullmatch(r"\d+", p):
|
||||
continue
|
||||
terms.append(p)
|
||||
row_terms_map.append((obj_box, urheber, terms))
|
||||
if (r_idx % 200) == 0:
|
||||
logger.debug(f"[{file_path.name}] Zeile {r_idx} verarbeitet")
|
||||
|
||||
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
|
||||
logger.info(f"[{file_path.name}] Gefundene unique Terme: {len(all_terms)}")
|
||||
total_unique_terms = len(all_terms)
|
||||
# API-Abfragen
|
||||
t0 = time.time()
|
||||
gnd_results = batch_query_gnd(all_terms)
|
||||
t1 = time.time()
|
||||
logger.info(f"[{file_path.name}] GND-Abfragen Dauer: {t1-t0:.1f}s")
|
||||
wd_results = batch_query_wikidata(all_terms)
|
||||
t2 = time.time()
|
||||
logger.info(f"[{file_path.name}] Wikidata-Abfragen Dauer: {t2-t1:.1f}s")
|
||||
aat_results = batch_query_getty_aat(all_terms) if API_ACTIVE.get("aat", False) else {t:"" for t in all_terms}
|
||||
t3 = time.time()
|
||||
logger.info(f"[{file_path.name}] AAT-Abfragen Dauer: {t3-t2:.1f}s")
|
||||
|
||||
# Build output rows
|
||||
output_rows = []
|
||||
processed_count = 0
|
||||
for obj_box, urheber, terms in row_terms_map:
|
||||
for term in terms:
|
||||
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
|
||||
total_terms += 1
|
||||
if norm_name != "KEIN TREFFER":
|
||||
total_hits += 1
|
||||
out_row = {
|
||||
"Box": obj_box,
|
||||
"Objekt/Ebene": obj_box,
|
||||
"Urheber": urheber,
|
||||
"Begriff": term,
|
||||
"Norm_Treffer": norm_name,
|
||||
"Norm_ID": norm_id,
|
||||
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
|
||||
"GND_Top1": gnd_results.get(term,""),
|
||||
"WD_Top1": wd_results.get(term,""),
|
||||
"AAT_Top1": aat_results.get(term,"")
|
||||
}
|
||||
output_rows.append(out_row)
|
||||
processed_count += 1
|
||||
if (processed_count % 200) == 0:
|
||||
logger.debug(f"[{file_path.name}] {processed_count}/{total_unique_terms} Terme verarbeitet")
|
||||
|
||||
out_df = pd.DataFrame(output_rows)
|
||||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
|
||||
version = 1
|
||||
while output_file.exists():
|
||||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
|
||||
version += 1
|
||||
engine = "odf" if output_file.suffix.lower()==".ods" else None
|
||||
|
||||
try:
|
||||
out_df.to_excel(output_file, index=False, engine=engine)
|
||||
logger.info(f"[{file_path.name}] Auswertung gespeichert: {output_file}")
|
||||
except Exception as e:
|
||||
logger.error(f"[{file_path.name}] Fehler beim Speichern der Auswertung {output_file}: {e}")
|
||||
continue
|
||||
|
||||
export_missing_terms(out_df, output_file)
|
||||
mark_norm_hits(output_file)
|
||||
|
||||
file_elapsed = time.time() - file_start
|
||||
logger.info(f"[Datei {file_idx}/{len(files)}] Fertig ({file_elapsed:.1f}s)")
|
||||
|
||||
overall_elapsed = time.time() - overall_start
|
||||
logger.info(f"Fertig. Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular. Gesamtzeit: {overall_elapsed:.1f}s")
|
||||
|
||||
# =========================
|
||||
# Main
|
||||
# =========================
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
process_files()
|
||||
except KeyboardInterrupt:
|
||||
logger.warning("Abbruch durch Benutzer (KeyboardInterrupt).")
|
||||
except SystemExit:
|
||||
logger.warning("SystemExit aufgetreten.")
|
||||
except Exception as e:
|
||||
logger.exception(f"Ungefangener Fehler: {e}")
|
||||
finally:
|
||||
# Stop logger (flush remaining logs)
|
||||
try:
|
||||
save_cache()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
logger.info("Beende.")
|
||||
logger.stop()
|
||||
except Exception:
|
||||
pass
|
||||
46
NormVokabular_Mapper_Wrapper.py
Normal file
46
NormVokabular_Mapper_Wrapper.py
Normal file
@ -0,0 +1,46 @@
|
||||
import subprocess
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
def run_mapper(term):
|
||||
"""
|
||||
Ruft das bestehende mapper script auf und liefert Vorschläge zurück.
|
||||
Erwartet, dass das mapper script eine JSON-Ausgabe liefert:
|
||||
{
|
||||
"term": "Begriff",
|
||||
"norm_name": "Normierter Treffer oder KEIN TREFFER",
|
||||
"norm_id": "ID",
|
||||
"suggestions": ["Vorschlag1", "Vorschlag2", "Vorschlag3"]
|
||||
}
|
||||
"""
|
||||
mapper_script = Path("/home/jarnold/projects/GND-Skript Test/NormVokabular_Mapper_1.2.py") # dein bestehendes Mapper-Skript
|
||||
if not mapper_script.exists():
|
||||
raise FileNotFoundError(f"{mapper_script} nicht gefunden")
|
||||
|
||||
# Übergabe als JSON-String
|
||||
input_json = json.dumps({"term": term})
|
||||
|
||||
# Aufruf via subprocess
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(mapper_script), input_json],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Mapper Fehler: {result.stderr}")
|
||||
|
||||
try:
|
||||
output = json.loads(result.stdout)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Ungültige Ausgabe vom Mapper: {e}")
|
||||
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1:
|
||||
term = sys.argv[1]
|
||||
output = run_mapper(term)
|
||||
print(json.dumps(output, ensure_ascii=False))
|
||||
BIN
Test 20 Exlibris Alice Glotin.ods
Normal file
BIN
Test 20 Exlibris Alice Glotin.ods
Normal file
Binary file not shown.
BIN
Test API.ods
Normal file
BIN
Test API.ods
Normal file
Binary file not shown.
101
Tryout/NVTest.py
Normal file
101
Tryout/NVTest.py
Normal file
@ -0,0 +1,101 @@
|
||||
import pandas as pd
|
||||
import requests
|
||||
import time
|
||||
import os
|
||||
|
||||
def match_gnd(token, delay=0.3):
|
||||
"""GND-Abfrage für ein Schlagwort, gibt erstes Ergebnis zurück"""
|
||||
url = f"https://lobid.org/gnd/search?q={token}&format=json"
|
||||
try:
|
||||
resp = requests.get(url, timeout=5)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
if 'member' in data and data['member']:
|
||||
first = data['member'][0]
|
||||
return first.get('preferredName'), first.get('gndIdentifier')
|
||||
except Exception as e:
|
||||
print(f"Fehler bei GND-Abfrage für '{token}': {e}")
|
||||
time.sleep(delay)
|
||||
return None, None
|
||||
|
||||
def load_exlibris_refs(path):
|
||||
"""CSV einlesen, Scan-Zuordnung, Platzhalter-Inventarnummer, GND-Abgleich"""
|
||||
df = pd.read_csv(path, dtype=str, header=0)
|
||||
# erste Spalte leer? → "Kürzel"
|
||||
if df.columns[0].strip() == '':
|
||||
df.rename(columns={df.columns[0]: 'Kürzel'}, inplace=True)
|
||||
df.fillna('', inplace=True)
|
||||
|
||||
# Scan-Level-Spalten
|
||||
level_cols = [c for c in df.columns if c.strip() in ['0','1','2','3','4']]
|
||||
|
||||
obj_list = []
|
||||
current_obj = None
|
||||
placeholder_counter = 1
|
||||
|
||||
for _, row in df.iterrows():
|
||||
has_0 = row['0'].strip() if '0' in df.columns else ''
|
||||
row_refs = []
|
||||
for c in level_cols:
|
||||
val = row[c].strip()
|
||||
if val:
|
||||
row_refs.append({'level': c, 'scan_ref': val})
|
||||
|
||||
if has_0:
|
||||
if current_obj:
|
||||
obj_list.append(current_obj)
|
||||
core_data = {col: row[col] for col in df.columns if col not in level_cols}
|
||||
# Inventarnummer prüfen
|
||||
inv = core_data.get('Inventarnummer','').strip()
|
||||
if not inv:
|
||||
core_data['Inventarnummer'] = f'PL-{placeholder_counter:04d}'
|
||||
placeholder_counter += 1
|
||||
# GND-Abgleich
|
||||
obj_descr = core_data.get('Objektbeschreibung','')
|
||||
gnd_name, gnd_id = None, None
|
||||
if obj_descr:
|
||||
tokens = [t.strip() for t in obj_descr.split(',') if t.strip()]
|
||||
for t in tokens:
|
||||
name, gid = match_gnd(t)
|
||||
if gid:
|
||||
gnd_name = name
|
||||
gnd_id = gid
|
||||
break
|
||||
core_data['GND_Name'] = gnd_name
|
||||
core_data['GND_ID'] = gnd_id
|
||||
current_obj = core_data
|
||||
current_obj['ScanReferenzen'] = row_refs
|
||||
else:
|
||||
if current_obj:
|
||||
current_obj['ScanReferenzen'].extend(row_refs)
|
||||
|
||||
if current_obj:
|
||||
obj_list.append(current_obj)
|
||||
|
||||
out_df = pd.DataFrame(obj_list)
|
||||
core_fields = ['Kürzel','Inventarnummer','Standort','Jahr','Urheber','Eigner',
|
||||
'Objektbeschreibung','Material','Maße (in cm)',
|
||||
'Objekttyp','Inschrift','Anmerkungen','ScanReferenzen',
|
||||
'GND_Name','GND_ID']
|
||||
available = [c for c in core_fields if c in out_df.columns]
|
||||
return out_df[available]
|
||||
|
||||
# ====================
|
||||
# Hauptteil
|
||||
# ====================
|
||||
if __name__ == "__main__":
|
||||
# CSV im gleichen Ordner suchen
|
||||
csv_files = [f for f in os.listdir('.') if f.lower().endswith('.csv')]
|
||||
if not csv_files:
|
||||
print("Keine CSV-Datei im aktuellen Ordner gefunden.")
|
||||
exit(1)
|
||||
# nimm die erste gefundene CSV
|
||||
input_csv = csv_files[0]
|
||||
print(f"Verwende CSV-Datei: {input_csv}")
|
||||
|
||||
df = load_exlibris_refs(input_csv)
|
||||
|
||||
# Ergebnis als Testergebnis.csv speichern
|
||||
output_file = "Testergebnis.csv"
|
||||
df.to_csv(output_file, index=False)
|
||||
print(f"Aufbereitete Daten gespeichert als {output_file}")
|
||||
190
VLG.py
Normal file
190
VLG.py
Normal file
@ -0,0 +1,190 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
VLG_AAT.py Gruppierung, Auflösung "Objektbeschreibung"
|
||||
NOCH OHNE AAT-ABGLEICH
|
||||
|
||||
- Prüft ezodf in aktueller Umgebung
|
||||
- Liest ODS aus "Input CSV/"
|
||||
- Extrahiert Begriffe aus "Objektbeschreibung"
|
||||
- Lemmatisierung (Spacy) + Stopwortfilter
|
||||
- Subtokenisierung komplexer Phrasen
|
||||
- Zählt Häufigkeiten
|
||||
- Ausgabe ODS / CSV-Fallback in "Auswertung Ergebnisse"
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
from collections import Counter
|
||||
import pandas as pd
|
||||
import spacy
|
||||
|
||||
# ---------------------------
|
||||
# Logging
|
||||
# ---------------------------
|
||||
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
||||
|
||||
# ---------------------------
|
||||
# ezodf prüfen
|
||||
# ---------------------------
|
||||
try:
|
||||
import ezodf
|
||||
EZODF_AVAILABLE = True
|
||||
logging.info(f"ezodf erkannt")
|
||||
except ImportError:
|
||||
EZODF_AVAILABLE = False
|
||||
logging.error("ezodf konnte nicht importiert werden!")
|
||||
logging.error("Möglicherweise nutzen Sie nicht die Python-Umgebung, in der ezodf installiert ist.")
|
||||
logging.error(f"Aktuelle Python-Executable: {sys.executable}")
|
||||
logging.error("Bitte prüfen Sie Ihre venv oder installieren Sie ezodf in dieser Umgebung:")
|
||||
logging.error(" python -m pip install ezodf")
|
||||
sys.exit(1)
|
||||
|
||||
# ---------------------------
|
||||
# Spacy laden
|
||||
# ---------------------------
|
||||
try:
|
||||
nlp = spacy.load("de_core_news_sm")
|
||||
logging.info("Spacy-Modell geladen.")
|
||||
except Exception as e:
|
||||
logging.error(f"Spacy-Modell konnte nicht geladen werden: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# ---------------------------
|
||||
# Konfiguration
|
||||
# ---------------------------
|
||||
INPUT_FOLDER = "Input CSV"
|
||||
OUTPUT_FOLDER = "Auswertung Ergebnisse"
|
||||
INPUT_FILENAME = None
|
||||
TARGET_COLUMN = "Objektbeschreibung"
|
||||
STOPWORDS = {"mit", "auf", "von", "und", "der", "die", "das"} # erweiterbar
|
||||
MAPPING = { # Projektinterne Sonderfälle
|
||||
"exlibris": "exlibris",
|
||||
"wappen": "wappen"
|
||||
}
|
||||
|
||||
# ---------------------------
|
||||
# Funktionen
|
||||
# ---------------------------
|
||||
def find_input_file(folder: str, filename_hint: str = None):
|
||||
if not os.path.isdir(folder):
|
||||
raise FileNotFoundError(f"Input-Ordner '{folder}' existiert nicht.")
|
||||
files = [f for f in os.listdir(folder) if f.lower().endswith(".ods")]
|
||||
if filename_hint:
|
||||
for f in files:
|
||||
if f == filename_hint or filename_hint in f:
|
||||
return os.path.join(folder, f)
|
||||
if not files:
|
||||
raise FileNotFoundError(f"Keine .ods-Dateien in '{folder}' gefunden.")
|
||||
return os.path.join(folder, files[0])
|
||||
|
||||
def read_ods_first_sheet(path: str) -> pd.DataFrame:
|
||||
"""Lädt ODS, erkennt automatisch Header-Zeile."""
|
||||
try:
|
||||
df = pd.read_excel(path, engine="odf", header=None)
|
||||
logging.info("ODS mit pandas + odfpy geladen.")
|
||||
except Exception as e1:
|
||||
logging.warning(f"pandas + odfpy konnte ODS nicht lesen ({e1}).")
|
||||
if not EZODF_AVAILABLE:
|
||||
raise RuntimeError("ezodf nicht installiert und pandas + odfpy fehlgeschlagen.")
|
||||
doc = ezodf.opendoc(path)
|
||||
sheet = doc.sheets[0]
|
||||
data = []
|
||||
for row in sheet.rows():
|
||||
values = [c.value if hasattr(c, "value") else "" for c in row]
|
||||
data.append(values)
|
||||
df = pd.DataFrame(data)
|
||||
logging.info("ODS mit ezodf geladen.")
|
||||
|
||||
# Header-Zeile automatisch finden
|
||||
header_row_index = None
|
||||
for i, row in df.iterrows():
|
||||
row_str = row.fillna("").astype(str).str.lower()
|
||||
if any("objektbeschreibung" in str(cell) for cell in row_str):
|
||||
header_row_index = i
|
||||
break
|
||||
if header_row_index is None:
|
||||
raise KeyError("Keine Header-Zeile mit 'Objektbeschreibung' gefunden.")
|
||||
|
||||
df.columns = df.iloc[header_row_index]
|
||||
df = df.iloc[header_row_index + 1:].reset_index(drop=True)
|
||||
return df
|
||||
|
||||
def tokenize_and_lemmatize(series: pd.Series) -> list:
|
||||
"""Tokenisiert, entfernt Stopwords, wendet Mapping + Spacy-Lemmatisierung an."""
|
||||
series = series.fillna("").astype(str).str.strip().str.lower()
|
||||
all_terms = []
|
||||
for text in series:
|
||||
if not text:
|
||||
continue
|
||||
# Komma-Split
|
||||
for part in [p.strip() for p in text.split(",") if p.strip()]:
|
||||
# Subtokenisierung via Spacy
|
||||
doc = nlp(part)
|
||||
for token in doc:
|
||||
lemma = token.lemma_.lower()
|
||||
if lemma in STOPWORDS:
|
||||
continue
|
||||
lemma = MAPPING.get(lemma, lemma)
|
||||
if lemma:
|
||||
all_terms.append(lemma)
|
||||
return all_terms
|
||||
|
||||
def write_output(rows: list, outpath: str):
|
||||
if EZODF_AVAILABLE:
|
||||
if not rows:
|
||||
logging.warning("Keine Daten zum Schreiben.")
|
||||
return
|
||||
keys = list(rows[0].keys())
|
||||
doc = ezodf.newdoc(doctype="ods", filename=outpath)
|
||||
sheet = ezodf.Sheet("Auswertung", size=(len(rows)+1, len(keys)))
|
||||
doc.sheets += sheet
|
||||
for ci, k in enumerate(keys):
|
||||
sheet[0, ci].set_value(k)
|
||||
for ri, row in enumerate(rows, start=1):
|
||||
for ci, k in enumerate(keys):
|
||||
sheet[ri, ci].set_value(row.get(k, ""))
|
||||
doc.save()
|
||||
logging.info(f"ODS geschrieben: {outpath}")
|
||||
else:
|
||||
csv_path = os.path.splitext(outpath)[0] + ".csv"
|
||||
df = pd.DataFrame(rows)
|
||||
df.to_csv(csv_path, index=False, sep=";", encoding="utf-8")
|
||||
logging.info(f"CSV-Fallback geschrieben: {csv_path}")
|
||||
|
||||
# ---------------------------
|
||||
# Hauptfunktion
|
||||
# ---------------------------
|
||||
def main(input_folder=INPUT_FOLDER, input_filename=INPUT_FILENAME):
|
||||
input_path = find_input_file(input_folder, filename_hint=input_filename)
|
||||
input_basename = os.path.splitext(os.path.basename(input_path))[0]
|
||||
logging.info(f"Verarbeite Datei: {input_path}")
|
||||
|
||||
df = read_ods_first_sheet(input_path)
|
||||
logging.info(f"Geladene Spalten: {list(df.columns)}")
|
||||
|
||||
if TARGET_COLUMN.lower() not in [str(c).lower() for c in df.columns]:
|
||||
raise KeyError(f"Spalte '{TARGET_COLUMN}' nicht gefunden.")
|
||||
|
||||
terms = tokenize_and_lemmatize(df[TARGET_COLUMN])
|
||||
logging.info(f"Gefundene Begriffe: {len(terms)}")
|
||||
|
||||
counts = Counter(terms)
|
||||
sorted_terms = sorted(counts.items(), key=lambda kv: kv[1], reverse=True)
|
||||
rows = [{"Begriff": term, "Anzahl": freq} for term, freq in sorted_terms]
|
||||
|
||||
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
|
||||
out_name = f"{input_basename} Auswertung.ods"
|
||||
out_path = os.path.join(OUTPUT_FOLDER, out_name)
|
||||
write_output(rows, out_path)
|
||||
logging.info("Fertig.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
argv = sys.argv[1:]
|
||||
folder = INPUT_FOLDER
|
||||
fname = INPUT_FILENAME
|
||||
if len(argv) >= 1:
|
||||
folder = argv[0]
|
||||
if len(argv) >= 2:
|
||||
fname = argv[1]
|
||||
main(input_folder=folder, input_filename=fname)
|
||||
262
VLG_API_multi.py
Normal file
262
VLG_API_multi.py
Normal file
@ -0,0 +1,262 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import requests
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from difflib import SequenceMatcher
|
||||
import argparse
|
||||
|
||||
# =========================
|
||||
# Argumente / Dry-Run
|
||||
# =========================
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--dry-run', action='store_true', help='API-Abfragen simulieren')
|
||||
args = parser.parse_args()
|
||||
DRY_RUN = args.dry_run
|
||||
|
||||
# =========================
|
||||
# Konfiguration
|
||||
# =========================
|
||||
INPUT_DIR = Path("Input CSV")
|
||||
OUTPUT_DIR = Path("Auswertung Ergebnisse")
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
TIMEOUT = 5
|
||||
MAX_RETRIES = 3
|
||||
BACKOFF_FACTOR = 2
|
||||
MAX_CONSECUTIVE_FAILURES = 10
|
||||
|
||||
CACHE_FILE = "api_cache.json"
|
||||
if os.path.exists(CACHE_FILE):
|
||||
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
||||
CACHE = json.load(f)
|
||||
else:
|
||||
CACHE = {}
|
||||
|
||||
API_ACTIVE = {"gnd": True, "wikidata": True}
|
||||
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
|
||||
|
||||
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
|
||||
|
||||
# =========================
|
||||
# Logging
|
||||
# =========================
|
||||
def log(level, msg):
|
||||
print(f"[{level}] {msg}")
|
||||
|
||||
# =========================
|
||||
# Cache speichern
|
||||
# =========================
|
||||
def save_cache():
|
||||
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# =========================
|
||||
# Request mit Retry & Backoff
|
||||
# =========================
|
||||
def request_with_retries(api_name, url, params=None):
|
||||
if DRY_RUN:
|
||||
return {"dummy": True}
|
||||
if not API_ACTIVE[api_name]:
|
||||
return None
|
||||
|
||||
cache_key = url + (str(params) if params else "")
|
||||
if cache_key in CACHE:
|
||||
return CACHE[cache_key]
|
||||
|
||||
retries = 0
|
||||
while retries < MAX_RETRIES:
|
||||
try:
|
||||
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
|
||||
if r.status_code == 200:
|
||||
try:
|
||||
data = r.json()
|
||||
except:
|
||||
data = r.text
|
||||
CACHE[cache_key] = data
|
||||
save_cache()
|
||||
FAIL_COUNTER[api_name] = 0
|
||||
return data
|
||||
elif r.status_code in [403, 429]:
|
||||
log("ERROR", f"{api_name.upper()} HTTP {r.status_code} – Stopschalter aktiviert")
|
||||
API_ACTIVE[api_name] = False
|
||||
return None
|
||||
else:
|
||||
log("ERROR", f"{api_name.upper()} HTTP {r.status_code}")
|
||||
except requests.exceptions.Timeout:
|
||||
log("ERROR", f"Timeout bei {api_name.upper()}")
|
||||
except Exception as e:
|
||||
log("ERROR", f"Fehler bei {api_name.upper()}: {e}")
|
||||
|
||||
retries += 1
|
||||
sleep_time = min(BACKOFF_FACTOR ** retries, 30)
|
||||
time.sleep(sleep_time)
|
||||
|
||||
FAIL_COUNTER[api_name] += 1
|
||||
if FAIL_COUNTER[api_name] >= MAX_CONSECUTIVE_FAILURES:
|
||||
log("CRITICAL", f"{MAX_CONSECUTIVE_FAILURES} Fehler bei {api_name.upper()} – Stopschalter aktiviert")
|
||||
API_ACTIVE[api_name] = False
|
||||
return None
|
||||
|
||||
# =========================
|
||||
# API-Abfragen mit Confidence
|
||||
# =========================
|
||||
def query_gnd(term, min_conf=0.6):
|
||||
if DRY_RUN or not API_ACTIVE["gnd"]:
|
||||
return "TEST_GND", 1.0
|
||||
|
||||
url = f"https://lobid.org/gnd/search?q={term}&format=json"
|
||||
data = request_with_retries("gnd", url)
|
||||
if not data:
|
||||
return "API nicht erreichbar", 0.0
|
||||
|
||||
results = []
|
||||
scores = []
|
||||
for doc in data.get("member", []):
|
||||
name = doc.get("preferredName", "")
|
||||
conf = SequenceMatcher(None, term.lower(), name.lower()).ratio()
|
||||
if conf >= min_conf:
|
||||
results.append(name)
|
||||
scores.append(conf)
|
||||
if results:
|
||||
return ", ".join(results), max(scores)
|
||||
return "ohne Ergebnis", 0.0
|
||||
|
||||
def query_wikidata(term, min_conf=0.5):
|
||||
if DRY_RUN or not API_ACTIVE["wikidata"]:
|
||||
return "TEST_WD", 1.0
|
||||
|
||||
url = "https://www.wikidata.org/w/api.php"
|
||||
params = {"action": "wbsearchentities", "search": term, "language": "de", "format": "json"}
|
||||
data = request_with_retries("wikidata", url, params)
|
||||
if not data:
|
||||
return "API nicht erreichbar", 0.0
|
||||
|
||||
results = []
|
||||
scores = []
|
||||
for entry in data.get("search", []):
|
||||
match_info = entry.get("match", {})
|
||||
score = match_info.get("score", 0.0)
|
||||
if score >= min_conf:
|
||||
results.append(entry["label"])
|
||||
scores.append(score)
|
||||
if results:
|
||||
return ", ".join(results), max(scores)
|
||||
return "ohne Ergebnis", 0.0
|
||||
|
||||
# =========================
|
||||
# Input laden
|
||||
# =========================
|
||||
def load_input_file(file_path):
|
||||
try:
|
||||
if file_path.suffix.lower() == ".ods":
|
||||
df = pd.read_excel(file_path, engine="odf", header=None)
|
||||
elif file_path.suffix.lower() == ".xlsx":
|
||||
df = pd.read_excel(file_path, engine="openpyxl", header=None)
|
||||
elif file_path.suffix.lower() == ".csv":
|
||||
df = pd.read_csv(file_path, header=None)
|
||||
else:
|
||||
log("WARNING", f"Unbekanntes Dateiformat: {file_path.name}")
|
||||
return None
|
||||
return df
|
||||
except Exception as e:
|
||||
log("ERROR", f"Fehler beim Laden von {file_path.name}: {e}")
|
||||
return None
|
||||
|
||||
# =========================
|
||||
# Header-Zeile suchen
|
||||
# =========================
|
||||
def find_header_row(df, keywords=["objektbeschreibung", "objekt/ebene"]):
|
||||
for i, row in df.iterrows():
|
||||
row_lower = [str(cell).lower() if pd.notna(cell) else "" for cell in row]
|
||||
if any(kw in cell for kw in keywords for cell in row_lower):
|
||||
return i, row_lower
|
||||
return None, None
|
||||
|
||||
# =========================
|
||||
# Verarbeitung
|
||||
# =========================
|
||||
def process_files():
|
||||
all_terms = []
|
||||
output_rows = []
|
||||
|
||||
for file_path in INPUT_DIR.glob("*"):
|
||||
if not file_path.suffix.lower() in [".csv", ".xlsx", ".ods"]:
|
||||
continue
|
||||
log("INFO", f"Verarbeite {file_path.name}")
|
||||
df = load_input_file(file_path)
|
||||
if df is None:
|
||||
continue
|
||||
|
||||
header_idx, header_row = find_header_row(df)
|
||||
if header_idx is None:
|
||||
log("WARNING", f"Keine Header-Zeile gefunden in {file_path.name}")
|
||||
continue
|
||||
df.columns = header_row
|
||||
df = df.iloc[header_idx+1:].reset_index(drop=True)
|
||||
|
||||
col_objdesc = next((col for col in df.columns if "objektbeschreibung" in str(col).lower()), None)
|
||||
col_objlevel = next((col for col in df.columns if "objekt/ebene" in str(col).lower()), None)
|
||||
if not col_objdesc:
|
||||
log("WARNING", f"Keine Spalte 'Objektbeschreibung' in {file_path.name}")
|
||||
continue
|
||||
|
||||
term_list = []
|
||||
obj_level_list = []
|
||||
for _, row in df.iterrows():
|
||||
terms = str(row[col_objdesc]) if pd.notna(row[col_objdesc]) else ""
|
||||
if not terms:
|
||||
continue
|
||||
for term in [t.strip() for t in terms.split(",") if t.strip()]:
|
||||
term_list.append(term)
|
||||
obj_level_list.append(row[col_objlevel] if col_objlevel and pd.notna(row[col_objlevel]) else "")
|
||||
|
||||
# API-Abfragen
|
||||
gnd_results = []
|
||||
gnd_scores = []
|
||||
wikidata_results = []
|
||||
wikidata_scores = []
|
||||
|
||||
for term in term_list:
|
||||
gnd_res, gnd_conf = query_gnd(term)
|
||||
wikidata_res, wd_conf = query_wikidata(term)
|
||||
gnd_results.append(gnd_res)
|
||||
gnd_scores.append(gnd_conf)
|
||||
wikidata_results.append(wikidata_res)
|
||||
wikidata_scores.append(wd_conf)
|
||||
|
||||
for idx, term in enumerate(term_list):
|
||||
output_rows.append({
|
||||
"Begriff": term,
|
||||
"Quelle": file_path.name,
|
||||
"Objekt/Ebene": obj_level_list[idx],
|
||||
"GND": gnd_results[idx],
|
||||
"GND_Confidence": gnd_scores[idx],
|
||||
"Wikidata": wikidata_results[idx],
|
||||
"Wikidata_Confidence": wikidata_scores[idx]
|
||||
})
|
||||
all_terms.extend(term_list)
|
||||
|
||||
# Hauptoutput
|
||||
out_df = pd.DataFrame(output_rows)
|
||||
out_file = OUTPUT_DIR / "Auswertung_gesamt.ods"
|
||||
out_df.to_excel(out_file, index=False, engine="odf")
|
||||
log("INFO", f"Hauptauswertung gespeichert: {out_file}")
|
||||
|
||||
# Rohdatei
|
||||
raw_terms = pd.Series(all_terms).value_counts().reset_index()
|
||||
raw_terms.columns = ["Begriff", "Häufigkeit"]
|
||||
raw_file = OUTPUT_DIR / "Rohbegriffe.ods"
|
||||
raw_terms.to_excel(raw_file, index=False, engine="odf")
|
||||
log("INFO", f"Rohbegriffe gespeichert: {raw_file}")
|
||||
|
||||
# =========================
|
||||
# Main
|
||||
# =========================
|
||||
if __name__ == "__main__":
|
||||
if not INPUT_DIR.exists():
|
||||
log("CRITICAL", f"Eingabeordner {INPUT_DIR} fehlt!")
|
||||
sys.exit(1)
|
||||
process_files()
|
||||
2815369
api_cache.json
Normal file
2815369
api_cache.json
Normal file
File diff suppressed because it is too large
Load Diff
9
config.json
Normal file
9
config.json
Normal file
@ -0,0 +1,9 @@
|
||||
{
|
||||
"normvokabular_path": "/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods",
|
||||
"max_suggestions": 3,
|
||||
"color_hit": "#C6EFCE",
|
||||
"color_miss": "#FFC7CE",
|
||||
"use_rapidfuzz": false,
|
||||
"use_spacy": false,
|
||||
"autosave": false
|
||||
}
|
||||
371
mapper.py
Normal file
371
mapper.py
Normal file
@ -0,0 +1,371 @@
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import time
|
||||
import json
|
||||
import pandas as pd
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
# RapidFuzz für Token-basierte Fuzzy-Suche
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
RAPIDFUZZ_AVAILABLE = True
|
||||
print("RapidFuzz verfügbar")
|
||||
except ImportError:
|
||||
RAPIDFUZZ_AVAILABLE = False
|
||||
print("RapidFuzz nicht verfügbar – nutze SequenceMatcher")
|
||||
|
||||
# Spacy Lemmatizer
|
||||
try:
|
||||
import spacy
|
||||
nlp = spacy.load("de_core_news_sm")
|
||||
SPACY_AVAILABLE = True
|
||||
print("Spacy Lemmatizer aktiviert")
|
||||
except:
|
||||
SPACY_AVAILABLE = False
|
||||
nlp = None
|
||||
print("Spacy nicht verfügbar – nutze naive Stemmer")
|
||||
|
||||
# =========================
|
||||
# Pfade & Config
|
||||
# =========================
|
||||
INPUT_DIR = Path("Input CSV")
|
||||
OUTPUT_DIR = Path("Auswertung Ergebnisse")
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
|
||||
CACHE_FILE = "api_cache.json"
|
||||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||||
CONF_THRESHOLD = 0.75
|
||||
TIMEOUT = 5
|
||||
MAX_RETRIES = 3
|
||||
BACKOFF_FACTOR = 2
|
||||
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
|
||||
API_ACTIVE = {"gnd": True, "wikidata": True}
|
||||
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
|
||||
|
||||
# Cache
|
||||
if os.path.exists(CACHE_FILE):
|
||||
with open(CACHE_FILE,"r",encoding="utf-8") as f:
|
||||
CACHE = json.load(f)
|
||||
else:
|
||||
CACHE = {}
|
||||
|
||||
def save_cache():
|
||||
with open(CACHE_FILE,"w",encoding="utf-8") as f:
|
||||
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# =========================
|
||||
# Normalisierung / Lemma
|
||||
# =========================
|
||||
def normalize_text(s):
|
||||
if not s:
|
||||
return ""
|
||||
s = str(s).lower().strip()
|
||||
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
|
||||
s = re.sub(r"\s+"," ",s)
|
||||
return s
|
||||
|
||||
# Lemma-Cache
|
||||
lemma_cache = {}
|
||||
|
||||
def lemmatize_term(term):
|
||||
term_norm = normalize_text(term)
|
||||
if term_norm in lemma_cache:
|
||||
return lemma_cache[term_norm]
|
||||
if SPACY_AVAILABLE and nlp:
|
||||
doc = nlp(term_norm)
|
||||
lemma = " ".join([token.lemma_ for token in doc])
|
||||
else:
|
||||
lemma = term_norm
|
||||
lemma_cache[term_norm] = lemma
|
||||
return lemma
|
||||
|
||||
# =========================
|
||||
# Kompositum-Zerlegung (einfacher Ansatz)
|
||||
# =========================
|
||||
def compound_split(term):
|
||||
parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
|
||||
return parts if parts else [term]
|
||||
|
||||
# =========================
|
||||
# Normvokabular laden & Lemma vorbereiten
|
||||
# =========================
|
||||
def load_normvokabular(file_path):
|
||||
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
|
||||
norm_dict = {}
|
||||
stem_index = defaultdict(list)
|
||||
lemma_norm_map = {} # für RapidFuzz preprocessed
|
||||
|
||||
for sheet_name, df in sheets.items():
|
||||
if sheet_name.lower() in ["master", "übersicht"]:
|
||||
continue
|
||||
df = df.dropna(how="all", axis=1)
|
||||
df.columns = [str(c).strip() for c in df.columns]
|
||||
id_col = next((c for c in df.columns if "ID" in c), None)
|
||||
word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c), None)
|
||||
if not id_col or not word_col:
|
||||
continue
|
||||
|
||||
current_parent_id = None
|
||||
for _, row in df.iterrows():
|
||||
row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
|
||||
row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
|
||||
if row_id:
|
||||
current_parent_id = row_id
|
||||
if not row_word:
|
||||
continue
|
||||
assigned_parent_id = current_parent_id
|
||||
entry = {
|
||||
"Name": row_word,
|
||||
"ID": assigned_parent_id, # Parent-ID
|
||||
"Sheet": sheet_name,
|
||||
"Own_ID": row_id or "" # eigene ID, falls vorhanden
|
||||
}
|
||||
key = normalize_text(row_word)
|
||||
norm_dict[key] = entry
|
||||
lemma = lemmatize_term(key)
|
||||
stem_index[lemma].append(entry)
|
||||
if lemma not in lemma_norm_map:
|
||||
lemma_norm_map[lemma] = entry
|
||||
return norm_dict, stem_index, lemma_norm_map
|
||||
|
||||
# =========================
|
||||
# Mapping & Vorschläge
|
||||
# =========================
|
||||
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
|
||||
term_norm = normalize_text(term)
|
||||
term_lemma = lemmatize_term(term)
|
||||
|
||||
# Exakter Treffer
|
||||
if term_norm in norm_dict:
|
||||
e = norm_dict[term_norm]
|
||||
return e["Name"], e["ID"], []
|
||||
|
||||
# Lemma-Treffer
|
||||
if term_lemma in stem_index:
|
||||
e = stem_index[term_lemma][0]
|
||||
return e["Name"], e["ID"], []
|
||||
|
||||
# KEIN TREFFER → Kompositum-Split
|
||||
tokens = compound_split(term)
|
||||
if len(tokens) == 1:
|
||||
suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
|
||||
return "KEIN TREFFER", "", suggestions
|
||||
else:
|
||||
token_matches = []
|
||||
for t in tokens:
|
||||
t_lemma = lemmatize_term(t)
|
||||
if t_lemma in stem_index:
|
||||
e = stem_index[t_lemma][0]
|
||||
token_matches.append((t, e["Name"], e["ID"]))
|
||||
else:
|
||||
sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
|
||||
token_matches.append((t, "KEIN TREFFER", "", sugg))
|
||||
combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
|
||||
return "KEIN TREFFER", "", combined_suggestions
|
||||
|
||||
def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
|
||||
candidates = []
|
||||
for key_lemma, entry in lemma_norm_map.items():
|
||||
if RAPIDFUZZ_AVAILABLE:
|
||||
score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
|
||||
else:
|
||||
score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
|
||||
if key_lemma.lower().startswith(term_lemma.lower()):
|
||||
score = min(score + 0.1, 1.0)
|
||||
if score >= threshold:
|
||||
candidates.append((score, entry["Name"], entry["ID"]))
|
||||
candidates.sort(reverse=True)
|
||||
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
|
||||
|
||||
# =========================
|
||||
# API-Abfragen
|
||||
# =========================
|
||||
def request_with_retries(api_name,url,params=None):
|
||||
cache_key = url + str(params)
|
||||
if cache_key in CACHE:
|
||||
return CACHE[cache_key]
|
||||
retries = 0
|
||||
while retries < MAX_RETRIES:
|
||||
try:
|
||||
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
|
||||
if r.status_code == 200:
|
||||
try: data = r.json()
|
||||
except: data = r.text
|
||||
CACHE[cache_key] = data
|
||||
FAIL_COUNTER[api_name] = 0
|
||||
return data
|
||||
except:
|
||||
pass
|
||||
retries += 1
|
||||
time.sleep(min(BACKOFF_FACTOR**retries,30))
|
||||
FAIL_COUNTER[api_name] += 1
|
||||
if FAIL_COUNTER[api_name] >= 10:
|
||||
API_ACTIVE[api_name] = False
|
||||
return None
|
||||
|
||||
def batch_query_gnd(terms):
|
||||
results={}
|
||||
if not API_ACTIVE.get("gnd", False):
|
||||
for t in terms: results[t] = ""
|
||||
return results
|
||||
for t in terms:
|
||||
url="https://lobid.org/gnd/search"
|
||||
params={"q":t,"format":"json"}
|
||||
data = request_with_retries("gnd", url, params)
|
||||
top = ""
|
||||
if data and "member" in data:
|
||||
cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None,t.lower(),(doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
|
||||
cands = [c for c in cands if c[1]>=0.75]
|
||||
if cands:
|
||||
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
|
||||
results[t] = top
|
||||
return results
|
||||
|
||||
def batch_query_wikidata(terms):
|
||||
results={}
|
||||
if not API_ACTIVE.get("wikidata", False):
|
||||
for t in terms: results[t] = ""
|
||||
return results
|
||||
for t in terms:
|
||||
url="https://www.wikidata.org/w/api.php"
|
||||
params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
|
||||
data = request_with_retries("wikidata", url, params)
|
||||
top = ""
|
||||
if data and "search" in data:
|
||||
cands = [(e.get("label",""), SequenceMatcher(None,t.lower(),e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")]
|
||||
cands = [c for c in cands if c[1]>=0.70]
|
||||
if cands:
|
||||
top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
|
||||
results[t] = top
|
||||
return results
|
||||
|
||||
# =========================
|
||||
# Markierung / Export
|
||||
# =========================
|
||||
def mark_norm_hits(file_path):
|
||||
ext = file_path.suffix.lower()
|
||||
if ext in [".xlsx", ".xls"]:
|
||||
from openpyxl import load_workbook
|
||||
from openpyxl.styles import PatternFill
|
||||
wb = load_workbook(file_path)
|
||||
ws = wb.active
|
||||
green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
||||
red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
||||
col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
|
||||
norm_col = col_map.get("Norm_Treffer", None)
|
||||
if not norm_col:
|
||||
print("Spalte 'Norm_Treffer' nicht gefunden")
|
||||
return
|
||||
for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
|
||||
cell = row[0]
|
||||
if cell.value and cell.value != "KEIN TREFFER":
|
||||
cell.fill = green_fill
|
||||
else:
|
||||
cell.fill = red_fill
|
||||
wb.save(file_path)
|
||||
elif ext==".ods":
|
||||
df = pd.read_excel(file_path, engine="odf")
|
||||
df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
|
||||
df.to_excel(file_path, index=False, engine="odf")
|
||||
|
||||
# =========================
|
||||
# Verarbeitung Input-Dateien
|
||||
# =========================
|
||||
def process_files():
|
||||
norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
|
||||
total_terms = 0
|
||||
total_hits = 0
|
||||
|
||||
if not INPUT_DIR.exists():
|
||||
print(f"Eingabeordner {INPUT_DIR} fehlt")
|
||||
sys.exit(1)
|
||||
files = list(INPUT_DIR.glob("*"))
|
||||
if not files:
|
||||
print("Keine Dateien gefunden")
|
||||
return
|
||||
|
||||
for file_path in files:
|
||||
if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
|
||||
continue
|
||||
print(f"Verarbeite Datei: {file_path.name}")
|
||||
try:
|
||||
if file_path.suffix.lower() == ".csv":
|
||||
df = pd.read_csv(file_path)
|
||||
else:
|
||||
df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
|
||||
except Exception as e:
|
||||
print(f"Fehler beim Lesen von {file_path.name}: {e}")
|
||||
continue
|
||||
|
||||
df = df.dropna(how="all")
|
||||
df.columns = [str(c).strip() for c in df.columns]
|
||||
|
||||
besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
|
||||
box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
|
||||
urh_col = next((c for c in df.columns if "Urheber" in c), None)
|
||||
if not besch_col: continue
|
||||
|
||||
row_terms_map = []
|
||||
for _, row in df.iterrows():
|
||||
besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
|
||||
if not besch: continue
|
||||
obj_box = row[box_col] if box_col else ""
|
||||
urheber = row[urh_col] if urh_col else ""
|
||||
clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
|
||||
terms = []
|
||||
for clause in clauses:
|
||||
parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
|
||||
for p in parts:
|
||||
if p.lower() in STOPWORDS: continue
|
||||
if re.fullmatch(r"\d+", p): continue
|
||||
terms.append(p)
|
||||
row_terms_map.append((obj_box, urheber, terms))
|
||||
|
||||
all_terms = list({t for _,_,terms in row_terms_map for t in terms})
|
||||
gnd_results = batch_query_gnd(all_terms)
|
||||
wd_results = batch_query_wikidata(all_terms)
|
||||
|
||||
output_rows = []
|
||||
for obj_box, urheber, terms in row_terms_map:
|
||||
for term in terms:
|
||||
norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
|
||||
total_terms += 1
|
||||
if norm_name != "KEIN TREFFER":
|
||||
total_hits += 1
|
||||
out_row = {
|
||||
"Box": obj_box,
|
||||
"Objekt/Ebene": obj_box,
|
||||
"Urheber": urheber,
|
||||
"Begriff": term,
|
||||
"Norm_Treffer": norm_name,
|
||||
"Norm_ID": norm_id,
|
||||
"Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
|
||||
"GND_Top1": gnd_results.get(term,""),
|
||||
"WD_Top1": wd_results.get(term,"")
|
||||
}
|
||||
output_rows.append(out_row)
|
||||
|
||||
out_df = pd.DataFrame(output_rows)
|
||||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
|
||||
version = 1
|
||||
while output_file.exists():
|
||||
output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
|
||||
version += 1
|
||||
engine = "odf" if output_file.suffix.lower()==".ods" else None
|
||||
out_df.to_excel(output_file, index=False, engine=engine)
|
||||
mark_norm_hits(output_file)
|
||||
print(f"Auswertung gespeichert: {output_file}")
|
||||
|
||||
save_cache()
|
||||
print(f"Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular")
|
||||
|
||||
# =========================
|
||||
# Main
|
||||
# =========================
|
||||
if __name__ == "__main__":
|
||||
process_files()
|
||||
print("Fertig")
|
||||
237
mapper_macro.py
Normal file
237
mapper_macro.py
Normal file
@ -0,0 +1,237 @@
|
||||
import uno
|
||||
import os
|
||||
import re
|
||||
import traceback
|
||||
import json
|
||||
|
||||
# Optional für Lemmatizer
|
||||
try:
|
||||
import spacy
|
||||
nlp = spacy.load("de_core_news_sm")
|
||||
SPACY_AVAILABLE = True
|
||||
except:
|
||||
SPACY_AVAILABLE = False
|
||||
nlp = None
|
||||
|
||||
# Optional für Fuzzy Matching
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
RAPIDFUZZ_AVAILABLE = True
|
||||
except:
|
||||
from difflib import SequenceMatcher
|
||||
RAPIDFUZZ_AVAILABLE = False
|
||||
|
||||
import odf.opendocument
|
||||
import odf.table
|
||||
import odf.text
|
||||
|
||||
# ------------------------
|
||||
# Konfiguration absolute Pfade
|
||||
# ------------------------
|
||||
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro"
|
||||
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
|
||||
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
|
||||
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
|
||||
|
||||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||||
CONF_THRESHOLD = 0.75
|
||||
|
||||
# ------------------------
|
||||
# Logging
|
||||
# ------------------------
|
||||
def log(msg):
|
||||
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
||||
f.write(msg + "\n")
|
||||
|
||||
# ------------------------
|
||||
# Cache laden
|
||||
# ------------------------
|
||||
if os.path.exists(CACHE_FILE):
|
||||
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
||||
CACHE = json.load(f)
|
||||
else:
|
||||
CACHE = {}
|
||||
|
||||
# ------------------------
|
||||
# Normalisierung / Lemma
|
||||
# ------------------------
|
||||
def normalize_text(s):
|
||||
if not s:
|
||||
return ""
|
||||
s = str(s).lower().strip()
|
||||
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
|
||||
s = re.sub(r"\s+"," ",s)
|
||||
return s
|
||||
|
||||
lemma_cache = {}
|
||||
def lemmatize_term(term):
|
||||
term_norm = normalize_text(term)
|
||||
if term_norm in lemma_cache:
|
||||
return lemma_cache[term_norm]
|
||||
if SPACY_AVAILABLE and nlp:
|
||||
doc = nlp(term_norm)
|
||||
lemma = " ".join([token.lemma_ for token in doc])
|
||||
else:
|
||||
lemma = term_norm
|
||||
lemma_cache[term_norm] = lemma
|
||||
return lemma
|
||||
|
||||
# ------------------------
|
||||
# NV_MASTER einlesen
|
||||
# ------------------------
|
||||
def load_nv_master(path):
|
||||
norm_dict = {}
|
||||
try:
|
||||
doc = odf.opendocument.load(path)
|
||||
except Exception as e:
|
||||
log(f"Fehler beim Laden von NV_MASTER: {e}")
|
||||
return norm_dict
|
||||
|
||||
for sheet in doc.spreadsheet.getElementsByType(odf.table.Table):
|
||||
sheet_name = sheet.getAttribute("name")
|
||||
if sheet_name.lower() == "master":
|
||||
continue
|
||||
|
||||
current_parent_id = None
|
||||
for row in sheet.getElementsByType(odf.table.TableRow):
|
||||
cells = row.getElementsByType(odf.table.TableCell)
|
||||
cell_values = []
|
||||
for cell in cells:
|
||||
texts = cell.getElementsByType(odf.text.P)
|
||||
if texts and texts[0].firstChild:
|
||||
cell_values.append(str(texts[0].firstChild.data).strip())
|
||||
else:
|
||||
cell_values.append("")
|
||||
if not cell_values or len(cell_values)<4:
|
||||
continue
|
||||
id_val, unterk, unterunterk, word = cell_values[:4]
|
||||
if id_val:
|
||||
current_parent_id = id_val.strip()
|
||||
if not word:
|
||||
continue
|
||||
key = lemmatize_term(word)
|
||||
norm_dict[key] = {
|
||||
"Name": word.strip(),
|
||||
"ID": current_parent_id,
|
||||
"Sheet": sheet_name,
|
||||
"Unterkategorie": unterk.strip(),
|
||||
"Unterunterkategorie": unterunterk.strip()
|
||||
}
|
||||
log(f"NV_MASTER geladen: {len(norm_dict)} Begriffe")
|
||||
return norm_dict
|
||||
|
||||
# ------------------------
|
||||
# Matching
|
||||
# ------------------------
|
||||
def get_suggestions(term_lemma, norm_dict, top_n=3, threshold=CONF_THRESHOLD):
|
||||
candidates = []
|
||||
for key, entry in norm_dict.items():
|
||||
if RAPIDFUZZ_AVAILABLE:
|
||||
score = fuzz.token_set_ratio(term_lemma, key)/100
|
||||
else:
|
||||
score = SequenceMatcher(None, term_lemma.lower(), key.lower()).ratio()
|
||||
if key.lower().startswith(term_lemma.lower()):
|
||||
score = min(score + 0.1, 1.0)
|
||||
if score >= threshold:
|
||||
candidates.append((score, entry["Name"], entry["ID"]))
|
||||
candidates.sort(reverse=True)
|
||||
return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]
|
||||
|
||||
def map_word(word, norm_dict):
|
||||
key = lemmatize_term(word)
|
||||
if key in CACHE:
|
||||
cached = CACHE[key]
|
||||
return cached["Norm"], cached["Suggestion"], cached["ID"]
|
||||
|
||||
if key in norm_dict:
|
||||
entry = norm_dict[key]
|
||||
tr, sug, wid = entry["Name"], "", entry["ID"]
|
||||
else:
|
||||
suggestions = get_suggestions(term_lemma=key, norm_dict=norm_dict)
|
||||
if suggestions:
|
||||
tr, sug, wid = "KEIN TREFFER", ", ".join(suggestions), ""
|
||||
else:
|
||||
tr, sug, wid = "KEIN TREFFER", "", ""
|
||||
|
||||
CACHE[key] = {"Norm": tr, "Suggestion": sug, "ID": wid}
|
||||
return tr, sug, wid
|
||||
|
||||
# ------------------------
|
||||
# Makro-Hauptfunktion
|
||||
# ------------------------
|
||||
def run_mapper_macro():
|
||||
try:
|
||||
doc = XSCRIPTCONTEXT.getDocument()
|
||||
sheets = doc.getSheets()
|
||||
sheet = sheets.getByIndex(0)
|
||||
cursor = sheet.createCursor()
|
||||
cursor.gotoStartOfUsedArea(False)
|
||||
cursor.gotoEndOfUsedArea(True)
|
||||
data_range = cursor.getRangeAddress()
|
||||
|
||||
header_row = 0
|
||||
objekt_col = None
|
||||
|
||||
# Header prüfen
|
||||
for col in range(data_range.EndColumn+1):
|
||||
val = sheet.getCellByPosition(col, header_row).String.strip().lower()
|
||||
if val == "objektbeschreibung":
|
||||
objekt_col = col
|
||||
break
|
||||
|
||||
if objekt_col is None:
|
||||
log("Spalte 'Objektbeschreibung' nicht gefunden")
|
||||
return
|
||||
|
||||
# Neue Spalten am rechten Tabellenende erstellen
|
||||
max_col = data_range.EndColumn
|
||||
norm_tr_col = max_col + 1
|
||||
norm_sug_col = max_col + 2
|
||||
norm_id_col = max_col + 3
|
||||
|
||||
sheet.getCellByPosition(norm_tr_col, header_row).String = "Norm_Treffer"
|
||||
sheet.getCellByPosition(norm_sug_col, header_row).String = "Norm_Vorschlag"
|
||||
sheet.getCellByPosition(norm_id_col, header_row).String = "Norm_ID"
|
||||
|
||||
norm_dict = load_nv_master(NV_MASTER_PATH)
|
||||
|
||||
# Farben
|
||||
GREEN = 0xC6EFCE
|
||||
YELLOW = 0xFFEB9C
|
||||
RED = 0xFFC7CE
|
||||
|
||||
for row in range(1, data_range.EndRow+1):
|
||||
cell = sheet.getCellByPosition(objekt_col, row)
|
||||
val = cell.String.strip()
|
||||
if not val:
|
||||
continue
|
||||
words = [w.strip() for w in re.split(r"\s+", val) if w.strip() and w.lower() not in STOPWORDS]
|
||||
tr_list, sug_list, id_list = [], [], []
|
||||
for w in words:
|
||||
tr, sug, wid = map_word(w, norm_dict)
|
||||
if tr != "KEIN TREFFER":
|
||||
tr_list.append(tr)
|
||||
if sug:
|
||||
sug_list.append(sug)
|
||||
if wid:
|
||||
id_list.append(wid)
|
||||
sheet.getCellByPosition(norm_tr_col, row).String = ", ".join(tr_list)
|
||||
sheet.getCellByPosition(norm_sug_col, row).String = ", ".join(sug_list)
|
||||
sheet.getCellByPosition(norm_id_col, row).String = ", ".join(id_list)
|
||||
# Farbmarkierung
|
||||
if tr_list:
|
||||
cell.CellBackColor = GREEN
|
||||
elif sug_list:
|
||||
cell.CellBackColor = YELLOW
|
||||
else:
|
||||
cell.CellBackColor = RED
|
||||
|
||||
# Cache speichern
|
||||
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(CACHE, f, ensure_ascii=False, indent=2)
|
||||
|
||||
log("Makro erfolgreich ausgeführt")
|
||||
|
||||
except Exception as e:
|
||||
log("Fehler in run_mapper_macro:")
|
||||
log(traceback.format_exc())
|
||||
448
mapper_macro_1.1.py
Normal file
448
mapper_macro_1.1.py
Normal file
@ -0,0 +1,448 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# LibreOffice Calc macro: NV_MASTER-Abgleich, Pandas+odf, Cache, Farben
|
||||
# Pfade: BASE_DIR muss auf das Verzeichnis zeigen, in dem NV_MASTER.ods + Makro liegen.
|
||||
# Speichern: /home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro/mapper_macro.py
|
||||
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import traceback
|
||||
|
||||
# UNO-Context wird zur Laufzeit zur Verfügung gestellt (XSCRIPTCONTEXT)
|
||||
# Third-party libs: pandas, odfpy, optional: spacy, rapidfuzz
|
||||
try:
|
||||
import pandas as pd
|
||||
PANDAS_AVAILABLE = True
|
||||
except Exception:
|
||||
PANDAS_AVAILABLE = False
|
||||
|
||||
try:
|
||||
import spacy
|
||||
nlp = spacy.load("de_core_news_sm")
|
||||
SPACY_AVAILABLE = True
|
||||
except Exception:
|
||||
SPACY_AVAILABLE = False
|
||||
nlp = None
|
||||
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
RAPIDFUZZ_AVAILABLE = True
|
||||
except Exception:
|
||||
RAPIDFUZZ_AVAILABLE = False
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
# ------------------------
|
||||
# Konfiguration
|
||||
# ------------------------
|
||||
BASE_DIR = "/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro"
|
||||
NV_MASTER_PATH = os.path.join(BASE_DIR, "NV_MASTER.ods")
|
||||
LOG_FILE = os.path.join(BASE_DIR, "mapper_macro.log")
|
||||
CACHE_FILE = os.path.join(BASE_DIR, "mapper_cache.json")
|
||||
|
||||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||||
CONF_THRESHOLD = 0.75 # Basis-Schwelle für Vorschläge
|
||||
|
||||
# ------------------------
|
||||
# Utilities: Logging & safe I/O
|
||||
# ------------------------
|
||||
def log(msg):
|
||||
try:
|
||||
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
||||
f.write(msg + "\n")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ------------------------
|
||||
# Cache laden
|
||||
# ------------------------
|
||||
try:
|
||||
if os.path.exists(CACHE_FILE):
|
||||
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
||||
CACHE = json.load(f)
|
||||
else:
|
||||
CACHE = {}
|
||||
except Exception:
|
||||
CACHE = {}
|
||||
|
||||
# ------------------------
|
||||
# Text-Normalisierung & Lemma
|
||||
# ------------------------
|
||||
def normalize_text(s):
|
||||
if not s:
|
||||
return ""
|
||||
s = str(s).strip().lower()
|
||||
s = re.sub(r"[\(\)\[\]\"'\\;:\?!,\.]", "", s)
|
||||
s = re.sub(r"\s+", " ", s)
|
||||
return s
|
||||
|
||||
lemma_cache = {}
|
||||
def lemmatize_term(term):
|
||||
term_norm = normalize_text(term)
|
||||
if term_norm in lemma_cache:
|
||||
return lemma_cache[term_norm]
|
||||
if SPACY_AVAILABLE and nlp:
|
||||
try:
|
||||
doc = nlp(term_norm)
|
||||
lemma = " ".join([token.lemma_ for token in doc])
|
||||
except Exception:
|
||||
lemma = term_norm
|
||||
else:
|
||||
lemma = term_norm
|
||||
lemma_cache[term_norm] = lemma
|
||||
return lemma
|
||||
|
||||
# ------------------------
|
||||
# NV_MASTER robust laden (pandas + odf)
|
||||
# ------------------------
|
||||
def build_norm_index(nv_path):
|
||||
norm_dict = {} # normalized_name -> list of entries (Name, ID, Sheet)
|
||||
lemma_index = {} # lemma -> list of entries
|
||||
if not PANDAS_AVAILABLE:
|
||||
log("Pandas nicht verfügbar. NV_MASTER kann nicht zuverlässig gelesen werden.")
|
||||
return norm_dict, lemma_index
|
||||
|
||||
try:
|
||||
sheets = pd.read_excel(nv_path, sheet_name=None, engine="odf")
|
||||
except Exception as e:
|
||||
log(f"Fehler beim Einlesen von NV_MASTER mit pandas: {e}")
|
||||
return norm_dict, lemma_index
|
||||
|
||||
for sheet_name, df in sheets.items():
|
||||
if str(sheet_name).strip().lower() == "master":
|
||||
continue
|
||||
# normalize columns names to find ID and Wort columns
|
||||
df = df.fillna("") # leere Zellen als ""
|
||||
cols = [str(c).strip().lower() for c in df.columns]
|
||||
# try to find columns
|
||||
id_col = None
|
||||
word_col = None
|
||||
for i, c in enumerate(cols):
|
||||
if "id" in c:
|
||||
id_col = df.columns[i]
|
||||
if "wort" in c or "vokabel" in c:
|
||||
word_col = df.columns[i]
|
||||
# fallback: if not found, try first/last
|
||||
if word_col is None and len(df.columns) >= 1:
|
||||
word_col = df.columns[-1]
|
||||
if id_col is None and len(df.columns) >= 1:
|
||||
id_col = df.columns[0]
|
||||
|
||||
current_parent_id = None
|
||||
for _, row in df.iterrows():
|
||||
id_val = str(row[id_col]).strip() if id_col in df.columns else ""
|
||||
word_val = str(row[word_col]).strip() if word_col in df.columns else ""
|
||||
# if row defines an ID, set as current parent
|
||||
if id_val:
|
||||
current_parent_id = id_val
|
||||
# skip empty word cells
|
||||
if not word_val:
|
||||
continue
|
||||
norm_name = normalize_text(word_val)
|
||||
lemma = lemmatize_term(word_val)
|
||||
entry = {"Name": word_val.strip(), "ID": current_parent_id or "", "Sheet": sheet_name}
|
||||
# add to norm_dict by normalized name (exact matching)
|
||||
norm_dict.setdefault(norm_name, []).append(entry)
|
||||
# add to lemma_index
|
||||
lemma_index.setdefault(lemma, []).append(entry)
|
||||
|
||||
log(f"NV_MASTER geladen ({NV_MASTER_PATH}). Begriffe: {sum(len(v) for v in norm_dict.values())}")
|
||||
return norm_dict, lemma_index
|
||||
|
||||
# ------------------------
|
||||
# Matching: exakter Treffer, Lemma-Treffer, Fuzzy-Vorschläge
|
||||
# ------------------------
|
||||
def fuzzy_score(a, b):
|
||||
if RAPIDFUZZ_AVAILABLE:
|
||||
try:
|
||||
return fuzz.token_set_ratio(a, b) / 100.0
|
||||
except Exception:
|
||||
return 0.0
|
||||
else:
|
||||
try:
|
||||
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
def get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, threshold=CONF_THRESHOLD):
|
||||
# collect candidates from lemma_index keys and norm_dict keys
|
||||
candidates = []
|
||||
# iterate over lemma_index keys for candidate names
|
||||
for key_lemma, entries in lemma_index.items():
|
||||
score = fuzzy_score(term_lemma, key_lemma)
|
||||
if key_lemma.startswith(term_lemma):
|
||||
score = min(score + 0.1, 1.0)
|
||||
if score >= threshold:
|
||||
for e in entries:
|
||||
candidates.append((score, e["Name"], e["ID"]))
|
||||
# also check norm_dict keys (exact-normalized names) as additional candidates
|
||||
for norm_key, entries in norm_dict.items():
|
||||
score = fuzzy_score(term_lemma, norm_key)
|
||||
if norm_key.startswith(term_lemma):
|
||||
score = min(score + 0.1, 1.0)
|
||||
if score >= threshold:
|
||||
for e in entries:
|
||||
candidates.append((score, e["Name"], e["ID"]))
|
||||
# sort by score descending
|
||||
candidates.sort(key=lambda t: t[0], reverse=True)
|
||||
# unique by (Name, ID) preserve score order
|
||||
seen = set()
|
||||
results = []
|
||||
for score, name, id_ in candidates:
|
||||
key = (name, id_)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
results.append({"score": score, "name": name, "id": id_})
|
||||
# return all candidates (no limit) as "Name (ID)"
|
||||
return [f'{r["name"]} ({r["id"]})' if r["id"] else r["name"] for r in results]
|
||||
|
||||
def map_term_with_indexes(term, norm_dict, lemma_index):
|
||||
term_norm = normalize_text(term)
|
||||
term_lemma = lemmatize_term(term)
|
||||
# cache lookup
|
||||
if term_lemma in CACHE:
|
||||
return CACHE[term_lemma]["hits"], CACHE[term_lemma]["suggestions"], CACHE[term_lemma]["ids"]
|
||||
|
||||
hits = []
|
||||
suggestions = []
|
||||
ids = []
|
||||
|
||||
# 1) exact normalized name match
|
||||
if term_norm in norm_dict:
|
||||
for e in norm_dict[term_norm]:
|
||||
hits.append(e["Name"])
|
||||
if e["ID"]:
|
||||
ids.append(e["ID"])
|
||||
|
||||
# 2) lemma match (if not already hits)
|
||||
if not hits and term_lemma in lemma_index:
|
||||
for e in lemma_index[term_lemma]:
|
||||
hits.append(e["Name"])
|
||||
if e["ID"]:
|
||||
ids.append(e["ID"])
|
||||
|
||||
# 3) suggestions via fuzzy (always compute even if hits exist, but suggestions empty if exact)
|
||||
suggs = get_suggestions_for_term(term_lemma, norm_dict, lemma_index, top_n=None, threshold=CONF_THRESHOLD)
|
||||
# If there are exact hits, we still may present suggestions (user wanted unlimited), but suggestions are secondary
|
||||
suggestions = suggs
|
||||
|
||||
# deduplicate lists preserving order
|
||||
def unique_preserve(seq):
|
||||
seen = set()
|
||||
out = []
|
||||
for x in seq:
|
||||
if x not in seen:
|
||||
seen.add(x)
|
||||
out.append(x)
|
||||
return out
|
||||
|
||||
hits = unique_preserve(hits)
|
||||
suggestions = unique_preserve(suggestions)
|
||||
ids = unique_preserve(ids)
|
||||
|
||||
# cache result
|
||||
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
|
||||
return hits, suggestions, ids
|
||||
|
||||
# ------------------------
|
||||
# Haupt-Makro
|
||||
# ------------------------
|
||||
def run_mapper_macro():
|
||||
try:
|
||||
# UNO doc/sheet
|
||||
doc = XSCRIPTCONTEXT.getDocument()
|
||||
sheet = doc.CurrentController.ActiveSheet
|
||||
cursor = sheet.createCursor()
|
||||
cursor.gotoStartOfUsedArea(False)
|
||||
cursor.gotoEndOfUsedArea(True)
|
||||
data_range = cursor.getRangeAddress()
|
||||
except Exception as e:
|
||||
log("Fehler: konnte Dokument/Sheet nicht öffnen: " + str(e))
|
||||
return
|
||||
|
||||
# find header row and Objektbeschreibung column (search first 5 rows)
|
||||
header_row = None
|
||||
objekt_col = None
|
||||
max_col = data_range.EndColumn
|
||||
for r in range(0, min(5, data_range.EndRow+1)):
|
||||
for c in range(0, max_col+1):
|
||||
try:
|
||||
val = str(sheet.getCellByPosition(c, r).String).strip().lower()
|
||||
except Exception:
|
||||
val = ""
|
||||
if val == "objektbeschreibung":
|
||||
header_row = r
|
||||
objekt_col = c
|
||||
break
|
||||
if objekt_col is not None:
|
||||
break
|
||||
|
||||
if objekt_col is None:
|
||||
log("Spalte 'Objektbeschreibung' nicht gefunden. Abbruch.")
|
||||
return
|
||||
|
||||
# determine or create result columns: search if exist anywhere; otherwise append at right end
|
||||
existing = {}
|
||||
for c in range(0, data_range.EndColumn+1):
|
||||
try:
|
||||
h = str(sheet.getCellByPosition(c, header_row).String).strip()
|
||||
except Exception:
|
||||
h = ""
|
||||
if h == "Norm_Treffer":
|
||||
existing["Norm_Treffer"] = c
|
||||
if h == "Norm_Vorschlag":
|
||||
existing["Norm_Vorschlag"] = c
|
||||
if h == "Norm_ID":
|
||||
existing["Norm_ID"] = c
|
||||
|
||||
# append columns at right end if missing
|
||||
last_col = data_range.EndColumn
|
||||
if "Norm_Treffer" not in existing:
|
||||
last_col += 1
|
||||
existing["Norm_Treffer"] = last_col
|
||||
try:
|
||||
sheet.getCellByPosition(last_col, header_row).String = "Norm_Treffer"
|
||||
except Exception:
|
||||
pass
|
||||
if "Norm_Vorschlag" not in existing:
|
||||
last_col += 1
|
||||
existing["Norm_Vorschlag"] = last_col
|
||||
try:
|
||||
sheet.getCellByPosition(last_col, header_row).String = "Norm_Vorschlag"
|
||||
except Exception:
|
||||
pass
|
||||
if "Norm_ID" not in existing:
|
||||
last_col += 1
|
||||
existing["Norm_ID"] = last_col
|
||||
try:
|
||||
sheet.getCellByPosition(last_col, header_row).String = "Norm_ID"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
norm_tr_col = existing["Norm_Treffer"]
|
||||
norm_sug_col = existing["Norm_Vorschlag"]
|
||||
norm_id_col = existing["Norm_ID"]
|
||||
|
||||
# Build norm indexes
|
||||
norm_dict, lemma_index = build_norm_index(NV_MASTER_PATH)
|
||||
if not norm_dict and not lemma_index:
|
||||
log("NV_MASTER leer oder nicht lesbar. Abbruch.")
|
||||
return
|
||||
|
||||
# colors
|
||||
GREEN = 0xADFF2F
|
||||
YELLOW = 0xFFA500
|
||||
RED = 0xCC0000
|
||||
|
||||
# iterate rows
|
||||
rows_processed = 0
|
||||
for r in range(header_row + 1, data_range.EndRow + 1):
|
||||
try:
|
||||
cell = sheet.getCellByPosition(objekt_col, r)
|
||||
txt = str(cell.String).strip()
|
||||
if not txt:
|
||||
# clear any previous outputs? keep existing per spec; skip empty
|
||||
continue
|
||||
|
||||
# tokenize: split by commas first, then whitespace; filter stopwords and pure numbers
|
||||
clauses = [c.strip() for c in re.split(r",", txt) if c.strip()]
|
||||
terms = []
|
||||
for cl in clauses:
|
||||
parts = [p.strip() for p in re.split(r"\s+", cl) if p.strip()]
|
||||
for p in parts:
|
||||
if p.lower() in STOPWORDS:
|
||||
continue
|
||||
if re.fullmatch(r"\d+", p):
|
||||
continue
|
||||
terms.append(p)
|
||||
|
||||
# for each term, get hits/suggestions/ids
|
||||
row_hits = []
|
||||
row_sugs = []
|
||||
row_ids = []
|
||||
any_unmapped = False # at least one term without hit and without suggestion
|
||||
# We will record for each term
|
||||
for term in terms:
|
||||
hits, sugs, ids = map_term_with_indexes(term, norm_dict, lemma_index)
|
||||
if hits:
|
||||
row_hits.extend(hits)
|
||||
if sugs:
|
||||
row_sugs.extend(sugs)
|
||||
if ids:
|
||||
row_ids.extend(ids)
|
||||
if (not hits) and (not sugs):
|
||||
any_unmapped = True
|
||||
|
||||
# deduplicate preserving order
|
||||
def uniq(seq):
|
||||
seen = set()
|
||||
out = []
|
||||
for x in seq:
|
||||
if x not in seen:
|
||||
seen.add(x)
|
||||
out.append(x)
|
||||
return out
|
||||
|
||||
row_hits = uniq(row_hits)
|
||||
row_sugs = uniq(row_sugs)
|
||||
row_ids = uniq(row_ids)
|
||||
|
||||
# write outputs (unlimited lists, joined with " | ")
|
||||
try:
|
||||
sheet.getCellByPosition(norm_tr_col, r).String = " | ".join(row_hits)
|
||||
sheet.getCellByPosition(norm_sug_col, r).String = " | ".join(row_sugs)
|
||||
sheet.getCellByPosition(norm_id_col, r).String = " | ".join(row_ids)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Coloring rules per new spec:
|
||||
# - Objektbeschreibung cell: RED if any_unmapped else no change (we do not color green/yellow here)
|
||||
# - Norm_Treffer cell: GREEN if all terms matched (i.e., terms non-empty and no term unmapped and at least one hit per term)
|
||||
# - Norm_Vorschlag cell: YELLOW if at least one suggestion exists
|
||||
# Determine "all matched": terms non-empty and every term has at least one hit (we approximated by checking any_unmapped and hits length)
|
||||
all_matched = False
|
||||
if terms:
|
||||
# all_matched if no term without hit and there is at least one hit overall
|
||||
if (not any_unmapped) and row_hits:
|
||||
all_matched = True
|
||||
|
||||
# apply colors
|
||||
try:
|
||||
if any_unmapped:
|
||||
cell.CellBackColor = RED
|
||||
else:
|
||||
# clear red if previously set? We'll leave unchanged if not set. Optionally set to default 16777215 (white)
|
||||
pass
|
||||
# Norm_Treffer coloring
|
||||
tr_cell = sheet.getCellByPosition(norm_tr_col, r)
|
||||
if all_matched:
|
||||
tr_cell.CellBackColor = GREEN
|
||||
else:
|
||||
# clear color if needed -> set to white
|
||||
tr_cell.CellBackColor = 0xFFFFFF
|
||||
# Norm_Vorschlag coloring
|
||||
sug_cell = sheet.getCellByPosition(norm_sug_col, r)
|
||||
if row_sugs:
|
||||
sug_cell.CellBackColor = YELLOW
|
||||
else:
|
||||
sug_cell.CellBackColor = 0xFFFFFF
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
rows_processed += 1
|
||||
|
||||
except Exception as e:
|
||||
# continue processing other rows; log once
|
||||
log(f"Fehler in Zeile {r}: {e}")
|
||||
|
||||
# persist cache
|
||||
try:
|
||||
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(CACHE, f, ensure_ascii=False, indent=2)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
log(f"run_mapper_macro fertig. Zeilen verarbeitet: {rows_processed}")
|
||||
|
||||
# Export for LO
|
||||
g_exportedScripts = (run_mapper_macro,)
|
||||
297
mapper_macro_1.2.py
Normal file
297
mapper_macro_1.2.py
Normal file
@ -0,0 +1,297 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import uno
|
||||
import unohelper
|
||||
import re
|
||||
import json
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
# RapidFuzz für Fuzzy-Suche
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
RAPIDFUZZ_AVAILABLE = True
|
||||
except ImportError:
|
||||
RAPIDFUZZ_AVAILABLE = False
|
||||
|
||||
# Spacy Lemmatizer
|
||||
try:
|
||||
import spacy
|
||||
nlp = spacy.load("de_core_news_sm")
|
||||
SPACY_AVAILABLE = True
|
||||
except:
|
||||
SPACY_AVAILABLE = False
|
||||
nlp = None
|
||||
|
||||
# =========================
|
||||
# Pfade & Config
|
||||
# =========================
|
||||
SCRIPT_DIR = Path("/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro")
|
||||
NV_MASTER_FILE = SCRIPT_DIR / "NV_MASTER.ods"
|
||||
CACHE_FILE = SCRIPT_DIR / "mapper_cache.json"
|
||||
LOG_FILE = SCRIPT_DIR / "mapper_log.txt"
|
||||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||||
|
||||
# =========================
|
||||
# Cache & Logging
|
||||
# =========================
|
||||
if CACHE_FILE.exists():
|
||||
with open(CACHE_FILE,"r",encoding="utf-8") as f:
|
||||
CACHE = json.load(f)
|
||||
else:
|
||||
CACHE = {}
|
||||
|
||||
def save_cache():
|
||||
with open(CACHE_FILE,"w",encoding="utf-8") as f:
|
||||
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
||||
|
||||
def log(msg):
|
||||
with open(LOG_FILE,"a",encoding="utf-8") as f:
|
||||
f.write(msg + "\n")
|
||||
|
||||
# =========================
|
||||
# Textverarbeitung
|
||||
# =========================
|
||||
def normalize_text(s):
|
||||
if not s: return ""
|
||||
s = str(s).lower().strip()
|
||||
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
|
||||
s = re.sub(r"\s+"," ",s)
|
||||
return s
|
||||
|
||||
lemma_cache = {}
|
||||
def lemmatize_term(term):
|
||||
term_norm = normalize_text(term)
|
||||
if term_norm in lemma_cache:
|
||||
return lemma_cache[term_norm]
|
||||
if SPACY_AVAILABLE and nlp:
|
||||
doc = nlp(term_norm)
|
||||
lemma = " ".join([token.lemma_ for token in doc])
|
||||
else:
|
||||
lemma = term_norm
|
||||
lemma_cache[term_norm] = lemma
|
||||
return lemma
|
||||
|
||||
def compound_split(term):
|
||||
parts = re.findall(r'[A-ZÄÖÜa-zäöü]+', term)
|
||||
return parts if parts else [term]
|
||||
|
||||
# =========================
|
||||
# NV_MASTER laden
|
||||
# =========================
|
||||
def load_normvokabular(file_path):
|
||||
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf")
|
||||
norm_dict = {}
|
||||
for sheet_name, df in sheets.items():
|
||||
df = df.dropna(how="all", axis=1)
|
||||
df.columns = [str(c).strip() for c in df.columns]
|
||||
if "ID" not in df.columns or "Wort/Vokabel" not in df.columns:
|
||||
continue
|
||||
current_parent_id = None
|
||||
for _, row in df.iterrows():
|
||||
row_id = str(row["ID"]).strip() if pd.notna(row["ID"]) else None
|
||||
row_word = str(row["Wort/Vokabel"]).strip() if pd.notna(row["Wort/Vokabel"]) else None
|
||||
if row_id: current_parent_id = row_id
|
||||
if not row_word: continue
|
||||
norm_dict[normalize_text(row_word)] = {
|
||||
"ID": current_parent_id,
|
||||
"Wort/Vokabel": row_word
|
||||
}
|
||||
return norm_dict
|
||||
|
||||
# =========================
|
||||
# Mapping
|
||||
# =========================
|
||||
def map_term_with_indexes(term, norm_dict):
|
||||
term_norm = normalize_text(term)
|
||||
term_lemma = lemmatize_term(term_norm)
|
||||
|
||||
# Cache prüfen
|
||||
if term_lemma in CACHE:
|
||||
cached = CACHE[term_lemma]
|
||||
if isinstance(cached, dict) and all(k in cached for k in ("hits","suggestions","ids")):
|
||||
return cached["hits"], cached["suggestions"], cached["ids"]
|
||||
else:
|
||||
CACHE.pop(term_lemma, None)
|
||||
|
||||
hits = []
|
||||
suggestions = []
|
||||
ids = []
|
||||
|
||||
# Exakte Treffer
|
||||
if term_norm in norm_dict:
|
||||
e = norm_dict[term_norm]
|
||||
hits.append(e["Wort/Vokabel"])
|
||||
ids.append(e["ID"])
|
||||
elif term_lemma in norm_dict:
|
||||
e = norm_dict[term_lemma]
|
||||
hits.append(e["Wort/Vokabel"])
|
||||
ids.append(e["ID"])
|
||||
else:
|
||||
# Fuzzy Matching
|
||||
for key, e in norm_dict.items():
|
||||
score = fuzz.token_sort_ratio(term_lemma, key)/100.0 if RAPIDFUZZ_AVAILABLE else SequenceMatcher(None, term_lemma, key).ratio()
|
||||
if score >= 0.75:
|
||||
suggestions.append(e["Wort/Vokabel"])
|
||||
ids.append(e["ID"])
|
||||
|
||||
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
|
||||
return hits, suggestions, ids
|
||||
|
||||
# =========================
|
||||
# LibreOffice Dialog (ListBox + Checkbox)
|
||||
# =========================
|
||||
def apply_proposals_dialog():
|
||||
ctx = uno.getComponentContext()
|
||||
smgr = ctx.ServiceManager
|
||||
desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
|
||||
doc = desktop.getCurrentComponent()
|
||||
if not doc.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
|
||||
log("Kein Calc-Dokument aktiv")
|
||||
return
|
||||
selection = doc.CurrentSelection
|
||||
sheet = doc.CurrentController.ActiveSheet
|
||||
|
||||
# Prüfen ob eine Zelle ausgewählt ist
|
||||
if selection is None or not hasattr(selection, "getCellAddress"):
|
||||
log("Keine Zelle ausgewählt")
|
||||
return
|
||||
cell = selection
|
||||
|
||||
# Spalte überprüfen
|
||||
header_row = sheet.getCellRangeByPosition(0,0,sheet.Columns.Count-1,0)
|
||||
objekt_col = None
|
||||
norm_vorschlag_col = None
|
||||
for col_idx in range(sheet.Columns.Count):
|
||||
val = sheet.getCellByPosition(col_idx,0).String
|
||||
if val.strip().lower() == "objektbeschreibung":
|
||||
objekt_col = col_idx
|
||||
elif val.strip().lower() == "norm_vorschlag":
|
||||
norm_vorschlag_col = col_idx
|
||||
if norm_vorschlag_col is None or objekt_col is None:
|
||||
log("Spalte 'Norm_Vorschlag' oder 'Objektbeschreibung' nicht gefunden")
|
||||
return
|
||||
|
||||
# Vorschläge auslesen
|
||||
proposals_str = sheet.getCellByPosition(norm_vorschlag_col, cell.RangeAddress.StartRow).String
|
||||
if not proposals_str.strip():
|
||||
log("Keine Vorschläge in der ausgewählten Zelle")
|
||||
return
|
||||
proposals = [p.strip() for p in proposals_str.split(";") if p.strip()]
|
||||
|
||||
# Dialog erstellen
|
||||
toolkit = smgr.createInstanceWithContext("com.sun.star.awt.Toolkit", ctx)
|
||||
dialog_model = smgr.createInstanceWithContext("com.sun.star.awt.UnoControlDialogModel", ctx)
|
||||
dialog_model.Width = 180
|
||||
dialog_model.Height = 150
|
||||
dialog_model.Title = "Vorschläge übernehmen"
|
||||
|
||||
# ListBox
|
||||
lb_model = dialog_model.createInstance("com.sun.star.awt.UnoControlListBoxModel")
|
||||
lb_model.Name = "ProposalList"
|
||||
lb_model.PositionX = 10
|
||||
lb_model.PositionY = 10
|
||||
lb_model.Width = 160
|
||||
lb_model.Height = 80
|
||||
lb_model.StringItemList = tuple(proposals)
|
||||
dialog_model.insertByName("ProposalList", lb_model)
|
||||
|
||||
# Checkbox
|
||||
cb_model = dialog_model.createInstance("com.sun.star.awt.UnoControlCheckBoxModel")
|
||||
cb_model.Name = "AllCheck"
|
||||
cb_model.PositionX = 10
|
||||
cb_model.PositionY = 95
|
||||
cb_model.Width = 160
|
||||
cb_model.Height = 15
|
||||
cb_model.Label = "Alle Vorschläge übernehmen"
|
||||
dialog_model.insertByName("AllCheck", cb_model)
|
||||
|
||||
# OK-Button
|
||||
btn_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
|
||||
btn_model.Name = "OKButton"
|
||||
btn_model.PositionX = 10
|
||||
btn_model.PositionY = 115
|
||||
btn_model.Width = 80
|
||||
btn_model.Height = 20
|
||||
btn_model.Label = "OK"
|
||||
dialog_model.insertByName("OKButton", btn_model)
|
||||
|
||||
# Abbrechen-Button
|
||||
cancel_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
|
||||
cancel_model.Name = "CancelButton"
|
||||
cancel_model.PositionX = 100
|
||||
cancel_model.PositionY = 115
|
||||
cancel_model.Width = 80
|
||||
cancel_model.Height = 20
|
||||
cancel_model.Label = "Abbrechen"
|
||||
dialog_model.insertByName("CancelButton", cancel_model)
|
||||
|
||||
# Control Dialog
|
||||
dialog = smgr.createInstanceWithContext("com.sun.star.awt.UnoControlDialog", ctx)
|
||||
dialog.setModel(dialog_model)
|
||||
dialog.setVisible(True)
|
||||
toolkit.createPeer(dialog, None)
|
||||
|
||||
# Warten auf OK
|
||||
while True:
|
||||
import time
|
||||
time.sleep(0.1)
|
||||
# Prüfen auf Klick
|
||||
if dialog.getControl("OKButton").Pressed:
|
||||
all_flag = dialog.getControl("AllCheck").State == 1
|
||||
selected_idx = dialog.getControl("ProposalList").SelectedItems
|
||||
if selected_idx:
|
||||
selected_proposal = proposals[selected_idx[0]]
|
||||
else:
|
||||
selected_proposal = None
|
||||
break
|
||||
elif dialog.getControl("CancelButton").Pressed:
|
||||
dialog.endExecute()
|
||||
return
|
||||
|
||||
# Anwenden
|
||||
obj_cell = sheet.getCellByPosition(objekt_col, cell.RangeAddress.StartRow)
|
||||
obj_text = obj_cell.String
|
||||
if all_flag:
|
||||
for prop in proposals:
|
||||
idx = obj_text.lower().find(prop.lower())
|
||||
if idx != -1:
|
||||
obj_text = obj_text[:idx] + prop + obj_text[idx+len(prop):]
|
||||
else:
|
||||
if selected_proposal:
|
||||
idx = obj_text.lower().find(selected_proposal.lower())
|
||||
if idx != -1:
|
||||
obj_text = obj_text[:idx] + selected_proposal + obj_text[idx+len(selected_proposal):]
|
||||
|
||||
obj_cell.String = obj_text
|
||||
obj_cell.CellBackColor = 0x00FF00 # grün
|
||||
dialog.endExecute()
|
||||
save_cache()
|
||||
log(f"Vorschlag übernommen: {obj_text}")
|
||||
|
||||
# =========================
|
||||
# Automatische Button-Registrierung
|
||||
# =========================
|
||||
def register_toolbar_button():
|
||||
ctx = uno.getComponentContext()
|
||||
smgr = ctx.ServiceManager
|
||||
desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
|
||||
doc = desktop.getCurrentComponent()
|
||||
frame = doc.CurrentController.Frame
|
||||
# Button kann manuell über Makro-Menü an Toolbar gebunden werden
|
||||
# Hier wird nur das Makro selbst registriert
|
||||
# Symbolleiste muss in LO einmalig erstellt werden
|
||||
|
||||
# =========================
|
||||
# Hauptmakro
|
||||
# =========================
|
||||
def run_mapper_macro():
|
||||
try:
|
||||
norm_dict = load_normvokabular(NV_MASTER_FILE)
|
||||
log(f"NV_MASTER geladen ({len(norm_dict)} Begriffe)")
|
||||
|
||||
apply_proposals_dialog()
|
||||
|
||||
except Exception as e:
|
||||
log(f"Fehler in run_mapper_macro: {e}")
|
||||
297
mapper_macro_1.3.py
Normal file
297
mapper_macro_1.3.py
Normal file
@ -0,0 +1,297 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import uno
|
||||
import unohelper
|
||||
import re
|
||||
import json
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
# RapidFuzz für Fuzzy-Suche
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
RAPIDFUZZ_AVAILABLE = True
|
||||
except ImportError:
|
||||
RAPIDFUZZ_AVAILABLE = False
|
||||
|
||||
# Spacy Lemmatizer
|
||||
try:
|
||||
import spacy
|
||||
nlp = spacy.load("de_core_news_sm")
|
||||
SPACY_AVAILABLE = True
|
||||
except:
|
||||
SPACY_AVAILABLE = False
|
||||
nlp = None
|
||||
|
||||
# =========================
|
||||
# Pfade & Config
|
||||
# =========================
|
||||
SCRIPT_DIR = Path("/home/jarnold/.config/libreoffice/4/user/Scripts/python/NV Abgleich Makro")
|
||||
NV_MASTER_FILE = SCRIPT_DIR / "NV_MASTER.ods"
|
||||
CACHE_FILE = SCRIPT_DIR / "mapper_cache.json"
|
||||
LOG_FILE = SCRIPT_DIR / "mapper_log.txt"
|
||||
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
|
||||
|
||||
# =========================
|
||||
# Cache & Logging
|
||||
# =========================
|
||||
if CACHE_FILE.exists():
|
||||
with open(CACHE_FILE,"r",encoding="utf-8") as f:
|
||||
CACHE = json.load(f)
|
||||
else:
|
||||
CACHE = {}
|
||||
|
||||
def save_cache():
|
||||
with open(CACHE_FILE,"w",encoding="utf-8") as f:
|
||||
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
||||
|
||||
def log(msg):
|
||||
with open(LOG_FILE,"a",encoding="utf-8") as f:
|
||||
f.write(msg + "\n")
|
||||
|
||||
# =========================
|
||||
# Textverarbeitung
|
||||
# =========================
|
||||
def normalize_text(s):
|
||||
if not s: return ""
|
||||
s = str(s).lower().strip()
|
||||
s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
|
||||
s = re.sub(r"\s+"," ",s)
|
||||
return s
|
||||
|
||||
lemma_cache = {}
|
||||
def lemmatize_term(term):
|
||||
term_norm = normalize_text(term)
|
||||
if term_norm in lemma_cache:
|
||||
return lemma_cache[term_norm]
|
||||
if SPACY_AVAILABLE and nlp:
|
||||
doc = nlp(term_norm)
|
||||
lemma = " ".join([token.lemma_ for token in doc])
|
||||
else:
|
||||
lemma = term_norm
|
||||
lemma_cache[term_norm] = lemma
|
||||
return lemma
|
||||
|
||||
def compound_split(term):
|
||||
parts = re.findall(r'[A-ZÄÖÜa-zäöü]+', term)
|
||||
return parts if parts else [term]
|
||||
|
||||
# =========================
|
||||
# NV_MASTER laden
|
||||
# =========================
|
||||
def load_normvokabular(file_path):
|
||||
sheets = pd.read_excel(file_path, sheet_name=None, engine="odf")
|
||||
norm_dict = {}
|
||||
for sheet_name, df in sheets.items():
|
||||
df = df.dropna(how="all", axis=1)
|
||||
df.columns = [str(c).strip() for c in df.columns]
|
||||
if "ID" not in df.columns or "Wort/Vokabel" not in df.columns:
|
||||
continue
|
||||
current_parent_id = None
|
||||
for _, row in df.iterrows():
|
||||
row_id = str(row["ID"]).strip() if pd.notna(row["ID"]) else None
|
||||
row_word = str(row["Wort/Vokabel"]).strip() if pd.notna(row["Wort/Vokabel"]) else None
|
||||
if row_id: current_parent_id = row_id
|
||||
if not row_word: continue
|
||||
norm_dict[normalize_text(row_word)] = {
|
||||
"ID": current_parent_id,
|
||||
"Wort/Vokabel": row_word
|
||||
}
|
||||
return norm_dict
|
||||
|
||||
# =========================
|
||||
# Mapping
|
||||
# =========================
|
||||
def map_term_with_indexes(term, norm_dict):
|
||||
term_norm = normalize_text(term)
|
||||
term_lemma = lemmatize_term(term_norm)
|
||||
|
||||
# Cache prüfen
|
||||
if term_lemma in CACHE:
|
||||
cached = CACHE[term_lemma]
|
||||
if isinstance(cached, dict) and all(k in cached for k in ("hits","suggestions","ids")):
|
||||
return cached["hits"], cached["suggestions"], cached["ids"]
|
||||
else:
|
||||
CACHE.pop(term_lemma, None)
|
||||
|
||||
hits = []
|
||||
suggestions = []
|
||||
ids = []
|
||||
|
||||
# Exakte Treffer
|
||||
if term_norm in norm_dict:
|
||||
e = norm_dict[term_norm]
|
||||
hits.append(e["Wort/Vokabel"])
|
||||
ids.append(e["ID"])
|
||||
elif term_lemma in norm_dict:
|
||||
e = norm_dict[term_lemma]
|
||||
hits.append(e["Wort/Vokabel"])
|
||||
ids.append(e["ID"])
|
||||
else:
|
||||
# Fuzzy Matching
|
||||
for key, e in norm_dict.items():
|
||||
score = fuzz.token_sort_ratio(term_lemma, key)/100.0 if RAPIDFUZZ_AVAILABLE else SequenceMatcher(None, term_lemma, key).ratio()
|
||||
if score >= 0.75:
|
||||
suggestions.append(e["Wort/Vokabel"])
|
||||
ids.append(e["ID"])
|
||||
|
||||
CACHE[term_lemma] = {"hits": hits, "suggestions": suggestions, "ids": ids}
|
||||
return hits, suggestions, ids
|
||||
|
||||
# =========================
|
||||
# LibreOffice Dialog (ListBox + Checkbox)
|
||||
# =========================
|
||||
def apply_proposals_dialog():
|
||||
ctx = uno.getComponentContext()
|
||||
smgr = ctx.ServiceManager
|
||||
desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
|
||||
doc = desktop.getCurrentComponent()
|
||||
if not doc.supportsService("com.sun.star.sheet.SpreadsheetDocument"):
|
||||
log("Kein Calc-Dokument aktiv")
|
||||
return
|
||||
selection = doc.CurrentSelection
|
||||
sheet = doc.CurrentController.ActiveSheet
|
||||
|
||||
# Prüfen ob eine Zelle ausgewählt ist
|
||||
if selection is None or not hasattr(selection, "getCellAddress"):
|
||||
log("Keine Zelle ausgewählt")
|
||||
return
|
||||
cell = selection
|
||||
|
||||
# Spalte überprüfen
|
||||
header_row = sheet.getCellRangeByPosition(0,0,sheet.Columns.Count-1,0)
|
||||
objekt_col = None
|
||||
norm_vorschlag_col = None
|
||||
for col_idx in range(sheet.Columns.Count):
|
||||
val = sheet.getCellByPosition(col_idx,0).String
|
||||
if val.strip().lower() == "objektbeschreibung":
|
||||
objekt_col = col_idx
|
||||
elif val.strip().lower() == "norm_vorschlag":
|
||||
norm_vorschlag_col = col_idx
|
||||
if norm_vorschlag_col is None or objekt_col is None:
|
||||
log("Spalte 'Norm_Vorschlag' oder 'Objektbeschreibung' nicht gefunden")
|
||||
return
|
||||
|
||||
# Vorschläge auslesen
|
||||
proposals_str = sheet.getCellByPosition(norm_vorschlag_col, cell.RangeAddress.StartRow).String
|
||||
if not proposals_str.strip():
|
||||
log("Keine Vorschläge in der ausgewählten Zelle")
|
||||
return
|
||||
proposals = [p.strip() for p in proposals_str.split(";") if p.strip()]
|
||||
|
||||
# Dialog erstellen
|
||||
toolkit = smgr.createInstanceWithContext("com.sun.star.awt.Toolkit", ctx)
|
||||
dialog_model = smgr.createInstanceWithContext("com.sun.star.awt.UnoControlDialogModel", ctx)
|
||||
dialog_model.Width = 180
|
||||
dialog_model.Height = 150
|
||||
dialog_model.Title = "Vorschläge übernehmen"
|
||||
|
||||
# ListBox
|
||||
lb_model = dialog_model.createInstance("com.sun.star.awt.UnoControlListBoxModel")
|
||||
lb_model.Name = "ProposalList"
|
||||
lb_model.PositionX = 10
|
||||
lb_model.PositionY = 10
|
||||
lb_model.Width = 160
|
||||
lb_model.Height = 80
|
||||
lb_model.StringItemList = tuple(proposals)
|
||||
dialog_model.insertByName("ProposalList", lb_model)
|
||||
|
||||
# Checkbox
|
||||
cb_model = dialog_model.createInstance("com.sun.star.awt.UnoControlCheckBoxModel")
|
||||
cb_model.Name = "AllCheck"
|
||||
cb_model.PositionX = 10
|
||||
cb_model.PositionY = 95
|
||||
cb_model.Width = 160
|
||||
cb_model.Height = 15
|
||||
cb_model.Label = "Alle Vorschläge übernehmen"
|
||||
dialog_model.insertByName("AllCheck", cb_model)
|
||||
|
||||
# OK-Button
|
||||
btn_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
|
||||
btn_model.Name = "OKButton"
|
||||
btn_model.PositionX = 10
|
||||
btn_model.PositionY = 115
|
||||
btn_model.Width = 80
|
||||
btn_model.Height = 20
|
||||
btn_model.Label = "OK"
|
||||
dialog_model.insertByName("OKButton", btn_model)
|
||||
|
||||
# Abbrechen-Button
|
||||
cancel_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
|
||||
cancel_model.Name = "CancelButton"
|
||||
cancel_model.PositionX = 100
|
||||
cancel_model.PositionY = 115
|
||||
cancel_model.Width = 80
|
||||
cancel_model.Height = 20
|
||||
cancel_model.Label = "Abbrechen"
|
||||
dialog_model.insertByName("CancelButton", cancel_model)
|
||||
|
||||
# Control Dialog
|
||||
dialog = smgr.createInstanceWithContext("com.sun.star.awt.UnoControlDialog", ctx)
|
||||
dialog.setModel(dialog_model)
|
||||
dialog.setVisible(True)
|
||||
toolkit.createPeer(dialog, None)
|
||||
|
||||
# Warten auf OK
|
||||
while True:
|
||||
import time
|
||||
time.sleep(0.1)
|
||||
# Prüfen auf Klick
|
||||
if dialog.getControl("OKButton").Pressed:
|
||||
all_flag = dialog.getControl("AllCheck").State == 1
|
||||
selected_idx = dialog.getControl("ProposalList").SelectedItems
|
||||
if selected_idx:
|
||||
selected_proposal = proposals[selected_idx[0]]
|
||||
else:
|
||||
selected_proposal = None
|
||||
break
|
||||
elif dialog.getControl("CancelButton").Pressed:
|
||||
dialog.endExecute()
|
||||
return
|
||||
|
||||
# Anwenden
|
||||
obj_cell = sheet.getCellByPosition(objekt_col, cell.RangeAddress.StartRow)
|
||||
obj_text = obj_cell.String
|
||||
if all_flag:
|
||||
for prop in proposals:
|
||||
idx = obj_text.lower().find(prop.lower())
|
||||
if idx != -1:
|
||||
obj_text = obj_text[:idx] + prop + obj_text[idx+len(prop):]
|
||||
else:
|
||||
if selected_proposal:
|
||||
idx = obj_text.lower().find(selected_proposal.lower())
|
||||
if idx != -1:
|
||||
obj_text = obj_text[:idx] + selected_proposal + obj_text[idx+len(selected_proposal):]
|
||||
|
||||
obj_cell.String = obj_text
|
||||
obj_cell.CellBackColor = 0x00FF00 # grün
|
||||
dialog.endExecute()
|
||||
save_cache()
|
||||
log(f"Vorschlag übernommen: {obj_text}")
|
||||
|
||||
# =========================
|
||||
# Automatische Button-Registrierung
|
||||
# =========================
|
||||
def register_toolbar_button():
|
||||
ctx = uno.getComponentContext()
|
||||
smgr = ctx.ServiceManager
|
||||
desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
|
||||
doc = desktop.getCurrentComponent()
|
||||
frame = doc.CurrentController.Frame
|
||||
# Button kann manuell über Makro-Menü an Toolbar gebunden werden
|
||||
# Hier wird nur das Makro selbst registriert
|
||||
# Symbolleiste muss in LO einmalig erstellt werden
|
||||
|
||||
# =========================
|
||||
# Hauptmakro
|
||||
# =========================
|
||||
def run_mapper_macro():
|
||||
try:
|
||||
norm_dict = load_normvokabular(NV_MASTER_FILE)
|
||||
log(f"NV_MASTER geladen ({len(norm_dict)} Begriffe)")
|
||||
|
||||
apply_proposals_dialog()
|
||||
|
||||
except Exception as e:
|
||||
log(f"Fehler in run_mapper_macro: {e}")
|
||||
121
normmapper_macro.py
Normal file
121
normmapper_macro.py
Normal file
@ -0,0 +1,121 @@
|
||||
import uno
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from com.sun.star.awt import XActionListener
|
||||
|
||||
# Farbwerte (BGR)
|
||||
GREEN = 0xC6EFCE
|
||||
RED = 0xFFC7CE
|
||||
YELLOW = 0xFFEB9C
|
||||
|
||||
def get_objektbeschreibung_column(sheet):
|
||||
"""Findet die Spalte 'Objektbeschreibung'."""
|
||||
for row in range(sheet.Rows.Count):
|
||||
for col in range(sheet.Columns.Count):
|
||||
cell = sheet.getCellByPosition(col, row)
|
||||
if cell.String.strip().lower() == "objektbeschreibung":
|
||||
return col
|
||||
return None
|
||||
|
||||
def update_cell_color(cell, status):
|
||||
"""Färbt die Zelle."""
|
||||
if status == "grün":
|
||||
cell.CellBackColor = GREEN
|
||||
elif status == "gelb":
|
||||
cell.CellBackColor = YELLOW
|
||||
else:
|
||||
cell.CellBackColor = RED
|
||||
|
||||
def call_mapper(term):
|
||||
"""Ruft den lokalen Wrapper auf."""
|
||||
wrapper = Path("/home/jarnold/projects/GND-Skript Test/NormVokabular_Mapper_Wrapper.py")
|
||||
if not wrapper.exists():
|
||||
return {"term": term, "norm_name": "KEIN TREFFER", "norm_id": "", "suggestions": []}
|
||||
|
||||
result = subprocess.run(
|
||||
["python3", str(wrapper), term],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
try:
|
||||
output = json.loads(result.stdout)
|
||||
except:
|
||||
output = {"term": term, "norm_name": "KEIN TREFFER", "norm_id": "", "suggestions": []}
|
||||
return output
|
||||
|
||||
class SuggestionListener(XActionListener):
|
||||
"""Listener für Klick auf Vorschlag-Button."""
|
||||
def __init__(self, cell, suggestion, dialog):
|
||||
self.cell = cell
|
||||
self.suggestion = suggestion
|
||||
self.dialog = dialog
|
||||
|
||||
def actionPerformed(self, event):
|
||||
self.cell.String = self.suggestion
|
||||
update_cell_color(self.cell, "grün")
|
||||
self.dialog.endExecute() # schließt das Dialogfenster
|
||||
|
||||
def disposing(self, event):
|
||||
pass
|
||||
|
||||
def show_suggestion_dialog(cell, term, suggestions):
|
||||
"""Zeigt ein Dialog-Fenster mit klickbaren Vorschlägen."""
|
||||
ctx = XSCRIPTCONTEXT.getComponentContext()
|
||||
smgr = ctx.getServiceManager()
|
||||
toolkit = smgr.createInstance("com.sun.star.awt.Toolkit")
|
||||
dialog_model = smgr.createInstance("com.sun.star.awt.UnoControlDialogModel")
|
||||
dialog_model.PositionX = 100
|
||||
dialog_model.PositionY = 100
|
||||
dialog_model.Width = 200
|
||||
dialog_model.Height = 30 + 25*len(suggestions)
|
||||
dialog_model.Title = f"Vorschläge für '{term}'"
|
||||
|
||||
for i, sugg in enumerate(suggestions[:3]):
|
||||
btn_model = dialog_model.createInstance("com.sun.star.awt.UnoControlButtonModel")
|
||||
btn_model.Name = f"btn_{i}"
|
||||
btn_model.Label = sugg
|
||||
btn_model.PositionX = 10
|
||||
btn_model.PositionY = 10 + i*25
|
||||
btn_model.Width = 180
|
||||
btn_model.Height = 20
|
||||
dialog_model.insertByName(btn_model.Name, btn_model)
|
||||
|
||||
dialog = smgr.createInstance("com.sun.star.awt.UnoControlDialog")
|
||||
dialog.setModel(dialog_model)
|
||||
dialog.setVisible(True)
|
||||
|
||||
for i, sugg in enumerate(suggestions[:3]):
|
||||
btn = dialog.getControl(f"btn_{i}")
|
||||
listener = SuggestionListener(cell, sugg, dialog)
|
||||
btn.addActionListener(listener)
|
||||
|
||||
toolkit.createDialog(dialog).execute()
|
||||
|
||||
def mapper_process_column():
|
||||
"""Verarbeitet alle Zellen unter 'Objektbeschreibung' in der aktiven Tabelle."""
|
||||
doc = XSCRIPTCONTEXT.getDocument()
|
||||
sheet = doc.CurrentController.ActiveSheet
|
||||
col_index = get_objektbeschreibung_column(sheet)
|
||||
if col_index is None:
|
||||
return
|
||||
|
||||
for row in range(sheet.Rows.Count):
|
||||
cell = sheet.getCellByPosition(col_index, row)
|
||||
if not cell.String.strip():
|
||||
continue # leere Zelle ignorieren
|
||||
term = cell.String.strip()
|
||||
result = call_mapper(term)
|
||||
|
||||
if result["norm_name"] != "KEIN TREFFER":
|
||||
cell.String = result["norm_name"]
|
||||
update_cell_color(cell, "grün")
|
||||
elif result["suggestions"]:
|
||||
update_cell_color(cell, "gelb")
|
||||
show_suggestion_dialog(cell, term, result["suggestions"])
|
||||
else:
|
||||
update_cell_color(cell, "rot")
|
||||
show_suggestion_dialog(cell, term, [])
|
||||
|
||||
# Export
|
||||
g_exportedScripts = mapper_process_column,
|
||||
247
venv/bin/Activate.ps1
Normal file
247
venv/bin/Activate.ps1
Normal file
@ -0,0 +1,247 @@
|
||||
<#
|
||||
.Synopsis
|
||||
Activate a Python virtual environment for the current PowerShell session.
|
||||
|
||||
.Description
|
||||
Pushes the python executable for a virtual environment to the front of the
|
||||
$Env:PATH environment variable and sets the prompt to signify that you are
|
||||
in a Python virtual environment. Makes use of the command line switches as
|
||||
well as the `pyvenv.cfg` file values present in the virtual environment.
|
||||
|
||||
.Parameter VenvDir
|
||||
Path to the directory that contains the virtual environment to activate. The
|
||||
default value for this is the parent of the directory that the Activate.ps1
|
||||
script is located within.
|
||||
|
||||
.Parameter Prompt
|
||||
The prompt prefix to display when this virtual environment is activated. By
|
||||
default, this prompt is the name of the virtual environment folder (VenvDir)
|
||||
surrounded by parentheses and followed by a single space (ie. '(.venv) ').
|
||||
|
||||
.Example
|
||||
Activate.ps1
|
||||
Activates the Python virtual environment that contains the Activate.ps1 script.
|
||||
|
||||
.Example
|
||||
Activate.ps1 -Verbose
|
||||
Activates the Python virtual environment that contains the Activate.ps1 script,
|
||||
and shows extra information about the activation as it executes.
|
||||
|
||||
.Example
|
||||
Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
|
||||
Activates the Python virtual environment located in the specified location.
|
||||
|
||||
.Example
|
||||
Activate.ps1 -Prompt "MyPython"
|
||||
Activates the Python virtual environment that contains the Activate.ps1 script,
|
||||
and prefixes the current prompt with the specified string (surrounded in
|
||||
parentheses) while the virtual environment is active.
|
||||
|
||||
.Notes
|
||||
On Windows, it may be required to enable this Activate.ps1 script by setting the
|
||||
execution policy for the user. You can do this by issuing the following PowerShell
|
||||
command:
|
||||
|
||||
PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
|
||||
|
||||
For more information on Execution Policies:
|
||||
https://go.microsoft.com/fwlink/?LinkID=135170
|
||||
|
||||
#>
|
||||
Param(
|
||||
[Parameter(Mandatory = $false)]
|
||||
[String]
|
||||
$VenvDir,
|
||||
[Parameter(Mandatory = $false)]
|
||||
[String]
|
||||
$Prompt
|
||||
)
|
||||
|
||||
<# Function declarations --------------------------------------------------- #>
|
||||
|
||||
<#
|
||||
.Synopsis
|
||||
Remove all shell session elements added by the Activate script, including the
|
||||
addition of the virtual environment's Python executable from the beginning of
|
||||
the PATH variable.
|
||||
|
||||
.Parameter NonDestructive
|
||||
If present, do not remove this function from the global namespace for the
|
||||
session.
|
||||
|
||||
#>
|
||||
function global:deactivate ([switch]$NonDestructive) {
|
||||
# Revert to original values
|
||||
|
||||
# The prior prompt:
|
||||
if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
|
||||
Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
|
||||
Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
|
||||
}
|
||||
|
||||
# The prior PYTHONHOME:
|
||||
if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
|
||||
Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
|
||||
Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
|
||||
}
|
||||
|
||||
# The prior PATH:
|
||||
if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
|
||||
Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
|
||||
Remove-Item -Path Env:_OLD_VIRTUAL_PATH
|
||||
}
|
||||
|
||||
# Just remove the VIRTUAL_ENV altogether:
|
||||
if (Test-Path -Path Env:VIRTUAL_ENV) {
|
||||
Remove-Item -Path env:VIRTUAL_ENV
|
||||
}
|
||||
|
||||
# Just remove VIRTUAL_ENV_PROMPT altogether.
|
||||
if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
|
||||
Remove-Item -Path env:VIRTUAL_ENV_PROMPT
|
||||
}
|
||||
|
||||
# Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
|
||||
if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
|
||||
Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
|
||||
}
|
||||
|
||||
# Leave deactivate function in the global namespace if requested:
|
||||
if (-not $NonDestructive) {
|
||||
Remove-Item -Path function:deactivate
|
||||
}
|
||||
}
|
||||
|
||||
<#
|
||||
.Description
|
||||
Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
|
||||
given folder, and returns them in a map.
|
||||
|
||||
For each line in the pyvenv.cfg file, if that line can be parsed into exactly
|
||||
two strings separated by `=` (with any amount of whitespace surrounding the =)
|
||||
then it is considered a `key = value` line. The left hand string is the key,
|
||||
the right hand is the value.
|
||||
|
||||
If the value starts with a `'` or a `"` then the first and last character is
|
||||
stripped from the value before being captured.
|
||||
|
||||
.Parameter ConfigDir
|
||||
Path to the directory that contains the `pyvenv.cfg` file.
|
||||
#>
|
||||
function Get-PyVenvConfig(
|
||||
[String]
|
||||
$ConfigDir
|
||||
) {
|
||||
Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
|
||||
|
||||
# Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
|
||||
$pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
|
||||
|
||||
# An empty map will be returned if no config file is found.
|
||||
$pyvenvConfig = @{ }
|
||||
|
||||
if ($pyvenvConfigPath) {
|
||||
|
||||
Write-Verbose "File exists, parse `key = value` lines"
|
||||
$pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
|
||||
|
||||
$pyvenvConfigContent | ForEach-Object {
|
||||
$keyval = $PSItem -split "\s*=\s*", 2
|
||||
if ($keyval[0] -and $keyval[1]) {
|
||||
$val = $keyval[1]
|
||||
|
||||
# Remove extraneous quotations around a string value.
|
||||
if ("'""".Contains($val.Substring(0, 1))) {
|
||||
$val = $val.Substring(1, $val.Length - 2)
|
||||
}
|
||||
|
||||
$pyvenvConfig[$keyval[0]] = $val
|
||||
Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
|
||||
}
|
||||
}
|
||||
}
|
||||
return $pyvenvConfig
|
||||
}
|
||||
|
||||
|
||||
<# Begin Activate script --------------------------------------------------- #>
|
||||
|
||||
# Determine the containing directory of this script
|
||||
$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
||||
$VenvExecDir = Get-Item -Path $VenvExecPath
|
||||
|
||||
Write-Verbose "Activation script is located in path: '$VenvExecPath'"
|
||||
Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
|
||||
Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
|
||||
|
||||
# Set values required in priority: CmdLine, ConfigFile, Default
|
||||
# First, get the location of the virtual environment, it might not be
|
||||
# VenvExecDir if specified on the command line.
|
||||
if ($VenvDir) {
|
||||
Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
|
||||
}
|
||||
else {
|
||||
Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
|
||||
$VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
|
||||
Write-Verbose "VenvDir=$VenvDir"
|
||||
}
|
||||
|
||||
# Next, read the `pyvenv.cfg` file to determine any required value such
|
||||
# as `prompt`.
|
||||
$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
|
||||
|
||||
# Next, set the prompt from the command line, or the config file, or
|
||||
# just use the name of the virtual environment folder.
|
||||
if ($Prompt) {
|
||||
Write-Verbose "Prompt specified as argument, using '$Prompt'"
|
||||
}
|
||||
else {
|
||||
Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
|
||||
if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
|
||||
Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
|
||||
$Prompt = $pyvenvCfg['prompt'];
|
||||
}
|
||||
else {
|
||||
Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
|
||||
Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
|
||||
$Prompt = Split-Path -Path $venvDir -Leaf
|
||||
}
|
||||
}
|
||||
|
||||
Write-Verbose "Prompt = '$Prompt'"
|
||||
Write-Verbose "VenvDir='$VenvDir'"
|
||||
|
||||
# Deactivate any currently active virtual environment, but leave the
|
||||
# deactivate function in place.
|
||||
deactivate -nondestructive
|
||||
|
||||
# Now set the environment variable VIRTUAL_ENV, used by many tools to determine
|
||||
# that there is an activated venv.
|
||||
$env:VIRTUAL_ENV = $VenvDir
|
||||
|
||||
if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
|
||||
|
||||
Write-Verbose "Setting prompt to '$Prompt'"
|
||||
|
||||
# Set the prompt to include the env name
|
||||
# Make sure _OLD_VIRTUAL_PROMPT is global
|
||||
function global:_OLD_VIRTUAL_PROMPT { "" }
|
||||
Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
|
||||
New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
|
||||
|
||||
function global:prompt {
|
||||
Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
|
||||
_OLD_VIRTUAL_PROMPT
|
||||
}
|
||||
$env:VIRTUAL_ENV_PROMPT = $Prompt
|
||||
}
|
||||
|
||||
# Clear PYTHONHOME
|
||||
if (Test-Path -Path Env:PYTHONHOME) {
|
||||
Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
|
||||
Remove-Item -Path Env:PYTHONHOME
|
||||
}
|
||||
|
||||
# Add the venv to the PATH
|
||||
Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
|
||||
$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"
|
||||
69
venv/bin/activate
Normal file
69
venv/bin/activate
Normal file
@ -0,0 +1,69 @@
|
||||
# This file must be used with "source bin/activate" *from bash*
|
||||
# you cannot run it directly
|
||||
|
||||
deactivate () {
|
||||
# reset old environment variables
|
||||
if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
|
||||
PATH="${_OLD_VIRTUAL_PATH:-}"
|
||||
export PATH
|
||||
unset _OLD_VIRTUAL_PATH
|
||||
fi
|
||||
if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
|
||||
PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
|
||||
export PYTHONHOME
|
||||
unset _OLD_VIRTUAL_PYTHONHOME
|
||||
fi
|
||||
|
||||
# This should detect bash and zsh, which have a hash command that must
|
||||
# be called to get it to forget past commands. Without forgetting
|
||||
# past commands the $PATH changes we made may not be respected
|
||||
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
|
||||
hash -r 2> /dev/null
|
||||
fi
|
||||
|
||||
if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
|
||||
PS1="${_OLD_VIRTUAL_PS1:-}"
|
||||
export PS1
|
||||
unset _OLD_VIRTUAL_PS1
|
||||
fi
|
||||
|
||||
unset VIRTUAL_ENV
|
||||
unset VIRTUAL_ENV_PROMPT
|
||||
if [ ! "${1:-}" = "nondestructive" ] ; then
|
||||
# Self destruct!
|
||||
unset -f deactivate
|
||||
fi
|
||||
}
|
||||
|
||||
# unset irrelevant variables
|
||||
deactivate nondestructive
|
||||
|
||||
VIRTUAL_ENV='/home/jarnold/projects/GND-Skript Test/venv'
|
||||
export VIRTUAL_ENV
|
||||
|
||||
_OLD_VIRTUAL_PATH="$PATH"
|
||||
PATH="$VIRTUAL_ENV/"bin":$PATH"
|
||||
export PATH
|
||||
|
||||
# unset PYTHONHOME if set
|
||||
# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
|
||||
# could use `if (set -u; : $PYTHONHOME) ;` in bash
|
||||
if [ -n "${PYTHONHOME:-}" ] ; then
|
||||
_OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
|
||||
unset PYTHONHOME
|
||||
fi
|
||||
|
||||
if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
|
||||
_OLD_VIRTUAL_PS1="${PS1:-}"
|
||||
PS1='(venv) '"${PS1:-}"
|
||||
export PS1
|
||||
VIRTUAL_ENV_PROMPT='(venv) '
|
||||
export VIRTUAL_ENV_PROMPT
|
||||
fi
|
||||
|
||||
# This should detect bash and zsh, which have a hash command that must
|
||||
# be called to get it to forget past commands. Without forgetting
|
||||
# past commands the $PATH changes we made may not be respected
|
||||
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
|
||||
hash -r 2> /dev/null
|
||||
fi
|
||||
26
venv/bin/activate.csh
Normal file
26
venv/bin/activate.csh
Normal file
@ -0,0 +1,26 @@
|
||||
# This file must be used with "source bin/activate.csh" *from csh*.
|
||||
# You cannot run it directly.
|
||||
# Created by Davide Di Blasi <davidedb@gmail.com>.
|
||||
# Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
|
||||
|
||||
alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate'
|
||||
|
||||
# Unset irrelevant variables.
|
||||
deactivate nondestructive
|
||||
|
||||
setenv VIRTUAL_ENV '/home/jarnold/projects/GND-Skript Test/venv'
|
||||
|
||||
set _OLD_VIRTUAL_PATH="$PATH"
|
||||
setenv PATH "$VIRTUAL_ENV/"bin":$PATH"
|
||||
|
||||
|
||||
set _OLD_VIRTUAL_PROMPT="$prompt"
|
||||
|
||||
if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
|
||||
set prompt = '(venv) '"$prompt"
|
||||
setenv VIRTUAL_ENV_PROMPT '(venv) '
|
||||
endif
|
||||
|
||||
alias pydoc python -m pydoc
|
||||
|
||||
rehash
|
||||
69
venv/bin/activate.fish
Normal file
69
venv/bin/activate.fish
Normal file
@ -0,0 +1,69 @@
|
||||
# This file must be used with "source <venv>/bin/activate.fish" *from fish*
|
||||
# (https://fishshell.com/); you cannot run it directly.
|
||||
|
||||
function deactivate -d "Exit virtual environment and return to normal shell environment"
|
||||
# reset old environment variables
|
||||
if test -n "$_OLD_VIRTUAL_PATH"
|
||||
set -gx PATH $_OLD_VIRTUAL_PATH
|
||||
set -e _OLD_VIRTUAL_PATH
|
||||
end
|
||||
if test -n "$_OLD_VIRTUAL_PYTHONHOME"
|
||||
set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
|
||||
set -e _OLD_VIRTUAL_PYTHONHOME
|
||||
end
|
||||
|
||||
if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
|
||||
set -e _OLD_FISH_PROMPT_OVERRIDE
|
||||
# prevents error when using nested fish instances (Issue #93858)
|
||||
if functions -q _old_fish_prompt
|
||||
functions -e fish_prompt
|
||||
functions -c _old_fish_prompt fish_prompt
|
||||
functions -e _old_fish_prompt
|
||||
end
|
||||
end
|
||||
|
||||
set -e VIRTUAL_ENV
|
||||
set -e VIRTUAL_ENV_PROMPT
|
||||
if test "$argv[1]" != "nondestructive"
|
||||
# Self-destruct!
|
||||
functions -e deactivate
|
||||
end
|
||||
end
|
||||
|
||||
# Unset irrelevant variables.
|
||||
deactivate nondestructive
|
||||
|
||||
set -gx VIRTUAL_ENV '/home/jarnold/projects/GND-Skript Test/venv'
|
||||
|
||||
set -gx _OLD_VIRTUAL_PATH $PATH
|
||||
set -gx PATH "$VIRTUAL_ENV/"bin $PATH
|
||||
|
||||
# Unset PYTHONHOME if set.
|
||||
if set -q PYTHONHOME
|
||||
set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
|
||||
set -e PYTHONHOME
|
||||
end
|
||||
|
||||
if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
|
||||
# fish uses a function instead of an env var to generate the prompt.
|
||||
|
||||
# Save the current fish_prompt function as the function _old_fish_prompt.
|
||||
functions -c fish_prompt _old_fish_prompt
|
||||
|
||||
# With the original prompt function renamed, we can override with our own.
|
||||
function fish_prompt
|
||||
# Save the return status of the last command.
|
||||
set -l old_status $status
|
||||
|
||||
# Output the venv prompt; color taken from the blue of the Python logo.
|
||||
printf "%s%s%s" (set_color 4B8BBE) '(venv) ' (set_color normal)
|
||||
|
||||
# Restore the return status of the previous command.
|
||||
echo "exit $old_status" | .
|
||||
# Output the original/"old" prompt.
|
||||
_old_fish_prompt
|
||||
end
|
||||
|
||||
set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
|
||||
set -gx VIRTUAL_ENV_PROMPT '(venv) '
|
||||
end
|
||||
229
venv/bin/csv2ods
Executable file
229
venv/bin/csv2ods
Executable file
@ -0,0 +1,229 @@
|
||||
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2008 Agustin Henze -> agustinhenze at gmail.com
|
||||
#
|
||||
# This is free software. You may redistribute it under the terms
|
||||
# of the Apache license and the GNU General Public License Version
|
||||
# 2 or at your option any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public
|
||||
# License along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#
|
||||
# Contributor(s):
|
||||
#
|
||||
# Søren Roug
|
||||
#
|
||||
# Oct 2014: Georges Khaznadar <georgesk@debian.org>
|
||||
# - ported to Python3
|
||||
# - imlemented the missing switch -c / --encoding, with an extra
|
||||
# feature for POSIX platforms which can guess encoding.
|
||||
|
||||
from odf.opendocument import OpenDocumentSpreadsheet
|
||||
from odf.style import Style, TextProperties, ParagraphProperties, TableColumnProperties
|
||||
from odf.text import P
|
||||
from odf.table import Table, TableColumn, TableRow, TableCell
|
||||
from optparse import OptionParser
|
||||
import sys,csv,re, os, codecs
|
||||
|
||||
if sys.version_info[0]==3: unicode=str
|
||||
|
||||
if sys.version_info[0]==2:
|
||||
class UTF8Recoder:
|
||||
"""
|
||||
Iterator that reads an encoded stream and reencodes the input to UTF-8
|
||||
"""
|
||||
def __init__(self, f, encoding):
|
||||
self.reader = codecs.getreader(encoding)(f)
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
return self.reader.next().encode("utf-8")
|
||||
|
||||
class UnicodeReader:
|
||||
"""
|
||||
A CSV reader which will iterate over lines in the CSV file "f",
|
||||
which is encoded in the given encoding.
|
||||
"""
|
||||
|
||||
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
|
||||
f = UTF8Recoder(f, encoding)
|
||||
self.reader = csv.reader(f, dialect=dialect, **kwds)
|
||||
|
||||
def next(self):
|
||||
row = self.reader.next()
|
||||
return [unicode(s, "utf-8") for s in row]
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
|
||||
def csvToOds( pathFileCSV, pathFileODS, tableName='table',
|
||||
delimiter=',', quoting=csv.QUOTE_MINIMAL,
|
||||
quotechar = '"', escapechar = None,
|
||||
skipinitialspace = False, lineterminator = '\r\n',
|
||||
encoding="utf-8"):
|
||||
textdoc = OpenDocumentSpreadsheet()
|
||||
# Create a style for the table content. One we can modify
|
||||
# later in the word processor.
|
||||
tablecontents = Style(name="Table Contents", family="paragraph")
|
||||
tablecontents.addElement(ParagraphProperties(numberlines="false", linenumber="0"))
|
||||
tablecontents.addElement(TextProperties(fontweight="bold"))
|
||||
textdoc.styles.addElement(tablecontents)
|
||||
|
||||
# Start the table
|
||||
table = Table( name=tableName )
|
||||
|
||||
if sys.version_info[0]==3:
|
||||
reader = csv.reader(open(pathFileCSV, encoding=encoding),
|
||||
delimiter=delimiter,
|
||||
quoting=quoting,
|
||||
quotechar=quotechar,
|
||||
escapechar=escapechar,
|
||||
skipinitialspace=skipinitialspace,
|
||||
lineterminator=lineterminator)
|
||||
else:
|
||||
reader = UnicodeReader(open(pathFileCSV),
|
||||
encoding=encoding,
|
||||
delimiter=delimiter,
|
||||
quoting=quoting,
|
||||
quotechar=quotechar,
|
||||
escapechar=escapechar,
|
||||
skipinitialspace=skipinitialspace,
|
||||
lineterminator=lineterminator)
|
||||
fltExp = re.compile('^\s*[-+]?\d+(\.\d+)?\s*$')
|
||||
|
||||
for row in reader:
|
||||
tr = TableRow()
|
||||
table.addElement(tr)
|
||||
for val in row:
|
||||
if fltExp.match(val):
|
||||
tc = TableCell(valuetype="float", value=val.strip())
|
||||
else:
|
||||
tc = TableCell(valuetype="string")
|
||||
tr.addElement(tc)
|
||||
p = P(stylename=tablecontents,text=val)
|
||||
tc.addElement(p)
|
||||
|
||||
textdoc.spreadsheet.addElement(table)
|
||||
textdoc.save( pathFileODS )
|
||||
|
||||
if __name__ == "__main__":
|
||||
usage = "%prog -i file.csv -o file.ods -d"
|
||||
parser = OptionParser(usage=usage, version="%prog 0.1")
|
||||
parser.add_option('-i','--input', action='store',
|
||||
dest='input', help='File input in csv')
|
||||
parser.add_option('-o','--output', action='store',
|
||||
dest='output', help='File output in ods')
|
||||
parser.add_option('-d','--delimiter', action='store',
|
||||
dest='delimiter', help='specifies a one-character string to use as the field separator. It defaults to ",".')
|
||||
|
||||
parser.add_option('-c','--encoding', action='store',
|
||||
dest='encoding', help='specifies the encoding the file csv. It defaults to utf-8')
|
||||
|
||||
parser.add_option('-t','--table', action='store',
|
||||
dest='tableName', help='The table name in the output file')
|
||||
|
||||
parser.add_option('-s','--skipinitialspace',
|
||||
dest='skipinitialspace', help='''specifies how to interpret whitespace which
|
||||
immediately follows a delimiter. It defaults to False, which
|
||||
means that whitespace immediately following a delimiter is part
|
||||
of the following field.''')
|
||||
|
||||
parser.add_option('-l','--lineterminator', action='store',
|
||||
dest='lineterminator', help='''specifies the character sequence which should
|
||||
terminate rows.''')
|
||||
|
||||
parser.add_option('-q','--quoting', action='store',
|
||||
dest='quoting', help='''It can take on any of the following module constants:
|
||||
0 = QUOTE_MINIMAL means only when required, for example, when a field contains either the quotechar or the delimiter
|
||||
1 = QUOTE_ALL means that quotes are always placed around fields.
|
||||
2 = QUOTE_NONNUMERIC means that quotes are always placed around fields which do not parse as integers or floating point numbers.
|
||||
3 = QUOTE_NONE means that quotes are never placed around fields.
|
||||
It defaults is QUOTE_MINIMAL''')
|
||||
|
||||
parser.add_option('-e','--escapechar', action='store',
|
||||
dest='escapechar', help='''specifies a one-character string used to escape the delimiter when quoting is set to QUOTE_NONE.''')
|
||||
|
||||
parser.add_option('-r','--quotechar', action='store',
|
||||
dest='quotechar', help='''specifies a one-character string to use as the quoting character. It defaults to ".''')
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
if options.input:
|
||||
pathFileCSV = options.input
|
||||
else:
|
||||
parser.print_help()
|
||||
exit( 0 )
|
||||
|
||||
if options.output:
|
||||
pathFileODS = options.output
|
||||
else:
|
||||
parser.print_help()
|
||||
exit( 0 )
|
||||
|
||||
if options.delimiter:
|
||||
delimiter = options.delimiter
|
||||
else:
|
||||
delimiter = ","
|
||||
|
||||
if options.skipinitialspace:
|
||||
skipinitialspace = True
|
||||
else:
|
||||
skipinitialspace=False
|
||||
|
||||
if options.lineterminator:
|
||||
lineterminator = options.lineterminator
|
||||
else:
|
||||
lineterminator ="\r\n"
|
||||
|
||||
if options.escapechar:
|
||||
escapechar = options.escapechar
|
||||
else:
|
||||
escapechar=None
|
||||
|
||||
if options.tableName:
|
||||
tableName = options.tableName
|
||||
else:
|
||||
tableName = "table"
|
||||
|
||||
if options.quotechar:
|
||||
quotechar = options.quotechar
|
||||
else:
|
||||
quotechar = "\""
|
||||
|
||||
encoding = "utf-8" # default setting
|
||||
###########################################################
|
||||
## try to guess the encoding; this is implemented only with
|
||||
## POSIX platforms. Can it be improved?
|
||||
output = os.popen('/usr/bin/file ' + pathFileCSV).read()
|
||||
m=re.match(r'^.*: ([-a-zA-Z0-9]+) text$', output)
|
||||
if m:
|
||||
encoding=m.group(1)
|
||||
if 'ISO-8859' in encoding:
|
||||
encoding="latin-1"
|
||||
else:
|
||||
encoding="utf-8"
|
||||
############################################################
|
||||
# when the -c or --coding switch is used, it takes precedence
|
||||
if options.encoding:
|
||||
encoding = options.encoding
|
||||
|
||||
csvToOds( pathFileCSV=unicode(pathFileCSV),
|
||||
pathFileODS=unicode(pathFileODS),
|
||||
delimiter=delimiter, skipinitialspace=skipinitialspace,
|
||||
escapechar=escapechar,
|
||||
lineterminator=unicode(lineterminator),
|
||||
tableName=tableName, quotechar=quotechar,
|
||||
encoding=encoding)
|
||||
|
||||
# Local Variables: ***
|
||||
# mode: python ***
|
||||
# End: ***
|
||||
10
venv/bin/csv2rdf
Executable file
10
venv/bin/csv2rdf
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from rdflib.tools.csv2rdf import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
10
venv/bin/f2py
Executable file
10
venv/bin/f2py
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from numpy.f2py.f2py2e import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
95
venv/bin/mailodf
Executable file
95
venv/bin/mailodf
Executable file
@ -0,0 +1,95 @@
|
||||
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2006 Søren Roug, European Environment Agency
|
||||
#
|
||||
# This is free software. You may redistribute it under the terms
|
||||
# of the Apache license and the GNU General Public License Version
|
||||
# 2 or at your option any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public
|
||||
# License along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#
|
||||
# Contributor(s):
|
||||
#
|
||||
from odf.odf2xhtml import ODF2XHTML
|
||||
import zipfile
|
||||
import sys, os, smtplib, getopt
|
||||
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from email.mime.nonmultipart import MIMENonMultipart
|
||||
from email.mime.text import MIMEText
|
||||
from email.encoders import encode_base64
|
||||
|
||||
if sys.version_info[0]==3: unicode=str
|
||||
|
||||
def usage():
|
||||
sys.stderr.write("Usage: %s [-f from] [-s subject] inputfile recipients...\n" % sys.argv[0])
|
||||
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "f:s:", ["from=", "subject="])
|
||||
except getopt.GetoptError:
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
fromaddr = os.getlogin() + "@" + os.getenv('HOSTNAME','localhost')
|
||||
subject = None
|
||||
for o, a in opts:
|
||||
if o in ("-f", "--from"):
|
||||
fromaddr = a
|
||||
if o in ("-s", "--subject"):
|
||||
subject = a
|
||||
|
||||
if len(args) < 2:
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
suffices = {
|
||||
'wmf':('image','x-wmf'),
|
||||
'png':('image','png'),
|
||||
'gif':('image','gif'),
|
||||
'jpg':('image','jpeg'),
|
||||
'jpeg':('image','jpeg')
|
||||
}
|
||||
|
||||
msg = MIMEMultipart('related',type="text/html")
|
||||
msg['From'] = fromaddr
|
||||
# msg['Date'] = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
|
||||
msg['To'] = ','.join(args[1:])
|
||||
msg.preamble = 'This is a multi-part message in MIME format.'
|
||||
msg.epilogue = ''
|
||||
odhandler = ODF2XHTML()
|
||||
result = odhandler.odf2xhtml(unicode(args[0]))
|
||||
if subject:
|
||||
msg['Subject'] = subject
|
||||
else:
|
||||
msg['Subject'] = odhandler.title
|
||||
htmlpart = MIMEText(result,'html','us-ascii')
|
||||
htmlpart['Content-Location'] = 'index.html'
|
||||
msg.attach(htmlpart)
|
||||
z = zipfile.ZipFile(unicode(args[0]))
|
||||
for file in z.namelist():
|
||||
if file[0:9] == 'Pictures/':
|
||||
suffix = file[file.rfind(".")+1:]
|
||||
main,sub = suffices.get(suffix,('application','octet-stream'))
|
||||
img = MIMENonMultipart(main,sub)
|
||||
img.set_payload(z.read(file))
|
||||
img['Content-Location'] = "" + file
|
||||
encode_base64(img)
|
||||
msg.attach(img)
|
||||
z.close()
|
||||
|
||||
server = smtplib.SMTP('localhost')
|
||||
#server.set_debuglevel(1)
|
||||
server.sendmail(fromaddr, args[1:], msg.as_string())
|
||||
server.quit()
|
||||
|
||||
|
||||
# Local Variables: ***
|
||||
# mode: python ***
|
||||
# End: ***
|
||||
10
venv/bin/markdown-it
Executable file
10
venv/bin/markdown-it
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from markdown_it.cli.parse import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
10
venv/bin/nltk
Executable file
10
venv/bin/nltk
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from nltk.cli import cli
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(cli())
|
||||
10
venv/bin/normalizer
Executable file
10
venv/bin/normalizer
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from charset_normalizer.cli import cli_detect
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(cli_detect())
|
||||
10
venv/bin/numpy-config
Executable file
10
venv/bin/numpy-config
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from numpy._configtool import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
72
venv/bin/odf2mht
Executable file
72
venv/bin/odf2mht
Executable file
@ -0,0 +1,72 @@
|
||||
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2006 Søren Roug, European Environment Agency
|
||||
#
|
||||
# This is free software. You may redistribute it under the terms
|
||||
# of the Apache license and the GNU General Public License Version
|
||||
# 2 or at your option any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public
|
||||
# License along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#
|
||||
# Contributor(s):
|
||||
#
|
||||
from __future__ import print_function
|
||||
from odf.odf2xhtml import ODF2XHTML
|
||||
import zipfile
|
||||
import sys
|
||||
#from time import gmtime, strftime
|
||||
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from email.mime.nonmultipart import MIMENonMultipart
|
||||
from email.mime.text import MIMEText
|
||||
from email import encoders
|
||||
|
||||
if sys.version_info[0]==3: unicode=str
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
sys.stderr.write("Usage: %s inputfile\n" % sys.argv[0])
|
||||
sys.exit(1)
|
||||
|
||||
suffices = {
|
||||
'wmf':('image','x-wmf'),
|
||||
'png':('image','png'),
|
||||
'gif':('image','gif'),
|
||||
'jpg':('image','jpeg'),
|
||||
'jpeg':('image','jpeg')
|
||||
}
|
||||
|
||||
msg = MIMEMultipart('related',type="text/html")
|
||||
# msg['Subject'] = 'Subject here'
|
||||
# msg['From'] = '<Saved by ODT2MHT>'
|
||||
# msg['Date'] = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
|
||||
msg.preamble = 'This is a multi-part message in MIME format.'
|
||||
msg.epilogue = ''
|
||||
odhandler = ODF2XHTML()
|
||||
result = odhandler.odf2xhtml(unicode(sys.argv[1]))
|
||||
htmlpart = MIMEText(result,'html','us-ascii')
|
||||
htmlpart['Content-Location'] = 'index.html'
|
||||
msg.attach(htmlpart)
|
||||
z = zipfile.ZipFile(sys.argv[1])
|
||||
for file in z.namelist():
|
||||
if file[0:9] == 'Pictures/':
|
||||
suffix = file[file.rfind(".")+1:]
|
||||
main,sub = suffices.get(suffix,('application','octet-stream'))
|
||||
img = MIMENonMultipart(main,sub)
|
||||
img.set_payload(z.read(file))
|
||||
img['Content-Location'] = "" + file
|
||||
encoders.encode_base64(img)
|
||||
msg.attach(img)
|
||||
z.close()
|
||||
print (msg.as_string())
|
||||
|
||||
|
||||
# Local Variables: ***
|
||||
# mode: python ***
|
||||
# End: ***
|
||||
59
venv/bin/odf2xhtml
Executable file
59
venv/bin/odf2xhtml
Executable file
@ -0,0 +1,59 @@
|
||||
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2007 Søren Roug, European Environment Agency
|
||||
#
|
||||
# This is free software. You may redistribute it under the terms
|
||||
# of the Apache license and the GNU General Public License Version
|
||||
# 2 or at your option any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public
|
||||
# License along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#
|
||||
# Contributor(s):
|
||||
#
|
||||
from odf.odf2xhtml import ODF2XHTML
|
||||
import sys, getopt
|
||||
|
||||
if sys.version_info[0]==3: unicode=str
|
||||
|
||||
from io import StringIO
|
||||
|
||||
def usage():
|
||||
sys.stderr.write("Usage: %s [-p] inputfile\n" % sys.argv[0])
|
||||
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "ep", ["plain","embedable"])
|
||||
except getopt.GetoptError:
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
generatecss = True
|
||||
embedable = False
|
||||
for o, a in opts:
|
||||
if o in ("-p", "--plain"):
|
||||
generatecss = False
|
||||
if o in ("-e", "--embedable"):
|
||||
embedable = True
|
||||
|
||||
if len(args) != 1:
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
odhandler = ODF2XHTML(generatecss, embedable)
|
||||
try:
|
||||
result = odhandler.odf2xhtml(unicode(args[0]))
|
||||
except:
|
||||
sys.stderr.write("Unable to open file %s or file is not OpenDocument\n" % args[0])
|
||||
sys.exit(1)
|
||||
sys.stdout.write(result)
|
||||
|
||||
|
||||
# Local Variables: ***
|
||||
# mode: python ***
|
||||
# End: ***
|
||||
81
venv/bin/odf2xml
Executable file
81
venv/bin/odf2xml
Executable file
@ -0,0 +1,81 @@
|
||||
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2008 Søren Roug, European Environment Agency
|
||||
#
|
||||
# This is free software. You may redistribute it under the terms
|
||||
# of the Apache license and the GNU General Public License Version
|
||||
# 2 or at your option any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public
|
||||
# License along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#
|
||||
# Contributor(s):
|
||||
#
|
||||
#
|
||||
|
||||
# OpenDocument can be a complete office document in a single
|
||||
# XML document. This script will create such a document.
|
||||
import sys, getopt, base64
|
||||
from odf.opendocument import load
|
||||
from odf.draw import Image, ObjectOle
|
||||
from odf.style import BackgroundImage
|
||||
from odf.text import ListLevelStyleImage
|
||||
from odf.office import BinaryData
|
||||
|
||||
if sys.version_info[0]==3: unicode=str
|
||||
|
||||
def usage():
|
||||
sys.stderr.write("Usage: %s [-e] [-o outputfile] [inputfile]\n" % sys.argv[0])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
embedimage = False
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "o:e", ["output="])
|
||||
except getopt.GetoptError:
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
outputfile = '-'
|
||||
|
||||
for o, a in opts:
|
||||
if o in ("-o", "--output"):
|
||||
outputfile = a
|
||||
if o == '-e':
|
||||
embedimage = True
|
||||
|
||||
if len(args) > 1:
|
||||
usage()
|
||||
sys.exit(2)
|
||||
if len(args) == 0:
|
||||
d = load(sys.stdin)
|
||||
else:
|
||||
d = load(unicode(args[0]))
|
||||
if embedimage:
|
||||
images = d.getElementsByType(Image) + \
|
||||
d.getElementsByType(BackgroundImage) + \
|
||||
d.getElementsByType(ObjectOle) + \
|
||||
d.getElementsByType(ListLevelStyleImage)
|
||||
for image in images:
|
||||
href = image.getAttribute('href')
|
||||
if href and href[:9] == "Pictures/":
|
||||
p = d.Pictures[href]
|
||||
bp = base64.encodestring(p[1])
|
||||
image.addElement(BinaryData(text=bp))
|
||||
image.removeAttribute('href')
|
||||
xml = d.xml()
|
||||
if outputfile == '-':
|
||||
print (xml)
|
||||
else:
|
||||
open(outputfile,"wb").write(xml)
|
||||
|
||||
|
||||
# Local Variables: ***
|
||||
# mode: python ***
|
||||
# End: ***
|
||||
190
venv/bin/odfimgimport
Executable file
190
venv/bin/odfimgimport
Executable file
@ -0,0 +1,190 @@
|
||||
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2007-2009 Søren Roug, European Environment Agency
|
||||
#
|
||||
# This is free software. You may redistribute it under the terms
|
||||
# of the Apache license and the GNU General Public License Version
|
||||
# 2 or at your option any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public
|
||||
# License along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#
|
||||
# Contributor(s):
|
||||
#
|
||||
from __future__ import print_function
|
||||
|
||||
import zipfile, sys, getopt, mimetypes
|
||||
try:
|
||||
from urllib2 import urlopen, quote, unquote
|
||||
except ImportError:
|
||||
from urllib.request import urlopen, quote, unquote
|
||||
try:
|
||||
from urlparse import urlunsplit, urlsplit
|
||||
except ImportError:
|
||||
from urllib.parse import urlunsplit, urlsplit
|
||||
from odf.opendocument import load
|
||||
from odf.draw import Image
|
||||
|
||||
if sys.version_info[0]==3: unicode=str
|
||||
|
||||
#sys.tracebacklimit = 0
|
||||
|
||||
# Variable to count the number of retrieval failures
|
||||
failures = 0
|
||||
|
||||
# Set to one if quiet behaviour is wanted
|
||||
quiet = 0
|
||||
|
||||
# If set will write every url to import
|
||||
verbose = 0
|
||||
|
||||
# Dictionary with new pictures. Key is original file path
|
||||
# Item is newfilename
|
||||
newpictures = {}
|
||||
doc = None
|
||||
|
||||
def importpicture(href):
|
||||
""" Add the picture to the ZIP file
|
||||
Returns the new path name to the file in the zip archive
|
||||
If it is unable to import, then it returns the original href
|
||||
Sideeffect: add line to manifest
|
||||
"""
|
||||
global doc, newpictures, failures, verbose
|
||||
|
||||
# Check that it is not already in the manifest
|
||||
if href in doc.Pictures: return href
|
||||
|
||||
image = None
|
||||
if verbose: print ("Importing", href, file=sys.stderr)
|
||||
if href[:7] == "http://" or href[:8] == "https://" or href[:6] == "ftp://":
|
||||
# There is a bug in urlopen: It can't open urls with non-ascii unicode
|
||||
# characters. Convert to UTF-8 and then use percent encoding
|
||||
try:
|
||||
goodhref = href.encode('ascii')
|
||||
except:
|
||||
o = list(urlsplit(href))
|
||||
o[2] = quote(o[2].encode('utf-8'))
|
||||
goodhref = urlunsplit(o)
|
||||
if goodhref in newpictures:
|
||||
if verbose: print ("already imported", file=sys.stderr)
|
||||
return newpictures[goodhref] # Already imported
|
||||
try:
|
||||
f = urlopen(goodhref.decode("utf-8"))
|
||||
image = f.read()
|
||||
headers = f.info()
|
||||
f.close()
|
||||
# Get the mimetype from the headerlines
|
||||
c_t = headers['Content-Type'].split(';')[0].strip()
|
||||
if c_t: mediatype = c_t.split(';')[0].strip()
|
||||
if verbose: print ("OK", file=sys.stderr)
|
||||
except:
|
||||
failures += 1
|
||||
if verbose: print ("failed", file=sys.stderr)
|
||||
return href
|
||||
# Remove query string
|
||||
try: href= href[:href.rindex('?')]
|
||||
except: pass
|
||||
try:
|
||||
lastslash = href[href.rindex('/'):]
|
||||
ext = lastslash[lastslash.rindex('.'):]
|
||||
except: ext = mimetypes.guess_extension(mediatype)
|
||||
# Everything is a simple path.
|
||||
else:
|
||||
goodhref = href
|
||||
if href[:3] == '../':
|
||||
if directory is None:
|
||||
goodhref = unquote(href[3:])
|
||||
else:
|
||||
goodhref = unquote(directory + href[2:])
|
||||
if goodhref in newpictures:
|
||||
if verbose: print ("already imported", file=sys.stderr)
|
||||
return newpictures[goodhref] # Already imported
|
||||
mediatype, encoding = mimetypes.guess_type(goodhref)
|
||||
if mediatype is None:
|
||||
mediatype = ''
|
||||
try: ext = goodhref[goodhref.rindex('.'):]
|
||||
except: ext=''
|
||||
else:
|
||||
ext = mimetypes.guess_extension(mediatype)
|
||||
try:
|
||||
image = file(goodhref).read()
|
||||
if verbose: print ("OK", file=sys.stderr)
|
||||
except:
|
||||
failures += 1
|
||||
if verbose: print ("failed", file=sys.stderr)
|
||||
return href
|
||||
# If we have a picture to import, the image variable contains it
|
||||
# and manifestfn, ext and mediatype has a value
|
||||
if image:
|
||||
manifestfn = doc.addPictureFromString(image, unicode(mediatype))
|
||||
newpictures[goodhref] = manifestfn
|
||||
return manifestfn
|
||||
|
||||
if verbose: print ("not imported", file=sys.stderr)
|
||||
return href
|
||||
|
||||
def exitwithusage(exitcode=2):
|
||||
""" Print out usage information and exit """
|
||||
print ("Usage: %s [-q] [-v] [-o output] [inputfile]" % sys.argv[0], file=sys.stderr)
|
||||
print ("\tInputfile must be OpenDocument format", file=sys.stderr)
|
||||
sys.exit(exitcode)
|
||||
|
||||
outputfile = None
|
||||
writefile = True
|
||||
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "qvo:")
|
||||
except getopt.GetoptError:
|
||||
exitwithusage()
|
||||
|
||||
for o, a in opts:
|
||||
if o == "-o":
|
||||
outputfile = a
|
||||
writefile = True
|
||||
if o == "-q":
|
||||
quiet = 1
|
||||
if o == "-v":
|
||||
verbose = 1
|
||||
|
||||
if len(args) == 0:
|
||||
try:
|
||||
doc = load(sys.stdin)
|
||||
directory = None
|
||||
except:
|
||||
print ("Couldn't open OpenDocument file", file=sys.stderr)
|
||||
exitwithusage()
|
||||
else:
|
||||
fn = unicode(args[0])
|
||||
if not zipfile.is_zipfile(fn):
|
||||
exitwithusage()
|
||||
dirinx = max(fn.rfind('\\'), fn.rfind('/'))
|
||||
if dirinx >= 0: directory = fn[:dirinx]
|
||||
else: directory = "."
|
||||
doc = load(fn)
|
||||
|
||||
for image in doc.getElementsByType(Image):
|
||||
href = image.getAttribute('href')
|
||||
newhref = importpicture(href)
|
||||
image.setAttribute('href',newhref)
|
||||
|
||||
if writefile:
|
||||
if outputfile is None:
|
||||
doc.save(fn)
|
||||
else:
|
||||
doc.save(unicode(outputfile))
|
||||
|
||||
|
||||
if quiet == 0 and failures > 0:
|
||||
print ("Couldn't import %d image(s)" % failures, file=sys.stderr)
|
||||
sys.exit( int(failures > 0) )
|
||||
|
||||
|
||||
# Local Variables: ***
|
||||
# mode: python ***
|
||||
# End: ***
|
||||
216
venv/bin/odflint
Executable file
216
venv/bin/odflint
Executable file
@ -0,0 +1,216 @@
|
||||
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2009 Søren Roug, European Environment Agency
|
||||
#
|
||||
# This is free software. You may redistribute it under the terms
|
||||
# of the Apache license and the GNU General Public License Version
|
||||
# 2 or at your option any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public
|
||||
# License along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#
|
||||
# Contributor(s):
|
||||
#
|
||||
import zipfile
|
||||
from xml.sax import make_parser,handler
|
||||
from xml.sax.xmlreader import InputSource
|
||||
import xml.sax.saxutils
|
||||
import sys
|
||||
from odf.opendocument import OpenDocument
|
||||
from odf import element, grammar
|
||||
from odf.namespaces import *
|
||||
from odf.attrconverters import attrconverters, cnv_string
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
if sys.version_info[0]==3: unicode=str
|
||||
|
||||
extension_attributes = {
|
||||
"OpenOffice.org" : {
|
||||
(METANS,u'template'): (
|
||||
(XLINKNS,u'role'),
|
||||
),
|
||||
(STYLENS,u'graphic-properties'): (
|
||||
(STYLENS,u'background-transparency'),
|
||||
),
|
||||
(STYLENS,u'paragraph-properties'): (
|
||||
(TEXTNS,u'enable-numbering'),
|
||||
(STYLENS,u'join-border'),
|
||||
),
|
||||
(STYLENS,u'table-cell-properties'): (
|
||||
(STYLENS,u'writing-mode'),
|
||||
),
|
||||
(STYLENS,u'table-row-properties'): (
|
||||
(STYLENS,u'keep-together'),
|
||||
),
|
||||
},
|
||||
"KOffice" : {
|
||||
(STYLENS,u'graphic-properties'): (
|
||||
(KOFFICENS,u'frame-behavior-on-new-page'),
|
||||
),
|
||||
(DRAWNS,u'page'): (
|
||||
(KOFFICENS,u'name'),
|
||||
),
|
||||
(PRESENTATIONNS,u'show-shape'): (
|
||||
(KOFFICENS,u'order-id'),
|
||||
),
|
||||
(PRESENTATIONNS,u'hide-shape'): (
|
||||
(KOFFICENS,u'order-id'),
|
||||
),
|
||||
(CHARTNS,u'legend'): (
|
||||
(KOFFICENS,u'title'),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
printed_errors = []
|
||||
|
||||
def print_error(str):
|
||||
if str not in printed_errors:
|
||||
printed_errors.append(str)
|
||||
print (str)
|
||||
|
||||
def chop_arg(arg):
|
||||
if len(arg) > 20:
|
||||
return "%s..." % arg[0:20]
|
||||
return arg
|
||||
|
||||
def make_qname(tag):
|
||||
return "%s:%s" % (nsdict.get(tag[0],tag[0]), tag[1])
|
||||
|
||||
def allowed_attributes(tag):
|
||||
return grammar.allowed_attributes.get(tag)
|
||||
|
||||
|
||||
class ODFElementHandler(handler.ContentHandler):
|
||||
""" Extract headings from content.xml of an ODT file """
|
||||
def __init__(self, document):
|
||||
self.doc = document
|
||||
self.tagstack = []
|
||||
self.data = []
|
||||
self.currtag = None
|
||||
|
||||
def characters(self, data):
|
||||
self.data.append(data)
|
||||
|
||||
def startElementNS(self, tag, qname, attrs):
|
||||
""" Pseudo-create an element
|
||||
"""
|
||||
allowed_attrs = grammar.allowed_attributes.get(tag)
|
||||
attrdict = {}
|
||||
for (att,value) in attrs.items():
|
||||
prefix = nsdict.get(att[0],att[0])
|
||||
# Check if it is a known extension
|
||||
notan_extension = True
|
||||
for product, ext_attrs in extension_attributes.items():
|
||||
allowed_ext_attrs = ext_attrs.get(tag)
|
||||
if allowed_ext_attrs and att in allowed_ext_attrs:
|
||||
print_error("Warning: Attribute %s in element <%s> is illegal - %s extension" % ( make_qname(att), make_qname(tag), product))
|
||||
notan_extension = False
|
||||
# Check if it is an allowed attribute
|
||||
if notan_extension and allowed_attrs and att not in allowed_attrs:
|
||||
print_error("Error: Attribute %s:%s is not allowed in element <%s>" % ( prefix, att[1], make_qname(tag)))
|
||||
# Check the value
|
||||
try:
|
||||
convert = attrconverters.get(att, cnv_string)
|
||||
convert(att, value, tag)
|
||||
except ValueError as res:
|
||||
print_error("Error: Bad value '%s' for attribute %s:%s in tag: <%s> - %s" %
|
||||
(chop_arg(value), prefix, att[1], make_qname(tag), res))
|
||||
|
||||
self.tagstack.append(tag)
|
||||
self.data = []
|
||||
# Check that the parent allows this child element
|
||||
if tag not in ( (OFFICENS, 'document'), (OFFICENS, 'document-content'), (OFFICENS, 'document-styles'),
|
||||
(OFFICENS, 'document-meta'), (OFFICENS, 'document-settings'),
|
||||
(MANIFESTNS,'manifest')):
|
||||
try:
|
||||
parent = self.tagstack[-2]
|
||||
allowed_children = grammar.allowed_children.get(parent)
|
||||
except:
|
||||
print_error("Error: This document starts with the wrong tag: <%s>" % make_qname(tag))
|
||||
allowed_children = None
|
||||
if allowed_children and tag not in allowed_children:
|
||||
print_error("Error: Element %s is not allowed in element %s" % ( make_qname(tag), make_qname(parent)))
|
||||
# Test that all mandatory attributes have been added.
|
||||
required = grammar.required_attributes.get(tag)
|
||||
if required:
|
||||
for r in required:
|
||||
if attrs.get(r) is None:
|
||||
print_error("Error: Required attribute missing: %s in <%s>" % (make_qname(r), make_qname(tag)))
|
||||
|
||||
|
||||
def endElementNS(self, tag, qname):
|
||||
self.currtag = self.tagstack.pop()
|
||||
str = ''.join(self.data).strip()
|
||||
# Check that only elements that can take text have text
|
||||
# But only elements we know exist in grammar
|
||||
if tag in grammar.allowed_children:
|
||||
if str != '' and tag not in grammar.allows_text:
|
||||
print_error("Error: %s does not allow text data" % make_qname(tag))
|
||||
self.data = []
|
||||
|
||||
class ODFDTDHandler(handler.DTDHandler):
|
||||
def notationDecl(self, name, public_id, system_id):
|
||||
""" Ignore DTDs """
|
||||
print_error("Warning: ODF doesn't use DOCTYPEs")
|
||||
|
||||
def exitwithusage(exitcode=2):
|
||||
""" print out usage information """
|
||||
sys.stderr.write("Usage: %s inputfile\n" % sys.argv[0])
|
||||
sys.stderr.write("\tInputfile must be OpenDocument format\n")
|
||||
sys.exit(exitcode)
|
||||
|
||||
def lint(odffile):
|
||||
if not zipfile.is_zipfile(odffile):
|
||||
print_error("Error: This is not a zipped file")
|
||||
return
|
||||
zfd = zipfile.ZipFile(odffile)
|
||||
try:
|
||||
mimetype = zfd.read('mimetype')
|
||||
except:
|
||||
mimetype=''
|
||||
d = OpenDocument(unicode(mimetype))
|
||||
first = True
|
||||
for zi in zfd.infolist():
|
||||
if first:
|
||||
if zi.filename == 'mimetype':
|
||||
if zi.compress_type != zipfile.ZIP_STORED:
|
||||
print_error("Error: The 'mimetype' member must be stored - not deflated")
|
||||
if zi.comment != "":
|
||||
print_error("Error: The 'mimetype' member must not have extra header info")
|
||||
else:
|
||||
print_error("Warning: The first member in the archive should be the mimetype")
|
||||
first = False
|
||||
if zi.filename in ('META-INF/manifest.xml', 'content.xml', 'meta.xml', 'styles.xml', 'settings.xml'):
|
||||
content = zfd.read(zi.filename)
|
||||
parser = make_parser()
|
||||
parser.setFeature(handler.feature_namespaces, True)
|
||||
parser.setFeature(handler.feature_external_ges, False)
|
||||
parser.setContentHandler(ODFElementHandler(d))
|
||||
dtdh = ODFDTDHandler()
|
||||
parser.setDTDHandler(dtdh)
|
||||
parser.setErrorHandler(handler.ErrorHandler())
|
||||
|
||||
inpsrc = InputSource()
|
||||
if not isinstance(content, str):
|
||||
content=content
|
||||
inpsrc.setByteStream(BytesIO(content))
|
||||
parser.parse(inpsrc)
|
||||
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
exitwithusage()
|
||||
lint(unicode(sys.argv[1]))
|
||||
|
||||
|
||||
|
||||
# Local Variables: ***
|
||||
# mode: python ***
|
||||
# End: ***
|
||||
266
venv/bin/odfmeta
Executable file
266
venv/bin/odfmeta
Executable file
@ -0,0 +1,266 @@
|
||||
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2006-2009 Søren Roug, European Environment Agency
|
||||
#
|
||||
# This is free software. You may redistribute it under the terms
|
||||
# of the Apache license and the GNU General Public License Version
|
||||
# 2 or at your option any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public
|
||||
# License along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#
|
||||
# Contributor(s):
|
||||
#
|
||||
import zipfile, time, sys, getopt, re
|
||||
import xml.sax, xml.sax.saxutils
|
||||
from odf.namespaces import TOOLSVERSION, OFFICENS, XLINKNS, DCNS, METANS
|
||||
from io import BytesIO
|
||||
|
||||
OUTENCODING="utf-8"
|
||||
|
||||
whitespace = re.compile(r'\s+')
|
||||
|
||||
fields = {
|
||||
'title': (DCNS,u'title'),
|
||||
'description': (DCNS,u'description'),
|
||||
'subject': (DCNS,u'subject'),
|
||||
'creator': (DCNS,u'creator'),
|
||||
'date': (DCNS,u'date'),
|
||||
'language': (DCNS,u'language'),
|
||||
'generator': (METANS,u'generator'),
|
||||
'initial-creator': (METANS,u'initial-creator'),
|
||||
'keyword': (METANS,u'keyword'),
|
||||
'editing-duration': (METANS,u'editing-duration'),
|
||||
'editing-cycles': (METANS,u'editing-cycles'),
|
||||
'printed-by': (METANS,u'printed-by'),
|
||||
'print-date': (METANS,u'print-date'),
|
||||
'creation-date': (METANS,u'creation-date'),
|
||||
'user-defined': (METANS,u'user-defined'),
|
||||
#'template': (METANS,u'template'),
|
||||
}
|
||||
|
||||
xfields = []
|
||||
Xfields = []
|
||||
addfields = {}
|
||||
deletefields = {}
|
||||
yieldfields = {}
|
||||
showversion = None
|
||||
|
||||
def exitwithusage(exitcode=2):
|
||||
""" print out usage information """
|
||||
sys.stderr.write("Usage: %s [-cdlvV] [-xXaAI metafield]... [-o output] [inputfile]\n" % sys.argv[0])
|
||||
sys.stderr.write("\tInputfile must be OpenDocument format\n")
|
||||
sys.exit(exitcode)
|
||||
|
||||
def normalize(str):
|
||||
"""
|
||||
The normalize-space function returns the argument string with whitespace
|
||||
normalized by stripping leading and trailing whitespace and replacing
|
||||
sequences of whitespace characters by a single space.
|
||||
"""
|
||||
return whitespace.sub(' ', str).strip()
|
||||
|
||||
class MetaCollector:
|
||||
"""
|
||||
The MetaCollector is a pseudo file object, that can temporarily ignore write-calls
|
||||
It could probably be replaced with a StringIO object.
|
||||
"""
|
||||
def __init__(self):
|
||||
self._content = []
|
||||
self.dowrite = True
|
||||
|
||||
def write(self, str):
|
||||
if self.dowrite:
|
||||
self._content.append(str)
|
||||
|
||||
def content(self):
|
||||
return ''.join(self._content)
|
||||
|
||||
|
||||
base = xml.sax.saxutils.XMLGenerator
|
||||
|
||||
class odfmetaparser(base):
|
||||
""" Parse a meta.xml file with an event-driven parser and replace elements.
|
||||
It would probably be a cleaner approach to use a DOM based parser and
|
||||
then manipulate in memory.
|
||||
Small issue: Reorders elements
|
||||
"""
|
||||
version = 'Unknown'
|
||||
|
||||
def __init__(self):
|
||||
self._mimetype = ''
|
||||
self.output = MetaCollector()
|
||||
self._data = []
|
||||
self.seenfields = {}
|
||||
base.__init__(self, self.output, OUTENCODING)
|
||||
|
||||
def startElementNS(self, name, qname, attrs):
|
||||
self._data = []
|
||||
field = name
|
||||
# I can't modify the template until the tool replaces elements at the same
|
||||
# location and not at the end
|
||||
# if name == (METANS,u'template'):
|
||||
# self._data = [attrs.get((XLINKNS,u'title'),'')]
|
||||
if showversion and name == (OFFICENS,u'document-meta'):
|
||||
if showversion == '-V':
|
||||
print ("version:%s" % attrs.get((OFFICENS,u'version'),'Unknown').decode('utf-8'))
|
||||
else:
|
||||
print ("%s" % attrs.get((OFFICENS,u'version'),'Unknown').decode('utf-8'))
|
||||
if name == (METANS,u'user-defined'):
|
||||
field = attrs.get((METANS,u'name'))
|
||||
if field in deletefields:
|
||||
self.output.dowrite = False
|
||||
elif field in yieldfields:
|
||||
del addfields[field]
|
||||
base.startElementNS(self, name, qname, attrs)
|
||||
else:
|
||||
base.startElementNS(self, name, qname, attrs)
|
||||
self._tag = field
|
||||
|
||||
def endElementNS(self, name, qname):
|
||||
field = name
|
||||
if name == (METANS,u'user-defined'):
|
||||
field = self._tag
|
||||
if name == (OFFICENS,u'meta'):
|
||||
for k,v in addfields.items():
|
||||
if len(v) > 0:
|
||||
if type(k) == type(''):
|
||||
base.startElementNS(self,(METANS,u'user-defined'),None,{(METANS,u'name'):k})
|
||||
base.characters(self, v)
|
||||
base.endElementNS(self, (METANS,u'user-defined'),None)
|
||||
else:
|
||||
base.startElementNS(self, k, None, {})
|
||||
base.characters(self, v)
|
||||
base.endElementNS(self, k, None)
|
||||
if name in xfields:
|
||||
print ("%s" % self.data())
|
||||
if name in Xfields:
|
||||
if isinstance(self._tag, tuple):
|
||||
texttag = self._tag[1]
|
||||
else:
|
||||
texttag = self._tag
|
||||
print ("%s:%s" % (texttag, self.data()))
|
||||
if field in deletefields:
|
||||
self.output.dowrite = True
|
||||
else:
|
||||
base.endElementNS(self, name, qname)
|
||||
|
||||
def characters(self, content):
|
||||
base.characters(self, content)
|
||||
self._data.append(content)
|
||||
|
||||
def meta(self):
|
||||
return self.output.content()
|
||||
|
||||
def data(self):
|
||||
if usenormalize:
|
||||
return normalize(''.join(self._data))
|
||||
else:
|
||||
return ''.join(self._data)
|
||||
|
||||
now = time.localtime()[:6]
|
||||
outputfile = "-"
|
||||
writemeta = False # Do we change any meta data?
|
||||
usenormalize = False
|
||||
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "cdlvVI:A:a:o:x:X:")
|
||||
except getopt.GetoptError:
|
||||
exitwithusage()
|
||||
|
||||
if len(opts) == 0:
|
||||
opts = [ ('-l','') ]
|
||||
|
||||
for o, a in opts:
|
||||
if o in ('-a','-A','-I'):
|
||||
writemeta = True
|
||||
if a.find(":") >= 0:
|
||||
k,v = a.split(":",1)
|
||||
else:
|
||||
k,v = (a, "")
|
||||
if len(k) == 0:
|
||||
exitwithusage()
|
||||
k = fields.get(k,k)
|
||||
addfields[k] = unicode(v,'utf-8')
|
||||
if o == '-a':
|
||||
yieldfields[k] = True
|
||||
if o == '-I':
|
||||
deletefields[k] = True
|
||||
if o == '-d':
|
||||
writemeta = True
|
||||
addfields[(DCNS,u'date')] = "%04d-%02d-%02dT%02d:%02d:%02d" % now
|
||||
deletefields[(DCNS,u'date')] = True
|
||||
if o == '-c':
|
||||
usenormalize = True
|
||||
if o in ('-v', '-V'):
|
||||
showversion = o
|
||||
if o == '-l':
|
||||
Xfields = fields.values()
|
||||
if o == "-x":
|
||||
xfields.append(fields.get(a,a))
|
||||
if o == "-X":
|
||||
Xfields.append(fields.get(a,a))
|
||||
if o == "-o":
|
||||
outputfile = a
|
||||
|
||||
# The specification says we should change the element to our own,
|
||||
# and must not export the original identifier.
|
||||
if writemeta:
|
||||
addfields[(METANS,u'generator')] = TOOLSVERSION
|
||||
deletefields[(METANS,u'generator')] = True
|
||||
|
||||
odfs = odfmetaparser()
|
||||
parser = xml.sax.make_parser()
|
||||
parser.setFeature(xml.sax.handler.feature_namespaces, 1)
|
||||
parser.setContentHandler(odfs)
|
||||
|
||||
if len(args) == 0:
|
||||
zin = zipfile.ZipFile(sys.stdin,'r')
|
||||
else:
|
||||
if not zipfile.is_zipfile(args[0]):
|
||||
exitwithusage()
|
||||
zin = zipfile.ZipFile(args[0], 'r')
|
||||
|
||||
try:
|
||||
content = zin.read('meta.xml').decode('utf-8')
|
||||
except:
|
||||
sys.stderr.write("File has no meta data\n")
|
||||
sys.exit(1)
|
||||
parser.parse(BytesIO(content.encode('utf-8')))
|
||||
|
||||
if writemeta:
|
||||
if outputfile == '-':
|
||||
if sys.stdout.isatty():
|
||||
sys.stderr.write("Won't write ODF file to terminal\n")
|
||||
sys.exit(1)
|
||||
zout = zipfile.ZipFile(sys.stdout,"w")
|
||||
else:
|
||||
zout = zipfile.ZipFile(outputfile,"w")
|
||||
|
||||
|
||||
|
||||
# Loop through the input zipfile and copy the content to the output until we
|
||||
# get to the meta.xml. Then substitute.
|
||||
for zinfo in zin.infolist():
|
||||
if zinfo.filename == "meta.xml":
|
||||
# Write meta
|
||||
zi = zipfile.ZipInfo("meta.xml", now)
|
||||
zi.compress_type = zipfile.ZIP_DEFLATED
|
||||
zout.writestr(zi,odfs.meta() )
|
||||
else:
|
||||
payload = zin.read(zinfo.filename)
|
||||
zout.writestr(zinfo, payload)
|
||||
|
||||
zout.close()
|
||||
zin.close()
|
||||
|
||||
|
||||
# Local Variables: ***
|
||||
# mode: python ***
|
||||
# End: ***
|
||||
144
venv/bin/odfoutline
Executable file
144
venv/bin/odfoutline
Executable file
@ -0,0 +1,144 @@
|
||||
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2006 Søren Roug, European Environment Agency
|
||||
#
|
||||
# This is free software. You may redistribute it under the terms
|
||||
# of the Apache license and the GNU General Public License Version
|
||||
# 2 or at your option any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public
|
||||
# License along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#
|
||||
# Contributor(s):
|
||||
#
|
||||
from __future__ import print_function
|
||||
import zipfile
|
||||
from xml.sax import make_parser,handler
|
||||
from xml.sax.xmlreader import InputSource
|
||||
import xml.sax.saxutils
|
||||
import sys
|
||||
from odf.namespaces import TEXTNS, TABLENS, DRAWNS
|
||||
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from io import StringIO
|
||||
|
||||
|
||||
def getxmlpart(odffile, xmlfile):
|
||||
""" Get the content out of the ODT file"""
|
||||
z = zipfile.ZipFile(odffile)
|
||||
content = z.read(xmlfile)
|
||||
z.close()
|
||||
return content
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Extract headings from content.xml
|
||||
#
|
||||
class ODTHeadingHandler(handler.ContentHandler):
|
||||
""" Extract headings from content.xml of an ODT file """
|
||||
def __init__(self, eater):
|
||||
self.r = eater
|
||||
self.data = []
|
||||
self.level = 0
|
||||
|
||||
def characters(self, data):
|
||||
self.data.append(data)
|
||||
|
||||
def startElementNS(self, tag, qname, attrs):
|
||||
if tag == (TEXTNS, 'h'):
|
||||
self.level = 0
|
||||
for (att,value) in attrs.items():
|
||||
if att == (TEXTNS, 'outline-level'):
|
||||
self.level = int(value)
|
||||
self.data = []
|
||||
|
||||
def endElementNS(self, tag, qname):
|
||||
if tag == (TEXTNS, 'h'):
|
||||
str = ''.join(self.data)
|
||||
self.data = []
|
||||
self.r.append("%d%*s%s" % (self.level, self.level, '', str))
|
||||
|
||||
class ODTSheetHandler(handler.ContentHandler):
|
||||
""" Extract sheet names from content.xml of an ODS file """
|
||||
def __init__(self, eater):
|
||||
self.r = eater
|
||||
|
||||
def startElementNS(self, tag, qname, attrs):
|
||||
if tag == (TABLENS, 'table'):
|
||||
sheetname = attrs.get((TABLENS, 'name'))
|
||||
if sheetname:
|
||||
self.r.append(sheetname)
|
||||
|
||||
class ODTSlideHandler(handler.ContentHandler):
|
||||
""" Extract headings from content.xml of an ODT file """
|
||||
def __init__(self, eater):
|
||||
self.r = eater
|
||||
self.data = []
|
||||
self.pagenum = 0
|
||||
|
||||
def characters(self, data):
|
||||
self.data.append(data)
|
||||
|
||||
def startElementNS(self, tag, qname, attrs):
|
||||
if tag == (DRAWNS, 'page'):
|
||||
self.pagenum = self.pagenum + 1
|
||||
self.r.append("SLIDE %d: %s" % ( self.pagenum, attrs.get((DRAWNS, 'name'),'')))
|
||||
if tag == (TEXTNS, 'p'):
|
||||
self.data = []
|
||||
|
||||
def endElementNS(self, tag, qname):
|
||||
if tag == (TEXTNS, 'p'):
|
||||
str = ''.join(self.data)
|
||||
self.data = []
|
||||
if len(str) > 0:
|
||||
self.r.append(" " + str)
|
||||
|
||||
def odtheadings(odtfile):
|
||||
mimetype = getxmlpart(odtfile,'mimetype')
|
||||
content = getxmlpart(odtfile,'content.xml')
|
||||
lines = []
|
||||
parser = make_parser()
|
||||
parser.setFeature(handler.feature_namespaces, 1)
|
||||
if not isinstance(mimetype, str):
|
||||
mimetype=mimetype.decode("utf-8")
|
||||
if mimetype in ('application/vnd.oasis.opendocument.text',
|
||||
'application/vnd.oasis.opendocument.text-template'):
|
||||
parser.setContentHandler(ODTHeadingHandler(lines))
|
||||
elif mimetype in ('application/vnd.oasis.opendocument.spreadsheet',
|
||||
'application/vnd.oasis.opendocument.spreadsheet-template'):
|
||||
parser.setContentHandler(ODTSheetHandler(lines))
|
||||
elif mimetype in ('application/vnd.oasis.opendocument.presentation'
|
||||
'application/vnd.oasis.opendocument.presentation-template'):
|
||||
parser.setContentHandler(ODTSlideHandler(lines))
|
||||
else:
|
||||
print ("Unsupported fileformat")
|
||||
sys.exit(2)
|
||||
parser.setErrorHandler(handler.ErrorHandler())
|
||||
|
||||
inpsrc = InputSource()
|
||||
if not isinstance(content, str):
|
||||
content=content.decode("utf-8")
|
||||
inpsrc.setByteStream(StringIO(content))
|
||||
parser.parse(inpsrc)
|
||||
return lines
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
filler = " "
|
||||
for heading in odtheadings(sys.argv[1]):
|
||||
print (heading)
|
||||
|
||||
|
||||
|
||||
# Local Variables: ***
|
||||
# mode: python ***
|
||||
# End: ***
|
||||
101
venv/bin/odfuserfield
Executable file
101
venv/bin/odfuserfield
Executable file
@ -0,0 +1,101 @@
|
||||
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2006-2007 Søren Roug, European Environment Agency
|
||||
#
|
||||
# This is free software. You may redistribute it under the terms
|
||||
# of the Apache license and the GNU General Public License Version
|
||||
# 2 or at your option any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public
|
||||
# License along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#
|
||||
# Contributor(s): Michael Howitz, gocept gmbh & co. kg
|
||||
|
||||
import sys
|
||||
import getopt
|
||||
|
||||
import odf.userfield
|
||||
|
||||
if sys.version_info[0]==3: unicode=str
|
||||
|
||||
listfields = False
|
||||
Listfields = False
|
||||
xfields = []
|
||||
Xfields = []
|
||||
setfields = {}
|
||||
outputfile = None
|
||||
inputfile = None
|
||||
|
||||
|
||||
def exitwithusage(exitcode=2):
|
||||
""" print out usage information """
|
||||
sys.stderr.write("Usage: %s [-lL] [-xX metafield] [-s metafield:value]... "
|
||||
"[-o output] [inputfile]\n" % sys.argv[0])
|
||||
sys.stderr.write("\tInputfile must be OpenDocument format\n")
|
||||
sys.exit(exitcode)
|
||||
|
||||
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "lLs:o:x:X:")
|
||||
except getopt.GetoptError:
|
||||
exitwithusage()
|
||||
|
||||
if len(opts) == 0:
|
||||
exitwithusage()
|
||||
|
||||
for o, a in opts:
|
||||
if o == '-s':
|
||||
if a.find(":") >= 0:
|
||||
k,v = a.split(":",1)
|
||||
else:
|
||||
k,v = (a, "")
|
||||
if len(k) == 0:
|
||||
exitwithusage()
|
||||
setfields[unicode(k)] = unicode(v)
|
||||
if o == '-l':
|
||||
listfields = True
|
||||
Listfields = False
|
||||
if o == '-L':
|
||||
Listfields = True
|
||||
listfields = False
|
||||
if o == "-x":
|
||||
xfields.append(unicode(a))
|
||||
if o == "-X":
|
||||
Xfields.append(unicode(a))
|
||||
if o == "-o":
|
||||
outputfile = unicode(a)
|
||||
|
||||
if len(args) != 0:
|
||||
inputfile = unicode(args[0])
|
||||
|
||||
user_fields = odf.userfield.UserFields(inputfile, outputfile)
|
||||
|
||||
if xfields:
|
||||
for value in user_fields.list_values(xfields):
|
||||
print (value)
|
||||
|
||||
if Listfields or Xfields:
|
||||
if Listfields:
|
||||
Xfields = None
|
||||
for field_name, value_type, value in user_fields.list_fields_and_values(
|
||||
Xfields):
|
||||
print ("%s#%s:%s" % (field_name, value_type, value))
|
||||
|
||||
if listfields:
|
||||
for value in user_fields.list_fields():
|
||||
print (value)
|
||||
|
||||
if setfields:
|
||||
user_fields.update(setfields)
|
||||
|
||||
|
||||
|
||||
# Local Variables: ***
|
||||
# mode: python ***
|
||||
# End: ***
|
||||
10
venv/bin/pip
Executable file
10
venv/bin/pip
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from pip._internal.cli.main import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
10
venv/bin/pip3
Executable file
10
venv/bin/pip3
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from pip._internal.cli.main import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
10
venv/bin/pip3.10
Executable file
10
venv/bin/pip3.10
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from pip._internal.cli.main import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
10
venv/bin/pygmentize
Executable file
10
venv/bin/pygmentize
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from pygments.cmdline import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
1
venv/bin/python
Symbolic link
1
venv/bin/python
Symbolic link
@ -0,0 +1 @@
|
||||
python3
|
||||
1
venv/bin/python3
Symbolic link
1
venv/bin/python3
Symbolic link
@ -0,0 +1 @@
|
||||
/usr/bin/python3
|
||||
1
venv/bin/python3.10
Symbolic link
1
venv/bin/python3.10
Symbolic link
@ -0,0 +1 @@
|
||||
python3
|
||||
10
venv/bin/rdf2dot
Executable file
10
venv/bin/rdf2dot
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from rdflib.tools.rdf2dot import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
10
venv/bin/rdfgraphisomorphism
Executable file
10
venv/bin/rdfgraphisomorphism
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from rdflib.tools.graphisomorphism import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
10
venv/bin/rdfpipe
Executable file
10
venv/bin/rdfpipe
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from rdflib.tools.rdfpipe import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
10
venv/bin/rdfs2dot
Executable file
10
venv/bin/rdfs2dot
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from rdflib.tools.rdfs2dot import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
10
venv/bin/rqw
Executable file
10
venv/bin/rqw
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from SPARQLWrapper.main import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
10
venv/bin/spacy
Executable file
10
venv/bin/spacy
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from spacy.cli import setup_cli
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(setup_cli())
|
||||
10
venv/bin/tqdm
Executable file
10
venv/bin/tqdm
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from tqdm.cli import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
10
venv/bin/typer
Executable file
10
venv/bin/typer
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from typer.cli import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
10
venv/bin/weasel
Executable file
10
venv/bin/weasel
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
'''exec' "/home/jarnold/projects/GND-Skript Test/venv/bin/python3" "$0" "$@"
|
||||
' '''
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from weasel.cli import app
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(app())
|
||||
241
venv/bin/xml2odf
Executable file
241
venv/bin/xml2odf
Executable file
@ -0,0 +1,241 @@
|
||||
#!/home/jarnold/projects/GND-Skript Test/venv/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2006 Søren Roug, European Environment Agency
|
||||
#
|
||||
# This is free software. You may redistribute it under the terms
|
||||
# of the Apache license and the GNU General Public License Version
|
||||
# 2 or at your option any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public
|
||||
# License along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#
|
||||
# Contributor(s):
|
||||
#
|
||||
#
|
||||
|
||||
# OpenDocument can be a complete office document in a single
|
||||
# XML document. This script will take such a document and create
|
||||
# a package
|
||||
import io
|
||||
import zipfile,time, sys, getopt
|
||||
import xml.sax, xml.sax.saxutils
|
||||
from odf import manifest
|
||||
|
||||
class SplitWriter:
|
||||
def __init__(self):
|
||||
self.activefiles = []
|
||||
self._content = []
|
||||
self._meta = []
|
||||
self._styles = []
|
||||
self._settings = []
|
||||
|
||||
self.files = {'content': self._content, 'meta': self._meta,
|
||||
'styles':self._styles, 'settings': self._settings }
|
||||
|
||||
def write(self, str):
|
||||
for f in self.activefiles:
|
||||
f.append(str)
|
||||
|
||||
def activate(self, filename):
|
||||
file = self.files[filename]
|
||||
if file not in self.activefiles:
|
||||
self.activefiles.append(file)
|
||||
|
||||
def deactivate(self, filename):
|
||||
file = self.files[filename]
|
||||
if file in self.activefiles:
|
||||
self.activefiles.remove(file)
|
||||
|
||||
odmimetypes = {
|
||||
'application/vnd.oasis.opendocument.text': '.odt',
|
||||
'application/vnd.oasis.opendocument.text-template': '.ott',
|
||||
'application/vnd.oasis.opendocument.graphics': '.odg',
|
||||
'application/vnd.oasis.opendocument.graphics-template': '.otg',
|
||||
'application/vnd.oasis.opendocument.presentation': '.odp',
|
||||
'application/vnd.oasis.opendocument.presentation-template': '.otp',
|
||||
'application/vnd.oasis.opendocument.spreadsheet': '.ods',
|
||||
'application/vnd.oasis.opendocument.spreadsheet-template': '.ots',
|
||||
'application/vnd.oasis.opendocument.chart': '.odc',
|
||||
'application/vnd.oasis.opendocument.chart-template': '.otc',
|
||||
'application/vnd.oasis.opendocument.image': '.odi',
|
||||
'application/vnd.oasis.opendocument.image-template': '.oti',
|
||||
'application/vnd.oasis.opendocument.formula': '.odf',
|
||||
'application/vnd.oasis.opendocument.formula-template': '.otf',
|
||||
'application/vnd.oasis.opendocument.text-master': '.odm',
|
||||
'application/vnd.oasis.opendocument.text-web': '.oth',
|
||||
}
|
||||
|
||||
OFFICENS = u"urn:oasis:names:tc:opendocument:xmlns:office:1.0"
|
||||
base = xml.sax.saxutils.XMLGenerator
|
||||
|
||||
class odfsplitter(base):
|
||||
|
||||
def __init__(self):
|
||||
self._mimetype = ''
|
||||
self.output = SplitWriter()
|
||||
self._prefixes = []
|
||||
base.__init__(self, self.output, 'utf-8')
|
||||
|
||||
def startPrefixMapping(self, prefix, uri):
|
||||
base.startPrefixMapping(self, prefix, uri)
|
||||
self._prefixes.append('xmlns:%s="%s"' % (prefix, uri))
|
||||
|
||||
def startElementNS(self, name, qname, attrs):
|
||||
if name == (OFFICENS, u"document"):
|
||||
self._mimetype = attrs.get((OFFICENS, "mimetype"))
|
||||
elif name == (OFFICENS, u"meta"):
|
||||
self.output.activate('meta')
|
||||
|
||||
elif name == (OFFICENS, u"settings"):
|
||||
self.output.activate('settings')
|
||||
elif name == (OFFICENS, u"scripts"):
|
||||
self.output.activate('content')
|
||||
elif name == (OFFICENS, u"font-face-decls"):
|
||||
self.output.activate('content')
|
||||
self.output.activate('styles')
|
||||
elif name == (OFFICENS, u"styles"):
|
||||
self.output.activate('styles')
|
||||
elif name == (OFFICENS, u"automatic-styles"):
|
||||
self.output.activate('content')
|
||||
self.output.activate('styles')
|
||||
elif name == (OFFICENS, u"master-styles"):
|
||||
self.output.activate('styles')
|
||||
elif name == (OFFICENS, u"body"):
|
||||
self.output.activate('content')
|
||||
base.startElementNS(self, name, qname, attrs)
|
||||
|
||||
def endElementNS(self, name, qname):
|
||||
base.endElementNS(self, name, qname)
|
||||
if name == (OFFICENS, u"meta"):
|
||||
self.output.deactivate('meta')
|
||||
elif name == (OFFICENS, u"settings"):
|
||||
self.output.deactivate('settings')
|
||||
elif name == (OFFICENS, u"scripts"):
|
||||
self.output.deactivate('content')
|
||||
elif name == (OFFICENS, u"font-face-decls"):
|
||||
self.output.deactivate('content')
|
||||
self.output.deactivate('styles')
|
||||
elif name == (OFFICENS, u"styles"):
|
||||
self.output.deactivate('styles')
|
||||
elif name == (OFFICENS, u"automatic-styles"):
|
||||
self.output.deactivate('content')
|
||||
self.output.deactivate('styles')
|
||||
elif name == (OFFICENS, u"master-styles"):
|
||||
self.output.deactivate('styles')
|
||||
elif name == (OFFICENS, u"body"):
|
||||
self.output.deactivate('content')
|
||||
|
||||
|
||||
def content(self):
|
||||
""" Return the content inside a wrapper called <office:document-content>
|
||||
"""
|
||||
prefixes = ' '.join(self._prefixes)
|
||||
return ''.join(['<?xml version="1.0" encoding="UTF-8"?>\n<office:document-content %s office:version="1.0">' % prefixes] + list(map(lambda x: x.decode("utf-8"), self.output._content)) + ['</office:document-content>'])
|
||||
|
||||
def settings(self):
|
||||
prefixes = ' '.join(self._prefixes).encode('utf-8')
|
||||
return ''.join( ['<?xml version="1.0" encoding="UTF-8"?>\n<office:document-settings %s office:version="1.0">' % prefixes] + self.output._settings + ['''</office:document-settings>'''])
|
||||
|
||||
def styles(self):
|
||||
prefixes = ' '.join(self._prefixes)
|
||||
return ''.join( ['<?xml version="1.0" encoding="UTF-8"?>\n<office:document-styles %s office:version="1.0">' % prefixes] + list(map(lambda x: x.decode("utf-8"), self.output._styles)) + ['''</office:document-styles>'''])
|
||||
|
||||
def meta(self):
|
||||
prefixes = ' '.join(self._prefixes)
|
||||
return ''.join( ['<?xml version="1.0" encoding="UTF-8"?>\n<office:document-meta %s office:version="1.0">' % prefixes] + list(map(lambda x: x.decode("utf-8"), self.output._meta)) + ['''</office:document-meta>'''])
|
||||
|
||||
def usage():
|
||||
sys.stderr.write("Usage: %s [-o outputfile] [-s] inputfile\n" % sys.argv[0])
|
||||
|
||||
def manifestxml(m):
|
||||
""" Generates the content of the manifest.xml file """
|
||||
xml=io.StringIO()
|
||||
xml.write(u"<?xml version='1.0' encoding='UTF-8'?>\n")
|
||||
m.toXml(0,xml)
|
||||
return xml.getvalue()
|
||||
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "o:s", ["output=","suffix"])
|
||||
except getopt.GetoptError:
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
outputfile = '-'
|
||||
addsuffix = False
|
||||
|
||||
for o, a in opts:
|
||||
if o in ("-o", "--output"):
|
||||
outputfile = a
|
||||
if o in ("-s", "--suffix"):
|
||||
addsuffix = True
|
||||
|
||||
if len(args) > 1:
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
odfs = odfsplitter()
|
||||
parser = xml.sax.make_parser()
|
||||
parser.setFeature(xml.sax.handler.feature_namespaces, 1)
|
||||
parser.setContentHandler(odfs)
|
||||
if len(args) == 0:
|
||||
parser.parse(sys.stdin)
|
||||
else:
|
||||
parser.parse(open(args[0],"r"))
|
||||
|
||||
mimetype = odfs._mimetype
|
||||
suffix = odmimetypes.get(mimetype,'.xxx')
|
||||
|
||||
if outputfile == '-':
|
||||
if sys.stdout.isatty():
|
||||
sys.stderr.write("Won't write ODF file to terminal\n")
|
||||
sys.exit(1)
|
||||
z = zipfile.ZipFile(sys.stdout,"w")
|
||||
else:
|
||||
if addsuffix:
|
||||
outputfile = outputfile + suffix
|
||||
z = zipfile.ZipFile(outputfile,"w")
|
||||
|
||||
now = time.localtime()[:6]
|
||||
|
||||
# Write mimetype
|
||||
zi = zipfile.ZipInfo('mimetype', now)
|
||||
zi.compress_type = zipfile.ZIP_STORED
|
||||
z.writestr(zi,mimetype)
|
||||
|
||||
# Write content
|
||||
zi = zipfile.ZipInfo("content.xml", now)
|
||||
zi.compress_type = zipfile.ZIP_DEFLATED
|
||||
z.writestr(zi,odfs.content() )
|
||||
# Write styles
|
||||
zi = zipfile.ZipInfo("styles.xml", now)
|
||||
zi.compress_type = zipfile.ZIP_DEFLATED
|
||||
z.writestr(zi,odfs.styles() )
|
||||
|
||||
# Write meta
|
||||
zi = zipfile.ZipInfo("meta.xml", now)
|
||||
zi.compress_type = zipfile.ZIP_DEFLATED
|
||||
z.writestr(zi,odfs.meta() )
|
||||
|
||||
m = manifest.Manifest()
|
||||
m.addElement(manifest.FileEntry(fullpath="/", mediatype=mimetype))
|
||||
m.addElement(manifest.FileEntry(fullpath="content.xml",mediatype="text/xml"))
|
||||
m.addElement(manifest.FileEntry(fullpath="styles.xml", mediatype="text/xml"))
|
||||
m.addElement(manifest.FileEntry(fullpath="meta.xml", mediatype="text/xml"))
|
||||
|
||||
# Write manifest
|
||||
zi = zipfile.ZipInfo("META-INF/manifest.xml", now)
|
||||
zi.compress_type = zipfile.ZIP_DEFLATED
|
||||
z.writestr(zi, manifestxml(m).encode("utf-8") )
|
||||
z.close()
|
||||
|
||||
|
||||
|
||||
# Local Variables: ***
|
||||
# mode: python ***
|
||||
# End: ***
|
||||
@ -0,0 +1 @@
|
||||
pip
|
||||
@ -0,0 +1,28 @@
|
||||
Copyright 2010 Pallets
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
@ -0,0 +1,92 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: MarkupSafe
|
||||
Version: 3.0.2
|
||||
Summary: Safely add untrusted strings to HTML/XML markup.
|
||||
Maintainer-email: Pallets <contact@palletsprojects.com>
|
||||
License: Copyright 2010 Pallets
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
Project-URL: Donate, https://palletsprojects.com/donate
|
||||
Project-URL: Documentation, https://markupsafe.palletsprojects.com/
|
||||
Project-URL: Changes, https://markupsafe.palletsprojects.com/changes/
|
||||
Project-URL: Source, https://github.com/pallets/markupsafe/
|
||||
Project-URL: Chat, https://discord.gg/pallets
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Environment :: Web Environment
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: BSD License
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
|
||||
Classifier: Topic :: Text Processing :: Markup :: HTML
|
||||
Classifier: Typing :: Typed
|
||||
Requires-Python: >=3.9
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE.txt
|
||||
|
||||
# MarkupSafe
|
||||
|
||||
MarkupSafe implements a text object that escapes characters so it is
|
||||
safe to use in HTML and XML. Characters that have special meanings are
|
||||
replaced so that they display as the actual characters. This mitigates
|
||||
injection attacks, meaning untrusted user input can safely be displayed
|
||||
on a page.
|
||||
|
||||
|
||||
## Examples
|
||||
|
||||
```pycon
|
||||
>>> from markupsafe import Markup, escape
|
||||
|
||||
>>> # escape replaces special characters and wraps in Markup
|
||||
>>> escape("<script>alert(document.cookie);</script>")
|
||||
Markup('<script>alert(document.cookie);</script>')
|
||||
|
||||
>>> # wrap in Markup to mark text "safe" and prevent escaping
|
||||
>>> Markup("<strong>Hello</strong>")
|
||||
Markup('<strong>hello</strong>')
|
||||
|
||||
>>> escape(Markup("<strong>Hello</strong>"))
|
||||
Markup('<strong>hello</strong>')
|
||||
|
||||
>>> # Markup is a str subclass
|
||||
>>> # methods and operators escape their arguments
|
||||
>>> template = Markup("Hello <em>{name}</em>")
|
||||
>>> template.format(name='"World"')
|
||||
Markup('Hello <em>"World"</em>')
|
||||
```
|
||||
|
||||
## Donate
|
||||
|
||||
The Pallets organization develops and supports MarkupSafe and other
|
||||
popular packages. In order to grow the community of contributors and
|
||||
users, and allow the maintainers to devote more time to the projects,
|
||||
[please donate today][].
|
||||
|
||||
[please donate today]: https://palletsprojects.com/donate
|
||||
@ -0,0 +1,14 @@
|
||||
MarkupSafe-3.0.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
MarkupSafe-3.0.2.dist-info/LICENSE.txt,sha256=SJqOEQhQntmKN7uYPhHg9-HTHwvY-Zp5yESOf_N9B-o,1475
|
||||
MarkupSafe-3.0.2.dist-info/METADATA,sha256=aAwbZhSmXdfFuMM-rEHpeiHRkBOGESyVLJIuwzHP-nw,3975
|
||||
MarkupSafe-3.0.2.dist-info/RECORD,,
|
||||
MarkupSafe-3.0.2.dist-info/WHEEL,sha256=_kVlewavvOSnwZE_whBk3jlE_Ob-nL5GvlVcLkpXSD8,151
|
||||
MarkupSafe-3.0.2.dist-info/top_level.txt,sha256=qy0Plje5IJuvsCBjejJyhDCjEAdcDLK_2agVcex8Z6U,11
|
||||
markupsafe/__init__.py,sha256=sr-U6_27DfaSrj5jnHYxWN-pvhM27sjlDplMDPZKm7k,13214
|
||||
markupsafe/__pycache__/__init__.cpython-310.pyc,,
|
||||
markupsafe/__pycache__/_native.cpython-310.pyc,,
|
||||
markupsafe/_native.py,sha256=hSLs8Jmz5aqayuengJJ3kdT5PwNpBWpKrmQSdipndC8,210
|
||||
markupsafe/_speedups.c,sha256=O7XulmTo-epI6n2FtMVOrJXl8EAaIwD2iNYmBI5SEoQ,4149
|
||||
markupsafe/_speedups.cpython-310-x86_64-linux-gnu.so,sha256=x4RoxWgyqAEokk-AZrWvrLDxLE-dm-zZSZYV_gOiLJA,34976
|
||||
markupsafe/_speedups.pyi,sha256=ENd1bYe7gbBUf2ywyYWOGUpnXOHNJ-cgTNqetlW8h5k,41
|
||||
markupsafe/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
@ -0,0 +1,6 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: setuptools (75.2.0)
|
||||
Root-Is-Purelib: false
|
||||
Tag: cp310-cp310-manylinux_2_17_x86_64
|
||||
Tag: cp310-cp310-manylinux2014_x86_64
|
||||
|
||||
@ -0,0 +1 @@
|
||||
markupsafe
|
||||
@ -0,0 +1,37 @@
|
||||
# Authors
|
||||
|
||||
* Ivan Herman ([@iherman](http://github.com/iherman))
|
||||
* Sergio Fernández ([@wikier](http://github.com/wikier))
|
||||
* Carlos Tejo ([@dayures](http://github.com/dayures))
|
||||
* Alexey Zakhlestin ([@indeyets](http://github.com/indeyets))
|
||||
|
||||
# Contributors
|
||||
|
||||
See https://github.com/RDFLib/sparqlwrapper/graphs/contributors
|
||||
|
||||
* [@eggplants]https://github.com/eggplants: most things to make 2.0.0 happen
|
||||
* Obey Arthur Liu ([@ArthurLiu](http://github.com/ArthurLiu)): different patches
|
||||
* Christopher Lenz ([@cmlenz](http://github.com/cmlenz)): feature to allow developers to choose the json module
|
||||
* Pēteris Caune ([@cuu508](http://github.com/cuu508)): great feedback and patches
|
||||
* Bogdan Benea ([bugdone@users.sourceforge.net](mailto:bugdone@users.sourceforge.net)), patch for the query regular expresion
|
||||
* William Waites ([@wwaites](http://github.com/wwaites)): patches for RDFLib3
|
||||
* Christoph Burgmer ([@cburgmer](http://github.com/cburgmer)): patches for RDFLib3
|
||||
* Thomas Kluyver ([@takluyver](http://github.com/takluyver)): patches for Python 3.x
|
||||
* Diego Berrueta ([@berrueta](http://github.com/berrueta)): new function for printing results as table
|
||||
* Olivier Berger ([@olberger](http://github.com/olberger)): patch regarding raw response for unknown formats
|
||||
* Benjamin Cogrel ([@bcogrel](http://github.com/bcogrel)): standard query types
|
||||
* Urs Holzer ([@uholzer](http://github.com/uholzer)): features, patches and testing
|
||||
* Alf Lervåg ([@alf](http://github.com/alf)): setup patch
|
||||
* Nolan Nichols ([@nicholsn](http://github.com/nicholsn)): http disgest auth support
|
||||
* Kevin Turner ([@keturn](https://github.com/keturn)): `SmartWrapper.Value.__repr__()` implementation
|
||||
* Marcelo Jorge Vieira ([@marcelometal](https://github.com/marcelometal)): typos
|
||||
* Trevor Andersen ([@trevorandersen](https://github.com/trevorandersen): patches for Python 3.x
|
||||
* Carlos Martinez-Ortiz ([@cmartinez](https://github.com/cmartinez): improves support for return format HTTP parameter
|
||||
* Christian Amsüss ([@chrysn](https://github.com/chrysn)): dependecy fixes
|
||||
* Chris Lamb ([@lamby](https://github.com/lamby)): typo
|
||||
* Hugo van Kemenade ([@hugovk](https://github.com/hugovk)): update classifiers (Python 3.6)
|
||||
* Edward Betts ([@EdwardBetts](https://github.com/EdwardBetts)): Correct spelling mistakes
|
||||
* Carlos Martínez ([@c-martinez](https://github.com/c-martinez)): Mainly support for CSV and TSV results in SPARQL SELECT queries
|
||||
* Dan Michael O. Heggø ([@danmichaelo](https://github.com/danmichaelo)): update README with SPARQLWrapper2 example
|
||||
* Sam Clements ([@borntyping](https://github.com/borntyping)): Provide hints about setting properly the timeout
|
||||
* Marc Feger ([@MaFeg100](https://github.com/MaFeg100)): Improve/tests for development
|
||||
@ -0,0 +1 @@
|
||||
pip
|
||||
@ -0,0 +1,18 @@
|
||||
SPARQL Python Wrapper is released under the W3C® SOFTWARE NOTICE AND LICENSE.
|
||||
|
||||
This work (and included software, documentation such as READMEs, or other related items) is being provided by the copyright holders under the following license. By obtaining, using and/or copying this work, you (the licensee) agree that you have read, understood, and will comply with the following terms and conditions.
|
||||
|
||||
Permission to copy, modify, and distribute this software and its documentation, with or without modification, for any purpose and without fee or royalty is hereby granted, provided that you include the following on ALL copies of the software and documentation or portions thereof, including modifications:
|
||||
|
||||
1. The full text of this NOTICE in a location viewable to users of the redistributed or derivative work.
|
||||
2. Any pre-existing intellectual property disclaimers, notices, or terms and conditions. If none exist, the W3C Software Short Notice should be included (hypertext is preferred, text is permitted) within the body of any redistributed or derivative code.
|
||||
3. Notice of any changes or modifications to the files, including the date changes were made. (We recommend you provide URIs to the location from which the code is derived.)
|
||||
|
||||
THIS SOFTWARE AND DOCUMENTATION IS PROVIDED "AS IS," AND COPYRIGHT HOLDERS MAKE NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS.
|
||||
|
||||
COPYRIGHT HOLDERS WILL NOT BE LIABLE FOR ANY DIRECT, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF ANY USE OF THE SOFTWARE OR DOCUMENTATION.
|
||||
|
||||
The name and trademarks of copyright holders may NOT be used in advertising or publicity pertaining to the software without specific, written prior permission. Title to copyright in this software and any associated documentation will at all times remain with copyright holders.
|
||||
|
||||
See also http://www.w3.org/Consortium/Legal/copyright-software for further details
|
||||
|
||||
@ -0,0 +1,45 @@
|
||||
Metadata-Version: 2.1
|
||||
Name: SPARQLWrapper
|
||||
Version: 2.0.0
|
||||
Summary: SPARQL Endpoint interface to Python
|
||||
Home-page: http://rdflib.github.io/sparqlwrapper
|
||||
Download-URL: https://github.com/RDFLib/sparqlwrapper/releases
|
||||
Author: Ivan Herman, Sergio Fernández, Carlos Tejo Alonso, Alexey Zakhlestin
|
||||
Author-email: rdflib-dev@googlegroups.com
|
||||
License: W3C SOFTWARE NOTICE AND LICENSE
|
||||
Project-URL: Home, https://rdflib.github.io/sparqlwrapper
|
||||
Project-URL: Documentation, https://sparqlwrapper.readthedocs.io
|
||||
Project-URL: Source, https://github.com/RDFLib/sparqlwrapper
|
||||
Project-URL: Tracker, https://github.com/RDFLib/sparqlwrapper/issues
|
||||
Keywords: python,sparql,rdf,rdflib
|
||||
Platform: any
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: W3C License
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Requires-Python: >=3.7
|
||||
License-File: LICENSE.txt
|
||||
License-File: AUTHORS.md
|
||||
Requires-Dist: rdflib (>=6.1.1)
|
||||
Provides-Extra: dev
|
||||
Requires-Dist: setuptools (>=3.7.1) ; extra == 'dev'
|
||||
Requires-Dist: mypy (>=0.931) ; extra == 'dev'
|
||||
Requires-Dist: pandas (>=1.3.5) ; extra == 'dev'
|
||||
Requires-Dist: pandas-stubs (>=1.2.0.48) ; extra == 'dev'
|
||||
Provides-Extra: docs
|
||||
Requires-Dist: sphinx (<5) ; extra == 'docs'
|
||||
Requires-Dist: sphinx-rtd-theme ; extra == 'docs'
|
||||
Provides-Extra: keepalive
|
||||
Requires-Dist: keepalive (>=0.5) ; extra == 'keepalive'
|
||||
Provides-Extra: pandas
|
||||
Requires-Dist: pandas (>=1.3.5) ; extra == 'pandas'
|
||||
|
||||
This is a wrapper around a SPARQL service. It helps in creating the query URI and, possibly, convert the result into a more manageable format.
|
||||
|
||||
@ -0,0 +1,25 @@
|
||||
../../../bin/rqw,sha256=qf6Nvwhjovp_uPIPeeMNocB3j7iZ_YnskuMQcUK6DYY,291
|
||||
SPARQLWrapper-2.0.0.dist-info/AUTHORS.md,sha256=7oV4hamlTbjfsaWy15f3BVH2h90Nf5mJ-rR0Z1azy9s,2725
|
||||
SPARQLWrapper-2.0.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
SPARQLWrapper-2.0.0.dist-info/LICENSE.txt,sha256=Z1IX12CEodcefDAOAMJ7irELJAX-huUCOiuzio5G8Ik,2134
|
||||
SPARQLWrapper-2.0.0.dist-info/METADATA,sha256=kU92L4KNVjo9aP6-jm4FXVAUpNScd5mIWWbIGHu_D_I,2020
|
||||
SPARQLWrapper-2.0.0.dist-info/RECORD,,
|
||||
SPARQLWrapper-2.0.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
SPARQLWrapper-2.0.0.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
||||
SPARQLWrapper-2.0.0.dist-info/entry_points.txt,sha256=aIYAzonEA7winfiw8NydOLNu406HC6aRBlKLI2H5kEQ,48
|
||||
SPARQLWrapper-2.0.0.dist-info/top_level.txt,sha256=3KluNiTwOkX16hLJwC3UEYzKdEscknK--UV5q56mYWY,14
|
||||
SPARQLWrapper/KeyCaseInsensitiveDict.py,sha256=JF83-6EPbcm9F4gg0GQ11vTVuLzdJ7sDsubEP9j-3zw,1377
|
||||
SPARQLWrapper/SPARQLExceptions.py,sha256=qFlU175hp61gO6bvgQsCdSTEGOFnJwJNBQlIGS5W7-o,2595
|
||||
SPARQLWrapper/SmartWrapper.py,sha256=GxZiMGZpGppPZX54W-YdUtcdAAa83GJjPLdyfLWPK-4,15557
|
||||
SPARQLWrapper/Wrapper.py,sha256=M9lTPkpvRU2xAUbrHiKYK0mEV8pkycNS3lPoO__0gSE,58238
|
||||
SPARQLWrapper/__init__.py,sha256=6kU9hD9FnlFbk2c8uFkpGb1arB3268nN74RUh91e60s,1213
|
||||
SPARQLWrapper/__pycache__/KeyCaseInsensitiveDict.cpython-310.pyc,,
|
||||
SPARQLWrapper/__pycache__/SPARQLExceptions.cpython-310.pyc,,
|
||||
SPARQLWrapper/__pycache__/SmartWrapper.cpython-310.pyc,,
|
||||
SPARQLWrapper/__pycache__/Wrapper.cpython-310.pyc,,
|
||||
SPARQLWrapper/__pycache__/__init__.cpython-310.pyc,,
|
||||
SPARQLWrapper/__pycache__/main.cpython-310.pyc,,
|
||||
SPARQLWrapper/__pycache__/sparql_dataframe.cpython-310.pyc,,
|
||||
SPARQLWrapper/main.py,sha256=MKNPMrFxIGN_A7-UwyMS_AycjswscgKsP37h2K2df8k,4330
|
||||
SPARQLWrapper/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
SPARQLWrapper/sparql_dataframe.py,sha256=-oM7_eXbwGgeNkFv9mSxe3JWHM3xQQk90nNrbhthnrI,2429
|
||||
@ -0,0 +1,5 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: bdist_wheel (0.37.1)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
|
||||
@ -0,0 +1,2 @@
|
||||
[console_scripts]
|
||||
rqw = SPARQLWrapper.main:main
|
||||
@ -0,0 +1 @@
|
||||
SPARQLWrapper
|
||||
@ -0,0 +1,46 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
A simple implementation of a key case-insensitive dictionary.
|
||||
..
|
||||
Developers involved:
|
||||
* Ivan Herman <http://www.ivan-herman.net>
|
||||
* Sergio Fernández <http://www.wikier.org>
|
||||
* Carlos Tejo Alonso <http://www.dayures.net>
|
||||
* Alexey Zakhlestin <https://indeyets.ru/>
|
||||
Organizations involved:
|
||||
* `World Wide Web Consortium <http://www.w3.org>`_
|
||||
* `Foundation CTIC <http://www.fundacionctic.org/>`_
|
||||
:license: `W3C® Software notice and license <http://www.w3.org/Consortium/Legal/copyright-software>`_
|
||||
"""
|
||||
|
||||
from typing import Dict, Mapping, TypeVar
|
||||
|
||||
_V = TypeVar("_V")
|
||||
|
||||
class KeyCaseInsensitiveDict(Dict[str, _V]):
|
||||
"""
|
||||
A simple implementation of a key case-insensitive dictionary
|
||||
"""
|
||||
|
||||
def __init__(self, d: Mapping[str, _V]={}) -> None:
|
||||
"""
|
||||
:param dict d: The source dictionary.
|
||||
"""
|
||||
for k, v in d.items():
|
||||
self[k] = v
|
||||
|
||||
def __setitem__(self, key: str, value: _V) -> None:
|
||||
if hasattr(key, "lower"):
|
||||
key = key.lower()
|
||||
dict.__setitem__(self, key, value)
|
||||
|
||||
def __getitem__(self, key: str) -> _V:
|
||||
if hasattr(key, "lower"):
|
||||
key = key.lower()
|
||||
return dict.__getitem__(self, key)
|
||||
|
||||
def __delitem__(self, key: str) -> None:
|
||||
if hasattr(key, "lower"):
|
||||
key = key.lower()
|
||||
dict.__delitem__(self, key)
|
||||
@ -0,0 +1,94 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
SPARQL Wrapper exceptions
|
||||
|
||||
..
|
||||
Developers involved:
|
||||
|
||||
* Ivan Herman <http://www.ivan-herman.net>
|
||||
* Sergio Fernández <http://www.wikier.org>
|
||||
* Carlos Tejo Alonso <http://www.dayures.net>
|
||||
* Alexey Zakhlestin <https://indeyets.ru/>
|
||||
|
||||
Organizations involved:
|
||||
|
||||
* `World Wide Web Consortium <http://www.w3.org>`_
|
||||
* `Foundation CTIC <http://www.fundacionctic.org/>`_
|
||||
|
||||
:license: `W3C® Software notice and license <http://www.w3.org/Consortium/Legal/copyright-software>`_
|
||||
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class SPARQLWrapperException(Exception):
|
||||
"""
|
||||
Base class for SPARQL Wrapper exceptions
|
||||
"""
|
||||
|
||||
msg = "an exception has occurred"
|
||||
|
||||
def __init__(self, response: Optional[bytes] = None):
|
||||
"""
|
||||
:param string response: The server response
|
||||
"""
|
||||
if response:
|
||||
formatted_msg = "%s: %s. \n\nResponse:\n%r" % (
|
||||
self.__class__.__name__,
|
||||
self.msg,
|
||||
response,
|
||||
)
|
||||
else:
|
||||
formatted_msg = "%s: %s." % (self.__class__.__name__, self.msg)
|
||||
|
||||
super(SPARQLWrapperException, self).__init__(formatted_msg)
|
||||
|
||||
|
||||
class EndPointInternalError(SPARQLWrapperException):
|
||||
"""
|
||||
Exception type for Internal Server Error responses. Usually HTTP response status code ``500``.
|
||||
"""
|
||||
|
||||
msg = "The endpoint returned the HTTP status code 500"
|
||||
|
||||
|
||||
class QueryBadFormed(SPARQLWrapperException):
|
||||
"""
|
||||
Query Bad Formed exception. Usually HTTP response status code ``400``.
|
||||
"""
|
||||
|
||||
msg = "A bad request has been sent to the endpoint: probably the SPARQL query is badly formed"
|
||||
|
||||
|
||||
class EndPointNotFound(SPARQLWrapperException):
|
||||
"""
|
||||
End Point Not Found exception. Usually HTTP response status code ``404``.
|
||||
"""
|
||||
|
||||
msg = "It was not possible to connect to the given endpoint: check it is correct"
|
||||
|
||||
|
||||
class Unauthorized(SPARQLWrapperException):
|
||||
"""
|
||||
Access is denied due to invalid credentials (unauthorized). Usually HTTP response status code ``401``.
|
||||
|
||||
.. versionadded:: 1.8.2
|
||||
"""
|
||||
|
||||
msg = "Access to that endpoint is denied due to invalid credentials (unauthorized). Check the credentials"
|
||||
|
||||
|
||||
class URITooLong(SPARQLWrapperException):
|
||||
"""
|
||||
The URI requested by the client is longer than the server is willing to interpret. Usually HTTP response
|
||||
status code ``414``.
|
||||
|
||||
.. versionadded:: 1.8.3
|
||||
"""
|
||||
|
||||
msg = (
|
||||
"The URI requested by the client is longer than the server is willing to interpret. "
|
||||
"Check if the request was sent using GET method instead of POST method."
|
||||
)
|
||||
366
venv/lib/python3.10/site-packages/SPARQLWrapper/SmartWrapper.py
Normal file
366
venv/lib/python3.10/site-packages/SPARQLWrapper/SmartWrapper.py
Normal file
@ -0,0 +1,366 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
|
||||
..
|
||||
Developers involved:
|
||||
|
||||
* Ivan Herman <http://www.ivan-herman.net>
|
||||
* Sergio Fernández <http://www.wikier.org>
|
||||
* Carlos Tejo Alonso <http://www.dayures.net>
|
||||
* Alexey Zakhlestin <https://indeyets.ru/>
|
||||
|
||||
Organizations involved:
|
||||
|
||||
* `World Wide Web Consortium <http://www.w3.org>`_
|
||||
* `Foundation CTIC <http://www.fundacionctic.org/>`_
|
||||
|
||||
:license: `W3C® Software notice and license <http://www.w3.org/Consortium/Legal/copyright-software>`_
|
||||
:requires: `RDFLib <https://rdflib.readthedocs.io>`_ package.
|
||||
"""
|
||||
|
||||
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from SPARQLWrapper.Wrapper import JSON, SELECT, QueryResult
|
||||
from SPARQLWrapper.Wrapper import SPARQLWrapper as SW
|
||||
|
||||
######################################################################################
|
||||
|
||||
|
||||
class Value(object):
|
||||
"""
|
||||
Class encapsulating a single binding for a variable.
|
||||
|
||||
:ivar variable: The original variable, stored for an easier reference.
|
||||
:vartype variable: string
|
||||
:ivar value: Value of the binding.
|
||||
:vartype value: string
|
||||
:ivar type: Type of the binding. One of :attr:`Value.URI`, :attr:`Value.Literal`, :attr:`Value.TypedLiteral`, or
|
||||
:attr:`Value.BNODE`.
|
||||
:vartype type: string
|
||||
:ivar lang: Language tag of the binding, or ``None`` if not set.
|
||||
:vartype lang: string
|
||||
:ivar datatype: Datatype of the binding, or ``None`` if not set. It is an URI.
|
||||
:vartype datatype: string
|
||||
"""
|
||||
|
||||
URI = "uri"
|
||||
"""the string denoting a URI variable."""
|
||||
Literal = "literal"
|
||||
"""the string denoting a Literal variable."""
|
||||
TypedLiteral = "typed-literal"
|
||||
"""the string denoting a typed literal variable."""
|
||||
BNODE = "bnode"
|
||||
"""the string denoting a blank node variable."""
|
||||
|
||||
def __init__(self, variable: str, binding: Dict[str, str]) -> None:
|
||||
"""
|
||||
:param variable: the variable for that binding. Stored for an easier reference.
|
||||
:type variable: string
|
||||
:param binding: the binding dictionary part of the return result for a specific binding.
|
||||
:type binding: dict
|
||||
"""
|
||||
self.variable = variable
|
||||
self.value = binding["value"]
|
||||
self.type = binding["type"]
|
||||
self.lang = None
|
||||
self.datatype = None
|
||||
try:
|
||||
self.lang = binding["xml:lang"]
|
||||
except:
|
||||
# no lang is set
|
||||
pass
|
||||
try:
|
||||
self.datatype = binding["datatype"]
|
||||
except:
|
||||
pass
|
||||
|
||||
def __repr__(self) -> str:
|
||||
cls = self.__class__.__name__
|
||||
return "%s(%s:%r)" % (cls, self.type, self.value)
|
||||
|
||||
|
||||
######################################################################################
|
||||
|
||||
|
||||
class Bindings(object):
|
||||
"""
|
||||
Class encapsulating one query result, based on the JSON return format. It decodes the
|
||||
return values to make it a bit more usable for a standard usage. The class consumes the
|
||||
return value and instantiates a number of attributes that can be consulted directly. See
|
||||
the list of variables.
|
||||
|
||||
The `Serializing SPARQL Query Results in JSON <http://www.w3.org/TR/rdf-sparql-json-res/>`_ explains the details of
|
||||
the JSON return structures. Very succinctly: the return data has "bindings", which means a list of dictionaries.
|
||||
Each dictionary is a possible binding of the SELECT variables to :class:`Value` instances. This structure is made a
|
||||
bit more usable by this class.
|
||||
|
||||
:ivar fullResult: The original dictionary of the results, stored for an easier reference.
|
||||
:vartype fullResult: dict
|
||||
:ivar head: Header part of the return, see the JSON return format document for details.
|
||||
:vartype head: dict
|
||||
:ivar variables: List of unbounds (variables) of the original query. It is a list of strings. ``None`` in the case
|
||||
of an ASK query.
|
||||
:vartype variables: list
|
||||
:ivar bindings: The final bindings: list of dictionaries, mapping variables to :class:`Value` instances. \
|
||||
If unbound, then no value is set in the dictionary; that can be easily checked with \
|
||||
``var in res.bindings[..]``, for example.
|
||||
:vartype bindings: list
|
||||
:ivar askResult: by default, set to **False**; in case of an ASK query, the result of the query.
|
||||
:vartype askResult: bool
|
||||
"""
|
||||
|
||||
def __init__(self, retval: QueryResult):
|
||||
"""
|
||||
:param retval: the query result.
|
||||
:type retval: :class:`QueryResult<SPARQLWrapper.Wrapper.QueryResult>`
|
||||
"""
|
||||
self.fullResult = retval._convertJSON()
|
||||
self.head = self.fullResult["head"]
|
||||
self.variables: Optional[List[str]] = None
|
||||
try:
|
||||
self.variables = self.fullResult["head"]["vars"]
|
||||
except:
|
||||
pass
|
||||
|
||||
self.bindings: List[Dict[str, Value]] = []
|
||||
try:
|
||||
for b in self.fullResult["results"]["bindings"]:
|
||||
# This is a single binding. It is a dictionary per variable; each value is a dictionary again
|
||||
# that has to be converted into a Value instance
|
||||
newBind = {}
|
||||
# type error: Item "None" of "Union[List[str], Any, None]" has no attribute "__iter__" (not iterable)
|
||||
for key in self.variables: # type: ignore [union-attr]
|
||||
if key in b:
|
||||
# there is a real binding for this key
|
||||
newBind[key] = Value(key, b[key])
|
||||
self.bindings.append(newBind)
|
||||
except:
|
||||
pass
|
||||
|
||||
self.askResult = False
|
||||
try:
|
||||
self.askResult = self.fullResult["boolean"]
|
||||
except:
|
||||
pass
|
||||
|
||||
def getValues(self, key: str) -> Optional[List[Value]]:
|
||||
"""A shorthand for the retrieval of all bindings for a single key. It is
|
||||
equivalent to ``[b[key] for b in self[key]]``
|
||||
|
||||
:param key: possible variable name.
|
||||
:type key: string
|
||||
:return: list of :class:`Value` instances.
|
||||
:rtype: list
|
||||
"""
|
||||
try:
|
||||
return [b[key] for b in self[key]]
|
||||
except:
|
||||
return []
|
||||
|
||||
def __contains__(self, key: Union[str, List[str], Tuple[str]]) -> bool:
|
||||
"""Emulation of the "``key in obj``" operator. Key can be a string for a variable or an array/tuple
|
||||
of strings.
|
||||
|
||||
If ``key`` is a variable, the return value is ``True`` if there is at least one binding where ``key`` is
|
||||
bound. If ``key`` is an array or tuple, the return value is ``True`` if there is at least one binding
|
||||
where *all* variables in ``key`` are bound.
|
||||
|
||||
:param key: possible variable, or array/tuple of variables
|
||||
:return: whether there is a binding of the variable in the return
|
||||
:rtype: Boolean
|
||||
"""
|
||||
if len(self.bindings) == 0:
|
||||
return False
|
||||
if type(key) is list or type(key) is tuple:
|
||||
# check first whether they are all really variables
|
||||
# type error: Unsupported right operand type for in ("Optional[List[str]]")
|
||||
if False in [k in self.variables for k in key]: # type: ignore [operator]
|
||||
return False
|
||||
for b in self.bindings:
|
||||
# try to find a binding where all key elements are present
|
||||
if False in [k in b for k in key]:
|
||||
# this is not a binding for the key combination, move on...
|
||||
continue
|
||||
else:
|
||||
# yep, this one is good!
|
||||
return True
|
||||
return False
|
||||
else:
|
||||
# type error: Unsupported right operand type for in ("Optional[List[str]]")
|
||||
if key not in self.variables: # type: ignore [operator]
|
||||
return False
|
||||
for b in self.bindings:
|
||||
if key in b:
|
||||
return True
|
||||
return False
|
||||
|
||||
def __getitem__(self, key: Union[slice, str, List[str]]) -> List[Dict[str, Value]]:
|
||||
"""Emulation of the ``obj[key]`` operator. Slice notation is also available.
|
||||
The goal is to choose the right bindings among the available ones. The return values are always
|
||||
arrays of bindings, ie, arrays of dictionaries mapping variable keys to :class:`Value` instances.
|
||||
The different value settings mean the followings:
|
||||
|
||||
- ``obj[key]`` returns the bindings where ``key`` has a valid value
|
||||
- ``obj[key1,key2,...]`` returns the bindings where *all* ``key1,key2,...`` have valid values
|
||||
- ``obj[(key1,key2,...):(nkey1,nkey2,...)]`` returns the bindings where all ``key1,key2,...`` have
|
||||
valid values and *none* of the ``nkey1,nkey2,...`` have valid values
|
||||
- ``obj[:(nkey1,nkey2,...)]`` returns the bindings where *none* of the ``nkey1,nkey2,...`` have valid values
|
||||
|
||||
In all cases complete bindings are returned, ie, the values for other variables, not present among
|
||||
the keys in the call, may or may not be present depending on the query results.
|
||||
|
||||
:param key: possible variable or array/tuple of keys with possible slice notation
|
||||
:return: list of bindings
|
||||
:rtype: array of variable -> :class:`Value` dictionaries
|
||||
"""
|
||||
|
||||
def _checkKeys(keys: Union[List[Any], Tuple[Any, ...]]) -> bool:
|
||||
if len(keys) == 0:
|
||||
return False
|
||||
for k in keys:
|
||||
# type error: Unsupported right operand type for in ("Optional[List[str]]")
|
||||
if (
|
||||
not isinstance(k, str)
|
||||
or k not in self.variables # type: ignore [operator]
|
||||
):
|
||||
return False
|
||||
return True
|
||||
|
||||
def _nonSliceCase(
|
||||
key: Union[
|
||||
str,
|
||||
List[Any],
|
||||
Tuple[Any],
|
||||
]
|
||||
) -> Union[List[Any], bool, Tuple[Any]]:
|
||||
# type error: Unsupported right operand type for in ("Optional[List[str]]")
|
||||
if isinstance(key, str) and key != "" and key in self.variables: # type: ignore[operator]
|
||||
# unicode or string:
|
||||
return [key]
|
||||
elif type(key) is list or type(key) is tuple:
|
||||
if _checkKeys(key):
|
||||
return key
|
||||
return False
|
||||
|
||||
# The arguments should be reduced to arrays of variables, ie, unicode strings
|
||||
yes_keys: Union[List[Any], bool, Tuple[Any]] = []
|
||||
no_keys: Union[List[Any], bool, Tuple[Any]] = []
|
||||
if type(key) is slice:
|
||||
# Note: None for start or stop is all right
|
||||
if key.start:
|
||||
yes_keys = _nonSliceCase(key.start)
|
||||
if not yes_keys:
|
||||
raise TypeError
|
||||
if key.stop:
|
||||
no_keys = _nonSliceCase(key.stop)
|
||||
if not no_keys:
|
||||
raise TypeError
|
||||
else:
|
||||
yes_keys = _nonSliceCase(key)
|
||||
|
||||
# got it right, now get the right binding line with the constraints
|
||||
retval: List[Dict[str, Value]] = []
|
||||
for b in self.bindings:
|
||||
# first check whether the 'yes' part is all there:
|
||||
# type error: Item "bool" of "Union[List[Any], bool, Tuple[Any]]" has no attribute "__iter__" (not iterable)
|
||||
if False in [k in b for k in yes_keys]: # type: ignore[union-attr]
|
||||
continue
|
||||
# type error: Item "bool" of "Union[List[Any], bool, Tuple[Any]]" has no attribute "__iter__" (not iterable)
|
||||
if True in [k in b for k in no_keys]: # type: ignore[union-attr]
|
||||
continue
|
||||
# if we got that far, we should be all right!
|
||||
retval.append(b)
|
||||
# if retval is of zero length, no hit; an exception should be raised to stay within the python style
|
||||
if len(retval) == 0:
|
||||
raise IndexError
|
||||
return retval
|
||||
|
||||
def convert(self) -> "Bindings":
|
||||
"""This is just a convenience method, returns ``self``.
|
||||
|
||||
Although :class:`SPARQLWrapper2.Bindings` is not a subclass of
|
||||
:class:`SPARQLWrapper.QueryResult<SPARQLWrapper.Wrapper.QueryResult>`, it is returned as a result by
|
||||
:func:`SPARQLWrapper2.query`, just like :class:`QueryResult<SPARQLWrapper.Wrapper.QueryResult>` is returned by
|
||||
:func:`SPARQLWrapper.query()<SPARQLWrapper.Wrapper.SPARQLWrapper.query>`. Consequently,
|
||||
having an empty :func:`convert` method to imitate
|
||||
:class:`QueryResult's convert() method<SPARQLWrapper.Wrapper.QueryResult.convert>`
|
||||
may avoid unnecessary problems.
|
||||
"""
|
||||
return self
|
||||
|
||||
|
||||
##############################################################################################################
|
||||
|
||||
|
||||
class SPARQLWrapper2(SW):
|
||||
"""Subclass of :class:`~SPARQLWrapper.Wrapper.SPARQLWrapper` that works with a JSON SELECT return result only. The
|
||||
query result is automatically set to a :class:`Bindings` instance. Makes the average query processing a bit
|
||||
simpler..."""
|
||||
|
||||
def __init__(self, baseURI: str, defaultGraph: Optional[str] = None):
|
||||
"""
|
||||
Class encapsulating a full SPARQL call. In contrast to the :class:`~SPARQLWrapper.Wrapper.SPARQLWrapper`
|
||||
superclass, the return format cannot be set (it is defaulted to
|
||||
:attr:`~SPARQLWrapper.Wrapper.SPARQLWrapper.JSON`).
|
||||
|
||||
:param baseURI: string of the SPARQL endpoint's URI.
|
||||
:type baseURI: string
|
||||
:param defaultGraph: URI for the default graph. Default is ``None``, can be set via an explicit call, too.
|
||||
:type defaultGraph: string
|
||||
"""
|
||||
super(SPARQLWrapper2, self).__init__(
|
||||
baseURI, returnFormat=JSON, defaultGraph=defaultGraph
|
||||
)
|
||||
|
||||
def setReturnFormat(self, format: Optional[str]) -> None:
|
||||
"""
|
||||
Set the return format (:meth:`overriding the inherited method
|
||||
<SPARQLWrapper.Wrapper.SPARQLWrapper.setReturnFormat>`).
|
||||
|
||||
.. warning::
|
||||
|
||||
This method does nothing; this class instance should work with JSON only. The method is defined \
|
||||
just to avoid possible errors by erroneously setting the return format. \
|
||||
When using this class, the user can safely ignore this call.
|
||||
|
||||
:param format: return format
|
||||
:type format: string
|
||||
"""
|
||||
pass
|
||||
|
||||
def query(self) -> Union[Bindings, QueryResult]: # type: ignore[override]
|
||||
"""
|
||||
Execute the query and do an automatic conversion.
|
||||
|
||||
Exceptions can be raised if either the URI is wrong or the HTTP sends back an error.
|
||||
The usual urllib2 exceptions are raised, which cover possible SPARQL errors, too.
|
||||
|
||||
If the query type is *not* SELECT, the method falls back to the
|
||||
:meth:`corresponding method in the superclass<SPARQLWrapper.Wrapper.SPARQLWrapper.query>`.
|
||||
|
||||
:return: query result
|
||||
:rtype: :class:`Bindings` instance
|
||||
"""
|
||||
res = super(SPARQLWrapper2, self).query()
|
||||
|
||||
if self.queryType == SELECT:
|
||||
return Bindings(res)
|
||||
else:
|
||||
return res
|
||||
|
||||
def queryAndConvert( # type: ignore[override]
|
||||
self,
|
||||
) -> Union[Union[Bindings, QueryResult], QueryResult.ConvertResult]:
|
||||
"""This is here to override the inherited method; it is equivalent to :class:`query`.
|
||||
|
||||
If the query type is *not* SELECT, the method falls back to the
|
||||
:meth:`corresponding method in the superclass<SPARQLWrapper.Wrapper.SPARQLWrapper.queryAndConvert>`.
|
||||
|
||||
:return: the converted query result.
|
||||
"""
|
||||
if self.queryType == SELECT:
|
||||
return self.query()
|
||||
else:
|
||||
return super(SPARQLWrapper2, self).queryAndConvert()
|
||||
1369
venv/lib/python3.10/site-packages/SPARQLWrapper/Wrapper.py
Normal file
1369
venv/lib/python3.10/site-packages/SPARQLWrapper/Wrapper.py
Normal file
File diff suppressed because it is too large
Load Diff
72
venv/lib/python3.10/site-packages/SPARQLWrapper/__init__.py
Normal file
72
venv/lib/python3.10/site-packages/SPARQLWrapper/__init__.py
Normal file
@ -0,0 +1,72 @@
|
||||
# -*- coding: utf8 -*-
|
||||
|
||||
"""
|
||||
|
||||
**SPARQLWrapper** is a simple Python wrapper around a `SPARQL <https://www.w3.org/TR/sparql11-overview/>`_ service to
|
||||
remotelly execute your queries. It helps in creating the query
|
||||
invokation and, possibly, convert the result into a more manageable
|
||||
format.
|
||||
|
||||
"""
|
||||
|
||||
__version__ = "2.0.0"
|
||||
"""The version of SPARQLWrapper"""
|
||||
|
||||
__agent__: str = f"sparqlwrapper {__version__} (rdflib.github.io/sparqlwrapper)"
|
||||
|
||||
|
||||
from .SmartWrapper import SPARQLWrapper2
|
||||
from .sparql_dataframe import get_sparql_dataframe
|
||||
from .Wrapper import (
|
||||
ASK,
|
||||
BASIC,
|
||||
CONSTRUCT,
|
||||
CSV,
|
||||
DELETE,
|
||||
DESCRIBE,
|
||||
DIGEST,
|
||||
GET,
|
||||
INSERT,
|
||||
JSON,
|
||||
JSONLD,
|
||||
N3,
|
||||
POST,
|
||||
POSTDIRECTLY,
|
||||
RDF,
|
||||
RDFXML,
|
||||
SELECT,
|
||||
TSV,
|
||||
TURTLE,
|
||||
URLENCODED,
|
||||
XML,
|
||||
QueryResult,
|
||||
SPARQLWrapper,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"SPARQLWrapper2",
|
||||
"get_sparql_dataframe",
|
||||
"ASK",
|
||||
"BASIC",
|
||||
"CONSTRUCT",
|
||||
"CSV",
|
||||
"DELETE",
|
||||
"DESCRIBE",
|
||||
"DIGEST",
|
||||
"GET",
|
||||
"INSERT",
|
||||
"JSON",
|
||||
"JSONLD",
|
||||
"N3",
|
||||
"POST",
|
||||
"POSTDIRECTLY",
|
||||
"RDF",
|
||||
"RDFXML",
|
||||
"SELECT",
|
||||
"TSV",
|
||||
"TURTLE",
|
||||
"URLENCODED",
|
||||
"XML",
|
||||
"QueryResult",
|
||||
"SPARQLWrapper",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
157
venv/lib/python3.10/site-packages/SPARQLWrapper/main.py
Normal file
157
venv/lib/python3.10/site-packages/SPARQLWrapper/main.py
Normal file
@ -0,0 +1,157 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import xml
|
||||
from typing import List, Optional
|
||||
|
||||
import rdflib
|
||||
|
||||
from . import __version__
|
||||
from .Wrapper import SPARQLWrapper, _allowedAuth, _allowedFormats, _allowedRequests
|
||||
|
||||
|
||||
class SPARQLWrapperFormatter(
|
||||
argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
def check_file(v: str) -> str:
|
||||
if os.path.isfile(v):
|
||||
return v
|
||||
elif v == "-":
|
||||
return "-" # stdin
|
||||
else:
|
||||
raise argparse.ArgumentTypeError("file '%s' is not found" % v)
|
||||
|
||||
|
||||
def choicesDescriptions() -> str:
|
||||
d = "\n - ".join(["allowed FORMAT:"] + _allowedFormats)
|
||||
d += "\n - ".join(["\n\nallowed METHOD:"] + _allowedRequests)
|
||||
d += "\n - ".join(["\n\nallowed AUTH:"] + _allowedAuth)
|
||||
return d
|
||||
|
||||
|
||||
def parse_args(test: Optional[List[str]] = None) -> argparse.Namespace:
|
||||
"""Parse arguments."""
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="rqw",
|
||||
formatter_class=(
|
||||
lambda prog: SPARQLWrapperFormatter(
|
||||
prog,
|
||||
**{
|
||||
"width": shutil.get_terminal_size(fallback=(120, 50)).columns,
|
||||
"max_help_position": 30,
|
||||
},
|
||||
)
|
||||
),
|
||||
description="sparqlwrapper CLI",
|
||||
epilog=choicesDescriptions(),
|
||||
)
|
||||
input_group = parser.add_mutually_exclusive_group(required=True)
|
||||
input_group.add_argument(
|
||||
"-f",
|
||||
"--file",
|
||||
metavar="FILE",
|
||||
type=check_file,
|
||||
help="query with sparql file (stdin: -)",
|
||||
)
|
||||
input_group.add_argument("-Q", "--query", metavar="QUERY", help="query with string")
|
||||
parser.add_argument(
|
||||
"-F",
|
||||
"--format",
|
||||
default="json",
|
||||
metavar="FORMAT",
|
||||
choices=_allowedFormats,
|
||||
help="response format",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-e",
|
||||
"--endpoint",
|
||||
metavar="URI",
|
||||
help="sparql endpoint",
|
||||
default="http://dbpedia.org/sparql",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--method",
|
||||
metavar="METHOD",
|
||||
choices=_allowedRequests,
|
||||
help="request method",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-a", "--auth", metavar="AUTH", choices=_allowedAuth, help="HTTP auth"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-u", "--username", metavar="ID", default="guest", help="username for auth"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p", "--password", metavar="PW", default="", help="password for auth"
|
||||
)
|
||||
parser.add_argument("-q", "--quiet", action="store_true", help="supress warnings")
|
||||
parser.add_argument(
|
||||
"-V", "--version", action="version", version="%(prog)s {}".format(__version__)
|
||||
)
|
||||
if test is None:
|
||||
return parser.parse_args()
|
||||
else:
|
||||
return parser.parse_args(test)
|
||||
|
||||
|
||||
def main(test: Optional[List[str]] = None) -> None:
|
||||
args = parse_args(test)
|
||||
if args.quiet:
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
q = ""
|
||||
if args.query is not None:
|
||||
q = args.query
|
||||
elif args.file == "-":
|
||||
q = sys.stdin.read()
|
||||
else:
|
||||
q = open(args.file, "r").read()
|
||||
|
||||
sparql = SPARQLWrapper(
|
||||
args.endpoint,
|
||||
agent=(
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/96.0.4664.110 Safari/537.36"
|
||||
),
|
||||
)
|
||||
if args.auth is not None:
|
||||
sparql.setHTTPAuth(args.auth)
|
||||
sparql.setCredentials(args.username, args.password)
|
||||
if args.method is not None:
|
||||
sparql.setMethod(args.method)
|
||||
sparql.setQuery(q)
|
||||
sparql.setReturnFormat(args.format)
|
||||
results = sparql.query().convert()
|
||||
|
||||
if isinstance(results, dict):
|
||||
# "json"
|
||||
print(json.dumps(results, indent=4))
|
||||
elif isinstance(results, xml.dom.minidom.Document):
|
||||
# "xml"
|
||||
print(results.toxml())
|
||||
elif isinstance(results, bytes):
|
||||
# "csv", "tsv", "turtle", "n3"
|
||||
print(results.decode("utf-8"))
|
||||
elif isinstance(results, rdflib.graph.ConjunctiveGraph):
|
||||
# "rdf"
|
||||
print(results.serialize())
|
||||
else:
|
||||
# unknown type
|
||||
raise TypeError(f"Unsupported result of type {type(results)}: {results!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -0,0 +1,74 @@
|
||||
"""
|
||||
Query a SPARQL endpoint and return results as a Pandas dataframe.
|
||||
"""
|
||||
import io
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Union
|
||||
|
||||
from SPARQLWrapper.SmartWrapper import Bindings, SPARQLWrapper2, Value
|
||||
from SPARQLWrapper.Wrapper import CSV, SELECT, SPARQLWrapper
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class QueryException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def get_sparql_dataframe_orig(
|
||||
endpoint: str, query: Union[str, bytes]
|
||||
) -> "pd.DataFrame":
|
||||
"""copy paste from: https://github.com/lawlesst/sparql-dataframe"""
|
||||
# pandas inside to avoid requiring it
|
||||
import pandas as pd
|
||||
|
||||
sparql = SPARQLWrapper(endpoint)
|
||||
sparql.setQuery(query)
|
||||
if sparql.queryType != SELECT:
|
||||
raise QueryException("Only SPARQL SELECT queries are supported.")
|
||||
sparql.setReturnFormat(CSV)
|
||||
results = sparql.query().convert()
|
||||
if isinstance(results, bytes):
|
||||
_csv = io.StringIO(results.decode("utf-8"))
|
||||
return pd.read_csv(_csv, sep=",")
|
||||
else:
|
||||
raise TypeError(type(results))
|
||||
|
||||
|
||||
def get_sparql_typed_dict(
|
||||
endpoint: str, query: Union[str, bytes]
|
||||
) -> List[Dict[str, Value]]:
|
||||
"""modified from: https://github.com/lawlesst/sparql-dataframe"""
|
||||
# pandas inside to avoid requiring it
|
||||
import pandas as pd
|
||||
# rdflib in here because there is some meta stuff in the setup.py and Travis fails because rdflib is installed later
|
||||
import rdflib.term
|
||||
sparql = SPARQLWrapper2(endpoint)
|
||||
sparql.setQuery(query)
|
||||
if sparql.queryType != SELECT:
|
||||
raise QueryException("Only SPARQL SELECT queries are supported.")
|
||||
# sparql.setReturnFormat(JSON)
|
||||
results = sparql.query()
|
||||
if not isinstance(results, Bindings):
|
||||
raise TypeError(type(results))
|
||||
# consider perf hacking later, probably slow
|
||||
# convert list of dicts to python types
|
||||
d = []
|
||||
for x in results.bindings:
|
||||
row = {}
|
||||
for k in x:
|
||||
v = x[k]
|
||||
vv = rdflib.term.Literal(v.value, datatype=v.datatype).toPython() # type: ignore[no-untyped-call]
|
||||
row[k] = vv
|
||||
d.append(row)
|
||||
return d
|
||||
|
||||
|
||||
def get_sparql_dataframe(endpoint: str, query: Union[str, bytes]) -> "pd.DataFrame":
|
||||
# pandas inside to avoid requiring it
|
||||
import pandas as pd
|
||||
|
||||
d = get_sparql_typed_dict(endpoint, query)
|
||||
# TODO: will nan fill somehow, make more strict if there is way of getting the nan types from rdflib
|
||||
df = pd.DataFrame(d)
|
||||
return df
|
||||
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user