213 lines
7.6 KiB
Python
213 lines
7.6 KiB
Python
import os
|
|
import re
|
|
import logging
|
|
import pandas as pd
|
|
import ezodf
|
|
|
|
from openpyxl.utils import get_column_letter
|
|
from openpyxl.styles import Alignment
|
|
|
|
# -------------------------------------------------
|
|
# KONFIGURATION
|
|
# -------------------------------------------------
|
|
INPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods"
|
|
OUTPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Normvokabular_INTERN/NV_MASTER_Updated.ods"
|
|
|
|
MASTER_SHEET_NAME = "Masterstruktur"
|
|
SHEET_ORDER = [
|
|
"Masterstruktur",
|
|
"1 Figur",
|
|
"2 Objekt",
|
|
"3 Flora",
|
|
"4 Fauna",
|
|
"5 Landschaft",
|
|
"6 Phänomene, Erscheinungen",
|
|
"7 Architektur",
|
|
"8 Verzierungen, Ornamentik",
|
|
"9 Aktivität, Handlung, Pose"
|
|
]
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
|
|
# -------------------------------------------------
|
|
# HELFERFUNKTIONEN
|
|
# -------------------------------------------------
|
|
def detect_id_and_name(df):
|
|
df_cols = [str(c).strip().lower() for c in df.columns]
|
|
id_col, name_col = None, None
|
|
for idx, col in enumerate(df_cols):
|
|
if col == "id":
|
|
id_col = df.columns[idx]
|
|
elif col in ["name", "wort", "wort/vokabel"]:
|
|
name_col = df.columns[idx]
|
|
if id_col is None or name_col is None:
|
|
logging.warning(f"Sheet hat keine ID oder Name/Wort-Spalte: {df.columns}")
|
|
return id_col, name_col
|
|
|
|
def parse_id_level(id_val):
|
|
if pd.isna(id_val):
|
|
return None
|
|
id_str = str(id_val).strip()
|
|
if re.match(r'^\d+(\.\d+){0,2}$', id_str):
|
|
return len(id_str.split("."))
|
|
return None
|
|
|
|
def process_category_df(df, sheet_name):
|
|
id_col, name_col = detect_id_and_name(df)
|
|
if id_col is None or name_col is None:
|
|
return None
|
|
|
|
current_level = {2: None, 3: None}
|
|
new_rows = []
|
|
|
|
for _, row in df.iterrows():
|
|
id_val = row[id_col] if pd.notna(row[id_col]) else ""
|
|
name_val = row[name_col] if pd.notna(row[name_col]) else ""
|
|
if not id_val and not name_val:
|
|
continue
|
|
|
|
level = parse_id_level(id_val)
|
|
if level:
|
|
if level >= 2:
|
|
current_level[level] = name_val
|
|
for deeper in range(level+1, 4):
|
|
current_level[deeper] = None
|
|
new_rows.append({
|
|
"ID": id_val,
|
|
"Unterkategorie": current_level[2] if level >= 2 else "",
|
|
"Unterunterkategorie": current_level[3] if level >= 3 else "",
|
|
"Wort/Vokabel": name_val
|
|
})
|
|
else:
|
|
new_rows.append({
|
|
"ID": "",
|
|
"Unterkategorie": "",
|
|
"Unterunterkategorie": "",
|
|
"Wort/Vokabel": name_val
|
|
})
|
|
df_new = pd.DataFrame(new_rows, columns=["ID", "Unterkategorie", "Unterunterkategorie", "Wort/Vokabel"])
|
|
logging.info(f"Sheet '{sheet_name}' verarbeitet: {len(df_new)} Zeilen")
|
|
return df_new
|
|
|
|
def merge_new_terms(original_df, processed_df):
|
|
"""Fügt neue Wörter aus original_df (ohne ID) in processed_df ein, wenn sie noch nicht vorhanden sind."""
|
|
_, orig_name_col = detect_id_and_name(original_df)
|
|
if orig_name_col is None or orig_name_col not in original_df.columns:
|
|
return processed_df
|
|
|
|
existing_words = set(str(x).strip().lower() for x in processed_df["Wort/Vokabel"].dropna())
|
|
new_rows = []
|
|
|
|
for _, row in original_df.iterrows():
|
|
name = str(row.get(orig_name_col, "")).strip()
|
|
id_val = str(row.get("ID", "")).strip() if "ID" in row else ""
|
|
if not name:
|
|
continue
|
|
if not id_val and name.lower() not in existing_words:
|
|
new_rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": "", "Wort/Vokabel": name})
|
|
|
|
if new_rows:
|
|
df_new = pd.concat([processed_df, pd.DataFrame(new_rows)], ignore_index=True)
|
|
logging.info(f"{len(new_rows)} neue Wörter übernommen.")
|
|
return df_new
|
|
return processed_df
|
|
|
|
def build_master_df(category_dfs):
|
|
seen_ids = set()
|
|
master_rows = []
|
|
for df in category_dfs:
|
|
for _, row in df.iterrows():
|
|
id_val = row["ID"]
|
|
name_val = row["Wort/Vokabel"]
|
|
if id_val and id_val not in seen_ids:
|
|
seen_ids.add(id_val)
|
|
master_rows.append({"ID": id_val, "Name": name_val})
|
|
master_df = pd.DataFrame(master_rows)
|
|
logging.info(f"Masterstruktur enthält {len(master_df)} eindeutige IDs")
|
|
return master_df
|
|
|
|
# -------------------------------------------------
|
|
# FORMATIERUNG UND SPEICHERN
|
|
# -------------------------------------------------
|
|
def format_excel_sheet(df, sheet_name, writer):
|
|
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
|
worksheet = writer.sheets[sheet_name]
|
|
|
|
for col_idx, col in enumerate(df.columns, 1):
|
|
max_len = max([len(str(cell)) if cell is not None else 0 for cell in df[col]])
|
|
max_len = max(max_len, len(col)) + 2
|
|
worksheet.column_dimensions[get_column_letter(col_idx)].width = max_len
|
|
for row_idx in range(1, len(df) + 2):
|
|
worksheet.cell(row=row_idx, column=col_idx).alignment = Alignment(horizontal='left')
|
|
|
|
def save_ods(processed_sheets, output_file):
|
|
doc = ezodf.newdoc(doctype="ods")
|
|
for name, df in processed_sheets.items():
|
|
df = df.fillna("")
|
|
sheet = ezodf.Sheet(name, size=(len(df) + 1, len(df.columns)))
|
|
doc.sheets += sheet
|
|
|
|
for col_idx, col_name in enumerate(df.columns):
|
|
sheet[0, col_idx].set_value(str(col_name))
|
|
|
|
for row_idx, row in enumerate(df.itertuples(index=False), start=1):
|
|
for col_idx, value in enumerate(row):
|
|
if value is None or str(value).lower() == "nan":
|
|
value = ""
|
|
sheet[row_idx, col_idx].set_value(str(value))
|
|
doc.saveas(output_file)
|
|
logging.info(f"ODS-Datei gespeichert: {output_file}")
|
|
|
|
# -------------------------------------------------
|
|
# HAUPTPROGRAMM
|
|
# -------------------------------------------------
|
|
def main():
|
|
if not os.path.exists(INPUT_FILE):
|
|
logging.error(f"Datei {INPUT_FILE} existiert nicht.")
|
|
return
|
|
|
|
ext = os.path.splitext(INPUT_FILE)[1].lower()
|
|
engine = None
|
|
if ext in [".xlsx", ".xls"]:
|
|
engine = "openpyxl"
|
|
elif ext == ".ods":
|
|
engine = "odf"
|
|
else:
|
|
logging.error("Nicht unterstütztes Dateiformat")
|
|
return
|
|
|
|
logging.info(f"Lade Datei {INPUT_FILE} mit Engine '{engine}'")
|
|
xls = pd.ExcelFile(INPUT_FILE, engine=engine)
|
|
|
|
processed_sheets = {}
|
|
category_dfs = []
|
|
|
|
for sheet_name in xls.sheet_names:
|
|
if sheet_name == MASTER_SHEET_NAME:
|
|
continue
|
|
df = pd.read_excel(xls, sheet_name=sheet_name, engine=engine)
|
|
df_new = process_category_df(df, sheet_name)
|
|
if df_new is not None:
|
|
df_merged = merge_new_terms(df, df_new)
|
|
processed_sheets[sheet_name] = df_merged
|
|
category_dfs.append(df_merged)
|
|
else:
|
|
processed_sheets[sheet_name] = df
|
|
|
|
master_df = build_master_df(category_dfs)
|
|
processed_sheets[MASTER_SHEET_NAME] = master_df
|
|
|
|
ordered_sheets = {name: processed_sheets[name] for name in SHEET_ORDER if name in processed_sheets}
|
|
|
|
ext_out = os.path.splitext(OUTPUT_FILE)[1].lower()
|
|
if ext_out in [".xlsx", ".xls"]:
|
|
with pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl") as writer:
|
|
for name, df in ordered_sheets.items():
|
|
format_excel_sheet(df, name, writer)
|
|
logging.info(f"Excel-Datei gespeichert: {OUTPUT_FILE}")
|
|
elif ext_out == ".ods":
|
|
save_ods(ordered_sheets, OUTPUT_FILE)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|