import os import re import logging import pandas as pd import ezodf from openpyxl.utils import get_column_letter from openpyxl.styles import Alignment # ------------------------------------------------- # KONFIGURATION # ------------------------------------------------- INPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods" OUTPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Normvokabular_INTERN/NV_MASTER_Updated.ods" MASTER_SHEET_NAME = "Masterstruktur" SHEET_ORDER = [ "Masterstruktur", "1 Figur", "2 Objekt", "3 Flora", "4 Fauna", "5 Landschaft", "6 Phänomene, Erscheinungen", "7 Architektur", "8 Verzierungen, Ornamentik", "9 Aktivität, Handlung, Pose" ] logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") # ------------------------------------------------- # HELFERFUNKTIONEN # ------------------------------------------------- def detect_id_and_name(df): df_cols = [str(c).strip().lower() for c in df.columns] id_col, name_col = None, None for idx, col in enumerate(df_cols): if col == "id": id_col = df.columns[idx] elif col in ["name", "wort", "wort/vokabel"]: name_col = df.columns[idx] if id_col is None or name_col is None: logging.warning(f"Sheet hat keine ID oder Name/Wort-Spalte: {df.columns}") return id_col, name_col def parse_id_level(id_val): if pd.isna(id_val): return None id_str = str(id_val).strip() if re.match(r'^\d+(\.\d+){0,2}$', id_str): return len(id_str.split(".")) return None def process_category_df(df, sheet_name): id_col, name_col = detect_id_and_name(df) if id_col is None or name_col is None: return None current_level = {2: None, 3: None} new_rows = [] for _, row in df.iterrows(): id_val = row[id_col] if pd.notna(row[id_col]) else "" name_val = row[name_col] if pd.notna(row[name_col]) else "" if not id_val and not name_val: continue level = parse_id_level(id_val) if level: if level >= 2: current_level[level] = name_val for deeper in range(level+1, 4): current_level[deeper] = None new_rows.append({ "ID": id_val, "Unterkategorie": current_level[2] if level >= 2 else "", "Unterunterkategorie": current_level[3] if level >= 3 else "", "Wort/Vokabel": name_val }) else: new_rows.append({ "ID": "", "Unterkategorie": "", "Unterunterkategorie": "", "Wort/Vokabel": name_val }) df_new = pd.DataFrame(new_rows, columns=["ID", "Unterkategorie", "Unterunterkategorie", "Wort/Vokabel"]) logging.info(f"Sheet '{sheet_name}' verarbeitet: {len(df_new)} Zeilen") return df_new def merge_new_terms(original_df, processed_df): """Fügt neue Wörter aus original_df (ohne ID) in processed_df ein, wenn sie noch nicht vorhanden sind.""" _, orig_name_col = detect_id_and_name(original_df) if orig_name_col is None or orig_name_col not in original_df.columns: return processed_df existing_words = set(str(x).strip().lower() for x in processed_df["Wort/Vokabel"].dropna()) new_rows = [] for _, row in original_df.iterrows(): name = str(row.get(orig_name_col, "")).strip() id_val = str(row.get("ID", "")).strip() if "ID" in row else "" if not name: continue if not id_val and name.lower() not in existing_words: new_rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": "", "Wort/Vokabel": name}) if new_rows: df_new = pd.concat([processed_df, pd.DataFrame(new_rows)], ignore_index=True) logging.info(f"{len(new_rows)} neue Wörter übernommen.") return df_new return processed_df def build_master_df(category_dfs): seen_ids = set() master_rows = [] for df in category_dfs: for _, row in df.iterrows(): id_val = row["ID"] name_val = row["Wort/Vokabel"] if id_val and id_val not in seen_ids: seen_ids.add(id_val) master_rows.append({"ID": id_val, "Name": name_val}) master_df = pd.DataFrame(master_rows) logging.info(f"Masterstruktur enthält {len(master_df)} eindeutige IDs") return master_df # ------------------------------------------------- # FORMATIERUNG UND SPEICHERN # ------------------------------------------------- def format_excel_sheet(df, sheet_name, writer): df.to_excel(writer, sheet_name=sheet_name, index=False) worksheet = writer.sheets[sheet_name] for col_idx, col in enumerate(df.columns, 1): max_len = max([len(str(cell)) if cell is not None else 0 for cell in df[col]]) max_len = max(max_len, len(col)) + 2 worksheet.column_dimensions[get_column_letter(col_idx)].width = max_len for row_idx in range(1, len(df) + 2): worksheet.cell(row=row_idx, column=col_idx).alignment = Alignment(horizontal='left') def save_ods(processed_sheets, output_file): doc = ezodf.newdoc(doctype="ods") for name, df in processed_sheets.items(): df = df.fillna("") sheet = ezodf.Sheet(name, size=(len(df) + 1, len(df.columns))) doc.sheets += sheet for col_idx, col_name in enumerate(df.columns): sheet[0, col_idx].set_value(str(col_name)) for row_idx, row in enumerate(df.itertuples(index=False), start=1): for col_idx, value in enumerate(row): if value is None or str(value).lower() == "nan": value = "" sheet[row_idx, col_idx].set_value(str(value)) doc.saveas(output_file) logging.info(f"ODS-Datei gespeichert: {output_file}") # ------------------------------------------------- # HAUPTPROGRAMM # ------------------------------------------------- def main(): if not os.path.exists(INPUT_FILE): logging.error(f"Datei {INPUT_FILE} existiert nicht.") return ext = os.path.splitext(INPUT_FILE)[1].lower() engine = None if ext in [".xlsx", ".xls"]: engine = "openpyxl" elif ext == ".ods": engine = "odf" else: logging.error("Nicht unterstütztes Dateiformat") return logging.info(f"Lade Datei {INPUT_FILE} mit Engine '{engine}'") xls = pd.ExcelFile(INPUT_FILE, engine=engine) processed_sheets = {} category_dfs = [] for sheet_name in xls.sheet_names: if sheet_name == MASTER_SHEET_NAME: continue df = pd.read_excel(xls, sheet_name=sheet_name, engine=engine) df_new = process_category_df(df, sheet_name) if df_new is not None: df_merged = merge_new_terms(df, df_new) processed_sheets[sheet_name] = df_merged category_dfs.append(df_merged) else: processed_sheets[sheet_name] = df master_df = build_master_df(category_dfs) processed_sheets[MASTER_SHEET_NAME] = master_df ordered_sheets = {name: processed_sheets[name] for name in SHEET_ORDER if name in processed_sheets} ext_out = os.path.splitext(OUTPUT_FILE)[1].lower() if ext_out in [".xlsx", ".xls"]: with pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl") as writer: for name, df in ordered_sheets.items(): format_excel_sheet(df, name, writer) logging.info(f"Excel-Datei gespeichert: {OUTPUT_FILE}") elif ext_out == ".ods": save_ods(ordered_sheets, OUTPUT_FILE) if __name__ == "__main__": main()