import os import re import logging import datetime import pandas as pd from openpyxl.utils import get_column_letter from openpyxl.styles import Alignment import ezodf # ----------------- KONFIGURATION ----------------- INPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods" MASTER_SHEET_NAME = "Masterstruktur" today = datetime.datetime.today().strftime("%y.%m.%d") base, ext = os.path.splitext(INPUT_FILE) OUTPUT_FILE = f"{base}_Updated_{today}{ext}" logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") # ----------------- HILFSFUNKTIONEN ----------------- def load_file(input_file): """ Prüft Dateiformat und gibt für Excel: pd.ExcelFile + Engine zurück, für ODS: None + "odf" (da ODS direkt über ezodf gelesen wird). """ ext = os.path.splitext(input_file)[1].lower() if ext in [".xlsx", ".xls"]: engine = "openpyxl" xls = pd.ExcelFile(input_file, engine=engine) elif ext == ".ods": engine = "odf" xls = None # ODS wird direkt über ezodf gelesen else: raise ValueError(f"Nicht unterstütztes Dateiformat: {ext}") logging.info(f"Lade Datei {input_file} mit Engine '{engine}'") return xls, engine def read_ods_sheet(filename, sheet_name): """Liests ODS Sheet sauber ein, inklusive Header.""" doc = ezodf.opendoc(filename) sheet = doc.sheets[sheet_name] data = [] headers = [str(sheet[0, col].value).strip() for col in range(sheet.ncols())] for row_idx in range(1, sheet.nrows()): row = {} empty_row = True for col_idx, col_name in enumerate(headers): cell_val = sheet[row_idx, col_idx].value val = "" if cell_val is None else str(cell_val).strip() row[col_name] = val if val: empty_row = False if not empty_row: data.append(row) df = pd.DataFrame(data, columns=headers) return df def process_category_sheet(df): """Erstellt die treppenartige Hierarchie.""" df = df.copy() for col in ["ID","Unterkategorie","Unterunterkategorie","Wort/Vokabel"]: if col not in df.columns: df[col] = "" rows = [] current_id = "" current_uuk = "" for _, r in df.iterrows(): id_val = str(r.get("ID","")).strip() uuk_val = str(r.get("Unterunterkategorie","")).strip() word_val = str(r.get("Wort/Vokabel","")).strip() if id_val: # Kategoriezeile current_id = id_val current_uuk = uuk_val or word_val rows.append({"ID": current_id, "Unterkategorie": "", "Unterunterkategorie": current_uuk, "Wort/Vokabel": ""}) continue if uuk_val: # Unterunterkategorie current_uuk = uuk_val rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": current_uuk, "Wort/Vokabel": ""}) continue if word_val: # Vokabel rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": "", "Wort/Vokabel": word_val}) continue return pd.DataFrame(rows, columns=["ID","Unterkategorie","Unterunterkategorie","Wort/Vokabel"]) def remove_empty_vocabulary_rows(df): """Entfernt Zeilen, die nur leere Wort/Vokabel-Spalte haben.""" return df[df["Wort/Vokabel"].astype(str).str.strip() != ""].copy().reset_index(drop=True) def sync_master_and_sheets(master_df, category_dfs): """Synchronisiert Kategorien nach Master, Vokabeln bleiben erhalten.""" master_df = master_df.copy() master_df["ID"] = master_df["ID"].astype(str).str.strip() master_dict = dict(zip(master_df["ID"], master_df["Kategorie"])) updated_dfs = {} summary = {} for sheet_name, df in category_dfs.items(): rows_out = [] changes = {"removed":0} for _, row in df.iterrows(): id_val = str(row.get("ID","")).strip() if id_val and id_val not in master_dict: changes["removed"] +=1 continue rows_out.append(row.to_dict()) updated_dfs[sheet_name] = pd.DataFrame(rows_out, columns=df.columns) summary[sheet_name] = changes new_master = pd.DataFrame([{"ID":k,"Kategorie":v} for k,v in sorted(master_dict.items())]) return new_master, updated_dfs, summary def save_excel(processed_sheets, output_file): from openpyxl import Workbook with pd.ExcelWriter(output_file, engine="openpyxl") as writer: for sheet_name, df in processed_sheets.items(): df.to_excel(writer, sheet_name=sheet_name, index=False) ws = writer.sheets[sheet_name] for col_idx, col in enumerate(df.columns,1): max_len = max(df[col].astype(str).map(len).max() if len(df)>0 else 0,len(col))+2 ws.column_dimensions[get_column_letter(col_idx)].width = max_len for row_idx in range(1,len(df)+2): ws.cell(row=row_idx,column=col_idx).alignment = Alignment(horizontal='left') def save_ods(processed_sheets, output_file): doc = ezodf.newdoc(doctype="ods", filename=output_file) for name, df in processed_sheets.items(): sheet = ezodf.Sheet(name, size=(len(df)+1,len(df.columns))) doc.sheets += sheet for col_idx, col_name in enumerate(df.columns): sheet[0,col_idx].set_value(col_name) for row_idx,row in enumerate(df.itertuples(index=False),start=1): for col_idx,value in enumerate(row): sheet[row_idx,col_idx].set_value("" if pd.isna(value) else value) doc.save() # ----------------- HAUPTPROGRAMM ----------------- def main(): xls, engine = load_file(INPUT_FILE) if engine == "odf": doc = ezodf.opendoc(INPUT_FILE) sheet_names = [s.name for s in doc.sheets if s.name != MASTER_SHEET_NAME] category_dfs = {name: process_category_sheet(read_ods_sheet(INPUT_FILE,name)) for name in sheet_names} master_df = read_ods_sheet(INPUT_FILE, MASTER_SHEET_NAME) else: sheet_names = [s for s in xls.sheet_names if s != MASTER_SHEET_NAME] category_dfs = {} for sheet_name in sheet_names: df = pd.read_excel(xls, sheet_name=sheet_name, engine=engine) df.columns = [str(c).strip() for c in df.columns] category_dfs[sheet_name] = process_category_sheet(df) master_df = pd.read_excel(xls, sheet_name=MASTER_SHEET_NAME, engine=engine) master_df.columns = [str(c).strip() for c in master_df.columns] new_master, updated_dfs, summary = sync_master_and_sheets(master_df, category_dfs) processed_sheets = {MASTER_SHEET_NAME:new_master} processed_sheets.update({k:remove_empty_vocabulary_rows(v) for k,v in updated_dfs.items()}) ext_out = os.path.splitext(OUTPUT_FILE)[1].lower() if ext_out in [".xlsx",".xls"]: save_excel(processed_sheets, OUTPUT_FILE) else: save_ods(processed_sheets, OUTPUT_FILE) logging.info(f"Datei gespeichert: {OUTPUT_FILE}") logging.info("===== SYNC SUMMARY =====") for sheet, info in summary.items(): logging.info(f"{sheet}: {info}") if __name__ == "__main__": main()