GND_Skript_Test/NV_Master_EditorFAIL.py
2025-10-10 09:46:41 +02:00

172 lines
7.1 KiB
Python

import os
import re
import logging
import datetime
import pandas as pd
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment
import ezodf
# ----------------- KONFIGURATION -----------------
INPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods"
MASTER_SHEET_NAME = "Masterstruktur"
today = datetime.datetime.today().strftime("%y.%m.%d")
base, ext = os.path.splitext(INPUT_FILE)
OUTPUT_FILE = f"{base}_Updated_{today}{ext}"
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# ----------------- HILFSFUNKTIONEN -----------------
def load_file(input_file):
"""
Prüft Dateiformat und gibt für Excel: pd.ExcelFile + Engine zurück,
für ODS: None + "odf" (da ODS direkt über ezodf gelesen wird).
"""
ext = os.path.splitext(input_file)[1].lower()
if ext in [".xlsx", ".xls"]:
engine = "openpyxl"
xls = pd.ExcelFile(input_file, engine=engine)
elif ext == ".ods":
engine = "odf"
xls = None # ODS wird direkt über ezodf gelesen
else:
raise ValueError(f"Nicht unterstütztes Dateiformat: {ext}")
logging.info(f"Lade Datei {input_file} mit Engine '{engine}'")
return xls, engine
def read_ods_sheet(filename, sheet_name):
"""Liests ODS Sheet sauber ein, inklusive Header."""
doc = ezodf.opendoc(filename)
sheet = doc.sheets[sheet_name]
data = []
headers = [str(sheet[0, col].value).strip() for col in range(sheet.ncols())]
for row_idx in range(1, sheet.nrows()):
row = {}
empty_row = True
for col_idx, col_name in enumerate(headers):
cell_val = sheet[row_idx, col_idx].value
val = "" if cell_val is None else str(cell_val).strip()
row[col_name] = val
if val:
empty_row = False
if not empty_row:
data.append(row)
df = pd.DataFrame(data, columns=headers)
return df
def process_category_sheet(df):
"""Erstellt die treppenartige Hierarchie."""
df = df.copy()
for col in ["ID","Unterkategorie","Unterunterkategorie","Wort/Vokabel"]:
if col not in df.columns:
df[col] = ""
rows = []
current_id = ""
current_uuk = ""
for _, r in df.iterrows():
id_val = str(r.get("ID","")).strip()
uuk_val = str(r.get("Unterunterkategorie","")).strip()
word_val = str(r.get("Wort/Vokabel","")).strip()
if id_val: # Kategoriezeile
current_id = id_val
current_uuk = uuk_val or word_val
rows.append({"ID": current_id, "Unterkategorie": "", "Unterunterkategorie": current_uuk, "Wort/Vokabel": ""})
continue
if uuk_val: # Unterunterkategorie
current_uuk = uuk_val
rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": current_uuk, "Wort/Vokabel": ""})
continue
if word_val: # Vokabel
rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": "", "Wort/Vokabel": word_val})
continue
return pd.DataFrame(rows, columns=["ID","Unterkategorie","Unterunterkategorie","Wort/Vokabel"])
def remove_empty_vocabulary_rows(df):
"""Entfernt Zeilen, die nur leere Wort/Vokabel-Spalte haben."""
return df[df["Wort/Vokabel"].astype(str).str.strip() != ""].copy().reset_index(drop=True)
def sync_master_and_sheets(master_df, category_dfs):
"""Synchronisiert Kategorien nach Master, Vokabeln bleiben erhalten."""
master_df = master_df.copy()
master_df["ID"] = master_df["ID"].astype(str).str.strip()
master_dict = dict(zip(master_df["ID"], master_df["Kategorie"]))
updated_dfs = {}
summary = {}
for sheet_name, df in category_dfs.items():
rows_out = []
changes = {"removed":0}
for _, row in df.iterrows():
id_val = str(row.get("ID","")).strip()
if id_val and id_val not in master_dict:
changes["removed"] +=1
continue
rows_out.append(row.to_dict())
updated_dfs[sheet_name] = pd.DataFrame(rows_out, columns=df.columns)
summary[sheet_name] = changes
new_master = pd.DataFrame([{"ID":k,"Kategorie":v} for k,v in sorted(master_dict.items())])
return new_master, updated_dfs, summary
def save_excel(processed_sheets, output_file):
from openpyxl import Workbook
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
for sheet_name, df in processed_sheets.items():
df.to_excel(writer, sheet_name=sheet_name, index=False)
ws = writer.sheets[sheet_name]
for col_idx, col in enumerate(df.columns,1):
max_len = max(df[col].astype(str).map(len).max() if len(df)>0 else 0,len(col))+2
ws.column_dimensions[get_column_letter(col_idx)].width = max_len
for row_idx in range(1,len(df)+2):
ws.cell(row=row_idx,column=col_idx).alignment = Alignment(horizontal='left')
def save_ods(processed_sheets, output_file):
doc = ezodf.newdoc(doctype="ods", filename=output_file)
for name, df in processed_sheets.items():
sheet = ezodf.Sheet(name, size=(len(df)+1,len(df.columns)))
doc.sheets += sheet
for col_idx, col_name in enumerate(df.columns):
sheet[0,col_idx].set_value(col_name)
for row_idx,row in enumerate(df.itertuples(index=False),start=1):
for col_idx,value in enumerate(row):
sheet[row_idx,col_idx].set_value("" if pd.isna(value) else value)
doc.save()
# ----------------- HAUPTPROGRAMM -----------------
def main():
xls, engine = load_file(INPUT_FILE)
if engine == "odf":
doc = ezodf.opendoc(INPUT_FILE)
sheet_names = [s.name for s in doc.sheets if s.name != MASTER_SHEET_NAME]
category_dfs = {name: process_category_sheet(read_ods_sheet(INPUT_FILE,name)) for name in sheet_names}
master_df = read_ods_sheet(INPUT_FILE, MASTER_SHEET_NAME)
else:
sheet_names = [s for s in xls.sheet_names if s != MASTER_SHEET_NAME]
category_dfs = {}
for sheet_name in sheet_names:
df = pd.read_excel(xls, sheet_name=sheet_name, engine=engine)
df.columns = [str(c).strip() for c in df.columns]
category_dfs[sheet_name] = process_category_sheet(df)
master_df = pd.read_excel(xls, sheet_name=MASTER_SHEET_NAME, engine=engine)
master_df.columns = [str(c).strip() for c in master_df.columns]
new_master, updated_dfs, summary = sync_master_and_sheets(master_df, category_dfs)
processed_sheets = {MASTER_SHEET_NAME:new_master}
processed_sheets.update({k:remove_empty_vocabulary_rows(v) for k,v in updated_dfs.items()})
ext_out = os.path.splitext(OUTPUT_FILE)[1].lower()
if ext_out in [".xlsx",".xls"]:
save_excel(processed_sheets, OUTPUT_FILE)
else:
save_ods(processed_sheets, OUTPUT_FILE)
logging.info(f"Datei gespeichert: {OUTPUT_FILE}")
logging.info("===== SYNC SUMMARY =====")
for sheet, info in summary.items():
logging.info(f"{sheet}: {info}")
if __name__ == "__main__":
main()