172 lines
7.1 KiB
Python
172 lines
7.1 KiB
Python
import os
|
|
import re
|
|
import logging
|
|
import datetime
|
|
import pandas as pd
|
|
from openpyxl.utils import get_column_letter
|
|
from openpyxl.styles import Alignment
|
|
import ezodf
|
|
|
|
# ----------------- KONFIGURATION -----------------
|
|
INPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods"
|
|
MASTER_SHEET_NAME = "Masterstruktur"
|
|
today = datetime.datetime.today().strftime("%y.%m.%d")
|
|
base, ext = os.path.splitext(INPUT_FILE)
|
|
OUTPUT_FILE = f"{base}_Updated_{today}{ext}"
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
|
|
# ----------------- HILFSFUNKTIONEN -----------------
|
|
|
|
def load_file(input_file):
|
|
"""
|
|
Prüft Dateiformat und gibt für Excel: pd.ExcelFile + Engine zurück,
|
|
für ODS: None + "odf" (da ODS direkt über ezodf gelesen wird).
|
|
"""
|
|
ext = os.path.splitext(input_file)[1].lower()
|
|
if ext in [".xlsx", ".xls"]:
|
|
engine = "openpyxl"
|
|
xls = pd.ExcelFile(input_file, engine=engine)
|
|
elif ext == ".ods":
|
|
engine = "odf"
|
|
xls = None # ODS wird direkt über ezodf gelesen
|
|
else:
|
|
raise ValueError(f"Nicht unterstütztes Dateiformat: {ext}")
|
|
logging.info(f"Lade Datei {input_file} mit Engine '{engine}'")
|
|
return xls, engine
|
|
|
|
def read_ods_sheet(filename, sheet_name):
|
|
"""Liests ODS Sheet sauber ein, inklusive Header."""
|
|
doc = ezodf.opendoc(filename)
|
|
sheet = doc.sheets[sheet_name]
|
|
data = []
|
|
headers = [str(sheet[0, col].value).strip() for col in range(sheet.ncols())]
|
|
for row_idx in range(1, sheet.nrows()):
|
|
row = {}
|
|
empty_row = True
|
|
for col_idx, col_name in enumerate(headers):
|
|
cell_val = sheet[row_idx, col_idx].value
|
|
val = "" if cell_val is None else str(cell_val).strip()
|
|
row[col_name] = val
|
|
if val:
|
|
empty_row = False
|
|
if not empty_row:
|
|
data.append(row)
|
|
df = pd.DataFrame(data, columns=headers)
|
|
return df
|
|
|
|
def process_category_sheet(df):
|
|
"""Erstellt die treppenartige Hierarchie."""
|
|
df = df.copy()
|
|
for col in ["ID","Unterkategorie","Unterunterkategorie","Wort/Vokabel"]:
|
|
if col not in df.columns:
|
|
df[col] = ""
|
|
rows = []
|
|
current_id = ""
|
|
current_uuk = ""
|
|
for _, r in df.iterrows():
|
|
id_val = str(r.get("ID","")).strip()
|
|
uuk_val = str(r.get("Unterunterkategorie","")).strip()
|
|
word_val = str(r.get("Wort/Vokabel","")).strip()
|
|
|
|
if id_val: # Kategoriezeile
|
|
current_id = id_val
|
|
current_uuk = uuk_val or word_val
|
|
rows.append({"ID": current_id, "Unterkategorie": "", "Unterunterkategorie": current_uuk, "Wort/Vokabel": ""})
|
|
continue
|
|
if uuk_val: # Unterunterkategorie
|
|
current_uuk = uuk_val
|
|
rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": current_uuk, "Wort/Vokabel": ""})
|
|
continue
|
|
if word_val: # Vokabel
|
|
rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": "", "Wort/Vokabel": word_val})
|
|
continue
|
|
return pd.DataFrame(rows, columns=["ID","Unterkategorie","Unterunterkategorie","Wort/Vokabel"])
|
|
|
|
def remove_empty_vocabulary_rows(df):
|
|
"""Entfernt Zeilen, die nur leere Wort/Vokabel-Spalte haben."""
|
|
return df[df["Wort/Vokabel"].astype(str).str.strip() != ""].copy().reset_index(drop=True)
|
|
|
|
def sync_master_and_sheets(master_df, category_dfs):
|
|
"""Synchronisiert Kategorien nach Master, Vokabeln bleiben erhalten."""
|
|
master_df = master_df.copy()
|
|
master_df["ID"] = master_df["ID"].astype(str).str.strip()
|
|
master_dict = dict(zip(master_df["ID"], master_df["Kategorie"]))
|
|
updated_dfs = {}
|
|
summary = {}
|
|
|
|
for sheet_name, df in category_dfs.items():
|
|
rows_out = []
|
|
changes = {"removed":0}
|
|
for _, row in df.iterrows():
|
|
id_val = str(row.get("ID","")).strip()
|
|
if id_val and id_val not in master_dict:
|
|
changes["removed"] +=1
|
|
continue
|
|
rows_out.append(row.to_dict())
|
|
updated_dfs[sheet_name] = pd.DataFrame(rows_out, columns=df.columns)
|
|
summary[sheet_name] = changes
|
|
|
|
new_master = pd.DataFrame([{"ID":k,"Kategorie":v} for k,v in sorted(master_dict.items())])
|
|
return new_master, updated_dfs, summary
|
|
|
|
def save_excel(processed_sheets, output_file):
|
|
from openpyxl import Workbook
|
|
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
|
|
for sheet_name, df in processed_sheets.items():
|
|
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
|
ws = writer.sheets[sheet_name]
|
|
for col_idx, col in enumerate(df.columns,1):
|
|
max_len = max(df[col].astype(str).map(len).max() if len(df)>0 else 0,len(col))+2
|
|
ws.column_dimensions[get_column_letter(col_idx)].width = max_len
|
|
for row_idx in range(1,len(df)+2):
|
|
ws.cell(row=row_idx,column=col_idx).alignment = Alignment(horizontal='left')
|
|
|
|
def save_ods(processed_sheets, output_file):
|
|
doc = ezodf.newdoc(doctype="ods", filename=output_file)
|
|
for name, df in processed_sheets.items():
|
|
sheet = ezodf.Sheet(name, size=(len(df)+1,len(df.columns)))
|
|
doc.sheets += sheet
|
|
for col_idx, col_name in enumerate(df.columns):
|
|
sheet[0,col_idx].set_value(col_name)
|
|
for row_idx,row in enumerate(df.itertuples(index=False),start=1):
|
|
for col_idx,value in enumerate(row):
|
|
sheet[row_idx,col_idx].set_value("" if pd.isna(value) else value)
|
|
doc.save()
|
|
|
|
# ----------------- HAUPTPROGRAMM -----------------
|
|
def main():
|
|
xls, engine = load_file(INPUT_FILE)
|
|
if engine == "odf":
|
|
doc = ezodf.opendoc(INPUT_FILE)
|
|
sheet_names = [s.name for s in doc.sheets if s.name != MASTER_SHEET_NAME]
|
|
category_dfs = {name: process_category_sheet(read_ods_sheet(INPUT_FILE,name)) for name in sheet_names}
|
|
master_df = read_ods_sheet(INPUT_FILE, MASTER_SHEET_NAME)
|
|
else:
|
|
sheet_names = [s for s in xls.sheet_names if s != MASTER_SHEET_NAME]
|
|
category_dfs = {}
|
|
for sheet_name in sheet_names:
|
|
df = pd.read_excel(xls, sheet_name=sheet_name, engine=engine)
|
|
df.columns = [str(c).strip() for c in df.columns]
|
|
category_dfs[sheet_name] = process_category_sheet(df)
|
|
master_df = pd.read_excel(xls, sheet_name=MASTER_SHEET_NAME, engine=engine)
|
|
master_df.columns = [str(c).strip() for c in master_df.columns]
|
|
|
|
new_master, updated_dfs, summary = sync_master_and_sheets(master_df, category_dfs)
|
|
processed_sheets = {MASTER_SHEET_NAME:new_master}
|
|
processed_sheets.update({k:remove_empty_vocabulary_rows(v) for k,v in updated_dfs.items()})
|
|
|
|
ext_out = os.path.splitext(OUTPUT_FILE)[1].lower()
|
|
if ext_out in [".xlsx",".xls"]:
|
|
save_excel(processed_sheets, OUTPUT_FILE)
|
|
else:
|
|
save_ods(processed_sheets, OUTPUT_FILE)
|
|
|
|
logging.info(f"Datei gespeichert: {OUTPUT_FILE}")
|
|
logging.info("===== SYNC SUMMARY =====")
|
|
for sheet, info in summary.items():
|
|
logging.info(f"{sheet}: {info}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|