GND_Skript_Test/Masterfile_Editor.py
2025-10-10 09:46:41 +02:00

213 lines
7.6 KiB
Python

import os
import re
import logging
import pandas as pd
import ezodf
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment
# -------------------------------------------------
# KONFIGURATION
# -------------------------------------------------
INPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Input CSV/Normvokabular_INTERN/NV_MASTER.ods"
OUTPUT_FILE = r"/home/jarnold/projects/GND-Skript Test/Normvokabular_INTERN/NV_MASTER_Updated.ods"
MASTER_SHEET_NAME = "Masterstruktur"
SHEET_ORDER = [
"Masterstruktur",
"1 Figur",
"2 Objekt",
"3 Flora",
"4 Fauna",
"5 Landschaft",
"6 Phänomene, Erscheinungen",
"7 Architektur",
"8 Verzierungen, Ornamentik",
"9 Aktivität, Handlung, Pose"
]
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# -------------------------------------------------
# HELFERFUNKTIONEN
# -------------------------------------------------
def detect_id_and_name(df):
df_cols = [str(c).strip().lower() for c in df.columns]
id_col, name_col = None, None
for idx, col in enumerate(df_cols):
if col == "id":
id_col = df.columns[idx]
elif col in ["name", "wort", "wort/vokabel"]:
name_col = df.columns[idx]
if id_col is None or name_col is None:
logging.warning(f"Sheet hat keine ID oder Name/Wort-Spalte: {df.columns}")
return id_col, name_col
def parse_id_level(id_val):
if pd.isna(id_val):
return None
id_str = str(id_val).strip()
if re.match(r'^\d+(\.\d+){0,2}$', id_str):
return len(id_str.split("."))
return None
def process_category_df(df, sheet_name):
id_col, name_col = detect_id_and_name(df)
if id_col is None or name_col is None:
return None
current_level = {2: None, 3: None}
new_rows = []
for _, row in df.iterrows():
id_val = row[id_col] if pd.notna(row[id_col]) else ""
name_val = row[name_col] if pd.notna(row[name_col]) else ""
if not id_val and not name_val:
continue
level = parse_id_level(id_val)
if level:
if level >= 2:
current_level[level] = name_val
for deeper in range(level+1, 4):
current_level[deeper] = None
new_rows.append({
"ID": id_val,
"Unterkategorie": current_level[2] if level >= 2 else "",
"Unterunterkategorie": current_level[3] if level >= 3 else "",
"Wort/Vokabel": name_val
})
else:
new_rows.append({
"ID": "",
"Unterkategorie": "",
"Unterunterkategorie": "",
"Wort/Vokabel": name_val
})
df_new = pd.DataFrame(new_rows, columns=["ID", "Unterkategorie", "Unterunterkategorie", "Wort/Vokabel"])
logging.info(f"Sheet '{sheet_name}' verarbeitet: {len(df_new)} Zeilen")
return df_new
def merge_new_terms(original_df, processed_df):
"""Fügt neue Wörter aus original_df (ohne ID) in processed_df ein, wenn sie noch nicht vorhanden sind."""
_, orig_name_col = detect_id_and_name(original_df)
if orig_name_col is None or orig_name_col not in original_df.columns:
return processed_df
existing_words = set(str(x).strip().lower() for x in processed_df["Wort/Vokabel"].dropna())
new_rows = []
for _, row in original_df.iterrows():
name = str(row.get(orig_name_col, "")).strip()
id_val = str(row.get("ID", "")).strip() if "ID" in row else ""
if not name:
continue
if not id_val and name.lower() not in existing_words:
new_rows.append({"ID": "", "Unterkategorie": "", "Unterunterkategorie": "", "Wort/Vokabel": name})
if new_rows:
df_new = pd.concat([processed_df, pd.DataFrame(new_rows)], ignore_index=True)
logging.info(f"{len(new_rows)} neue Wörter übernommen.")
return df_new
return processed_df
def build_master_df(category_dfs):
seen_ids = set()
master_rows = []
for df in category_dfs:
for _, row in df.iterrows():
id_val = row["ID"]
name_val = row["Wort/Vokabel"]
if id_val and id_val not in seen_ids:
seen_ids.add(id_val)
master_rows.append({"ID": id_val, "Name": name_val})
master_df = pd.DataFrame(master_rows)
logging.info(f"Masterstruktur enthält {len(master_df)} eindeutige IDs")
return master_df
# -------------------------------------------------
# FORMATIERUNG UND SPEICHERN
# -------------------------------------------------
def format_excel_sheet(df, sheet_name, writer):
df.to_excel(writer, sheet_name=sheet_name, index=False)
worksheet = writer.sheets[sheet_name]
for col_idx, col in enumerate(df.columns, 1):
max_len = max([len(str(cell)) if cell is not None else 0 for cell in df[col]])
max_len = max(max_len, len(col)) + 2
worksheet.column_dimensions[get_column_letter(col_idx)].width = max_len
for row_idx in range(1, len(df) + 2):
worksheet.cell(row=row_idx, column=col_idx).alignment = Alignment(horizontal='left')
def save_ods(processed_sheets, output_file):
doc = ezodf.newdoc(doctype="ods")
for name, df in processed_sheets.items():
df = df.fillna("")
sheet = ezodf.Sheet(name, size=(len(df) + 1, len(df.columns)))
doc.sheets += sheet
for col_idx, col_name in enumerate(df.columns):
sheet[0, col_idx].set_value(str(col_name))
for row_idx, row in enumerate(df.itertuples(index=False), start=1):
for col_idx, value in enumerate(row):
if value is None or str(value).lower() == "nan":
value = ""
sheet[row_idx, col_idx].set_value(str(value))
doc.saveas(output_file)
logging.info(f"ODS-Datei gespeichert: {output_file}")
# -------------------------------------------------
# HAUPTPROGRAMM
# -------------------------------------------------
def main():
if not os.path.exists(INPUT_FILE):
logging.error(f"Datei {INPUT_FILE} existiert nicht.")
return
ext = os.path.splitext(INPUT_FILE)[1].lower()
engine = None
if ext in [".xlsx", ".xls"]:
engine = "openpyxl"
elif ext == ".ods":
engine = "odf"
else:
logging.error("Nicht unterstütztes Dateiformat")
return
logging.info(f"Lade Datei {INPUT_FILE} mit Engine '{engine}'")
xls = pd.ExcelFile(INPUT_FILE, engine=engine)
processed_sheets = {}
category_dfs = []
for sheet_name in xls.sheet_names:
if sheet_name == MASTER_SHEET_NAME:
continue
df = pd.read_excel(xls, sheet_name=sheet_name, engine=engine)
df_new = process_category_df(df, sheet_name)
if df_new is not None:
df_merged = merge_new_terms(df, df_new)
processed_sheets[sheet_name] = df_merged
category_dfs.append(df_merged)
else:
processed_sheets[sheet_name] = df
master_df = build_master_df(category_dfs)
processed_sheets[MASTER_SHEET_NAME] = master_df
ordered_sheets = {name: processed_sheets[name] for name in SHEET_ORDER if name in processed_sheets}
ext_out = os.path.splitext(OUTPUT_FILE)[1].lower()
if ext_out in [".xlsx", ".xls"]:
with pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl") as writer:
for name, df in ordered_sheets.items():
format_excel_sheet(df, name, writer)
logging.info(f"Excel-Datei gespeichert: {OUTPUT_FILE}")
elif ext_out == ".ods":
save_ods(ordered_sheets, OUTPUT_FILE)
if __name__ == "__main__":
main()