#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ NormVokabular Mapper – Version 1.4.1 - Detailliertes (DEBUG) Batch-Logging: gepufferte Logs werden periodisch in Konsole + Datei geschrieben - Getty AAT (SPARQL via requests) – API-polite, timeout/retries/backoff - Fehlertoleranz: API-Ausfälle führen nicht zum Totalabsturz - Fehlende Begriffe -> separate Datei (gleiches Format wie Output) - Bestehende Normalisierung/Lemmatisierung/Stemming wird weiterverwendet - Batch-Logging-Modus (konfigurierbar) """ from __future__ import annotations import os import sys import re import time import json import threading import queue import requests import pandas as pd from pathlib import Path from collections import defaultdict from difflib import SequenceMatcher from datetime import datetime # Optional libs try: from rapidfuzz import fuzz RAPIDFUZZ_AVAILABLE = True except Exception: RAPIDFUZZ_AVAILABLE = False try: import spacy nlp = spacy.load("de_core_news_sm") SPACY_AVAILABLE = True except Exception: SPACY_AVAILABLE = False nlp = None # ========================= # Config & Pfade # ========================= INPUT_DIR = Path("Input CSV") OUTPUT_DIR = Path("Auswertung Ergebnisse") OUTPUT_DIR.mkdir(exist_ok=True) NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods") CACHE_FILE = "api_cache.json" STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"} CONF_THRESHOLD = 0.75 TIMEOUT_DEFAULT = 5 MAX_RETRIES_DEFAULT = 3 BACKOFF_FACTOR_DEFAULT = 2 HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"} API_ACTIVE = {"gnd": True, "wikidata": True, "aat": True} FAIL_COUNTER = {"gnd": 0, "wikidata": 0, "aat": 0} # Logging file LOG_FILE = OUTPUT_DIR / "mapper_log.txt" # Batch logging parameters LOG_BATCH_SIZE = 100 # flush wenn >= Einträge LOG_FLUSH_INTERVAL = 5.0 # Sekunden zwischen Flushes (Batch-Logging) LOG_LEVEL = "DEBUG" # ausführlich gewünscht # ========================= # Buffered/Batched Logger # ========================= class BatchLogger: def __init__(self, logfile: Path, flush_interval: float = 5.0, batch_size: int = 100, level: str = "DEBUG"): self.logfile = logfile self.flush_interval = flush_interval self.batch_size = batch_size self.level = level self.q = queue.Queue() self._stop_event = threading.Event() self._thread = threading.Thread(target=self._worker, daemon=True, name="BatchLoggerThread") # Ensure logfile exists try: logfile.parent.mkdir(parents=True, exist_ok=True) logfile.touch(exist_ok=True) except Exception: pass self._thread.start() def _format(self, level: str, msg: str) -> str: ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") return f"{ts} - {level} - {msg}" def log(self, level: str, msg: str): if self._stop_event.is_set(): return formatted = self._format(level, msg) self.q.put((level, formatted)) # If queue too big, trigger immediate flush by putting a special token if self.q.qsize() >= self.batch_size: self.q.put(("__FLUSH__", "__FLUSH__")) def debug(self, msg: str): if LOG_LEVEL in ("DEBUG",): self.log("DEBUG", msg) def info(self, msg: str): self.log("INFO", msg) def warning(self, msg: str): self.log("WARNING", msg) def error(self, msg: str): self.log("ERROR", msg) def exception(self, msg: str): self.log("EXCEPTION", msg) def _worker(self): buffer = [] last_flush = time.time() while not self._stop_event.is_set() or not self.q.empty(): try: item = None try: item = self.q.get(timeout=self.flush_interval) except queue.Empty: # time-based flush if buffer: self._flush_buffer(buffer) buffer = [] last_flush = time.time() continue if item is None: continue level, formatted = item if level == "__FLUSH__": if buffer: self._flush_buffer(buffer) buffer = [] last_flush = time.time() continue buffer.append((level, formatted)) # flush conditions if len(buffer) >= self.batch_size or (time.time() - last_flush) >= self.flush_interval: self._flush_buffer(buffer) buffer = [] last_flush = time.time() except Exception as e: # As a last resort, write error immediately to stderr try: sys.stderr.write(f"BatchLogger worker error: {e}\n") except Exception: pass time.sleep(0.5) # final flush if buffer: self._flush_buffer(buffer) def _flush_buffer(self, buffer): if not buffer: return # write to console and file try: # console out_lines = [f"{line}\n" for _, line in buffer] # write to stdout try: sys.stdout.writelines(out_lines) sys.stdout.flush() except Exception: pass # append to file try: with open(self.logfile, "a", encoding="utf-8") as f: f.writelines(out_lines) except Exception as e: try: sys.stderr.write(f"BatchLogger file write error: {e}\n") except Exception: pass except Exception: pass def stop(self): self._stop_event.set() # put sentinel to wake worker try: self.q.put(("__FLUSH__", "__FLUSH__")) except Exception: pass self._thread.join(timeout=5.0) # Instantiate logger logger = BatchLogger(LOG_FILE, flush_interval=LOG_FLUSH_INTERVAL, batch_size=LOG_BATCH_SIZE, level=LOG_LEVEL) logger.info("Starte NormVokabular Mapper v1.4.1 (Batch-Logging aktiv)") # ========================= # Cache laden/speichern # ========================= if os.path.exists(CACHE_FILE): try: with open(CACHE_FILE,"r",encoding="utf-8") as f: CACHE = json.load(f) logger.debug(f"Cache geladen ({len(CACHE)} Einträge).") except Exception as e: logger.warning(f"Cache konnte nicht geladen werden: {e}") CACHE = {} else: CACHE = {} def save_cache(): try: with open(CACHE_FILE,"w",encoding="utf-8") as f: json.dump(CACHE, f, indent=2, ensure_ascii=False) logger.debug("Cache gespeichert.") except Exception as e: logger.error(f"Cache konnte nicht gespeichert werden: {e}") # ========================= # Normalisierung / Lemma / Tokenization # ========================= def normalize_text(s): if not s: return "" s = str(s).lower().strip() s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s) s = re.sub(r"\s+"," ",s) return s lemma_cache = {} def lemmatize_term(term): term_norm = normalize_text(term) if term_norm in lemma_cache: return lemma_cache[term_norm] if SPACY_AVAILABLE and nlp: try: doc = nlp(term_norm) lemma = " ".join([token.lemma_ for token in doc]) except Exception: lemma = term_norm else: lemma = term_norm lemma_cache[term_norm] = lemma return lemma def compound_split(term): if not term: return [] parts = [p for p in re.split(r"[\s\-_/]+", term) if p] return parts if parts else [term] # ========================= # Normvokabular laden & Index # ========================= def load_normvokabular(file_path): try: sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None) except Exception as e: logger.error(f"Normvokabular konnte nicht geladen werden: {e}") raise norm_dict = {} stem_index = defaultdict(list) lemma_norm_map = {} for sheet_name, df in sheets.items(): if sheet_name.lower() in ["master", "übersicht"]: continue df = df.dropna(how="all", axis=1) df.columns = [str(c).strip() for c in df.columns] id_col = next((c for c in df.columns if "ID" in c), None) word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c or "Begriff" in c), None) if not id_col or not word_col: continue current_parent_id = None for _, row in df.iterrows(): row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None if row_id: current_parent_id = row_id if not row_word: continue assigned_parent_id = current_parent_id entry = {"Name": row_word, "ID": assigned_parent_id or "", "Sheet": sheet_name, "Own_ID": row_id or ""} key = normalize_text(row_word) norm_dict[key] = entry lemma = lemmatize_term(key) stem_index[lemma].append(entry) if lemma not in lemma_norm_map: lemma_norm_map[lemma] = entry logger.info(f"Normvokabular geladen: {len(norm_dict)} Einträge, {len(stem_index)} Stems") return norm_dict, stem_index, lemma_norm_map # ========================= # Mapping & Vorschläge # ========================= def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3): term_norm = normalize_text(term) term_lemma = lemmatize_term(term) if term_norm in norm_dict: e = norm_dict[term_norm] logger.debug(f"map_to_norm: exakter Treffer für '{term}' -> {e['Name']}") return e["Name"], e["ID"], [] if term_lemma in stem_index: e = stem_index[term_lemma][0] logger.debug(f"map_to_norm: Lemma-Treffer für '{term}' -> {e['Name']}") return e["Name"], e["ID"], [] tokens = compound_split(term_norm) if len(tokens) == 1: suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n) logger.debug(f"map_to_norm: KEIN TREFFER für '{term}', Vorschläge: {suggestions}") return "KEIN TREFFER", "", suggestions else: token_matches = [] for t in tokens: t_lemma = lemmatize_term(t) if t_lemma in stem_index: e = stem_index[t_lemma][0] token_matches.append((t, e["Name"], e["ID"])) else: sugg = get_suggestions(t_lemma, lemma_norm_map, top_n) token_matches.append((t, "KEIN TREFFER", "", sugg)) combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"] logger.debug(f"map_to_norm: Kompositum '{term}' -> combined_suggestions: {combined_suggestions}") return "KEIN TREFFER", "", combined_suggestions def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD): candidates = [] for key_lemma, entry in lemma_norm_map.items(): if RAPIDFUZZ_AVAILABLE: score = fuzz.token_set_ratio(term_lemma, key_lemma)/100 else: score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio() if key_lemma.lower().startswith(term_lemma.lower()): score = min(score + 0.1, 1.0) if score >= threshold: candidates.append((score, entry["Name"], entry["ID"])) candidates.sort(reverse=True) return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]] # ========================= # Generic request with retries & caching # ========================= def request_with_retries_generic(api_name, url, params=None, headers=None, timeout=TIMEOUT_DEFAULT, max_retries=MAX_RETRIES_DEFAULT, backoff=BACKOFF_FACTOR_DEFAULT): cache_key = url + (json.dumps(params, sort_keys=True, ensure_ascii=False) if params else "") if cache_key in CACHE: logger.debug(f"[Cache] {api_name}: {cache_key}") return CACHE[cache_key] retries = 0 while retries < max_retries: try: r = requests.get(url, params=params, headers=headers or HEADERS, timeout=timeout) if r.status_code == 200: try: data = r.json() except Exception: data = r.text CACHE[cache_key] = data FAIL_COUNTER[api_name] = 0 logger.debug(f"[{api_name}] Erfolgreiche Antwort für {url}") return data else: logger.warning(f"[{api_name}] HTTP {r.status_code} für {url}") raise ValueError(f"HTTP {r.status_code}") except Exception as e: retries += 1 wait = backoff ** retries logger.warning(f"[{api_name}] Fehler ({retries}/{max_retries}) für {url}: {e}. Warte {wait}s") time.sleep(wait) FAIL_COUNTER[api_name] += 1 if FAIL_COUNTER[api_name] >= 10: API_ACTIVE[api_name] = False logger.error(f"[{api_name}] Deaktiviere API nach zu vielen Fehlern.") return None # ========================= # GND / Wikidata (bestehend) # ========================= def batch_query_gnd(terms): results = {} if not API_ACTIVE.get("gnd", False): for t in terms: results[t] = "" return results logger.info(f"[GND] Starte GND-Abfragen für {len(terms)} Terme") start = time.time() for idx, t in enumerate(terms, start=1): logger.debug(f"[GND] ({idx}/{len(terms)}) Anfrage für '{t}'") url = "https://lobid.org/gnd/search" params = {"q": t, "format": "json"} data = request_with_retries_generic("gnd", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT) top = "" try: if data and "member" in data: cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None, t.lower(), (doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")] cands = [c for c in cands if c[1] >= 0.75] if cands: top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0] except Exception as e: logger.debug(f"[GND] Fehler bei Verarbeitung für '{t}': {e}") results[t] = top elapsed = time.time() - start logger.info(f"[GND] Fertig. Dauer: {elapsed:.1f}s") return results def batch_query_wikidata(terms): results = {} if not API_ACTIVE.get("wikidata", False): for t in terms: results[t] = "" return results logger.info(f"[WD] Starte Wikidata-Abfragen für {len(terms)} Terme") start = time.time() for idx, t in enumerate(terms, start=1): logger.debug(f"[WD] ({idx}/{len(terms)}) Anfrage für '{t}'") url = "https://www.wikidata.org/w/api.php" params = {"action": "wbsearchentities", "search": t, "language": "de", "format": "json"} data = request_with_retries_generic("wikidata", url, params=params, headers=HEADERS, timeout=TIMEOUT_DEFAULT) top = "" try: if data and "search" in data: cands = [(e.get("label",""), SequenceMatcher(None, t.lower(), e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")] cands = [c for c in cands if c[1] >= 0.70] if cands: top = sorted(cands, key=lambda x: x[1], reverse=True)[0][0] except Exception as e: logger.debug(f"[WD] Fehler bei Verarbeitung für '{t}': {e}") results[t] = top elapsed = time.time() - start logger.info(f"[WD] Fertig. Dauer: {elapsed:.1f}s") return results # ========================= # Getty AAT Abfrage – robust & API-polite (requests) # ========================= def batch_query_getty_aat(terms): results = {} if not API_ACTIVE.get("aat", False): for t in terms: results[t] = "" return results endpoint = "https://vocab.getty.edu/sparql" headers = {"Accept": "application/sparql-results+json", "User-Agent": HEADERS.get("User-Agent")} TIMEOUT = 8 MAX_RETRIES = 3 BACKOFF_FACTOR = 2 FAIL_LIMIT = 5 fail_counter_local = 0 logger.info(f"[AAT] Starte Getty AAT-Abgleich für {len(terms)} Terme") start_all = time.time() for idx, term in enumerate(terms, start=1): term_norm = lemmatize_term(normalize_text(term)) tokens = compound_split(term_norm) logger.debug(f"[AAT] ({idx}/{len(terms)}) Begriff '{term}' -> Tokens: {tokens}") query_fragments = [] for tkn in tokens: t_escaped = tkn.replace('"', '\\"') qf = f""" ?concept skos:prefLabel ?label . FILTER(lang(?label)='de' && CONTAINS(LCASE(?label), LCASE("{t_escaped}"))) """ query_fragments.append(f"{{ {qf} }}") query_body = " UNION ".join(query_fragments) if query_fragments else "" query = f"PREFIX skos: SELECT ?label ?concept WHERE {{ {query_body} }} LIMIT 10" retries = 0 success = False start_term = time.time() while retries < MAX_RETRIES and not success: try: logger.debug(f"[AAT] Anfrage (Retry {retries}) für '{term}'") r = requests.get(endpoint, params={"query": query}, headers=headers, timeout=TIMEOUT) if r.status_code != 200: raise ValueError(f"HTTP {r.status_code}") ret = r.json() candidates = [(b['label']['value'], b['concept']['value']) for b in ret.get("results", {}).get("bindings", [])] if candidates: scored = [ (c[0], c[1], SequenceMatcher(None, term_norm, lemmatize_term(normalize_text(c[0]))).ratio()) for c in candidates ] top = max(scored, key=lambda x: x[2]) results[term] = top[0] logger.debug(f"[AAT] Treffer für '{term}': {results[term]} (Score: {top[2]:.3f})") else: results[term] = "" logger.debug(f"[AAT] Kein Treffer für '{term}'") success = True except Exception as e: retries += 1 wait = BACKOFF_FACTOR ** retries logger.warning(f"[AAT] Fehler ({retries}/{MAX_RETRIES}) für '{term}': {e} – warte {wait}s") time.sleep(wait) if retries == MAX_RETRIES: results[term] = "" fail_counter_local += 1 # polite delay time.sleep(1.0) elapsed_term = time.time() - start_term logger.debug(f"[AAT] Dauer für '{term}': {elapsed_term:.2f}s") if fail_counter_local >= FAIL_LIMIT: logger.error("[AAT] Zu viele Fehler lokal - breche AAT-Abfragen ab.") for t_rem in terms[idx:]: results[t_rem] = "" FAIL_COUNTER["aat"] += fail_counter_local API_ACTIVE["aat"] = False break elapsed_all = time.time() - start_all logger.info(f"[AAT] Getty AAT-Abgleich abgeschlossen. Dauer: {elapsed_all:.1f}s") return results # ========================= # Markierung / Export (Excel/ODS) # ========================= def mark_norm_hits(file_path): ext = file_path.suffix.lower() try: if ext in [".xlsx", ".xls"]: from openpyxl import load_workbook from openpyxl.styles import PatternFill wb = load_workbook(file_path) ws = wb.active col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])} norm_col = col_map.get("Norm_Treffer", None) if not norm_col: logger.debug("Spalte 'Norm_Treffer' nicht gefunden (mark_norm_hits).") wb.save(file_path) return green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid") red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid") for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col): cell = row[0] if cell.value and cell.value != "KEIN TREFFER": cell.fill = green_fill else: cell.fill = red_fill wb.save(file_path) elif ext==".ods": df = pd.read_excel(file_path, engine="odf") df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer") df.to_excel(file_path, index=False, engine="odf") except Exception as e: logger.warning(f"Fehler beim Markieren der Treffer in {file_path}: {e}") # ========================= # Fehlende Begriffe -> separate Datei # ========================= def export_missing_terms(out_df, output_file): missing_df = out_df[ (out_df["Norm_Treffer"] == "KEIN TREFFER") & (out_df["Norm_Vorschlag"].isna() | (out_df["Norm_Vorschlag"].str.strip() == "")) ][["Begriff"]].drop_duplicates() count_missing = len(missing_df) logger.info(f"Anzahl Begriffe ohne Treffer und Vorschläge: {count_missing}") if count_missing == 0: return ext = output_file.suffix.lower() base_name = output_file.stem missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe{output_file.suffix}" version = 1 while missing_file.exists(): missing_file = OUTPUT_DIR / f"{base_name}_fehlende_Begriffe_({version}){output_file.suffix}" version += 1 try: if ext in [".xlsx", ".xls"]: missing_df.to_excel(missing_file, index=False, engine="openpyxl") elif ext == ".ods": missing_df.to_excel(missing_file, index=False, engine="odf") else: missing_df.to_csv(missing_file, index=False, sep=";") logger.info(f"Fehlende Begriffe gespeichert: {missing_file}") except Exception as e: logger.error(f"Fehler beim Speichern der fehlenden Begriffe: {e}") # ========================= # Haupt-Loop: Verarbeitung Input-Dateien # ========================= def process_files(): overall_start = time.time() try: norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE) except Exception as e: logger.error("Normvokabular konnte nicht geladen werden. Beende.") raise total_terms = 0 total_hits = 0 if not INPUT_DIR.exists(): logger.error(f"Eingabeordner {INPUT_DIR} fehlt") raise SystemExit(1) files = list(INPUT_DIR.glob("*")) if not files: logger.info("Keine Dateien gefunden") return logger.info(f"Starte Verarbeitung von {len(files)} Dateien") for file_idx, file_path in enumerate(files, start=1): if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]: logger.debug(f"Übersprungen (kein unterstütztes Format): {file_path.name}") continue logger.info(f"[Datei {file_idx}/{len(files)}] Verarbeite: {file_path.name}") file_start = time.time() try: if file_path.suffix.lower() == ".csv": df = pd.read_csv(file_path) else: df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None) except Exception as e: logger.error(f"Fehler beim Lesen von {file_path.name}: {e}") continue df = df.dropna(how="all") df.columns = [str(c).strip() for c in df.columns] besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None) box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None) urh_col = next((c for c in df.columns if "Urheber" in c), None) if not besch_col: logger.warning(f"Spalte 'Objektbeschreibung' nicht gefunden in {file_path.name}. Datei übersprungen.") continue row_terms_map = [] for r_idx, row in enumerate(df.itertuples(index=False), start=1): try: besch = str(row[df.columns.get_loc(besch_col)]).strip() if pd.notna(row[df.columns.get_loc(besch_col)]) else "" except Exception: besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else "" if not besch: continue obj_box = row[df.columns.get_loc(box_col)] if box_col and box_col in df.columns else "" urheber = row[df.columns.get_loc(urh_col)] if urh_col and urh_col in df.columns else "" clauses = [c.strip() for c in re.split(r",", besch) if c.strip()] terms = [] for clause in clauses: parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()] for p in parts: if p.lower() in STOPWORDS: continue if re.fullmatch(r"\d+", p): continue terms.append(p) row_terms_map.append((obj_box, urheber, terms)) if (r_idx % 200) == 0: logger.debug(f"[{file_path.name}] Zeile {r_idx} verarbeitet") all_terms = list({t for _,_,terms in row_terms_map for t in terms}) logger.info(f"[{file_path.name}] Gefundene unique Terme: {len(all_terms)}") total_unique_terms = len(all_terms) # API-Abfragen t0 = time.time() gnd_results = batch_query_gnd(all_terms) t1 = time.time() logger.info(f"[{file_path.name}] GND-Abfragen Dauer: {t1-t0:.1f}s") wd_results = batch_query_wikidata(all_terms) t2 = time.time() logger.info(f"[{file_path.name}] Wikidata-Abfragen Dauer: {t2-t1:.1f}s") aat_results = batch_query_getty_aat(all_terms) if API_ACTIVE.get("aat", False) else {t:"" for t in all_terms} t3 = time.time() logger.info(f"[{file_path.name}] AAT-Abfragen Dauer: {t3-t2:.1f}s") # Build output rows output_rows = [] processed_count = 0 for obj_box, urheber, terms in row_terms_map: for term in terms: norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map) total_terms += 1 if norm_name != "KEIN TREFFER": total_hits += 1 out_row = { "Box": obj_box, "Objekt/Ebene": obj_box, "Urheber": urheber, "Begriff": term, "Norm_Treffer": norm_name, "Norm_ID": norm_id, "Norm_Vorschlag": ", ".join(suggestions) if suggestions else "", "GND_Top1": gnd_results.get(term,""), "WD_Top1": wd_results.get(term,""), "AAT_Top1": aat_results.get(term,"") } output_rows.append(out_row) processed_count += 1 if (processed_count % 200) == 0: logger.debug(f"[{file_path.name}] {processed_count}/{total_unique_terms} Terme verarbeitet") out_df = pd.DataFrame(output_rows) output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}" version = 1 while output_file.exists(): output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}" version += 1 engine = "odf" if output_file.suffix.lower()==".ods" else None try: out_df.to_excel(output_file, index=False, engine=engine) logger.info(f"[{file_path.name}] Auswertung gespeichert: {output_file}") except Exception as e: logger.error(f"[{file_path.name}] Fehler beim Speichern der Auswertung {output_file}: {e}") continue export_missing_terms(out_df, output_file) mark_norm_hits(output_file) file_elapsed = time.time() - file_start logger.info(f"[Datei {file_idx}/{len(files)}] Fertig ({file_elapsed:.1f}s)") overall_elapsed = time.time() - overall_start logger.info(f"Fertig. Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular. Gesamtzeit: {overall_elapsed:.1f}s") # ========================= # Main # ========================= if __name__ == "__main__": try: process_files() except KeyboardInterrupt: logger.warning("Abbruch durch Benutzer (KeyboardInterrupt).") except SystemExit: logger.warning("SystemExit aufgetreten.") except Exception as e: logger.exception(f"Ungefangener Fehler: {e}") finally: # Stop logger (flush remaining logs) try: save_cache() except Exception: pass try: logger.info("Beende.") logger.stop() except Exception: pass