import os import sys import time import json import requests import pandas as pd from pathlib import Path from difflib import SequenceMatcher import argparse # ========================= # Argumente / Dry-Run # ========================= parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true', help='API-Abfragen simulieren') args = parser.parse_args() DRY_RUN = args.dry_run # ========================= # Konfiguration # ========================= INPUT_DIR = Path("Input CSV") OUTPUT_DIR = Path("Auswertung Ergebnisse") OUTPUT_DIR.mkdir(exist_ok=True) TIMEOUT = 5 MAX_RETRIES = 3 BACKOFF_FACTOR = 2 MAX_CONSECUTIVE_FAILURES = 10 CACHE_FILE = "api_cache.json" if os.path.exists(CACHE_FILE): with open(CACHE_FILE, "r", encoding="utf-8") as f: CACHE = json.load(f) else: CACHE = {} API_ACTIVE = {"gnd": True, "wikidata": True} FAIL_COUNTER = {"gnd": 0, "wikidata": 0} HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"} # ========================= # Logging # ========================= def log(level, msg): print(f"[{level}] {msg}") # ========================= # Cache speichern # ========================= def save_cache(): with open(CACHE_FILE, "w", encoding="utf-8") as f: json.dump(CACHE, f, indent=2, ensure_ascii=False) # ========================= # Request mit Retry & Backoff # ========================= def request_with_retries(api_name, url, params=None): if DRY_RUN: return {"dummy": True} if not API_ACTIVE[api_name]: return None cache_key = url + (str(params) if params else "") if cache_key in CACHE: return CACHE[cache_key] retries = 0 while retries < MAX_RETRIES: try: r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS) if r.status_code == 200: try: data = r.json() except: data = r.text CACHE[cache_key] = data save_cache() FAIL_COUNTER[api_name] = 0 return data elif r.status_code in [403, 429]: log("ERROR", f"{api_name.upper()} HTTP {r.status_code} – Stopschalter aktiviert") API_ACTIVE[api_name] = False return None else: log("ERROR", f"{api_name.upper()} HTTP {r.status_code}") except requests.exceptions.Timeout: log("ERROR", f"Timeout bei {api_name.upper()}") except Exception as e: log("ERROR", f"Fehler bei {api_name.upper()}: {e}") retries += 1 sleep_time = min(BACKOFF_FACTOR ** retries, 30) time.sleep(sleep_time) FAIL_COUNTER[api_name] += 1 if FAIL_COUNTER[api_name] >= MAX_CONSECUTIVE_FAILURES: log("CRITICAL", f"{MAX_CONSECUTIVE_FAILURES} Fehler bei {api_name.upper()} – Stopschalter aktiviert") API_ACTIVE[api_name] = False return None # ========================= # API-Abfragen mit Confidence # ========================= def query_gnd(term, min_conf=0.6): if DRY_RUN or not API_ACTIVE["gnd"]: return "TEST_GND", 1.0 url = f"https://lobid.org/gnd/search?q={term}&format=json" data = request_with_retries("gnd", url) if not data: return "API nicht erreichbar", 0.0 results = [] scores = [] for doc in data.get("member", []): name = doc.get("preferredName", "") conf = SequenceMatcher(None, term.lower(), name.lower()).ratio() if conf >= min_conf: results.append(name) scores.append(conf) if results: return ", ".join(results), max(scores) return "ohne Ergebnis", 0.0 def query_wikidata(term, min_conf=0.5): if DRY_RUN or not API_ACTIVE["wikidata"]: return "TEST_WD", 1.0 url = "https://www.wikidata.org/w/api.php" params = {"action": "wbsearchentities", "search": term, "language": "de", "format": "json"} data = request_with_retries("wikidata", url, params) if not data: return "API nicht erreichbar", 0.0 results = [] scores = [] for entry in data.get("search", []): match_info = entry.get("match", {}) score = match_info.get("score", 0.0) if score >= min_conf: results.append(entry["label"]) scores.append(score) if results: return ", ".join(results), max(scores) return "ohne Ergebnis", 0.0 # ========================= # Input laden # ========================= def load_input_file(file_path): try: if file_path.suffix.lower() == ".ods": df = pd.read_excel(file_path, engine="odf", header=None) elif file_path.suffix.lower() == ".xlsx": df = pd.read_excel(file_path, engine="openpyxl", header=None) elif file_path.suffix.lower() == ".csv": df = pd.read_csv(file_path, header=None) else: log("WARNING", f"Unbekanntes Dateiformat: {file_path.name}") return None return df except Exception as e: log("ERROR", f"Fehler beim Laden von {file_path.name}: {e}") return None # ========================= # Header-Zeile suchen # ========================= def find_header_row(df, keywords=["objektbeschreibung", "objekt/ebene"]): for i, row in df.iterrows(): row_lower = [str(cell).lower() if pd.notna(cell) else "" for cell in row] if any(kw in cell for kw in keywords for cell in row_lower): return i, row_lower return None, None # ========================= # Verarbeitung # ========================= def process_files(): all_terms = [] output_rows = [] for file_path in INPUT_DIR.glob("*"): if not file_path.suffix.lower() in [".csv", ".xlsx", ".ods"]: continue log("INFO", f"Verarbeite {file_path.name}") df = load_input_file(file_path) if df is None: continue header_idx, header_row = find_header_row(df) if header_idx is None: log("WARNING", f"Keine Header-Zeile gefunden in {file_path.name}") continue df.columns = header_row df = df.iloc[header_idx+1:].reset_index(drop=True) col_objdesc = next((col for col in df.columns if "objektbeschreibung" in str(col).lower()), None) col_objlevel = next((col for col in df.columns if "objekt/ebene" in str(col).lower()), None) if not col_objdesc: log("WARNING", f"Keine Spalte 'Objektbeschreibung' in {file_path.name}") continue term_list = [] obj_level_list = [] for _, row in df.iterrows(): terms = str(row[col_objdesc]) if pd.notna(row[col_objdesc]) else "" if not terms: continue for term in [t.strip() for t in terms.split(",") if t.strip()]: term_list.append(term) obj_level_list.append(row[col_objlevel] if col_objlevel and pd.notna(row[col_objlevel]) else "") # API-Abfragen gnd_results = [] gnd_scores = [] wikidata_results = [] wikidata_scores = [] for term in term_list: gnd_res, gnd_conf = query_gnd(term) wikidata_res, wd_conf = query_wikidata(term) gnd_results.append(gnd_res) gnd_scores.append(gnd_conf) wikidata_results.append(wikidata_res) wikidata_scores.append(wd_conf) for idx, term in enumerate(term_list): output_rows.append({ "Begriff": term, "Quelle": file_path.name, "Objekt/Ebene": obj_level_list[idx], "GND": gnd_results[idx], "GND_Confidence": gnd_scores[idx], "Wikidata": wikidata_results[idx], "Wikidata_Confidence": wikidata_scores[idx] }) all_terms.extend(term_list) # Hauptoutput out_df = pd.DataFrame(output_rows) out_file = OUTPUT_DIR / "Auswertung_gesamt.ods" out_df.to_excel(out_file, index=False, engine="odf") log("INFO", f"Hauptauswertung gespeichert: {out_file}") # Rohdatei raw_terms = pd.Series(all_terms).value_counts().reset_index() raw_terms.columns = ["Begriff", "Häufigkeit"] raw_file = OUTPUT_DIR / "Rohbegriffe.ods" raw_terms.to_excel(raw_file, index=False, engine="odf") log("INFO", f"Rohbegriffe gespeichert: {raw_file}") # ========================= # Main # ========================= if __name__ == "__main__": if not INPUT_DIR.exists(): log("CRITICAL", f"Eingabeordner {INPUT_DIR} fehlt!") sys.exit(1) process_files()