GND_Skript_Test/VLG_API_multi.py
2025-10-10 09:46:41 +02:00

263 lines
8.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import sys
import time
import json
import requests
import pandas as pd
from pathlib import Path
from difflib import SequenceMatcher
import argparse
# =========================
# Argumente / Dry-Run
# =========================
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true', help='API-Abfragen simulieren')
args = parser.parse_args()
DRY_RUN = args.dry_run
# =========================
# Konfiguration
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
TIMEOUT = 5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
MAX_CONSECUTIVE_FAILURES = 10
CACHE_FILE = "api_cache.json"
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
CACHE = json.load(f)
else:
CACHE = {}
API_ACTIVE = {"gnd": True, "wikidata": True}
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
# =========================
# Logging
# =========================
def log(level, msg):
print(f"[{level}] {msg}")
# =========================
# Cache speichern
# =========================
def save_cache():
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(CACHE, f, indent=2, ensure_ascii=False)
# =========================
# Request mit Retry & Backoff
# =========================
def request_with_retries(api_name, url, params=None):
if DRY_RUN:
return {"dummy": True}
if not API_ACTIVE[api_name]:
return None
cache_key = url + (str(params) if params else "")
if cache_key in CACHE:
return CACHE[cache_key]
retries = 0
while retries < MAX_RETRIES:
try:
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
if r.status_code == 200:
try:
data = r.json()
except:
data = r.text
CACHE[cache_key] = data
save_cache()
FAIL_COUNTER[api_name] = 0
return data
elif r.status_code in [403, 429]:
log("ERROR", f"{api_name.upper()} HTTP {r.status_code} Stopschalter aktiviert")
API_ACTIVE[api_name] = False
return None
else:
log("ERROR", f"{api_name.upper()} HTTP {r.status_code}")
except requests.exceptions.Timeout:
log("ERROR", f"Timeout bei {api_name.upper()}")
except Exception as e:
log("ERROR", f"Fehler bei {api_name.upper()}: {e}")
retries += 1
sleep_time = min(BACKOFF_FACTOR ** retries, 30)
time.sleep(sleep_time)
FAIL_COUNTER[api_name] += 1
if FAIL_COUNTER[api_name] >= MAX_CONSECUTIVE_FAILURES:
log("CRITICAL", f"{MAX_CONSECUTIVE_FAILURES} Fehler bei {api_name.upper()} Stopschalter aktiviert")
API_ACTIVE[api_name] = False
return None
# =========================
# API-Abfragen mit Confidence
# =========================
def query_gnd(term, min_conf=0.6):
if DRY_RUN or not API_ACTIVE["gnd"]:
return "TEST_GND", 1.0
url = f"https://lobid.org/gnd/search?q={term}&format=json"
data = request_with_retries("gnd", url)
if not data:
return "API nicht erreichbar", 0.0
results = []
scores = []
for doc in data.get("member", []):
name = doc.get("preferredName", "")
conf = SequenceMatcher(None, term.lower(), name.lower()).ratio()
if conf >= min_conf:
results.append(name)
scores.append(conf)
if results:
return ", ".join(results), max(scores)
return "ohne Ergebnis", 0.0
def query_wikidata(term, min_conf=0.5):
if DRY_RUN or not API_ACTIVE["wikidata"]:
return "TEST_WD", 1.0
url = "https://www.wikidata.org/w/api.php"
params = {"action": "wbsearchentities", "search": term, "language": "de", "format": "json"}
data = request_with_retries("wikidata", url, params)
if not data:
return "API nicht erreichbar", 0.0
results = []
scores = []
for entry in data.get("search", []):
match_info = entry.get("match", {})
score = match_info.get("score", 0.0)
if score >= min_conf:
results.append(entry["label"])
scores.append(score)
if results:
return ", ".join(results), max(scores)
return "ohne Ergebnis", 0.0
# =========================
# Input laden
# =========================
def load_input_file(file_path):
try:
if file_path.suffix.lower() == ".ods":
df = pd.read_excel(file_path, engine="odf", header=None)
elif file_path.suffix.lower() == ".xlsx":
df = pd.read_excel(file_path, engine="openpyxl", header=None)
elif file_path.suffix.lower() == ".csv":
df = pd.read_csv(file_path, header=None)
else:
log("WARNING", f"Unbekanntes Dateiformat: {file_path.name}")
return None
return df
except Exception as e:
log("ERROR", f"Fehler beim Laden von {file_path.name}: {e}")
return None
# =========================
# Header-Zeile suchen
# =========================
def find_header_row(df, keywords=["objektbeschreibung", "objekt/ebene"]):
for i, row in df.iterrows():
row_lower = [str(cell).lower() if pd.notna(cell) else "" for cell in row]
if any(kw in cell for kw in keywords for cell in row_lower):
return i, row_lower
return None, None
# =========================
# Verarbeitung
# =========================
def process_files():
all_terms = []
output_rows = []
for file_path in INPUT_DIR.glob("*"):
if not file_path.suffix.lower() in [".csv", ".xlsx", ".ods"]:
continue
log("INFO", f"Verarbeite {file_path.name}")
df = load_input_file(file_path)
if df is None:
continue
header_idx, header_row = find_header_row(df)
if header_idx is None:
log("WARNING", f"Keine Header-Zeile gefunden in {file_path.name}")
continue
df.columns = header_row
df = df.iloc[header_idx+1:].reset_index(drop=True)
col_objdesc = next((col for col in df.columns if "objektbeschreibung" in str(col).lower()), None)
col_objlevel = next((col for col in df.columns if "objekt/ebene" in str(col).lower()), None)
if not col_objdesc:
log("WARNING", f"Keine Spalte 'Objektbeschreibung' in {file_path.name}")
continue
term_list = []
obj_level_list = []
for _, row in df.iterrows():
terms = str(row[col_objdesc]) if pd.notna(row[col_objdesc]) else ""
if not terms:
continue
for term in [t.strip() for t in terms.split(",") if t.strip()]:
term_list.append(term)
obj_level_list.append(row[col_objlevel] if col_objlevel and pd.notna(row[col_objlevel]) else "")
# API-Abfragen
gnd_results = []
gnd_scores = []
wikidata_results = []
wikidata_scores = []
for term in term_list:
gnd_res, gnd_conf = query_gnd(term)
wikidata_res, wd_conf = query_wikidata(term)
gnd_results.append(gnd_res)
gnd_scores.append(gnd_conf)
wikidata_results.append(wikidata_res)
wikidata_scores.append(wd_conf)
for idx, term in enumerate(term_list):
output_rows.append({
"Begriff": term,
"Quelle": file_path.name,
"Objekt/Ebene": obj_level_list[idx],
"GND": gnd_results[idx],
"GND_Confidence": gnd_scores[idx],
"Wikidata": wikidata_results[idx],
"Wikidata_Confidence": wikidata_scores[idx]
})
all_terms.extend(term_list)
# Hauptoutput
out_df = pd.DataFrame(output_rows)
out_file = OUTPUT_DIR / "Auswertung_gesamt.ods"
out_df.to_excel(out_file, index=False, engine="odf")
log("INFO", f"Hauptauswertung gespeichert: {out_file}")
# Rohdatei
raw_terms = pd.Series(all_terms).value_counts().reset_index()
raw_terms.columns = ["Begriff", "Häufigkeit"]
raw_file = OUTPUT_DIR / "Rohbegriffe.ods"
raw_terms.to_excel(raw_file, index=False, engine="odf")
log("INFO", f"Rohbegriffe gespeichert: {raw_file}")
# =========================
# Main
# =========================
if __name__ == "__main__":
if not INPUT_DIR.exists():
log("CRITICAL", f"Eingabeordner {INPUT_DIR} fehlt!")
sys.exit(1)
process_files()