263 lines
8.5 KiB
Python
263 lines
8.5 KiB
Python
import os
|
||
import sys
|
||
import time
|
||
import json
|
||
import requests
|
||
import pandas as pd
|
||
from pathlib import Path
|
||
from difflib import SequenceMatcher
|
||
import argparse
|
||
|
||
# =========================
|
||
# Argumente / Dry-Run
|
||
# =========================
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument('--dry-run', action='store_true', help='API-Abfragen simulieren')
|
||
args = parser.parse_args()
|
||
DRY_RUN = args.dry_run
|
||
|
||
# =========================
|
||
# Konfiguration
|
||
# =========================
|
||
INPUT_DIR = Path("Input CSV")
|
||
OUTPUT_DIR = Path("Auswertung Ergebnisse")
|
||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||
|
||
TIMEOUT = 5
|
||
MAX_RETRIES = 3
|
||
BACKOFF_FACTOR = 2
|
||
MAX_CONSECUTIVE_FAILURES = 10
|
||
|
||
CACHE_FILE = "api_cache.json"
|
||
if os.path.exists(CACHE_FILE):
|
||
with open(CACHE_FILE, "r", encoding="utf-8") as f:
|
||
CACHE = json.load(f)
|
||
else:
|
||
CACHE = {}
|
||
|
||
API_ACTIVE = {"gnd": True, "wikidata": True}
|
||
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}
|
||
|
||
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
|
||
|
||
# =========================
|
||
# Logging
|
||
# =========================
|
||
def log(level, msg):
|
||
print(f"[{level}] {msg}")
|
||
|
||
# =========================
|
||
# Cache speichern
|
||
# =========================
|
||
def save_cache():
|
||
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
||
json.dump(CACHE, f, indent=2, ensure_ascii=False)
|
||
|
||
# =========================
|
||
# Request mit Retry & Backoff
|
||
# =========================
|
||
def request_with_retries(api_name, url, params=None):
|
||
if DRY_RUN:
|
||
return {"dummy": True}
|
||
if not API_ACTIVE[api_name]:
|
||
return None
|
||
|
||
cache_key = url + (str(params) if params else "")
|
||
if cache_key in CACHE:
|
||
return CACHE[cache_key]
|
||
|
||
retries = 0
|
||
while retries < MAX_RETRIES:
|
||
try:
|
||
r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
|
||
if r.status_code == 200:
|
||
try:
|
||
data = r.json()
|
||
except:
|
||
data = r.text
|
||
CACHE[cache_key] = data
|
||
save_cache()
|
||
FAIL_COUNTER[api_name] = 0
|
||
return data
|
||
elif r.status_code in [403, 429]:
|
||
log("ERROR", f"{api_name.upper()} HTTP {r.status_code} – Stopschalter aktiviert")
|
||
API_ACTIVE[api_name] = False
|
||
return None
|
||
else:
|
||
log("ERROR", f"{api_name.upper()} HTTP {r.status_code}")
|
||
except requests.exceptions.Timeout:
|
||
log("ERROR", f"Timeout bei {api_name.upper()}")
|
||
except Exception as e:
|
||
log("ERROR", f"Fehler bei {api_name.upper()}: {e}")
|
||
|
||
retries += 1
|
||
sleep_time = min(BACKOFF_FACTOR ** retries, 30)
|
||
time.sleep(sleep_time)
|
||
|
||
FAIL_COUNTER[api_name] += 1
|
||
if FAIL_COUNTER[api_name] >= MAX_CONSECUTIVE_FAILURES:
|
||
log("CRITICAL", f"{MAX_CONSECUTIVE_FAILURES} Fehler bei {api_name.upper()} – Stopschalter aktiviert")
|
||
API_ACTIVE[api_name] = False
|
||
return None
|
||
|
||
# =========================
|
||
# API-Abfragen mit Confidence
|
||
# =========================
|
||
def query_gnd(term, min_conf=0.6):
|
||
if DRY_RUN or not API_ACTIVE["gnd"]:
|
||
return "TEST_GND", 1.0
|
||
|
||
url = f"https://lobid.org/gnd/search?q={term}&format=json"
|
||
data = request_with_retries("gnd", url)
|
||
if not data:
|
||
return "API nicht erreichbar", 0.0
|
||
|
||
results = []
|
||
scores = []
|
||
for doc in data.get("member", []):
|
||
name = doc.get("preferredName", "")
|
||
conf = SequenceMatcher(None, term.lower(), name.lower()).ratio()
|
||
if conf >= min_conf:
|
||
results.append(name)
|
||
scores.append(conf)
|
||
if results:
|
||
return ", ".join(results), max(scores)
|
||
return "ohne Ergebnis", 0.0
|
||
|
||
def query_wikidata(term, min_conf=0.5):
|
||
if DRY_RUN or not API_ACTIVE["wikidata"]:
|
||
return "TEST_WD", 1.0
|
||
|
||
url = "https://www.wikidata.org/w/api.php"
|
||
params = {"action": "wbsearchentities", "search": term, "language": "de", "format": "json"}
|
||
data = request_with_retries("wikidata", url, params)
|
||
if not data:
|
||
return "API nicht erreichbar", 0.0
|
||
|
||
results = []
|
||
scores = []
|
||
for entry in data.get("search", []):
|
||
match_info = entry.get("match", {})
|
||
score = match_info.get("score", 0.0)
|
||
if score >= min_conf:
|
||
results.append(entry["label"])
|
||
scores.append(score)
|
||
if results:
|
||
return ", ".join(results), max(scores)
|
||
return "ohne Ergebnis", 0.0
|
||
|
||
# =========================
|
||
# Input laden
|
||
# =========================
|
||
def load_input_file(file_path):
|
||
try:
|
||
if file_path.suffix.lower() == ".ods":
|
||
df = pd.read_excel(file_path, engine="odf", header=None)
|
||
elif file_path.suffix.lower() == ".xlsx":
|
||
df = pd.read_excel(file_path, engine="openpyxl", header=None)
|
||
elif file_path.suffix.lower() == ".csv":
|
||
df = pd.read_csv(file_path, header=None)
|
||
else:
|
||
log("WARNING", f"Unbekanntes Dateiformat: {file_path.name}")
|
||
return None
|
||
return df
|
||
except Exception as e:
|
||
log("ERROR", f"Fehler beim Laden von {file_path.name}: {e}")
|
||
return None
|
||
|
||
# =========================
|
||
# Header-Zeile suchen
|
||
# =========================
|
||
def find_header_row(df, keywords=["objektbeschreibung", "objekt/ebene"]):
|
||
for i, row in df.iterrows():
|
||
row_lower = [str(cell).lower() if pd.notna(cell) else "" for cell in row]
|
||
if any(kw in cell for kw in keywords for cell in row_lower):
|
||
return i, row_lower
|
||
return None, None
|
||
|
||
# =========================
|
||
# Verarbeitung
|
||
# =========================
|
||
def process_files():
|
||
all_terms = []
|
||
output_rows = []
|
||
|
||
for file_path in INPUT_DIR.glob("*"):
|
||
if not file_path.suffix.lower() in [".csv", ".xlsx", ".ods"]:
|
||
continue
|
||
log("INFO", f"Verarbeite {file_path.name}")
|
||
df = load_input_file(file_path)
|
||
if df is None:
|
||
continue
|
||
|
||
header_idx, header_row = find_header_row(df)
|
||
if header_idx is None:
|
||
log("WARNING", f"Keine Header-Zeile gefunden in {file_path.name}")
|
||
continue
|
||
df.columns = header_row
|
||
df = df.iloc[header_idx+1:].reset_index(drop=True)
|
||
|
||
col_objdesc = next((col for col in df.columns if "objektbeschreibung" in str(col).lower()), None)
|
||
col_objlevel = next((col for col in df.columns if "objekt/ebene" in str(col).lower()), None)
|
||
if not col_objdesc:
|
||
log("WARNING", f"Keine Spalte 'Objektbeschreibung' in {file_path.name}")
|
||
continue
|
||
|
||
term_list = []
|
||
obj_level_list = []
|
||
for _, row in df.iterrows():
|
||
terms = str(row[col_objdesc]) if pd.notna(row[col_objdesc]) else ""
|
||
if not terms:
|
||
continue
|
||
for term in [t.strip() for t in terms.split(",") if t.strip()]:
|
||
term_list.append(term)
|
||
obj_level_list.append(row[col_objlevel] if col_objlevel and pd.notna(row[col_objlevel]) else "")
|
||
|
||
# API-Abfragen
|
||
gnd_results = []
|
||
gnd_scores = []
|
||
wikidata_results = []
|
||
wikidata_scores = []
|
||
|
||
for term in term_list:
|
||
gnd_res, gnd_conf = query_gnd(term)
|
||
wikidata_res, wd_conf = query_wikidata(term)
|
||
gnd_results.append(gnd_res)
|
||
gnd_scores.append(gnd_conf)
|
||
wikidata_results.append(wikidata_res)
|
||
wikidata_scores.append(wd_conf)
|
||
|
||
for idx, term in enumerate(term_list):
|
||
output_rows.append({
|
||
"Begriff": term,
|
||
"Quelle": file_path.name,
|
||
"Objekt/Ebene": obj_level_list[idx],
|
||
"GND": gnd_results[idx],
|
||
"GND_Confidence": gnd_scores[idx],
|
||
"Wikidata": wikidata_results[idx],
|
||
"Wikidata_Confidence": wikidata_scores[idx]
|
||
})
|
||
all_terms.extend(term_list)
|
||
|
||
# Hauptoutput
|
||
out_df = pd.DataFrame(output_rows)
|
||
out_file = OUTPUT_DIR / "Auswertung_gesamt.ods"
|
||
out_df.to_excel(out_file, index=False, engine="odf")
|
||
log("INFO", f"Hauptauswertung gespeichert: {out_file}")
|
||
|
||
# Rohdatei
|
||
raw_terms = pd.Series(all_terms).value_counts().reset_index()
|
||
raw_terms.columns = ["Begriff", "Häufigkeit"]
|
||
raw_file = OUTPUT_DIR / "Rohbegriffe.ods"
|
||
raw_terms.to_excel(raw_file, index=False, engine="odf")
|
||
log("INFO", f"Rohbegriffe gespeichert: {raw_file}")
|
||
|
||
# =========================
|
||
# Main
|
||
# =========================
|
||
if __name__ == "__main__":
|
||
if not INPUT_DIR.exists():
|
||
log("CRITICAL", f"Eingabeordner {INPUT_DIR} fehlt!")
|
||
sys.exit(1)
|
||
process_files()
|