2025-10-10 09:46:41 +02:00

102 lines
3.6 KiB
Python

import pandas as pd
import requests
import time
import os
def match_gnd(token, delay=0.3):
"""GND-Abfrage für ein Schlagwort, gibt erstes Ergebnis zurück"""
url = f"https://lobid.org/gnd/search?q={token}&format=json"
try:
resp = requests.get(url, timeout=5)
if resp.status_code == 200:
data = resp.json()
if 'member' in data and data['member']:
first = data['member'][0]
return first.get('preferredName'), first.get('gndIdentifier')
except Exception as e:
print(f"Fehler bei GND-Abfrage für '{token}': {e}")
time.sleep(delay)
return None, None
def load_exlibris_refs(path):
"""CSV einlesen, Scan-Zuordnung, Platzhalter-Inventarnummer, GND-Abgleich"""
df = pd.read_csv(path, dtype=str, header=0)
# erste Spalte leer? → "Kürzel"
if df.columns[0].strip() == '':
df.rename(columns={df.columns[0]: 'Kürzel'}, inplace=True)
df.fillna('', inplace=True)
# Scan-Level-Spalten
level_cols = [c for c in df.columns if c.strip() in ['0','1','2','3','4']]
obj_list = []
current_obj = None
placeholder_counter = 1
for _, row in df.iterrows():
has_0 = row['0'].strip() if '0' in df.columns else ''
row_refs = []
for c in level_cols:
val = row[c].strip()
if val:
row_refs.append({'level': c, 'scan_ref': val})
if has_0:
if current_obj:
obj_list.append(current_obj)
core_data = {col: row[col] for col in df.columns if col not in level_cols}
# Inventarnummer prüfen
inv = core_data.get('Inventarnummer','').strip()
if not inv:
core_data['Inventarnummer'] = f'PL-{placeholder_counter:04d}'
placeholder_counter += 1
# GND-Abgleich
obj_descr = core_data.get('Objektbeschreibung','')
gnd_name, gnd_id = None, None
if obj_descr:
tokens = [t.strip() for t in obj_descr.split(',') if t.strip()]
for t in tokens:
name, gid = match_gnd(t)
if gid:
gnd_name = name
gnd_id = gid
break
core_data['GND_Name'] = gnd_name
core_data['GND_ID'] = gnd_id
current_obj = core_data
current_obj['ScanReferenzen'] = row_refs
else:
if current_obj:
current_obj['ScanReferenzen'].extend(row_refs)
if current_obj:
obj_list.append(current_obj)
out_df = pd.DataFrame(obj_list)
core_fields = ['Kürzel','Inventarnummer','Standort','Jahr','Urheber','Eigner',
'Objektbeschreibung','Material','Maße (in cm)',
'Objekttyp','Inschrift','Anmerkungen','ScanReferenzen',
'GND_Name','GND_ID']
available = [c for c in core_fields if c in out_df.columns]
return out_df[available]
# ====================
# Hauptteil
# ====================
if __name__ == "__main__":
# CSV im gleichen Ordner suchen
csv_files = [f for f in os.listdir('.') if f.lower().endswith('.csv')]
if not csv_files:
print("Keine CSV-Datei im aktuellen Ordner gefunden.")
exit(1)
# nimm die erste gefundene CSV
input_csv = csv_files[0]
print(f"Verwende CSV-Datei: {input_csv}")
df = load_exlibris_refs(input_csv)
# Ergebnis als Testergebnis.csv speichern
output_file = "Testergebnis.csv"
df.to_csv(output_file, index=False)
print(f"Aufbereitete Daten gespeichert als {output_file}")