import os
import sys
import re
import time
import json
import pandas as pd
import requests
from pathlib import Path
from collections import defaultdict
from difflib import SequenceMatcher

# RapidFuzz für Token-basierte Fuzzy-Suche
try:
    from rapidfuzz import fuzz
    RAPIDFUZZ_AVAILABLE = True
    print("RapidFuzz verfügbar")
except ImportError:
    RAPIDFUZZ_AVAILABLE = False
    print("RapidFuzz nicht verfügbar – nutze SequenceMatcher")

# Spacy Lemmatizer
try:
    import spacy
    nlp = spacy.load("de_core_news_sm")
    SPACY_AVAILABLE = True
    print("Spacy Lemmatizer aktiviert")
except:
    SPACY_AVAILABLE = False
    nlp = None
    print("Spacy nicht verfügbar – nutze naive Stemmer")

# =========================
# Pfade & Config
# =========================
INPUT_DIR = Path("Input CSV")
OUTPUT_DIR = Path("Auswertung Ergebnisse")
OUTPUT_DIR.mkdir(exist_ok=True)
NORMVOC_FILE = Path("Input CSV/Normvokabular_INTERN/NV_MASTER.ods")
CACHE_FILE = "api_cache.json"
STOPWORDS = {"mit","ohne","der","die","das","ein","eine","und","zu","von","im","in","auf","an","als","bei","für","aus","dem","den","des","eines","einer"}
CONF_THRESHOLD = 0.75
TIMEOUT = 5
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
HEADERS = {"User-Agent": "VLG_API_multi/1.0 (projekt@example.com)"}
API_ACTIVE = {"gnd": True, "wikidata": True}
FAIL_COUNTER = {"gnd": 0, "wikidata": 0}

# Cache
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE,"r",encoding="utf-8") as f:
        CACHE = json.load(f)
else:
    CACHE = {}

def save_cache():
    with open(CACHE_FILE,"w",encoding="utf-8") as f:
        json.dump(CACHE, f, indent=2, ensure_ascii=False)

# =========================
# Normalisierung / Lemma
# =========================
def normalize_text(s):
    if not s:
        return ""
    s = str(s).lower().strip()
    s = re.sub(r"[\(\)\[\]\"'\\.,;:\?!]", "", s)
    s = re.sub(r"\s+"," ",s)
    return s

# Lemma-Cache
lemma_cache = {}

def lemmatize_term(term):
    term_norm = normalize_text(term)
    if term_norm in lemma_cache:
        return lemma_cache[term_norm]
    if SPACY_AVAILABLE and nlp:
        doc = nlp(term_norm)
        lemma = " ".join([token.lemma_ for token in doc])
    else:
        lemma = term_norm
    lemma_cache[term_norm] = lemma
    return lemma

# =========================
# Kompositum-Zerlegung (einfacher Ansatz)
# =========================
def compound_split(term):
    parts = re.findall(r'[A-ZÄÖÜ][a-zäöü]+', term)
    return parts if parts else [term]

# =========================
# Normvokabular laden & Lemma vorbereiten
# =========================
def load_normvokabular(file_path):
    sheets = pd.read_excel(file_path, sheet_name=None, engine="odf" if file_path.suffix.lower()==".ods" else None)
    norm_dict = {}
    stem_index = defaultdict(list)
    lemma_norm_map = {}  # für RapidFuzz preprocessed

    for sheet_name, df in sheets.items():
        if sheet_name.lower() in ["master", "übersicht"]:
            continue
        df = df.dropna(how="all", axis=1)
        df.columns = [str(c).strip() for c in df.columns]
        id_col = next((c for c in df.columns if "ID" in c), None)
        word_col = next((c for c in df.columns if "Wort" in c or "Vokabel" in c), None)
        if not id_col or not word_col:
            continue

        current_parent_id = None
        for _, row in df.iterrows():
            row_id = str(row[id_col]).strip() if pd.notna(row[id_col]) else None
            row_word = str(row[word_col]).strip() if pd.notna(row[word_col]) else None
            if row_id:
                current_parent_id = row_id
            if not row_word:
                continue
            assigned_parent_id = current_parent_id
            entry = {
                "Name": row_word,
                "ID": assigned_parent_id,   # Parent-ID
                "Sheet": sheet_name,
                "Own_ID": row_id or ""      # eigene ID, falls vorhanden
            }
            key = normalize_text(row_word)
            norm_dict[key] = entry
            lemma = lemmatize_term(key)
            stem_index[lemma].append(entry)
            if lemma not in lemma_norm_map:
                lemma_norm_map[lemma] = entry
    return norm_dict, stem_index, lemma_norm_map

# =========================
# Mapping & Vorschläge
# =========================
def map_to_norm(term, norm_dict, stem_index, lemma_norm_map, top_n=3):
    term_norm = normalize_text(term)
    term_lemma = lemmatize_term(term)

    # Exakter Treffer
    if term_norm in norm_dict:
        e = norm_dict[term_norm]
        return e["Name"], e["ID"], []

    # Lemma-Treffer
    if term_lemma in stem_index:
        e = stem_index[term_lemma][0]
        return e["Name"], e["ID"], []

    # KEIN TREFFER → Kompositum-Split
    tokens = compound_split(term)
    if len(tokens) == 1:
        suggestions = get_suggestions(term_lemma, lemma_norm_map, top_n)
        return "KEIN TREFFER", "", suggestions
    else:
        token_matches = []
        for t in tokens:
            t_lemma = lemmatize_term(t)
            if t_lemma in stem_index:
                e = stem_index[t_lemma][0]
                token_matches.append((t, e["Name"], e["ID"]))
            else:
                sugg = get_suggestions(t_lemma, lemma_norm_map, top_n)
                token_matches.append((t, "KEIN TREFFER", "", sugg))
        combined_suggestions = [f"{m[1]} ({m[2]})" for m in token_matches if m[1] != "KEIN TREFFER"]
        return "KEIN TREFFER", "", combined_suggestions

def get_suggestions(term_lemma, lemma_norm_map, top_n=3, threshold=CONF_THRESHOLD):
    candidates = []
    for key_lemma, entry in lemma_norm_map.items():
        if RAPIDFUZZ_AVAILABLE:
            score = fuzz.token_set_ratio(term_lemma, key_lemma)/100
        else:
            score = SequenceMatcher(None, term_lemma.lower(), key_lemma.lower()).ratio()
        if key_lemma.lower().startswith(term_lemma.lower()):
            score = min(score + 0.1, 1.0)
        if score >= threshold:
            candidates.append((score, entry["Name"], entry["ID"]))
    candidates.sort(reverse=True)
    return [f"{name} ({id_})" for _, name, id_ in candidates[:top_n]]

# =========================
# API-Abfragen
# =========================
def request_with_retries(api_name,url,params=None):
    cache_key = url + str(params)
    if cache_key in CACHE:
        return CACHE[cache_key]
    retries = 0
    while retries < MAX_RETRIES:
        try:
            r = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
            if r.status_code == 200:
                try: data = r.json()
                except: data = r.text
                CACHE[cache_key] = data
                FAIL_COUNTER[api_name] = 0
                return data
        except:
            pass
        retries += 1
        time.sleep(min(BACKOFF_FACTOR**retries,30))
    FAIL_COUNTER[api_name] += 1
    if FAIL_COUNTER[api_name] >= 10:
        API_ACTIVE[api_name] = False
    return None

def batch_query_gnd(terms):
    results={}
    if not API_ACTIVE.get("gnd", False):
        for t in terms: results[t] = ""
        return results
    for t in terms:
        url="https://lobid.org/gnd/search"
        params={"q":t,"format":"json"}
        data = request_with_retries("gnd", url, params)
        top = ""
        if data and "member" in data:
            cands = [(doc.get("preferredName","") or doc.get("name",""), SequenceMatcher(None,t.lower(),(doc.get("preferredName","") or doc.get("name","")).lower()).ratio()) for doc in data["member"] if doc.get("preferredName","") or doc.get("name","")]
            cands = [c for c in cands if c[1]>=0.75]
            if cands:
                top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
        results[t] = top
    return results

def batch_query_wikidata(terms):
    results={}
    if not API_ACTIVE.get("wikidata", False):
        for t in terms: results[t] = ""
        return results
    for t in terms:
        url="https://www.wikidata.org/w/api.php"
        params={"action":"wbsearchentities","search":t,"language":"de","format":"json"}
        data = request_with_retries("wikidata", url, params)
        top = ""
        if data and "search" in data:
            cands = [(e.get("label",""), SequenceMatcher(None,t.lower(),e.get("label","").lower()).ratio()) for e in data["search"] if e.get("label","")]
            cands = [c for c in cands if c[1]>=0.70]
            if cands:
                top = sorted(cands,key=lambda x:x[1],reverse=True)[0][0]
        results[t] = top
    return results

# =========================
# Markierung / Export
# =========================
def mark_norm_hits(file_path):
    ext = file_path.suffix.lower()
    if ext in [".xlsx", ".xls"]:
        from openpyxl import load_workbook
        from openpyxl.styles import PatternFill
        wb = load_workbook(file_path)
        ws = wb.active
        green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
        red_fill   = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
        col_map = {cell.value: idx+1 for idx, cell in enumerate(ws[1])}
        norm_col = col_map.get("Norm_Treffer", None)
        if not norm_col:
            print("Spalte 'Norm_Treffer' nicht gefunden")
            return
        for row in ws.iter_rows(min_row=2, min_col=norm_col, max_col=norm_col):
            cell = row[0]
            if cell.value and cell.value != "KEIN TREFFER":
                cell.fill = green_fill
            else:
                cell.fill = red_fill
        wb.save(file_path)
    elif ext==".ods":
        df = pd.read_excel(file_path, engine="odf")
        df["Norm_Status"] = df["Norm_Treffer"].apply(lambda x: "Treffer" if pd.notna(x) and str(x).strip() and x!="KEIN TREFFER" else "Kein Treffer")
        df.to_excel(file_path, index=False, engine="odf")

# =========================
# Verarbeitung Input-Dateien
# =========================
def process_files():
    norm_dict, stem_index, lemma_norm_map = load_normvokabular(NORMVOC_FILE)
    total_terms = 0
    total_hits = 0

    if not INPUT_DIR.exists():
        print(f"Eingabeordner {INPUT_DIR} fehlt")
        sys.exit(1)
    files = list(INPUT_DIR.glob("*"))
    if not files:
        print("Keine Dateien gefunden")
        return

    for file_path in files:
        if not file_path.suffix.lower() in [".csv",".ods",".xls",".xlsx"]:
            continue
        print(f"Verarbeite Datei: {file_path.name}")
        try:
            if file_path.suffix.lower() == ".csv":
                df = pd.read_csv(file_path)
            else:
                df = pd.read_excel(file_path, engine="odf" if file_path.suffix.lower()==".ods" else None)
        except Exception as e:
            print(f"Fehler beim Lesen von {file_path.name}: {e}")
            continue

        df = df.dropna(how="all")
        df.columns = [str(c).strip() for c in df.columns]

        besch_col = next((c for c in df.columns if "Objektbeschreibung" in c), None)
        box_col = next((c for c in df.columns if "Objekt/Ebene" in c), None)
        urh_col = next((c for c in df.columns if "Urheber" in c), None)
        if not besch_col: continue

        row_terms_map = []
        for _, row in df.iterrows():
            besch = str(row[besch_col]).strip() if pd.notna(row[besch_col]) else ""
            if not besch: continue
            obj_box = row[box_col] if box_col else ""
            urheber = row[urh_col] if urh_col else ""
            clauses = [c.strip() for c in re.split(r",", besch) if c.strip()]
            terms = []
            for clause in clauses:
                parts = [p.strip() for p in re.split(r"\s+", clause) if p.strip()]
                for p in parts:
                    if p.lower() in STOPWORDS: continue
                    if re.fullmatch(r"\d+", p): continue
                    terms.append(p)
            row_terms_map.append((obj_box, urheber, terms))

        all_terms = list({t for _,_,terms in row_terms_map for t in terms})
        gnd_results = batch_query_gnd(all_terms)
        wd_results = batch_query_wikidata(all_terms)

        output_rows = []
        for obj_box, urheber, terms in row_terms_map:
            for term in terms:
                norm_name, norm_id, suggestions = map_to_norm(term, norm_dict, stem_index, lemma_norm_map)
                total_terms += 1
                if norm_name != "KEIN TREFFER":
                    total_hits += 1
                out_row = {
                    "Box": obj_box,
                    "Objekt/Ebene": obj_box,
                    "Urheber": urheber,
                    "Begriff": term,
                    "Norm_Treffer": norm_name,
                    "Norm_ID": norm_id,
                    "Norm_Vorschlag": ", ".join(suggestions) if suggestions else "",
                    "GND_Top1": gnd_results.get(term,""),
                    "WD_Top1": wd_results.get(term,"")
                }
                output_rows.append(out_row)

        out_df = pd.DataFrame(output_rows)
        output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}{file_path.suffix}"
        version = 1
        while output_file.exists():
            output_file = OUTPUT_DIR / f"Auswertung_{file_path.stem}_({version}){file_path.suffix}"
            version += 1
        engine = "odf" if output_file.suffix.lower()==".ods" else None
        out_df.to_excel(output_file, index=False, engine=engine)
        mark_norm_hits(output_file)
        print(f"Auswertung gespeichert: {output_file}")

    save_cache()
    print(f"Gesamt: {total_terms} Begriffe, {total_hits} Treffer im Normvokabular")

# =========================
# Main
# =========================
if __name__ == "__main__":
    process_files()
    print("Fertig")