"""
ICAC Data Fetcher v2.2 — Scraping Web Systématique
Fix critique : filtre INSEE/CP strict pour RNA et assoce.fr.
Fouille web PRIMAIRE pour toute requête, RAG secondaire.
v2.2: Intégration subventions 2025 + audit mandats.

Sources:
  - besseges.fr (toujours)
  - RNA / data.gouv.fr (associations)
  - Journal Officiel Associations
  - Annuaire Entreprises data.gouv.fr
  - Pappers (dirigeants)
  - Aides-Territoires
  - BOAMP (marchés publics)
  - RNE (élus)
  - DGFiP: data.economie.gouv.fr (balances comptables)
  - SIRENE: recherche-entreprises.api.gouv.fr
"""

import asyncio
import json
import logging
import re
import time
from pathlib import Path
from typing import Optional

import httpx
from bs4 import BeautifulSoup

log = logging.getLogger("icac.data_fetcher")

BASE_DIR = Path(__file__).parent.parent
CACHE_DIR = BASE_DIR / "cache" / "data"
CACHE_DIR.mkdir(parents=True, exist_ok=True)

# INSEE → SIREN mapping (known)
INSEE_SIREN = {
    "30034": "213000375",
}

# INSEE officiel → code commune API recherche-entreprises (SIRENE)
# Le code commune SIRENE peut différer du code INSEE administratif
INSEE_TO_SIRENE_COMMUNE = {
    "30034": "30037",   # Bessèges : INSEE 30034, SIRENE commune 30037
}


def _cache_get(key: str, max_age: int) -> Optional[str]:
    path = CACHE_DIR / f"{key}.json"
    if not path.exists():
        return None
    try:
        data = json.loads(path.read_text())
        if time.time() - data.get("ts", 0) > max_age:
            return None
        return data.get("content", "")
    except Exception:
        return None


def _cache_set(key: str, content: str):
    path = CACHE_DIR / f"{key}.json"
    path.write_text(json.dumps({"ts": time.time(), "content": content}, ensure_ascii=False))


# ════════════════════════════════════════════════════════════
#  UTILITAIRES PARTAGÉS
# ════════════════════════════════════════════════════════════

async def _scrape_clean(url: str, max_chars: int = 2000) -> str:
    """Scrape une URL, nettoie le HTML, retourne le texte."""
    try:
        async with httpx.AsyncClient(
            timeout=12,
            headers={"User-Agent": "Mozilla/5.0 (compatible; ICAC/1.0; mairie-bot)"},
            follow_redirects=True
        ) as c:
            r = await c.get(url)
            if r.status_code != 200:
                return ""
            return _scrape_clean_from_html(r.text, max_chars)
    except Exception as e:
        log.warning("_scrape_clean %s: %s", url, e)
        return ""


def _scrape_clean_from_html(html: str, max_chars: int = 2000) -> str:
    soup = BeautifulSoup(html, "html.parser")
    for t in soup(["nav", "header", "footer",
                   "script", "style", "aside",
                   "form", "button"]):
        t.decompose()
    text = soup.get_text(" ", strip=True)
    text = re.sub(r"\s{3,}", "  ", text)
    return text[:max_chars]


def _extract_entity(query: str) -> Optional[str]:
    """Extrait le premier nom propre significatif."""
    stopwords = {
        "Quels", "Quelle", "Quelles", "Comment", "Combien",
        "Bessèges", "Commune", "Mairie", "Donnez", "Donne",
        "Trouver", "Chercher", "Quel", "Qui", "Est", "Sont",
        "Association", "Club", "Entreprise", "Société",
        "Liste", "Rechercher", "Historique", "Les", "Des", "Sur",
    }
    names = re.findall(
        r'\b[A-ZÀÂÉÈÊËÎÏÔÙÛÜÇ][a-zàâéèêëîïôùûüç]+'
        r'(?:\s+[A-ZÀÂÉÈÊËÎÏÔÙÛÜÇ][a-zàâéèêëîïôùûüç]+)*\b',
        query
    )
    entities = [n for n in names if n not in stopwords]
    return entities[0] if entities else None


# ════════════════════════════════════════════════════════════
#  FONCTIONS DE SCRAPING SPÉCIALISÉES
# ════════════════════════════════════════════════════════════

async def _fetch_besseges_search(query: str, commune: str) -> dict:
    """Recherche sur le moteur interne de besseges.fr — TOUJOURS appelé."""
    q = query.replace(" ", "+")
    url = f"https://www.besseges.fr/?s={q}"
    txt = await _scrape_clean(url, max_chars=3000)
    if not txt or len(txt) < 50:
        # fallback : page d'accueil
        txt = await _scrape_clean("https://www.besseges.fr", max_chars=2000)
    return {
        "domain": "besseges.fr",
        "text":   txt,
        "url":    url
    }


async def _fetch_rna_besseges(commune_cfg: dict) -> dict:
    """
    RNA via API officielle avec filtre code commune SIRENE strict.
    Le code commune SIRENE (30037) diffère de l'INSEE admin (30034).
    Filtre post-API sur CP 30160 pour exclure les sièges hors commune.
    """
    insee = commune_cfg.get("insee", "30034")
    cp = commune_cfg.get("cp", "30160")
    commune = commune_cfg.get("nom_court", "Bessèges")

    # Utiliser le code commune SIRENE (pas l'INSEE admin)
    sirene_commune = INSEE_TO_SIRENE_COMMUNE.get(insee, insee)

    url = (
        f"https://recherche-entreprises.api.gouv.fr/search"
        f"?code_commune={sirene_commune}"
        f"&nature_juridique=9220"
        f"&page=1&per_page=25"
        f"&mtm_campaign=icac"
    )

    results_text = []

    try:
        async with httpx.AsyncClient(
            timeout=8, follow_redirects=True,
            headers={"User-Agent": "ICAC/2.1 mairie-bot"}
        ) as client:
            r = await client.get(url)
            if r.status_code == 200:
                data = r.json()
                items = data.get("results", [])

                log.info(
                    "RNA API: %d résultats bruts pour commune SIRENE %s (INSEE %s)",
                    len(items), sirene_commune, insee
                )

                # Filtre post-API : garder uniquement les sièges à Bessèges
                filtered = []
                for item in items:
                    siege = item.get("siege", {})
                    item_cp = str(siege.get("code_postal", ""))
                    item_commune = siege.get("libelle_commune", "").upper()

                    if (item_cp == cp or
                        commune.upper() in item_commune or
                        "BESS" in item_commune):
                        filtered.append(item)
                    else:
                        log.debug(
                            "RNA exclu (siège hors commune): %s → %s %s",
                            item.get("nom_complet", "?"),
                            item_commune, item_cp
                        )

                log.info(
                    "RNA filtrés Bessèges: %d/%d",
                    len(filtered), len(items)
                )

                for item in filtered:
                    nom = item.get("nom_complet", "")
                    siren = item.get("siren", "")
                    siege = item.get("siege", {})
                    adresse = (
                        siege.get("geo_adresse", "") or
                        siege.get("adresse", "")
                    )
                    creation = item.get("date_creation", "")
                    etat = "Active" if item.get("etat_administratif") == "A" else "Dissoute"
                    rna_id = item.get("complements", {}).get(
                        "identifiant_association", ""
                    )

                    # Chercher le président dans les dirigeants
                    dirigeant = ""
                    for d in item.get("dirigeants", []):
                        qual = d.get("qualite", "").lower()
                        if qual in ["président", "president", "gérant", "directeur"]:
                            dirigeant = (
                                f"{d.get('prenom', '')} "
                                f"{d.get('nom', '')}".strip()
                            )
                            break

                    results_text.append(
                        f"■ {nom}\n"
                        f"  RNA      : {rna_id or 'N/A'}\n"
                        f"  SIREN    : {siren}\n"
                        f"  Adresse  : {adresse}\n"
                        f"  Création : {creation}\n"
                        f"  État     : {etat}\n"
                        f"  Président: {dirigeant or 'N/A'}"
                    )

    except Exception as e:
        log.warning("RNA API: %s", e)

    return {
        "domain": "RNA/recherche-entreprises.api.gouv.fr",
        "text": (
            f"{len(results_text)} associations à "
            f"{commune} ({cp}) :\n\n"
            + "\n\n".join(results_text)
        ) if results_text else "",
        "url": url
    }


async def _fetch_jo_association(query: str, commune: str) -> dict:
    """JO associations via journal-officiel.gouv.fr."""
    name = _extract_entity(query) or commune
    url = (
        "https://www.journal-officiel.gouv.fr"
        "/pages/associations-recherche/"
        f"?fullText={name.replace(' ', '+')}"
        f"&typeAnnonce=fond"
    )
    txt = await _scrape_clean(url, max_chars=2000)
    if txt and len(txt) > 80:
        return {
            "domain": "journal-officiel.gouv.fr",
            "text":   txt,
            "url":    url
        }
    return {}


def _parse_assoce_page(soup, entity: str, commune: str, cp: str = "30160") -> list:
    """
    Parse assoce.fr avec filtre géographique strict.
    N'inclut que les assos de Bessèges (30160/30034).
    """
    assos = []
    entity_low = entity.lower() if entity else None
    text_full = soup.get_text(" ", strip=True)

    # Chercher dans les blocs <li class="block"> (structure assoce.fr)
    blocks = soup.find_all("li", class_="block")

    # Fallback : chercher par liens WALDEC
    if not blocks:
        blocks = []
        for a in soup.find_all("a", href=re.compile(r"waldec")):
            parent = a.find_parent(["div", "article", "li", "tr"])
            if parent and parent not in blocks:
                blocks.append(parent)

    for block in blocks:
        text = block.get_text(" ", strip=True)

        # FILTRE STRICT : uniquement Bessèges / 30160 / 30034
        is_besseges = any(x in text for x in [
            "Bessèges", "BESSÈGES", "BESSEGES",
            "30160", "30034"
        ])

        # Si recherche entité spécifique → moins strict sur la géo
        is_entity_match = (
            entity_low and
            entity_low in text.lower()
        )

        if not is_besseges and not is_entity_match:
            continue

        asso = {}

        # Nom: span with style (structure assoce.fr) ou titre
        name_span = block.find("span", style=True)
        if name_span:
            asso["nom"] = name_span.get_text(strip=True)[:100]
        else:
            nom_tag = (
                block.find(["h2", "h3", "h4"]) or
                block.find("strong") or
                block.find("a", href=re.compile(r"waldec"))
            )
            if nom_tag:
                asso["nom"] = nom_tag.get_text(strip=True)[:100]

        # Dates from aoeData div
        data_div = block.find("div", class_="aoeData")
        if data_div:
            dp = data_div.find("span", class_="dpubli-token")
            if dp:
                asso["creation"] = dp.get_text(strip=True)
            ddisso = data_div.get("data-ddisso", "")
            if ddisso and ddisso != "0001-01-01":
                asso["statut"] = "dissoute"
            else:
                asso["statut"] = "active"

        # RNA
        rna_m = re.search(r'W\d{9}', text)
        if rna_m:
            asso["rna"] = rna_m.group(0)

        # Objet
        divs = block.find_all("div", recursive=False)
        if len(divs) > 2:
            objet_text = divs[2].get_text(strip=True)
            if objet_text and "Fiche" not in objet_text:
                asso["objet"] = objet_text[:250]

        if not asso.get("objet"):
            for label in ["objet", "but", "activité", "mission"]:
                m = re.search(
                    rf"(?:{label})\s*[:\-]?\s*(.{{15,300}})",
                    text, re.IGNORECASE
                )
                if m:
                    asso["objet"] = m.group(1)[:250]
                    break

        # Dirigeant
        for label in ["président", "présidente", "dirigeant", "représentant"]:
            m = re.search(
                rf"{label}\s*[:\-]?\s*"
                rf"([A-ZÀÂÉÈÊËÎÏÔÙÛÜÇ][a-zàâéèêëîïôùûüç]+"
                rf"(?:\s+[A-ZÀÂÉÈÊËÎÏÔÙÛÜÇ][a-zàâéèêëîïôùûüç]+)+)",
                text, re.IGNORECASE
            )
            if m:
                asso["dirigeant"] = m.group(1).strip()
                break

        # URL fiche détaillée
        a_tag = block.find("a", href=re.compile(r"waldec"))
        if a_tag:
            href = a_tag["href"]
            asso["url_fiche"] = (
                href if href.startswith("http")
                else f"https://assoce.fr{href}"
            )

        if asso.get("nom") and len(asso.get("nom", "")) > 3:
            assos.append(asso)

    return assos


async def _fetch_assoce(query: str, commune_cfg: dict) -> list:
    """
    Scrape assoce.fr avec filtre code postal strict.
    Bessèges = 30160, INSEE 30034.
    """
    commune = commune_cfg.get("nom_court", "Bessèges")
    cp = commune_cfg.get("cp", "30160")
    insee = commune_cfg.get("insee", "30034")
    entity = _extract_entity(query)

    results = []

    # URLs assoce.fr — le site est SPA, scraping HTML limité
    commune_slug = commune.lower().replace("è", "e").replace("é", "e")
    urls_to_try = []

    # Si nom d'entité spécifique → chercher en priorité
    if entity:
        entity_slug = entity.lower().replace(" ", "+")
        urls_to_try.append(
            f"https://assoce.fr/search?q={entity_slug}+{commune_slug}"
        )
        urls_to_try.append(
            f"https://assoce.fr/search?q={entity_slug}+{cp}"
        )

    # Page commune (slug sans accents)
    urls_to_try.extend([
        f"https://assoce.fr/{commune_slug}/",
        f"https://assoce.fr/{cp}-{commune_slug}/",
        f"https://assoce.fr/search?q={commune_slug}+{cp}",
    ])

    async with httpx.AsyncClient(
        headers={"User-Agent": "ICAC/2.1 mairie-bot"},
        timeout=8,
        follow_redirects=True
    ) as client:

        for url in urls_to_try:
            try:
                r = await client.get(url)
                if r.status_code != 200:
                    continue

                soup = BeautifulSoup(r.text, "html.parser")
                text = soup.get_text(" ", strip=True)

                # FILTRE STRICT : vérifier que la page concerne Bessèges
                page_has_besseges = any(x in text for x in [
                    "Bessèges", "BESSÈGES", "BESSEGES",
                    "30160", "30034"
                ])
                if not page_has_besseges:
                    log.info(
                        "assoce.fr: page sans Bessèges ignorée → %s",
                        url
                    )
                    continue

                # Parser les fiches avec filtre géographique
                assos = _parse_assoce_page(soup, entity, commune, cp)

                if assos:
                    lines = []
                    for a in assos:
                        lines.append(
                            f"■ {a['nom']}\n"
                            f"  RNA      : {a.get('rna', 'N/A')}\n"
                            f"  Objet    : {a.get('objet', 'N/A')}\n"
                            f"  Dirigeant: {a.get('dirigeant', 'N/A')}\n"
                            f"  Statut   : {a.get('statut', 'active')}\n"
                            f"  URL      : {a.get('url_fiche', '')}"
                        )
                    results.append({
                        "domain": "assoce.fr (WALDEC)",
                        "text": (
                            f"{len(assos)} associations à "
                            f"Bessèges ({cp}) :\n\n"
                            + "\n\n".join(lines)
                        ),
                        "url": url,
                        "count": len(assos)
                    })
                    log.info(
                        "assoce.fr OK: %d assos Bessèges via %s",
                        len(assos), url
                    )
                    break

                # Si parsing vide mais page pertinente → envoyer texte brut
                elif page_has_besseges and len(text) > 200:
                    results.append({
                        "domain": "assoce.fr (WALDEC)",
                        "text": text[:4000],
                        "url": url
                    })
                    log.info("assoce.fr texte brut: %dc", len(text))
                    break

            except Exception as e:
                log.warning("_fetch_assoce %s: %s", url, e)

    if not results:
        log.warning("assoce.fr: aucun résultat pour Bessèges")
    return results


def _parse_assoce_fiche(soup) -> dict:
    """
    Parse a single assoce.fr/waldec/WXXX/ fiche page.
    Extracts: nom, commune, objet, domaines, dirigeant if available.
    """
    text = soup.get_text(" ", strip=True)
    asso = {}

    # Title (h1 or first big text)
    h1 = soup.find("h1")
    if h1:
        asso["nom"] = h1.get_text(strip=True)

    # RNA from URL or page text
    rna_m = re.search(r'W\d{9,10}', text)
    if rna_m:
        asso["rna"] = rna_m.group(0)

    # Commune
    for label in ["Commune", "Siège"]:
        m = re.search(rf"{label}\s+(.+?)(?:\s*\(\d{{5}}\)|\s*Domaines)", text)
        if m:
            asso["adresse"] = m.group(1).strip()[:100]
            break

    # Objet / Activité
    m = re.search(r"(?:Activité|Mission|Objet)\s+(.{10,300}?)(?:\s+Revendiquez|\s+Commune|\s+Contact)",
                  text, re.DOTALL)
    if m:
        asso["objet"] = m.group(1).strip()[:200]

    # Dirigeant
    for label in ["Président", "Dirigeant", "Représentant"]:
        m = re.search(
            rf"{label}\s*:?\s*"
            rf"([A-ZÀÂÉÈÊËÎÏÔÙÛÜÇ][a-zàâéèêëîïôùûüç]+"
            rf"(?:\s+[A-ZÀÂÉÈÊËÎÏÔÙÛÜÇ][a-zàâéèêëîïôùûüç]+)+)",
            text, re.IGNORECASE
        )
        if m:
            asso["dirigeant"] = m.group(1).strip()
            break

    return asso


async def _fetch_assoce_direct(rna: str) -> dict:
    """
    Récupère la fiche directe d'une asso par son RNA.
    Usage : quand on connaît le RNA depuis une autre source.
    """
    url = f"https://assoce.fr/waldec/{rna}/"
    try:
        async with httpx.AsyncClient(
            headers={"User-Agent": "Mozilla/5.0 (compatible; ICAC/1.0)"},
            timeout=6,
            follow_redirects=True
        ) as c:
            r = await c.get(url)
            if r.status_code != 200:
                return {}
            soup = BeautifulSoup(r.text, "html.parser")
            asso = _parse_assoce_fiche(soup)
            if asso:
                lines = [
                    f"FICHE WALDEC : {asso.get('nom', 'N/A')}",
                    f"  RNA      : {asso.get('rna', rna)}",
                    f"  Objet    : {asso.get('objet', 'N/A')}",
                    f"  Adresse  : {asso.get('adresse', 'N/A')}",
                    f"  Dirigeant: {asso.get('dirigeant', 'N/A')}",
                ]
                return {
                    "domain": f"assoce.fr/waldec/{rna}",
                    "text":   "\n".join(lines),
                    "url":    url
                }
            return {}
    except Exception as e:
        log.warning("assoce direct %s: %s", rna, e)
        return {}


async def _fetch_annuaire_entreprises(query: str, commune: str, dept: str) -> dict:
    """Annuaire des entreprises data.gouv.fr."""
    name = _extract_entity(query) or query[:40]
    url = (
        "https://annuaire-entreprises.data.gouv.fr"
        f"/rechercher?terme={name.replace(' ', '+')}"
        f"&cp_dep={dept}"
    )
    txt = await _scrape_clean(url, max_chars=2500)
    if txt and len(txt) > 80:
        return {
            "domain": "annuaire-entreprises.data.gouv.fr",
            "text":   txt,
            "url":    url
        }
    return {}


async def _fetch_pappers(query: str, commune: str) -> dict:
    """Pappers — dirigeants enrichis."""
    name = _extract_entity(query) or query[:40]
    url = (
        f"https://www.pappers.fr/recherche"
        f"?q={name.replace(' ', '+')}"
        f"&ville={commune.replace(' ', '+')}"
    )
    txt = await _scrape_clean(url, max_chars=2000)
    if txt and len(txt) > 100:
        return {
            "domain": "pappers.fr",
            "text":   txt,
            "url":    url
        }
    return {}


async def _fetch_elus(insee: str) -> dict:
    """Élus municipaux — page besseges.fr/conseil-municipal."""
    url = "https://www.besseges.fr/conseil-municipal/"
    txt = await _scrape_clean(url, max_chars=2000)
    if txt and len(txt) > 80:
        return {
            "domain": "besseges.fr/élus",
            "text":   txt,
            "url":    url
        }
    return {}


async def _fetch_aides_territoires(insee: str) -> dict:
    """Aides-territoires.beta.gouv.fr pour subventions."""
    url = (
        "https://aides-territoires.beta.gouv.fr"
        f"/aides/?perimeter={insee}"
    )
    txt = await _scrape_clean(url, max_chars=2000)
    if txt and len(txt) > 100:
        return {
            "domain": "aides-territoires.beta.gouv.fr",
            "text":   txt,
            "url":    url
        }
    return {}


async def _fetch_boamp(commune: str) -> dict:
    """BOAMP — marchés publics."""
    url = (
        f"https://www.boamp.fr/pages/recherche/"
        f"?terms={commune.replace(' ', '+')}"
    )
    txt = await _scrape_clean(url, max_chars=2000)
    if txt and len(txt) > 100:
        return {
            "domain": "boamp.fr",
            "text":   txt,
            "url":    url
        }
    return {}


async def _fetch_besseges_page(url: str) -> dict:
    """Scrape une page spécifique de besseges.fr."""
    txt = await _scrape_clean(url, max_chars=2000)
    if txt and len(txt) > 50:
        return {
            "domain": "besseges.fr",
            "text":   txt,
            "url":    url
        }
    return {}


# ════════════════════════════════════════════════════════════
#  FONCTION CENTRALE — fetch_for_query()
#  Appelée pour TOUTE requête sans exception
# ════════════════════════════════════════════════════════════

async def fetch_for_query(query: str, commune_cfg: dict) -> list:
    """
    Fouille web systématique pour toute requête.
    Retourne une liste de sources textuelles pour la synthèse.
    Appelé AVANT la synthèse, sans condition.
    """
    sources = []
    q_low = query.lower()
    insee = commune_cfg.get("insee", "30034")
    commune = commune_cfg.get("nom_court", "Bessèges")
    dept = commune_cfg.get("departement", "30")[:2]

    tasks = []

    # ── TOUJOURS : site officiel de la commune ──────────
    tasks.append(_fetch_besseges_search(query, commune))

    # ── ASSOCIATIONS ────────────────────────────────────
    if any(k in q_low for k in [
        "association", "club", "comité", "président",
        "bureau", "siège", "bénévole", "loi 1901",
        "perséphone", "jardins", "sport", "culture",
        "musique", "théâtre", "foot", "tennis",
        "chasse", "pêche", "gym", "danse",
        "vie associative", "adhérent",
        "dirige", "préside", "félin", "maison",
    ]):
        tasks.append(_fetch_rna_besseges(commune_cfg))
        tasks.append(_fetch_jo_association(query, commune))
        tasks.append(_fetch_assoce(query, commune_cfg))
        # Subventions municipales si la requête en parle
        if any(k in q_low for k in ["subvention", "versé", "montant", "combien"]):
            from agents_python.csv_subventions import search_subventions
            sub_text = search_subventions(query)
            if sub_text:
                sources.append({
                    "domain": "Subventions 2025 (document officiel)",
                    "text": sub_text,
                    "url": "data/documents/Tableau des Subventions 2025.pdf"
                })

    # ── ENTREPRISES / DIRIGEANTS ────────────────────────
    if any(k in q_low for k in [
        "entreprise", "société", "gérant", "dirigeant",
        "patron", "commerce", "artisan", "boulangerie",
        "restaurant", "garage", "pharmacie", "médecin",
        "notaire", "avocat", "siren", "siret",
    ]):
        # 1. CSV local — instantané, zéro latence
        from agents_python.csv_entreprises import search_entreprises
        csv_text = search_entreprises(query)
        if csv_text:
            sources.append({
                "domain": "CSV local entreprises Bessèges",
                "text":   csv_text,
                "url":    "data/documents/besseges_entreprises.csv"
            })

        # 2. Web en complément
        tasks.append(_fetch_annuaire_entreprises(query, commune, dept))
        tasks.append(_fetch_pappers(query, commune))

    # ── ÉLUS / PERSONNES PUBLIQUES ──────────────────────
    if any(k in q_low for k in [
        "maire", "conseiller", "élu", "adjoint",
        "délégué", "président", "vice", "commission",
        "conseil municipal",
    ]):
        tasks.append(_fetch_elus(insee))
        tasks.append(_fetch_besseges_page(
            "https://www.besseges.fr/conseil-municipal/"
        ))

    # ── SUBVENTIONS ──────────────────────────────────────
    if any(k in q_low for k in [
        "subvention", "aide", "financement", "detr",
        "dsil", "dotation", "fonds", "appel à projet",
        "europe", "région",
    ]):
        # CSV subventions locales 2025
        from agents_python.csv_subventions import search_subventions
        sub_text = search_subventions(query)
        if sub_text:
            sources.append({
                "domain": "Subventions 2025 (document officiel)",
                "text": sub_text,
                "url": "data/documents/Tableau des Subventions 2025.pdf"
            })
        tasks.append(_fetch_aides_territoires(insee))

    # ── MARCHÉS PUBLICS ──────────────────────────────────
    if any(k in q_low for k in [
        "marché", "appel d'offre", "boamp",
        "attributaire", "prestataire",
    ]):
        tasks.append(_fetch_boamp(commune))

    # Exécuter toutes les tâches en parallèle
    results = await asyncio.gather(*tasks, return_exceptions=True)

    for r in results:
        if isinstance(r, Exception):
            log.warning("fetch_for_query task error: %s", r)
            continue
        if r and isinstance(r, dict) and r.get("text"):
            sources.append(r)
        elif r and isinstance(r, list):
            sources.extend(
                [x for x in r if x and x.get("text")]
            )

    log.info(
        "fetch_for_query → %d sources pour '%s'",
        len(sources), query[:60]
    )
    return sources


# ════════════════════════════════════════════════════════════
#  CLASSE DataFetcher (API structurées — conservée)
# ════════════════════════════════════════════════════════════

class DataFetcher:
    """Fetches context data from official API sources for LLM enrichment."""

    def __init__(self):
        self.sources = {
            "data.gouv.fr": {"status": "active", "cache_ttl": 86400},
            "sirene": {"status": "active", "cache_ttl": 86400},
            "aides-territoires": {"status": "active", "cache_ttl": 3600},
            "jo-associations": {"status": "active", "cache_ttl": 86400},
        }
        log.info("DataFetcher initialized — %d sources active", len(self.sources))

    async def get_context(self, query: str, insee: str, category: str) -> str:
        """
        Fetch relevant context data for LLM enrichment.
        v1.8 : NE PAS injecter les comptes de gestion pour les catégories
        non financières. fetch_for_query() fournit le contexte web.
        """
        context_parts = []

        if category == "subventions":
            context_parts.append(await self._fetch_subventions(insee, query))
        elif category in ("finances", "depenses"):
            # Seules les requêtes financières reçoivent les comptes de gestion
            context_parts.append(await self._fetch_comptes(insee))
        elif category == "entreprises":
            context_parts.append(await self._fetch_entreprises(insee))
        elif category == "associations":
            context_parts.append(await self._fetch_associations(insee))
        # marches, deliberations, general → PAS de comptes de gestion
        # Le contexte pertinent vient de fetch_for_query()

        return "\n\n".join([p for p in context_parts if p])

    async def _fetch_comptes(self, insee: str) -> str:
        """Fetch comptes de gestion from data.economie.gouv.fr."""
        cache_key = f"comptes_{insee}"
        cached = _cache_get(cache_key, self.sources["data.gouv.fr"]["cache_ttl"])
        if cached:
            log.info("comptes(%s) — cache hit", insee)
            return cached

        siren = INSEE_SIREN.get(insee, "")
        if not siren:
            log.warning("No SIREN mapping for INSEE %s", insee)
            return ""

        for annee in [2023, 2022]:
            dataset = f"balances-comptables-des-communes-en-{annee}"
            api_url = (
                f"https://data.economie.gouv.fr/api/explore/v2.1/catalog/datasets/{dataset}/records"
                f"?where=siren%3D%22{siren}%22%20AND%20cbudg%3D1"
                f"&select=compte,obnetdeb,obnetcre,sd,sc"
                f"&limit=100"
            )

            try:
                async with httpx.AsyncClient(timeout=5.0) as client:
                    resp = await client.get(api_url)
                    resp.raise_for_status()
                    data = resp.json()
                    records = data.get("results", [])

                    if not records:
                        continue

                    classes = {}
                    for rec in records:
                        compte = rec.get("compte", "")
                        cls = compte[0] if compte else "?"
                        if cls not in classes:
                            classes[cls] = {"deb": 0, "cre": 0, "sd": 0, "sc": 0}
                        classes[cls]["deb"] += rec.get("obnetdeb", 0) or 0
                        classes[cls]["cre"] += rec.get("obnetcre", 0) or 0
                        classes[cls]["sd"] += rec.get("sd", 0) or 0
                        classes[cls]["sc"] += rec.get("sc", 0) or 0

                    c6 = classes.get("6", {})
                    c7 = classes.get("7", {})

                    dette = sum(
                        (rec.get("sc", 0) or 0)
                        for rec in records
                        if rec.get("compte", "").startswith("16")
                    )

                    dep_fonct = c6.get("deb", 0)
                    rec_fonct = c7.get("cre", 0)
                    epargne = rec_fonct - dep_fonct

                    text = f"""COMPTES DE GESTION {annee} — COMMUNE INSEE {insee} (SIREN {siren})
Source: DGFiP via data.economie.gouv.fr
Nombre de comptes: {len(records)}

FONCTIONNEMENT:
  Dépenses (classe 6): {dep_fonct:,.2f} €
  Recettes (classe 7):  {rec_fonct:,.2f} €
  Épargne brute:        {epargne:,.2f} €

INVESTISSEMENT:
  Dépenses (classe 2): {classes.get('2', {}).get('deb', 0):,.2f} €
  Recettes (classe 2): {classes.get('2', {}).get('cre', 0):,.2f} €

DETTE:
  Encours dette (c.16): {dette:,.2f} €
  Trésorerie (classe 5 sd): {classes.get('5', {}).get('sd', 0):,.2f} €

CLASSES COMPTABLES:"""
                    for cls_num in sorted(classes.keys()):
                        v = classes[cls_num]
                        text += f"\n  Classe {cls_num}: débit={v['deb']:,.2f}€  crédit={v['cre']:,.2f}€"

                    log.info("comptes(%s) — %d records for %d", insee, len(records), annee)
                    _cache_set(cache_key, text)
                    return text

            except Exception as e:
                log.warning("DGFiP fetch %d failed: %s", annee, e)
                continue

        log.error("No DGFiP data found for %s", insee)
        return ""

    async def _fetch_subventions(self, insee: str, query: str) -> str:
        """Fetch subventions context."""
        cache_key = f"subv_{insee}_{hash(query) % 100000}"
        cached = _cache_get(cache_key, self.sources["aides-territoires"]["cache_ttl"])
        if cached:
            log.info("subventions(%s) — cache hit", insee)
            return cached

        comptes_ctx = await self._fetch_comptes(insee)

        text = f"""CONTEXTE SUBVENTIONS — COMMUNE INSEE {insee}
Strate: < 5 000 habitants | Département: Gard (30) | Région: Occitanie

DISPOSITIFS PRINCIPAUX ACCESSIBLES:
- DETR (Dotation d'Équipement des Territoires Ruraux): communes < 2000 hab ou arrondissement éligible
- DSIL (Dotation de Soutien à l'Investissement Local): projets structurants
- FCTVA (Fonds de Compensation de la TVA): remboursement TVA sur investissements
- Fonds Vert: transition écologique (rénovation énergétique, biodiversité)
- DPV (Dotation Politique de la Ville): si quartiers prioritaires
- Aides Région Occitanie: contrats territoriaux, rénovation thermique
- Aides Département Gard: voirie, bâtiments communaux, culture

CALENDRIER INDICATIF:
- DETR/DSIL: dépôt dossiers sept-nov pour l'année suivante
- Fonds Vert: appels à projets continus
- Aides régionales: selon programmes en cours"""

        if comptes_ctx:
            text += f"\n\n{comptes_ctx}"

        log.info("subventions(%s) — static context + comptes", insee)
        _cache_set(cache_key, text)
        return text

    async def _fetch_entreprises(self, insee: str) -> str:
        """Fetch entreprises from recherche-entreprises.api.gouv.fr."""
        cache_key = f"entreprises_{insee}"
        cached = _cache_get(cache_key, self.sources["sirene"]["cache_ttl"])
        if cached:
            log.info("entreprises(%s) — cache hit", insee)
            return cached

        sirene_commune = INSEE_TO_SIRENE_COMMUNE.get(insee, insee)
        api_url = (
            f"https://recherche-entreprises.api.gouv.fr/search"
            f"?code_commune={sirene_commune}"
            f"&page=1&per_page=25"
            f"&mtm_campaign=icac"
        )

        try:
            async with httpx.AsyncClient(timeout=5.0) as client:
                resp = await client.get(api_url)
                resp.raise_for_status()
                data = resp.json()

                results = data.get("results", [])
                total = data.get("total_results", 0)

                if not results:
                    return ""

                active_count = sum(1 for ent in results if ent.get("etat_administratif") == "A")

                text = f"""ENTREPRISES — COMMUNE INSEE {insee}
Source: recherche-entreprises.api.gouv.fr (SIRENE)
Total résultats: {total}
Établissements affichés: {len(results)} (25 premiers)
Actifs: {active_count}

DÉTAIL (25 premiers):"""

                for i, ent in enumerate(results[:25], 1):
                    nom = ent.get("nom_complet", "?")
                    siren = ent.get("siren", "?")
                    etat = "Actif" if ent.get("etat_administratif") == "A" else "Fermé"
                    naf = ent.get("activite_principale", "?")
                    date_creation = ent.get("date_creation", "?")
                    tranche = ent.get("tranche_effectif_salarie", "?")

                    text += f"\n  {i}. {nom}"
                    text += f"\n     SIREN: {siren} | NAF: {naf} | État: {etat}"
                    text += f"\n     Création: {date_creation} | Effectif: {tranche}"

                log.info("entreprises(%s) — %d total, %d shown", insee, total, len(results))
                _cache_set(cache_key, text)
                return text

        except Exception as e:
            log.warning("SIRENE fetch failed: %s", e)
            return ""

    async def _fetch_associations(self, insee: str) -> str:
        """Fetch associations from recherche-entreprises (RNA)."""
        cache_key = f"associations_{insee}"
        cached = _cache_get(cache_key, self.sources["jo-associations"]["cache_ttl"])
        if cached:
            log.info("associations(%s) — cache hit", insee)
            return cached

        sirene_commune = INSEE_TO_SIRENE_COMMUNE.get(insee, insee)
        api_url = (
            f"https://recherche-entreprises.api.gouv.fr/search"
            f"?code_commune={sirene_commune}"
            f"&nature_juridique=9220"
            f"&page=1&per_page=25"
            f"&mtm_campaign=icac"
        )

        try:
            async with httpx.AsyncClient(timeout=5.0) as client:
                resp = await client.get(api_url)
                resp.raise_for_status()
                data = resp.json()

                results = data.get("results", [])
                total = data.get("total_results", 0)

                if not results:
                    for nj in ["9210", "9230", "9240", "9260"]:
                        api_url2 = (
                            f"https://recherche-entreprises.api.gouv.fr/search"
                            f"?code_commune={sirene_commune}"
                            f"&nature_juridique={nj}"
                            f"&page=1&per_page=10"
                        )
                        resp2 = await client.get(api_url2)
                        if resp2.status_code == 200:
                            d2 = resp2.json()
                            results.extend(d2.get("results", []))
                            total += d2.get("total_results", 0)

                if not results:
                    return ""

                active = [r for r in results if r.get("etat_administratif") == "A"]

                text = f"""ASSOCIATIONS — COMMUNE INSEE {insee}
Source: RNA via recherche-entreprises.api.gouv.fr
Total: {total} | Actives: {len(active)}

LISTE (25 premières):"""

                for i, asso in enumerate(results[:25], 1):
                    nom = asso.get("nom_complet", "?")
                    siren = asso.get("siren", "?")
                    etat = "Active" if asso.get("etat_administratif") == "A" else "Dissoute"
                    date_creation = asso.get("date_creation", "?")
                    naf = asso.get("activite_principale", "?")

                    text += f"\n  {i}. {nom}"
                    text += f"\n     SIREN: {siren} | État: {etat} | Création: {date_creation}"
                    text += f"\n     Activité (NAF): {naf}"

                log.info("associations(%s) — %d total", insee, total)
                _cache_set(cache_key, text)
                return text

        except Exception as e:
            log.warning("Associations fetch failed: %s", e)
            return ""


# ════════════════════════════════════════════════════════════
#  FONCTIONS LEGACY (conservées pour compatibilité)
# ════════════════════════════════════════════════════════════

async def fetch_association_rna(name: str, commune_insee: str) -> dict:
    """
    Recherche dans le RNA via recherche-entreprises.api.gouv.fr.
    """
    result = {
        "found":         False,
        "name":          name,
        "rna":           None,
        "objet":         None,
        "adresse":       None,
        "president":     None,
        "date_creation": None,
        "source":        "RNA/JO"
    }
    try:
        sirene_commune = INSEE_TO_SIRENE_COMMUNE.get(commune_insee, commune_insee)
        async with httpx.AsyncClient(timeout=8) as client:
            url = (
                f"https://recherche-entreprises.api.gouv.fr/search"
                f"?q={name}"
                f"&code_commune={sirene_commune}"
                f"&nature_juridique=9220"
                f"&page=1&per_page=5"
            )
            resp = await client.get(url)
            if resp.status_code == 200:
                data = resp.json()
                results = data.get("results", [])
                if results:
                    asso = results[0]
                    result["found"] = True
                    result["name"] = asso.get("nom_complet", name)
                    result["rna"] = asso.get("complements", {}).get("identifiant_association")
                    result["date_creation"] = asso.get("date_creation")
                    siege = asso.get("siege", {})
                    if siege:
                        result["adresse"] = siege.get("adresse_complete")
    except Exception as e:
        log.warning("RNA fetch %s: %s", name, e)
    return result


async def fetch_web_fallback(query: str) -> list:
    """
    Web fallback legacy — désormais remplacé par fetch_for_query().
    Conservé pour compatibilité.
    """
    commune_cfg = {
        "insee": "30034",
        "nom_court": "Bessèges",
        "cp": "30160",
        "departement": "30",
    }
    return await fetch_for_query(query, commune_cfg)