#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
[CYBER-STRAT] Fetcher de sources medias
Recherche DuckDuckGo HTML + extraction de contenu d'articles
Agent responsable : DATA_SCRAPER
"""

import urllib.request
import urllib.parse
import http.cookiejar
import json
import re
import os


# Liste blanche des domaines medias autorises (prompt_architect_v1.1.md)
# v4.6 : ajout sources academiques, medicales, blogs publics
# v4.9b : +18 domaines (radio, presse specialisee/regionale, associatif)
MEDIA_WHITELIST = [
    # Presse nationale
    "lemonde.fr", "lefigaro.fr", "liberation.fr", "rfi.fr", "france24.com",
    "tv5monde.com", "lepoint.fr", "lexpress.fr", "mediapart.fr",
    "courrierinternational.com", "bfmtv.com", "franceinfo.fr", "20minutes.fr",
    # v4.9b — Presse specialisee
    "strategies.fr", "lesinrocks.com", "telerama.fr", "nouvelobs.com",
    # v4.9b — Presse regionale
    "leparisien.fr", "ouest-france.fr", "sudouest.fr",
    "lagazettedescommunes.com",
    # v4.9b — Radio / podcasts
    "radiofrance.fr", "franceculture.fr", "franceinter.fr",
    # Sources academiques FR
    "cairn.info", "persee.fr", "hal.science",
    "hal.archives-ouvertes.fr", "erudit.org",
    # Sources medicales / psychiatrie / sante
    "collectifpsychiatrie.fr", "psycom.org", "santementale.fr",
    # Blogs publics Mediapart (pas de paywall)
    "blogs.mediapart.fr", "club.mediapart.fr",
    # Culture / livres
    "babelio.com", "decitre.fr",
    # v4.9b — Associatif / numerique
    "ffdn.org", "laquadrature.net", "nextinpact.com", "numerama.com",
    # v4.9d — Profils biographiques (numerique/militant/entrepreneur)
    "about.me", "f6s.com", "muckrack.com", "crunchbase.com", "github.com",
]

# Domaines rejetes (Wikipedia deja traite, reseaux sociaux, forums, e-commerce)
DOMAIN_BLACKLIST = [
    "wikipedia.org", "facebook.com", "twitter.com", "x.com", "instagram.com",
    "linkedin.com", "reddit.com", "youtube.com", "tiktok.com",
    "amazon.fr", "ebay.fr", "forum"
]

MAX_CHARS_PER_ARTICLE = 3000
FETCH_TIMEOUT = 8

# User-Agent realiste (regle data_scraper.md)
USER_AGENT = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/120.0.0.0 Safari/537.36"
)


# Themes declencheurs Reflets.info (cybersecurite, influence, hackers...)
REFLETS_THEMES = [
    "cybersecurite", "cybersécurité", "cyber securite", "cyber sécurité",
    "cyberattaque", "cyberguerre", "cyber guerre", "cyberdefense",
    "influence", "propagande", "desinformation", "désinformation",
    "chiffrement", "encryption", "cryptographie",
    "hacker", "hackers", "hacking", "piratage",
    "ransomware", "malware", "phishing",
    "surveillance", "vie privee", "vie privée",
    "darknet", "dark web", "pegasus", "predator", "nso group"
]


# Requetes declenchant Reflets.info en PRIORITE 1 (noms propres, pseudos)
REFLETS_PRIORITY_QUERIES = {
    "drapher", "kiketoa", "bluetouff", "shaman",
    "fabrice epelboin", "le pistolet et la pioche",
    "radio reflets", "antoine champagne", "yovan menkevick",
    "reflets.info", "reflets info"
}


def is_reflets_relevant(query):
    """Verifier si la requete correspond aux themes Reflets.info"""
    q = query.lower()
    return any(theme in q for theme in REFLETS_THEMES)


def is_reflets_priority(query):
    """Verifier si la requete correspond a un nom/pseudo prioritaire Reflets.
    Detection insensible a la casse, partielle (mot dans la requete).
    """
    q = query.lower().strip()
    # Match exact
    if q in REFLETS_PRIORITY_QUERIES:
        return True
    # Match partiel : un des termes est contenu dans la requete
    return any(r in q for r in REFLETS_PRIORITY_QUERIES)


def _log(level, message):
    """Log prefixe pour le module media"""
    print(f"[CYBER-STRAT][MEDIA][{level}] {message}")


def _strip_accents(text):
    """Retirer les accents d'un texte (ex: 'hérard' → 'herard')."""
    import unicodedata
    nfkd = unicodedata.normalize("NFKD", text)
    return "".join(c for c in nfkd if not unicodedata.combining(c))


def _extract_lastname_mf(name):
    """v4.9b — Extraire le nom de famille (gere particules).
    Version locale media_fetcher (miroir de server._extract_lastname).
    'Marie Le Boiteux' → 'le boiteux'
    'Fabrice Epelboin'  → 'epelboin'
    """
    _PARTICLES = {
        "le", "la", "de", "du", "des",
        "von", "van", "di", "el", "al", "ben", "ibn",
    }
    parts = name.strip().split()
    if len(parts) < 2:
        return _strip_accents(name.strip().lower())
    lastname_parts = [parts[-1]]
    for i in range(len(parts) - 2, 0, -1):
        if parts[i].lower() in _PARTICLES:
            lastname_parts.insert(0, parts[i])
        else:
            break
    return _strip_accents(" ".join(lastname_parts).lower())


def _normalize_url(url):
    """v4.9b — Normalise une URL pour deduplication.
    Supprime : protocole, www., trailing /, tracking params.

    https://www.pascalherard.fr/  → pascalherard.fr
    http://pascalherard.fr        → pascalherard.fr
    https://pascalherard.fr?utm=x → pascalherard.fr
    """
    url = url.lower().strip()
    url = re.sub(r'^https?://', '', url)
    url = re.sub(r'^www\.', '', url)
    url = url.rstrip('/')
    url = re.sub(r'\?utm[^#]*', '', url)
    url = re.sub(r'\?ref=[^#]*', '', url)
    return url


# ── v4.9f : Detection paywall ─────────────────────────────────────────

_PAYWALL_PATTERNS = [
    "reserve aux abonnes",
    "article reserve",
    "acces reserve",
    "abonnez-vous",
    "pour lire cet article",
    "lire sur un seul appareil",
    "lecture du monde en cours sur un autre appareil",
    "contenu premium",
    "offre d'abonnement",
    "decouvrez nos offres",
    "connectez-vous pour acceder",
    "cree ton compte pour lire",
    "vous devez etre connecte",
    "already a subscriber",
    "subscribe to read",
    "premium content",
    "vous pouvez lire le monde",
    "cet article est reserve",
]


def _detect_paywall(text):
    """v4.9f — Detecte si le texte scrape est un message de paywall.
    Retourne True si paywall detecte.
    """
    if not text or len(text.strip()) < 100:
        return True
    text_lower = _strip_accents(text.lower())
    paywall_hits = sum(
        1 for p in _PAYWALL_PATTERNS if p in text_lower)
    # 2+ patterns ET texte court → paywall
    if paywall_hits >= 2 and len(text.strip()) < 500:
        return True
    # 1+ pattern ET texte tres court → paywall
    if paywall_hits >= 1 and len(text.strip()) < 200:
        return True
    return False


# ── v4.9f : Domaines bloques (scraping impossible) ────────────────────

_BLOCKED_SCRAPING_DOMAINS = [
    "cairn.info",       # 403 systematique (protection anti-scraping)
    "jstor.org",        # paywall academique
]


def _should_skip_scraping(url):
    """v4.9f — Verifie si le domaine est connu pour bloquer le scraping."""
    try:
        domain = urllib.parse.urlparse(url).netloc.replace("www.", "")
    except Exception:
        return False
    return any(blocked in domain for blocked in _BLOCKED_SCRAPING_DOMAINS)


# --- v4.9c : Serper API (remplace DDG Lite) ---

SERPER_API_KEY = os.getenv("SERPER_API_KEY", "")
SERPER_ENDPOINT = "https://google.serper.dev/search"
SERPER_IMG_ENDPOINT = "https://google.serper.dev/images"


def _serper_search(query, num=5, gl="fr", hl="fr"):
    """v4.9c — Recherche web via Serper API.
    Remplace _ddg_search_lite / _parse_ddg_lite.
    Retourne le meme format : [{"url":, "title":, "snippet":}, ...]
    """
    if not SERPER_API_KEY:
        _log("WARN", "SERPER_API_KEY manquante")
        return []

    payload = json.dumps({
        "q": query, "gl": gl, "hl": hl, "num": num,
    }).encode("utf-8")

    req = urllib.request.Request(
        SERPER_ENDPOINT,
        data=payload,
        headers={
            "X-API-KEY": SERPER_API_KEY,
            "Content-Type": "application/json",
        },
        method="POST",
    )

    try:
        with urllib.request.urlopen(req, timeout=10) as resp:
            data = json.loads(resp.read().decode("utf-8"))
        results = []
        for item in data.get("organic", [])[:num]:
            url = item.get("link", "")
            if url:
                results.append({
                    "url": url,
                    "title": item.get("title", ""),
                    "snippet": item.get("snippet", ""),
                })
        _log("INFO",
            f"[SERPER] q='{query[:50]}' → {len(results)} resultats")
        return results
    except Exception as e:
        _log("ERROR", f"[SERPER] Erreur pour '{query[:50]}': {e}")
        return []


def _serper_image_search(query, num=10, gl="fr"):
    """v4.9c — Recherche images via Serper API.
    Retourne : [{"imageUrl":, "title":, "link":, "source":}, ...]
    """
    if not SERPER_API_KEY:
        return []

    payload = json.dumps({
        "q": query, "gl": gl, "num": num,
    }).encode("utf-8")

    req = urllib.request.Request(
        SERPER_IMG_ENDPOINT,
        data=payload,
        headers={
            "X-API-KEY": SERPER_API_KEY,
            "Content-Type": "application/json",
        },
        method="POST",
    )

    try:
        with urllib.request.urlopen(req, timeout=10) as resp:
            data = json.loads(resp.read().decode("utf-8"))
        images = data.get("images", [])[:num]
        _log("INFO",
            f"[SERPER-IMG] q='{query[:50]}' → {len(images)} images")
        return images
    except Exception as e:
        _log("ERROR", f"[SERPER-IMG] Erreur: {e}")
        return []


def _is_personal_site(url, name, html_text):
    """v4.9b — Verifie qu'un site est bien le site personnel
    de la personne (pas une entreprise homonyme).

    Rejette si :
    - Le prenom+nom complets sont absents du contenu
    - Le contenu contient des marqueurs d'entreprise

    Args:
        url: URL du site
        name: nom complet de la personne
        html_text: texte extrait de la page (deja scrape)

    Retourne True si site personnel valide, False sinon.
    """
    _ENTREPRISE_MARKERS = [
        "sarl", "sas ", "eurl", "sci ",
        "btp", "construction", "devis gratuit",
        "nos réalisations", "nos realisations",
        "siret", "tva intra",
        "conditions générales", "conditions generales",
        "cgv", "société spécialisée", "societe specialisee",
        "entreprise spécialisée", "entreprise specialisee",
    ]

    text_lower = html_text.lower()[:3000]

    # Rejet si marqueur entreprise
    for marker in _ENTREPRISE_MARKERS:
        if marker in text_lower:
            _log("INFO",
                f"[SITE-PERSO] Rejete (entreprise "
                f"'{marker}'): {url[:50]}")
            return False

    # Verifier presence prenom+nom
    name_ascii = _strip_accents(name.lower())
    parts = name_ascii.split()
    firstname = parts[0] if parts else ""
    lastname = _extract_lastname_mf(name)
    text_ascii = _strip_accents(text_lower)

    has_lastname = bool(lastname and lastname in text_ascii)
    has_firstname = bool(
        firstname and len(firstname) > 2
        and firstname in text_ascii)

    if has_lastname and has_firstname:
        return True

    _log("INFO",
        f"[SITE-PERSO] Rejete ('{name}' absent): "
        f"{url[:50]}")
    return False


# ── v4.9e : Algorithme de discrimination de personnes ─────────────────

# Domaines d'activite pour clustering (mots-cles FR sans accents)
_ACTIVITY_DOMAINS = {
    "sante": [
        "psychiatre", "psychanalyste", "psychologue",
        "medecin", "chirurgien", "docteur", "pharmacien",
        "infirmier", "therapeute", "cardiologue",
        "dermatologue", "oncologue", "pediatre",
        "generaliste", "hopital", "clinique", "soignant",
    ],
    "journalisme": [
        "journaliste", "reporter", "chroniqueur",
        "editorialiste", "redacteur", "correspondant",
        "presentateur", "animateur", "redaction",
    ],
    "droit": [
        "avocat", "magistrat", "juge", "notaire",
        "juriste", "barreau", "tribunal", "juridique",
    ],
    "politique": [
        "depute", "senateur", "maire", "ministre",
        "conseiller", "politique", "assemblee",
        "parlement", "gouvernement", "secretaire d'etat",
    ],
    "recherche": [
        "chercheur", "professeur", "enseignant",
        "universitaire", "academique", "these",
        "laboratoire", "cnrs", "inria", "inserm",
        "universite", "doctorant", "maitre de conferences",
    ],
    "culture": [
        "ecrivain", "auteur", "romancier", "poete",
        "artiste", "musicien", "acteur", "comedien",
        "realisateur", "cineaste", "peintre", "sculpteur",
        "photographe", "dramaturge", "compositeur",
    ],
    "tech": [
        "ingenieur", "developpeur", "informatique",
        "startup", "entrepreneur", "cto", "ceo",
        "fondateur", "co-fondateur", "numerique",
        "digital", "software", "hacker", "cybersecurite",
    ],
    "sport": [
        "sportif", "athlete", "joueur", "entraineur",
        "champion", "competition", "football", "rugby",
        "tennis", "basket", "cyclisme", "olympique",
    ],
    "economie": [
        "economiste", "financier", "banquier", "trader",
        "directeur", "pdg", "president", "gerant",
        "manager", "consultant", "auditeur",
    ],
}

# Domaines web bruit (exclure du clustering)
_NOISE_DOMAINS = {
    "wikipedia.org", "wikidata.org", "wikimedia.org",
    "facebook.com", "twitter.com", "x.com", "instagram.com",
    "tiktok.com", "pinterest.com", "reddit.com",
    "youtube.com", "amazon.fr", "amazon.com", "fnac.com",
    "ebay.fr", "ebay.com", "leboncoin.fr",
    "societe.com", "societes.com", "verif.com", "manageo.fr",
    "pappers.fr", "infogreffe.fr", "dnb.com",
    "annuaire-entreprises.data.gouv.fr",
    "pagesjaunes.fr", "118712.fr", "copainsdavant.com",
    "entreprises.lefigaro.fr", "societe.leparisien.fr",
}

# Domaines profils / reseaux sociaux (signal fort)
_SOCIAL_DOMAINS = {
    "linkedin.com", "github.com", "about.me",
    "muckrack.com", "viadeo.com", "researchgate.net",
    "scholar.google.com", "orcid.org",
    "babelio.com", "senscritique.com",
    "imdb.com", "allocine.fr",
    "cairn.info", "theses.fr",
    "f6s.com", "crunchbase.com",
}


# ── v4.9f : Deduplication par domaine + compensation + media ──────────

_NATIONAL_MEDIA_DOMAINS = [
    # Presse ecrite
    "lemonde.fr", "lefigaro.fr", "liberation.fr", "leparisien.fr",
    "20minutes.fr", "lexpress.fr", "lepoint.fr", "nouvelobs.com",
    "mediapart.fr", "humanite.fr", "ouest-france.fr", "sudouest.fr",
    # Radio/TV
    "franceculture.fr", "radiofrance.fr", "franceinter.fr",
    "francetvinfo.fr", "franceinfo.fr", "tf1info.fr", "bfmtv.com",
    "lci.fr", "arte.tv", "rfi.fr",
    # Medias tech/pro
    "tv5monde.com", "strategies.fr", "telerama.fr",
    "numerama.fr", "nextinpact.com", "zdnet.fr",
]

_DOMAIN_TO_BRAND = {
    "lemonde.fr": "lemonde",
    "lefigaro.fr": "lefigaro",
    "liberation.fr": "liberation",
    "leparisien.fr": "leparisien",
    "20minutes.fr": "20minutes",
    "lexpress.fr": "lexpress",
    "lepoint.fr": "lepoint",
    "nouvelobs.com": "nouvelobs",
    "mediapart.fr": "mediapart",
    "humanite.fr": "humanite",
    "ouest-france.fr": "ouest-france",
    "sudouest.fr": "sudouest",
    "franceculture.fr": "radiofrance",
    "radiofrance.fr": "radiofrance",
    "franceinter.fr": "radiofrance",
    "francetvinfo.fr": "francetv",
    "franceinfo.fr": "franceinfo",
    "tf1info.fr": "tf1",
    "bfmtv.com": "bfmtv",
    "lci.fr": "lci",
    "arte.tv": "arte",
    "rfi.fr": "rfi",
    "tv5monde.com": "radiofrance",
    "strategies.fr": "strategies",
    "telerama.fr": "telerama",
    "numerama.fr": "numerama",
    "nextinpact.com": "nextinpact",
    "zdnet.fr": "zdnet",
}

_FALLBACK_MEDIA_KEYWORDS = [
    "radiofrance", "lemonde", "franceinfo", "liberation",
]


def _deduplicate_by_domain(results, max_per_domain=2):
    """v4.9f — Limite a max_per_domain resultats par domaine.
    Conserve l'ordre original (premiers resultats Serper prioritaires).
    Retourne (deduplicated, removed_count).
    """
    domain_count = {}
    deduplicated = []
    removed = 0
    for r in results:
        try:
            domain = urllib.parse.urlparse(
                r.get("url", "")).netloc.replace("www.", "")
        except Exception:
            domain = ""
        count = domain_count.get(domain, 0)
        if count < max_per_domain:
            deduplicated.append(r)
            domain_count[domain] = count + 1
        else:
            removed += 1
    return deduplicated, removed


def _get_dominant_domain(results):
    """v4.9f — Identifie le domaine le plus frequent dans les resultats."""
    domain_counts = {}
    for r in results:
        try:
            domain = urllib.parse.urlparse(
                r.get("url", "")).netloc.replace("www.", "")
        except Exception:
            domain = ""
        domain_counts[domain] = domain_counts.get(domain, 0) + 1
    if not domain_counts:
        return ""
    return max(domain_counts, key=domain_counts.get)


def _search_with_compensation(query, initial_results,
                              max_per_domain=2, target_results=8):
    """v4.9f — Deduplique + lance une requete de compensation si besoin.
    Retourne (results, dominant_domain).
    """
    deduplicated, removed = _deduplicate_by_domain(
        initial_results, max_per_domain)

    dominant_domain = _get_dominant_domain(initial_results)

    if removed < 2 or len(deduplicated) >= target_results:
        _log("INFO",
            f"[DEDUP] {removed} doublons supprimes, "
            f"{len(deduplicated)} restants (compensation: NON)")
        return deduplicated, dominant_domain

    # Requete de compensation en excluant le domaine dominant
    comp_query = f'"{query}" -site:{dominant_domain}'
    _log("INFO",
        f"[DEDUP] {removed} doublons supprimes, "
        f"compensation: '{comp_query[:60]}'")
    extra_results = _serper_search(comp_query, num=10)

    # Combiner et re-dedupliquer
    seen_urls = {r.get("url", "") for r in deduplicated}
    for r in extra_results:
        if r.get("url", "") not in seen_urls:
            deduplicated.append(r)
            seen_urls.add(r.get("url", ""))

    final, _ = _deduplicate_by_domain(deduplicated, max_per_domain)
    _log("INFO",
        f"[DEDUP] Apres compensation: {len(final)} resultats")
    return final[:target_results], dominant_domain


def _has_press_footprint(results):
    """v4.9f — Detecte si au moins 1 resultat provient d'un media national."""
    for r in results:
        try:
            domain = urllib.parse.urlparse(
                r.get("url", "")).netloc.replace("www.", "")
        except Exception:
            continue
        for media in _NATIONAL_MEDIA_DOMAINS:
            if media in domain:
                return True
    return False


def _search_national_media(query, initial_results,
                           existing_results, dominant_domain,
                           max_results=6):
    """v4.9f — Requete complementaire pour presse nationale.
    Utilise le nom de marque media comme mot-cle naturel.
    """
    # Identifier les marques media presentes dans les resultats initiaux
    detected_brands = set()
    for r in initial_results:
        try:
            domain = urllib.parse.urlparse(
                r.get("url", "")).netloc.replace("www.", "")
        except Exception:
            continue
        for media_domain, brand in _DOMAIN_TO_BRAND.items():
            if media_domain in domain:
                detected_brands.add(brand)

    # Choisir un mot-cle DIFFERENT du domaine dominant
    dominant_brand = _DOMAIN_TO_BRAND.get(dominant_domain, "")

    candidates = [b for b in detected_brands if b != dominant_brand]
    if not candidates:
        candidates = [
            b for b in _FALLBACK_MEDIA_KEYWORDS
            if b != dominant_brand
        ]

    if not candidates:
        return []

    media_keyword = candidates[0]

    # Une requete naturelle avec guillemets autour du nom
    search_query = f'"{query}" {media_keyword}'
    _log("INFO",
        f"[MEDIA-COMP] Requete media: '{search_query[:60]}'")
    media_results = _serper_search(search_query, num=10)

    # Filtrer les resultats deja presents
    seen_urls = {r.get("url", "") for r in existing_results}
    new_results = []
    for r in media_results:
        if r.get("url", "") not in seen_urls:
            new_results.append(r)
            seen_urls.add(r.get("url", ""))

    _log("INFO",
        f"[MEDIA-COMP] {len(new_results)} nouvelles sources")
    return new_results[:max_results]


_GENERIC_PLATFORM_DOMAINS = frozenset({
    "linkedin.com", "fr.linkedin.com",
    "facebook.com", "twitter.com", "x.com",
    "instagram.com", "youtube.com", "tiktok.com",
    "reddit.com", "pinterest.com",
    "amazon.fr", "amazon.com",
    "myheritage.com", "myheritage.fr",
    "copainsdavant.linternaute.com",
    "google.com", "google.fr",
})


def _is_trusted_source_domain(image_page_url, text_sources):
    """v4.9f — Verifie si le domaine de la page source de l'image
    est deja present dans les sources texte validees du pipeline.

    Exclut les plateformes generiques (LinkedIn, Facebook, etc.) qui
    hebergent des profils de personnes differentes sur le meme domaine.

    text_sources : liste d'URLs (str) ou de dicts avec cle "url".
    Retourne True si le domaine de image_page_url matche un domaine
    de text_sources ET n'est pas une plateforme generique.
    """
    if not text_sources or not image_page_url:
        return False
    try:
        img_domain = urllib.parse.urlparse(
            image_page_url).netloc.lower().replace("www.", "")
    except Exception:
        return False
    if not img_domain:
        return False
    # Exclure les plateformes generiques (multi-profils)
    if img_domain in _GENERIC_PLATFORM_DOMAINS:
        return False
    for src in text_sources:
        src_url = src.get("url", "") if isinstance(src, dict) else src
        if not src_url:
            continue
        try:
            src_domain = urllib.parse.urlparse(
                src_url).netloc.lower().replace("www.", "")
        except Exception:
            continue
        if img_domain == src_domain:
            return True
    return False


def _validate_portrait_name(filename_or_url, firstname, lastname):
    """v4.9f — Verifie qu'un portrait correspond a la bonne personne.
    Gere les noms composes (Le Boiteux), les prenoms composes
    (Marie-Therese), et les prenoms intercales (herve_elie-bokobza).
    v4.9f CORRECTION 2 : gestion stricte des particules de nom.

    Retourne True si l'image correspond probablement a la bonne personne.
    Retourne False si un homonyme est detecte.
    """
    # Normaliser : tout en minuscules, separateurs → espaces
    text = _strip_accents(filename_or_url.lower())
    text = re.sub(r"[-_./\\]", " ", text)
    words = text.split()

    firstname_lower = _strip_accents(firstname.lower().strip())
    firstname_lower = re.sub(r"[-_]", " ", firstname_lower)
    lastname_lower = _strip_accents(lastname.lower().strip())

    _particles = {
        "de", "du", "le", "la", "les", "des",
        "van", "von", "el", "al", "ben", "ibn",
    }

    # --- ETAPE 1 : Verifier la presence du nom de famille ---
    lastname_parts = lastname_lower.split()
    core_lastname = lastname_parts[-1]  # "boiteux" pour "le boiteux"
    has_particle = (
        len(lastname_parts) > 1
        and any(p in _particles for p in lastname_parts[:-1])
    )

    # 1a. Chercher le nom complet d'abord ("le boiteux", "ben osman")
    full_match = False
    full_match_pos = -1
    for i in range(len(words) - len(lastname_parts) + 1):
        if words[i:i + len(lastname_parts)] == lastname_parts:
            full_match = True
            full_match_pos = i
            break

    # 1b. Fallback : chercher le dernier mot seul ("boiteux", "osman")
    core_match = False
    core_match_pos = -1
    if not full_match:
        for i, w in enumerate(words):
            if w == core_lastname:
                core_match = True
                core_match_pos = i
                break

    if not full_match and not core_match:
        return False  # nom de famille absent → rejet

    # Position effective du nom (pour les checks d'adjacence)
    lastname_position = full_match_pos if full_match else core_match_pos

    # --- ETAPE 2 : Verifier le prenom ---
    firstname_parts = firstname_lower.split()
    core_firstname = firstname_parts[0]

    # Trouver toutes les positions du prenom
    firstname_positions = [
        i for i, w in enumerate(words) if w == core_firstname]

    # -----------------------------------------------------------
    # CAS A : Nom complet avec particule trouve (full_match)
    #         → logique standard (identique v4.9f)
    # -----------------------------------------------------------
    if full_match:
        if not firstname_positions:
            # Prenom absent — verifier absence de prenom contradictoire
            adj = []
            if full_match_pos > 0:
                adj.append(full_match_pos - 1)
            if full_match_pos > 1:
                adj.append(full_match_pos - 2)
            for idx in adj:
                if idx < 0 or idx >= len(words):
                    continue
                w = words[idx]
                if (len(w) >= 3 and w.isalpha()
                        and w not in _particles):
                    return False
            return True

        # Prenom present — verifier coherence
        for fp in firstname_positions:
            nxt = fp + 1
            if nxt < len(words):
                nw = words[nxt]
                if nw == lastname_parts[0]:
                    return True
                if nw in _particles:
                    after = fp + 2
                    if (after < len(words)
                            and words[after] == core_lastname):
                        return True
                    continue
                if (len(nw) >= 3 and nw.isalpha()
                        and nw not in _particles
                        and nw not in firstname_parts):
                    return False
            if fp + 1 == full_match_pos:
                return True
        return True

    # -----------------------------------------------------------
    # CAS B : Seul le core_lastname est trouve (sans particule)
    #         ET le nom contient une particule (has_particle=True)
    #         → verification STRICTE
    # -----------------------------------------------------------
    if core_match and has_particle:
        if not firstname_positions:
            # Prenom absent — verifier absence de prenom contradictoire
            if core_match_pos > 0:
                prev = words[core_match_pos - 1]
                if (len(prev) >= 3 and prev.isalpha()
                        and prev not in _particles
                        and prev != core_firstname):
                    return False
            # Nom seul, pas de prenom contradictoire → OK (prudent)
            return True

        # Prenom present. Le nom est sans particule ("boiteux" sans "le").
        # Si le prenom est adjacent au nom SANS la particule entre eux,
        # c'est probablement un homonyme ("Marie Boiteux" ≠ "Marie Le Boiteux").
        for fp in firstname_positions:
            if core_match_pos > fp:
                between = words[fp + 1:core_match_pos]
                if len(between) == 0:
                    # "marie boiteux" → particule manquante → REJET
                    return False
                if all(w in _particles for w in between):
                    # "marie de boiteux" → particules entre → OK
                    return True
                # Mots non-particule entre prenom et nom → rejet
                return False
            elif core_match_pos < fp:
                between = words[core_match_pos + 1:fp]
                if len(between) == 0:
                    # "boiteux marie" (ordre inverse, rare) → REJET
                    return False
                # Mots entre → ambigu → rejet
                return False

        # Prenom present mais pas adjacent → OK (prudent)
        return True

    # -----------------------------------------------------------
    # CAS C : Seul le core_lastname est trouve, nom SANS particule
    #         → logique standard v4.9f (prenoms intercales, composes)
    # -----------------------------------------------------------
    if not firstname_positions:
        # Prenom absent — verifier absence de prenom contradictoire
        adj = []
        if lastname_position > 0:
            adj.append(lastname_position - 1)
        if lastname_position > 1:
            adj.append(lastname_position - 2)
        for idx in adj:
            if idx < 0 or idx >= len(words):
                continue
            w = words[idx]
            if (len(w) >= 3 and w.isalpha()
                    and w not in _particles):
                return False
        return True

    # Prenom present — logique v4.9f standard (prenoms composes,
    # prenoms intercales comme herve_elie-bokobza)
    for fp in firstname_positions:
        nxt = fp + 1
        if nxt < len(words):
            nw = words[nxt]
            if (nw == core_lastname
                    or nw == lastname_parts[0]):
                return True
            if nw in _particles:
                after = fp + 2
                if (after < len(words)
                        and words[after] == core_lastname):
                    return True
                continue
            if (len(nw) >= 3 and nw.isalpha()
                    and nw not in _particles
                    and nw not in firstname_parts):
                return False

        if fp + 1 == lastname_position:
            return True
        if (fp + 2 == lastname_position
                and len(words) > fp + 1
                and words[fp + 1] in _particles):
            return True

    return True


def _classify_url_type(url):
    """v4.9e — Classifier le type d'un resultat Serper par son URL.
    Retourne: 'linkedin', 'social', 'personal_site', 'press',
              'academic', 'noise', 'other'
    """
    try:
        domain = urllib.parse.urlparse(url).netloc.lower()
        domain = domain.replace("www.", "")
    except Exception:
        return "other"

    if "linkedin.com" in domain:
        return "linkedin"
    if any(domain == sd or domain.endswith("." + sd)
           for sd in _NOISE_DOMAINS):
        return "noise"
    if any(domain == sd or domain.endswith("." + sd)
           for sd in _SOCIAL_DOMAINS):
        return "social"

    # Presse / medias : domaines de la whitelist
    if any(d in domain for d in MEDIA_WHITELIST):
        return "press"

    # Academique
    academic_markers = (
        "hal.science", "hal.archives", "persee.fr",
        "erudit.org", "cairn.info", "theses.fr",
        "scholar.google",
    )
    if any(m in domain for m in academic_markers):
        return "academic"

    return "other"


def _extract_activity_domains(text):
    """v4.9e — Extraire les domaines d'activite d'un texte.
    Retourne un set de noms de domaines d'activite.
    """
    text_norm = _strip_accents(text.lower())
    found = set()
    for domain_name, keywords in _ACTIVITY_DOMAINS.items():
        for kw in keywords:
            if kw in text_norm:
                found.add(domain_name)
                break
    return found


def _extract_identity_signals(result, query_parts, query_ascii):
    """v4.9e — Extraire les signaux d'identite d'un resultat Serper.
    Args:
        result: dict Serper {url, title, snippet}
        query_parts: liste de mots du nom (lowercase, sans accents)
        query_ascii: nom complet lowercase sans accents
    Retourne un dict signal ou None si resultat bruit.
    """
    url = result.get("url", "")
    title = result.get("title", "")
    snippet = result.get("snippet", "")
    if not url:
        return None

    url_type = _classify_url_type(url)
    if url_type == "noise":
        return None

    # Verifier presence du nom dans le contenu
    combined = _strip_accents((title + " " + snippet).lower())
    name_in_title = all(p in _strip_accents(title.lower())
                        for p in query_parts)
    name_in_snippet = all(p in _strip_accents(snippet.lower())
                          for p in query_parts)

    # Au moins le nom de famille doit etre present
    lastname = query_parts[-1] if query_parts else ""
    if lastname and lastname not in combined:
        return None

    # Extraire domaines d'activite du titre + snippet
    activity = _extract_activity_domains(title + " " + snippet)

    # Detecter nom dans le domaine
    try:
        domain = urllib.parse.urlparse(url).netloc.lower()
        domain = domain.replace("www.", "").replace("-", "")
        name_in_domain = (
            query_ascii.replace(" ", "") in domain
            or (lastname and lastname in domain)
        )
    except Exception:
        name_in_domain = False

    # Extraire profession depuis snippet LinkedIn
    profession = None
    if url_type == "linkedin" and " - " in snippet:
        parts = snippet.split(" - ")
        if len(parts) >= 2:
            profession = parts[0].strip()[:100]

    return {
        "url": url,
        "title": title,
        "snippet": snippet,
        "url_type": url_type,
        "activity_domains": activity,
        "name_in_title": name_in_title,
        "name_in_snippet": name_in_snippet,
        "name_in_domain": name_in_domain,
        "profession": profession,
    }


def _signals_compatible(sig_a, sig_b):
    """v4.9e — Verifier si deux signaux sont compatibles (meme personne).
    Compatibles si :
    - Ils partagent au moins un domaine d'activite
    - OU l'un est un site personnel et mentionne le nom
    - OU l'un est LinkedIn (unicite forte du profil)
    """
    # Domaines d'activite en commun
    if (sig_a["activity_domains"]
            and sig_b["activity_domains"]
            and sig_a["activity_domains"] & sig_b["activity_domains"]):
        return True

    # LinkedIn est un signal fort : compatible avec tout resultat
    # qui mentionne le nom
    strong_types = ("linkedin", "social")
    if (sig_a["url_type"] in strong_types
            and (sig_b["name_in_title"] or sig_b["name_in_snippet"])):
        return True
    if (sig_b["url_type"] in strong_types
            and (sig_a["name_in_title"] or sig_a["name_in_snippet"])):
        return True

    # Site personnel : compatible avec tout resultat pertinent
    if sig_a["name_in_domain"] and sig_b["name_in_title"]:
        return True
    if sig_b["name_in_domain"] and sig_a["name_in_title"]:
        return True

    return False


def _cluster_signals(signals):
    """v4.9e — Regrouper les signaux par compatibilite (union-find).
    Retourne une liste de clusters (listes de signaux),
    triee par taille decroissante.
    """
    n = len(signals)
    if n == 0:
        return []

    # Union-Find
    parent = list(range(n))

    def find(x):
        while parent[x] != x:
            parent[x] = parent[parent[x]]
            x = parent[x]
        return x

    def union(a, b):
        ra, rb = find(a), find(b)
        if ra != rb:
            parent[ra] = rb

    # Comparer chaque paire
    for i in range(n):
        for j in range(i + 1, n):
            if _signals_compatible(signals[i], signals[j]):
                union(i, j)

    # Construire les clusters
    from collections import defaultdict
    clusters_map = defaultdict(list)
    for i in range(n):
        clusters_map[find(i)].append(signals[i])

    # Trier par taille decroissante
    clusters = sorted(clusters_map.values(),
                      key=len, reverse=True)
    return clusters


def _decide_discrimination(clusters, query):
    """v4.9e — Decider si la personne est identifiable.
    Retourne un dict:
        status: 'identifiable' | 'partial' | 'non_discriminable'
        dominant_cluster: list de signaux | None
        confidence: 'high' | 'medium' | 'low'
        profession: str | None
        activity_domain: str | None
    """
    if not clusters:
        return {
            "status": "non_discriminable",
            "dominant_cluster": None,
            "confidence": "low",
            "profession": None,
            "activity_domain": None,
        }

    dominant = clusters[0]
    dom_size = len(dominant)
    total = sum(len(c) for c in clusters)

    # Extraire profession du cluster dominant
    profession = None
    for sig in dominant:
        if sig.get("profession"):
            profession = sig["profession"]
            break

    # Extraire domaine d'activite dominant
    from collections import Counter
    all_domains = Counter()
    for sig in dominant:
        for ad in sig["activity_domains"]:
            all_domains[ad] += 1
    activity_domain = (
        all_domains.most_common(1)[0][0] if all_domains else None
    )

    # Seuils de decision
    # LinkedIn dans le cluster dominant → boost confiance
    has_linkedin = any(
        s["url_type"] == "linkedin" for s in dominant)
    has_social = any(
        s["url_type"] in ("linkedin", "social")
        for s in dominant)
    has_personal = any(s["name_in_domain"] for s in dominant)

    if dom_size >= 4:
        status = "identifiable"
        confidence = "high" if has_linkedin else "medium"
    elif dom_size >= 2:
        status = "partial"
        if has_linkedin or has_personal:
            confidence = "medium"
        else:
            confidence = "low"
    else:
        # Cluster dominant = 1 resultat
        if has_linkedin or has_personal:
            status = "partial"
            confidence = "medium" if has_linkedin else "low"
        else:
            status = "non_discriminable"
            confidence = "low"

    # Bonus : si le ratio dominant/total est fort
    if total > 0 and dom_size / total >= 0.8 and dom_size >= 3:
        if confidence == "low":
            confidence = "medium"

    _log("INFO",
        f"[DISCRIMINATION] {query}: status={status}, "
        f"cluster={dom_size}/{total}, "
        f"confidence={confidence}, "
        f"profession={profession}, "
        f"activity={activity_domain}, "
        f"linkedin={has_linkedin}")

    return {
        "status": status,
        "dominant_cluster": dominant,
        "confidence": confidence,
        "profession": profession,
        "activity_domain": activity_domain,
    }


def _parse_ouvrage_template(line):
    """v4.8g — Parser un template {{Ouvrage|titre=...|année=...}}
    Retourne un dict publication ou None.
    """
    # Extraire les parametres cle=valeur
    params = {}
    # Trouver le contenu entre {{ et }}
    match = re.search(r'\{\{[Oo]uvrage\s*\|(.+?)\}\}', line,
                      re.DOTALL)
    if not match:
        return None
    content = match.group(1)
    # Splitter sur | en ignorant les {{ imbriques
    depth = 0
    current = ""
    for ch in content:
        if ch == '{':
            depth += 1
            current += ch
        elif ch == '}':
            depth -= 1
            current += ch
        elif ch == '|' and depth == 0:
            if '=' in current:
                key, _, val = current.partition('=')
                params[key.strip().lower()] = val.strip()
            current = ""
        else:
            current += ch
    if current and '=' in current:
        key, _, val = current.partition('=')
        params[key.strip().lower()] = val.strip()

    title = params.get("titre", "").strip()
    if not title or len(title) < 4:
        return None

    # Nettoyer le titre (wiki markup residuel)
    title = re.sub(r"'{2,}", "", title)
    title = re.sub(r"\[\[([^\]|]+\|)?([^\]]+)\]\]", r"\2",
                   title)

    year = 0
    for year_key in ("année", "annee", "date", "year"):
        if year_key in params:
            ym = re.search(r'(19[4-9]\d|20[0-2]\d)',
                           params[year_key])
            if ym:
                year = int(ym.group(1))
                break

    publisher = ""
    for pub_key in ("éditeur", "editeur", "publisher",
                    "edition"):
        if pub_key in params:
            publisher = params[pub_key].strip()[:50]
            break

    isbn = ""
    for isbn_key in ("isbn", "isbn1"):
        if isbn_key in params:
            isbn = params[isbn_key].strip()
            break

    return {
        "title": title[:120],
        "author": "",
        "publisher": publisher,
        "year": year,
        "isbn": isbn,
        "cover_url": "",
        "link": "",
        "source": "wikipedia_biblio"
    }


class MediaFetcher:
    """Recherche et extraction de contenu depuis les medias francophones"""

    def search_media(self, query, max_results=5):
        """v4.9c — Rechercher des articles via Serper API.
        Remplace DDG Lite (rate-limit).
        """
        site_parts = " OR ".join(f"site:{d}" for d in MEDIA_WHITELIST)
        full_query = f"{query} ({site_parts})"

        results = _serper_search(full_query, num=max_results * 3)

        # Filtrer par liste blanche stricte + rejeter blacklist
        filtered = []
        for r in results:
            domain = self._extract_domain(r["url"])
            is_whitelisted = any(d in domain for d in MEDIA_WHITELIST)
            is_blacklisted = any(d in domain for d in DOMAIN_BLACKLIST)
            if is_whitelisted and not is_blacklisted:
                r["domain"] = domain
                filtered.append(r)
                if len(filtered) >= max_results:
                    break

        _log("INFO", f"Recherche pour '{query}': {len(filtered)} resultats filtres sur {len(results)} bruts")
        return filtered

    def fetch_article_text(self, url, max_chars=MAX_CHARS_PER_ARTICLE):
        """Extraire le texte des paragraphes d'un article
        Regle : uniquement <p>, ignorer nav/footer/aside/scripts
        Limite : 3000 chars max par article
        Timeout : 8 secondes
        Echec silencieux (regle prompt_architect_v1.1)
        v4.6 : fallback meta description si paragraphes insuffisants
        v4.9f : skip domaines bloques, detection paywall
        """
        # v4.9f — Skip domaines connus pour bloquer
        if _should_skip_scraping(url):
            _log("INFO",
                f"[BLOCKED] Domaine bloque, skip: {url[:60]}")
            return None

        try:
            req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
            with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT) as resp:
                html = resp.read().decode("utf-8", errors="replace")
        except Exception as e:
            _log("WARN", f"Echec fetch article (skip): {url} -- {e}")
            return None

        text = self._extract_paragraphs(html)

        # v4.6 — Fallback meta description si contenu trop court
        # (paywall, page dynamique, contenu JS-only)
        if not text or len(text) < 80:
            meta_desc = self._extract_meta_description(html)
            if meta_desc and len(meta_desc) > 50:
                _log("INFO",
                    f"Fallback meta description: "
                    f"{len(meta_desc)} chars pour {url[:60]}")
                text = meta_desc

        # v4.9f — Detecter contenu paywall
        if text and _detect_paywall(text):
            _log("WARN",
                f"[PAYWALL] Contenu paywall detecte: {url[:60]}")
            return None

        if text and len(text) > max_chars:
            text = text[:max_chars - 3] + "..."
        return text

    def fetch_media_sources(self, query, max_results=5):
        """Pipeline complet : recherche DDG + extraction de contenu
        Retourne une liste de sources exploitables pour la synthese
        Si aucun contenu exploitable, retourne les snippets DDG bruts (regle prompt_architect)
        """
        results = self.search_media(query, max_results)
        sources = []
        snippets_fallback = []

        for r in results:
            # Conserver le snippet comme fallback
            if r.get("snippet") and len(r["snippet"]) > 30:
                snippets_fallback.append({
                    "type": "media",
                    "domain": r["domain"],
                    "url": r["url"],
                    "title": r["title"],
                    "text": r["snippet"],
                    "snippet": r["snippet"]
                })

            # Tenter le fetch complet
            text = self.fetch_article_text(r["url"])
            if text and len(text) > 100:
                sources.append({
                    "type": "media",
                    "domain": r["domain"],
                    "url": r["url"],
                    "title": r["title"],
                    "text": text,
                    "snippet": r.get("snippet", "")
                })
                _log("INFO", f"Source extraite: {r['domain']} ({len(text)} chars)")
            else:
                _log("WARN", f"Contenu insuffisant ou paywall: {r.get('domain', r['url'])}")

        # Si aucun article fetche mais des snippets disponibles : fallback snippets
        if not sources and snippets_fallback:
            _log("INFO", f"Fallback snippets DDG: {len(snippets_fallback)} snippets")
            return snippets_fallback

        return sources

    # ── Fallback DDG ciblé (quand médias standards échouent) ────────────

    EXCLUDED_DOMAINS_DDG = {
        "facebook.com", "twitter.com", "instagram.com",
        "linkedin.com", "youtube.com", "amazon.fr",
        "amazon.com", "fnac.com", "x.com",
        "tiktok.com", "pinterest.com",
    }

    def fetch_ddg_targeted(self, query, max_results=5,
                           ddg_results=None):
        """Recherche DDG ciblée quand les médias standards échouent.
        Cherche des sources spécialisées sur la personne :
        interviews, tribunes, articles de fond.
        Exclut les sites purement commerciaux et les réseaux sociaux.

        ddg_results : list optionnelle de résultats DDG pré-parsés
                      (depuis resolve_person_identity) pour éviter
                      un appel DDG supplémentaire (rate-limit).
        """
        try:
            if ddg_results:
                # Reutiliser les resultats pre-parses
                parsed = ddg_results
                _log("INFO",
                    f"Serper cible '{query}': "
                    f"{len(parsed)} resultats (reutilises)")
            else:
                # v4.9c — Serper : recherche exacte
                parsed = _serper_search(f'"{query}"', num=10)
                # Fallback sans guillemets si 0 resultats
                if not parsed:
                    parsed = _serper_search(query, num=10)

            # v4.6 — matching sans accents + any() au lieu de all()
            query_ascii = _strip_accents(query.lower())
            name_parts = [
                p for p in query_ascii.split()
                if len(p) > 2
            ]
            results = []

            for r in parsed[:15]:
                result_url = r.get("url", "")
                title = r.get("title", "")
                snippet = r.get("snippet", "")

                if not result_url:
                    continue

                # Extraire domaine
                try:
                    domain = result_url.split("/")[2].replace("www.", "")
                except IndexError:
                    continue

                # Exclure domaines commerciaux/sociaux + blacklist
                if domain in self.EXCLUDED_DOMAINS_DDG:
                    continue
                if any(d in domain for d in DOMAIN_BLACKLIST):
                    continue

                # Vérifier que le nom apparaît dans titre ou snippet
                # v4.6 : any() + sans accents (le nom rare suffit)
                content = _strip_accents(
                    (title + " " + snippet).lower())
                if not any(
                        p in content for p in name_parts
                        if len(p) > 4):
                    continue

                # Scraper la page
                try:
                    content_text = self.fetch_article_text(result_url)
                    if content_text and len(content_text) > 80:
                        results.append({
                            "type": "media",
                            "domain": domain,
                            "url": result_url,
                            "title": title,
                            "text": content_text[:1500],
                            "snippet": snippet,
                        })
                        _log("INFO", f"DDG ciblé: OK {domain}")
                        if len(results) >= max_results:
                            break
                    else:
                        # Fallback snippet si contenu insuffisant
                        if snippet and len(snippet) > 50:
                            results.append({
                                "type": "media",
                                "domain": domain,
                                "url": result_url,
                                "title": title,
                                "text": snippet,
                                "snippet": snippet,
                            })
                            _log("INFO",
                                f"DDG ciblé: snippet fallback {domain}")
                            if len(results) >= max_results:
                                break
                except Exception:
                    continue

            _log("INFO",
                f"DDG ciblé: {len(results)} sources trouvées "
                f"pour '{query}'")
            return results

        except Exception as e:
            _log("WARN", f"fetch_ddg_targeted échec: {e}")
            return []

    # ── Notoriété Web : personnes SANS page Wikipedia (v4.5) ───────────

    # Domaines exclus de la notoriété web (déjà traités ou non pertinents)
    _NOTORIETE_EXCLUDED_DOMAINS = {
        "wikipedia.org", "wikidata.org", "wikimedia.org",
        "facebook.com", "twitter.com", "x.com", "instagram.com",
        "tiktok.com", "pinterest.com", "reddit.com",
        "youtube.com",
        "amazon.fr", "amazon.com", "fnac.com", "ebay.fr",
    }

    def fetch_notoriete_web(self, query):
        """Strategie notoriete web pour personnes.
        v4.5b : orchestre 5 sous-fonctions en parallele :
        - articles parlant DE la personne
        - site personnel / LinkedIn / portfolio
        - interviews / entretiens / podcasts
        - portrait presse (image)
        - medias publics (France Inter, France Culture, Arte, RFI, INA)
        Retourne un dict {sources: [...], portrait: {...}|None, meta: {...}}
        """
        import concurrent.futures
        import time as _time

        _log("INFO",
            f"[NOTORIETE-WEB] Debut pour '{query}'")

        start = _time.time()
        sources = []
        portrait = None

        try:
            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=5) as exe:
                f_articles = exe.submit(
                    self._ddg_articles_about, query)
                f_site = exe.submit(
                    self._ddg_site_personnel, query)
                f_interviews = exe.submit(
                    self._ddg_interviews, query)
                f_portrait = exe.submit(
                    self._ddg_portrait_notoriete, query)
                f_medias = exe.submit(
                    self._fetch_medias_publics_guest, query)

                # Collecter avec timeout 15s (5 taches)
                deadline = _time.time() + 15
                for label, future in [
                    ("articles", f_articles),
                    ("site_perso", f_site),
                    ("interviews", f_interviews),
                    ("medias_publics", f_medias),
                ]:
                    try:
                        remaining = max(0.5,
                            deadline - _time.time())
                        result = future.result(
                            timeout=remaining)
                        if result:
                            sources.extend(result)
                            _log("INFO",
                                f"[NOTORIETE-WEB] {label}: "
                                f"{len(result)} sources")
                    except Exception as e:
                        _log("WARN",
                            f"[NOTORIETE-WEB] {label} echec: {e}")

                # Portrait
                try:
                    remaining = max(0.5,
                        deadline - _time.time())
                    portrait = f_portrait.result(
                        timeout=remaining)
                    if portrait:
                        _log("INFO",
                            f"[NOTORIETE-WEB] Portrait trouve: "
                            f"{portrait.get('source', '?')}")
                except Exception as e:
                    _log("WARN",
                        f"[NOTORIETE-WEB] portrait echec: {e}")

        except Exception as e:
            _log("ERROR",
                f"[NOTORIETE-WEB] Erreur orchestration: {e}")

        # Deduplique par URL normalisee (v4.9b — www/non-www)
        seen_urls = set()
        unique_sources = []
        for src in sources:
            url = src.get("url", "")
            if url:
                url_norm = _normalize_url(url)
                if url_norm not in seen_urls:
                    seen_urls.add(url_norm)
                    unique_sources.append(src)

        elapsed = round((_time.time() - start) * 1000)
        _log("INFO",
            f"[NOTORIETE-WEB] Termine: {len(unique_sources)} sources "
            f"uniques, portrait={'OUI' if portrait else 'NON'}, "
            f"{elapsed}ms")

        return {
            "sources": unique_sources,
            "portrait": portrait,
            "meta": {
                "strategy": "notoriete_web",
                "elapsed_ms": elapsed,
                "sub_counts": {
                    "articles": len([
                        s for s in unique_sources
                        if s.get("sub_type") == "article"]),
                    "site_perso": len([
                        s for s in unique_sources
                        if s.get("sub_type") == "site_perso"]),
                    "interview": len([
                        s for s in unique_sources
                        if s.get("sub_type") == "interview"]),
                    "media_public": len([
                        s for s in unique_sources
                        if s.get("sub_type") == "media_public"]),
                },
            },
        }

    def _ddg_articles_about(self, query, max_results=4):
        """Sous-fonction 1 : articles web parlant DE la personne.
        Recherche Serper avec nom entre guillemets.
        Exclut les reseaux sociaux et e-commerce.
        """
        try:
            parsed = _serper_search(f'"{query}"', num=15)
            _log("INFO",
                f"[NOTORIETE-ARTICLES] Serper brut: "
                f"{len(parsed)} resultats")

            # Fallback SANS guillemets si 0 resultats
            if not parsed:
                parsed = _serper_search(query, num=15)
                _log("INFO",
                    f"[NOTORIETE-ARTICLES] Serper SANS guillemets: "
                    f"{len(parsed)} resultats")

            name_parts = [p.lower() for p in query.split()
                          if len(p) > 2]
            results = []

            for r in parsed[:15]:
                result_url = r.get("url", "")
                title = r.get("title", "")
                snippet = r.get("snippet", "")
                if not result_url:
                    continue

                domain = self._extract_domain(result_url)
                if domain in self._NOTORIETE_EXCLUDED_DOMAINS:
                    continue
                if any(d in domain for d in DOMAIN_BLACKLIST):
                    continue

                # Le nom doit apparaitre dans titre ou snippet
                content_check = (title + " " + snippet).lower()
                if not any(p in content_check for p in name_parts):
                    continue

                # Scraper la page (v4.6 : seuil 150→80)
                try:
                    text = self.fetch_article_text(result_url)
                    if text and len(text) > 80:
                        results.append({
                            "type": "notoriete_web",
                            "sub_type": "article",
                            "domain": domain,
                            "url": result_url,
                            "title": title,
                            "text": text[:1500],
                            "snippet": snippet,
                        })
                        if len(results) >= max_results:
                            break
                    elif snippet and len(snippet) > 50:
                        results.append({
                            "type": "notoriete_web",
                            "sub_type": "article",
                            "domain": domain,
                            "url": result_url,
                            "title": title,
                            "text": snippet,
                            "snippet": snippet,
                        })
                        if len(results) >= max_results:
                            break
                except Exception:
                    continue

            return results

        except Exception as e:
            _log("WARN",
                f"[NOTORIETE-ARTICLES] Echec: {e}")
            return []

    def _ddg_site_personnel(self, query, max_results=3):
        """Sous-fonction 2 : site personnel, LinkedIn, portfolio.
        v4.5b : detection nom de famille dans domaine + URLs directes.
        """
        # Domaines cibles pour profils
        profile_domains = {
            "linkedin.com", "muckrack.com", "babelio.com",
            "cairn.info", "scholar.google.com",
            "researchgate.net", "orcid.org",
            "about.me", "journaliste.com",
        }

        # Preparer variantes du nom pour detection domaine
        parts = query.strip().split()
        lastname = parts[-1].lower() if parts else ""
        firstname = parts[0].lower() if parts else ""
        lastname_ascii = _strip_accents(lastname)
        firstname_ascii = _strip_accents(firstname)
        fullname_ascii = _strip_accents(
            query.replace(" ", "").lower())

        name_parts = [p.lower() for p in parts if len(p) > 2]
        results = []

        # ── PHASE 1 : URLs directes previsibles ──────────────
        direct_urls = [
            f"https://www.{firstname_ascii}{lastname_ascii}.fr",
            f"https://{firstname_ascii}{lastname_ascii}.fr",
            f"https://www.{firstname_ascii}-{lastname_ascii}.fr",
            f"https://{firstname_ascii}-{lastname_ascii}.fr",
            f"https://www.{lastname_ascii}.fr",
        ]
        # Dedoublonner
        seen = set()
        unique_direct = []
        for u in direct_urls:
            if u not in seen:
                seen.add(u)
                unique_direct.append(u)

        for direct_url in unique_direct:
            try:
                req = urllib.request.Request(direct_url, headers={
                    "User-Agent": USER_AGENT,
                })
                with urllib.request.urlopen(
                        req, timeout=4) as resp:
                    if resp.status == 200:
                        html = resp.read().decode(
                            "utf-8", errors="replace")
                        text = self._extract_paragraphs(html)
                        if text and len(text) > 80:
                            # v4.9b — Valider que c'est bien
                            # un site perso (pas entreprise)
                            if not _is_personal_site(
                                    direct_url, query, text):
                                continue
                            domain = self._extract_domain(direct_url)
                            results.append({
                                "type": "notoriete_web",
                                "sub_type": "site_perso",
                                "domain": domain,
                                "url": direct_url,
                                "title": (
                                    f"Site personnel — {query}"),
                                "text": (
                                    f"[Site officiel: {direct_url}]"
                                    f"\n{text[:1200]}"),
                                "snippet": "",
                                "is_profile": True,
                            })
                            _log("INFO",
                                f"[NOTORIETE-SITE] Site personnel "
                                f"trouve: {direct_url}")
                            if len(results) >= max_results:
                                return results
            except Exception:
                continue

        # ── PHASE 2 : Recherche DDG multi-requetes ────────────
        search_queries = [
            f'"{query}" site:linkedin.com OR biographie '
            f'OR site personnel OR portfolio',
        ]
        # Ajouter recherche domaine previsible
        if fullname_ascii:
            search_queries.append(
                f'site:{fullname_ascii}.fr OR '
                f'site:{fullname_ascii}.com')

        for sq in search_queries:
            if len(results) >= max_results:
                break
            try:
                parsed = _serper_search(sq, num=10)
                _log("INFO",
                    f"[NOTORIETE-SITE] Serper brut: "
                    f"{len(parsed)} resultats")

                for r in parsed[:10]:
                    if len(results) >= max_results:
                        break
                    result_url = r.get("url", "")
                    title = r.get("title", "")
                    snippet = r.get("snippet", "")
                    if not result_url:
                        continue

                    domain = self._extract_domain(result_url)
                    if domain in self._NOTORIETE_EXCLUDED_DOMAINS:
                        continue

                    # Eviter doublons (v4.9b — normalise www)
                    result_url_norm = _normalize_url(result_url)
                    if any(_normalize_url(x.get("url", ""))
                           == result_url_norm
                           for x in results):
                        continue

                    # Verifier pertinence : nom dans titre/snippet
                    content_check = (
                        title + " " + snippet).lower()
                    if not any(p in content_check
                               for p in name_parts):
                        continue

                    # Detection site personnel par domaine
                    domain_matches_name = (
                        lastname_ascii in domain
                        or fullname_ascii in domain.replace(
                            "-", "").replace(".", ""))
                    is_profile = (
                        domain_matches_name
                        or any(pd in domain
                               for pd in profile_domains))

                    # Scraper la page (ou snippet fallback)
                    text = ""
                    try:
                        fetched = self.fetch_article_text(
                            result_url)
                        if fetched and len(fetched) > 100:
                            text = fetched[:1500]
                    except Exception:
                        pass
                    if not text and snippet:
                        text = snippet

                    # v4.9b — Valider site perso si domaine
                    # match le nom (rejeter entreprises)
                    confirmed_personal = False
                    if domain_matches_name and text:
                        confirmed_personal = _is_personal_site(
                            result_url, query, text)
                        if not confirmed_personal:
                            continue

                    if text and len(text) > 30:
                        prefix = ""
                        if confirmed_personal:
                            prefix = (
                                f"[Site officiel: {result_url}]"
                                f"\n")
                        results.append({
                            "type": "notoriete_web",
                            "sub_type": "site_perso",
                            "domain": domain,
                            "url": result_url,
                            "title": title,
                            "text": prefix + text,
                            "snippet": snippet,
                            "is_profile": is_profile,
                        })

            except Exception as e:
                _log("WARN",
                    f"[NOTORIETE-SITE] Serper echec: {e}")

        return results

    def _ddg_interviews(self, query, max_results=3):
        """Sous-fonction 3 : interviews, entretiens, podcasts.
        Recherche Serper ciblee sur les apparitions medias.
        """
        search_query = (
            f'"{query}" interview OR entretien OR podcast '
            f'OR tribune OR conference'
        )

        try:
            parsed = _serper_search(search_query, num=12)
            _log("INFO",
                f"[NOTORIETE-INTERVIEWS] Serper brut: "
                f"{len(parsed)} resultats")

            # Fallback SANS guillemets si 0 resultats
            if not parsed:
                search_nq = (
                    f'{query} interview OR entretien '
                    f'OR podcast OR tribune')
                parsed = _serper_search(search_nq, num=12)
                _log("INFO",
                    f"[NOTORIETE-INTERVIEWS] Serper SANS "
                    f"guillemets: {len(parsed)} resultats")

            name_parts = [p.lower() for p in query.split()
                          if len(p) > 2]
            results = []

            for r in parsed[:12]:
                result_url = r.get("url", "")
                title = r.get("title", "")
                snippet = r.get("snippet", "")
                if not result_url:
                    continue

                domain = self._extract_domain(result_url)
                if domain in self._NOTORIETE_EXCLUDED_DOMAINS:
                    continue
                if any(d in domain for d in DOMAIN_BLACKLIST):
                    continue

                # Le nom doit apparaitre dans titre ou snippet
                content_check = (title + " " + snippet).lower()
                if not any(p in content_check for p in name_parts):
                    continue

                # Scraper la page (v4.6 : seuil 150→80)
                try:
                    text = self.fetch_article_text(result_url)
                    if text and len(text) > 80:
                        results.append({
                            "type": "notoriete_web",
                            "sub_type": "interview",
                            "domain": domain,
                            "url": result_url,
                            "title": title,
                            "text": text[:1500],
                            "snippet": snippet,
                        })
                        if len(results) >= max_results:
                            break
                    elif snippet and len(snippet) > 50:
                        results.append({
                            "type": "notoriete_web",
                            "sub_type": "interview",
                            "domain": domain,
                            "url": result_url,
                            "title": title,
                            "text": snippet,
                            "snippet": snippet,
                        })
                        if len(results) >= max_results:
                            break
                except Exception:
                    continue

            return results

        except Exception as e:
            _log("WARN",
                f"[NOTORIETE-INTERVIEWS] Echec: {e}")
            return []

    # Medias publics francais avec pages emissions / invites
    _MEDIAS_PUBLICS = [
        ("France Inter", "franceinter.fr"),
        ("France Culture", "radiofrance.fr"),
        ("France Culture", "franceculture.fr"),
        ("France Info", "francetvinfo.fr"),
        ("Arte", "arte.tv"),
        ("RFI", "rfi.fr"),
        ("INA", "ina.fr"),
    ]

    def _fetch_medias_publics_guest(self, query, max_per_media=1):
        """Sous-fonction 5 : apparitions sur medias publics francais.
        Cherche les pages emissions mentionnant la personne.
        Retourne une liste de sources notoriete_web.
        v4.5b
        """
        name_parts = [p.lower() for p in query.split()
                      if len(p) > 2]
        results = []

        for media_name, domain in self._MEDIAS_PUBLICS:
            if len(results) >= 5:
                break
            try:
                search_query = f'"{query}" site:{domain}'
                parsed = _serper_search(search_query, num=5)
                found_for_media = 0

                for r in parsed[:5]:
                    if found_for_media >= max_per_media:
                        break
                    result_url = r.get("url", "")
                    title = r.get("title", "")
                    snippet = r.get("snippet", "")
                    if not result_url:
                        continue

                    result_domain = self._extract_domain(
                        result_url)
                    # Verifier que l'URL est bien du bon domaine
                    if domain not in result_domain:
                        continue

                    # Le nom doit apparaitre dans titre ou snippet
                    content_check = (
                        title + " " + snippet).lower()
                    if not any(p in content_check
                               for p in name_parts):
                        continue

                    # Scraper la page emission
                    text = ""
                    try:
                        fetched = self.fetch_article_text(
                            result_url)
                        if fetched and len(fetched) > 80:
                            text = fetched[:600]
                    except Exception:
                        pass
                    if not text and snippet:
                        text = snippet

                    if text and len(text) > 30:
                        results.append({
                            "type": "notoriete_web",
                            "sub_type": "media_public",
                            "domain": result_domain,
                            "url": result_url,
                            "title": (
                                f"{media_name} -- {title}"),
                            "text": (
                                f"[{media_name} -- {title}]"
                                f"\n{text}"),
                            "snippet": snippet,
                        })
                        found_for_media += 1
                        _log("INFO",
                            f"[NOTORIETE-MEDIAS] {media_name} "
                            f"→ {result_url[:60]}")

            except Exception as e:
                _log("WARN",
                    f"[NOTORIETE-MEDIAS] {media_name} echec: {e}")

        _log("INFO",
            f"[NOTORIETE-MEDIAS] {len(results)} sources "
            f"medias publics pour '{query}'")
        return results

    def _ddg_portrait_notoriete(self, query, text_sources=None,
                                profession=None):
        """Sous-fonction 4 : portrait presse/web pour personnes sans Wikipedia.
        Utilise Serper Images avec filtre anti-homonymie.
        v4.9f : filtre prenom anti-homonyme renforce.
        v4.9f : source croisee — accepte les images dont le domaine est
        deja valide par le pipeline texte.
        v4.9f C3 : injection optionnelle de la profession dans la requete.
        Retourne un dict portrait ou None.
        """
        try:
            # v4.9f C3 — Enrichir la requete avec la profession
            query_parts = [query]
            if profession:
                # Nettoyer la profession (garder la partie avant separateur)
                prof = profession
                for sep in [" - ", " · ", " | ", " chez ", " at "]:
                    if sep in prof:
                        prof = prof.split(sep)[0]
                        break
                prof_clean = prof.strip()[:40]
                if prof_clean:
                    query_parts.append(prof_clean)
            query_parts.append("portrait photo")
            enriched_query = " ".join(query_parts)

            serper_results = _serper_image_search(
                enriched_query, num=12)
            _log("INFO",
                f"[NOTORIETE-PORTRAIT] Requete: "
                f"'{enriched_query}' → "
                f"{len(serper_results)} resultats bruts")

            # Tokens du nom pour anti-homonymie
            q_lower = query.lower()
            q_parts = q_lower.split()
            q_surname = (
                " ".join(q_parts[1:]) if len(q_parts) > 1
                else q_lower)
            particles = {"le", "la", "de", "du", "des",
                         "von", "van"}
            q_last = ""
            for p in reversed(q_parts):
                if p not in particles and len(p) > 2:
                    q_last = p
                    break

            # v4.9f — Extraire prenom et nom pour filtre renforce
            q_firstname = q_parts[0] if q_parts else ""
            q_lastname = (
                " ".join(q_parts[1:]) if len(q_parts) > 1
                else q_lower)

            for item in serper_results[:12]:
                img_url = item.get("imageUrl", "")
                item_title = item.get("title", "").lower()
                src_url = item.get("link", "").lower()

                if not img_url.startswith("https://"):
                    continue
                if self._is_google_proxy_url(img_url):
                    continue

                # Anti-homonymie : nom dans titre ou URL source
                check_text = item_title + " " + src_url
                name_match = (
                    q_lower in check_text
                    or (q_surname and q_surname in check_text)
                    or (q_last and q_last in check_text)
                )
                if not name_match:
                    # v4.9f — Source croisee : si le domaine de la page
                    # source est deja dans les sources texte validees,
                    # accepter quand meme (le site parle de la personne)
                    if (text_sources
                            and _is_trusted_source_domain(
                                src_url, text_sources)):
                        _log("INFO",
                            f"[NOTORIETE-PORTRAIT] Source croisee "
                            f"(domaine valide): {src_url[:60]}")
                    else:
                        continue

                # Exclure logos, icones
                if any(kw in img_url.lower()
                       for kw in ["logo", "icon", "banner",
                                  "favicon"]):
                    continue

                # v4.9f — Filtre anti-homonyme renforce (prenom)
                # Verifier l'URL de l'image ET le titre de la page
                # Ne s'applique pas aux images LinkedIn/licdn (pas de nom
                # dans les URLs CDN licdn.com ni les pages linkedin.com)
                if ("linkedin.com" not in src_url
                        and "licdn.com" not in img_url.lower()):
                    img_check = img_url + " " + item_title
                    if not _validate_portrait_name(
                            img_check, q_firstname, q_lastname):
                        _log("INFO",
                            f"[NOTORIETE-PORTRAIT] REJETE "
                            f"(homonyme prenom): "
                            f"{img_url[:60]}")
                        continue

                _log("INFO",
                    f"[NOTORIETE-PORTRAIT] ACCEPTE: "
                    f"{img_url[:60]}")
                return {
                    "url": img_url,
                    "caption": query,
                    "type": "person_portrait",
                    "source": "notoriete_web_serper",
                    "source_url": item.get("link", ""),
                }

            _log("INFO",
                "[NOTORIETE-PORTRAIT] Aucun portrait valide")
            return None

        except Exception as e:
            _log("WARN",
                f"[NOTORIETE-PORTRAIT] Echec: {e}")
            return None

    # ── Recherche d'images (DDG Images + Wikimedia Commons) ─────────────

    def fetch_image(self, query, max_results=2):
        """Rechercher des images via DDG Images (priorite) + Wikimedia Commons (fallback)
        Filtre : HTTPS uniquement, min 200x200px
        Retourne une liste de dicts {url, caption, source, width, height}
        """
        # Priorite 1 : DDG Images
        images = self._fetch_ddg_images(query, max_results)
        if images:
            return images

        # Fallback : Wikimedia Commons
        _log("INFO", f"DDG Images: aucun resultat, fallback Wikimedia Commons")
        images = self._fetch_wikimedia_commons(query, max_results)
        return images

    def _fetch_ddg_images(self, query, max_results=2):
        """Recherche d'images via Serper Images API"""
        try:
            results = _serper_image_search(query, num=max_results * 3)
        except Exception as e:
            _log("WARN", f"Echec Serper Images: {e}")
            return []

        images = []
        for r in results:
            img_url = r.get("imageUrl", "")

            # Filtre : HTTPS uniquement
            if not img_url.startswith("https://"):
                continue

            images.append({
                "url": img_url,
                "caption": r.get("title", ""),
                "source": "serper_images",
                "width": 0,
                "height": 0,
            })
            if len(images) >= max_results:
                break

        _log("INFO", f"Serper Images pour '{query}': {len(images)} images filtrees sur {len(results)} brutes")
        return images

    def _get_ddg_vqd(self, query):
        """Obtenir le token vqd necessaire pour DDG Images"""
        url = f"https://duckduckgo.com/?q={urllib.parse.quote(query)}"
        try:
            req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
            with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT) as resp:
                html = resp.read().decode("utf-8", errors="replace")
            # Extraire vqd du HTML (format: vqd='4-123...' ou vqd="4-123...")
            match = re.search(r'vqd=["\x27]([\d-]+)["\x27]', html)
            if match:
                return match.group(1)
            # Pattern alternatif sans quotes
            match = re.search(r'vqd=([\d-]+)', html)
            if match:
                return match.group(1)
        except Exception as e:
            _log("WARN", f"Echec obtention vqd DDG: {e}")
        return None

    def _fetch_wikimedia_commons(self, query, max_results=2):
        """Fallback : rechercher des images sur Wikimedia Commons API"""
        # Etape 1 : Recherche de fichiers
        params = urllib.parse.urlencode({
            "action": "query",
            "list": "search",
            "srsearch": query,
            "srnamespace": "6",
            "srlimit": str(max_results * 3),
            "format": "json"
        })
        url = f"https://commons.wikimedia.org/w/api.php?{params}"

        try:
            req = urllib.request.Request(url, headers={
                "User-Agent": "CyberStrat/1.0"
            })
            with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT) as resp:
                data = json.loads(resp.read().decode("utf-8"))
        except Exception as e:
            _log("WARN", f"Echec Wikimedia Commons recherche: {e}")
            return []

        results = data.get("query", {}).get("search", [])
        if not results:
            return []

        # Etape 2 : Obtenir les URLs d'images
        titles = "|".join(r["title"] for r in results)
        info_params = urllib.parse.urlencode({
            "action": "query",
            "titles": titles,
            "prop": "imageinfo",
            "iiprop": "url|size",
            "format": "json"
        })
        info_url = f"https://commons.wikimedia.org/w/api.php?{info_params}"

        try:
            req = urllib.request.Request(info_url, headers={
                "User-Agent": "CyberStrat/1.0"
            })
            with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT) as resp:
                info_data = json.loads(resp.read().decode("utf-8"))
        except Exception as e:
            _log("WARN", f"Echec Wikimedia Commons imageinfo: {e}")
            return []

        pages = info_data.get("query", {}).get("pages", {})
        images = []
        for page_id, page in pages.items():
            if page_id == "-1":
                continue
            imageinfo = page.get("imageinfo", [])
            if not imageinfo:
                continue
            img_url = imageinfo[0].get("url", "")
            width = imageinfo[0].get("width", 0)
            height = imageinfo[0].get("height", 0)

            # Filtre : HTTPS + min 200x200 + pas de SVG
            if not img_url.startswith("https://"):
                continue
            if width < 200 or height < 200:
                continue
            if img_url.lower().endswith(".svg"):
                continue

            images.append({
                "url": img_url,
                "caption": page.get("title", "").replace("File:", "").replace("Fichier:", ""),
                "source": "wikimedia_commons",
                "width": width,
                "height": height
            })
            if len(images) >= max_results:
                break

        _log("INFO", f"Wikimedia Commons pour '{query}': {len(images)} images")
        return images

    # ── Resolution d'identite personne (DDG HTML) ──────────────────────

    # Plateformes d'identite professionnelle et sociale
    IDENTITY_SOURCES = [
        "linkedin.com", "senscritique.com", "babelio.com",
        "twitter.com", "instagram.com", "facebook.com",
        "youtube.com", "github.com", "about.me",
        "journaldesfemmes.fr", "allocine.fr", "imdb.com",
        "viadeo.com", "muckrack.com", "researchgate.net",
        "cairn.info", "scholar.google", "theses.fr",
    ]

    def _clean_linkedin_profession(self, raw):
        """Nettoyer un snippet LinkedIn brut pour extraire la profession.
        Supprime les tokens de navigation LinkedIn (Experience, Competences,
        Formation, etc.) et les marqueurs · en debut de chaine.
        """
        if not raw:
            return raw
        # Supprimer les marqueurs · en debut
        cleaned = raw.lstrip("· ").strip()
        # Tokens LinkedIn a supprimer (prefixes de rubrique)
        noise_prefixes = [
            "Expérience :", "Experience :", "Expérience:",
            "Compétences :", "Competences :", "Compétences:",
            "Formation :", "Formation:",
            "À propos :", "A propos :",
            "Licences et certifications :",
            "Bénévolat :", "Benevolat :",
        ]
        for prefix in noise_prefixes:
            if cleaned.startswith(prefix):
                cleaned = cleaned[len(prefix):].strip()
        # Si le resultat est vide ou trop court, retourner None
        if not cleaned or len(cleaned) < 3:
            return None
        return cleaned[:100]

    def resolve_person_identity(self, query):
        """Resoudre l'identite d'une personne depuis les premieres
        sources en ligne : LinkedIn, reseaux sociaux, pages perso, etc.
        Retourne un profil structure : nom confirme, metier,
        localisation, liens de profil, bio snippet.
        Utilise DDG HTML (pas Lite) pour acceder aux reseaux sociaux.
        Zero import requests — urllib.request uniquement.
        """
        identity = {
            "confirmed_name": query,
            "profession": None,
            "location": None,
            "profile_links": [],
            "bio_snippet": None,
            "confidence": "low",
        }

        try:
            # Recherche Serper avec guillemets autour du nom exact
            results = _serper_search(f'"{query}"', num=10)
            _log("INFO",
                f"Identity Serper pour '{query}': "
                f"{len(results)} resultats bruts")

            # Analyser chaque resultat
            # v4.6 — matching sans accents ('hervé' == 'herve')
            query_ascii = _strip_accents(query.lower())
            name_parts = [
                p for p in query_ascii.split() if len(p) > 2
            ]

            for r in results[:10]:
                title = r.get("title", "")
                snippet = r.get("snippet", "")
                result_url = r.get("url", "")

                # Verifier que le nom est dans titre ou snippet
                # (sans accents pour robustesse)
                combined_text = _strip_accents(
                    (title + " " + snippet).lower())
                name_present = all(
                    p in combined_text for p in name_parts
                )
                if not name_present:
                    continue

                url_lower = result_url.lower()

                # Classer par type de source
                if "linkedin" in url_lower:
                    identity["profile_links"].insert(0, {
                        "platform": "LinkedIn",
                        "url": result_url,
                        "snippet": snippet,
                    })
                    # Extraire metier/localisation depuis le snippet
                    if " - " in snippet:
                        parts = snippet.split(" - ")
                        if len(parts) >= 2:
                            identity["profession"] = self._clean_linkedin_profession(
                                parts[0].strip()[:100])
                            identity["location"] = (
                                parts[1].strip()[:100])
                    elif " · " in snippet:
                        parts = snippet.split(" · ")
                        if len(parts) >= 2:
                            identity["profession"] = self._clean_linkedin_profession(
                                parts[0].strip()[:100])
                            identity["location"] = (
                                parts[1].strip()[:100])
                    identity["confidence"] = "high"

                elif any(s in url_lower
                         for s in self.IDENTITY_SOURCES):
                    platform = url_lower.split("/")[2] if (
                        "/" in url_lower) else url_lower
                    platform = platform.replace("www.", "")
                    identity["profile_links"].append({
                        "platform": platform,
                        "url": result_url,
                        "snippet": snippet,
                    })
                    if identity["confidence"] == "low":
                        identity["confidence"] = "medium"

                # Premier snippet pertinent comme bio
                if not identity["bio_snippet"] and name_present:
                    identity["bio_snippet"] = snippet[:300]

            # Extraction profession depuis bio_snippet si non trouvee
            # via LinkedIn (fallback par mots-cles)
            if not identity["profession"] and identity["bio_snippet"]:
                bio_lower = identity["bio_snippet"].lower()
                prof_keywords = [
                    "psychiatre", "psychanalyste", "psychologue",
                    "medecin", "médecin", "docteur", "chirurgien",
                    "journaliste", "professeur", "enseignant",
                    "chercheur", "directeur", "fondateur",
                    "ecrivain", "écrivain", "auteur", "avocat",
                    "ingenieur", "ingénieur", "sociologue",
                    "philosophe", "historien", "economiste",
                    "politologue", "anthropologue", "biologiste",
                ]
                found_profs = [
                    kw for kw in prof_keywords if kw in bio_lower
                ]
                if found_profs:
                    identity["profession"] = ", ".join(
                        found_profs[:3])
                    _log("INFO",
                        f"Profession extraite du bio_snippet: "
                        f"{identity['profession']}")

            # Stocker les résultats DDG bruts pour réutilisation
            # par fetch_ddg_targeted (évite un 2ème appel DDG)
            identity["ddg_raw_results"] = results

            _log("INFO",
                f"Identite '{query}': "
                f"confidence={identity['confidence']}, "
                f"profession={identity['profession']}, "
                f"links={len(identity['profile_links'])}")

        except Exception as e:
            _log("WARN", f"resolve_person_identity echec: {e}")

        return identity

    def discriminate_person(self, query):
        """v4.9e — Algorithme de discrimination de personnes.
        Remplace resolve_person_identity + fetch_notoriete_web
        en une seule methode avec 1-2 appels Serper.

        Etapes:
        1. Requete Serper unique "{prenom} {nom}" → 10 resultats
        2. Extraction signaux d'identite par resultat
        3. Clustering par compatibilite (union-find)
        4. Decision de discrimination (seuils)
        5. Scraping selectif du cluster dominant
        6. Construction identity + sources + portrait

        Retourne: {
            "identity": {resolve_person_identity format},
            "notoriete": {fetch_notoriete_web format},
        }
        """
        import time as _time
        start = _time.time()

        # ── ETAPE 1 : Requete Serper unique ──────────────────────
        initial_results = _serper_search(f'"{query}"', num=10)
        _log("INFO",
            f"[DISCRIM] Serper pour '{query}': "
            f"{len(initial_results)} resultats")

        # Fallback sans guillemets si 0 resultats
        if not initial_results:
            initial_results = _serper_search(query, num=10)
            _log("INFO",
                f"[DISCRIM] Serper SANS guillemets: "
                f"{len(initial_results)} resultats")

        # ── ETAPE 1A+1B : Dedup par domaine + compensation ──────
        results, dominant_domain = _search_with_compensation(
            query, initial_results)
        serper_credits = 1
        if len(results) != len(initial_results):
            serper_credits = 2  # compensation used

        # ── ETAPE 1C : Requete media SYSTEMATIQUE (v4.9g) ────────
        # Phase 2 : toujours lancee, pas conditionnelle
        media_extra = _search_national_media(
            query, initial_results, results, dominant_domain)
        if media_extra:
            combined = list(results) + list(media_extra)
            results, _ = _deduplicate_by_domain(
                combined, max_per_domain=2)
            serper_credits += 1
            _log("INFO",
                f"[DISCRIM] Phase 2 media: "
                f"+{len(media_extra)} → {len(results)} resultats")
        else:
            serper_credits += 1
            _log("INFO",
                "[DISCRIM] Phase 2 media: 0 nouvelle source")

        # ── ETAPE 1D : Recherche site perso (v4.9g) ─────────────
        # Phase 3 : si aucun site personnel dans les resultats
        query_ascii_check = _strip_accents(query.lower())
        name_parts_check = query_ascii_check.split()
        has_personal_site = False
        for r in results:
            try:
                r_domain = urllib.parse.urlparse(
                    r.get("url", "")).netloc.lower()
                r_domain = r_domain.replace("www.", "")
                r_domain_ascii = _strip_accents(r_domain)
                if any(p in r_domain_ascii for p in name_parts_check
                       if len(p) > 2):
                    has_personal_site = True
                    break
            except Exception:
                continue

        if not has_personal_site and serper_credits < 4:
            perso_query = f'"{query}" site personnel OR biographie'
            perso_results = _serper_search(perso_query, num=5)
            seen_urls = {r.get("url", "") for r in results}
            new_perso = [
                r for r in perso_results
                if r.get("url", "") not in seen_urls
            ]
            if new_perso:
                combined = list(results) + new_perso[:3]
                results, _ = _deduplicate_by_domain(
                    combined, max_per_domain=2)
                _log("INFO",
                    f"[DISCRIM] Phase 3 site perso: "
                    f"+{len(new_perso[:3])} → "
                    f"{len(results)} resultats")
            else:
                _log("INFO",
                    "[DISCRIM] Phase 3 site perso: "
                    "0 nouvelle source")
            serper_credits += 1
        elif has_personal_site:
            _log("INFO",
                "[DISCRIM] Phase 3 skip: "
                "site perso deja present")

        # Preparer tokens du nom
        query_ascii = _strip_accents(query.lower())
        query_parts = [
            p for p in query_ascii.split() if len(p) > 2
        ]

        # ── ETAPE 2 : Extraction des signaux ─────────────────────
        signals = []
        for r in results:
            sig = _extract_identity_signals(
                r, query_parts, query_ascii)
            if sig:
                signals.append(sig)

        _log("INFO",
            f"[DISCRIM] {len(signals)} signaux extraits "
            f"sur {len(results)} resultats")

        # ── ETAPE 3 : Clustering ─────────────────────────────────
        clusters = _cluster_signals(signals)
        _log("INFO",
            f"[DISCRIM] {len(clusters)} clusters "
            f"(tailles: {[len(c) for c in clusters]})")

        # ── ETAPE 4 : Decision ───────────────────────────────────
        decision = _decide_discrimination(clusters, query)
        dominant = decision["dominant_cluster"] or []

        # ── ETAPE 5 : Construction identity ──────────────────────
        identity = {
            "confirmed_name": query,
            "profession": decision["profession"],
            "location": None,
            "profile_links": [],
            "bio_snippet": None,
            "confidence": decision["confidence"],
            "discrimination_status": decision["status"],
            "activity_domain": decision["activity_domain"],
            "ddg_raw_results": results,
        }

        # Extraire profils et bio depuis le cluster dominant
        for sig in dominant:
            if sig["url_type"] == "linkedin":
                identity["profile_links"].insert(0, {
                    "platform": "LinkedIn",
                    "url": sig["url"],
                    "snippet": sig["snippet"],
                })
                # Extraire location depuis snippet LinkedIn
                if " - " in sig["snippet"]:
                    parts = sig["snippet"].split(" - ")
                    if len(parts) >= 2:
                        identity["location"] = (
                            parts[1].strip()[:100])
                # Nettoyer profession LinkedIn
                if sig.get("profession"):
                    identity["profession"] = (
                        self._clean_linkedin_profession(
                            sig["profession"]))

            elif sig["url_type"] == "social":
                try:
                    domain = urllib.parse.urlparse(
                        sig["url"]).netloc.lower()
                    domain = domain.replace("www.", "")
                except Exception:
                    domain = sig["url_type"]
                identity["profile_links"].append({
                    "platform": domain,
                    "url": sig["url"],
                    "snippet": sig["snippet"],
                })

            # Premier snippet pertinent comme bio
            if (not identity["bio_snippet"]
                    and sig["name_in_title"]):
                identity["bio_snippet"] = sig["snippet"][:300]

        # Fallback profession depuis mots-cles du bio_snippet
        if not identity["profession"] and identity["bio_snippet"]:
            bio_lower = _strip_accents(
                identity["bio_snippet"].lower())
            prof_keywords = [
                "psychiatre", "psychanalyste", "psychologue",
                "medecin", "docteur", "chirurgien",
                "journaliste", "professeur", "enseignant",
                "chercheur", "directeur", "fondateur",
                "ecrivain", "auteur", "avocat",
                "ingenieur", "sociologue", "philosophe",
                "historien", "economiste", "politologue",
            ]
            found_profs = [
                kw for kw in prof_keywords if kw in bio_lower
            ]
            if found_profs:
                identity["profession"] = ", ".join(
                    found_profs[:3])

        # ── ETAPE 6 : Scraping selectif + sources ────────────────
        sources = []
        if decision["status"] in ("identifiable", "partial"):
            # Scraper les meilleures sources du cluster dominant
            scrape_targets = []
            for sig in dominant:
                if sig["url_type"] in ("noise",):
                    continue
                # Privilegier presse, other, academic
                if sig["url_type"] in ("press", "other",
                                       "academic"):
                    scrape_targets.append(sig)
                elif sig["name_in_domain"]:
                    scrape_targets.append(sig)

            # Limiter a 4 scrapes max
            for sig in scrape_targets[:4]:
                try:
                    text = self.fetch_article_text(sig["url"])
                    if text and len(text) > 80:
                        domain = self._extract_domain(sig["url"])
                        # Classifier sub_type
                        sub_type = "article"
                        if sig["name_in_domain"]:
                            sub_type = "site_perso"
                        elif sig["url_type"] == "academic":
                            sub_type = "article"
                        sources.append({
                            "type": "notoriete_web",
                            "sub_type": sub_type,
                            "domain": domain,
                            "url": sig["url"],
                            "title": sig["title"],
                            "text": text[:1500],
                            "snippet": sig["snippet"],
                        })
                    elif sig["snippet"] and len(
                            sig["snippet"]) > 50:
                        domain = self._extract_domain(sig["url"])
                        sources.append({
                            "type": "notoriete_web",
                            "sub_type": "article",
                            "domain": domain,
                            "url": sig["url"],
                            "title": sig["title"],
                            "text": sig["snippet"],
                            "snippet": sig["snippet"],
                        })
                except Exception as e:
                    _log("WARN",
                        f"[DISCRIM] Scrape echec "
                        f"{sig['url'][:50]}: {e}")

            # Ajouter les profils LinkedIn/social comme sources
            for sig in dominant:
                if sig["url_type"] in ("linkedin", "social"):
                    domain = self._extract_domain(sig["url"])
                    sources.append({
                        "type": "notoriete_web",
                        "sub_type": "site_perso",
                        "domain": domain,
                        "url": sig["url"],
                        "title": sig["title"],
                        "text": sig["snippet"] or sig["title"],
                        "snippet": sig["snippet"],
                        "is_profile": True,
                    })

        # Deduplique par URL normalisee
        seen_urls = set()
        unique_sources = []
        for src in sources:
            url_norm = _normalize_url(src.get("url", ""))
            if url_norm and url_norm not in seen_urls:
                seen_urls.add(url_norm)
                unique_sources.append(src)

        # ── ETAPE 7 : Portrait (1 appel Serper Images) ───────────
        portrait = None
        if decision["status"] in ("identifiable", "partial"):
            portrait = self._ddg_portrait_notoriete(
                query, text_sources=unique_sources,
                profession=decision.get("profession"))

        elapsed = round((_time.time() - start) * 1000)

        _log("INFO",
            f"[DISCRIM] Termine '{query}': "
            f"status={decision['status']}, "
            f"confidence={decision['confidence']}, "
            f"{len(unique_sources)} sources, "
            f"portrait={'OUI' if portrait else 'NON'}, "
            f"{elapsed}ms")

        notoriete = {
            "sources": unique_sources,
            "portrait": portrait,
            "meta": {
                "strategy": "discrimination_v4.9g",
                "elapsed_ms": elapsed,
                "discrimination_status": decision["status"],
                "clusters_count": len(clusters),
                "dominant_cluster_size": len(dominant),
                "serper_credits": (
                    serper_credits + (1 if portrait else 0)),
                "sub_counts": {
                    "article": len([
                        s for s in unique_sources
                        if s.get("sub_type") == "article"]),
                    "site_perso": len([
                        s for s in unique_sources
                        if s.get("sub_type") == "site_perso"]),
                    "interview": len([
                        s for s in unique_sources
                        if s.get("sub_type") == "interview"]),
                    "media_public": len([
                        s for s in unique_sources
                        if s.get("sub_type") ==
                        "media_public"]),
                },
            },
        }

        return {
            "identity": identity,
            "notoriete": notoriete,
        }

    def _parse_ddg_html_results(self, html):
        """Parser les resultats de recherche DuckDuckGo HTML.
        Structure DDG HTML :
        - Liens : <a class="result__a" href="...">Title</a>
        - Snippets : <a class="result__snippet">...</a>
          ou <td class="result-snippet">...</td>
        - URLs : <span class="result__url">...</span>
        Zero BeautifulSoup — regex uniquement.
        """
        results = []

        # Pattern 1 : DDG HTML modern (class="result__a")
        links = re.findall(
            r'<a[^>]*class=["\']result__a["\'][^>]*'
            r'href=["\']([^"\']*)["\'][^>]*>(.*?)</a>',
            html, re.DOTALL
        )
        if not links:
            # Pattern alternatif : href avant class
            links = re.findall(
                r'<a[^>]*href=["\']([^"\']*)["\'][^>]*'
                r'class=["\']result__a["\'][^>]*>(.*?)</a>',
                html, re.DOTALL
            )
        if not links:
            # Fallback : DDG Lite format (reutiliser le parser existant)
            links = re.findall(
                r'<a[^>]*class=["\']result-link["\'][^>]*'
                r'href=["\']([^"\']*)["\'][^>]*>(.*?)</a>',
                html, re.DOTALL
            )

        # Snippets : plusieurs patterns possibles
        snippets = re.findall(
            r'<a[^>]*class=["\']result__snippet["\'][^>]*>'
            r'(.*?)</a>',
            html, re.DOTALL
        )
        if not snippets:
            snippets = re.findall(
                r'<td[^>]*class=["\']result-snippet["\'][^>]*>'
                r'(.*?)</td>',
                html, re.DOTALL
            )

        # URLs affichees
        urls_display = re.findall(
            r'<span[^>]*class=["\']result__url["\'][^>]*>'
            r'(.*?)</span>',
            html, re.DOTALL
        )

        for i, (href, title_html) in enumerate(links):
            # Decoder l'URL DDG
            url = self._decode_ddg_url(href)
            if not url:
                # Essayer l'URL affichee
                if i < len(urls_display):
                    display_url = re.sub(
                        r"<[^>]+>", "", urls_display[i]
                    ).strip()
                    if display_url and not display_url.startswith(
                            "http"):
                        url = "https://" + display_url
                    elif display_url:
                        url = display_url
                if not url:
                    continue

            # Nettoyer le titre
            title = re.sub(r"<[^>]+>", "", title_html).strip()
            title = self._decode_html_entities(title)

            # Nettoyer le snippet
            snippet = ""
            if i < len(snippets):
                snippet = re.sub(
                    r"<[^>]+>", "", snippets[i]
                ).strip()
                snippet = self._decode_html_entities(snippet)

            results.append({
                "url": url,
                "title": title,
                "snippet": snippet,
            })

        return results

    def _decode_html_entities(self, text):
        """Decoder les entites HTML courantes"""
        text = text.replace("&amp;", "&")
        text = text.replace("&quot;", '"')
        text = text.replace("&#x27;", "'")
        text = text.replace("&#39;", "'")
        text = text.replace("&lt;", "<")
        text = text.replace("&gt;", ">")
        text = text.replace("&nbsp;", " ")
        return text

    # ── Normalisation slug Wikipedia ──────────────────────────────────

    def _normalize_wikipedia_slug(self, query):
        """Genere plusieurs variantes de slug Wikipedia a essayer.
        Retourne une liste ordonnee du plus probable au moins probable.
        Gere : tirets (Saint-Paul-Lacoste), accents, suffixes geo.
        """
        import unicodedata

        def to_title_smart(s):
            """Met chaque mot en majuscule sauf articles/prepositions."""
            LOWER_WORDS = {
                "de", "du", "des", "le", "la", "les",
                "en", "sur", "sous", "et", "au", "aux",
                "d", "l",
            }
            words = s.split()
            result = []
            for i, w in enumerate(words):
                if i == 0 or w.lower() not in LOWER_WORDS:
                    result.append(w.capitalize())
                else:
                    result.append(w.lower())
            return " ".join(result)

        # Nettoyer : supprimer les suffixes geo parasites
        GEO_SUFFIXES = (
            r"\b(gard|herault|hérault|lozere|lozère|var|"
            r"bouches.du.rh[oô]ne|vaucluse|ard[eè]che|"
            r"dr[oô]me|aude|pyr[ée]n[ée]es|"
            r"france|d[ée]partement|r[ée]gion|commune|"
            r"canton|arrondissement)\b"
        )
        cleaned = query.strip()
        cleaned_lower = re.sub(
            GEO_SUFFIXES, "", cleaned.lower(),
            flags=re.IGNORECASE)
        cleaned_lower = re.sub(r"\s+", " ", cleaned_lower).strip()

        variants = []

        # Variante 1 : tirets entre mots (Saint-Paul-Lacoste)
        hyphenated = "-".join(
            w.capitalize() for w in cleaned_lower.split())
        variants.append(hyphenated)

        # Variante 2 : espaces avec capitalisation (Saint Paul Lacoste)
        titled = to_title_smart(cleaned_lower)
        variants.append(titled)

        # Variante 3 : requete originale telle quelle (title case)
        variants.append(cleaned.replace(" ", "_"))

        # Variante 4 : sans accents (version tirets)
        def strip_accents(s):
            s = unicodedata.normalize("NFD", s)
            return "".join(
                c for c in s
                if unicodedata.category(c) != "Mn"
            )
        no_accent = strip_accents(hyphenated)
        if no_accent != hyphenated:
            variants.append(no_accent)

        # Dedupliquer en conservant l'ordre
        seen = set()
        unique = []
        for v in variants:
            if v and v not in seen:
                seen.add(v)
                unique.append(v)

        _log("INFO", f"Wikipedia slugs a essayer: {unique}")
        return unique

    # ── Recherche elu officiel (RNE data.gouv.fr) ─────────────────────

    # UUID de la ressource "Maires" du Repertoire National des Elus
    RNE_MAIRES_UUID = "2876a346-d50c-4911-934e-19ee07b0e503"
    RNE_BASE_URL = "https://tabular-api.data.gouv.fr/api/resources"
    RNE_TIMEOUT = 8

    # Noms de colonnes RNE (contiennent accents + apostrophes)
    # Definis ici pour eviter les backslashes dans les f-strings
    # (incompatibles Python 3.10)
    _COL_NOM = "Nom de l'\u00e9lu"
    _COL_PRENOM = "Pr\u00e9nom de l'\u00e9lu"
    _COL_COMMUNE = "Libell\u00e9 de la commune"
    _COL_DEPT = "Libell\u00e9 du d\u00e9partement"
    _COL_CODE_COMMUNE = "Code de la commune"
    _COL_CODE_DEPT = "Code du d\u00e9partement"
    _COL_DEBUT_MANDAT = "Date de d\u00e9but du mandat"
    _COL_DEBUT_FONCTION = "Date de d\u00e9but de la fonction"
    _COL_CSP = "Libell\u00e9 de la cat\u00e9gorie socio-professionnelle"
    _COL_SEXE = "Code sexe"

    def fetch_elu_officiel(self, query):
        """Recherche un elu dans le RNE (Repertoire National des Elus).

        Interroge l'API tabulaire data.gouv.fr sur le dataset Maires.
        Filtre par nom de famille (exact, uppercase) puis match prenom.

        Args:
            query: nom complet de la personne (ex: "Adrien Chapon")

        Returns:
            dict avec {nom, prenom, commune, departement, code_insee,
                       fonction, date_debut_mandat, date_debut_fonction,
                       csp} ou None si non trouve.
        """
        parts = query.strip().split()
        if len(parts) < 2:
            _log("INFO", "[RNE] Requete trop courte pour recherche elu: '%s'" % query)
            return None

        # Extraire prenom et nom — convention : dernier mot = nom
        surname = parts[-1].upper()
        firstname_query = " ".join(parts[:-1]).strip().lower()

        # Construction URL manuelle pour gerer l'apostrophe dans
        # "Nom de l'elu" — urllib.parse.urlencode encode ' en %27
        # ce qui casse le filtre __ilike, mais __exact fonctionne
        url = (
            self.RNE_BASE_URL + "/" + self.RNE_MAIRES_UUID + "/data/"
            "?Nom%20de%20l%27%C3%A9lu__exact="
            + urllib.parse.quote(surname)
            + "&page_size=20"
        )

        _log("INFO", "[RNE] Recherche elu: nom=%s, prenom=%s" % (surname, firstname_query))
        _log("INFO", "[RNE] URL: %s" % url)

        try:
            req = urllib.request.Request(url, headers={
                "User-Agent": "CyberStrat/1.0 (contact@cevennes-web.com)",
                "Accept": "application/json",
            })
            with urllib.request.urlopen(req, timeout=self.RNE_TIMEOUT) as resp:
                raw = resp.read().decode("utf-8")
                data = json.loads(raw)
        except Exception as e:
            _log("WARN", "[RNE] Erreur API: %s" % str(e))
            return None

        results = data.get("data", [])
        if not results:
            _log("INFO", "[RNE] Aucun resultat pour nom=%s" % surname)
            return None

        _log("INFO", "[RNE] %d resultats pour nom=%s" % (len(results), surname))

        # Match exact sur prenom (insensible a la casse + accents)
        import unicodedata

        def normalize_str(s):
            """Normalise pour comparaison : minuscule + sans accents."""
            s = s.lower().strip()
            s = unicodedata.normalize("NFD", s)
            return "".join(c for c in s if unicodedata.category(c) != "Mn")

        fn_normalized = normalize_str(firstname_query)

        best_match = None
        for row in results:
            row_prenom = row.get(self._COL_PRENOM, "")
            row_nom = row.get(self._COL_NOM, "")
            row_commune = row.get(self._COL_COMMUNE, "?")
            row_fn_normalized = normalize_str(row_prenom)

            # Match exact prenom
            if row_fn_normalized == fn_normalized:
                best_match = row
                _log("INFO",
                    "[RNE] MATCH EXACT: %s %s -- %s"
                    % (row_prenom, row_nom, row_commune))
                break

            # Match partiel : le prenom de la requete est contenu
            # dans le prenom RNE (ex: "Jean" dans "Jean-Luc")
            if fn_normalized in row_fn_normalized:
                if best_match is None:
                    best_match = row
                    _log("INFO",
                        "[RNE] Match partiel prenom: %s %s"
                        % (row_prenom, row_nom))

        if not best_match:
            _log("INFO", "[RNE] Aucun match prenom pour '%s' parmi %d resultats"
                 % (firstname_query, len(results)))
            # Lister les resultats pour debug
            for row in results[:5]:
                rp = row.get(self._COL_PRENOM, "?")
                rn = row.get(self._COL_NOM, "?")
                rc = row.get(self._COL_COMMUNE, "?")
                _log("INFO", "[RNE]   -> %s %s -- %s" % (rp, rn, rc))
            return None

        # Construire le resultat
        elu = {
            "nom": best_match.get(self._COL_NOM, ""),
            "prenom": best_match.get(self._COL_PRENOM, ""),
            "commune": best_match.get(self._COL_COMMUNE, ""),
            "departement": best_match.get(self._COL_DEPT, ""),
            "code_insee": best_match.get(self._COL_CODE_COMMUNE, ""),
            "code_departement": best_match.get(self._COL_CODE_DEPT, ""),
            "fonction": "Maire",  # Dataset = maires uniquement
            "date_debut_mandat": best_match.get(self._COL_DEBUT_MANDAT, ""),
            "date_debut_fonction": best_match.get(self._COL_DEBUT_FONCTION, ""),
            "csp": best_match.get(self._COL_CSP, ""),
            "sexe": best_match.get(self._COL_SEXE, ""),
        }

        _log("INFO",
            "[RNE] Elu identifie: %s %s, Maire de %s (%s), depuis %s"
            % (elu["prenom"], elu["nom"], elu["commune"],
               elu["departement"], elu["date_debut_fonction"]))

        return elu

    # ── Validation URLs portrait ─────────────────────────────────────

    def _is_google_proxy_url(self, url):
        """Detecte les URLs Google proxy / profile photo / YouTube.
        Ces URLs sont instables, expirent, et ne sont pas de vrais
        portraits exploitables. Patterns detectes :
        - googleusercontent.com (Google Photos, YouTube profiles)
        - ggpht.com (ancien CDN Google profiles)
        - =s900, =s800 etc. (redimensionnement Google)
        - no-rj (parametre Google profile)
        - blogspot.com (hosted images, souvent recycled)
        - gstatic.com (assets Google statiques)
        """
        if not url or not isinstance(url, str):
            return True  # URL vide/invalide = rejeter
        url_lower = url.lower()
        google_patterns = [
            "googleusercontent.com",
            "ggpht.com",
            "gstatic.com",
            "blogspot.com",
            "google.com/img",
            "google.com/maps",
            "=s900",
            "=s800",
            "=s700",
            "=s600",
            "no-rj",
            "c0x00ffffff",
        ]
        for pattern in google_patterns:
            if pattern in url_lower:
                _log("INFO",
                    f"[PORTRAIT-FILTER] URL Google proxy rejetee: "
                    f"{url[:80]}... (pattern: {pattern})")
                return True
        return False

    def _is_valid_portrait_url(self, url):
        """Valide qu'une URL portrait est exploitable.
        Rejette : Google proxy, URLs trop courtes, sans https,
        fragments sans domaine.
        """
        if not url or not isinstance(url, str):
            return False
        if not url.startswith("https://"):
            _log("INFO",
                f"[PORTRAIT-FILTER] URL non-HTTPS rejetee: "
                f"{url[:60]}")
            return False
        if len(url) < 20:
            return False
        if self._is_google_proxy_url(url):
            return False
        return True

    # ── Portrait depuis articles de presse (og:image) ──────────────────

    # v4.9d R3 — patterns URL typiques d'images paysage (og:image)
    _LANDSCAPE_URL_PATTERNS = (
        "1200x628", "1200x630", "1200x675",
        "16x9", "16-9", "banner", "hero",
        "cover", "header",
        "1280x720", "800x450", "600x338",
    )

    def _is_landscape_og_image(self, img_url, min_ratio=0.65):
        """v4.9d R3 — Detecte les og:image au format paysage.
        Etape 1 : heuristique URL (rapide, sans reseau).
        Etape 2 : PIL via Range request (4096 octets).
        Retourne True si paysage ou trop petit → rejeter.
        """
        # Etape 1 : heuristique URL
        url_lower = img_url.lower()
        if any(p in url_lower for p in self._LANDSCAPE_URL_PATTERNS):
            return True

        # Etape 2 : tenter PIL via Range request
        try:
            req = urllib.request.Request(img_url, headers={
                "User-Agent": USER_AGENT,
                "Range": "bytes=0-8191",
            })
            with urllib.request.urlopen(req, timeout=5) as resp:
                raw = resp.read(8192)

            from PIL import Image as _PILImage
            import io as _io
            img = _PILImage.open(_io.BytesIO(raw))
            w, h = img.size
            if w < 150 or h < 150:
                _log("INFO",
                    f"[R3] Image trop petite: {w}x{h}")
                return True
            if h < w * min_ratio:
                _log("INFO",
                    f"[R3] Image paysage detectee: "
                    f"{w}x{h} (ratio {h/w:.2f})")
                return True
        except Exception:
            # PIL non dispo ou Range non supporte
            # → ne pas rejeter sans preuve
            pass

        return False

    # v4.9d — R2 : classification URL pour portrait auteur vs sujet
    AUTHOR_PAGE_PATTERNS = (
        "/auteur/", "/journaliste/", "/contributeur/",
        "/redaction/", "/profil/", "/people/", "/author/",
        "/writers/", "/team/", "/equipe/",
    )

    INTERVIEW_PAGE_PATTERNS = (
        "/interview/", "/portrait/", "/rencontre/",
        "/parole/", "/temoignage/", "/a-la-une/rencontre",
    )

    # Domaines dont l'og:image d'article N'EST JAMAIS un portrait auteur
    PRESS_OG_UNRELIABLE = {
        "mediapart.fr",   # banner horizontal d'article
        "babelio.com",    # couverture de livre, pas l'auteur
    }

    def _fetch_portrait_from_press(self, press_urls, query):
        """Extraire un portrait depuis les og:image des articles de presse.
        press_urls : liste d'URLs d'articles (depuis media_sources).
        Anti-homonymie : le titre/h1 de la page doit contenir le nom.
        v4.9d R2 : rejette og:image si auteur non confirme sur article
                   thematique (photo du sujet, pas du journaliste).
        Retourne un dict portrait ou None.
        """
        if not press_urls:
            _log("INFO", "[PORTRAIT-PRESS] Aucune URL presse fournie")
            return None

        from bs4 import BeautifulSoup

        # Extraire nom de famille pour filtre pertinence
        q_parts = query.lower().split()
        particles = {"le", "la", "de", "du", "des", "von", "van"}
        surname = ""
        for p in reversed(q_parts):
            if p not in particles and len(p) > 2:
                surname = p
                break
        surname_ascii = _strip_accents(surname) if surname else ""

        LOGO_KEYWORDS = [
            "logo", "banner", "header", "favicon", "icon",
            "default", "placeholder", "share-", "social-",
            "og-default", "thumbnail-default",
        ]

        for url in press_urls[:5]:
            try:
                # v4.9d R2 — domaine og:image non fiable
                try:
                    press_domain = urllib.parse.urlparse(
                        url).netloc.lower().replace("www.", "")
                except Exception:
                    press_domain = ""
                if press_domain in self.PRESS_OG_UNRELIABLE:
                    _log("INFO",
                        f"[PORTRAIT-PRESS][R2] og:image non fiable "
                        f"(domaine {press_domain}): {url[:60]}")
                    continue

                # v4.9d R2 — classifier la page
                is_author_page = any(
                    p in url.lower()
                    for p in self.AUTHOR_PAGE_PATTERNS
                ) or any(
                    p in url.lower()
                    for p in self.INTERVIEW_PAGE_PATTERNS
                )

                req = urllib.request.Request(url, headers={
                    "User-Agent": USER_AGENT,
                    "Accept": "text/html",
                })
                with urllib.request.urlopen(
                        req, timeout=6) as resp:
                    html_bytes = resp.read(50000)
                    html = html_bytes.decode(
                        "utf-8", errors="replace")

                soup = BeautifulSoup(html, "html.parser")

                # Anti-homonymie : titre ou h1 doit contenir
                # le nom de famille
                page_title = ""
                title_tag = soup.find("title")
                if title_tag:
                    page_title = title_tag.get_text().lower()
                h1_text = ""
                h1 = soup.find("h1")
                if h1:
                    h1_text = h1.get_text().lower()

                name_in_page = (
                    (surname and surname in page_title)
                    or (surname and surname in h1_text)
                )
                if not name_in_page:
                    _log("INFO",
                        f"[PORTRAIT-PRESS] {url[:50]}: "
                        f"nom '{surname}' absent du titre/h1")
                    continue

                # Extraire og:image
                og_img = soup.find(
                    "meta", attrs={"property": "og:image"})
                if not og_img or not og_img.get("content"):
                    _log("INFO",
                        f"[PORTRAIT-PRESS] {url[:50]}: "
                        f"pas d'og:image")
                    continue

                img_url = og_img["content"]

                # Valider HTTPS
                if not img_url.startswith("https://"):
                    _log("INFO",
                        f"[PORTRAIT-PRESS] non-HTTPS: "
                        f"{img_url[:50]}")
                    continue

                # Rejeter logos / banners generiques
                img_lower = img_url.lower()
                if any(kw in img_lower for kw in LOGO_KEYWORDS):
                    _log("INFO",
                        f"[PORTRAIT-PRESS] logo/banner rejete: "
                        f"{img_url[:60]}")
                    continue

                # Rejeter Google proxy
                if self._is_google_proxy_url(img_url):
                    continue

                # v4.9d R3 — filtre ratio paysage
                if self._is_landscape_og_image(img_url):
                    _log("INFO",
                        f"[PORTRAIT-PRESS][R3] og:image rejetee "
                        f"(paysage): {img_url[:60]}")
                    continue

                # v4.9d R2 — verification auteur sur articles
                # thematiques (og:image = photo du sujet, pas auteur)
                if not is_author_page and surname_ascii:
                    author_meta = ""
                    # Pattern 1 : <meta name="author">
                    meta_match = re.search(
                        r'<meta[^>]+name=["\']author["\'][^>]+'
                        r'content=["\']([^"\']+)["\']',
                        html, re.I
                    )
                    if meta_match:
                        author_meta = meta_match.group(1).lower()
                    else:
                        # Pattern 2 : class="author/byline"
                        byline_match = re.search(
                            r'class=["\'][^"\']*'
                            r'(?:author|byline|signature)'
                            r'[^"\']*["\'][^>]*>'
                            r'([^<]{3,60})<',
                            html, re.I
                        )
                        if byline_match:
                            author_meta = (
                                byline_match.group(1).lower())

                    if surname_ascii and author_meta:
                        if surname_ascii not in _strip_accents(
                                author_meta):
                            _log("INFO",
                                f"[PORTRAIT-PRESS][R2] og:image "
                                f"rejetee — auteur "
                                f"'{author_meta.strip()[:30]}' "
                                f"ne correspond pas a "
                                f"'{surname}'")
                            continue
                    elif not author_meta and not is_author_page:
                        _log("INFO",
                            f"[PORTRAIT-PRESS][R2] og:image "
                            f"rejetee — auteur non confirme "
                            f"sur article: {url[:60]}")
                        continue

                _log("INFO",
                    f"[PORTRAIT-PRESS] ACCEPTE: "
                    f"{img_url[:60]} (source: {url[:40]})")
                return {
                    "url": img_url,
                    "caption": query,
                    "type": "person_portrait",
                    "source": "press_og_image",
                    "source_url": url
                }

            except Exception as e:
                _log("WARN",
                    f"[PORTRAIT-PRESS] Erreur {url[:40]}: {e}")
                continue

        _log("INFO",
            "[PORTRAIT-PRESS] Aucun portrait presse trouve")
        return None

    # ── Portrait DDG enrichi (contexte identite + LinkedIn) ──────────

    def _fetch_ddg_portrait(self, query, identity=None,
                            text_sources=None):
        """v4.9c — Recherche portrait via Serper Images.
        Enrichit la requete avec le contexte d'identite,
        priorite LinkedIn. v4.9f : source croisee.
        Retourne dict portrait ou None.
        """
        # Enrichir la requete avec le contexte d'identite
        enriched_parts = [query]
        if identity and identity.get("profession"):
            prof = identity["profession"]
            for sep in [" - ", " · ", " | ", " chez ", " at "]:
                if sep in prof:
                    prof = prof.split(sep)[0]
                    break
            enriched_parts.append(prof.strip()[:40])
        enriched_parts.append("photo portrait")
        enriched_query = " ".join(enriched_parts)

        _log("INFO",
            f"[PORTRAIT-SERPER] Requete enrichie: "
            f"'{enriched_query}'")

        try:
            serper_results = _serper_image_search(
                enriched_query, num=15
            )

            BLOCKED_DOMAINS = [
                "twitter.com", "facebook.com",
                "instagram.com", "youtube.com",
                "pinterest.com",
                "amazon.", "fnac.", "babelio.",
                "goodreads.",
            ]

            _log("INFO",
                f"[PORTRAIT-SERPER] {len(serper_results)} "
                f"resultats bruts")

            # Tokens du nom pour anti-homonymie
            q_lower = _strip_accents(query.lower())
            q_surname = _strip_accents(
                _extract_lastname_mf(query).lower()
            )

            linkedin_candidate = None
            best_candidate = None

            for item in serper_results[:15]:
                img_url = item.get("imageUrl", "")
                title = _strip_accents(
                    item.get("title", "").lower()
                )
                src_url = item.get("link", "").lower()

                if not img_url or not img_url.startswith(
                        "https"):
                    continue

                blocked = any(
                    d in src_url for d in BLOCKED_DOMAINS
                )
                if blocked:
                    continue

                if self._is_google_proxy_url(img_url):
                    continue

                # Anti-homonymie : surname dans titre/URL
                check_text = title + " " + src_url
                check_ascii = _strip_accents(check_text)
                name_relevant = (
                    q_lower in check_ascii
                    or (q_surname and q_surname in check_ascii)
                )
                if not name_relevant:
                    # v4.9f — Source croisee
                    if (text_sources
                            and _is_trusted_source_domain(
                                src_url, text_sources)):
                        _log("INFO",
                            f"[PORTRAIT-SERPER] Source croisee "
                            f"(domaine valide): '{title[:40]}'")
                    else:
                        _log("INFO",
                            f"[PORTRAIT-SERPER] Rejete "
                            f"(nom absent): '{title[:40]}'")
                        continue

                # LinkedIn priority
                is_linkedin = (
                    "linkedin" in src_url
                    or "licdn.com" in img_url
                )
                if is_linkedin and not linkedin_candidate:
                    linkedin_candidate = {
                        "url": img_url,
                        "caption": query,
                        "type": "person_portrait",
                        "source": "duckduckgo_linkedin",
                        "source_url": item.get("link", "")
                    }
                    _log("INFO",
                        f"[PORTRAIT-SERPER] LinkedIn: "
                        f"{img_url[:60]}")
                elif not best_candidate:
                    best_candidate = {
                        "url": img_url,
                        "caption": query,
                        "type": "person_portrait",
                        "source": "duckduckgo_portrait",
                        "source_url": item.get("link", "")
                    }

            if linkedin_candidate:
                _log("INFO",
                    f"[PORTRAIT-SERPER] ACCEPTE (LinkedIn)")
                return linkedin_candidate

            if best_candidate:
                _log("INFO",
                    f"[PORTRAIT-SERPER] ACCEPTE: "
                    f"{best_candidate['url'][:60]}")
                return best_candidate

            _log("INFO",
                f"[PORTRAIT-SERPER] Aucun resultat valide")
        except Exception as e:
            _log("WARN",
                f"[PORTRAIT-SERPER] ECHEC: {e}")

        return None

    # ── Recherche portrait personne (Wikipedia + Wikimedia + Press + DDG) ──

    def fetch_person_portrait(self, query, skip_ddg=False,
                              press_urls=None, identity=None):
        """Cherche EXCLUSIVEMENT une photo portrait de la personne.
        Ordre de priorite (pipeline 5 sources v3.8) :
        1. Wikipedia API (portrait officiel si page existe)
        2. Wikimedia Commons (categorie portraits)
        3. Press og:image (articles de presse, anti-homonymie)
        4. DDG enrichi (contexte identite + LinkedIn priority)
        Rejette : logos, affiches, visuels d'emissions, couvertures.
        press_urls : liste d'URLs d'articles de presse.
        identity : dict de resolve_person_identity.
        Retourne un dict {url, caption, type, source} ou None.
        """
        _log("INFO",
            f"[PORTRAIT] Debut recherche portrait pour '{query}' "
            f"(skip_ddg={skip_ddg}, press={len(press_urls or [])}, "
            f"identity={bool(identity)})")

        # PRIORITE 1 — Wikipedia : image de la page si elle existe
        for lang in ["fr", "en"]:
            try:
                slug = urllib.parse.quote(query.replace(" ", "_"))
                url = (f"https://{lang}.wikipedia.org/api/rest_v1/"
                       f"page/summary/{slug}")
                req = urllib.request.Request(url, headers={
                    "User-Agent": USER_AGENT
                })
                with urllib.request.urlopen(req, timeout=6) as resp:
                    data = json.loads(resp.read().decode("utf-8"))

                thumb = data.get("thumbnail", {})
                original = data.get("originalimage", {})
                img_url = original.get("source") or thumb.get("source")
                w = original.get("width") or thumb.get("width", 0)
                h = original.get("height") or thumb.get("height", 0)

                if not img_url:
                    _log("INFO",
                        f"[PORTRAIT] Wikipedia ({lang}): "
                        f"page trouvee mais AUCUNE image")
                elif w < 150 or h < 150:
                    _log("INFO",
                        f"[PORTRAIT] Wikipedia ({lang}): "
                        f"image trop petite ({w}x{h})")
                elif h < w * 0.8:
                    _log("INFO",
                        f"[PORTRAIT] Wikipedia ({lang}): "
                        f"ratio non-portrait ({w}x{h})")
                elif self._is_google_proxy_url(img_url):
                    _log("WARN",
                        f"[PORTRAIT] Wikipedia ({lang}): "
                        f"Google proxy rejete")
                else:
                    _log("INFO",
                        f"[PORTRAIT] Wikipedia ({lang}): "
                        f"ACCEPTE {img_url[:60]}")
                    wiki_page = f"https://{lang}.wikipedia.org/wiki/{slug}"
                    return {
                        "url": img_url,
                        "caption": query,
                        "type": "person_portrait",
                        "source": f"wikipedia_{lang}",
                        "source_url": wiki_page
                    }
            except Exception as e:
                _log("WARN",
                    f"[PORTRAIT] Wikipedia ({lang}): ECHEC {e}")

        # PRIORITE 2 — Wikimedia Commons : recherche portrait
        try:
            params = urllib.parse.urlencode({
                "action": "query",
                "generator": "search",
                "gsrnamespace": "6",
                "gsrsearch": f"{query} portrait",
                "gsrlimit": "5",
                "prop": "imageinfo",
                "iiprop": "url|size|mime|thumburl",
                "iiurlwidth": "400",
                "format": "json"
            })
            url = f"https://commons.wikimedia.org/w/api.php?{params}"
            req = urllib.request.Request(url, headers={
                "User-Agent": "CyberStrat/1.0"
            })
            with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT) as resp:
                data = json.loads(resp.read().decode("utf-8"))

            pages = data.get("query", {}).get("pages", {})
            _log("INFO",
                f"[PORTRAIT] Wikimedia Commons: "
                f"{len(pages)} resultats")

            # Extraire nom de famille pour filtre pertinence
            name_parts = query.lower().split()
            # Le nom de famille est generalement le dernier mot
            # (sauf particules : le, la, de, du, des, von, van)
            particles = {"le", "la", "de", "du", "des", "von",
                         "van", "di", "el", "al"}
            surname_parts = []
            for i, part in enumerate(name_parts):
                if i > 0:  # skip prenom
                    surname_parts.append(part)
            # Construire le nom de famille complet (ex: "le boiteux")
            surname_full = " ".join(surname_parts) if surname_parts else ""
            # Et juste le dernier mot non-particule
            surname_last = ""
            for part in reversed(name_parts):
                if part not in particles and len(part) > 2:
                    surname_last = part
                    break

            for page in pages.values():
                info = page.get("imageinfo", [{}])[0]
                mime = info.get("mime", "")
                img_url = info.get("thumburl") or info.get("url")
                w = info.get("thumbwidth") or info.get("width", 0)
                h = info.get("thumbheight") or info.get("height", 0)
                page_title = page.get("title", "?")

                # Filtre pertinence : titre doit contenir
                # le nom de famille
                title_lower = page_title.lower()
                name_match = (
                    (surname_full and surname_full in title_lower)
                    or (surname_last and surname_last in title_lower)
                )
                if not name_match:
                    _log("INFO",
                        f"[PORTRAIT] Wikimedia '{page_title[:40]}': "
                        f"REJETE (nom '{surname_last}' absent "
                        f"du titre)")
                    continue

                if not img_url or "jpeg" not in mime:
                    _log("INFO",
                        f"[PORTRAIT] Wikimedia '{page_title[:40]}': "
                        f"non-jpeg ou pas d'URL")
                    continue
                if w < 150 or h < 150:
                    _log("INFO",
                        f"[PORTRAIT] Wikimedia '{page_title[:40]}': "
                        f"trop petit ({w}x{h})")
                    continue
                if h < w * 0.8:
                    _log("INFO",
                        f"[PORTRAIT] Wikimedia '{page_title[:40]}': "
                        f"ratio non-portrait ({w}x{h})")
                    continue
                if self._is_google_proxy_url(img_url):
                    _log("WARN",
                        f"[PORTRAIT] Wikimedia '{page_title[:40]}': "
                        f"Google proxy rejete")
                    continue
                _log("INFO",
                    f"[PORTRAIT] Wikimedia Commons: "
                    f"ACCEPTE {img_url[:60]}")
                commons_page = f"https://commons.wikimedia.org/wiki/{page_title}"
                return {
                    "url": img_url,
                    "caption": query,
                    "type": "person_portrait",
                    "source": "wikimedia_commons",
                    "source_url": commons_page
                }
        except Exception as e:
            _log("WARN",
                f"[PORTRAIT] Wikimedia Commons: ECHEC {e}")

        # PRIORITE 3 — Press og:image (articles de presse)
        if press_urls:
            press_result = self._fetch_portrait_from_press(
                press_urls, query)
            if press_result:
                return press_result
        else:
            _log("INFO",
                "[PORTRAIT] Press: aucune URL presse fournie")

        # PRIORITE 4 — DDG enrichi (contexte identite + LinkedIn)
        if skip_ddg:
            _log("INFO",
                "[PORTRAIT] DDG SKIPPE (skip_ddg=True)")
            _log("WARN",
                f"[PORTRAIT] Aucun portrait trouve pour "
                f"'{query}' (Wikipedia + Wikimedia + Press)")
            return None

        ddg_result = self._fetch_ddg_portrait(
            query, identity, text_sources=press_urls)
        if ddg_result:
            return ddg_result

        _log("WARN",
            f"[PORTRAIT] Aucun portrait trouve pour '{query}'")
        return None

    # ── Sélection featured publication ─────────────────────────────────

    def _select_featured(self, publications):
        """v4.8g — Selectionne la publication featured :
        La plus recente par annee. Si ex-aequo, prefere avec cover.
        """
        if not publications:
            return None

        # Trier par annee decroissante, cover en tiebreaker
        sorted_pubs = sorted(
            publications,
            key=lambda x: (
                x.get("year") or 0,
                1 if x.get("cover_url") else 0
            ),
            reverse=True
        )

        featured = sorted_pubs[0]
        _log("INFO",
            f"Featured selectionne: '{featured.get('title')}' "
            f"({featured.get('year')}) "
            f"cover={'OUI' if featured.get('cover_url') else 'NON'}")
        return featured

    # ── Publications connues (forçage manuel) ─────────────────────────

    KNOWN_WORKS = {
        "marie le boiteux": [
            {
                "title": "Le dernier juge de ma vie",
                "author": "Marie Le Boiteux",
                "year": 0,
                "publisher": "",
                "isbn": "",
                "cover_url": "",
                "link": "",
                "source": "known_works"
            }
        ],
    }

    # ── Vérification correspondance auteur ──────────────────────────

    @staticmethod
    def _author_name_matches(pub_author, expected_name):
        """v4.9c — Verifie que l'auteur de la publication correspond
        au nom recherche. Rejette les homonymes partiels.
        5 corrections : accents, noms intermediaires, initiales,
        ordre inverse, initiales nues.
        """
        if not pub_author or not expected_name:
            return False  # pas d'info → rejeter par precaution

        _PARTICLES = {"le", "la", "de", "du", "des", "von",
                      "van", "di", "el", "al", "ben", "ibn"}

        def _normalize(s):
            s = _strip_accents(s.lower().strip())
            s = s.replace(".", " ").replace(",", " ")
            return [t for t in s.split() if t and t not in _PARTICLES]

        pub_parts = _normalize(pub_author)
        exp_parts = _normalize(expected_name)

        if not pub_parts or not exp_parts:
            return False

        # Nom de famille attendu — mot final significatif
        # _extract_lastname_mf retourne "le boiteux" mais _normalize
        # supprime "le" → on utilise le dernier token normalise
        raw_surname = _extract_lastname_mf(expected_name)
        surname_tokens = _normalize(raw_surname)
        exp_surname = surname_tokens[-1] if surname_tokens else (
            exp_parts[-1] if exp_parts else ""
        )
        exp_firstname = exp_parts[0] if exp_parts else ""

        def _match(pp, ep):
            # Regle 1 : nom de famille doit etre present
            if exp_surname and not any(
                exp_surname == t for t in pp
            ):
                return False

            # Regle 2 : tokens supplementaires = homonyme
            # "herve elie bokobza" a 3 tokens,
            # "herve bokobza" en a 2 → rejet
            # Mais les initiales (1 char) matchant un token
            # attendu ne comptent pas comme extra
            pp_extra = [t for t in pp
                        if t != exp_surname
                        and t != exp_firstname
                        and not (len(t) == 1
                                 and any(e.startswith(t) for e in ep))]
            ep_extra = [t for t in ep
                        if t != exp_surname and t != exp_firstname]
            if len(pp_extra) > len(ep_extra):
                return False

            # Regle 3 : comparer prenom/initiale
            pp_first = next(
                (t for t in pp if t != exp_surname), None
            )
            ep_first = next(
                (t for t in ep if t != exp_surname), None
            )
            if pp_first and ep_first:
                if len(pp_first) == 1:
                    if pp_first[0] != ep_first[0]:
                        return False
                elif len(ep_first) == 1:
                    if ep_first[0] != pp_first[0]:
                        return False
                else:
                    if pp_first != ep_first:
                        return False

            return True

        # Test direct
        if _match(pub_parts, exp_parts):
            return True

        # Regle 4 : ordre inverse "Bokobza H." → swap
        if len(pub_parts) >= 2:
            swapped = pub_parts[1:] + [pub_parts[0]]
            if _match(swapped, exp_parts):
                return True

        return False

    def _fetch_by_title(self, title, author_name=None):
        """Cherche une publication specifique par son titre exact.
        Utilise Google Books + Open Library.
        """
        results = []

        # Google Books par titre
        try:
            params = urllib.parse.urlencode({
                "q": f'intitle:"{title}"',
                "maxResults": "3",
                "langRestrict": "fr"
            })
            url = f"https://www.googleapis.com/books/v1/volumes?{params}"
            req = urllib.request.Request(url, headers={
                "User-Agent": USER_AGENT
            })
            with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT) as resp:
                data = json.loads(resp.read().decode("utf-8"))

            for item in data.get("items", []):
                info = item.get("volumeInfo", {})
                pub_title = info.get("title", "")
                authors = info.get("authors", [])
                pub_author = authors[0] if authors else ""
                year_str = info.get("publishedDate", "")
                year = int(year_str[:4]) if len(year_str) >= 4 else 0
                cover = info.get("imageLinks", {}).get("thumbnail", "")
                if cover:
                    cover = cover.replace("http://", "https://")

                link = info.get("infoLink", "")

                results.append({
                    "title": pub_title,
                    "author": pub_author,
                    "publisher": info.get("publisher", ""),
                    "year": year,
                    "isbn": "",
                    "cover_url": cover,
                    "link": link,
                    "source": "google_books_title"
                })
        except Exception as e:
            _log("WARN", f"_fetch_by_title Google Books echec: {e}")

        # Open Library par titre
        try:
            params = urllib.parse.urlencode({
                "title": title,
                "limit": "3",
                "fields": "title,author_name,publisher,"
                          "first_publish_year,isbn,cover_i,key"
            })
            url = f"https://openlibrary.org/search.json?{params}"
            req = urllib.request.Request(url, headers={
                "User-Agent": USER_AGENT
            })
            with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT) as resp:
                data = json.loads(resp.read().decode("utf-8"))

            for doc in data.get("docs", []):
                pub_title = doc.get("title", "")
                authors_list = doc.get("author_name", [])
                pub_author = authors_list[0] if authors_list else ""
                year = doc.get("first_publish_year", 0) or 0
                cover_i = doc.get("cover_i")
                cover_url = (
                    f"https://covers.openlibrary.org/b/id/"
                    f"{cover_i}-L.jpg" if cover_i else ""
                )
                ol_key = doc.get("key", "")
                link = (f"https://openlibrary.org{ol_key}"
                        if ol_key else "")

                results.append({
                    "title": pub_title,
                    "author": pub_author,
                    "publisher": (doc.get("publisher", [None])[0]
                                  if doc.get("publisher") else ""),
                    "year": year,
                    "isbn": "",
                    "cover_url": cover_url,
                    "link": link,
                    "source": "openlibrary_title"
                })
        except Exception as e:
            _log("WARN", f"_fetch_by_title Open Library echec: {e}")

        _log("INFO",
            f"_fetch_by_title('{title}'): "
            f"{len(results)} resultats")
        return results

    # ── Filtrage publications par domaine professionnel ────────────────

    DOMAIN_KEYWORDS = {
        "psychiatr": ["psychiatrie", "psy", "mental", "soin",
                      "thérapie", "therapie", "santé", "sante",
                      "clinique", "patient", "psychose",
                      "psychanalyse", "psychanalyste",
                      "psychiatre", "contention", "manifeste"],
        "juriste":   ["droit", "juridique", "loi", "justice",
                      "avocat", "tribunal", "code", "jurisprudence"],
        "économ":    ["économie", "economie", "marché", "marche",
                      "finance", "travail", "capital", "monnaie"],
        "politiqu":  ["politique", "élection", "election",
                      "parti", "gouvern", "démocratie", "democratie"],
        "journalist": ["journalisme", "presse", "media", "enquête",
                       "enquete", "reportage", "information"],
        "philosophe": ["philosophie", "pensée", "pensee",
                       "ethique", "éthique", "morale"],
        "sociologue": ["sociologie", "social", "société", "societe",
                       "classe", "inégalité", "inegalite"],
        "historien": ["histoire", "historique", "mémoire", "memoire",
                      "siècle", "siecle", "guerre", "révolution"],
    }

    def _filter_publications_by_domain(self, publications, identity):
        """Filtre les publications incohérentes avec la profession
        connue de la personne.
        """
        profession = (identity.get("profession") or "").lower()
        if not profession:
            return publications

        # Identifier le domaine de la personne
        person_keywords = []
        for domain_key, keywords in self.DOMAIN_KEYWORDS.items():
            if domain_key in profession:
                person_keywords = keywords
                break

        if not person_keywords:
            return publications  # domaine inconnu → pas de filtre

        filtered = []
        for pub in publications:
            title_lower = (pub.get("title") or "").lower()
            publisher_lower = (pub.get("publisher") or "").lower()
            pub_text = title_lower + " " + publisher_lower

            # Accepter si au moins 1 mot-clé du domaine présent
            if any(kw in pub_text for kw in person_keywords):
                filtered.append(pub)
            else:
                _log("INFO",
                    f"Publication filtree (hors domaine): "
                    f"'{pub.get('title')}'")

        # Si filtre trop agressif (rien gardé) → garder tout
        if not filtered:
            _log("WARN",
                f"Filtre domaine trop agressif pour '{profession}' "
                f"— publications conservees sans filtre")
            return publications

        return filtered

    # ── Recherche de publications (Open Library + Google Books) ────────

    # ── v4.8g — Bibliographie Wikipedia via MediaWiki API ────────────

    def _fetch_wikipedia_bibliography(self, name, lang="fr"):
        """v4.8g — Extraire la bibliographie depuis Wikipedia
        via l'API MediaWiki (sections → wikitext → parse).
        Source la plus fiable pour les auteurs notoires.
        Retourne une liste de dicts publication.
        """
        slug = name.replace(" ", "_")
        ascii_slug = _strip_accents(name).replace(" ", "_")
        variants = [slug]
        if ascii_slug != slug:
            variants.append(ascii_slug)

        for variant in variants:
            for wiki_lang in [lang, "en"]:
                pubs = self._try_wiki_biblio(variant, wiki_lang,
                                             name)
                if pubs:
                    return pubs

        return []

    def _try_wiki_biblio(self, page_title, wiki_lang, author_name):
        """v4.8g — Tenter d'extraire la bibliographie d'une page
        Wikipedia donnee via l'API MediaWiki.
        """
        base_url = (f"https://{wiki_lang}.wikipedia.org"
                    f"/w/api.php")

        # Etape 1 : Obtenir l'index des sections
        params = urllib.parse.urlencode({
            "action": "parse",
            "page": page_title,
            "prop": "sections",
            "format": "json",
            "redirects": "1",
        })
        try:
            req = urllib.request.Request(
                f"{base_url}?{params}",
                headers={"User-Agent": USER_AGENT})
            with urllib.request.urlopen(
                    req, timeout=FETCH_TIMEOUT) as resp:
                data = json.loads(resp.read().decode("utf-8"))
        except Exception as e:
            _log("WARN",
                f"Wikipedia sections echec ({wiki_lang}): {e}")
            return []

        sections = data.get("parse", {}).get("sections", [])
        if not sections:
            return []

        # Chercher la section bibliographie
        _BIBLIO_TITLES = {
            "bibliographie", "publications", "ouvrages",
            "livres", "œuvres", "oeuvres",
            "bibliography", "works", "books",
            "selected works", "selected bibliography",
        }
        biblio_index = None
        for s in sections:
            s_title = s.get("line", "").lower().strip()
            if s_title in _BIBLIO_TITLES:
                biblio_index = s.get("index")
                break
            # Match partiel : "ouvrages publiés", etc.
            if any(bt in s_title for bt in _BIBLIO_TITLES):
                biblio_index = s.get("index")
                break

        if biblio_index is None:
            return []

        # Etape 2 : Extraire le wikitext de la section
        params2 = urllib.parse.urlencode({
            "action": "parse",
            "page": page_title,
            "prop": "wikitext",
            "section": biblio_index,
            "format": "json",
            "redirects": "1",
        })
        try:
            req2 = urllib.request.Request(
                f"{base_url}?{params2}",
                headers={"User-Agent": USER_AGENT})
            with urllib.request.urlopen(
                    req2, timeout=FETCH_TIMEOUT) as resp2:
                data2 = json.loads(
                    resp2.read().decode("utf-8"))
        except Exception as e:
            _log("WARN",
                f"Wikipedia wikitext echec ({wiki_lang}): {e}")
            return []

        wikitext = (data2.get("parse", {})
                    .get("wikitext", {}).get("*", ""))
        if not wikitext:
            return []

        pubs = self._parse_wikitext_bibliography(
            wikitext, author_name)
        if pubs:
            _log("INFO",
                f"Wikipedia biblio ({wiki_lang}): "
                f"{len(pubs)} publications pour '{page_title}'")
        return pubs

    @staticmethod
    def _parse_wikitext_bibliography(wikitext, author_name=""):
        """v4.8g — Parser le wikitext d'une section bibliographie.
        Extrait titre, annee, editeur.
        Formats courants Wikipedia :
        * ''Titre'', Editeur, 1998
        * [[Titre du livre]], Editeur, 2005, ISBN...
        # ''Titre'', coll. «...», Editeur, 2010
        {{Ouvrage|titre=...|année=...|éditeur=...}}
        """
        publications = []
        seen_titles = set()
        lines = wikitext.split("\n")

        for line in lines:
            line = line.strip()
            if not line or line.startswith("=="):
                continue

            # Tenter d'abord le template {{Ouvrage|...}}
            ouvrage_match = re.search(
                r'\{\{[Oo]uvrage\s*\|', line)
            if ouvrage_match:
                pub = _parse_ouvrage_template(line)
                if pub and pub["title"].lower() not in seen_titles:
                    seen_titles.add(pub["title"].lower())
                    if author_name:
                        pub["author"] = author_name
                    publications.append(pub)
                continue

            # Lignes de liste (* ou # ou -)
            if not (line.startswith("*") or
                    line.startswith("#") or
                    line.startswith("-")):
                continue

            # Nettoyer le wikitext
            clean = re.sub(
                r"\[\[([^\]|]+\|)?([^\]]+)\]\]",
                r"\2", line)
            clean = re.sub(r"\{\{[^}]*\}\}", "", clean)
            clean = re.sub(r"'{2,}", "", clean)
            clean = re.sub(r"^\s*[*#\-]+\s*", "", clean)
            clean = re.sub(r"<[^>]+>", "", clean)
            clean = re.sub(r"\s+", " ", clean).strip()

            if len(clean) < 5:
                continue

            # Extraire l'annee
            year = 0
            year_match = re.search(
                r"\b(19[4-9]\d|20[0-2]\d)\b", clean)
            if year_match:
                year = int(year_match.group())

            # Extraire le titre (avant la premiere virgule)
            parts = clean.split(",")
            title = parts[0].strip()
            # Nettoyer ISBN, coll., vol., etc.
            title = re.sub(
                r"\s*(ISBN|coll\.|vol\.|n°|avec |"
                r"preface|préface).*$",
                "", title, flags=re.IGNORECASE
            ).strip()

            if not title or len(title) < 4:
                continue
            # Rejeter les lignes descriptives
            # (commencent par un verbe conjugue)
            _DESC_STARTS = (
                "il ", "elle ", "en ", "avec ", "pour ",
                "dans ", "sur ", "sous la ", "sous le ",
                "c'est ", "ce ", "cette ",
            )
            if title.lower().startswith(_DESC_STARTS):
                continue
            if title.lower() in seen_titles:
                continue
            seen_titles.add(title.lower())

            # Editeur : mot apres virgule, pas une annee
            publisher = ""
            if len(parts) > 1:
                for p in parts[1:]:
                    p = p.strip()
                    if (not re.match(r"^\d{4}$", p)
                            and len(p) > 2
                            and "ISBN" not in p.upper()
                            and "coll." not in p.lower()
                            and "p." not in p[:3]):
                        publisher = p[:50]
                        break

            publications.append({
                "title": title[:120],
                "author": author_name,
                "publisher": publisher,
                "year": year,
                "isbn": "",
                "cover_url": "",
                "link": "",
                "source": "wikipedia_biblio"
            })

        return publications

    def fetch_publications(self, author_name, max_results=10,
                            identity=None, wiki_found=True):
        """Rechercher les publications d'un auteur
        Sources : Wikipedia biblio + Open Library + Google Books
                  + Babelio (+ Amazon.fr si rien > 2020)
        Tri     : par annee de publication decroissante, year=0 en dernier
        Max     : 10 resultats (v4.8g)
        identity : dict optionnel (resolve_person_identity) pour variantes
        wiki_found : si True, tenter la bibliographie Wikipedia
                     (False = skip pour gagner du temps)
        """
        all_pubs = []

        # Source 0 (v4.8g) : Wikipedia Bibliographie — PRIORITAIRE
        # Skip si Wikipedia non trouvee en ETAPE 1 (evite 2-4
        # appels HTTP inutiles sur une page inexistante)
        if wiki_found:
            wiki_pubs = self._fetch_wikipedia_bibliography(
                author_name, lang="fr")
            if wiki_pubs:
                _log("INFO",
                    f"Wikipedia biblio: {len(wiki_pubs)} publications "
                    f"pour '{author_name}'")
                all_pubs.extend(wiki_pubs)
        else:
            _log("INFO",
                f"Wikipedia biblio SKIP (pas de page wiki) "
                f"pour '{author_name}'")

        # Construire la liste des requetes a essayer
        queries_to_try = [author_name]
        if (identity
                and identity.get("confidence") in ("medium", "high")
                and identity.get("profession")):
            # Ajouter variante avec premier mot de la profession
            prof_words = identity["profession"].strip().split()
            # Filtrer les mots parasites (bullet, Experience, etc.)
            clean_words = [
                w for w in prof_words
                if len(w) > 3 and w[0].isalpha()
                and w.lower() not in ("lieu", "experience")
            ]
            if clean_words:
                queries_to_try.append(
                    f"{author_name} {clean_words[0]}")

        for q in queries_to_try:
            # Source 1 : Open Library API
            ol_pubs = self._fetch_openlibrary(q, max_results)
            all_pubs.extend(ol_pubs)

            # Source 2 : Google Books API
            gb_pubs = self._fetch_google_books(q, max_results)
            all_pubs.extend(gb_pubs)

        # Source 3 : Babelio (toujours — meilleure couverture FR)
        bab_pubs = self._fetch_babelio(author_name, max_results)
        all_pubs.extend(bab_pubs)

        # Filtre correspondance auteur (rejeter homonymes)
        before_filter = len(all_pubs)
        all_pubs = [
            p for p in all_pubs
            if self._author_name_matches(
                p.get("author", ""), author_name)
        ]
        rejected = before_filter - len(all_pubs)
        if rejected > 0:
            _log("INFO",
                f"Filtre auteur: {rejected} publications "
                f"homonymes rejetees")

        # Deduplication par titre normalise (lowercase, 40 premiers chars)
        deduped = self._deduplicate_pubs(all_pubs)

        # Source 4 : Amazon.fr (seulement si aucune publication > 2020)
        has_recent = any(p["year"] > 2020 for p in deduped)
        if not has_recent:
            _log("INFO", "Aucune pub > 2020, fallback Amazon.fr")
            amz_pubs = self._fetch_amazon(author_name, max_results)
            # Filtre auteur sur Amazon aussi
            amz_pubs = [
                p for p in amz_pubs
                if self._author_name_matches(
                    p.get("author", ""), author_name)
            ]
            deduped.extend(amz_pubs)
            deduped = self._deduplicate_pubs(deduped)

        # Tri par annee decroissante, year=0 en dernier
        deduped.sort(key=lambda p: (0 if p["year"] > 0 else 1, -p["year"]))

        # Valider les cover_url (rejeter invalides/placeholders)
        for pub in deduped:
            if pub.get("cover_url"):
                if not self._is_valid_cover_url(pub["cover_url"]):
                    _log("WARN",
                        f"Cover URL invalide ignoree: "
                        f"{pub['cover_url'][:60]} [{pub['title'][:30]}]")
                    pub["cover_url"] = ""

        # Retry Google Books si aucune jaquette valide
        has_valid_cover = any(p.get("cover_url") for p in deduped)
        if not has_valid_cover and deduped:
            _log("INFO",
                "Aucune jaquette valide — retry Google Books avec titre")
            title_query = deduped[0]["title"]
            google_retry = self._fetch_google_books(title_query, max_results=3)
            for pub in google_retry:
                if pub.get("cover_url") and self._is_valid_cover_url(
                        pub["cover_url"]):
                    deduped[0]["cover_url"] = pub["cover_url"]
                    deduped[0]["link"] = (
                        deduped[0].get("link") or pub.get("link", ""))
                    _log("INFO",
                        f"Jaquette recuperee via Google Books retry: "
                        f"{pub['cover_url'][:60]}")
                    break

        # Fallback A : Open Library recherche generale (q= au lieu de author=)
        if not deduped:
            _log("INFO",
                f"Aucune publication par auteur '{author_name}' "
                f"— retry Open Library recherche generale")
            ol_broad = self._fetch_openlibrary_broad(
                author_name, max_results)
            # Filtre auteur sur fallback aussi
            ol_broad = [
                p for p in ol_broad
                if self._author_name_matches(
                    p.get("author", ""), author_name)
            ]
            deduped.extend(ol_broad)
            deduped = self._deduplicate_pubs(deduped)

        # Fallback B : Google Books recherche large (sans inauthor)
        if not deduped:
            _log("INFO",
                f"Toujours aucune publication "
                f"— retry Google Books recherche large")
            gb_broad = self._fetch_google_books_broad(
                author_name, max_results)
            # Filtre auteur sur fallback aussi
            gb_broad = [
                p for p in gb_broad
                if self._author_name_matches(
                    p.get("author", ""), author_name)
            ]
            deduped.extend(gb_broad)
            deduped = self._deduplicate_pubs(deduped)

        # Valider les covers des retries
        if deduped:
            for pub in deduped:
                if pub.get("cover_url"):
                    if not self._is_valid_cover_url(pub["cover_url"]):
                        pub["cover_url"] = ""

        # Filtrer publications hors domaine si identité connue
        if identity and identity.get("profession"):
            deduped = self._filter_publications_by_domain(
                deduped, identity)

        # Injecter KNOWN_WORKS si disponibles
        known_key = author_name.lower().strip()
        known_works = self.KNOWN_WORKS.get(known_key, [])
        if known_works:
            _log("INFO",
                f"KNOWN_WORKS: {len(known_works)} oeuvre(s) "
                f"connue(s) pour '{author_name}'")
            existing_titles = {
                p["title"].lower().strip()[:40] for p in deduped
            }
            for kw in known_works:
                kw_key = kw["title"].lower().strip()[:40]
                if kw_key not in existing_titles:
                    # Chercher dans les APIs pour enrichir
                    # (cover, year, link)
                    fetched = self._fetch_by_title(kw["title"],
                                                   author_name)
                    if fetched:
                        # Prendre le meilleur resultat
                        best = fetched[0]
                        for f in fetched:
                            if f.get("cover_url"):
                                best = f
                                break
                        # Merger avec les infos connues
                        merged = dict(kw)
                        if best.get("cover_url"):
                            merged["cover_url"] = best["cover_url"]
                        if best.get("year") and not merged.get("year"):
                            merged["year"] = best["year"]
                        if best.get("link"):
                            merged["link"] = best["link"]
                        if best.get("publisher"):
                            merged["publisher"] = best["publisher"]
                        deduped.append(merged)
                        _log("INFO",
                            f"KNOWN_WORKS injecte: "
                            f"'{merged['title']}' "
                            f"(year={merged.get('year')}, "
                            f"cover={'OK' if merged.get('cover_url') else 'NONE'})")
                    else:
                        # Injecter tel quel
                        deduped.append(dict(kw))
                        _log("INFO",
                            f"KNOWN_WORKS injecte (brut): "
                            f"'{kw['title']}'")

        # Limiter au max demande
        result = deduped[:max_results]

        _log("INFO", f"Publications pour '{author_name}': {len(result)} resultats "
             f"({len(all_pubs)} bruts, {len(deduped)} dedup)")
        for p in result:
            _log("INFO", f"  -> {p['year']} | {p['title']} [{p['source']}] "
                 f"cover={'OK' if p.get('cover_url') else 'NONE'}")
        return result

    def _fetch_openlibrary(self, author_name, max_results=5):
        """Rechercher les publications via Open Library API"""
        params = urllib.parse.urlencode({
            "author": author_name,
            "sort": "new",
            "limit": str(max_results * 2),
            "fields": "title,author_name,publisher,first_publish_year,isbn,cover_i,key",
            "lang": "fr"
        })
        url = f"https://openlibrary.org/search.json?{params}"

        try:
            req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
            with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT) as resp:
                data = json.loads(resp.read().decode("utf-8"))
        except Exception as e:
            _log("WARN", f"Echec Open Library: {e}")
            return []

        docs = data.get("docs", [])
        pubs = []
        seen_titles = set()
        for doc in docs:
            title = doc.get("title", "")
            if not title or title.lower() in seen_titles:
                continue
            seen_titles.add(title.lower())

            year = doc.get("first_publish_year", 0) or 0
            cover_i = doc.get("cover_i")
            cover_url = f"https://covers.openlibrary.org/b/id/{cover_i}-L.jpg" if cover_i else ""
            ol_key = doc.get("key", "")
            link = f"https://openlibrary.org{ol_key}" if ol_key else ""

            publishers = doc.get("publisher", [])
            publisher = publishers[0] if publishers else ""

            isbns = doc.get("isbn", [])
            isbn = isbns[0] if isbns else ""

            authors = doc.get("author_name", [])
            author = authors[0] if authors else author_name

            pubs.append({
                "title": title,
                "author": author,
                "publisher": publisher,
                "year": year,
                "isbn": isbn,
                "cover_url": cover_url,
                "link": link,
                "source": "openlibrary"
            })
            if len(pubs) >= max_results:
                break

        _log("INFO", f"Open Library pour '{author_name}': {len(pubs)} publications")
        return pubs

    def _fetch_openlibrary_broad(self, author_name, max_results=5):
        """Recherche Open Library elargie (parametre q= general).
        Utile quand le nom d'auteur est un pseudonyme non indexe
        dans le champ author d'Open Library.
        """
        params = urllib.parse.urlencode({
            "q": author_name,
            "sort": "new",
            "limit": str(max_results * 2),
            "fields": "title,author_name,publisher,first_publish_year,isbn,cover_i,key"
        })
        url = f"https://openlibrary.org/search.json?{params}"

        try:
            req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
            with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT) as resp:
                data = json.loads(resp.read().decode("utf-8"))
        except Exception as e:
            _log("WARN", f"Echec Open Library broad: {e}")
            return []

        docs = data.get("docs", [])
        pubs = []
        seen_titles = set()

        # Filtrer : garder seulement les resultats ou le nom apparait
        author_parts = author_name.lower().split()

        for doc in docs:
            title = doc.get("title", "")
            if not title or title.lower() in seen_titles:
                continue

            # Verifier pertinence : nom dans auteurs ou titre
            authors = doc.get("author_name", [])
            all_text = " ".join(authors).lower() + " " + title.lower()
            if not any(part in all_text for part in author_parts if len(part) > 2):
                continue

            seen_titles.add(title.lower())
            year = doc.get("first_publish_year", 0) or 0
            cover_i = doc.get("cover_i")
            cover_url = (
                f"https://covers.openlibrary.org/b/id/{cover_i}-L.jpg"
                if cover_i else "")
            ol_key = doc.get("key", "")
            link = f"https://openlibrary.org{ol_key}" if ol_key else ""
            publishers = doc.get("publisher", [])
            publisher = publishers[0] if publishers else ""
            isbns = doc.get("isbn", [])
            isbn = isbns[0] if isbns else ""
            author = authors[0] if authors else author_name

            pubs.append({
                "title": title,
                "author": author,
                "publisher": publisher,
                "year": year,
                "isbn": isbn,
                "cover_url": cover_url,
                "link": link,
                "source": "openlibrary"
            })
            if len(pubs) >= max_results:
                break

        _log("INFO",
            f"Open Library broad pour '{author_name}': {len(pubs)} publications")
        return pubs

    def _fetch_google_books(self, author_name, max_results=5):
        """Fallback : rechercher les publications via Google Books API"""
        params = urllib.parse.urlencode({
            "q": f'inauthor:"{author_name}"',
            "maxResults": str(max_results * 2),
            "orderBy": "newest",
            "printType": "books",
            "langRestrict": "fr"
        })
        url = f"https://www.googleapis.com/books/v1/volumes?{params}"

        try:
            req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
            with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT) as resp:
                data = json.loads(resp.read().decode("utf-8"))
        except Exception as e:
            _log("WARN", f"Echec Google Books: {e}")
            return []

        items = data.get("items", [])
        pubs = []
        seen_titles = set()
        for item in items:
            info = item.get("volumeInfo", {})
            title = info.get("title", "")
            if not title or title.lower() in seen_titles:
                continue
            seen_titles.add(title.lower())

            # Annee depuis publishedDate (format YYYY ou YYYY-MM-DD)
            pub_date = info.get("publishedDate", "")
            year = 0
            if pub_date and len(pub_date) >= 4:
                try:
                    year = int(pub_date[:4])
                except ValueError:
                    year = 0

            # Couverture (zoom=2 pour meilleure resolution)
            img_links = info.get("imageLinks", {})
            cover_url = img_links.get("thumbnail", "")
            if cover_url:
                cover_url = cover_url.replace("http://", "https://")
                cover_url = cover_url.replace("zoom=1", "zoom=2")

            # Lien
            link = info.get("canonicalVolumeLink", "") or info.get("infoLink", "")

            authors = info.get("authors", [])
            author = authors[0] if authors else author_name
            publisher = info.get("publisher", "")

            isbns = info.get("industryIdentifiers", [])
            isbn = ""
            for ident in isbns:
                if ident.get("type") == "ISBN_13":
                    isbn = ident.get("identifier", "")
                    break
            if not isbn and isbns:
                isbn = isbns[0].get("identifier", "")

            pubs.append({
                "title": title,
                "author": author,
                "publisher": publisher,
                "year": year,
                "isbn": isbn,
                "cover_url": cover_url,
                "link": link,
                "source": "google_books"
            })
            if len(pubs) >= max_results:
                break

        _log("INFO", f"Google Books pour '{author_name}': {len(pubs)} publications")
        return pubs

    def _fetch_google_books_broad(self, author_name, max_results=3):
        """Recherche Google Books elargie (sans restriction inauthor).
        Utile pour les pseudonymes non indexes dans les metadonnees Google.
        Cherche le nom comme terme libre dans tous les champs.
        """
        params = urllib.parse.urlencode({
            "q": f'"{author_name}" livre',
            "maxResults": str(max_results * 2),
            "orderBy": "relevance",
            "printType": "books"
        })
        url = f"https://www.googleapis.com/books/v1/volumes?{params}"

        try:
            req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
            with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT) as resp:
                data = json.loads(resp.read().decode("utf-8"))
        except Exception as e:
            _log("WARN", f"Echec Google Books broad: {e}")
            return []

        items = data.get("items", [])
        pubs = []
        seen_titles = set()
        for item in items:
            info = item.get("volumeInfo", {})
            title = info.get("title", "")
            if not title or title.lower() in seen_titles:
                continue
            seen_titles.add(title.lower())

            # Verifier que l'auteur est mentionne quelque part
            authors = info.get("authors", [])
            description = info.get("description", "")
            author_parts = author_name.lower().split()
            all_text = " ".join(authors).lower() + " " + description.lower()
            if not any(part in all_text for part in author_parts if len(part) > 2):
                continue

            pub_date = info.get("publishedDate", "")
            year = 0
            if pub_date and len(pub_date) >= 4:
                try:
                    year = int(pub_date[:4])
                except ValueError:
                    year = 0

            img_links = info.get("imageLinks", {})
            cover_url = img_links.get("thumbnail", "")
            if cover_url:
                cover_url = cover_url.replace("http://", "https://")

            link = info.get("canonicalVolumeLink", "") or info.get("infoLink", "")
            author = authors[0] if authors else author_name
            publisher = info.get("publisher", "")

            pubs.append({
                "title": title,
                "author": author,
                "publisher": publisher,
                "year": year,
                "isbn": "",
                "cover_url": cover_url,
                "link": link,
                "source": "google_books"
            })
            if len(pubs) >= max_results:
                break

        _log("INFO",
            f"Google Books broad pour '{author_name}': {len(pubs)} publications")
        return pubs

    def _is_valid_cover_url(self, url):
        """Verifier qu'une URL de jaquette pointe vers une vraie image.
        Rejette : URL nulle, cover_id 0 ou -1 (Open Library), placeholders.
        Fait un HEAD HTTP pour verifier Content-Type image/*.
        Timeout 5s pour ne pas bloquer le pipeline.
        """
        if not url:
            return False

        # Open Library : cover_id 0 ou -1 = pas de couverture
        if "covers.openlibrary.org/b/id/0" in url:
            return False
        if "covers.openlibrary.org/b/id/-" in url:
            return False

        # Placeholders connus
        placeholders = [
            "no-cover", "placeholder", "default-cover",
            "blank.gif", "1x1.gif", "spacer.gif"
        ]
        url_lower = url.lower()
        if any(p in url_lower for p in placeholders):
            return False

        # Verifier HTTP 200 et Content-Type image/*
        try:
            req = urllib.request.Request(url, method="HEAD", headers={
                "User-Agent": USER_AGENT
            })
            with urllib.request.urlopen(req, timeout=5) as resp:
                content_type = resp.headers.get("Content-Type", "")
                if resp.status == 200 and "image" in content_type:
                    return True
                _log("WARN",
                    f"Cover URL rejetee: status={resp.status} "
                    f"type={content_type} url={url[:60]}")
                return False
        except Exception as e:
            _log("WARN", f"Cover URL inaccessible: {url[:60]} ({e})")
            return False

    def _deduplicate_pubs(self, pubs):
        """Deduplication par titre normalise (lowercase, 40 premiers chars).
        Normalise apostrophes/guillemets. Garde la version avec meilleure annee/cover."""
        seen = {}
        for p in pubs:
            # Normaliser : minuscule + remplacer apostrophes typographiques
            norm = p["title"].lower().strip()
            norm = norm.replace("\u2019", "'").replace("\u2018", "'")
            norm = norm.replace("\u201c", '"').replace("\u201d", '"')
            key = norm[:40]
            if key not in seen:
                seen[key] = p
            else:
                existing = seen[key]
                better = False
                if p["year"] > existing["year"]:
                    better = True
                elif p["year"] == existing["year"] and p.get("cover_url") and not existing.get("cover_url"):
                    better = True
                if better:
                    seen[key] = p
        return list(seen.values())

    def _fetch_babelio(self, author_name, max_results=5):
        """Scraper Babelio — meilleure couverture publications FR"""
        try:
            from bs4 import BeautifulSoup
        except ImportError:
            _log("WARN", "bs4 non disponible, skip Babelio")
            return []

        query = urllib.parse.quote_plus(author_name)
        url = f"https://www.babelio.com/recherche.php?Recherche={query}&item_type=livres"

        try:
            req = urllib.request.Request(url, headers={
                "User-Agent": USER_AGENT,
                "Accept-Language": "fr-FR,fr;q=0.9",
                "Accept": "text/html,application/xhtml+xml"
            })
            with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT) as resp:
                html = resp.read().decode("utf-8", errors="replace")
        except Exception as e:
            _log("WARN", f"Echec Babelio: {e}")
            return []

        soup = BeautifulSoup(html, "html.parser")
        pubs = []
        seen = set()

        # Strategie 1 : conteneurs .livre_con
        containers = soup.select(".livre_con")
        if not containers:
            # Strategie 2 : parent de chaque lien /livres/
            for a_tag in soup.select('a[href*="/livres/"]'):
                parent = (a_tag.find_parent("div")
                          or a_tag.find_parent("td")
                          or a_tag.find_parent("li"))
                if parent and parent not in containers:
                    containers.append(parent)

        for block in containers:
            # Titre : premier lien /livres/ dans le block
            title_link = block.select_one('a[href*="/livres/"]')
            if not title_link:
                continue
            title = title_link.get_text(strip=True)
            if not title or len(title) < 3:
                continue

            norm_key = title.lower().strip()[:40]
            if norm_key in seen:
                continue
            seen.add(norm_key)

            href = title_link.get("href", "")
            if href and not href.startswith("http"):
                href = "https://www.babelio.com" + href

            # Annee : regex dans le texte du block
            year = 0
            block_text = block.get_text()
            year_match = re.search(r'\b(19[5-9]\d|20[0-2]\d)\b', block_text)
            if year_match:
                year = int(year_match.group(1))

            # Couverture : img dans le block
            cover_url = ""
            img_el = block.select_one("img")
            if img_el:
                src = img_el.get("src", "") or img_el.get("data-src", "")
                if src:
                    if not src.startswith("http"):
                        src = "https://www.babelio.com" + src
                    cover_url = src

            pubs.append({
                "title": title,
                "author": author_name,
                "publisher": "",
                "year": year,
                "isbn": "",
                "cover_url": cover_url,
                "link": href,
                "source": "babelio"
            })
            if len(pubs) >= max_results:
                break

        _log("INFO", f"Babelio pour '{author_name}': {len(pubs)} publications")
        return pubs

    def _fetch_amazon(self, author_name, max_results=5):
        """Scraper Amazon.fr — fallback si aucune publication recente.
        Zero BeautifulSoup — parsing regex uniquement.
        """
        query = urllib.parse.quote_plus(f"{author_name} livre")
        url = (f"https://www.amazon.fr/s?k={query}"
               f"&i=stripbooks&s=date-desc-rank")

        try:
            req = urllib.request.Request(url, headers={
                "User-Agent": USER_AGENT,
                "Accept-Language": "fr-FR,fr;q=0.9",
                "Accept": "text/html,application/xhtml+xml"
            })
            with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT) as resp:
                html = resp.read().decode("utf-8", errors="replace")
        except Exception as e:
            _log("WARN", f"Echec Amazon.fr: {e}")
            return []

        # Parser les blocs de resultats Amazon via regex
        # Chaque resultat est dans data-component-type="s-search-result"
        block_pattern = re.compile(
            r'data-component-type="s-search-result"[^>]*>(.*?)'
            r'(?=data-component-type="s-search-result"|$)',
            re.DOTALL
        )
        blocks = block_pattern.findall(html)
        if not blocks:
            _log("WARN", "Amazon.fr: aucun resultat parse (anti-bot ?)")
            return []

        pubs = []
        seen = set()
        author_parts = [
            p.lower() for p in author_name.split() if len(p) > 2
        ]

        for block in blocks:
            # Titre : <h2 ...><a ...><span ...>TITRE</span></a></h2>
            title_match = re.search(
                r'<h2[^>]*>.*?<a[^>]*>.*?<span[^>]*>([^<]+)</span>',
                block, re.DOTALL
            )
            if not title_match:
                continue
            title = self._decode_html_entities(
                title_match.group(1).strip())
            if not title or len(title) < 3:
                continue

            # Filtrer si l'auteur n'apparait pas dans le bloc
            block_text = re.sub(r'<[^>]+>', ' ', block).lower()
            if not any(p in block_text for p in author_parts):
                continue

            norm_key = title.lower().strip()[:40]
            if norm_key in seen:
                continue
            seen.add(norm_key)

            # Lien : href dans le <a> du <h2>
            href = ""
            link_match = re.search(
                r'<h2[^>]*>.*?<a[^>]*href="([^"]+)"',
                block, re.DOTALL
            )
            if link_match:
                href = link_match.group(1)
                if href and not href.startswith("http"):
                    href = "https://www.amazon.fr" + href

            # Date : chercher annee 20xx dans le texte
            year = 0
            year_match = re.search(r'\b(20[0-2]\d)\b', block_text)
            if year_match:
                year = int(year_match.group(1))

            # Couverture : img class="s-image" src="..."
            cover_url = ""
            img_match = re.search(
                r'class="s-image"[^>]*src="([^"]+)"', block)
            if not img_match:
                img_match = re.search(
                    r'src="([^"]+)"[^>]*class="s-image"', block)
            if img_match:
                cover_url = img_match.group(1)

            pubs.append({
                "title": title,
                "author": author_name,
                "publisher": "",
                "year": year,
                "isbn": "",
                "cover_url": cover_url,
                "link": href,
                "source": "amazon"
            })
            if len(pubs) >= max_results:
                break

        _log("INFO",
            f"Amazon.fr pour '{author_name}': {len(pubs)} publications")
        return pubs

    # ── Parsing DuckDuckGo Lite ──────────────────────────────────────────

    def _parse_ddg_lite(self, html):
        """Parser les resultats de recherche DuckDuckGo Lite
        Structure : <a class='result-link' href="//duckduckgo.com/l/?uddg=...">Title</a>
        Snippets  : <td class='result-snippet'>texte</td>
        """
        results = []

        # Extraire les liens (class='result-link')
        links = re.findall(
            r"""<a[^>]*class=['"]result-link['"][^>]*href=["']([^"']*)["'][^>]*>(.*?)</a>""",
            html, re.DOTALL
        )
        if not links:
            # Ordre alternatif des attributs
            links = re.findall(
                r"""<a[^>]*href=["']([^"']*)["'][^>]*class=['"]result-link['"][^>]*>(.*?)</a>""",
                html, re.DOTALL
            )

        # Extraire les snippets
        snippets = re.findall(
            r"""<td[^>]*class=['"]result-snippet['"][^>]*>(.*?)</td>""",
            html, re.DOTALL
        )

        for i, (href, title_html) in enumerate(links):
            url = self._decode_ddg_url(href)
            if not url:
                continue

            title = re.sub(r"<[^>]+>", "", title_html).strip()
            # Decoder les entites HTML
            title = title.replace("&amp;", "&").replace("&quot;", '"')
            title = title.replace("&#x27;", "'").replace("&#39;", "'")

            snippet = ""
            if i < len(snippets):
                snippet = re.sub(r"<[^>]+>", "", snippets[i]).strip()
                snippet = snippet.replace("&amp;", "&").replace("&quot;", '"')
                snippet = snippet.replace("&#x27;", "'").replace("&#39;", "'")

            results.append({
                "url": url,
                "title": title,
                "snippet": snippet
            })

        return results

    def _decode_ddg_url(self, href):
        """Decoder l'URL de redirection DuckDuckGo"""
        if "uddg=" in href:
            match = re.search(r"uddg=([^&]+)", href)
            if match:
                return urllib.parse.unquote(match.group(1))
        elif href.startswith("http"):
            return href
        return None

    # ── Extraction de contenu ────────────────────────────────────────────

    def _extract_meta_description(self, html):
        """v4.6 — Extraire la meta description d'une page HTML.
        Fallback quand _extract_paragraphs retourne trop peu de texte
        (paywall, page dynamique, contenu JS-only).
        """
        import re
        # Regex pour <meta name="description" content="...">
        # et <meta property="og:description" content="...">
        patterns = [
            r'<meta\s+name=["\']description["\']\s+'
            r'content=["\']([^"\']{20,})["\']',
            r'<meta\s+content=["\']([^"\']{20,})["\']\s+'
            r'name=["\']description["\']',
            r'<meta\s+property=["\']og:description["\']\s+'
            r'content=["\']([^"\']{20,})["\']',
            r'<meta\s+content=["\']([^"\']{20,})["\']\s+'
            r'property=["\']og:description["\']',
        ]
        for pat in patterns:
            match = re.search(pat, html, re.IGNORECASE)
            if match:
                desc = match.group(1).strip()
                # Nettoyer les entites HTML basiques
                desc = (desc.replace("&amp;", "&")
                        .replace("&lt;", "<")
                        .replace("&gt;", ">")
                        .replace("&#039;", "'")
                        .replace("&quot;", '"'))
                return desc
        return None

    def _extract_paragraphs(self, html):
        """Extraire le texte des balises <p> uniquement
        Regle : ignorer nav, footer, aside, scripts (prompt_architect_v1.1)
        Tente bs4, fallback regex (decision D3)
        """
        try:
            from bs4 import BeautifulSoup
            return self._extract_paragraphs_bs4(html)
        except ImportError:
            return self._extract_paragraphs_regex(html)

    def _extract_paragraphs_bs4(self, html):
        """Extraction <p> avec BeautifulSoup"""
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(html, "html.parser")

        # Supprimer les elements non pertinents (regle prompt_architect)
        for tag in soup.select(
            "nav, footer, aside, script, style, header, "
            ".nav, .footer, .sidebar, .comments, .social, "
            ".ad, .pub, .newsletter, .cookie"
        ):
            tag.decompose()

        paragraphs = []
        for p in soup.find_all("p"):
            text = p.get_text(strip=True)
            if len(text) > 30:
                paragraphs.append(text)

        return "\n\n".join(paragraphs)

    def _extract_paragraphs_regex(self, html):
        """Fallback extraction <p> avec regex"""
        # Supprimer scripts et styles
        html = re.sub(
            r"<(script|style)[^>]*>.*?</\1>", "",
            html, flags=re.DOTALL | re.IGNORECASE
        )
        paragraphs = re.findall(
            r"<p[^>]*>(.*?)</p>",
            html, re.DOTALL | re.IGNORECASE
        )
        texts = []
        for p in paragraphs:
            text = re.sub(r"<[^>]+>", "", p).strip()
            text = re.sub(r"\s+", " ", text)
            if len(text) > 30:
                texts.append(text)

        return "\n\n".join(texts)

    # ── Utilitaires ──────────────────────────────────────────────────────

    def _extract_domain(self, url):
        """Extraire le domaine d'une URL"""
        try:
            parsed = urllib.parse.urlparse(url)
            domain = parsed.netloc.lower()
            if domain.startswith("www."):
                domain = domain[4:]
            return domain
        except Exception:
            return ""

    # ── Detection carriere par volume d'articles (v4.2) ─────────────────

    # Domaines medias avec pages contributeur indexees
    _CAREER_MEDIA_DOMAINS = [
        "franceinter.fr", "franceculture.fr", "francetvinfo.fr",
        "rfi.fr", "france24.com", "tv5monde.com",
        "ina.fr", "arte.tv",
        "lemonde.fr", "lefigaro.fr", "liberation.fr",
        "mediapart.fr", "lexpress.fr", "lepoint.fr",
        "nouvelobs.com", "slate.fr", "huffingtonpost.fr",
        "reflets.info", "nextinpact.com", "numerama.com",
    ]

    # Seuil minimal d'articles pour considerer un lien editorial
    _CAREER_MIN_ARTICLES = 3

    def fetch_author_profile(self, author_name, media_sources=None):
        """Detecter si une personne est journaliste/contributeur regulier
        d'un ou plusieurs medias, en comptant les articles par domaine.

        Strategie :
        1. Parcourir les sources medias deja collectees (pipeline)
        2. Recherche DDG ciblee sur les domaines medias cles
        3. Compter les articles par domaine
        4. Deduire career_start / career_end depuis les dates trouvees

        Retourne un dict :
            {
                "is_media_contributor": bool,
                "domains": [
                    {"domain": str, "nb_articles": int,
                     "career_start": str|None, "career_end": str|None,
                     "sample_titles": list[str]},
                ],
                "primary_domain": str|None,
                "summary": str,   # "Contributeur France Inter (2015-2023, 12 articles)"
            }
        ou None si aucun profil editorial detecte.
        """
        import re as _re

        _log("INFO",
            "[AUTHOR-PROFILE] Recherche profil editorial "
            "pour '%s'" % author_name)

        domain_data = {}  # domain -> {urls: set, titles: [], years: []}

        # ── Phase 1 : comptage depuis les sources deja collectees ────
        if media_sources:
            for src in media_sources:
                domain = src.get("domain", "")
                url = src.get("url", "")
                title = src.get("title", "")
                if not domain:
                    continue
                # Normaliser le domaine
                clean_dom = domain.lower()
                if clean_dom.startswith("www."):
                    clean_dom = clean_dom[4:]
                # Verifier si c'est un domaine media pertinent
                matched = None
                for md in self._CAREER_MEDIA_DOMAINS:
                    if md in clean_dom or clean_dom in md:
                        matched = md
                        break
                if not matched:
                    continue
                if matched not in domain_data:
                    domain_data[matched] = {
                        "urls": set(), "titles": [], "years": []
                    }
                if url and url not in domain_data[matched]["urls"]:
                    domain_data[matched]["urls"].add(url)
                    if title:
                        domain_data[matched]["titles"].append(title)
                    # Extraire annee depuis l'URL ou le titre
                    year = self._extract_year_from_url(url)
                    if year:
                        domain_data[matched]["years"].append(year)

        # ── Phase 2 : recherche DDG ciblee sur domaines cles ─────────
        # Cibler les 5 plus gros medias FR (pas tous, pour la vitesse)
        priority_domains = [
            "franceinter.fr", "lemonde.fr", "mediapart.fr",
            "liberation.fr", "lefigaro.fr",
        ]
        quoted_name = '"%s"' % author_name

        for target_domain in priority_domains:
            # Skip si deja assez de donnees pour ce domaine
            if (target_domain in domain_data
                    and len(domain_data[target_domain]["urls"]) >= 5):
                continue
            try:
                ddg_query = "%s site:%s" % (quoted_name, target_domain)
                results = _serper_search(ddg_query, num=10)
                if not results:
                    continue

                if target_domain not in domain_data:
                    domain_data[target_domain] = {
                        "urls": set(), "titles": [], "years": []
                    }
                dd = domain_data[target_domain]
                for r in results:
                    r_url = r.get("url", "")
                    r_title = r.get("title", "")
                    if r_url and r_url not in dd["urls"]:
                        dd["urls"].add(r_url)
                        if r_title:
                            dd["titles"].append(r_title)
                        year = self._extract_year_from_url(r_url)
                        if year:
                            dd["years"].append(year)

                _log("INFO",
                    "[AUTHOR-PROFILE] Serper site:%s → %d resultats"
                    % (target_domain, len(results)))
            except Exception as e:
                _log("WARN",
                    "[AUTHOR-PROFILE] Serper site:%s erreur: %s"
                    % (target_domain, e))

        # ── Phase 3 : analyse et construction du profil ──────────────
        domains_result = []
        for domain, dd in sorted(
            domain_data.items(),
            key=lambda x: len(x[1]["urls"]),
            reverse=True
        ):
            nb = len(dd["urls"])
            if nb < self._CAREER_MIN_ARTICLES:
                continue

            years = sorted(set(dd["years"]))
            career_start = str(years[0]) if years else None
            career_end = str(years[-1]) if years else None
            sample = dd["titles"][:5]

            domains_result.append({
                "domain": domain,
                "nb_articles": nb,
                "career_start": career_start,
                "career_end": career_end,
                "sample_titles": sample,
            })

        if not domains_result:
            _log("INFO",
                "[AUTHOR-PROFILE] Aucun profil editorial detecte "
                "pour '%s'" % author_name)
            return None

        primary = domains_result[0]
        period = ""
        if primary["career_start"] and primary["career_end"]:
            if primary["career_start"] == primary["career_end"]:
                period = primary["career_start"]
            else:
                period = "%s-%s" % (
                    primary["career_start"], primary["career_end"])
        elif primary["career_start"]:
            period = "depuis %s" % primary["career_start"]

        summary_parts = []
        for d in domains_result[:3]:
            label = d["domain"].split(".")[0].replace("france", "France ")
            p = ""
            if d["career_start"] and d["career_end"]:
                if d["career_start"] == d["career_end"]:
                    p = d["career_start"]
                else:
                    p = "%s-%s" % (d["career_start"], d["career_end"])
            entry = "%s (%d articles" % (label, d["nb_articles"])
            if p:
                entry += ", %s" % p
            entry += ")"
            summary_parts.append(entry)

        profile_summary = "Contributeur %s" % " / ".join(summary_parts)

        result = {
            "is_media_contributor": True,
            "domains": domains_result,
            "primary_domain": primary["domain"],
            "summary": profile_summary,
        }

        _log("INFO",
            "[AUTHOR-PROFILE] Profil detecte: %s" % profile_summary)

        return result

    @staticmethod
    def _extract_year_from_url(url):
        """Extraire une annee (2000-2039) depuis une URL.
        Patterns : /2023/04/, /2021-06-15, /article-2019-
        """
        import re as _re
        match = _re.search(r'/(\d{4})[-/]', url)
        if match:
            year = int(match.group(1))
            if 2000 <= year <= 2039:
                return year
        return None


# ══════════════════════════════════════════════════════════════════════════
# RefletsScraper — Scraping authentifie de reflets.info
# Themes : cybersecurite, influence, propagande, chiffrement, hackers
# Credentials : variables d'environnement REFLETS_EMAIL / REFLETS_PASSWORD
# ══════════════════════════════════════════════════════════════════════════

class RefletsScraper:
    """Scraper authentifie pour reflets.info (media investigation cyber)
    Reflets.info utilise Devise (Ruby on Rails) pour l'authentification.
    Login : POST /users/sign_in avec CSRF token (authenticity_token)
    Recherche : GET /articles?search=QUERY (fonctionne SANS auth)
    Articles complets : necessite auth (paywall)
    """

    LOGIN_URL = "https://reflets.info/users/sign_in"
    SEARCH_URL = "https://reflets.info/articles"
    BASE_URL = "https://reflets.info"

    def __init__(self):
        self.cookie_jar = http.cookiejar.CookieJar()
        self.opener = urllib.request.build_opener(
            urllib.request.HTTPCookieProcessor(self.cookie_jar)
        )
        self.logged_in = False
        self.email = os.environ.get("REFLETS_EMAIL", "")
        self.password = os.environ.get("REFLETS_PASSWORD", "")

    def _get_csrf_token(self):
        """Extraire le token CSRF (authenticity_token) depuis la page login.
        Devise Rails genere un token CSRF dans un champ hidden du formulaire
        et/ou dans une balise <meta name='csrf-token'>.
        """
        try:
            req = urllib.request.Request(
                self.LOGIN_URL,
                headers={
                    "User-Agent": USER_AGENT,
                    "Accept": "text/html"
                }
            )
            resp = self.opener.open(req, timeout=10)
            html = resp.read().decode("utf-8", errors="replace")

            # Methode 1 : champ hidden authenticity_token dans le formulaire
            m = re.search(
                r'<input[^>]+name="authenticity_token"[^>]+value="([^"]+)"',
                html
            )
            if m:
                _log("INFO", "Reflets: CSRF token extrait (hidden input)")
                return m.group(1)

            # Methode 2 : balise meta csrf-token
            m = re.search(
                r'<meta\s+name="csrf-token"\s+content="([^"]+)"',
                html
            )
            if m:
                _log("INFO", "Reflets: CSRF token extrait (meta tag)")
                return m.group(1)

            _log("WARN", "Reflets: CSRF token introuvable sur /users/sign_in")
            return None

        except Exception as e:
            _log("WARN", f"Reflets: echec GET /users/sign_in: {e}")
            return None

    def login(self):
        """Authentification a reflets.info via Devise (Ruby on Rails).
        Etape 1 : GET /users/sign_in → extraire authenticity_token
        Etape 2 : POST /users/sign_in avec user[email], user[password], token
        """
        if not self.email or not self.password:
            _log("WARN",
                "Reflets: credentials non configurees "
                "(REFLETS_EMAIL / REFLETS_PASSWORD)")
            return False

        # Etape 1 : recuperer le CSRF token
        csrf_token = self._get_csrf_token()
        if not csrf_token:
            _log("WARN", "Reflets: impossible de recuperer CSRF token")
            return False

        # Etape 2 : POST login Devise
        try:
            data = urllib.parse.urlencode({
                "authenticity_token": csrf_token,
                "user[email]": self.email,
                "user[password]": self.password,
                "user[remember_me]": "1",
                "commit": "Se connecter"
            }).encode("utf-8")

            req = urllib.request.Request(
                self.LOGIN_URL,
                data=data,
                headers={
                    "User-Agent": USER_AGENT,
                    "Content-Type": "application/x-www-form-urlencoded",
                    "Referer": self.LOGIN_URL,
                    "Origin": self.BASE_URL,
                    "Accept": "text/html,application/xhtml+xml"
                }
            )
            resp = self.opener.open(req, timeout=10)
            final_url = resp.geturl()

            # Devise redirige vers / ou /articles apres login reussi
            # Si on reste sur /users/sign_in → echec
            if "/users/sign_in" in final_url:
                _log("WARN",
                    "Reflets: echec auth "
                    "(redirige vers sign_in, credentials invalides?)")
                return False

            # Verifier presence cookies de session Rails
            cookie_names = [c.name for c in self.cookie_jar]
            _log("INFO",
                f"Reflets: cookies apres login: {cookie_names}")

            # Rails session cookie = _reflets_session ou remember_user_token
            self.logged_in = any(
                "_session" in n or "remember" in n or "user" in n
                for n in cookie_names
            )

            if not self.logged_in:
                # Fallback : si redirige hors de sign_in, on est probablement connecte
                self.logged_in = True
                _log("INFO",
                    "Reflets: auth presumee reussie "
                    f"(redirect vers {final_url})")
            else:
                _log("INFO",
                    "Reflets: authentification reussie "
                    f"(cookies session detectes)")

            return self.logged_in

        except Exception as e:
            _log("WARN", f"Reflets: echec login: {e}")
            return False

    def search_articles(self, query, max_results=3):
        """Rechercher des articles sur reflets.info.
        La recherche fonctionne SANS authentification.
        URL : /articles?search=QUERY
        """
        try:
            params = urllib.parse.urlencode({"search": query})
            url = f"{self.SEARCH_URL}?{params}"

            req = urllib.request.Request(url, headers={
                "User-Agent": USER_AGENT,
                "Accept": "text/html"
            })
            # Utiliser opener (avec cookies) au cas ou on est connecte
            resp = self.opener.open(req, timeout=FETCH_TIMEOUT)
            html = resp.read().decode("utf-8", errors="replace")

            return self._parse_search_results(html, max_results)

        except Exception as e:
            _log("WARN", f"Reflets: echec recherche: {e}")
            return []

    def fetch_article(self, url, max_chars=3000):
        """Extraire le contenu complet d'un article (avec auth paywall).
        Retourne un dict {"text": str|None, "author": str} avec le texte
        et le nom de l'auteur/signataire de l'article.
        """
        if not self.logged_in and not self.login():
            _log("WARN",
                "Reflets: impossible d'acceder a l'article "
                "(auth requise pour contenu complet)")
            return {"text": None, "author": ""}

        try:
            req = urllib.request.Request(url, headers={
                "User-Agent": USER_AGENT,
                "Accept": "text/html",
                "Referer": self.BASE_URL + "/"
            })
            resp = self.opener.open(req, timeout=FETCH_TIMEOUT)
            html = resp.read().decode("utf-8", errors="replace")

            text = self._extract_article_text(html)
            if text and len(text) > max_chars:
                text = text[:max_chars - 3] + "..."

            author = self._extract_article_author(html)

            return {"text": text, "author": author}

        except Exception as e:
            _log("WARN", f"Reflets: echec fetch article {url}: {e}")
            return {"text": None, "author": ""}

    def fetch_sources(self, query, max_results=3):
        """Pipeline complet : recherche + extraction pour synthese.
        Retourne un dict structuré :
            {
                "sources": list[dict] — sources compatibles pipeline,
                "author_articles": list[dict] — articles signes par la personne,
                "is_contributor": bool — True si la personne est auteur
            }
        Pipeline :
        1. Recherche articles (sans auth)
        2. Login si pas deja fait
        3. Extraction contenu complet + auteur (avec auth)
        4. Detection role auteur
        """
        # Etape 1 : recherche (pas besoin d'auth)
        articles = self.search_articles(query, max_results)

        if not articles:
            _log("INFO", "Reflets: aucun article trouve pour la recherche")
            return {
                "sources": [],
                "author_articles": [],
                "is_contributor": False
            }

        _log("INFO",
            f"Reflets: {len(articles)} articles trouves, "
            "extraction du contenu...")

        # Etape 2 : login pour acceder au contenu complet
        if not self.logged_in:
            if not self.login():
                _log("WARN",
                    "Reflets: auth echouee, "
                    "tentative extraction sans auth")

        # Mots de la requete pour matcher l'auteur
        query_words = [
            w.lower() for w in query.split() if len(w) > 2
        ]

        # Etape 3 : extraction du contenu + detection auteur
        sources = []
        author_articles = []
        for art in articles:
            result = self.fetch_article(art["url"])
            text = result.get("text")
            author_name = result.get("author", "")

            if text and len(text) > 100:
                # Detecter si la personne recherchee est l'auteur
                is_author = False
                if author_name:
                    author_lower = author_name.lower()
                    is_author = any(
                        w in author_lower for w in query_words
                    )

                source_entry = {
                    "type": "media",
                    "domain": "reflets.info",
                    "url": art["url"],
                    "title": art["title"],
                    "text": text,
                    "snippet": text[:200],
                    "author": author_name,
                    "is_author": is_author
                }
                sources.append(source_entry)

                if is_author:
                    author_articles.append({
                        "title": art["title"],
                        "url": art["url"],
                        "author": author_name
                    })
                    _log("INFO",
                        f"Reflets: article SIGNE par '{query}': "
                        f"{art['title'][:50]} (auteur: {author_name})")
                else:
                    _log("INFO",
                        f"Reflets source: {art['title'][:50]} "
                        f"({len(text)} chars)"
                        f"{f' (auteur: {author_name})' if author_name else ''}")

        is_contributor = len(author_articles) > 0

        _log("INFO",
            f"Reflets: {len(sources)} sources exploitables "
            f"sur {len(articles)} articles, "
            f"auteur={is_contributor} ({len(author_articles)} signes)")

        return {
            "sources": sources,
            "author_articles": author_articles,
            "is_contributor": is_contributor
        }

    def _parse_search_results(self, html, max_results):
        """Parser les resultats de recherche Reflets.info.
        Structure confirmee : liens <a href='/articles/SLUG'> dans la page.
        """
        try:
            from bs4 import BeautifulSoup
        except ImportError:
            _log("WARN", "bs4 non disponible, fallback regex Reflets")
            return self._parse_search_results_regex(html, max_results)

        soup = BeautifulSoup(html, "html.parser")
        articles = []
        seen_urls = set()

        # Methode 1 : liens vers /articles/SLUG
        for link in soup.find_all("a", href=True):
            href = link.get("href", "")

            # Filtrer uniquement les liens d'articles
            if "/articles/" not in href:
                continue

            # Ignorer les liens de navigation/pagination
            if href.endswith("/articles") or href.endswith("/articles/"):
                continue

            # Extraire le slug
            slug_match = re.search(r"/articles/([a-z0-9\-]+)", href)
            if not slug_match:
                continue

            # Construire l'URL complete
            if not href.startswith("http"):
                href = self.BASE_URL + href

            # Dedup par URL
            if href in seen_urls:
                continue
            seen_urls.add(href)

            # Titre : texte du lien, ou titre parent, ou slug humanise
            title = link.get_text(strip=True)
            if not title or len(title) < 5:
                # Essayer le parent h2/h3
                parent = link.find_parent(["h2", "h3", "h4"])
                if parent:
                    title = parent.get_text(strip=True)
            if not title or len(title) < 5:
                # Humaniser le slug
                title = slug_match.group(1).replace("-", " ").title()

            articles.append({"title": title, "url": href})
            if len(articles) >= max_results:
                break

        _log("INFO", f"Reflets search: {len(articles)} articles trouves")
        return articles

    def _parse_search_results_regex(self, html, max_results):
        """Fallback regex si bs4 indisponible"""
        articles = []
        seen_urls = set()

        for m in re.finditer(
            r'<a[^>]+href="(/articles/[a-z0-9\-]+)"[^>]*>([^<]+)</a>',
            html
        ):
            href = self.BASE_URL + m.group(1)
            title = m.group(2).strip()

            if href in seen_urls or not title or len(title) < 5:
                continue
            seen_urls.add(href)

            articles.append({"title": title, "url": href})
            if len(articles) >= max_results:
                break

        return articles

    def _extract_article_text(self, html):
        """Extraire le texte d'un article Reflets.info.
        Gere le cas paywall (contenu partiel) et contenu complet (auth).
        """
        try:
            from bs4 import BeautifulSoup
        except ImportError:
            return self._extract_article_text_regex(html)

        soup = BeautifulSoup(html, "html.parser")

        # Supprimer le bruit
        for tag in soup.select(
            "nav, footer, aside, .comments, script, style, "
            ".sidebar, .widget, .related-posts, .share-buttons, "
            ".paywall-teaser, .subscription-cta, header"
        ):
            tag.decompose()

        # Selecteurs pour contenu article Reflets
        # Tester du plus specifique au plus general
        selectors = [
            ".article-content",
            ".article-body",
            ".post-content",
            ".entry-content",
            "article .content",
            "article",
            "main .content",
            "main"
        ]

        for sel in selectors:
            content = soup.select_one(sel)
            if content:
                paragraphs = content.find_all("p")
                text = "\n".join(
                    p.get_text(strip=True) for p in paragraphs
                    if len(p.get_text(strip=True)) > 30
                )
                if len(text) > 100:
                    return text

        # Fallback : tous les <p> de la page
        all_p = soup.find_all("p")
        text = "\n".join(
            p.get_text(strip=True) for p in all_p
            if len(p.get_text(strip=True)) > 50
        )
        if len(text) > 100:
            return text

        return None

    def _extract_article_text_regex(self, html):
        """Fallback regex si bs4 indisponible"""
        # Extraire tous les paragraphes
        paragraphs = re.findall(r"<p[^>]*>(.+?)</p>", html, re.DOTALL)
        texts = []
        for p in paragraphs:
            clean = re.sub(r"<[^>]+>", "", p).strip()
            if len(clean) > 30:
                texts.append(clean)
        text = "\n".join(texts)
        return text if len(text) > 100 else None

    def _extract_article_author(self, html):
        """Extraire le nom de l'auteur/signataire d'un article Reflets.info.
        Cherche dans les selecteurs byline courants + meta author.
        Retourne le nom de l'auteur ou chaine vide.
        """
        try:
            from bs4 import BeautifulSoup
        except ImportError:
            return self._extract_article_author_regex(html)

        soup = BeautifulSoup(html, "html.parser")

        # Selecteurs byline courants (du plus specifique au plus general)
        selectors = [
            ".author-name", ".author a", ".byline a",
            "[rel='author']", ".entry-author a",
            ".author", ".byline", ".entry-author",
            ".post-author", ".article-author",
            "span.author", "a.author"
        ]
        for sel in selectors:
            el = soup.select_one(sel)
            if el:
                name = el.get_text(strip=True)
                if name and len(name) > 2 and len(name) < 60:
                    return name

        # Fallback : meta tag author
        meta_author = soup.find("meta", attrs={"name": "author"})
        if meta_author:
            content = meta_author.get("content", "").strip()
            if content and len(content) > 2:
                return content

        return ""

    def _extract_article_author_regex(self, html):
        """Fallback regex pour extraction auteur"""
        # Chercher class="author" ou class="byline"
        m = re.search(
            r'class="(?:author|byline)[^"]*"[^>]*>([^<]+)<',
            html
        )
        if m:
            name = m.group(1).strip()
            if name and len(name) > 2 and len(name) < 60:
                return name

        # Meta author
        m = re.search(
            r'<meta\s+name="author"\s+content="([^"]+)"',
            html
        )
        if m:
            return m.group(1).strip()

        return ""
