
    ~iT                       d Z ddlZddlZddlZddlZddlZddlZg dZ	g dZ
dZdZdZg dZh d	Zd
 Zd Zd Zd Zd Zd Zg dZd ZddgZd Zg dZd Zg dZd Zg dZdtdZd Z  ej!        dd          Z"dZ#dZ$dud"Z%dvd$Z&d% Z'g d&g d'g d(g d)g d*g d+g d,g d-g d.d/	Z(h d0Z)h d1Z*g d2Z+i d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdLdNdLdOdPdQdRdSdTdUdVdWdLdXdYdZd[d\d]Z,g d^Z-dwd`Z.da Z/	 dxdbZ0dc Z1	 dydeZ2 e3h df          Z4dg Z5dh Z6di Z7dj Z8dk Z9dl Z:dm Z;dn Z<do Z= G dp dq          Z> G dr ds          Z?dS )zz
[CYBER-STRAT] Fetcher de sources medias
Recherche DuckDuckGo HTML + extraction de contenu d'articles
Agent responsable : DATA_SCRAPER
    N)-
lemonde.frlefigaro.frliberation.frrfi.frfrance24.comtv5monde.com
lepoint.frlexpress.frmediapart.frzcourrierinternational.com	bfmtv.comfranceinfo.fr20minutes.frstrategies.frzlesinrocks.comtelerama.frnouvelobs.comleparisien.frouest-france.frsudouest.frzlagazettedescommunes.comradiofrance.frfranceculture.frfranceinter.fr
cairn.info	persee.frhal.sciencezhal.archives-ouvertes.fr
erudit.orgzcollectifpsychiatrie.frz
psycom.orgzsantementale.frzblogs.mediapart.frzclub.mediapart.frbabelio.com
decitre.frzffdn.orgzlaquadrature.netnextinpact.comnumerama.comabout.mef6s.commuckrack.comcrunchbase.com
github.com)wikipedia.orgfacebook.comtwitter.comx.cominstagram.comlinkedin.com
reddit.comyoutube.com
tiktok.com	amazon.frebay.frforum     zuMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36)cybersecuriteu   cybersécuritézcyber securiteu   cyber sécuritécyberattaquecyberguerrezcyber guerrecyberdefense	influence
propagandedesinformationu   désinformationchiffrement
encryptioncryptographiehackerhackershackingpiratage
ransomwaremalwarephishingsurveillancez
vie priveeu   vie privéedarknetzdark webpegasuspredatorz	nso group>   reflets inforadio refletsyovan menkevickfabrice epelboinantoine champagnele pistolet et la piocheshamandrapherkiketoa	bluetouffreflets.infoc                 l    |                                  t          fdt          D                       S )z9Verifier si la requete correspond aux themes Reflets.infoc              3       K   | ]}|v V  	d S N ).0themeqs     3/var/www/cyber-strat/agents_python/media_fetcher.py	<genexpr>z&is_reflets_relevant.<locals>.<genexpr>Z   s'      66euz666666    )loweranyREFLETS_THEMESqueryrY   s    @rZ   is_reflets_relevantrb   W   s3    A6666~666666r\   c                     |                                                                  t          v rdS t          fdt          D                       S )zVerifier si la requete correspond a un nom/pseudo prioritaire Reflets.
    Detection insensible a la casse, partielle (mot dans la requete).
    Tc              3       K   | ]}|v V  	d S rU   rV   )rW   rrY   s     rZ   r[   z&is_reflets_priority.<locals>.<genexpr>f   s'      88!qAv888888r\   )r]   stripREFLETS_PRIORITY_QUERIESr^   r`   s    @rZ   is_reflets_priorityrh   ]   sS     	A$$$t88887888888r\   c                 0    t          d|  d|            dS )z Log prefixe pour le module mediaz[CYBER-STRAT][MEDIA][z] N)print)levelmessages     rZ   _logrm   i   s'    	
4%
4
47
4
455555r\   c                 z    ddl                     d|           }d                    fd|D                       S )u<   Retirer les accents d'un texte (ex: 'hérard' → 'herard').r   NNFKD c              3   F   K   | ]}                     |          |V  d S rU   )	combiningrW   cunicodedatas     rZ   r[   z!_strip_accents.<locals>.<genexpr>r   s6      CC+*?*?*B*BC1CCCCCCr\   )ru   	normalizejoin)textnfkdru   s     @rZ   _strip_accentsrz   n   sK      ..D77CCCCdCCCCCCr\   c                    h d}|                                                                  }t          |          dk     r3t          |                                                                            S |d         g}t          t          |          dz
  dd          D ];}||                                         |v r|                    d||                    ; t          d                    |                                                    S )u   v4.9b — Extraire le nom de famille (gere particules).
    Version locale media_fetcher (miroir de server._extract_lastname).
    'Marie Le Boiteux' → 'le boiteux'
    'Fabrice Epelboin'  → 'epelboin'
    >   aldediduellalebendesibnvanvon   r    )rf   splitlenrz   r]   rangeinsertrw   )name
_PARTICLESpartslastname_partsis        rZ   _extract_lastname_mfr   u   s      J JJLL  E
5zzA~~djjll0022333Bi[N3u::>1b))  8>>z))!!!U1X....#((>2288::;;;r\   c                 ,   |                                                                  } t          j        dd|           } t          j        dd|           } |                     d          } t          j        dd|           } t          j        dd|           } | S )u  v4.9b — Normalise une URL pour deduplication.
    Supprime : protocole, www., trailing /, tracking params.

    https://www.pascalherard.fr/  → pascalherard.fr
    http://pascalherard.fr        → pascalherard.fr
    https://pascalherard.fr?utm=x → pascalherard.fr
    z
^https?://rp   z^www\./z
\?utm[^#]*z\?ref=[^#]*)r]   rf   resubrstripurls    rZ   _normalize_urlr      s}     ))++



C
&C
(
(C
&B
$
$C
**S//C
&C
(
(C
&S
)
)CJr\   ) zreserve aux abonneszarticle reservezacces reservezabonnez-vouszpour lire cet articlezlire sur un seul appareilz/lecture du monde en cours sur un autre appareilzcontenu premiumzoffre d'abonnementzdecouvrez nos offreszconnectez-vous pour accederzcree ton compte pour lirezvous devez etre connectezalready a subscriberzsubscribe to readzpremium contentzvous pouvez lire le mondezcet article est reservezbloqueur de publiciteadblockz
ad blockerzdesactivez votre bloqueurzdesactiver votre bloqueurzdisable your ad blockerzturn off your ad blockerzveuillez desactiverzplease disable your adblockerzdetecte un bloqueurz%contenu indisponible avec un bloqueurz#accepter les cookies pour continuerzaccept cookies to continuezconsentement requisc                    | r%t          |                                           dk     rdS t          |                                           t	          fdt
          D                       }|dk    r't          |                                           dk     rdS |dk    r't          |                                           dk     rdS dS )	uj   v4.9f — Detecte si le texte scrape est un message de paywall.
    Retourne True si paywall detecte.
    d   Tc              3   $   K   | ]
}|v d V  dS )   NrV   )rW   p
text_lowers     rZ   r[   z"_detect_paywall.<locals>.<genexpr>   s0       9 9Z9 9r\   r   i  r      F)r   rf   rz   r]   sum_PAYWALL_PATTERNS)rx   paywall_hitsr   s     @rZ   _detect_paywallr      s      3tzz||$$s**t

--J 9 9 9 9$9 9 9 9 9L qS..44tqS..44t5r\   r   z	jstor.orgc                     	 t           j                            |           j                            dd          n# t
          $ r Y dS w xY wt          fdt          D                       S )uC   v4.9f — Verifie si le domaine est connu pour bloquer le scraping.www.rp   Fc              3       K   | ]}|v V  	d S rU   rV   )rW   blockeddomains     rZ   r[   z(_should_skip_scraping.<locals>.<genexpr>   s(      JJWw& JJJJJJr\   )urllibparseurlparsenetlocreplace	Exceptionr^   _BLOCKED_SCRAPING_DOMAINS)r   r   s    @rZ   _should_skip_scrapingr      sx    &&s++2::62FF   uuJJJJ0IJJJJJJs   8< 
A
	A
)myheritage.commyheritage.frzgeneanet.orgz	filae.comzancestry.comzancestry.frzfamilysearch.orgzgeni.comzavis-de-deces.netzdansnoscoeurs.frzlibramemoria.comznecrologie.frzfrancearchives.gouv.frcopainsdavant.linternaute.compagesjaunes.fr	118712.frz	118218.frzkompass.comzcarnet.sudouest.frc                     	 t           j                            |           j                            dd                                          n# t          $ r Y dS w xY wt          fdt          D                       S )u   v4.9g FIX-A — Rejette les domaines qui produisent du bruit
    (genealogie, avis de deces, annuaires, archives historiques).
    Distinct de _NOISE_DOMAINS (clustering) : cible le scraping.
    r   rp   Fc              3       K   | ]}|v V  	d S rU   rV   )rW   noiser   s     rZ   r[   z#_is_noise_domain.<locals>.<genexpr>  s'      DD5uDDDDDDr\   )	r   r   r   r   r   r]   r   r^   _SCRAPING_NOISE_DOMAINSr   r   s    @rZ   _is_noise_domainr      s    
&&s++2::B  	   uuDDDD,CDDDDDD   A
A 
AA)zbooknode.comr   r.   
amazon.comz	amazon.dezamazon.co.ukfnac.comr   zcultura.comzchapitre.comzlalibrairie.comzplacedeslibraires.frzleslibraires.frzgoodreads.comzlibrarything.comc                     	 t           j                            |           j                            dd                                          n# t          $ r Y dS w xY wt          fdt          D                       S )u   v4.9g FIX-C — Detecte les domaines de vente/catalogue de livres.
    Les snippets de ces sites decrivent des FICTIONS (synopsis de romans),
    pas la biographie reelle de la personne.
    r   rp   Fc              3       K   | ]}|v V  	d S rU   rV   )rW   bdr   s     rZ   r[   z"_is_book_domain.<locals>.<genexpr>!  s'      44rV|444444r\   )	r   r   r   r   r   r]   r   r^   _BOOK_DOMAINSr   s    @rZ   _is_book_domainr     s    
&&s++2::B  	   uu4444m444444r   )	/pub/dir//directory//search/results/personnes nommeesr   zpeople namedzprofils professionnelszprofiles of professionalsz"voir les profils de professionnelszvoir les profils de personnesz"view the profiles of professionalszview the profiles of peoplezconsultez les profilszprofils sur linkedinzprofiles on linkedinrp   c                    	 t           j                            |           j                            dd                                          }n# t          $ r Y dS w xY wd|vrdS |                                 }dD ]	}||v r dS 
t          |dz   |z                                             }t          D ]}|	                    d          r||v r dS  dS )	u   v4.9g FIX-E — Detecte les pages LinkedIn generiques (listings).
    Ces pages ne contiennent pas de profil reel mais des listes
    de personnes portant le meme nom.

    Retourne True si c'est une page generique a ignorer.
    r   rp   Fr*   )r   r   r   Tr   r   )
r   r   r   r   r   r]   r   rz   _LINKEDIN_GENERIC_PATTERNS
startswith)r   titlesnippetr   	url_lowerpatterncombineds          rZ   _is_linkedin_genericr   <  s   &&s++2::B  	   uu V##u 		I(  i44   	w	%%'') )H-  c"" 	h44  5s   A
A 
AAc                    | r|sdS t          |                                           }t          |                                          }||v rdS |                                }t          |          dk     r||v S h d}|d         }d}t	          |dd         d          D ]\  }}	|	|v r|} n|} d                    ||d                   }
|
r|
|v rdS |d	         }t          |          d
k    r
||v r||v rdS dS )u   v4.9g FIX-D — Verifie que le texte scrape mentionne la personne.
    Filtre les pages d'homonymes (ex: Damien Boiteux quand on cherche
    Marie Le Boiteux).

    Retourne True si le texte est pertinent (mentionne le nom).
    FTr   >   r|   dar}   r~   r   r   r   r   r   r   r   r   r   r   r   Nr   r      )rz   r]   r   r   	enumeraterw   )rx   ra   r   query_lowerr   
particulesprenom	nom_startr   r   nom_de_familledernier_mots               rZ   _is_content_relevantr   b  sQ     u u

--J //K j  t E
5zzA~~j((3 3 3J1XFI%)Q''  1
??IEIXXeIJJ/00N  .J66t )KK1z))*$$t5r\   SERPER_API_KEYz https://google.serper.dev/searchz https://google.serper.dev/images   frc           	         t           st          dd           g S t          j        | |||d                              d          }t
          j                            t          |t           ddd          }	 t
          j        	                    |d	
          5 }t          j
        |                                                    d                    }ddd           n# 1 swxY w Y   g }|                    dg           d|         D ][}	|	                    dd          }
|
rA|                    |
|	                    dd          |	                    dd          d           \t          dd| dd          dt          |           d           |S # t           $ r*}t          dd| dd          d|            g cY d}~S d}~ww xY w)u   v4.9c — Recherche web via Serper API.
    Remplace _ddg_search_lite / _parse_ddg_lite.
    Retourne le meme format : [{"url":, "title":, "snippet":}, ...]
    WARNzSERPER_API_KEY manquante)rY   glhlnumutf-8application/jsonz	X-API-KEYContent-TypePOSTdataheadersmethod
   timeoutNorganiclinkrp   r   r   r   r   r   INFOz[SERPER] q='2      ' → 
 resultatsERRORz[SERPER] Erreur pour '': )r   rm   jsondumpsencoder   requestRequestSERPER_ENDPOINTurlopenloadsreaddecodegetappendr   r   )ra   r   r   r   payloadreqrespr   resultsitemr   es               rZ   _serper_searchr    sI   
  V/000	j"Bs   vg  .
 
 '.
 
  !  C^##C#44 	;:diikk0099::D	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	;HHY++DSD1 	 	D((62&&C !XXgr22#xx	266      
 	VE5":EES\\EEE	G 	G 	G   WAuSbSzAAaAABBB						sC   7!F :CF C""F %C"&B,F 
GG<GGr   c           	         t           sg S t          j        | ||d                              d          }t          j                            t          |t           ddd          }	 t          j                            |d          5 }t          j	        |
                                                    d                    }d	d	d	           n# 1 swxY w Y   |                    d
g           d	|         }t          dd| d	d          dt          |           d           |S # t          $ r}t          dd|            g cY d	}~S d	}~ww xY w)uq   v4.9c — Recherche images via Serper API.
    Retourne : [{"imageUrl":, "title":, "link":, "source":}, ...]
    )rY   r   r   r   r   r   r   r   r   r   Nimagesr   z[SERPER-IMG] q='r   r    imagesr   z[SERPER-IMG] Erreur: )r   r   r   r   r   r  r  SERPER_IMG_ENDPOINTr  r  r  r  r  rm   r   r   )	ra   r   r   r
  r  r  r   r  r  s	            rZ   _serper_image_searchr    s     	j"S   vg  .
 
 '.
 
  !  C	^##C#44 	;:diikk0099::D	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	;(B''-VEuSbSzEEVEEE	G 	G 	G   W1a11222						sC   &!D$ :CD$ CD$ CAD$ $
E.EEEc           
         g d}|                                 dd         }|D ]'}||v r!t          dd| d| dd                      dS (t          |                                           }|                                }|r|d	         nd
}t	          |          }	t          |          }
t          |	o|	|
v           }t          |ot          |          dk    o||
v           }|r|rdS t          dd| d| dd                     dS )u  v4.9b — Verifie qu'un site est bien le site personnel
    de la personne (pas une entreprise homonyme).

    Rejette si :
    - Le prenom+nom complets sont absents du contenu
    - Le contenu contient des marqueurs d'entreprise

    Args:
        url: URL du site
        name: nom complet de la personne
        html_text: texte extrait de la page (deja scrape)

    Retourne True si site personnel valide, False sinon.
    )sarlzsas eurlzsci btpconstructionzdevis gratuitu   nos réalisationsznos realisationssiretz	tva intrau   conditions généraleszconditions generalescgvu   société spécialiséezsociete specialiseeu   entreprise spécialiséezentreprise specialiseeNr1   r   z![SITE-PERSO] Rejete (entreprise ''): r   Fr   rp   r   Tz[SITE-PERSO] Rejete ('z' absent): )r]   rm   rz   r   r   boolr   )r   r   	html_text_ENTREPRISE_MARKERSr   marker
name_asciir   	firstnamelastname
text_asciihas_lastnamehas_firstnames                rZ   _is_personal_siter(    s      ""5D5)J &  Z++ + #CRC+ +, , , 55	    

--JE!)arI#D))H
++J;X%;<<L 	$c)nnq( 	$#% %M   t	 	 	ss8	 	   5r\   )
psychiatrepsychanalystepsychologuemedecin
chirurgiendocteur
pharmacien	infirmier
therapeutecardiologuedermatologue	oncologuepediatregeneralistehopitalcliniquesoignant)	journalistereporterchroniqueureditorialiste	redacteurcorrespondantpresentateur	animateur	redaction)avocat	magistratjugenotairejuristebarreautribunal	juridique)
deputesenateurmaireministre
conseiller	politique	assemblee	parlementgouvernementzsecretaire d'etat)	chercheur
professeur
enseignantuniversitaire
academiquetheselaboratoirecnrsinriainserm
universite	doctorantzmaitre de conferences)ecrivainauteur	romancierpoeteartistemusicienacteurcomedienrealisateurcineastepeintre	sculpteurphotographe
dramaturgecompositeur)	ingenieurdeveloppeurinformatiquestartupentrepreneurctoceo	fondateurzco-fondateur	numeriquedigitalsoftwarer=   r3   )sportifathletejoueur
entraineurchampioncompetitionfootballrugbytennisbasketcyclisme	olympique)
economiste	financierbanquiertrader	directeurpdg	presidentgerantmanager
consultantauditeur)	santejournalismedroitrP  	rechercheculturetechsporteconomie>   dnb.comebay.com	verif.com
manageo.fr
pappers.frsociete.comleboncoin.frsocietes.cominfogreffe.frcopainsdavant.comsociete.leparisien.frentreprises.lefigaro.fr!annuaire-entreprises.data.gouv.frr(   r/   r   r   r.   r   r+   r-   r'   r,   r&   wikidata.orgr)   pinterest.comwikimedia.orgr%   r   >   r!   r    imdb.com	orcid.org	theses.frr   r$   
viadeo.comallocine.frr   r*   r"   r#   researchgate.netsenscritique.comscholar.google.com)r   r   r   r   r   r
   r	   r   r   humanite.frr   r   r   r   r   francetvinfo.frr   
tf1info.frr   lci.frarte.tvr   r   r   r   numerama.frr   zdnet.frr   lemonder   lefigaror   
liberationr   
leparisienr   	20minutesr
   lexpressr	   lepointr   	nouvelobsr   	mediapartr  humaniter   zouest-francer   sudouestr   radiofrancer   r   r  francetvr   
franceinfotf1bfmtvlciarterfi
strategiesteleramanumerama
nextinpactzdnet)r  r   r  r  r   r   r   r   r  r   r  )r  r  r  r  r   c                 ^   i }g }d}| D ]}	 t           j                            |                    dd                    j                            dd          }n# t          $ r d}Y nw xY w|                    |d          }||k     r|                    |           |dz   ||<   |dz  }||fS )u   v4.9f — Limite a max_per_domain resultats par domaine.
    Conserve l'ordre original (premiers resultats Serper prioritaires).
    Retourne (deduplicated, removed_count).
    r   r   rp   r   r   )r   r   r   r  r   r   r   r	  )r  max_per_domaindomain_countdeduplicatedremovedre   r   counts           rZ   _deduplicate_by_domainr    s    
 LLG  	\**eR  " ""()<)< F 	 	 	FFF	  ++>!!"""#(19L  qLGG  s   AAA('A(c                 <   i }| D ]~}	 t           j                            |                    dd                    j                            dd          }n# t          $ r d}Y nw xY w|                    |d          dz   ||<   |sdS t          ||j                  S )uC   v4.9f — Identifie le domaine le plus frequent dans les resultats.r   rp   r   r   r   key)r   r   r   r  r   r   r   max)r  domain_countsre   r   s       rZ   _get_dominant_domainr    s    M A A	\**eR  " ""()<)< F 	 	 	FFF	 - 1 1&! < <q @f r}-"34444s   AAA$#A$c           	         t          ||          \  }}t          |          }|dk     st          |          |k    r(t          dd| dt          |           d           ||fS d|  d| }t          dd| d|d	d
          d           t	          |d          }d |D             }	|D ]X}
|
                    dd          |	vr>|                    |
           |	                    |
                    dd                     Yt          ||          \  }}t          ddt          |           d           |d	|         |fS )uq   v4.9f — Deduplique + lance une requete de compensation si besoin.
    Retourne (results, dominant_domain).
    r   r   z[DEDUP] z doublons supprimes, z restants (compensation: NON)"z" -site:z$ doublons supprimes, compensation: 'N<   'r   r   c                 :    h | ]}|                     d d          S r   rp   r  rW   re   s     rZ   	<setcomp>z,_search_with_compensation.<locals>.<setcomp>  s&    888aub!!888r\   r   rp   z[DEDUP] Apres compensation: r   )r  r  r   rm   r  r  r	  add)ra   initial_resultsr  target_resultsr  r  dominant_domain
comp_queryextra_results	seen_urlsre   final_s                rZ   _search_with_compensationr    s   
 3) )L' +?;;O{{c,''>99V@w @ @<  @ @ @	A 	A 	A _,, 6U55O55J	-7 	- 	-$SbS/	- 	- 	-. . . #:2666M 98<888I , ,559,,"""MM!%%r**+++%lNCCHE1=s5zz===? ? ?.!?22r\   c                     | D ]r}	 t           j                            |                    dd                    j                            dd          }n# t          $ r Y \w xY wt          D ]
}||v r  dS sdS )uF   v4.9f — Detecte si au moins 1 resultat provient d'un media national.r   rp   r   TF)r   r   r   r  r   r   r   _NATIONAL_MEDIA_DOMAINS)r  re   r   medias       rZ   _has_press_footprintr    s      	\**eR  " ""()<)< F 	 	 	H	, 	 	Ettt 	 5s   AA
A A    c                 |   t                      }|D ]}	 t          j                            |                    dd                    j                            dd          }n# t          $ r Y \w xY wt          	                                D ]\  }}	||v r|
                    |	           t                              |d          fd|D             }
|
sfdt          D             }
|
sg S |
d         }d|  d| }t          d	d
|dd          d           t          |d          }d |D             }g }|D ]X}|                    dd          |vr>|                    |           |
                    |                    dd                     Yt          d	dt          |           d           |d|         S )uv   v4.9f — Requete complementaire pour presse nationale.
    Utilise le nom de marque media comme mot-cle naturel.
    r   rp   r   c                      g | ]
}|k    |S rV   rV   rW   bdominant_brands     rZ   
<listcomp>z*_search_national_media.<locals>.<listcomp>  s#    DDD^0C0C!0C0C0Cr\   c                      g | ]
}|k    |S rV   rV   r  s     rZ   r  z*_search_national_media.<locals>.<listcomp>  s.     
 
 
N"" """r\   r   r  z" r   z[MEDIA-COMP] Requete media: 'Nr  r  r   r  c                 :    h | ]}|                     d d          S r  r  r  s     rZ   r  z)_search_national_media.<locals>.<setcomp>%  s&    <<<aub!!<<<r\   z[MEDIA-COMP] z nouvelles sources)setr   r   r   r  r   r   r   _DOMAIN_TO_BRANDitemsr  _FALLBACK_MEDIA_KEYWORDSrm   r  r	  r   )ra   r  existing_resultsr  max_resultsdetected_brandsre   r   media_domainbrand
candidatesmedia_keywordsearch_querymedia_resultsr  new_resultsr  s                   @rZ   _search_national_mediar    sB    eeO + +	\**eR  " ""()<)< F 	 	 	H	#3#9#9#;#; 	+ 	+L%v%%##E***	+
 &))/2>>NDDDD_DDDJ 

 
 
 
/
 
 


  	qMM 0u////L<SbS(9<<<> > >"<R888M =<+;<<<IK , ,559,,q!!!MM!%%r**+++<K((<<<> > >||$$s   AA""
A/.A/>   	google.fr
google.comfr.linkedin.comr(   r.   r   r+   r-   r'   r,   r&   r*   r)   r   r  r   r   c                    |r| sdS 	 t           j                            |           j                                                            dd          }n# t          $ r Y dS w xY w|sdS |t          v rdS |D ]}t          |t                    r|
                    dd          n|}|s2	 t           j                            |          j                                                            dd          }n# t          $ r Y w xY w||k    r dS dS )u  v4.9f — Verifie si le domaine de la page source de l'image
    est deja present dans les sources texte validees du pipeline.

    Exclut les plateformes generiques (LinkedIn, Facebook, etc.) qui
    hebergent des profils de personnes differentes sur le meme domaine.

    text_sources : liste d'URLs (str) ou de dicts avec cle "url".
    Retourne True si le domaine de image_page_url matche un domaine
    de text_sources ET n'est pas une plateforme generique.
    Fr   rp   r   T)r   r   r   r   r]   r   r   _GENERIC_PLATFORM_DOMAINS
isinstancedictr  )image_page_urltext_sources
img_domainsrcsrc_url
src_domains         rZ   _is_trusted_source_domainr  =  sO     ~ u\** "55777762+>+> 	
   uu u...u 
 
(23(=(=F#''%$$$3 		.. (;(; J 	 	 	H	##44 $5s%   A
A 
A! A!(A
C33
D ?D c                 
   t          |                                           }t          j        dd|          }|                                }t          |                                                                          }t          j        dd|          }t          |                                                                          }h d|                                }|d         }t          |          dk    o"t          fd|dd         D                       }	d	}
d}t          t          |          t          |          z
  dz             D ]&}|||t          |          z            |k    rd
}
|} n'd	}d}|
s!t          |          D ]\  }}||k    rd
}|} n|
s|sd	S |
r|n|}|                                }|d         fdt          |          D             }|
r7|sg }|dk    r|
                    |dz
             |dk    r|
                    |dz
             |D ]R}|dk     s|t          |          k    r||         }t          |          dk    r|                                r|vr d	S Sd
S |D ]}|dz   }|t          |          k     ru||         }||d         k    r d
S |v r(|dz   }|t          |          k     r||         |k    r d
S ]t          |          dk    r|                                r|vr||vr d	S |dz   |k    r d
S d
S |r|	r|sF|dk    r>||dz
           }t          |          dk    r |                                r|vr|k    rd	S d
S |D ]x}||k    rD||dz   |         }t          |          dk    r d	S t          fd|D                       r d
S  d	S ||k     r&||dz   |         }t          |          dk    r d	S  d	S yd
S |sg }|dk    r|
                    |dz
             |dk    r|
                    |dz
             |D ]R}|dk     s|t          |          k    r||         }t          |          dk    r|                                r|vr d	S Sd
S |D ]}|dz   }|t          |          k     r{||         }||k    s||d         k    r d
S |v r(|dz   }|t          |          k     r||         |k    r d
S ct          |          dk    r|                                r|vr||vr d	S |dz   |k    r d
S |dz   |k    r&t          |          |dz   k    r||dz            v r d
S d
S )u  v4.9f — Verifie qu'un portrait correspond a la bonne personne.
    Gere les noms composes (Le Boiteux), les prenoms composes
    (Marie-Therese), et les prenoms intercales (herve_elie-bokobza).
    v4.9f CORRECTION 2 : gestion stricte des particules de nom.

    Retourne True si l'image correspond probablement a la bonne personne.
    Retourne False si un homonyme est detecte.
    z[-_./\\]r   z[-_]>   r|   r}   r   r   r   r   r   r   r   lesr   r   r   r   c              3       K   | ]}|v V  	d S rU   rV   )rW   r   
_particless     rZ   r[   z*_validate_portrait_name.<locals>.<genexpr>~  s'      ==AZ======r\   NFTr   c                 &    g | ]\  }}|k    |S rV   rV   )rW   r   wcore_firstnames      rZ   r  z+_validate_portrait_name.<locals>.<listcomp>  s.     ? ? ?a!~*=*=*=*=*=r\   r   r   c              3       K   | ]}|v V  	d S rU   rV   )rW   r   r  s     rZ   r[   z*_validate_portrait_name.<locals>.<genexpr>  s'      881qJ888888r\   )rz   r]   r   r   r   rf   r   r^   r   r   r	  isalphaall)filename_or_urlr#  r$  rx   wordsfirstname_lowerlastname_lowerr   core_lastnamehas_particle
full_matchfull_match_posr   
core_matchcore_match_posr   lastname_positionfirstname_partsfirstname_positionsadjidxfpnxtnwafterprevbetweenr  r!  s                              @@rZ   _validate_portrait_namer:  b  s    ///1122D6+sD))DJJLLE$Y__%6%6%<%<%>%>??OfWc?;;O#HNN$4$4$:$:$<$<==N  J $))++N"2&MNa 	>====)<=====  JN3u::N 3 33a788  1s>****+~==JNE > JN e$$ 	 	DAqM!!!
!" "
  j u +5H. &++--O$Q'N? ? ? ?&&? ? ?  $" 	C!!

>A-...!!

>A-... ! !77cSZZ//#JFFaKKAIIKKKZ// 554 & 	 	Bq&CSZZ3Z***44##FEE

** %e = =#ttGGqLLRZZ\\Lj00o55 55Av''tt (t  #l #" 		!!^a/0IINNt||~~N 
22 N22 54
 & 	 	B""Q~ 56w<<1$$ 55888888888  44uu"$$ 22 56w<<1$$ 55uu % t  q  JJ(1,---q  JJ(1,--- 	 	CQww#U++c
AA!		++uut "  1fUsBm##^A...ttZQCJJ&&!%LM9944B1*,,/11uu6&&&44F'''JJa''"q&MZ//444r\   c                    	 t           j                            |           j                                                            dd          n# t          $ r Y dS w xY wdv rdS t          fdt          D                       rdS t          fdt          D                       rd	S t          fd
t          D                       rdS d}t          fd|D                       rdS dS )u   v4.9e — Classifier le type d'un resultat Serper par son URL.
    Retourne: 'linkedin', 'social', 'personal_site', 'press',
              'academic', 'noise', 'other'
    r   rp   otherr*   linkedinc              3   T   K   | ]"}|k    p                     d |z             V  #dS .NendswithrW   sdr   s     rZ   r[   z%_classify_url_type.<locals>.<genexpr>5  sP       % % R<46??3844 % % % % % %r\   r   c              3   T   K   | ]"}|k    p                     d |z             V  #dS r?  rA  rC  s     rZ   r[   z%_classify_url_type.<locals>.<genexpr>8  sP       & & R<46??3844 & & & & & &r\   socialc              3       K   | ]}|v V  	d S rU   rV   rW   dr   s     rZ   r[   z%_classify_url_type.<locals>.<genexpr>=  s'      
0
011;
0
0
0
0
0
0r\   press)r   zhal.archivesr   r   r   r  scholar.googlec              3       K   | ]}|v V  	d S rU   rV   )rW   mr   s     rZ   r[   z%_classify_url_type.<locals>.<genexpr>F  s'      
1
111;
1
1
1
1
1
1r\   academic)r   r   r   r   r]   r   r   r^   _NOISE_DOMAINS_SOCIAL_DOMAINSMEDIA_WHITELIST)r   academic_markersr   s     @rZ   _classify_url_typerS  (  sU   
&&s++288::++   ww z
 % % % %#% % % % % w
 & & & &$& & & & & x 
0
0
0
0
0
0
000 w
 
1
1
1
1 0
1
1
111 z7s   AA 
AAc                     t          |                                           }t                      }t                                          D ]%\  }}|D ]}||v r|                    |            n&|S )uo   v4.9e — Extraire les domaines d'activite d'un texte.
    Retourne un set de noms de domaines d'activite.
    )rz   r]   r  _ACTIVITY_DOMAINSr   r  )rx   	text_normfounddomain_namekeywordskws         rZ   _extract_activity_domainsr[  L  s~     tzz||,,IEEE!2!8!8!:!:  X 	 	BY		+&&&  Lr\   c           
         |                      dd          }|                      dd          |                      dd          |sdS t          |          }|dk    rdS t          dz   z                                             }t	          fd|D                       }t	          fd	|D                       }|r|d
         nd}|r||vrdS t          dz   z             }		 t          j                            |          j	                                        }
|

                    dd          
                    dd          }
|
                    dd          |
v p|o||
v }n# t          $ r d}Y nw xY wd}|dk    rNdv rJ                    d          }t          |          dk    r"|d                                         dd         }|||	||||d	S )u3  v4.9e — Extraire les signaux d'identite d'un resultat Serper.
    Args:
        result: dict Serper {url, title, snippet}
        query_parts: liste de mots du nom (lowercase, sans accents)
        query_ascii: nom complet lowercase sans accents
    Retourne un dict signal ou None si resultat bruit.
    r   rp   r   r   Nr   r   c              3   ^   K   | ]'}|t                                                    v V  (d S rU   rz   r]   )rW   r   r   s     rZ   r[   z,_extract_identity_signals.<locals>.<genexpr>n  sL       . . ^EKKMM::: . . . . . .r\   c              3   ^   K   | ]'}|t                                                    v V  (d S rU   r^  )rW   r   r   s     rZ   r[   z,_extract_identity_signals.<locals>.<genexpr>p  sL       0 0 ~gmmoo>>> 0 0 0 0 0 0r\   r   r   -Fr=   - r   r   r   )	r   r   r   url_typeactivity_domainsname_in_titlename_in_snippetname_in_domain
profession)r  rS  rz   r]   r$  r[  r   r   r   r   r   r   r   r   rf   )resultquery_partsquery_asciir   rb  r   rd  re  r$  activityr   rf  rg  r   r   r   s                 @@rZ   _extract_identity_signalsrl  Z  sa    **UB

CJJw##EjjB''G t!#&&H7t us{W4;;==>>H . . . .!,. . . . .M 0 0 0 0#.0 0 0 0 0O #.5{22H HH,,t )w)>??H&&s++288::++33C<<R((F2 1/X/ 	     J:%7"2"2e$$u::??q))$3$/J $&*( 
 
 
s   'A>E& &E54E5c                    | d         r|d         r| d         |d         z  rdS d}| d         |v r|d         s|d         rdS |d         |v r| d         s| d         rdS | d         r
|d         rdS |d         r
| d         rdS dS )	u   v4.9e — Verifier si deux signaux sont compatibles (meme personne).
    Compatibles si :
    - Ils partagent au moins un domaine d'activite
    - OU l'un est un site personnel et mentionne le nom
    - OU l'un est LinkedIn (unicite forte du profil)
    rc  Tr=  rF  rb  rd  re  rf  FrV   )sig_asig_bstrong_typess      rZ   _signals_compatiblerr    s     	 ! ()()E2D,EE t *Lj\))' 	*+01B+C 	*tj\))' 	*+01B+C 	*t  5#9 t 5#9 t5r\   c                   	 t          |           }|dk    rg S t          t          |                    		fd	fd}t          |          D ]@}t          |dz   |          D ]*}t          | |         | |                   r |||           +Addlm}  |t                    }t          |          D ],}| |                                       | |                    -t          |                                t           d          }|S )u   v4.9e — Regrouper les signaux par compatibilite (union-find).
    Retourne une liste de clusters (listes de signaux),
    triee par taille decroissante.
    r   c                 j    |          | k    r%|                   | <   |          } |          | k    %| S rU   rV   )xparents    rZ   findz_cluster_signals.<locals>.find  sA    Qi1nnvay)F1Iq	A Qi1nn r\   c                 N     |            |          }}||k    r||<   d S d S rU   rV   )ar  rarbrw  rv  s       rZ   unionz_cluster_signals.<locals>.union  s9    a$$q''B88F2JJJ 8r\   r   )defaultdictTr  reverse)	r   listr   rr  collectionsr}  r	  sortedvalues)
signalsnr|  r   jr}  clusters_mapclustersrw  rv  s
           @@rZ   _cluster_signalsr    sX   
 	GAAvv	 %((^^F          1XX  q1ua 	 	A"71:wqz:: a	
 (''''';t$$L1XX 1 1TT!WW$$WQZ0000 l))++t- - -HOr\   c                     | sddddddS | d         }t          |          }t          d | D                       }d}|D ]!}|                    d          r
|d         } n"ddlm}  |            }|D ]}|d	         D ]}	||	xx         d
z  cc<   |r!|                    d
          d         d         nd}
t          d |D                       }t          d |D                       }t          d |D                       }|dk    r	d}|rdnd}n#|dk    rd}|s|rd}nd}n|s|r	d}|rdnd}nd}d}|dk    r||z  dk    r|dk    r|dk    rd}t          dd| d| d| d| d| d| d|
 d|            |||||
dS )u0  v4.9e — Decider si la personne est identifiable.
    Retourne un dict:
        status: 'identifiable' | 'partial' | 'non_discriminable'
        dominant_cluster: list de signaux | None
        confidence: 'high' | 'medium' | 'low'
        profession: str | None
        activity_domain: str | None
    non_discriminableNlow)statusdominant_cluster
confidencerg  activity_domainr   c              3   4   K   | ]}t          |          V  d S rU   r   rW   rt   s     rZ   r[   z)_decide_discrimination.<locals>.<genexpr>  s(      ))1A))))))r\   rg  )Counterrc  r   c              3   .   K   | ]}|d          dk    V  dS )rb  r=  NrV   rW   ss     rZ   r[   z)_decide_discrimination.<locals>.<genexpr>  s<       7 7()*#7 7 7 7 7 7r\   c              3   *   K   | ]}|d          dv V  dS )rb  rn  NrV   r  s     rZ   r[   z)_decide_discrimination.<locals>.<genexpr>  s@         	
*//     r\   c              3   &   K   | ]}|d          V  dS )rf  NrV   r  s     rZ   r[   z)_decide_discrimination.<locals>.<genexpr>  s(      ==qq)*======r\      identifiablehighmediumr   partial皙?r   r   z[DISCRIMINATION] z	: status=z
, cluster=r   , confidence=, profession=z, activity=z, linkedin=)r   r   r  r  r  most_commonr^   rm   )r  ra   dominantdom_sizetotalrg  sigr  all_domainsadr  has_linkedin
has_socialhas_personalr  r  s                   rZ   _decide_discriminationr    s     
) $#
 
 	
 {H8}}H)))))))E J  77<   	\*JE	
 $#####'))K ! !() 	! 	!BOOOq OOOO	! -8A""1%a((T   7 7-57 7 7 7 7L      J ==H=====L1}}+9VV

	Q 	< 	!JJJJ  	< 	F%1<uJJ(FJ qyyX%,,Q!J	#E 	# 	#F 	# 	#	# 	##	# 	# 	# 	# !	# 	# $		# 	#
 !	# 	#$ $ $ $  *  r\   c           	         i }t          j        d| t           j                  }|sdS |                    d          }d}d}|D ]}|dk    r|dz  }||z  }|dk    r|dz  }||z  }$|dk    ra|dk    r[d	|v rT|                    d	          \  }}}	|	                                ||                                                                <   d}||z  }|rXd	|v rT|                    d	          \  }}}	|	                                ||                                                                <   |                    d
d                                          }
|
rt          |
          dk     rdS t          j	        dd|
          }
t          j	        dd|
          }
d}dD ]G}||v rAt          j        d||                   }|r$t          |                    d                    } nHd}dD ]*}||v r$||                                         dd         } n+d}dD ]"}||v r||                                         } n#|
dd         d|||ddddS )ul   v4.8g — Parser un template {{Ouvrage|titre=...|année=...}}
    Retourne un dict publication ou None.
    z\{\{[Oo]uvrage\s*\|(.+?)\}\}Nr   r   rp   {}|=titrer  '{2,}\[\[([^\]|]+\|)?([^\]]+)\]\]\2)u   annéeanneedateyearz(19[4-9]\d|20[0-2]\d))u   éditeurediteur	publishereditionr   )isbnisbn1x   wikipedia_biblior   authorr  r  r  	cover_urlr   source)r   searchDOTALLgroup	partitionrf   r]   r  r   r   int)lineparamsmatchcontentdepthcurrentchr  r  valr   r  year_keyymr  pub_keyr  isbn_keys                     rZ   _parse_ouvrage_templater  ;  s   
 FI5ti! !E tkk!nnGEG  99QJErMGG3YYQJErMGG3YY5A::g~~%//44Q.1iikksyy{{((**+GGrMGG 23'>>'',,Q&)iikksyy{{  ""#JJw##))++E CJJNNt F8R''EF2E E D7  v3!(+- -B 288A;;''I  fw--//4IE  D%  v(#))++DE 
 tt$	 	 	r\   c                      e Zd ZdZdpdZefdZdpdZh dZ	 	 dqdZ	h d	Z
d
 ZdrdZdsdZdsdZg dZdtdZ	 	 dudZdvdZdvdZd ZdvdZg dZd Zd Zd Zd Zd Zd Zd Zd!Zd"Zd#Z d$Z!d%Z"d&Z#d'Z$d(Z%d)Z&d*Z'd+Z(d,Z)d- Z*d. Z+d/ Z,d0Z-dwd2Z.d3Z/d4Z0d5d6hZ1d7 Z2	 	 dud8Z3	 	 dxd:Z4d; Z5d<d=d>d?d?d?d?d@dAgdBdCdDdEdFd?d?d@dAdGdCdHdId?d?d?d@dAgdJZ6e7dK             Z8dydLZ9g dMg dNg dOg dPg dQg dRg dSg dTdUZ:dV Z;dzdXZ<dY Z=e7d{dZ            Z>	 	 d|d]Z?dpd^Z@dpd_ZAdpd`ZBdsdaZCdb ZDdc ZEdpddZFdpdeZGdf ZHdg ZIdh ZJdi ZKdj ZLdk ZMdl ZNg dmZOdZPdydnZQe7do             ZRdS )}MediaFetcherzARecherche et extraction de contenu depuis les medias francophonesr   c                   
 d                     d t          D                       }| d| d}t          ||dz            }g }|D ]}|                     |d                   
t	          
fdt          D                       }t	          
fd	t
          D                       }	|r1|	s/
|d
<   |                    |           t          |          |k    r nt          dd| dt          |           dt          |           d           |S )ub   v4.9c — Rechercher des articles via Serper API.
        Remplace DDG Lite (rate-limit).
        z OR c              3       K   | ]	}d | V  
dS )site:NrV   )rW   rI  s     rZ   r[   z,MediaFetcher.search_media.<locals>.<genexpr>  s(       F F F F F F F Fr\    ()r   r  r   c              3       K   | ]}|v V  	d S rU   rV   rH  s     rZ   r[   z,MediaFetcher.search_media.<locals>.<genexpr>  s'       F Ff F F F F F Fr\   c              3       K   | ]}|v V  	d S rU   rV   rH  s     rZ   r[   z,MediaFetcher.search_media.<locals>.<genexpr>  s'       G Gf G G G G G Gr\   r   r   zRecherche pour 'r   z resultats filtres sur z bruts)	rw   rQ  r  _extract_domainr^   DOMAIN_BLACKLISTr	  r   rm   )selfra   r  
site_parts
full_queryr  filteredre   is_whitelistedis_blacklistedr   s             @rZ   search_mediazMediaFetcher.search_media  sE    [[ F Fo F F FFF
.....
 qAAA  	 	A))!E(33F  F F F Fo F F FFFN  G G G G6F G G GGGN n $("""x==K//EVlll#h--llX[\cXdXdlllmmmr\   c           	         t          |          rt          dd|dd                     dS t          |          rt          dd|dd                     dS 	 t          j                            |dt          i          }t          j                            |t                    5 }|	                                
                    d	d
          }ddd           n# 1 swxY w Y   n.# t          $ r!}t          dd| d|            Y d}~dS d}~ww xY w|                     |          }|rt          |          dk     rW|                     |          }|r@t          |          dk    r-t          ddt          |           d|dd                     |}|r,t          |          rt          dd|dd                     dS |r#t          |          |k    r|d|dz
           dz   }|S )av  Extraire le texte des paragraphes d'un article
        Regle : uniquement <p>, ignorer nav/footer/aside/scripts
        Limite : 3000 chars max par article
        Timeout : 8 secondes
        Echec silencieux (regle prompt_architect_v1.1)
        v4.6 : fallback meta description si paragraphes insuffisants
        v4.9f : skip domaines bloques, detection paywall
        r   z [BLOCKED] Domaine bloque, skip: Nr  z[NOISE] Domaine bruit, skip: 
User-Agentr   r   r   r   errorsr   zEchec fetch article (skip):  -- P   r   zFallback meta description: z chars pour z#[PAYWALL] Contenu paywall detecte: r   ...)r   rm   r   r   r  r  
USER_AGENTr  FETCH_TIMEOUTr  r  r   _extract_paragraphsr   _extract_meta_descriptionr   )	r  r   	max_charsr  r  htmlr  rx   	meta_descs	            rZ   fetch_article_textzMediaFetcher.fetch_article_text  s    !%% 	=3ss8==? ? ?4 C   	:CRC::< < <4	.((|Z6P(QQC'']'CC Etyy{{))'))DDE E E E E E E E E E E E E E E 	 	 	DDDDDEEE44444	 ''--  	!s4yy2~~66t<<I !S^^b00V>9~~> >36ss8> >? ? ? !  	OD)) 	@c#2#h@@B B B4 	0CII	))Q'%/Ds=   AC* (*CC* C""C* %C"&C* *
D4DDc                    |                      ||          }g }g }|D ]6}|                    d          rSt          |d                   dk    r:|                    d|d         |d         |d         |d         |d         d           |                     |d                   }|rt          |          dk    rn|                    d|d         |d         |d         ||                    dd	          d           t          d
d|d          dt          |           d           	t          dd|                    d|d                               8|s%|r#t          d
dt          |           d           |S |S )zPipeline complet : recherche DDG + extraction de contenu
        Retourne une liste de sources exploitables pour la synthese
        Si aucun contenu exploitable, retourne les snippets DDG bruts (regle prompt_architect)
        r      r  r   r   r   typer   r   r   rx   r   r   rp   r   zSource extraite: r   chars)r   z Contenu insuffisant ou paywall: zFallback snippets DDG: z	 snippets)r  r  r   r	  r  rm   )r  ra   r  r  sourcessnippets_fallbackre   rx   s           rZ   fetch_media_sourcesz MediaFetcher.fetch_media_sources  s   
 ##E;77 	] 	]AuuY C)$5$5$:$:!((#kU8wZiL |* *    **1U844D ]D		C#kU8wZ  uuY33       VR8RRD		RRRSSSSV[hPQRWPX@Y@Y[[\\\\  	%, 	%T37H3I3ITTTUUU$$r\   >   r(   r   r.   r   r-   r'   r,   r&   r*   r)   r  Nc           
         	 |r'|}t          dd| dt          |           d           n(t          d| dd          }|st          |d          }t          |                                          }d |                                D             }g }|d	d
         D ]}|                    dd          }	|                    dd          }
|                    dd          }|	sH	 |	                    d          d                             dd          n# t          $ r Y w xY w| j	        v rt          fdt          D                       rt          |
dz   |z                                             t          fd|D                       s	 |                     |	          }|r`t          |          dk    rM|                    d|	|
|d	d         |d           t          dd            t          |          |k    r nmnY|rWt          |          dk    rD|                    d|	|
||d           t          dd            t          |          |k    r n# t          $ r Y w xY wt          ddt          |           d| d           |S # t          $ r}t          dd |            g cY d	}~S d	}~ww xY w)!u  Recherche DDG ciblée quand les médias standards échouent.
        Cherche des sources spécialisées sur la personne :
        interviews, tribunes, articles de fond.
        Exclut les sites purement commerciaux et les réseaux sociaux.

        ddg_results : list optionnelle de résultats DDG pré-parsés
                      (depuis resolve_person_identity) pour éviter
                      un appel DDG supplémentaire (rate-limit).
        r   zSerper cible 'r   z resultats (reutilises)r  r   r  c                 8    g | ]}t          |          d k    |S r   r  rW   r   s     rZ   r  z3MediaFetcher.fetch_ddg_targeted.<locals>.<listcomp>$  s/       q66A:: ::r\   N   r   rp   r   r   r   r   r   c              3       K   | ]}|v V  	d S rU   rV   rH  s     rZ   r[   z2MediaFetcher.fetch_ddg_targeted.<locals>.<genexpr>;  '      ==qqF{======r\   r   c              3   F   K   | ]}t          |          d k    |v V  dS )r  Nr  )rW   r   r  s     rZ   r[   z2MediaFetcher.fetch_ddg_targeted.<locals>.<genexpr>B  s?       ' ')*q66A:: W%:::' 'r\   r  r    r  u   DDG ciblé: OK r   u   DDG ciblé: snippet fallback u   DDG ciblé: u    sources trouvées pour 'r  r   u   fetch_ddg_targeted échec: )rm   r   r  rz   r]   r   r  r   
IndexErrorEXCLUDED_DOMAINS_DDGr^   r  r  r	  r   )r  ra   r  ddg_resultsparsedrj  
name_partsr  re   
result_urlr   r   content_textr  r  r   s                 @@rZ   fetch_ddg_targetedzMediaFetcher.fetch_ddg_targeted	  s   [	 ;$V<U < <6{{< < <= = = =
 (E"=== ;+Er:::F )77K &,,..  J GCRC[ < <UU5"--
gr**%%	2..! '--c2215==fbIIFF!   H T666====,<=====  )S[7*11335 5 ' ' ' '.8' ' ' ' '  #'#:#::#F#FL# &L(9(9B(>(>$+&,#-%*$0$$7'.( (    V%?v%?%?@@@w<<;66!E 7 # &s7||b'8'8#NN(/*0'1).(/+2, ,    ! H H HJ J J"7||{:: %    H "s7|| " "" " "# # # N 	 	 	:q::;;;IIIIII	su   C$J! )/DJ! 
D&#J! %D&&A1J! A4I*J! AI*&J! *
I84J! 7I88(J! !
K
+K?K
K
>   r(   r/   r   r.   r   r+   r-   r'   r,   r&   r  r)   r  r  r%   c                 B   ddl }ddl}t          dd| d           |                                }g }d}	 |j                            d          5 }|                    | j        |          }|                    | j        |          }	|                    | j        |          }
|                    | j	        |          }|                    | j
        |          }|                                dz   }d	|fd
|	fd|
fd|ffD ]\  }}	 t          d||                                z
            }|                    |          }|r9|                    |           t          dd| dt          |           d           }# t          $ r }t          dd| d|            Y d}~d}~ww xY w	 t          d||                                z
            }|                    |          }|r't          dd|                    dd                      n*# t          $ r}t          dd|            Y d}~nd}~ww xY wddd           n# 1 swxY w Y   n*# t          $ r}t          dd|            Y d}~nd}~ww xY wt#                      }g }|D ]W}|                    dd          }|r=t%          |          }||vr*|                    |           |                    |           Xt+          |                                |z
  dz            }t          ddt          |           d|rdnd  d!| d"           ||d#|t          d$ |D                       t          d% |D                       t          d& |D                       t          d' |D                       d(d)d*S )+a  Strategie notoriete web pour personnes.
        v4.5b : orchestre 5 sous-fonctions en parallele :
        - articles parlant DE la personne
        - site personnel / LinkedIn / portfolio
        - interviews / entretiens / podcasts
        - portrait presse (image)
        - medias publics (France Inter, France Culture, Arte, RFI, INA)
        Retourne un dict {sources: [...], portrait: {...}|None, meta: {...}}
        r   Nr   z[NOTORIETE-WEB] Debut pour 'r  r   )max_workersr  articles
site_perso
interviewsmedias_publicsg      ?r   z[NOTORIETE-WEB] : z sourcesr    echec: z![NOTORIETE-WEB] Portrait trouve: r  ?z [NOTORIETE-WEB] portrait echec: r   z&[NOTORIETE-WEB] Erreur orchestration: r   rp     z[NOTORIETE-WEB] Termine: z sources uniques, portrait=OUINON, msnotoriete_webc                 D    g | ]}|                     d           dk    |S sub_typearticler  r  s     rZ   r  z4MediaFetcher.fetch_notoriete_web.<locals>.<listcomp>  s9     %; %; %;55,,	99 999r\   c                 D    g | ]}|                     d           dk    |S r"  r  r  r  s     rZ   r  z4MediaFetcher.fetch_notoriete_web.<locals>.<listcomp>  9     '> '> '>55,,<< <<<r\   c                 D    g | ]}|                     d           dk    |S r"  	interviewr  r  s     rZ   r  z4MediaFetcher.fetch_notoriete_web.<locals>.<listcomp>  9     &= &= &=55,,;; ;;;r\   c                 D    g | ]}|                     d           dk    |S r"  media_publicr  r  s     rZ   r  z4MediaFetcher.fetch_notoriete_web.<locals>.<listcomp>  s<     )@ )@ )@55,,>> >>>r\   )r  r  r)  r-  )strategy
elapsed_ms
sub_countsr  portraitmeta)concurrent.futurestimerm   futuresThreadPoolExecutorsubmit_ddg_articles_about_ddg_site_personnel_ddg_interviews_ddg_portrait_notoriete_fetch_medias_publics_guestr  rh  extendr   r   r  r  r   r  r	  round)r  ra   
concurrent_timestartr  r2  exe
f_articlesf_sitef_interviews
f_portraitf_mediasdeadlinelabelfuture	remainingrh  r  r  unique_sourcesr  r   url_normelapseds                            rZ   fetch_notoriete_webz MediaFetcher.fetch_notoriete_web|  s    	"!!!V35333	5 	5 	5 

4	>#66 ! 7 # # /@&) ZZ,e5 5
,e5 5"zz(% 1  1 ZZ0%9 9
::4e= = !::<<",,!6*!<0%x0	& C CME6C$'$uzz||3%5 %5	!'$- "/ "/ "/! :#NN6222 !95 !9 !9#&v;;!9 !9 !9: : : % C C CVAuAAaAAC C C C C C C CC
@ #C 5::<</!1 !1I)00 )  1  +  +H >V='||Hc::= => > > ! @ @ @>1>>@ @ @ @ @ @ @ @@[/@ /@ /@ /@ /@ /@ /@ /@ /@ /@ /@ /@ /@ /@ /@b  	> 	> 	><<<> > > > > > > >	>
 EE	 	/ 	/C''%$$C /)#..9,,MM(+++"))#...-566VN(;(;  *2!=   	 	 	 & +% # %; %;#1%; %; %; !< !< #& '> '>#1'> '> '> #? #? "% &= &=#1&= &= &= "> "> %( )@ )@#1)@ )@ )@ %A %A  
 
 	
s   I B2IA6E;:I;
F%F I F%%I)A$HI
H5H0+I0H55I8I II II 
I7I22I7r  c                    	 t          d| dd          }t          ddt          |           d           |s2t          |d          }t          ddt          |           d           d |                                D             }g }|d	d         D ]|}|                    d
d          }|                    dd          }|                    dd          }	|sH|                     |          | j        v rgt          fdt          D                       r|dz   |	z   	                                t          fd|D                       s	 | 
                    |          }
|
rNt          |
          dk    r;|                    dd|||
d	d         |	d           t          |          |k    r n[nG|	rEt          |	          dk    r2|                    dd|||	|	d           t          |          |k    r nm# t          $ r Y zw xY w|S # t          $ r}t          dd|            g cY d	}~S d	}~ww xY w)zSous-fonction 1 : articles web parlant DE la personne.
        Recherche Serper avec nom entre guillemets.
        Exclut les reseaux sociaux et e-commerce.
        r  r  r  r   z"[NOTORIETE-ARTICLES] Serper brut: r   z-[NOTORIETE-ARTICLES] Serper SANS guillemets: c                 \    g | ])}t          |          d k    |                                *S r  r   r]   r  s     rZ   r  z4MediaFetcher._ddg_articles_about.<locals>.<listcomp>   6     ) ) ) VVaZZ ''))'ZZr\   Nr   rp   r   r   c              3       K   | ]}|v V  	d S rU   rV   rH  s     rZ   r[   z3MediaFetcher._ddg_articles_about.<locals>.<genexpr>  r  r\   r   c              3       K   | ]}|v V  	d S rU   rV   rW   r   content_checks     rZ   r[   z3MediaFetcher._ddg_articles_about.<locals>.<genexpr>  (      BB!1-BBBBBBr\   r  r  r#  r  r  r"  r   r   r   rx   r   r   r   z[NOTORIETE-ARTICLES] Echec: r  rm   r   r   r  r  _NOTORIETE_EXCLUDED_DOMAINSr^   r  r]   r  r	  r   )r  ra   r  r  r  r  re   r  r   r   rx   r  rX  r   s               @@rZ   r9  z MediaFetcher._ddg_articles_about  s   
F	#LLLLb999F+v;;+ + +, , ,
  0'2666V/6{{/ / /0 0 0) )U[[]] ) ) )JGCRC[ . .UU5"--
gr**%%	2..! --j99T=======,<=====  "'w!6 = = ? ?BBBBzBBBBB 22:>>D "D		B$3(1&,#-%*$($K'.( (    w<<;66!E 7  "S\\B%6%6$3(1&,#-%*$+'.( (    w<<;66!E    H N 	 	 	2q224 4 4IIIIII	sP   EH A"H9H ;AHH 
HH HH 
I"H<6I<Ir   c                 J
  "#$ h d}|                                                                 }|r|d                                         nd}|r|d                                         nd}t          |          }t          |          }t          |                    dd                                                    }	d |D             }
g }d| | dd	| | dd| d
| dd	| d
| dd| dg}t                      }g }|D ]0}||vr*|                    |           |                    |           1|D ]l}	 t          j	        
                    |dt          i          }t          j	                            |d          5 }|j        dk    r|                                                    dd          }|                     |          }|rt#          |          dk    rt%          |||          s	 ddd           |                     |          #|                    dd#|d| d| d|dd          ddd           t)          dd|            t#          |          |k    r|cddd           c S ddd           n# 1 swxY w Y   ]# t*          $ r Y jw xY wd| d g}|	r|                    d!|	 d"|	 d#           |D ]W}t#          |          |k    r n@	 t-          |d$%          }t)          dd&t#          |           d'           |dd$         D ]}t#          |          |k    r n|                    d(d          }|                    d)d          }|                    d*d          }|s^|                     |          ##| j        v r}t3          |          $t5          $fd+|D                       r|dz   |z                                   "t5          "fd,|
D                       s|#v p+|	#                    d
d                              d-d          v }|pt5          #fd.|D                       }d}	 |                     |          }|rt#          |          d/k    r
|dd0         }n# t*          $ r Y nw xY w|s|r|}d1}|r|rt%          |||          }|s|r>t#          |          d2k    r+d} |rd| d} |                    dd#||| |z   ||d           ֐.# t*          $ r}!t)          d3d4|!            Y d}!~!Qd}!~!ww xY w|S )5zSous-fonction 2 : site personnel, LinkedIn, portfolio.
        v4.5b : detection nom de famille dans domaine + URLs directes.
        >	   journaliste.comr    r  r   r   r*   r"   r  r  r   rp   r   r   c                 \    g | ])}t          |          d k    |                                *S r  rS  r  s     rZ   r  z4MediaFetcher._ddg_site_personnel.<locals>.<listcomp>P  s+    ===A#a&&1**aggii***r\   zhttps://www.z.frhttps://r`  r  r  r  r   r   r   r   r  r  Nr  r  u   Site personnel — z[Site officiel: ]
i  Tr  r"  r   r   r   rx   r   
is_profiler   z([NOTORIETE-SITE] Site personnel trouve: r  z@" site:linkedin.com OR biographie OR site personnel OR portfolior  z.fr OR site:z.comr   r  z[NOTORIETE-SITE] Serper brut: r   r   r   r   c              3   f   K   | ]+}t          |                    d d                    k    V  ,dS )r   rp   N)r   r  )rW   ru  result_url_norms     rZ   r[   z3MediaFetcher._ddg_site_personnel.<locals>.<genexpr>  sU       - -  *!%%r*:*:;;-. - - - - - -r\   c              3       K   | ]}|v V  	d S rU   rV   rW  s     rZ   r[   z3MediaFetcher._ddg_site_personnel.<locals>.<genexpr>  <       4 4#$  !M1 4 4 4 4 4 4r\   r@  c              3       K   | ]}|v V  	d S rU   rV   )rW   pdr   s     rZ   r[   z3MediaFetcher._ddg_site_personnel.<locals>.<genexpr>  s;       : :#%  "V| : : : : : :r\   r   r  Fr  r   z[NOTORIETE-SITE] Serper echec: )rf   r   r]   rz   r   r  r  r	  r   r  r  r  r  r  r  r  r  r   r(  r  rm   r   r  r  r\  r   r^   r  )%r  ra   r  profile_domainsr   r$  r#  lastname_asciifirstname_asciifullname_asciir  r  direct_urlsseenunique_directu
direct_urlr  r  r  rx   search_queriessqr  re   r  r   r   domain_matches_namerc  fetchedconfirmed_personalprefixr  rX  r   re  s%                                     @@@rZ   r:  z MediaFetcher._ddg_site_personnel;  s   

 
 
 ##%%(-559??$$$2(-5E!HNN$$$2	'11(33'MM#r""((**, , >====
 @??N???;;;;;@?@@^@@@<<<<<<.>...
 uu 	( 	(A}}$$Q'''' %	 %	J$n,,Z *B,   ^++Q , ( ( /+/{c))#yy{{11#I  2  7  7#77== /CIINN $5$.t$= $= ) (/ / / / / / / &*%9%9*%E%EF#NN(7,8*0'1$A%$A$A%7z %7 %7)-ete%7 %7+-.2, ,    !!8+5!8 !89 9 9  #7||{::'.=/ / / / / / / / / / / / / / / / / / / / / / / / />    
. . . .

  	.!!- - -&- - -. . . ! V	; V	;B7||{**S;'333V/6{{/ / /0 0 0   I IA7||{22!"ub!1!1JEE'2..EeeIr22G% ! !11*==F!AAA  '5Z&@&@O - - - -$+- - - - - ! ! g-uuww " 4 4 4 4(24 4 4 4 4 !  '&0 6)V^^.% .%%,WS"%5%56 (
 , : : : : :)8: : : : :  D"&"9"9&#( #(" 2s7||c'9'9#*5D5>D$    'G '& */&* %t %->&t.5 .5*1 %$ D		B!#- '!&: !& !& !& #  $3(4&,#-%*$*TM'.*4	( 	( 	 	 	  ; ; ;V9a99; ; ; ; ; ; ; ;; s   9A	JA1J3J?A+J*J9JJ			JJ		J
J J $E)S84RS8
RS8RA&S88
T TT c                    d| d}	 t          |d          }t          ddt          |           d           |s7| d}t          |d          }t          dd	t          |           d           d
 |                                D             }g }|dd         D ]|}|                    dd          }	|                    dd          }
|                    dd          }|	sH|                     |	          | j        v rgt          fdt          D                       r|
dz   |z   	                                t          fd|D                       s	 | 
                    |	          }|rNt          |          dk    r;|                    dd|	|
|dd         |d           t          |          |k    r n[nG|rEt          |          dk    r2|                    dd|	|
||d           t          |          |k    r nm# t          $ r Y zw xY w|S # t          $ r}t          dd|            g cY d}~S d}~ww xY w)zxSous-fonction 3 : interviews, entretiens, podcasts.
        Recherche Serper ciblee sur les apparitions medias.
        r  z<" interview OR entretien OR podcast OR tribune OR conference   r  r   z$[NOTORIETE-INTERVIEWS] Serper brut: r   z- interview OR entretien OR podcast OR tribunez/[NOTORIETE-INTERVIEWS] Serper SANS guillemets: c                 \    g | ])}t          |          d k    |                                *S r  rS  r  s     rZ   r  z0MediaFetcher._ddg_interviews.<locals>.<listcomp>  rT  r\   Nr   rp   r   r   c              3       K   | ]}|v V  	d S rU   rV   rH  s     rZ   r[   z/MediaFetcher._ddg_interviews.<locals>.<genexpr>  r  r\   r   c              3       K   | ]}|v V  	d S rU   rV   rW  s     rZ   r[   z/MediaFetcher._ddg_interviews.<locals>.<genexpr>  rY  r\   r  r  r)  r  rZ  r   r   z[NOTORIETE-INTERVIEWS] Echec: r[  )r  ra   r  r	  r  	search_nqr  r  re   r  r   r   rx   r  rX  r   s                 @@rZ   r;  zMediaFetcher._ddg_interviews  sF   
( ( ( ( 	
I	#Lb999F+v;;+ + +, , ,
  < - - -  (	r:::V;#&v;;; ; ;< < <) )U[[]] ) ) )JGCRC[ . .UU5"--
gr**%%	2..! --j99T=======,<=====  "'w!6 = = ? ?BBBBzBBBBB 22:>>D "D		B$3(3&,#-%*$($K'.( (    w<<;66!E 7  "S\\B%6%6$3(3&,#-%*$+'.( (    w<<;66!E    H N 	 	 	4446 6 6IIIIII	sP   EH A"H H AHH 
HH HH 
I)I=II))zFrance Interr   )France Culturer   )r  r   )zFrance Infor  )Arter  )RFIr   )INAina.frr   c                 *   d |                                 D             }g }| j        D ]\  }}t          |          dk    r n	 d| d| }t          |d          }d}	|dd         D ]R}
|	|k    r nH|
                    dd	          }|
                    d
d	          }|
                    dd	          }|sQ|                     |          }||vrk|dz   |z                                   t          fd|D                       sd	}	 |                     |          }|rt          |          dk    r
|dd         }n# t          $ r Y nw xY w|s|r|}|rat          |          dk    rN|
                    dd||| d| d| d| d| |d           |	dz  }	t          dd| d|dd                     T# t          $ r!}t          dd| d|            Y d}~d}~ww xY wt          ddt          |           d| d           |S ) zSous-fonction 5 : apparitions sur medias publics francais.
        Cherche les pages emissions mentionnant la personne.
        Retourne une liste de sources notoriete_web.
        v4.5b
        c                 \    g | ])}t          |          d k    |                                *S r  rS  r  s     rZ   r  z<MediaFetcher._fetch_medias_publics_guest.<locals>.<listcomp>T  s6     % % %AQ! ggii#r\   r   r  z" site:r  r   Nr   rp   r   r   r   c              3       K   | ]}|v V  	d S rU   rV   rW  s     rZ   r[   z;MediaFetcher._fetch_medias_publics_guest.<locals>.<genexpr>r  rg  r\   r  iX  r  r  r-  r  [ra  rZ  r   r   z[NOTORIETE-MEDIAS]     → r  r   r  z sources medias publics pour 'r  )r   _MEDIAS_PUBLICSr   r  r  r  r]   r^   r  r   r	  rm   )r  ra   max_per_mediar  r  
media_namer   r	  r  found_for_mediare   r  r   r   result_domainrx   rv  r  rX  s                     @rZ   r=  z(MediaFetcher._fetch_medias_publics_guestN  sp   % % % % %
"&"6 >	C >	CJ7||q  ;C959999'!<<<"# 26 26A&-77!"ub!1!1JEE'2..EeeIr22G% ! $($8$8"%$ %$M ]22  g-uuww " 4 4 4 4(24 4 4 4 4 !  D"&"9"9&#( #(" 1s7||b'8'8#*4C4=D$    'G '& 6D		B$3(6&3#-#- : :5 : :!,J !, !,E !, !,%)!, !,'.( (    (1,V5* 5 5#-crc?5 56 6 6  C C CVA*AAaAAC C C C C C C CC 	V-#g,, - -$)- - -	. 	. 	. s>   CG4EG
EGEA.G
G,G''G,c           	         	 |g}|r[|}dD ]#}||v r|                     |          d         } n$|                                dd         }|r|                    |           |                    d           d                    |          }t	          |d          }	t          d	d
| dt          |	           d           |                                }
|
                                 }t          |          dk    rd                    |dd                   n|
}h d}d}t          |          D ]}||vrt          |          dk    r|} n|r|d         nd}t          |          dk    rd                    |dd                   n|
}|	dd         D ]}|	                    dd          |	                    dd                                          }|	                    dd                                          }
                    d          s|                               r|dz   |z   }|
|v p|o||v p|o||v }|s/|r,t          ||          rt          d	d|dd                     nt          fddD                       rd|vrLd                                vr6dz   |z   }t          |||          st          d	ddd                     Jt          d	ddd                     |dd|	                    dd          dc S t          d	d            dS # t          $ r}t          d!d"|            Y d}~dS d}~ww xY w)#u  Sous-fonction 4 : portrait presse/web pour personnes sans Wikipedia.
        Utilise Serper Images avec filtre anti-homonymie.
        v4.9f : filtre prenom anti-homonyme renforce.
        v4.9f : source croisee — accepte les images dont le domaine est
        deja valide par le pipeline texte.
        v4.9f C3 : injection optionnelle de la profession dans la requete.
        Retourne un dict portrait ou None.
        ra      ·  | z chez z at r   N(   zportrait photor   rz  r  r   z[NOTORIETE-PORTRAIT] Requete: 'r    resultats brutsr   >   r}   r   r   r   r   r   r   rp   r   imageUrlr   r   r`  z6[NOTORIETE-PORTRAIT] Source croisee (domaine valide): r  c              3   D   K   | ]}|                                 v V  d S rU   r]   )rW   rZ  img_urls     rZ   r[   z7MediaFetcher._ddg_portrait_notoriete.<locals>.<genexpr>  sD       . . W]]__, . . . . . .r\   )logoiconbannerfaviconr*   	licdn.comz/[NOTORIETE-PORTRAIT] REJETE (homonyme prenom): z[NOTORIETE-PORTRAIT] ACCEPTE: person_portraitnotoriete_web_serperr   captionr  r  
source_urlz*[NOTORIETE-PORTRAIT] Aucun portrait valider   z[NOTORIETE-PORTRAIT] Echec: )r   rf   r	  rw   r  rm   r   r]   reversedr  r   _is_google_proxy_urlr  r^   r:  r   )r  ra   r  rg  ri  profsep
prof_cleanenriched_queryserper_resultsq_lowerq_parts	q_surname	particlesq_lastr   q_firstname
q_lastnamer  
item_titler  
check_text
name_match	img_checkr  r  s                            @rZ   r<  z$MediaFetcher._ddg_portrait_notoriete  s   p	 'K 	3!C  Cd{{#zz#q1 # "ZZ\\#2#.
 3&&z222/000 XXk22N1B( ( (N9"9 9~&&9 9 9: : : kkmmGmmooG),W)9)9%%% ' ' 'IFg&&  I%%#a&&1**FE )07'!**RK),W)9)9%%%  'ss+ < <((:r22!XXgr2288::
((62..4466))*55 ,,W55  (#-7
z) 9!=i:&=976Z#7 
 " ! % ! 9 '!7 !7! V@18"@ @A A A A !  . . . ."-. . . . .   #'11'w}}>> '#
 :I2%{J@ @ !V.&ss|. ./ / / !V&ss|& &' ' ' #$-4"&((62"6"6     <> > >4 	 	 	2q224 4 444444	s   LL, L, ,
M6MMr   c                     |                      ||          }|r|S t          dd           |                     ||          }|S )zRechercher des images via DDG Images (priorite) + Wikimedia Commons (fallback)
        Filtre : HTTPS uniquement, min 200x200px
        Retourne une liste de dicts {url, caption, source, width, height}
        r   z6DDG Images: aucun resultat, fallback Wikimedia Commons)_fetch_ddg_imagesrm   _fetch_wikimedia_commons)r  ra   r  r  s       rZ   fetch_imagezMediaFetcher.fetch_image	  sS     ''{;; 	M 	VNOOO..ukBBr\   c                    	 t          ||dz            }n,# t          $ r}t          dd|            g cY d}~S d}~ww xY wg }|D ]r}|                    dd          }|                    d          s.|                    ||                    d	d          d
ddd           t          |          |k    r nst          dd| dt          |           dt          |           d           |S )z(Recherche d'images via Serper Images APIr   r  r   zEchec Serper Images: Nr  rp   r`  r   serper_imagesr   r   r  r  widthheightr   zSerper Images pour 'r   z images filtrees sur z brutes)r  r   rm   r  r   r	  r   )r  ra   r  r  r  r  re   r  s           rZ   r  zMediaFetcher._fetch_ddg_images*	  sG   	*5kAoFFFGG 	 	 	444555IIIIII	  	 	AeeJ++G %%j11 MM55"--)     6{{k)) * 	VmEmmc&kkmmX[\cXdXdmmmnnns    
A ;A A c                 p   dt           j                            |           }	 t           j                            |dt
          i          }t           j                            |t                    5 }|                                	                    dd          }ddd           n# 1 swxY w Y   t          j        d	|          }|r|                    d
          S t          j        d|          }|r|                    d
          S n*# t          $ r}t          dd|            Y d}~nd}~ww xY wdS )z/Obtenir le token vqd necessaire pour DDG Imageszhttps://duckduckgo.com/?q=r  r  r   r   r   r  Nzvqd=["\x27]([\d-]+)["\x27]r   zvqd=([\d-]+)r   zEchec obtention vqd DDG: )r   r   quoter  r  r  r  r  r  r  r   r  r  r   rm   )r  ra   r   r  r  r  r  r  s           rZ   _get_ddg_vqdzMediaFetcher._get_ddg_vqdG	  s   F6<+=+=e+D+DFF	:.((|Z6P(QQC'']'CC Etyy{{))'))DDE E E E E E E E E E E E E E E I;TBBE &{{1~~%Iot44E &{{1~~%& 	: 	: 	:8Q8899999999	:tsB   AD 2*B(D (B,,D /B,0.D +D 
D3D..D3c           
         t           j                            dd|dt          |dz            dd          }d| }	 t           j                            |dd	i
          }t           j                            |t                    5 }t          j	        |
                                                    d                    }ddd           n# 1 swxY w Y   n,# t          $ r}t          dd|            g cY d}~S d}~ww xY w|                    di                               dg           }	|	sg S d                    d |	D                       }
t           j                            d|
dddd          }d| }	 t           j                            |dd	i
          }t           j                            |t                    5 }t          j	        |
                                                    d                    }ddd           n# 1 swxY w Y   n,# t          $ r}t          dd|            g cY d}~S d}~ww xY w|                    di                               di           }g }|                                D ]1\  }}|dk    r|                    dg           }|s&|d                             dd          }|d                             dd          }|d                             dd          }|                    d          s|dk     s|dk     r|                                                    d          r|                    ||                    d d                              d!d                              d"d          d#||d$           t-          |          |k    r n3t          d%d&| d't-          |           d(           |S ))z:Fallback : rechercher des images sur Wikimedia Commons APIra   r  6r   r   )actionr  srsearchsrnamespacesrlimitformat(https://commons.wikimedia.org/w/api.php?r  CyberStrat/1.0r  r   r   Nr   z#Echec Wikimedia Commons recherche: r  c              3   &   K   | ]}|d          V  dS )r   NrV   r  s     rZ   r[   z8MediaFetcher._fetch_wikimedia_commons.<locals>.<genexpr>v	  s&      66!G*666666r\   	imageinfozurl|size)r  titlespropiipropr  z#Echec Wikimedia Commons imageinfo: pagesz-1r   r   rp   r  r  r`  r   z.svgr   zFile:zFichier:wikimedia_commonsr  r   zWikimedia Commons pour 'r   r  )r   r   	urlencodestrr  r  r  r  r   r  r  r  r   rm   r  rw   r   r   r]   rB  r	  r   r   )r  ra   r  r  r   r  r  r   r  r  r  info_paramsinfo_url	info_datar  r  page_idpager  r  r  r  s                         rZ   r  z%MediaFetcher._fetch_wikimedia_commonsZ	  s    '';?++)
 )
   BAA	.((.7(  C '']'CC ?tz$))++"4"4W"="=>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 	 	 	BqBBCCCIIIIII	 ((7B''++Hb99 	I 66g66666l,, .
 .
   LkKK	.((.<(  C '']'CC Dt Jtyy{{'9'9''B'BCC	D D D D D D D D D D D D D D D 	 	 	BqBBCCCIIIIII	 gr**..w;;"[[]] 	 	MGT$b11I l&&ub11GaL$$Wa00Eq\%%h22F %%j11 s{{fsll}}''// MM88GR0088"EEMMjZ\]]-      6{{k)) * 	VNNN#f++NNNOOOs   A	C :C C CC CC 
D"C<6D<D<A	H :H?H HH HH 
I !H;5I ;I )r*   r  r   r'   r)   r&   r,   r$   r    zjournaldesfemmes.frr  r  r  r"   r  r   rK  r  c                 (   |s|S |                     d                                          }g d}|D ]@}|                    |          r)|t          |          d                                         }A|rt          |          dk     rdS |dd         S )u   Nettoyer un snippet LinkedIn brut pour extraire la profession.
        Supprime les tokens de navigation LinkedIn (Experience, Competences,
        Formation, etc.) et les marqueurs · en debut de chaine.
        u   · )u   Expérience :zExperience :u   Expérience:u   Compétences :zCompetences :u   Compétences:zFormation :z
Formation:u   À propos :z
A propos :zLicences et certifications :u   Bénévolat :zBenevolat :Nr   r   )lstriprf   r   r   )r  rawcleanednoise_prefixesrx  s        rZ   _clean_linkedin_professionz'MediaFetcher._clean_linkedin_profession	  s    
  	J**U##))++
 
 
 % 	8 	8F!!&)) 8!#f++,,/5577 	#g,,**4tt}r\   c                    |ddg ddd}	 t          d| dd          }t          dd| d	t          |           d
           t          |                                          }d |                                D             }|dd         D ]}|                    dd          }|                    dd          }|                    dd          }	t          |dz   |z                                             t          fd|D                       }
|
s|	                                dv r9|d                             dd|	|d           d|v r|                    d          }t          |          dk    r]| 	                    |d         
                                dd                   |d<   |d         
                                dd         |d<   nd|v r|                    d          }t          |          dk    r]| 	                    |d         
                                dd                   |d<   |d         
                                dd         |d<   d|d<   nt          fd | j        D                       rgd!v r                    d!          d         n}|                    d"d          }|d                             ||	|d           |d         dk    rd#|d<   |d$         s|
rt          |	          s|dd%         |d$<   |d         so|d$         rg|d$                                         g d&}fd'|D             }|r9d(                    |dd)                   |d<   t          dd*|d                     ||d+<   t          dd,| d-|d          d.|d          d/t          |d                               n*# t"          $ r}t          d0d1|            Y d}~nd}~ww xY w|S )2u{  Resoudre l'identite d'une personne depuis les premieres
        sources en ligne : LinkedIn, reseaux sociaux, pages perso, etc.
        Retourne un profil structure : nom confirme, metier,
        localisation, liens de profil, bio snippet.
        Utilise DDG HTML (pas Lite) pour acceder aux reseaux sociaux.
        Zero import requests — urllib.request uniquement.
        Nr  )confirmed_namerg  locationprofile_linksbio_snippetr  r  r   r  r   zIdentity Serper pour 'r   r  c                 8    g | ]}t          |          d k    |S r  r  r  s     rZ   r  z8MediaFetcher.resolve_person_identity.<locals>.<listcomp>	  s*       #a&&1*****r\   r   rp   r   r   r   c              3       K   | ]}|v V  	d S rU   rV   )rW   r   combined_texts     rZ   r[   z7MediaFetcher.resolve_person_identity.<locals>.<genexpr>	  s9       # #+,A&# # # # # #r\   r=  r  r   LinkedInplatformr   r   ra  r   r   rg  r   r  r  r  r  c              3       K   | ]}|v V  	d S rU   rV   )rW   r  r   s     rZ   r[   z7MediaFetcher.resolve_person_identity.<locals>.<genexpr>
  s;       9 9 i 9 9 9 9 9 9r\   r   r   r  r  ,  )r)  r*  r+  r,  u   médecinr.  r-  r:  rU  rV  rT  r  rv  r`  u	   écrivainra  rC  ro  u
   ingénieur
sociologue
philosophe	historienr  politologueanthropologue
biologistec                     g | ]}|v |	S rV   rV   rW   rZ  	bio_lowers     rZ   r  z8MediaFetcher.resolve_person_identity.<locals>.<listcomp>:
  s%       "	//B///r\   r  r   z$Profession extraite du bio_snippet: ddg_raw_resultsz
Identite 'z': confidence=r  z, links=r   zresolve_person_identity echec: )r  rm   r   rz   r]   r   r  r$  r   r  rf   r^   IDENTITY_SOURCESr   r	  r   rw   r   )r  ra   identityr  rj  r  re   r   r   r  name_presentr   r  prof_keywordsfound_profsr  r  r  r   s                   @@@rZ   resolve_person_identityz$MediaFetcher.resolve_person_identity	  s{    $
 
m	@$\\\\r:::G2 2 2w<<2 2 23 3 3 )77K &,,..  J SbS\ :< :<gr**%%	2..UU5"--
 !/S[7*1133!5 !5" # # # #0:# # #     $ &,,..	 **_-44Q$.)#*9 9    '' 'e 4 4u::??595T5T %a 0 0# 668 68H\2 !&a 0 0# 6 %Z07** 'f 5 5u::??595T5T %a 0 0# 668 68H\2 !&a 0 0# 6 %Z0-3H\** 9 9 9 9"&"79 9 9 9 9 : y((  )s33A66/8 '//;;H_-44$,)#*6 6   
  -6619. !/ <L < /
 ; ;<.5dsdmH]+ L) 5h}.E 5$]399;;		! 	! 	!   !.    5-1YY#BQB.) .)H\*4#L14 45 5 5 +2H&':U : :&|4: :&|4: : Xo677: :; ; ; ;  	@ 	@ 	@>1>>????????	@ s   OO 
O:O55O:c                   +,- ddl }|                                 }t          d| dd          }t          dd| dt          |           d	           |s2t          |d          }t          dd
t          |           d	           t	          ||          \  }}d}t          |          t          |          k    rd}t          ||||          }|rjt          |          t          |          z   }	t          |	d          \  }}
|dz  }t          ddt          |           dt          |           d	           n|dz  }t          dd           t          |	                                          }|
                                }d}|D ]}	 t          j                            |                    dd                    j        	                                }|                    dd          }t          |          ,t#          ,fd|D                       rd} n# t$          $ r Y w xY w|s|dk     rd| d}t          |d          }d |D             --fd|D             }|rht          |          |dd         z   }	t          |	d          \  }}
t          ddt          |dd                    dt          |           d	           nt          dd           |dz  }n|rt          dd           t          |	                                          }d  |
                                D             }g }|D ]*}t'          |||          }|r|                    |           +t          dd!t          |           d"t          |           d	           t+          |          }t          dd!t          |           d#d$ |D              d%           t-          ||          }|d&         pg }||d'         dg d|d(         |d)         |d*         |d+	}|D ]}|d,         d-k    r|d.                             dd/|d         |d0         d1           d2|d0         v rS|d0         
                    d2          }t          |          dk    r%|d                                         dd3         |d4<   |                    d'          r|                     |d'                   |d'<   n|d,         d5k    r	 t          j                            |d                   j        	                                }|                    dd          }n# t$          $ r |d,         }Y nw xY w|d.                             ||d         |d0         d1           |d6         s0|d7         r(t5          |d                   s|d0         dd8         |d6<   |d'         sc|d6         r[t          |d6         	                                          +g d9}+fd:|D             }|r d;                    |dd                   |d'<   g }|d)         d<v r2d= |D             } t          |           t          |          k     r1t          dd>t          |          t          |           z
   d?           g }!| D ]J}|d,         d@v r|d,         dAv r|!                    |           -|dB         r|!                    |           K|!dd         D ]}	 |                     |d                   }"|"r2t;          |"          r#t          ddC|d         ddD                     d}"|"rH|                    dB          s3t=          |"|          s#t          ddE|d         ddD                     d}"|"rt          |"          dFk    rn|                     |d                   }dG}#|dB         rdH}#n|d,         dIk    rdG}#|                    dJ|#||d         |dK         |"ddL         |d0         dM           nq|d0         rit          |d0                   dDk    rP|                     |d                   }|                    dJdG||d         |dK         |d0         |d0         dM           # t$          $ r/}$t          dNdO|d         ddD          dP|$            Y d}$~$d}$~$ww xY w| D ]}|d,         dQv rtA          |d         |dK         |d0                   r"t          ddR|d         ddD                     Q|                     |d                   }|                    dJdH||d         |dK         |d0         p|dK         |d0         ddS           tC                      -g }%|D ]U}&tE          |&                    dd                    }'|'r.|'-vr*-#                    |'           |%                    |&           Vd}(|d)         d<v r+| $                    ||%|                    d'          T          }(tK          |                                 |z
  dUz            })t          ddV| dW|d)          dX|d(          d;t          |%           dY|(rdZnd[ d;|) d\           |%|(d]|)|d)         t          |          t          |          ||(rdndz   t          d^ |%D                       t          d_ |%D                       t          d` |%D                       t          da |%D                       dbdcdd}*||*deS )fu  v4.9e — Algorithme de discrimination de personnes.
        Remplace resolve_person_identity + fetch_notoriete_web
        en une seule methode avec 1-2 appels Serper.

        Etapes:
        1. Requete Serper unique "{prenom} {nom}" → 10 resultats
        2. Extraction signaux d'identite par resultat
        3. Clustering par compatibilite (union-find)
        4. Decision de discrimination (seuils)
        5. Scraping selectif du cluster dominant
        6. Construction identity + sources + portrait

        Retourne: {
            "identity": {resolve_person_identity format},
            "notoriete": {fetch_notoriete_web format},
        }
        r   Nr  r   r  r   z[DISCRIM] Serper pour 'r   r   z"[DISCRIM] Serper SANS guillemets: r   r   )r  z[DISCRIM] Phase 2 media: +r  z*[DISCRIM] Phase 2 media: 0 nouvelle sourceFr   rp   r   c              3   F   K   | ]}t          |          d k    |v V  dS r   Nr  )rW   r   r_domain_asciis     rZ   r[   z3MediaFetcher.discriminate_person.<locals>.<genexpr>
  s?       & &qa&&1** N*$***& &r\   Tr  z" site personnel OR biographier   c                 :    h | ]}|                     d d          S r  r  r  s     rZ   r  z3MediaFetcher.discriminate_person.<locals>.<setcomp>
  s&    ;;;aub));;;r\   c                 D    g | ]}|                     d d          v|S r  r  )rW   re   r  s     rZ   r  z4MediaFetcher.discriminate_person.<locals>.<listcomp>
  s<       55##944 444r\   r   z[DISCRIM] Phase 3 site perso: +z/[DISCRIM] Phase 3 site perso: 0 nouvelle sourcez/[DISCRIM] Phase 3 skip: site perso deja presentc                 8    g | ]}t          |          d k    |S r  r  r  s     rZ   r  z4MediaFetcher.discriminate_person.<locals>.<listcomp>
  s*     
 
 
c!ffqjjAjjjr\   z
[DISCRIM] z signaux extraits sur z clusters (tailles: c                 ,    g | ]}t          |          S rV   r  r  s     rZ   r  z4MediaFetcher.discriminate_person.<locals>.<listcomp>
  s    333Q#a&&333r\   r  r  rg  r  r  r  )	r  rg  r  r  r  r  discrimination_statusr  r  rb  r=  r  r  r   r  ra  r   r  rF  r  rd  r  )r)  r*  r+  r,  r.  r-  r:  rU  rV  rT  r  rv  r`  ra  rC  ro  r  r  r  r  r  c                     g | ]}|v |	S rV   rV   r  s     rZ   r  z4MediaFetcher.discriminate_person.<locals>.<listcomp>  s%       bIooooor\   r  )r  r  c                 <    g | ]}t          |d                    |S r   )r   )rW   r  s     rZ   r  z4MediaFetcher.discriminate_person.<locals>.<listcomp>"  s:     ! ! !'E
33!! ! !r\   z[DISCRIM] FIX-A: z domaines bruit filtres)r   )rJ  r<  rN  rf  z)[DISCRIM] FIX-B: paywall/adblock detecte r   z'[DISCRIM] FIX-D: contenu non pertinent r  r#  r  rN  r  r   r  rZ  r   z[DISCRIM] Scrape echec r  rn  z+[DISCRIM] FIX-E: LinkedIn generique ignore rb  )r  rg  r  z[DISCRIM] Termine 'z
': status=r  z sources, portrait=r  r  r  zdiscrimination_v4.9gc                 D    g | ]}|                     d           dk    |S r!  r  r  s     rZ   r  z4MediaFetcher.discriminate_person.<locals>.<listcomp>  s9     $; $; $;55,,	99 999r\   c                 D    g | ]}|                     d           dk    |S r%  r  r  s     rZ   r  z4MediaFetcher.discriminate_person.<locals>.<listcomp>  r&  r\   c                 D    g | ]}|                     d           dk    |S r(  r  r  s     rZ   r  z4MediaFetcher.discriminate_person.<locals>.<listcomp>  r*  r\   c                 D    g | ]}|                     d           dk    |S r,  r  r  s     rZ   r  z4MediaFetcher.discriminate_person.<locals>.<listcomp>  sD     )( )( )(55,,&' ' ' ' 'r\   )r#  r  r)  r-  )r.  r/  r  clusters_countdominant_cluster_sizeserper_creditsr0  r1  )r  	notoriete)&r5  r  rm   r   r  r  r  r  rz   r]   r   r   r   r   r  r   r   r^   r   rl  r	  r  r  r   rf   r  r   rw   r  r   r   r  r   r  r   r  r<  r?  ).r  ra   rA  rB  r  r  r  r  media_extrar   r  query_ascii_checkname_parts_checkhas_personal_sitere   r_domainperso_queryperso_results	new_persorj  ri  r  r  r  decisionr  r  r   r   r  r  r  filtered_dominantscrape_targetsrx   r"  r  rM  r  rN  r2  rO  r  r  r  r  s.                                              @@@rZ   discriminate_personz MediaFetcher.discriminate_personS
  se   $ 	

 )U2>>>V0e 0 0?##0 0 0	1 	1 	1
  	5,U;;;O4''4 4 45 5 5
 $=?$$ $$ w<<3////N -?G_> > 	>G}}tK'8'88H/, , ,JGQaND$$D D+.w<<D D DE E E E aN<> > >
 +5;;==99,2244! 	 	A
!<00EE%$$& &&,UUWW #++FB77!/!9!9 & & & &4D & & & & & (,%E     ! 	+^a%7%7CeCCCK*;A>>>M;;7;;;I   (  I  )==9RaR=83Q0 0 0
V0IbqbM**0 07||0 0 01 1 1 1
 V() ) ) aNN 	+*+ + +
 %U[[]]33
 
"((**
 
 

  	$ 	$A+;- -C $s###V,W , ,w<<, , ,	- 	- 	-
 $G,,V7X 7 733(3337 7 7	8 	8 	8
 *(E::./52 $"<0"<0%-h%7'(9:&

 

  %	? %	?C:*,,)00 *u:"9~5 5    C	N**	N0077E5zzQ!!HNN,,TcT2 !, 77<(( 077-/ / \* ZH,,-#\22E
$ $$*5577 #^^FB77FF  - - - _FFF-)00 &u:"9~2 2    ]+ ?O,?+CJ77? +.i.#*>' % 	%(=*A 	%&'--//1 1I  M   *  K  %)-O*% *%& H!<<<! !'! ! ! $%%H55V/8}}s+<'='==/ / /0 0 0  N( / /z?j00z? '3 3 3"))#....)* /"))#... &bqb) 43 4333223u:>>D $ 5 5 $V9'*5z#2#9 9: : :  $  $$'GG,<$=$=$$8 $e%- %-$ V;),UCRC; ;< < <  $ D		B!%!5!5c%j!A!A#,/0 1'3HH _
::'0H$3(0&,#&u:%(\$($K'*9~( (     Y C	N-, -,.0-1 -1!%!5!5c%j!A!A$3(1&,#&u:%(\$'	N'*9~( (    ! 3 3 32u:crc?2 2./2 23 3 3 3 3 3 3 33 )  z?&<<<+JG	N, , ! V1"5z#2#1 12 2 2 !!11#e*==FNN /$0"("5z!$W #I >#g,#&y>&*	$ 	$ 	 	 	 EE	 	+ 	+C%cggeR&8&899H +HI55h'''%%c*** H!<<<33N#<<55 4 7 7H -566V%  x( "<0  >""  "*4u	 
   	 	 	 & 2%)1();"%h--),X"8&:aa;" $; $;#1$; $; $;  <  < #& '> '>#1'> '> '> #? #? "% &= &=#1&= &= &= "> "> %( )( )(#1)( )( )( %) %)  
 
	< !"
 
 	
s?   BH
H+*H+.AUUU0Fa??
b8	$b33b8c                    g }t          j        d|t           j                  }|s t          j        d|t           j                  }|s t          j        d|t           j                  }t          j        d|t           j                  }|s t          j        d|t           j                  }t          j        d|t           j                  }t          |          D ]3\  }\  }}|                     |          }	|	se|t          |          k     rOt          j        dd||                                                   }
|
r|
                    d	          sd
|
z   }	n|
r|
}	|	st          j        dd|                                          }| 	                    |          }d}|t          |          k     rCt          j        dd||                                                   }| 	                    |          }|
                    |	||d           5|S )ud  Parser les resultats de recherche DuckDuckGo HTML.
        Structure DDG HTML :
        - Liens : <a class="result__a" href="...">Title</a>
        - Snippets : <a class="result__snippet">...</a>
          ou <td class="result-snippet">...</td>
        - URLs : <span class="result__url">...</span>
        Zero BeautifulSoup — regex uniquement.
        zL<a[^>]*class=["\']result__a["\'][^>]*href=["\']([^"\']*)["\'][^>]*>(.*?)</a>zL<a[^>]*href=["\']([^"\']*)["\'][^>]*class=["\']result__a["\'][^>]*>(.*?)</a>zN<a[^>]*class=["\']result-link["\'][^>]*href=["\']([^"\']*)["\'][^>]*>(.*?)</a>z5<a[^>]*class=["\']result__snippet["\'][^>]*>(.*?)</a>z6<td[^>]*class=["\']result-snippet["\'][^>]*>(.*?)</td>z7<span[^>]*class=["\']result__url["\'][^>]*>(.*?)</span><[^>]+>rp   httpr`  r   )r   findallr  r   _decode_ddg_urlr   r   rf   r   _decode_html_entitiesr	  )r  r  r  linkssnippetsurls_displayr   href
title_htmlr   display_urlr   r   s                rZ   _parse_ddg_html_resultsz$MediaFetcher._parse_ddg_html_results  sZ     
7")
 

  	J<bi E
  	J;bi E :")
 

  	zbi H z")
 
 &/u%5%5 !	 !	!A!j&&t,,C s<(((("$&"BQ# #egg   # *;+A+A",$ ,$ *(;6$ *)  F:r:66<<>>E..u55E G3x==  &HQK %''  44W==NN"      r\   c                 :   |                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd	          }|                     d
d          }|                     dd          }|S )z"Decoder les entites HTML courantes&amp;&&quot;r  &#x27;r  &#39;&lt;<&gt;>z&nbsp;r   )r   )r  rx   s     rZ   r  z"MediaFetcher._decode_html_entities  s    ||GS))||Hc**||Hc**||GS))||FC((||FC((||Hc**r\   c                 "   ddl d }d}|                                }t          j        |d|                                t          j                  }t          j        dd|                                          }g }d	                    d
 |                                D                       }|                    |            ||          }|                    |           |                    |	                    dd                     fd}	 |	|          }
|
|k    r|                    |
           t                      }g }|D ]2}|r.||vr*|                    |           |                    |           3t          dd|            |S )zGenere plusieurs variantes de slug Wikipedia a essayer.
        Retourne une liste ordonnee du plus probable au moins probable.
        Gere : tirets (Saint-Paul-Lacoste), accents, suffixes geo.
        r   Nc                 `   h d}|                                  }g }t          |          D ]p\  }}|dk    s|                                |vr(|                    |                                           I|                    |                                           qd                    |          S )z7Met chaque mot en majuscule sauf articles/prepositions.>   rI  laur}   r   enetr   r   auxr   r  sursousr   r   )r   r   r]   r	  
capitalizerw   )r  LOWER_WORDSr&  rh  r   r   s         rZ   to_title_smartz>MediaFetcher._normalize_wikipedia_slug.<locals>.to_title_smart0  s      K
 GGIIEF!%(( - -166QWWYYk99MM!,,..1111MM!'')),,,,88F###r\   u   \b(gard|herault|hérault|lozere|lozère|var|bouches.du.rh[oô]ne|vaucluse|ard[eè]che|dr[oô]me|aude|pyr[ée]n[ée]es|france|d[ée]partement|r[ée]gion|commune|canton|arrondissement)\brp   flags\s+r   r`  c              3   >   K   | ]}|                                 V  d S rU   )r9  rW   r   s     rZ   r[   z9MediaFetcher._normalize_wikipedia_slug.<locals>.<genexpr>Q  s;       ; ; ALLNN; ; ; ; ; ;r\   r  c                 r                         d|           } d                    fd| D                       S )NNFDrp   c              3   N   K   | ]}                     |          d k    |V   dS MnNcategoryrs   s     rZ   r[   zPMediaFetcher._normalize_wikipedia_slug.<locals>.strip_accents.<locals>.<genexpr>_  sH        ''**d22 2222 r\   )rv   rw   r  ru   s    rZ   strip_accentsz=MediaFetcher._normalize_wikipedia_slug.<locals>.strip_accents]  sT    %%eQ//A77         r\   r   zWikipedia slugs a essayer: )ru   rf   r   r   r]   
IGNORECASErw   r   r	  r   r  r  rm   )r  ra   r;  GEO_SUFFIXESr  cleaned_lowervariants
hyphenatedtitledrI  	no_accentro  uniquevru   s                 @rZ   _normalize_wikipedia_slugz&MediaFetcher._normalize_wikipedia_slug)  s   
 		$ 	$ 	$$( 	 ++--"gmmoo-! ! ! vsM::@@BB XX ; ;$1$7$7$9$9; ; ; ; ;

###  .. 	S11222	 	 	 	 	 "M*--	
""OOI&&& uu 	! 	!A !Qd]]a   V;6;;<<<r\   z$2876a346-d50c-4911-934e-19ee07b0e503z.https://tabular-api.data.gouv.fr/api/resourcesr2   u   Nom de l'éluu   Prénom de l'éluu   Libellé de la communeu   Libellé du départementzCode de la communeu   Code du départementu   Date de début du mandatu   Date de début de la fonctionu/   Libellé de la catégorie socio-professionnellez	Code sexec                 	   |                                                                 }t          |          dk     rt          dd|z             dS |d                                         }d                    |dd                                                                                    }| j        dz   | j        z   dz   t          j
                            |          z   d	z   }t          dd
|d|           t          dd|z             	 t          j                            |ddd          }t          j                            || j                  5 }|                                                    d          }t%          j        |          }	ddd           n# 1 swxY w Y   n8# t(          $ r+}
t          ddt+          |
          z             Y d}
~
dS d}
~
ww xY w|	                    dg           }|st          dd|z             dS t          ddt          |          |fz             ddlfd} ||          }d}|D ]}|                    | j        d          }|                    | j        d          }|                    | j        d          } ||          }||k    r|}t          dd|d|d|            n||v r||}t          dd|d|           |st          dd|t          |          fz             |dd          D ]l}|                    | j        d          }|                    | j        d          }|                    | j        d          }t          dd!|d|d|           mdS |                    | j        d          |                    | j        d          |                    | j        d          |                    | j        d          |                    | j        d          |                    | j        d          d"|                    | j        d          |                    | j        d          |                    | j         d          |                    | j!        d          d#}t          dd$|d%         d|d&         d'|d(         d)|d*         d+|d,         
           |S )-a  Recherche un elu dans le RNE (Repertoire National des Elus).

        Interroge l'API tabulaire data.gouv.fr sur le dataset Maires.
        Filtre par nom de famille (exact, uppercase) puis match prenom.

        Args:
            query: nom complet de la personne (ex: "Adrien Chapon")

        Returns:
            dict avec {nom, prenom, commune, departement, code_insee,
                       fonction, date_debut_mandat, date_debut_fonction,
                       csp} ou None si non trouve.
        r   r   z2[RNE] Requete trop courte pour recherche elu: '%s'Nr   r   r   z&/data/?Nom%20de%20l%27%C3%A9lu__exact=z&page_size=20z[RNE] Recherche elu: nom=z	, prenom=z[RNE] URL: %sz)CyberStrat/1.0 (contact@cevennes-web.com)r   r  Acceptr  r   r   r   z[RNE] Erreur API: %sr   z [RNE] Aucun resultat pour nom=%sz[RNE] %d resultats pour nom=%sr   c                     |                                                                  }                     d|           } d                    fd| D                       S )z6Normalise pour comparaison : minuscule + sans accents.rB  rp   c              3   N   K   | ]}                     |          d k    |V   dS rD  rF  rs   s     rZ   r[   zIMediaFetcher.fetch_elu_officiel.<locals>.normalize_str.<locals>.<genexpr>  s;      KK;+?+?+B+Bd+J+J1+J+J+J+JKKr\   )r]   rf   rv   rw   rH  s    rZ   normalize_strz6MediaFetcher.fetch_elu_officiel.<locals>.normalize_str  sV    		!!A%%eQ//A77KKKKaKKKKKKr\   rp   r  z[RNE] MATCH EXACT: r  z[RNE] Match partiel prenom: z5[RNE] Aucun match prenom pour '%s' parmi %d resultatsr   z[RNE]   -> Maire)nomr   communedepartement
code_inseecode_departementfonctiondate_debut_mandatdate_debut_fonctioncspsexez[RNE] Elu identifie: r   r[  z, Maire de r\  r  r]  z
), depuis rb  )"rf   r   r   rm   upperrw   r]   RNE_BASE_URLRNE_MAIRES_UUIDr   r   r  r  r  r  RNE_TIMEOUTr  r  r   r  r   r  r  ru   _COL_PRENOM_COL_NOM_COL_COMMUNE	_COL_DEPT_COL_CODE_COMMUNE_COL_CODE_DEPT_COL_DEBUT_MANDAT_COL_DEBUT_FONCTION_COL_CSP	_COL_SEXE)r  ra   r   surnamefirstname_queryr   r  r  r  r   r  r  rY  fn_normalized
best_matchrow
row_prenomrow_nomrow_communerow_fn_normalizedrprnrceluru   s                           @rZ   fetch_elu_officielzMediaFetcher.fetch_elu_officiel  s    ##%%u::>>MPUUVVV4 )//##((5":..4466<<>> #d&:: >/ /l  ))*  	 	VV//Z[[[V_s*+++
	.((I,7 7(  C ''T5E'FF '$iikk((11z#' ' ' ' ' ' ' ' ' ' ' ' ' ' '  	 	 	/#a&&899944444	 ((62&& 	;gEFFF4V5Ww8OOPPP 		L 	L 	L 	L 	L &o66
 	1 	1C!1266JggdmR00G''$"3S99K -j 9 9 !M11 
VV!zz777KK9: : :   111%!$J%::ww01 1 1  		P#S\\23 4 4 4 rr{ F FWWT-s33WWT]C00WWT.44VVRRRDEEEE4 >>$-44 nnT%5r::!~~d&7<<%>>$."==$..)?DD *t/BB G G!+0F!K!K#->>$2JB#O#O>>$-44NN4>266
 
 	VV8}}}c%jjj#i...=!!!3'<#=#=?	@ 	@ 	@
 
s=   A
F+ <FF+ F##F+ &F#'F+ +
G 5 GG c           	          |rt          |t                    sdS |                                }g d}|D ](}||v r"t          dd|dd          d| d            dS )d	S )
a  Detecte les URLs Google proxy / profile photo / YouTube.
        Ces URLs sont instables, expirent, et ne sont pas de vrais
        portraits exploitables. Patterns detectes :
        - googleusercontent.com (Google Photos, YouTube profiles)
        - ggpht.com (ancien CDN Google profiles)
        - =s900, =s800 etc. (redimensionnement Google)
        - no-rj (parametre Google profile)
        - blogspot.com (hosted images, souvent recycled)
        - gstatic.com (assets Google statiques)
        T)zgoogleusercontent.comz	ggpht.comzgstatic.comzblogspot.comzgoogle.com/imgzgoogle.com/mapsz=s900z=s800z=s700z=s600zno-rjc0x00ffffffr   z,[PORTRAIT-FILTER] URL Google proxy rejetee: Nr  z... (pattern: r  F)r  r  r]   rm   )r  r   r   google_patternsr   s        rZ   r  z!MediaFetcher._is_google_proxy_url  s      	*S#.. 	4IIKK	
 
 
 ' 	 	G)##V:3B3x: :/6: : :; ; ; tt	 $
 ur\   c                     |rt          |t                    sdS |                    d          st          dd|dd                     dS t	          |          dk     rdS |                     |          rdS dS )	zValide qu'une URL portrait est exploitable.
        Rejette : Google proxy, URLs trop courtes, sans https,
        fragments sans domaine.
        Fr`  r   z)[PORTRAIT-FILTER] URL non-HTTPS rejetee: Nr     T)r  r  r   rm   r   r  )r  r   s     rZ   _is_valid_portrait_urlz#MediaFetcher._is_valid_portrait_url(  s    
  	*S#.. 	5~~j)) 	ss8    5s88b==5$$S)) 	5tr\   )1200x6281200x6301200x67516x9z16-9r  herocoverheader1280x720800x450600x338?c           
         |                                 t          fd| j        D                       rdS 	 t          j                            |t          dd          }t          j                            |d          5 }|                    d          }d	d	d	           n# 1 swxY w Y   d
dl	m
} d
d	l}|                    |                    |                    }|j        \  }	}
|	dk     s|
dk     rt          dd|	 d|
            dS |
|	|z  k     r t          dd|	 d|
 d|
|	z  dd           dS n# t           $ r Y nw xY wdS )u   v4.9d R3 — Detecte les og:image au format paysage.
        Etape 1 : heuristique URL (rapide, sans reseau).
        Etape 2 : PIL via Range request (4096 octets).
        Retourne True si paysage ou trop petit → rejeter.
        c              3       K   | ]}|v V  	d S rU   rV   rW   r   r   s     rZ   r[   z6MediaFetcher._is_landscape_og_image.<locals>.<genexpr>L  s'      DD!qI~DDDDDDr\   Tzbytes=0-8191)r  Ranger  r   r   i    Nr   )Image   r   z[R3] Image trop petite: ru  z[R3] Image paysage detectee: z (ratio z.2fr  F)r]   r^   _LANDSCAPE_URL_PATTERNSr   r  r  r  r  r  PILr  ioopenBytesIOsizerm   r   )r  r  	min_ratior  r  r  	_PILImage_ioimgr   hr   s              @rZ   _is_landscape_og_imagez#MediaFetcher._is_landscape_og_imageD  s    MMOO	DDDDt'CDDDDD 	4	.((('; ;(  C ''Q'77 &4iioo& & & & & & & & & & & & & & & /.......S!1!122C8DAq3ww!c''V6q661668 8 8t1y=  V11 11 1&'c01 1 12 2 2 t	 !
  	 	 	 D	
 us=   A
D: B%D: %B))D: ,B)-A!D: 'D: :
EE)
z/auteur/z/journaliste/z/contributeur/z/redaction/z/profil/z/people/z/author/z	/writers/z/team/z/equipe/)z/interview/z
/portrait/z/rencontre/z/parole/z/temoignage/z/a-la-une/rencontrer   r   c           
         ! |st          dd           dS ddlm} |                                                                }h d}d}t          |          D ]}||vrt          |          dk    r|} n|rt          |          nd}t          |                                                                          }	|	r|	d         nd}
t          |	          d	k    rd
                    |	d	d                   n t          |                                          }g d}|dd         D ]ӊ!	 	 t          j
                            !          j                                                            dd          }n# t          $ r d}Y nw xY w|| j        v rt          dd| d!dd                     t!          !fd| j        D                       pt!          !fd| j        D                       }t          j                            !t*          dd          }t          j                            |d          5 }|                    d          }|                    dd          }ddd           n# 1 swxY w Y    ||d          }d}|                    d          }|r&|                                                                }d}|                    d          }|r&|                                                                }|r||v p|o||v }|s!t          dd!dd           d!| d"           |                    d#d$d%i&          }|r|                    d'          st          dd!dd           d(           _|d'         }|                    d)          st          dd*|dd                      |                                 t!           fd+|D                       rt          dd,|dd                     |                     |          r|                     |          rt          dd-|dd                     .|s|rd}t?          j         d.|t>          j!                  }|r(|"                    d	                                          }nIt?          j         d/|t>          j!                  }|r'|"                    d	                                          }|rG|rE|t          |          vr3t          dd0|#                                dd1          d2| d3           n!|s|st          dd4!dd                     2|d
z   |z   d
z   |z   }tI          ||
|          st          dd5|dd                     nt          dd6|dd          d7!dd8          d9           ||d:d;!d<c S # t          $ r)}t          d=d>!dd8          d?|            Y d}~d}~ww xY wt          dd@           dS )Aa  Extraire un portrait depuis les og:image des articles de presse.
        press_urls : liste d'URLs d'articles (depuis media_sources).
        Anti-homonymie : le titre/h1 de la page doit contenir le nom.
        v4.9d R2 : rejette og:image si auteur non confirme sur article
                   thematique (photo du sujet, pas du journaliste).
        Retourne un dict portrait ou None.
        r   z*[PORTRAIT-PRESS] Aucune URL presse fournieNr   BeautifulSoup>   r}   r   r   r   r   r   r   rp   r   r   r   )r  r  r  r  r  defaultplaceholderzshare-zsocial-z
og-defaultzthumbnail-defaultr   r   z2[PORTRAIT-PRESS][R2] og:image non fiable (domaine ): r  c              3   D   K   | ]}|                                 v V  d S rU   r  rW   r   r   s     rZ   r[   z:MediaFetcher._fetch_portrait_from_press.<locals>.<genexpr>  sD       % % $% % % % % %r\   c              3   D   K   | ]}|                                 v V  d S rU   r  r  s     rZ   r[   z:MediaFetcher._fetch_portrait_from_press.<locals>.<genexpr>  sD         $     r\   	text/htmlrU  r  r  r   iP  r   r   r  html.parserr   h1z[PORTRAIT-PRESS] r   z: nom 'z' absent du titre/h1r3  propertyzog:imageattrsr  z: pas d'og:imager`  z[PORTRAIT-PRESS] non-HTTPS: c              3       K   | ]}|v V  	d S rU   rV   )rW   rZ  	img_lowers     rZ   r[   z:MediaFetcher._fetch_portrait_from_press.<locals>.<genexpr>  s'      ??2rY??????r\   z%[PORTRAIT-PRESS] logo/banner rejete: z1[PORTRAIT-PRESS][R3] og:image rejetee (paysage): z?<meta[^>]+name=["\']author["\'][^>]+content=["\']([^"\']+)["\']zLclass=["\'][^"\']*(?:author|byline|signature)[^"\']*["\'][^>]*>([^<]{3,60})<u2   [PORTRAIT-PRESS][R2] og:image rejetee — auteur 'r  z' ne correspond pas a 'r  uK   [PORTRAIT-PRESS][R2] og:image rejetee — auteur non confirme sur article: z+[PORTRAIT-PRESS] FIX-E2 rejete (homonyme): z[PORTRAIT-PRESS] ACCEPTE: z
 (source: r  r  r  press_og_imager  r   z[PORTRAIT-PRESS] Erreur r  z-[PORTRAIT-PRESS] Aucun portrait presse trouve)%rm   bs4r  r]   r   r  r   rz   rw   r   r   r   r   r   r   PRESS_OG_UNRELIABLEr^   AUTHOR_PAGE_PATTERNSINTERVIEW_PAGE_PATTERNSr  r  r  r  r  r  rw  get_textr  r   r  r  r   r  Ir  rf   r:  )"r  
press_urlsra   r  r  r  rs  r   surname_asciiq_parts_asciir  r  LOGO_KEYWORDSpress_domainis_author_pager  r  
html_bytesr  soup
page_title	title_tagh1_textr  name_in_pageog_imgr  author_meta
meta_matchbyline_matchr  r  r  r   s"                                   @@rZ   _fetch_portrait_from_pressz'MediaFetcher._fetch_portrait_from_press~  s     	EFFF4%%%%%% ++--%%''AAA	'"" 	 	A	!!c!ffqjj3:Bw/// 'u{{}}55;;==*7?mA&&R+.}+=+=+A+ACHH]122&'''.. 	
 
 
 bqb> b	 b	Ca&#)<#8#8$ $#EEGGGGFB,?,? !L  & & &#%LLL&4#;;;@$0@ @58"X@ @A A A  "% % % % %!6% % % " " "     !9      n,,S",); ;,   ^++Q , ( ( 3+/!%5!1!1J%,,	 - 3 3D3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 %}T=99  
 IIg..	 >!*!3!3!5!5!;!;!=!=JYYt__ 4 kkmm1133G 6J!6 86Gw$6  $ >CH > > '> > >? ? ?  :z": # < < VZZ	%:%: *CH * * *+ + +  + ))*55 *"3B3<* *+ + +  $MMOO	????????? *"3B3<* *+ + +  ,,W55  ..w77 5&-crcl5 56 6 6  & &!- &!"$K!#7bd" "J
 " ?&0&6&6q&9&9&?&?&A&A (*y- !"$( ( ( ? , 2 21 5 5 ; ; = = ( % ! !( +1- 1- - - !/$/$5$5$7$7$<!/ !/ %,	!/ !/ !/0 0 0 %- ) ! !V7,/H7 78 8 8 ! cMJ.#$  /!;
< < 6'.ss|6 67 7 7 V;ss|; ;/23B3x; ; ;< < < #$-."%        V>s3B3x>>1>>@ @ @
 	V;	= 	= 	=ts   ,A
E76W7FWF*W1B
W;-I4(W4I8	8W;I8	<B:W8AW8W A
WW#0WC W7 W:W.W
W9W44W9c           	         |g}|rr|                     d          r]|d         }dD ]#}||v r|                    |          d         } n$|                    |                                dd                    |                    d           d                    |          }t          dd	| d
           	 t          |d          }g d}	t          ddt          |           d           t          |	                                          }
t          t          |          	                                          }|
                                }|r|d         nd}t          |          dk    rd                    |dd                   n|
}d}d}|dd         D ]}|                     dd          }t          |                     dd          	                                          }|                     dd          	                                |r|                    d          st          fd|	D                       }|r|                     |          r|dz   z   }t          |          }|
|v p|o||v }|sM|r-t          |          rt          dd|dd          d
           nt          dd|dd          d
           2dvrSd|	                                vr=|dz   |z   dz   z   }t          |||          st          dd|dd          d
           dv pd|v }|r;|s9||dd|                     dd          d}t          dd |dd!                     |s||dd"|                     dd          d}|rt          dd#           |S |r#t          dd$|d%         dd!                     |S t          dd&           n*# t           $ r}t          d'd(|            Y d}~nd}~ww xY wdS ))u   v4.9c — Recherche portrait via Serper Images.
        Enrichit la requete avec le contexte d'identite,
        priorite LinkedIn. v4.9f : source croisee.
        Retourne dict portrait ou None.
        rg  r  r   Nr  zphoto portraitr   r   z%[PORTRAIT-SERPER] Requete enrichie: 'r  r  r  )	r'   r&   r)   r,   r  zamazon.zfnac.zbabelio.z
goodreads.z[PORTRAIT-SERPER] r  rp   r   r  r   r   httpsc              3       K   | ]}|v V  	d S rU   rV   )rW   rI  r  s     rZ   r[   z3MediaFetcher._fetch_ddg_portrait.<locals>.<genexpr>  s8        %&AL     r\   z4[PORTRAIT-SERPER] Source croisee (domaine valide): 'z([PORTRAIT-SERPER] Rejete (nom absent): 'r*   r  z-[PORTRAIT-SERPER] FIX-E2 rejete (homonyme): 'r=  r  duckduckgo_linkedinr  z[PORTRAIT-SERPER] LinkedIn: r  duckduckgo_portraitz$[PORTRAIT-SERPER] ACCEPTE (LinkedIn)z[PORTRAIT-SERPER] ACCEPTE: r   z'[PORTRAIT-SERPER] Aucun resultat valider   z[PORTRAIT-SERPER] ECHEC: )r  r   r	  rf   rw   rm   r  r   rz   r]   r   r   r^   r  r  r:  r   )r  ra   r  r  enriched_partsr  r  r  r  BLOCKED_DOMAINSr  r  r  r  r  linkedin_candidatebest_candidater  r  r   r   r  check_asciiname_relevantr  is_linkedinr  r  s                              @rZ   _fetch_ddg_portraitz MediaFetcher._fetch_ddg_portraitM  s      	5\22 	5L)D?  $;;::c??1-DE  !!$**,,ss"3444.///.11V"" " "	# 	# 	#	11B  N  O #S%8%8 # # #$ $ $
 %U[[]]33G&$U++1133 I
 mmooG(/7'!**RK),W)9)9%%%  "&!N&ss+ M M((:r22&HHWb))//11  ((62..4466 g&8&8'! '!     *9      ,,W55  #S[72
,Z88{* @!>i;&>  % !$ 
! 9 '!7 !7
! V@27*@ @ @A A A A V<.3CRCj< < <= = = ! #'11'w}}>>#-3g= 2%{J@ @ !V:,1#2#J: : :; ; ; ! ') ."g-   '9 &#( 1"7&*hhvr&:&:* *& *"3B3<* *+ + + + ( &#( 1"7&*hhvr&:&:& &N " *V;= = =)) &V4%e,SbS14 45 5 5 &%:< < < < 	1 	1 	1/A//1 1 1 1 1 1 1 1	1 ts%   8K&O $O O 
O<O77O<Fc                 <   t          dd| d| dt          |pg            dt          |           d	           dD ]r}	 t          j                            |                    dd	                    }d
| d| }t          j                            |dt          i          }t          j        
                    |d          5 }	t          j        |	                                                    d                    }
ddd           n# 1 swxY w Y   |
                    di           }|
                    di           }|                    d          p|                    d          }|                    d          p|                    dd          }|                    d          p|                    dd          }|st          dd| d           n|dk     s|dk     rt          dd| d| d| d           n||dz  k     rt          dd| d| d| d           n]|                     |          rt          dd| d            n3t          dd| d!|dd"                     d
| d#| }||d$d%| |d&c S F# t"          $ r!}t          dd| d'|            Y d}~ld}~ww xY w	 t          j                            d(d)d*| d+d,d-d.d/d0d1	          }d2| }t          j                            |dd3i          }t          j        
                    |t&                    5 }	t          j        |	                                                    d                    }
ddd           n# 1 swxY w Y   |
                    d(i                               d4i           }t          dd5t          |           d6           |                                                                }h d7}g }t-          |          D ] \  }}|dk    r|                    |           !|rd                    |          nd8}d8}t3          |          D ]}||vrt          |          d9k    r|} n|                                D ]}|                    d-i g          d         }|                    d:d8          }|                    d;          p|                    d<          }|                    d=          p|                    dd          }|                    d>          p|                    dd          }|                    d?d@          }|                                }|r||v p|o||v } | s!t          ddA|ddB          dC| dD           |rdE|vrt          ddA|ddB          dF           3|dk     s|dk     r$t          ddA|ddB          dG| d| d           c||dz  k     r$t          ddA|ddB          dH| d| d           |                     |          rt          ddA|ddB          dI           t          ddJ|dd"                     dK| }!||d$dL|!d&c S n*# t"          $ r}t          ddM|            Y d}~nd}~ww xY w|r|                     ||          }"|"r|"S nt          ddN           |r&t          ddO           t          ddP| dQ           dS |                     |||R          }#|#r|#S t          ddP| dS           dS )TaT  Cherche EXCLUSIVEMENT une photo portrait de la personne.
        Ordre de priorite (pipeline 5 sources v3.8) :
        1. Wikipedia API (portrait officiel si page existe)
        2. Wikimedia Commons (categorie portraits)
        3. Press og:image (articles de presse, anti-homonymie)
        4. DDG enrichi (contexte identite + LinkedIn priority)
        Rejette : logos, affiches, visuels d'emissions, couvertures.
        press_urls : liste d'URLs d'articles de presse.
        identity : dict de resolve_person_identity.
        Retourne un dict {url, caption, type, source} ou None.
        r   z*[PORTRAIT] Debut recherche portrait pour 'z' (skip_ddg=z, press=z, identity=r  )r   r4  r   r  r`  z(.wikipedia.org/api/rest_v1/page/summary/r  r  r  r   r   N	thumbnailoriginalimager  r  r   r  z[PORTRAIT] Wikipedia (z!): page trouvee mais AUCUNE imager  z): image trop petite (ru  r  z): ratio non-portrait (r   z): Google proxy rejetez): ACCEPTE r  z.wikipedia.org/wiki/r  
wikipedia_r  z	): ECHEC ra   r  r  z	 portrait5r  zurl|size|mime|thumburl400r   )	r  	generatorgsrnamespace	gsrsearchgsrlimitr  r  
iiurlwidthr  r  r  r  z[PORTRAIT] Wikimedia Commons: r   >
   r|   r}   r~   r   r   r   r   r   r   r   rp   r   mimethumburlr   
thumbwidththumbheightr   r  z[PORTRAIT] Wikimedia 'r  z': REJETE (nom 'z' absent du titre)jpegz': non-jpeg ou pas d'URLz': trop petit (z': ratio non-portrait (z': Google proxy rejetez&[PORTRAIT] Wikimedia Commons: ACCEPTE z#https://commons.wikimedia.org/wiki/r  z$[PORTRAIT] Wikimedia Commons: ECHEC z+[PORTRAIT] Press: aucune URL presse fourniez%[PORTRAIT] DDG SKIPPE (skip_ddg=True)z'[PORTRAIT] Aucun portrait trouve pour 'z!' (Wikipedia + Wikimedia + Press))r  r  )rm   r   r  r   r   r  r   r  r  r  r  r   r  r  r  r  r  r   r  r  r]   r   r   r	  rw   r  r  r  r  )$r  ra   skip_ddgr  r  langslugr   r  r  r   thumboriginalr  r   r  	wiki_pager  r  r  r  r  surname_partsr   partsurname_fullsurname_lastr  infor  r  title_lowerr  commons_pagepress_result
ddg_results$                                       rZ   fetch_person_portraitz"MediaFetcher.fetch_person_portrait  s
    	V* * *!* *+.z/?R+@+@* *X* * *	+ 	+ 	+ ! /	A /	AD.A|))%--S*A*ABB.$ . .'+. .n,,S *;,   ^++C+;; Ct:diikk&8&8&A&ABBDC C C C C C C C C C C C C C C b1188OR88",,x00GEIIh4G4GLL))BUYYw-B-BLL**Deii!.D.D : : : :; ; ; ; WWC7 7 7./7 7237 7 78 8 8 8 S[[8 8 8/08 8348 8 89 9 9 9 ..w77 / / / /0 0 0 0 2 2 2#*3B3<2 23 3 3 !L4 K KT K KI&#( 1"5t"5"5&/      A A AV?T??A??A A A A A A A AA
b	<\++!% # %000#2# 
- 
- 
 
F FVEEC.((.7(  C '']'CC ?tz$))++"4"4W"="=>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ? HHWb))--gr::E*u::* * *+ + +
 ,,..J2 2 2IM$Z00 / /4q55!((...6CK388M222LL ,,  y((SYY]]#'LE 4 4xxbT2215xx++((:..A$((5//HH\**Bdhhw.B.BHH]++Dtxx!/D/D!XXgs33
 )..00!Alk&A F$D)D  " %CRC % %(4% % %& & &  &"4"41CRC 1 1 12 2 2 s77a#gg0CRC 0 0'(0 0+,0 0 01 1 1 q3w;;8CRC 8 8/08 8348 8 89 9 9 ,,W55 /CRC / / /0 0 0 V.&ss|. ./ / /  RZQQ"$-1".    ]4j  	< 	< 	<:q::< < < < < < < <	<
  	?::E# #L $##$ =? ? ?  	79 9 9== = => > > 4--8* . 6 6
 	V>e>>>	@ 	@ 	@ts|    BJ:D
>J
D	JD	E+J
J-J((J-1A:Y- +:M1%Y- 1M55Y- 8M59K2Y- -
Z7ZZc                     |sdS t          |d d          }|d         }t          dd|                    d           d	|                    d
           d|                    d          rdnd            |S )u|   v4.8g — Selectionne la publication featured :
        La plus recente par annee. Si ex-aequo, prefere avec cover.
        Nc                 b    |                      d          pd|                      d          rdndfS )Nr  r   r  r   r  ru  s    rZ   <lambda>z/MediaFetcher._select_featured.<locals>.<lambda>  s3    f"UU;''.Q r\   Tr~  r   r   zFeatured selectionne: 'r   z' (r  z) cover=r  r  r  )r  rm   r  )r  publicationssorted_pubsfeatureds       rZ   _select_featuredzMediaFetcher._select_featured  s      	4   
 
 
 q>VEhll7&;&; E EV$$E E&ll;77BUUUE E	F 	F 	F r\   zLe dernier juge de ma viezMarie Le Boiteuxr   rp   known_works)r   r  r  r  r  r  r   r  z!Manifeste d'un psychiatre outragezHerve Bokobzai  zChamp social9791034606740zLa psychiatrie en perili  Eres)zmarie le boiteuxzherve bokobzac                 l  	
 | r|sdS h d		fd} ||           } ||          }|r|sdS t          |          } ||          }|r|d         n|r|d         nd|r|d         nd

fd} |||          rdS t          |          d	k    r"|d
d         |d         gz   } |||          rdS dS )u   v4.9c — Verifie que l'auteur de la publication correspond
        au nom recherche. Rejette les homonymes partiels.
        5 corrections : accents, noms intermediaires, initiales,
        ordre inverse, initiales nues.
        F>   r|   r}   r~   r   r   r   r   r   r   r   r   r   c                     t          |                                                                           } |                     dd                              dd          } fd|                                 D             S )Nr@  r   ,c                      g | ]
}||v|S rV   rV   )rW   tr   s     rZ   r  zIMediaFetcher._author_name_matches.<locals>._normalize.<locals>.<listcomp>  s(    FFF!AF!:2E2EA2E2E2Er\   )rz   r]   rf   r   r   )r  r   s    rZ   
_normalizez5MediaFetcher._author_name_matches.<locals>._normalize  sg    qwwyy0011A		#s##++C55AFFFFqwwyyFFFFr\   r   rp   r   c                    rt          fd| D                       sdS fd| D             }fdD             }t          |          t          |          k    rdS t          fd| D             d           }t          fdD             d           }|rZ|rXt          |          dk    r|d         |d         k    rdS n0t          |          dk    r|d         |d         k    rdS n||k    rdS d	S )
Nc              3   $   K   | ]
}|k    V  d S rU   rV   rW   r	  exp_surnames     rZ   r[   zDMediaFetcher._author_name_matches.<locals>._match.<locals>.<genexpr>  s9       ' '%&q ' ' ' ' ' 'r\   Fc                     g | ]>k    r6k    r0t                    d k    rt          fdD                       <?S )r   c              3   B   K   | ]}|                               V  d S rU   )r   )rW   r  r	  s     rZ   r[   zOMediaFetcher._author_name_matches.<locals>._match.<locals>.<listcomp>.<genexpr>$  s-      (E(EQa(E(E(E(E(E(Er\   )r   r^   )rW   r	  epexp_firstnamer  s    @rZ   r  zEMediaFetcher._author_name_matches.<locals>._match.<locals>.<listcomp>   sp     H H Ha++..!$Q1%((E(E(E(E"(E(E(E%E%E "-  "-r\   c                 ,    g | ]}|k    |k    |S rV   rV   )rW   r	  r  r  s     rZ   r  zEMediaFetcher._author_name_matches.<locals>._match.<locals>.<listcomp>%  s:     D D Da++]0B0B 0B0B0Br\   c              3   (   K   | ]}|k    |V  d S rU   rV   r  s     rZ   r[   zDMediaFetcher._author_name_matches.<locals>._match.<locals>.<genexpr>,  -      33q!{"2"2"2"2"2"233r\   c              3   (   K   | ]}|k    |V  d S rU   rV   r  s     rZ   r[   zDMediaFetcher._author_name_matches.<locals>._match.<locals>.<genexpr>/  r  r\   r   r   T)r^   r   next)ppr  pp_extraep_extrapp_firstep_firstr  r  s    `    rZ   _matchz1MediaFetcher._author_name_matches.<locals>._match  s    3 ' ' ' '*,' ' ' $ $  uH H H H H H2 H H HH
D D D D D2 D D DH8}}s8}},,u 3333B333T H 3333B333T H  	%H 	%x==A%%{hqk11$u 2]]a''{hqk11$u 2  8++$u4r\   Tr   r   N)r   r   )
pub_authorexpected_namer
  	pub_parts	exp_partsraw_surnamesurname_tokensr  swappedr   r  r  s            @@@rZ   _author_name_matchesz!MediaFetcher._author_name_matches  sZ     	 	5= = =
	G 	G 	G 	G 	G
 Jz**	J}--	 		 	5
 +=99#K00,: 
nR((&.IbMMB 	 )29	!r(	 (	 (	 (	 (	 (	V 6)Y'' 	4 y>>Qmy|n4Gvgy)) tur\   c                 ~   g }	 t           j                            d| dddd          }d| }t           j                            |dt
          i          }t           j                            |t          	          5 }t          j	        |
                                                    d
                    }ddd           n# 1 swxY w Y   |                    dg           D ]}	|	                    di           }
|
                    dd          }|
                    dg           }|r|d         nd}|
                    dd          }t          |          dk    rt          |dd                   nd}|
                    di                               dd          }|r|                    dd          }|
                    dd          }|                    |||
                    dd          |d||dd           n*# t"          $ r}t%          dd|            Y d}~nd}~ww xY w	 t           j                            |ddd          }d | }t           j                            |dt
          i          }t           j                            |t          	          5 }t          j	        |
                                                    d
                    }ddd           n# 1 swxY w Y   |                    d!g           D ]}|                    dd          }|                    d"g           }|r|d         nd}|                    d#d          pd}|                    d$          }|rd%| d&nd}|                    d'd          }|rd(| nd}|                    |||                    d          r|                    ddg          d         nd|d||d)d           n*# t"          $ r}t%          dd*|            Y d}~nd}~ww xY wt%          d+d,| d-t          |           d.           |S )/zmCherche une publication specifique par son titre exact.
        Utilise Google Books + Open Library.
        z	intitle:"r  3r   )rY   
maxResultslangRestrict,https://www.googleapis.com/books/v1/volumes?r  r  r   r   Nr   
volumeInfor   rp   authorsr   publishedDater  
imageLinksr  http://r`  infoLinkr  google_books_titler  r   z$_fetch_by_title Google Books echec: ?title,author_name,publisher,first_publish_year,isbn,cover_i,key)r   limitfields$https://openlibrary.org/search.json?docsauthor_namefirst_publish_yearcover_i$https://covers.openlibrary.org/b/id/-L.jpgr  https://openlibrary.orgopenlibrary_titlez$_fetch_by_title Open Library echec: r   z_fetch_by_title('r  r   )r   r   r  r  r  r  r  r  r   r  r  r  r  r   r  r   r	  r   rm   )r  r   r7  r  r  r   r  r  r   r  r  	pub_titler,  r  year_strr  r  r   r  docauthors_listr9  r  ol_keys                           rZ   _fetch_by_titlezMediaFetcher._fetch_by_titleJ  s    %	E\++))))! $- -  F
 JIIC.((j7(  C '']'CC ?tz$))++"4"4W"="=>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ? "--  xxb11 HHWb11	((9b11+2:WQZZ
88OR88,/MMQ,>,>s8BQB<(((Ar2266{BGG A!MM)Z@@Exx
B//&(!%+r!:!: !& 2	  	  	 	 	 	.  	E 	E 	ECCCDDDDDDDD	E(	E\++@- -  F BAAC.((j7(  C '']'CC ?tz$))++"4"4W"="=>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ? xx++  GGGR00	"ww}b990<D\!__"
ww3Q77<1''),, +2: '' ' ' '79  ++!*:&:::')  &(%(WW[%9%9#B#''+v">">q"A"A?A !* 1
  
  
 
 
 
2  	E 	E 	ECCCDDDDDDDD	E 	V( ( (7||( ( (	) 	) 	) s{   A:H >:C8H CH CD8H 
H,H''H,0A6O/ &:K, O/ ,K00O/ 3K04C:O/ /
P9PP)psychiatriepsymentalsoinu	   thérapietherapieu   santér  r8  patientpsychosepsychanalyser*  r)  
contention	manifeste)r  rJ  loijusticerC  rI  codejurisprudence)u	   économier  u   marchémarchefinancetravailcapitalmonnaie)rP  u	   électionelectionpartigouvernu   démocratie
democratie)r  presser  u   enquêteenquete	reportageinformation)philosophieu   penséepenseeethiqueu   éthiquemorale)
sociologierF  u	   sociétésocieteclasseu   inégalité	inegalite)histoire
historiqueu   mémoirememoireu   sièclesiecleguerreu   révolution)	psychiatrrG  u   économpolitiqu
journalistr  r  r  c                 ^   |                     d          pd                                }|s|S g }| j                                        D ]\  }}||v r|} n|s|S g }|D ]}|                     d          pd                                }	|                     d          pd                                }
|	dz   |
z   t	          fd|D                       r|                    |           t          dd|                     d           d	           |st          d
d| d           |S |S )u`   Filtre les publications incohérentes avec la profession
        connue de la personne.
        rg  rp   r   r  r   c              3       K   | ]}|v V  	d S rU   rV   )rW   rZ  pub_texts     rZ   r[   z>MediaFetcher._filter_publications_by_domain.<locals>.<genexpr>  s'      <<b2><<<<<<r\   r   z%Publication filtree (hors domaine): 'r  r   z#Filtre domaine trop agressif pour 'u)   ' — publications conservees sans filtre)r  r]   DOMAIN_KEYWORDSr   r^   r	  rm   )r  r  r  rg  person_keywords
domain_keyrY  r  pubr  publisher_lowerrq  s              @rZ   _filter_publications_by_domainz+MediaFetcher._filter_publications_by_domain  s    ll<006B==??
 	  $($8$>$>$@$@ 	 	 JZ''"* (  	  	- 	-C777++1r88::K"ww{339r@@BBO"S(?:H <<<<O<<<<< -$$$$V,((, , ,- - - -
  	 ;j ; ; ;< < <  r\   r   c                 
   |                     dd          }t          |                               dd          }|g}||k    r|                    |           |D ](}|dfD ]!}|                     |||          }|r|c c S ")g S )u   v4.8g — Extraire la bibliographie depuis Wikipedia
        via l'API MediaWiki (sections → wikitext → parse).
        Source la plus fiable pour les auteurs notoires.
        Retourne une liste de dicts publication.
        r   r  r4  )r   rz   r	  _try_wiki_biblio)	r  r   r  r  
ascii_slugrM  variant	wiki_langpubss	            rZ   _fetch_wikipedia_bibliographyz*MediaFetcher._fetch_wikipedia_bibliography  s     ||C%%#D))11#s;;
6OOJ''' 	  	 G"D\    	,,Wi-13 3  KKKKK   	r\   c           
         d| d}t           j                            d|dddd          }	 t           j                            | d| d	t
          i
          }t           j                            |t                    5 }t          j	        |
                                                    d                    }ddd           n# 1 swxY w Y   n/# t          $ r"}	t          dd| d|	            g cY d}	~	S d}	~	ww xY w|                    di                               dg           }
|
sg S h d}d}|
D ]}|                    dd                                                                          |v r|                    d          } n3t#          fd|D                       r|                    d          } n|g S t           j                            d|d|ddd          }	 t           j                            | d| d	t
          i
          }t           j                            |t                    5 }t          j	        |
                                                    d                    }ddd           n# 1 swxY w Y   n/# t          $ r"}	t          dd| d|	            g cY d}	~	S d}	~	ww xY w|                    di                               di                               dd          }|sg S |                     ||          }|r't          dd| dt'          |           d| d           |S )un   v4.8g — Tenter d'extraire la bibliographie d'une page
        Wikipedia donnee via l'API MediaWiki.
        r`  z.wikipedia.org/w/api.phpr   sectionsr   1)r  r  r  r  	redirectsr  r  r  r   r   Nr   zWikipedia sections echec (r  >      œuvresselected worksselected bibliographybooksworkslivresoeuvresouvragesbibliographyr  bibliographier  rp   indexc              3       K   | ]}|v V  	d S rU   rV   )rW   bts_titles     rZ   r[   z0MediaFetcher._try_wiki_biblio.<locals>.<genexpr>/  s'      ::R2=::::::r\   wikitext)r  r  r  sectionr  r  zWikipedia wikitext echec (*r   zWikipedia biblio ( publications pour 'r  )r   r   r  r  r  r  r  r  r   r  r  r  r   rm   r  r]   rf   r^   _parse_wikitext_bibliographyr   )r  r  r|  r7  base_urlr  r  r  r   r  r  _BIBLIO_TITLESbiblio_indexr  params2req2resp2data2r  r}  r  s                       @rZ   ry  zMediaFetcher._try_wiki_biblio  s   "y " " " '')
 )
  
	.((&&f&&%z2 ) 4 4C '' ( 0 0 ?37z$))++"4"4W"="=>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ?  	 	 	>Y>>1>>@ @ @IIIIII	
 88GR((,,Z<< 	I
 
 
  	 	AeeFB''--//5577G.(( uuW~~::::>:::::  uuW~~ I ,((#*
 *
  	>))''g''%z2 * 4 4D ''- ( 1 1 249
JJLL''002 22 2 2 2 2 2 2 2 2 2 2 2 2 2 2  	 	 	>Y>>1>>@ @ @IIIIII	
 IIgr**SR((S" 	 	I00k# # 	A@Y @ @t99@ @2<@ @ @A A A s   AC :C;C CC CC 
C?C:4C?:C?-AJ  :J:J J

J J
J 
J>J93J>9J>c                 t   g }t                      }|                     d          }|D ]}|                                }|r|                    d          r/t	          j        d|          }|rwt          |          }|re|d                                         |vrI|                    |d                                                    |r||d<   |	                    |           |                    d          s+|                    d          s|                    d          st	          j
        d	d
|          }t	          j
        dd|          }t	          j
        dd|          }t	          j
        dd|          }t	          j
        dd|          }t	          j
        dd|                                          }t          |          dk     rd}	t	          j        d|          }
|
r!t          |
                                          }	|                    d          }|d                                         }t	          j
        dd|t          j                                                  }|rt          |          dk     r\d}|                                                    |          r|                                |v r|                    |                                           d}t          |          dk    r|dd         D ]}|                                }t	          j        d|          sWt          |          dk    rDd|                                vr.d|                                vrd |dd!         vr|dd"         } n|	                    |dd#         |||	dddd$d%           |S )&uN  v4.8g — Parser le wikitext d'une section bibliographie.
        Extrait titre, annee, editeur.
        Formats courants Wikipedia :
        * ''Titre'', Editeur, 1998
        * [[Titre du livre]], Editeur, 2005, ISBN...
        # ''Titre'', coll. «...», Editeur, 2010
        {{Ouvrage|titre=...|année=...|éditeur=...}}
        
z==z\{\{[Oo]uvrage\s*\|r   r  r  #r`  r  r  z\{\{[^}]*\}\}rp   r  z^\s*[*#\-]+\s*r  r>  r   r   r   z\b(19[4-9]\d|20[0-2]\d)\br  u4   \s*(ISBN|coll\.|vol\.|n°|avec |preface|préface).*$r<  r  )zil zelle zen zavec zpour zdans zsur zsous la zsous le zc'est zce zcette r   Nz^\d{4}$r   ISBNzcoll.zp.r   r   r  r  r  )r  r   rf   r   r   r  r  r]   r  r	  r   r   r  r  rJ  r  re  )r  r7  r  seen_titleslinesr  ouvrage_matchru  cleanr  
year_matchr   r   _DESC_STARTSr  r   s                   rZ   r  z)MediaFetcher._parse_wikitext_bibliographyY  s    eet$$ Z	 Z	D::<<D 4??400  I&. .M -d33 -3w<--//{BBOOCL$6$6$8$8999" 4(3H '',,, OOC(( OOC((OOC((  F/t E F+R77EF8R//EF,b%88EF:r511EF63..4466E5zzA~~ D,e5 5J /:++--.. KK$$E!HNN$$EF(E   egg	   CJJNNL
 {{}}''55 {{}}++OOEKKMM*** I5zzA~~qrr  A		AHZ33  #A

 &aggii 7 7 'qwwyy 8 8 $AbqbE 1 1$%crcF	tt%&,	! 	! 	 	 	 	 r\   r   Tc                     g }|rS                      d          }|r9t          ddt          |           d d           |                    |           nt          dd d           g}|r|                    d          d	v ro|                    d
          rZ|d
                                                                         }d |D             }	|	r |                     d|	d                     |D ]X}
                     |
|          }|                    |            	                    |
|          }|                    |           Y 
                    |          }|                    |           t          |          } fd|D             }|t          |          z
  }|dk    rt          dd| d                                |          }t          d |D                       }|s_t          dd                                |          } fd|D             }|                    |                                |          }|                    d            |D ]j}|                    d          rS                     |d                   s8t          dd|d         dd          d|d         dd          d           d|d<   kt          d  |D                       }|s|rt          dd!           |d         d         } 	                    |d"#          }|D ]}|                    d          r                     |d                   rn|d         |d         d<   |d                             d$          p|                    d$d          |d         d$<   t          dd%|d         dd                      n|sct          dd& d'                                |          } fd(|D             }|                    |                                |          }|s_t          dd)                                |          } fd*|D             }|                    |                                |          }|r:|D ]7}|                    d          r                      |d                   sd|d<   8|r+|                    d
          r                     ||          }                                                                } j                            |g           }|rt          dd+t          |           d, d           d- |D             }|D ]}|d                                                                         dd.         }||vr                     |d                   }|r+|d         }|D ]} |                     d          r| } nt-          |          }!|                    d          r|d         |!d<   |                    d/          r |!                    d/          s|d/         |!d/<   |                    d$          r|d$         |!d$<   |                    d0          r|d0         |!d0<   |                    |!           t          dd1|!d          d2|!                    d/           d3|!                    d          rd4nd5 d6           |                    t-          |                     t          dd7|d          d           |d|         }"t          dd8 d9t          |"           d:t          |           d;t          |           d<	           |"D ]G}#t          dd=|#d/          d>|#d          d|#d?          d@|#                    d          rd4nd5            H|"S )Aa  Rechercher les publications d'un auteur
        Sources : Wikipedia biblio + Open Library + Google Books
                  + Babelio (+ Amazon.fr si rien > 2020)
        Tri     : par annee de publication decroissante, year=0 en dernier
        Max     : 10 resultats (v4.8g)
        identity : dict optionnel (resolve_person_identity) pour variantes
        wiki_found : si True, tenter la bibliographie Wikipedia
                     (False = skip pour gagner du temps)
        r   )r  r   zWikipedia biblio: r  r  z/Wikipedia biblio SKIP (pas de page wiki) pour 'r  )r  r  rg  c                     g | ]G}t          |          d k    r2|d                                         r|                                dvE|HS )r   r   )lieu
experience)r   r#  r]   r@  s     rZ   r  z3MediaFetcher.fetch_publications.<locals>.<listcomp>  sU       q66A::!A$,,..:GGII%;;; ;;;r\   r   r   c                 h    g | ].}                     |                    d d                    ,|/S r  rp   r%  r  rW   r   r7  r  s     rZ   r  z3MediaFetcher.fetch_publications.<locals>.<listcomp>  sR     
 
 
((h##[2 2

 
 
r\   zFiltre auteur: z  publications homonymes rejeteesc              3   .   K   | ]}|d          dk    V  dS )r  i  NrV   r  s     rZ   r[   z2MediaFetcher.fetch_publications.<locals>.<genexpr>  s+      ;;a6T);;;;;;r\   z%Aucune pub > 2020, fallback Amazon.frc                 h    g | ].}                     |                    d d                    ,|/S r  r  r  s     rZ   r  z3MediaFetcher.fetch_publications.<locals>.<listcomp>  R       ,,EE(B''6 6  r\   c                 4    | d         dk    rdnd| d          fS )Nr  r   r   rV   )r   s    rZ   r  z1MediaFetcher.fetch_publications.<locals>.<lambda>  s!    6QAAA&	z#J r\   r  r  r   zCover URL invalide ignoree: Nr  z [r   r  ]rp   c              3   @   K   | ]}|                     d           V  dS )r  Nr  r  s     rZ   r[   z2MediaFetcher.fetch_publications.<locals>.<genexpr>+  s.      BBQaeeK00BBBBBBr\   u8   Aucune jaquette valide — retry Google Books avec titrer   )r  r   z+Jaquette recuperee via Google Books retry: zAucune publication par auteur 'u+   ' — retry Open Library recherche generalec                 h    g | ].}                     |                    d d                    ,|/S r  r  r  s     rZ   r  z3MediaFetcher.fetch_publications.<locals>.<listcomp>D  r  r\   uB   Toujours aucune publication — retry Google Books recherche largec                 h    g | ].}                     |                    d d                    ,|/S r  r  r  s     rZ   r  z3MediaFetcher.fetch_publications.<locals>.<listcomp>T  r  r\   zKNOWN_WORKS: z oeuvre(s) connue(s) pour 'c                 v    h | ]6}|d                                                                           dd         7S )r   Nr  )r]   rf   r  s     rZ   r  z2MediaFetcher.fetch_publications.<locals>.<setcomp>o  sJ       45'
  ""((**3B3/  r\   r  r  r  zKNOWN_WORKS injecte: 'z' (year=z, cover=OKNONEr  zKNOWN_WORKS injecte (brut): 'zPublications pour 'r   z resultats (z bruts, z dedup)z  -> r  r  z] cover=)r~  rm   r   r>  r  rf   r   r	  _fetch_openlibrary_fetch_google_books_fetch_babelio_deduplicate_pubsr^   _fetch_amazonsort_is_valid_cover_url_fetch_openlibrary_broad_fetch_google_books_broadrw  r]   KNOWN_WORKSrC  r  )$r  r7  r  r  
wiki_foundall_pubs	wiki_pubsqueries_to_try
prof_wordsclean_wordsrY   ol_pubsgb_pubsbab_pubsbefore_filterrejecteddeduped
has_recentamz_pubsru  has_valid_covertitle_querygoogle_retryol_broadgb_broad	known_keyr  existing_titlesrZ  kw_keyrv  bestfmergedrh  r   s$   ``                                  rZ   fetch_publicationszMediaFetcher.fetch_publications  s
    
  	)::$ ; ( (I +V,Y , ,(, , ,- - - 	***($( ( () ) )
 & 	7LL..2DDDLL.. E ",/5577==??J %  K
  7%%"55[^557 7 7   	% 	%A--a==GOOG$$$ ..q+>>GOOG$$$$ &&{K@@!!! H
 
 
 
 

 
 

 !3x==0a<<&( & & &' ' '
 ((22 ;;7;;;;;
 
	6@AAA))+{CCH    #  H
 NN8$$$,,W55G 	JJKKK  	* 	*Cww{## *//K0@AA *I{+CRC0I I47L"4EI I IJ J J (*C$ BB'BBBBB 	7 	JL L L!!*W-K33KQ3OOL# 	 	77;'' D,D,DK(-* -* .1+.>GAJ{+
v..E#''&"2E2E AJv&3{+CRC03 34 4 4 E  	6=+ = = => > > 44[* *H    #  H
 NN8$$$,,W55G  	6:; ; ; 55[* *H    #  H
 NN8$$$,,W55G  	. . .77;'' .33C4DEE .+-K(  	#\22 	#99# #G  %%''--//	&**9b99 *	02K 0 0 2 2#.2 2 23 3 3 9@  O " #0 #0G**,,2244SbS900 #222g;3>@ @G 0&qz!( & &A uu[11 &'( %& "&b88K00 D26{2CF;/88F++ :FJJv4F4F :-1&\F6N88F++ :-1&\F6N88K00 D26{2CF;/v...VT &wT T%+ZZ%7%7T T .4ZZ-D-D%PTT&T T TU U U U  tBxx000V/ "7/ / /0 0 0
 +&V >; > >3v;; > >X> >(+G> > > 	? 	? 	? 	D 	DA C6 C Cqz C CQx[ C C"#%%"4"4@$$&C C D D D Dr\   c                 @   t           j                            |dt          |dz            ddd          }d| }	 t           j                            |dt          i          }t           j                            |t          	          5 }t          j
        |                                                    d
                    }ddd           n# 1 swxY w Y   n,# t          $ r}t          dd|            g cY d}~S d}~ww xY w|                    dg           }	g }
t!                      }|	D ]H}|                    dd          }|r|                                |v r2|                    |                                           |                    dd          pd}|                    d          }|rd| dnd}|                    dd          }|rd| nd}|                    dg           }|r|d         nd}|                    dg           }|r|d         nd}|                    dg           }|r|d         n|}|
                    |||||||dd           t)          |
          |k    r nJt          dd| dt)          |
           d            |
S )!z0Rechercher les publications via Open Library APInewr   r2  r   )r  r  r3  r4  r  r5  r  r  r   r   Nr   zEchec Open Library: r6  r   rp   r8  r   r9  r:  r;  r  r<  r  r  r7  openlibraryr  r   zOpen Library pour 'r    publications)r   r   r  r  r  r  r  r  r  r   r  r  r  r   rm   r  r  r]   r  r	  r   )r  r7  r  r  r   r  r  r   r  r6  r}  r  r@  r   r  r9  r  rB  r   
publishersr  isbnsr  r,  r  s                            rZ   r  zMediaFetcher._fetch_openlibrary  s-   ''!q))W)
 )
   >V==	.((|Z6P(QQC'']'CC ?tz$))++"4"4W"="=>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 	 	 	333444IIIIII	 xx##ee  	  	CGGGR((E EKKMM[88OOEKKMM***77/338qDggi((GRYaNwNNNN_aIWWUB''F9?G5V555RDb11J)3;
1IGGFB''E$,588"DggmR00G#*;WQZZFKK &&'	 	 	 	 	 4yyK'' ( 	VS;SS3t99SSSTTTB   AC 
:CC CC CC 
D&D :D Dc                 B   t           j                            |dt          |dz            dd          }d| }	 t           j                            |dt          i          }t           j                            |t                    5 }t          j
        |                                                    d	                    }d
d
d
           n# 1 swxY w Y   n,# t          $ r}t          dd|            g cY d
}~S d
}~ww xY w|                    dg           }	g }
t!                      }|                                                                }|	D ]}|                    dd          }|r|                                |v r2|                    dg           }d                    |                                          dz   |                                z   t)          fd|D                       s|                    |                                           |                    dd          pd}|                    d          }|rd| dnd}|                    dd          }|rd| nd}|                    dg           }|r|d         nd}|                    dg           }|r|d         nd}|r|d         n|}|
                    |||||||dd           t/          |
          |k    r nt          dd| d t/          |
           d!           |
S )"zRecherche Open Library elargie (parametre q= general).
        Utile quand le nom d'auteur est un pseudonyme non indexe
        dans le champ author d'Open Library.
        r  r   r2  )rY   r  r3  r4  r5  r  r  r   r   Nr   zEchec Open Library broad: r6  r   rp   r7  r   c              3   F   K   | ]}t          |          d k    |v V  dS r  r  rW   r  all_texts     rZ   r[   z8MediaFetcher._fetch_openlibrary_broad.<locals>.<genexpr>   3      RRDCIIPQMMtx'MMMMRRr\   r8  r   r9  r:  r;  r  r<  r  r  r  r  r   zOpen Library broad pour 'r   r  )r   r   r  r  r  r  r  r  r  r   r  r  r  r   rm   r  r  r]   r   rw   r^   r  r	  r   )r  r7  r  r  r   r  r  r   r  r6  r}  r  author_partsr@  r   r,  r  r9  r  rB  r   r  r  r  r  r  r  s                             @rZ   r  z%MediaFetcher._fetch_openlibrary_broad  s   
 ''q))W	)
 )
   >V==	.((|Z6P(QQC'']'CC ?tz$))++"4"4W"="=>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 	 	 	9a99:::IIIIII	 xx##ee #((**0022 $	 $	CGGGR((E EKKMM[88 ggmR00Gxx((..0036FHRRRRLRRRRR OOEKKMM***77/338qDggi((G #FwFFFF "  WWUB''F9?G5V555RDb11J)3;
1IGGFB''E$,588"D#*;WQZZFKK &&'	 	 	 	 	 4yyK'' ( 	VPPPD		PPP	R 	R 	Rr  c                    t           j                            d| dt          |dz            dddd          }d| }	 t           j                            |d	t          i
          }t           j                            |t                    5 }t          j
        |                                                    d                    }ddd           n# 1 swxY w Y   n,# t          $ r}t          dd|            g cY d}~S d}~ww xY w|                    dg           }	g }
t!                      }|	D ]&}|                    di           }|                    dd          }|r|                                |v rH|                    |                                           |                    dd          }d}|r>t'          |          dk    r+	 t)          |dd                   }n# t*          $ r d}Y nw xY w|                    di           }|                    dd          }|r,|                    dd          }|                    dd          }|                    dd          p|                    dd          }|                    dg           }|r|d         n|}|                    d d          }|                    d!g           }d}|D ]3}|                    d"          d#k    r|                    d$d          } n4|s|r|d                             d$d          }|
                    |||||||d%d&           t'          |
          |k    r n(t          d'd(| d)t'          |
           d*           |
S )+z;Fallback : rechercher les publications via Google Books APIz
inauthor:"r  r   newestr  r   )rY   r(  orderBy	printTyper)  r*  r  r  r   r   Nr   zEchec Google Books: r   r+  r   rp   r-  r   r  r.  r  r/  r`  zzoom=1zzoom=2canonicalVolumeLinkr0  r,  r  industryIdentifiersr  ISBN_13
identifiergoogle_booksr  r   zGoogle Books pour 'r   r  )r   r   r  r  r  r  r  r  r  r   r  r  r  r   rm   r  r  r]   r  r   r  
ValueErrorr   r	  )r  r7  r  r  r   r  r  r   r  r   r}  r  r  r  r   pub_dater  	img_linksr  r   r,  r  r  r  r  idents                             rZ   r  z MediaFetcher._fetch_google_books"  s   '',k,,,kAo..  )
 )
   FVEE	.((|Z6P(QQC'']'CC ?tz$))++"4"4W"="=>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 	 	 	333444IIIIII	 "%%ee 2	 2	D88L"--DHHWb))E EKKMM[88OOEKKMM*** xx44HD CMMQ..x|,,DD!   DDD r22I!k266I B%--iDD	%--hAA	 881266R$((:r:R:RDhhy"--G#*;WQZZFb11IHH2B77ED  99V$$	11 99\266DE 2  6E 6Qx||L"55KK &&(	 	 	 	 	 4yyK'' ( 	VS;SS3t99SSSTTTsU    AC  :CC  CC  CC   
D	*D>D	D	G))G87G8c                 ,   t           j                            d| dt          |dz            ddd          }d| }	 t           j                            |dt          i	          }t           j                            |t          
          5 }t          j
        |                                                    d                    }ddd           n# 1 swxY w Y   n,# t          $ r}t          dd|            g cY d}~S d}~ww xY w|                    dg           }	g }
t!                      }|	D ]:}|                    di           }|                    dd          }|r|                                |v rH|                    |                                           |                    dg           }|                    dd          }|                                                                }d                    |                                          dz   |                                z   t+          fd|D                       s|                    dd          }d}|r>t-          |          dk    r+	 t/          |dd                   }n# t0          $ r d}Y nw xY w|                    di           }|                    dd          }|r|                    dd          }|                    dd          p|                    dd          }|r|d         n|}|                    d d          }|
                    ||||d||d!d"           t-          |
          |k    r n<t          d#d$| d%t-          |
           d&           |
S )'zRecherche Google Books elargie (sans restriction inauthor).
        Utile pour les pseudonymes non indexes dans les metadonnees Google.
        Cherche le nom comme terme libre dans tous les champs.
        r  z" livrer   	relevancer  )rY   r(  r  r  r*  r  r  r   r   Nr   zEchec Google Books broad: r   r+  r   rp   r,  descriptionr   c              3   F   K   | ]}t          |          d k    |v V  dS r  r  r  s     rZ   r[   z9MediaFetcher._fetch_google_books_broad.<locals>.<genexpr>  r  r\   r-  r   r  r.  r  r/  r`  r  r0  r  r  r  r   zGoogle Books broad pour 'r   r  )r   r   r  r  r  r  r  r  r  r   r  r  r  r   rm   r  r  r]   r  r   rw   r^   r   r  r  r   r	  )r  r7  r  r  r   r  r  r   r  r   r}  r  r  r  r   r,  r  r  r  r  r  r  r   r  r  r  s                            @rZ   r  z&MediaFetcher._fetch_google_books_broado  s   
 '')[)))kAo.." 	)
 )
   FVEE	.((|Z6P(QQC'']'CC ?tz$))++"4"4W"="=>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 	 	 	9a99:::IIIIII	 "%%ee +	 +	D88L"--DHHWb))E EKKMM[88OOEKKMM*** hhy"--G((="55K&,,..4466Lxx((..00369J9J9L9LLHRRRRLRRRRR xx44HD CMMQ..x|,,DD!   DDD r22I!k266I E%--iDD	881266R$((:r:R:RD#*;WQZZFb11IKK &&(	 	 	 	 	 4yyK'' ( 	VPPPD		PPP	R 	R 	RsU    AC  :CC  CC  CC   
D	*D>D	D	?JJ&%J&c                    |sdS d|v rdS d|v rdS g d}|                                 t          fd|D                       rdS 	 t          j                            |ddt
          i          }t          j                            |d	
          5 }|j                            dd          }|j	        dk    rd|v r	 ddd           dS t          dd|j	         d| d|dd                     	 ddd           dS # 1 swxY w Y   dS # t          $ r*}t          dd|dd          d| d           Y d}~dS d}~ww xY w)a  Verifier qu'une URL de jaquette pointe vers une vraie image.
        Rejette : URL nulle, cover_id 0 ou -1 (Open Library), placeholders.
        Fait un HEAD HTTP pour verifier Content-Type image/*.
        Timeout 5s pour ne pas bloquer le pipeline.
        Fzcovers.openlibrary.org/b/id/0zcovers.openlibrary.org/b/id/-)zno-coverr  zdefault-coverz	blank.gifz1x1.gifz
spacer.gifc              3       K   | ]}|v V  	d S rU   rV   r  s     rZ   r[   z3MediaFetcher._is_valid_cover_url.<locals>.<genexpr>  s'      44!qI~444444r\   HEADr  )r   r   r   r   r   rp   r   imageNTr   zCover URL rejetee: status=z type=z url=r  zCover URL inaccessible: r  r  )r]   r^   r   r  r  r  r  r   r  r  rm   r   )r  r   placeholdersr  r  content_typer  r   s          @rZ   r  z MediaFetcher._is_valid_cover_url  s-     	5 +c115*c115
 
 
 IIKK	4444|44444 	5	.((VjF(  C ''Q'77 4#|//CC;#%%'\*A*A        V: : :(: :/23B3x: :; ; ;                    	 	 	DCHDDDDDEEE55555	sI   A
D ,C?>D 'C?2D ?DD DD 
E D;;E c                 B   i }|D ]}|d                                                                          }|                    dd                              dd          }|                    dd                              dd          }|dd	         }||vr|||<   ||         }d
}|d         |d         k    rd}n>|d         |d         k    r,|                    d          r|                    d          sd}|r|||<   t	          |                                          S )zDeduplication par titre normalise (lowercase, 40 premiers chars).
        Normalise apostrophes/guillemets. Garde la version avec meilleure annee/cover.r   u   ’r  u   ‘u   “r  u   ”Nr  Fr  Tr  )r]   rf   r   r  r  r  )r  r}  ro  r   normr  existingbetters           rZ   r  zMediaFetcher._deduplicate_pubs  s0     	" 	"AW:##%%++--D<<#..66xEED<<#..66xEEDss)C$S		9V9x///!FFvY(6"222quu[7I7I2RZR^R^_jRkRk2!F " !DIDKKMM"""r\   c                    	 ddl m} n"# t          $ r t          dd           g cY S w xY wt          j                            |          }d| d}	 t          j                            |t          ddd	
          }t          j        
                    |t                    5 }|                                                    dd          }ddd           n# 1 swxY w Y   n,# t          $ r}	t          dd|	            g cY d}	~	S d}	~	ww xY w ||d          }
g }t                      }|
                    d          }|sr|
                    d          D ]\}|                    d          p)|                    d          p|                    d          }|r||vr|                    |           ]|D ]}|                    d          }|s|                    d          }|rt+          |          dk     rG|                                                                dd         }||v rz|                    |           |                    dd          }|r|                    d          sd|z   }d}|                                }t7          j        d|          }|r"t;          |                    d                     }d}|                    d!          }|rJ|                    d"d          p|                    d#d          }|r|                    d          sd|z   }|}|                    ||d|d||d$d%           t+          |          |k    r nt          d&d'| d(t+          |           d)           |S )*u8   Scraper Babelio — meilleure couverture publications FRr   r  r   z bs4 non disponible, skip Babelioz0https://www.babelio.com/recherche.php?Recherche=z&item_type=livresfr-FR,fr;q=0.9text/html,application/xhtml+xmlr  zAccept-LanguagerV  r  r   r   r   r  NzEchec Babelio: r  z
.livre_conza[href*="/livres/"]divtdliTrf   r   r  r"  rp   r  zhttps://www.babelio.comz\b(19[5-9]\d|20[0-2]\d)\br   r  r  zdata-srcbabelior  r   zBabelio pour 'r   r  )r  r  ImportErrorrm   r   r   
quote_plusr  r  r  r  r  r  r  r   r  selectfind_parentr	  
select_oner  r   r]   rf   r  r  r   r   r  r  r  )r  r7  r  r  ra   r   r  r  r  r  r  r}  ro  
containersa_tagrv  block
title_linkr   norm_keyr"  r  
block_textr  r  img_elr  s                              rZ   r  zMediaFetcher._fetch_babelio  sq   	))))))) 	 	 	;<<<III	 ''44YYYY
	.(((#3;7 7(  C
 '']'CC Etyy{{))'))DDE E E E E E E E E E E E E E E 	 	 	.1..///IIIIII	 }T=11uu [[..
 	.%:;; . .++E22 5"..t445"..t44   .fJ66%%f--- .	 .	E))*?@@J ''d'33E CJJNN{{}}**,,SbS1H4HHX>>&"--D 8DOOF33 8047 D))J#?LLJ 0:++A..// I%%e,,F $jj++Ivzz*b/I/I $>>&11 >7#= #IKK%&#	 	 	 	 	 4yyK'' ( 	VNkNNc$iiNNNOOOsO   	 ((AC# !*CC# CC# CC# #
D-DDDc                    t           j                            | d          }d| d}	 t           j                            |t
          ddd          }t           j                            |t                    5 }|                                	                    d	d
          }ddd           n# 1 swxY w Y   n,# t          $ r}t          dd|            g cY d}~S d}~ww xY wt          j        dt          j                  }	|	                    |          }
|
st          dd           g S g }t!                      }d |                                D             }|
D ]f}t          j        d|t          j                  }|s&|                     |                    d                                                    }|rt-          |          dk     rv|                                                                g d}t1          fd|D                       rt-          |          dk     rt          j        dd|                                          t1          fd|D                       s|                                                                dd         }||v rI|                    |           d}t          j        d|t          j                  }|r1|                    d          }|r|                    d          sd|z   }d }t          j        d!          }|r"t9          |                    d                    }d}t          j        d"|          }|st          j        d#|          }|r|                    d          }|                    ||d|d||d$d%           t-          |          |k    r nht          d&d'| d(t-          |           d)           |S )*u   Scraper Amazon.fr — fallback si aucune publication recente.
        Zero BeautifulSoup — parsing regex uniquement.
        z livrezhttps://www.amazon.fr/s?k=z&i=stripbooks&s=date-desc-rankr  r  r	  r  r   r   r   r  Nr   zEchec Amazon.fr: z[data-component-type="s-search-result"[^>]*>(.*?)(?=data-component-type="s-search-result"|$)z,Amazon.fr: aucun resultat parse (anti-bot ?)c                 \    g | ])}t          |          d k    |                                *S r  rS  r  s     rZ   r  z.MediaFetcher._fetch_amazon.<locals>.<listcomp>s  s2     
 
 
c!ffqjjAGGIIjjjr\   z0<h2[^>]*>.*?<a[^>]*>.*?<span[^>]*>([^<]+)</span>r   r   )zactuellement indisponiblezlivres associezet al.zsur 5 etoile
sponsorisezajouter au panierzvoir les offresc              3   :   K   | ]}|t                    v V  d S rU   )rz   )rW   gr  s     rZ   r[   z-MediaFetcher._fetch_amazon.<locals>.<genexpr>  sD       * * {333 * * * * * *r\   r   r  r   c              3       K   | ]}|v V  	d S rU   rV   )rW   r   r  s     rZ   r[   z-MediaFetcher._fetch_amazon.<locals>.<genexpr>  s'      ==1qJ======r\   r  rp   z!<h2[^>]*>.*?<a[^>]*href="([^"]+)"r  zhttps://www.amazon.frr   z\b(20[0-2]\d)\bz!class="s-image"[^>]*src="([^"]+)"z!src="([^"]+)"[^>]*class="s-image"amazonr  r   zAmazon.fr pour 'r   r  )r   r   r  r  r  r  r  r  r  r  r   rm   r   compiler  r  r  r   r  r  r  rf   r   r]   r^   r   r  r   r  r	  )r  r7  r  ra   r   r  r  r  r  block_patternblocksr}  ro  r  r  title_matchr   _AMZ_GARBAGEr  r"  
link_matchr  r  r  	img_matchr  r  s                            @@rZ   r  zMediaFetcher._fetch_amazonQ  s    '';(>(>(>??1E 1 1 1
	.(((#3;7 7(  C
 '']'CC Etyy{{))'))DDE E E E E E E E E E E E E E E 	 	 	0Q00111IIIIII	 
;I
 

 &&t,, 	GHHHIuu
 
*0022
 
 
  L	 L	E)Cry K  ..!!!$$**,,. .E CJJNN  ++----//K  L  * * * *(* * * * * 5zzA~~ 
C77==??J========= {{}}**,,SbS1H4HHX D4ry J  :!''** : 7 7 :2T9D D#5zBBJ 0:++A..// I	4e= =I AI8%A A	 /%OOA..	KK%&"	 	 	 	 	 4yyK'' ( 	VG{GGs4yyGGG	I 	I 	IsB   AB> <*B2&B> 2B66B> 9B6:B> >
C'C"C'"C'c                    g }t          j        d|t           j                  }|s t          j        d|t           j                  }t          j        d|t           j                  }t          |          D ]M\  }\  }}|                     |          }|s!t          j        dd|                                          }	|	                    dd                              dd	          }	|	                    d
d                              dd          }	d}
|t          |          k     rt          j        dd||                                                   }
|
                    dd                              dd	          }
|
                    d
d                              dd          }
|	                    ||	|
d           O|S )zParser les resultats de recherche DuckDuckGo Lite
        Structure : <a class='result-link' href="//duckduckgo.com/l/?uddg=...">Title</a>
        Snippets  : <td class='result-snippet'>texte</td>
        zI<a[^>]*class=['"]result-link['"][^>]*href=["']([^"']*)["'][^>]*>(.*?)</a>zI<a[^>]*href=["']([^"']*)["'][^>]*class=['"]result-link['"][^>]*>(.*?)</a>z4<td[^>]*class=['"]result-snippet['"][^>]*>(.*?)</td>r  rp   r'  r(  r)  r  r*  r  r+  r   )
r   r  r  r   r  r   rf   r   r   r	  )r  r  r  r  r   r   r"  r#  r   r   r   s              rZ   _parse_ddg_litezMediaFetcher._parse_ddg_lite  s   
  
\")
 
  	J`bi E :G")
 

 &/u%5%5 	 	!A!j&&t,,C F:r:66<<>>EMM'3//77#FFEMM(C0088#FFEG3x==  &R!==CCEE!//'377??#NN!//(C88@@#NNNN"      r\   c                     d|v rJt          j        d|          }|r2t          j                            |                    d                    S n|                    d          r|S dS )z'Decoder l'URL de redirection DuckDuckGozuddg=zuddg=([^&]+)r   r  N)r   r  r   r   unquoter  r   )r  r"  r  s      rZ   r  zMediaFetcher._decode_ddg_url  sg    d??Iot44E <|++EKKNN;;;<__V$$ 	Ktr\   c                 t   ddl }g d}|D ]} |j        |||j                  }|r|                    d                                          }|                    dd                              dd                              d	d
                              dd                              dd          }|c S dS )u   v4.6 — Extraire la meta description d'une page HTML.
        Fallback quand _extract_paragraphs retourne trop peu de texte
        (paywall, page dynamique, contenu JS-only).
        r   N)zD<meta\s+name=["\']description["\']\s+content=["\']([^"\']{20,})["\']zD<meta\s+content=["\']([^"\']{20,})["\']\s+name=["\']description["\']zK<meta\s+property=["\']og:description["\']\s+content=["\']([^"\']{20,})["\']zK<meta\s+content=["\']([^"\']{20,})["\']\s+property=["\']og:description["\']r   r'  r(  r,  r-  r.  r/  z&#039;r  r)  r  )r   r  rJ  r  rf   r   )r  r  r   patternspatr  descs          rZ   r  z&MediaFetcher._extract_meta_description  s    
 					
 	
 	
  
	 
	CBIc477E {{1~~++--Wc22 -- -- 3// 3//	 
  tr\   c                     	 ddl m} |                     |          S # t          $ r |                     |          cY S w xY w)zExtraire le texte des balises <p> uniquement
        Regle : ignorer nav, footer, aside, scripts (prompt_architect_v1.1)
        Tente bs4, fallback regex (decision D3)
        r   r  )r  r  _extract_paragraphs_bs4r  _extract_paragraphs_regex)r  r  r  s      rZ   r  z MediaFetcher._extract_paragraphs'  sb    
	8))))))//555 	8 	8 	811$77777	8s    ??c                 X   ddl m}  ||d          }|                    d          D ]}|                                 g }|                    d          D ]@}|                    d          }t          |          dk    r|                    |           Ad	                    |          S )
z!Extraction <p> avec BeautifulSoupr   r  r  zwnav, footer, aside, script, style, header, .nav, .footer, .sidebar, .comments, .social, .ad, .pub, .newsletter, .cookier   Tr  r  

)	r  r  r  	decomposefind_allr  r   r	  rw   )r  r  r  r  tag
paragraphsr   rx   s           rZ   r2  z$MediaFetcher._extract_paragraphs_bs42  s    %%%%%%}T=11 ;;.
 
 	 	C
 MMOOOO
s## 	( 	(A::D:))D4yy2~~!!$'''{{:&&&r\   c                    t          j        dd|t           j        t           j        z            }t          j        d|t           j        t           j        z            }g }|D ]h}t          j        dd|                                          }t          j        dd|          }t          |          dk    r|                    |           id	                    |          S )
z"Fallback extraction <p> avec regexz<(script|style)[^>]*>.*?</\1>rp   r<  z<p[^>]*>(.*?)</p>r  r>  r   r  r5  )	r   r   r  rJ  r  rf   r   r	  rw   )r  r  r9  textsr   rx   s         rZ   r3  z&MediaFetcher._extract_paragraphs_regexG  s     v,b	BM1
 
 
 Z ")bm+
 

  	# 	#A6*b!,,2244D6&#t,,D4yy2~~T"""{{5!!!r\   c                     	 t           j                            |          }|j                                        }|                    d          r
|dd         }|S # t          $ r Y dS w xY w)zExtraire le domaine d'une URLr   r  Nrp   )r   r   r   r   r]   r   r   )r  r   r  r   s       rZ   r  zMediaFetcher._extract_domain]  sy    	\**3//F]((**F  (( $M 	 	 	22	s   AA 
A)(A))r   r   r  r   r   r   r  r  r   r   r   r   r
   r	   r   zslate.frzhuffingtonpost.frrR   r   r   c           	      	   ddl }t          dd|z             i }|rA|D ]=}|                    dd          }|                    dd          }|                    dd          }|sH|                                }	|	                    d	          r
|	d
d         }	d}
| j        D ]}||	v s|	|v r|}
 n|
s|
|vrt                      g g d||
<   |r|||
         d         vr|||
         d                             |           |r!||
         d                             |           | 	                    |          }|r!||
         d                             |           ?g d}d|z  }|D ][}||v r t          ||         d                   dk    r'	 |d|}t          |d          }|sC||vrt                      g g d||<   ||         }|D ]}|                    dd          }|                    dd          }|rt||d         vrj|d                             |           |r|d                             |           | 	                    |          }|r|d                             |           t          dd|t          |          fz             /# t          $ r!}t          dd|d|           Y d}~Ud}~ww xY wg }t          |                                d d          D ]\  }}t          |d                   }|| j        k     r&t          t          |d                             }|rt!          |d                   nd}|rt!          |d                   nd}|d         dd         }|                    |||||d           |st          dd|z             dS |d         }d}|d         r7|d          r/|d         |d          k    r	|d         }n'|d         d!|d          }n|d         rd"|d         z  }g }|dd#         D ]} | d                             d$          d                             d%d&          }!d}"| d         r6| d          r.| d         | d          k    r	| d         }"n| d         d!| d          }"d'|!| d(         fz  }#|"r|#d)|"z  z  }#|#d*z  }#|                    |#           d+d,                    |          z  }$d||d         |$d-}%t          dd.|$z             |%S )/a  Detecter si une personne est journaliste/contributeur regulier
        d'un ou plusieurs medias, en comptant les articles par domaine.

        Strategie :
        1. Parcourir les sources medias deja collectees (pipeline)
        2. Recherche DDG ciblee sur les domaines medias cles
        3. Compter les articles par domaine
        4. Deduire career_start / career_end depuis les dates trouvees

        Retourne un dict :
            {
                "is_media_contributor": bool,
                "domains": [
                    {"domain": str, "nb_articles": int,
                     "career_start": str|None, "career_end": str|None,
                     "sample_titles": list[str]},
                ],
                "primary_domain": str|None,
                "summary": str,   # "Contributeur France Inter (2015-2023, 12 articles)"
            }
        ou None si aucun profil editorial detecte.
        r   Nr   z5[AUTHOR-PROFILE] Recherche profil editorial pour '%s'r   rp   r   r   r   r  )urlsr  yearsr>  r  r?  )r   r   r   r   r   z"%s"r   z site:r   r  u0   [AUTHOR-PROFILE] Serper site:%s → %d resultatsr   z[AUTHOR-PROFILE] Serper site:z	 erreur: c                 8    t          | d         d                   S )Nr   r>  r  r  s    rZ   r  z3MediaFetcher.fetch_author_profile.<locals>.<lambda>  s    #ad6l++ r\   Tr~  r   )r   nb_articlescareer_start
career_endsample_titlesz9[AUTHOR-PROFILE] Aucun profil editorial detecte pour '%s'rB  rC  r`  z	depuis %sr   r@  francezFrance z%s (%d articlesrA  z, %sr  zContributeur %sz / )is_media_contributordomainsprimary_domainsummaryz#[AUTHOR-PROFILE] Profil detecte: %s)r   rm   r  r]   r   _CAREER_MEDIA_DOMAINSr  r  r	  _extract_year_from_urlr   r  r   r  r   _CAREER_MIN_ARTICLESr  r   r   rw   )&r  r7  media_sources_redomain_datar  r   r   r   	clean_dommatchedmdr  priority_domainsquoted_nametarget_domain	ddg_queryr  ddre   r_urlr_titler  domains_resultnbr?  rB  rC  sampleprimaryperiodsummary_partsrI  rJ  r   entryprofile_summaryrh  s&                                         rZ   fetch_author_profilez!MediaFetcher.fetch_author_profilex  s   . 	V%&	' 	' 	'   	C$ C C2..ggeR((,, "LLNN	''// . )!""I4  BY)r//"$ +:  +-- #b, ,K(  C3k'&:6&BBB(044S999 E#G,X6==eDDD66s;;D C#G,W5<<TBBB
 
 
 {*- !	* !	*M,,K6v>??1DD*,7KKG	(;;;  33 #b2 2K. !/  	5 	5AEE%,,EeeGR00G 5bj!8!86
u---" 9xL//888#::5AA 5wK..t444VF$c'll345 5 5 5  * * *VV$}}aa)* * * * * * * **  ++
 
 
 	 	JFB
 RZBD---3r'{++,,E,1;3uQx===tL+0:U2YdJ\"1"%F!! ! ,(!'# #      	)*+ + + 4 #>" 	;w|'< 	;~&',*??? 0 N+++W\-B-BD^$ 	; 7>#::F# 	( 	(AhK%%c**1-55h	JJEA  GQ|_ G^$,77.)AA#$^#4#4#4aooFA%-0@(AAE $!#SLE  ''''+ejj.G.GG %)%%h/&	
 
 	V1OC	E 	E 	E s   J,C*J
K"J>>Kc                     ddl }|                    d|           }|r4t          |                    d                    }d|cxk    rdk    rn n|S dS )zqExtraire une annee (2000-2039) depuis une URL.
        Patterns : /2023/04/, /2021-06-15, /article-2019-
        r   Nz/(\d{4})[-/]r   i  i  )r   r  r  r  )r   rN  r  r  s       rZ   rK  z#MediaFetcher._extract_year_from_url)  sn    
 	

?C00 	u{{1~~&&Dt####t#####tr\   )r   )r   N)r  r   )r   )NNr  )r  )FNNrU   )r   )rp   )r   NT)S__name__
__module____qualname____doc__r  MAX_CHARS_PER_ARTICLEr  r  r
  r  r\  rP  r9  r:  r;  r  r=  r<  r  r  r  r  r  r  r  r  r%  r  rS  rg  rf  rh  rj  ri  rk  rl  rm  rn  ro  rp  rq  rr  r  r  r  r  r  r  r  r  r  r  r  r  r  staticmethodr%  rC  rr  rw  r~  ry  r  r  r  r  r  r  r  r  r  r  r*  r  r  r  r2  r3  r  rJ  rL  rb  rK  rV   r\   rZ   r  r    sd       KK   0 1F 1 1 1 1f) ) ) )Z   56'+f f f fV# # #p
 p
 p
dK K K KZr r r rhR R R Rj  OM M M M^ ;?+/z z z z|      :  &O O O Oh    4@ @ @Dn
 n
 n
`W W Wr	 	 	G G GX =OCLK
 #H-K/L2I,/N6=HHIy y yz" " "H  *& & & &R 	
K K K^ 37)-X X X Xx 5:8<C C C CN  : 5,'	 	
 =)+''	 	 3)#'	 	
$ $KP U U \Un\ \ \ \B? ? ?
E E EB B BG G G> > >8 8 8= = =D D D% O,' ' 'Z   *U U Un i i i \iV ;=6:Z Z Z Zx9 9 9 9vD D D DLK K K KZG G G GR' ' 'R# # #.W W W Wrv v v vt/ / /b    >	8 	8 	8' ' '*" " ",	 	 	   o o o ob 
 
 \
 
 
r\   r  c                   l    e Zd ZdZdZdZdZd Zd Zd Z	dd	Z
ddZddZd Zd Zd Zd Zd Zd ZdS )RefletsScraperaG  Scraper authentifie pour reflets.info (media investigation cyber)
    Reflets.info utilise Devise (Ruby on Rails) pour l'authentification.
    Login : POST /users/sign_in avec CSRF token (authenticity_token)
    Recherche : GET /articles?search=QUERY (fonctionne SANS auth)
    Articles complets : necessite auth (paywall)
    z"https://reflets.info/users/sign_inzhttps://reflets.info/articleszhttps://reflets.infoc                 z   t           j                                        | _        t          j                            t          j                            | j                            | _        d| _	        t          j                            dd          | _        t          j                            dd          | _        d S )NFREFLETS_EMAILrp   REFLETS_PASSWORD)r  	cookiejar	CookieJar
cookie_jarr   r  build_openerHTTPCookieProcessoropener	logged_inosenvironr  emailpassword)r  s    rZ   __init__zRefletsScraper.__init__I  s    .2244n11N..t??
 
 Z^^OR88

'92>>r\   c                 T   	 t           j                            | j        t          dd          }| j                            |d          }|                                                    dd          }t          j
        d	|          }|r%t          d
d           |                    d          S t          j
        d|          }|r%t          d
d           |                    d          S t          dd           dS # t          $ r}t          dd|            Y d}~dS d}~ww xY w)zExtraire le token CSRF (authenticity_token) depuis la page login.
        Devise Rails genere un token CSRF dans un champ hidden du formulaire
        et/ou dans une balise <meta name='csrf-token'>.
        r  rU  r  r   r   r   r   r  z8<input[^>]+name="authenticity_token"[^>]+value="([^"]+)"r   z*Reflets: CSRF token extrait (hidden input)r   z-<meta\s+name="csrf-token"\s+content="([^"]+)"z&Reflets: CSRF token extrait (meta tag)r   z2Reflets: CSRF token introuvable sur /users/sign_inNz#Reflets: echec GET /users/sign_in: )r   r  r  	LOGIN_URLr  ru  r  r  r  r   r  rm   r  r   )r  r  r  r  rM  r  s         rZ   _get_csrf_tokenzRefletsScraper._get_csrf_tokenR  sW   
"	.((",)  )  C ;##C#44D99;;%%gi%@@D 	K A  "VIJJJwwqzz! 	@ A  "VEFFFwwqzz!MNNN4 	 	 	BqBBCCC44444	s$   B.C? 1;C? -C? ?
D'	D""D'c           
      j   | j         r| j        st          dd           dS |                                 }|st          dd           dS 	 t          j                            || j         | j        ddd                              d          }t          j        	                    | j
        |t          d	| j
        | j        d
d          }| j                            |d          }|                                }d|v rt          dd           dS d | j        D             }t          dd|            t#          d |D                       | _        | j        sd| _        t          dd| d           nt          dd           | j        S # t&          $ r}t          dd|            Y d}~dS d}~ww xY w)u   Authentification a reflets.info via Devise (Ruby on Rails).
        Etape 1 : GET /users/sign_in → extraire authenticity_token
        Etape 2 : POST /users/sign_in avec user[email], user[password], token
        r   zGReflets: credentials non configurees (REFLETS_EMAIL / REFLETS_PASSWORD)Fz+Reflets: impossible de recuperer CSRF tokenr  zSe connecter)authenticity_tokenzuser[email]zuser[password]zuser[remember_me]commitr   z!application/x-www-form-urlencodedr  )r  r   RefererOriginrV  )r   r   r   r   z/users/sign_inzCReflets: echec auth (redirige vers sign_in, credentials invalides?)c                     g | ]	}|j         
S rV   )r   r  s     rZ   r  z(RefletsScraper.login.<locals>.<listcomp>  s    <<<qAF<<<r\   r   zReflets: cookies apres login: c              3   .   K   | ]}d |v pd|v pd|v V  dS )_sessionrememberuserNrV   )rW   r  s     rZ   r[   z'RefletsScraper.login.<locals>.<genexpr>  sL       ! ! aA:?Afk! ! ! ! ! !r\   Tz.Reflets: auth presumee reussie (redirect vers r  z<Reflets: authentification reussie (cookies session detectes)zReflets: echec login: N)ry  rz  rm   r~  r   r   r  r   r  r  r}  r  BASE_URLru  r  geturlrr  r^   rv  r   )r  
csrf_tokenr   r  r  	final_urlcookie_namesr  s           rZ   loginzRefletsScraper.login{  sc   
 z 	 	56 6 6 5 ))++
 	FGGG59	<))&0#z"&-%((+ +   vg  .((",$G#~"m?  ) 
 
C ;##C#44DI  9,,VFG G G u =<DO<<<L???A A A ! ! !%! ! !  DN
 > 	3!%V3&/3 3 34 4 4 4 V23 3 3 >! 	 	 	5!5566655555	s    
CF
 A;F
 

F2F--F2r   c                    	 t           j                            d|i          }| j         d| }t           j                            |t          dd          }| j                            |t                    }|
                                                    dd	          }|                     ||          S # t          $ r}t          d
d|            g cY d}~S d}~ww xY w)zRechercher des articles sur reflets.info.
        La recherche fonctionne SANS authentification.
        URL : /articles?search=QUERY
        r  r  r  rU  r  r   r   r   r  r   zReflets: echec recherche: N)r   r   r  
SEARCH_URLr  r  r  ru  r  r  r  r  _parse_search_resultsr   rm   )	r  ra   r  r  r   r  r  r  r  s	            rZ   search_articleszRefletsScraper.search_articles  s    
	\++Xu,=>>F_//v//C.(((%7 7(  C
 ;##C#??D99;;%%gi%@@D--dK@@@ 	 	 	9a99:::IIIIII	s   B5B8 8
C!CC!C!r1   c                 h   | j         s)|                                 st          dd           dddS 	 t          j                            |t          d| j        dz   d	          }| j        	                    |t          
          }|                                                    dd          }|                     |          }|r#t          |          |k    r|d|dz
           dz   }|                     |          }||dS # t           $ r%}t          dd| d|            dddcY d}~S d}~ww xY w)zExtraire le contenu complet d'un article (avec auth paywall).
        Retourne un dict {"text": str|None, "author": str} avec le texte
        et le nom de l'auteur/signataire de l'article.
        r   zMReflets: impossible d'acceder a l'article (auth requise pour contenu complet)Nrp   )rx   r  r  r   )r  rV  r  r  r   r   r   r  r   r  zReflets: echec fetch article r  )rv  r  rm   r   r  r  r  r  ru  r  r  r  r  _extract_article_textr   _extract_article_authorr   )	r  r   r  r  r  r  rx   r  r  s	            rZ   fetch_articlezRefletsScraper.fetch_article  s}   
 ~ 	0djjll 	067 7 7 !B///	0.(((%=3.7 7(  C
 ;##C#??D99;;%%gi%@@D--d33D 4D		I--NY]N+e311$77F F333 	0 	0 	0CCCCCDDD B////////	0s   CD 
D1D,&D1,D1c                    |                      ||          }|st          dd           g g ddS t          ddt          |           d           | j        s$|                                 st          dd           d	 |                                D             }g }g }|D ]S}|                     |d
                   }|                    d          }	|                    dd          }
|	rt          |	          dk    rd}|
r/|
                                t          fd|D                       }dd|d
         |d         |	|	dd         |
|d}|
                    |           |rO|
                    |d         |d
         |
d           t          dd| d|d         dd          d|
 d           t          dd|d         dd          dt          |	           d|
rd|
 dnd            Ut          |          dk    }t          ddt          |           d t          |           d!| dt          |           d"	           |||dS )#u#  Pipeline complet : recherche + extraction pour synthese.
        Retourne un dict structuré :
            {
                "sources": list[dict] — sources compatibles pipeline,
                "author_articles": list[dict] — articles signes par la personne,
                "is_contributor": bool — True si la personne est auteur
            }
        Pipeline :
        1. Recherche articles (sans auth)
        2. Login si pas deja fait
        3. Extraction contenu complet + auteur (avec auth)
        4. Detection role auteur
        r   z/Reflets: aucun article trouve pour la rechercheF)r  author_articlesis_contributorz	Reflets: z+ articles trouves, extraction du contenu...r   z5Reflets: auth echouee, tentative extraction sans authc                 \    g | ])}t          |          d k    |                                *S r  rS  r@  s     rZ   r  z0RefletsScraper.fetch_sources.<locals>.<listcomp>$  s2     
 
 
A

AGGII


r\   r   rx   r  rp   r   c              3       K   | ]}|v V  	d S rU   rV   )rW   r   author_lowers     rZ   r[   z/RefletsScraper.fetch_sources.<locals>.<genexpr>5  s9       $ $./\)$ $ $ $ $ $r\   r  rR   r   Nr   )r  r   r   r   rx   r   r  	is_author)r   r   r  zReflets: article SIGNE par 'r   r   z
 (auteur: r  zReflets source: r  r  r   z sources exploitables sur z articles, auteur=z signes))r  rm   r   rv  r  r   r  r  r]   r^   r	  )r  ra   r  r  query_wordsr  r  artrh  rx   r7  r  source_entryr  r  s                 @rZ   fetch_sourceszRefletsScraper.fetch_sources  s    ''{;; 	JKKK#%"'   	V'H ' ' '	( 	( 	(
 ~ 	6::<< 6V56 6 6

 
${{}}
 
 

  '	Q '	QC''E
33F::f%%D **Xr22K "QD		C!	 #.#4#4#6#6L # $ $ $ $3>$ $ $ ! !I
 $,u: \ #DSDz)!*	  	  |,,, Q#**!$W"5z"-, ,   
 Gu G Gw<,G G8CG G GH H H H P3w<+< P PIIP P:EM666662P PQ Q Q
 _--1VGG G Gx==G G$G G(+O(<(<G G G	H 	H 	H .,
 
 	
r\   c                    	 ddl m} n6# t          $ r) t          dd           |                     ||          cY S w xY w ||d          }g }t                      }|                    dd          D ]q}|                    d	d
          }d|vr|                    d          s|                    d          rIt          j
        d|          }	|	sa|                    d          s
| j        |z   }||v r|                    |           |                    d          }
|
rt          |
          dk     r/|                    g d          }|r|                    d          }
|
rt          |
          dk     r;|	                    d                              dd                                          }
|                    |
|d           t          |          |k    r nst          ddt          |           d           |S )zParser les resultats de recherche Reflets.info.
        Structure confirmee : liens <a href='/articles/SLUG'> dans la page.
        r   r  r   z*bs4 non disponible, fallback regex Refletsr  ry  T)r"  r"  rp   z
/articles/z	/articlesz/articles/([a-z0-9\-]+)r  r  r   )h2h3h4r   r`  r   r   r   r   zReflets search: z articles trouves)r  r  r  rm   _parse_search_results_regexr  r7  r  rB  r   r  r   r  r  r  r   r  r  r   r   r	  )r  r  r  r  r  r  r  r   r"  
slug_matchr   rv  s               rZ   r  z$RefletsScraper._parse_search_resultsa  sa   	G))))))) 	G 	G 	GEFFF33D+FFFFF	G }T=11EE	 MM#DM11 &	 &	D88FB''D 4'' }}[)) T]]<-H-H  #=tDDJ  ??6** ,}t+ y  MM$ MMM--E 8CJJNN))*<*<*<== 8"OO$O77E FCJJNN"((++33C==CCEEOOeD99:::8}}++ , 	VHHHHHIIIs   	 0<<c                    g }t                      }t          j        d|          D ]}| j        |                    d          z   }|                    d                                          }||v s|rt          |          dk     r`|                    |           |                    ||d           t          |          |k    r n|S )"Fallback regex si bs4 indisponiblez6<a[^>]+href="(/articles/[a-z0-9\-]+)"[^>]*>([^<]+)</a>r   r   r   r  )	r  r   finditerr  r  rf   r   r  r	  )r  r  r  r  r  rM  r"  r   s           rZ   r  z*RefletsScraper._parse_search_results_regex  s    EE	E
 
 	 	A =1771::-DGGAJJ$$&&Ey   UaMM$OOeD99:::8}}++ , r\   c                 8   	 ddl m} n%# t          $ r |                     |          cY S w xY w ||d          }|                    d          D ]}|                                 g d}|D ]d}|                    |          }|rK|                    d          }d                    d |D                       }	t          |	          d	k    r|	c S e|                    d          }
d                    d
 |
D                       }	t          |	          d	k    r|	S dS )z~Extraire le texte d'un article Reflets.info.
        Gere le cas paywall (contenu partiel) et contenu complet (auth).
        r   r  r  znav, footer, aside, .comments, script, style, .sidebar, .widget, .related-posts, .share-buttons, .paywall-teaser, .subscription-cta, header)z.article-contentz.article-bodyz.post-contentz.entry-contentzarticle .contentr#  zmain .contentmainr   r  c              3      K   | ]A}t          |                    d                     dk    )|                    d           V  BdS )Tr  r  Nr   r  r  s     rZ   r[   z7RefletsScraper._extract_article_text.<locals>.<genexpr>  s]       ! !/01::D:1122R77 JJTJ**7777! !r\   r   c              3      K   | ]A}t          |                    d                     dk    )|                    d           V  BdS )Tr  r   Nr  r  s     rZ   r[   z7RefletsScraper._extract_article_text.<locals>.<genexpr>  s]       
 
'(1::D:))**R// JJTJ""////
 
r\   N)
r  r  r  _extract_article_text_regexr  r6  r  r7  rw   r   )r  r  r  r  r8  	selectorsselr  r9  rx   all_ps              rZ   r  z$RefletsScraper._extract_article_text  s   	:))))))) 	: 	: 	:33D99999	: }T=11 ;;9
 
 	 	C
 MMOOOO	
 	
 	
	  		  		 Cooc**G  $--c22
yy ! !4>! ! !   t99s??KKK c""yy 
 
,1
 
 
 
 
 t99s??Kt   	 ++c                 H   t          j        d|t           j                  }g }|D ]R}t          j        dd|                                          }t          |          dk    r|                    |           Sd                    |          }t          |          dk    r|ndS )r  z<p[^>]*>(.+?)</p>r  rp   r  r  r   N)r   r  r  r   rf   r   r	  rw   )r  r  r9  r;  r   r  rx   s          rZ   r  z*RefletsScraper._extract_article_text_regex  s     Z 4dBIFF
 	$ 	$AF:r1--3355E5zzBU###yy4yy3ttD0r\   c                    	 ddl m} n%# t          $ r |                     |          cY S w xY w ||d          }g d}|D ][}|                    |          }|rB|                    d          }|r*t          |          dk    rt          |          dk     r|c S \|                    d	d
di          }|r?|                    dd          	                                }	|	rt          |	          dk    r|	S dS )zExtraire le nom de l'auteur/signataire d'un article Reflets.info.
        Cherche dans les selecteurs byline courants + meta author.
        Retourne le nom de l'auteur ou chaine vide.
        r   r  r  )z.author-namez	.author az	.byline az[rel='author']z.entry-author az.authorz.bylinez.entry-authorz.post-authorz.article-authorzspan.authorza.authorTr  r   r  r3  r   r  r  r  rp   )
r  r  r  _extract_article_author_regexr  r  r   rw  r  rf   )
r  r  r  r  r  r  r   r   meta_authorr  s
             rZ   r  z&RefletsScraper._extract_article_author  sE   
	<))))))) 	< 	< 	<55d;;;;;	< }T=11
 
 
	  	  	 C%%B  {{{..  CIIMMc$ii"nnKKK iivx.@iAA 	!ooi44::<<G 3w<<!++rr  c                 R   t          j        d|          }|rQ|                    d                                          }|r(t	          |          dk    rt	          |          dk     r|S t          j        d|          }|r'|                    d                                          S dS )z%Fallback regex pour extraction auteurz,class="(?:author|byline)[^"]*"[^>]*>([^<]+)<r   r   r  z)<meta\s+name="author"\s+content="([^"]+)"rp   )r   r  r  rf   r   )r  r  rM  r   s       rZ   r  z,RefletsScraper._extract_article_author_regex  s     I;
 
  	771::##%%D D		A#d))b.. I8
 
  	&771::##%%%rr\   Nrd  )r1   )re  rf  rg  rh  r}  r  r  r{  r~  r  r  r  r  r  r  r  r  r  r  rV   r\   rZ   rl  rl  =  s          5I0J%H? ? ?' ' 'RK K KZ   .0 0 0 0@`
 `
 `
 `
D8 8 8t  ,4 4 4l
1 
1 
1" " "H    r\   rl  )rp   rp   )r   r   r   )r   r   r  )r   r2   )r  )@rh  urllib.requestr   urllib.parsehttp.cookiejarr  r   r   rw  rQ  r  ri  r  r  r_   rg   rb   rh   rm   rz   r   r   r   r   r   r   r   r   r   r   r   r   r   getenvr   r  r  r  r  r(  rU  rO  rP  r  r  r  r  r  r  r  r  	frozensetr  r  r:  rS  r[  rl  rr  r  r  r  r  rl  rV   r\   rZ   <module>r     s                 				 				  8     % 	 	 	   7 7 7	9 	9 	96 6 6
D D D< < <,  &$ $ $ N  *  K K K   (
E 
E 
E  
5 
5 
5   ,! ! ! !L0 0 0j +R0048 ( ( ( (V   B5 5 5x    
    
        
  [2 2 j       ): \ \	
 K : ) [ K : ~ :  m m  z!" \#$ !!"9   >   
! ! ! !.5 5 5  @A!3 !3 !3 !3H    ()1% 1% 1% 1%h &I 	' 	' 	' 	 	 " " "JC C CL! ! !H  = = =@  @& & &RU U UpI I IXmB mB mB mB mB mB mB mBlEn n n n n n n n n nr\   