
    ۬i1                         d Z ddlZddlZddlZddlmZ ddlmZ  ej        d          Z	dZ
dZdZd	Z G d
 d          ZdS )u}  
ICAC RAG Engine — local PDF indexing + semantic search
Session 4: PyMuPDF + sentence-transformers + ChromaDB

Pipeline:
  1. PDF → text extraction (PyMuPDF/fitz)
  2. Text → chunks (500 chars, 100 overlap)
  3. Chunks → embeddings (sentence-transformers, all-MiniLM-L6-v2)
  4. Embeddings → ChromaDB (local, persistent)
  5. Query → cosine similarity → top-5 chunks
    N)Path)Optionalzicac.ragi  d   zall-MiniLM-L6-v2icac_documentsc                       e Zd ZdZdedefdZd ZdedefdZd	ed
edefdZ	e
dedefd            Ze
dedefd            ZdedefdZ	 	 ddededededef
dZddededefdZdefdZdefdZdS )	RAGEnginez8Local RAG engine using ChromaDB + sentence-transformers.docs_dir
chroma_dirc                 &   t          |          | _        t          |          | _        | j                            dd           | j                            dd           d| _        d| _        d| _        d| _        |                                  dS )z&Initialize ChromaDB + embedding model.T)parentsexist_okNF)	r   r	   r
   mkdir	_embedder_chroma_client_collection_initialized
_lazy_init)selfr	   r
   s      )/var/www/icac/agents_python/rag_engine.py__init__zRAGEngine.__init__   s    Xz**D4888dT:::"!    c                    | j         rdS 	 ddl}ddlm} |                    t          | j                   |d                    | _        | j                            t          ddi	          | _
        t                              d
t          | j
                                                   nY# t          $ r t                              d           Y dS t           $ r&}t                              d|           Y d}~dS d}~ww xY w	 ddlm}  |t(                    | _        t                              dt(                     nY# t          $ r t                              d           Y dS t           $ r&}t                              d|           Y d}~dS d}~ww xY wd| _         t                              d| j        | j                   dS )zLazy-load heavy dependencies.Nr   )SettingsF)anonymized_telemetry)pathsettingsz
hnsw:spacecosine)namemetadatau,   ChromaDB ready — collection '%s' (%d docs)u'   chromadb not installed — RAG disabledzChromaDB init failed: %s)SentenceTransformerzEmbedding model loaded: %su4   sentence-transformers not installed — RAG disabledzEmbedding model failed: %sTu4   RAGEngine fully initialized — docs: %s, chroma: %s)r   chromadbchromadb.configr   PersistentClientstrr
   r   get_or_create_collectionCOLLECTION_NAMEr   loginfocountImportErrorwarning	Exceptionerrorsentence_transformersr    EMBED_MODELr   r	   )r   r!   r   er    s        r   r   zRAGEngine._lazy_init,   s#    	F	OOO000000"*";";))!u=== #< # #D  $2KK$&1  L    D HHC$d&6&<&<&>&>@ @ @ @ 	 	 	KKABBBFF 	 	 	II0!444FFFFF			AAAAAA00==DNHH1;???? 	 	 	KKNOOOFF 	 	 	II2A666FFFFF	 !G	1 	1 	1 	1 	1s<   B"B. .$D	DC??D;E $F+	F4FFpdf_pathreturnc                    	 ddl }|                    |          }g }t          t          |                    D ]P}||         }|                                }|                                r|                    d|dz    d|            Q|                                 d                    |          }t          
                    dt          |          t          |          t          |          j                   |S # t          $ r t                              d           Y d	S t          $ r'}	t                              d
||	           Y d}	~	d	S d}	~	ww xY w)z+Extract text from PDF using PyMuPDF (fitz).r   Nz[PAGE    z]


z$Extracted %d chars from %d pages: %su3   PyMuPDF (fitz) not installed — cannot extract PDF z PDF extraction failed for %s: %s)fitzopenrangelenget_textstripappendclosejoinr'   r(   r   r   r*   r-   r,   )
r   r1   r7   doc
text_partspage_numpagetext	full_textr0   s
             r   _extract_text_from_pdfz RAGEngine._extract_text_from_pdfU   s_   	KKK))H%%CJ!#c((OO H H8}}}::<< H%%&Fx!|&F&F&F&FGGGIIKKKJ//IHH;^^S__d8nn6IK K K 	 	 	IIKLLL22 	 	 	II8(AFFF22222	s   C;C> >$E%	E.EErD   sourcec                 b   t          j        dd|          }t          j        dd|          }g }d}d}|t          |          k     r1|t          z   }|t          |          k     rDdD ]A}|                    ||t          dz  z   |dz             }||k    r|t          |          z   } nB|||                                         }	t          |	          d	k    rit          j        | d
|                                           	                                dd         }
|
                    | d|
 |	||d           |dz  }|t          z
  }|dk    r|t          |          k    rn|t          |          k     1t                              dt          |          t          |          |           |S )z#Split text into overlapping chunks.z\n{3,}r5   z {2,} r   )z. z.
r5   
rI      2      :N   _)idrD   rG   	chunk_idxr4   u+   Chunked %d chars → %d chunks (source: %s))resubr:   
CHUNK_SIZErfindr<   hashlibmd5encode	hexdigestr=   CHUNK_OVERLAPr'   r(   )r   rD   rG   chunksposchunk_idendboundarybp
chunk_text
chunk_hashs              r   _chunk_textzRAGEngine._chunk_textl   s    vi..vhT**CIIoo
"C SYY @  HHcJ!O.CS2XNNBCxx 3x==0   c#g,,..J:##$[F)?)?X)?)?)F)F)H)HIISSUUVYWYVYZ
#22j22&$!)	     A%CaxxC3t99,,3 CIIoo6 	>TCKK	1 	1 	1r   filenamec                 R   |                                  t          fddD                       rdS t          fddD                       rdS t          fddD                       rd	S t          fd
dD                       rdS t          fddD                       rdS dS )z#Detect document type from filename.c              3       K   | ]}|v V  	d S N .0kfns     r   	<genexpr>z-RAGEngine._detect_doc_type.<locals>.<genexpr>   s'      HH1qBwHHHHHHr   )comptegestioncg_comptescompte_gestionc              3       K   | ]}|v V  	d S rh   ri   rj   s     r   rn   z-RAGEngine._detect_doc_type.<locals>.<genexpr>   s'      TTQbTTTTTTr   )pvprocesconseildeliberu   séance
pv_conseilc              3       K   | ]}|v V  	d S rh   ri   rj   s     r   rn   z-RAGEngine._detect_doc_type.<locals>.<genexpr>   s'      GGQbGGGGGGr   )budgetprimitifbp_bp2budget_primitifc              3       K   | ]}|v V  	d S rh   ri   rj   s     r   rn   z-RAGEngine._detect_doc_type.<locals>.<genexpr>   s'      NNQbNNNNNNr   )bulletinmagazineinfo_municipalebulletin_municipalc              3       K   | ]}|v V  	d S rh   ri   rj   s     r   rn   z-RAGEngine._detect_doc_type.<locals>.<genexpr>   s'      IIQbIIIIIIr   )marcheu   marchéappellot_marche_publicautre)lowerany)re   rm   s    @r   _detect_doc_typezRAGEngine._detect_doc_type   s     ^^HHHH GHHHHH 		###TTTT"STTTTT 	#<GGGG"FGGGGG 	#$$NNNN"MNNNNN 	#''IIII"HIIIII 	#"?wr   c                 h    ddl }|                    d|           }|r|                    d          ndS )zExtract year from filename.r   Nz20[12]\dr6   )rS   searchgroup)re   _rems      r   _detect_anneezRAGEngine._detect_annee   s<     	JJ{H--&qwwqzzzB&r   c                   
 | j         s7|                                  | j         st                              d           dS t	          |          j        }t	          |          j        | j                            d|i          }|rY|                    d          rDt          	                    d|t          |d                              t          |d                   S |                               |                               
t          	                    d
           |                     |          }|sdS |                     ||          }|sdS d |D             }d	 |D             }
fd
|D             }| j                            |d                                          }	| j                            ||	||           t          	                    d|t          |                     t          |          S )u=   Extract, chunk, vectorize a PDF → return nb chunks indexed.u&   RAG not initialized — skipping indexr   rG   )whereidsz(Document already indexed: %s (%d chunks)u%   Indexing %s → doc_type=%s, annee=%sc                     g | ]
}|d          S )rD   ri   rk   cs     r   
<listcomp>z,RAGEngine.index_document.<locals>.<listcomp>   s    +++q6+++r   c                     g | ]
}|d          S )rQ   ri   r   s     r   r   z,RAGEngine.index_document.<locals>.<listcomp>   s    '''1qw'''r   c                 b    g | ]+}|d          |d         |                     dd          d,S )rG   rR   rC   r4   )rG   rR   re   doc_typeanneerC   )get)rk   r   r   r   re   s     r   r   z,RAGEngine.index_document.<locals>.<listcomp>   sZ         k;  EE&!$$
 
   r   F)show_progress_bar)r   
embeddings	documents	metadatasz2Indexed %s: %d chunks stored in ChromaDB (type=%s))r   r   r'   r+   r   stemr   r   r   r(   r:   r   r   rF   rd   r   rY   tolistadd)r   r1   pdf_nameexistingrD   r\   textsr   r   r   r   r   re   s             @@@r   index_documentzRAGEngine.index_document   s?     	OO$ DEEEq>>&>>& #''x.B'CC 	(U++ 	(HH?s8E?335 5 5x''' ((22""8,,8(HeTTT **844 	1 !!$11 	1 ,+F+++'''''        	 ^**5E*JJQQSS
!	 	 	
 	
 	
 	ExQTU[Q\Q\^fggg6{{r   N   ffffff?queryr   top_k	min_scorec                    | j         s|                                  | j         sg S | j                                        dk    rg S 	 |rd|ind}| j                            |g                                          }| j                            |t          |dz  | j                                                  |g d          }g }|r|	                    d          rot          |d         d                   D ]R\  }	}
|	                    d          r|d         d         |	         ni }|	                    d	          r|d	         d         |	         nd}t          d
|z
  d          }||k     rGt                              d|||	                    dd          |	                    dd                     |                    |
|	                    dd          |	                    dd          |	                    dd          |	                    dd          |	                    dd
          |	                    dd          |d           Tt          |d d          d|         }t                              d|dd         |pdt!          |          |           |S # t"          $ r'}t                              d|           g cY d}~S d}~ww xY w)z
        Semantic search in ChromaDB.
        Only returns chunks with score >= min_score.
        Cosine score: 1.0 = identical, 0.0 = unrelated.
        0.35 = reasonable threshold for administrative text.
        r   r   NrK   )r   r   	distances)query_embeddings	n_resultsr   includer   r   r   r4      z(RAG ignored (score=%.3f < %.2f): %s p.%sre   ?rC   rG   r6   r   rR   )rD   rG   re   r   r   rC   rR   scorec                     | d         S )Nr   ri   )xs    r   <lambda>z"RAGEngine.search.<locals>.<lambda>  s
    !G* r   T)keyreverseuE   RAG search '%s' (doc_type=%s) → %d chunks relevant (threshold=%.2f)rL   allzSearch RAG: %s)r   r   r   r)   r   rY   r   r   minr   	enumerateroundr'   r(   r=   sortedr:   r,   r-   )r   r   r   r   r   r   query_embeddingresultsr\   ir@   metadistr   r0   s                  r   r   zRAGEngine.search   s      	OO$ 	!!##q((I2	.6@Z**DE"n33UG<<CCEEO&,,!0eai)9)?)?)A)ABB???	 -  G F 7;;{33 '(<Q(?@@  FAs9@[9Q9QY7;/2155WYD9@[9Q9QX7;/2155WXD!!d(A..E y((F!9 HHZ55 HHVS11	   !MM #"&((8S"9"9$(HHZ$<$<$(HHZ$<$<!%'2!6!6 $ 3 3%)XXk1%=%=!&	# 	# 	 	 	 	 F(<(<dKKKFUFSFHHWcrc
H-s6{{I   M 	 	 	II&***IIIIII	s   IJ 
K(K
K
Kc           	         |                      ||dd          }|s(t                              d|dd         |pd           dS d	g}t          |d
          D ]c\  }}|d          d|d          d|d          d|d         dd}|                    d| d| d           |                    |d                    dd                    |          S )z1Return formatted context for LLM from RAG search.r   r   )r   r   r   z-RAG: no relevant chunk for '%s' (doc_type=%s)N<   r   r6   zDOCUMENTS LOCAUX (RAG):r4   re   z p.rC   rI   r   z [score=r   z.2f]z
--- Extrait z (z) ---rD   rJ   )r   r'   r(   r   r=   r?   )r   r   r   r\   partsr   r   srcs           r   get_contextzRAGEngine.get_context)  s    UXQ$OO 	HHDeCRCjRZRc^cddd2*+fa(( 	$ 	$DAqz]XXqyXX1W:XXqzXXXXCLL9!99s999:::LL6####yyr   c                     | j         sddddS t          t          | j                            d                              }d| j        r| j                                        nd|t          t          dS )zReturn RAG engine stats.Fr   )initializedtotal_chunksdocs_on_disk*.pdfT)r   r   r   embed_model
chunk_size)	r   r:   listr	   globr   r)   r/   rU   )r   r   s     r   	get_statszRAGEngine.get_stats8  s}      	P#(!QOOO4 2 27 ; ;<<==8<8HOD,22444a(&$
 
 	
r   c                    d}d}g }t          | j                            d                    D ]}	 |                     t	          |                    }||z  }|dz  }t
                              d|j        |           Q# t          $ rU}t
          	                    d|j        |           |
                    |j        t	          |          d           Y d}~d}~ww xY w|||dS )	z%Index all PDFs in the docs directory.r   r   r4   u   Indexed %s → %d chunkszFailed to index %s: %s)filer-   N)indexedr   errors)r   r	   r   r   r$   r'   r(   r   r,   r-   r=   )r   r   r   r   pdf_filenr0   s          r   index_all_documentszRAGEngine.index_all_documentsF  s   t}11'::;; 	H 	HHH''H66!13X]AFFFF H H H		2HM1EEEx}s1vvFFGGGGGGGGH
 (
 
 	
s   AA??
C	ACC)Nr   r   rh   )__name__
__module____qualname____doc__r$   r   r   rF   r   rd   staticmethodr   r   intr   floatr   r   dictr   r   ri   r   r   r   r      s       BB #    '1 '1 '1Rs s    .' 'S 'T ' ' ' 'R 3 3    \ ' ' ' ' ' \'7s 7s 7 7 7 7r 2626B BC B3 BB*/B;?B B B BH       s        
4 
 
 
 

T 
 
 
 
 
 
r   r   )r   rW   loggingrS   pathlibr   typingr   	getLoggerr'   rU   r[   r/   r&   r   ri   r   r   <module>r      s   
 
   				            g
##
 "
 
 
 
 
 
 
 
 
 
r   