
    siP                         d dl Z d dlZd dlZd dlZd dlZd dlmZmZ d dlm	Z	m
Z
 ddlmZmZ  ej        e          Z G d de          ZdS )    N)IterableList)NLTK_IMPORT_ERRORis_nltk_available   )ENGLISH_STOP_WORDSWordTokenizerc                       e Zd ZdZg edddfdee         dee         deded	ef
d
Z	d Z
dee         fdZdedee         fdZdefdZedefd            ZdS )PhraseTokenizera~  Tokenizes the text with respect to existent phrases in the vocab.

    This tokenizers respects phrases that are in the vocab. Phrases are separated with 'ngram_separator', for example,
    in Google News word2vec file, ngrams are separated with a _ like New_York. These phrases are detected in text and merged as one special token. (New York is the ... => [New_York, is, the])
    F_   vocab
stop_wordsdo_lower_casengram_separatormax_ngram_lengthc                     t                      s+t          t          j        | j        j                            t          |          | _        || _        || _	        || _
        |                     |           d S N)r   ImportErrorr   format	__class____name__setr   r   r   r   	set_vocab)selfr   r   r   r   r   s         i/var/www/icac/venv/lib/python3.11/site-packages/sentence_transformers/models/tokenizer/PhraseTokenizer.py__init__zPhraseTokenizer.__init__   so     !"" 	Q/6t~7NOOPPPj//*. 0u    c                     | j         S r   )r   )r   s    r   	get_vocabzPhraseTokenizer.get_vocab'   s
    zr   c                    || _         t          j        d t          |          D                       | _        t                      | _        t                      | _        |D ]}| j        v| j        |v rm|	                    | j                  dz   }| j        | j        z   |vr?|| j
        k    r4| j                            |           | j                            |           t          |          dk    rst                              d                    | j                             t                              d                    t          | j                                       d S d S )Nc                     g | ]	\  }}||f
S  r#   ).0idxwords      r   
<listcomp>z-PhraseTokenizer.set_vocab.<locals>.<listcomp>,   s     0_0_0_d$0_0_0_r   r   r   z*PhraseTokenizer - Phrase ngram lengths: {}z!PhraseTokenizer - Num phrases: {})r   collectionsOrderedDict	enumerateword2idxr   ngram_lookupngram_lengthsr   countr   addlenloggerinfor   )r   r   r&   ngram_counts       r   r   zPhraseTokenizer.set_vocab*   sJ   
#/0_0_iX]N^N^0_0_0_``  EE UU 	8 	8D#/D4HD4P4P"jj)=>>B'$*>>dJJ{^b^sOsOs%))$///&**;777u::>>KKDKKDL^__```KK;BB3tGXCYCYZZ[[[[[ >r   textreturnc                 z   ddl m}  ||d          }t          | j        d          D ]}d}|t	          |          |z
  k    r| j                            ||||z                      }|| j        v r|g||||z   <   n8|                                | j        v r|                                g||||z   <   |dz  }|t	          |          |z
  k    g }|D ]}	|	| j	        v r|	| j
        v r!|                    | j
        |	                    6|	                                }	|	| j	        v rT|	| j
        v r!|                    | j
        |	                    ~|	                    t          j                  }	|	| j	        v rt	          |	          dk    r*|	| j
        v r!|                    | j
        |	                    |S )Nr   )word_tokenizeT)preserve_line)reverser   )nltkr7   sortedr-   r0   r   joinr,   lowerr   r+   appendstripstringpunctuation)
r   r4   kwargsr7   tokens	ngram_lenr%   ngramtokens_filteredtokens
             r   tokenizezPhraseTokenizer.tokenize=   s    &&&&&&t4888   2DAAA 	 	ICVy000,11&sY9N2OPPD---5:GF3y011[[]]d&7775:[[]]OF3y01q Vy000  	 	E''$-''&&t}U';<<<KKMME''$-''&&t}U';<<<KK 233E''UaET]$:$:&&t}U';<<<r   output_pathc           	      T   t          t          j                            |d          d          5 }t	          j        t          | j                                                  t          | j	                  | j
        | j        | j        d|           d d d            d S # 1 swxY w Y   d S )Nphrasetokenizer_config.jsonw)r   r   r   r   r   )openospathr<   jsondumplistr+   keysr   r   r   r   )r   rI   fOuts      r   savezPhraseTokenizer.savef   s    "',,{,IJJCPP 
	TXI!$-"4"4"6"677"&t"7"7%)%7'+';(,(=  	 	 	
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	s   A!BB!$B!
input_pathc                     t          t          j                            | d          d          5 }t	          j        |          }d d d            n# 1 swxY w Y   t          di |S )NrK   rr#   )rM   rN   rO   r<   rP   loadr   )rV   fInconfigs      r   rY   zPhraseTokenizer.loads   s    "',,z+HII3OO 	$SVYs^^F	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ (((((s   AAAN)r   
__module____qualname____doc__r   r   strboolintr   r    r   r   rH   rU   staticmethodrY   r#   r   r   r   r      s          "$6#" ! } SM 	
     "  \x} \ \ \ \&'S 'tCy ' ' ' 'R     ) ) ) ) \) ) )r   r   )r(   rP   loggingrN   r@   typingr   r   transformers.utils.import_utilsr   r   r	   r   	getLoggerr   r1   r   r#   r   r   <module>rg      s          				  ! ! ! ! ! ! ! ! P P P P P P P P < < < < < < < <		8	$	$i) i) i) i) i)m i) i) i) i) i)r   