
    sie#                         d dl Z d dlZd dlmZmZmZmZmZmZ d dl	Z	d dl	m
Z
 d dlmZmZmZmZmZ  G d de
j                  ZdS )    N)AnyDictListOptionalTupleUnion)nn)
AutoConfig	AutoModelAutoTokenizer	MT5ConfigT5Configc                       e Zd ZdZ	 	 	 	 	 	 	 ddedee         deeeef                  deeeef                  deeeef                  d	ee         d
e	deddf fdZ
d dZd dZd dZdefdZdeeej        f         deeej        f         fdZdefdZ	 d!deee         ee         eeeef                  f         deee	f         deeej        f         fdZdeeef         fdZd!dede	ddfdZededd fd            Z xZS )"Transformera  Huggingface AutoModel to generate token embeddings.
    Loads the correct class, e.g. BERT / RoBERTa etc.

    Args:
        model_name_or_path: Huggingface models name
            (https://huggingface.co/models)
        max_seq_length: Truncate any inputs longer than max_seq_length
        model_args: Keyword arguments passed to the Huggingface
            Transformers model
        tokenizer_args: Keyword arguments passed to the Huggingface
            Transformers tokenizer
        config_args: Keyword arguments passed to the Huggingface
            Transformers config
        cache_dir: Cache dir for Huggingface Transformers to store/load
            models
        do_lower_case: If true, lowercases the input (independent if the
            model is cased or not)
        tokenizer_name_or_path: Name or path of the tokenizer. When
            None, then model_name_or_path is used
    NFmodel_name_or_pathmax_seq_length
model_argstokenizer_argsconfig_args	cache_dirdo_lower_casetokenizer_name_or_pathreturnc	                 ^   t          t          |                                            ddg| _        || _        |i }|i }|i }t          j        |fi |d|i}	 | j        ||	|fi | |	d|vr||d<   t          j        ||n|fd|i|| _	        |mt          | j        d          rXt          | j        j        d          r>t          | j	        d          r)t          | j        j        j        | j	        j                  }|| _        |"| j	        j        j        | j        j        _        d S d S )Nr   r   r   model_max_lengthconfigmax_position_embeddings)superr   __init__config_keysr   r
   from_pretrained_load_modelr   	tokenizerhasattr
auto_modelr   minr   r   r   	__class____name__tokenizer_class)selfr   r   r   r   r   r   r   r   r   r'   s             [/var/www/icac/venv/lib/python3.11/site-packages/sentence_transformers/models/Transformer.pyr   zTransformer.__init__    s    	k4  ))+++,o>*J!NK+,>cc+ccYbccc+VYMM*MMM%*<N*R*R1?N-.&6&<&H""N`
 

 
 
 !22vDO24MNNv DN,>??v
 "%T_%;%SUYUcUt!u!u,!-59^5M5VDO"222 .-    c                     t          |t                    r | j        |||fi | dS t          |t                    r | j        |||fi | dS t          j        |f||d|| _        dS )zLoads the transformer modelr   r   N)
isinstancer   _load_t5_modelr   _load_mt5_modelr   r!   r%   )r*   r   r   r   r   s        r+   r"   zTransformer._load_modelN   s    fh'' 	D 2FITTTTTTT	** 	 D !3VYUU*UUUUU'7"+1Y JT DOOOr,   c                 N    ddl m} dg|_         |j        |f||d|| _        dS )Loads the encoder model from T5r   )T5EncoderModel	decoder.*r.   N)transformersr4   "_keys_to_ignore_on_load_unexpectedr!   r%   )r*   r   r   r   r   r4   s         r+   r0   zTransformer._load_t5_modelY   sR    //////=HM98.8
'-
 
FP
 
r,   c                 N    ddl m} dg|_         |j        |f||d|| _        dS )r3   r   )MT5EncoderModelr5   r.   N)r6   r9   r7   r!   r%   )r*   r   r   r   r   r9   s         r+   r1   zTransformer._load_mt5_modelb   sR    000000>I]:9/9
'-
 
FP
 
r,   c                 p    d                     |                                 | j        j        j                  S )Nz+Transformer({}) with Transformer model: {} )formatget_config_dictr%   r'   r(   r*   s    r+   __repr__zTransformer.__repr__k   s3    <CC  ""DO$=$F
 
 	
r,   featuresc                 B   |d         |d         d}d|v r|d         |d<    | j         di |ddi}|d         }|                    ||d         d           | j         j        j        r6d	}t	          |          d
k     rd}||         }|                    d|i           |S )z#Returns token_embeddings, cls_token	input_idsattention_mask)rA   rB   token_type_idsreturn_dictFr   )token_embeddingsrB            all_layer_embeddings )r%   updater   output_hidden_stateslen)r*   r?   trans_featuresoutput_statesoutput_tokensall_layer_idxhidden_statess          r+   forwardzTransformer.forwardp   s    '/'<PXYiPjkkx''/78H/IN+,'LL.LLeLLL%a(]hWgNhiijjj?!6 	EM=!!A%% !)-8MOO3]CDDDr,   c                 $    | j         j        j        S )N)r%   r   hidden_sizer=   s    r+   get_word_embedding_dimensionz(Transformer.get_word_embedding_dimension   s    %11r,   Ttextspaddingc           
      t   i }t          |d         t                    r|g}nt          |d         t                    rqg }g |d<   |D ]c}t          t	          |                                                    \  }}|                    |           |d                             |           d|g}nCg g }	}|D ]8}
|                    |
d                    |	                    |
d                    9||	g}d |D             }| j        rd |D             }|                     | j	        ||dd| j
        d           |S )	z-Tokenizes a text and maps tokens to token-idsr   	text_keysrH   c                 &    g | ]}d  |D             S )c                 P    g | ]#}t          |                                          $S rJ   )strstrip.0ss     r+   
<listcomp>z3Transformer.tokenize.<locals>.<listcomp>.<listcomp>   s&    4441A444r,   rJ   r`   cols     r+   rb   z(Transformer.tokenize.<locals>.<listcomp>   s'    LLL44444LLLr,   c                 &    g | ]}d  |D             S )c                 6    g | ]}|                                 S rJ   )lowerr_   s     r+   rb   z3Transformer.tokenize.<locals>.<listcomp>.<listcomp>   s     333!AGGII333r,   rJ   rc   s     r+   rb   z(Transformer.tokenize.<locals>.<listcomp>   s'    KKK33s333KKKr,   longest_firstpt)rX   
truncationreturn_tensors
max_length)r/   r]   dictnextiteritemsappendr   rK   r#   r   )r*   rW   rX   outputto_tokenizelookuptext_keytextbatch1batch2
text_tuples              r+   tokenizezTransformer.tokenize   s    eAh$$ 	+ 'KKa$'' 	+K"$F; 5 5!%d6<<>>&:&:!;!;$""4((({#**84444&-KKFF# - -
jm,,,jm,,,,!6*K MLLLL  	LKK{KKKKDN*#.  	
 	
 	
 r,   c                 *      fd j         D             S )Nc                 ,    i | ]}|j         |         S rJ   )__dict__)r`   keyr*   s     r+   
<dictcomp>z/Transformer.get_config_dict.<locals>.<dictcomp>   s"    DDDCT]3'DDDr,   )r    r=   s   `r+   r<   zTransformer.get_config_dict   s     DDDD43CDDDDr,   output_pathsafe_serializationc                 R   | j                             ||           | j                            |           t          t          j                            |d          d          5 }t          j        | 	                                |d           d d d            d S # 1 swxY w Y   d S )N)r   sentence_bert_config.jsonwrF   )indent)
r%   save_pretrainedr#   openospathjoinjsondumpr<   )r*   r   r   fOuts       r+   savezTransformer.save   s    ''HZ'[[[&&{333"',,{,GHH#NN 	>RVId**,,d1====	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>s   %*BB #B 
input_pathc                    dD ]C}t           j                            | |          }t           j                            |          r nDt	          |          5 }t          j        |          }d d d            n# 1 swxY w Y   d|v r%d|d         v r|d                             d           d|v r%d|d         v r|d                             d           d|v r%d|d         v r|d                             d           t          dd| i|S )N)r   zsentence_roberta_config.jsonzsentence_distilbert_config.jsonzsentence_camembert_config.jsonzsentence_albert_config.jsonz sentence_xlm-roberta_config.jsonzsentence_xlnet_config.jsonr   trust_remote_coder   r   r   rJ   )	r   r   r   existsr   r   loadpopr   )r   config_namesbert_config_pathfInr   s        r+   r   zTransformer.load   sx   
 	 	K !#Z E Ew~~/00  #$$ 	$Ys^^F	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 6!!&9VL=Q&Q&Q< $$%8999v%%*=HXAY*Y*Y#$(()<===F""':f]>S'S'S=!%%&9:::CCjCFCCCs   A77A;>A;)NNNNNFN)r   N)T)r(   
__module____qualname____doc__r]   r   intr   r   boolr   r"   r0   r1   r>   torchTensorrS   rV   r   r   r   rz   r<   r   staticmethodr   __classcell__)r'   s   @r+   r   r   
   s        0 )-/33704#'#&*,W ,W,W !,W T#s(^,	,W
 !c3h0,W d38n-,W C=,W ,W !$,W 
,W ,W ,W ,W ,W ,W\	 	 	 	
 
 
 

 
 
 

# 
 
 
 

S%,%6 7 DelAR<S    *2c 2 2 2 2 gk& &49d4j$uS#X2GGH&SXY\^bYbSc&	c5<	 & & & &PEc3h E E E E> > > > > > > > D D D D D \D D D D Dr,   r   )r   r   typingr   r   r   r   r   r   r   r	   r6   r
   r   r   r   r   Moduler   rJ   r,   r+   <module>r      s     				 : : : : : : : : : : : : : : : :        R R R R R R R R R R R R R RID ID ID ID ID") ID ID ID ID IDr,   