
    si`                         d dl Z d dlZd dlZd dlmZmZmZ d dlZd dlmZm	Z	 ddl
mZ  ej        e          Z G d de	j                  ZdS )    N)DictListLiteral)Tensornn   )WhitespaceTokenizerc            	           e Zd ZdZi ddfdee         deeef         dedef fdZ	d	eee
f         fd
Zdee         dee         fdZd Z	 ddeee                  dedeed         ej
        f         fdZd Zd Zed             Z xZS )BoWzImplements a Bag-of-Words (BoW) model to derive sentence embeddings.

    A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
    r   Tvocabword_weightsunknown_word_weightcumulative_term_frequencyc                    t          t          |                                            t          t	          |                    }g d| _        || _        || _        || _        || _	        g | _
        d}|D ]a}|}||v r	||         }n6|                                |v r||                                         }n|dz  }| j
                            |           bt                              d                    |t!          |          |                     t#          |t	                      d          | _        t!          |          | _        d S )N)r   r   r   r   r   r   z>{} out of {} words without a weighting value. Set weight to {}F)
stop_wordsdo_lower_case)superr   __init__listsetconfig_keysr   r   r   r   weightslowerappendloggerinfoformatlenr	   	tokenizersentence_embedding_dimension)	selfr   r   r   r   num_unknown_wordswordweight	__class__s	           S/var/www/icac/venv/lib/python3.11/site-packages/sentence_transformers/models/BoW.pyr   zBoW.__init__   sN    	c4!!###SZZ  hhh
(#6 )B&  	( 	(D(F|##%d+--%djjll3!Q&!L''''LSS!3u::/B 	
 	
 	
 -UsuuTYZZZ,/JJ)))    featuresc                     |S N )r!   r(   s     r&   forwardzBoW.forward9   s    r'   textsreturnc                 N      fd|D             }                      |          S )Nc                 6    g | ]} j         j        |fi S r+   )r   tokenize).0textkwargsr!   s     r&   
<listcomp>z BoW.tokenize.<locals>.<listcomp>>   s1    OOO,T^,T<<V<<OOOr'   )get_sentence_features)r!   r-   r4   	tokenizeds   ` ` r&   r1   zBoW.tokenize=   s4    OOOOOOOO	)))444r'   c                     | j         S r*   )r    r!   s    r&    get_sentence_embedding_dimensionz$BoW.get_sentence_embedding_dimensionA   s    00r'   r   tokenized_textspad_seq_lengthsentence_embeddingc                 :   g }|D ]}t          j        |                                 t           j                  }|D ]5}| j        r||xx         | j        |         z  cc<   %| j        |         ||<   6|                    |           dt          j        |          iS )N)dtyper=   )torchzerosr:   float32r   r   r   stack)r!   r;   r<   vectorstokensvectortokens          r&   r6   zBoW.get_sentence_featuresD   s     % 	# 	#F[!F!F!H!HPUP]^^^F 8 81 85MMMT\%%88MMMM$(L$7F5MMNN6""""$ek'&:&:;;r'   c                 *      fd j         D             S )Nc                 ,    i | ]}|j         |         S r+   )__dict__)r2   keyr!   s     r&   
<dictcomp>z'BoW.get_config_dict.<locals>.<dictcomp>U   s"    DDDCT]3'DDDr'   )r   r9   s   `r&   get_config_dictzBoW.get_config_dictT   s     DDDD43CDDDDr'   c                     t          t          j                            |d          d          5 }t	          j        |                                 |d           d d d            d S # 1 swxY w Y   d S )Nconfig.jsonw   )indent)openospathjoinjsondumprM   )r!   output_pathfOuts      r&   savezBoW.saveW   s    "',,{M::C@@ 	>DId**,,d1====	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>s   *A&&A*-A*c                     t          t          j                            | d                    5 }t	          j        |          }d d d            n# 1 swxY w Y   t          di |S )NrO   r+   )rS   rT   rU   rV   rW   loadr   )
input_pathfInconfigs      r&   r]   zBoW.load[   s    "',,z=99:: 	$cYs^^F	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ }}V}}s   AAA)r   )__name__
__module____qualname____doc__r   strr   floatboolr   r   r,   intr1   r:   r   r@   r6   rM   r[   staticmethodr]   __classcell__)r%   s   @r&   r   r      sv         *,%&*.#7 #7Cy#7 3:&#7 #	#7
 $(#7 #7 #7 #7 #7 #7JS&[ 1    5d3i 5d3i 5 5 5 51 1 1 GH< <#DI<@C<	g*+U\9	:< < < < E E E> > >   \    r'   r   )rW   loggingrT   typingr   r   r   r@   r   r   r   r	   	getLoggerra   r   Moduler   r+   r'   r&   <module>ro      s      				 & & & & & & & & & &          * * * * * *		8	$	$R R R R R") R R R R Rr'   