
    si3                         d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
  ej        e          Z G d de          ZdS )    N)List)Dataset)SentenceTransformer)InputExamplec                       e Zd ZdZ	 	 ddedededefdZ	 ddededededd
f
dZ		 	 	 dde
e
e                  dededefdZd Zd Zd Zd Zd Zd
S )ParallelSentencesDatasetu  
    This dataset reader can be used to read-in parallel sentences, i.e., it reads in a file with tab-seperated sentences with the same
    sentence in different languages. For example, the file can look like this (EN	DE	ES):
    hello world     hallo welt  hola mundo
    second sentence zweiter satz    segunda oración

    The sentence in the first column will be mapped to a sentence embedding using the given the embedder. For example,
    embedder is a mono-lingual sentence embedding method for English. The sentences in the other languages will also be
    mapped to this English sentence embedding.

    When getting a sample from the dataset, we get one sentence with the according sentence embedding for this sentence.

    teacher_model can be any class that implement an encode function. The encode function gets a list of sentences and
    returns a list of sentence embeddings
       Tstudent_modelteacher_model
batch_sizeuse_embedding_cachec                     || _         || _        g | _        g | _        g | _        g | _        g | _        g | _        || _        || _	        i | _
        d| _        dS )a+  
        Parallel sentences dataset reader to train student model given a teacher model

        Args:
            student_model (SentenceTransformer): The student sentence embedding model that should be trained.
            teacher_model (SentenceTransformer): The teacher model that provides the sentence embeddings for the first column in the dataset file.
            batch_size (int, optional): The batch size for training. Defaults to 8.
            use_embedding_cache (bool, optional): Whether to use an embedding cache. Defaults to True.
        r   N)r
   r   datasetsdatasets_iteratordatasets_tokenizeddataset_indicescopy_dataset_indicescacher   r   embedding_cachenum_sentences)selfr
   r   r   r   s        j/var/www/icac/venv/lib/python3.11/site-packages/sentence_transformers/datasets/ParallelSentencesDataset.py__init__z!ParallelSentencesDataset.__init__   sg      +*!#"$!$&!
$#6 !    d   N   filepathweightmax_sentencesmax_sentence_lengthreturnc                    t                               d|z              g }|                    d          rt          j        |dd          nt	          |d          5 }d}|D ]y}|                                                    d          }	|$|dk    rt          d	 |	D                       |k    rO|                    |	           |d
z  }||dk    r||k    r nzddd           n# 1 swxY w Y   | 	                    ||||           dS )a  
        Reads in a tab-seperated .txt/.csv/.tsv or .gz file. The different columns contain the different translations of the sentence in the first column

        Args:
            filepath (str): Filepath to the file.
            weight (int, optional): If more than one dataset is loaded with load_data, specifies the frequency at which data should be sampled from this dataset. Defaults to 100.
            max_sentences (int, optional): Maximum number of lines to be read from the filepath. Defaults to None.
            max_sentence_length (int, optional): Skip the example if one of the sentences has more characters than max_sentence_length. Defaults to 128.

        Returns:
            None
        zLoad z.gzrtutf8)encodingr   	Nc                 ,    g | ]}t          |          S  len.0sents     r   
<listcomp>z6ParallelSentencesDataset.load_data.<locals>.<listcomp>X   s    ===4SYY===r      )r   r   r    )
loggerinfoendswithgzipopenstripsplitmaxappendadd_dataset)
r   r   r   r   r    parallel_sentencesfIncountline	sentencess
             r   	load_dataz"ParallelSentencesDataset.load_data<   s     	Gh&''';C;L;LU;S;S 
TYx7777Y]vZ
 Z
 Z
 	E   JJLL..t44	'3+a//==9===>>ATTT")))444
 ,1B1BuP]G]G]E!	 	 	 	 	 	 	 	 	 	 	 	 	 	 	" 	v]`s 	 	
 	
 	
 	
 	
s   A?C((C,/C,r:   c                   	 i 	|D ]}|$|dk    rt          d |D                       |k    r(|d         }|	vrt                      	|<   |D ]}	|                             |           ||dk    rt          	          |k    r nt          	          dk    rd S | xj        t          	fd	D                       z  c_        t          | j                  }| j                            t          		                                                     | j
                            d           | j                            |g|z             d S )Nr   c                 ,    g | ]}t          |          S r(   r)   r+   s     r   r.   z8ParallelSentencesDataset.add_dataset.<locals>.<listcomp>p   s    999tT999r   c                 :    g | ]}t          |                   S r(   r)   )r,   r-   sentences_maps     r   r.   z8ParallelSentencesDataset.add_dataset.<locals>.<listcomp>   s&    "V"V"V3}T':#;#;"V"V"Vr   )r7   setaddr*   r   sumr   r8   listitemsr   r   extend)
r   r:   r   r   r    r>   source_sentencer-   
dataset_idrC   s
            @r   r9   z$ParallelSentencesDataset.add_datasetd   s    + 	 	I#/'!++99y999::=PPP'lOm3314o.! 9 9o.2248888(]Q->->3}CUCUYfCfCf}""Fc"V"V"V"V"V"V"VWWW''
T-"5"5"7"788999%%a(((##ZL6$9:::::r   c                    g }g }| j         D ]D}|                     |          \  }}|                    |           |                    |           E|                     |          }t	          ||          D ]4\  }}|D ],}| j                            t          |g|                     -5t          j        | j                   d S )N)textslabel)	r   
next_entryr8   get_embeddingszipr   r   randomshuffle)	r   source_sentences_listtarget_sentences_listdata_idxsrc_sentencetrg_sentencessrc_embeddingssrc_embeddingtrg_sentences	            r   generate_dataz&ParallelSentencesDataset.generate_data   s     " ", 	8 	8H*.//(*C*C'L-!((666!((7777 ,,-BCC,/@U,V,V 	[ 	[(M= - [ [
!!,l^="Y"Y"YZZZZ[ 	tz"""""r   c                    | j         |         | j        |                  \  }}| j        |xx         dz  cc<   | j        |         t          | j         |                   k    r)d| j        |<   t          j        | j         |                    ||fS )Nr/   r   )r   r   r*   rR   rS   )r   rV   sourcetarget_sentencess       r   rO   z#ParallelSentencesDataset.next_entry   s    #'=#:4;QRZ;[#\  x(((A-(((!(+s4=3J/K/KKK/0D"8,N4=2333'''r   c                 j     j         s# j                            | j        dd          S g }|D ] }| j        vr|                    |           !t          |          dk    rC j                            | j        dd          }t          ||          D ]\  }}| j        |<    fd|D             S )NFT)r   show_progress_barconvert_to_numpyr   c                 *    g | ]}j         |         S r(   )r   )r,   r-   r   s     r   r.   z;ParallelSentencesDataset.get_embeddings.<locals>.<listcomp>   s!    AAAt$T*AAAr   )r   r   encoder   r   r8   r*   rQ   )r   r>   new_sentencesr-   new_embeddings	embeddings   `     r   rP   z'ParallelSentencesDataset.get_embeddings   s    ' 	%,,doae -   
  	+ 	+D4///$$T***}!!!/66$/Uei 7  N $'}n#E#E 7 7i-6$T**AAAAyAAAAr   c                     | j         S )N)r   )r   s    r   __len__z ParallelSentencesDataset.__len__   s    !!r   c                     t          | j                  dk    r|                                  | j                                        S )Nr   )r*   r   r\   pop)r   idxs     r   __getitem__z$ParallelSentencesDataset.__getitem__   s9    tz??a   z~~r   )r	   T)r   Nr   )__name__
__module____qualname____doc__r   intboolr   strr?   r   r9   r\   rO   rP   ri   rm   r(   r   r   r   r      sO        ( $( * + 	
 "   < gj&
 &
&
%(&
?B&
`c&
	&
 &
 &
 &
V !#&"; "; cO"; "; 	";
 !"; "; "; ";H# # #"( ( (B B B*" " "         r   r   )r3   loggingrR   typingr   torch.utils.datar   sentence_transformersr   sentence_transformers.readersr   	getLoggerrn   r0   r   r(   r   r   <module>r{      s             $ $ $ $ $ $ 5 5 5 5 5 5 6 6 6 6 6 6		8	$	$q  q  q  q  q w q  q  q  q  q r   