
    si                     ^    d dl mZ d dlZd dlmZ d dlmZmZ d dl	m
Z
  G d de          ZdS )    )ListN)Dataset)NLTK_IMPORT_ERRORis_nltk_available)InputExamplec                   T    e Zd ZdZd fdee         fdZd Zd Ze	d
d            Z
d	S )DenoisingAutoEncoderDataseta  
    The DenoisingAutoEncoderDataset returns InputExamples in the format: texts=[noise_fn(sentence), sentence]
    It is used in combination with the DenoisingAutoEncoderLoss: Here, a decoder tries to re-construct the
    sentence without noise.

    Args:
        sentences: A list of sentences
        noise_fn: A noise function: Given a string, it returns a string
            with noise, e.g. deleted words
    c                 6    t                               |           S N)r	   delete)ss    m/var/www/icac/venv/lib/python3.11/site-packages/sentence_transformers/datasets/DenoisingAutoEncoderDataset.py<lambda>z$DenoisingAutoEncoderDataset.<lambda>   s    @[@b@bcd@e@e     	sentencesc                     t                      s+t          t          j        | j        j                            || _        || _        d S r   )r   ImportErrorr   format	__class____name__r   noise_fn)selfr   r   s      r   __init__z$DenoisingAutoEncoderDataset.__init__   sB     "" 	Q/6t~7NOOPPP" r   c                 f    | j         |         }t          |                     |          |g          S )N)texts)r   r   r   )r   itemsents      r   __getitem__z'DenoisingAutoEncoderDataset.__getitem__   s0    ~d#4==#6#6"=>>>>r   c                 *    t          | j                  S r   )lenr   )r   s    r   __len__z#DenoisingAutoEncoderDataset.__len__!   s    4>"""r   333333?c                 t   ddl m}m}  ||           }t          |          }|dk    r| S t          j                            |          |k    }t          |          dk    r"d|t          j                            |          <    |            	                    t	          j
        |          |                   }|S )Nr   )TreebankWordDetokenizerword_tokenizeT)nltkr$   r%   r    nprandomrandsumchoice
detokenizearray)text	del_ratior$   r%   wordsnkeep_or_notwords_processeds           r   r   z"DenoisingAutoEncoderDataset.delete%   s    ????????d##JJ66KinnQ'')3{q  /3K	((++,1133>>rx{?[\\r   N)r"   )r   
__module____qualname____doc__r   strr   r   r!   staticmethodr    r   r   r	   r	   
   s        	 	 7f6e ! !$s) ! ! ! !? ? ?# # #    \  r   r	   )typingr   numpyr'   torch.utils.datar   transformers.utils.import_utilsr   r   *sentence_transformers.readers.InputExampler   r	   r9   r   r   <module>r?      s              $ $ $ $ $ $ P P P P P P P P C C C C C C( ( ( ( (' ( ( ( ( (r   