
    si                     R   d dl Z d dlmZ d dlmZmZ d dlmZmZ d dl	Z	d dl
mZmZmZ d dlmZ  e            rd dlmZ  e j        e          Z G d d	          Z G d
 dee          Z G d dee          Z G d dee          Z G d dee          Z G d dee          ZdS )    N)defaultdict)
accumulatecycle)IteratorList)BatchSamplerConcatDatasetSubsetRandomSampler)is_datasets_available)Datasetc                   4     e Zd ZdZd fdZdeddfdZ xZS )SetEpochMixinz
    Required for a BatchSampler as the Trainer will call set_epoch on the BatchSampler at the beginning of each epoch.
    The BatchSampler can then set the generator seed accordingly.
    returnNc                 H     t                      j        |i | d| _        d S Nr   )super__init__epoch)selfargskwargs	__class__s      P/var/www/icac/venv/lib/python3.11/site-packages/sentence_transformers/sampler.pyr   zSetEpochMixin.__init__   s*    $)&)))


    r   c                     || _         d S )N)r   )r   r   s     r   	set_epochzSetEpochMixin.set_epoch   s    


r   )r   N)__name__
__module____qualname____doc__r   intr   __classcell__r   s   @r   r   r      si         
     
s t        r   r   c                       e Zd ZdS )DefaultBatchSamplerN)r   r   r    r   r   r%   r%       s        Dr   r%   c                   ~     e Zd Z	 	 	 ddddededee         dej        d	ed
df fdZ	d
e
ee                  fdZ xZS )GroupByLabelBatchSamplerNr   datasetr   
batch_size	drop_lastvalid_label_columns	generatorseedr   c                    t                                          |||           || _        || _        || _        || _        || _        | j        dz  dk    rt          d          |pg D ]}||j        v r
|d         } nt          d| d          ~t          t                    }	t          |          D ] \  }
}|	|                             |
           !fd|	                                D             | _        d S )N      zEThe batch size for `GroupByLabelBatchSampler` must be divisible by 2.labelz None of the valid_label_columns z are in the dataset.c                 T    i | ]$\  }}t          |          d z  x||d         %S )r0   Nlen).0r2   sample_indicesnum_sampless      r   
<dictcomp>z5GroupByLabelBatchSampler.__init__.<locals>.<dictcomp>D   sP     
 
 
%~">22a77
>,;,/
 
 
r   )r   r   r)   r*   r+   r-   r.   
ValueErrorcolumn_namesr   list	enumerateappenditemsgroups)r   r)   r*   r+   r,   r-   r.   column_namelabelsr@   
sample_idxr2   r8   r   s               @r   r   z!GroupByLabelBatchSampler.__init__%   s8    	*i888$""	?Q!##deee.4" 	k 	kKg222 ) 3 i@SiiijjjT""!*6!2!2 	- 	-J5M  ,,,,
 
 
 
)/
 
 
r   c              #   :  K   | j         r.| j        r'| j                             | j        | j        z              t	          | j                                                  }g }t          j        t          | j                  | j                   D ]|}||         }| j        |         }|
                    |           t          |          | j        k    r8|d | j                 V  || j        d          }t          |          | j        k    8}| j        s|r|V  d S d S d S )Nr-   )r-   r.   manual_seedr   r<   r@   keystorchrandpermr5   extendr*   r+   )r   rB   partial_batch	label_idxr2   sampless         r   __iter__z!GroupByLabelBatchSampler.__iter__J   s?     > 	?di 	?N&&ty4:'=>>>dk&&(())DK(8(8DNSSS 	A 	AI9%Ek%(G  )))m$$77#$5do$56666 -do.?.? @ m$$77 ~ 	 - 	 	  	  	  	 r   )NNr   )r   r   r   r!   boolr   strrH   	Generatorr   r   rN   r"   r#   s   @r   r(   r(   $   s         *.%)#
 #
#
 #
 	#

 "#Y#
 ?#
 #
 
#
 #
 #
 #
 #
 #
J (49-                r   r(   c                        e Zd Zg ddfdddededee         dej        d	ed
df fdZ	d
e
ee                  fdZd
efdZ xZS )NoDuplicatesBatchSamplerNr   r)   r   r*   r+   r,   r-   r.   r   c                    t                                          |||           t          |j                  t          |          dhz  z  x}r|                    |          }|| _        || _        || _        || _        || _	        d S )Ndataset_name)
r   r   setr;   remove_columnsr)   r*   r+   r-   r.   )	r   r)   r*   r+   r,   r-   r.   label_columnsr   s	           r   r   z!NoDuplicatesBatchSampler.__init__]   s     	*i888 455=P9Q9QUcTd9dee= 	<,,];;G$""			r   c              #   p  K   | j         r.| j        r'| j                             | j        | j        z              t	          t          j        t          | j                  | j                   	                                          }|rt	                      }g }|D ]|}t	          | j        |         
                                          }||z  r4|                    |           t          |          | j        k    r|V   n!|                    |           }| j        s|V  |t	          |          z  }|dS dS )a5  
        Iterate over the remaining non-yielded indices. For each index, check if the sample values are already in the
        batch. If not, add the sample values to the batch keep going until the batch is full. If the batch is full, yield
        the batch indices and continue with the next batch.
        rE   N)r-   r.   rF   r   rV   rH   rI   r5   r)   tolistvaluesr>   r*   updater+   )r   remaining_indicesbatch_valuesbatch_indicesindexsample_valuess         r   rN   z!NoDuplicatesBatchSampler.__iter__o   s^      > 	?di 	?N&&ty4:'=>>>s4</@/@DN [ [ [ b b d dee 	455LM* ( ( #DL$7$>$>$@$@ A A </ $$U+++}%%88''''E##M2222 ~ ('''']!3!33)   	4 	4 	4 	4 	4r   c                     | j         rt          | j                  | j        z  S t          | j                  | j        z   dz
  | j        z  S )Nr1   )r+   r5   r)   r*   r   s    r   __len__z NoDuplicatesBatchSampler.__len__   sG    > 	Pt|$$77%%7!;OOr   )r   r   r   r!   rO   r   rP   rH   rQ   r   r   rN   rd   r"   r#   s   @r   rS   rS   \   s         *,%)   	
 "#Y ?  
     $4(49- 4 4 4 4@P P P P P P P P Pr   rS   c            
       z     e Zd Zdedee         dej        deddf
 fdZ	de
ee                  fdZdefd	Z xZS )
RoundRobinBatchSamplerr)   batch_samplersr-   r.   r   Nc                     t                                          ||d         j        |d         j                   || _        || _        || _        || _        d S r   r   r   r*   r+   r)   rg   r-   r.   r   r)   rg   r-   r.   r   s        r   r   zRoundRobinBatchSampler.__init__   R     	."3">q@Q@[\\\,"			r   c              #     K   | j                             | j        | j        z              d | j        j        D             }dgt          t          |                    z   }d | j        D             }t          t          t          |                              D ]@}||         	 fdt          ||                   D             V  /# t          $ r Y  d S w xY wd S )Nc                 ,    g | ]}t          |          S r&   r4   r6   r)   s     r   
<listcomp>z3RoundRobinBatchSampler.__iter__.<locals>.<listcomp>       IIIs7||IIIr   r   c                 ,    g | ]}t          |          S r&   iterr6   samplers     r   ro   z3RoundRobinBatchSampler.__iter__.<locals>.<listcomp>       KKKG$w--KKKr   c                     g | ]}|z   S r&   r&   r6   idxsample_offsets     r   ro   z3RoundRobinBatchSampler.__iter__.<locals>.<listcomp>   s    XXXss]*XXXr   )r-   rF   r.   r   r)   datasetsr<   r   rg   r   ranger5   nextStopIteration)r   r8   sample_offsetsrg   dataset_idxrz   s        @r   rN   zRoundRobinBatchSampler.__iter__   s     ""49tz#9:::II4<3HIIItJ{$;$;<<<KKt7JKKK s>':':!;!;<< 	 	K*;7MXXXXd>+;V6W6WXXXXXXX    	 	s   &#C


CCc                 h    t          d | j        D                       t          | j                  z  S )Nc                 ,    g | ]}t          |          S r&   r4   rt   s     r   ro   z2RoundRobinBatchSampler.__len__.<locals>.<listcomp>       DDDWCLLDDDr   )minrg   r5   rc   s    r   rd   zRoundRobinBatchSampler.__len__   s2    DD0CDDDEEDL_H`H```r   r   r   r   r	   r   r   rH   rQ   r!   r   r   rN   rd   r"   r#   s   @r   rf   rf      s         \* ?	
  
     (49-     a a a a a a a a ar   rf   c            
       z     e Zd Zdedee         dej        deddf
 fdZ	de
ee                  fdZdefd	Z xZS )
ProportionalBatchSamplerr)   rg   r-   r.   r   Nc                     t                                          ||d         j        |d         j                   || _        || _        || _        || _        d S r   ri   rj   s        r   r   z!ProportionalBatchSampler.__init__   rk   r   c              #     K   | j                             | j        | j        z              d | j        j        D             }dgt          t          |                    z   }d | j        D             }d t          |          D             }t          || j                   }d | j        D             }|D ]-}||         fdt          ||                   D             V  .d S )Nc                 ,    g | ]}t          |          S r&   r4   rn   s     r   ro   z5ProportionalBatchSampler.__iter__.<locals>.<listcomp>   rp   r   r   c                 ,    g | ]}t          |          S r&   r4   rt   s     r   ro   z5ProportionalBatchSampler.__iter__.<locals>.<listcomp>   s    GGGs7||GGGr   c                 <    g | ]\  }}t          |          D ]}|S r&   )r|   )r6   ry   length_s       r   ro   z5ProportionalBatchSampler.__iter__.<locals>.<listcomp>   s3    ```;3RWX^R_R_``Q3````r   rE   c                 ,    g | ]}t          |          S r&   rr   rt   s     r   ro   z5ProportionalBatchSampler.__iter__.<locals>.<listcomp>   rv   r   c                     g | ]}|z   S r&   r&   rx   s     r   ro   z5ProportionalBatchSampler.__iter__.<locals>.<listcomp>   s    TTT33&TTTr   )r-   rF   r.   r   r)   r{   r<   r   rg   r=   r
   r}   )	r   r8   r   num_batchesdataset_indicesdataset_idx_samplerrg   r   rz   s	           @r   rN   z!ProportionalBatchSampler.__iter__   s     ""49tz#9:::II4<3HIIItJ{$;$;<<<GG43FGGG``)K2H2H```1/T^\\\KKt7JKKK. 	U 	UK*;7MTTTT$~k7R2S2STTTTTTT	U 	Ur   c                 >    t          d | j        D                       S )Nc                 ,    g | ]}t          |          S r&   r4   rt   s     r   ro   z4ProportionalBatchSampler.__len__.<locals>.<listcomp>   r   r   )sumrg   rc   s    r   rd   z ProportionalBatchSampler.__len__   s"    DD0CDDDEEEr   r   r#   s   @r   r   r      s         \* ?	
  
     U(49- U U U UF F F F F F F F Fr   r   )loggingcollectionsr   	itertoolsr   r   typingr   r   rH   torch.utils.datar   r	   r
   sentence_transformers.utilr   r{   r   	getLoggerr   loggerr   r%   r(   rS   rf   r   r&   r   r   <module>r      s    # # # # # # ' ' ' ' ' ' ' ' ! ! ! ! ! ! ! !  M M M M M M M M M M < < < < < < !      		8	$	$       	 	 	 	 	- 	 	 	5  5  5  5  5 }l 5  5  5 p7P 7P 7P 7P 7P}l 7P 7P 7Pta a a a a]L a a aDF F F F F}l F F F F Fr   