
    [iԘ                        d dl mZ d dlZd dlmZ d dlmZmZ d dlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZ g d
Z G d de          Z G d de          Z G d dee          Z G d de          Z G d dee          Z G d de          Z G d dee          Z G d de          Z G d dee          Z G d de          ZdS )    )AnyN)Tensor)
functionalinit)	ParameterUninitializedBufferUninitializedParameter   )SyncBatchNorm)LazyModuleMixin)Module)BatchNorm1dLazyBatchNorm1dBatchNorm2dLazyBatchNorm2dBatchNorm3dLazyBatchNorm3dr   c                        e Zd ZU dZdZg dZeed<   eed<   edz  ed<   e	ed<   e	ed	<   	 	 	 	 	 	 ddedededz  de	d	e	ddf fdZ
ddZddZd Zd Z	 	 d fdZ xZS )	_NormBasez,Common base of _InstanceNorm and _BatchNorm.   )track_running_statsmomentumepsnum_featuresaffiner   r   Nr   r   r   h㈵>皙?Treturnc           
         ||d}t                                                       || _        || _        || _        || _        || _        | j        rIt          t          j	        |fi |          | _
        t          t          j	        |fi |          | _        n,|                     dd            |                     dd            | j        r|                     dt          j        |fi |           |                     dt          j        |fi |           |  |  |                     dt          j        	 d
dt          j        id	 |                                D                        |  nB|                     dd            |                     dd            |                     dd            |                                  d S )Ndevicedtypeweightbiasrunning_meanrunning_varnum_batches_trackedr   r"   c                 &    i | ]\  }}|d k    ||S r"    .0kvs      M/var/www/icac/venv/lib/python3.11/site-packages/torch/nn/modules/batchnorm.py
<dictcomp>z&_NormBase.__init__.<locals>.<dictcomp>L   s#    OOO1!w,,q!,,,    r   )super__init__r   r   r   r   r   r   torchemptyr#   r$   register_parameterregister_bufferzerosonestensorlongitemsreset_parameters
selfr   r   r   r   r   r!   r"   factory_kwargs	__class__s
            r/   r4   z_NormBase.__init__&   s    %+U;;( #6 ; 	2#EK$O$O$O$OPPDK!%+l"M"Mn"M"MNNDII##Hd333##FD111# 	>  L K KN K K     uz,II.II   ,+  % * PO(<(<(>(>OOO	    33  666  555  !6===r1   c                     | j         rN| j                                         | j                            d           | j                                         d S d S Nr
   )r   r%   zero_r&   fill_r'   r@   s    r/   reset_running_statsz_NormBase.reset_running_statsV   s`    # 	- ##%%%""1%%%$**,,,,,	- 	-r1   c                     |                                   | j        r4t          j        | j                   t          j        | j                   d S d S N)rH   r   r   ones_r#   zeros_r$   rG   s    r/   r>   z_NormBase.reset_parameters^   sR      """; 	#Jt{###K	"""""	# 	#r1   c                     t           rJ   )NotImplementedErrorr@   inputs     r/   _check_input_dimz_NormBase._check_input_dimd   s    !!r1   c                 &     dj         di | j        S )Nzj{num_features}, eps={eps}, momentum={momentum}, affine={affine}, track_running_stats={track_running_stats}r*   )format__dict__rG   s    r/   
extra_reprz_NormBase.extra_reprg   s1    ? 88>P PAEP P	
r1   c           	      X   |                     dd           }||dk     rc| j        r\|dz   }	|	|vrS| j        )| j        j        t	          j        d          k    r| j        nt	          j        dt          j                  ||	<   t                                          |||||||           d S )Nversionr   r'   metar   r)   )	getr   r'   r!   r5   r;   r<   r3   _load_from_state_dict)r@   
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsrW   num_batches_tracked_keyrB   s             r/   rZ   z_NormBase._load_from_state_dictm   s     !$$Y55Ow{{0H{ '-/D&D#&j88 /;075<;O;OOO ,, auz:::	 23 	%%	
 	
 	
 	
 	
r1   r   r   TTNNr   N)__name__
__module____qualname____doc___version__constants__int__annotations__floatboolr4   rH   r>   rQ   rU   rZ   __classcell__rB   s   @r/   r   r      sU        66HXXXM	JJJdlLLL !$$(.  . .  .  $,	. 
 .  ".  
.  .  .  .  .  . `- - - -# # # #" " "
 
 
 
 
 
  
  
  
  
  
  
  
  
  
r1   r   c                   \     e Zd Z	 	 	 	 	 	 ddedededz  ded	ed
df fdZded
efdZ xZ	S )
_BatchNormr   r   TNr   r   r   r   r   r   c                 N    ||d} t                      j        |||||fi | d S Nr    )r3   r4   r?   s
            r/   r4   z_BatchNorm.__init__   sP     %+U;;#x1D	
 	
HV	
 	
 	
 	
 	
r1   rP   c           
         |                      |           | j        d}n| j        }| j        rN| j        rG| j        @| j                            d           | j        dt          | j                  z  }n| j        }	 | j        rd}n| j        d u o| j        d u }	 t          j
        || j        r| j        r| j        nd | j        r| j        r| j        nd | j        | j        ||| j                  S )N        r
         ?T)rQ   r   trainingr   r'   add_rm   r%   r&   F
batch_normr#   r$   r   )r@   rP   exponential_average_factorbn_trainings       r/   forwardz_BatchNorm.forward   s1   e$$$
 = ),&&)-&= 	?T5 	?'3(--a000=(14uT=U7V7V1V..15.	 = 	UKK,4T4;Kt;SK	
 | }(,(@!!$(MWT5MWDSWKI&H
 
 	
r1   rc   )
re   rf   rg   rk   rm   rn   r4   r   r~   ro   rp   s   @r/   rr   rr      s         !$$(
 

 
 $,	

 
 "
 

 
 
 
 
 
0
V 0
 0
 0
 0
 0
 0
 0
 0
 0
r1   rr   c                   Z     e Zd ZU eed<   eed<   	 	 	 	 	 	 d	 d fdZd fd	Zdd
Z xZS )_LazyNormBaser#   r$   r   r   TNr   c                    ||d} t                      j        d||ddfi | || _        || _        | j        r"t	          di || _        t	          di || _        | j        rct          di || _        t          di || _	        t          j        	 ddt          j        id |                                D             | _        d S d S )Nr    r   Fr"   c                 &    i | ]\  }}|d k    ||S r)   r*   r+   s      r/   r0   z*_LazyNormBase.__init__.<locals>.<dictcomp>   s#    KKKDAqa7ll1alllr1   r*   r2   )r3   r4   r   r   r	   r#   r$   r   r%   r&   r5   r;   r<   r=   r'   )	r@   r   r   r   r   r!   r"   rA   rB   s	           r/   r4   z_LazyNormBase.__init__   s%    %+U;; 		
 		
 		
 		
 		
 #6 ; 	A0BB>BBDK.@@@@DI# 
	 3 E En E ED2DD^DDD',|( (j( LKN$8$8$:$:KKK	( (D$$$
	 
	r1   c                     |                                  s-| j        dk    r$t                                                       d S d S d S )Nr   )has_uninitialized_paramsr   r3   r>   )r@   rB   s    r/   r>   z_LazyNormBase.reset_parameters   sP    ,,.. 	'43D3I3IGG$$&&&&&	' 	'3I3Ir1   c                 
   |                                  r|j        d         | _        | j        rxt	          | j        t                    sJ t	          | j        t                    sJ | j                            | j        f           | j                            | j        f           | j	        r@| j
                            | j        f           | j                            | j        f           |                                  d S d S rD   )r   shaper   r   
isinstancer#   r	   r$   materializer   r%   r&   r>   rO   s     r/   initialize_parametersz#_LazyNormBase.initialize_parameters  s   ((** 	$ %AD{ <!$+/EFFFFF!$)-CDDDDD''):(<===	%%t'8&:;;;' !--&(    ,,&(   !!#####	$ 	$r1   rc   rd   )	re   rf   rg   r	   rl   r4   r>   r   ro   rp   s   @r/   r   r      s         """"
      & 
& & & & & &P' ' ' ' ' '
$ $ $ $ $ $ $ $r1   r   c                       e Zd ZdZddZdS )r   a  Applies Batch Normalization over a 2D or 3D input.

    Method described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the number of features or channels of the input). By default, the
    elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
    At train time in the forward pass, the variance is calculated via the biased estimator,
    equivalent to ``torch.var(input, correction=0)``. However, the value stored in the
    moving average of the variance is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, correction=1)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.

    Args:
        num_features: number of features or channels :math:`C` of the input
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size,
          :math:`C` is the number of features or channels, and :math:`L` is the sequence length
        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm1d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm1d(100, affine=False)
        >>> input = torch.randn(20, 100)
        >>> output = m(input)
    r   Nc                     |                                 dk    r=|                                 dk    r't          d|                                  d          d S d S Nr      zexpected 2D or 3D input (got D input)dim
ValueErrorrO   s     r/   rQ   zBatchNorm1d._check_input_dim^  W    99;;!		q 0 0RUYY[[RRRSSS  0 0r1   rd   re   rf   rg   rh   rQ   r*   r1   r/   r   r     s;        D DLT T T T T Tr1   r   c                       e Zd ZdZeZddZdS )r   aR  A :class:`torch.nn.BatchNorm1d` module with lazy initialization.

    Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    r   Nc                     |                                 dk    r=|                                 dk    r't          d|                                  d          d S d S r   r   rO   s     r/   rQ   z LazyBatchNorm1d._check_input_dim  r   r1   rd   )re   rf   rg   rh   r   cls_to_becomerQ   r*   r1   r/   r   r   d  s?         4  MT T T T T Tr1   r   c                       e Zd ZdZddZdS )r   a  Applies Batch Normalization over a 4D input.

    4D is a mini-batch of 2D inputs
    with additional channel dimension. Method described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
    standard-deviation is calculated via the biased estimator, equivalent to
    ``torch.var(input, correction=0)``. However, the value stored in the moving average of the
    standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, correction=1)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C, H, W)`
        - Output: :math:`(N, C, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm2d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm2d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45)
        >>> output = m(input)
    r   Nc                     |                                 dk    r%t          d|                                  d          d S N   zexpected 4D input (got r   r   rO   s     r/   rQ   zBatchNorm2d._check_input_dim  ?    99;;!Luyy{{LLLMMM r1   rd   r   r*   r1   r/   r   r     ;        E ENN N N N N Nr1   r   c                       e Zd ZdZeZddZdS )r   aU  A :class:`torch.nn.BatchNorm2d` module with lazy initialization.

    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    r   Nc                     |                                 dk    r%t          d|                                  d          d S r   r   rO   s     r/   rQ   z LazyBatchNorm2d._check_input_dim  r   r1   rd   )re   rf   rg   rh   r   r   rQ   r*   r1   r/   r   r     ?         4  MN N N N N Nr1   r   c                       e Zd ZdZddZdS )r   a  Applies Batch Normalization over a 5D input.

    5D is a mini-batch of 3D inputs with additional channel dimension as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
    standard-deviation is calculated via the biased estimator, equivalent to
    ``torch.var(input, correction=0)``. However, the value stored in the moving average of the
    standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, correction=1)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
    or Spatio-temporal Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, D, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C, D, H, W)`
        - Output: :math:`(N, C, D, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm3d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)
    r   Nc                     |                                 dk    r%t          d|                                  d          d S N   zexpected 5D input (got r   r   rO   s     r/   rQ   zBatchNorm3d._check_input_dim>  r   r1   rd   r   r*   r1   r/   r   r     r   r1   r   c                       e Zd ZdZeZddZdS )r   aU  A :class:`torch.nn.BatchNorm3d` module with lazy initialization.

    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    r   Nc                     |                                 dk    r%t          d|                                  d          d S r   r   rO   s     r/   rQ   z LazyBatchNorm3d._check_input_dima  r   r1   rd   )re   rf   rg   rh   r   r   rQ   r*   r1   r/   r   r   D  r   r1   r   c                        e Zd ZdZ	 	 	 	 	 	 	 ddedededz  d	ed
ededz  ddf fdZddZ	ddZ
dedefdZedd            Z xZS )r   a  Applies Batch Normalization over a N-Dimensional input.

    The N-D input is a mini-batch of [N-2]D inputs with additional channel dimension) as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over all
    mini-batches of the same process groups. :math:`\gamma` and :math:`\beta`
    are learnable parameter vectors of size `C` (where `C` is the input size).
    By default, the elements of :math:`\gamma` are sampled from
    :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
    The standard-deviation is calculated via the biased estimator, equivalent to
    `torch.var(input, correction=0)`.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done for each channel in the ``C`` dimension, computing
    statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch
    Normalization or Spatio-temporal Batch Normalization.

    Currently :class:`SyncBatchNorm` only supports
    :class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
    :meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
    :attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
    Network with DDP.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, +)`
        eps: a value added to the denominator for numerical stability.
            Default: ``1e-5``
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        process_group: synchronization of stats happen within each process group
            individually. Default behavior is synchronization across the whole
            world

    Shape:
        - Input: :math:`(N, C, +)`
        - Output: :math:`(N, C, +)` (same shape as input)

    .. note::
        Synchronization of batchnorm statistics occurs only while training, i.e.
        synchronization is disabled when ``model.eval()`` is set or if
        ``self.training`` is otherwise ``False``.

    Examples::

        >>> # xdoctest: +SKIP
        >>> # With Learnable Parameters
        >>> m = nn.SyncBatchNorm(100)
        >>> # creating process group (optional)
        >>> # ranks is a list of int identifying rank ids.
        >>> ranks = list(range(8))
        >>> r1, r2 = ranks[:4], ranks[4:]
        >>> # Note: every rank calls into new_group for every
        >>> # process group created, even if that rank is not
        >>> # part of the group.
        >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
        >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)

        >>> # network is nn.BatchNorm layer
        >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
        >>> # only single gpu per process is currently supported
        >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
        >>>                         sync_bn_network,
        >>>                         device_ids=[args.local_rank],
        >>>                         output_device=args.local_rank)
    r   r   TNr   r   r   r   r   process_groupr   c	                 \    ||d}	 t                      j        |||||fi |	 || _        d S rt   )r3   r4   r   )r@   r   r   r   r   r   r   r!   r"   rA   rB   s             r/   r4   zSyncBatchNorm.__init__  sV     %+U;;#x1D	
 	
HV	
 	
 	
 +r1   c                     |                                 dk     r%t          d|                                  d          d S )Nr   z expected at least 2D input (got r   r   rO   s     r/   rQ   zSyncBatchNorm._check_input_dim  s<    99;;??U		UUUVVV ?r1   c                 V    |                     d          dk    rt          d          d S )Nr
   r   z9SyncBatchNorm number of input channels should be non-zero)sizer   rO   s     r/   _check_non_zero_input_channelsz,SyncBatchNorm._check_non_zero_input_channels  s4    ::a==AK   r1   rP   c                 V   |                      |           |                     |           | j        d}n| j        }| j        rU| j        rN| j        J | j                            d           | j        d| j                                        z  }n| j        }	 | j        rd}n| j        du o| j	        du }	 | j        r| j        r| j        nd}| j        r| j        r| j	        nd}|oB| j        o;t          j                                        ot          j                                        }|r|j        j        dddt          j                                        fvr.t%          d	t          j                                                   t          j        j        j        }| j        r| j        }t          j                            |          }|dk    }|s*t/          j        |||| j        | j        ||| j                  S |sJ t9          j        || j        | j        ||| j        |||	  	        S )
z(
        Runs the forward pass.
        Nrv   r
   rw   Tcudahpuxpuz;SyncBatchNorm expected input tensor to be on GPU or XPU or )rQ   r   r   rx   r   r'   ry   itemr%   r&   r5   distributedis_availableis_initializedr!   type_C_get_privateuse1_backend_namer   groupWORLDr   get_world_sizerz   r{   r#   r$   r   sync_batch_normapply)	r@   rP   r|   r}   r%   r&   	need_syncr   
world_sizes	            r/   r~   zSyncBatchNorm.forward  s    	e$$$++E222
 = ),&&)-&= 	;T5 	;+777$))!,,,}$-043K3P3P3R3R-R**-1]*	 = 	UKK,4T4;Kt;SK	 &*]Xd6NXDTX 	 %)MWT5MWDSW 	  33!..003 !0022	 	  	'| 6688	)   !Bx==??B B  
 "-39M! 3 $ 2*99-HHJ"QI  	<	*	 	 	 "(	*
 
 
r1   c                    |}t          |t          j        j        j        j                  rt          j                            |j        |j        |j	        |j
        |j        |          }|j
        rCt          j                    5  |j        |_        |j        |_        ddd           n# 1 swxY w Y   |j        |_        |j        |_        |j        |_        |j        |_        t'          |d          r|j        |_        |                                D ]/\  }}|                    ||                     ||                     0~|S )aa  Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers.

        Args:
            module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
            process_group (optional): process group to scope synchronization,
                default is the whole world

        Returns:
            The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
            layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
            a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
            instead.

        Example::

            >>> # Network with nn.BatchNorm layer
            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
            >>> module = torch.nn.Sequential(
            >>>            torch.nn.Linear(20, 100),
            >>>            torch.nn.BatchNorm1d(100),
            >>>          ).cuda()
            >>> # creating process group (optional)
            >>> # ranks is a list of int identifying rank ids.
            >>> ranks = list(range(8))
            >>> r1, r2 = ranks[:4], ranks[4:]
            >>> # Note: every rank calls into new_group for every
            >>> # process group created, even if that rank is not
            >>> # part of the group.
            >>> # xdoctest: +SKIP("distributed")
            >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
            >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
            >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)

        Nqconfig)r   r5   nnmodules	batchnormrr   r   r   r   r   r   r   no_gradr#   r$   r%   r&   r'   rx   hasattrr   named_children
add_moduleconvert_sync_batchnorm)clsmoduler   module_outputnamechilds         r/   r   z$SyncBatchNorm.convert_sync_batchnormI  s   H feh.8CDD 	7!H22#
* M } 5]__ 5 5+1=M()/M&5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 *0)<M&(.(:M%060JM-%+_M"vy)) 7(.%!0022 	 	KD%$$c00FF    s   B((B,/B,)r   r   TTNNNrd   rJ   )re   rf   rg   rh   rk   rm   rn   r   r4   rQ   r   r   r~   classmethodr   ro   rp   s   @r/   r   r   f  s'       d dR !$$($(+ ++ + $,	+
 + "+ Tz+ 
+ + + + + +"W W W W   _V _ _ _ _ _B < < < [< < < < <r1   r   )typingr   r5   r   torch.nnr   rz   r   torch.nn.parameterr   r   r	   
_functionsr   r   lazyr   r   r   __all__r   rr   r   r   r   r   r   r   r   r*   r1   r/   <module>r      s                * * * * * * * * U U U U U U U U U U 8 8 8 8 8 8 ! ! ! ! ! !        t
 t
 t
 t
 t
 t
 t
 t
n@
 @
 @
 @
 @
 @
 @
 @
FA$ A$ A$ A$ A$OY A$ A$ A$HIT IT IT IT IT* IT IT ITZT T T T TmZ T T TDJN JN JN JN JN* JN JN JN\N N N N NmZ N N NDJN JN JN JN JN* JN JN JN\N N N N NmZ N N ND` ` ` ` `J ` ` ` ` `r1   