
    [i.                       U d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZ dd	lmZ e
rdd
lmZ ddlZddlmZ dgZdaded<   e G d d                      ZedWd            Z	 dXdYdZedZd            Zd[dZ	 d\d]d'Zd^d0Zd_d7Z ed8          Z d`d;Z!	 dadbdBZ"dcdDZ#dddddddEdddIZ$ddddJdedMZ%	 	 	 dfddPdgdQZ&ddPdhdTZ' ej(        dUeV           dS )izUBER PROTOTYPE!!!    )annotationsN)	dataclass)cache)AnyTYPE_CHECKING)TypeVarTupleUnpack   )	_registry)
ModuleType)Libraryregister_flash_attention_fa4
str | None_FA4_MODULE_PATHc                  "    e Zd ZU ded<   ddZdS )
_FA4HandlezLibrary | NonelibraryreturnNonec                    d | _         d S N)r   )selfs    J/var/www/icac/venv/lib/python3.11/site-packages/torch/nn/attention/_fa4.pyremovez_FA4Handle.remove"   s        N)r   r   )__name__
__module____qualname____annotations__r    r   r   r   r      s6              r   r   devicetorch.devicer   intc                J    t           j                            |           \  }}|S r   )torchcudaget_device_capability)r!   major_s      r   _get_device_majorr*   &   s     z//77HE1Lr   flash_attn.cute.interfacemodule_pathstrc                Z    t          |           }| at          t                                S )z
    Register FA4 flash attention kernels with the PyTorch dispatcher.

    Args:
        module_path: Python module path to the FA4 implementation.
    )_fa4_import_moduler   r   _fa4_register_kernels)r,   r)   s     r   r   r   ,   s,     	;''A"+--...r   r   c                    t          j        |           }t          |d          rt          |d          st          d|  d          |S )N_flash_attn_fwd_flash_attn_bwdzModule 'z' does not expose FA4 kernels)	importlibimport_modulehasattrRuntimeError)r,   modules     r   r/   r/   ;   sX    $[11F6,-- RWVEV5W5W RPkPPPQQQMr   r   c                    t          ddd          } |                     dt          d           |                     dt          d           |                     dt          d           |                     dt
          d           | S )NatenIMPLCUDA_flash_attention_forward_flash_attention_backward#_scaled_dot_product_flash_attention,_scaled_dot_product_flash_attention_backward)r   impl!_fa4_flash_attention_forward_impl"_fa4_flash_attention_backward_impl4_fa4_scaled_dot_product_flash_attention_forward_impl5_fa4_scaled_dot_product_flash_attention_backward_impl)libs    r   r0   r0   C   s    
&&&
)
)CHH')JFSSSHH(*LfUUUHH-<  
 HH6=  
 Jr   r    querytorch.Tensortensorstuple[torch.Tensor, ...]	cum_seq_qtorch.Tensor | Nonerequire_fp32$tuple[tuple[str, torch.Tensor], ...]c                   t          d |D                       sdS t          d |D                       dk    rdS | j        t          j        t          j        fvrdS |D ]!\  }}|j        t          j        k    r| dc S "||                                 dk    rd	S ||                                 d
k    rdS t          j        	                                sdS t          | j                  dvrdS d S )Nc              3  $   K   | ]}|j         V  d S r   )is_cuda.0ts     r   	<genexpr>z,_fa4_common_support_error.<locals>.<genexpr>Z   s$      **Qqy******r   zinputs must be CUDA tensorsc                    h | ]	}|j         
S r    )r!   rR   s     r   	<setcomp>z,_fa4_common_support_error.<locals>.<setcomp>\   s    &&&AH&&&r   r
   inputs must share devicez'query dtype must be float16 or bfloat16z dtype must be float32   zdense query must be 4D   zragged query must be 3DzCUDA not available)	   
   z+FA4 requires compute capability 9.0 or 10.0)alllendtyper%   float16bfloat16float32dimr&   is_availabler*   r!   )rG   rI   rK   rM   nametensors         r   _fa4_common_support_errorrg   T   s!    **'***** -,,
&&g&&&''1,,)){5=%.99988$ 3 3f<5=((222222 )UYY[[A--''!1!1((:""$$ $##&&g55<<4r   keyvalue	dropout_pfloatreturn_debug_maskboolalibi_slopes	seqused_kc                    |dk    rdS |rdS |dS | |j         t          j        k    rdS |j        sdS t	          | | ||f|          }|
|dk    rdS |S d S )	N        dropout_p must be 0zreturn_debug_mask must be Falsezalibi_slopes not supportedzseqused_k must be int32zseqused_k must be CUDArX   z(query, key, value must be on same device)r_   r%   int32rQ   rg   )	rG   rh   ri   rj   rl   rn   ro   rK   errors	            r   _fa4_forward_support_errorru   n   s     C$$ 100++?ek)),,  	,++%	U E
 ...==4r   grad_outout	logsumexpwindow_size_left
int | Nonewindow_size_rightc
           	     b    |dk    rdS ||	dS t          || |||||f|d|ff          }
|
|
S d S )Nrq   rr   z windowed attention not supportedrx   )rM   )rg   )rv   rG   rh   ri   rw   rx   rj   rK   ry   r{   rt   s              r   _fa4_backward_support_errorr}      sn     C$$#'8'D11%	5#uc95"I.0	  E 4r   Ts
Unpack[Ts]tuple[Unpack[Ts]]c                 4    t          d | D                       S )Nc              3  B   K   | ]}|                     d d          V  dS )r
      N)	transposerR   s     r   rU   z#_transpose_dense.<locals>.<genexpr>   s0      44qQ""444444r   )tuple)rI   s    r   _transpose_denser      s    44G444444r   cu_seq_qcu_seq_kscalefloat | None	is_causal!tuple[torch.Tensor, torch.Tensor]c           	         t           t          d          t          t                     }||||d|||	|	                                nd d}|
|
|d<    |j        | ||fi |\  }
}|
|                                fS )NFA4 not registeredT)softmax_scalecausalry   r{   
return_lsecu_seqlens_qcu_seqlens_kro   rw   )r   r7   r/   
contiguousr2   )rG   rh   ri   r   r   r   r   ry   r{   ro   rw   r8   kwargslses                 r   _fa4_run_forwardr      s     /000 011F ,.  /8/DY))+++$	 	F u%v%eS%BB6BBHC    r   /tuple[torch.Tensor, torch.Tensor, torch.Tensor]c
                    t           t          d          t          t                     }
|
                    ||||| |                                ||	||
  
        \  }}}|||fS )Nr   )r   r   r   r   )r   r7   r/   r3   r   )rv   rG   rh   ri   rw   rx   r   r   r   r   r8   dqdkdvs                 r   _fa4_run_backwardr      s     /000 011F'' (  JBB r2:r   )r   ry   r{   ro   rn   rw   	cum_seq_kmax_qmax_kc
               v   t          | ||||	|||          }|t          d|           t          | |||||
|||||          \  }}t          j        dt          j        | j                  }t          j        dt          j        | j                  }t          j        d| j        | j                  }|||||fS )Nz)FA4 flash_attention forward unsupported: )r   )r_   r!   r    r   )	ru   r7   r   r%   zerosuint64r!   emptyr_   )rG   rh   ri   rK   r   r   r   rj   r   rl   r   ry   r{   ro   rn   rw   rt   r   	rng_statephilox_offset
debug_masks                        r   rB   rB      s    & '	 	E NuNNOOO HC DU\JJJIK%,u|LLLMQek%,GGGJYz99r   )r   ry   r{   r   unusedc                   t          | ||||||
|||
  
        }|t          d|           t          | |||||||||
  
        \  }}}|||fS )Nz*FA4 flash_attention backward unsupported: )r}   r7   r   )rv   rG   rh   ri   rw   rx   rK   r   r   r   rj   r   r   r   r   ry   r{   rt   r   r   r   s                        r   rC   rC   "  s    ( ( E OOOPPP" JBB r2:r   rq   Fr   c                  t          | ||||d d d           }|t          d|           t          | ||          \  }}	}
t          j        |           }|                    dd          }|                    d          }|	                    d          }t          ||	|
d d |||||||          \  }}}}}|                     d          }|                    d          }||d d |||||f	S )NzFA4 SDPA forward unsupported: r
   r   )r   rw   )ru   r7   r   r%   
empty_liker   sizerB   )rG   rh   ri   rj   r   rl   r   rt   qkvout_bhsdout_bshdmax_q_flashmax_k_flashr)   r   r   r   r   r   r   s                         r   rD   rD   S  s.    '	 	E CECCDDDuc511GAq!
 &&H!!!Q''H&&))K&&))K3T			4 4 40AsI}j JJqMMEHHQKKE
 
r   philox_seedr   c               X   t          | ||||||
d d d 
  
        }|t          d|           t          |||||           \  }}}}}|                    d          }|                    d          }	t	          ||||||d d ||	|
||||          \  }}}t          |||          \  }}}|||fS )NzFA4 SDPA backward unsupported: r   r   )r}   r7   r   r   rC   )rv   rG   rh   ri   rw   rx   rK   r   r   r   rj   r   r   r   r   rt   r   r   r   ogor   r   r   s                           r   rE   rE     s    $ ( E DUDDEEE%eS%hGGNAq!QJJqMMEHHQKKE3
				  JBB" ""b"--JBBr2:r   FA4)register_fn)r!   r"   r   r#   )r+   )r,   r-   r   r   )r,   r-   r   r   )r   r   )r    )
rG   rH   rI   rJ   rK   rL   rM   rN   r   r   )rG   rH   rh   rH   ri   rH   rj   rk   rl   rm   rn   rL   ro   rL   rK   rL   r   r   )rv   rH   rG   rH   rh   rH   ri   rH   rw   rH   rx   rH   rj   rk   rK   rL   ry   rz   r{   rz   r   r   )rI   r   r   r   r   )rG   rH   rh   rH   ri   rH   r   rL   r   rL   r   r   r   rm   ry   rz   r{   rz   ro   rL   rw   rL   r   r   )rv   rH   rG   rH   rh   rH   ri   rH   rw   rH   rx   rH   r   rL   r   rL   r   r   r   rm   r   r   ) rG   rH   rh   rH   ri   rH   rK   rL   r   rL   r   r#   r   r#   rj   rk   r   rm   rl   rm   r   r   ry   rz   r{   rz   ro   rL   rn   rL   rw   rL   )"rv   rH   rG   rH   rh   rH   ri   rH   rw   rH   rx   rH   rK   rL   r   rL   r   r#   r   r#   rj   rk   r   rm   r   rH   r   rH   r   r   ry   rz   r{   rz   )rq   FF)rG   rH   rh   rH   ri   rH   rj   rk   r   rm   rl   rm   r   r   )rv   rH   rG   rH   rh   rH   ri   rH   rw   rH   rx   rH   rK   rL   r   rL   r   r#   r   r#   rj   rk   r   rm   r   rH   r   rH   r   r   ))__doc__
__future__r   r4   dataclassesr   	functoolsr   typingr   r   typing_extensionsr   r	    r   typesr   r%   torch.libraryr   __all__r   r   r   r*   r   r/   r0   rg   ru   r}   r~   r   r   r   rB   rC   rD   rE   register_flash_attention_implr    r   r   <module>r      s      # " " " " "     ! ! ! ! ! !       % % % % % % % % 2 2 2 2 2 2 2 2        !        ! ! ! ! ! ! #
  $  # # # #             3/ / / / /       * :<	    4   B   6 \$5 5 5 5  $! ! ! ! !B   T #'$(%)(,##/: /: /: /: /: /:D #'$(%. . . . . .j #: : : : : : :Z !5 5 5 5 5 5p (	 ';W X X X X X Xr   