
    9i                   P   U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZmZm Z  d dl!m!Z! d dl	m"Z" d dl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 d dl1m2Z2m3Z3m4Z4 d dlm5Z5 d dl6Z6d dl7Z7d dl8m9c m:Z; d d	l<m=Z= d d
l>m?Z? d dl@mAZA d dlBmCZC d dlDmEZE d dl8mFZFmGZG e,rd dlHmIZI ddgZJd dlKmLZLmMZMmNZNmOZO e,r\d dlmPZPmQZQmRZR d dl7mSZSmTZTmUZU d dlVmWZW d dlXmYZY d dlZm[Z[ ddl\m]Z] ddl^m_Z_ ddl`maZa ddlbmcZc ddldmeZemfZfmgZgmhZhmiZimjZj ddlkmlZl ddlmmnZnmoZo g d Zp e/d!          Zqejr        dNd$            Zsd d%ltmuZu d d&lvmwZw d d'lxmyZy d d(lzm{Z{ d d)l|m}Z} d d*l~mZ d d+lmZmZmZmZmZ d d,lmZmZ d d-lmZmZ dd.lmZ dd/lmZ ej        d0k    Z ej        e          Ze7j                            ed1          Z e/d2          Zee6j        e6j        f         Ze*e0e7j        ee7jU        f                  Zd3d4d5Zd6Zd6Zd6Zd7Zd8Zeedz
  z  d k    red9k    s
J d:            dOd=ZdPdAZ G dB dCe6j                  Z ej        dDE           G dF dG                      ZdQdRdOZ	 	 	 dSdTdRZ	 	 	 dSdTdSZejr        dUdT            ZdVdXZdWd[ZdXd_ZdYdbZdZdfZd[diZd\dmZd]dpZd^dsZd_dvZdw fd`d|ZdadZdbdcdZ	 	 dddedZ	 	 	 	 	 dfdgdZdhdZdidZdjdZdkdZdldZ e3d          Z e/ddD          Zee&e$ef         ef         Z G d de+e'eef                   ZŐdmdZƐdndZǐdodZȐdpdZɐdqdZʐdrdZ	 dsdtdÄZ̐dudƄZ͐dvdȄZΐdwd˄Zϐdxd̈́ZАdydфZѐdzdӄZҐd{dՄZӐd|d؄ZԐd}dڄZՐd~dۄZ eg dܢ          ZؐddZِddZڐddZd dlZܐddZg Zded<   ddZddZej        	 	 	 ddd            ZeZeZeZdPdddZdPdddZ ej        d9          dd            Z G d de)          Zej         G d d                      Z G d d           Z G d de          Zej        dd            Z G d d          Z G d de          Z G d d	e          Zejr        ddd            Zej        dd            Zej        dUd            ZddZ	 dsddZddZddZddZdPdPdDddd!ZddPd"dd(ZdPd)dd*ZdPd)dd+Z ej        d,          dUd-            Zdd6Z dd:Zdd<Ze0ee6j        f         Zd=eߐd><   ejr        	 ddd@            Zejr        ddA            Zejr        ddB            Zejr        ddC            Zejr        ddE            ZddFZ	ddGZ
ddHZddIZddJZddNZ	 	 	 	 dddSZdUdTZ G dU dV          Zdd[Zdd\Zdd^Zdd_Zdd`ZddaZddcZej        ddf            Z	 dsddjZddlZddmZddoZddpZddsZdduZ ej        ddx            Z!ddyZ"ejr        ddz            Z#ejr        dd{            Z$ejr        dd|            Z%dd}Z&dd~Z'ddZ(ddZ)dUdZ*dUdZ+ddZ,d~dZ- G d dej.                  Z/ddZ0ddZ1ddZ2	 dsddZ3ddZ4	 dsddZ5ddZ6ddZ7ddZ8d fddZ9d fddZ:ddZ;ddZ<ej         G d d                      Z=ej        dd            Z>ddZ?ddZ@ddZAddZBddZCddZDdÐdńZEdĐdǄZFdŐdɄZGdƐd˄ZHdǐd΄ZIdȐdӄZJdɐdԄZK	 dsdʐdۄZLdːd݄ZMd̐d߄ZNd͐dZOdUdZPddZQddddddddZRd eRS                                D             ZT ejU        d          ZVdΐdZWdϐdZXdАdZYdАdZZejr        dѐd            Z[ej         G d d                      Z\i Z]deߐd<   dҐdZ^ eE            Z_deߐd<   dӐd Z`dsdԐdZadՐdZb e/d          Zc e/d          Zd G d deecedf                   Ze e2dD          dsdDdEd֐d            ZfdאdZg G d dej.                  Zhejr        dؐd            ZidUdZjdِdZkdِdZldڐdZmdNdZndېdZodUdZpdܐdZqdZrdݐdZsdݐdZtdސd$Zu	 	 dߐdd,Zvdd.Zwdd1ZxdUd2Zydd6Zz	 	 ddd9Z{dd;Z| ej        dDE           G d< d=                      Z}ed>e$f         Z~ee~e}ge~f         Z G d? d@          Z e            ZddBZddDZddGZddJZddKZ eEg dL          ZddMZdS (      )annotationsN)Callable
Collection	GeneratorIteratorMappingMutableMapping
MutableSet)datetime)StringIO)AnycastConcatenateGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKING	TypeAlias	TypeGuardTypeVarUnion)dataclass_transform	ParamSpecSelf)mock)datasheet_tops)DeviceProperties)_needs_inductor_compile)dtype_abbrs)
OrderedSet)tree_flattentree_map_only)Path!activation_quantization_aten_passinductor_autotune_lookup_table)free_symbolsfree_unbacked_symbolsIterateExprsShapeEnv)IterableSequence
ValuesView)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)Node   )WorkspaceArgPythonWrapperCodegen)DepGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpumtiaTreturnstrc                     d t           D             } t          |           dk    sJ t          |           dk    rdn|                                 }|S )Nc                `    g | ]+}t          t          |                                          )|,S  )getattrtorchis_available.0xs     H/var/www/icac/venv/lib/python3.11/site-packages/torch/_inductor/utils.py
<listcomp>z get_gpu_type.<locals>.<listcomp>l   s3    KKK'%*;*;*H*H*J*JK!KKK    r5   r   rF   )	GPU_TYPESlenpop)
avail_gpusgpu_types     rV   get_gpu_typer^   j   sT    KKYKKKJz??aZA--vv:>>3C3CHOrX   )get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesconfig)ceildivwin32
perf_hints_Tz.cubinz.spv)rF   rH         @      zmust be power of 2nbytesintc                .    | t           z   dz
  t            z  S )z/Round up to the nearest multiple of ALIGN_BYTESr5   )ALIGN_BYTES)rx   s    rV   _alignr|      s    [ 1$44rX   v
sympy.Exprboolc                   t          | t          j        t          j        f          r't	          t          t          | j                            S t          | t                    p"t          j	        | t                    t          k    S )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdr{   )r}   s    rV   r   r      s]    !ei+,, -3{AF++,,,aK59Q#<#<#KKrX   c                  2    e Zd ZdZdZdZed
d            Zd	S )r   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr5   Tvaluer~   rK   Optional[sympy.Expr]c                    t          |t          t          j        f          rt	          t          |                    S t          |          r|S d S N)r   ry   r   Integerr|   r   )clsr   s     rV   evalz
align.eval   sN    ec5=122 	&#e**%%%u 	L	 	rX   N)r   r~   rK   r   )__name__
__module____qualname____doc__nargs
is_integerclassmethodr   rO   rX   rV   r   r      sB        FFEJ   [  rX   r   T)frozenc                  <    e Zd ZU dZded<   ded<   ded<   ded<   d	S )
GraphPartitionMapzP
    Mapping from the partition info (e.g., input/output) to the graph info
    ry   idzlist[Optional[int]]input_index_mappingoutput_index_mapping	list[str]constant_namesNr   r   r   r   __annotations__rO   rX   rV   r   r      sO          
 GGG -,,,---- rX   r      d   fnCallable[[], Any]warmuprepfloatc                    |              t           j                                         t          j        t	          d          t           j        d          }t           j                            d          }t           j                            d          }|                                 t          d          D ] }|	                                  |              !|                                 t           j                                         |
                    |          dz  }t          dt	          ||z                      }t          dt	          ||z                      }	t          |          D ]} |              d t          |	          D             }d	 t          |	          D             }t           j                            t           j        j        j        g
          5 }
t           j                                         t          |	          D ]}|	                                 ||                                          t           j        j                            d          5   |              ddd           n# 1 swxY w Y   ||                                          t           j                                         t          j        d t%          ||          D                       }ddd           n# 1 swxY w Y   t          j        |                                          }t*                              d           t*                              |
                                                    dd                     t3          d |
                                D                       }|r$|t7          j        d |D                       dz  z  }t*                              d|           |S )R  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
        ArF   dtypedeviceTenable_timing   r5   c                N    g | ]"}t           j                            d           #S Tr   rQ   rF   EventrT   _s     rV   rW   zfp8_bench.<locals>.<listcomp>   s+    QQQA5:##$#77QQQrX   c                N    g | ]"}t           j                            d           #S r   r   r   s     rV   rW   zfp8_bench.<locals>.<listcomp>   s+    OOO!!!!55OOOrX   
activitiesRunCudaModuleNc                >    g | ]\  }}|                     |          S rO   )elapsed_time)rT   ses      rV   rW   zfp8_bench.<locals>.<listcomp>   s(    GGG41aQ^^AGGGrX   
raw eventsself_device_time_totalsort_by	row_limitc                p    g | ]3}|j         t          j        k    rt          j        d |j                  1|4S )zfused_abs_max_\d)device_typera   CUDArematchnamerT   events     rV   rW   zfp8_bench.<locals>.<listcomp>   sJ     	
 	
 	
!Z_44H0%*==I	  JIIrX   c              3  $   K   | ]}|j         V  d S r   device_time_totalr   s     rV   	<genexpr>zfp8_bench.<locals>.<genexpr>
  s%      QQE3QQQQQQrX        @@profiling results: %s ms)rQ   rF   synchronizeemptyry   float16r   recordrangezero_r   maxprofilerprofileProfilerActivityr   nvtxtensorzipmeanitemlogdebugkey_averagestablerb   events
statistics)r   r   r   cachestart_event	end_eventr   estimate_msn_warmupn_repeatpitimesresfiltered_eventss                  rV   	fp8_benchr      s    BDDD	JKJu}VLLLE *"""66K
  t 44I1XX  
	J**9559K 1c&;.//00H1c#+,,--H 8__  
QQxQQQKOOuXOOOI			N+0
 
  
 
 
 

   x 	" 	"AKKMMMN!!###&&77                aL!!!!
   GG3{I+F+FGGG
 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
" *U


 
 
"
"CIIlIIann$$-EQS$TTUUU	
 	
	
 	
 	
	 	O  
OQQQQQQQ	

 II(#...Js8   (BK=*J5K=JK=J	A(K==LLFis_vetted_benchmarkingc                F    ddl m}   |t                    | |||          S )Nr   )may_distort_benchmarking_result)$torch._inductor.runtime.benchmarkingr   _do_bench_using_profiling)r   r   r   r   r   s        rV   do_bench_using_profilingr     sB    " UTTTTTE**+DEE
FC/  rX   c                   |sddl m}  |              |              t          j                                         t          j        t          d          t          j        d          }t          j                            d          }t          j                            d          }|                                 t          d          D ] }|
                                  |              !|                                 t          j                                         |                    |          dz  }	t          d	t          ||	z                      }
t          d	t          ||	z                      }t          |
          D ]} |              t          j                                         t          j                            t          j        j        j        g
          5 }t          |          D ] }|
                                  |              !t          j                                         ddd           n# 1 swxY w Y   t"                              d           t"                              |                                                    dd                     t+          d |                                D                       }t/          |          |z  dk    rt1          dt/          |          |          t/          |          |z  t+          fdt3          |          D                       }|                                 |                                }t"                              d           t"                              |                    d                     t7          d |D                       dz  |z  }t"                              d|           |S )r   r   )may_ban_benchmarkingr   rF   r   Tr   r   r5   r   Nr   r   r   r   c                R    g | ]$}|j         t          j        k    |j        d k    "|%S )zContext Sync)r   ra   r   r   r   s     rV   rW   z-_do_bench_using_profiling.<locals>.<listcomp>h  s>     	
 	
 	
 JO33
n8T8T 8T8T8TrX   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %sc                ,    g | ]\  }}|z  d k    |S r   rO   )rT   r   r   num_event_per_groups      rV   rW   z-_do_bench_using_profiling.<locals>.<listcomp>w  s8     	
 	
 	
5&&!++ +++rX   zprofiling time breakdown)r   c              3  $   K   | ]}|j         V  d S r   r   r   s     rV   r   z,_do_bench_using_profiling.<locals>.<genexpr>  s%      AA%e%AAAAAArX   r   r   )r   r   rQ   rF   r   r   ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rb   r   rZ   RuntimeError	enumerate_build_treesum)r   r   r   r   r   r   r   r   r   r   r   r   r   r   actual_eventsr   r  s                   @rV   r   r   *  s    " MMMMMMBDDD	JKJuyHHHE *"""66K
  t 44I1XX  
	J**9559K 1c&;.//00H1c#+,,--H 8__  
	J			N+0
 
  
 
 ! 
x 	 	AKKMMMBDDDD
   ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! IIlIIann$$-EQS$TTUUU	
 	
	
 	
 	
 O ?h&!++-  	
 
 	
 o..9	
 	
 	
 	
%o66	
 	
 	
 M !..00MII()))IIm!!B!//000
AA=AAA
A
AF
JX
UCII(#...Js   'AII	Ic                    	 ddl m}  t          j                            dd           | d uo(t          t          t          j        dd           d          S # t          $ r Y dS t          $ r}dt          |          v sJ Y d }~dS d }~ww xY w)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr  Fztorchvision::nms does not exist)torchvision.opsr  rQ   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrP   opsImportErrorr  rL   )r  r   s     rV   has_torchvision_roi_alignr    s    ------667I6RRR$ 
EI}d33[*
 *
 	
    uu   0CFF::::uuuuus   AA 
B	"	B	+BB	r   "Union[Optional[torch.device], str]torch.devicec                :   | t          j        d          j        S t          | t                    rt          j        |           } | j        dvrM| j        Ft          | j                  }t          j        | j        |j        	                                          S | S )Ng        )cpumeta)index)
rQ   r   r   r   rL   typer  r_   Workercurrent_devicer   device_interfaces     rV   decode_devicer     s    ~|C  ''&# &f%%{/))fl.B3FK@@|FK/?/F/U/U/W/WXXXXMrX   itIterable[sympy.Expr]c                `    t          j        t          j        | t          j        j                  S r   )	functoolsreduceoperatormulr   SOner!  s    rV   sympy_productr+    s    HL"egk:::rX   seq1Sequence[sympy.Expr]seq2c           	         t          |           t          |          k    sJ t          j        t          d t	          | |          D                                 S )Nc              3  &   K   | ]\  }}||z  V  d S r   rO   )rT   abs      rV   r   zsympy_dot.<locals>.<genexpr>  s*      >>daAE>>>>>>rX   )rZ   r   expandr  r   )r,  r.  s     rV   	sympy_dotr4    sN    t99D		!!!!<>>c$oo>>>>>???rX   Iterable[_T]ValuesView[_T]c                >    d | D                                              S )Nc                .    i | ]}t          |          |S rO   )r   rS   s     rV   
<dictcomp>zunique.<locals>.<dictcomp>  s     !!!BqEE1!!!rX   )valuesr*  s    rV   uniquer;    s"    !!b!!!((***rX   numberUnion[int, sympy.Expr]denomc           
        t          | t          j                  st          |t          j                  r4t          t          j        |           t          j        |                    S t          | t
                    rt          |t
                    s/J |  dt          |            d| dt          |                       t          | |          S )Nz: , )r   r   Exprre   sympifyry   r  runtime_ceildiv)r<  r>  s     rV   rp   rp     s     &%*%% DE5:)F)F Du}V,,emE.B.BCCC fc"" z%'='=  ;;T&\\;;U;;d5kk;;   65)))rX   keyOptional[torch.dtype]c                   | dS t          |                               d          d         }i dddddd	d
ddddddd	dddddddddddddddddd d!d"dd#d$d%d&}|                    d' t          |                                          D                        t          | t                     r| n
d(||          S ))Nz*i8.r   r   i1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64u16u32u64)uint8uint16uint32uint64c                    i | ]}||S rO   rO   )rT   r}   s     rV   r9  z_type_of.<locals>.<dictcomp>  s    1111111rX   *)rL   splitupdatelistr:  r   )rD  	dtype_strtyss      rV   _type_ofrs    so   
 {uCs##B'Ii 	G 	z	
 	 	 	w 	$ 	D 	6 	F 	6 	6  	!" 	#$ 	%& 	'( /  C4 JJ11d3::<<00111222S#&&@33,@I,@,@@rX   lst"Iterable[Union[int, torch.SymInt]]list[sympy.Expr]c                    d | D             S )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    c                6    g | ]}t          j        |          S rO   )r   rB  rT   r   s     rV   rW   z-convert_shape_to_inductor.<locals>.<listcomp>  s"    ***EM!***rX   rO   rt  s    rV   convert_shape_to_inductorr{    s     +*c****rX   r   Union[int, torch.SymInt]c                    ddl m} t          | t                    r| nNt          | t          j                  rt          |           n%|j        j        j        	                    | d          S )zL
    Like convert_shape_to_symint, but operates on a single expression.
    r5   VN)hint)
virtualizedr  r   ry   r   r   graphsizevars	shape_envcreate_symintnode)r   r  s     rV   convert_to_symintr    sr      a	
 !U]++LCFFF!+==ad=KKrX    Iterable[Union[int, sympy.Expr]]list[Union[int, torch.SymInt]]c                    d | D             S )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    c                ,    g | ]}t          |          S rO   )r  ry  s     rV   rW   z+convert_shape_to_symint.<locals>.<listcomp>  s!    ...Qa  ...rX   rO   rz  s    rV   convert_shape_to_symintr     s     /.#....rX   optorch._ops.OpOverloadc                H    t          d | j        j        D                       S )z-
    Does this op overload have aliasing
    c              3  (   K   | ]}|j         d uV  d S r   )
alias_inforT   r1  s     rV   r   zis_view.<locals>.<genexpr>  s)      FFAq|4'FFFFFFrX   )any_schema	argumentsr  s    rV   is_viewr  
  s&     FF1EFFFFFFrX   c                    dS NFrO   )r   s    rV   <lambda>r    s     rX   user4   is_pointwise_fn'Callable[[torch._ops.OpOverload], bool]c                   | j         dk    rdS t          | j        t          j        j                  s| j        t          j        u sdS t          t          j        j        | j                  }|t          j        u st          |          r t          fd| j        D                       S t          j        j        |j        v p
 |          S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc              3  8   K   | ]}t          |          V  d S r   )is_pointwise_use)rT   ur  s     rV   r   z#is_pointwise_use.<locals>.<genexpr>$  s.      KKA#A77KKKKKKrX   )r  r   targetrQ   _ops
OpOverloadr&  getitemr   r  r   usersTag	pointwisetags)r  r  r  s    ` rV   r  r    s     v  u3:uz4559<xGW9W9Wu%*'44F!!!WV__!KKKKKKKKKK9&+-H1H1HHrX   r  r   r   	list[Any]kwargsdict[str, Any]&tuple[GraphModule, list[torch.Tensor]]c           	        t           j                                        g d	fd} j        | gt	          t           j        |||f          R  }t          | j        j                  dk    r+t          | j        j        d         j
                  dk    r|f}                    |           t           j                            i           }|fS )
Nargtorch.TensorrK   r4   c                x                         |                                dt                               S )Nr  )appendplaceholderrZ   )r  g
graph_argss    rV   add_tensor_argz)gen_gm_and_inputs.<locals>.add_tensor_arg/  s8    #}}43z??44555rX   r5   r   Tensor)r  r  rK   r4   )rQ   fxGraphr  r$   r  rZ   r  returnsrL   r  outputr3   )r  r   r  r  nodegmr  r  s         @@rV   gen_gm_and_inputsr  )  s     	A%'J6 6 6 6 6 6 6 1?u|^dF^LL  D 	FN"##q((&q).//8;;wHHTNNN			b!	$	$Bz>rX   rF   Nonec                    | dk    rd S t          |           }|                                r|                                 d S d S Nr  )r_   rR   r   r  s     rV   r   r   A  sT    /77$$&& '$$&&&&&' 'rX   modelCallable[..., Any]example_inputsSequence[Any]r   c                    t          |           t          j        d           t          j                    }t          |          D ]} | | }t          |           t          j                    }|J ||z
  S )Ni9  )r   rQ   manual_seedtimeperf_counterr   )r  r  r   r   t0r   resultt1s           rV   timedr  I  s     	d				B5\\  'F				B7NrX   rO   
         ?repeatbaselinec                     t          j         fdt          |          D                       }t          j        |          z  }t	          ||z  d           |                                S )Nc                4    g | ]}t                    S rO   )r  )rT   r   r   r  r  r   s     rV   rW   z%print_performance.<locals>.<listcomp>d  s'    LLLuneV	4	4LLLrX   z.6f)rQ   r   r   medianprintr   )r  r  r   r  r  r   timingstooks   ```  `  rV   print_performancer  [  s{     lLLLLLLLeFmmLLL G <  5(D	TH_
"
"###99;;rX   objmethodc                `     t          | |                      t          | |fd           dS )zKReplace obj.method() with a new method that returns a precomputed constant.c                      S r   rO   )r  s   rV   r  z#precompute_method.<locals>.<lambda>n  s     rX   N)rP   setattr)r  r  r  s     @rV   precompute_methodr  k  s8    !WS&!!##FC(((((rX   methodsr   c                0    |D ]}t          | |           dS )zFReplace methods with new methods that returns a precomputed constants.N)r  )r  r  r  s      rV   precompute_methodsr  q  s.     ' '#v&&&&' 'rX   r1  r2  c                P    t          | |k              t          | |k               z
  S r   )ry   )r1  r2  s     rV   cmpr  w  s!    q1u::AE

""rX   rU   Union[int, Sequence[int]]sizeSequence[int]c                    t          | t                    r| g|z  S t          |           dk    r" t          |           | d         g          |z  S | S )Nr5   r   )r   ry   rZ   r  )rU   r  s     rV   pad_listliker  {  sS    !S sTz
1vv{{tAww!v%%HrX   tuple[_T, ...]list[_T]c                V    t          |           dk    rg S dd}t          | |          S )	Nr   elemrs   rK   rL   c                    t          | t                    r| S ddlm} t          | |          sJ |                                 S )Nr5   )rD   )r   rL   	schedulerrD   get_name)r  rD   s     rV   	sort_funcztuple_sorted.<locals>.sort_func  sP    dC   	K000000$ 122222}}rX   rD  )r  rs   rK   rL   )rZ   sorted)rU   r  s     rV   tuple_sortedr    s?    
1vv{{	    !####rX   PRV)	covariantc                  .    e Zd Zedd            ZddZdS )CachedMethodr   r   rK   r  c                    d S r   rO   )r   s    rV   clear_cachezCachedMethod.clear_cache  s    ),rX   r   P.argsr  P.kwargsr  c                    d S r   rO   selfr   r  s      rV   __call__zCachedMethod.__call__  s      rX   N)r   r   rK   r  )r   r  r  r  rK   r  )r   r   r   staticmethodr  r   rO   rX   rV   r  r    s2        ,,, \,DDDDDDrX   r  !Callable[Concatenate[Any, P], RV]CachedMethod[P, RV]c           	         | j         }d| dd| i}t          d| d d d                                |            t          j        |           || d                   }dfd}||_        |S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfr  r   rK   r  c                L    t          |           rt          |            d S d S r   r  delattrr  rD  s    rV   r  z"cache_on_self.<locals>.clear_cache  s5    4 	D#	 	rX   r  r   rK   r  r   execlstripr$  wrapsr  )r   r   ctxwrapperr  rD  s        @rV   cache_on_selfr    s    ;D
t


C *C				 		 !		 		 (+		 		 		 FHH   "iob!!#&=&=&=">??G      &GNrX   Callable[P, RV]c                     t          |           S )z]
    Variant of cache_on_self for properties. The only difference is the type signature.
    )r  )r   s    rV   cache_property_on_selfr    s    
 rX   
class_name*Callable[[FN_TYPE[P, RV]], FN_TYPE[P, RV]]c                     d fd}|S )Nr   FN_TYPE[P, RV]rK   c           	         d d| j          dd| i}t          d d d d                                |            t          j        |           |d	                   }dfd}||_        |S )Nr  r   r  r   z            def inner(self: Any, *args: P.args, **kwargs: P.kwargs) -> RV:
                args_kwargs = (args, tuple(sorted(kwargs.items())))

                if not hasattr(self, "z2"):
                    object.__setattr__(self, "z%", {})

                cache = self.z

                try:
                    return cache[args_kwargs]
                except KeyError:
                    pass

                rv = fn(self, *args, **kwargs)

                cache[args_kwargs] = rv
                return rv
            innerr  r   rK   r  c                L    t          |           rt          |            d S d S r   r	  r  s    rV   r  z<cache_on_self_and_args.<locals>.wrapper.<locals>.clear_cache  s5    tS!! #c"""""# #rX   r  r  )r   r  r  r  rD  r  s       @rV   r  z'cache_on_self_and_args.<locals>.wrapper  s     4:33333 Rj (+	 
 03  "  $ )	
 	
 	
, $	##CL11	# 	# 	# 	# 	# 	# (rX   )r   r  rK   r  rO   )r  r  s   ` rV   cache_on_self_and_argsr    s)    
$ $ $ $ $ $L NrX   node_schedule0Union[Sequence[BaseSchedulerNode], ExternKernel]OrderedSet[Node]c                    ddl m} t          | t                    r6t	          j        t          j        d | D             t                                S t          | |j	                  r| j
        S t                      S )Nr5   irc                T    g | ]%}t          |d           |j        |j        j        &S )r  )r  r  originsrT   r  s     rV   rW   z%aggregate_origins.<locals>.<listcomp>  sK        4((	 .2Y		!  rX   ) r$  r   rp  r$  r%  r&  or_r"   r=   r&  )r  r$  s     rV   aggregate_originsr*    s     -&& L  *   LL	
 	
 		
 
M2?	3	3 $$||rX   Sequence[BaseSchedulerNode]descriptive_names8Literal[True, 'torch', 'original_aten', 'inductor_node']c                   t          |           }|dk    r.d fd|D             }t          t          |                    }n|dk    rg }|D ]}|j        dk    rd }d}d|j        v r|j        d         d         }nd	|j        v r|j        d	         d         }d
}|sOt          |d         t                    r|                    |d         |z              |                    |d         j        |z              t          t          |                    }n|dk    rd |D             }nt          d
                    dg|z             S )Noriginal_atenc                    | j         d         }d}t          |t          j        j                  r|j        j        }n@t          |t          j        j                  r!t          |	                                          }|S )Nr/  r(  )
r  r   rQ   r  r  _overloadpacketr   HigherOrderOperatorrL   r   )originr/  rD  s      rV   get_origin_meta_strz2get_fused_kernel_name.<locals>.get_origin_meta_str  sl    "K8MC-)>?? 0#3<M5:+IJJ 0-,,..//JrX   c                h    g | ].}|j         d k    r!d|j        v r|j        d         # |          /S )r  r/  )r  r  )rT   r3  r4  s     rV   rW   z)get_fused_kernel_name.<locals>.<listcomp>  sW     
 
 
yO++6;..O,8	  '' 988rX   rQ   r  r(  source_fn_stackr   fwd_source_fn_stackbackwardr5   inductor_nodec                2    g | ]}|j         d k    |j        S r  )r  r   rT   r3  s     rV   rW   z)get_fused_kernel_name.<locals>.<listcomp>9  s-     
 
 
"VY/5Q5QFK5Q5Q5QrX   r   fused)r*  r  r"   r  r  r   rL   r  r   NotImplementedErrorjoin)r  r,  all_originssourcesr3  	source_fnsuffixr4  s          @rV   get_fused_kernel_namerD  
  s    $M22KO++	 	 	
 
 
 
%
 
 
 G,,--	g	%	%! 	C 	CFyO++ 	$33 &,= >r BII*fk99 &,A B2 FI'F  ilC00 CNN9Q<&#89999NN9Q<#86#ABBBG,,--	o	-	-
 
&1
 
 
 "!88WI'(((rX   r  r8   tuple[str, str]c                h
   t          |           }d |D             }t          j        t                    }t          j        t                    }d|rt	          d |D                       }t          |          dk    r[|d         j        t          d          s%d t          j	                  D             }|_
        |                    fd	           |D ]D}d
|j        v r|j        d
         |j        d
         }	d}
t          |	t          j        j                  rt#          |	j                  }
n@t          |	t          j        j                  r!t#          |	                                          }
|
r ||
                             |j                   d|j        v r:|j        d         d         j        }
||
                             |j                   |j                            d          dk    r%||j                                     |j                   Fdnd}|j         d| dd                    |                                           dd                    |                                           d}|j         dg}t5          |                                          D ]G\  }}|                    |j         d| dd                    t5          |                                Hddlm |                    |j         d           t	                      }g }t          | j                  slddlm } d4fd!}d5d$d6fd'}| D ]R}t          |d(          r|j!        t          |j!        d)          r|j!        j"        |j!        j"        D ]}|j        |v r|#                    |j                   |j        $                    |j                  }|H |||j                  \  }}|                    |j         d*| d+ ||           d,| d           t          |j!        d-          ri|j!        j%        ]|j!        j%        D ]P}|j        $                    |j                  }|$ |||j                  \  }}|                    d.|z              QT|D ]5}|                    |j         d|&                    d/0                      6|                    |j         d1d2                    |                      |d3                    |          fS )7aH  
    Retrieves metadata information for a kernel.
    Args:
        node_schedule (Union[Sequence[BaseSchedulerNode], ExternKernel]):
            Either a sequence of BaseSchedulerNode objects or an ExternKernel instance.
        wrapper (PythonWrapperCodegen):
            An instance of PythonWrapperCodegen, used to define the code comment format.
    Returns:
        tuple[str, str]:
            A tuple containing two strings:
                - The first string represents the kernel's metadata.
                - The second string represent the kernel's detailed metadata.
    c                (    g | ]}|j         d k    |S r;  r  r<  s     rV   rW   z'get_kernel_metadata.<locals>.<listcomp>T  s$    WWW&):V:Vf:V:V:VrX   Nc              3  $   K   | ]}|j         V  d S r   )r  )rT   ns     rV   r   z&get_kernel_metadata.<locals>.<genexpr>^  s$      "C"Cq17"C"C"C"C"C"CrX   r5   r   )_inductor_kernel_metadata_node_to_idx_mapc                    i | ]\  }}||	S rO   rO   )rT   idxrI  s      rV   r9  z'get_kernel_metadata.<locals>.<dictcomp>c  s    "V"V"Vfc11c"V"V"VrX   c                    j         |          S r   )rJ  )rI  single_graphs    rV   r  z%get_kernel_metadata.<locals>.<lambda>f  s    lTUVW rX   r  r/  	from_nodepartitioner_tagis_backwardzTopologically SortedUnsorted z Source Nodes: [r@  z], Original ATen: []z" Source node to ATen node mapping:z   z => r#  z Graph fragment:r~  buffer2Union[ir.TensorBox, ir.Buffer, ir.TorchBindObject]rw_namerL   rK   tuple[str, ir.Layout | None]c                   t          | j                  r,t          | j        j                  r| j        j        j        }n| j        }||}n|j        }	 |                                 }n# t          $ r d }Y nw xY w||fS r   )r   	TensorBoxdata
StorageBoxorigin_noder   
get_layoutr>  )rU  rW  r]  r   layoutr$  s        rV   get_buffer_infoz,get_kernel_metadata.<locals>.get_buffer_info  s     fbl33 5
K9 9 5 #)+"2">KK"("4K&"DD&+D"#..00FF* " " "!FFF"V|#s   A, ,A;:A;shapeIterable[int]c                H    dd                     d | D                        dS )N[r@  c                ,    g | ]}t          |          S rO   )rL   rS   s     rV   rW   z@get_kernel_metadata.<locals>.stringify_shape.<locals>.<listcomp>  s    %<%<%<c!ff%<%<%<rX   rT  )r?  )ra  s    rV   stringify_shapez,get_kernel_metadata.<locals>.stringify_shape  s.    @499%<%<e%<%<%<==@@@@rX   r_  ir.Layout | Nonec                    | dS  | j                    } | j                   }| j         }dt          | j                  | | | dS )Nr(  ")r  strider   r!   r   )r_  shape_annotationstride_annotationdevice_annotationrf  s       rV   stringfy_layoutz,get_kernel_metadata.<locals>.stringfy_layout  s|    >2&5ofk&B&B#D '6v}'E'E$G!'-}$6!?FL1 ?3C ?(?*;? ? ?rX   read_writesreadsz   %z
 : Tensor z = PlaceHolder[target=writes%T)include_tensor_metadataz
   return ,
)rU  rV  rW  rL   rK   rX  )ra  rb  rK   rL   )r_  rg  rK   rL   )'r*  collectionsdefaultdictrp  r"   rZ   r  r  r  nodesrJ  sortr  r   rQ   r  r  rL   r1  r2  r   r  getcommentr?  keysr  itemsr(  r$  r=   r  r  ro  rp  addtry_get_bufferrq  format_node) r  r  r@  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsnode_to_idx_mapr  r/  rD  sort_strmetadatadetailed_metadataoriginal_noderx  	all_reads
all_writesr  r`  rn  rI  rrU  
input_namer_  woutput_namer   r$  rN  rf  s                                 @@@rV   get_kernel_metadatar  A  s=   $ $M22KWW;WWWN ,T22N$066
 L 
""C"CN"C"C"CCC}"")!,2L<)TUU Y"V"V	,BT8U8U"V"V"VIXFWWWW       8 8di''DIo,F,R Io6MC-)>?? 0-788M5:+IJJ 0-,,..// :"3'..ty999$)##)K(+0C3&&ty1111Y]],-->>49%,,TY777)5)A%%zH? 	C 	CX 	C 	CtyyATATAVAV7W7W 	C 	C99%7%<%<%>%>??	C 	C 	C  $OOOOP &~';';'='= > > 
 
u  PP=PPdiiu6N6NPP	
 	
 	
 	

   GO!E!E!EFFF%/\\	 "
-99 B	=&&&&&&$ $ $ $ $ $(A A A A
 
 
 
 
 
 # = =q-00 AM4I1='22 q}7J7V]0  6Y..$!af---!"!7!7!?!?!>$-<_VQV-L-L*
F)00& \ \J \ \.v66\ \NX\ \ \    AM844
=,8]1 = =!"!7!7!?!?!>$)8)H)HQ"))#*;<<<<" 	 	D$$?WWt'7'7PT'7'U'UWW    	  GO!U!Usxx
?S?S!U!UVVVTYY01111rX   initial_queueIterable[torch.fx.Node]skip_filterOptional[Callable[[Any], bool]]OrderedSet[torch.fx.Node]c                    t          |           } t          |           }| r\|                                 }|j        D ]>}|r ||          r||vr*|                    |           |                     |           ?| \|S )zJReturns the set of nodes whose values depend on those within initial_queue)rp  r"   r[   r  r~  r  )r  r  dominated_setr  users        rV   dominated_nodesr    s    
 ''M}--M
 +  ""J 	+ 	+D {{400 =((!!$'''$$T***  + rX   Sequence[IRNode]dict[str, IRNode]c                    ddl m d
fdt          |          \  }}fd|D             }t          |           \  }}fd	|D             }t          t	          j        g ||R            S )Nr5   r#  rI  r>   rK   r   c                   t          | j                  r | j                  S t          | j                  r | j                  S t          | j                  o(t          | j        j        j        j        f           S r   )	r   rZ  r[  r\  r>   ComputedBufferInputsKernelInputBufferTemplateBuffer)rI  r$  is_unrealized_nodes    rV   r  z*gather_origins.<locals>.is_unrealized_node  s    a&& 	.%%af---a'' 	.%%af---!RY'' 

!!	1
 1
 -
 	
rX   c                4    g | ]} |          |j         S rO   r&  rT   valr  s     rV   rW   z"gather_origins.<locals>.<listcomp>  s-    WWWc?Q?QRU?V?VWckWWWrX   c                4    g | ]} |          |j         S rO   r  r  s     rV   rW   z"gather_origins.<locals>.<listcomp>  s,    SSSC;M;Mc;R;RSCKSSSrX   )rI  r>   rK   r   )r(  r$  r#   r"   	itertoolschain)	r   r  kwargs_flattenr   kwargs_originsargs_flattenargs_originsr$  r  s	          @@rV   gather_originsr    s     
 
 
 
 
 
 
" %V,,NAWWWW^WWWN"4((OL!SSSS<SSSLioE|EnEEEFFFrX   exprc                P    d
ddfddfddfd	 |           S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    r  r~   rK   r   c                    t          | t          j                  o(t          | j                  dk    o| j        d         dk    S )N   r   r   )r   r   MulrZ   r   )r  s    rV   is_neg_leadzsympy_str.<locals>.is_neg_lead  s9    tUY''VC	NNa,?VDIaLTVDV	
rX   rL   c                n   t          | t          j                  rt          | j                  dk    rP | j        d                   r: | j        d                    d | j        d         j        d                    S d                    t          | j                            S  |           S )Nr  r5   r   z - z + )r   r   r   rZ   r   r?  r   )r  r  sympy_str_muls    rV   sympy_str_addz sympy_str.<locals>.sympy_str_add  s    dEI&& 	' 49~~""{{49Q<'@'@"'-	!55__--	RSHYZ[H\:]:]___zz#mTY"?"?@@@ =&&&rX   c                    t          | t          j                  rL |           rd | j        d                    S d                    t          | j                            S  |           S )N-r5   z * )r   r   r  r   r?  r   )r  r  sympy_str_atoms    rV   r  z sympy_str.<locals>.sympy_str_mul#  sx    dEI&& 	({4   B :>>$)A,77999zz#ndi"@"@AAA!>$'''rX   c                   t          | t          j                  r| j        S t          | t          j        t          j        f          rd |            dS t          | t          t          t          t          f          r=| j
        j         dd                    t          t          | j                             dS t!          |           S )N()r@  )r   r   Symbolr   r   r  ri   rf   rg   rh   funcr   r?  r   	sympy_strr   rL   )r  r  s    rV   r  z!sympy_str.<locals>.sympy_str_atom.  s    dEL)) 	9uy%)455 	-}}T**----(HMNN 	i(RR499SDI5N5N+O+ORRRRt99rX   )r  r~   rK   r   r  r~   rK   rL   rO   )r  r  r  r  r  s    @@@@rV   r  r    s    
 
 
 

	' 	' 	' 	' 	' 	' 	'	( 	( 	( 	( 	( 	( 	(      =rX   r  ValueRanges[Any]c                    ddl m} t          j        r2t	          |j        dd           x}r|j        dk    rt          |           S t          j	                    S )Nr5   r~  current_node
index_expr)
r  r  ro   compute_all_boundsrP   interpreterr  rl   rm   unknown)r  r  fx_nodes      rV   get_bounds_index_exprr  ;  sh     	!%~tDDDW% Nl**5!!!"$$$rX   prefixc                    | d         dk    S )Nr   r  rO   )r  s    rV   prefix_is_reductionr  I  s    !9rX   rk   rL  sympy.Symbolc                L    | t           j        k    sJ t          | |dd          S )9
    Used to generate an integer-nonnegative symbol.
    Tintegernonnegative)rk   SIZErj   )r  rL  s     rV   sympy_index_symbol_with_prefixr  M  s0     TY vsDdCCCCrX   checkc                6    | st           j        ot           j        S r   )ro   debug_index_assertsassert_indirect_indexing)r  s    rV   generate_assertr  Y  s    /V/TV5TTrX   r   c                L    | d         dk    sJ t          j        | dd          S )r  r   r   Tr  )r   r  r   s    rV   sympy_index_symbolr  ]  s.     7c>>>> <d====rX   replacementsdict[sympy.Expr, Any]c                    d	dt          j        |                               fd|                                D                       S )
z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    replacedr~   replacementUnion[sympy.Expr, str]rK   r  c                    t          | t          j                  sJ t          |t                    r!t          j        || j        | j                  S |S )Nr  )r   r   rA  rL   r  r   is_nonnegative)r  r  s     rV   	to_symbolzsympy_subs.<locals>.to_symbolo  s^     (EJ/////k3'' 	< +$3    rX   c                0    i | ]\  }}| ||          S rO   rO   )rT   kr}   r  s      rV   r9  zsympy_subs.<locals>.<dictcomp>~  s)    ===1IIaOO===rX   )r  r~   r  r  rK   r  )r   rB  xreplacer}  )r  r  r  s     @rV   
sympy_subsr  i  s^        =''====(:(:(<(<===  rX   ,TypeGuard[Union[torch.SymInt, torch.Tensor]]c                
   t          | t          j                  pit          | t          j                  oOt	          d t          j        |                                 |                                           D                       S )Nc              3  4   K   | ]}t          |          V  d S r   is_symbolicrS   s     rV   r   zis_symbolic.<locals>.<genexpr>  s(      NN1ANNNNNNrX   )	r   rQ   r1   r  r  r  r  r  rj  )r1  s    rV   r  r    sf    a&& 1el## 	ONN	!((**(M(MNNNNNrX   c                 4    t          d | D                       S )Nc              3  4   K   | ]}t          |          V  d S r   r  r  s     rV   r   z"any_is_symbolic.<locals>.<genexpr>  s(      ,,!{1~~,,,,,,rX   r  )r   s    rV   any_is_symbolicr    s    ,,t,,,,,,rX   )z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalarr  torch.fx.GraphModuleOptional[torch.fx.Node]c                    ddl m} | j        j        D ]@}t	          |          r|c S |j                            d          x} ||          r|c S Ad S )Nr   )r)   r  )%torch.fx.experimental.symbolic_shapesr)   r  rx  is_cudagraph_unsafe_fx_noder  rz  )r  r)   r  r  s       rV   %get_first_incompatible_cudagraph_noder    s|     LKKKKK  &t,, 	KKK9=='''C49N9Ns9S9S4KKK4rX   c                    t          t          t          | j        j                                      }|j        dk    sJ |S )z$Get the output node from an FX graphr  )nextiterreversedr  rx  r  )r  	last_nodes     rV   output_noder    s<    T(28>223344I<8####rX   OrderedSet[torch.device]c                   | j                             d          }t          d |D                       }t          |           j        d         }t          |t                    r|n|f}t          d |D                       }||z  S )Nr  r  c              3     K   | ]H}t          |j                            d           t          j                  4|j        d          j        V  IdS r  N)r   r  rz  rQ   r  r   r'  s     rV   r   z"get_all_devices.<locals>.<genexpr>  s`       9 9dimmE**EL999	%9 9 9 9 9 9rX   r   c              3     K   | ]g}t          |t          j        j                  !t          |j                            d           t          j                  S|j        d          j        V  hdS r  )r   rQ   r  r4   r  rz  r  r   )rT   r  s     rV   r   z"get_all_devices.<locals>.<genexpr>  sw       7 7c58=))7 sx||E**EL99	77 7 7 7 7 7rX   )r  
find_nodesr"   r  r   r   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicess         rV   get_all_devicesr    s    ++}+==.8 9 9%9 9 9 / /M "oo"1%G$We44Dww7*H,6 7 77 7 7 - -K ;&&rX   c                    t          t          j                                                  D ]} |                     d          st          j        |          }|j        D ]}|                    d          rt          ||          }t          |t          j	        j
        j        j                  rV|j        D ]N}t          |t          j	        j
        j        j                  r#|j        j        j                                         Ot          j        | = dt          j        v r<t          j        d         }t'          |j        j        j                  `|j        j        `t1          j                     d S )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)rp  sysmodulesr|  
startswith__dict__rP   r   rQ   	_inductorruntimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r  driveractiveutilsinstancegccollect)module_namem	attr_namer  r  r  s         rV   unload_xpu_triton_pydsr(    sS   CK,,..// % %%%&NOO 	K$ 	< 	<I##I.. < I..EO3EV  	< #)"8 < <%"!O3EY  <
 #M-199;;;K$$ #+--k12"())2J#JLLLLLrX   _registered_cachesc                    t          | d          rt          | j                  st          |  d          t                              |            | S )zh
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_cache().
    cache_clearz# does not have a cache_clear method)r  callabler+  AttributeErrorr)  r  r  s    rV   clear_on_fresh_cacher/    sY    
 3&& Jhs.G.G JHHHIIIc"""JrX   c                 B    t           D ]} |                                  dS )z&
    Clear all registered caches.
    N)r)  r+  r.  s    rV   clear_cachesr1    s0     "   rX   cache_entriesOptional[dict[str, Any]]dirOptional[str]deleteIterator[None]c              #  |  K   t                       ddlm}  |t          j        |                    	 t
          j                            t          j	        di          5  t                              d            |t          j                            d                    t
          j                            t          j	        di          5  dV  t          | t                    rqt          |           dk    s
J d	            t          j                                      r5t          j                  }|                     fd
|D                        ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   |r`t'                      r,t(          j                                        rt/                       t1          j        t'                      fd           n*# t4          $ r t                              d            w xY wt                       dS # t                       w xY w)z
    Contextmanager that provides a clean tmp cachedir for pt2 caches.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    r   )normalize_path_separator)r4  TORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNz!expected empty cache_entries dictc           	         i | ]D}d |v|t           j                            t           j                            |                    ES )z.lock)ospathgetsizer?  )rT   ftriton_cache_dirs     rV   r9  zfresh_cache.<locals>.<dictcomp>"  sP       $%#*!#3#3 !"27??27<<@PRS3T3T#U#U#3#3#3rX   c                >    t                               d|          S )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)r  r?  rD  inductor_cache_dirs      rV   r  zfresh_cache.<locals>.<lambda>2  s$    S[[@&% 6A 6 6 rX   )ignore_errorsonerrorz(on error, temporary cache dir kept at %s)r1  torch._inductor.cpp_builderr9  tempfilemkdtempr   patchdictr>  environr   r   r?  r?  r   rZ   existslistdirro  
is_windowsrQ   rH   rR   r(  shutilrmtree	ExceptionrE  )r2  r4  r6  r9  filesrF  rB  s        @@rV   fresh_cacherV    s      NNNDDDDDD11(2Bs2K2K2KLL)Z__J24FG
 
 	 	 II35GHHH77/::    .@BR-STT  mT22 
}--2224W222w~~&677  "
+; < <%,,   ).                  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	(  	|| )	 6 6 8 8 )&(((M" )ll          >@RSSS 	sh   ,G1 $A1FBE, F,E0	0F3E0	4F7G1 FG1 
FA%G1 0H+ 1'HH+ +H;)reverseseqrW  	list[int]c                   | j         }t          t          |                     }t          t	          ||d                    }|st          t          |                    S |S )NTrD  rW  )__getitem__r   rZ   rp  r  r  )rX  rW  gettera_rsort_idxs        rV   argsortr`  E  s[    _F
C//C F3FD999::H (HX&&'''OrX   r  r+   .Sequence[Union[int, torch.SymInt, sympy.Expr]]c                    d
 fd}d t          |          D             }t          |t          j        |          |          }d	 |D             }|S )Nr1  tuple[int, sympy.Expr]r2  rK   ry   c                    | \  }}|\  }}d	fd} |||k               rdS  |||k              rdS ||k     rdS ||k    rdS dS )
Nr  %Union[bool, torch.SymInt, sympy.Expr]rK   r   c                `    t          | t                    r| S                     | d          S )NT)size_oblivious)r   r   evaluate_expr)r  r  s    rV   evaluatez*argsort_sym.<locals>.cmp.<locals>.evaluate_  s3    $%% **4*EEErX   r   r5   r   )r  re  rK   r   rO   )r1  r2  a_idxa_valb_idxb_valri  r  s          rV   r  zargsort_sym.<locals>.cmp[  s    uu	F 	F 	F 	F 	F 	F
 8EEM"" 	28EEM"" 	1
 5==15==2qrX   c                h    g | ]/\  }}|t          |t          j                  r|j        j        n|f0S rO   )r   rQ   r1   r  r  )rT   rL  r   s      rV   rW   zargsort_sym.<locals>.<listcomp>s  sJ       C 
Z5<88?afkka@  rX   r[  c                    g | ]\  }}|S rO   rO   )rT   rL  r   s      rV   rW   zargsort_sym.<locals>.<listcomp>x  s    &&&fc1c&&&rX   )r1  rc  r2  rc  rK   ry   )r  r  r$  
cmp_to_key)r  rX  rW  r  exprsr  s   `     rV   argsort_symrr  U  s~         0 nn  E 5i2377IIIE&&&&&FMrX   r   torch.dtypec                v    | t           j        k    rdS t          j        d|                                           S )Nrw   rO   r   )rQ   rk  r   element_sizeru  s    rV   get_dtype_sizerw  |  s7     q;r'''44666rX   c                      e Zd ZU ded<   dS )LineContextr   contextNr   r   r   r   rO   rX   rV   ry  ry    s         LLLLLrX   ry  c                  $    e Zd ZU ded<   ded<   dS )ValueWithLineMaprL   r   zlist[tuple[int, LineContext]]line_mapNr{  rO   rX   rV   r}  r}    s'         JJJ++++++rX   r}  c                      e Zd ZdZd1d2dZej        d3d
            Zd4dZd5dZ	d5dZ
d6dZd7dZd5dZd6dZd8dZd9dZd:d;dZd:d<dZd:d<d Z	 d=d>d%Zd?d(Zd5d)Zd@d,ZdAd/Zd0S )BIndentedBuffer   r   initial_indentry   rK   r  c                "    g | _         || _        d S r   )_lines_indent)r  r  s     rV   __init__zIndentedBuffer.__init__  s    GI%rX   tabwidthr7  c              #  V   K   | j         }	 || _         d V  || _         d S # || _         w xY wr   )r  )r  r  prevs      rV   set_tabwidthzIndentedBuffer.set_tabwidth  s@      }	!$DMEEE DMMMDDM    s    	(r}  c                   t                      }d}g }| j        D ]}t          |t                    r |            }|$n4t          |t                    r|                    ||j        f           W|}t          |t                    sJ |                    |           |                    d           |d|	                    d          z   z  }t          |                                |          S )Nr5   ru  )r   r  r   DeferredLineBasery  r  rz  rL   writecountr}  getvalue)r  bufr   linemaplilines         rV   getvaluewithlinemapz"IndentedBuffer.getvaluewithlinemap  s    jj13+ 	& 	&B".// rtt<  B,, 2:///dC(((((IIdOOOIIdOOOTZZ%%%%AA888rX   rL   c                4    |                                  j        S r   )r  r   r  s    rV   r  zIndentedBuffer.getvalue  s    ''))//rX   c                   t                      }| j        D ]}t          |t                    r |            }|$nt          |t                    r;|}t          |t
                    sJ |                    d          r|                    |d d                    |                    |           |                    d           |                                S )N\r   ru  )	r   r  r   r  ry  rL   endswithr  r  )r  r  r  r  s       rV   getrawvaluezIndentedBuffer.getrawvalue  s    jj+ 	  	 B".// rtt<  B,, dC(((((}}T""  		$ss)$$$$		$		$||~~rX   c                8    | j                                          d S r   )r  clearr  s    rV   r  zIndentedBuffer.clear  s    rX   r   c                *    t          | j                  S r   )r   r  r  s    rV   __bool__zIndentedBuffer.__bool__  s    DK   rX   c                &    d| j         | j        z  z  S )NrS  )r  r  r  s    rV   r  zIndentedBuffer.prefix  s    dlT]233rX   c                0    |                      d           d S )Nru  	writeliner  s    rV   newlinezIndentedBuffer.newline  s    trX   r  )Union[LineContext, DeferredLineBase, str]c                   t          |t                    r| j                            |           d S t          |t                    rA| j                            |                    |                                                      d S |                                r2| j                            |                                  |            d S | j                            d           d S Nr(  )r   ry  r  r  r  with_prefixr  stripr  r  s     rV   r  zIndentedBuffer.writeline  s    dK(( 	#Kt$$$$$.// 	#Kt//>>?????ZZ\\ 	#K$++--77788888Kr"""""rX   lines3Sequence[Union[LineContext, DeferredLineBase, str]]c                :    |D ]}|                      |           d S r   r  )r  r  r  s      rV   
writelineszIndentedBuffer.writelines  s2      	! 	!DNN4    	! 	!rX   r5   offset'contextlib.AbstractContextManager[None]c                L     t           j        d fd            } |            S )NrK   r7  c               3     K   xj          z  c_         	 d V  xj          z  c_         d S # xj          z  c_         w xY wr   r  )r  r  s   rV   r  z"IndentedBuffer.indent.<locals>.ctx  sQ      LLF"LL'&&s   + =rK   r7  )
contextlibcontextmanager)r  r  r  s   `` rV   indentzIndentedBuffer.indent  sB    		"	' 	' 	' 	' 	' 	' 
#	"	' suurX   c                &    | xj         |z  c_         d S r   r  r  r  s     rV   	do_indentzIndentedBuffer.do_indent      rX   c                &    | xj         |z  c_         d S r   r  r  s     rV   do_unindentzIndentedBuffer.do_unindent  r  rX   F
other_codeUnion[IndentedBuffer, str]r  c           	        t          |t                    rt          d          }|j        D ]X}t          |t                    sA|r?t          |t          |          t          |                                          z
            }Yt          j	        |          rd}|j        D ]b}t          |t                    r| j        
                    |           2t                              | |t          |          d                     cd S t          j        |          }|r|                                }|sd S |                                }|                    d          D ]}|                     |           d S )Ninfr   ru  )r   r  r   r  ry  minrZ   r  mathisinfr  r  ry   textwrapdedentrstriprn  )r  r  r  r  r  r   s         rV   splicezIndentedBuffer.splice  s    j.11 	"5\\F") I I!$44 I I TS5G5G)GHHFz&!! ") H HdK00 HK&&t,,,,",,T4F3FGGGG	H H "44J 1'..00
 #**,,J%%d++ " "q!!!!" "rX   r  Callable[[Any], Any]c                b    t          | j                  }fd| j        D             |_        |S )Nr  c                &    g | ]} |          S rO   rO   )rT   r  r  s     rV   rW   z&IndentedBuffer.map.<locals>.<listcomp>  s!    999Tdd4jj999rX   )r  r  r  )r  r  r   s    ` rV   r   zIndentedBuffer.map  s7    DL9999999T[999

rX   c                P    t          |            d|                                  dS )Nr  r  )r  r  r  s    rV   __repr__zIndentedBuffer.__repr__  s'    t**11t}}1111rX   otherr   c                    | j         |j         k    sJ t          | j                   }|                    | j                   |                    |j                   |S )Nr  )r  r  r  r  )r  r  r   s      rV   __add__zIndentedBuffer.__add__  sV    |u},,,,DL999t{###u|$$$
rX   new_line)Union[DeferredLineBase, LineContext, str]c                    || j         v S r   )r  )r  r  s     rV   containszIndentedBuffer.contains!  s    4;&&rX   Nr  )r  ry   rK   r  )r  ry   rK   r7  )rK   r}  rK   rL   rK   r  rK   r   )r  r  rK   r  )r  r  rK   r  r   )r  ry   rK   r  )r  ry   rK   r  )F)r  r  r  r   rK   r  )r  r  rK   r  )r  r   rK   r  )r  r  rK   r   )r   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  rO   rX   rV   r  r    s       H& & & & & ! ! ! !9 9 9 9(0 0 0 0   (   ! ! ! !4 4 4 4   # # # #! ! ! !	 	 	 	 	         EJ" " " " "4   
2 2 2 2   ' ' ' ' ' 'rX   r  c                  (     e Zd Zd fdZd	dZ xZS )
FakeIndentedBufferrK   r  c                H    t                                                       d S r   )superr  )r  	__class__s    rV   r  zFakeIndentedBuffer.__init__&  s    rX   r   rL   r   c                j    |dk    rt                               | |          S t          d| d          )Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r  )r  r   s     rV   r  z#FakeIndentedBuffer.__getattribute__)  sK    ;**4666=$ = = =
 
 	
rX   r  )r   rL   rK   r   )r   r   r   r  r  __classcell__r  s   @rV   r  r  %  sQ             
 
 
 
 
 
 
 
rX   r  c               #     K   t           j        t           j        }} 	 d V  | |ct           _        t           _        d S # | |ct           _        t           _        w xY wr   )r  stdoutstderr)initial_stdoutinitial_stderrs     rV   restore_stdout_stderrr  4  sR      %(ZNN@!/
CJJJ
CJ????s	   ; Ac                  R    e Zd ZdZddZddZdd	ZddZddZddZ	ddZ
ddZdS )r  z.A line that can be 'unwritten' at a later timer  rL   c                @    |                                 sd}|| _        d S r  )r  r  r  s     rV   r  zDeferredLineBase.__init__@  s"    zz|| 	D			rX   rK   Union[str, None]c                    t           )zJReturns either self.line or None to indicate the line has been 'unwritten'r>  r  s    rV   r   zDeferredLineBase.__call__E      !!rX   r   c                    t           )z3Returns a new deferred line with the same conditionr  r  s     rV   	_new_linezDeferredLineBase._new_lineI  r  rX   r  c                >    |                      | | j                   S r   r  r  )r  r  s     rV   r  zDeferredLineBase.with_prefixM  s!    ~~444555rX   c                Z    |                      | j                                                  S r   )r  r  r  r  s    rV   r  zDeferredLineBase.lstripP  s"    ~~di..00111rX   r  Union[int, slice]c                B    |                      | j        |                   S r   r  )r  r  s     rV   r\  zDeferredLineBase.__getitem__S  s    ~~di.///rX   r   c                *    t          | j                  S r   )r   r  r  s    rV   r  zDeferredLineBase.__bool__V  s    DIrX   ry   c                *    t          | j                  S r   )rZ   r  r  s    rV   __len__zDeferredLineBase.__len__Y  s    49~~rX   N)r  rL   )rK   r  )r  rL   rK   r   )r  rL   rK   r   )rK   r   )r  r  rK   r   r  rK   ry   )r   r   r   r   r  r   r  r  r  r\  r  r  rO   rX   rV   r  r  =  s        88   
" " " "" " " "6 6 6 62 2 2 20 0 0 0        rX   r  c                  4     e Zd ZdZd fdZdd	Zdd
Z xZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`rD  rL   value_fnCallable[[], str]r  c                f    t                                          |           || _        || _        d S r   )r  r  rD  r  )r  rD  r  r  r  s       rV   r  zDelayReplaceLine.__init__`  s-     rX   rK   c                f    | j                             | j        |                                           S r   )r  replacerD  r  r  s    rV   r   zDelayReplaceLine.__call__e  s$    y  4==??;;;rX   c                8    t          | j        | j        |          S r   )r  rD  r  r  s     rV   r  zDelayReplaceLine._new_lineh  s    $->>>rX   )rD  rL   r  r  r  rL   r  )r  rL   rK   r  r   r   r   r   r  r   r  r  r  s   @rV   r  r  ]  sk        @@! ! ! ! ! !
< < < <? ? ? ? ? ? ? ?rX   r  c                  4     e Zd ZdZd fdZdd	Zdd
Z xZS )DelayMaybeLinez7At end of codegen return `line if `pred_fn() else None`pred_fnCallable[[], bool]r  rL   c                X    t                                          |           || _        d S r   )r  r  r  )r  r  r  r  s      rV   r  zDelayMaybeLine.__init__o  s&    rX   rK   
str | Nonec                <    |                                  r| j        nd S r   )r  r  r  s    rV   r   zDelayMaybeLine.__call__s  s     LLNN4tyy4rX   c                ,    t          | j        |          S r   )r  r  r  s     rV   r  zDelayMaybeLine._new_linev  s    dlD111rX   )r  r  r  rL   )rK   r  )r  rL   rK   r  r  r  s   @rV   r  r  l  sk        AA     5 5 5 52 2 2 2 2 2 2 2rX   r  index_or_deviceUnion[int, torch.device]c                   t          | t          j                  r| }n!t          j        t                      |           }t	          j        |          }t          j        j        r=|j        J |j        dk     s|j        dk    rt          
                    d           dS dS |j        dk    rdnd}|j        }||k     r!t          
                    d	||d
           dS dS )N	   r  z6GPU arch does not support max_autotune_gemm mode usageFTrH   rt   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)r   rQ   r   r^   r   createversionhipmajorr   rE  r  multi_processor_count)r
  r   propr  r  s        rV   
is_big_gpur  z  s    /5<00 ? lnno>>"6**D } z%%%:>>TZ2--KKPQQQ5tK5((bbbG*I7:%I>> 	 	
 	
 	
 u4rX   c                     t           j                                        r#t           j                                        j        S t           j                            d          j        S )NrF   )rQ   rH   rR   get_device_propertiesgpu_subslice_countrF   r  rO   rX   rV   get_max_num_smsr    sI    y Dy..00CC:++F33IIrX   c                     t           j                                        sdS t           j                            t           j                                                  } | j        dk    S )zEReturns true if the device is a NVIDIA B200, otherwise returns false.Fr  )rQ   rF   rR   r  r  r  )device_propertiess    rV   
using_b200r    sQ     :""$$ u
889R9R9T9TUU"b((rX   c                     t           j                                        rt                      S t           j                                        } t                      | | ndz
  S )zFHandle experimental carveout if set otherwise return hardware SM countNr   )rQ   rH   rR   r  r  _get_sm_carveout_experimental)carveouts    rV   get_num_smsr#    sS     y !   x5577HH,@aHHrX   num_tma_descriptorsnum_programsOptional[int]r6   c                    ddl m}m} |t                      }|                    d          }|| z  t
          z  } |||| |j                              S )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r5   )r6   WorkspaceZeroModeNF)r  	zero_moder   
outer_name)codegen.commonr6   r(  r#  	from_boolTMA_DESCRIPTOR_SIZEunique_name)r$  r   r%  r6   r(  r)  r  s          rV   get_tma_workspace_argr/    s     @???????"}}!++E22I--0CCD<+<+--	   rX   r_  r?   allowed_layout_dtypeslist[torch.dtype]c                    | j         |vr!t                              d| j         |           t          | j        j                  o| j         |v ot          | j                  S )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r  r  )r_  r0  s     rV   _use_template_for_gpur4    sl     |000		RL!	
 	
 	
 	v}!"" 	&L11	&v}%%rX   backendc                    |                                  d t          j                                                             d          D             v S )Nc                6    g | ]}|                                 S rO   r  rS   s     rV   rW   z)_use_autotune_backend.<locals>.<listcomp>  -       		  rX   rt  )upperro   max_autotune_gemm_backendsrn  r5  s    rV   _use_autotune_backendr=    P    ==??  !<BBDDJJ3OO    rX   c                    |                                  d t          j                                                             d          D             v S )Nc                6    g | ]}|                                 S rO   r8  rS   s     rV   rW   z._use_conv_autotune_backend.<locals>.<listcomp>  r9  rX   rt  )r:  ro   max_autotune_conv_backendsrn  r<  s    rV   _use_conv_autotune_backendrB    r>  rX   )enable_int32enable_float8check_max_autotunerC  rD  rE  c                  ddl m}m} t          j        t          j        t          j        g}|r.t          j        t          j        t          j        t          j        g}|r+|                    t          j	        t          j
        g           t          | j        j                  ot          | |          p| j        j        dk    o| j        |v o?t           j        pt           j        p| o$t'          d          o || j        |j                  S )Nr5   )BackendFeaturehas_backend_featurer  TRITON)r+  rG  rH  rQ   r   rW  rY  ra  extendrQ  rR  r3  r   r  r4  r   ro   max_autotunemax_autotune_gemmr=  TRITON_TEMPLATES)r_  rC  rD  rE  rG  rH  layout_dtypess          rV   use_triton_templaterO    s    DCCCCCCC]ENEMBM Tu{S Ge153DEFFF v})** A)&-@@O "e+M0M
	P  VF$<VDV@V
	P "(++
	P  ~/NOOrX   output_layout
add_guardsmatricesr>   rQ  Optional[Layout]rR  c                    ddl m} ddlm dfd	dfd}dfddfd |            o%t	          fd|D                       o
 ||           S )u  
    Return True iff *all* supplied tensors satisfy the CUDA-12.9 TMA constraints
    that Triton relies on today.
    * https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html

    A tensor is accepted when:
      * 2 ≤ rank ≤ 5
      * dtype ∈ {FP16, BF16, FP8-E4M3FN}
      * Every logical size ≥ 2
      * Base pointer 16-byte aligned
      * All "outer" dims have 16-byte aligned strides
      * The “inner” dim has stride 1 (contiguous)
      * For FP8 tensors, inner dim ≥ 32
    r   )has_triton_tma_devicer5   r~  
expr_bytesr=  rK   r   c                N    j         j                            | t                    S r   )r  r  statically_known_multiple_ofTMA_ALIGNMENT)rW  r  s    rV   _alignedzcan_use_tma.<locals>._aligned  s    w<<ZWWWrX   r_  rT  c                x    | dS | j         }| j        }| j        } | j                  sdS  |||d          S )NTFallow_float32)r  rj  r   r  )r_  sizesstridesr   r[  _is_tma_compatibles       rV   _is_tma_compatible_layoutz.can_use_tma.<locals>._is_tma_compatible_layout  sW    >4- x&& 	5!!%%tLLLLrX   r&  r>   c                    |                                  }|                                 }|                                 }|                                 j        j        v rdS  |||d          S )NFr]  )get_size
get_stride	get_dtyper  r  unaligned_buffers)r&  r_  r`  r   r  ra  s       rV   _is_tma_compatible_matrixz.can_use_tma.<locals>._is_tma_compatible_matrix#  sc    

,,.. ::<<174445!!%%uMMMMrX   r_  r-  r`  Sequence[_IntLike]r   rs  r^  c                   t          |           }|j        }|dk     s|dk    rdS |t          j        t          j        t          j        fvr|r|t          j        k    rdS r?j        j        	                    |           }j        j        	                    |          }nfd| D             }fd|D             }t          fd|D                       rdS fdt          |          D             }t          |          dk    rdS |d	         }	t          |          D ]\  }
}|
|	k    r ||z            s dS ||	         } ||z            sdS |t          j        k    r"j        j                            |d
          sdS dS )Nr  r   Fc                N    g | ]!}j         j                            |          "S rO   r  r  symbolic_hintrT   r   r  s     rV   rW   z;can_use_tma.<locals>._is_tma_compatible.<locals>.<listcomp>E  s,    HHHQqw'55a88HHHrX   c                N    g | ]!}j         j                            |          "S rO   rl  )rT   str  s     rV   rW   z;can_use_tma.<locals>._is_tma_compatible.<locals>.<listcomp>F  s,    NNN)77;;NNNrX   c              3  Z   K   | ]%}j         j                            |d            V  &dS r  N)r  r  statically_known_geqrn  s     rV   r   z:can_use_tma.<locals>._is_tma_compatible.<locals>.<genexpr>I  s;      PP117#88A>>>PPPPPPrX   c                Z    g | ]'\  }}j         j                            |d           %|(S r   )r  r  statically_known_equals)rT   r   rp  r  s      rV   rW   z;can_use_tma.<locals>._is_tma_compatible.<locals>.<listcomp>M  sH     
 
 
2w77A>>

 
 
rX   r5   r       T)rZ   itemsizerQ   r   rW  rQ  rY  r  r  guard_int_seqr  r  rs  )r_  r`  r   r^  rankrw  sizes_i	strides_ir  	inner_idxr   rp  	inner_dimr  r[  rR  s                rV   ra  z'can_use_tma.<locals>._is_tma_compatible.  s    5zz> !88taxx5 8KLLL M!&%-!7!75 	Og&44U;;G(66w??IIHHHH%HHHGNNNNgNNNI PPPPPPPPP 	5
 
 
 
"9--
 
 

 u::??5!H	 y)) 	 	EArI~~8BM** uu I&	x	H,-- 	5 E'''0@0U0Ur1
 1
' 5trX   c              3  .   K   | ]} |          V  d S r   rO   )rT   r&  rh  s     rV   r   zcan_use_tma.<locals>.<genexpr>l  s/      ??))!,,??????rX   )rW  r=  rK   r   )r_  rT  rK   r   )r&  r>   rK   r   )
r_  r-  r`  ri  r   rs  r^  r   rK   r   )torch.utils._tritonrV  r  r  r   )	rQ  rR  rS  rV  rb  r  r[  ra  rh  s	    `   @@@@rV   can_use_tmar    s   " :99999X X X X X XM M M M M M M	N 	N 	N 	N 	N 	N 	N: : : : : : : :z 	 	5????h?????	5%%m44rX   )rR  c                    t           j        j        r| nd }t          d |D                       ot	          |||dot           j        j        S )Nc              3  `   K   | ])}t          |                                          d k    V  *dS rr  )rZ   rd  )rT   r&  s     rV   r   z*use_triton_tma_template.<locals>.<genexpr>v  s7      55qC

"555555rX   rP  )ro   r;  enable_template_tma_storer   r  enable_persistent_tma_matmul)rQ  rR  rS  r_  s       rV   use_triton_tma_templater  q  sZ     %mEO]]4F55H55555 	7JOOO	7M6rX   c                `    t          || |dsdS ddlm} ddlm}  |            o	 |            S )NrP  Fr   )%has_triton_tensor_descriptor_host_tmar5   is_datacenter_blackwell_arch)r  r  r  codegen.cuda.cuda_envr  )rQ  rR  rS  r  r  s        rV   !use_triton_blackwell_tma_templater  |  sm     #	:    uIIIIIICCCCCC 1022U7S7S7U7UUrX   )maxsizec                 h    	 t           j                            d          duS # t          $ r Y dS w xY w)zCheck if CuTeDSL is importable; cache the result for reuse.

    Call ensure_cute_available.cache_clear() after installing CuTeDSL
    in the same interpreter to retry the import.
    zcutlass.cuteNF)	importlibutil	find_specr  rO   rX   rV   ensure_cute_availabler    sD    ~''77tCC   uus    # 
11mat_amat_ba_is_2db_is_2doffsOptional[Any]biasscale_resultc                   t                      sdS t          d          sdS ddlm} t	          |j        j                  sdS  |            sdS t          j        g}	t          ||	          sdS t          j        st          j        sdS t          | ||          sdS t          d | |fD                       rdS |r|rdS |dS ||dS dS )	a  
    Returns True if we can use the blackwell kernel for grouped mm.
    Required conditions:
        1. CuTeDSL backend is enabled
        2. CuTeDSL is available
        3. We are on a blackwell arch
        4. The dtype is bf16
        5. Max autotune or max autotune gemm is enabled
        6. A, B, and the output are 16B aligned
        7. We are not using dynamic shapes
        8. A is 2d
        9. B is 3d
        10. Offsets are provided
        11. Bias and Scale are not provided
    FCUTEDSLr5   r  )rQ  c              3  4   K   | ]}t          |          V  d S r   )
is_dynamicrS   s     rV   r   z3use_blackwell_cutedsl_grouped_mm.<locals>.<genexpr>  s(      
1
1Q:a==
1
1
1
1
1
1rX   NT)r  r=  r  r  r3  r   r  rQ   rW  r4  ro   rK  rL  r  r  )
r  r  r_  r  r  r  r  r  r  rN  s
             rV    use_blackwell_cutedsl_grouped_mmr    s"   2 !"" u ++ uCCCCCC&-$%% u'')) u^$M 77 u 6#; u ue6::: u

1
15%.
1
1
111 u g u|u<3u4rX   r&  rI  r  c                   ddl m} |j        j                            ||z  |z  d          }|dk    s|t
          j        j        k     rdS ddlm	} t          j        j        rdS t          j        t          j        t          j        g}t!          | |          o&t
          j        pt
          j        ot'          d          }|r6 |            s,t(                              d	t
          j        j                   dS |S )
Nr5   r~  r   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir %s is set correctly. Skipping CUTLASS backend for now.)r  r  r  r  	size_hintro   rF   cutlass_backend_min_gemm_sizecodegen.cuda.cutlass_utilsr  rQ   r  r  r   rW  ra  r4  rK  rL  r=  r   rE  cutlass_dir)	r_  r&  rI  r  r  	gemm_sizer  rN  r   s	            rV   use_cutlass_templater    s    **1q519r*BBIA~~V[%NNNu>>>>>> } u ]ENEK@Mfm44 	- <F$<	-!),,   !!## 	KK4 '	   5JrX   op_namec                    t           j        j                                        }|dk    rdS |                                 d |                    d          D             v S )z8Check if CUTLASS should be used for the given operation.ALLTc                6    g | ]}|                                 S rO   r8  rS   s     rV   rW   z'_use_cutlass_for_op.<locals>.<listcomp>   s     IIIQqwwyyIIIrX   rt  )ro   rF   cutlass_enabled_opsr:  rn  )r  enabled_opss     rV   _use_cutlass_for_opr    sX    +17799Ket==??II+2C2CC2H2HIIIIIrX   r   _IntLikethreshold_multiplec           
     t   ddl m} t          j        j        |z  }t
          j        j         o|j        j	        
                    t          j        t          j        ||| z            t          j        |||z                                o.|j        j         o!|j        j         ot          j        j        dk    S )Nr   r~  )torch._inductor.virtualizedr  ro   r;  decompose_k_thresholdrQ   r  r  r  r  statically_known_truer   AndGeaot_modecpp_wrappernum_decompose_k_splits)r&  rI  r  r  r  r  s         rV   use_decompose_k_choicer    s     .-----"M?BTT M 		5G22I1A5661A566 
 
		5   		5 ##		5 M014rX   c           
     \   t           j        j        }ddlm} t          t          j        j                  ow|j	        j
                            t          j        t          j        ||| z            t          j        |||z                                o|j	        j         o|j	        j         S )z
    Check if we should use the contiguous subgraph transform.
    This transform makes the second matrix contiguous before the matmul.
    r   r~  )ro   rocmcontiguous_thresholdr  r  r   rQ   r  r  r  r  r  r   r  r  r  r  )r&  rI  r  r  r  s        rV   use_contiguousr    s     ";; .----- 	U] 	$G22I0145501455 
 
	$   	$ ##
rX   c                   t           j        j        }g d}t          |t          j                  r	|j        s|S |dk    rg S t          | t          j                  r| j        r!t          |t          j                  r
|j        sdnt          || z  ||z            dt	          j        |          }fd|D             }g g g }}}|D ]j}	||	z  }
|
dk     r|
|
dz
  z  dk    r|
dk    r|	                    |	           6|
dz  dk    r|	                    |	           U|	                    |	           kt           j
        d	k    r||z   |z   S ||z   |z   }|d |         S )
N)rt   rv  rv   ru      r   r  r  c                ,    g | ]}|k    |k    |S rO   rO   )rT   divisormax_k_splitmin_k_splits     rV   rW   z get_k_splits.<locals>.<listcomp>L  s8       k!!g&<&< 	&<&<&<rX   ru   r5   rv  
EXHAUSTIVE)ro   r;  r  r   r   rA  	is_numberr  divisorsr  max_autotune_gemm_search_space)r&  rI  r  k_splits_limitdefault_k_splitsr  pow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitsr  r  s               @@rV   get_k_splitsr  4  s    ]9N .--!UZ    	1			1ej!! *!+ *1ej!!**++* !q&!q&))K~a  H      H =?B>) % %Q 3;; EAI!##$$Q''''RZ1__%%a(((( !!!$$$$,<< #55FF#&88>IK''rX   c                J    t           j                            |           j        S r   )rQ   rF   r  gcnArchNamer   s    rV   _rocm_native_device_arch_namer  m  s    :++F33??rX   Qtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]]c                     	 dd l } ddlm}m} ddlm} t          j                            | j	                  }n'# t          $ r d
d}d
d} G d d	          }d }Y nw xY w||||fS )Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationrK   r  c                     g S r   rO   rO   rX   rV   r  z*try_import_ck_lib.<locals>.gen_ops_library      IrX   c                     g S r   rO   rO   rX   rV   r  z.try_import_ck_lib.<locals>.gen_ops_preselected  r  rX   c                      e Zd ZdS )*try_import_ck_lib.<locals>.CKGemmOperationN)r   r   r   rO   rX   rV   r  r    s        DrX   r  )rK   r  )ck4inductor(ck4inductor.universal_gemm.gen_instancesr  r  ck4inductor.universal_gemm.opr  r>  r?  dirname__file__r  )r  r  r  r  package_dirnames        rV   try_import_ck_libr  r  s    	
 	
 	
 	
 	
 	
 	
 	
	
 	
 	
 	
 	
 	
 '//+*>??   	 	 	 		 	 	 		 	 	 	 	 	 	 	  O-@/QQs   69 !AAc                f   t           j        st           j        sdS t          j        j        sdS | j        j        dk    rdS t          | j                  }d t           j	        j
        D             p|                    d          d         |ifd                                t           j	        j        z  D             }|sdS | j        t          j        t          j        t          j        fvrdS t%                      \  }}}}|st&                              d           dS |t           j	        _        dS )	NFrF   c                F    i | ]}|                     d           d         |S ):r   )rn  )rT   r  s     rV   r9  z#use_ck_template.<locals>.<dictcomp>  s(    DDDaqwws||ADDDrX   r  r   c                     g | ]
}|         S rO   rO   )rT   r  requested_archss     rV   rW   z#use_ck_template.<locals>.<listcomp>  s.     ! ! ! 	! ! !rX   z,Please pip install Composable Kernel packageT)ro   rK  rL  rQ   r  r  r   r  r  r  archrn  r|  ck_supported_archr   r   rW  rY  r  r   rE  ck_dir)r_  native_archrequested_supported_archsck_package_dirnamer   r  s        @rV   use_ck_templater    sA    6#; u= u}V##u 0>>KDD6;3CDDD #q!;IO! ! ! ! %%''&+*GG! ! ! % u|EM5>5=IIIu"3"5"51a BCCCu+FK4rX   c                    ddl m} t          d          o9t          |           o*|j        j                            ||z  |z  d          dk    S )Nr5   r~  CKr   r  r   r  r  r=  r  r  r  r  r_  r&  rI  r  r  s        rV   use_ck_gemm_templater    se     	d## 	CF##	CG&&q1uqy2&>>BrX   c                    ddl m} t          d          o9t          |           o*|j        j                            ||z  |z  d          dk    S )Nr5   r~  CKTILEr   r  r   r  r  s        rV   use_ck_tile_gemm_templater    se     	h'' 	CF##	CG&&q1uqy2&>>BrX   c                >    t          d          ot          |           S )Nr  )rB  r  r_  s    rV   use_ck_conv_templater    s    %d++G0G0GGrX   c                R    t           j        pt           j        o| j        j        dk    S r  )ro   rK  rL  r   r  r  s    rV   _use_template_for_cpur    s'    7v7&
-

%&rX   mat1Union[ReinterpretView, Buffer]mat2c                   ddl m} t          |j        |          sJ |j        j        }|j        j        }t          |           oe|                                t          j	        k    oCt          |          dk    o0t          |          dk    o|d         |d         k    o|d         dk    }t          | ||d          o|j                                        p|S )Nr5   )r?      r  F)require_constant_mat2)r$  r?   r   r_  r  rj  r  rf  rQ   rY  rZ   use_cpp_gemm_templateis_contiguous)r_  r  r  r?   	mat1_sizemat1_stridemat1_each_batch_is_contiguouss          rV   use_cpp_bmm_templater    s     dk6*****
  I+$Kf%% 	"NN-	"^^q 	" "	" ^y|+		"
 ^q  " !t5QQQ !!##D'DrX   mat2_transposedr  is_woq_int4q_group_sizec                `   ddl m} ddlm} ddlm}	 ddlm}
 t          |           rt          d          sdS t          j        j        sdS |                                t          j        t          j        fv }t          j        t          j        t          j        t          j        g} |
|||r| j        nd ||          \  }}}} }}t+          ||f          rdS t-          ||j                  r|                                } |	|                                          \  }} |d	||||                                |                                |t3                      | |

  
        }dd}| j        |v o:|d uo6 ||          o+t-          ||j                  o|                                p| S )Nr5   r#  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtyper  use_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refr  rU   r>   rK   r   c                f    |                                   |                                 d         dk    S )Nr   r5   )freeze_layoutre  rU   s    rV   is_last_dim_stride1z2use_cpp_gemm_template.<locals>.is_last_dim_stride1	  s*    	||~~b!Q&&rX   )rU   r>   rK   r   )r(  r$  codegen.cpp_micro_gemmr  codegen.cpp_utilsr  kernel.mm_commonr  r  r=  ro   cppweight_prepackrf  rQ   rh  r]  rY  rW  halfr   has_free_symbolsr   BaseViewunwrap_viewparallel_num_threadsr\  is_module_buffer)r_  r  r  r  r  r  r  r$  r  r  r  	int8_gemmrN  r&  rI  r  r  r   r  r!  s                       rV   r  r    s     999999MMMMMM)))))) (( 0Ee0L0L u:$ u  U[%*$==I]ENEJLM")'"+5&,,'# # #Aq!VT4 A u$$$ "!!@@AQAQRROL!""			NN$$^^%%!(**!  J' ' ' '
 	% 	Cd"	C%%	C tR]++	C ""$$A,A(ArX   c                 R    t           j        pt           j         pt          d          S )NATEN)ro   rK  rL  r=  rO   rX   rV   use_aten_gemm_kernelsr0  ,	  s*    7v7 '	v	&	&'rX   c                  R    e Zd ZU  ej        d          Zded<   ddZddZdd
Z	dS )DebugDirManagerr   rL   prev_debug_namerK   r  c                B    t          t          j                  | _        d S r   )r  r2  counterr   r  s    rV   r  zDebugDirManager.__init__6	  s    .//rX   c                    t           j        j        j        | _        | j         d| j         | _        | j        t           j        j        _        d S )N_tmp_)rQ   _dynamoro   debug_dir_rootr3  r   new_namer  s    rV   	__enter__zDebugDirManager.__enter__9	  sA    $}3B/??dg??.2m+++rX   r   r   c                n    t          j        | j                   | j        t          j        j        _        d S r   )rR  rS  r:  r3  rQ   r8  ro   r9  )r  r   s     rV   __exit__zDebugDirManager.__exit__>	  s*    dm$$$.2.B+++rX   Nr  )r   r   rK   r  )
r   r   r   r  r  r5  r   r  r;  r=  rO   rX   rV   r2  r2  2	  su         ioa  G0 0 0 0< < < <
C C C C C CrX   r2  Callable[P, _T]r  r  tuple[_T, list[str]]c                   ddl m} t                      d	fd}t          j                            |d|          5  t          j                                          | |i |}d d d            n# 1 swxY w Y   |t                    fS )
Nr5   r:   coderL   rK   r  c                2                         |            d S r   )r~  rA  source_codess    rV   save_output_codez*run_and_get_code.<locals>.save_output_codeL	  s    rX   rE  rA  rL   rK   r  )
r  r;   r"   r   rL  r  rQ   r8  resetrp  )r   r   r  r;   rE  r  rD  s         @rV   run_and_get_coderH  C	  s    
 %$$$$$$.LLL      
		=*<>N	O	O % %T$V$$% % % % % % % % % % % % % % % 4%%%%s   'A00A47A4c                    t          | g|R i |\  }}g }|D ]5}|                    t          j        d|t          j                             6||fS )Nz	'''.*?''')rH  rJ  r   findallDOTALL)r   r   r  r  rD  kernelsrA  s          rV   run_and_get_kernelsrM  U	  sk     ,B@@@@@@FLG B Brz,bi@@AAAA7?rX   tuple[Any, list[str]]c                .     d fd}t          |          S )NrK   r   c                 h                 } |                                                                   | S r   )r  r8  )r  r   s    rV   run_with_backwardz1run_fw_bw_and_get_code.<locals>.run_with_backwarda	  s-    

rX   )rK   r   )rH  )r   rQ  s   ` rV   run_fw_bw_and_get_coderR  `	  s2         
 -...rX   c                b   ddl m} g dfddfd}t          j                            |d|          5  t          j                            |d          5  t
          j                                          | |i |}ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   S )zLGet the inductor-generated code, but skip any actual compilation or running.r5   r:   rA  rL   rK   r  c                2                         |            d S r   r  rC  s    rV   rE  z"get_code.<locals>.save_output_codeo	  s    D!!!!!rX   r  r;   r   c                     G d d          }| j         r|                                 n|                                 \  }} |j                   |r |j                    |            S )Nc                  "    e Zd ZdZd
dZddZd	S )@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulerK   r  c                    d S r   rO   r  s    rV   r  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__v	  s    rX   r   r   r  c                    d S r   rO   r  s      rV   callzEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.cally	  s    rX   Nr  r   r   r  r   rK   r  )r   r   r   r   r  r[  rO   rX   rV   DummyModulerX  s	  sB        FF        rX   r]  )r  codegen_with_cpp_wrappercodegenr   )r  r]  wrapper_codekernel_coderE  s       rV   patched_compile_to_modulez+get_code.<locals>.patched_compile_to_moduler	  s    	 	 	 	 	 	 	 	 04/?SD))+++T\\^^ 	"k 	+,,, 	0[.///{}}rX   compile_to_modulerE  NrF  )r  r;   rK   r   )r  r;   r   rL  r  rQ   r8  rG  )r   r   r  r;   rb  r   rE  rD  s         @@rV   get_coderd  i	  st   $$$$$$ L" " " " " "     , 	
.0I	
 	
    	
-);=MNN	    	B                                                            s5   "B$'BB$B	B$B	B$$B(+B(c                    t          | g|R i |}dt          |          cxk    rdk    sn J dt          |                       |d         S Nr5   r  z%expected one or two code outputs got r   )rd  rZ   )r   r   r  rD  s       rV   get_triton_coderg  	  ss    B000000LL!!&&&&Q&&&&&CL0A0ACC '&& ?rX   c                    t          | g|R i |\  }}dt          |          cxk    rdk    sn J dt          |                       |d         S rf  )rH  rZ   )r   r   r  r   rD  s        rV   run_and_get_triton_coderi  	  sy     'r;D;;;F;;OA|L!!&&&&Q&&&&&CL0A0ACC '&& ?rX   tuple[Any, list[GraphLowering]]c                    ddl m ddlm} |j        g dfd	}t
          j                            |d
|          5   | |i |}d d d            n# 1 swxY w Y   |fS )Nr   r:   rB   r   r   r  rK   r  c                 v     | i | | d         }t          |          sJ                     |           d S )Nr  )r   r  )r   r  r  r;   graph_lowerings	real_inits      rV   	fake_initz-run_and_get_graph_lowering.<locals>.fake_init	  sQ    	4"6"""Q%/////u%%%%%rX   r  r\  )torch._inductor.graphr;   torch._inductor.output_coderC   r  r   rL  r  )	r   r   r  rC   ro  r  r;   rm  rn  s	         @@@rV   run_and_get_graph_loweringrr  	  s     433333;;;;;;(IO& & & & & & & & 
		?J		B	B % %T$V$$% % % % % % % % % % % % % % % ?""s   	AAAaten_opoverride_fnc              #     K   ddl m} |j        |          }	 t          j        ||          |j        | <   dV  ||j        | <   dS # ||j        | <   w xY w)z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorrv  	loweringsr$  partial)rs  rt  rv  orig_fns       rV   override_loweringr{  	  s{       )((((( )G.&/&7W&M&M7#&-7###g7#----s   !A Apre_fnpost_fnOptional[Callable[..., Any]]c                ~     ddl m} |j        d	 fd}t          j        j                            |d|          S )
zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerr  r   rx  rK   c                T     | |            | |          }r | |           |S r   rO   )r  rx  outrz  r}  r|  s      rV   r  z(add_scheduler_init_hook.<locals>.wrapper	  sE    y%   gi'' 	&GIu%%%
rX   r  )r  r   rx  r   rK   r   )torch._inductor.schedulerr  r  unittestr   rL  r  )r|  r}  r  r  rz  s   ``  @rV   add_scheduler_init_hookr  	  sh     433333 G        =%%iWEEErX   msgc                    t           j        rt                              |            dS t                              |            dS )z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)ro   developer_warningsr   rE  info)r  s    rV   developer_warningr  	  s:       CrX   c                    	 t           j                            d          } | dz   t          t           j                  k     rZt          t           j        | dz                      dk    r4t           j        | dz            d         dk    rt           j        | dz            S n# t          $ r Y nw xY wt           j        D ]0}|                    d          r|t          d          d         c S 1dS )a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr5   r   r  z--only=N)r  argvr  rZ   
ValueErrorr  )rL  r  s     rV   get_benchmark_namer  	  s    	hnnX&&!Gc#(mm##CHS1W%&&**q!!$++8C!G$$    x ) )>>)$$ 	)s9~~''((((	) 4s   BB 
B)(B)r}  c                4    t          d | D                       S )Nc              3  "   K   | ]
}|d k    V  dS r5   NrO   rS   s     rV   r   zis_ones.<locals>.<genexpr>
  &      %%!qAv%%%%%%rX   r   r}  s    rV   is_onesr  
      %%u%%%%%%rX   c                4    t          d | D                       S )Nc              3  "   K   | ]
}|d k    V  dS )r   NrO   rS   s     rV   r   zis_zeros.<locals>.<genexpr>
  r  rX   r  r  s    rV   is_zerosr  
  r  rX   inputsSequence[torch.Tensor]c                4    t          d | D                       S )Nc              3     K   | ];}t          |t          j                  |j        t          j        d           k    V  <dS )r  N)r   rQ   r  r   )rT   r   s     rV   r   z is_cpu_device.<locals>.<genexpr>
  sY        dEL))u|E***     rX   r  )r  s    rV   is_cpu_devicer  
  s0           rX   r  c                    t          | t          j                  s
J d            | j        rt          j        S t          j        S )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)r   r   rA  r   rQ   rc  r[  )r  s    rV   get_sympy_Expr_dtyper  #
  sI    c5:&&  B   ~ {}rX   should_profileIterator[Any]c              /     K   | r5t          j        j        |i |5 }|V  d d d            d S # 1 swxY w Y   d S d V  d S r   )rQ   r   r   )r  r   r  r   s       rV   maybe_profiler  -
  s       ^#T4V44 	GGG	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   -11c                 Z    t           j        j        } | dk     rt          j                    } | S Nr5   )ro   r%  threadsrQ   get_num_threads)r  s    rV   r+  r+  6
  s(    j G{{'))NrX   c                 t    ddl m}   |             }|                    dt          j        j        rdnd          S )Nr5   )get_backend_options
num_stagesr  r  )runtime.triton_helpersr  rz  rQ   r  r  )r  optionss     rV   get_backend_num_stagesr  =
  sD    ;;;;;;!!##G;;|%-*;%BQQCCCrX   c                j   t          | t          j        j        j        j                  }||S ddlm}m} t          j        	                                o!t          j        
                                dk    }| t          j        t          j        t          j        fv sJ t          j        |          j                            d          rddlm}  |            }| t          j        t          j        fv r|r || |          S t          j        j        j        j        r |t          j        |          S  |t          j        |          S | t          j        t          j        fv r|r ||           S t          j        j        j        j        r |t          j                  S  |t          j                  S )z
    We don't want to throw errors in this function. First check to see if the device is in device_info.py,
    then fall back to the inaccurate triton estimation.
    )is_tf32Nr   )get_max_simd_tflopsget_max_tensorcore_tflops)rw   r   
clock_rate)max_clock_rate)r   rQ   backendsrF   matmul
allow_tf32triton.testingr  r  rR   get_device_capabilityr   rW  rY  inspect	signature
parametersrz  torch._utils_internalr  )r   ds_topsr  r  SM80OrLaterr  sm_clocks          rV   get_device_tflopsr  E
  s    UEN,?,F,QRRRGMMMMMMMM*))++ 
0P0P0R0R W 1K
 U]ENEMBBBBB,--8<<\JJ 6888888!>##U]EN3333,,UH===>%0 	@,,U]HEEE&&u}h???U]EN3333,,U333>%0 	6,,U];;; '&u}555rX   c                 "    ddl m}   |             S )Nr   get_dram_gbps)r  r  r  s    rV   get_gpu_dram_gbpsr  q
  s     ,,,,,,=??rX   c                 t    ddl m}  | j        j                            d                              dd          S )Nr   r  max_shared_mem)triton.runtimer  r   r!  r  rz  r  s    rV   get_gpu_shared_memoryr  x
  s@    %%%%%% =44Q77;;<LaPPPrX   c                     t           j                                        rGt           j                                        j        } t           j                                        j        }nd} d}|| z  S )Nrv  i   )rQ   rF   rR   r  	warp_sizemax_threads_per_block)r  r  s     rV   get_max_numwarpsr  
  s`    z   %J4466@	 %
 @ @ B B X 	 $ I--rX   reduction_typec                ,    |                      d          S )Nwelford)r  r  s    rV   is_welford_reductionr  
  s    $$Y///rX   c                8    t          |           rdS | dk    rdS dS )Nr  online_softmax_reducer  r5   )r  r  s    rV   reduction_num_outputsr  
  s-    N++ q	2	2	2qqrX   c                 0    t          j                    dk    S )NLinux)platformsystemrO   rX   rV   is_linuxr  
  s    ?''rX   c                 "    t           j        dk    S )Nrq   )r  r  rO   rX   rV   rQ  rQ  
  s    <7""rX   itrIterable[Any]c                4    t          d | D                       S )Nc              3  Z   K   | ]&}t          |t          j                  o|j         V  'd S r   )r   r   rA  r  rS   s     rV   r   z#has_free_symbols.<locals>.<genexpr>
  s7      JJz!UZ((<_JJJJJJrX   r  )r  s    rV   r(  r(  
  s    JJcJJJJJJrX   c            	     t   ddl m} | D ]}t          ||j        |j        |j        |j        |j        f          rJt          |	                                pd          s#t          |
                                pd          r dS zt          ||j                  st          dt          |                     dS )Nr5   r#  rO   Tzunexpected type for is_dynamic F)r(  r$  r   rZ  r\  r)  r  r<   r(  maybe_get_sizemaybe_get_strider>   	TypeErrorr  )r   r$  ts      rV   r  r  
  s     I IbmR[":KRYW
 
 
	I   0 0 2 2 8b99 =M""$$*> >  tt Ary)) 	IGd1ggGGHHH5rX   c                      e Zd ZdZdZdS )PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r   r   r   r  r  rO   rX   rV   r  r  
  s          K *rX   r  r  r3   inpc                   ddl m} t          j        dd          5 }t	          j                    }t	          j                    } t          |t          |                    j        |  t          d|j
         |           t          |j
        |           t          j                    }t          ||          5   | |j
                   d d d            n# 1 swxY w Y   t          j                    |z
  }	 ||j
                   |j
                                         |                                 t          d	|j
         |           t          |j
        |           |                                |                                k    }
t"                              d
||j        |
|	           d d d            d S # 1 swxY w Y   d S )Nr5   )stable_topological_sortr  zutf-8)modeencoding)r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr  rJ  NamedTemporaryFileior   rd   r`   	propagater  r  r   nowrc   lint	recompiler  r   r  r   )r  r  r  r  r  rA  	before_ioafter_io
start_timetime_elapsedr  s              rV   pass_execution_and_saver  
  sA    988888		$
 
 
 
 
KMM	;==C	R#3C#8#8999CSII$"($$1----bhY''''\^^
#B,, 	 	DNNN	 	 	 	 	 	 	 	 	 	 	 	 	 	 	|~~
2)))

###!,,,,bhX&&&&  H$5$5$7$77hF	
 	
 	
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
s7   BG <CG C	G  C	!CG  GG	input_buf"Optional[Union[Buffer, Operation]]c                l    ddl m} t          | |j                  ot          | j        |j                  S )zB
    Check if input buffer is a multi-outputs template buffer
    r5   r#  )r(  r$  r   CppTemplateBufferr_  MultiOutputLayoutr  r$  s     rV   is_multi_outputs_templater  
  sG     i!566 :".< < rX   c                    ddl m} t          | |j                  o1t	          | j                  dk    ot          | j        d                   S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r5   r#  r   )r(  r$  r   MultiOutputrZ   r  r  r  s     rV   #is_output_of_multi_outputs_templater  
  s\      	9bn-- 	;	 !!Q&	;%i&6q&9::rX   r   Optional[Union[Node, Operation]]!Optional[torch._ops.OperatorBase]c                D   | dS ddl m} t          | |j                  o"t          | |j                   o|d u p| j        |u pt          |           |j        u ot          t          j
        j        d          o#| j        t          j
        j        j        j        k    pt          t          j
        j        d          o#| j        t          j
        j        j        j        k    pBt          t          j
        j        d          o#| j        t          j
        j        j        j        k    S )NFr5   r#  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r(  r$  r   _CollectiveKernel_WaitKernelop_overloadr  FallbackKernelr  rQ   r  torchrecr
  defaultr  r  r  r  r$  s      rV   is_collectiver  
  s%    |u 	4-.. 	34000	34Z14+r1 	T

b'' 	

 	*,?@@ U$	(:(L(TT
 	*,DEE E$9%<DE 	*,CDD Y$	(:(P(XX/rX   "Optional[Union[IRNode, Operation]]c                :    ddl m} t          |           |j        u S Nr5   r#  )r(  r$  r  r  )r  r$  s     rV   is_waitr  $  s'    ::''rX   snoderD   	filter_fn-Optional[Callable[[BaseSchedulerNode], bool]]c                    ddl m} t          | |          rt          d | j        D                       S t          | j                  o|d u p
 ||           S )Nr   GroupedSchedulerNodec              3  4   K   | ]}t          |          V  d S r   )contains_collectiverS   s     rV   r   z&contains_collective.<locals>.<genexpr>1  s+      @@a&q))@@@@@@rX   )r  r  r   r  snodesr  r  )r  r  r  s      rV   r   r   *  sr     ?>>>>>%-.. A@@5<@@@@@@$$P)t*;*Oyy?O?OPrX   c                    ddl m} t          | |          rt          d | j        D                       S t          | j                  S )Nr   r  c              3  4   K   | ]}t          |          V  d S r   )contains_waitrS   s     rV   r   z contains_wait.<locals>.<genexpr>:  s*      ::=##::::::rX   )r  r  r   r  r!  r  r  )r  r  s     rV   r$  r$  6  sV    >>>>>>%-.. #::U\::::::uz"""rX   Optional[Operation]?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]c                    ddl m} t          |t          j        j                  r|g}t          | |j                  o| j        |v S r  )r(  r$  r   rQ   r  r  r  r  r  s      rV   is_fallback_opr(  ?  sT     "ej+,, TdB-..I43Cr3IIrX   buf_namename_to_bufname_to_fused_nodec                L    |||          j                                                  S r   )defining_opr  )r)  r*  r+  s      rV   buf_name_to_fused_snoder.  J  s#     k(3?HHJJKKrX   c                    dS r  rO   r  s    rV   r  r  U      u rX   collected_node_setMutableSet[BaseSchedulerNode]dict[str, SchedulerBuffer]dict[str, BaseSchedulerNode]criteria_cbCallable[[Any], bool]c                     ||           rd S |                     |            | j        D ]1}t          |j        ||          }||v rt	          |||||           2d S )Nr6  )r~  unmet_dependenciesr.  r   find_recursive_deps_of_node)r  r2  r*  r+  r6  depdefining_op_for_deps          rV   r;  r;  P  s     {5 5!!!' 
 
5Hk#5
 
 "444##	
 	
 	
 	
 	

 
rX   c                    dS r  rO   r0  s    rV   r  r  n  r1  rX   c           	         ||           rd S |                     |            |                                 D ]}|j        D ]}}|j        J |j                                        dk    r)|j                                        |vrE||j                                                 }||v rit          |||||           ~d S )NOUTPUTr9  )r~  get_outputsr  r  r  find_recursive_users_of_node)r  r2  r*  r+  r6  or  user_ops           rV   rB  rB  i  s    {5 5!!!    G 	 	D9(((y!!##x//y!!##+===(););)=)=>G,,,(""'    	 rX   dynamo_gm_num_inputsaot_fw_gm_num_inputsc                F    t           j        j        j        rdnd}|| z
  |z
  S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r  r   )rQ   
_functorchro   functionalize_rng_ops)rE  rF  num_rng_seed_offset_inputss      rV   num_fw_fixed_argumentsrK    s2     $:A   "669SSSrX   fx_gc                   d	d}d}g }| j         j        D ]2}|j        dk    r% ||          r|                    |           |dz  }3|t	          t          t          |                              k    sJ t          |          S )
z>
    Infers which inputs are static for a backwards graph
    rU   r4   rK   r   c                J    d| j         vod| j         vod| j         vod| j         vS )Ntangentsbwd_seedbwd_base_offsetbwd_rng_stater  r   s    rV   is_saved_tensorz'count_tangents.<locals>.is_saved_tensor  s@    af$ .!&(.!/.  qv-		
rX   r   r  r5   )rU   r4   rK   r   )r  rx  r  r  rp  r   rZ   )rL  rS  	arg_countstatic_arg_idxsrI  s        rV   count_tangentsrV    s    

 
 
 
 IOZ  4=  q!! 2&&y111NId5_)=)=#>#>??????rX   c                  :    e Zd ZU ded<   d
dZedd            Zd	S )	BoxedBoolr   r   rK   c                    | j         S r   )r   r  s    rV   r  zBoxedBool.__bool__  s
    zrX   r  r   Union[BoxedBool, bool]c                B    t          | t                    r	d| _        | S dS r  )r   rX  r   r.  s    rV   disablezBoxedBool.disable  s%    c9%% 	CIJurX   Nr  )r  r   rK   rZ  )r   r   r   r   r  r  r\  rO   rX   rV   rX  rX    sS         KKK       \  rX   rX  kernel_listc              #      K   ddl m} |j        	 	 	 dd fd}t          j                            |d|          5  d V  d d d            d S # 1 swxY w Y   d S )Nr5   r7   Tr  r8   kernel_namerL   ra  r  r5  gpur   cpp_definitionrK   r   c                N                         |            | |||||          S r   rU  )r  r_  ra  r  r`  ra  r]  orig_define_kernels         rV   define_kernelz.collect_defined_kernels.<locals>.define_kernel  s;     	;'''!!+{Hc>
 
 	
rX   rd  )NTN)r  r8   r_  rL   ra  rL   r  r5  r`  r   ra  r5  rK   r   )codegen.wrapperr8   rd  r   rL  r  )r]  r8   rd  rc  s   `  @rV   collect_defined_kernelsrf    s      555555-; #'(,
 
 
 
 
 
 
 
 
		/-	P	P                   s   AAAc                    | dz   S )N__original__rO   r  s    rV    get_cloned_parameter_buffer_nameri    s    .  rX   c                    | t           v S r   )rY   r  s    rV   r3  r3    s    YrX   c                ,    | dk    ot          |           S )NrG   )r3  r  s    rV   device_need_guardrl    s    U?-vf~~-rX   c                6   | t           j        k    r@t           j                                        r"t           j                                        dk     S | t           j        k    r t           j                                        rdS | t           j        t           j        fv S )N)r  r   T)rQ   rW  rF   rR   r  rH   rc  r   ru  s    rV   ,needs_fallback_due_to_atomic_add_limitationsrn    st    5:#:#:#<#<z//11F::	%.	 	 UY%;%;%=%=	 tej111rX   r  
self_dtype	src_dtypesrc_device_typesrc_is_tensorc                *   | j         t          j        j        j        t          j        j        j        fv r|dS | j         t          j        j        j        k    rdnd}|d |fvp|ot          |          ot          |          p| j         t          j        j        j        k    oA|dk    o;|o9|dk    o3t          j
        j        o"t          j
        j        pt                      dk    p2||k    o|t          j        t          j        fv pt          j                    S )NFr~  r  r  r5   )overloadpacketrQ   r  atenscatter_reduce_scatter_reducescatter_r3  rn  ro   r%  fallback_scatter_reduce_sumdynamic_threadsr+  r   rc  $are_deterministic_algorithms_enabled)r  r  ro  rp  rq  rr  	reduce_tys          rV   use_scatter_fallbackr}    s=    	"IN*EIN,IJ	K 	K"u +uy~/FFFE 
 	tY// 	8 H''H<YGG		8 &%).*HH L%'LL  5(L 
6	L
 +J/C/E/E/J	8 i'SJ5:u{:S,S	8 577!rX   c                   ddl m}m} ddlm} t          dt          |            d           t          |           D ]!\  }}t          d|dd           ||u rt          d	           .||u rt          d
           Bt          ||          r|	                                }t          |rdnd d           |r*|j
        J t          d|j
        j        j                    t          d           |j        j        D ]}t          |           t          d           |j        j        D ]}t          |           t!          dt#          |                     dS )z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesrS  3r  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdr  r  r  r  r  rZ   r  r   is_reductionr  r[  reduction_hintro  rp  rq  r  r  )r  r  r  r  rL  r  is_redr<  s           rV   dump_node_scheduler    s   
 ONNNNNNN777777	
:M 2 2
:
:
:;;;}-- H H	Tl#llll?""$%%%%%%%%&&&&m,, 	H&&((Ff.UU$???@@@ Ry,,,P1NPPQQQ*'-  c



+'.  c



 F$t**FFGGG'H HrX   r   r  c                    ddl m}  ||                                 t          | j                  z  t
          z  dk              S )Nr   )r  )r  r  storage_offsetrw  r   GPU_ALIGN_BYTES)r   r  s     rV   tensor_is_alignedr  +  sU     LKKKKK  				 	 >&,#?#?	??RVWW  rX   example_inputc                n    t          | j        j                  sdS t          j        pt          |           S r  )r3  r   r  ro   assume_aligned_inputsr  )r  s    rV   should_assume_input_alignedr  9  s6     -&+,, u'K+<]+K+KKrX   r  c                     t           j        j                                        } | st	          j                    S | j        r| j        j        st	          j                    S | j        j        }|                                S r   )	rQ   _guardsTracingContexttry_getr  nullcontextr  r  suppress_guards)tracing_contextr  s     rV   #maybe_get_suppress_shape_guards_ctxr  B  sw    
 m2::<<O (%''' $ (O,E,O (%''')3I$$&&&rX   tuple[_T, str]c                   t           j        j                            t          dd          5  t
          j                                         dd l}dd l	} |j
                    } |j        |          }ddlm} |                    |           |j        }|                    |j                    | |i |}	|                                }
|                    |           |                    |           d d d            n# 1 swxY w Y   |	|
fS )Nr   Tr   )output_code_log)r  r   rL  r  ro   rQ   r8  rG  r  loggingr   StreamHandlertorch._inductor.codecacher  
addHandlerlevelsetLevelDEBUGr  removeHandler)r   r   r  r  r  log_capture_stringchr  
prev_levelr  r   s              rV   run_and_get_cpp_coder  R  sZ    
		#	#FGT	:	: * *			(R[]]"W"#566======""2&&&$*
  ///T$V$$''))  ,,,%%b)))* * * * * * * * * * * * * * *  19s   CD  DDSequence[InputType]Optional[ShapeEnv]c                   t          |           }||j        S | D ]}t          |t          j                  r|j        j        c S t          |t          j                  r|                                D ],}t          |t          j                  r|j        j        c c S -|                                D ],}t          |t          j                  r|j        j        c c S -d S r   )	r`   r  r   rQ   r1   r  r  r  rj  )r  r  inputr  rj  s        rV   shape_env_from_inputsr  k  s    ((I ""  1 1eU\** 	(:'''' eU\** 	1

 / /dEL11 /9....../,,.. 1 1fel33 1!;0000001 4rX   Callable[[list[InputType]], _T]inputs_to_checkmutated_input_idxsOrderedSet[int]c                F     t                    dk    r S d fd}|S )Nr   
new_inputslist[InputType]rK   r   c                    t          |           \  }} |           }t          |          rt          j        ||           |S r   )copy_misaligned_inputsrZ   rQ   _foreach_copy_)r  old_tensorsnew_tensorsr  r  r  r  s       rV   r  z)align_inputs_from_check_idxs.<locals>.run  sZ    #9);$
 $
 [ eJ { 	; k:::
rX   )r  r  rK   r   )rZ   )r  r  r  r  s   ``` rV   align_inputs_from_check_idxsr    sN    
 ?q          JrX   c                   d|                                  v rd}nNt          d t          |                                  |                                           D                       dz   }t	          j        | |fd                                          }t	          j        ||                                  |                                           S )Nr   c              3  ,   K   | ]\  }}|d z
  |z  V  dS r  rO   )rT   ra  rj  s      rV   r   z)clone_preserve_strides.<locals>.<genexpr>  s/      TTf$TTTTTTrX   r5   r   )r  r  r   rj  rQ   
as_stridedclone)rU   needed_sizerU  s      rV   clone_preserve_stridesr    s    AFFHH}} TT#affhh

:S:STTTTTWXX 	 a+66<<>>FFAFFHHahhjj999rX   r  r  check_inputs_idxsreturn_pair_idxsOptional[OrderedSet[int]]-tuple[list[torch.Tensor], list[torch.Tensor]]c                d   g }g }|du}|D ]}| |         }t          |t          j                  sJ dt          |                       |                                t
          z  rHt          |          | |<   |r4||v r0|                    |           |                    | |                    ||fS )z
    Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
    cloned tensor which is in `return_pair_idxs`.
    Nz Expected tensors only, but got: )r   rQ   r  r  data_ptr	ALIGNMENTr  r  )r  r  r  r  r  ret_pair_definedr   _inps           rV   r  r    s     ')K&(K (t3 
2 
2!}$-- 	
 	
;tDzz;;	
 	
 	
 ==??Y& 	22488JqM 2A)9$9$9""4((("":a=111##rX   static_input_idxsc                   g }|D ]Y}| |         }t          |t          j                  r5|                                t          z  dk    r|                    |           Zt          |          t          |          k    r|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )r   rQ   r  r  r  r  rZ   )r  r  aligned_static_input_idxsrL  r  s        rV   remove_unaligned_input_idxsr    s     !#  2 2seU\** 	20@0@90LQR/R/R%,,S111
$%%->)?)???((rX   r   c                   ddl m} t          j        t          j                  j        }|j        j        j        }|j        j        j	        j
        }t          j        r"|j        j                            | |           dS |j        j                            | |k              rdS |j        r%|j        j                            | dk               rdS  ||           o ||           |k    S )Nr5   r~  Tg@xDF)r  r  rQ   iinfora  r   r  r  r  r  has_hintro   assume_32bit_indexing	check_leqr  aot_compilation)r   r  int_maxr  r  s        rV   expr_fits_within_32bitr    s    k%+&&*G *Iw)2H# 	""1g...t 	w--a7l;; t 	  711!d(;; 	 5 8A;;299Q<<722rX   compiled_graphrC   c                   t           j        j                                        }||j        t          |j                  dk    sJ t          |           |j        J |j        D ]}||j                            d            dt           j        j                                        x}r|j        d	fd|j                            t          fd|D                                  d S d S d S )
Nr   Fr   r   rK   ,Union[float, int, SymInt, SymFloat, SymBool]c                ~    t          |           S r                    |           S                     |           S r   )ry   deserialize_symexprevaluate_symexpr)r   fakify_first_callr  s    rV   map_exprz4set_tracing_context_output_strides.<locals>.map_expr  sE     ("1vv( @(<<Q???$55a888rX   c              3  .   K   | ]} |          V  d S r   rO   )rT   r   r  s     rV   r   z5set_tracing_context_output_strides.<locals>.<genexpr>  s+      55!((1++555555rX   )r   r   rK   r  )
rQ   r  r  r  output_stridesrZ   r  r  r  r  )r  r  rz  rq  r  r  r  r  s        @@@rV   "set_tracing_context_output_stridesr    s>    m*2244Gw5A7)**a////).99	,888#2 	 	E}&--d3333$)!-6>>@@@3 >(+(=%9 9 9 9 9 9 9 &--5555u55555   ' AA	 	rX   c                    t           j        t           j        S t          j                    sdS t          j                                        rdS 	 ddlm}  n# t          $ r Y dS w xY w| t          j        	                    d          k    S )NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
ro   fx_graph_remote_cache	is_fbcoderQ   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorjustknobs_getval_intr  s    rV    should_use_remote_fx_graph_cacher     s    #/++ u,,.. uHHHHHHH   uu  5#8#M#M8$ $  s   A 
A$#A$c                .    t          j        dd|           S )Nz[^a-zA-Z0-9_]r   )r   subr  s    rV   normalize_namer  3  s    6"C...rX   ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2c                    i | ]\  }}||	S rO   rO   )rT   r  r}   s      rV   r9  r9  C  s    GGG$!QAGGGrX   z^.*[.]c                    t                               dt          |                     }t                              ||          S )z"Convert torch.dtype to triton typetl.)_triton_type_rer  rL   _triton_type_mappingrz  )r   triton_type_names     rV   triton_typer  I  s6    &**5#e**==##$46FGGGrX   c                    t                               | |           }|                    dd          }t          t          |          }t          |t          j                  sJ |S )Nr  r(  )_torch_triton_mappingrz  r  rP   rQ   r   r   )r   adjusted_type	type_namer  s       rV   triton_type_to_torchr  O  sY    )--eU;;M%%eR00Iy))Ii-----rX   r[  r   c                   | j          o|                                 |                                k    o|                                 |                                k    o| j        |j        k    o| j        |j        k    ow|                                                                 |                                                                k    o)|                                 |                                k    S r   )	is_mkldnnr  rj  r   r   untyped_storager  r  r[  r   s     rV   is_same_tensorr  W  s    N 	<IIKK5::<<'	<KKMMU\\^^+	< J%+%	< K5<'		<
   ""++--1F1F1H1H1Q1Q1S1SS	< !!U%9%9%;%;;rX   c                8   | j         o|                                 |                                k    oi| j        |j        k    oY| j        |j        k    oIt          j        j                            |           t          j        j                            |          k    S r   )r  r  r   r   rQ   r  mkldnnr  r  s     rV   is_same_mkldnn_tensorr  c  s     	PIIKK5::<<'	PJ%+%	P K5<'	P I%%d++uy/?/H/H/O/OOrX   tuple[str, ...]c                     dS )N)r  isnanlogical_notlogical_andsignbitand_leltgegteqner)  xorrO   rO   rX   rV   boolean_opsr  m  s     rX   c                  $    e Zd ZU ded<   ded<   dS )OpDtypeRuler2   type_promotion_kindrE  override_return_dtypeNr{  rO   rX   rV   r  r    s*         8888000000rX   r  zdict[str, OpDtypeRule]op_dtype_propagation_rulesr  r2   r  c                6    t          ||          t          | <   d S r   )r  r  )r   r  r  s      rV   #register_op_dtype_propagation_rulesr    s%    
 (32( (t$$$rX   zOrderedSet[str]op_requires_libdevice_fp64c                :    t                               |            d S r   )r  r~  r  s    rV   #register_op_requires_libdevice_fp64r    s    ""4(((((rX   r   c                    ddl m} | s|j                                        j        } | dk    rt
          j        S | dk    rdS | dk    rt
          j        S t
          j        S )Nr   r~  r  rG   rH   )	r  r  r  get_current_device_or_throwr  ro   cpu_backendxpu_backendcuda_backend)r   r  s     rV   get_current_backendr!    st    ------ Ag99;;@e!!			u			!!""rX   c                    | t           j        t           j        fv r/t          j        j        rt                      dk    rt           j        S | S )z"Maybe upcast [b]float16 to float32r;  )rQ   r   rW  ro   r;  codegen_upcast_to_fp32r!  rY  ru  s    rV   upcast_compute_typer$    sC     	%-000M0 	1!!X--}LrX   KeyTypeValTypec                  \    e Zd ZdZddZdd	ZddZddZdddZd dZ	d!dZ
d"dZd#dZdS )$
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    original_dictMapping[KeyType, ValType]c                "    || _         i | _        d S r   r)  	new_items)r  r)  s     rV   r  zScopedDict.__init__  s    *13rX   rD  r%  rK   r&  c                H    || j         v r| j         |         S | j        |         S r   r-  r)  r  s     rV   r\  zScopedDict.__getitem__  s*    $.  >#&&!#&&rX   r   r  c                    || j         |<   d S r   )r-  )r  rD  r   s      rV   __setitem__zScopedDict.__setitem__  s    #srX   r  r   c                &    || j         v p|| j        v S r   r/  r  s     rV   __contains__zScopedDict.__contains__  s    dn$At/A(AArX   Nr  Optional[ValType]c                d    || j         v r| j         |         S | j                            ||          S r   )r-  r)  rz  )r  rD  r  s      rV   rz  zScopedDict.get  s5    $.  >#&&!%%c7333rX   ry   c                ^    t          | j                  }| j        D ]}|| j        vr|dz  }|S r  )rZ   r)  r-  )r  rI  r  s      rV   r  zScopedDict.__len__  s@    "## 	 	A***QrX   Iterator[KeyType]c              #  R   K   | j         E d {V  | j        D ]}|| j         vr|V  d S r   r,  )r  r  s     rV   __iter__zScopedDict.__iter__  sT      %%%%%%%% 	 	A***	 	rX   c                8    t          | j        p| j                  S r   )r   r)  r-  r  s    rV   r  zScopedDict.__bool__  s    D&8$.999rX   c                    t           r   r  r  s     rV   __delitem__zScopedDict.__delitem__  s    !!rX   )r)  r*  )rD  r%  rK   r&  )rD  r%  r   r&  rK   r  )rD  r  rK   r   r   )rD  r%  r  r4  rK   r4  r  )rK   r7  r  )rD  r%  rK   r  )r   r   r   r   r  r\  r1  r3  rz  r  r9  r  r<  rO   rX   rV   r(  r(    s         4 4 4 4' ' ' '
$ $ $ $B B B B4 4 4 4 4
      : : : :" " " " " "rX   r(  )frozen_defaultr   Optional[type[Any]]r   c              .    dfd}| |S  ||           S )Nr   rs   rK   c                2    t          j        | d          S )NT)kw_onlyr   )dataclasses	dataclass)r   r   s    rV   wrapzir_dataclass.<locals>.wrap  s    $S$vFFFFrX   )r   rs   rK   rs   rO   )r   r   rD  s    ` rV   ir_dataclassrE    sA    G G G G G G {499rX   Optional[list[int]]c                 v    t           j        j                                        } | | j        r| j        j        S d S r   )rQ   r  r  r  fw_metadatabw_donated_idxs)r  s    rV   get_donated_idxsrJ    s7    m2::<<O"'B"*::4rX   c                  "    e Zd ZdZdZdZdZdZdS )TritonAttrsDescriptorVersionr   r5   r  r  r  N)r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTrO   rX   rV   rL  rL    s,        LKK	  GGGrX   rL  c                    t           j                            d          t          j        S dd l} dd l} t          | j        j	        d          rt          j
        S t          | j	        j	        d          rt          j        S t          j        S )Nr;  r   AttrsDescriptor)r  r  r  rL  rM  triton.backends.compilertriton.compiler.compilerr  r  compilerrO  rN  rQ  )r;  s    rV   #get_triton_attrs_descriptor_versionrW    s    ~))1+88########v'):;; 4 ,77	)+<	=	= 4+77 ,33rX   c                 :    t                      t          j        k    S r   )rW  rL  rQ  rO   rX   rV   triton_version_uses_attrs_dictrY    s    .004P4XXXrX   r  torch.fx.Nodec                   ddl m} | j        }t          |t          j        j                  sdS |t          j        j        j	        j
        t          j        j        j        j
        t          j        j        j        j
        fv r\ ||| j        | j        d          }|A|\  }}|d         }|D ]1}|-|j        d         j        t          j        t          j        fv r dS 2dS )	a  
    Check if an FX node is cudagraph-unsafe based on its input arguments.

    Some ops are only cudagraph-unsafe depending on their inputs (e.g., index_put
    with boolean indices triggers .nonzero() during capture, but integer indices
    are safe).
    r   )normalize_functionFT)normalize_to_only_use_kwargsNindicesr  )torch.fx.operator_schemasr\  r  r   rQ   r  r  r  ru  	index_putr  
index_put__unsafe_index_putr   r  r  r   r   rh  )r  r\  r  
normalizedr   r  r^  rL  s           rV   ,_fx_node_is_input_dependent_cudagraph_unsaferd  "  s     =<<<<<^Ffej344 u 	 (	!)	(0  
 ('GL'.t
 
 

 !"IAvY'G    ?sx'<JKA ( (  445rX   c                   | j         }t          |          t          v rdS t          |t          j        j                  rt          j        j        j	        |j
        v rdS t          |           rdS | j                            d          x}Jt          |t          t          f          s|gn|}|D ]&}t          |t          j                  r
|j        r dS 'dS )a   
    Check if an FX node is cudagraph-unsafe.

    This includes:
    - Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
    - Ops with the cudagraph_unsafe tag
    - Input-dependent unsafe ops (e.g., index_put with boolean indices)
    - Ops with sparse tensor outputs
    Tr  NF)r  rL   FORBIDDEN_CUDAGRAPH_OPSr   rQ   r  r  r  r  cudagraph_unsafer  rd  r  rz  rp  r  r  	is_sparse)r  r  r  valsr}   s        rV   r  r  F  s     ^F 6{{---t 	65:011HL)V[88t 4G<< t |&&&3&sT5M::Cuu 	 	A!U\** q{ tt5rX   r@   c                    ddl m} t          | |j        |j        f          rdS t          | |j        |j        f          sdS t          | dd          }|t          |          rdS dS )ah  
    Returns True if the node is an op that is not cudagraphable.
    This includes:
    - Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
    - Ops with the cudagraph_unsafe tag
    - index_put_ with boolean indices (triggers .nonzero() during capture)
    - Control flow nodes (Conditional, WhileLoop)
    - Ops with sparse tensor outputs
    r5   r#  TFr  N)	r(  r$  r   Conditional	WhileLoopr  r=   rP   r  )r  r$  r  s      rV   is_cudagraph_unsafe_oprm  k  s      $677 tdR.@AA udIt,,G:7CCt5rX   c                    t           j                            dd          } t          j                    rXddlm}  |            }|rFt           j                            |dd          }| r!t           j	                            || g          n|} | S )NLD_LIBRARY_PATHr(  r   )get_runtime_pathr  lib)
r>  rN  rz  ro   r  libfb.py.parutilrp  r?  r?  pathsep)r?  rp  runtime_pathlib_paths       rV   get_ld_library_pathrv    s    :>>+R00D K555555'')) 	Kw||L)UCCH8<J2:??Hd#3444(DKrX   c                @    ddl m} t          | |          o| j        d uS )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperrx  r   partition_signatures)r  rx  s     rV   #is_codegen_graph_partition_subgraphr{    s9    LLLLLL 	7899 	5(4rX   c                     t           j        j        j        j        pt
          j        d uot           j        j        j        S r   )rQ   r  ro   r;  
cudagraphs&_unstable_customized_partition_wrapperr  graph_partitionrO   rX   rV   is_using_cudagraph_partitionr    s8    %0 	F19E1 /
 
01rX   c                    ddl m} |j        j                            | d          r,|j        j                            | d          rt          j        S t          j        S )Nr5   r~  l        i   )	r  r  r  r  statically_known_ltrs  rQ   ra  rc  )r  r  s     rV   dtype_from_sizer    se    w++e  
'

/
/h
?
? {{rX   )r  rH   c                d    | dk    r#t           j        j                                        S d| v rdS dS )z;
    Returns True if the device supports MKL-DNN BF16.
    r  rH   TF)rQ   r  r  _is_mkldnn_bf16_supportedr   s    rV   is_mkldnn_bf16_supportedr    ;     ey99;;;	+		t5rX   c                d    | dk    r#t           j        j                                        S d| v rdS dS )z;
    Returns True if the device supports MKL-DNN FP16.
    r  rH   TF)rQ   r  r  _is_mkldnn_fp16_supportedr  s    rV   is_mkldnn_fp16_supportedr    r  rX   elementsSequence[Sequence[T]]headersSequence[T]c           
        d |D             }| D ]l}t          |          t          |          k    sJ t          |          D ]8\  }}t          ||         t          t          |                              ||<   9mg }|                    d                    d t          ||          D                                  t          |          t          |          dz  z   t          |          dz
  z   }|                    d|z             | D ]B}|                    d                    d t          ||          D                                  Cd                    |          S )	Nc                F    g | ]}t          t          |                    S rO   )rZ   rL   )rT   r   s     rV   rW   ztabulate_2d.<locals>.<listcomp>  s$    +++ac#a&&kk+++rX   |c              3  ,   K   | ]\  }}d || d V  dS rS  NrO   )rT   hr  s      rV   r   ztabulate_2d.<locals>.<genexpr>  s2      HH41a,a,,,,HHHHHHrX   r  r5   r  c              3  ,   K   | ]\  }}d || d V  dS r  rO   )rT   r   r  s      rV   r   ztabulate_2d.<locals>.<genexpr>  s2      HHtq!l!QllllHHHHHHrX   ru  )rZ   r  r   rL   r  r?  r   r  )r  r  widthsrowr   r   r  total_widths           rV   tabulate_2dr    s]   ++7+++F 4 43xx3w<<''''cNN 	4 	4DAqF1Is3q66{{33F1II	4E	LLHH3w3G3GHHHHHIIIf++Vq1S[[1_EK	LL{"### J JSXXHHs37G7GHHHHHIIII99UrX   dict1r*  dict2
d1_defaultValType | None
d2_defaultEGenerator[tuple[KeyType, ValType | None, ValType | None], None, None]c              #    K   t          |                                           t          |                                          z  }|D ];}|                     |          }|                    |          }|||n|||n|fV  <dS )a  
    Zip two dictionaries together, replacing missing keys with default values.

    Args:
        dict1 (dict): The first dictionary.
        dict2 (dict): The second dictionary.
        d1_default (Any): the default value for the first dictionary
        d2_default (Any): the default value for the second dictionary

    Yields:
        tuple: A tuple containing the key, the value from dict1 (or d1_default if missing),
               and the value from dict2 (or d2_default if missing).
    N)r"   r|  rz  )r  r  r  r  all_keysrD  value1value2s           rV   	zip_dictsr    s      ( %**,,''*UZZ\\*B*BBH  	
 	
33 (FFj(FFj
 	
 	
 	
 	
	
 	
rX   config_patchesc                (   dd	}dd
}|                      dt          j        j                  }|                                 } |rm || dd            || dd            || dt
          j        j                     || dd            || dt          j        j	                    || dd           |                      dt          j
        j                  }|                      dt          j
        j                  }|dk    r|rt          d          | S )a6  
    Ensures the configuration is internally consistent for standalone AOTInductor.

    If `aot_inductor_mode.compile_standalone` is set to True in the provided
    `config_patches` (or falls back to the global config), this function ensures
    that the following configs are also enabled:
        - `aot_inductor.package_cpp_only`

    Args:
        config_patches (dict[str, Any]): A dictionary of user-provided config
            overrides for AOTInductor compilation.

    Returns:
        dict[str, Any]: The possibly-updated `config_patches` dictionary.
    r  r  config_namerL   config_valuer   rK   r  c                    |                      |t          t          |                    }||| |<   d S |s||k    rt          d| d| d          d S d S )NzInvalid config: =z3 when aot_inductor_mode.compile_standalone is True.)rz  rP   ro   r  r  r  r  r   s       rV   patch_configz2maybe_aoti_standalone_config.<locals>.patch_config  s     "";0L0LMM=*6N;''' 	5L00r;rrrrr  	 	00rX   c                    |                      |t          t          |                    }||k    rt                              d||           || |<   d S )NzDOverriding: %s=%s when aot_inductor_mode.compile_standalone is True.)rz  rP   ro   r   rE  r  s       rV   force_patch_configz8maybe_aoti_standalone_config.<locals>.force_patch_config  s_     "";0L0LMML  KKV  
 '3{###rX   z$aot_inductor_mode.compile_standalonezaot_inductor.package_cpp_onlyTz aot_inductor.embed_kernel_binaryz#aot_inductor.emit_multi_arch_kernelz+aot_inductor.model_name_for_generated_files
aoti_modelzaot_inductor.link_libtorchzaot_inductor.dynamic_linkageFz"aot_inductor.cross_target_platformz$aot_inductor.package_constants_in_sowindowszconfig.aot_inductor.package_constants_in_so is not supported for windows cross-compilation. Please use config.aot_inductor.package_constants_on_disk_format = binary_blob.)r  r  r  rL   r  r   rK   r  )rz  ro   aot_inductor_modecompile_standalonecopyrQ   r  r  test_configsuse_libtorchaot_inductorcross_target_platformpackage_constants_in_sor  )r  r  r  r  r  r  s         rV   maybe_aoti_standalone_configr    s   "	 	 	 	
3 
3 
3 
3 (++. 3 
 $((**N R^%DdKKK^%GNNNAu}GXCX	
 	
 	
 	I<	
 	
 	
 	(,	
 	
 	

 	>+I5QQQ*..,1 
 -00.3 
 	)).E)]
 
 	

 rX   consts_sizetuple[bool, bool]c                d   t           j        j        r$t           j        j        dk    rt	          d          t           j        j        r,t           j        j        dk    rt	          d          d}d}||fS t           j        j        dk    rd}d}||fS | dk    rdS d}t          j                     }||fS )	a  
    Decide whether we should mmap weights, and whether to store the weights with .so.

    If force_mmap_weights or package_constants_on_disk_format == "binary_blob" configs are set, respect the config.

    Returns tuple (use_external_weights, use_mmap_weights).
    binary_blobzconfig.aot_inductor.package_constants_on_disk_format = binary_blob and config.aot_inductor.force_mmap_weights cannot both be True.r  zKwhen cross_target_platform is windows, use_mmap_weights should not be true.TFi 5w)FF)ro   r  force_mmap_weights package_constants_on_disk_formatr  r  r  )r  use_mmap_weightsuse_external_weightss      rV   determine_aoti_mmap_flagsr  P  s     	.
@MQQJ
 
 	

 - 64	AA]    $#%555;}LL# #%555m##| !+---!111rX   c                     ddl m}  | j        j        }|dS t	          |t
                    st          d          |dk    rdS t          j        d|          st          d          dS )	zL
    Validates if a model name is suitable for use in code generation.

    r   rn   NTz4Invalid AOTI model name: Model name must be a stringr(  z^[a-zA-Z_][a-zA-Z0-9_]*$zVInvalid AOTI model name: Model name can only contain letters, numbers, and underscores)	rw  ro   r  model_name_for_generated_filesr   rL   r  r   r   )ro   
model_names     rV   is_valid_aoti_model_namer  y  s    
 '&&&&&$CJtj#&& QOPPPRt 8/<< 
d
 
 	
 4rX   r*   unbacked_onlyOrderedSet[sympy.Symbol]c                B    |rt          |           S t          |           S r   )r)   r(   )rU   r  s     rV   get_free_symbolsr    s$     $Q'''ArX   cudagraph partition due to Optional[BaseSchedulerNode]c                    t           j        j        sdS | |  }|rC|j        x}r:|                                x}r$|j                            dd          x}r| d| }t                              |           dS )z
    Cudagraph partition may lead to extra memory overhead so we
    log partition reasons to help users understand the overhead.
    Nstack_tracez. Found from : 
 )	ro   r;  r}  r  get_origin_noder  rz  perf_hint_logrE  )r  r  r  warning_msgir_noder  r  s          rV   maybe_log_cudagraph_partitionr    s     =# "S""K 	F	!WF  //111WF $L,,]DAAA[	F %EEEE+&&&&&rX   dict[str, str]c                    i t           j        dt           j                            dt           j                            t
          j                            i} t          j                    rt          j
        d          | d<   | S )zA
    Get a base environment for running Python subprocesses.
    
PYTHONPATHTORCH_CUSTOM_PYTHONPATHr[  
PYTHONHOME)r>  rN  rz  rs  r?  r  r?  ro   r  	sysconfigget_path)envs    rV   python_subprocess_envr    sv    

* 	bjnn%rzsx'@'@
 
	 C   7%.v66LJrX   c                  (    e Zd ZU dZded<   ded<   dS )CUDAGraphWrapperMetadataz
    Metadata for Customized CUDAGraphWrapper.

    Currently assumes there is 1 dynamo graph and will extend to
    multiple graphs in the future.
    ry   num_partitionspartition_indexNr   rO   rX   rV   r  r    s8            rX   r  .c                      e Zd ZU dZded<   dS )CUDAGraphWrapperNzOptional[CUDAGraphWrapperType]r  )r   r   r   r  r   rO   rX   rV   r  r    s#         .2G222222rX   r  CUDAGraphWrapperTypec                    | t           _        d S r   )r~  r  )r  s    rV   !set_customized_partition_wrappersr    s    5<*222rX    tuple[list[Any], dict[str, Any]]c                \   | j         j        }| j                             g || j         j        | j         j                  }| j         j        }t          j        ||f          \  }}ddfd|D             }dddfd	fd
|D             }t          j        ||          \  }}||fS )NrK   r   c                    t          | t          j        j        j                  o$t          | t          j        j        j                   S r   )r   rQ   r  r$  r>   GeneratorStater   s    rV   _is_tensor_irz(snode_args_kwargs.<locals>._is_tensor_ir  sA    !U_/677 

u!0A
 A
 =
 	
rX   c                v    g | ]5} |          r&t           j        j                            |d           n|6S )F)guard_shape)rQ   r  r$  ir_node_to_tensor)rT   r1  r  s     rV   rW   z%snode_args_kwargs.<locals>.<listcomp>
  sZ         =	,,QE,BBB  rX   r  c                0    t          j        | ||          S )Nr   )rQ   r   )r  r   r   s      rV   _tensorz"snode_args_kwargs.<locals>._tensor  s    {4uV<<<<rX   r   r   c                    t          | t          j                  s| S  |                                 | j        | j                  }|S r   )r   rQ   r  r  r   r   )r   r  r  s     rV   to_real_tensorz)snode_args_kwargs.<locals>.to_real_tensor  s@    !U\** 	Hgaffhh22
rX   c                &    g | ]} |          S rO   rO   )rT   r1  r  s     rV   rW   z%snode_args_kwargs.<locals>.<listcomp>  s#    666q""666rX   r  )rK   r  )r   r   rK   r   )r  r  fill_non_provided_argsconstant_argsr  pytreer#   tree_unflatten)r  r   r  	flat_argsflat_args_pytree_specr  r  r  s        @@@rV   snode_args_kwargsr    s   :D:,,*$*)*
 D ZF'-':D&>'J'J$I$
 
 
 

    	  I= = = =      7666I666I(4IJJLD&<rX   r<  r9   c                    ddl m} | j        }|j        j        r"|                    |j        j        dz             }|                    d          S )Nr5   r~  r   )primals_r  fwd_rng_staterR  rO  )r  r  r   r  removeprefixr  )r<  r  dep_names      rV   is_nonfreeable_buffersr     s`    xH 	w| =(();<<I  rX   template_dirr%   c                    t          ||  dz            5 }|                                cddd           S # 1 swxY w Y   dS )z,Load a template file and return its content.z	.py.jinjaN)openread)r   r  rA  s      rV   load_templater  -  s    	l////	0	0 Avvxx                 s   7;;c                    | j         }t          |t          j        j        t          j        j        f          sJ dt          |                       t          j        sdS t          t          j
        j        j        j        t          j
        j        j        j        g          }||v rdS t          t          j
        j        j        g          }t          |t          j        j                  r||v S t#          |            S )zLDecide whether fallback for a node. This is only used in inductor lite mode.z6Expected OpOverload or HigherOrderOperator, but found F)r  r   rQ   r  r  r2  r  ro   fallback_by_defaultr"   r  ru  _assert_scalarr  lift_fresh_copyhigher_order triton_kernel_wrapper_functionalr    )r  r  "skip_fallback_due_to_dynamic_shapefallback_hopss       rV   should_fallback_by_defaultr  3  s   [F&
(FG  O ONVNNO O O % u *4IN)1IN*2	
* *& 333u 			@A M &%*899 '&&&t,,,,rX   )	z-torch.ops._c10d_functional.all_reduce.defaultz.torch.ops._c10d_functional.all_reduce_.defaultz9torch.ops._c10d_functional.all_gather_into_tensor.defaultz8torch.ops._c10d_functional.reduce_scatter_tensor.defaultz4torch.ops._c10d_functional.all_to_all_single.defaultz6torch.ops._c10d_functional_autograd.all_reduce.defaultzBtorch.ops._c10d_functional_autograd.all_gather_into_tensor.defaultzAtorch.ops._c10d_functional_autograd.reduce_scatter_tensor.defaultz=torch.ops._c10d_functional_autograd.all_to_all_single.defaultc                    | t           v S )z0Check if an operation is a collective operation.)COLLECTIVE_OPS)r  s    rV   is_collective_opr  h  s    n$$rX   r  )rx   ry   rK   ry   )r}   r~   rK   r   )r   r   )r   r   r   ry   r   ry   rK   r   )r   r   F)
r   r   r   ry   r   ry   r   r   rK   r   r  )r   r  rK   r  )r!  r"  rK   r~   )r,  r-  r.  r-  rK   r~   )r!  r5  rK   r6  )r<  r=  r>  r=  rK   r=  )rD  rE  rK   rL   )rt  ru  rK   rv  )r   r=  rK   r|  )rt  r  rK   r  )r  r  rK   r   )r  r4   r  r  rK   r   )r  r   r   r  r  r  rK   r  )rF   )r   rL   rK   r  )r5   rF   )
r  r  r  r  r   ry   r   rL   rK   r   )rO   r  r  r  rF   )r  r  r  r  r   ry   r  ry   r  r   r   rL   rK   r   )r  r   r  rL   rK   r  )r  r   r  r   rK   r  )r1  ry   r2  ry   rK   ry   )rU   r  r  ry   rK   r  )rU   r  rK   r  )r   r  rK   r  )r   r  rK   r  )r  rL   rK   r  )r  r   rK   r!  )r  r+  r,  r-  rK   rL   )r  r   r  r8   rK   rE  r   )r  r  r  r  rK   r  )r   r  r  r  rK   r  r  )r  r~   rK   r  )r  rL   rK   r   )r  rk   rL  ry   rK   r  )r  r   rK   r   )r   rL   rK   r  )r  r~   r  r  rK   r~   )r1  r   rK   r  )r   r   rK   r   )r  r  rK   r  )r  r  rK   r4   )r  r  rK   r  r  )r  r   rK   r   )NNT)r2  r3  r4  r5  r6  r   rK   r7  )rX  r  rW  r   rK   rY  )r  r+   rX  ra  rW  r   rK   rY  )r   rs  rK   ry   r  r  )r
  r  rK   r   r  )r$  ry   r   r  r%  r&  rK   r6   )r_  r?   r0  r1  rK   r   )r5  rL   rK   r   )
r_  r?   rC  r   rD  r   rE  r   rK   r   )rS  r>   rQ  rT  rR  r   rK   r   )rS  r>   rQ  r?   rR  r   rK   r   )r  r   r  r   r_  r?   r  r   r  r   r  r  r  r  r  r  rK   r   )
r_  r?   r&  ry   rI  ry   r  ry   rK   r   )r  rL   rK   r   r   )
r&  r  rI  r  r  r  r  ry   rK   r   )r&  r  rI  r  r  r  rK   r   )r&  r  rI  r  r  r  rK   rY  )r   rL   rK   rL   )rK   r  )r_  r?   rK   r   )r_  r?   r  r  r  r>   rK   r   )FTFN)r_  r?   r  r>   r  r>   r  r   r  r   r  r   r  r&  rK   r   )r   r>  r   r  r  r  rK   r?  )r   r  rK   rN  )r   r>  r   r  r  r  rK   r   )r   r>  r   r  r  r  rK   rL   )r   r>  r   r  r  r  rK   rj  )rs  r  rt  r  rK   r7  )r|  r  r}  r~  rK   r   )r  rL   rK   r  )rK   r5  )r}  r  rK   r   )r  r  rK   r   )r  r~   rK   rs  )r  r   r   r   r  r   rK   r  )r   rs  rK   r   )r  rL   rK   r   )r  rL   rK   ry   )r  r  rK   r   )
r  r  r  r3   r  r  r  rL   rK   r  )r  r  rK   r   )r  r  r  r  rK   r   )r  r  rK   r   )r  rD   r  r  rK   r   )r  rD   rK   r   )r  r%  r  r&  rK   r   )r)  rL   r*  r  r+  r  rK   r   )r  rD   r2  r3  r*  r4  r+  r5  r6  r7  rK   r  )rE  ry   rF  ry   rK   ry   )rL  r  rK   ry   )r]  r   rK   r7  )r   rL   rK   rL   )r   r5  rK   r   )r   rL   rK   r   )r   rs  rK   r   )r  r  r  r5  ro  rs  rp  rs  rq  rL   rr  r   rK   r   )r  r+  rK   r  )r   r  rK   r   )r  r  rK   r   )rK   r  )r   r>  r   r  r  r  rK   r  )r  r  rK   r  )r  r  r  r  r  r  rK   r  )rU   r  rK   r  )r  r  r  r  r  r  rK   r  )r  r  r  r  rK   r  )r   r~   rK   r   )r  r  r  rC   rK   r  )r   rs  rK   rL   )r   rL   rK   rs  )r[  r  r   r  rK   r   )rK   r  )r   rL   r  r2   r  rE  rK   r  )r   rL   rK   r  )r   r5  rK   rL   )r   rs  rK   rs  )r   r>  r   r   rK   r   )rK   rF  )rK   rL  )r  rZ  rK   r   )r  r@   rK   r   )r  r8   rK   r   )r  ry   rK   rs  )r   rL   rK   r   )r  r  r  r  rK   rL   )NN)
r  r*  r  r*  r  r  r  r  rK   r  )r  r  rK   r  )r  ry   rK   r  )rU   r*   r  r   rK   r  )r  N)r  rL   r  r5  r  r  rK   r  )rK   r  )r  r  rK   r  )r  rD   rK   r  )r<  r9   rK   r   )r   rL   r  r%   rK   rL   )r  rZ  rK   r   (  
__future__r   rv  r  rB  enumr$  r  r  r  r  r  r  r&  r>  r  r   rR  r   r  r  rJ  r  r  r  collections.abcr   r   r   r   r   r	   r
   r   r   typingr   r   r   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r   rQ   torch.utils._pytreer!  _pytreer  $torch._inductor.analysis.device_infor   torch._inductor.runtime.hintsr   !torch.fx.passes.regional_inductorr    torch.utils._dtype_abbrsr!   torch.utils._ordered_setr"   r#   r$   pathlibr%   OPTIMUS_EXCLUDE_POST_GRADr  r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   torch._prims_commonr2   torch.fxr3   torch.fx.noder4   r+  r6   re  r8   dependenciesr9   r  r;   r$  r<   r=   r>   r?   r@   rA   output_coderC   r  rD   rE   rY   rJ   r   r^   torch._dynamo.device_interfacer_   torch._dynamo.utilsr`   torch.autogradra   torch.autograd.profiler_utilrb   (torch.fx.passes.graph_transform_observerrc   torch.fx.passes.shape_proprd   torch.utils._sympy.functionsre   rf   rg   rh   ri   torch.utils._sympy.symbolrj   rk   torch.utils._sympy.value_rangesrl   rm   r(  ro   runtime.runtime_utilsrp   rC  _IS_WINDOWS	getLoggerr   r   _logginggetArtifactLoggerr  rs   rM  rA  	VarRangesr  ry   	InputTypeGPU_KERNEL_BIN_EXTSr  r  rZ  r-  r{   r|   r   Functionr   rC  r   r   r   r   r  r   r+  r4  r;  rs  r{  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r  FN_TYPEr  r  r  r  r*  rD  r  r  r  r  r  r  r  r  r  r  r  r  	frozensetrf  r  r  r  r#  r(  r)  r   r/  r1  r  rV  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacher`  rr  	lru_cacherw  ry  r}  r  r  r  r  r  r  r  r  r  r#  r/  r4  r=  rB  rO  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r0  r2  rH  rM  rR  rd  rg  ri  rr  r{  r  r  r  r  r  r  r  r  r+  r  r  r  r  r  r  r  r  rQ  r(  r  Enumr  r  r  r  r  r  r   r$  r(  r.  r;  rB  rK  rV  rX  rf  ri  r3  rl  rn  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r}  r  compiler  r  r  r  r  r  r  r  r  r  r  r!  r$  r%  r&  r(  rE  rJ  rL  rW  rY  rd  r  rm  rv  r{  r  r  SUPPORTED_MKLDNN_DEVICESr  r  r  r  r  r  r  r  r  r  r  PartitionFnTyper  r  r~  r  r  r   r  r  r  r  rO   rX   rV   <module>rA     sg   " " " " " " "                       				        				  				      



                                                                   C B B B B B B B B B         $ $ $ $ $ $ $ $ $ ? ? ? ? ? ? : : : : : : E E E E E E 0 0 0 0 0 0 / / / / / / ; ; ; ; ; ; ; ;   ($ 
             >>>>>>>>>>>//////////CCCCCC$$$$$$"""""",,,,,,555555!!!!!!$$$$$$TTTTTTTTTTTTTTTT,,,,,,======== +**	GCLL
     D C C C C C 0 0 0 0 0 0 % % % % % % 2 2 2 2 2 2 K K K K K K 0 0 0 0 0 0              8 7 7 7 7 7 7 7 D D D D D D D D       = = = = = = lg%g!!00<HH WT]]UZ'(	U5<el:;<	'77 	 {Q'A--+2B2B2BDX2B2B2B5 5 5 5
L L L L    EN    d###       $#"G G G G GX #(	    4 #(	[ [ [ [ [|       ; ; ; ;@ @ @ @
+ + + +* * * *#A #A #A #AL+ + + +   "/ / / /G G G G @OI I I I I0   0' ' ' ' ' 	    ( %'     ) ) ) )' ' ' '# # # #   $ $ $ $  IcNNWTT"""
;sAv&*
+E E E E E8WQU^ E E E   :   + + + +\   ,4) 4) 4) 4)nW2 W2 W2 W2x 48    (G G G G:, , , ,^% % % %   	D 	D 	D 	DU U U U	> 	> 	> 	>   2   - - - - $)    $      ' ' ' '& 
			   < !#  " " " "	 	 	 	    .29 9 9 9 9z !5 $ "  49      ( 	$ $ $ $ $ $N Q7 7 7 7    *    , , , , , , , ,
S' S' S' S' S' S' S' S'l
 
 
 
 
 
 
 
 @ @ @ @       @? ? ? ? ?' ? ? ?2 2 2 2 2% 2 2 2     8 J J J J ) ) ) )I I I I #'    (           #     < :>RWp p p p p ph BG      BGV V V V V V  Q	 	 	  	> > > >B   BJ J J J CO, , , , , EF    *    . 5( 5( 5( 5(p @ @ @ @ R R R R:" " " "J      H H H H& & & &   : ""&"&= = = = =@' ' ' 'C C C C C C C C"& & & &$   / / / /( ( ( (V   	 	 	 	# # # #* . . . .$ IMF F F F F*	 	 	 	   B& & & && & & &              D D D D (6 (6 (6 (6V    Q Q Q Q. . . .0 0 0 0   ( ( ( (# # # #K K K K   (* * * * *$) * * * 
  
  
  
F      " -1# # # # #L( ( ( ( @D	Q 	Q 	Q 	Q 	Q# # # #J J J JL L L L *=)<
 
 
 
 
< *=)<    :T T T T       2            ,! ! ! !   . . . .2 2 2 2$ $ $ $NH H H H>   L L L L' ' ' '    2   :   0	: 	: 	: 	: 37$ $ $ $ $<   $$3 $3 $3 $3N   :   &/ / / / '#)* $%
 
  HG*>*D*D*F*FGGG  "*Y''H H H H   	 	 	 	       & 1 1 1 1 1 1 1 1
 68  7 7 7 7    /9jll  : : : :) ) ) )# # # # #    ')


')

-" -" -" -" -" 01 -" -" -"` D)))t      *)       49    4 4 4 42Y Y Y Y! ! ! !H" " " "J   4
 
 
 
   1 1 1 1    * 	 	 	 		 	 	 	   & "&!%	 
  
  
  
  
FR R R Rj&2 &2 &2 &2R   6    :(,' ' ' ' '2   6 d###       $# 38$./@ 3 3 3 3 3 3 3 3 *:)9);); &= = = =       F
 
 
 
   "- "- "- "-L 
 
 
 % % % % % %rX   