
    [iT                        d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZ e
rd dlmZ d d	fd
efdZ e j        ed d          Z e j        ed d	          Ze G d d                      Ze G d d                      Z G d d          Z G d d          Zd#dZd d dfdZd Zd$dZe G d d                      Ze G d  d!                      Zd" Z dS )%    N)deque)	dataclass)AnyLiteralTYPE_CHECKINGprofile)
DeviceType)_KinetoEventc                     | j         S N)childrenxs    H/var/www/icac/venv/lib/python3.11/site-packages/torch/profiler/_utils.py<lambda>r      s    1:     Freversec              #      K   |rt           nd }t           ||                     }|r? ||          }|V   | ||                    D ]}|                    |           |=d S d S )Nc                     | S r    r   s    r   r   z_traverse.<locals>.<lambda>   s    q r   )reversedr   append)treenext_fnchildren_fnr   order	remaining
curr_eventchild_events           r   	_traverser!      s      0HH[[EeeDkk""I
 *WY''
 5Z!8!899 	* 	*K[))))	  * * * * *r   c                 *    |                                  S r   )popr   s    r   r   r      s    aeegg r   T)r   r   c                 *    |                                  S r   )popleftr   s    r   r   r      s     r   c                   ^    e Zd ZU dZeed<   dZeed<   dZeed<   dZeed<   e	d             Z
dS )EventMetricsr   duration_time_nsself_time_nsidle_time_nsqueue_depthc                 :    | j         dk    rdS | j        | j         z  S )Nr   g        )r(   r*   selfs    r   fraction_idle_timezEventMetrics.fraction_idle_time(   s&     A%%3 4#888r   N)__name__
__module____qualname__r(   int__annotations__r)   r*   r+   propertyr/   r   r   r   r'   r'   !   sp         cL#L#K9 9 X9 9 9r   r'   c                   2    e Zd ZU eed<   eed<   dZeed<   dS )Intervalstartendr   r+   N)r0   r1   r2   r3   r4   r+   r   r   r   r7   r7   /   s4         JJJ	HHHKr   r7   c                   F    e Zd Zd	dZd Zd ZdefdZdee	         fdZ
dS )
EventKeyreturnNc                     || _         d S r   event)r.   r?   s     r   __init__zEventKey.__init__7   s    


r   c                 4    t          | j        j                  S r   )hashr?   idr-   s    r   __hash__zEventKey.__hash__:   s    DJM"""r   c                 6    | j         j        |j         j        k    S r   )r?   rC   )r.   others     r   __eq__zEventKey.__eq__=   s    z}..r   c                     | j         j         S r   )r?   namer-   s    r   __repr__zEventKey.__repr__@   s    */##r   	intervalsc                 b   d}t          |d           }|rXt          | j        j        |d         j                  }t          | j        j        |d         j                  }||k     r|||z
  z  }d\  }}|t          |          k     r||         }||         }|dz  }|j        |j        k    r$|j        |j        k    r|dz  }N|j        |_        |}t          | j        j        |j                  }t          | j        j        |j                  }||k     r|||z
  z  }|t          |          k     |S )Nr   c                     | j         S r   r8   r   s    r   r   z,EventKey.intervals_overlap.<locals>.<lambda>E   s    AG r   key)r      rQ   )	sortedmaxr?   start_time_nsr8   minend_time_nsr9   len)	r.   rK   overlap_timeoverlap_startoverlap_endijprev_intervalcurr_intervals	            r   intervals_overlapzEventKey.intervals_overlapC   sK   9*;*;<<<	 	<
 8)A,:LMMMdj4il6FGGK{**m ;;1#i..  %aLM%aLMFA =#666 $}'888FA*7*;M'A
 8-:MNNMdj4m6GHHK{**m ;;! #i..  $ r   r<   N)r0   r1   r2   r@   rD   rG   strrJ   listr7   r_   r   r   r   r;   r;   6   s           # # #/ / /$# $ $ $ $4>      r   r;   c                   L    e Zd ZdeddfdZddZd ZddZd Zdde	de
fdZdS )BasicEvaluationprofr<   Nc                 :   || _         i | _        |                                  t          | j                                        d           | _        d | j        D             | _        g | _        |                                 | _	        | 
                                 d S )Nc                     | j         j        S r   )r?   rT   r   s    r   r   z*BasicEvaluation.__init__.<locals>.<lambda>j   s    qw/D r   rO   c                     g | ]	}|j         
S r   r>   .0es     r   
<listcomp>z,BasicEvaluation.__init__.<locals>.<listcomp>l   s    8881qw888r   )r	   metricscompute_self_timerR   keys
event_keyseventscuda_eventscompute_queue_depthqueue_depth_listcompute_idle_time)r.   re   s     r   r@   zBasicEvaluation.__init__e   s    57    L%D%D
 
 
 98888/1 $ 8 8 : :     r   c                    | j         j        J t          | j         j                                                  }|r|                                }|j        }|j        D ]!}||j        z  }|                    |           "t          |          | j	        vsJ d|j
         d|j                     t          |          | j	        t          |          <   |j        | j	        t          |                   _        |dS dS )zM
        Computes event's self time(total time - time in child ops).
        NzDuplicate id: z, )r)   )r	   kineto_resultsr   experimental_event_treer#   r(   r   r   r;   rm   rC   rI   r'   )r.   stackr   	self_timer    s        r   rn   z!BasicEvaluation.compute_self_timeq   s!    |*666dl1IIKKLL  	=J"3I)2 * *[99	[))))J''t|;;;CCC*/CC <;; 2>91U1U1UDL*--. ",!< L$$  	= 	= 	= 	= 	=r   c                    | j         j        J | j         j                                        }d d t          fd|D             d           }t          fd|D             d           }t          ||z   d	           | _        i }d
}|D ]"t          |fd|          }||<   ||n|}#d
}d}||z   | j        z   }	d }
g }|	                    |
           |	D ]}t          |d          rW|                                dz  }|                                |	                                z   dz  }||v r||         ||         }t          |d          rR|
                                }|
                                |                                z   }||v r||         ||         }nt          |d          r|j        }|j        }|t          |          k     rT||         
                                |k    r6|dz  }|t          |          k     r||         
                                |k    6||z
  dz   }t          |d
          }t          |d          st          |d          r&|                    t#          |||                     t          |d          r|| j        t'          |                   _        |S )z
        Computes queue_depth at each event. This will calculate the queue depth data for
        All the events in the tree.
        This will return a list of Interval of queue depth data of cuda launch and kernels.
        Nc                 ~    h d}t          t          | d|                     t          fd|D                       S )z+Check if the event is a CUDA launch kernel.>   cudaLaunchKernel__cudaLaunchKernelcudaLaunchKernelExCcudaLaunchCooperativeKernel&cudaLaunchCooperativeKernelMultiDevicerI   c              3   B   K   | ]}                     |          V  d S r   )
startswithrj   patternrI   s     r   	<genexpr>zUBasicEvaluation.compute_queue_depth.<locals>.is_cuda_launch_kernel.<locals>.<genexpr>   s/      OOGtw//OOOOOOr   )ra   getattrany)rk   launch_patternsrI   s     @r   is_cuda_launch_kernelzBBasicEvaluation.compute_queue_depth.<locals>.is_cuda_launch_kernel   sR      O wq&!,,--DOOOOOOOOOOr   c                     |                                  t          j        k    rdS t          t	          | d|                                                     h d}t          fd|D                        S )z,Check if the event is a CUDA runtime kernel.FrI   >   cpymemfreeallocc              3       K   | ]}|v V  	d S r   r   r   s     r   r   zNBasicEvaluation.compute_queue_depth.<locals>.is_cuda_kernel.<locals>.<genexpr>   s'      KKw7d?KKKKKKr   )device_typer
   CUDAra   r   lowerr   )rk   exclude_patternsrI   s     @r   is_cuda_kernelz;BasicEvaluation.compute_queue_depth.<locals>.is_cuda_kernel   sy     }}*/11uwq&!,,--3355D  ?>>KKKK:JKKKKKKKr   c              3   2   K   | ]} |          |V  d S r   r   )rj   rk   r   s     r   r   z6BasicEvaluation.compute_queue_depth.<locals>.<genexpr>   s4      DD1+@+@+C+CDQDDDDDDr   c                 *    |                                  S r   start_nsr   s    r   r   z5BasicEvaluation.compute_queue_depth.<locals>.<lambda>       !**,, r   rO   c              3   2   K   | ]} |          |V  d S r   r   )rj   rk   r   s     r   r   z6BasicEvaluation.compute_queue_depth.<locals>.<genexpr>   s2      ==1>>!+<+<=Q======r   c                 *    |                                  S r   r   r   s    r   r   z5BasicEvaluation.compute_queue_depth.<locals>.<lambda>   r   r   c                 *    |                                  S r   r   r   s    r   r   z5BasicEvaluation.compute_queue_depth.<locals>.<lambda>   s    1::<< r   r   c                 X    |                                                                   k    S r   )linked_correlation_id)r   cuda_launch_events    r   r   z5BasicEvaluation.compute_queue_depth.<locals>.<lambda>   s'    !1133$::<<= r   rN   c                     t          | d          r|                                 dz  S t          | d          r|                                 S t          | d          r| j        S t	          d          )Nstart_us  r   rT   zUnknown Event Type)hasattrr   r   rT   	Exceptionr>   s    r   new_old_event_comparatorzEBasicEvaluation.compute_queue_depth.<locals>.new_old_event_comparator   su    uj)) /~~''$..uj)) (~~'''uo.. +**0111r   r   r   r   rT   rQ   )r	   rw   rq   rR   rr   index_of_first_matchsortr   r   duration_usr   duration_nsrT   rV   rW   rS   r   r7   rm   r;   r+   )r.   cuda_event_listcuda_launch_eventscuda_kernel_eventskernel_mappinglast_mapped_kernelindexcurrent_kernel_indexspawned_kernel_index
all_eventsr   rt   r?   
start_timeend_timecurrent_queue_depthr   r   r   s                   @@@r   rs   z#BasicEvaluation.compute_queue_depth   s    |*666,5<<>>
	P 
	P 
	P	L 	L 	L $DDDDDDD&&
 
 
 $=======&&
 
 

 "!339O9O
 
 
 35!3 	T 	T("= = = =(	  E 16N,-*/*;AS !'*<<t{J
	2 	2 	2 ,.4555  	P  	PEuj)) A"^^--4
!NN,,u/@/@/B/BBdJN**~e/D/P+9%+@(uj)) -"^^--
 >>++e.?.?.A.AAN**~e/D/P+9%+@(00 -"0
 , %s+='>'>>>'(<=FFHHZWW$)$ %s+='>'>>>'(<=FFHHZWW #79M"MPQ"Q"%&91"="=uj)) PWUJ-G-G P ''Z3FGG    00 P<OXe__-9r   c                 L   d}d}g }| j         rj| j        rc|t          | j        d         j        | j         d         j                  t          | j         d         j        | j        d         j                  gz  }| j         D ]O}|j        dk    r|s	|j        }d}|j        dk    r,|r*|                    t          ||j                             d}Pd | j	        D             }|D ]A}t          |                              |          | j	        t          |                   _        BdS )z4
        Computes idle time of the profile.
        Fr   r   Tc                     g | ]	}|j         
S r   r>   ri   s     r   rl   z5BasicEvaluation.compute_idle_time.<locals>.<listcomp>  s    444!ag444r   N)rt   rq   r7   rT   r8   r9   rV   r+   r   rm   r;   r_   r*   )r.   idle
idle_startidle_intervals
data_point
event_listr?   s          r   ru   z!BasicEvaluation.compute_idle_time   sI   
 
)+  	T[ 	Q5t7LQ7O7UVV.r26B8STT N
 / 	 	J%**4*'^
%))d)%%hz:;K&L&LMMM44t|444
 	0 	0E9A: :// L%)66	0 	0r   c                 L    ddl }t          t           j                            }d |D             }dd}g d}|t	          |          k     r||         k    r|dz  }%t          |dz   t	          |                    D ]x}t          |fd|          }t          |||          }	|	M||	         |k    rA                    t          ||	         j
        ||         j
                             ||n|} ny|dz  }|t	          |          k     Èfd	 j        D             }
|
r|                     fd
|
D             |j                  }|                     fd|
D             |j                  }||                    |          z
  |                    |          z  }||                    |          z
  |                    |          z  }|d|z  z   }d t!          t#          ||
d          t%          j        d          d          D             }
|
d|         }
|
S )a  
        Filter and Rank the events based on some heuristics:
        1) Events that are in the falling phase of the queue depth.
        2) Events that have a high idle_time, self_time difference.

        Parameters:
            length: The number of events to return.
        r   Nc                     g | ]	}|j         
S r   )r+   ri   s     r   rl   z/BasicEvaluation.rank_events.<locals>.<listcomp>   s    ===qQ]===r      rQ   c                     | k    S r   r   )r   bottom_threasholds    r   r   z-BasicEvaluation.rank_events.<locals>.<lambda>.  s    .?)? r   rN   )r8   r9   c                 >    g | ]}|                               |S r   )r_   )rj   r?   decrease_intervals     r   rl   z/BasicEvaluation.rank_events.<locals>.<listcomp>=  s>     
 
 
&&'899

 
 
r   c                 4    g | ]}j         |         j        S r   )rm   r)   rj   r?   r.   s     r   rl   z/BasicEvaluation.rank_events.<locals>.<listcomp>D  s#    JJJee$1JJJr   )dtypec                 4    g | ]}j         |         j        S r   )rm   r/   r   s     r   rl   z/BasicEvaluation.rank_events.<locals>.<listcomp>H  s#    PPPEe$7PPPr   g333333?c                     g | ]\  }}|S r   r   )rj   _r?   s      r   rl   z/BasicEvaluation.rank_events.<locals>.<listcomp>P  s,       Au   r   T)strict)rP   r   )torchrb   r   rt   rW   ranger   argmaxr   r7   r8   rm   tensorfloat32meanstdrR   zipoperator
itemgetter)r.   lengthr   rt   	qd_valuestop_threasholdr[   r\   next_minimum_idxpeak_idxr   rz   	idle_timenormalized_gainnormalized_selfheuristic_score_listr   r   s   `               @@r   rank_eventszBasicEvaluation.rank_events  s    	)> ? ?@@==,<===	#i..  |///Q1q5#i..11   $8????q$ $ $  ")1:JKKK 'Ih,?>,Q,Q%,, ,X6<>Nq>Q>W   
 -=,H((aAEFA+ #i..  .
 
 
 

 
 


  	-JJJJzJJJm %  I PPPPZPPPm %  I  )5::i+@+@@EIIiDXDXXO(5::i+@+@@EIIiDXDXXO#2S?5J#J   &,jFFF +A.. ! ! !  J $GVG,Jr   rQ   Tr   print_enablec                                            |          }|s|S |rdnd}|d                     fd|D                       z  }|rt          |           |S )NzOptimizable events:
zNo events to optimize

c                 |    g | ]8}d  d| dt          |j                   dj        |         j        dz  ddd  	9S )zP--------------------------------------------------------------------------------z
Event:                z
Source code location: z
Percentage idle time: d   z.2fz%
)source_code_locationr?   rm   r/   r   s     r   rl   z:BasicEvaluation.get_optimizable_events.<locals>.<listcomp>b  s            +EK88  |E*=C   
	   r   )r   joinprint)r.   r   r   r   outputs   `    r   get_optimizable_eventsz&BasicEvaluation.get_optimizable_events[  s    %%f--
 	,6U((<U$))    (  	
 	
 		
  	&MMMr   r`   )rQ   T)r0   r1   r2   r	   r@   rn   rs   ru   r   r3   boolr   r   r   r   rd   rd   d   s        
!W 
! 
! 
! 
! 
!= = = =,m  m  m ^0 0 0 08G G GR S D      r   rd   c                     ||t          |           k    rt          |           }t          ||          D ]} || |                   r|c S d S r   )rW   r   )seq	predicater8   r9   r[   s        r   r   r   p  sb    
{cSXXoo#hh5#  9SV 	HHH	4r   c                     | S r   r   r   s    r   r   r   y  s    a r   c                     | ||         } t          |           dk    rd S |                     t          | |                    |z   S )Nr   rO   )rW   r   rS   )r   rP   r8   r9   s       r   r   r   y  sG    
eCi.C
3xx1}}t99S#&&&''%//r   c                 `    | +t          j        d| j                  }|| j        } &| j        S dS )Nz
\.py\(.*\)zNo source code location found)researchrI   parent)r?   matchs     r   r   r     s8    

	-44=LEz**r   r<   c                  Z    ddl m}   |             5  	 d d d            d S # 1 swxY w Y   d S )Nr   r   )torch.autograd.profilerr	   r   s    r   _init_for_cuda_graphsr     s    //////	                   s    $$c                       e Zd ZU dZeed<   ed         ed<   ed         dz  ed<   eez  dz  ed<   eee	f         ed	<   dS )
TimelineEventz-Represents an event in the profiler timeline.	timestamp)r8   r9   regular
event_typefilenamenodeNmarker_type
identifierr?   )
r0   r1   r2   __doc__r3   r4   r   ra   dictr   r   r   r   r   r     ss         77NNN12222+,t3333c	D    S>r   r   c                   ^    e Zd ZU dZed         ed<   eez  ed<   edz  ed<   dZ	edz  ed<   dS )ContextStackEntryz5Represents a context (filename or node) in the stack.r   context_typer  Nmetadatatid)
r0   r1   r2   r  r   r4   ra   r3   r  r	  r   r   r   r  r    s^         ??,----c	TkCtr   r  c           
         ddl m} |                     dg           }g d }fd}|D ]}d|vsd|vr ||          rb|d         d	d
         }|                    d          r |d||           I	 t	          |          }n# t
          $ r Y nw xY w |d||           x|d         }                    t          |ddd|                                         d            g }	D ]v}
|
j	        xdk    rB |
j
        J |
j        dk    r{t          |
j
        t                    sJ |                    |
j
                  }|
j                            d          }|	                    t          d|
j
        ||                     |
j        dk    rd}|
j                            d          }t!          |	          D ]!}|j        dk    r|j        |k    r	|j        } n"|rV|                    di           }|
j
        |v r7||
j
                 }|	                    t          d|
j
        ||                     Rxdk    rf t)          t+          |	          dz
  dd          D ]A}|	|         }|
j        |j        k    r'|
j
        |j
        k    r|	                    |            nBdk    rd}d}|
j                            d          }t!          |	          D ]W}|j        |k    rJ|j        dk    r?|j        r8|j                            dd          }|j                            dd          } nX|s|r)|
j                            di           }|r||d<   |r||d<   	 xdS )an  
    Maps recorded profiler events to their corresponding fx nodes and adds stack traces.

    Builds a timeline of all events (regular ops and FX markers for filenames/nodes),
    sorts by timestamp, then processes chronologically while maintaining a context stack of active
    filename/node scopes. Regular events are augmented with stack traces and node names from the
    innermost active context. Runtime is O(n log n) for n events.

    Args:
        traced_data: Json of profiler events from Chrome trace

    Returns:
        Dict mapping recorded event names to their aten operations with added stack traces
    r   )_FX_METADATA_REGISTRYtraceEventsc                     |                      d          dk    oQ|                      dd                              d          o(|                      dd                              d          S )Ncatcpu_oprI    z## z ##)getr   endswithr>   s    r   is_fx_marker_eventzLmap_recorded_events_to_aten_ops_with_stack_trace.<locals>.is_fx_marker_event  sc    IIe( 6		&"%%00776		&"%%..u55	
r   c           	          |d         }||d         z   }                     t          |d| ||                                          t          |d| ||                     d S )Ntsdurr8   r9   )r   r   )r   r  r?   start_tsend_tsevent_timelines        r   append_fx_marker_eventzPmap_recorded_events_to_aten_ops_with_stack_trace.<locals>.append_fx_marker_event  s|    ;E%L((GZUKK	
 	
 	
 	&%ZGG	
 	
 	
 	
 	
r   r  r  rI      z.pyr   r   r   Nc                     | j         S r   )r   r   s    r   r   zBmap_recorded_events_to_aten_ops_with_stack_trace.<locals>.<lambda>  s    ak r   rO   r8   r	  node_metadatar9   rQ   r   stack_tracezNo model stack trace availabler  args	node_name)torch.fx.tracebackr  r  r  r3   
ValueErrorr   r   r   r   r  r  
isinstancera   r?   r  r   r  r	  r  r   rW   r#   
setdefault)traced_datar  trace_eventsr  r  r?   content
node_indexr  context_stacktimeline_eventr  r	  current_file_metadata	ctx_entryr  	node_metar[   current_stack_tracecurrent_node_name	event_tidr   r  s                         @r   0map_recorded_events_to_aten_ops_with_stack_tracer2    s    988888??="55L +-N
 
 

 
 
 
 
  Y YuU 2 2e$$ 	YFmAbD)G&& B&&z7EBBBB!$WJJ!   D&&vz5AAAA T{H!!-)T4QV"W"WXXXX 11222 .0M ) L> L>'%0<<<!-;;%n&?EEEEE4889RSSH(.22599C!(()&(A8S    
 $/699,0)(.22599C%-m%<%< " "	%2j@@ ) 4 44=4F1!E, 
(=(A(A/SU(V(V)4EE5B . 96I *00 1$*N,EyRU!" !"   s=11A5r2>>  A -a 0I&2i6LLL*59MMM%))!,,, '+#$(!*044U;;	!)-!8!8 
" 
"I }	11$1V;;	@R;2;2D2H2H -/O3 3/ 1:0B0F0Fvr0R0R- "E ' >*; >)/::62FFD* B.A]+( >,=[)=YL> L>s   4B
BB)r   Nr`   )!	functoolsr   r   collectionsr   dataclassesr   typingr   r   r   r   r	   torch.profilerr
   torch.autogradr   r   r!   partialtraverse_dfstraverse_bfsr'   r7   r;   rd   r   r   r   r   r   r  r2  r   r   r   <module>r<     s        				       ! ! ! ! ! ! . . . . . . . . . . + + + + + + % % % % % %  ,++++++ *>)=u * * * * * * !y 4E4EtTTT y ,,e  
 
9 
9 
9 
9 
9 
9 
9 
9        + + + + + + + +\I I I I I I I IX     Kqd 0 0 0 0+ + +                   O> O> O> O> O>r   