o
    h                     @   sx   d dl Z d dlZddlmZ ddlmZ ddlmZ ddlmZm	Z	m
Z
 dd Zd	d
 Zdd Z	dddZdd ZdS )    N   )cdiv)runtime)driver)get_dram_gbpsget_max_simd_tflopsget_max_tensorcore_tflopsc                 C   @   |t |d }tj|d d }t ||| t|| | }|S z# return compute throughput in TOPS    multiprocessor_count)minr   utilsget_device_propertiesr   backenddevicenum_ctas	num_warpsdtypetotal_warpsnum_subcorestflops r   R/var/www/html/ai/venv/lib/python3.10/site-packages/triton/ops/matmul_perf_model.pyget_tensorcore_tflops      r   c                 C   r	   r
   )r   r   r   r   r   r   r   r   r   get_simd_tflops   r   r   c                 C   sB   t j|}|d dk r|t jkrt| ||||S t| ||||S )Nr      )torchcudaget_device_capabilityfloat32r   r   )r   r   r   r   r   
capabilityr   r   r   
get_tflops   s   r$   Fc           ,      K   s  t jj}tj }|j}| }t||}t||	}|}|| | }t	||t	||	}}d| | | d }t
|||| |}|| }tj|d }td|| }td|d }t	td|d d d}t|||d |d	   }|d
 }|| | dd|d    }|| | d |d  } || | dd|d    }!|| | d |d  }"||! d }#| |" d }$|#| |$|  }%|d }&|| | | d }'|dkr|'|& }(n|&})|'|) }(|| d d |& }*|(|*7 }(t	||%|( }+|r	td|+ d| d|% d|( d|d  d |+S )zO return estimated running time in ms
          = max(compute, loading) + store r   i   @r          L   r   gffffff?g?r   g?g?i   g333333?zTotal time: zms, compute time: zms, loading time: zms, store time: zms, Activate CTAs: d   %)r   r   CUDAr   r    current_devicer   element_sizer   maxr$   r   r   r   r   r   print),r   
num_stagesABCMNKBLOCK_MBLOCK_NBLOCK_KSPLIT_Kdebugkwargsr   r   r   dtsize	num_cta_m	num_cta_n	num_cta_kr   	total_opstput
compute_msnum_smactive_cta_ratioactive_cta_ratio_bw1active_cta_ratio_bw2dram_bwl2_bwload_a_dram	load_a_l2load_b_dram	load_b_l2
total_dramtotal_l2load_msstore_bwstore_c_dramstore_ms	reduce_bwzero_mstotal_time_msr   r   r   estimate_matmul_time"   sV   





rV   c                    s  t j }t j }|d  }|d j}g }| D ]1}|j}|d |d |d |jf\}	}
}}tj	
|d }|	|
 | | | }||krJ|| q|} |t jt jfvr\dd | D } i }| D ]9}|j}|d |d |d |d |j|jf\}	}
}}}}|	|
|||f}||v r|| ||f q`||fg||< q`g }| D ]O\}}|\}	}
}}}|d	 d
kr|	|
 | d }|td| d
 }d}||  tjd| fddd}|D ]	}||d	  qq|d	 d	 }d|_|| q|S )Nr0   r6   r7   r8   max_shared_memc                 S   s   g | ]}|j d  dkr|qS )r9   r%   )r;   ).0configr   r   r   
<listcomp>z   s    z&early_config_prune.<locals>.<listcomp>r9   r   r   i   r   i,  r   c                    s0   | d   dk rdt | d    S | d   S )Nr%   r   
   )abs)xoptimal_num_stagesr   r   <lambda>   s   z$early_config_prune.<locals>.<lambda>)key)r   r    r+   r!   r,   r   r;   r/   r   r   r   appendfloat16r"   r   itemsr   heapq	nsmallest)configs
named_argsr   r#   r<   r   pruned_configsrY   kwr6   r7   r8   r/   max_shared_memoryrequired_shared_memoryconfigs_mapr9   r   ra   kvmmas
mma_cyclesldgsts_latencynearestnrandom_configr   r^   r   early_config_pruned   sT   




"rv   )F)re   r    r   _C.libtriton.tritonr   r   testingr   r   r   r   r   r$   rV   rv   r   r   r   r   <module>   s    
B