o
    hY                     @  s(  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZ ddlmZmZmZmZmZmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, dd Z-dd Z.dd Z/dd Z0dd Z1dd Z2dd Z3e4 dYd!d"Z5dZd[d(d)Z6d\d+d,Z7d-d. Z8d/d0 Z9d]d5d6Z:d^d9d:Z;d;d< Z<d=d> Z=d?Z>d@Z?e>e>e?dAZ@dBZAdCZBeAeAeBdAZCdDZDdEdF ZEdGdH ZFe
dIdJdKgeG eG gdLZHdMdN ZIdOdP ZJdQdR ZKdSdT ZLdUdV ZMG dWdX dXZNdS )_    )annotationsN)
namedtuple)Path)AnyTuple   )add_external_libscompile_ptx_to_cubinget_shared_memory_sizeirtranslate_llvmir_to_hsacotranslate_llvmir_to_ptxtranslate_triton_gpu_to_llvmir)get_backendpath_to_ptxas)OutOfResources)get_cache_manager)driver)JITFunctionget_cuda_streamget_current_deviceget_device_capabilityversion_key)extract   )ast_to_ttir)	make_stubc                 C  s*   t | j}|  |  ||  | S N)r   pass_managercontextenable_debugadd_inliner_passrun)modpm r%   N/var/www/html/ai/venv/lib/python3.10/site-packages/triton/compiler/compiler.pyinline_triton_ir   s
   
r'   c                 C  s4   t | j}|  t|r|| ||  | S r   )r   r   r   r    _is_cudaadd_rewrite_tensor_pointer_passr"   r#   archr$   r%   r%   r&   ttir_compute_capability_rewrite'   s   

r,   c                 C  sl   t | } t| |} t| j}|  |  |  |  |	  |
  |  |  ||  | S r   )r'   r,   r   r   r   r    r!   add_triton_combine_passadd_canonicalizer_passadd_reorder_broadcast_passadd_cse_passadd_licm_passadd_symbol_dce_passr"   r*   r%   r%   r&   optimize_ttir2   s   

r3   c                 C  s,   t | j}|  || ||  | S r   )r   r   r   r    $add_convert_triton_to_tritongpu_passr"   )r#   	num_warpsr$   r%   r%   r&   ttir_to_ttgirB   s
   

r6   c                 C  s   t | j}|  |  |  t|tr|| |  |	  |
| |  |	  |  |  |  |  |  ||  | S r   )r   r   r   r    add_tritongpu_coalesce_pass,add_tritongpu_remove_layout_conversions_pass
isinstanceint$add_tritongpu_accelerate_matmul_pass(add_tritongpu_optimize_dot_operands_passadd_tritongpu_pipeline_passadd_tritongpu_prefetch_pass(add_tritongpu_decompose_conversions_pass'add_tritongpu_reorder_instructions_passr0   r2   r"   )r#   
num_stagesr+   r$   r%   r%   r&   optimize_ttgirJ   s$   



rB   c                 C  sP   |  D ]\}}t|dkst|dkr d S qt| t| t|  d S )Nr   )itemslenr   listkeysvalues)r#   libsnamepathr%   r%   r&   _add_external_libs_   s
    rK   c                 C  s.   |rt | | t|rt| |dS t| ddS )NFr   T)rK   r(   r   )r#   extern_libsr+   r%   r%   r&   ttgir_to_llirf   s
   
rM   returnr:   c                 C  sZ   t | tsJ tt| d\}}|dkrd| S |dkr!d| S |dkr)d| S td)	zK
    Get the highest PTX version supported by the current CUDA driver.
    .   P      F   
   ?   z'Triton only support CUDA 10.0 or higher)r9   strmapr:   splitRuntimeError)cuda_versionmajorminorr%   r%   r&   ptx_get_versionr   s   r]   r#   r   r+   ptx_versionrV   c                 C  s&   |du rt  \}}t|}t| ||S )zr
    Translate TritonGPU module to PTX code.
    :param mod: a TritonGPU dialect module
    :return: PTX code
    N)r   r]   r   )r#   r+   r^   _rZ   r%   r%   r&   llir_to_ptx   s   
r`   ptxc                 C  s   t  \}}t| ||S )z
    Compile TritonGPU module to cubin.
    :param ptx: ptx code
    :param compute_capability: compute capability
    :return: str
    )r   r	   )ra   r+   ptxasr_   r%   r%   r&   ptx_to_cubin   s   
rc   c                 C  s   g d}| d }t d|d }d| d }tjttj	
 d}i }d}|D ]}|| }	tj|	rC|	|dt| < |d7 }q+|| }
tj|
rV|
|dt| < |S )N)z	opencl.bczocml.bczockl.bczoclc_finite_only_off.bczoclc_daz_opt_off.bcz!oclc_correctly_rounded_sqrt_on.bczoclc_unsafe_math_off.bczoclc_wavefrontsize64_on.bcr   zgfx(\w+)oclc_isa_version_z.bczthird_party/rocm/lib/bitcode/library_)researchgroupstriposrJ   joinr   __file__parentresolveexistsrV   )r+   #gpu_arch_agnostic_bitcode_librariesgfx_archgfx_arch_id!gpu_arch_specific_bitcode_librarybitcode_path_diramdgcn_bitcode_pathsibc_libbc_pathbc_gfx_pathr%   r%   r&   get_amdgcn_bitcode_paths   s"   	rz   c                  C  s   zUt jddd} t| d  }td|d 	d}|d }|d 	d	}|d }d
}t
|dkrPdtd|d d d td|d d }|||gW S  ty_   Y dS w )z
    get the amdgpu fulll ISA details for compiling:
    i.e., arch_triple: amdgcn-amd-amdhsa; arch_name: gfx906; arch_features: sramecc+:xnack-
    	ROCM_PATHz	/opt/rocm)defaultz/bin/rocminfozamd.*r   z--r   :    +z\w+z,-r   N)rj   getenv
subprocesscheck_outputdecoderf   rg   rh   ri   rX   rD   BaseException)rocm_path_dirrocminfogfx_arch_detailsarch_triplearch_name_features	arch_namearch_featuresr%   r%   r&   get_amdgpu_arch_fulldetails   s    r   rq   
gfx_triplegfx_featuresTuple[str, str]c                 C  s   t | |||S )z
    Translate TritonGPU module to HSACO code based on full details of gpu architecture.
    :param mod: a TritonGPU dialect module
    :return:
        - AMDGCN code
        - Path to HSACO object
    )r   )r#   rq   r   r   r%   r%   r&   llir_to_amdgcn_and_hsaco   s   r   srcpatternc                 C  s>   | sJ |  dD ]}| }||r|  d   S q	dS )zd
    Get kernel name from PTX code.
    This Kernel name is required when launching the kernel.
    
N)rX   ri   
startswith)r   r   liner%   r%   r&   get_kernel_name   s   
r   c                 C  s*   t d| }|d urdt|d S | S )Nz!tt\.ptr<(.*)>*r   )rf   rg   convert_type_reprrh   )xmatchr%   r%   r&   r      s   r   c                   s   t | tr\|d }|d }|dt }|dd}|dd}|dd	}d
d   fdd|D }	| j dd|  d|	 d| d| d| d| d| }
t|
	d
 S t | tscJ tt|  t  	d
 S )Nconfigs	signature	constantsr5      rA   r   debugFc                 S  s   t | jt | jfS r   )sorteddivisible_by_16
equal_to_1)confr%   r%   r&   <lambda>       zmake_hash.<locals>.<lambda>c                   s   g | ]} |qS r%   r%   ).0r   get_conf_keyr%   r&   
<listcomp>   r   zmake_hash.<locals>.<listcomp>-r~   zutf-8)r9   r   getdict	cache_keyrk   rG   hashlibmd5encode	hexdigestrV   r   	read_textr   )fnr+   kwargsr   r   r   r5   rA   r   configs_keykeyr%   r   r&   	make_hash   s   
>"r   z`^\s*tt\.func\s+(?:public\s+)?(@\w+)(\((?:%\w+: \S+(?: \{\S+ = \S+ : \S+\})?(?:, )?)*\))\s*\{\s*$z=\.(?:visible|extern)\s+\.(?:entry|func)\s+(\w+)\s*\(([^)]*)\))ttirttgirra   z-%\w+: ([^,^\)\s]+)(?: \{\S+ = \S+ : \S+\})?,?z\.param\s+\.(\w+)z&"triton_gpu.num-warps"\s?=\s?(\d+)\s?:c                 C  s2   dd }i }| D ]}|| | r| | ||< q|S )Nc              	   S  s*   zt |  W dS  ttfy   Y dS w )NTF)jsondumps	TypeErrorOverflowError)r   r%   r%   r&   _is_jsonable  s   
z-_get_jsonable_constants.<locals>._is_jsonabler%   )r   r   serialized_constantsconstantr%   r%   r&   _get_jsonable_constants  s   r   c                 C  s   t | |}||_|S r   )r   parse_mlir_moduler   )rJ   r   moduler%   r%   r&   r   +  s   r   instance_descriptorr   r   )defaultsc                 C  s
   t | tS r   )r9   r:   r+   r%   r%   r&   r(   6  s   
r(   c                 C  sh   zdd l }W n ty   tdw | d u r2|jjd u r/t }t|} | d d | d  } | S t } | S )Nr   z'Triton requires PyTorch to be installedrT   r   )torchImportErrorversionhipr   r   r   )
capabilityr   devicer%   r%   r&   get_architecture_descriptor:  s   r   c                   s   | t|  t|D ]}|| dks|| d u r|| q| tjdd   d u r2tddd  fddf|d< d S )	Nr~   MI_GPU_ARCHr   z gfx_arch is None (not specified)c                 S     t |  S r   r   r   rJ   r%   r%   r&   r   T      z!add_rocm_stages.<locals>.<lambda>c                   s   t |  d d S )Nr   r   )r   r   rq   gfx_arch_full_detailsr%   r&   r   U  s    amdgcn)updaterz   rE   poprj   environr   rY   )r+   rL   stagesr   r%   r   r&   add_rocm_stagesI  s   
r   c                   s4   dd  fddf|d< dd  fddf|d< d S )Nc                 S  r   r   r   r   r%   r%   r&   r   \  r   z!add_cuda_stages.<locals>.<lambda>c                   
   t |  S r   )r`   r   r   r%   r&   r   ]     
 ra   c                 S  r   r   )r   
read_bytesr   r%   r%   r&   r   ^  r   c                   r   r   )rc   r   r   r%   r&   r   _  r   cubinr%   )r+   rL   r   r%   r   r&   add_cuda_stagesZ  s   

r   c           "        s8  | dd}t|}|dv rt| dd  nt|}|sJ |jd7i | |dko.t }|dv o5| }t | dt | dd| d|rR d	krRd
nd| dt d u rct | ddt }fddd f|d< fdd 	fddf|d< fdd fddf|d< dd  fddf|d< |rt | n|rt | n|	 | t
tr| dd |d 	d u rt gtdksJ |d< j}d}t
	trdd  t	d!D 		|d< nt
tsJ tjd"\}	}
t }dd l}|t|
 ||j}|d|d}	|t|
 	}|
dkrd|t|}t|dksLJ d#d|vs^t|d ks^J d$t|d d%d& |D }d'd  t|D 	t |! "|
}|s|rt#|	}n|$|	}t%t& fi |}t
trjd}}ntjd"\}}d }| d(}|'|pi }| |}|d urt(|}t)*|}W d    n	1 sw   Y  nt+ d)}|d*krd+|v s J d,|d+ |d+< ||d< t |! "|}t }}t |, |d  D ]\}
\}}| d"|
 }|
|kr9|}na| |}|d u ru||}td-krf| d.} |-|d |||< |-|d | || < n4|-||||< |-|| n%|
d-kr| d.} | | }!|!d usJ d/||||!f}n||}|
d0kr|||
< n|
d-krt|d ||
< nt|||
< |
dkrd+|vrt.||d+< |
d*krt/|d1d2|d3< |
d-krt/|d d4d2|d3< |d |d5< |s|s|0|
|||| |}q"|d u r|j-t)1||dd6||< |2|| t3|||S )8Ndevice_typecudar   r   ccr   r5   r   rA   K   r   r   rL   r   Fc                   s    S r   r%   r   )r   r%   r&   r   {  s    zcompile.<locals>.<lambda>astc                   r   r   r   r   r   r%   r&   r   |  r   c              	     s   t t| d  d S )Nr   )r   r+   )r3   r   r   )r+   r   r   r   r   r%   r&   r   }  s    r   c                   r   r   r   r   r   r%   r&   r   ~  r   c                   s   t t|  S r   )rB   r6   r   )r+   rA   r5   r%   r&   r     s    r   c                 S  r   r   r   r   r%   r%   r&   r     r   c                   s   t |  S r   )rM   r   )r+   rL   r%   r&   r     r   llirr   r   r   r   c                 S  s   i | ]	\}}||  qS r%   )ri   r   kvr%   r%   r&   
<dictcomp>  s    zcompile.<locals>.<dictcomp>,rO   z(Expected exactly one match for num_warpsz6num_warps in ttgir does not match num_warps in compilec                 S  s   g | ]}t |qS r%   )r   )r   tyr%   r%   r&   r     r   zcompile.<locals>.<listcomp>c                 S  s   i | ]\}}||qS r%   r%   r   r%   r%   r&   r     s    z.json)r5   rA   r   r   r+   ra   sharedz/ptx compilation must provide shared memory sizer   z.hsaco_pathz?Expected to have hsaco_path in metadata when we have the amdgcnr   z	// .globl)r   rI   z.globl
hsaco_path)binaryr%   )4r   r   r   r(   r   r   r   r   r   
add_stagesr9   r   r   rD   __name__rV   	enumeraterX   rj   rJ   basenamer   r   rf   rg   prototype_pattern	MULTILINErh   findallarg_type_patternttgir_num_warps_patternr:   rE   rF   indexr   make_launcher_stubr   r   	get_groupopenr   loadr   rC   putr
   r   add_meta_infor   	put_groupCompiledKernel)"r   r   r   _device_backendis_cudais_hipr   rI   first_stager_   ir_namer   rf   r   typesnum_warps_matches	param_tysso_pathfn_cache_managerextmetadatametadata_filenamemetadata_groupmetadata_pathfasmr   parsecompile_kernelir_filenamenext_modulerJ   extra_file_name
hasco_pathr%   )
r+   r   r   r   r   rL   r   rA   r5   r   r&   compileb  s  





$




 














r  c                      sF   e Zd ZdZdZdd Zdd Z fddZdd	 Zdd
dZ	  Z
S )r  Nc                 C  s   dd l }|jd|}|j|}|| _|j| t|d| _d|v r(|d nd| _	|d | _
|d | _|d | _|d | _| jd	vrIt| jnd | _|| _|| _d | _d | _d S )
Nr   __triton_launcherlaunchr   r5   rA   r   r   r   )importlib.utilutilspec_from_file_locationmodule_from_specr   loaderexec_modulegetattr	c_wrapperr   r5   rA   r   r   r   device_backendr  r  	cu_modulecu_function)selfr   r  r  r  	importlibspecr#   r%   r%   r&   __init__  s    




zCompiledKernel.__init__c           	      C  s   | j d urd S | jdv r't }tjdtjditj }tj|d }tjj	}n| j
s,J | j
 }| j
 }| j
|d }| j
 }| j|krOt| j|d|| jd | j| | j|\}}}}|| _|| _|| _ || _d S )Nr   r   r   max_shared_memzshared memoryrI   )r*  r   r   r   HIPCUDAbackendutilsget_device_propertiesload_binaryr)  get_kernel_binget_load_binary_fnr   r   r  r  n_spillsn_regsr+  )	r,  r   bin_path
max_sharedfn_load_binaryr#   funcr:  r9  r%   r%   r&   _init_handles$  s.   







$
zCompiledKernel._init_handlesc                   s   |dkr|    t |S )Nr(  )r?  super__getattribute__)r,  rI   	__class__r%   r&   rA  A  s   zCompiledKernel.__getattribute__c                   s       d d fdd
}|S )N)streamc                   sh   | d u rj dv rt } ntj d } j d  d  d jj| jtj	tj
g
|R   d S )N)r   rocmr   r   r   )r   r   r   
get_streamr(  r5   r   r+  r  launch_enter_hooklaunch_exit_hook)rD  argsgridr,  r%   r&   runnerI  s   
$
z*CompiledKernel.__getitem__.<locals>.runner)r?  )r,  rK  rL  r%   rJ  r&   __getitem__F  s   zCompiledKernel.__getitem__c              	   C  s   d| j v r
| j d S t \}}z*t|d}|| j d  W d    n1 s)w   Y  t||| _W t| nt| w | j| j d< | jS )Nsasswbr   )	r  tempfilemkstempr  writer   rN  rj   remove)r,  funfdrJ   r   r%   r%   r&   get_sassS  s   

zCompiledKernel.get_sassr   )r   
__module____qualname__rG  rH  r/  r?  rA  rM  rV  __classcell__r%   r%   rB  r&   r    s    r  )rN   r:   r   )r#   r   r+   r:   r^   r:   rN   rV   )ra   rV   r+   r:   )
r#   r   rq   rV   r   rV   r   rV   rN   r   )r   rV   r   rV   rN   rV   )O
__future__r   	functoolsr   r   rj   rf   r   rP  collectionsr   pathlibr   typingr   r   _C.libtriton.tritonr   r	   r
   r   r   r   r   common.backendr   r   runtime.autotunerr   runtime.cacher   runtime.driverr   runtime.jitr   r   r   r   r   tools.disasmr   code_generatorr   make_launcherr   r'   r,   r3   r6   rB   rK   rM   	lru_cacher]   r`   rc   rz   r   r   r   r   r   mlir_prototype_patternptx_prototype_patternr   mlir_arg_type_patternptx_arg_type_patternr   r   r   r   setr   r(   r   r   r   r  r  r%   r%   r%   r&   <module>   sx    $


 %