o
    hY                     @  s  d dl mZmZ d dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
mZmZ d dlmZmZmZmZmZmZmZmZmZ ddlmZmZ ejejejeZdZd1dd	Z d
d Z!dd Z"dd Z#edZ$G dd dej%Z&e' dd Z(G dd dee$ Z)G dd de)e$ Z*ed2ddZ+edddddd3d%dZ+	d1dddddd&d4d*dZ+G d+d, d,Z,G d-d. d.Z-d/d0 Z.dS )5    )annotationsdivisionN)defaultdict
namedtuple)	CallableGenericIterableListOptionalTypeVarUnioncastoverload   )get_backendpath_to_ptxasz2.1.0c                 C  sN   | d u rt  } zddlm} || W S  ty&   dd l}|j| j Y S w )Nr   )_cuda_getCurrentRawStream)get_current_devicetorch._Cr   ImportErrortorchcudacurrent_streamcuda_stream)idxr   r    r   H/var/www/html/ai/venv/lib/python3.10/site-packages/triton/runtime/jit.pyget_cuda_stream   s   
r   c                  C  s   dd l } | j S Nr   )r   r   current_device)r   r   r   r   r      s   
r   c                 C  s   dd l }|j|  d S r   )r   r   
set_devicer   r   r   r   r   set_current_device$   s   r"   c                 C  s   dd l }|j| S r   )r   r   get_device_capabilityr!   r   r   r   r#   )   s   r#   Tc                      s:   e Zd ZdZd fddZdd Zdd	 Zd
d Z  ZS )DependenciesFinderz
    This AST visitor is used to find dependencies of a JITFunction. This can
    be used to invalidate a JITFunction's hash when its source code -- or
    that of its dependencies -- changes.
    returnNonec                   s*   t    t|d | _|| _d S )Nutf-8)super__init__hashlibmd5encode	hexdigestretglobals)selfr0   src	__class__r   r   r*   <   s   

zDependenciesFinder.__init__c                 C  s   | j |jd S N)r0   getid)r1   noder   r   r   
visit_NameA   s   zDependenciesFinder.visit_Namec                 C  sj   |  |j}t|tjr|  |j}t|tjs|d u s-t|dddks-t|dddr/d S t||jS )N__name__ tritonz.triton)visitvalue
isinstanceast	Attributegetattrendswithattr)r1   r8   lhsr   r   r   visit_AttributeD   s   *z"DependenciesFinder.visit_Attributec                 C  s   |  |j}|d u rd S t|rd S |jr#|jds!d|jv r#d S t|ts1J d|j d|j	d u rLt
|j}t|j|j}| | |j|_	tt|dd}| j|j	 | d| _t| j | _d S )Nztriton.z.triton.z
Function "zv" is being called from a Triton function but is not a Triton function itself. Decorate it with @triton.jit to fix thisnoinlineFr(   )r=   funcinspect	isbuiltin
__module__
startswithr?   JITFunctionr:   hashr@   parser2   r%   __globals__r/   strrB   r-   r+   r,   r.   )r1   r8   rH   treefinderrG   r   r   r   
visit_CallL   s    


zDependenciesFinder.visit_Call)r&   r'   )	r:   rK   __qualname____doc__r*   r9   rF   rT   __classcell__r   r   r3   r   r%   5   s    r%   c               	   C  s  dd l } g }ttd}|t|  g7 }W d    n1 s"w   Y  tj	t
d}| |gD ])}t|j|jjd}|t|  g7 }W d    n1 sXw   Y  q4ttj	t
dd}|t|  g7 }W d    n1 sw   Y  tj	t
d}| |gD ])}t|j|jjd}|t|  g7 }W d    n1 sw   Y  qt d }tt|dg }d	td | d d	| S )Nr   rbcompilerz_C/libtriton.solanguagez	--version-)pkgutilopen__file__r+   r,   readr.   ospathjoinTRITON_PATHiter_modulesmodule_finder	find_specnameoriginr   
subprocesscheck_outputTRITON_VERSION)r\   contentsfcompiler_pathliblanguage_pathptxasptxas_versionr   r   r   version_keyc   s.   
 rs   c                   @  s    e Zd ZU ded< dddZdS )KernelInterfacer$   runr&   c                 C  s   t ttjt t| j|dS )z
        A JIT function is launched with: fn[grid](*args, **kwargs).
        Hence JITFunction.__getitem__ returns a callable proxy that
        memorizes the grid.
        )grid)r   r$   	functoolspartialr   ru   )r1   rv   r   r   r   __getitem__   s   zKernelInterface.__getitem__N)r&   r$   )r:   rK   rU   __annotations__ry   r   r   r   r   rt   }   s   
 rt   c                      s   e Zd ZdZdZedd Zedd Zedd Zed	d
 Z	dd Z
edd Zdd Zdd Zdd Zd1ddZd1ddZd2dd Zd!d" Zd3d#d$Zed%d& Zd'd( Zd)d* Zd+d, Z fd-d.Zd/d0 Z  ZS )4rM   N   c                 C  s   t | dr| jS t| trdS t| tr*d| kr| dkrdS d| kr(| dkr(dS d	S t| tr1d
S | d u r7d S tdt|  d|  )Ndtypei1i   ii32l            l    u64i64fp32zUnsupported type z for )hasattrr|   r?   boolintfloat	TypeErrortypeargr   r   r   _key_of   s   



zJITFunction._key_ofc                 C  s"   t | drt | jdr| jjS dS )Ndevicer   r;   )r   r   r   r   r   r   r   
_device_of      
zJITFunction._device_ofc                 C  s"   t | drt| jtr|  S dS )N	is_pinnedF)r   r?   r   r   r   r   r   r   _pinned_memory_of   r   zJITFunction._pinned_memory_ofc                 C  sD   t | dr|  tj dkS t| tr| d dk| dkfS | d u fS )Ndata_ptrr   r{      r   r   rM   divisibilityr?   r   r   r   r   r   _spec_of   s
   


zJITFunction._spec_ofc                   sR   dd   fddt |D }fddt |D }tdddgt|t|S )	Nc                 S  sD   t | dr|  tj dkS t| tr| tj dkS | d u r dS dS )Nr   r   TFr   )xr   r   r   is_divisible_by_16   s   

z3JITFunction._get_config.<locals>.is_divisible_by_16c                   s&   h | ]\}} |r|j vr|qS r   )do_not_specialize.0ir   r   r1   r   r   	<setcomp>      & z*JITFunction._get_config.<locals>.<setcomp>c                   s:   h | ]\}}t |tst |tr|d kr| jvr|qS )r   )r?   r   r   r   r   r1   r   r   r      s   : instance_descriptordivisible_by_16
equal_to_1)	enumerater   tuple)r1   argsr   r   r   r   r   _get_config   s   zJITFunction._get_configc                 C  s   | d u rdS t | dd }i dddddd	d
dddddddddddddddddddddd d!d"d#}t| D ]}|||< qGt| t rU| S d$||  S )%Nz*i8.r   r}   float8e4fp8e4float8e5fp8e5float8e4b15fp8e4b15float16fp16bfloat16bf16float32r   float64fp64int8i8int16i16int32r~   int64r   uint8u8uint16u16uint32u32uint64r   *)rQ   splitlistvaluesr?   )key	dtype_strtysvr   r   r   _type_of   sN   	

zJITFunction._type_ofc                   s    d  fddt|D }|S )N,c                   s   g | ]	\}}  |qS r   )r   )r   r   kr   r   r   
<listcomp>       z/JITFunction._make_signature.<locals>.<listcomp>)rb   r   )r1   sig_key	signaturer   r   r   _make_signature   s   zJITFunction._make_signaturec                 C  s   t t| j|}|S r5   )dictzip
constexprs)r1   constexpr_key	constantsr   r   r   _make_constants   s   zJITFunction._make_constantsc	              	   C  s   t jd u rdS | jj}	| jj}
ddd t| j|d D }|	 d| d| d| d	}t|}G d
d d}t	|||||||d}t j||||
|	d|i|dddS )NF, c                 S  s   g | ]\}}| d | qS )z: r   r   rg   tyr   r   r   r      s    z*JITFunction._call_hook.<locals>.<listcomp>r   z[num_warps=z, num_stages=]()c                   @  s   e Zd Zdd ZdS )z.JITFunction._call_hook.<locals>.LegacyCompilerc                 S  s   || _ || _d S r5   )modulerg   )r1   r   rg   r   r   r   r*      s   z7JITFunction._call_hook.<locals>.LegacyCompiler.__init__N)r:   rK   rU   r*   r   r   r   r   LegacyCompiler   s    r   )r   r   r   	num_warps
num_stagesextern_libsconfigsr   )r   reprfncompileis_manual_warmupalready_compiled)
rM   
cache_hookr   r:   rK   rb   r   	arg_namesrQ   r   )r1   r   r   r   r   r   r   r   r   rg   r   	arg_reprsr   r   kwargsr   r   r   
_call_hook   s   
 $zJITFunction._call_hookr&   rQ   c                 C  s   | j |d}|dkr%d| dtj d| d| dtj d| d| d	S d
|v r3d| dtj dS |dkrDd| dtj d| dS dS )Nr;   (z.data_ptr() % z == 0) if hasattr(z,, "data_ptr")                         else (z % z == 0, z == 1) if isinstance(z,, int)                         else (False,)Tensorz == 0)r   z == 1)z(False,))rz   r6   rM   r   r1   r   arg_annotationr   r   r   _get_arg_specialization_key  s    z'JITFunction._get_arg_specialization_keyc                 C  sD   | j |d}d|v r| dS |dkrdS |dkrdS d| d	S )
Nr;   r   z.dtyper   r}   r   r   z_key_of(r   )rz   r6   r   r   r   r   _get_arg_sig_key  s   
zJITFunction._get_arg_sig_keydevice_types	List[str]pinned_memory_flags
List[bool]c                 C  sv   dd |D }d|v rdd l }|jjrdS dS tdd |D }tdd |D }|r/|r/dS t|dkr9|d S dS )	Nc                 S  s   g | ]}|d kr|qS )r;   r   r   device_typer   r   r   r         z5JITFunction._conclude_device_type.<locals>.<listcomp>r   r   hipc                 s  s    | ]}|d kV  qdS )cpuNr   r   r   r   r   	<genexpr>"  s    z4JITFunction._conclude_device_type.<locals>.<genexpr>c                 s  s    | ]}|V  qd S r5   r   )r   pinned_memory_flagr   r   r   r   #  s    )r   versionr   allanylen)r1   r   r   r   is_cpuis_pinned_memoryr   r   r   _conclude_device_type  s   z!JITFunction._conclude_device_typec                   s   fddt  jD } fddt  jD }d|}d fdd|D }dddd |D  d }ddd	d |D  d }d|}g }t |D ]\}	}
|	 jv rZqP| |
g7 }qPd|}d
dd  jD }ddd t j jD }d jj d| d| dt	|dkr| d
nd dt	|dkr| d
nd d| d| d| d| d| dddd  jD  d}t
 t  j j j j jttttd}t|| | jj S )Nc                   s    g | ]\}}| j vr| qS r   r   r   r   r   r   r   +       z.JITFunction._make_launcher.<locals>.<listcomp>c                   s    g | ]\}}| j v r| qS r   r  r   r   r   r   r   ,  r  r   c                   s   g | ]}  |qS r   )r   r   r   r   r   r   r   /  s    [c                 S     g | ]}d | dqS )z_device_of(r   r   r  r   r   r   r   0  r   ]c                 S  r	  )z_pinned_memory_of(r   r   r  r   r   r   r   1  r   r   c                 S  s   g | ]
}d | d| qS )"z": r   r  r   r   r   r   <  s    c                 s  s0    | ]\}}|t jkr|n| d | V  qdS )z = NrI   _empty)r   rg   dfltr   r   r   r   =  s   . z-JITFunction._make_launcher.<locals>.<genexpr>z
def r   z, grid=None, num_warps=4, num_stages=3, extern_libs=None, stream=None, warmup=False, device=None, device_type=None):
    from ..compiler import compile, CompiledKernel
    sig_key =  z,
    constexpr_key = r   r   z
    spec_key = aV  
    key = (version_key, sig_key, constexpr_key, spec_key, num_warps, num_stages, self.debug)
    if not extern_libs is None:
      key = (key, tuple(extern_libs.items()))
    assert num_warps > 0 and (num_warps & (num_warps - 1)) == 0, "num_warps must be a power of 2"
    assert grid is not None
    if callable(grid):
        grid = grid({z})
    grid_size = len(grid)
    grid_0 = grid[0]
    grid_1 = grid[1] if grid_size > 1 else 1
    grid_2 = grid[2] if grid_size > 2 else 1

    if device_type is None:
        device_types = [_device_type for _device_type in zW if _device_type != '']
        device_type = self._conclude_device_type(device_types, a  )

    device_backend = None
    if device_type not in ['cuda', 'hip']:
        device_backend = get_backend(device_type)
        if device_backend is None:
            raise ValueError('Cannot find backend for ' + device_type)

    if device is None:
        if device_type in ['cuda', 'hip']:
            device = get_current_device()
            set_current_device(device)
        else:
            device = device_backend.get_current_device()
            device_backend.set_current_device(device)
    if stream is None and not warmup:
        if device_type in ['cuda', 'hip']:
            stream = get_cuda_stream(device)
        else:
            stream = device_backend.get_stream()

    bin = cache[device].get(key, None)
    if bin is not None:
      if not warmup:
          bin.c_wrapper(grid_0, grid_1, grid_2, bin.num_warps, bin.shared, stream, bin.cu_function, CompiledKernel.launch_enter_hook, CompiledKernel.launch_exit_hook, bin, zt)
      return bin
    # kernel not cached -- compile
    else:
      # build dict of constant values
      args = [z]
      all_args = c                 S  s   g | ]}| qS r   r   r  r   r   r   r   q      a  ,
      configs = self._get_config(*all_args),
      constants = self._make_constants(constexpr_key)
      constants.update({i: None for i, arg in enumerate(all_args) if arg is None})
      constants.update({i: 1 for i in configs[0].equal_to_1})
      # build kernel signature -- doesn't include specialized arguments
      signature = { i: self._type_of(_key_of(arg)) for i, arg in enumerate(all_args) if i not in self.constexprs }
      # build stub signature -- includes arguments that are specialized
      for i, arg in constants.items():
        if callable(arg):
          raise TypeError(f"Callable constexpr at index {i} is not supported")
      if not self._call_hook(key, signature, device, constants, num_warps, num_stages, extern_libs, configs):
        bin = compile(self, signature=signature, device=device, constants=constants, num_warps=num_warps, num_stages=num_stages, extern_libs=extern_libs, configs=configs, debug=self.debug, device_type=device_type)
        if not warmup:
            bin.c_wrapper(grid_0, grid_1, grid_2, bin.num_warps, bin.shared, stream, bin.cu_function, CompiledKernel.launch_enter_hook, CompiledKernel.launch_exit_hook, bin, *args)
        self.cache[device][key] = bin
        return bin
      return None
)rs   r   r1   r   r   r   r   cache__spec__r   r   r"   )r   r   rb   r   r   r   arg_defaultsr   r:   r  rs   r   r   r   r   r   r  r  r   r   r"   exec)r1   regular_argsconstexpr_argsr   sig_keysr   r   constexpr_keysspecializationsr   r   	spec_keys	grid_argsargs_signaturer2   scoper   r   r   _make_launcher*  sj   



,12E
zJITFunction._make_launcherc                   sZ  |_ |j_|_t|}dd |j D _dd |j D _	t
dd j	D _|d u r6g n|_fddjD _tt|_jjdd  _tt_d _g _d _tjd	d
dkrrdn|_|_dd   fdd|j D _fddj D _  _!|j"_"|j#_#|j$_$|j_d S )Nc                 S     g | ]}|j qS r   )rg   r   r   r   r   r   r     r  z(JITFunction.__init__.<locals>.<listcomp>c                 S  r  r   )defaultr  r   r   r   r     r  c                 s  s    | ]}|t jkV  qd S r5   r  r  r   r   r   r     s    z'JITFunction.__init__.<locals>.<genexpr>c                   s&   h | ]}t |tr j|n|qS r   )r?   rQ   r   indexr  r   r   r   r     r   z'JITFunction.__init__.<locals>.<setcomp>defTRITON_DEBUG01Tc                 S  s   t | tr| jS | S r5   )r?   r   r:   )r   r   r   r   <lambda>  s    z&JITFunction.__init__.<locals>.<lambda>c                   s   i | ]	\}}| |qS r   r   r   )normalize_tyr   r   
<dictcomp>  r   z(JITFunction.__init__.<locals>.<dictcomp>c                   s$   g | ]\}}d |v r j |qS )	constexpr)r   r!  r   r   r   r   r     s   $ )%r   rK   r   r   rI   r   
parametersr   r   r  r   has_defaultsr   textwrapdedent	getsourcer2   findr   r   r  rN   kernel_decoratorskernelr`   environr6   debugrG   rz   itemsr   r  ru   rV   r:   rP   )r1   r   r   r   r3  rG   r   r   )r'  r1   r   r*     s2   


zJITFunction.__init__c                 C  s<   | j d u rt| j| jd}||   |jt  | _ | j S )N)r0   r2   )rN   r%   rP   r2   r=   rO   r/   rs   )r1   dependencies_finderr   r   r   	cache_key  s
   
zJITFunction.cache_keyc                 O  s    | j ttj|i |ddiS )NwarmupT)ru   map
MockTensor
wrap_dtyper1   r   r   r   r   r   r7    s    zJITFunction.warmupc                 C  sH   t | j}t|t jsJ t|jdksJ t|jd t js"J |S )Nr   r   )r@   rO   r2   r?   Moduler  bodyFunctionDef)r1   rR   r   r   r   rO     s
   zJITFunction.parsec                 O  s   t d)Nz:Cannot call @triton.jit'd outside of the scope of a kernel)RuntimeErrorr;  r   r   r   __call__  s   zJITFunction.__call__c                   s6   |dkrd | _ tt| || |dkrd | _d S d S )Nr0  r2   )r1  r)   rM   __setattr__rN   )r1   rg   r>   r3   r   r   rA    s   
zJITFunction.__setattr__c                 C  s   d| j  d| jj dS )NzJITFunction(:r   )r   r   r:   r   r   r   r   __repr__  s   zJITFunction.__repr__r&   rQ   )r   r   r   r   r&   rQ   )NNNN)r:   rK   rU   r   r   staticmethodr   r   r   r   r   r   r   r   r   r   r   r  r  r*   propertyr6  r7  rO   r@  rA  rC  rW   r   r   r3   r   rM      s:    








i%
rM   r   r&   JITFunction[T]c                 C     d S r5   r   )r   r   r   r   jit     rI  r   r   r3  rG   r   Optional[Iterable[int]]r3  Optional[bool]rG   Callable[[T], JITFunction[T]]c                 C  rH  r5   r   rK  r   r   r   rI    s   )r   r   r3  rG   	interpretOptional[T]rO  4Union[JITFunction[T], Callable[[T], JITFunction[T]]]c                  s*   d fdd}| dur|| S |S )	a<  
    Decorator for JIT-compiling a function using the Triton compiler.

    :note: When a jit'd function is called, arguments are
        implicitly converted to pointers if they have a :code:`.data_ptr()` method
        and a `.dtype` attribute.

    :note: This function will be compiled and run on the GPU. It will only have access to:

           * python primitives,
           * builtins within the triton package,
           * arguments to this function,
           * other jit'd functions

    :param fn: the function to be jit-compiled
    :type fn: Callable
    r   r$   r&   rG  c                   s6   t | sJ rddlm} || S t|  dS )Nr   )GridSelectorrK  )callableinterpreter.interpreterrR  rM   )r   rR  r3  r   rO  rG   r   r   r   	decorator  s   zjit.<locals>.decoratorNr   r$   r&   rG  r   )r   r   r   r3  rG   rO  rV  r   rU  r   rI    s   c                   @  s0   e Zd ZdZedd Zdd Zedd ZdS )	r9  zr
    Can be used in place of real tensors when calling:
        kernel.warmup(MockTensor(torch.float32), ...)
    c                 C  s"   | j jdkr| jdkrt| S | S )Nr|   r   )r4   r:   rK   r9  r   r   r   r   r:  -  s   
zMockTensor.wrap_dtypec                 C  s
   || _ d S r5   )r|   )r1   r|   r   r   r   r*   4     
zMockTensor.__init__c                   C  s   dS r   r   r   r   r   r   r   7  rJ  zMockTensor.data_ptrN)r:   rK   rU   rV   rE  r:  r*   r   r   r   r   r   r9  (  s    
r9  c                   @  s.   e Zd Zdd Zdd Zdd Zdd	d
ZdS )TensorWrapperc                 C  s*   || _ || _|j| _|j| _| jj| _d S r5   )r|   baseis_cudar   shape)r1   rZ  r|   r   r   r   r*   =  s
   zTensorWrapper.__init__c                 C  s
   | j  S r5   )rZ  r   r   r   r   r   r   D  rX  zTensorWrapper.data_ptrc                 C  s   | j |S r5   )rZ  stride)r1   r   r   r   r   r]  G  s   zTensorWrapper.strider&   rQ   c                 C  s   d| j  d| j dS )NzTensorWrapper[r   r   )r|   rZ  r   r   r   r   __str__J  s   zTensorWrapper.__str__NrD  )r:   rK   rU   r*   r   r]  r^  r   r   r   r   rY  <  s
    rY  c                 C  sP   t | tr|| jjkr| jS t| j|S t| drt| |S tdt|  d)Nr   zCannot reinterpret a r   )r?   rY  rZ  r|   r   r   r   )tensorr|   r   r   r   reinterpretN  s   


r`  r5   rW  )r   rL  r3  rM  rG   rM  r&   rN  )r   rP  r   rL  r3  rM  rG   rM  rO  rM  r&   rQ  )/
__future__r   r   r@   rw   r+   rI   r`   ri   r,  collectionsr   r   typingr   r   r   r	   r
   r   r   r   r   common.backendr   r   ra   dirnameabspathr^   rc   rk   r   r   r"   r#   r$   NodeVisitorr%   	lru_cachers   rt   rM   rI  r9  rY  r`  r   r   r   r   <module>   sX    ,
.
  ^3