o
    hZ                     @   sz  d dl Z ddlmZmZmZmZmZ ddlmZ ddl	m
Z
mZ e je je jgZdd Zd	d
 Zdd Zeeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddge  g de
edddeddd iedejdejd ejd!ejd"ejd#ejdejfd$d%ZG d&d' d'e jjZejZdS )(    N   )Configautotunecdiv
heuristicsjit)language   )early_config_pruneestimate_matmul_timec                 C   sR   | |u r| S | t v sJ |t v sJ t D ]}| |u r|  S ||u r&|   S qd S N)_ordered_datatypes)abd r   G/var/www/html/ai/venv/lib/python3.10/site-packages/triton/ops/matmul.pyget_higher_dtype
   s   r   c                    s    fddS )Nc                    s   |     S r   )zero_)nargsnamer   r   <lambda>   s    zinit_to_zero.<locals>.<lambda>r   r   r   r   r   init_to_zero   s   r   c                  C   s   g } dD ]?}dD ]:}dD ]5}dD ]0}|dkrdnd}|  t|||dd	||d
 dD ]}|  t||||d	||tdd q+qqqq| S )N)r               )       )r   @   )r   r          r    r   r   r	   BLOCK_MBLOCK_NBLOCK_KSPLIT_K
num_stages	num_warps)r   r      r   C)r)   r*   pre_hook)appendr   r   )configsr)   block_mblock_kblock_nr*   split_kr   r   r   get_configs_io_bound   s(   

r4   r!   r"   r   r#   r   r+   r(   r    r   r   )MNK
   )r
   
perf_modeltop_k)r/   keyprune_configs_byEVEN_Kc                 C   s   | d | d | d   dkS )Nr7   r&   r'   r   r   )argsr   r   r   r   L   s    r   dot_out_dtyper$   r%   r&   GROUP_Mr'   c           (      C   s  t d}t d}t ||}t ||}|| }|| }t|||  |}|| ||  }|| | }|| t d| }|| t d| }t t || ||}t t || ||}|| t d| } | |d d d f | | d d d f |   } || d d d f | |d d d f |	   }t j||f|d}!tdt ||| D ]h}"|rt 	| }#t 	|}$n1||"||   }%t jd|j
jd}&t j	| | d d d f |%k |&d}#t j	|| d d d f |%k |&d}$|#|j
j}#|$|j
j}$|!t j|#|$|d7 }!| || | 7 } ||| | 7 }q|!|j
j}!|| t d| }|| t d| }||d d d f |
 |d d d f |   }||k d d d f ||k d d d f @ }'|dkrht j||!|'d d S t j||!|'d d S )Nr   r	   )dtype)r	   r	   )maskother)	out_dtype)rB   )tl
program_idr   minarangemax_contiguousmultiple_ofzerosrangeloadrA   
element_tytodotstore
atomic_add)(ABr,   r5   r6   r7   	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnr?   r$   r%   r&   r@   r'   r=   pidpid_zgrid_mgrid_nwidthgroup_id
group_sizepid_mpid_nrmrnramrbnrkacckr   r   k_remaining_0rB   r   r   r   _kernel-   sL   
+
,,
  ,(
rm   c                   @   s.   e Zd ZeZi Zedd ZedddZdS )_matmulc           	         s  | j }| ddkr| ddkr|  } |ddkr'|ddkr'| }| jd |jd ks5J d| j\ }|j\}| jtjtjtjfv sU|jtjtjtjfv rYt	j
}nt| j|j}t	j f||d}|d u r|t	j
t	jt	jfv r|tj}n&tj}n"t|t	jsJ d|t	j
krtj
}n|t	jt	jfv rtj}ntj} fdd}t| | || || d| d|d|d|d|d|dd	 |S )
Nr   r	   zincompatible dimensions)devicerA   z#dot_out_dtype must be a torch.dtypec                    s$   t  | d t | d  | d fS )Nr$   r%   r'   )r   )METAr5   r6   r   r   r      s   $ z_matmul._call.<locals>.<lambda>r+   )r?   r@   )ro   stride
contiguousshaperA   rE   float8e4float8e4b15float8e5torchfloat16r   emptyfloat32bfloat16int32
isinstancerm   )	r   r   r?   ro   r7   _c_dtypecgridr   rq   r   _call   s@   


z_matmul._callNc                 C   s   t j|||dS )N)r?   )rn   r   )ctxr   r   r?   r   r   r   forward   s   z_matmul.forwardr   )	__name__
__module____qualname__rm   kernel_locksstaticmethodr   r   r   r   r   r   rn      s    
)rn   )rx    r   r   r   r   r   r   rE   matmul_perf_modelr
   r   ry   r|   r{   r   r   r   r4   	constexprrm   autogradFunctionrn   applymatmulr   r   r   r   <module>   sr    8
4