o
    h<                     @   s   d dl Z ddlmZmZmZ ddlmZ eddd iedejd	ejd
ejdejdejf
ddZdddZ	dd Z
edejd	ejd
ejdejdejf
ddZdddZdd ZdddZG dd de jjZG dd dZdS )     N   )cdiv
heuristicsjit)languageEVEN_Kc                 C   s   | d | d  dkS )NKTILE_Kr    )nargsr
   r
   S/var/www/html/ai/venv/lib/python3.10/site-packages/triton/ops/blocksparse/matmul.py<lambda>       r   TILE_MTILE_Nr	   BLOCKc           *      C   s<  t d| }||d 7 }t d}t |d }t |d }|| t d||  }t d|}| ||  ||  |d d d f |  |d d d f |  }t |d }|| t d||  }t d|} |||  ||  |d d d f |
  | d d d f |	  }!t j||ft jd}"t|d| D ]F}#|rt |}$t |!}%n t j||d d d f |#k dd}$t j|!| d d d f |#k dd}%|"t j|$|%t jd7 }"||| 7 }|!||	 7 }!q|"|j	j
}&t d|| }'t d|| }(|||  ||  |'d d d f |  |(d d d f |  })t j|)|&d	d
 d S )Nr   r         dtypeg        )maskother	out_dtypeTr   )tl
program_idloadarangezerosfloat32rangedottor   
element_tystore)*ABC	stride_za	stride_ha	stride_ma	stride_ak	stride_zb	stride_hb	stride_bk	stride_nb	stride_zc	stride_hc	stride_mc	stride_ncr   grid_offsetlutr   r   r	   r   r   block_idoff_zoff_hstart_amoffs_amoffs_aka_ptrsstart_bnoffs_bnoffs_bkb_ptrsacckabcoffs_cmoffs_cnpcr
   r
   r   _sdd_kernel   sf   

  rJ   c
                 C   s  |  ddkr|  ddkr|  } | ddkr$| ddkr$| }|r2|| } }| | }}|r6dnd}
|r<dnd}| j|
 |j| }}||krXtd| d| d|	d u rqtj| jd	 |jd	 ||f| j| jd
}n|	j| jd	 |jd	 ||fksJ |	}|jd d|jd	 g}t| | |||  d	|  d|  |rdnd|  |rdnd| d	| d| |rdnd| |rdnd| d	| d| d| d|d	|||d|ddd |S )Nr   r   r   zInner dimension mismatch (A: z vs B: )r   r   device       )r   r   r	   r   
num_stages	num_warps)	stride
contiguousshape
ValueErrortorchemptyr   rO   rJ   )rD   rE   trans_atrans_btrans_cspdimsblockr6   widthsouta_dimb_dimKaKbrF   gridr
   r
   r   
sdd_matmulT   s6   
*"00 
	rf   c                 C   s&   | j dd| }| }|d fS )NFas_tuple)nonzeror#   intrU   )layoutr^   rO   r6   r
   r
   r   sdd_lutv   s   rl   GROUP_SIZE_Mc           4      C   s  t d}t d}t d}t d}t |||||\}}t d}||d  }t |d }t |d }t |d }t |d } || }!t |!d }"t |"d}"t d|}#t d|}$| ||  |"|  |#d d d f |  |$d d d f |  }%|| t d| }&t t |&| ||}&t |!}'t |'d}'|'t d| }(|||  | |  |&d d d f |
  |(d d d f |	  })t j||ft j	d}*|!d7 }!t |!d }+t |+d}+t |!},t |,d},t
|d| D ]=}-t |%}.t |)}/|*t j|.|/t j	d7 }*|%|+7 }%|)|,|	 7 })|!d7 }!t |!d }+t |+d}+t |!},t |,d},q|*|jj}0|| t d| }1|| t d| }2|| |  ||  |1d d d f |  |2d d d f |  }3t j|3|0|2d d d f |k d	 d S )
Nr   r   r   rQ   r      r   r   r   )r   r   num_programs	swizzle2dr   multiple_ofr   max_contiguousr   r    r!   r"   r#   r   r$   r%   )4r&   r'   r(   	stride_azr*   	stride_amr,   r-   r.   r/   	stride_bnr1   r2   	stride_cm	stride_cnDS0DS1r6   r   r   r	   rm   r   pid_mpid_n	num_pid_m	num_pid_npidzheaderoffsetr   columnr9   pincr7   r;   r<   par?   start_bkr@   pbrB   inc_ainc_brC   rD   rE   rF   rG   rH   rI   r
   r
   r   _dsd_kernel   s   











$r   c
                    s  |  ddkr|  ddkr|  } | ddkr$| ddkr$| }|||r*dnd  }
|d |d}||r>dnd| j} }|}|rLn|
}|rR|
n}|	d u rftj||||f|| jd}n|	j||||fksqJ |	}d} fdd}t| | |||  d|  d|  |rdnd|  |rdnd| d| d| |rdnd| |rdnd| d| d| |rdnd| |rdnd|
|f||t	|d	|d
d
d
d |S )Nr   r   r   r   rN      c                    s   t | d  gS )Nr   )r   )metaBS0BS3widthr
   r   r      r   zdsd_matmul.<locals>.<lambda>rP   rQ   )r   r   r	   r   rR   rS   rm   )
rT   rU   sizer   rX   rY   rO   rV   r   min)rD   rE   rZ   r[   r\   r]   r^   r6   r   r`   AS1BS1r   CS0CS1CS2CS3rF   r   re   r
   r   r   
dsd_matmul   s<   

000
r   c                 C   s  t | |rdnd}t |jdd\}}| }|| }|r&| jdd}	n
| ddjdd}	|	d}
t |}t j|dd dd	|dd< t 	||
d t | }|	dddf | }|
 }|dd  |dd 8  < || }|ddd|}||ddddf< |dddf  |d | 8  < |||dk  |||dk df< |d}|rt j|
| jd
}nLt jg t j| jd}d}t| dD ]8}| |ddddf 
  }| }dt j|| jd
 ||dk< t |||j|jdk  d f}||7 }q|| | }|dd  |dd | | 8  < |ddd|}|rF||ddddf< |dddf  |d | 8  < n|| |ddddf< |dddf  |d | | 8  < |||dk  |||dk df< |d}|d}|d | d|  }|| }t j||||fdd	d }t j||fdd	d }t jd|j|jd}t ||f}t ||f}|t j|}||fS )a  
    Generates the look-up table for incrementing pointers in the DSD/DDS matmul.
    Example (BLOCK=32, STEP=16)
    [[1, 0, 0, 1, 0],
     [0, 1, 1, 0, 1],
     [1, 0, 1, 0, 0]]

    Then the offsets for A are
     [0 , 16, 32, 48] <- row 0
      \----/  \----/
      col=0   col=3
     [64, 80, 96, 112, 128, 144] <- row 1
      \----/   \----/  \------/
       col=1    col=2    col=3
     [160, 176, 192, 208]
    which leads to increments table
    [0, 16, 16, 16, || 64, 16, 16, 16, 16, 16, || 160, 16, 16, 16]

    Because B is dense, the offsets are
    [0, 16, 96, 112] <- row 0
    [32, 48, 64, 80]  <- row 1
    [0, 16, 64, 80]   <- row 2
    r   r   Trg   Fr   NrL   )dim)rO   rN   rQ      )rO   r   )rX   sum	ones_likeri   flatten	transposer   
zeros_likecumsumr   cloneviewrepeatr   rO   tensorint64r!   longcatTstackrU   r   r   typeint32r#   )rk   r^   steptransrO   sizeshead_idcol_idsegmentsnnz
num_blocksoffsetsB_idxB_incsdivA_idxcurrent_offsetzlayoutwmsumA_incsr   r   incspadr6   r
   r
   r   dsd_lut   sd   

  
"
$"$ 

 r   c
           
      C   s"   t || | | | |||||	d
S N)r`   )r   )
rD   rE   rZ   r[   r\   r]   r^   r6   r   r`   r
   r
   r   
dds_matmulZ  s   "r   c                   @   s0   e Zd ZeeedZedd Zedd Z	dS )_matmulsdddsdddsc                 C   sx   t j| ||||||||	|
|d
}| || || _|| _|| _|| _|| _|| _|| _	|| _
|| _|| _|d u| _|S r   )r   fnsave_for_backwardda_lutda_widthdb_lutdb_widthmoder]   r^   rZ   r[   r\   has_out)ctxrD   rE   rZ   r[   r\   r   r]   r^   c_lutc_widthr   r   r   r   r`   rF   r
   r
   r   forwardf  s   "
z_matmul.forwardc           
      C   s   | j \}}d\}}| j}| jd r4|d |d  |d  }tj| ||| j| j | j| j| j	| j
| j	}| jd r\|d |d  |d  }tj| ||| j | j| j| j| j	| j| j	}| jra|nd }	||d d d d d d d d d d d d |	fS )N)NNr   r   r   )saved_tensorsr   needs_input_gradr   r   r\   r[   rZ   r]   r^   r   r   r   r   r   )
r   dcrD   rE   dadbr   mode_damode_dbdoutr
   r
   r   backward{  s$   

"
"
z_matmul.backwardN)
__name__
__module____qualname__rf   r   r   r   staticmethodr   r   r
   r
   r
   r   r   b  s    
r   c                   @   s    e Zd ZdddZdddZdS )	matmulFc           	      C   sD  |dvrt d|| _|| _|| _|| _|| _|| _|j| _t	|d}| jdkrJt
|||\| _| _t|||d|\| _| _t|||d|\| _| _| jdkrtt|||| j |\| _| _t
|||\| _| _t|||| j|\| _| _| jdkrt|||| j|\| _| _t|||| j |\| _| _t
|||\| _| _d S d S )	Nr   z"Supported modes are: sdd, dsd, ddsrP   r   TFr   r   )NotImplementedErrorr^   r   rZ   r[   r\   rk   rV   r]   r   rl   r   r   r   r   r   r   r   )	selfrk   r^   r   rO   rZ   r[   r\   r   r
   r
   r   __init__  s.   



zmatmul.__init__Nc                 C   sB   t ||| j| j| j| j| j| j| j| j	| j
| j| j| j|}|S N)r   applyrZ   r[   r\   r   r]   r^   r   r   r   r   r   r   )r   rD   rE   r`   rF   r
   r
   r   __call__  s   zmatmul.__call__)FFFr   )r   r   r   r   r   r
   r
   r
   r   r     s    
r   r   )rX    r   r   r   r   r   	constexprrJ   rf   rl   r   r   r   r   autogradFunctionr   r   r
   r
   r
   r   <module>   sJ    
@"
K&
f1