o
    h                     @   s   d dl Z ddlmZ ddlmZ ddlmZ dd Zedejd	ejd
ejfddZedejd	ejd
ejfddZ	G dd de j
jZG dd dZdS )    N   )jit)language)next_power_of_2c                 C   s4   | dkrdS | dkrdS | dkrdS | dkrdS d	S )
N            i      i          )nr   r   T/var/www/html/ai/venv/lib/python3.10/site-packages/triton/ops/blocksparse/softmax.py	num_warps   s   r   ROW_SIZE
BLOCK_SIZEIS_DENSEc                  C   s  t d}t d}t d}|t d | }t d|
| }t d|
| }||| d  }t |d }t |d }|| }||| | | 7 }||| | 7 }|r]t d|
}n$|dt d t d |  }t j|| | ||k dd}|| | }||k }t j|| | |td d}|t j}|}||9 }|d ur||| 7 }||| 7 }|| d | }|dk||k @ }t j|||  | |dd}||7 }|t j}t ||k|	@ td |}t 	|}t j
| | | ||d d S )Nr   r   r	   maskotherinf        r   )tl
program_idnum_programsarangeloadfloattofloat32wheresoftmaxstore) OutA	stride_xzLUTRextent	stride_zr	stride_hrscale	is_causalr   r   r   hmzhmlane_nblock_nheadersizeoffsetoff_ansoff_lutstart_nr   aoutoff_lomask_lo
rel_logitsr   r   r   _blocksparse_softmax_fwd   sB   
	

 
rA   c           '      C   s   t d}t d}t d}|t d | }t d|| }t d|| }||| d  }t |d }t |d }|| | | }||| | 7 }||k }|||  | }|||  | }|rkt d|}n"|dt d t d |  }t j|| | |dd} | | | }t j|| |dd}!|!t j}!t j|| |dd}"|"t j}"t ||k|@ |!|!k@ d|!}!|!|"t |!|" d  }#|d ur|||
 7 }||| 7 }|	| d | }$|$dk|$|	k @ |@ }%t j	|||	  |$ |#|%d |#| }#| ||  | }&t j	|&| |#|d d S )Nr   r   r	   r   r   r   )
r   r   r   r   r   r    r!   r"   sumr$   )'DA
stride_zdxDOutstride_zdoutr%   stride_zoutr-   r(   DRr*   r+   r,   	stride_err.   r   r   r   r/   r0   r1   r2   r3   r4   r5   r6   r7   off_mnr   AsDOutsr9   r:   r;   r<   doutdar>   r?   DAsr   r   r   _blocksparse_softmax_bwdK   sD   


 rP   c                   @   s0   e Zd Zedd Zedd Zedd ZdS )_softmaxc              	   C   s   t jg t j| jd}| }t| jd D ]}t || |d d d d f df}q|| }t 	|}t j
|d d dd|dd < | jddd d df }t j||fddd}	t |	|ft j|}
|
t| fS )	Ndtypedevicer   )dimr   F)as_tupler	   )torchtensorint64rT   clonerangeshapecatrB   
zeros_likecumsumnonzerostackviewtypeint32r    intmax)layoutblockrT   _emptysizesr/   total_sizesoffsetscolumnsr5   lutr   r   r   make_lut   s   (
z_softmax.make_lutc
                 C   s  |d urt |tjr|jjdksJ | }|jd }
|d |d | |
g}|d u r,dn|j}|d u r5dn| }t|}t	| |||d|||d |d |d |||t
||	t|d | || || _|| _|| _|| _|| _|| _|j| _|	| _|| _|S )Ncpur   r   )r   r   r   r   rU   r   r   r   r   )
isinstancerX   TensorrT   rd   itemr]   stride
empty_likerA   r   r   save_for_backwardspdimsri   maxlutr-   	rel_shaperel_stridesrS   	rel_dtypeis_denser.   )ctxr<   r-   r@   r.   ry   ri   ro   rz   r~   Mgridr{   r|   r=   r   r   r   forward   s:   

z_softmax.forwardc                 C   s   | j \}}d }| jd rtj| j| j|jd}|jd }| jd | jd | j	 |f}t
|}t| ||d||d||d| j||| jd | jd | jd | jd | j| j	t| j| jt| jd |d d |d d d d d d d d d d d d d d fS )Nr   rR   r   r   rU   r	   rr   )saved_tensorsneeds_input_gradrX   zerosr{   r}   rT   r]   ry   ri   rw   rP   rv   r-   r|   r.   r   rz   r~   r   )r   rM   r=   ro   drr   r   rN   r   r   r   backward   s6   






"

z_softmax.backwardN)__name__
__module____qualname__staticmethodrp   r   r   r   r   r   r   rQ      s    

%rQ   c                   @   s(   e Zd Zd	ddZddddddZdS )
r#   Fc                 C   s8   |j | _|| _|| _t| j| j|\| _| _|| _d S )N)	r]   ry   rh   ri   rQ   rp   ro   rz   r~   )selfrh   ri   rT   r~   r   r   r   __init__   s
   
zsoftmax.__init__g      ?N)r-   r@   r.   c                C   sL   |d ur|j |j krtd|j  t||||| j| j| j| j| j	}|S )Nz$relative position embedding must be )	rS   
ValueErrorrQ   applyry   ri   ro   rz   r~   )r   r<   r-   r@   r.   r   r   r   __call__   s   zsoftmax.__call__)F)r   r   r   r   r   r   r   r   r   r#      s    
r#   )rX    r   r   r   r   r   	constexprrA   rP   autogradFunctionrQ   r#   r   r   r   r   <module>   s.    6	
;Z