o
    h4                     @   s   d Z ddlZddlmZmZ ddlmZ edejdejdejd	ejfd
dZedejdejfddZ	edejdejdejdejdejf
ddZ
edejdejdejdejdejf
ddZG dd dejjZejZdS )ao  
Fused Attention
===============
This is a Triton implementation of the Flash Attention algorithm
(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf)

Sequence Parallel implementation inspired by HazyResearch
(see https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py)
    N   )cdivjit)languageBLOCK_MBLOCK_DMODELBLOCK_N	IS_CAUSALc           6   	   C   s  t d}t d}|| }t j| | ||f||	f|| df||fdd} t j|| ||f||fd||fdd}!t j|| ||f||fd||fdd}"|| t d| }#t d|}$t j|gt jdtd }%t j|gt jd}&t j||gt jd}'|d	 }(t | })|)|( |j	j
})d}*|r|d | n|}+t|*|+|D ]},t |!}-t |"}.t j||gt jd}/|rt |#d d d f |,|$d d d f  k|/td
}/|/t j|)|-dd7 }/t |%t |/d}0t j|%|0 }1t j|/|0d d d f  }2|&d |1 }3|'|3d d d f 9 }'|'t j|2|j	j
|.dd7 }'|&|1 t |2d }&|0}%t |!d|f}!t |"|df}"q|'|&d d d f  }'|||  |# }4t |4|%t j|&  t j|| ||f||f|| df||fdd}5t |5|'|j	j
 d S )Nr      )r
   r   )baseshapestridesoffsetsblock_shapeorder)r   r   )r   r
   dtypeinf/ldG?-infT
allow_tf32)tl
program_idmake_block_ptrarangezerosfloat32floatloadtor   
element_tyrangewheredotmaximummaxmathexp2sumadvancestorelog2)6QKVsm_scaleLOut	stride_qz	stride_qh	stride_qm	stride_qk	stride_kz	stride_kh	stride_kn	stride_kk	stride_vz	stride_vh	stride_vk	stride_vn	stride_oz	stride_oh	stride_om	stride_onZHN_CTXr   r   r   r	   start_moff_hz
qvk_offsetQ_block_ptrK_block_ptrV_block_ptroffs_moffs_nm_il_iaccqk_scaleqlohistart_nkvqkm_i_newalphap	acc_scalel_ptrsO_block_ptr r_   P/var/www/html/ai/venv/lib/python3.10/site-packages/triton/ops/flash_attention.py_fwd_kernel   s   


	


2
ra   D_HEADc           
      C   s   t d| t d| }t d|}t | |d d d f |  |d d d f  t j}t ||d d d f |  |d d d f  t j}t j|| dd}	t || |	 d S )Nr   r
   )axis)r   r   r   r   r    r   r)   r+   )
r2   DODeltar   rb   off_moff_nododeltar_   r_   r`   _bwd_preprocessr   s   66rk   SEQUENCE_PARALLELCAUSALc$           A      C   s  |"r|| tj| 7 }|#r|| }$nd}$|$td| }%|| td| }&td|!}'td| }(| |%d d d f | |(d d d f |   })||&d d d f | |(d d d f |   }*||&d d d f | |(d d d f |   }+||%d d d f | |(d d d f |   },||%d d d f | |(d d d f |   }-|||  }.|
||  }/tj|| gtjd}0tj|| gtjd}1t|*}2t|+}3t|$|| |D ]}4|4|' }5t|)}6|#rt|5d d d f |&d d d f kt	dt	d}7n
tj||!gtjd}7|7t
|6t|27 }7|7|9 }7t|/|5 }8tj|7|8d d d f  }9t|,}:|0tj
t|9 | jj|:dd7 }0t|.|5 };tj
|:t|3dd}<|9|<|;d d d f   |  | jj}=|1tj
t|=|6dd7 }1|"st|-}>|>tj
|=|2dd7 }>t|-|> n|"rttj
t|2t|=dd}>t|-|> |-|| 7 }-|)|| 7 })|,|| 7 },q|	|&d d d f | |(d d d f |   }?||&d d d f | |(d d d f |   }@t|?|0 t|@|1 d S )Nr   r   g        r   Tr   )r    r   int64r   r   r   r   r"   r#   r   r$   transr'   r(   r   r!   r+   )Ar-   r.   r/   r0   rQ   r2   rd   DQDKDVr1   D
stride_dqar3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   rC   rD   rE   rG   rU   	num_blockr   r   r   rl   rm   rS   offs_qmrM   rL   offs_kq_ptrsk_ptrsv_ptrsdo_ptrsdq_ptrsD_ptrsr]   dvdkrV   rW   rF   offs_m_currrR   rX   rO   r[   ri   Didpdsdqdv_ptrsdk_ptrsr_   r_   r`   _bwd_kernel_one_col_block   sd   
,,,,,


4
$&
",,r   c            &   	   C   s  |d } t d}!|!| }"|!| }#| |"| |#|  7 } ||"| |#|  7 }||"| |#|  7 }||"| |#|  7 }||"| |#|  7 }||"| |#|  7 }||"| |#|  7 }t ||}$|std|$D ]L}%tg | |||| ||||||	|
|||||||||||||||||!|%|$R |||||d qdd S t d}%tg | |||| ||||||	|
|||||||||||||||||!|%|$R |||||d d S )Nr   r   )r   r   r   rl   rm   r
   )r   r   r   r"   r   )&r-   r.   r/   r0   r2   rd   rp   rq   rr   r1   rs   rt   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   rC   rD   rE   r   r   r   rl   rm   rQ   rG   off_zoff_hnum_block_nrU   r_   r_   r`   _bwd_kernel   s0  
			


			
r   c                   @   s&   e Zd ZedddZedd ZdS )
_attentionFc                 C   s  t j }|d dk rtdd}d}	|jd |jd |jd }
}}|
|kr,||ks.J |dv s4J t |}t|jd ||jd |jd	  d	f}t j|jd |jd	  |jd f|jt j	d
}|dkridnd}t
| |||||||d|d	|d|d|d|d	|d|d|d|d	|d|d|d|d	|d|d|jd |jd	 |jd f||	|||dd | ||||| || _|| _|| _|| _|| _|S )Nr      zEFlash attention currently only supported for compute capability >= 80   @   >          r   r   r   r
   devicer         )r   r   r   r	   	num_warps
num_stages)torchcudaget_device_capabilityRuntimeErrorr   
empty_liker   emptyr   r   ra   stridesave_for_backwardgridr0   r   causalsequence_parallel)ctxrR   rV   rW   r   r0   r   
capabilityr   r   LqLkLvrh   r   r1   r   r_   r_   r`   forward#  sD   
"
&.    	z_attention.forwardc                 C   s  d}| j \}}}}}| j}|jd }	| }|r.t|	|}
|
f|j }tj||j|jd}ntj	|tj
d}t|}t|}t|}t| jd | jd  f ||||| jd t| jd |rft|	|ndf |||| j|||||||| |d|d|d|d|d|d|d|d|d|d|d|d|jd |jd |jd f||| j|| jd	dd
 t|jdkr|jdd}|||d d d fS )Nr   r   r   r   r   r
   )r   rb   r   r   )r   r   r   rl   rm   r   r      )dim)saved_tensorsr   r   
contiguousr   r   r   r   r   
zeros_liker   r   rk   r   r   r   r0   numelr   r   lenr)   )r   ri   BLOCKrR   rV   rW   rh   r1   r   
seq_len_kvreplicasnew_dq_shaper   r   r~   rj   r_   r_   r`   backwardI  sN   





&  
z_attention.backwardN)F)__name__
__module____qualname__staticmethodr   r   r_   r_   r_   r`   r   !  s
    %r   )__doc__r    r   r   r   r   	constexprra   rk   r   r   autogradFunctionr   apply	attentionr_   r_   r_   r`   <module>   s\    
		
`YC
T