o
    hw{                     @   sb  d dl Z d dlZd dlmZ dd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd ZdUddZdd  Zd!d" Zd#d$ Ze r'd dlZd dlmZ d d%lmZmZ ejd&ejd'ejd(ejd)ejd*ejd+ejfd,d-Zejd'ejd(ejd*ejd+ejd.ejf
d/d0Zd1d2 Z d3d4 Z!d5d5dd6dd7d8ej"d9ej"d:ej"d;eej" d<e#d=eeee$ ee$ ee$ f  fd>d?Z%dd6dd@dAej"dBej"d;eej" d<e#d=eeee$ ee$ ee$ f  f
dCdDZ&ejdEejdFejfdGdHZ'dUdIdJZ(	K	6	dVdLej"dMej"dNej"dOeej" dPe)dQe#dRee) fdSdTZ*dS dZ(dZ&dZ%dZ*dS )W    N)get_device_capabilityc                  C   s@   t j sdS zdd l} | d uot dkW S  ty   Y dS w )NFr   )   r   )torchcudais_availabletritonr   ImportError)r    r	   N/var/www/html/ai/venv/lib/python3.10/site-packages/torch/sparse/_triton_ops.py_has_triton   s   
r   c                 C   s   | st |d S N)
ValueError)condmsgr	   r	   r
   check   s   r   c                 C   s   t |jtjk|  d d S )Nz@(): only BSR sparse format is supported for the sparse argument.)r   layoutr   
sparse_bsr)f_nametr	   r	   r
   check_bsr_layout   s   
r   c                 C   s&   t |j|ko|jjdk|  d d S )Nr   z9(): all inputs are expected to be on the same GPU device.)r   devicetype)r   r   r   r	   r	   r
   check_device   s   r   c                 C   s   t | dko| dk|  d|  d|  d |jdd  \}}|jdd  \}}t ||k|  d| d| d d S )N   zc(): all inputs involved in the matrix product are expected to be at least 2D, but got lhs.dim() == z and rhs.dim() == .zw(): arguments' sizes involved in the matrix product are not compatible for matrix multiplication, got lhs.shape[-1] == z( which is not equal to rhs.shape[-2] == )r   dimshape)r   lhsrhsmklkrnr	   r	   r
   check_mm_compatible_shapes%   s$   r$   c                 G   sF   t |j|ko|jtjtjtjft|  v |  d| d|j d d S )Nz\(): all inputs are expected to be of the same dtype and one of (half, bfloat16, float32) or z, but got dtype == r   )r   dtyper   halfbfloat16floattuple)r   r   r%   additional_dtypesr	   r	   r
   check_dtype6   s   
r+   c                    sP   t |dksJ dd   fdd}t|||  d|d  d|d	  d
 d S )Nr   c                 S   s   | | d @  S N   r	   )vr	   r	   r
   is_power_of_twoC   s   z(check_blocksize.<locals>.is_power_of_twoc                    s&   d}| D ]}|dko |o|}q|S )NT   r	   )bres	blocksizer/   r	   r
   is_compatible_blocksizeF   s   z0check_blocksize.<locals>.is_compatible_blocksizez(): sparse inputs' blocksize (r   z, r-   z;) should be at least 16 and a power of 2 in each dimension.)lenr   )r   r3   r5   r	   r4   r
   check_blocksize@   s   r7   c                 C   s(   |  ddkr|  ddkr|  S | S )Nr   r-   )stride
contiguous)r   r	   r	   r
   make_triton_contiguousT   s   r;   c                 G   s<   zt jdd |D  W S  ty   td|  d Y d S w )Nc                 s       | ]
}|j d d V  qd S Nr   r   .0r   r	   r	   r
   	<genexpr>]       z'broadcast_batch_dims.<locals>.<genexpr>Fz3(): inputs' batch dimensions are not broadcastable!)r   broadcast_shapes	Exceptionr   )r   tensorsr	   r	   r
   broadcast_batch_dims[   s
   rF   c                 g   s4    |D ]}t d g|  }||| < || V  qd S r   )slicer   )r   slice_rangerE   r   slicesr	   r	   r
   slicerb   s   rJ   c                 g   sP    |D ]"}t d g|  }t| |D ]\}}|d ur|||< q|| V  qd S r   )rG   r   zip)dimsrI   rE   r   sdd_slicer	   r	   r
   multidim_sliceri   s   rP   c                  g   s$    | D ]}|V  |  E d H  qd S r   )r9   )rE   r   r	   r	   r
   ptr_stride_extractorr   s
   rQ   c           	      #   s    dt    krdksJ  J dt   krdks!J  J dd l} fdd}fdd}|j|  D ]%}dd t |D }d	d t||D }|d d d
 g||R V  q8d S )Nr      c                  3   s(    t  D ]\} }td| |V  qd S )Nr   )rK   range)fgmg)	full_gridgrid_blocksr	   r
   generate_grid_points~   s   z.grid_partitioner.<locals>.generate_grid_pointsc                 3   s*       D ]\}}tt|| |V  qd S r   )itemsnextrP   )rI   r   t_dims)tensor_dims_mapr	   r
   generate_sliced_tensors   s   z1grid_partitioner.<locals>.generate_sliced_tensorsc                 S   s    g | ]\}}}t || |qS r	   )min)r@   rT   gprU   r	   r	   r
   
<listcomp>   s     z$grid_partitioner.<locals>.<listcomp>c                 S   s   g | ]\}}t ||| qS r	   )rG   )r@   r_   gr	   r	   r
   r`      s    r8   )r6   	itertoolsproductrK   )	rV   rW   r\   rb   rX   r]   
grid_pointgridrI   r	   )rV   rW   r\   r
   grid_partitionerx   s     rf   c                    sj   dd d d }|d u r|}ndd  t  fddt||D }t|||D ]^}}| |g|R   q&d S )N)i  rg   r8   c                 S   s   | d u r|S t dt| |S r,   )maxr^   )ra   rU   r	   r	   r
   valid_grid_dim   s   z%launch_kernel.<locals>.valid_grid_dimc                 3   s    | ]
\}} ||V  qd S r   r	   )r@   ra   rU   ri   r	   r
   rA      s    
z launch_kernel.<locals>.<genexpr>)r)   rK   rf   )kernelr\   rV   rW   cuda_max_gridre   sliced_tensorsr	   rj   r
   launch_kernel   s   rn   c                    s   |   d}|  d}t|  d}dd |D }tj|jd d gdd |D R  dd   |d	} |d	} ||jdd  } fd
d|D }|||g|R S )Nr   c                 S   s   g | ]	}t |d qS )r   )r;   	unsqueezer?   r	   r	   r
   r`      s    z"prepare_inputs.<locals>.<listcomp>c                 s   r<   r=   r>   r?   r	   r	   r
   rA      rB   z!prepare_inputs.<locals>.<genexpr>c                 S   s   |  || dt|d S )Nr   r-   )broadcast_toflattenr6   )r   
batch_dimsinvariant_dimsr	   r	   r
   batch_broadcast_and_squash   s   z2prepare_inputs.<locals>.batch_broadcast_and_squashr8   c                    s"   g | ]} ||j d d qS )r   Nr>   r?   ru   batch_dims_broadcastedr	   r
   r`      s    )crow_indicesro   col_indicesr;   valuesr   rC   r   )bsrdense_tensorsry   rz   r{   rE   r	   rw   r
   prepare_inputs   s&   &r~   c                 G   s~   t | |g|R  }| |d }| |d }| || jdd   }||jdd   }tj|||||jdS )Nrv   rp   r   sizer   )	rF   ry   rq   rz   r{   r   r   sparse_compressed_tensorr   )r   r|   rE   batch_shapery   rz   r{   r   r	   r	   r
   broadcast_batch_dims_bsr   s    r   c                 C   sH   | j ^ }}}|||d  |d ||d  |d g }| |ddS )Nr   r-   rp   r   )r   reshape	transpose)r   r3   restr    r#   	new_shaper	   r	   r
   tile_to_blocksize   s   

r   )OptionalTupleIS_BETA_ZEROBLOCKSIZE_ROWBLOCKSIZE_COLTILE_K	acc_dtype
allow_tf32c            5   	   C   s*  t jdd} t jdd}!|||   ||!  }"t |"}#t |"| }$|$|# }%|%dkr,d S t d|}&t d|}'|||   |	|#  |
|&d d d f   ||'d d d f   }(|||   ||#  })|||   ||!  ||&d d d f   }*|||   ||'d d d f   }+t d|},t|%D ]}-t j||f|d}.t |)}/td||D ]H}0|0|, }1|1|k }2t j|*||1d d d f   |2d d d f dd}3t j|+||/  ||1d d d f   |2d d d f dd}4|.t j|3|4|d7 }.q|r|.| 9 }.n| |. |t |(  }.t |(|.|j	j
 |(|	7 }(|)|7 })qd S )Nr-   axisr   r%           maskotherr   )tl
program_idloadarangerS   zerosdotstoretor%   
element_ty)5alphabetar   r   r   kr   
values_ptrvalues_batch_stridevalues_nnz_stridevalues_row_block_stridevalues_col_block_stridecrow_indices_ptrcrow_indices_batch_stridecrow_indices_stridecol_indices_ptrcol_indices_batch_stridecol_indices_stridemat1_ptrmat1_batch_stridemat1_tiled_row_stridemat1_tiled_col_stridemat1_row_block_stridemat1_col_block_stridemat2_ptrmat2_batch_stridemat2_tiled_row_stridemat2_tiled_col_stridemat2_row_block_stridemat2_col_block_strider   r   	batch_pidrow_block_pidcrow_indices_offset_ptr
nnz_offsetnnz_offset_nextrow_nnzrow_block_arangecol_block_arangevalues_block_ptrscol_index_nnz_ptrmat1_block_ptrsmat2_block_ptrsk_tile_arange_	acc_block	col_blockk_tile	k_offsetsmask_k
mat1_block
mat2_blockr	   r	   r
   _sampled_addmm_kernel   s   #
		


r   GROUP_SIZE_ROWc           0      C   s  t jdd}t jdd}t jdd}t jdd}t jdd} t |||| |\}}|||  |	|  }!t |!}"t |!|	 }#|#|" }$|$dkrId S t d| }%t d|}&|||  ||"  ||%d d d f   ||&d d d f   }'|||  ||  ||&d d d f   ||%d d d f   }(|||  ||  ||  ||%d d d f   ||%d d d f   })|
||  ||"  }*t j| | f|d}+t|$D ]'},t |'}-t |*}.t |(||.  }/|+t j|-|/|d7 }+|'|7 }'|*|7 }*qt 	|)|+
|jj d S )Nr   r   r   r-   r   r   )r   r   num_programs	swizzle2dr   r   r   rS   r   r   r   r%   r   )0r   r   r   r   r   r   r   r   r   r   r   r   r   	dense_ptrdense_batch_stridedense_tiled_row_stridedense_tiled_col_stridedense_row_block_stridedense_col_block_stride
output_ptroutput_batch_strideoutput_tiled_row_strideoutput_tiled_col_strideoutput_row_block_strideoutput_col_block_strider   r   r   r   r   col_block_pidn_block_rowsn_block_colsr   r   r   r   r   r   r   dense_block_ptrsoutput_ptrsr   output_acc_blockr   values_blockdense_row_idxdense_blockr	   r	   r
   "_bsr_strided_dense_rowspace_kernel\  s   )





r   c              
      s   | d}| dd }| d}	||	|f}
|d ur4t|d d d d d ddt|d d    }nd }|d|d|d|d	|d
i}|jtjtjfv rQtj dntj	 d fdd}t
|||
| d S )Nr   r8   r-   rp   rR   r   r   NNr   Nr8   )r   rp   N)r   rp   TFc                    s,   t |  g t| R  dddd d S )N   r-   )r   r   r   
num_stages	num_warps)r   rQ   re   rm   r   r   r3   r	   r
   rk     s   z*_run_dense_rowspace_kernel.<locals>.kernelr   r)   r6   r%   r   r&   r'   r   float32float64rn   )r3   r{   ry   rz   denseoutputmax_grid	n_batchesr   r   rV   rW   r\   rk   r	   r   r
   _run_dense_rowspace_kernel  s(   


4r   c              
      s   | d}| dd }||f}|d ur.t|d d d d d ddt|d d    }nd }|d|d|d|	d|
di}|jtjtjfv rKtj d	ntj	 d
 fdd}t
|||| d S )Nr   r8   r-   r   r   )r   N)r   r8   )r   r   TFc                    s8   t |  gt| R  ddd d S )Nr-   r   )r   r   r   r   )r   rQ   r   r   r   r   r   r3   is_beta_zeror   tile_kr	   r
   rk     s    z)_run_sampled_addmm_kernel.<locals>.kernelr   )r   r   r   r3   r   r   r{   ry   rz   mat1mat2r   r   r   rV   rW   r\   rk   r	   r   r
   _run_sampled_addmm_kernel   s&   
4r   g      ?F)r   r   outskip_checksr   inputr   r   r  r  r   c                C   s`  d}t ||  t|| ||}	|st||| j t||| j |dkr3| jtju r3td| d| d | jtjurHt||| j t||| j nt|||j t	||| |d urt || t|||j t||| j t|j
|	j
koz| |  k| d|	j
 d|	  d|j
 d	|  	 |d u r|	j|jd
d}n||	 | dks| dkr|S | j
dd  }
|d}|d}|d}|dks|dkr| | |S |}t|||\}}}}}t||
d |f}t|||
d f}t|
 }t|||dk|
|||||||| |  dd  | dd  kr.| || j
 |S )Nsampled_addmmr   Fz(): having beta == z3 not equal to 0.0 with boolean mask is not allowed.z!(): Expects `out` to be of shape z and with nnz equal to z but got out.shape = z and out.nnz = T)copyr   r   r8   r-   rp   )r   r   r   r   r%   r   boolr   r+   r$   r   _nnzr   copy_numelr{   r   mul_r~   r   rh   r   r9   r   )r  r   r   r   r   r  r  r   r   input_broadcastedr3   r    r#   r   
out_backupry   rz   r{   r   r	   r	   r
   r  +  st   






&r  )r  r  r   r|   r   c                C   s  d}|sIt ||  t|| |j t|| |j t|| | | d}|d}|  jdd  \}}	t	||  d| d| d t
|||	f n| jdd  \}}
|jdd  \}}t|| |}|d ur|s|||f }t	|j|kd| d|j d t	| p|dd d	 |d u r||||f }|  d
kr| S |  jdd  }|}t| ||\}}}}}t||d d d }t||d
 |d
 f}t||||||| |S )Nbsr_dense_mmr   r8   z"bsr_dense_mm(): dense.size(-1) == z( should be divisible by blocksize[0] == r   z9bsr_dense_mm(): `out` argument has wrong shape, expected z
, but got zbsr_dense_mm(): only row-major/col-major `out` arguments are supported, i.e. (out.is_contiguous() or out.transpose(-2, -1).is_contiguous()) should be True.r   )r   r   r   r+   r%   r$   r   r{   r   r   r7   rF   is_contiguousr   	new_emptyr  zero_r~   r   r   )r|   r   r  r  r   r   r    r#   	row_blockr   r!   r"   original_batch_dims_broadcastedexpected_out_shaper3   r  ry   rz   r{   r	   r	   r
   r    sX   


r  MAX_ROW_NNZTILEc                 C   sR  t jdd}t jdd}t jdd}| ||  ||  }t |}t || }|| }|dkr2d S t d|
}||| k }|||  ||  ||  }t j|| |td dt j}t j|dd}t|
|	|
D ]-}||
7 }||| k }t j|| |td dt j}t j|dd}t 	||k||}qjt 
|| }t j|dd}t|
|	|
D ]-}||
8 }||| k }t j|| |td dt j}t 
|| }|t j|dd7 }qt j|| || |jj|d t|
|	|
D ]5}||
7 }||| k }t j|| |td dt j}t 
|| }t j|| || |jj|d qd S )Nr   r   r-   r   infr   )r   )r   r   r   r   r(   r   r   rh   rS   whereexpsumr   r%   r   )r   r   r   r   r   r   values_nnz_col_block_strider  r   r  r  r   row_block_offset_pidr   r   r   r   r   
row_aranger   curr_row_values_ptrsrow_tilemax_row_valuer   curr_max_row_valuenumdenomr	   r	   r
   _bsr_softmax_kernel  s`   
"""""$r#  c                    s  d}t ||  t|| | j |  dks|  dkr|  S | jdd  \}}|  }|  jdd  \ d u r@t	|nt	| 
 ddd}|  dd ra|   }n|  }|dd dddd|  }|jd | f}d }	|dd df d|d	i}
 fd
d}t||
||	 |d| ddj|  j }tj| 
  |   || j| jdS )Nbsr_softmaxr   r   rp   r   r8   .r   r   c                    s0   t |  g t|  tdR   d S )Ni   )r#  rQ   r^   r   r   max_row_nnzr  r	   r
   rk   <  s   zbsr_softmax.<locals>.kernelr   )r   r+   r%   r  r	  cloner   r{   r   next_power_of_2ry   ro   rr   r   r  r:   r   rn   r   r   rz   r   )r  r&  r   r    r#   nnzry   r{   rV   rW   r\   rk   r	   r%  r
   r$    s>   

,	$

r$  r   querykeyvalue	attn_mask	dropout_p	is_causalscalec           
      C   sh  d}t | | d t |d u| d |d usJ t |jtjk| dtj d|j d t||| j t||| j t||| j t||| j t||| j |jtjur_t||| j t	|| |
ddd	d
d}|d u rw| ddks{|d	krt d
| d| d |d u rdt| d n|}	| |	 t|}tjjj| |dd t||}|S )N_scaled_dot_product_attentionz'(): is_causal == True is not supported.z'(): attn_mask == None is not supported.z(): attn_mask.layout must be z, but got attn_mask.layout == r   r   r8   r   F)r   r  r   z(): current value of scale == z results in division by zero.r-   T)pinplace)r   r   r   r   r   r   r+   r%   r  r  r   r   mathsqrtr{   r
  r$  nn
functionaldropoutr  )
r*  r+  r,  r-  r.  r/  r0  r   sdpascale_factorr	   r	   r
   r1  Q  sJ   	
 
r1  r   )r   FN)+r4  r   torch._inductor.cuda_propertiesr   r   r   r   r   r$   r+   r7   r;   rF   rJ   rP   rQ   rf   rn   r~   r   r   r   triton.languagelanguager   typingr   r   jit	constexprr   r   r   r   Tensorr  intr  r  r#  r$  r(   r1  r	   r	   r	   r
   <module>   s    
	
! z$%&{(0	
X
O	

FA2