o
    hu                     @   s(  d dl Z d dlZd dlZd dlmZ e s#dd ZdejfddZdS d dlZd dl	m
Z d dlmZmZ ejejd	d
iddejd	diddgdgdejd	ejfddZdejfddZejejddddddejddddddgddgdejdejdejdejfddZdd ZdS )     N)is_triton_availablec                 C      d S N )inputr   r   Y/var/www/html/ai/venv/lib/python3.10/site-packages/bitsandbytes/triton/quantize_global.pyquantize_global_transpose       r   xc                 C   r   r   r   )r
   r   r   r   quantize_global   r	   r   )early_config_pruneestimate_matmul_time
BLOCK_SIZEi      )	num_warpsi      )
num_stages
n_elements)configskeyc                 C   st   t jdd}|| }|t d| }||k }t j| | |d}	t |}
t jd|	|
  }t j|| ||d d S )Nr   )axismask     _@)tl
program_idarangeload	libdevicellrintstore)x_ptrabsmax_inv_ptr
output_ptrr   r   pidblock_startoffsetsr   r
   
absmax_invoutputr   r   r   _quantize_global   s   
r)   c                    sn   |    d}d| }tj| jdtjd}| jr|js J |   fdd}t	| | ||  ||fS )Nr         ?cudadevicedtypec                    s   t  | d fS )Nr   tritoncdiv)metar   r   r   <lambda>/   s    z!quantize_global.<locals>.<lambda>)
absmax	unsqueezetorchemptyshapeint8is_cudanumelr)   )r
   absmaxr'   r(   gridr   r3   r   r   )   s         )BLOCK_MBLOCK_NGROUP_MMNrB   rC   rD   c                 C   s  t d}||	 d |	 }||
 d |
 }|| }|| }t|||  |}|| ||  }|| | }||	 t d|	 }||
 t d|
 }| |d d d f | |d d d f |   } ||k d d d f ||k d d d f @ }t j| |d}t |}||	 t d|	 }||
 t d|
 }||d d d f | |d d d f |   }||k d d d f ||k d d d f @ }t jd||  }t j|||d d S )Nr   r   r   r   )r   r   minr   r   r   r   r    )Ar"   B	stride_am	stride_an	stride_bn	stride_bmrE   rF   rB   rC   rD   r$   grid_mgrid_nwidthgroup_id
group_sizepid_mpid_nrmrnr   ar'   r(   r   r   r   _quantize_global_transpose5   s(   
,(
,(rX   c              
      s   |    d}d| }| j\ tj dtjd}|dkr*|d ks,J | ddks<| ddks<J |ddksL|ddksLJ  fdd}t	| | ||| d| d|d|d 	 ||fS )Nr   r*   r+   r,   r   c                    s"   t  | d t | d  fS )NrB   rC   r/   )METArE   rF   r   r   r4   h   s   " z+quantize_global_transpose.<locals>.<lambda>)
r5   r6   r7   r:   r8   r9   r;   sizestriderX   )r   r>   r'   outr?   r   rZ   r   r   ^   s   
   4)mathr8   time bitsandbytes.triton.triton_utilsr   r   Tensorr   r0   triton.languagelanguager   triton.ops.matmul_perf_modelr   r   autotuneConfigjit	constexprr)   rX   r   r   r   r   <module>   sH    	