o
    hN                 +   @   sn  d dl Zd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	Z
d dlmZ d dlmZmZmZ d dlmZ d dlmZmZ ddlmZmZ d	d
 Zi Zer	 i Zejejejfed< ejejfed< ej ej!fed< ej"ej#ej$fed< ej%ej&fed< i Z'ej(ej)fe'd< ej*ej+fe'd< ej,ej-fe'd< ej.ej/fe'd< ej(ej)fe'd< ej*ej+fe'd< i Z0ej1ej2ej3fe0d< ej4ej5fe0d< ej6ej7fe0d< ej8ej9ej:fe0d< ej;ej<fe0d< G dd dZ=G dd dZ>G dd dZ?i Z@de@ejA< de@ejB< de@ejC< de@ejD< de@ejE< ejAejFdd ddddZGdd d!ZHdd#d$ZIdd%d&ZJdd'd(ZKdd)d*ZLdd,d-ZMdd/d0ZNdd2d3ZOdd5d6ZPdd7d8ZQd9d: ZRd;d< ZSd=ed>ejTfd?d@ZUdAdB ZVdCdD ZWddEdFZX	ddHdIZY	G				ddJdKZZdd=edNedOe[d>efdPdQZ\G dRdS dSZ]dd=edUedVedNed>ef
dWdXZ^					T	dd=edYe]dVedUedNedZe_d>efd[d\Z`dd^d_Zadd=edVedNefd`daZbdd=edVedNefdbdcZcdd=edVedNed>efdedfZddd=edYe]dVedNedZe_d>efdgdhZedd=edYe]dVedNedZe_d>efdidjZfdd=edYe]dVedNedZe_d>efdkdlZgdd=edUedNed>efdmdnZh				dd=edoeeef dVedUedNed>efdpdqZidd=edUedNed>efdrdsZjdd=edUedNed>efdtduZk		v	v	w		v	ddxeldyedzed{ed|e[d}e[d~e_de[dede[de[de[dede[d>dfddZm	v	w		vddxeldyedzed{eded|e[de[d}e[d~e_de[dedededededede[de[dede[d>df*ddZn	v	w	ddxeldyedzed{eded|e[de[d}e[d~e_de[dedededede[de[d>df"ddZo	1ddeded~e_de_fddZpdedededefddZqejEfddZr				dd=ededNefddZs			dd=ededNefddZt			dd=ededNefddZuddejvfddZw				dddZx	vdddZyG dd dZzG dd dZ{G dd dZ|dd Z}dd Z~ejfddZ	vdddZdddZdddZdddZdZdddńZdddǄZejdfddɄZejfdd˄Zdd̈́ Zddτ ZdS )    N)norm)reduce)TupleAnyDict)Tensor)pack_dict_to_tensorunpack_tensor_to_dict   )COMPILED_WITH_CUDAlibc                 C   s   t tj| dS Nr
   )r   operatormul)iterable r   M/var/www/html/ai/venv/lib/python3.10/site-packages/bitsandbytes/functional.pyprod   s   r   adammomentumrmsproplionadagradlamblarsc                   @   s6   e Zd ZdZdd Zdd Zedd Zdd	d
ZdS )GlobalPageManagerNc                 C      t dNzCall get_instance() insteadRuntimeErrorselfr   r   r   __init__d      zGlobalPageManager.__init__c                 C   s
   g | _ d S N)paged_tensorsr    r   r   r   
initializeg      
zGlobalPageManager.initializec                 C   &   | j d u r| | | _ | j   | j S r$   	_instance__new__r&   clsr   r   r   get_instancej      

zGlobalPageManager.get_instanceFc                 C   s$   | j d d d D ]}t|| qd S )N)r%   prefetch_tensor)r!   to_cputr   r   r   prefetch_allq   s   zGlobalPageManager.prefetch_allF)	__name__
__module____qualname__r*   r"   r&   classmethodr.   r4   r   r   r   r   r   a   s    
r   c                   @   s4   e Zd ZdZdd Zdd Zedd Zdd	 ZdS )
CUBLAS_ContextNc                 C   r   r   r   r    r   r   r   r"   }   r#   zCUBLAS_Context.__init__c                 C   s
   i | _ d S r$   )contextr    r   r   r   r&      r'   zCUBLAS_Context.initializec                 C   r(   r$   r)   r,   r   r   r   r.      r/   zCUBLAS_Context.get_instancec                 C   sP   |j | jvr"tj }tj| tt	 | j|j < tj| | j|j  S r$   )
indexr;   torchcudacurrent_device
set_devicectc_void_pr   get_context)r!   deviceprev_devicer   r   r   rC      s   
zCUBLAS_Context.get_context)	r6   r7   r8   r*   r"   r&   r9   r.   rC   r   r   r   r   r:   z   s    
r:   c                   @   s,   e Zd ZdZdd Zdd Zedd ZdS )Cusparse_ContextNc                 C   r   r   r   r    r   r   r   r"      r#   zCusparse_Context.__init__c                 C   s   t t | _d S r$   )rA   rB   r   get_cusparser;   r    r   r   r   r&      s   zCusparse_Context.initializec                 C   r(   r$   r)   r,   r   r   r   r.      r/   zCusparse_Context.get_instance)r6   r7   r8   r*   r"   r&   r9   r.   r   r   r   r   rF      s    rF         r>   )r<   dtyperD   c                 G   sp   t |  t| }tt|}t|ttj}t	j
j||d}tj|| t|d|}d|_|j|_|S )Nshape)rK   countT)dtype2bytesr   r   cget_managed_ptrrA   c_size_tcastPOINTERc_intnp	ctypeslibas_arrayr=   
frombufferviewis_pagedr<   page_deviceid)rK   rD   rM   	num_bytescuda_ptrc_ptr	new_arrayoutr   r   r   	get_paged   s   ra   Fc                 C   sR   | j sJ d|rd}n| j}t| j |   }tt| t	|t
| d S )Nz%Only paged tensors can be prefetched!r0   )rZ   r[   rO   rK   numelr   	cprefetchget_ptrrA   rQ   c_int32)Ar2   deviceidr\   r   r   r   r1      s   "r1   Tc                 C   s   d }|j tjkrttd|  dd }t|}n|j tjkr-ttd|  dd }t|}|d u r8t	d|  t|dd}|rN|rNt
| |d urNt
| |t|t||t|  |jsd|jrktj  d S d S )Nc_fp32_uint8zFunction not implemented: 
is_managedF)rK   r=   float32getattrr   rA   c_floatuint8c_uint8NotImplementedErrorr1   rd   c_int64rb   rZ   r>   synchronize)	func_namerf   Bvalueprefetchfunccvaluerk   r   r   r   elementwise_func   s    
 rz   c                 C   s   t d| d | d S )Nfillrz   )rf   rv   rD   rw   r   r   r   r{          r{   c                 C   s   t d| d d d S )Naranger   r|   )rf   rD   r   r   r   r~      r}   r~   c                 C   s   t d| |d d S )N_mulr   r|   )rf   ru   rD   r   r   r   r      r}   r      c                 C   s   | rdnd}d| }|s|dk r| sd| nd| d }t |d|}d|  }|dkr/|S | d }t |d |  dg|  ||d    S )	N              rI   r   r
         ?   r   )r=   linspacerb   r   tolist)signed
total_bitsadd_zerosigntotal_valuesvaluesgaplr   r   r   create_linear_map   s   0r   +ew?c                 C   s   |r)t t| ddd d  }dgd }t t| ddd d   }n&t t| ddd d  }dgd }t t| ddd d   }|| | }t|}| j}||  }|	 dksmJ |S )	Ng      ?	   r0   r      r      r   )
r   ppfr=   r   r   r   sortr   maxrb   )offsetuse_extra_valuev1v2v3vr   r   r   r   create_normal_map   s    
$ 
"

r      c                 C   s  |}|}| rdnd}|| || ksJ g }g }t td||   d||  dD ]\}	}
|d|
  q)g }ttjddg|d}d|d  }td| D ]I}|D ]D}|dkrZdnd}t t|D ]\}	}||d|	d    7 }qb|dkr~|d|   }n|d|| d    }|| | r||  qRqNt|d| ksJ |  |dk rdt| }t|D ]}	|d q|  t	|}||
  }|S )Nr
   r   rI   )repeatr   r   )	enumeraterangeappendlist	itertoolsproductlenr   r=   r   r   )r   exponent_bitsprecision_bitsr   ephas_signevaluespvaluesivalr   lstbiasevaluebit_patternrv   pvalr   coder   r   r   create_fp8_map  sD   *

r      c                 C   s  g }|| rdnd }d||  d }t |D ]R}t| r%d|| |  d nd|| | d  d }tdd|}|dd |dd  d }	|d|d  |  |	  7 }| rh|d|d  |   |	  7 }q|dkrtdd|d }|dd |dd  d }	|d|d  |  |	  7 }| r|d|d  |   |	  7 }|d |d	 t|d| ksJ d
t| }
t |
D ]}|d q|  t|S )a+  
    Creates the dynamic quantiztion map.

    The dynamic data type is made up of a dynamic exponent and
    fraction. As the exponent increase from 0 to -7 the number
    of bits available for the fraction shrinks.

    This is a generalization of the dynamic type where a certain
    number of the bits and be reserved for the linear quantization
    region (the fraction). n determines the maximum number of
    exponent bits.

    For more details see
    (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561]
    r
   rI   g?Nr0          @
   r   r   r   )	r   intr=   r   r   r   r   r   r   )r   max_exponent_bitsr   datanon_sign_bitsadditional_itemsr   fraction_items
boundariesmeansr   r   r   r   create_dynamic_map1  s2   4  

r   c                 C   sn   t | d| d d}| }|d dt| }t|D ]}|d q|  t|}||   }|S )NrI   r
   )num_quantilesr   r   )	estimate_quantilesr   r   r   r   r   r   absr   )rf   r   qr   r   r   r   r   create_quantile_mapc  s   
r   c                  C   s8   t j sdS t j \} }| dkrdS | dkrdS dS )N
col_turingr   r   
col_ampere)r=   r>   is_availableget_device_capability)major_minorr   r   r   get_special_format_strr  s   r   c                 C   s   d}t  }| D ] }|d u rqt|dd}||jjdkp|M }|s'||jj q|s6tddd | D  t|dkrHtd	d
d | D  |S )NTrZ   Fr>   zZAll input tensors need to be on the same GPU, but found some tensors to not be on a GPU:
 c                 S      g | ]}|j |jfqS r   rM   rD   .0r3   r   r   r   
<listcomp>      zis_on_gpu.<locals>.<listcomp>r
   zcInput tensors need to be on the same GPU, but found the following tensor and device combinations:
 c                 S   r   r   r   r   r   r   r   r     r   )setrm   rD   typeaddr<   	TypeErrorr   )tensorson_gpugpu_idsr3   rZ   r   r   r   	is_on_gpu}  s   
r   rf   returnc                 C   s   | du rdS t | j S )z
    Get the ctypes pointer from a PyTorch Tensor.

    Parameters
    ----------
    A : torch.tensor
        The PyTorch tensor.

    Returns
    -------
    ctypes.c_void_p
    N)rA   rB   r   data_ptr)rf   r   r   r   rd     s   rd   c                 C   s   t j }t j|  |S r$   )r=   r>   r?   r@   )rD   rE   r   r   r   pre_call  s   
r   c                 C   s   t j|  d S r$   )r=   r>   r@   )rE   r   r   r   	post_call  s   r   c              	   C   sn   d| t jkrdnd d| d| d|rdnd }tt|s2t| td| d	| d
|  d| tt|S )Nctransform_r       __to_r3   nz"Transform function not supported:  to z for data type z and transpose=)r=   int8hasattrr   print
ValueErrorrm   )rK   orderAorderOut	transposenamer   r   r   get_transform_func  s   2

r   rowc                 C   sD  t j}t| }|dkr| d }n|dkr| d | d  }| d }	| |f}
|r5|}|	}|}	| d d d |f}
|dks=|dkrF|| ||d|
fS |d	kr]d
|	d d
  }	|||	f||d|
fS |dkr|d
|	d d
  }	d|d d  }|||	f||d|
fS |dkrd
|	d d
  }	d
|d d
  }|||	f||d|
fS td| )NrI   r      r
   r0   r   colrJ   col32r      r   r   r   r   zTo_order not supported: )r=   zerosr   rq   )rM   rK   rD   to_order
from_orderr   	init_funcdimsrowscolsstatetmpr   r   r   get_transform_buffer  s6   
r   c                    s(  |d u r
| j |f}n|d }|d u r#t|d | j| j||d \}}n|d |f}t| j|||}|d  t dkrJt d }	t d }
n2|d urjt }t fdd|D }	t||	 }
t|	}	nt d  d  }	t d }
t	
 | j}||t| t||	|
 ||fS )Nr
   r   rI   c                    s   g | ]} | qS r   r   )r   r   rL   r   r   r     s    z$nvidia_transform.<locals>.<listcomp>)rM   r   rK   rD   r   r   rA   re   r   r:   r.   rC   rd   )rf   r   r   r`   r   r   ld	new_staterx   dim1dim2r   ptrr   rL   r   nvidia_transform  s.   	
r        `?r   r`   r   c              	   C   sB  |   dk rtd|    d|dkrtd| |dk r)|dkr)dd|  }|du r7tjd	tj| jd
}t| |g t| j}| jtjkr]t	
t| t|t|t|    n#| jtjkrxt	t| t|t|t|    ntd| j t| |dk rtd| }tdd| | j}|| }|S )a  
    Estimates 256 equidistant quantiles on the input tensor eCDF.

    Uses SRAM-Quantiles algorithm to quickly estimate 256 equidistant quantiles
    via the eCDF of the input tensor `A`. This is a fast but approximate algorithm
    and the extreme quantiles close to 0 and 1 have high variance / large estimation
    errors. These large errors can be avoided by using the offset variable which trims
    the distribution. The default offset value of 1/512 ensures minimum entropy encoding -- it
    trims 1/512 = 0.2% from each side of the distrivution. An offset value of 0.01 to 0.02
    usually has a much lower error but is not a minimum entropy encoding. Given an offset
    of 0.02 equidistance points in the range [0.02, 0.98] are used for the quantiles.

    Parameters
    ----------
    A : torch.Tensor
        The input tensor. Any shape.
    out : torch.Tensor
        Tensor with the 256 estimated quantiles.
    offset : float
        The offset for the first and last quantile from 0 and 1. Default: 1/(2*num_quantiles)
    num_quantiles : int
        The number of equally spaced quantiles.

    Returns
    -------
    torch.Tensor:
        The 256 quantiles in float32 datatype.
    r   zQQuantile estimation needs at least 256 values in the Tensor, but Tensor had only z values.zgCurrently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles=r  r
   rI   N)r   rJ   zNot supported data type r      )rb   rq   r=   r   rl   rD   r   r   rK   r   cestimate_quantiles_fp32rd   rA   rn   rT   float16cestimate_quantiles_fp16r   roundr   longto)rf   r`   r   r   rD   stepidxr   r   r   r     s$    
**r   c                   @   sr   e Zd ZdZdZdd eD Zg dZdddZd	d
 Ze	de
eef dejdd fddZdddZdd ZdS )
QuantStatezVcontainer for quantization state components to work with Params4bit and similar clasesfp4nf4c                 C   s   g | ]}d | qS )bitsandbytes__r   )r   xr   r   r   r   >  s    zQuantState.<listcomp>)absmax	quant_mapnested_absmaxnested_quant_mapquant_state
quant_type	blocksizerK   rM   nested_blocksizenested_dtypenested_offsetNc	           	      C   s>   || _ || _|| _|| _|| _|| _|| _|| _|d u| _d S r$   )	r  rM   r   rK   r  r  r   state2nested)	r!   r  rM   r   r  r  rK   r   r  r   r   r   r"   B  s   zQuantState.__init__c                 C   sR   | j r| j| j| j| j| j| jg| jg}|| S | j| j| j| jd| jg}|| S )a$  
        ensures compatibility with older quant state scheme with nested lists.
        assumes the following layout:
        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
        state2 = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
        N)r   r  rM   rK   r  r   r  r  )r!   r  	list_reprr   r   r   __get_item__M  s
   "zQuantState.__get_item__qs_dictrD   r   c              
   C   sp  dd |  D }t|sd|vrtdt|dks'|d dd | jvr3td	| j d
| dt|dkrG|d }|t|| dd |  D }t|	 
| js\J d|v rtt|d |}| |d ||d |d |tt|d d}nd\}}| |d |d ||d |d |tt|d |d durt|d nd||d}|S )aW  
        unpacks components of state_dict into QuantState
        where necessary, convert into strings, torch.dtype, ints, etc.

        qs_dict: based on state_dict, with only relevant keys, striped of prefixes.

        item with key `quant_state.bitsandbytes__[nf4/fp4]` may contain minor and non-tensor quant state items.        
        c                 S   s(   g | ]\}}d |v rt |tjr|qS )r  
isinstancer=   r   r   kr   r   r   r   r   f  s   ( z(QuantState.from_dict.<locals>.<listcomp>r  z<Expected packed or unpacked quant_state items, found neitherr
   r   .r0   z@There should be exactly one `quant_state` item with ending from z.
Detected c                 S   s    i | ]\}}| d d |qS )r(  r0   )splitr&  r   r   r   
<dictcomp>q  s     z(QuantState.from_dict.<locals>.<dictcomp>r  r  r  r  r  )r  r  r   rK   NNr  r  r  rK   rM   N)r  r  r  r   rK   rM   r   r  )itemsr   r   r)  valid_qs_type_keysupdater	   popr   keysissubsetvalid_qs_keysr=   tensorfloatr  rm   Size)r-   r#  rD   qs_keyr   r  r  r   r   r   	from_dictZ  s<   $
zQuantState.from_dictFc                 C   s   | j | j| j| jt| jdt| jd}| j	r6|
| jj| jj| jj t| jjd| j d |s:|S dd | D }dd | D }t||d| j  < |S )z
        returns dict of tensors and strings to use in serialization via _save_to_state_dict()
        param: packed -- returns dict[str, torch.Tensor] for state_dict fit for safetensors saving
        ztorch.)r  r  r  r  rK   rM   )r  r  r  r  r  c                 S   s"   i | ]\}}t |tjr||qS r   r$  r&  r   r   r   r*       " z&QuantState.as_dict.<locals>.<dictcomp>c                 S   s"   i | ]\}}t |tjs||qS r   r$  r&  r   r   r   r*    r8  zquant_state.bitsandbytes__)r  r  r  r   strrK   striptuplerM   r   r.  r  cloner   itemr,  r   )r!   packedr#  qs_packed_dictnon_tensor_dictr   r   r   as_dict  s*   
zQuantState.as_dictc                 C   sN   | j || _ | jr%| j|| _| jj || j_ | jj|| j_d S d S r$   )r  r  r   r   r  r   )r!   rD   r   r   r   r    s   zQuantState.to)NNNNNNNr5   )r6   r7   r8   __doc__valid_quant_typesr-  r2  r"   r"  r9   r   r9  r   r=   rD   r7  rA  r  r   r   r   r   r  ;  s    
"
0r     r   r  c                 C   s<  |du rdt vrt | jt d< t d }|du r8|  }|| }||| dkr*dnd7 }tj|f| jtjd}|du rDtj| tj	d}| jj
dkr|dv sPJ t|}t| j}	|| j}t|| ||g | jtjkrtt|t| t|t||t|   nD| jtjkrtt|t| t|t||t|   n&| jtjkrtt|t| t|t||t|   ntd	| j t| j n| }tt|t| t|t|t|t|   |r| }
||
8 }t||d
d\}}t|||| j|
|d}||fS t|||| jd}||fS )a  
    Quantize tensor A in blocks of size 4096 values.

    Quantizes tensor A by dividing it into blocks of 4096 values.
    Then the absolute maximum value within these blocks is calculated
    for the non-linear quantization.

    Parameters
    ----------
    A : torch.Tensor
        The input tensor.
    code : torch.Tensor
        The quantization map.
    absmax : torch.Tensor
        The absmax values.
    out : torch.Tensor
        The output tensor (8-bit).

    Returns
    -------
    torch.Tensor:
        The 8-bit tensor.
    tuple(torch.Tensor, torch.Tensor):
        The quantization state to undo the quantization.
    Ndynamicr   r
   rD   rK   rK   cpurD           r      @   ?Blockwise quantization only supports 16/32-bit floats, but got F)r  r   )r  r   r  rK   r   r  r  r   r  rK   ) 	name2qmapr   r  rD   rb   r=   r   rl   
zeros_likero   r   rA   re   r   r   rK   r   cquantize_blockwise_fp32rd   rT   r  cquantize_blockwise_fp16bfloat16cquantize_blockwise_bf16r   r   rH  cquantize_blockwise_cpu_fp32
c_longlongmeanquantize_blockwiser  )rf   r   r  r`   r  r   r   blocks
cblocksizerE   r   qabsmaxr  r  r   r   r   rZ    sF   

0004rZ  r  r  c                 C   s:  |dus
|dus
J |du r#|du r#dt vrt | jt d< t d }|du r0t|||tjd}|j}|jrLt	|j|j
}||j7 }|jtjkrL| }|du r[tj| j|j| jd}| jjdkrt| j}|j| j}|jdvr{td|j dt| ||g |jtjkrtt|jt| t|t|t|jt|   nN|jtjkrtt|jt| t|t|t|jt|   n+|jtjkrtt|jt| t|t|t|jt|   ntd	| j t | j |S |j! }t"t|t| t|jt|t#|jt#|   |S )
a[  
    Dequantizes blockwise quantized values.

    Dequantizes the tensor A with maximum absolute values absmax in
    blocks of size 4096.

    Parameters
    ----------
    A : torch.Tensor
        The input 8-bit tensor.
    quant_state : QuantState
        Object with code, absmax and other quantization state components.
    absmax : torch.Tensor
        The absmax values.
    code : torch.Tensor
        The quantization map.
    out : torch.Tensor
        Dequantized output tensor (default: float32)


    Returns
    -------
    torch.Tensor:
        Dequantized tensor (default: float32)
    NrE  rP  rJ   rH  rJ  rD  rK  rL  r   rM  rN  The blockwise of J is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]rO  )$rQ  r   r  rD   r  r=   rl   r  r   dequantize_blockwiser  r   rK   r4  emptyrM   r   r   r   r  r   r   r   cdequantize_blockwise_fp32rd   rA   rT   rb   r  cdequantize_blockwise_fp16rU  cdequantize_blockwise_bf16r   rH  cdequantize_blockwise_cpu_fp32rX  )rf   r  r  r   r`   r  r   rD   r   r   r   ra    s@   "


:::

8ra  rN  c                 C   s   |d u rd}d }| dkr	 g d}n(| dkrg d}n| dkr$g d}n| dkr:|d	kr6g d
d d d }nt d|d u rFt d|  dt|}||   }| dksZJ ||S )Nr>   r  )r   g    6Gg    fg    TFٿg   I4ҿg   ০ǿg    Or   g   __?g   `\?g   ?g   @g?g    4?g   ` ?g   `v"?r   r  )r   g      ?g       @g      (@g      @g      @r   g      @r   g      g       g      (g      g      g       g      int4)r      r   rH   r   rI   r
   r   r   r0   iiaf4rN  )r   g|8geg:Kڞ׿gH2퓊cпg}Yu-ÿgQ	#(Dr   gF?g`_?g
0E?gL_߹E?gƶ=?ga@?gкv-?r   r0   z94-bit AbnormalFloats currently only support blocksize 64.z	Typename z not supported   )rq   r   r   r   rb   r  )typenamerD   r  r   r   r   r   get_4bit_typeD  s*   





rp  c                 C      t | ||||dS Nr  quantize_4bitrf   r  r`   r  compress_statisticsr   r   r   quantize_fp4w     rw  c                 C   rq  Nr  rs  ru  r   r   r   quantize_nf4z  rx  rz  r  c              
   C   s  | j jdkrtd| j j |dvrtd| d|  }| j}|du rA|| }||| dkr3dnd7 }tj|f| j tjd	}|du rUtj|d d
 dftj| j d}|dv s[J t	| j }	t
| ||g | jtjkr|dkrttdt| t|t|t|t| nttdt| t|t|t|t| n| jtjkr|dkrttdt| t|t|t|t| n_ttdt| t|t|t|t| nF| jtjkr|dkrttdt| t|t|t|t| n!ttdt| t|t|t|t| ntd| j t| j  t|| j d}
|rV| }||8 }t|dd\}}~t||| j||
|||d}||fS t||| j||
|d}||fS )a  
    Quantize tensor A in blocks of 4-bit values.

    Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.

    Parameters
    ----------
    A : torch.Tensor
        The input tensor.
    absmax : torch.Tensor
        The absmax values.
    out : torch.Tensor
        The output tensor (8-bit).
    blocksize : int
        The blocksize used in quantization.
    quant_type : str
        The 4-bit quantization data type {fp4, nf4}

    Returns
    -------
    torch.Tensor:
        The 8-bit tensor with packed 4-bit values.
    tuple(torch.Tensor, torch.Size, torch.dtype, int):
        The quantization state to undo the quantization.
    r>   z0Device type not supported for FP4 quantization: r  4-bit quantization data type  is not implemented.Nr   r
   rF  rI   rJ   rI  r  rO  )rD   r   )r  )r  rM   rK   r  r   r  r   r  )r  rM   rK   r  r   r  )rD   r   rq   rb   rM   r=   r   rl   ro   r   r   rK   r   cquantize_blockwise_fp32_fp4rd   rA   re   rT   cquantize_blockwise_fp32_nf4r  cquantize_blockwise_fp16_fp4cquantize_blockwise_fp16_nf4rU  cquantize_blockwise_bf16_fp4cquantize_blockwise_bf16_nf4r   r   rp  rY  rZ  r  )rf   r  r`   r  rv  r  r   input_shaper[  rE   r   r   r]  r  r   r   r   r   rt  }  sN    
2222
22
rt  c                 C   rq  rr  dequantize_4bitrf   r  r  r`   r  r   r   r   dequantize_fp4  rx  r  c                 C   rq  ry  r  r  r   r   r   dequantize_nf4  rx  r  c           	   
   C   s  |dvrt d| d|dvrtd| d|du r2|dur$|dus&J t||j|j||d}n|j}|jrNt|j|j}||j	7 }|jt
jkrN| }|du r]t
j|j|j| jd	}| }t| j}t| ||g |jt
jkr|jd
krttdt| t|t|t|jt| nttdt| t|t|t|jt| n|jt
jkr|jd
krttdt| t|t|t|jt| ncttdt| t|t|t|jt| nI|jt
jkr,|jd
krttdt| t|t|t|jt| n"ttdt| t|t|t|jt| nt d| j t | j | jd dkrCdnd}|rL|! S |S )a  
    Dequantizes FP4 blockwise quantized values.

    Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.

    Parameters
    ----------
    A : torch.Tensor
        The input 8-bit tensor (packed 4-bit values).
    quant_state : QuantState
        object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
    absmax : torch.Tensor
        The absmax values.
    out : torch.Tensor
        Dequantized output tensor.
    blocksize : int
        The blocksize used in quantization.
    quant_type : str
        The 4-bit quantization data type {fp4, nf4}


    Returns
    -------
    torch.Tensor:
        Dequantized tensor.
    r^  r_  r`  r  r{  r|  N)r  rM   rK   r  r  rJ   r  rO  r   r
   TF)"r   rq   r  rM   rK   r  r   ra  r  r   r=   rl   r4  rb  rD   rb   r   r   r  r   cdequantize_blockwise_fp32_fp4rd   rA   rT   r  cdequantize_blockwise_fp32_nf4r  cdequantize_blockwise_fp16_fp4cdequantize_blockwise_fp16_nf4rU  cdequantize_blockwise_bf16_fp4cdequantize_blockwise_bf16_nf4r   r3   )	rf   r  r  r`   r  r  r   rD   is_transposedr   r   r   r    sD   


44
4444
r  c                 C   sx   |d u rdt vrt | jt d< t d }|| j}t|  }|jtjkr,|	 }| | }t
|||}|||ffS )NrE  )rQ  r   r  rD   r=   r   r   rK   rl   r4  quantize_no_absmax)rf   r   r`   r  inpr   r   r   quantize   s   r  r   c                 C   s~   |d us
|d us
J |d u r)|d u r)dt vrt | jt d< t d }|| j}|d u r1||f}t| |d |}||d  S )NrE  r
   r   )rQ  r   r  rD   dequantize_no_absmax)rf   r   r  r   r`   r   r   r   
dequantize.  s   r  c              	   C   s`   t | j}|du rtj| tjd}t| |g tt|t| t|t	
|   t| |S )a  
    Quantizes input tensor to 8-bit.

    Quantizes the 32-bit input tensor `A` to the 8-bit output tensor
    `out` using the quantization map `code`.

    Parameters
    ----------
    A : torch.Tensor
        The input tensor.
    code : torch.Tensor
        The quantization map.
    out : torch.Tensor, optional
        The output tensor. Needs to be of type byte.

    Returns
    -------
    torch.Tensor:
        Quantized 8-bit tensor.
    NrG  )r   rD   r=   rR  ro   r   r   	cquantizerd   rA   rT   rb   r   rf   r   r`   rE   r   r   r   r  B  s   
&r  c              	   C   sb   t | j}|du rtj| tjd}t|| |g tt|t| t|t	
|   t| |S )a  
    Dequantizes the 8-bit tensor to 32-bit.

    Dequantizes the 8-bit tensor `A` to the 32-bit tensor `out` via
    the quantization map `code`.

    Parameters
    ----------
    A : torch.Tensor
        The 8-bit input tensor.
    code : torch.Tensor
        The quantization map.
    out : torch.Tensor
        The 32-bit output tensor.

    Returns
    -------
    torch.Tensor:
        32-bit output tensor.
    NrG  )r   rD   r=   rR  rl   r   r   cdequantizerd   rA   rT   rb   r   r  r   r   r   r  _  s   
&r  r   r   optimizer_namegr   state1beta1epsr  lrr  beta2weight_decaygnorm_scale	unorm_vec	max_unormc                 C   s>  d}|dkrt |j }d}|jt jkrt|  d }n.|jt jkr*t|  d }n!|jt jkr?t	t|  dkr?t|  d }nt
d|j d|j t|||||g t|j}|t|t|t|t|t|t|t|t|t|	t|t|
t|t|t|t|t|  t| dS )	a$  
    Performs an inplace optimizer update with one or two optimizer states.

    Universal optimizer update for 32-bit state and 32/16-bit gradients/weights.

    Parameters
    ----------
    optimizer_name : str
        The name of the optimizer: {adam}.
    g : torch.Tensor
        Gradient tensor.
    p : torch.Tensor
        Parameter tensor.
    state1 : torch.Tensor
        Optimizer state 1.
    beta1 : float
        Optimizer beta1.
    eps : float
        Optimizer epsilon.
    weight_decay : float
        Weight decay.
    step : int
        Current optimizer step.
    lr : float
        The learning rate.
    state2 : torch.Tensor
        Optimizer state 2.
    beta2 : float
        Optimizer beta2.
    gnorm_scale : float
        The factor to rescale the gradient to the max clip value.
    unorm_vec : torch.Tensor
        The tensor for the update norm.
    max_unorm : float
        The maximum update norm relative to the weight norm.
    skip_zeros : bool
        Whether to skip zero-valued gradients or not (default: False).
    r   Nr   r
   r   rI   AGradient+optimizer bit data type combination not supported: grad , optimizer )r=   r   r   r4  rK   rl   str2optimizer32bitr  rU  r   r   r   r   rD   rd   rA   rn   re   c_boolrb   r   )r  r  r   r  r  r  r  r  r  r  r  r  r  r  
skip_zeros
param_norm
optim_funcrE   r   r   r   optimizer_update_32bit|  s@   8
r  qmap1qmap2max1max2new_max1new_max2c                 C   s  d}|dkrt |j }t|j}t||||||
|||||g |jt jkr|jt j	krt
|  d t|t|t|t|t|t|t|t|t|t|t|t|	t|
t|t|t|t|t|t|t|t|  nk|jt jkr|jt j	krt
|  d t|t|t|t|t|t|t|t|t|t|t|t|	t|
t|t|t|t|t|t|t|t|  ntd|j d|j t| dS )a  
    Performs an inplace Adam update.

    Universal Adam update for 32/8-bit state and 32/16-bit gradients/weights.
    Uses AdamW formulation if weight decay > 0.0.

    Parameters
    ----------
    optimizer_name : str
        The name of the optimizer. Choices {adam, momentum}
    g : torch.Tensor
        Gradient tensor.
    p : torch.Tensor
        Parameter tensor.
    state1 : torch.Tensor
        Adam state 1.
    state2 : torch.Tensor
        Adam state 2.
    beta1 : float
        Adam beta1.
    beta2 : float
        Adam beta2.
    eps : float
        Adam epsilon.
    weight_decay : float
        Weight decay.
    step : int
        Current optimizer step.
    lr : float
        The learning rate.
    qmap1 : torch.Tensor
        Quantization map for first Adam state.
    qmap2 : torch.Tensor
        Quantization map for second Adam state.
    max1 : torch.Tensor
        Max value for first Adam state update.
    max2 : torch.Tensor
        Max value for second Adam state update.
    new_max1 : torch.Tensor
        Max value for the next Adam update of the first state.
    new_max2 : torch.Tensor
        Max value for the next Adam update of the second state.
    gnorm_scale : float
        The factor to rescale the gradient to the max clip value.
    unorm_vec : torch.Tensor
        The tensor for the update norm.
    max_unorm : float
        The maximum update norm relative to the weight norm.
    r   r   r
   r  r  N)r=   r   r   r4  r   rD   r   rK   rl   ro   str2optimizer8bitrd   rA   rn   re   rb   r  r   r   )r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rE   r   r   r   optimizer_update_8bit  sr   H


r  absmax1absmax2c                 C   sx  d }t |j}t|||||
|||g |jtjkr&|jtjkr&t|  d }n:|jtjkr9|jtjkr9t|  d }n'|jtj	krT|jtjkrTt
t|  dkrTt|  d }ntd|j d|j t| t|||||
|||g t |j}|t|t|t|t|t|t|t|t|t|	t|
t|t|t|t|t|t|t|  t| d S )Nr   r
   r   rI   r  r  )r   rD   r   rK   r=   rl   ro   str2optimizer8bit_blockwiser  rU  r   r   r   rd   rA   rn   re   r  rb   )r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rE   r   r   r   optimizer_update_8bit_blockwise^  sH   

r  grad	gnorm_vec
percentilec           
   	   C   s   t | j}t| |g | jtjkr&tt| t|t	
|t	
|   n$| jtjkrAtt| t|t	
|t	
|   n	td| j dt| t||d  }t|\}}t|| }d}	||kro|| }	|||	fS )a   Applies percentile clipping

    grad: torch.Tensor
        The gradient tensor.
    gnorm_vec: torch.Tensor
        Vector of gradient norms. 100 elements expected.
    step: int
        The current optimiation steps (number of past gradient norms).

    zGradient type z not supported!d   r   )r   rD   r   rK   r=   rl   r   cpercentile_clipping_g32rd   rA   re   rb   r  cpercentile_clipping_g16r   r   sqrtr   )
r  r  r  r  rE   current_gnormvalsr  
clip_valuer  r   r   r   percentile_clipping  s2   

r  	histogramindex1index2sourcec                 C   s   t | jdks	J | jtjksJ |jtjksJ |jtjks!J |jtjks)J | jjdks1J |jjdks9J |jjdksAJ |jjdksIJ t	| jd }t	|
 }t| |||g tt| t|t|t||| d S )NrI   r>   r   )r   rM   rK   r=   rl   int32rD   r   rA   re   rb   r   r   chistogram_scatter_add_2drd   )r  r  r  r  maxdim1r   r   r   r   histogram_scatter_add_2d  s   (r  c              
   C   s  t j s
t j  | j|ks|j|kr td| j d|j | j}|j}|}|}	d}
t|dkr|t|dkr||sI|	sI| jd |jd krId}
n|rZ|	sZ| jd |jd krZd}
n|rk|	rk| jd |jd krkd}
n|s{|	r{| jd |jd kr{d}
nt|dkrt|dkr|s|	s| jd |jd krd}
n|r|	s| jd |jd krd}
ny|r|	r| jd |jd krd}
nh|s|	r| jd |jd krd}
nWt|dkr#t|dkr#|s|	s| jd |jd krd}
n8|r|	s| jd |jd krd}
n'|r|	r| jd |jd krd}
n|s#|	r#| jd |jd kr#d}
|d urc|j}|
sbt|dkrbt|dkrb|d |d krb|d |d krb|d |d krb|d |d krbd}
nt|dkrt|dkr|s|	s|d |d f}n|r|	r|d |d f}n|r|	s|d |d f}n|s|	r|d |d f}nt|dkrt|dkr|s|	s|d |d |d f}n|r|	r|d |d |d f}ny|r|	s|d |d |d f}ng|s|	r|d |d |d f}nUt|dkrXt|dkrX|s#|	s#|d |d |d f}n5|r5|	r5|d |d |d f}n#|rG|	sG|d |d |d f}n|sX|	rX|d |d |d f}|
sltd	| d
| d| d
|	 d	|S )Nz3Expected torch.int8 input tensors A and B, but got  and TrI   r
   r   Fr   z?Tensor dimensions incorrect for matrix mulitiplication: A x B:  x z with transpose for A x B: r(  )	r=   r>   is_initializedinitrK   r   rM   r   r   )rf   ru   r`   transposed_Atransposed_Bexpected_typesAsBtAtBcorrectsoutr   r   r   check_matmul  s   ""
"r  ru   c                 C   sh  t | j}|d u rtd|  | jd krtd|j}|d }|j}	|jr3t|j|j}	|	|j	7 }	|d u rat
| jdkrRtj| jd | jd |f| j| jd}ntj| jd |f| j| jd}d}
|d }|d }|d }|d }| jd d d }t|| ||	|jg t|}t|
}
t|}t|}t|}t|}|jtjkr&| jtjkrt||
|t| t|t|	t|jt||||t|j n]| jtjkrt||
|t| t|t|	t|jt||||t|j n7| jtjkrt||
|t| t|t|	t|jt||||t|j ntd	| j td	| j t| |S )
NzGstate cannot None. gem_4bit( ) requires the state from quantize_4bit( )r0   zcDimensions of A are invalid. Must be a vector with the leading dimensions of "1", e.g. [1, 1, 2048]r   r   r
   sizerK   rD   rI   z%Matmul not implemented for data type )r   rD   r   rb   rM   r  r   ra  r  r   r   r=   rb  rK   r   r   rA   re   ro   r  r   cgemm_4bit_inference_naive_fp16rd   r  rU  cgemm_4bit_inference_naive_bf16rl   cgemm_4bit_inference_naive_fp32rq   r   )rf   ru   r`   r  r  r   rE   Bshapeboutr  r   mr'  ldaldcldbr   r   r   	gemv_4bit2  sN   

(





@@@r  c                 C   sv  t | ||||}|d u rtj|tj| jd}t| jdkr>t|jdkr>| jd |jd kr>| jd |jd kr>t| ||S | j}|j}|rUt|dkrU|d |d f}n|rht|dkrh|d |d |d f}|ryt|dkry|d |d f}n|rt|dkr|d |d |d f}t|dkr4| d |jd krd}n| d |jd krd}t| jdkr|  d | jd krd}n)|  d | jd krd}n|  d | jd krd}n|  d | jd krd}t|dkr|d }|  |rdnd }	nt|dkrt|dkr|d |d  }|d }	|d }
|d }| |r,dnd }|d }nHt|dkr|t|dksDJ |d |d krV|d |d ks`t	d| d	| d}d}|d }
|d }|d |d  }|
}|d }	|
}t
 | j}t|| |g t|t|t|t|
t|t|t|t| t|t|t|	t| |S )
Nr  r   r   rI   r
   FTzMOnly bsi,bso->io supported for tensor contractions, but dims for A x B were: r  )r  r=   r   r  rD   r   rM   batched_igemmstrider   r:   r.   rC   r   r   cigemmrA   r  re   rd   )rf   ru   r`   r  r  r  r  r  r   r  r  r'  r  r  r  r   r   r   igemmn  sz   (

$.*r  c                 C   s  t | jdkrt |jdkstd| j d|j t| ||||}|d u r0tj|tj| jd}| r=|	 d }d}nV|	 }|d |jd krU|
 }|	 d }n>|d |jd krgd	}|	 d }n,|d dkrx|
 }|	 d }n|d dkr|
 }|	 d }n
|
 }|	 d }|  r| 	 d }d}n8| 	 }|d | jd kr| 
 } | 	 d }d}n|d | jd kr| 	 d }d	}n| 
 } | 	 d }d}| jd }	| jd }
|jd }|jd }|}|jd |jd  }| jd | jd  }| jd |jd  }t | j}t|| |g t|t|t|t|t|
t|t|t| t|t|t|t|t|t|t|t|	 |S )
Nr   z@Expected 3-dimensional tensors for bmm, but got shapes A and B: r  r  r
   Fr   rI   T)r   rM   r   r  r=   r   r  rD   is_contiguousr  
contiguousr:   r.   rC   r   r   cbatched_igemmrA   r  re   rd   c_longc_uint32)rf   ru   r`   r  r  r  r  sr  	num_batchr   r  r'  r  strideAstrideBstrideCr  r   r   r   r    sl   



.* r  c                 C   s  |d }|d }t |}	t |}
|
dksJ d|	dkr!|d }n|	dkr-|d |d  }|d  }}tt|dksBJ d| |d dkrZ|	dkrZtjd|d f| jtjdS |d dkry|	dkrytjt|d d |d g | jtjdS |	dkr|d u rt|d |d f|| jdd	\}}n|	dkr|d u rt|d |d |d f|| jdd	\}}|
dksJ d
| jj	dksJ |jj	dksJ | j
tjksJ |j
tjksJ |j
|ksJ |d dksJ |d dv sJ |d dksJ |d |d ks	J d| d| |d }| j}tj| j t | j}t| }t|}t|}|d }t|d }|dkrIt|d d d d }nt|d d d d }t|d }t|}t|}t|}d}td }t| ||g |dkr|tjkrt|||||||||||}n:t|||||||||||}n*|dkr|tjkrt|||||||||||}nt|||||||||||}|dkrtd| d| d|d  d|||f d|||f 
 tdtj| ||fS )Nr   rI   z:Only two dimensional matrices are supported for argument Br   r
   z(Input tensor dimensions need to be > 0: rF  r   r   zlen(B.shape)==3 not supportedr>   r   r   r0   zNMatmullt only supports A @ B^T. Inner matrix dimensions do not match: A @ B = z @ r   r   r   r   r   r   zA: z, B: z, C: z; (lda, ldb, ldc): z; (m, n, k): zcublasLt ran into an error!)r   r   r   r=   rb  rD   r  r;  r   r   rK   r   r>   r@   r:   r.   rC   rd   rA   re   r   r  r   cigemmlt_turing_32cigemmlt_turing_8cigemmlt_ampere_32cigemmlt_ampere_8r   	Exception)rf   ru   SASBr`   SoutrK   shapeAshapeBdimsAdimsBr  r   r   formatBrE   r  ptrAptrBptrCr'  r  r  r  	has_errorptrRowScaler   r   r   igemmlt  s   
*







6r  c                 C   s  | j tjksJ |d ur|j tjksJ |d }t|dkr*|d |d  |d f}|d u r8tj|tj| jd}|d u rHtj|d tj| jd}|d u rXtj|d tj| jd}|jd |jd ksmJ |j d|j |jd |jd ksJ |j d|j t	| j}	t
| }
t
|}t
|}t
|}t
|}t
|}t
|}t|d }t|d }t| ||||||g t|
||||||||	 t|	 |S )Nr   r   r
   rI   rJ    vs )rK   r=   r  r  r   rb  rD   rl   rM   r   rd   rA   re   r   r   cdequant_mm_int32_fp16r   )rf   r  	row_stats	col_statsr`   new_row_statsnew_col_statsr   	out_shaperE   r   ptrOutptrRowStatsptrColStatsptrNewRowStatsptrNewColStatsptrBiasnumRowsnumColsr   r   r   
mm_dequant{  sF   

r  c              	   C   sh  | j tjksJ | j}| jd }t| jdkr"| jd | jd  }n| jd }|d d }|d d d }	|d u rFtj|ftj|d	d
}|d u rWtj|ftj|d	d
}|d u rm|dkrmtj	|	| d ftj
|d	}t| }
t|}t|}t|}t|}t|}t| j}t| |||g t|
|||t||| t| |dkr|d |||fS )Nr0   r   r   r
   r  r      rn  rJ   g     jr   )rK   r=   r  rD   rM   r   rb  rl   fill_r   r  rd   rA   re   r   r   r   cget_col_row_statsrn   r   cumsum_)rf   r  r	  nnz_block_ptr	thresholdrD   r   r   	col_tiles
tiled_rowsr   r  r  
ptrNnzrowsrE   r   r   r   get_colrow_absmax  sJ   








r  c                   @      e Zd Zdd ZdS )COOSparseTensorc                 C   s   |j tjksJ |j tjksJ |j tjksJ | |ks J | |ks(J | |ks0J || _|| _|| _|| _|| _	|| _
d S r$   )rK   r=   r  r  rb   r   r   nnzrowidxcolidxr   )r!   r   r   r"  r#  r$  r   r   r   r   r"     s   
zCOOSparseTensor.__init__Nr6   r7   r8   r"   r   r   r   r   r!        r!  c                   @   r   )CSRSparseTensorc                 C   s   |j tjksJ |j tjksJ |j tjksJ | |ks J | |ks(J | |d ks2J || _|| _|| _|| _|| _	|| _
d S r   )rK   r=   r  r  rb   r   r   r"  rowptrr$  r   )r!   r   r   r"  r(  r$  r   r   r   r   r"        
zCSRSparseTensor.__init__Nr%  r   r   r   r   r'    r&  r'  c                   @   r   )CSCSparseTensorc                 C   s   |j tjksJ |j tjksJ |j tjksJ | |ks J | |ks(J | |d ks2J || _|| _|| _|| _|| _	|| _
d S r   )rK   r=   r  r  rb   r   r   r"  colptrr#  r   )r!   r   r   r"  r+  r#  r   r   r   r   r"     r)  zCSCSparseTensor.__init__Nr%  r   r   r   r   r*    r&  r*  c                 C   sz   t j| jdd\}}|d t j| jd ft j| jjd}|j|	 |
 dd |d t| j| j| j|| j| jS NTreturn_countsr
   rJ   r   )r<   srcdim)r=   uniquer#  add_r   r   r  rD   scatter_r  r   r  r'  r   r"  r$  r   )cooAr   countsr(  r   r   r   coo2csr  s   

r6  c                 C   s   t | j\}}| j| }| j| }t j|dd\}}|d t j| jd ft j	| jj
d}|j| | dd |d t| j| j| j|||S r,  )r=   r   r$  r#  r   r1  r2  r   r   r  rD   r3  r  r   r  r*  r   r"  )r4  r   
col2rowidxr#  r   	colvaluesr5  r+  r   r   r   coo2csc  s   



r9  c                 C   sL   t j|ft j|d}t j|ft j|d}t j|f||d}t| |||||S )NrJ   )r=   r   r  r!  )r   r   r"  rD   rK   r#  r$  r   r   r   r   	coo_zeros,  s   r:  c                 C   s@  | j }| jtjksJ |jdksJ t| j }| jd }t| jdkr.| jd | jd  }	n| jd }	|d u s;|d u rDt| |d\}}}
|d u rRtj	| j|tj
d}|d u r`tj	| j|tj
d}d }t| }t|}t|}t|}t|}t| ||||g |dkr|
d  }|dkrt| jd | jd |
d  |}t|j}t|j}t|j}t|
}t|||||||||t|t|	t| t|j\}}||_|j| |_|j| |_n3t|||||d d d d tdt|	t| nt|||||d d d d t|t|	t| t| |||||fS )	Nr>   r0   r   r   r
   )r  rF  r   )rD   rK   r=   halfr   r   rM   r   r  r   r   rd   r   r=  r:  r#  r$  r   r   cdouble_rowcol_quantrA   rn   re   r   r   )rf   r	  r  out_colout_rowr  rD   rE   r   r   nnz_row_ptr
coo_tensorr   r  r  	ptrOutCol	ptrOutRowr"  	ptrRowIdx	ptrColIdxptrVal	ptrRowPtrr   r  r   r   r   double_quant3  s   





rG  c                 C   s  t | j}|d u r| j|f}n|d }|d u r)t|d | j| j||d |\}}n|d |f}|d }	t|	dkrHt|	d }
t|	d }nt|	d |	d  }
t|	d }t| |g |dkr|rst	
t| t||
| n}t	t| t||
| np|dkr|rt	t| t||
| n]t	t| t||
| nP|dkr|rt	t| t||
| n=t	t| t||
| n0|dkr|dkrt	t| t||
| n|dkrt	t| t||
| n
td| d	| t| ||fS )
Nr
   r   rI   r   r   r   r   z)Transform function not implemented: From r   )r   rD   rM   r   rK   r   rA   re   r   r   ctransform_row2col32Trd   ctransform_row2col32ctransform_row2turingTctransform_row2turingctransform_row2ampereTctransform_row2amperectransform_turing2rowctransform_ampere2rowrq   r   )rf   r   r   r`   r   r   r   rE   r   rM   r  r  r   r   r   	transform  sB   
,rP  c                 C   sj  |d u rt j| j|jd f|j|jd}| j}| j |ks J | j	 |ks)J | j
 |ks2J | j|jd ks<J | rBdnd}| |rKdnd }|jd }t j}t| j}t| j	}	t| j
}
t|}t|}t| j}t| j}t| j}t|jd }t|}t|}t| j| j	| j
||g t|||	|
||||||||t| |S )Nr
   rF  r   FT)r=   rb  r   rM   rD   rK   r"  r#  rb   r$  r   r   r  r  rF   r.   r;   rd   rA   re   r   r   	cspmm_coor  )r4  ru   r`   r"  r  r  r  r  	ptrRowidx	ptrColidx	ptrValuesr  r  cnnzcrowsAccolsAccolsBcldbcldcr   r   r   spmm_coo  s6   






(r[  c                 C   s~  |d u rt j| j|jd f|j| jjd}| j}t|j}| j	
 |ks&J | j
 |ks/J | j
 |ks8J | j|jd ksKJ | j d|j | rQdnd}| |rZdnd }|jd }t j| j	dd\}	}
|
d }t j|
dd\}}| }| }|d d	ksJ d
|d  d|jt jt jfv sJ t|}t|}t|}t| j	}t| j}t| j}t|}t|}t|}t|

 }t| j}t| j}t| j}t|jd }t|jd }t|}t|}t| j	| j| j|||g |jt jkr t|||||||||||||| n|jt jkr9t|||||||||||||| t| |S )Nr
   rF  r   r  FTr-  )
descendingr   z)Current max count per row is 8 but found r(  )r=   r   r   rM   rD   r   rK   r"  r   r#  rb   r$  r   r  r  r1  cumsumr   r   r  r   rd   rA   re   r   r    cspmm_coo_very_sparse_naive_fp16 cspmm_coo_very_sparse_naive_int8r   )r4  ru   dequant_statsr`   r"  rE   r  r  r  r   r5  r   	max_countmax_idx	ptrOffsetptrMaxCount	ptrMaxIdxrR  rS  rT  r  r  ptrDequantStats	cnnz_rowsrU  rV  rW  crowsBrX  rY  rZ  r   r   r   spmm_coo_very_sparse  s   
&





ri  g     _@vectorc                 C   s$  |dkrt |   }t | | d t j}||fS |dv r>t jt | |dd}t | t|  t j}||fS |dkru| j	}|  } |  | 
  }|dkrWd}d	| }| 
 }t || }	t ||  |	 |	 } | |fS |d
v r| j	}|  } t j| |ddt j| |dd }d||dk< d	| }t j| |dd}t || }	t ||  |	 |	 } | |fS |dkrt  B t | }
t j|
|dd}|d }|
||
k}t | | }||
| | | |< t | | t t j}W d    ||fS 1 sw   Y  ||fS d S )Nlinear   )rj  r   T)r0  keepdim	zeropointr   r
   g     o@)vector-zeropointrow-zeropointtruncated-vectorgffffff?)r=   r   r   r4  r
  r  r   amaxCrK   minaminno_grad	expand_asr   )r  r0  r  r  xqrK   dynaqxminxzpxabsxr  r   r   r   r   vectorwise_quant<	  sZ   



r~  c                 C   s$   |dkr| t  | tj}|S d S )Nrj  )rs  r  r=   rl   )rx  r  r  r  r   r   r   vectorwise_dequantj	  s   r  c                 C   s  |dkr|| t t   }|  | |S |dkr(d||  }|  | |S |dkrqd||  }|  }t|jdkrIt|jdkrI|d}t|jdkr\t|jdkr\|d}t|jdkrh||9 }n||9 }||S |dkr|  }t|jdkrt|jdkr|d}t|jdkrt|jdkr|d}t|jdkr|d| 9 }n|d| 9 }|d|  9 }||S |d	kr|  }t|jdkrt|jdkr|d}t|jdkrt|jdkr|d}t|jdkr||| t t   9 }n
||| t t   9 }||S |d
v rd|  }t|jdkr/t|jdkr/|d}t|jdkrDt|jdkrD|d}t|jdkrS||t  9 }n||t  9 }||t  9 }||S d S )Nrk  rn  r   rp  r   rI   r   ro  r   )rq  rj  )rs  r4  r  r   rM   squeezer3   )rx  S1S2rK   r  r   r  r   r   r   vectorwise_mm_dequantr	  sd   











 
 

r  c                 C   s   |   d|d |d   }|   }t| jdkr(t|jdkr(|d}t|jdkr8|| d 9 }n||d 9 }||d d 9 }||7 }||S )Nr   r
   rI   r   rl  )r4  r3   sumr   rM   r  r  )rx  rf   ru   r  r  rK   r   r  r   r   r   dequant_min_max	  s   "

r  c                 C   s   |d }|d }|dv sJ | j jdksJ tj|d | ftj| j d}t| }t|d }t|d }t| }	t|}
t|}t	| j }|dkr[t
|	|
|||| n|dkrit
|	|
|||| t| |S )Nr   r
   r  r>   rJ   r   r   )rD   r   r=   r   rb   r   rA   re   rd   r   r   cextractOutliers_turingcextractOutliers_amperer   )rf   r  r  r  formatAr`   idx_sizer   r   r   ptrIdxr  rE   r   r   r   extract_outliers	  s(   
r  c                 C   s6   t | }tt| t|t|  t| |S r$   )r=   rR  r   cpipeline_testrd   rA   rQ   rb   )rf   
batch_sizer`   r   r   r   pipeline_test	  s   
(r  r5   )T)NTr$   )Tr   T)r   T)Tr   rI   r   )Tr   r   )r   )r   F)r   NFNN)Nr  r   )NNNrD  F)NNNNrD  F)NrN  )NNrN  F)NNrN  Fr  )NNNrN  )NNNrN  r  r+  )NNNN)Nr   r   r   Nr   F)r   r   Nr   )r   r   F)r   )NFFN)NFF)NNNr   )NNNNr   )r
   rj  )rj  )ctypesrA   r   r   randomr=   mathscipy.statsr   numpyrU   	functoolsr   typingr   r   r   r   bitsandbytes.utilsr   r	   
cextensionr   r   r   rQ  r  cadam32bit_grad_fp32cadam32bit_grad_fp16cadam32bit_grad_bf16cmomentum32bit_grad_32cmomentum32bit_grad_16crmsprop32bit_grad_32crmsprop32bit_grad_16clion32bit_grad_fp32clion32bit_grad_fp16clion32bit_grad_bf16cadagrad32bit_grad_32cadagrad32bit_grad_16r  cadam_static_8bit_grad_32cadam_static_8bit_grad_16cmomentum_static_8bit_grad_32cmomentum_static_8bit_grad_16crmsprop_static_8bit_grad_32crmsprop_static_8bit_grad_16clion_static_8bit_grad_32clion_static_8bit_grad_16r  cadam_8bit_blockwise_grad_fp32cadam_8bit_blockwise_grad_fp16cadam_8bit_blockwise_grad_bf16"cmomentum_8bit_blockwise_grad_fp32"cmomentum_8bit_blockwise_grad_fp16!crmsprop_8bit_blockwise_grad_fp32!crmsprop_8bit_blockwise_grad_fp16clion_8bit_blockwise_grad_fp32clion_8bit_blockwise_grad_fp16clion_8bit_blockwise_grad_bf16!cadagrad_8bit_blockwise_grad_fp32!cadagrad_8bit_blockwise_grad_fp16r   r:   rF   rO   rl   r  rU  ro   r   rD   ra   r1   rz   r{   r~   r   r   r   r   r   r   r   r   rB   rd   r   r   r   r   r  r4  r   r  rZ  r   ra  rp  rw  rz  rt  r  r  r  r  r  r  r  r9  r  r  r  r  r  r  r  r  r  r  r  r  r  r!  r'  r*  r6  r9  r;  r:  rG  rP  r[  ri  rs  r~  r  r  r  r  r  r   r   r   r   <module>   s  















.
2

,
(6 wK

I3$P$$M

&	

n	

 	

=
,
\
?
\
Sf
4
.

`
,
$V

.;