o
    œÜÓh+W  ã                	   @   sz  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	 d dlm
Z
 d dlZd dlmZ dd„ ZejZ	 G dd	„ d	ƒZd
edeeef fdd„Zdejdejdejfdd„ZG dd„ dejjƒZejZejZejZdejde fdd„Z!dd„ Z"dd„ Z#eG dd„ dƒƒZ$G dd„ dejjƒZ%G d d!„ d!ejjƒZ&			"	d,d#ed$ed%ed&e$fd'd(„Z'd-d#ed$ed)ej(d%efd*d+„Z)dS ).é    N)Ú	dataclass)Úreduce)ÚTupleÚOptionalÚList)Úwarnc                 C   s   t tj| dƒS )Né   )r   ÚoperatorÚmul)Úiterable© r   úV/var/www/html/ai/venv/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.pyÚprod   s   r   c                   @   s<   e Zd ZdZdd„ Zdd„ Zedd„ ƒZdd	„ Zd
d„ Z	dS )ÚGlobalOutlierPoolerNc                 C   s   t dƒ‚)NzCall get_instance() instead)ÚRuntimeError©Úselfr   r   r   Ú__init__!   s   zGlobalOutlierPooler.__init__c                 C   s   t ƒ | _d | _d S ©N)ÚsetÚoutliersÚ	model_dimr   r   r   r   Ú
initialize$   s   
zGlobalOutlierPooler.initializec                 C   s&   | j d u r|  | ¡| _ | j  ¡  | j S r   )Ú	_instanceÚ__new__r   )Úclsr   r   r   Úget_instance(   s   

z GlobalOutlierPooler.get_instancec                 C   s2   | j d u r|| _ || j krd S | j | ¡ ¡ d S r   )r   r   ÚupdateÚtolist)r   Úoutlier_idxÚfeature_dimr   r   r   Úadd_outliers/   s
   

z GlobalOutlierPooler.add_outliersc                 C   s   t  t| jƒ¡ t j¡S r   )ÚtorchÚTensorÚlistr   ÚtoÚint64r   r   r   r   Úget_current_outlier_idx7   s   z+GlobalOutlierPooler.get_current_outlier_idx)
Ú__name__Ú
__module__Ú__qualname__r   r   r   Úclassmethodr   r!   r'   r   r   r   r   r      s    
r   Útransform_tileÚ	tile_sizec                 C   sè   |\}}d||   k rdk sJ ‚ J ‚t j|| t jd ||¡}t  |¡}tdƒD ]F}t j|d| ddd }|d  t j¡ 	¡ }t  
| ¡ d |k¡sRJ d	ƒ‚| |ƒ}	|	 |j¡d }
||
d|  7 }|| d| k rq |S q+|S )
aþ  
    Compute a permutation of indices that invert the specified (tiled) matrix transformation

    :param transform_tile: a function that applies forward transform to a tensor of shape [dim1, dim2]
    :param tile_size: higher-level tile dimensions, i.e. (8, 32) for Turing and (32, 32) for Ampere
    :note: we assume that tile_transform applies to a cpu-based int8 tensor of shape tile_size
    :example: transform_tile function for the turing layout (bitsandbytes.functional as F)
    :returns: indices
    r   l            ©Údtypeé   é   Útrunc)Úrounding_modeé€   zint overflow)r"   Úaranger&   ÚviewÚ
zeros_likeÚrangeÚdivr%   Úint8Ú
contiguousÚallÚintr/   )r,   r-   Úd1Úd2Útile_indicesÚpermuted_tile_indicesÚiÚith_dim_indicesÚsample_tile_iÚpermuted_tile_iÚith_permuted_indicesr   r   r   Úget_inverse_transform_indices;   s    
 
þrG   Úpermuted_tensorr@   Úreturnc                 C   s¤   | j |j \}}\}}|| ||   krdks!J dƒ‚ J dƒ‚|  d| ¡ ¡ ¡ }t |¡}||| ¡ < | |||| || ¡}| dddd¡}| ||¡ ¡ S )a  
    Undo a tiled permutation such as turing or ampere layout

    :param permuted_tensor: torch tensor in a permuted layout
    :param tile_indices: reverse transformation indices, from get_inverse_transform_indices
    :return: contiguous row-major tensor
    r   z+tensor must contain a whole number of tileséÿÿÿÿé   é   r   )	ÚshapeÚreshapeÚnumelÚtr"   Ú
empty_likeÚflattenÚpermuter;   )rH   r@   ÚrowsÚcolsÚ	tile_rowsÚ	tile_colsÚtensorÚoutputsr   r   r   Úundo_layoutV   s   ,
rZ   c                   @   s&   e Zd Zeddd„ƒZedd„ ƒZdS )Ú
MatMul8bitNÚvectorc                 C   sÚ   |d u rg d¢}|d dkr)t  ¡  t  ||¡}W d   ƒ n1 s#w   Y  n0t|jƒdkr3d}nd}tj|d|d\}}	tj|||d\}
}t ||
¡}t ||	||j	|¡}|j
s_|j
re|  ||¡ || _|| _|S )N)r0   r0   r0   r   r0   rL   r   rJ   ©ÚdimÚ
quant_type)r"   Úno_gradÚmatmulÚlenrM   ÚFÚvectorwise_quantÚigemmÚvectorwise_mm_dequantr/   Úrequires_gradÚsave_for_backwardr_   Ú	precision)ÚctxÚAÚBÚoutr_   ri   Úoutputr^   ÚqAÚSAÚqBÚSBÚioutr   r   r   Úforwardi   s&   
ÿ€zMatMul8bit.forwardc                 C   sœ  | j \}}| j}| j}d  }}|jrÐt|jƒdkr"ddg}g d¢}	ndg}ddg}	|d dkrMt ¡  t | 	|	¡|¡}W d   ƒ n1 sGw   Y  nƒt|jƒdkr¦t|jƒdkr¦| 
¡ }| ¡ sg| 
¡  tj| d|jd ¡d|d\}
}| ¡ s€| 
¡ }tj| d|jd ¡d|d\}}t | ¡ |
¡}t || ¡ ||j|¡}n*tj|||d\}
}tj|||d\}}t | 	|	¡|
¡}t || 	|	¡||j|¡}|jrGt|jƒdkrßdg}ndg}t|jƒdkrðg d¢}	|}nddg}	dg}|d dkrt ¡  t || 	|	¡¡}W d   ƒ n	1 sw   Y  n*tj|||d\}
}tj|||d\}}t |
| 	|	¡¡}t ||| 	|	¡|j|¡}||d d d fS )	NrK   r   r   )r   rL   r   r0   rL   rJ   r]   )Úsaved_tensorsr_   ri   rg   rb   rM   r"   r`   ra   rS   r;   Úis_contiguousrc   rd   r6   re   rP   rf   r/   )rj   Úgrad_outputrk   rl   r_   ri   Úgrad_AÚgrad_BÚdimsÚpermute_dimÚqgrad_outputÚS1ro   ÚS2Úigrad_BÚdim_Brq   ÚS3Úigrad_Ar   r   r   Úbackward‚   s”   


ÿ€
ý
ÿÿ
ÿ
ÿû
ÿ€
ÿûzMatMul8bit.backward)Nr\   N)r(   r)   r*   Ústaticmethodrt   rƒ   r   r   r   r   r[   h   s
    r[   Údevicec                    sF   t jj| ddk rdS t jj| d‰ d}t‡ fdd„|D ƒƒr!dS dS )z7check if this device supports the optimized int8 kernel)r…   )é   é   F)zGTX 1630zGTX 1650zGTX 1660c                 3   s    | ]}|ˆ v V  qd S r   r   )Ú.0Ú
model_name©Údevice_namer   r   Ú	<genexpr>ç   s   € z#supports_igemmlt.<locals>.<genexpr>T)r"   ÚcudaÚget_device_capabilityÚget_device_nameÚany)r…   Únvidia16_modelsr   rŠ   r   Úsupports_igemmltá   s   r’   c                 C   s&   | dv sJ d| › ƒ‚| dkrdS dS )N)Ú
col_turingÚ
col_amperez9please find this assert and manually enter tile size for r“   )r0   é    )r•   r•   r   )Úformatr   r   r   Ú_get_tile_sizeì   s   
ýr—   c                    sN   ‡ ‡fdd„}t  ¡  t|tˆƒƒ ˆ ¡W  d   ƒ S 1 s w   Y  d S )Nc                    s"   t j|  ˆ ¡dˆdd  | j¡S )NÚrow)Ú
from_orderÚto_orderr   )rc   Ú	transformr%   r…   ©Úx©r…   r–   r   r   Ú<lambda>õ   s   " zget_tile_inds.<locals>.<lambda>)r"   r`   rG   r—   r%   )r–   r…   r›   r   rž   r   Úget_tile_indsô   s   
$ÿr    c                   @   sˆ   e Zd ZU dZeej ed< dZe	ed< dZ
dZdZdZdZdZdZdZdZdZdZdZdZdZdZdZe ¡ Zdd„ Zed	d
„ ƒZdS )ÚMatmulLtStateNÚ_tile_indicesFÚforce_no_igemmltç        Tc                 C   s.   d | _ d | _d | _d | _d | _d | _d | _d S r   )ÚCBÚCxBrr   ÚSCBÚCxBtÚSBtÚCBtr   r   r   r   Úreset_grads  s   
zMatmulLtState.reset_gradsc                 C   s"   | j d u rt| j| jjƒ| _ | j S r   )r¢   r    ÚformatBr¦   r…   r   r   r   r   r@     s   
zMatmulLtState.tile_indices) r(   r)   r*   r¢   r   r"   r#   Ú__annotations__r£   Úboolr¥   r¦   rr   r§   r¨   r©   rª   ÚsubBÚoutlier_poolÚhas_accumulated_gradientsÚ	thresholdÚidxÚis_trainingÚhas_fp16_weightsÚmemory_efficient_backwardÚuse_poolrc   Úget_special_format_strr¬   r«   Úpropertyr@   r   r   r   r   r¡   ù   s.   
 
r¡   c                   @   s,   e Zd Zeddefdd„ƒZedd„ ƒZdS )ÚMatMul8bitLtNc                 C   s¶  t |jƒo|j }d| _t|jƒdkrUd| _|| _|| _|| _|jd |jd kr?t	j
|jd d… |jdd …  |j|jdS t	j
|jd d… |jd d…  |j|jdS |j}|j}|jd u ret ¡ |_|jt	jkrut d|j› d¡ t|jƒd	kr…| d|jd ¡}tj| t	j¡|jd
\}	}
}}}|jdkrç|d urç|jrÓt	 |j¡ ¡ }d|	d d …|f< d|
d d …|f< |d d …|f }|d d …|f  ¡  ¡ |_||_ n,|j!d u ræ|rætj"|j#|d\|_!|_$n|jsý|j!d u rý|rýtj"|j#|d\|_!|_$d }|jr\t%|dd ƒd urdnd}| &¡  o|jd | 'd¡k}|r'| ¡ }|j(r.|r4|j!d u r[| )¡  t | t	j¡¡\}|_*|_+|_,}|rXtj"||d\|_!|_$n||_#nd}|d urÆ|jsÆt	 |j¡}||_ |j!d urƒt -|j!|j$|j  .¡ ¡}n|j#d d …|j  ¡ f  /¡ }||j+ 0dd¡ d  ¡  ¡  |j¡|_d|	d d …|j  ¡ f< d|
d d …|j  ¡ f< |d d …|j  ¡ f }|j$rÏ|j$d n|j}t|ƒd	krå|d |d |d f}n|d |d f}|r7t "|	d¡\}}t 1||j!||j$¡\}}|d u s|jt	jkr"tj2||||j+|d}| |j¡}nMtj2||||j+d d}| |j¡ 3|¡}n8| /¡ }|j d urLd|d d …|j  ¡ f< t	j4j5 6||j# |j¡¡}| 7|j+ 8d¡ 9d¡¡}|d uro| 3|¡}|d ur‚|d ur‚|t	 :||j¡7 }|| _;|| _|| _<|j|j|d u r–d n|j| _=| _>| _?t@| jAd d… ƒr·|
||f| _B||j f| _Cnd d |g| _Bd| _C|  Dd d ¡ t|ƒd	krÐt	j/ndd„ }|| 0|¡ƒS )NFr   TrJ   r   ©r/   r…   z'MatMul8bitLt: inputs will be cast from z to float16 during quantizationrK   )r²   r¤   )rš   Úgradg     À_@Úcol32)Úbiasç@ €?rL   ©NNc                 S   s   | S r   r   rœ   r   r   r   rŸ   ¶  s    z&MatMul8bitLt.forward.<locals>.<lambda>)Er’   r…   r£   Úis_emptyr   rM   rk   rl   r¾   r"   Úemptyr/   r¬   r°   r   r   Úfloat16Úwarningsr   rb   rN   rc   Údouble_quantr%   r²   rµ   ÚuniqueÚcolidxÚlongrP   r;   r¯   r³   r¦   r›   r¥   rr   Úgetattrrv   Ústrider´   r«   rª   r§   ÚSCBtÚextract_outliersr=   Úcloner6   ÚigemmltÚ
mm_dequantÚadd_ÚnnÚ
functionalÚlinearÚmul_Ú	unsqueezer
   ra   ÚstateÚ
grad_shapeÚdtype_AÚdtype_BÚ
dtype_biasr   Úneeds_input_gradÚtensorsÚtensor_statesrh   )rj   rk   rl   rm   r¾   rÖ   Úusing_igemmltr¬   Úinput_shapeÚCAÚCAtÚSCAÚSCAtÚcoo_tensorAr³   ÚsubAÚhas_gradÚis_transposedr¥   Úcoo_tensorBr   r   ÚshapeBÚoutput_shapeÚC32Arp   Úout32ÚSout32rn   ÚA_wo_outliersÚ
clone_funcr   r   r   rt   '  s¾   ,,

"€ ú€(

*zMatMul8bitLt.forwardc                 C   s  | j r| jd u r
d nt | j¡}t | j¡t | j¡d |d fS | j\}}}}}| j\}}}	| j\}
}| j	}| j
}d  } }}|rH|jd| jd}t|jƒdkrZ| d|jd ¡ ¡ }t | tj¡¡\}}}}}|r«tj||dd\}}tj|ddd\}}t ||||¡\}}t ||||
¡}|jdkr«|d ur«|d d …|f  t | ¡ |¡7  < |rA|jd urët |d¡\}}|jd u rÍtj|j|dd	\|_|_t ||j||j¡\}}t ||||j¡ | j ¡ | j!¡}nV|j"d ur|j"j| j!dd
 #|j$ %d¡ &d¡¡}t ||¡ | j ¡ | j!¡}n.|j'd ur=t(|j'|j)ƒ | j!¡ #|j$ %d¡ &d¡¡}t ||¡ | j ¡ | j!¡}nt*dƒ‚||d |d fS )Nr   r.   rK   rJ   T)Ú	transposer½   r¤   )rš   rð   )Úcopyr   r¿   z>State must contain either CBt or CB or CxB matrix for backward)+rÁ   r¾   r"   r7   rk   rl   rÛ   rÜ   rÝ   r¬   rÖ   ÚsumrÚ   rb   rM   rN   r;   rc   rÅ   r%   rÃ   r›   rÎ   rÏ   r²   ra   rP   rª   r¨   r©   rË   r6   r×   rØ   r¥   rÔ   r§   rÕ   r
   r¦   rZ   r@   Ú	Exception)rj   rw   Ú	bias_gradÚ	req_gradAÚ	req_gradBÚ_Úreq_gradBiasrá   rå   rk   rã   r³   r¬   rÖ   rx   ry   Ú	grad_biasÚCgradÚCgradtÚSCgradÚSCgradtÚ
coo_tensorÚCxAtÚSAtÚC32gradÚSgradÚgradB32ÚSgradB32ÚgradA32ÚSgradA32r¥   r   r   r   rƒ   ¹  sN   
$

$&ýzMatMul8bitLt.backward)r(   r)   r*   r„   r¡   rt   rƒ   r   r   r   r   rº   #  s     rº   c                   @   s.   e Zd Zeddejfdd„ƒZedd„ ƒZdS )Ú
MatMul4BitNÚquant_statec                 C   s  d| _ t|jƒdkrLd| _ || _|| _|| _|j}|jd |d kr7tj|jd d… |dd …  |j|j	dS tj|jd d… |d d…  |j|j	dS tj
j |t ||¡ |j¡ ¡ |¡}|| _|j|j|d u rld n|j| _| _| _t| jd d… ƒr†||f| _|S d| _|S )	NFr   TrJ   r   r»   rL   rÀ   )rÁ   r   rM   rk   rl   r¾   r"   rÂ   r/   r…   rÑ   rÒ   rÓ   rc   Údequantize_4bitr%   rP   rÖ   rØ   rÙ   rÚ   r   rÛ   rÜ   )rj   rk   rl   rm   r¾   r  ÚB_shapern   r   r   r   rt   ò  s$   **&(
þzMatMul4Bit.forwardc                 C   sª   | j r| jd u r
d nt | j¡}t | j¡t | j¡d |d fS | j\}}}}}| j\}}d\}}	}
|r;|jd| j	d}
|rNt 
|t || j¡ |j¡ ¡ ¡}||	d |
d fS )N©NNNr   r.   )rÁ   r¾   r"   r7   rk   rl   rÛ   rÜ   rò   rÚ   ra   rc   r	  rÖ   r%   r/   rP   )rj   rw   rô   rõ   r÷   rø   rk   rl   rx   ry   rù   r   r   r   rƒ     s   

&zMatMul4Bit.backwardr  )r(   r)   r*   r„   rc   Ú
QuantStatert   rƒ   r   r   r   r   r  î  s
    r  r¤   rk   rl   rm   rÖ   c                 C   s*   |pt ƒ }|dkr||_t | ||||¡S )Nr¤   )r¡   r²   rº   Úapply)rk   rl   rm   rÖ   r²   r¾   r   r   r   ra   '  s   
ra   r  c                 C   s¢   |d usJ ‚|   ¡ | jd krH| jdkrH| jd |j dkr3td|j› d| j› ƒ t | ||||¡S tj| | 	¡ ||d}|d urF||7 }|S t | ||||¡S )NrJ   Fr   z4Some matrices hidden dimension is not a multiple of z^ and efficient inference kernels are not supported for these (slow). Matrix input size found: )rÖ   )
rO   rM   rg   Ú	blocksizer   r  r  rc   Ú	gemv_4bitrP   )rk   rl   r  rm   r¾   r   r   r   Úmatmul_4bit5  s   r  )NNr¤   NrÀ   )*r	   rÄ   Údataclassesr   Ú	functoolsr   Útypingr   r   r   r   r"   Úbitsandbytes.functionalrÒ   rc   r   r#   rX   r   Úcallabler=   rG   Ú
LongTensorrZ   ÚautogradÚFunctionr[   r  Ú	mm_cublasÚ
bmm_cublasÚmatmul_cublasr…   r®   r’   r—   r    r¡   rº   r  ra   r  r  r   r   r   r   Ú<module>   sP    t) L<úÿþý
ü"