o
    hV                 
   @   s  d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlZd dlZ	d dl
Zd dl
mZ d dlmZ ejddddd	d
 dZdddZG dd dejjZG dd dZdd Zdd Zejjdejejgddgddd Zdd Zdd  Zejjdejejej gg d!dejjd"ddgd#d$gdejd%g d&ejjd'ddgd(d)gdd*d+ Z!ejjd,ejejgddgdd-d. Z"d/d0 Z#d1d2 Z$d3d4 Z%d5d6 Z&dd8d9Z'd:d; Z(d<d= Z)d>gZ*d?gZ+d@dA dBdA e$e$e%fgZ,e,-e&e&e$e$e%f dCdDgZ.ddgZ/e0ee*e+e,e/Z1e0ee*e+e.e/Z2dEdF e2D Z3ejjdGe1e3ddHdI Z4dJdK Z5dLZ6ej7d7dMe6fdN8 Z9ej7dOdMe6fdN8 Z:ej7dOdMe6fdN8 Z;g dPZ<e0ee9e:e<e;Z1dQdF e1D Z3ejjdRe1e3ddSdT Z=dUZ6ej7d7dVe6fdN8 Z;ej7d7dWe6fdN8 Z9ej7dLdOe6fdN8 Z:e0ee;e9e:Z1dXdF e1D Z3ejjdYe1e3ddZd[ Z>dLZ6ej7d7dVe6fdN8 Z;ej7d7dWe6fdN8 Z9ej7dLdOe6fdN8 Z:ddgZ<e0ee;e9e:e<Z1d\dF e1D Z3ejjd]e1e3dd^d_ Z?dLZ6ej7d`dae6fdN8 Z*ej7d7dbe6fdN8 Z+ej7d7dMe6fdN8 Z@ej7d7dMe6fdN8 ZAg dcZ<e0ee*e+e@eAe<Z1dddF e1D Z3ejjdee1e3ddfdg ZBd`Z6ej7d`dae6fdN8 Z*ej7d7dbe6fdN8 Z+ej7d7dMe6fdN8 Z@e0ee*e+e@Z1dhdF e1D Z3ejjdie1e3ddjdk ZCdLZ6ej7dLdMe6fdN8 Z*ej7dLdMe6fdN8 Z+ej7dLdMe6fdN8 Z@ejDejEgZFdlgZGg dmZHdgZ<dLdUgZIe0ee*e+e@eIeFeGeHe<Z1dndF e1D Z3ejjdoe1e3ddpdq ZJd`Z6ej7d`dMe6fdN8 Z*ej7d7dVe6fdN8 Z+ej7d7dre6fdN8 Z@ej7d7dre6fdN8 ZAdsZId gZKe0ee*e+e@eAeIeKZ1dtdF e1D Z3ejjdue1e3ddvdw ZLd7gZ*d7gZ+d7gZ@d7gZAdxZIe0ee*e+e@eAeIZ1dydF e1D Z3ejjdze1e3dd{d| ZMdLZNdVZOeNeOdWd}feNeOd~dfeNeOd}dfgZ1ddF e1D Z3ejjde1e3ddd ZPdLZ6ej7dadMe6fdN8 Z*ej7dadre6fdN8 ZAdxZIddgZQddgZRe0ee*eAeIeQeRZ1ddF e1D Z3ejjde1e3ddd ZSdLZ6drgZ*drgZ+dxZIe0ee*e+eIZ1ddF e1D Z3ejjde1e3ddd ZTdLZ6ej7d`dWe6fdN8 Z*ej7d`dWe6fdN8 Z+e0ee*e+Z1ddF e1D Z3ejjde1e3ddd ZUdZ6ej7d`dWe6fdN8 Z*ej7d`dWe6fdN8 ZAej7d`dWe6fdN8 ZVe0eWe*eAeVZ1ddF e1D Z3ejjde1e3ddd ZXdZ6ej7d`dWe6fdN8 Z*ej7d`dWe6fdN8 ZAej7d`dWe6fdN8 ZVe0eWe*eAeVZ1ddF e1D Z3ejjde1e3dejYddd ZZdrd>gZ*dd?gZVd}dWgZAe0eWe*eAeVZ1ddF e1D Z3ejjde1e3dejYddd Z[dLZ6ej7dLdre6fdN8 Z*ej7dLdre6fdN8 Z+d gZ@ejDgZFdlgZGg dZHddgZ<dLgZIe0ee*e+e@eIeFeGeHe<Z1ddF e1D Z3ejjdoe1e3ddd Z\dLZ6d`gZ*dgZ+ejDgZFdgZGdlgZHe0ee*e+eFeGeHZ1ddF e1D Z3dd Z]dLZ6ej7d`dWe6fdN8 Z*ej7d`dWe6fdN8 Z+e0ee*e+Z1ddF e1D Z3ejjde1e3ddd Z^dLZ6ej7d`dre6fdN8 Z*ej7d`dre6fdN8 Z+ddgZ_e0ee*e+e_Z1ddF e1D Z3ejjde1e3ddd Z`dd ZadLZ6ej7dMdre6fdN8 Z*ej7dMdre6fdN8 Z+e0ee*e+Z1ddF e1D Z3ejjde1e3ddd Zbdd ZcdLZ6d>gZ*d}gZ+ejgZFddgZde0ee*e+eFedZ1ddF e1D Z3ejjde1e3ddd Zedd Zfdd ZgdLZ6d>gZ*d>gZ+ejDgZFe0ee*e+eFZ1ddF e1D Z3ejjde1e3dddÄ Zhd`ZNd`ZOg Z1e1-eNeOddf ddF e1D Z3ejjde1e3dddȄ Ziddʄ Zjdd̄ Zkdd΄ ZlddЄ Zmdd҄ ZnddԄ Zoddք Zpejjdejejej gg d!ddd؄ Zqejjrejst  ddڍejdddgdd߄ Zuejjrejst  ddڍejddgdd Zvdd Zwejjdddgddgdejjdddgddgdejjdg dg ddejjdejej ejgg dddd ZxejYddd Zyejjdddgddgdejjdejej ejgg ddejjddgdgddd ZzdS )    N)product)
functional)norm   Fx      i'  )	precisionsci_mode	linewidth	edgeitems	thresholdMbP?Tc                 C   sV   t | |||}|dk  }||kr)|r)td| d|  t j| ||| |S )Nr   z"Too many values not close: assert z < )torchisclosesumitemprinttestingassert_close)abrtolatolcountthrowidxsumval r   K/var/www/html/ai/venv/lib/python3.10/site-packages/tests/test_functional.pyassert_all_approx_close   s   r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )FFNTc                    s   t    tjj|||d| _tjj|||d| _t  tjj	| jj
 tjj	| jj
 W d    d S 1 s;w   Y  d S )Nbias)super__init__r   nnLinearfc1fc2no_gradinitxavier_uniform_weight)selfinput_featureshidden_sizer"   	__class__r   r   r$   !   s   

"zFFN.__init__c                 C   s   t | |}| |}|S N)r   relur'   r(   )r-   xr   r   r   forward*   s   
zFFN.forward)T)__name__
__module____qualname__r$   r5   __classcell__r   r   r0   r   r        s    	r    c                   @   s0   e Zd Zdd ZdddZdddZd	d
 ZdS )Timerc                 C   s   i | _ i | _i | _d S r2   )startsendsaggr-   r   r   r   r$   1   s   
zTimer.__init__defaultc                 C   sX   || j vr"tjjdd| j |< tjjdd| j|< | j |   d S | j|ddd}d S )NT)enable_timingF)evictprint_ms)r;   r   cudaEventr<   recordtock)r-   namemsr   r   r   tick6   s
   
z
Timer.tickTc                 C   s   || j v r=| j |   tj  | j| | j | }|| jvr&d| j|< | j|  |7  < |r=| j| | j | |rT|| jv rTt	| d| j| d dd | j| S )N        z took: g     @@z.5fs)
r<   rE   r   rC   synchronizer;   elapsed_timer=   popr   )r-   rG   rA   rB   rH   r   r   r   rF   >   s   



 
z
Timer.tockc                 C   s   i | _ i | _i | _td d S )NzResetting benchmark data)r;   r<   r=   r   r>   r   r   r   resetO   s   zTimer.resetN)r?   )r?   TT)r6   r7   r8   r$   rI   rF   rO   r   r   r   r   r:   0   s
    

r:   c                   C      d S r2   r   r   r   r   r   setupV      rQ   c                   C   rP   r2   r   r   r   r   r   teardownZ   rR   rS   dtypefloathalf)idsc                 C   s   t jdddd}|| }t|}t jddd|jd}t jj||ddd	 t j	dddd}|| }t|}t 
| |}t || }|d
k  dksSJ d S )N   rC   deviceg      `?g     ?   r   {Gz?r   r   皙?r   )r   randtoFestimate_quantileslinspacerZ   r   r   randnquantilerU   absr   r   )rT   Acodepercs	quantilesdiffr   r   r   test_estimate_quantiles^   s   



rl   c                  C   s   t dD ]`} tjdddd}t|}t||}t||}t||  	 }|dk s0J tj
dddd}t|}t||}t||}t||  	 }tjj||ddd |d	k sdJ qd S )
Nd   rX   rC   rY   gQ~?g{Gzt?r   r]   r   )ranger   rd   ra   rb   quantize_no_absmaxdequantize_no_absmaxrf   meanr   r_   r   r   )iA1rh   CA2rk   r   r   r   test_quantile_quantizationr   s   

rv   c            	      C   s2  g } g }t dD ]C}tjdddd}t|\}}t||}t|| }|t|d  }| | 	  || 	  | 	 dk sKJ qt
t| t|   t
t|t|  t dD ]2}tjdddd}t|\}}t||}t||  	 }tjj||ddd	 |d
k sJ qdd S )Nrm   rX   rC   rY   :0yE>gS㥋?r\   r   r]   gMbp?)rn   r   rd   ra   quantize
dequantizerf   appendrq   r   r   r   lenr_   r   r   )	diffsreldiffsrr   rs   rt   Sru   rk   reldiffr   r   r   test_dynamic_quantization   s*   r   )fp32fp16bf16nestedFalseTrue	blocksize)      rX      r[      @   signedsigned_Truesigned_Falsec                 C   s  g }g }t dD ]A}tjddd| d}tj|||d\}}	t||	}
t||
  }|t| d  }||	 
  ||	 
  qt|t| }t|t| }|dk s`J |dk sfJ |
j| ksmJ g }tj|d	}t dD ]B}tjddd| d}tj||||d
\}}	t||	}
t||
  }|t| d  }||	 
  ||	 
  qyt|t| }t|t| }|r|dk sJ |dk sJ n|dk sJ |dk sJ |
j| ksJ d S )Nrm   rX   rC   rZ   rT   )r   r   rw   I+?g;On?)r   )r   r   rh   gy&1l?Q?gy&1\?g~jt?)rn   r   rd   ra   quantize_blockwisedequantize_blockwiserf   rU   rz   rq   r   r   r{   rT   create_dynamic_mapr_   )rT   r   r   r   r|   r}   rr   rs   rt   r~   ru   rk   r   abserrrelerrrh   r   r   r   #test_dynamic_blockwise_quantization   sB   r   gtypec                 C   s  t jddd}t jddd}d}d}d}ttD ]f}|d7 }t j||| dd}tj||||d	\}}	}
||	k r=|
d
ksCJ |	| sCJ t | }|dkrU||d d < n|||d < t 	|\}}|| }t j
|t | t j
||	 t j
|| qd S )Nrm   rC   rY      r   r      rT   rZ   )
percentile      ?)r   zerosrn   krd   ra   percentile_clippingr   rU   sortr   r   sqrt)r   
gnorm_vec1
gnorm_vec2nstepr   rr   ggnorm1clip2gnorm_scalegnorm2valsr   clip1r   r   r   test_percentile_clipping   s,    r   c                 C   s0   t |  }t | | d } || t jfS N   )r   rf   maxroundr`   int8)r4   max1r   r   r   quant   s   r   c                 C   s   |   |d  S r   rU   )cmaxCr   r   r   dequant   s   r   c                 C   s   |  | d  |d  S r   r   )maxAmaxBrt   r   r   r   
mm_dequant   s   r   c                 C   sD   t jt | |dd}d||dk< t | | d } || t jfS )NTdimkeepdimr   r   r   )r   amaxrf   r   r`   r   )r4   r   r   r   r   r   quant_multi   s   r       c                 C   s   |dkr,t j| d|d}tjt||d dd}t|dd| jd f}|| j}n)|dkrUt j| d|d}tjt||dd}t|| jd ddf}|| j}d||dk< t| | d	 } || 	tj
fS )
Nr   z(c a) b -> c a b)r   Tr   r   za (b c) -> a b cr   r   )einops	rearranger   r   rf   tileshapeviewr   r`   r   )r4   r   
chunk_size	x_chunkedr   r   r   r   quant_multi_chunk  s   r   c                 C   s   |   }|  }d S r2   )minr   )rg   minAr   r   r   r   quant_minmax  s   r   c                 C   s   t | tt|  S r2   )r   rU   r{   )xxr   r   r   rq     s   rq   r    @  c                 C      t | S r2   r   r4   r   r   r   r   <lambda>"      r   c                 C   r   r2   r   r   r   r   r   r   #  r   linear
vectorwisec                 C      g | ]}d j | qS )z#dim1_{}_dim2_{}_quant_{}_batched_{}format.0r   r   r   r   
<listcomp>/      r   z"dim1, dim2, quant_methods, batchedc                 C   s  | | d  } ||d  }g }g }t dD ]}|rGtjddd| |d fdd}tjddd|d | fdd}|d |d\}	}
|d |d\}}n(tjdd| |fdd}tjdd|| fdd}|d |d\}	}
|d |d\}}tjj|d |	|
|d	d
d |rt||}t|
 | }nt||}t	|
|}|d |	||}|
 }|| }|| }t|| }|t| }||   ||   qd S )Nr   r   r         ?rC   sizerZ      r   g?r^   r]   r   )rn   r   normalr   r   bmmrU   mmra   igemmstdrf   rz   rq   r   )dim1dim2quant_methodsbatchederrors	relerrorsrr   rg   Br   Acr   Bcout2rt   outr   errr   r   r   r   test_approx_igemm5  s>   r   c                  C   s   t jdd} |   d S )NrX   )bnbr%   StableEmbeddingreset_parameters)layerr   r   r   test_stable_embedding^  s   r   r   r[   r      )FFFTTFTTc                 C   r   )z2hidden_dim_{}_batch_dim_{},transpose_{}_seq_dim_{}r   r   r   r   r   r   i  r   z)hidden_dim, batch_dim, transpose, seq_dimc                 C   s  | | d  } ||d  }||d  }t tD ]}|d s || fn| |f}|d r2dtdd | fn	| dtdd f}tjdd|dd	tj}tjdd|dd	tj}|d so|d sot| | }	t	
||}
nZ|d s|d rt| |  }	t	
|| }
n=|d r|d st|  | }	t	
| |}
n |d r|d rt|  |  }	t	
| | }
tj|
 |	 qt tD ]{}||| f}|d rdtdd | fn	| dtdd f}tjdd|dd	tj}tjdd|dd	tj}|d s+|d s+t| | }	t	
||}
n|d sI|d rIt| |  }	t	
|| }
tj|
 |	 qd S )
Nr   r   r   r   r   r   rC   r   )rn   r   randomrandintr   r`   r   matmulrU   ra   r   tr   r   )
hidden_dim	batch_dim	transposeseq_dimrr   shapeAshapeBrg   r   r   r   r   r   r   
test_igemmo  sV   

r     r   r   c                 C   r   )z"seq_dim{}_hidden_dim{}_batch_dim{}r   r   r   r   r   r         
zseq_dim, hidden_dim, batch_dimc           	      C   s   | | d  } ||d  }||d  }t dD ]O}tjdd|| |fddtj}tjdd|| dfddtj}td	| | }tj|jd |jd tj	|j
d
}tj|||d}tj| | qd S )Nr   r      r   r   rC   r   rX   zbsi, bso->ior   r   )rn   r   r  r`   r   einsumrU   emptyr   int32rZ   ra   r   r   r   )	r  r  r  rr   rg   r   r   ioutr   r   r   r   test_dim3_igemm  s*   r  c                 C   r   )z1seq_dim={}_hidden_dim={}_batch_dim={}_transpose{}r   r   r   r   r   r     r   z)seq_dim, hidden_dim, batch_dim, transposec                 C   s  dd }| | d  } ||d  }||d  }g }g }g }g }t tD ]}	tjdd|| |fdd}
|r?tjd	dd
|fdd}ntjd	d|d
fdd}||
\}}}|rt||rYdnd	d\}}t|| }t|
| }| 	d	||  }|
 }||  | d | }t|
dd\}}t|| }t|| |}n?t|d	d\}}|	d	||  }t||}t|
|}|
 }|| | d | }t|
dd\}}t||}t|||}| }|| }|| }|| }t|| }|t|d  }t|| }|t|d  }||   ||   ||   ||   q"t|dk s@J t|dk sIJ d S )Nc                 S   sN   t j| ddd}t j| ddd}|| d }d| | |  | t j||fS )Nr   Tr          @r   )r   r   aminr`   r   )r4   r   r   scaler   r   r   min_max  s   "z"test_minmax_igemm.<locals>.min_maxr   r   rJ   r   rC   r   r   r[   r   r   ?  Hz>r   g333333?)rn   r   r   r   r   ra   r   r  r  r   rU   r   r   rf   rz   rq   r   )r  r  r  r  r  errsrelerrserrs2relerrs2rr   rg   r   r   r   r  r   r   r   r   offsetr   out3r   r   r   err2relerr2r   r   r   test_minmax_igemm  sb   r$  r   r   r   )r   r   r   r   c                 C   r   )z,dim1_{}_dim2_{}_dim3_{}_dim4_{}_transpose_{}r   r   r   r   r   r     r   z!dim1, dim2, dim3, dim4, transposec                 C   s  ||d  }||d  }||d  }t tD ]}|d r!| ||fn| ||f}|d r/| ||fn| ||f}tjdd|ddtj}tjdd|ddtj}	|d sg|d sgt| |	 }
t	||	}nr|d s|d rt| |	
g d }
t	||	
g d}nO|d r|d st|
g d |	 }
t	|
g d|	}n,|d r|d rt|
g d |	
g d }
t	|
g d|	
g d}tj| |
  qd S )	Nr   r   r   r   r   rC   r   )r   r   r   )rn   r   r   r  r`   r   r   rU   ra   r   permuter   r   )r   r   dim3dim4r  rr   r	  r
  rg   r   r   r   r   r   r   	test_ibmm%  s0     r(  c                 C   r   )zdim1_{}_dim2_{}_dim3_{}r   r   r   r   r   r   F      zdim1, dim2, dim3c           	   	   C   s|   ||d  }||d  }t tD ]+}tj||fdd}tj|dd\}}t||}| }t||ddt	|d d	 qd S )
Nr   rC   r   r   r  r\   皙?Mb`?r   r   r   )
rn   r   r   rd   ra   vectorwise_quantvectorwise_dequantnumelr   int)	r   r   r&  rr   rg   qASArs   r   r   r   r   test_vector_quantI  s   r3  row)colr4  col32c                 C   r   )zKdim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_transpose_{}r   r   r   r   r   r   c  r)  z:dim1, dim2, dim3, dims, dtype, orderA, orderOut, transposec                 C   s  |dkr
t dkr
d S |tjkrt dkrd S t||||}|dkr0tjdd| |fdd|}	n|dkrCtjdd| ||fdd|}	tj|	|d\}
}|d	kr\tj	|	
 |

  n|d
krntj	|	 
 |

  n|dkr|dkr|	jd |	jd d|	jd d    }n|dkr|	jd |	jd  |	jd d|	jd d    }|
 |ksJ n|dkrJ|	jd d|	jd d   |	jd d|	jd d    }|
 |ksJ |	jd d |	jd d dkrdnd }t|	jd D ]V}t|	jd D ]L}||	jd  }|}|d |d dkrdnd }|d |d dkr"dnd | }d||  }|d }|d d }|	
 ||  |	||f ksHJ qq|dkrctj|
|d	|d\}}tj	|	| d S d S )Nr  r6  r   r   r   rC   r   to_orderr4  r5  r   r   r   
col_turing   r[   )
from_orderr8  state)	out_orderr   r  ra   get_transform_funcr  r`   nvidia_transformr   r   flattenr  r   r/  rn   )r   r   r&  dimsrT   orderAorderOutr  funcrg   r   r~   r   total_coltiler4  r5  rr   jcoltilerowtiler   col2row2r   r   r   r   test_nvidia_transformf  sh   (
($

rK  rX   )r   r  c                 C   r   )z.dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}_ldb_{}r   r   r   r   r   r     r   z!dim1, dim2, dim3, dim4, dims, ldbc                 C   sr  t tD ]}|dkrtjdd| |fddtj}n|dkr.tjdd| ||fddtj}tjdd||fddtj}t| |  }	t	
|d\}
}t	
|d\}}t	|
|||\}}t	j|d	|d
\}}tj|	|  tjdd||fddtj}t| | }	t	j
|ddd\}}t	|
|||\}}t	j|d	|d
\}}tj|	|  qd S )Nr   r   r   rC   r   r  r6  r9  r4  r<  T)r  )rn   r   r   r  r`   r   r  rU   r  ra   	transformigemmltr?  r   r   )r   r   r&  r'  rA  ldbrr   rg   r   C1ru   r2  B2SBC2SCC3r~   B2tSBtr   r   r   test_igemmlt_int  s:   rX  )r   c                 C   r   )z'dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}r   r   r   r   r   r     r   zdim1, dim2, dim3, dim4, dimsc                 C   s&  t  }ttD ]}|dkrtjdd| |fdd }n|dkr.tjdd| ||fdd }tj||fdd }tjj	
| t|| }	t|| }
|d|jd }t |\}}}}}t |\}}}}}t |d	\}}t j||d
\}}t ||||\}}t ||||}qd S )Nr   r   r   rC   r   r  rY   r6  r7  )ra   get_special_format_strrn   r   r   r   rV   rd   r%   r*   r+   r  r  r   r   r   double_quantrM  rN  r   )r   r   r&  r'  rA  formatBrr   rg   r   rP  rS  CACAtstatsAstatsAt
coo_tensorCBCBtstatsBstatsBtC32Ar2  CxBrR  out1_32Sout1_32outputr   r   r   test_igemmlt_half  s,   rk   0  i   i <     c                 C   r   z"batch_{}_seq_{}_model_{}_hidden_{}r   r   r   r   r   r     r  zbatch, seq, model, hiddenc                 C   s   t  }tj| ||dd }tj| ||dd }tjdd||fdd }tjdd||fdd }td tj}	|d|j	d 
 }|d|j	d 
 }tj  t }
ttD ]
}t|| }q^tj  t |
 }t| d S )NrC   rY   r   r   r    rY  )ra   rZ  r   rd   rV   r  r   r   r   r   
contiguousrC   rL   timern   r   r  r  )batchseqmodelhiddenr\  rg   gradw1w2rT   t0rr   out1t16r   r   r   test_bench_8bit_training#  s    

	r|  r9  
col_amperec                 C   r   )z.dim1_{}_dim4_{}_dims_{}_formatB_{}_has_bias_{}r   r   r   r   r   r     r)  z#dim1, dim4, dims, formatB, has_biasc              	   C   sz  t jdddd }d }|rt j|dt jd}t }tdD ]}t j| |dd}t j||dd}	t |	 |	
 	 }
|rC|
|7 }
tj|dd\}}tj|	dd\}}t|d	\}}t||\}}t||||\}}tj|d
|d\}}t| ||
 }|r||7 }|
ddd}|
| }
|| }tj||| | |d}| }t|
|ddtd| d qd S )Nr   r   r   r   rC   r   rY   r  r6  r4  rL  r   rY  r!   r   r*  r\   r,  )r   r  r   rd   float16ra   rZ  rn   r  rV   r  r-  r?  rN  vectorwise_mm_dequantrU   r   r   r   r@  r/  r   r0  )r   r'  rA  r\  has_biasinnerr"   rr   rg   r   rP  rs   r   B1r   ru   r2  rQ  rR  rS  rT  rU  r~   C4r   C5r   r   r   r   test_dequant_mm  s0   r  c                 C   r   )zdim1_{}_dim2_{}_dims_{}r   r   r   r   r   r     r)  zdim1, dim2, dimsc                 C   s  t tD ]}d}tj| |dd }| }d|t|dk< |dkrTt| d\}}t| d\}	}t| d\}
}t| d\}}nJ t	j
||d	\}}}tjt|d
ddd}t||kd }tj|jd d |j|jd}|d|dd < tj|| tj|
| tj| | t	j
|dd	\}}}tj|	| tj|| |d u sJ qd S )N      @rC   rY   rJ   r   r   r   Fr   zC(rows row_tiles) (cols block_size)-> rows cols row_tiles block_sizer   r[   )	row_tiles
block_sizer  r   )rn   r   r   rd   rV   clonerf   rU   r   ra   get_colrow_absmaxr   r   r   r@  r   r   rT   rZ   cumsumr   r   r0  )r   r   rA  rr   r   rg   A_truncated
row_stats1_
col_stats1row_stats1_trunccol_stats1_trunc
row_stats2
col_stats2nnz_block_ptr2	A_blockednnz_rows1_countsnnz_block_ptr1r   r   r   test_colrow_absmax  sJ   r  c                 C   r   zdim1_{}_dim2_{}r   r   r   r   r   r   1  r)  z
dim1, dim2c                 C   sJ  t tD ]}tj| |dd }tj|dd\}}tj|dd\}}t|\}}	}
}}tjj	||ddd tjj	|	|ddd |	
 }tj||dddk  }tj|	|dddk  }d}||| krwtd	| d
|| d J ||| krtd	| d
|| d J tj	|  |
 tj	|  | qd S )NrC   rY   r   r  r   r]   )r   r+  zMin error exceeded z  elements are different. Error: .4f)rn   r   r   rd   rV   ra   r-  r[  r   r   r/  r   r   r   r   r@  rU   )r   r   rr   rg   out_col1Scolout_row1Srowr]  r^  r_  r`  ra  r   num_not_close_rowsnum_not_close_cols	min_errorr   r   r   test_double_quant4  s4   r  r   c                 C   r   zdim1_{}_dim4_{}_inner_{}r   r   r   r   r   r   `  r)  zdim1, dim4, innerc           !      C   s  t tD ]}tj| |dd }tj||dd }t| |  }t|\}}}	}
}t|\}}}}}tj	|dd\}}tj	|dd\}}tj
|  |	 tj
|  | tj
j||ddd tj
j||ddd t|d\}}t|d\}}t||||\}}t|||	|}t|d\}}t|d\}}t||||\}}tj|d	|d
\}}t| || }t||   }t||   } | |d ksJ qd S )NrC   rY   r   r  r   r   r   r6  r9  r4  rL  gffffff?)rn   r   r   rd   rV   r  r  ra   r[  r-  r   r   r@  rU   r?  rN  r   r  rf   rq   r   )!r   r'  r  rr   rg   r   rz  C1aC1bstats1astats1bra  C2aC2bstats2astats2brs   r   r  r   ru   r2  rQ  rR  outC32rT  r   rS  rU  r~   r!  err1r"  r   r   r   test_integrated_igemmltc  s2   r     c                 C   r   r  r   r   r   r   r   r     r)  z"Row scale has some bugs for amperec           -   	   C   s  t  }g g g }}}g g }}d}	ttD ] }
tj| |dd }tj||dd }tjj	| t
|| }t
| |  }t |\}}}}}t j|dd\}}t |d\}}t ||\}}t j|dd\}}d| |	 }t|| }t j||||tj|d	\}}t j|d
|d\} }!t|  }"|"dkrd}	n|"d }	| | | | d }#t
| |  }$t |\}%}&}'}(}t |%|\}}t ||||\}}t ||||'})t j|ddd\}*}t j|ddd\}}t
|* |  }+|+| | d },|t||)    |t||#    |t||,    qtd tt|t|  tt|t|  tt|t|  d S )Nr   rC   rY   r   
quant_typer6  r        $@rT   	row_scaler4  rL  r         ?r   r  vector)r   r  ro  )ra   rZ  rn   r   r   rd   rV   r%   r*   r+   r  r  r[  r-  r?  	ones_likerN  r   rf   r   rU   r   rz   rq   r   r   r   r{   )-r   r'  r  r\  r  r"  err3relerr1r#  r  rr   rg   r   rP  rz  r  r  r  r  ra  rb  absmaxBru   r2  rQ  rR  rs   r   r   r  r  rT  rU  r~   maxvalr!  r  r  r  r  r  r   r]  rt   out4r   r   r   test_igemmlt_row_scale  sT   

r  c                 C   r   r  r   r   r   r   r   r     r)  c           #   	   C   s  g g g }}}g g }}d}t j| |dd }	t j||dd }
t jj|
 ttD ]
}t |	|
	 }q.t j
  t }ttD ]
}t |	|
	 }qFt j
  tdt |  t|	\}}}}}tj|
dd\}}t|d\}}t|t\}}tj|	dd\}}d	| | }|| }t j
  t }ttD ]}tj||||t j|d
\}}qt j
  tdt |  t|
\}} }!}"}t|t\}}t j
  t }ttD ]}t||||\}}qt j
  tdt |  d S )Nr   rC   rY   16r   r  r6  r  r  r  zrow-wisezvector-wise)r   rd   rV   r%   r*   r+   rn   r   r  r  rC   rL   rq  r   ra   r[  r-  r?  r\  rN  r   )#r   r'  r  r  r"  r  r  r#  r  rg   r   rr   rP  ry  r  r  r  r  ra  rb  r  ru   r2  rQ  rR  rs   r   r   r  r  rT  r  r  r  r  r   r   r   test_row_scale_bench  sJ   






r  )r6  r9  r}  c                 C   r   )zAdim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_{}r   r   r   r   r   r     s    c                 C   s   t tD ]w}|dkrtjdd| |fdd|}	n|dkr,tjdd| ||fdd|}	d|	dd< |rE|	  }
tj	|
|d\}}n	tj	|	|d\}}tj
|	||d	\}}|d
 d
 |d
 d
 ksfJ |d
 d |d
 d kstJ tj|| qd S )Nr   
   c   rC   r   r  rY  r7  )r8  r  r   r   )rn   r   r   r  r`   r   r  rp  ra   r?  rM  r   r   )r   r   r&  rA  rT   rB  rC  r  rr   rg   Atrz  S1r   S2r   r   r   test_transform   s*   r  !   c                 C   r   )z.dim1_{}_dim2_{}_dtype_{}_orderA_{}_orderOut_{}r   r   r   r   r   r   K  r   c            
      C   s   t  } t|  tdD ]I}tdd tj	dd}tdd tj	dd}t 
|d\}}t 
|| \}}t j||||tjd}t| |  }	qd S )Nr   r      rY  r   r6  rT   )ra   rZ  r   rn   r   arangerC   r`   r   r   r?  rN  r  rU   r  )
r\  rr   r   r   CaSaCbSbr   c2r   r   r   test_overflowQ  s     r  c                 C   r   r  r   r   r   r   r   r   f  r)  c                 C   s   d}t tD ]k}tj| |dd }t||k}t|\}}}}	}
tj||d\}}}}	}
|
d urq|| }t|}|
j	||
j
 |
j f< tj|| ||dk }| |d d  }tjj||dk |dd	d
 qd S )Nr  rC   rY   r  r   r   r   r^   r   r  )rn   r   r   rd   rV   rf   ra   r[  
zeros_likevaluesrowidxlongcolidxr   r   rU   	unsqueeze)r   r   r   rr   rg   r   CA2r^  r_  r`  ra  r]  rs   ru   r   r   r   test_coo_double_quanti  s.   
r  c                 C   r   )zdim1_{}_dim2_{}_transposed_B_{}r   r   r   r   r   r     r)  zdim1, dim2, transposed_Bc              	   C   s   d}t jdddd }ttD ]}}t | |  }|r)t ||  }n
t ||  }t ||k}|dk	  }	t 
|\}
}|| }t|jd |jd |	|
 | |}|| }|rxt|| }t || }nt||}t ||}t||dd	d
d qd S )Nr  r   r   r~  r   r   r   r\   Q?   r   r   r   )r   r  r   rn   r   rd   rC   rV   rf   r   wherera   COOSparseTensorr   r0  spmm_coor  r  r   )r   r   transposed_Br   r&  rr   rg   r   r   nnzrowscolsr  cooAru   r   rz  r   r   r   test_spmm_coo  s,    r  c                  C   s  d} d}|d }d}| | }|}|}d}t j||dd }t j||dd }	tdD ]
}
t||	 }q,t j  t		 }tt
D ]
}
t||	 }qDt j  t		 | }t ||k}|dk  }t||   t |\}}|| }t|jd |jd || | |}tdD ]}
t||	}qt j  t		 }tt
D ]}
t||	}qt j  t		 | }t|| t||  d S )	Nr   rX   r   rC   rY   r  r   r   )r   rd   rV   rn   r   r  r  rC   rL   rq  r   rf   r   r   r   r/  r  ra   r  r   r0  r  )rr  rt  ru  rs  r   r   r&  r   rg   r   rr   rP  ry  t8r   r  r  r  r  r  r   tspr   r   r   test_spmm_bench  sH   

 


r  c                 C   r   r  r   r   r   r   r   r     r)  c                 C   sf  d}d}t tD ]}t| |  }t| |  }t|| }t	|\}}	}
}}t
||\}}t	|\}}}}}t
|d\}}t||||\}}t||||
}tj	||d\}}}}}t
|d\}}t||||\}}t||||
}|d usJ t|| }|| }t||   }t||   }||k sJ qd S )Nr  r9  r6  r  )rn   r   r   rd   rC   rV   r  r  ra   r[  rM  rN  r   r  rf   rq   r   )r   r   r   r\  rr   rg   rw  rz  Cw1Cw1tstatsw1statsw1tra  CTw1Sw1r]  r^  r_  r`  rf  r2  rh  ri  r   r!  r  out5r  r"  r   r   r   test_integrated_sparse_decomp  s2   r  c                  C   s   t dd  } t dd  }t | | }t| |}t| | }t || 	 
 }t || 	 
 }|dk sFJ |dk sLJ t|| d S )Nr[   r   g?)r   rd   rV   rC   r  r  r   matmul_cublasrf   rq   r   r   )r   r   c1r  c3r  r"  r   r   r   test_matmuls  s   r  r   onesc                 C   r   )z$dim1_{}_dim2_{}_dtype_{}_out_func_{}r   r   r   r   r   r     r  zdim1, dim2, dtype, out_funcc                 C   s  t t|}d}tj| |dd }|tjkr*tj||d dd }tjj| ntj||d dd }tjj| tj	|dd\}}t
d t||k}|dk  }	t|\}
}|| }t|jd	 |jd |	|
 | |}|| }t| | }||jtj|jd
}|| 7 }tj|||d}d}| }t|| }| }|| }|| }t|| dd|d tjd	|jd dd}d S )Ngffffff
@rC   rY   r   r   r  ro  r   r   r   r  竪>r\   r  r  rY  r  r   )getattrr   rd   rV   r  r%   r*   r+   ra   r-  r   rf   r   r   r  r  r   r0  r  rZ   r  spmm_coo_very_sparser/  mathceilr   r   r  )r   r   rT   out_funcr   rg   r   rR  r   r  r  r  r  r  ru   rz  r   r   pr   r   r   idx_colr   r   r   test_spmm_coo_very_sparse!  s@   

 r   c                  C   s   d} t dd  }t || k}|dk  }t |\}}|| }t	|j
d |j
d || | |}|| }t|}	|	jdd  |	jd d  }
|
 |j
d ks\J t j|
 |dkd |dk}t j|| |	j d S Nr   r   r   rY  )r   rd   rV   rC   rf   r   r   r  ra   r  r   r0  coo2csrrowptrr/  r   r   r  r  )r   rg   r   r  r  r  r  r  ru   csrAcountsr   r   r   test_coo2csr_  s     
r  c                  C   s   d} t dd  }t || k}|dk  }t |\}}|| }t	|j
d |j
d || | |}|| }t|}	|	jdd  |	jd d  }
|
 |j
d ks\J t j|
 |dkd | dk}t j| | |	j d S r  )r   rd   rV   rC   rf   r   r   r  ra   r  r   r0  coo2csccolptrr/  r   r   r  r  r  )r   rg   r   r  r  r  r  r  ru   cscAr  r   r   r   test_coo2cscs  s     
r
  c                 C   r   )zdim1_{}_dim2_{}_dtype_{}r   r   r   r   r   r     r)  zdim1, dim2, dtypec           !      C   s  d}t j| |dd }t j||d dt jd}t jj| | 	 }t
|\}}}	}
}t jd|jd dd	}d
|d d |f< t ||k}|dk  }t |\}}|| }t
|jd |jd || | |}|| }t
j|||
d}t || }t
|| }||
  d }t j|jdd\}}|d }t j|dd\}}tt |  t jj||ddd d}| }t !|| }t"||dd|d t j#$  t%% }t&dD ]}t
'||}qt j#$  tdt%% |  t j#$  t%% }t&dD ]	}t
||}qt j#$  tdt%% |  t j#$  t%% }t&dD ]}t
j|||
d}q-t j#$  tdt%% |  t j#$  t%% }t&dD ]	}t ||}qTt j#$  tdt%% |  t j#$  t%% }t&dD ]}t(||}t
j|||
d}|| } qyt j#$  tdt%% |  t j#$  t%% }t&dD ]}t(||}t j|d d |f | | |d qt j#$  tdt%% |  t j#$  t%% }t&dD ]	}t(||}qt j#$  tdt%% |  d S ) N      @rC   rY   r   r   r   rY  r  r          @r   )dequant_statsr   T)return_counts)
descendingr^   r   r  r  r\   r  r  rm   zcusparse fp16r   zint8+dequantr  zsparse+ matmulr  zpartial matmul))r   rd   rV   r  r  r%   r*   r+   r  rp  ra   r[  r  r   rf   r   r   r  r  r0  r  r  uniquer  r  r   r   medianrU   r   r   r/  r  r  r   rC   rL   rq  rn   r  r   )!r   r   rT   r   rg   r   Btrb  rc  rd  re  ra  r  r   r  r  r  r  r  ru   r   rz  r!  r  r   	max_countmax_idxr  r   r   ry  rr   r   r   r   r   test_spmm_coo_dequant  s    
	









(


r  i   i h  c                 C   r   rn  r   r   r   r   r   r     r)  c                 C   s  d}t  }tj| ||dd }tj||tjdd}tjj	| t 
|\}}	t j
|dd\}
}t |\}}t j|dd\}}tj||dd  }|  tjd|d	d
 }d|d d d d |f< tjj||dddd  }tj||d  }tjj||ddd  }tj|| |d t|D ]
}t||  qtj  td tj  t }t|D ]
}t||  qtj  td|  d| d| d| d| d|  d| d| dt | dd tj  t }t|D ]}tj|| |d qtj  td|  d| d| d| d| d|  d| d| dt | dd tj  t }t|D ]}tj|| |d qGtj  td|  d| d| d| d| d|  d| d| dt | dd d S )Ni  rC   rY   r   T)compress_statisticsFr   )r   r   r  r  r  )quant_statero  zpytorch fp16: [,z], [z]->[z]: r  rK   z
bnb nf4: [zbnb nf4+DQ: [)ra   rZ  r   rd   rV   r  r  r%   r*   r+   quantize_fp4quantize_nf4r   Linear8bitLtrC   evalr  matmul_4bitr  rn   r  rL   r   rq  )rr  rs  rt  ru  itersr\  rg   r   B_fp4r<  B_fp4_cstate_cB_nf4	state_nf4B_nf4_cstate_nf4_c
linear8bitoutlierslinearMixedBitlinear8bit_trainlinear8bit_train_threshrr   ry  r   r   r   test_bench_matmul	  sN   


J

J

Nr+  c                   C   s  dd } d}d}d}d| }t j|| |dd d	 }t j||dd d	 }t ||}| }| }t ||}t| |   }	d
}
t |||
 }||d
	dd
|
 8 }| |\}}}t
| |  t
||  ||   d
}
d}t || |
 |}||d|
 7 }|| }| |\}}}t ||}||d| 8 }|| }d
}d
}d}d}t || | || | }|||d	d
d | ||d
	dd
 |  8 }||| |jd
  8 }|||  }| |\}}}| |\}}}t ||}|||d	d
d | ||d
	dd
 |  8 }||| |jd
  8 }|||  }t
d t
| d d  t
| d d  t
|	 d d  t
| d d  t
| d d  t
| d d  t ||   }t ||	   }t ||   }t ||   }t ||   }t ||   }t
|||||| d S )Nc                 S   sf   | j }|  } |  |   }|dkrd}d| }|  }t|  | d }||  | } | ||fS )Nr   r   g     o@r   )rT   rU   r   r   r   r   )r4   rT   dynaqxminxzpxr   r   r   quant_zp  s   
z test_zeropoint.<locals>.quant_zpr   r   rX   r   rC   rY   r*  r   rY  r  r   ro  r  )r   rd   rV   r  rU   r   r  rp  r   r   r   r   r   r   r@  rf   rq   r   ) r0  rr  rs  rt  ru  rg   r   C0rP  rU  zprS  cacqaczar  r  r]  qazpar  zpbqbC6rb  C7r  r"  r  err4err5err6r   r   r   test_zeropoint  sn   88r?  c                  C   s  t tD ]} d}ttjd|d dd  }tjdd|dd	tj}|d d |	 f }t
|d
\}}t
|||}|jd |d ksJJ |jd | ksUJ tj|| t
|d\}}t
|||}|jd |d ksvJ |jd | ksJ tj|| qd S )Nr   r   r   r   )r  r   r   r   rC   r   r9  r}  )rn   r   r   r  r  r0  rC   r`   r   r  ra   rM  extract_outliersr   r/  r   r   )rr   r	  r   rg   	outliers1r]  r2  	outliers2r   r   r   test_extract_outliers  s    "rD  c               	   C   s   g } g }d}d}dD ]^}dD ]Y}t dD ]R}tj|||dd}t }tj||d\}	}
tj|	|
|d}tt |  t|| }|t|d  }| 	|
   |	|
   | d	 d
k sfJ qqq
d S )Nr   )r   r@  r   cpurY   r   rw   rY  r   )rn   r   rd   rq  ra   r   r   r   rf   rz   rq   r   )r|   r}   rr  rs  ru  r   rr   rs   ry  rt   r~   ru   rk   r   r   r   r   test_blockwise_cpu_large  s(   rG  c                  C   s  t ddD ]} d|  }td| | }g }g }t dD ];}tjdddd}tj||d\}}t||}	t||	 }
|
t|d	  }|	|

   |	|
   qg }g }t dD ];}tjdddd}tj||d\}}t||}	t||	 }
|
t|d	  }|	|

   |	|
   q`g }g }t dD ]9}tjdddd}t|\}}t||}	t||	 }
|
t|d	  }|	|

   |	|
   qqd S )
Nr      Trm   rX   rC   rY   rh   rw   )rn   ra   create_fp8_maprC   r   rd   r   r   rf   rz   rq   r   r_   )e_bitsp_bitsrh   r   r   rr   rs   rt   rT  ru   rk   r   r   r   r   test_fp8_quant-  sF   rM  c                  C   s>  t ddD ]} dD ]}g }g }d }|dkr!tjd| d }n@|dkr=t| d }| | d }td|||  }n$|d	krMtd| d
 |  }n|dkratj	dddd}t
||  }t| d|  d|  d fv s|J d|  d| | dksJ t dD ]}tj	dddd}||   }g }	g }
|d
 D ]}t||  }|	|  |
||   qt|	 }	t|
 }
tj||d\}}t||}t|	 | }t|| }||   ||d|      | rt|
|  }qtj|	| qq
qd S )Nr   	   )r   fp8dynamicre   r   T)
total_bitsrO  r   rP  r   re   r   rC   rY   zbits: z
, method: r[   r  r   rI  g|=)rn   ra   create_linear_maprC   r  r  rJ  r   r   rd   create_quantile_mapr  r/  rf   r   argminrz   r   Tensorr   r   r   r0  rq   r   r   r   )bitsmethodabserrsr  rh   ebitspbitsr  rr   q1v1vr   q2r  v2r"  r  r   r   r   test_few_bit_quant]  sR   
6
r`  c            
      C   s^  t dD ]>} tjdddd}t ddD ].}tddd| }tt| }t	j
|d	d| d
}t||  }|dk sAJ qqt dD ]e} tjdddd}t ddD ]U}d| d }td	dd| d }tdd| d d}|| }dd|  }	t|	d|	 |}tt| }t	j
|d| d d}t||  }|dk sJ qVqGd S )Nrm   rX   rC   rY   r   rN  gC8
!?gt^?r   )r   num_quantilesg~jt?r   r   )ra  gQ?)rn   r   rd   nprc   rU  r   ppfrC   ra   rb   rf   rq   r  )
rr   datarV  r  val1val2r   total_valuesr   r   r   r   r   test_kbit_quantile_estimation  s0   rh  c                  C   s   t jdddd } tdddd }tj| |d\}}t|  d	}t j	  t

 }td
D ]	}t| \}}q2t j	  d S )NrX   rC   rY   Tr  r   r   rI  gaa@@rm   )r   r_   rV   ra   rJ  rC   r   r   r   rL   rq  rn   )r   rh   r6  r2  max_theoretical_mury  rr   r   r   r   test_bench_dequantization  s   
rj  c                 C   s^  t tddgdd}i }|D ]S}d}d}|\}}}}	|d |d  |d  |	d  }
|r.dnd	}|d |d  }|dkrH|	dkrCd}n|d
 }nd| | d  }|	rUdnd	}|| | }|||
< qtjddd| d}tj|dd\}}t||}||   }||  d  	 }|d	k}
|	 }|j
| ksJ | dk sJ | dk sJ d S )Nr   r   r   )repeatr  r:  r   g      r   g      ?r  rX   rC   r   r   rF  rw   r*  Q?)listr   r   rd   ra   r  dequantize_fp4rf   rU   rq   rT   r   )rT   r   rh   rV  resultr"   signe1e2p1r   expfracrs   r6  r2  ru   r   r   r   r   r   test_fp4_quant  s4    

rv  zthis test requires a GPU)reasonr  fp4nf4c                 C   s>  dD ]}g }g }t dD ]}tjdddd }tj||| d\}}tj||d| d\}}	tj||| d	}
tj||	| d	}||
   }||  d
  	 }|	 }|
|  | dk sdJ | dk slJ ||   }||  d
  	 }|	 }|
|  | dk sJ | dk sJ qqd S )N)r   r   r  rX   rC   rY   r   r  T)r   r  r  r  gV瞯<g)\(?rl  )rn   r   rd   rV   ra   quantize_4bitdequantize_4bitrf   rU   rq   rz   r   )r  r   errs1r  rr   rs   r^  SA2q3SA3ru   A3r   r   r   r   r   test_4bit_compressed_stats  s.   r  c                 C   s   d}t jdddd }tj||| d\}}| d }| d }|| }|d }|d	 }	t jd
ddd }
d}t j  t		 }t
|D ]}tj|||| d qGt j  d S )Nr[   rm  rl  rC   rY   rz  r   g    eAi   r   rm   )r   r_   rV   ra   r{  r/  rd   rC   rL   rq  rn   r|  )r  r   r   r6  r2  
input_sizeoutput_size	num_bytesGBmax_theoretical_sr   r  ry  rr   r   r   r   test_bench_4bit_dequant	  s   
r  c                  C   s   t  } | d d  | dd    }d}t| |dk rTttd| d dd| }t| |d9 }g }|D ]}|||d  ||  d  q8t| |dk sd S d S )Nr:  ir   r   r   )ra   create_normal_maptolistr   rm  rn   rz   )rh   r  
num_pivotsr   pivotsrr   r   r   r   test_normal_map_tree2	  s     r  r[  DQ_TrueDQ_Falsestorage_typekind)r'   r(   attnattn_packed)r   r   r   c           )   
   C   s6  dD ]}g }g }g }g }g }	g }
g }g }g }t dD ]O}|dkr<tjd|| dd}tj|d || ddt| }nY|dkr\tjdd| | dd}tj|d| | ddt| }n9|d	krxtjd|| dd}tj||| ddt| }n|d
krtjd|| dd}tj|d || ddt| }tj|||d\}}t|| }tj	|| |d}d|_
t|| |}||   }||   }||   }t| d }t| d }t| d }|| }|| }|| }| }| } | }!||   ||   ||   ||   |	|   |
|   ||  ||   ||!  t| d |d  d }"t||dd|"dd}"qt|t| t| }t|t| t| }t|t| t| }t|t| t| }t|	t|	 t| }t|
t|
 t| }t|t| t| }#t|t| t| }$t|t| t| }%|| }&|| }'|| }(t| dd   t| dd   td|  td|  td|#  | tjkrc|dkr0|dk s(J |dk s/J n|dk s7J |dk s>J |&dk rH|&dksJJ |'dk rT|'dksVJ |(dk r`|(dksbJ q| tjkr|dkr|d k suJ |d!k s|J |#d"k sJ n|d k sJ |d#k sJ |#d"k sJ |&dk r|&dksJ |'dk r|'dksJ |(dk r|(dksJ q| tjkr|dkr|d$k sJ |d%k sJ |#d&k sJ n|dk sJ |d'k sJ |#d(k sJ |&dk r|&dks J |'d)k r
|'d*ksJ |(d+k r|(d,ksJ qd S )-N)r   r[   r   rX   rm   r'   r   rC   r   r   r(   r  r  r  r  r  rL  Tgh㈵>gǺV?r[   r\   F)r   r   izinference vs training abs: zinference vs training rel: zinference vs training max: r   gמY?g-C6J?giUMu?g-C6*?gGz?gףp=
?gHj>gư>r  g>ga2U0*C?gy&1|?r   r+  ga2U0*S?gp=
ף?gQ?gRQ?g\(\?)rn   r   rd   r  r   ra   r{  r  r  	gemv_4bitrequires_gradr   r  rf   rU   r   rz   rq   r   r0  r/  r   r   r{   r   r@  r  float32bfloat16))rT   r  r[  r  r   r}  r  errs3relerrs1r  relerrs3	max_errs1	max_errs2	max_errs3rr   rg   r   qBr<  rU  rS  rP  r  r"  r  mag1mag2mag3r  r#  relerr3max_err1max_err2max_err3r   maxerr1maxerr2maxerr3absratiorelratiomaxratior   r   r   test_gemv_4bitA	  s   
"" 


r  c                  C   s2  d} t j| | tjd}t j| | tjd}t j| | tjd}|js"J |js'J |jdks.J |jdks5J t |d t |d t |d |dk 	 | |  ksUJ |dk 	 | |  kscJ ||
  }|dk 	 | |  kswJ t || t || t || |dk 	 | |  ksJ d S )	Ni@  r  r   g      1@   r   i!     )ra   	get_pagedr   r  uint8is_pagedpage_deviceidfillr   r   rU   _mul)r   rg   r   rQ  rt   r   r   r   test_managed	  s&   

 r  c                 C   s   d}t jtjdd t jdd|fd }dd |D }|D ]S}t jddd	d	|f|d
d}t j||d
d}tj	|| |d\}}t 
|| }	t|| |}
d|_t|| |}t j||	 t j|| t j||
 q!d S )Nr  r   i2i    r   c                 S   s   g | ]
}|d |d    qS )r   r   )r   r   r   r   r   r   	  s    z&test_gemv_eye_4bit.<locals>.<listcomp>r*  r   rC   )r   rT   rZ   r   r  T)r   r  manual_seedrb  r  r  r   eyera   r{  r  r  r   r  r  r   r   )r  rT   r[  rA  r   rg   r   r  r<  rU  rS  rP  r   r   r   test_gemv_eye_4bit	  s    r  )r   r   r   T)r   ){r  r  rq  	itertoolsr   r   pytestr   numpyrb  bitsandbytesr   r   ra   scipy.statsr   set_printoptionsr   r   r%   Moduler    r:   rQ   rS   markparametrizer  r  rl   rv   r   r  r   r   r   r   r   r   r   r   rq   r   r   methodsrz   method_namesr   rm  r  values_namesnamesr   r   r   r  r  r  r  r  r  r  r  r$  r&  r'  r(  r3  r   r  rT   a_orderr=  rA  rK  rO  rX  rk  
batch_sizeseqdimr|  r\  r  r  r  r  r  zipr  skipr  r  r  r  r  r  r  r  r  r  out_functionr   r  r
  r  r+  r?  rD  rG  rM  r`  rh  rj  rv  skipifrC   is_availabler  r  r  r  r  r  r   r   r   r   <module>   s   

&
 -

	
&
3

E


>
!
(



 
%
-
%
"
A
-


+
#
=
d
 V0< 
$% 
~
% 