o
    h@                      @   s  d dl Z d dlmZ e sdd ZdS d dlZd dlmZ d dlm	Z	m
Z
 dd Zdd	 Zejejd
dddddddejdd
ddddddejddddddddejddddddddejd
d
ddddddejd
dddddddejdd
ddddddejd
dddddddejddddddddejd
dd
dddddejdd
d
dddddejddd
dddddejddd
dddddejd
d
d
dddddejd
dddddddejdd
ddddddejd
dddddddejddddddddge  g de	e
dddeddd iejdejdejdejd ejd!ejd"ejd#ejdejd$ejfd%d&Zd'd ZdS )(    N)is_triton_availablec                 C   s   d S N )abstate_xstate_wbiasr   r   f/var/www/html/ai/venv/lib/python3.10/site-packages/bitsandbytes/triton/int8_matmul_mixed_dequanitze.pyint8_matmul_mixed_dequanitze   s    r   )early_config_pruneestimate_matmul_timec                    s    fddS )Nc                    s   |     S r   )zero_)nargsnamer   r
   <lambda>   s    zinit_to_zero.<locals>.<lambda>r   r   r   r   r
   init_to_zero   s   r   c                  C   s   g } dD ]A}dD ]<}dD ]7}dD ]2}|dkrdnd}|  tj|||dd	||d
 dD ]}|  tj||||d	||tdd q,qqqq| S )N)               )       )r   @   )r   r         r   r   r      BLOCK_MBLOCK_NBLOCK_KSPLIT_K
num_stages	num_warps)r   r      r   C)r%   r&   pre_hook)appendtritonConfigr   )configsr%   block_mblock_kblock_nr&   split_kr   r   r
   get_configs_io_bound   s(   

r2   r   r   r   r   r   r   r'   r$   r   r   r   r   )MNK
   )r   
perf_modeltop_k)r-   keyprune_configs_byEVEN_Kc                 C   s   | d | d | d   dkS )Nr5   r"   r#   r   r   )argsr   r   r
   r   E   s    r   	divfactorhas_biasr    r!   r"   GROUP_Mr#   ACC_TYPEc           .      C   s  t d}t d}t ||}t ||}|| }|| }t|||  |}|| ||  }|| | } || t d| }!| | t d| }"t t |!| ||}#t t |"| ||}$|| t d| }%| |#d d d f | |%d d d f |   } ||%d d d f | |$d d d f |   }|| t d| }!| | t d| }"t |}&t ||# d d d f }'t j||ft j	d}(t
dt ||| D ]O})|rt | }*t |}+n(||)||   },t j| |%d d d f |,k dd}*t j||%d d d f |,k dd}+|(t |*|+7 }(| || | 7 } ||| | 7 }q|&|'|(|	   }(|(|jj}(|
rJt ||" |jj}|(|d d d f  }(||!d d d f | |"d d d f |   }|!|k d d d f |"|k d d d f @ }-|dkrt j||(|-d d S t j||(|-d d S )Nr   r   )dtypeg        )maskother)rB   )tl
program_idcdivminarangemax_contiguousmultiple_ofloadzerosint32rangedottorA   
element_tystore
atomic_add).ABr(   r	   state_x_ptrstate_w_ptrr3   r4   r5   r=   r>   	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnr    r!   r"   r?   r#   r;   r@   pidpid_zgrid_mgrid_nwidthgroup_id
group_sizepid_mpid_nrmrnramrbnrkw_factorx_factoracckr   r   k_remainingrB   r   r   r
   _int8_matmul_mixed_dequantize&   sR   
+
,,

  ,(
rq   c                    s  | j }d}|d u rdnd}| ddkr| ddkr|  } |ddkr1|ddkr1| }| jd |jd ks?J d| j\ }|j\}	tj f|tjd}
tj} fdd}t	| | ||
||| |||| d| d|d|d|
d|
dd|d	 |
S )
NgA@?r   r   zincompatible dimensions)devicerA   c                    s(   t  | d t | d  | d fS )Nr    r!   r#   )r+   rF   )METAr3   r4   r   r
   r      s   ( z.int8_matmul_mixed_dequanitze.<locals>.<lambda>r'   )r?   r@   )
rr   stride
contiguousshapetorchemptyfloat16rD   float32rq   )r   r   r   r   r	   rr   r=   r>   r5   _cr@   gridr   rt   r
   r      s(   

)rx    bitsandbytes.triton.triton_utilsr   r   r+   triton.languagelanguagerD   triton.ops.matmul_perf_modelr   r   r   r2   autotuner,   
heuristicsjit	constexprrq   r   r   r   r
   <module>   sr    C