o
    h7                     @   s   d Z ddlmZ ddlmZ ddlmZmZ ddlm	Z	m
Z
mZ e r-ddlZddlmZ g dg d	g d
ddg dg d	g d
dddZ				ddefddZdd Zdd Zdd Zdd Zdd ZdS )z;AWQ (Activation aware Weight Quantization) integration file   )ACT2FN)PreTrainedModel)is_auto_awq_availableis_torch_available)AwqBackendPackingMethod	AwqConfigAWQLinearVersion    N)q_projk_projv_projo_proj)	gate_projup_proj	down_proj)input_layernormpost_attention_layernormnormF)	attentionmlp	layernorm	use_alibi)mistralllamareturnc              	      sL  |du rg }|j }t std|tjkrddlm}m} n|tjkr)ddl	m
} |tjkr9|jtjkr6|n|}	n|}	|  D ]b\}
} du rIg   |
 t|tjr|
|vrt fdd|D s|j}|j}|	|j|j|||jdu|jjd| j|
< d	}| j|
 d
 tt| dkrt || ||d\}} !d q?| |fS )a  
    Public method that recursively replaces the Linear layers of the given model with AWQ quantized layers.
    `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
    conversion has been successfull or not.

    During the module replacement, we also infer the backend to use through the `quantization_config` object.

    Args:
        model (`torch.nn.Module`):
            The model to convert, can be any `torch.nn.Module` instance.
        quantization_config (`AwqConfig`):
            The quantization config object that contains the quantization parameters.
        modules_to_not_convert (`list`, *optional*):
            A list of modules to not convert. If a module name is in the list (e.g. `lm_head`), it will not be
            converted.
        current_key_name (`list`, *optional*):
            A list that contains the current key name. This is used for recursion and should not be passed by the user.
        has_been_replaced (`bool`, *optional*):
            A boolean that indicates if the conversion has been successful or not. This is used for recursion and
            should not be passed by the user.
    NzAWQ (either `autoawq` or `llmawq`) is not available. Please install it with `pip install autoawq` or check out the installation guide in https://github.com/mit-han-lab/llm-awqr	   WQLinear_GEMMWQLinear_GEMV)WQLinearc                 3   s    | ]
}|d   v V  qdS ).N)join).0keycurrent_key_name S/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/integrations/awq.py	<genexpr>a   s    z*replace_with_awq_linear.<locals>.<genexpr>)w_bit
group_sizein_featuresout_featuresbiasdevTF)modules_to_not_convertr$   quantization_confighas_been_replaced)"backendr   
ValueErrorr   AUTOAWQawq.modules.linearr   r   LLMAWQawq.quantize.qmoduler   versionr   GEMMnamed_childrenappend
isinstancennLinearanyr*   r+   bitsr)   r,   weightdevice_modulesrequires_grad_lenlistchildrenreplace_with_awq_linearpop)modelr.   r/   r$   r0   r2   r   r   r   
target_clsnamemoduler*   r+   _r%   r#   r&   rH   *   sT   




rH   c                 C   s   t | tstd| jj |jdur|j}|j|d< |S | jjt	v rKt	| jj }| jj
}| jj}t| jd|}||d< ||d< ||d< |j|d< |S td)af  
    Returns the fusing mapping given the quantization config and the model

    Args:
        model (`~PreTrainedModel`):
            The model to fuse - note this model should have been converted into AWQ format beforehand.
        quantization_config (`~transformers.quantization_config.AWQConfig`):
            The quantization configuration to use.
    z:The model should be an instance of `PreTrainedModel`, got Nmax_seq_lennum_key_value_headshidden_sizenum_attention_headsa  Fusing mapping not found either on the quantization config or the supported `AWQ_FUSED_MAPPINGS`. Please pass a `fused_mapping` argument in the `quantization_config` or raise an issue on transformers https://github.com/huggingface/transformers to add its support.)r<   r   r3   	__class____name__modules_to_fusefuse_max_seq_lenconfig
model_typeAWQ_FUSED_MAPPINGSrQ   rR   getattr)rJ   r/   current_fused_mappingrQ   rR   rP   r%   r%   r&   get_modules_to_fuse~   s&   




r\   c           
      C   s   t |}|j}t| |}|tjkr%ddlm} ddlm	} ddl
m} ntd|  D ]\}}	t|d |	| t| ||d |	| t| |	||| q-| S )a8  
    Optionally fuse some modules in the model to speedup inference.

    Args:
        model (`~PreTrainedModel`):
            The model to fuse - note this model should have been converted into AWQ format beforehand.
        quantization_config (`dict`):
            The quantization configuration to use.
    r	   )QuantAttentionFused)QuantFusedMLP)FasterTransformerRMSNormz0Fusing is only supported for the AutoAWQ backendr   r   )r   	from_dictr2   r\   r   r4   awq.modules.fused.attnr]   awq.modules.fused.mlpr^   awq.modules.fused.normr_   r3   named_modules_fuse_awq_layernorm_fuse_awq_mlp_fuse_awq_attention_layers)
rJ   r/   
awq_configr2   rU   r]   r^   r_   rL   rM   r%   r%   r&   fuse_awq_modules   s   


ri   c                 C   sB   | D ]}t ||rt||}||j|j|jj|j|< ~qdS )a  
    Fuse the LayerNorm layers into a target class using autoawq

    Args:
        fuse_module_names (`List[str]`):
            The list of module names to fuse
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        target_cls (`~autoawq.FasterTransformerRMSNorm`):
            The `FasterTransformerRMSNorm` class as it only supports that class
            for now.
    N)hasattrrZ   rA   variance_epsilontorB   rC   )fuse_module_namesrM   rK   module_name
old_moduler%   r%   r&   re      s   


re   c                 C   s   t |dkrdS t||d rPt||d }t||d }t||d }|jj}t| jj }	|||||	}
|dd\}}| 	|}t
|||
| ~~~dS dS )a  
    Fuse the MLP layers into a target class using autoawq

    Args:
        model (`~PreTrainedModel`):
            The input pretrained model
        current_module_name (`str`):
            The current submodule name
        fuse_module_names (`List[str]`):
            The list of module names to fuse. For the MLP layers it has to be an array
            of length 3 that consists of the 3 MLP layers in the order (gate (dense layer post-attention) / up / down layers)
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        target_cls (`~autoawq.QuantFusedMLP`):
            The `QuantFusedMLP` class as it only supports that class
            for now.
    r	   N   r   r   )rE   rj   rZ   qweightrB   r   rW   
hidden_actrsplitget_submodulesetattrrl   )rJ   current_module_namerm   rM   rK   r   r   r   previous_deviceactivation_fn
new_moduleparent_name
child_nameparentr%   r%   r&   rf      s   

rf   c              
   C   s  ddl m}m} t|d dkrdS t||d d rt||d d }|jj}t||r2|}	d}
nt||r<|}	d}
nt	dt||d d }t||d d }t||d d }|j
durmtj|j
|j
|j
gdd	nd}|	|j|j|j|j|j |j |j
dutt|  j}tj|j|j|jg|
d	|_tj|j|j|jg|
d	|_tj|j|j|jg|
d	|_t||r|j|_||_
||d
 |d |d ||||d |d d}d|_|dd\}}| |}t|||| ~~~~dS dS )a  
    Fuse the Attention layers into a target class using autoawq

    Args:
        model (`~PreTrainedModel`):
            The input pretrained model
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        modules_to_fuse (`List[str]`):
            The module fusing mapping. The dictionary has to contain a field `attention` with attention module names
            in the correct order: q, k, v, o layer
        current_module_name (`str`):
            The current submodule name
        target_cls (`~autoawq.QuantAttentionFused`):
            The `QuantAttentionFused` class as it only supports that class
            for now.
    r	   r   r   Nrp   z'Unsupported q_proj type: {type(q_proj)}r      )dimrQ   rR   rP   rO   r   )r   Tr   )r5   r   r   rE   rj   rZ   rq   rB   r<   r3   r,   torchcatr(   r)   r*   r+   nextiter
state_dictvaluesqzerosscalessplit_k_itersis_hf_transformersrs   rt   ru   rl   )rJ   rM   rU   rv   rK   r   r   r
   rw   linear_target_clscat_dimr   r   r   r,   	qkv_layerfused_attention_layerrz   r{   r|   r%   r%   r&   rg     s^   

(	

rg   )NNNF)__doc__activationsr   modeling_utilsr   utilsr   r   utils.quantization_configr   r   r   r   torch.nnr=   rY   boolrH   r\   ri   re   rf   rg   r%   r%   r%   r&   <module>   s>   
T&%%