o
    h{                     @   s|  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZmZmZmZ ddlmZ eeZdZdZg dZdededejfddZ dejdejfddZ!dejdejdejdejfddZ"G dd dej#Z$G dd  d ej#Z%G d!d" d"ej#Z&G d#d$ d$eZ'd%Z(d&Z)ed'e(G d(d) d)e'Z*ed*e(G d+d, d,e'Z+dS )-z PyTorch CodeGen model.    )OptionalTupleUnionN)nn)CrossEntropyLoss   )ACT2FN)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )CodeGenConfigSalesforce/codegen-2B-monor   )zSalesforce/codegen-350M-nlzSalesforce/codegen-350M-multizSalesforce/codegen-350M-monozSalesforce/codegen-2B-nlzSalesforce/codegen-2B-multir   zSalesforce/codegen-6B-nlzSalesforce/codegen-6B-multizSalesforce/codegen-6B-monozSalesforce/codegen-16B-nlzSalesforce/codegen-16B-multizSalesforce/codegen-16B-mononum_posdimreturnc                 C   sV   ddt d|d|   }t dt j| t jd| }t jt |t |fddS )	N      ?i'  r      zi , j -> i jdtyper   r   )torcharangeeinsumfloatcatsincos)r   r   inv_freqsinusoid_inp r$   b/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/codegen/modeling_codegen.pycreate_sinusoidal_positions7   s   r&   xc                 C   sb   | d d d d d d d d df }| d d d d d d dd df }t j| |fdd} | dS )Nr   r   r   )r   stackflatten)r'   x1x2r$   r$   r%   rotate_every_two>   s   ""
r.   tensorr    r!   c                 C   s`   t |d d d d d d d f dd}t |d d d d d d d f dd}| | t| |  S )Nr   r   )r   repeat_interleaver.   )r/   r    r!   r$   r$   r%   apply_rotary_pos_embF   s   &&r1   c                       s   e Zd Z fddZdd Zdd Z		ddd	Z					
	
ddeej	 dee
ej  deej	 deej deej	 dee dee dee
eje
ej f ee
eje
ej e
ejdf f  f fddZ  ZS )CodeGenAttentionc                    s   t    |j}| jdttj||ftjddd||dd t	
|j| _t	
|j| _|j| _|j| _| j| j | _| j| j | jkrTtd| j d| j dttj| jtjdt | _t	j| j| jd	 dd
| _t	j| j| jdd
| _|j| _| jp| j}t||| _d S )Ncausal_maskr   r   F)
persistentzEembed_dim must be divisible by num_attention_heads (got `embed_dim`: z and `num_attention_heads`: z).r   )bias) super__init__max_position_embeddingsregister_bufferr   trilonesboolviewr   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropouthidden_size	embed_dimnum_attention_headshead_dim
ValueErrorsqrtr/   float32toget_default_dtype
scale_attnLinearqkv_projout_proj
rotary_dimr&   embed_positions)selfconfigmax_positionspos_embd_dim	__class__r$   r%   r7   M   s4   

$zCodeGenAttention.__init__c                 C   sJ   | |jd d || |f }| |jd d d |jdd   }|S )Nr(   r)   )r(   )reshapeshape)rR   r'   n_headdim_headmp_numreshapedr$   r$   r%   _split_headsl   s    &zCodeGenAttention._split_headsc                 C   s   t |jdkr|ddddd }nt |jdkr%|dddd }n
tdt |j | dd	 || f }||S )
zM
        Merges attn_head_size dim and num_attn_heads dim into n_ctx
           r   r   r   r      z3Input tensor rank should be one of [4, 5], but is: Nr)   )lenrY   permute
contiguousrG   sizer=   )rR   r/   rE   attn_head_size	new_shaper$   r$   r%   _merge_headsq   s   
zCodeGenAttention._merge_headsNc                 C   s   | d| d}}| jd d d d || |d |f }|tj}|tj}t||dd}	|	| j }	t|	j	j
}
tj|
|	j	d|	j}
t||	|
}	|d ur[|	| }	tjdd|	}	|	|j	}	| |	}	|d urv|	| }	t|	|}||	fS )Nr)   r(   r   r   )rd   r3   rJ   r   rI   matmul	transposerL   finfor   minr/   devicewherer   Softmaxr@   )rR   querykeyvalueattention_mask	head_maskquery_length
key_lengthr3   attn_weights
mask_valueattn_outputr$   r$   r%   _attn~   s$   	&

zCodeGenAttention._attnFhidden_states
layer_pastrr   position_idsrs   	use_cacheoutput_attentionsr   .c                 C   s  |  |}d}	||jd d |	df }
| j| j |	 }tj|
|dd\}}}| j|| j| j|	d}| j|| j| j|	d}| j|| j| j|	d}|dddd}| j	}|j
|j
krc||j
}|| _	|| }tj||jd d dd\}}| jd ur|d d d d d d d | jf }|d d d d d d | jd f }|d d d d d d d | jf }|d d d d d d | jd f }t|||}t|||}tj||gdd}tj||gdd}nt|||}t|||}|dddd}|dddd}|d ur|d }|d }tj||fd	d}tj||fd	d}|d
u r'||j|f}nd }| |||||\}}| || j| j}| |}| |}||f}|rS||f7 }|S )Nr`   r(   r   )r\   r   r   r   r   r)   T)rN   rX   rY   rF   rE   r   splitr^   rb   rQ   rl   rJ   rP   r1   r   r   ry   rg   rO   rB   )rR   rz   r{   rr   r|   rs   r}   r~   qkvr\   	qkv_split	local_dimro   rq   rp   rQ   sincosr    r!   k_rotk_passq_rotq_passpast_key
past_valuepresentrx   rv   outputsr$   r$   r%   forward   sX   

""""




zCodeGenAttention.forward)NNNNNNFF)__name__
__module____qualname__r7   r^   rg   ry   r   r   FloatTensorr   Tensor
LongTensorr<   r   r   __classcell__r$   r$   rV   r%   r2   L   sD    
,	"r2   c                       s6   e Zd Z fddZdeej dejfddZ  ZS )
CodeGenMLPc                    sJ   t    |j}t||| _t||| _t|j | _	t
|j| _d S N)r6   r7   n_embdr   rM   fc_infc_outr   activation_functionactr>   rA   dropout)rR   intermediate_sizerS   rD   rV   r$   r%   r7      s   
zCodeGenMLP.__init__rz   r   c                 C   s,   |  |}| |}| |}| |}|S r   )r   r   r   r   )rR   rz   r$   r$   r%   r     s
   



zCodeGenMLP.forward)	r   r   r   r7   r   r   r   r   r   r$   r$   rV   r%   r      s    "
r   c                       s   e Zd Z fddZ						ddeej deeej  deej deej	 d	eej d
ee
 dee
 deeej eeejeejdf f  f fddZ  ZS )CodeGenBlockc                    sR   t    |jd ur|jnd|j }tj|j|jd| _t|| _	t
||| _d S )Nr`   eps)r6   r7   n_innerr   r   	LayerNormlayer_norm_epsilonln_1r2   attnr   mlp)rR   rS   	inner_dimrV   r$   r%   r7     s
   

zCodeGenBlock.__init__NFrz   r{   rr   r|   rs   r}   r~   r   .c              	   C   sx   |}|  |}| j|||||||d}	|	d }
|	dd  }| |}|
| | }|r1|f| }|S |f|dd   }|S )Nrz   r{   rr   r|   rs   r}   r~   r   r   )r   r   r   )rR   rz   r{   rr   r|   rs   r}   r~   residualattn_outputsrx   r   feed_forward_hidden_statesr$   r$   r%   r     s(   

	

zCodeGenBlock.forwardr   )r   r   r   r7   r   r   r   r   r   r   r<   r   r   r   r$   r$   rV   r%   r     s2    
(	r   c                       s>   e Zd ZdZeZdZdZdgZdZ	 fddZ
dd	 Z  ZS )
CodeGenPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    transformerTr   past_key_valuesc                    s   t  j|i | d S r   )r6   r7   )rR   inputskwargsrV   r$   r%   r7   C  s   zCodeGenPreTrainedModel.__init__c                 C   s   t |tjfr!|jjjd| jjd |jdur|jj	  dS dS t |tj
rD|jjjd| jjd |jdurB|jj|j 	  dS dS t |tjrY|jj	  |jjd dS dS )zInitialize the weights.g        )meanstdNr   )
isinstancer   rM   weightdatanormal_rS   initializer_ranger5   zero_	Embeddingpadding_idxr   fill_)rR   moduler$   r$   r%   _init_weightsF  s   

z$CodeGenPreTrainedModel._init_weights)r   r   r   __doc__r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placementr7   r   r   r$   r$   rV   r%   r   7  s    r   aJ  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CodeGenConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a:
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoProcenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_attention_heads,)` or `(n_layer, num_attention_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zaThe bare CodeGen Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Z fddZdd Zdd Zeede	e
eed																						dd
eej deeeej   deej deej deej deej deej dee dee dee dee deeef fddZ  ZS )CodeGenModelc                    s   t     j| _ j| _t j| j| _t j	| _
t fddt jD | _tj| j jd| _t j j j | _d| _|   d S )Nc                    s   g | ]}t  qS r$   )r   ).0_rS   r$   r%   
<listcomp>  s    z)CodeGenModel.__init__.<locals>.<listcomp>r   F)r6   r7   r   rD   
vocab_sizer   r   wter>   
embd_pdropdrop
ModuleListrangen_layerhr   r   ln_frk   rP   n_ctxrE   gradient_checkpointing	post_initrR   rS   rV   r   r%   r7     s    zCodeGenModel.__init__c                 C      | j S r   r   rR   r$   r$   r%   get_input_embeddings     z!CodeGenModel.get_input_embeddingsc                 C   
   || _ d S r   r   rR   new_embeddingsr$   r$   r%   set_input_embeddings     
z!CodeGenModel.set_input_embeddingsbatch_size, sequence_length
checkpointoutput_typer   N	input_idsr   rr   token_type_idsr|   rs   inputs_embedsr}   r~   output_hidden_statesreturn_dictr   c                 C   sn  |	d ur|	n| j j}	|
d ur|
n| j j}
|d ur|n| j j}|d ur$|n| j j}|d ur4|d ur4td|d urP| || | }|d|d }|j	d }n|d urb| d d }|j	d }ntd|d urm|j
n|j
}|d ur||d|d }|d u rd}td gt| j }n	|d d d}|d u rtj||d | tj|d}|d}|d ur|dkrtd||d}|d d d d d d f }|j| jd}d	| t| jj }| || j j}|d u r| |}|}|d ur| |}|| }| |}||df }| jr| jr|rtd
 d}|r!dnd }|	r(dnd }|
r/dnd }tt| j|D ]S\}\}}|
rG||f }| jr_| jr_|  |j!|d |||| ||	}n||||||| ||	d}|d }|du r|||d f }|	r|||rdnd f }q9| "|}||}|
r||f }|stdd ||||fD S t#||||dS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer(   r   z5You have to specify either input_ids or inputs_embedsr)   )r   rl   z$batch_size has to be defined and > 0r   r   zh`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting `use_cache=False`...Fr$   r   Tr   r   c                 s   s    | ]	}|d ur|V  qd S r   r$   )r   vr$   r$   r%   	<genexpr>A  s    z'CodeGenModel.forward.<locals>.<genexpr>)last_hidden_stater   rz   
attentions)$rS   r~   r   r}   use_return_dictrG   %warn_if_padding_and_no_attention_maskrd   r=   rY   rl   tuplera   r   r   r   long	unsqueezerJ   r   rj   rk   get_head_maskr   r   r   r   trainingloggerwarning_once	enumeratezip_gradient_checkpointing_func__call__r   r	   )rR   r   r   rr   r   r|   rs   r   r}   r~   r   r   input_shape
batch_sizerl   past_lengthrz   token_type_embedsoutput_shapepresentsall_self_attentionsall_hidden_statesiblockr{   r   r$   r$   r%   r     s   









zCodeGenModel.forward)NNNNNNNNNNN)r   r   r   r7   r   r   r   CODEGEN_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr	   _CONFIG_FOR_DOCr   r   r   r   r   r   r<   r   r   r   r$   r$   rV   r%   r     s\    	

r   zM
    The CodeGen Model transformer with a language modeling head on top.
    c                       s0  e Zd ZdgZ fddZdd Zdd Zdd	d
Zee	
deeeed												d deej deeeej   deej deej deej deej deej deej dee dee dee dee deeef fddZedeeej  dejdeeej  fddZ  ZS )!CodeGenForCausalLMzlm_head.weightc                    s4   t  | t|| _t|j|j| _| 	  d S r   )
r6   r7   r   r   r   rM   r   r   lm_headr   r   rV   r$   r%   r7   T  s   
zCodeGenForCausalLM.__init__c                 C   r   r   r	  r   r$   r$   r%   get_output_embeddings\  r   z(CodeGenForCausalLM.get_output_embeddingsc                 C   r   r   r
  r   r$   r$   r%   set_output_embeddings_  r   z(CodeGenForCausalLM.set_output_embeddingsNc           	      K   s   | dd }|r>|d d jd }|jd |kr|}n|jd d }|d d |d f }|d ur>|d d |jd  d f }| dd }| dd }|d urs|d u rs| dd }||dkd |rs|d d |jd  d f }||| d|||d	S )
Nr   r   r   r   rr   r|   r(   r}   )r   r   r}   r|   rr   r   )getrY   r   cumsummasked_fill_)	rR   r   r   r   r   r   remove_prefix_lengthrr   r|   r$   r$   r%   prepare_inputs_for_generationb  s.   z0CodeGenForCausalLM.prepare_inputs_for_generationr   r   r   r   rr   r   r|   rs   r   labelsr}   r~   r   r   r   c                 C   s  |dur|n| j j}| j||||||||	|
||d}|d }| |tj}d}|durb||j}|dddddf  }|dddf  }t	 }||
d|d|
d}||j}|sx|f|dd  }|durv|f| S |S t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)
r   rr   r   r|   rs   r   r}   r~   r   r   r   .r(   r   )losslogitsr   rz   r   )rS   r   r   r	  rJ   r   rI   rl   rc   r   r=   rd   r   r
   r   rz   r   )rR   r   r   rr   r   r|   rs   r   r  r}   r~   r   r   transformer_outputsrz   	lm_logitsr  shift_logitsshift_labelsloss_fctoutputr$   r$   r%   r     sD   zCodeGenForCausalLM.forwardbeam_idxc                    s   t  fdd| D S )a  
        This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
        [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        c                 3   s&    | ]}t  fd d|D V  qdS )c                 3   s$    | ]}| d  |jV  qdS )r   N)index_selectrJ   rl   )r   
past_stater  r$   r%   r     s   " z>CodeGenForCausalLM._reorder_cache.<locals>.<genexpr>.<genexpr>Nr   )r   r{   r  r$   r%   r     s
    
z4CodeGenForCausalLM._reorder_cache.<locals>.<genexpr>r  )r   r  r$   r  r%   _reorder_cache  s   	z!CodeGenForCausalLM._reorder_cacher   )NNNNNNNNNNNN)r   r   r   _tied_weights_keysr7   r  r  r  r   r  r  r   r  r
   r  r   r   r   r   r   r   r<   r   r   staticmethodr   r   r$   r$   rV   r%   r  K  sv    
$	

Dr  ),r   typingr   r   r   r   torch.utils.checkpointr   torch.nnr   activationsr   modeling_outputsr	   r
   modeling_utilsr   utilsr   r   r   r   configuration_codegenr   
get_loggerr   r   r  r  %CODEGEN_PRETRAINED_MODEL_ARCHIVE_LISTintr   r&   r.   r1   Moduler2   r   r   r   CODEGEN_START_DOCSTRINGr  r   r  r$   r$   r$   r%   <module>   sH   
" -+ 2 4