o
    h                     @   s  d Z ddlZddlmZmZmZ ddlZddlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZmZmZ ddlmZ ee Z!dZ"dZ#ddgZ$G dd dej%Z&G dd dej'Z(G dd dej'Z)G dd deZ*dZ+dZ,ede+G dd de*Z-ede+G dd  d e*Z.ed!e+G d"d# d#e*Z/ed$e+G d%d& d&e*Z0dS )'z PyTorch BioGPT model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)!_prepare_4d_causal_attention_mask))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )BioGptConfigzmicrosoft/biogptr   zmicrosoft/BioGPT-Largec                       sD   e Zd ZdZdedef fddZddejdef fd	d
Z  Z	S ) BioGptLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                    s   d| _ t || j  | d S )N   )offsetsuper__init__)selfr   r   	__class__ `/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/biogpt/modeling_biogpt.pyr   ?   s   z)BioGptLearnedPositionalEmbedding.__init__r   attention_maskpast_key_values_lengthc                    sN   |  }tj|dd||   d }|dd|df }t || j S )z3`input_ids_shape` is expected to be [bsz x seqlen].r   dimN)longtorchcumsumtype_asr   forwardr   )r   r#   r$   	positionsr   r!   r"   r+   E   s    z(BioGptLearnedPositionalEmbedding.forward)r   )
__name__
__module____qualname____doc__intr   r(   
LongTensorr+   __classcell__r!   r!   r   r"   r   :   s    "r   c                       s   e Zd ZdZ					ddededed	ed
ededee f fddZ	de
jdedefddZ					dde
jdee
j deee
j  dee
j dee
j dedee
jee
j eee
j  f fddZ  ZS )BioGptAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FTN	embed_dim	num_headsdropout
is_decoderbias	is_causalconfigc                    s   t    || _|| _|| _|| | _|| _| j| | jkr*td| j d| d| jd | _|| _	|| _
tj|||d| _tj|||d| _tj|||d| _tj|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩r:   )r   r   r6   r7   r8   head_dimr<   
ValueErrorscalingr9   r;   r   Lineark_projv_projq_projout_proj)r   r6   r7   r8   r9   r:   r;   r<   r   r!   r"   r   V   s&   



zBioGptAttention.__init__tensorseq_lenbszc                 C   s    | ||| j| jdd S )Nr   r   )viewr7   r>   	transpose
contiguous)r   rF   rG   rH   r!   r!   r"   _shapeu   s    zBioGptAttention._shapehidden_stateskey_value_statespast_key_valuer#   layer_head_maskoutput_attentionsreturnc                 C   sr  |du}|  \}}	}
| || j }|r.|dur.|d jd |jd kr.|d }|d }nZ|rE| | |d|}| | |d|}nC|durt| | |d|}| | |d|}tj|d |gdd}tj|d |gdd}n| | |d|}| | |d|}| j	r||f}|| j
 d| jf}| ||	|j| }|j| }|j| }| d}t||dd}|  || j
 |	|fkrtd|| j
 |	|f d|   |dur|  |d|	|fkrtd	|d|	|f d|   ||| j
|	|| }||| j
 |	|}tjj|dd}|durL|  | j
fkr1td
| j
f d|   |dddd||| j
|	| }||| j
 |	|}|rc||| j
|	|}||| j
 |	|}nd}tjj|| j| jd}t||}|  || j
 |	| jfkrtd|| j
 |	| jf d|   ||| j
|	| j}|dd}|||	| j}| |}|||fS )z#Input shape: Batch x Time x ChannelNr   r   r   r%   z$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )sizerD   r@   shaperL   rB   rC   r(   catr9   r7   r>   rI   reshapebmmrJ   r?   r   
functionalsoftmaxr8   rV   r6   rE   )r   rM   rN   rO   r#   rP   rQ   is_cross_attentionrH   tgt_len_query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputr!   r!   r"   r+   x   s   





"

zBioGptAttention.forward)r5   FTFN)NNNNF)r-   r.   r/   r0   r1   floatboolr   r   r   r(   TensorrL   r   r+   r3   r!   r!   r   r"   r4   S   sV    r4   c                       s   e Zd Zdef fddZ					ddejdeej d	eej d
eeej  dee	 dee	 deej
eeej
ej
f  f fddZ  ZS )BioGptDecoderLayerr<   c                    s   t    |j| _t| j|j|jdd| _|j| _	t
|j | _|j| _t| j| _t| j|j| _t|j| j| _t| j| _d S )NT)r6   r7   r8   r9   )r   r   hidden_sizer6   r4   num_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probr8   r
   
hidden_actactivation_fnactivation_dropoutr   	LayerNormself_attn_layer_normrA   intermediate_sizefc1fc2final_layer_normr   r<   r   r!   r"   r      s   
zBioGptDecoderLayer.__init__NFTrM   r#   rP   rO   rQ   	use_cacherR   c                 C   s   |}|  |}|dur|dd nd}| j|||||d\}}	}
tjj|| j| jd}|| }|}| |}| |}| |}tjj|| j	| jd}| 
|}tjj|| j| jd}|| }|f}|ri||	f7 }|rp||
f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        Nr   )rM   rO   r#   rP   rQ   rT   )rw   rq   r   r\   r8   rV   r{   ry   rt   ru   rz   )r   rM   r#   rP   rO   rQ   r}   residualself_attn_past_key_valueself_attn_weightspresent_key_valueoutputsr!   r!   r"   r+     s4   






zBioGptDecoderLayer.forward)NNNFT)r-   r.   r/   r   r   r(   rl   r   r   rk   FloatTensorr+   r3   r!   r!   r   r"   rm      s,    rm   c                   @   s$   e Zd ZdZeZdZdZdd ZdS )BioGptPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    biogptTc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS dS )zInitialize the weightsr5   )meanstdN      ?)
isinstancer   rA   weightdatanormal_r<   initializer_ranger:   zero_	Embeddingpadding_idxrv   fill_)r   moduler!   r!   r"   _init_weightsO  s   

z#BioGptPreTrainedModel._init_weightsN)	r-   r.   r/   r0   r   config_classbase_model_prefixsupports_gradient_checkpointingr   r!   r!   r!   r"   r   E  s    r   aJ  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`~BioGptConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape
            `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you
            can choose to directly pass an embedded representation. This is useful if you want more control over how to
            convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare BioGPT Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Zdef fddZdd Zdd Zee	de
eeed		
	
	
	
	
	
	
	
	
ddeej deej deej deej deeeej   dee dee dee dee deeef fddZ  ZS )BioGptModelr<   c                    s   t     | _ j| _ j| _ j| _ j| _	 j
r"t jnd| _t j| j| j	| _t j| j| _t fddt jD | _t| j| _d| _|   d S )Nr   c                    s   g | ]}t  qS r!   )rm   ).0r`   r<   r!   r"   
<listcomp>  s    z(BioGptModel.__init__.<locals>.<listcomp>F)r   r   r<   	layerdroprr   r8   rn   r6   pad_token_idr   scale_embeddingmathsqrtembed_scaler   r   
vocab_sizeembed_tokensr   max_position_embeddingsembed_positions
ModuleListrangenum_hidden_layerslayersrv   
layer_normgradient_checkpointing	post_initr|   r   r   r"   r     s    zBioGptModel.__init__c                 C      | j S Nr   r   r!   r!   r"   get_input_embeddings     z BioGptModel.get_input_embeddingsc                 C   
   || _ d S r   r   r   valuer!   r!   r"   set_input_embeddings     
z BioGptModel.set_input_embeddingsbatch_size, sequence_length
checkpointoutput_typer   N	input_idsr#   	head_maskinputs_embedspast_key_valuesr}   rQ   output_hidden_statesreturn_dictrR   c
              
   C   s&  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|	d ur$|	n| j j}	|d ur4|d ur4td|d ur?|}
|
 }n|d urW| d d }|d d d d df }
ntd|d urh|d d jd nd}|d u rv| |
| j	 }|d u rt
j|jd |jd | ft
j|jd}n|jd ||d  krtd|jd  d	||d   d
| ||}t||||}|| }tjj|| j| jd}| jr| jr|rtd d}|rdnd }|rdnd }d }|rdnd }t| jD ]t\}}|r||f7 }| jrt
g }|| jk rq|d ur|| nd }| jr4| jr4| |j|||d ur-|| nd d ||}n||||d ur@|| nd |||d}|d }|rZ|||rUdnd f7 }|rd||d f7 }q|rm||f7 }| |}|rw|nd }|	stdd |||||fD S t|||||dS )NzDYou cannot specify both input_ids and inputs_embeds at the same timerS   z5You have to specify either input_ids or inputs_embedsr   r   r   )dtypedevicez'The provided attention mask has length z, but its length should be z0 (sum of the lengths of current and past inputs)rT   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr!   )r#   rP   rO   rQ   r}   c                 s   s    | ]	}|d ur|V  qd S r   r!   )r   vr!   r!   r"   	<genexpr>=  s    z&BioGptModel.forward.<locals>.<genexpr>)last_hidden_stater   rM   
attentionscross_attentions) r<   rQ   r   r}   use_return_dictr?   rW   rX   r   r   r(   onesrk   r   r   r   r   r\   r8   rV   r   loggerwarning_once	enumerater   randr   _gradient_checkpointing_func__call__r   tupler   )r   r   r#   r   r   r   r}   rQ   r   r   inputinput_shaper$   r,   rM   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cacheidxdecoder_layerdropout_probabilityrO   layer_outputs
next_cacher!   r!   r"   r+     s   




	

zBioGptModel.forward)	NNNNNNNNN)r-   r.   r/   r   r   r   r   r   BIOGPT_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r(   r2   r   r   rl   rk   r   r+   r3   r!   r!   r   r"   r     sP    	

r   zHBioGPT Model with a `language modeling` head on top for CLM fine-tuning.c                       s   e Zd ZdgZ fddZdd Zdd Zee	de
eeed		
	
	
	
	
	
	
	
	
	
ddeej deej deej deej deeeej   deej dee dee dee dee deeef fddZ	
dddZedd Z  ZS )BioGptForCausalLMzoutput_projection.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S NFr=   )
r   r   r   r   r   rA   rn   r   output_projectionr   r|   r   r!   r"   r   Q  s   
zBioGptForCausalLM.__init__c                 C   r   r   r   r   r!   r!   r"   get_output_embeddingsZ  r   z'BioGptForCausalLM.get_output_embeddingsc                 C   r   r   r   )r   new_embeddingsr!   r!   r"   set_output_embeddings]  r   z'BioGptForCausalLM.set_output_embeddingsr   r   Nr   r#   r   r   r   labelsr}   rQ   r   r   rR   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|durS|ddddddf  }|ddddf  }t }||d| j j|d}|
si|f|dd  }|durg|f| S |S t|||j	|j
|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)r#   r   r   r   r}   rQ   r   r   r   rS   r   )losslogitsr   rM   r   r   )r<   r   r   r   rK   r   rI   r   r   r   rM   r   r   )r   r   r#   r   r   r   r   r}   rQ   r   r   r   sequence_outputprediction_scoreslm_lossshifted_prediction_scoresloss_fctoutputr!   r!   r"   r+   `  s>   
zBioGptForCausalLM.forwardc           	      K   s   |d ur(|d d j d }|j d |kr|}n|j d d }|d d |d f }|d ur5|d u r5d|i}nd|i}||||dd |S )Nr   r   r   r   r   r}   )r#   r   r}   )rX   updateget)	r   r   r#   r   r   kwargspast_lengthremove_prefix_lengthmodel_inputsr!   r!   r"   prepare_inputs_for_generation  s    
z/BioGptForCausalLM.prepare_inputs_for_generationc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr!   c                 3   s$    | ]}| d  |jV  qdS )r   N)index_selecttor   )r   
past_statebeam_idxr!   r"   r     s   " z3BioGptForCausalLM._reorder_cache.<locals>.<genexpr>)r   )r   r   reordered_past
layer_pastr!   r   r"   _reorder_cache  s   z BioGptForCausalLM._reorder_cache
NNNNNNNNNN)NN)r-   r.   r/   _tied_weights_keysr   r   r   r   r   r   r   r   r   r   r   r(   r2   r   r   rl   rk   r   r+   r   staticmethodr   r3   r!   r!   r   r"   r   K  s`    		

:
r   z
    BioGPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       s   e Zd Z fddZeeeeee	d											dde
ej de
ej de
ej de
ej d	e
eeej   d
e
ej de
ej de
e de
e de
e de
e deeef fddZ  ZS )BioGptForTokenClassificationc                    sj   t  | |j| _t|| _t|dr|jd ur|j}n|j}t	|| _
t|j|j| _|   d S )Nclassifier_dropout)r   r   
num_labelsr   r   hasattrr  rr   r   Dropoutr8   rA   rn   
classifierr   )r   r<   r  r   r!   r"   r     s   
z%BioGptForTokenClassification.__init__r   Nr   token_type_idsr#   r   r   r   r   r}   rQ   r   r   rR   c                 C   s
  |dur|n| j j}| j|||||||	|
|d	}|d }| |}| |}d}|duret }|durX|ddk}|d| j}t	||dt
|j|}|||}n||d| j|d}|s{|f|dd  }|dury|f| S |S t|||j|jdS )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r#   r   r   r}   rQ   r   r   r   rS   r   r   )r   r   rM   r   )r<   r   r   r8   r  r   rI   r  r(   whererF   ignore_indexr*   r   rM   r   )r   r   r  r#   r   r   r   r   r}   rQ   r   r   transformer_outputsrM   r   r   r   active_lossactive_logitsactive_labelsr   r!   r!   r"   r+     sF   

z$BioGptForTokenClassification.forward)NNNNNNNNNNN)r-   r.   r/   r   r   r   r   r   r   r   r   r(   r2   r   r   rl   rk   r   r+   r3   r!   r!   r   r"   r    sX    	

r  a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                       s   e Zd Zdef fddZeeeee	e
d										ddeej deej deej d	eeeej   d
eej deej dee dee dee dee deee	f fddZdd Zdd Z  ZS )BioGptForSequenceClassificationr<   c                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S r   )
r   r   r  r   r   r   rA   rn   scorer   r|   r   r!   r"   r   4  s
   
z(BioGptForSequenceClassification.__init__r   Nr   r#   r   r   r   r   r}   rQ   r   r   rR   c                 C   s  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}|dur/|jdd \}}n	|jdd \}}| j jdu rAd}n"|durWt|| j jdd 	|j
}nd}t| jj d |tj||j
d|f }d}|dur| j jdu r| jdkrd	| j _n| jdkr|jtjks|jtjkrd
| j _nd| j _| j jd	krt }| jdkr|| | }n+|||}n%| j jd
krt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|j|jdS )r	  Nr
  r   r   rS   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`)r   
regressionsingle_label_classificationmulti_label_classification)r   r   r   rM   r   )r<   r   r   r  rX   r   r(   nesumr   r   r   warningr    r-   arangeproblem_typer  r   r'   r1   r   squeezer   rI   r   r   r   rM   r   )r   r   r#   r   r   r   r   r}   rQ   r   r   r  rM   r   
batch_sizesequence_lengthpooled_logitsr   r   r   r!   r!   r"   r+   =  sn   
$

"


z'BioGptForSequenceClassification.forwardc                 C   s   | j jS r   r   r   r   r!   r!   r"   r     s   z4BioGptForSequenceClassification.get_input_embeddingsc                 C   s   || j _d S r   r  r   r!   r!   r"   r     s   z4BioGptForSequenceClassification.set_input_embeddingsr   )r-   r.   r/   r   r   r   r   r   r   r   r   r   r(   r2   r   r   rl   rk   r   r+   r   r   r3   r!   r!   r   r"   r  $  sV    		

Xr  )1r0   r   typingr   r   r   r(   torch.utils.checkpointr   torch.nnr   r   r   activationsr
   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   r   r   configuration_biogptr   
get_loggerr-   r   r   r   $BIOGPT_PRETRAINED_MODEL_ARCHIVE_LISTr   r   Moduler4   rm   r   BIOGPT_START_DOCSTRINGr   r   r   r  r  r!   r!   r!   r"   <module>   s\   
 T6 'zU