o
    hO                     @   s  d Z ddlZddlmZmZmZ ddlZddlZddlmZ ddl	m
Z
mZmZmZ ddl	mZ ddlmZmZmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ ddlmZ ddl m!Z! e"e#Z$dZ%dZ&g dZ'd.ddZ(G dd dej)Z*G dd dej)Z+G dd dej)Z,G dd deZ-dZ.dZ/ede.G d d! d!e-Z0ed"e.G d#d$ d$e-Z1ed%e.G d&d' d'e-Z2ed(e.G d)d* d*e-Z3ed+e.G d,d- d-e-Z4dS )/zPyTorch MPT model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forward)!_prepare_4d_causal_attention_mask))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)logging   )	MptConfigmosaicml/mpt-7br   )	r   zmosaicml/mpt-7b-storywriterzmosaicml/mpt-7b-instructzmosaicml/mpt-7b-8kzmosaicml/mpt-7b-8k-instructzmosaicml/mpt-7b-8k-chatzmosaicml/mpt-30bzmosaicml/mpt-30b-instructzmosaicml/mpt-30b-chat   c                 C   s   t jd| dt j|dddd|}dtt|  }t jd|d t j|d}|||  }dt d| }|d| dd}|| krWt 	|ddd |ddd gd|  }|| }|
dS )a  
    Link to paper: https://arxiv.org/abs/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
    the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
    https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
    r   )dtypedevice         ?Nr   )torcharangeint32viewmathceillog2float32powconcatsqueeze)	num_headssequence_lengthalibi_bias_maxr   alibinum_heads_power_of_2baseslopes r1   Z/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/mpt/modeling_mpt.pybuild_mpt_alibi_tensor;   s   $*
r3   c                
       sZ   e Zd ZdZdef fddZ		ddejdejdee	ej  d	eej fd
dZ
  ZS )MptAttentionzyMulti-head self attention.
    Using torch or triton attention implemetation enables user to also use additive bias.
    configc                    s   t    |j| _|j| _|j| _| j| j | _|jj| _| jd u r.dt	
| j| j  | _|jj| _tj| jd| j dd| _tj| j| jdd| _d S )Nr   r   Fbias)super__init__hidden_sizen_headsmax_seq_lenmax_seq_lengthhead_dimattn_configsoftmax_scaler#   sqrt
attn_pdropattn_dropout_pr   LinearWqkvout_projselfr5   	__class__r1   r2   r9   W   s   



zMptAttention.__init__Nhidden_statesposition_biaspast_key_valueattention_maskc                 C   s  |j d d \}}| |}|jddd\}}	}
|||| j| jdd}|	||| j| jdd}	|
||| j| jdd}
|d urgt|dkrbtj	|d |	gdd}	tj	|d |
gdd}
|	|
f}n|	|
f}t
||	dd| j }|d u r~|n||d j d  }|d urt|j dkrtdt|j  |	j d }td|d| }td|d| }|d d |d |d f }|| }|d ur||t|jj}tjj| dd|
j}tjj|| j| jd	}t
||
}|dddd ||d}| |}|||fS )
Nr   r   dimr   r   z6Expecting position_bias shape to be 3 dimensions, got ptraining)shaperE   chunkreshaper;   r>   	transposelenr   catmatmulr@   
ValueErrormaxsizemasked_fillfinfor   minr   r
   softmaxfloattodropoutrC   rU   permute
contiguousr"   rF   )rH   rK   rL   rM   rN   
batch_size
seq_length	mixed_qkvquery_states
key_statesvalue_statesattention_scoresquery_length
key_lengthposition_bias_query_indexposition_bias_key_indexattn_weightscontext_statesattn_outputr1   r1   r2   forwarde   s<   




zMptAttention.forward)NN)__name__
__module____qualname____doc__r   r9   r   Tensorr   r   rw   __classcell__r1   r1   rI   r2   r4   R   s    r4   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	MptMLPr5   c                    sX   t    |j}tj|d| dd| _tjdd| _tjd| |dd| _|j	j
| _d S )N   Fr6   none)approximate)r8   r9   r:   r   rD   up_projGELUact	down_projr?   rB   hidden_dropoutrH   r5   r:   rI   r1   r2   r9      s   
zMptMLP.__init__rK   residualreturnc                 C   s:   |  | |}| |}tj|| j| jd}|| }|S )NrS   )r   r   r   Frf   r   rU   )rH   rK   r   intermediate_outputoutputr1   r1   r2   rw      s
   
zMptMLP.forward)	rx   ry   rz   r   r9   r   r|   rw   r}   r1   r1   rI   r2   r~      s    $	r~   c                       sb   e Zd Zdef fddZ			ddejdejdejd	eeejejf  d
e	de	fddZ
  ZS )MptBlockr5   c                    sx   t    |j}t||jd| _d | j_|j| _t	|| _
t||jd| _d | j_t|| _|jj| _t| j| _d S )Neps)r8   r9   r:   r   layer_norm_epsilonnorm_1r7   r;   r*   r4   attnnorm_2r~   ffnr?   rB   dropout_rater   Dropoutresid_attn_dropoutr   rI   r1   r2   r9      s   



zMptBlock.__init__NFrK   rL   rN   
layer_past	use_cacheoutput_attentionsc                 C   st   |  |}|}| j||||d\}	}
}| |	| }| |}|}| ||}|f}|r1||f7 }|r8||
f7 }|S )N)rL   rN   rM   )r   r   r   r   r   )rH   rK   rL   rN   r   r   r   layernorm_outputr   attn_outputsrt   rM   r   outputsr1   r1   r2   rw      s$   



zMptBlock.forward)NFF)rx   ry   rz   r   r9   r   r|   r   r   boolrw   r}   r1   r1   rI   r2   r      s$    r   c                       sz   e Zd ZeZdZdZdgZdgZ fddZ	de
jfdd	Zed
eeejejf  deeejejf  fddZ  ZS )MptPreTrainedModeltransformerTr   z
lm_head.*.c                    s   t  j|i | d S N)r8   r9   )rH   inputskwargsrI   r1   r2   r9      s   zMptPreTrainedModel.__init__modulec                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tr\|jdurS|jj	  |jjd dS dS )zInitialize the weights.g        )meanstdNr   )
isinstancer   rD   weightdatanormal_r5   initializer_ranger7   zero_	Embeddingpadding_idxr   fill_)rH   r   r1   r1   r2   _init_weights   s   



z MptPreTrainedModel._init_weightsrM   r   c                    s8   | d d j \}}||  t fdd| D S )zw
        Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
        r   c                 3   s4    | ]}|d    |d   fV  qdS r   r   N)rX   .0r   batch_size_times_num_headsr>   rj   r1   r2   	<genexpr>  s    
z;MptPreTrainedModel._convert_to_mpt_cache.<locals>.<genexpr>)rV   tuple)rM   ri   r*   r1   r   r2   _convert_to_mpt_cache
  s
   z(MptPreTrainedModel._convert_to_mpt_cache)rx   ry   rz   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_keys_to_ignore_on_load_missingr9   r   Moduler   staticmethodr   r   r|   r   r}   r1   r1   rI   r2   r      s    r   a*  

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MptConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
            their past given to this model should not be passed as `input_ids` as they have already been computed.

            Each element of `past_key_values` is a tuple (past_key, past_value):
            - past_key: [batch_size * num_heads, head_dim, kv_length]
            - past_value: [batch_size * num_heads, kv_length, head_dim]
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.

            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
            `past_key_values`).
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
z]The bare Mpt Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Zdef fddZdd Zddd	Zd
ejfddZ	e
eeeeed								ddeej deeeejejf df  deej deej dee dee dee dee deeejdf ef fddZ  ZS )MptModelr5   c                    sz   t     j| _ j| _t j| j| _t	 fddt
 jD | _t| j jd| _d | j_d| _|   d S )Nc                    s   g | ]}t  qS r1   )r   )r   _r5   r1   r2   
<listcomp>n  s    z%MptModel.__init__.<locals>.<listcomp>r   F)r8   r9   r:   r;   r*   r   r   
vocab_sizewte
ModuleListrangen_layersblocksr   r   norm_fr7   gradient_checkpointing	post_initrG   rI   r   r2   r9   d  s    zMptModel.__init__c                 C      | j S r   r   rH   r1   r1   r2   get_input_embeddingsz     zMptModel.get_input_embeddingsr   Nc                 C   s   t ||||S r   )r3   )rH   r*   r+   r,   r   r1   r1   r2   r3   }  s   zMptModel.build_mpt_alibi_tensornew_embeddingsc                 C   
   || _ d S r   r   rH   r   r1   r1   r2   set_input_embeddings     
zMptModel.set_input_embeddings
checkpointoutput_typer   	input_idspast_key_values.rN   inputs_embedsr   r   output_hidden_statesreturn_dictr   c	              
   C   s~  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d ur$|n| j j}|d ur4|d ur4td|d ur>|j\}	}
n|d urI|j\}	}
}ntd|d u r[td gt| j	 }|d u rd| 
|}|}|rjdnd }|rpdnd }|rvdnd }| jr| jr|rtd d}|
}d}|d d ur|d d jd }|| }|d u rtj|	|f|jd}n||j}| j| j| j j|jd}t||	|
f||}| }t| j	|D ]G\}}|r||f }| jr| jr| |j||||||}n
|||||||d	}|d }|d
u r
||d f }|r|||rdnd f }q| |}|r'||f }|s7tdd ||||fD S t||||dS )NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsr1   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   )r   rN   r   r   rL   Tr   c                 s   s    | ]	}|d ur|V  qd S r   r1   )r   vr1   r1   r2   r     s    z#MptModel.forward.<locals>.<genexpr>)last_hidden_stater   rK   
attentions)r5   r   r   r   use_return_dictr]   rV   r   rZ   r   r   r   rU   loggerwarning_oncer   onesr   re   r3   r*   r<   r   r   zip_gradient_checkpointing_func__call__r   r   )rH   r   r   rN   r   r   r   r   r   ri   rj   r   rK   presentsall_self_attentionsall_hidden_statesseq_length_with_pastpast_key_values_lengthr-   causal_maskblockr   r   r1   r1   r2   rw     s   


	


zMptModel.forwardr   NNNNNNNNN)rx   ry   rz   r   r9   r   r3   r   r|   r   r   MPT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   
LongTensorr   r   r   rw   r}   r1   r1   rI   r2   r   _  sL    
	
r   z
    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                       sn  e Zd ZdgZdef fddZdd Zdejfdd	Z		
	
	
	
ddej
deej deej deej dee defddZeeeeeed	
	
	
	
	
	
	
	
	
d deej
 deeeejejf df  deej deej deej dee dee dee dee deeej ef fddZdeeejejf df dej
deeejejf df fddZ  ZS )!MptForCausalLMzlm_head.weightr5   c                    s8   t  | t|| _tj|j|jdd| _| 	  d S NFr6   )
r8   r9   r   r   r   rD   r:   r   lm_headr   rG   rI   r1   r2   r9     s   
zMptForCausalLM.__init__c                 C   r   r   r   r   r1   r1   r2   get_output_embeddings  r   z$MptForCausalLM.get_output_embeddingsr   c                 C   r   r   r   r   r1   r1   r2   set_output_embeddings  r   z$MptForCausalLM.set_output_embeddingsNr   r   rN   r   r   r   c           
      K   s   |d ur(|d d j d }|j d |kr|}n|j d d }|d d |d f }|d ur5|d u r5d|i}	nd|i}	|	|||d |	S )Nr   r   r   r   r   )r   r   rN   )rV   update)
rH   r   r   rN   r   r   r   past_lengthremove_prefix_lengthmodel_inputsr1   r1   r2   prepare_inputs_for_generation  s    

z,MptForCausalLM.prepare_inputs_for_generationr   .labelsr   r   r   c
              
   C   s   |	dur|	n| j j}	| j||||||||	d}
|
d }| |}d}|dur\||j}|dddddf  }|dddf  }|j\}}}t }||	|| ||	|| }|	sr|f|
dd  }|durp|f| S |S t
|||
j|
j|
jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        Nr   rN   r   r   r   r   r   r   .rQ   r   losslogitsr   rK   r   )r5   r   r   r   re   r   rh   rV   r   r"   r   r   rK   r   )rH   r   r   rN   r   r  r   r   r   r   transformer_outputsrK   	lm_logitsr  shift_logitsshift_labelsri   rj   r   loss_fctr   r1   r1   r2   rw   6  sB   

zMptForCausalLM.forwardpastbeam_idxc                    s,    fdd|D t fdd|D }|S )aL  
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.

        Output shares the same memory storage as `past`.
        c                    s&   i | ]}|D ]
}|j  |j qqS r1   )r   re   )r   r   
past_state)r  r1   r2   
<dictcomp>  s
    z1MptForCausalLM._reorder_cache.<locals>.<dictcomp>c                 3   sD    | ]}|d   d  |d  j |d  d  |d  j fV  qdS r   )index_selectr   r   )device_to_beam_idxr1   r2   r     s    
z0MptForCausalLM._reorder_cache.<locals>.<genexpr>)r   )rH   r  r  reordered_pastr1   )r  r  r2   _reorder_cachex  s   
zMptForCausalLM._reorder_cache)NNNN	NNNNNNNNN)rx   ry   rz   _tied_weights_keysr   r9   r   r   r|   r   r   r   r   dictr  r   r   r   r   r   r   r   r   rw   r  r}   r1   r1   rI   r2   r     s    
%	
<r   a  
    The MPT Model transformer with a sequence classification head on top (linear layer).

    [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                          e Zd Zdef fddZeeeee	e
d									ddeej deeeejejf df  d	eej d
eej deej dee dee dee dee deeej e	f fddZ  ZS )MptForSequenceClassificationr5   c                    s@   t  | |j| _t|| _tj|j|jdd| _| 	  d S r   )
r8   r9   
num_labelsr   r   r   rD   r:   scorer   rG   rI   r1   r2   r9     s
   
z%MptForSequenceClassification.__init__r   Nr   r   .rN   r   r  r   r   r   r   r   c
              
   C   s  |	dur|	n| j j}	| j||||||||	d}
|
d }| |}|dur*|jd }n|jd }| j jdu r=|dkr=td| j jdu rFd}n"|dur\t|| j j	dd 
|j}nd}t| jj d |tj||jd|f }d}|dur| j jdu r| jdkrd	| j _n| jdkr|jtjks|jtjkrd
| j _nd| j _| j jd	krt }| jdkr|| | }n#|||}n| j jd
krt }|||}n| j jdkrt }|||}|	s|f|
dd  }|dur|f| S |S t|||
j|
j|
jdS )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rQ   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr  )r5   r   r   r  rV   pad_token_idr]   r   nesumre   r   r   warningrJ   rx   r    problem_typer  r   longintr	   r)   r   r   r   r   rK   r   )rH   r   r   rN   r   r  r   r   r   r   r  rK   r  ri   sequence_lengthspooled_logitsr  r  r   r1   r1   r2   rw     sp   

$

"


z$MptForSequenceClassification.forwardr  )rx   ry   rz   r   r9   r   r   r   r   r   r   r   r   r   r   r|   r   r   rw   r}   r1   r1   rI   r2   r    sL    		
r  z
    MPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       r  )MptForTokenClassificationr5   c                    s   t  | |j| _t|| _t|dr|jd ur|j}nt|dr+|jd ur+|j}nd}t	|| _
t|j|j| _|   d S )Nclassifier_dropoutr   g?)r8   r9   r  r   r   hasattrr)  r   r   r   rf   rD   r:   
classifierr   )rH   r5   r)  rI   r1   r2   r9     s   
z"MptForTokenClassification.__init__r   Nr   r   .rN   r   r  r   r   r   r   r   c
              
   K   s   |	dur|	n| j j}	| j||||||||	d}|d }| |}| |}d}|durJ||j}|j\}}t }||	|| | j
|	|| }|	s`|f|dd  }|dur^|f| S |S t|||j|jdS )r  Nr  r   r   )r  r  rK   r   )r5   r   r   rf   r+  re   r   rV   r   r"   r  r   rK   r   )rH   r   r   rN   r   r  r   r   r   r   deprecated_argumentsr  rK   r  r  ri   rj   r  r   r1   r1   r2   rw   "  s>   


z!MptForTokenClassification.forwardr  )rx   ry   rz   r   r9   r   r   r   r   r   r   r   r   r   r   r|   r   r   rw   r}   r1   r1   rI   r2   r(  	  sL    	
r(  z
    The MPT Model transformer with a span classification head on top for extractive question-answering tasks like SQuAD
    (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       s   e Zd Z fddZeed								ddeej	 deej
 deej
 deej	 d	eej	 d
ee dee dee deeef fddZ  ZS )MptForQuestionAnsweringc                    s2   t  | t|| _t|jd| _|   d S )Nr   )	r8   r9   r   r   r   rD   r:   
qa_outputsr   rG   rI   r1   r2   r9   j  s   
z MptForQuestionAnswering.__init__zbatch_size, sequence_lengthNr   rN   r   start_positionsend_positionsr   r   r   r   c	                 C   sB  |dur|n| j j}| j||||||d}	|	d }
| |
}|jddd\}}|d }|d }d}|dur|durt| dkrL|d}t| dkrY|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|	dd  }|dur|f| S |S t||||	j|	jd	S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        N)rN   r   r   r   r   r   r   rQ   rO   )ignore_indexr   )r  start_logits
end_logitsrK   r   )r5   r   r   r.  splitr)   rh   rZ   r_   clampr   r   rK   r   )rH   r   rN   r   r/  r0  r   r   r   r   sequence_outputr  r2  r3  
total_lossignored_indexr  
start_lossend_lossr   r1   r1   r2   rw   r  sJ   	






zMptForQuestionAnswering.forwardr   )rx   ry   rz   r9   r   r   formatr   r   r   FloatTensorr   r   r   r   rw   r}   r1   r1   rI   r2   r-  b  s<    	

r-  r   )5r{   r#   typingr   r   r   r   torch.utils.checkpointr   torch.nnr   r   r   r	   r
   r   
file_utilsr   r   r   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   configuration_mptr   
get_loggerrx   r   r   r   !MPT_PRETRAINED_MODEL_ARCHIVE_LISTr3   r   r4   r~   r   r   MPT_START_DOCSTRINGr   r   r   r  r(  r-  r1   r1   r1   r2   <module>   sh   

H@/2  jR