o
    h                     @   s  d Z ddlmZ ddlmZmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlm Z  e!e"Z#dZ$dZ%dZ&dZ'dZ(dZ)dZ*dZ+dZ,dZ-g dZ.G dd de	j/Z0G dd dej1j2Z3G dd de4Z5d d! Z6G d"d# d#ej1j2Z7G d$d% d%e	j/Z8G d&d' d'e	j/Z9G d(d) d)e	j/Z:G d*d+ d+e	j/Z;G d,d- d-e	j/Z<G d.d/ d/e	j/Z=G d0d1 d1e	j/Z>G d2d3 d3e	j/Z?d4d5 Z@ejAjBd6d7 ZCejAjBd8d9 ZDejAjBd:d; ZEG d<d= d=e	j/ZFG d>d? d?e	j/ZGG d@dA dAeZHdBZIdCZJedDeIG dEdF dFeHZKedGeIG dHdI dIeHZLG dJdK dKe	j/ZMG dLdM dMe	j/ZNG dNdO dOe	j/ZOedPeIG dQdR dReHZPedSeIG dTdU dUeHZQedVeIG dWdX dXeHZRdS )Yz PyTorch DeBERTa model.    )Sequence)OptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputMaskedLMOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)softmax_backward_data)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )DebertaConfigr   microsoft/deberta-basez!lsanochkin/deberta-large-feedbackz' Paris'z0.54z#Palak/microsoft_deberta-large_squadz' a nice puppet'gQ?      )r   zmicrosoft/deberta-largezmicrosoft/deberta-xlargezmicrosoft/deberta-base-mnlizmicrosoft/deberta-large-mnlizmicrosoft/deberta-xlarge-mnlic                       s0   e Zd Z fddZdd Zedd Z  ZS )ContextPoolerc                    s2   t    t|j|j| _t|j| _|| _	d S N)
super__init__r   Linearpooler_hidden_sizedenseStableDropoutpooler_dropoutdropoutconfigselfr&   	__class__ b/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/deberta/modeling_deberta.pyr   C   s   

zContextPooler.__init__c                 C   s8   |d d df }|  |}| |}t| jj |}|S Nr   )r%   r"   r   r&   pooler_hidden_act)r(   hidden_statescontext_tokenpooled_outputr+   r+   r,   forwardI   s
   

zContextPooler.forwardc                 C      | j jS r   )r&   hidden_sizer(   r+   r+   r,   
output_dimS   s   zContextPooler.output_dim)__name__
__module____qualname__r   r2   propertyr6   __classcell__r+   r+   r)   r,   r   B   s
    
r   c                   @   s4   e Zd ZdZedd Zedd Zedd ZdS )	XSoftmaxa  
    Masked Softmax which is optimized for saving memory

    Args:
        input (`torch.tensor`): The input tensor that will apply softmax.
        mask (`torch.IntTensor`):
            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
        dim (int): The dimension that will apply softmax

    Example:

    ```python
    >>> import torch
    >>> from transformers.models.deberta.modeling_deberta import XSoftmax

    >>> # Make a tensor
    >>> x = torch.randn([4, 20, 100])

    >>> # Create a mask
    >>> mask = (x > 0).int()

    >>> # Specify the dimension to apply softmax
    >>> dim = -1

    >>> y = XSoftmax.apply(x, mask, dim)
    ```c                 C   sX   || _ |tj }||tt|jj}t	|| j }|
|d | | |S r-   )dimtotorchboolmasked_filltensorfinfodtypeminsoftmaxmasked_fill_save_for_backward)r(   inputmaskr=   rmaskoutputr+   r+   r,   r2   t   s   
zXSoftmax.forwardc                 C   s$   | j \}t| ||| j|}|d d fS r   )saved_tensorsr   r=   )r(   grad_outputrL   	inputGradr+   r+   r,   backward   s   
zXSoftmax.backwardc           
      C   s   dd l m  m} ddlm}m} | jd||jd d}| jd| d| jdtj	dtj
d	d
||jd d}|| ||| jdt	t|  jd
}	|| |	|}	|| |	|| jdtj	dtjd	d
S )Nr   )rA   rF   CastLong)to_iSubConstantr   rD   )value_tBool)torch.onnx.symbolic_helperonnxsymbolic_helpertorch.onnx.symbolic_opset9rA   rF   opcast_pytorch_to_onnxr?   rB   int64rC   typerD   rE   r@   )
gr(   rJ   r=   sym_helprA   rF   mask_cast_valuer_maskrL   r+   r+   r,   symbolic   s   "($zXSoftmax.symbolicN)r7   r8   r9   __doc__staticmethodr2   rP   re   r+   r+   r+   r,   r<   X   s    


r<   c                   @   s   e Zd Zdd ZdS )DropoutContextc                 C   s   d| _ d | _d| _d| _d S )Nr   r   T)r%   rJ   scale
reuse_maskr5   r+   r+   r,   r      s   
zDropoutContext.__init__N)r7   r8   r9   r   r+   r+   r+   r,   rh      s    rh   c                 C   s   t |ts
|}d }n|j}||j9 }|jr|jnd }|dkr2|d u r2dt| d|  	tj
}t |tr?|jd u r?||_||fS )Nr   r   )
isinstancerh   r%   ri   rj   rJ   r?   
empty_like
bernoulli_r>   r@   )rI   local_contextr%   rJ   r+   r+   r,   get_mask   s   

 

ro   c                	   @   sZ   e Zd ZdZedd Zedd Zedejj	dejj
deeef d	ejj
fd
dZdS )XDropoutzlOptimized dropout function to save computation and memory by using mask operation instead of multiplication.c                 C   sD   t ||\}}dd|  | _|dkr | | ||d| j S |S )Ng      ?r   r   )ro   ri   rH   rA   )ctxrI   	local_ctxrJ   r%   r+   r+   r,   r2      s   
zXDropout.forwardc                 C   s0   | j dkr| j\}||d| j  d fS |d fS )Nr   r   )ri   rM   rA   )rq   rN   rJ   r+   r+   r,   rP      s   
zXDropout.backwardra   rI   rr   returnc                 C   s4   ddl m} |}t|tr|j}d}|| |||S )Nr   )symbolic_opset12T)
torch.onnxrt   rk   rh   r%   )ra   rI   rr   rt   	dropout_ptrainr+   r+   r,   re      s   
zXDropout.symbolicN)r7   r8   r9   rf   rg   r2   rP   r?   _CGraphValuer   floatrh   re   r+   r+   r+   r,   rp      s    
	
4rp   c                       sB   e Zd ZdZ fddZdd Zdd Zdd
dZdd Z  Z	S )r#   z
    Optimized dropout module for stabilizing the training

    Args:
        drop_prob (float): the dropout probabilities
    c                    s    t    || _d| _d | _d S r-   )r   r   	drop_probcountcontext_stack)r(   r|   r)   r+   r,   r      s   

zStableDropout.__init__c                 C   s$   | j r| jdkrt||  S |S )zr
        Call the module

        Args:
            x (`torch.tensor`): The input tensor to apply dropout
        r   )trainingr|   rp   applyget_context)r(   xr+   r+   r,   r2      s   zStableDropout.forwardc                 C   s   d| _ d | _d S r-   )r}   r~   r5   r+   r+   r,   clear_context   s   
zStableDropout.clear_contextTr   c                 C   s2   | j d u rg | _ d| _| j D ]}||_||_qd S r-   )r~   r}   rj   ri   )r(   rj   ri   cr+   r+   r,   init_context   s   

zStableDropout.init_contextc                 C   sT   | j d ur'| jt| j kr| j t  | j | j }| j|_|  jd7  _|S | jS )Nr   )r~   r}   lenappendrh   r|   r%   )r(   rq   r+   r+   r,   r      s   
zStableDropout.get_context)Tr   )
r7   r8   r9   rf   r   r2   r   r   r   r;   r+   r+   r)   r,   r#      s    
r#   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )DebertaLayerNormzBLayerNorm module in the TF style (epsilon inside the square root).-q=c                    s8   t    tt|| _tt|| _|| _	d S r   )
r   r   r   	Parameterr?   onesweightzerosbiasvariance_epsilon)r(   sizeepsr)   r+   r,   r     s   

zDebertaLayerNorm.__init__c                 C   sj   |j }| }|jddd}|| djddd}|| t|| j  }||}| j| | j	 }|S )NT)keepdim   )
rD   r{   meanpowr?   sqrtr   r>   r   r   )r(   r/   
input_typer   varianceyr+   r+   r,   r2     s   
zDebertaLayerNorm.forward)r   r7   r8   r9   rf   r   r2   r;   r+   r+   r)   r,   r   
  s    r   c                       $   e Zd Z fddZdd Z  ZS )DebertaSelfOutputc                    s<   t    t|j|j| _t|j|j| _t	|j
| _d S r   )r   r   r   r    r4   r"   r   layer_norm_eps	LayerNormr#   hidden_dropout_probr%   r'   r)   r+   r,   r     s   
zDebertaSelfOutput.__init__c                 C   &   |  |}| |}| || }|S r   r"   r%   r   r(   r/   input_tensorr+   r+   r,   r2   %     

zDebertaSelfOutput.forwardr7   r8   r9   r   r2   r;   r+   r+   r)   r,   r     s    r   c                       s.   e Zd Z fddZ				dddZ  ZS )DebertaAttentionc                    s(   t    t|| _t|| _|| _d S r   )r   r   DisentangledSelfAttentionr(   r   rL   r&   r'   r)   r+   r,   r   -  s   



zDebertaAttention.__init__FNc           
      C   sJ   | j ||||||d}|r|\}}|d u r|}| ||}	|r#|	|fS |	S )N)query_statesrelative_posrel_embeddings)r(   rL   )
r(   r/   attention_maskoutput_attentionsr   r   r   self_output
att_matrixattention_outputr+   r+   r,   r2   3  s    	zDebertaAttention.forwardFNNNr   r+   r+   r)   r,   r   ,      
r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )DebertaIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r   r   r   r    r4   intermediate_sizer"   rk   
hidden_actstrr   intermediate_act_fnr'   r)   r+   r,   r   R  s
   
zDebertaIntermediate.__init__r/   rs   c                 C      |  |}| |}|S r   )r"   r   r(   r/   r+   r+   r,   r2   Z     

zDebertaIntermediate.forward)r7   r8   r9   r   r?   Tensorr2   r;   r+   r+   r)   r,   r   Q  s    r   c                       r   )DebertaOutputc                    sB   t    t|j|j| _t|j|j| _	t
|j| _|| _d S r   )r   r   r   r    r   r4   r"   r   r   r   r#   r   r%   r&   r'   r)   r+   r,   r   a  s
   

zDebertaOutput.__init__c                 C   r   r   r   r   r+   r+   r,   r2   h  r   zDebertaOutput.forwardr   r+   r+   r)   r,   r   `  s    r   c                       s.   e Zd Z fddZ				dddZ  ZS )DebertaLayerc                    s,   t    t|| _t|| _t|| _d S r   )r   r   r   	attentionr   intermediater   rL   r'   r)   r+   r,   r   p  s   


zDebertaLayer.__init__NFc                 C   sH   | j ||||||d}|r|\}}| |}	| |	|}
|r"|
|fS |
S )Nr   r   r   r   )r   r   rL   )r(   r/   r   r   r   r   r   r   r   intermediate_outputlayer_outputr+   r+   r,   r2   v  s   	
zDebertaLayer.forward)NNNFr   r+   r+   r)   r,   r   o  r   r   c                       sN   e Zd ZdZ fddZdd Zdd Zdd	d
Z					dddZ  Z	S )DebertaEncoderz8Modified BertEncoder with relative position bias supportc                    s~   t    t fddt jD | _t dd| _| jr:t dd| _	| j	dk r/ j
| _	t| j	d  j| _d| _d S )	Nc                    s   g | ]}t  qS r+   )r   .0_r&   r+   r,   
<listcomp>  s    z+DebertaEncoder.__init__.<locals>.<listcomp>relative_attentionFmax_relative_positionsr   r   r   )r   r   r   
ModuleListrangenum_hidden_layerslayergetattrr   r   max_position_embeddings	Embeddingr4   r   gradient_checkpointingr'   r)   r   r,   r     s   
 

zDebertaEncoder.__init__c                 C   s   | j r	| jj}|S d }|S r   )r   r   r   )r(   r   r+   r+   r,   get_rel_embedding  s   z DebertaEncoder.get_rel_embeddingc                 C   sN   |  dkr|dd}||dd }|S |  dkr%|d}|S )Nr   r   r   r
   )r=   	unsqueezesqueeze)r(   r   extended_attention_maskr+   r+   r,   get_attention_mask  s   
z!DebertaEncoder.get_attention_maskNc                 C   sB   | j r|d u r|d ur|dn|d}t||d|j}|S )Nr   )r   r   build_relative_positiondevice)r(   r/   r   r   qr+   r+   r,   get_rel_pos  s   zDebertaEncoder.get_rel_posTFc              
   C   s<  |  |}| |||}|rdnd }|rdnd }	t|tr"|d }
n|}
|  }t| jD ]T\}}|r8||f }| jrK| jrK| 	|j
|
|||||}n
||
|||||d}|r[|\}}|d urx|}t|trw|d t| jk ru||d  nd }
n|}
|r|	|f }	q-|r||f }|stdd |||	fD S t|||	dS )Nr+   r   )r   r   r   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r+   )r   vr+   r+   r,   	<genexpr>  s    z)DebertaEncoder.forward.<locals>.<genexpr>last_hidden_stater/   
attentions)r   r   rk   r   r   	enumerater   r   r   _gradient_checkpointing_func__call__r   tupler   )r(   r/   r   output_hidden_statesr   r   r   return_dictall_hidden_statesall_attentionsnext_kvr   ilayer_moduleatt_mr+   r+   r,   r2     s^   





	
"

zDebertaEncoder.forward)NN)TFNNT)
r7   r8   r9   rf   r   r   r   r   r2   r;   r+   r+   r)   r,   r     s    
	
r   c                 C   sj   t j| t j|d}t j|t j|d}|dddf |dd| d }|d| ddf }|d}|S )a  
    Build relative position according to the query and key

    We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
    \(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
    P_k\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key

    Return:
        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]

    rD   r   Nr   r   r   )r?   arangelongviewrepeatr   )
query_sizekey_sizer   q_idsk_idsrel_pos_idsr+   r+   r,   r     s   $
r   c                 C   s*   |  |d|d|d|dgS )Nr   r   r   r   expandr   )c2p_posquery_layerr   r+   r+   r,   c2p_dynamic_expand     *r   c                 C   s*   |  |d|d|d|dgS )Nr   r   r   r   )r   r   	key_layerr+   r+   r,   p2c_dynamic_expand  r   r   c                 C   s*   |  | d d | d|df S )Nr   r   r   )	pos_indexp2c_attr   r+   r+   r,   pos_dynamic_expand  r   r  c                       sB   e Zd ZdZ fddZdd Z				ddd	Zd
d Z  ZS )r   a  
    Disentangled self-attention module

    Parameters:
        config (`str`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaConfig`]

    c                    s  t    |j|j dkrtd|j d|j d|j| _t|j|j | _| j| j | _tj	|j| jd dd| _
ttj| jtjd| _ttj| jtjd| _|jd ur]|jng | _t|d	d| _t|d
d| _| jrtj	|j|jdd| _tj	|j|jdd| _| jrt|dd| _| jdk r|j| _t|j| _d| jv rtj	|j| jdd| _d| jv rt	|j| j| _t|j| _d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r
   Fr   rV   r   talking_headr   r   r   c2pp2c) r   r   r4   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   r    in_projr   r?   r   r{   q_biasv_biaspos_att_typer   r   r  head_logits_projhead_weights_projr   r   r#   r   pos_dropoutpos_proj
pos_q_projattention_probs_dropout_probr%   r'   r)   r+   r,   r   )  s:   




z"DisentangledSelfAttention.__init__c                 C   s4   |  d d | jdf }||}|ddddS )Nr   r   r   r   r
   )r   r  r   permute)r(   r   new_x_shaper+   r+   r,   transpose_for_scoresL  s   
z.DisentangledSelfAttention.transpose_for_scoresFNc                    sR  |du r  }|jddd\}}	}
nRdd j jjjd ddfdd	tdD dgd d d |jd jd
} fdd	tddD \}}fdd	|||fD \}}	}
|jddddf  }|
j	ddddf  }
d}dt
j }ttj|dtjd
| }||j|jd
 }t||	dd}jrĈ|}||	|||}|dur|| }jr|dddddddd}t||d}|}jr|dddddddd}t||
}|dddd }| dd d }||}|r'||fS |S )a  
        Call the module

        Args:
            hidden_states (`torch.FloatTensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`torch.BoolTensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            output_attentions (`bool`, optional):
                Whether return the attention matrix.

            query_states (`torch.FloatTensor`, optional):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`torch.LongTensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`torch.FloatTensor`):
                The embedding of relative distances. It's a tensor of shape [\(2 \times
                \text{max_relative_positions}\), *hidden_size*].


        Nr
   r   r=   c                 S   s0   |d urt ||  |  S t ||  S r   )r?   matmult)wbr   r+   r+   r,   linear|  s   z1DisentangledSelfAttention.forward.<locals>.linearr   c                    s0   g | ] t j fd dtjD ddqS )c                    s   g | ]
}|d     qS )r
   r+   r   r   )kwsr+   r,   r     s    z@DisentangledSelfAttention.forward.<locals>.<listcomp>.<listcomp>r   r  )r?   catr   r  )r   )r(   r"  )r!  r,   r     s   0 z5DisentangledSelfAttention.forward.<locals>.<listcomp>rV   c                    s.   g | ]}| |  j | jd qS )rV   )r>   rD   r   )r/   r  qkvbqkvwr+   r,   r     s   . r   c                    s   g | ]}  |qS r+   )r  )r   r   r5   r+   r,   r     s    r   r   r   )r  r  chunkr   r  r   r>   rD   r  r  r   r  r?   r   rB   r   r{   r  	transposer   r  disentangled_att_biasr  r  r  r<   r   r%   r  
contiguousr   )r(   r/   r   r   r   r   r   qpr   r   value_layerr   r!  r   rel_attscale_factorri   attention_scoresattention_probscontext_layernew_context_layer_shaper+   )r/   r  r$  r%  r(   r"  r,   r2   Q  sF   &

"" 
"
"
z!DisentangledSelfAttention.forwardc                 C   sl  |d u r| d}t|| d|j}| dkr"|dd}n| dkr.|d}n| dkr=td|  tt| d| d| j}|	 
|j}|| j| | j| d d f d}d}d| jv r| |}	| |	}	t||	d	d}
t|| d|d d }tj|
d	t|||d
}
||
7 }d| jv r4| |}| |}|ttj| d	tjd|  }| d| dkrt| d| d|j}n|}t| | d|d d }t||d	dj
|jd}tj|d	t|||d
d	d}| d| dkr0|d d d d d d df d	}tj|dt|||d
}||7 }|S )Nr   r   r   r
   r      z2Relative position ids must be of dim 2 or 3 or 4. r  r   )r=   indexr  rV   )r   r   r   r=   r   r	  rE   maxr   r   r>   r  r  r  r?   r  r(  clampgatherr   r  r   rB   r{   rD   r   r  )r(   r   r   r   r   r.  r   att_spanscorepos_key_layerc2p_attr   pos_query_layerr_posp2c_posr  r   r+   r+   r,   r)    sV   





$"z/DisentangledSelfAttention.disentangled_att_biasr   )	r7   r8   r9   rf   r   r  r2   r)  r;   r+   r+   r)   r,   r     s    
#	
[r   c                       s*   e Zd ZdZ fddZdddZ  ZS )DebertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    t|dd}t|d|j| _tj|j| j|d| _t|dd| _	| j	s,d | _
n	t|j| j| _
|jdkrCt|j| j| _| j|jkrTtj| j|jdd| _t|j|j| _t|j| _|| _| jd	t|jd
dd d S )Npad_token_idr   embedding_size)padding_idxposition_biased_inputTFr  position_ids)r   r   )
persistent)r   r   r   r4   rA  r   r   
vocab_sizeword_embeddingsrC  position_embeddingsr   type_vocab_sizetoken_type_embeddingsr    
embed_projr   r   r   r#   r   r%   r&   register_bufferr?   r   r   )r(   r&   r@  r)   r+   r,   r     s$   


zDebertaEmbeddings.__init__Nc                 C   sN  |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u r3tj|tj| jjd}|d u r<| |}| jd urI| | }nt|}|}	| j	rW|	|7 }	| j
jdkrf| |}
|	|
7 }	| j| j
jkrr| |	}	| |	}	|d ur| |	 kr| dkr|dd}|d}||	j}|	| }	| |	}	|	S )Nr   r   r   r   r3  r   )r   rD  r?   r   r   r   rG  rH  
zeros_likerC  r&   rI  rJ  rA  r4   rK  r   r=   r   r   r>   rD   r%   )r(   	input_idstoken_type_idsrD  rJ   inputs_embedsinput_shape
seq_lengthrH  
embeddingsrJ  r+   r+   r,   r2     s>   








zDebertaEmbeddings.forward)NNNNNr   r+   r+   r)   r,   r?    s    r?  c                   @   s*   e Zd ZdZeZdZdgZdZdd Z	dS )DebertaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    debertarH  Tc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rA|jjjd| jjd |jdurC|jj|j 	  dS dS dS )zInitialize the weights.g        )r   stdN)rk   r   r    r   datanormal_r&   initializer_ranger   zero_r   rB  )r(   moduler+   r+   r,   _init_weights7  s   

z$DebertaPreTrainedModel._init_weightsN)
r7   r8   r9   rf   r   config_classbase_model_prefix"_keys_to_ignore_on_load_unexpectedsupports_gradient_checkpointingr\  r+   r+   r+   r,   rT  ,  s    rT  a  
    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.


    Parameters:
        config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zaThe bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Z fddZdd Zdd Zdd Zee	d	e
eeed
								ddeej deej deej deej deej dee dee dee deeef fddZ  ZS )DebertaModelc                    s8   t  | t|| _t|| _d| _|| _|   d S r-   )	r   r   r?  rS  r   encoderz_stepsr&   	post_initr'   r)   r+   r,   r     s   

zDebertaModel.__init__c                 C   r3   r   rS  rG  r5   r+   r+   r,   get_input_embeddings  s   z!DebertaModel.get_input_embeddingsc                 C   s   || j _d S r   re  r(   new_embeddingsr+   r+   r,   set_input_embeddings  s   z!DebertaModel.set_input_embeddingsc                 C   s   t d)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        z7The prune function is not implemented in DeBERTa model.)NotImplementedError)r(   heads_to_pruner+   r+   r,   _prune_heads  s   zDebertaModel._prune_headsbatch_size, sequence_length
checkpointoutput_typer]  NrN  r   rO  rD  rP  r   r   r   rs   c	              	      s  |d ur|n j j}|d ur|n j j}|d ur|n j j}|d ur*|d ur*td|d ur9 || | }	n|d urF| d d }	ntd|d urQ|jn|j}
|d u r_tj	|	|
d}|d u rltj
|	tj|
d} j|||||d} j||d||d}|d	 } jd	kr|d
 } fddt jD }|d } j } j|} j|}|d	d  D ]}|||d|||d}|| q|d }|s|f||rd	ndd   S t||r|jnd |jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r   r   )rN  rO  rD  rJ   rP  T)r   r   r   r   r   c                    s   g | ]} j jd  qS r&  )rb  r   r   r5   r+   r,   r     s    z(DebertaModel.forward.<locals>.<listcomp>Fr   r   r   )r&   r   r   use_return_dictr	  %warn_if_padding_and_no_attention_maskr   r   r?   r   r   r   rS  rb  rc  r   r   r   r   r   r   r/   r   )r(   rN  r   rO  rD  rP  r   r   r   rQ  r   embedding_outputencoder_outputsencoded_layersr/   layersr   r   rel_posr   sequence_outputr+   r5   r,   r2     sr   


zDebertaModel.forward)NNNNNNNN)r7   r8   r9   r   rf  ri  rl  r   DEBERTA_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r?   r   r@   r   r   r2   r;   r+   r+   r)   r,   ra    sL    
	

ra  z5DeBERTa Model with a `language modeling` head on top.c                       s   e Zd ZddgZ fddZdd Zdd Zee	d	e
eeed
eed									ddeej deej deej deej deej deej dee dee dee deeef fddZ  ZS )DebertaForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                    s,   t  | t|| _t|| _|   d S r   )r   r   ra  rU  DebertaOnlyMLMHeadclsrd  r'   r)   r+   r,   r     s   

zDebertaForMaskedLM.__init__c                 C   s
   | j jjS r   r  predictionsdecoderr5   r+   r+   r,   get_output_embeddings     
z(DebertaForMaskedLM.get_output_embeddingsc                 C   s   || j j_d S r   r  rg  r+   r+   r,   set_output_embeddings  s   z(DebertaForMaskedLM.set_output_embeddingsrm  z[MASK])ro  rp  r]  rJ   expected_outputexpected_lossNrN  r   rO  rD  rP  labelsr   r   r   rs   c
              
   C   s   |	dur|	n| j j}	| j||||||||	d}
|
d }| |}d}|dur7t }||d| j j|d}|	sM|f|
dd  }|durK|f| S |S t|||
j|
j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   rO  rD  rP  r   r   r   r   r   r   losslogitsr/   r   )
r&   rq  rU  r  r   r   rF  r   r/   r   )r(   rN  r   rO  rD  rP  r  r   r   r   outputsrx  prediction_scoresmasked_lm_lossloss_fctrL   r+   r+   r,   r2     s4   
zDebertaForMaskedLM.forward	NNNNNNNNN)r7   r8   r9   _tied_weights_keysr   r  r  r   ry  rz  r   _CHECKPOINT_FOR_MASKED_LMr   r|  _MASKED_LM_EXPECTED_OUTPUT_MASKED_LM_EXPECTED_LOSSr   r?   r   r@   r   r   r2   r;   r+   r+   r)   r,   r}    sX    	
	

r}  c                       r   )DebertaPredictionHeadTransformc                    sf   t    t|d|j| _t|j| j| _t|j	t
r#t|j	 | _n|j	| _tj| j|jd| _d S )NrA  )r   )r   r   r   r4   rA  r   r    r"   rk   r   r   r   transform_act_fnr   r   r'   r)   r+   r,   r   F  s   
z'DebertaPredictionHeadTransform.__init__c                 C   s"   |  |}| |}| |}|S r   )r"   r  r   r   r+   r+   r,   r2   Q  s   


z&DebertaPredictionHeadTransform.forwardr   r+   r+   r)   r,   r  E  s    r  c                       r   )DebertaLMPredictionHeadc                    s\   t    t|| _t|d|j| _tj| j|j	dd| _
tt|j	| _| j| j
_d S )NrA  Fr  )r   r   r  	transformr   r4   rA  r   r    rF  r  r   r?   r   r   r'   r)   r+   r,   r   Y  s   

z DebertaLMPredictionHead.__init__c                 C   r   r   )r  r  r   r+   r+   r,   r2   g  r   zDebertaLMPredictionHead.forwardr   r+   r+   r)   r,   r  X  s    r  c                       r   )r~  c                    s   t    t|| _d S r   )r   r   r  r  r'   r)   r+   r,   r   o  s   
zDebertaOnlyMLMHead.__init__c                 C   s   |  |}|S r   )r  )r(   rx  r  r+   r+   r,   r2   s  s   
zDebertaOnlyMLMHead.forwardr   r+   r+   r)   r,   r~  n  s    r~  z
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       s   e Zd Z fddZdd Zdd Zeede	e
eed																		dd
eej deej deej deej deej deej dee dee dee deeef fddZ  ZS ) DebertaForSequenceClassificationc                    s~   t  | t|dd}|| _t|| _t|| _| jj}t	
||| _t|dd }|d u r2| jjn|}t|| _|   d S )N
num_labelsr   cls_dropout)r   r   r   r  ra  rU  r   poolerr6   r   r    
classifierr&   r   r#   r%   rd  )r(   r&   r  r6   drop_outr)   r+   r,   r     s   


z)DebertaForSequenceClassification.__init__c                 C   s
   | j  S r   )rU  rf  r5   r+   r+   r,   rf    r  z5DebertaForSequenceClassification.get_input_embeddingsc                 C   s   | j | d S r   )rU  ri  rg  r+   r+   r,   ri    s   z5DebertaForSequenceClassification.set_input_embeddingsrm  rn  NrN  r   rO  rD  rP  r  r   r   r   rs   c
              
   C   s:  |	dur|	n| j j}	| j||||||||	d}
|
d }| |}| |}| |}d}|dur| j jdu r| jdkrQt	 }|
d|j}|||
d}n| dks^|ddkr|dk }| }|ddkrt|d||d|d}t|d|
d}t }||
d| j |
d}n^td|}nUtd}||| d  }nC| j jdkrt	 }| jdkr|| | }n+|||}n%| j jdkrt }||
d| j|
d}n| j jdkrt }|||}|	s|f|
dd  }|dur|f| S |S t|||
j|
jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)rO  r   rD  rP  r   r   r   r   r   r   
regressionsingle_label_classificationmulti_label_classificationr  )r&   rq  rU  r  r%   r  problem_typer  r   r	   r   r>   rD   r=   r   nonzeror   r?   r7  r   r   r{   rB   
LogSoftmaxsumr   r   r   r   r/   r   )r(   rN  r   rO  rD  rP  r  r   r   r   r  encoder_layerr1   r  r  loss_fnlabel_indexlabeled_logitsr  log_softmaxrL   r+   r+   r,   r2     sh   



 


z(DebertaForSequenceClassification.forwardr  )r7   r8   r9   r   rf  ri  r   ry  rz  r   r{  r   r|  r   r?   r   r@   r   r   r2   r;   r+   r+   r)   r,   r  x  sP    	

r  z
    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       s   e Zd Z fddZeedeee	e
d									ddeej deej deej d	eej d
eej deej dee dee dee deee	f fddZ  ZS )DebertaForTokenClassificationc                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S r   )r   r   r  ra  rU  r   Dropoutr   r%   r    r4   r  rd  r'   r)   r+   r,   r     s   
z&DebertaForTokenClassification.__init__rm  rn  NrN  r   rO  rD  rP  r  r   r   r   rs   c
              
   C   s   |	dur|	n| j j}	| j||||||||	d}
|
d }| |}| |}d}|dur;t }||d| j|d}|	sQ|f|
dd  }|durO|f| S |S t|||
j	|
j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r   r   r  )r&   rq  rU  r%   r  r   r   r  r   r/   r   )r(   rN  r   rO  rD  rP  r  r   r   r   r  rx  r  r  r  rL   r+   r+   r,   r2     s0   

z%DebertaForTokenClassification.forwardr  )r7   r8   r9   r   r   ry  rz  r   r{  r   r|  r   r?   r   r@   r   r   r2   r;   r+   r+   r)   r,   r    sL    	

r  z
    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       s   e Zd Z fddZeedeee	e
eeeed										ddeej deej deej d	eej d
eej deej deej dee dee dee deee	f fddZ  ZS )DebertaForQuestionAnsweringc                    s<   t  | |j| _t|| _t|j|j| _| 	  d S r   )
r   r   r  ra  rU  r   r    r4   
qa_outputsrd  r'   r)   r+   r,   r   ?  s
   
z$DebertaForQuestionAnswering.__init__rm  )ro  rp  r]  r  r  qa_target_start_indexqa_target_end_indexNrN  r   rO  rD  rP  start_positionsend_positionsr   r   r   rs   c              
   C   sF  |
dur|
n| j j}
| j|||||||	|
d}|d }| |}|jddd\}}|d }|d }d}|dur|durt| dkrN|d}t| dkr[|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|
s||f|dd  }|dur|f| S |S t||||j|jd	S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr  r   r   r   r  )ignore_indexr   )r  start_logits
end_logitsr/   r   )r&   rq  rU  r  splitr   r*  r   r   r6  r   r   r/   r   )r(   rN  r   rO  rD  rP  r  r  r   r   r   r  rx  r  r  r  
total_lossignored_indexr  
start_lossend_lossrL   r+   r+   r,   r2   I  sN   !






z#DebertaForQuestionAnswering.forward)
NNNNNNNNNN)r7   r8   r9   r   r   ry  rz  r   _CHECKPOINT_FOR_QAr   r|  _QA_EXPECTED_OUTPUT_QA_EXPECTED_LOSS_QA_TARGET_START_INDEX_QA_TARGET_END_INDEXr   r?   r   r@   r   r   r2   r;   r+   r+   r)   r,   r  7  sZ    
	

r  )Srf   collections.abcr   typingr   r   r   r?   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_outputsr   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   r   configuration_debertar   
get_loggerr7   loggerr|  r{  r  r  r  r  r  r  r  r  %DEBERTA_PRETRAINED_MODEL_ARCHIVE_LISTModuler   autogradFunctionr<   objectrh   ro   rp   r#   r   r   r   r   r   r   r   r   jitscriptr   r   r  r   r?  rT  DEBERTA_START_DOCSTRINGry  ra  r}  r  r  r~  r  r  r  r+   r+   r+   r,   <module>   s   

?'1%"e


 CL,oO
oB