o
    hi                  
   @   sv  d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( e&)e*Z+dZ,dZ-dZ.dgZ/dd Z0da1e% re$ re+2d ze0  W n e3y Z4 ze+5de4  W Y dZ4[4ndZ4[4ww 	 dd Z6d\ddZ7d\ddZ8d\ddZ9d d! Z:G d"d# d#ej;j<Z=G d$d% d%ej;j<Z>G d&d' d'Z?d]d(d)Z@d*d+ ZA			d^d,d-ZBG d.d/ d/e
jCZDG d0d1 d1e
jCZEG d2d3 d3e
jCZFG d4d5 d5e
jCZGG d6d7 d7e
jCZHG d8d9 d9e
jCZIG d:d; d;e
jCZJG d<d= d=e
jCZKG d>d? d?e
jCZLG d@dA dAe
jCZMG dBdC dCe
jCZNG dDdE dEeZOdFZPdGZQe"dHePG dIdJ dJeOZRe"dKePG dLdM dMeOZSG dNdO dOe
jCZTe"dPePG dQdR dReOZUe"dSePG dTdU dUeOZVe"dVePG dWdX dXeOZWe"dYePG dZd[ d[eOZXdS )_z PyTorch MRA model.    N)Path)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss)load   )ACT2FN)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardis_ninja_availableis_torch_cuda_availablelogging   )	MraConfigzuw-madison/mra-base-512-4r   AutoTokenizerc                     sL   t t jjjd d   fdd} | g d}td|ddad	d lad S )
Nkernelsmrac                    s    fdd| D S )Nc                    s   g | ]} | qS  r"   ).0file
src_folderr"   Z/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/mra/modeling_mra.py
<listcomp>C       z:load_cuda_kernels.<locals>.append_root.<locals>.<listcomp>r"   )filesr%   r"   r'   append_rootB   s   z&load_cuda_kernels.<locals>.append_root)zcuda_kernel.cuzcuda_launch.cuztorch_extension.cppcuda_kernelT)verboser   )r   __file__resolveparentr
   r,   )r+   	src_filesr"   r%   r'   load_cuda_kernels>   s
   r2   zLoading custom CUDA kernels...zFailed to load CUDA kernels. Mra requires custom CUDA kernels. Please verify that compatible versions of PyTorch and CUDA Toolkit are installed: c                 C   s   t |  dkrtdt | dkrtd| ddkr#td| ddkr.td| jd	d
jdd	}| }| }| }t	||||\}}|dd	dddddddf }||fS )z8
    Computes maximum values for softmax stability.
       z.sparse_qk_prod must be a 4-dimensional tensor.   'indices must be a 2-dimensional tensor.    z>The size of the second dimension of sparse_qk_prod must be 32.r   z=The size of the third dimension of sparse_qk_prod must be 32.dimN)
lensize
ValueErrormaxvalues	transpose
contiguousintr,   	index_max)sparse_qk_prodindicesquery_num_blockkey_num_block
index_valsmax_valsmax_vals_scatterr"   r"   r'   
sparse_max]   s   $rK   r6   c                 C   s   t |  dkrtdt | dkrtd| jd |jd kr&td| j\}}|| }tj|dtj|jd}| |||} | |dddf ||  ddf } | S )zN
    Converts attention mask to a sparse mask for high resolution logits.
    r4   z$mask must be a 2-dimensional tensor.r5   r   zBmask and indices must have the same size in the zero-th dimension.dtypedeviceN)	r;   r<   r=   shapetorcharangelongrN   reshape)maskrE   
block_size
batch_sizeseq_len	num_block	batch_idxr"   r"   r'   sparse_masky   s   
&rZ   c           	      C   s"  |   \}}}|  \}}}|| dkrtd|| dkr"td| ||| ||dd} |||| ||dd}t|   dkrJtdt|  dkrVtdt|  d	krbtd
|  ddkrmtd| ddkrxtd|  } | }| }| }t| || S )z7
    Performs Sampled Dense Matrix Multiplication.
    r   zTquery_size (size of first dimension of dense_query) must be divisible by block_size.Pkey_size (size of first dimension of dense_key) must be divisible by block_size.r:   r7   r3   z+dense_query must be a 4-dimensional tensor.)dense_key must be a 4-dimensional tensor.r4   r5   r   r6   z.The third dimension of dense_query must be 32.z,The third dimension of dense_key must be 32.)	r<   r=   rS   r@   r;   rA   rB   r,   mm_to_sparse)	dense_query	dense_keyrE   rU   rV   
query_sizer9   _key_sizer"   r"   r'   r]      s.   r]   c           	      C   s  |  \}}}|| dkrtd|  d|krtd|  d|kr'td|||| ||dd}t|   d	krAtd
t|  d	krMtdt|  dkrYtd| ddkrdtd|  } | }| }| }t| |||}|dd||| |}|S )zP
    Performs matrix multiplication of a sparse matrix with a dense matrix.
    r   r[   r4   zQThe size of the second dimension of sparse_query must be equal to the block_size.r   zPThe size of the third dimension of sparse_query must be equal to the block_size.r:   r7   r3   ,sparse_query must be a 4-dimensional tensor.r\   r5   r6   z8The size of the third dimension of dense_key must be 32.)	r<   r=   rS   r@   r;   rA   rB   r,   sparse_dense_mm)	sparse_queryrE   r_   rF   rU   rV   rb   r9   dense_qk_prodr"   r"   r'   rd      s.   rd   c                 C   s    | | | t j| |dd  S )Nfloorrounding_mode)rP   divrR   )rE   dim_1_blockdim_2_blockr"   r"   r'   transpose_indices   s    rm   c                   @   s2   e Zd Zedd Zedd Zed	ddZdS )
MraSampledDenseMatMulc                 C   &   t ||||}| ||| || _|S N)r]   save_for_backwardrU   )ctxr^   r_   rE   rU   rD   r"   r"   r'   forward      zMraSampledDenseMatMul.forwardc                 C   sj   | j \}}}| j}|d| }|d| }t|||}t|dd|||}	t||||}
|
|	d d fS Nr   r:   r7   )saved_tensorsrU   r<   rm   rd   r@   )rr   gradr^   r_   rE   rU   rF   rG   	indices_Tgrad_key
grad_queryr"   r"   r'   backward   s   zMraSampledDenseMatMul.backwardr6   c                 C      t | |||S rp   )rn   apply)r^   r_   rE   rU   r"   r"   r'   operator_call      z#MraSampledDenseMatMul.operator_callNr6   __name__
__module____qualname__staticmethodrs   r{   r~   r"   r"   r"   r'   rn      s    


rn   c                   @   s0   e Zd Zedd Zedd Zedd ZdS )MraSparseDenseMatMulc                 C   ro   rp   )rd   rq   rF   )rr   re   rE   r_   rF   rD   r"   r"   r'   rs      rt   zMraSparseDenseMatMul.forwardc           
      C   s`   | j \}}}| j}|d|d }t|||}t|dd|||}t|||}	|	d |d fS ru   )rv   rF   r<   rm   rd   r@   r]   )
rr   rw   re   rE   r_   rF   rG   rx   ry   rz   r"   r"   r'   r{     s   zMraSparseDenseMatMul.backwardc                 C   r|   rp   )r   r}   )re   rE   r_   rF   r"   r"   r'   r~     r   z"MraSparseDenseMatMul.operator_callNr   r"   r"   r"   r'   r      s    

	r   c                   @   s   e Zd Zedd ZdS )MraReduceSumc                 C   s  |   \}}}}t|   dkrtdt|  dkr td|   \}}}}|  \}}| jdd|| |} tj| dtj|jd}tj	||dd	 |d d d f |  || }	tj
|| |f| j| jd}
|
d|	| |||}|||| }|S )
Nr3   rc   r4   r5   r8   r   rL   rg   rh   )r<   r;   r=   sumrS   rP   rQ   rR   rN   rj   zerosrM   	index_add)re   rE   rF   rG   rV   rX   rU   ra   rY   global_idxestempoutputr"   r"   r'   r~     s$   &
zMraReduceSum.operator_callN)r   r   r   r   r~   r"   r"   r"   r'   r     s    r   c                 C   s  |   \}}}|| }d}	|durl||||jdd}
| ||||jdd|
dddddf d  }|||||jdd|
dddddf d  }|durk|||||jdd|
dddddf d  }	n5|tj||tj| jd }
| ||||jdd}|||||jdd}|dur|||||jdd}	t||	ddt
| }|jdddj}|dur|d	|
dddddf |
dddddf  d
k    }||
||	fS )z/
    Compute low resolution approximation.
    Nr:   r8   r7   ư>rL   T)r9   keepdims     @g      ?)r<   rS   r   rP   onesfloatrN   meanmatmulr@   mathsqrtr>   r?   )querykeyrU   rT   valuerV   rW   head_dimnum_block_per_row	value_hattoken_count	query_hatkey_hatlow_resolution_logitlow_resolution_logit_row_maxr"   r"   r'   get_low_resolution_logit0  s6   :r   c                 C   sT  | j \}}}|dkr3|d }tj||| jd}	tjtj|	| d|d}
| |
dddddf d  } |dkrk| ddd|ddf d | ddd|ddf< | ddddd|f d | ddddd|f< tj| |d|ddd	d
}|j}|dkr|j	j
ddj	}| |ddddf k }||fS |dkrd}||fS t| d)zZ
    Compute the indices of the subset of components to be used in the approximation.
    r   r4   rN   )diagonalNg     @r:   TF)r9   largestsortedfullr8   sparsez# is not a valid approx_model value.)rO   rP   r   rN   triltriutopkrS   rE   r?   minr   r=   )r   
num_blocksapprox_modeinitial_prior_first_n_blocksinitial_prior_diagonal_n_blocksrV   total_blocks_per_rowra   offset	temp_maskdiagonal_mask
top_k_valsrE   	thresholdhigh_resolution_maskr"   r"   r'   get_block_idxesX  s.   r   c	           $      C   s  t du rt|  S |  \}	}
}}|	|
 }|| dkr!td|| }| |||} ||||}||||}|dure| |dddddf  } ||dddddf  }||dddddf  }|dkrvt| ||||\}}}}n(|dkrt  t| |||\}}}}W d   n1 sw   Y  nt	dt  || }t
|||||\}}W d   n1 sw   Y  tj| |||dt| }t||||\}}|| }|dur|dd	t||dddddddf    }t|}t||||}t||||}|dkrt|| d|  |dddddf  }t||dddddddf d	d	|d	|||}|jd
ddddddf d	d	|||}|d	d	|||| } |durq| | } t| | dk  }!||!dddddf  }||! }t|  | dk  }"||"dddddf  }||" }|| |dddddf |dddddf  d  }#n|dkr||dddddf d  }#nt	d|dur|#|dddddf  }#|#|	|
||}#|#S )z0
    Use Mra to approximate self-attention.
    Nr   z4sequence length must be divisible by the block_size.r   r   z&approx_mode must be "full" or "sparse")rU   r   r   r:   r8   r   z-config.approx_mode must be "full" or "sparse")r,   rP   
zeros_likerequires_grad_r<   r=   rS   r   no_grad	Exceptionr   rn   r~   r   r   rK   rZ   expr   r   r   repeatr   r   )$r   r   r   rT   r   r   rU   r   r   rV   num_headrW   r   
meta_batchr   r   r   r   r   ra   low_resolution_logit_normalizedrE   r   high_resolution_logitrI   rJ   high_resolution_attnhigh_resolution_attn_outhigh_resolution_normalizerlow_resolution_attnlow_resolution_attn_outlow_resolution_normalizerlog_correctionlow_resolution_corrhigh_resolution_corrcontext_layerr"   r"   r'   mra2_attention~  s   




.

"
.
.
 
r   c                       s*   e Zd ZdZ fddZdddZ  ZS )MraEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|jd |j| _	t|j
|j| _tj|j|jd| _t|j| _| dt|jdd  t|dd| _| jdtj| j tj| jjd	d
d d S )N)padding_idxr4   epsposition_ids)r   r:   position_embedding_typeabsolutetoken_type_idsrL   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrP   rQ   expandgetattrr   r   r   r<   rR   rN   selfconfig	__class__r"   r'   r     s   

zMraEmbeddings.__init__Nc                 C   s   |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u rNt| drC| jd d d |f }||d |}|}ntj|tj| jjd}|d u rW| 	|}| 
|}	||	 }
| jdkrn| |}|
|7 }
| |
}
| |
}
|
S )Nr:   r   r   r   rL   r   )r<   r   hasattrr   r   rP   r   rR   rN   r   r   r   r   r   r   )r   	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   r"   r"   r'   rs     s,   







zMraEmbeddings.forward)NNNNr   r   r   __doc__r   rs   __classcell__r"   r"   r   r'   r     s    r   c                       0   e Zd Zd fdd	Zdd ZdddZ  ZS )	MraSelfAttentionNc                    s  t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _|d ur[|n|j| _|jd |j | _t| jt|jd d | _|j| _|j| _|j| _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r6   r4   )r   r   r   num_attention_headsr   r=   rB   attention_head_sizeall_head_sizer   Linearr   r   r   r   attention_probs_dropout_probr   r   r   block_per_rowrX   r   r   r   r   r   r   r   r   r"   r'   r   ,  s*   

zMraSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr:   r   r4   r   r   )r<   r   r   viewpermute)r   layernew_layer_shaper"   r"   r'   transpose_for_scoresH  s   
z%MraSelfAttention.transpose_for_scoresc              
   C   s  |  |}| | |}| | |}| |}| \}}}	}
d|d  }| d|d|| |	 }d}|
|k rt|||	||
 f}t	j
|t	j||jdgdd}t	j
|t	j||jdgdd}t	j
|t	j||jdgdd}t| | | | | j| j| j| jd}|
|k r|d d d d d d d |
f }||||	|
}|d	d
dd }| d d | jf }|j| }|f}|S )N      ?r   r   r6   r   r:   r8   )r   r   r   r   r4   r   r7   )r   r	  r   r   r<   squeezer   rS   rB   rP   catr   rN   r   r   rX   r   r   r   r  rA   r   r  )r   hidden_statesattention_maskmixed_query_layer	key_layervalue_layerquery_layerrV   	num_headsrW   r   gpu_warp_sizepad_sizer   new_context_layer_shapeoutputsr"   r"   r'   rs   M  s@   

  
zMraSelfAttention.forwardrp   )r   r   r   r   r	  rs   r   r"   r"   r   r'   r   +  s    r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )MraSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )r   r   r   r  r   denser   r   r   r   r   r   r   r"   r'   r        
zMraSelfOutput.__init__r  input_tensorreturnc                 C   &   |  |}| |}| || }|S rp   r  r   r   r   r  r  r"   r"   r'   rs        

zMraSelfOutput.forwardr   r   r   r   rP   Tensorrs   r   r"   r"   r   r'   r        $r  c                       r   )	MraAttentionNc                    s.   t    t||d| _t|| _t | _d S )N)r   )r   r   r   r   r  r   setpruned_headsr  r   r"   r'   r     s   

zMraAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r8   )r;   r   r   r   r   r(  r   r   r   r   r   r  r   union)r   headsindexr"   r"   r'   prune_heads  s   zMraAttention.prune_headsc                 C   s2   |  ||}| |d |}|f|dd   }|S Nr   r   )r   r   )r   r  r  self_outputsattention_outputr  r"   r"   r'   rs     s   zMraAttention.forwardrp   )r   r   r   r   r,  rs   r   r"   r"   r   r'   r&    s    r&  c                       2   e Zd Z fddZdejdejfddZ  ZS )MraIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S rp   )r   r   r   r  r   intermediate_sizer  
isinstance
hidden_actstrr   intermediate_act_fnr   r   r"   r'   r     s
   
zMraIntermediate.__init__r  r  c                 C      |  |}| |}|S rp   )r  r6  r   r  r"   r"   r'   rs        

zMraIntermediate.forwardr#  r"   r"   r   r'   r1    s    r1  c                       r  )	MraOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r  )r   r   r   r  r2  r   r  r   r   r   r   r   r   r   r"   r'   r     r  zMraOutput.__init__r  r  r  c                 C   r  rp   r   r!  r"   r"   r'   rs     r"  zMraOutput.forwardr#  r"   r"   r   r'   r:    r%  r:  c                       s.   e Zd Z fddZdddZdd Z  ZS )	MraLayerc                    sB   t    |j| _d| _t|| _|j| _t|| _t	|| _
d S Nr   )r   r   chunk_size_feed_forwardseq_len_dimr&  	attentionadd_cross_attentionr1  intermediater:  r   r   r   r"   r'   r     s   


zMraLayer.__init__Nc                 C   sB   |  ||}|d }|dd  }t| j| j| j|}|f| }|S r-  )r?  r   feed_forward_chunkr=  r>  )r   r  r  self_attention_outputsr/  r  layer_outputr"   r"   r'   rs     s   
zMraLayer.forwardc                 C   s   |  |}| ||}|S rp   )rA  r   )r   r/  intermediate_outputrD  r"   r"   r'   rB    s   
zMraLayer.feed_forward_chunkrp   )r   r   r   r   rs   rB  r   r"   r"   r   r'   r;    s    
	r;  c                       s.   e Zd Z fddZ				dddZ  ZS )	
MraEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r"   )r;  )r#   ra   r   r"   r'   r(     r)   z'MraEncoder.__init__.<locals>.<listcomp>F)	r   r   r   r   
ModuleListrangenum_hidden_layersr  gradient_checkpointingr   r   rG  r'   r     s   
 
zMraEncoder.__init__NFTc           
      C   s   |rdnd }t | jD ]#\}}|r||f }| jr%| jr%| |j||}	n|||}	|	d }q|r6||f }|sCtdd ||fD S t||dS )Nr"   r   c                 s   s    | ]	}|d ur|V  qd S rp   r"   )r#   vr"   r"   r'   	<genexpr>  s    z%MraEncoder.forward.<locals>.<genexpr>)last_hidden_stater  )	enumerater  rK  training_gradient_checkpointing_func__call__tupler   )
r   r  r  	head_maskoutput_hidden_statesreturn_dictall_hidden_statesilayer_modulelayer_outputsr"   r"   r'   rs     s(   



zMraEncoder.forward)NNFTr   r   r   r   rs   r   r"   r"   r   r'   rF    s    	rF  c                       r0  )MraPredictionHeadTransformc                    sV   t    t|j|j| _t|jtrt	|j | _
n|j| _
tj|j|jd| _d S r  )r   r   r   r  r   r  r3  r4  r5  r   transform_act_fnr   r   r   r   r"   r'   r     s   
z#MraPredictionHeadTransform.__init__r  r  c                 C   s"   |  |}| |}| |}|S rp   )r  r]  r   r8  r"   r"   r'   rs   !  s   


z"MraPredictionHeadTransform.forwardr#  r"   r"   r   r'   r\    s    	r\  c                       s$   e Zd Z fddZdd Z  ZS )MraLMPredictionHeadc                    sL   t    t|| _tj|j|jdd| _t	t
|j| _| j| j_d S )NF)bias)r   r   r\  	transformr   r  r   r   decoder	ParameterrP   r   r_  r   r   r"   r'   r   *  s
   

zMraLMPredictionHead.__init__c                 C   r7  rp   )r`  ra  r8  r"   r"   r'   rs   7  r9  zMraLMPredictionHead.forwardr[  r"   r"   r   r'   r^  )  s    r^  c                       r0  )MraOnlyMLMHeadc                    s   t    t|| _d S rp   )r   r   r^  predictionsr   r   r"   r'   r   ?  s   
zMraOnlyMLMHead.__init__sequence_outputr  c                 C   s   |  |}|S rp   )rd  )r   re  prediction_scoresr"   r"   r'   rs   C  s   
zMraOnlyMLMHead.forwardr#  r"   r"   r   r'   rc  >  s    rc  c                   @   s$   e Zd ZdZeZdZdZdd ZdS )MraPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r!   Tc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS dS )zInitialize the weightsg        )r   stdNr
  )r3  r   r  weightdatanormal_r   initializer_ranger_  zero_r   r   r   fill_)r   moduler"   r"   r'   _init_weightsS  s   

z MraPreTrainedModel._init_weightsN)	r   r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointingrp  r"   r"   r"   r'   rg  I  s    rg  aF  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MraConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
ak	  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z]The bare MRA Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Z fddZdd Zdd Zdd Zee	d	e
eeed
								ddeej deej deej deej deej deej dee dee deeef fddZ  ZS )MraModelc                    s2   t  | || _t|| _t|| _|   d S rp   )r   r   r   r   r   rF  encoder	post_initr   r   r"   r'   r     s
   

zMraModel.__init__c                 C   s   | j jS rp   r   r   r   r"   r"   r'   get_input_embeddings  s   zMraModel.get_input_embeddingsc                 C   s   || j _d S rp   rw  )r   r   r"   r"   r'   set_input_embeddings  s   zMraModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsru  r  r?  r,  )r   heads_to_pruner  r*  r"   r"   r'   _prune_heads  s   zMraModel._prune_headsbatch_size, sequence_length
checkpointoutput_typerq  Nr   r  r   r   rT  r   rU  rV  r  c	                 C   s|  |d ur|n| j j}|d ur|n| j j}|d ur |d ur td|d ur/| || | }	n|d ur<| d d }	ntd|	\}
}|d urK|jn|j}|d u r[tj|
|f|d}|d u rt	| j
drz| j
jd d d |f }||
|}|}n	tj|	tj|d}| ||	}| || j j}| j
||||d}| j|||||d}|d	 }|s|f|d
d   S t||j|j|jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer:   z5You have to specify either input_ids or inputs_embedsr   r   rL   )r   r   r   r   )r  rT  rU  rV  r   r   )rN  r  
attentionscross_attentions)r   rU  use_return_dictr=   %warn_if_padding_and_no_attention_maskr<   rN   rP   r   r   r   r   r   r   rR   get_extended_attention_maskget_head_maskrJ  ru  r   r  r  r  )r   r   r  r   r   rT  r   rU  rV  r   rV   r   rN   r   r   extended_attention_maskembedding_outputencoder_outputsre  r"   r"   r'   rs     sZ   
zMraModel.forward)NNNNNNNN)r   r   r   r   ry  rz  r}  r   MRA_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   rP   r$  boolr   r   rs   r   r"   r"   r   r'   rt    sL    
	

rt  z1MRA Model with a `language modeling` head on top.c                       s   e Zd ZddgZ fddZdd Zdd Zee	d	e
eeed
									ddeej deej deej deej deej deej deej dee dee deeef fddZ  ZS )MraForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                    s,   t  | t|| _t|| _|   d S rp   )r   r   rt  r!   rc  clsrv  r   r   r"   r'   r     s   

zMraForMaskedLM.__init__c                 C   s
   | j jjS rp   r  rd  ra  rx  r"   r"   r'   get_output_embeddings  s   
z$MraForMaskedLM.get_output_embeddingsc                 C   s   || j j_d S rp   r  )r   new_embeddingsr"   r"   r'   set_output_embeddings  s   z$MraForMaskedLM.set_output_embeddingsr~  r  Nr   r  r   r   rT  r   labelsrU  rV  r  c
              
   C   s   |	dur|	n| j j}	| j||||||||	d}
|
d }| |}d}|dur7t }||d| j j|d}|	sM|f|
dd  }|durK|f| S |S t|||
j|
j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Nr  r   r   rT  r   rU  rV  r   r:   r   losslogitsr  r  )
r   r  r!   r  r   r  r   r   r  r  )r   r   r  r   r   rT  r   r  rU  rV  r  re  rf  masked_lm_lossloss_fctr   r"   r"   r'   rs   !  s4   
zMraForMaskedLM.forward	NNNNNNNNN)r   r   r   _tied_weights_keysr   r  r  r   r  r  r   r  r   r  r   rP   r$  r  r   r   rs   r   r"   r"   r   r'   r    sR    		

r  c                       s(   e Zd ZdZ fddZdd Z  ZS )MraClassificationHeadz-Head for sentence-level classification tasks.c                    sF   t    t|j|j| _t|j| _t|j|j	| _
|| _d S rp   )r   r   r   r  r   r  r   r   r   
num_labelsout_projr   r   r   r"   r'   r   ^  s
   

zMraClassificationHead.__init__c                 K   sR   |d d dd d f }|  |}| |}t| jj |}|  |}| |}|S )Nr   )r   r  r   r   r4  r  )r   featureskwargsxr"   r"   r'   rs   f  s   



zMraClassificationHead.forwardr   r"   r"   r   r'   r  [  s    r  zMRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.c                          e Zd Z fddZeedeee	e
d									ddeej deej deej d	eej d
eej deej deej dee dee deee	f fddZ  ZS )MraForSequenceClassificationc                    s4   t  | |j| _t|| _t|| _|   d S rp   )r   r   r  rt  r!   r  
classifierrv  r   r   r"   r'   r   v  s
   

z%MraForSequenceClassification.__init__r~  r  Nr   r  r   r   rT  r   r  rU  rV  r  c
              
   C   sf  |	dur|	n| j j}	| j||||||||	d}
|
d }| |}d}|dur| j jdu rP| jdkr6d| j _n| jdkrL|jtjksG|jtj	krLd| j _nd| j _| j jdkrnt
 }| jdkrh|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|	s|f|
dd  }|dur|f| S |S t|||
j|
jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr:   r  )r   r  r!   r  problem_typer  rM   rP   rR   rB   r	   r  r   r  r   r   r  r  )r   r   r  r   r   rT  r   r  rU  rV  r  re  r  r  r  r   r"   r"   r'   rs     sR   


"


z$MraForSequenceClassification.forwardr  )r   r   r   r   r   r  r  r   r  r   r  r   rP   r$  r  r   r   rs   r   r"   r"   r   r'   r  p  sL    		

r  zMRA Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks.c                       r  )MraForMultipleChoicec                    sD   t  | t|| _t|j|j| _t|jd| _| 	  d S r<  )
r   r   rt  r!   r   r  r   pre_classifierr  rv  r   r   r"   r'   r     s
   
zMraForMultipleChoice.__init__z(batch_size, num_choices, sequence_lengthr  Nr   r  r   r   rT  r   r  rU  rV  r  c
              
   C   s  |	dur|	n| j j}	|dur|jd n|jd }
|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	d}|d }|dddf }| |}t |}| 	|}|d|
}d}|durt
 }|||}|	s|f|dd  }|dur|f| S |S t|||j|jdS )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r:   r7   r  r   r  )r   r  rO   r  r<   r!   r  r   ReLUr  r   r   r  r  )r   r   r  r   r   rT  r   r  rU  rV  num_choicesr  hidden_statepooled_outputr  reshaped_logitsr  r  r   r"   r"   r'   rs     sN   


zMraForMultipleChoice.forwardr  )r   r   r   r   r   r  r  r   r  r   r  r   rP   r$  r  r   r   rs   r   r"   r"   r   r'   r    sL    
	

r  zMRA Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.c                       r  )MraForTokenClassificationc                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S rp   )r   r   r  rt  r!   r   r   r   r   r  r   r  rv  r   r   r"   r'   r   (  s   
z"MraForTokenClassification.__init__r~  r  Nr   r  r   r   rT  r   r  rU  rV  r  c
              
   C   s  |	dur|	n| j j}	| j||||||||	d}
|
d }| |}| |}d}|durdt }|durW|ddk}|d| j}t	||dt
|j|}|||}n||d| j|d}|	sz|f|
dd  }|durx|f| S |S t|||
j|
jdS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r:   r   r  )r   r  r!   r   r  r   r  r  rP   wheretensorignore_indextype_asr   r  r  )r   r   r  r   r   rT  r   r  rU  rV  r  re  r  r  r  active_lossactive_logitsactive_labelsr   r"   r"   r'   rs   3  sD   

z!MraForTokenClassification.forwardr  )r   r   r   r   r   r  r  r   r  r   r  r   rP   r$  r  r   r   rs   r   r"   r"   r   r'   r  "  sL    	

r  zMRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).c                       s   e Zd Z fddZeedeee	e
d										ddeej deej deej d	eej d
eej deej deej deej dee dee deee	f fddZ  ZS )MraForQuestionAnsweringc                    sB   t  | d|_|j| _t|| _t|j|j| _| 	  d S )Nr4   )
r   r   r  rt  r!   r   r  r   
qa_outputsrv  r   r   r"   r'   r   {  s   
z MraForQuestionAnswering.__init__r~  r  Nr   r  r   r   rT  r   start_positionsend_positionsrU  rV  r  c              
   C   s>  |
dur|
n| j j}
| j|||||||	|
d}|d }| |}|jddd\}}|d}|d}d}|dur}|dur}t| dkrJ|d}t| dkrW|d}|d}|d|}|d|}t	|d}|||}|||}|| d }|
s||f|dd  }|dur|f| S |S t
||||j|jd	S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr  r   r   r:   r8   )r  r4   )r  start_logits
end_logitsr  r  )r   r  r!   r  splitr  r;   r<   clampr   r   r  r  )r   r   r  r   r   rT  r   r  r  rU  rV  r  re  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   r"   r"   r'   rs     sN   








zMraForQuestionAnswering.forward)
NNNNNNNNNN)r   r   r   r   r   r  r  r   r  r   r  r   rP   r$  r  r   r   rs   r   r"   r"   r   r'   r  u  sR    	

r  r   )NN)r6   r   r   )Yr   r   pathlibr   typingr   r   r   rP   torch.utils.checkpointr   torch.nnr   r   r	   torch.utils.cpp_extensionr
   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   configuration_mrar   
get_loggerr   loggerr  r  _TOKENIZER_FOR_DOC!MRA_PRETRAINED_MODEL_ARCHIVE_LISTr2   r,   infor   ewarningrK   rZ   r]   rd   rm   autogradFunctionrn   r   r   r   r   r   Moduler   r   r  r&  r1  r:  r;  rF  r\  r^  rc  rg  MRA_START_DOCSTRINGr  rt  r  r  r  r  r  r  r"   r"   r"   r'   <module>   s     





((
(-
s:V!,/lLTTN