o
    h                     @   s  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZmZ ddlmZmZmZmZ ddlm Z  e!e"Z#dZ$dZ%g dZ&G dd dej'Z(G dd dej'Z)G dd dej'Z*G dd dej'Z+G dd dej'Z,G dd dej'Z-G dd dej'Z.G d d! d!ej'Z/G d"d# d#eZ0d$Z1d%Z2ed&e1G d'd( d(e0Z3G d)d* d*ej'Z4G d+d, d,ej'Z5ed-e1G d.d/ d/e0Z6eG d0d1 d1eZ7ed2e1G d3d4 d4e0Z8dS )5z PyTorch Splinter model.    N)	dataclass)ListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN))BaseModelOutputWithPastAndCrossAttentionsModelOutputQuestionAnsweringModelOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )SplinterConfigtau/splinter-baser   )r   ztau/splinter-base-qassztau/splinter-largeztau/splinter-large-qassc                       sj   e Zd ZdZ fddZ					ddeej deej deej d	eej d
ee	 de
fddZ  ZS )SplinterEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _| jdt|jddd t|dd| _d S )	N)padding_idxepsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandgetattrr    selfconfig	__class__ d/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/splinter/modeling_splinter.pyr#   4   s   
zSplinterEmbeddings.__init__Nr   	input_idstoken_type_idsr   inputs_embedspast_key_values_lengthreturnc                 C   s   |d ur	|  }n|  d d }|d }|d u r&| jd d ||| f }|d u r5tj|tj| jjd}|d u r>| |}| |}|| }	| jdkrU| 	|}
|	|
7 }	| 
|	}	| |	}	|	S )Nr   r   dtypedevicer!   )sizer   r3   zeroslongrE   r(   r,   r    r*   r-   r1   )r8   r>   r?   r   r@   rA   input_shape
seq_lengthr,   
embeddingsr*   r<   r<   r=   forwardE   s$   






zSplinterEmbeddings.forward)NNNNr   )__name__
__module____qualname____doc__r#   r   r3   
LongTensorFloatTensorintr   rL   __classcell__r<   r<   r:   r=   r   1   s*    r   c                       s   e Zd Zd fdd	ZdejdejfddZ						dd	ejd
eej deej deej deej dee	e	ej   dee
 de	ej fddZ  ZS )SplinterSelfAttentionNc                    s   t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _|p\t|dd| _| jdksh| jd	kry|j| _t	d
|j d | j| _|j| _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r    r!   relative_keyrelative_key_query   r   )r"   r#   r&   num_attention_headshasattr
ValueErrorrS   attention_head_sizeall_head_sizer   Linearquerykeyvaluer/   attention_probs_dropout_probr1   r6   r    r)   r$   distance_embedding
is_decoderr8   r9   r    r:   r<   r=   r#   i   s*   

zSplinterSelfAttention.__init__xrB   c                 C   s6   |  d d | j| jf }||}|ddddS )Nr   r   rZ   r   r	   )rF   r[   r^   viewpermute)r8   rh   new_x_shaper<   r<   r=   transpose_for_scores   s   
z*SplinterSelfAttention.transpose_for_scoresFhidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 C   s  |  |}|d u}	|	r|d ur|d }
|d }|}nP|	r/| | |}
| | |}|}n;|d urZ| | |}
| | |}tj|d |
gdd}
tj|d |gdd}n| | |}
| | |}| |}|d u}| jrz|
|f}t||
dd}| j	dks| j	dkr	|j
d |
j
d }}|rtj|d tj|jd	dd}ntj|tj|jd	dd}tj|tj|jd	dd}|| }| || j d }|j|jd
}| j	dkrtd||}|| }n| j	dkr	td||}td|
|}|| | }|t| j }|d ur|| }tjj|dd}| |}|d ur0|| }t||}|dddd }| d d | jf }||}|rX||fn|f}| jrd||f }|S )Nr   r   rZ   dimr   rX   rY   rC   )rD   zbhld,lrd->bhlrzbhrd,lrd->bhlrr	   ) ra   rl   rb   rc   r3   catrf   matmul	transposer    shapetensorrH   rE   ri   r4   re   r)   torD   einsummathsqrtr^   r   
functionalsoftmaxr1   rj   
contiguousrF   r_   )r8   rm   rn   ro   rp   rq   rr   rs   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputsr<   r<   r=   rL      sn   









zSplinterSelfAttention.forwardNNNNNNF)rM   rN   rO   r#   r3   Tensorrl   r   rR   r   boolrL   rT   r<   r<   r:   r=   rU   h   s4    	rU   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )SplinterSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )r"   r#   r   r`   r&   denser-   r.   r/   r0   r1   r7   r:   r<   r=   r#         
zSplinterSelfOutput.__init__rm   input_tensorrB   c                 C   &   |  |}| |}| || }|S r   r   r1   r-   r8   rm   r   r<   r<   r=   rL         

zSplinterSelfOutput.forwardrM   rN   rO   r#   r3   r   rL   rT   r<   r<   r:   r=   r          $r   c                       s   e Zd Zd fdd	Zdd Z						ddejdeej d	eej d
eej deej dee	e	ej   dee
 de	ej fddZ  ZS )SplinterAttentionNc                    s.   t    t||d| _t|| _t | _d S )Nr    )r"   r#   rU   r8   r   outputsetpruned_headsrg   r:   r<   r=   r#      s   

zSplinterAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rt   )lenr   r8   r[   r^   r   r   ra   rb   rc   r   r   r_   union)r8   headsindexr<   r<   r=   prune_heads  s   zSplinterAttention.prune_headsFrm   rn   ro   rp   rq   rr   rs   rB   c              	   C   s<   |  |||||||}| |d |}	|	f|dd   }
|
S )Nr   r   )r8   r   )r8   rm   rn   ro   rp   rq   rr   rs   self_outputsattention_outputr   r<   r<   r=   rL     s   
	zSplinterAttention.forwardr   r   )rM   rN   rO   r#   r   r3   r   r   rR   r   r   rL   rT   r<   r<   r:   r=   r      s4    	r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )SplinterIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r"   r#   r   r`   r&   intermediate_sizer   
isinstance
hidden_actstrr
   intermediate_act_fnr7   r:   r<   r=   r#   1  s
   
zSplinterIntermediate.__init__rm   rB   c                 C   s   |  |}| |}|S r   )r   r   )r8   rm   r<   r<   r=   rL   9  s   

zSplinterIntermediate.forwardr   r<   r<   r:   r=   r   0  s    r   c                       r   )SplinterOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r"   r#   r   r`   r   r&   r   r-   r.   r/   r0   r1   r7   r:   r<   r=   r#   A  r   zSplinterOutput.__init__rm   r   rB   c                 C   r   r   r   r   r<   r<   r=   rL   G  r   zSplinterOutput.forwardr   r<   r<   r:   r=   r   @  r   r   c                       s   e Zd Z fddZ						ddejdeej deej deej d	eej d
eeeej   dee	 deej fddZ
dd Z  ZS )SplinterLayerc                    sr   t    |j| _d| _t|| _|j| _|j| _| jr-| js&t|  dt|dd| _	t
|| _t|| _d S )Nr   z> should be used as a decoder model if cross attention is addedr!   r   )r"   r#   chunk_size_feed_forwardseq_len_dimr   	attentionrf   add_cross_attentionr]   crossattentionr   intermediater   r   r7   r:   r<   r=   r#   P  s   


zSplinterLayer.__init__NFrm   rn   ro   rp   rq   rr   rs   rB   c              	   C   s  |d ur
|d d nd }| j |||||d}	|	d }
| jr(|	dd }|	d }n|	dd  }d }| jro|d urot| dsDtd|  d|d urN|d	d  nd }| |
||||||}|d }
||dd  }|d }|| }t| j| j| j|
}|f| }| jr||f }|S )
NrZ   )rs   rr   r   r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`rv   )	r   rf   r\   r]   r   r   feed_forward_chunkr   r   )r8   rm   rn   ro   rp   rq   rr   rs   self_attn_past_key_valueself_attention_outputsr   r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputr<   r<   r=   rL   ^  sP   


	

zSplinterLayer.forwardc                 C   s   |  |}| ||}|S r   )r   r   )r8   r   intermediate_outputr   r<   r<   r=   r     s   
z SplinterLayer.feed_forward_chunkr   )rM   rN   rO   r#   r3   r   r   rR   r   r   rL   r   rT   r<   r<   r:   r=   r   O  s4    	
Ar   c                       s   e Zd Z fddZ									ddejdeej deej d	eej d
eej deeeej   dee	 dee	 dee	 dee	 de
eej ef fddZ  ZS )SplinterEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r<   )r   ).0_r9   r<   r=   
<listcomp>  s    z,SplinterEncoder.__init__.<locals>.<listcomp>F)	r"   r#   r9   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr7   r:   r   r=   r#     s   
 
zSplinterEncoder.__init__NFTrm   rn   ro   rp   rq   past_key_valuesr   rs   output_hidden_statesreturn_dictrB   c                 C   s^  |	rdnd }|r
dnd }|r| j jrdnd }| jr%| jr%|r%td d}|r)dnd }t| jD ]^\}}|	r;||f }|d urC|| nd }|d urM|| nd }| jrc| jrc| |j	|||||||}n
||||||||}|d }|rz||d f7 }|r||d f }| j jr||d f }q0|	r||f }|
st
dd	 |||||fD S t|||||d
S )Nr<   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   rZ   c                 s   s    | ]	}|d ur|V  qd S r   r<   )r   vr<   r<   r=   	<genexpr>  s    z*SplinterEncoder.forward.<locals>.<genexpr>last_hidden_stater   rm   
attentionscross_attentions)r9   r   r   trainingloggerwarning_once	enumerater   _gradient_checkpointing_func__call__tupler   )r8   rm   rn   ro   rp   rq   r   r   rs   r   r   all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskrr   layer_outputsr<   r<   r=   rL     sz   


zSplinterEncoder.forward)	NNNNNNFFT)rM   rN   rO   r#   r3   r   r   rR   r   r   r   r   rL   rT   r<   r<   r:   r=   r     sD    		
r   c                   @   s$   e Zd ZdZeZdZdZdd ZdS )SplinterPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    splinterTc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS dS )zInitialize the weightsg        )meanstdNg      ?)r   r   r`   weightdatanormal_r9   initializer_rangebiaszero_r$   r   r-   fill_)r8   moduler<   r<   r=   _init_weights  s   

z%SplinterPreTrainedModel._init_weightsN)	rM   rN   rO   rP   r   config_classbase_model_prefixsupports_gradient_checkpointingr   r<   r<   r<   r=   r     s    r   aK  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`SplinterConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a/
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `{0}`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `{0}`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `{0}`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zbThe bare Splinter Model transformer outputting raw hidden-states without any specific head on top.c                !       s  e Zd ZdZ fddZdd Zdd Zdd	 Zee	
d
eeeed													ddeej deej deej deej deej deej deej deej deeej  dee dee dee dee deeef fddZ  ZS )SplinterModela*  
    The model is an encoder (with only self-attention) following the architecture described in [Attention is all you
    need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
    Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
    c                    s2   t  | || _t|| _t|| _|   d S r   )r"   r#   r9   r   rK   r   encoder	post_initr7   r:   r<   r=   r#   g  s
   

zSplinterModel.__init__c                 C   s   | j jS r   rK   r(   )r8   r<   r<   r=   get_input_embeddingsq  s   z"SplinterModel.get_input_embeddingsc                 C   s   || j _d S r   r   )r8   rc   r<   r<   r=   set_input_embeddingst  s   z"SplinterModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r8   heads_to_pruner   r   r<   r<   r=   _prune_headsw  s   zSplinterModel._prune_headsbatch_size, sequence_length
checkpointoutput_typer   Nr>   rn   r?   r   ro   r@   rp   rq   r   r   rs   r   r   rB   c                 C   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}| j jr-|
dur(|
n| j j}
nd}
|dur;|dur;td|durJ| || | }n|durW| dd }ntd|\}}|durf|j	n|j	}|	durv|	d d j
d nd}|du rtj||| f|d}|du rtj|tj|d	}| ||}| j jr|dur| \}}}||f}|du rtj||d}| |}nd}| || j j}| j|||||d
}| j||||||	|
|||d
}|d }|s|f|dd  S t||j|j|j|jdS )a  
        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NFzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   rZ   )rE   rC   )r>   r   r?   r@   rA   )	rn   ro   rp   rq   r   r   rs   r   r   r   r   )r9   rs   r   use_return_dictrf   r   r]   %warn_if_padding_and_no_attention_maskrF   rE   rz   r3   onesrG   rH   get_extended_attention_maskinvert_attention_maskget_head_maskr   rK   r   r   r   rm   r   r   )r8   r>   rn   r?   r   ro   r@   rp   rq   r   r   rs   r   r   rI   
batch_sizerJ   rE   rA   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputr<   r<   r=   rL     sx   )
zSplinterModel.forward)NNNNNNNNNNNNN)rM   rN   rO   rP   r#   r   r  r  r   SPLINTER_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r3   r   r   rR   r   r   r   rL   rT   r<   r<   r:   r=   r   \  sl    
	

r   c                       s4   e Zd Zd fdd	ZdejdejfddZ  ZS )	SplinterFullyConnectedLayergeluc                    sD   t    || _|| _t| j| j| _t| | _t	| j| _	d S r   )
r"   r#   	input_dim
output_dimr   r`   r   r
   act_fnr-   )r8   r  r  r   r:   r<   r=   r#      s   

z$SplinterFullyConnectedLayer.__init__inputsrB   c                 C   s"   |  |}| |}| |}|S r   )r   r   r-   )r8   r!  rm   r<   r<   r=   rL   
  s   


z#SplinterFullyConnectedLayer.forward)r  r   r<   r<   r:   r=   r    s    
r  c                       s(   e Zd ZdZ fddZdd Z  ZS )QuestionAwareSpanSelectionHeadzf
    Implementation of Question-Aware Span Selection (QASS) head, described in Splinter's paper:

    c                    sz   t    t|j|j| _t|j|j| _t|j|j| _t|j|j| _tj	|j|jdd| _
tj	|j|jdd| _d S )NF)r   )r"   r#   r  r&   query_start_transformquery_end_transformstart_transformend_transformr   r`   start_classifierend_classifierr7   r:   r<   r=   r#     s   
z'QuestionAwareSpanSelectionHead.__init__c                 C   s   |  \}}}|ddd|}tj|d|d}| |}| |}| |}	| |}
| 	|}|	
ddd}	t||	}| |}|

ddd}
t||
}||fS )Nr   r   )ru   r   r   rZ   )rF   	unsqueezerepeatr3   gatherr#  r$  r%  r&  r'  rj   rx   r(  )r8   r!  	positionsr   ru   r   gathered_repsquery_start_repsquery_end_reps
start_repsend_repsrm   start_logits
end_logitsr<   r<   r=   rL   "  s   





z&QuestionAwareSpanSelectionHead.forward)rM   rN   rO   rP   r#   rL   rT   r<   r<   r:   r=   r"    s    r"  z
    Splinter Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       s   e Zd Z fddZeedeee	e
d												ddeej deej deej d	eej d
eej deej deej deej dee dee dee deej deee	f fddZ  ZS )SplinterForQuestionAnsweringc                    4   t  | t|| _t|| _|j| _|   d S r   r"   r#   r   r   r"  splinter_qassquestion_token_idr   r7   r:   r<   r=   r#   ?  
   

z%SplinterForQuestionAnswering.__init__r  r  Nr>   rn   r?   r   ro   r@   start_positionsend_positionsrs   r   r   question_positionsrB   c                 C   s  |dur|n| j j}d}|du r9|dur#tjt|| j dd}ntj|dtj	|j
|jd}|d}d}| j|||||||	|
|d	}|d }| ||\}}|r`|d	|d	}}|dur~|d	| t|jj  }|d	| t|jj  }d}|dur|durt| d	kr|d}t| d	kr|d}|d	}|d| |d| t|d
}|||}|||}|| d }|s||f|d	d  }|dur|f| S |S t||||j|jdS )a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        NFr   rt   r   )rD   layoutrE   Trn   r?   r   ro   r@   rs   r   r   r   ignore_indexrZ   lossr2  r3  rm   r   )r9   r	  r3   argmaxeqr8  rS   rG   rF   rH   r=  rE   r)  r   r7  squeezefinforD   minr   clamp_r   r   rm   r   )r8   r>   rn   r?   r   ro   r@   r:  r;  rs   r   r   r<  question_positions_were_none"question_position_for_each_exampler   r  r2  r3  
total_lossignored_indexloss_fct
start_lossend_lossr   r<   r<   r=   rL   I  sj   $






z$SplinterForQuestionAnswering.forwardNNNNNNNNNNNN)rM   rN   rO   r#   r   r  r  r   r  r   r  r   r3   r   rQ   r   r   r   rL   rT   r<   r<   r:   r=   r4  7  s^    
	

r4  c                   @   sl   e Zd ZU dZdZeej ed< dZ	ejed< dZ
ejed< dZeeej  ed< dZeeej  ed< dS )SplinterForPreTrainingOutputa  
    Class for outputs of Splinter as a span selection model.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when start and end positions are provided):
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
            Span-start scores (before SoftMax).
        end_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
            Span-end scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    NrB  r2  r3  rm   r   )rM   rN   rO   rP   rB  r   r3   rR   __annotations__r2  r3  rm   r   r   r<   r<   r<   r=   rQ    s   
 rQ  z
    Splinter Model for the recurring span selection task as done during the pretraining. The difference to the QA task
    is that we do not have a question, but multiple question tokens that replace the occurrences of recurring spans
    instead.
    c                       s   e Zd Z fddZeed												ddeej	 deej	 deej	 deej	 d	eej	 d
eej	 deej
 deej
 dee dee dee deej
 deeef fddZdej	dej	fddZ  ZS )SplinterForPreTrainingc                    r5  r   r6  r7   r:   r<   r=   r#     r9  zSplinterForPreTraining.__init__z*batch_size, num_questions, sequence_lengthNr>   rn   r?   r   ro   r@   r:  r;  rs   r   r   r<  rB   c                 C   s  |dur|n| j j}|du r|dur|durtd|du r&|du r&td|du r/| |}| j|||||||	|
|d	}|d }| \}}}| ||\}}|d}|dur}|d|||}|d| t	
|jj  }|d| t	
|jj  }d}|dur|dur|dtd|d  |dtd|d  t| j jd}|||| |||| }|||| |||| }|| d }|s||f|dd  }|dur|f| S |S t||||j|jd	S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        NzCquestion_positions must be specified in order to calculate the lossz>question_positions must be specified when input_embeds is usedr>  r   r   r?  rZ   rA  )r9   r	  	TypeError_prepare_question_positionsr   rF   r7  r)  r5   r3   rF  rD   rG  rH  maxr   r'   ri   rQ  rm   r   )r8   r>   rn   r?   r   ro   r@   r:  r;  rs   r   r   r<  r   r  r  sequence_lengthru   r2  r3  num_questions attention_mask_for_each_questionrK  rM  rN  rO  r   r<   r<   r=   rL     sh   !


zSplinterForPreTraining.forwardc                 C   sl   t || jjk\}}t |}t j|d| f| jjt j	|j
d}t dd |D }||||f< |S )Nr   rC   c                 S   s   g | ]}t |qS r<   )r3   r4   )r   nr<   r<   r=   r   S  s    zFSplinterForPreTraining._prepare_question_positions.<locals>.<listcomp>)r3   wherer9   r8  bincountfullrF   rV  r'   rH   rE   rw   )r8   r>   rowsflat_positionsrX  r,  colsr<   r<   r=   rU  J  s   
z2SplinterForPreTraining._prepare_question_positionsrP  )rM   rN   rO   r#   r   r  r  r   r3   r   rQ   r   r   r   rQ  rL   rU  rT   r<   r<   r:   r=   rS    sZ    	
	

drS  )9rP   r~   dataclassesr   typingr   r   r   r   r3   torch.utils.checkpointr   torch.nnr   activationsr
   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_splinterr   
get_loggerrM   r   r  r  &SPLINTER_PRETRAINED_MODEL_ARCHIVE_LISTModuler   rU   r   r   r   r   r   r   r   SPLINTER_START_DOCSTRINGr  r   r  r"  r4  rQ  rS  r<   r<   r<   r=   <module>   sd   
	7 2W]2  &r