o
    h]                    @   s  d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZmZmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' e$(e)Z*dZ+dZ,ddgZ-g dZ.eG dd deZ/dd Z0G dd dej1Z2G dd dej1Z3G dd dej1Z4G dd dej1Z5G dd  d ej1Z6G d!d" d"ej1Z7G d#d$ d$ej1Z8G d%d& d&ej1Z9G d'd( d(ej1Z:G d)d* d*ej1Z;G d+d, d,ej1Z<G d-d. d.ej1Z=G d/d0 d0ej1Z>G d1d2 d2ej1Z?G d3d4 d4eZ@d5ZAd6ZBe"d7eAG d8d9 d9e@ZCe"d:eAG d;d< d<e@ZDe"d=eAG d>d? d?e@ZEe"d@eAG dAdB dBe@ZFe"dCeAG dDdE dEe@ZGdS )Fz PyTorch CANINE model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputModelOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )CanineConfigzgoogle/canine-sr   zgoogle/canine-r)   +   ;   =   I   a   g   q                           c                   @   sZ   e Zd ZU dZdZejed< dZejed< dZ	e
eej  ed< dZe
eej  ed< dS )CanineModelOutputWithPoolinga  
    Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly
    different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow
    Transformer encoders.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final
            shallow Transformer encoder).
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Hidden-state of the first token of the sequence (classification token) at the last layer of the deep
            Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer
            weights are trained from the next sentence prediction (classification) objective during pretraining.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of each
            encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length //
            config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of each layer plus the
            initial input to each Transformer encoder. The hidden states of the shallow encoders have length
            `sequence_length`, but the hidden states of the deep encoder have length `sequence_length` //
            `config.downsampling_rate`.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape `(batch_size,
            num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length //
            config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions weights after the
            attention softmax, used to compute the weighted average in the self-attention heads.
    Nlast_hidden_statepooler_outputhidden_states
attentions)__name__
__module____qualname____doc__r.   torchFloatTensor__annotations__r/   r0   r   r   r1    r9   r9   `/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/canine/modeling_canine.pyr-   A   s   
 r-   c                 C   s*  zddl }ddl}ddl}W n ty   td  w tj|}t	d|  |j
|}g }g }	|D ] \}
}t	d|
 d|  |j
||
}||
 |	| q6t||	D ]5\}
}|
d}
tdd	 |
D r{t	d
d|
  q\|
d dkrd|
d< n>|
d dkr|
|
d  n0|
d dkrd|
d< n%|
d dkrdg|
dd  }
n|
d dkr|
d dv rdg|
dd  }
| }|
D ]x}|d|rd|vr|d|}n|g}|d dks|d dkrt|d}n?|d dks|d dkrt|d}n,|d d krt|d}nz	t||d }W n ty.   t	d
d|
  Y qw t|d!kr@t|d }|| }q|d"d d#krPt|d}n |d$d d%d& td'D v rft|d}n
|dkrp||}|j|jkrtd(|j d)|j d*t	d+|
  t||_q\| S ),z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape /c                 s   s    | ]}|d v V  qdS ))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepclsautoregressive_decoderchar_output_weightsNr9   ).0nr9   r9   r:   	<genexpr>   s    
z,load_tf_weights_in_canine.<locals>.<genexpr>z	Skipping bertencoderr   
embeddingssegment_embeddingstoken_type_embeddingsinitial_char_encoderchars_to_moleculesfinal_char_encoder)	LayerNormconv
projectionz[A-Za-z]+_\d+Embedderz_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weights   i_embeddingsic                 S   s   g | ]}d | qS )	Embedder_r9   )rD   ir9   r9   r:   
<listcomp>   s    z-load_tf_weights_in_canine.<locals>.<listcomp>   zPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipsplitanyjoinremove	fullmatchgetattrAttributeErrorlenintrange	transposeshape
ValueErrorr6   
from_numpydata)modelconfigtf_checkpoint_pathra   nptftf_path	init_varsnamesarraysnamer{   arraypointerm_namescope_namesnumr9   r9   r:   load_tf_weights_in_canined   s   



 

r   c                       s   e Zd ZdZ fddZdedefddZdededefd	d
Z				ddee	j
 dee	j
 dee	j
 dee	j de	jf
ddZ  ZS )CanineEmbeddingsz<Construct the character, position and token_type embeddings.c                    s   t    || _|j|j }t|jD ]}d| }t| |t|j	| qt|j	|j| _
t|j|j| _tj|j|jd| _t|j| _| jdt|jddd t|dd| _d S )	NHashBucketCodepointEmbedder_epsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__r   hidden_sizenum_hash_functionsry   setattrr   	Embeddingnum_hash_bucketschar_position_embeddingstype_vocab_sizerK   rP   layer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr6   arangemax_position_embeddingsexpandru   r   )selfr   shard_embedding_sizer^   r   	__class__r9   r:   r      s   

zCanineEmbeddings.__init__
num_hashesnum_bucketsc                 C   sV   |t tkrtdt t td| }g }|D ]}|d | | }|| q|S )a  
        Converts ids to hash bucket ids via multiple hashing.

        Args:
            input_ids: The codepoints or other IDs to be hashed.
            num_hashes: The number of hash functions to use.
            num_buckets: The number of hash buckets (i.e. embeddings in each table).

        Returns:
            A list of tensors, each of which is the hash bucket IDs from one hash function.
        z`num_hashes` must be <= Nr   )rw   _PRIMESr|   rn   )r   	input_idsr   r   primesresult_tensorsprimehashedr9   r9   r:   _hash_bucket_tensors   s   z%CanineEmbeddings._hash_bucket_tensorsembedding_sizec                 C   sx   || dkrt d| d| d| j|||d}g }t|D ]\}}d| }	t| |	|}
||
 qtj|ddS )	zDConverts IDs (e.g. codepoints) into embeddings via multiple hashing.r   zExpected `embedding_size` (z) % `num_hashes` (z) == 0)r   r   r   r   dim)r|   r   	enumerateru   rn   r6   cat)r   r   r   r   r   hash_bucket_tensorsembedding_shardsr^   hash_bucket_idsr   shard_embeddingsr9   r9   r:   _embed_hash_buckets   s   
z$CanineEmbeddings._embed_hash_bucketsNr   token_type_idsr   inputs_embedsreturnc           
      C   s   |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u r3tj|tj| jjd}|d u rE| || jj| jj	| jj
}| |}|| }| jdkr\| |}	||	7 }| |}| |}|S )Nr   r   dtypedevicer   )sizer   r6   zeroslongr   r   r   r   r   r   rK   r   r   rP   r   )
r   r   r   r   r   input_shape
seq_lengthrK   rI   position_embeddingsr9   r9   r:   forward  s(   





zCanineEmbeddings.forward)NNNN)r2   r3   r4   r5   r   rx   r   r   r   r6   
LongTensorr7   r   __classcell__r9   r9   r   r:   r      s(    r   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )CharactersToMoleculeszeConvert character sequence to initial molecule sequence (i.e. downsample) using strided convolutions.c                    sJ   t    tj|j|j|j|jd| _t|j | _	tj
|j|jd| _
d S )Nin_channelsout_channelskernel_sizestrider   )r   r   r   Conv1dr   downsampling_raterQ   r   
hidden_act
activationrP   r   r   r   r   r9   r:   r   -  s   
zCharactersToMolecules.__init__char_encodingr   c                 C   s   |d d ddd d f }t |dd}| |}t |dd}| |}|d d ddd d f }t j||gdd}| |}|S )Nr   r   r[   r   r   )r6   rz   rQ   r   r   rP   )r   r   cls_encodingdownsampleddownsampled_truncatedresultr9   r9   r:   r   <  s   


zCharactersToMolecules.forward)	r2   r3   r4   r5   r   r6   Tensorr   r   r9   r9   r   r:   r   *  s    r   c                       sD   e Zd ZdZ fddZ	d
dejdeej dejfdd	Z  Z	S )ConvProjectionz
    Project representations from hidden_size*2 back to hidden_size across a window of w = config.upsampling_kernel_size
    characters.
    c                    s`   t    || _tj|jd |j|jdd| _t|j	 | _
tj|j|jd| _t|j| _d S )Nr[   r   r   r   )r   r   r   r   r   r   upsampling_kernel_sizerQ   r   r   r   rP   r   r   r   r   r   r   r9   r:   r   ^  s   
zConvProjection.__init__Ninputsfinal_seq_char_positionsr   c           
      C   s   t |dd}| jjd }|d }|| }t||fd}| ||}t |dd}| |}| |}| 	|}|}|d urDt
d|}	|	S )Nr   r[   r   z,CanineForMaskedLM is currently not supported)r6   rz   r   r   r   ConstantPad1drQ   r   rP   r   NotImplementedError)
r   r   r   	pad_totalpad_begpad_endpadr   final_char_seq	query_seqr9   r9   r:   r   m  s   


zConvProjection.forwardN)
r2   r3   r4   r5   r   r6   r   r   r   r   r9   r9   r   r:   r   X  s    r   c                       sr   e Zd Z fddZdd Z			ddejdejd	eej d
eej dee	 de
ejeej f fddZ  ZS )CanineSelfAttentionc                    s   t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _t|dd| _| jdksf| jd	kry|j| _t	d
|j d | j| _d S d S )Nr   r   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   relative_keyrelative_key_queryr[   r   )r   r   r   num_attention_headshasattrr|   rx   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probr   ru   r   r   r   distance_embeddingr   r   r9   r:   r     s&   

zCanineSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr   r   r[   r   r
   )r   r   r   viewpermute)r   xnew_x_shaper9   r9   r:   transpose_for_scores  s   
z(CanineSelfAttention.transpose_for_scoresNFfrom_tensor	to_tensorattention_mask	head_maskoutput_attentionsr   c                 C   s  |  |}| | |}| | |}| |}	t|	|dd}
| jdks.| jdkr| d }tj	|tj
|jddd}tj	|tj
|jddd}|| }| || j d }|j|	jd}| jdkrvtd|	|}|
| }
n| jdkrtd|	|}td	||}|
| | }
|
t| j }
|d ur|jd
krtj|dd}d|  t|
jj }|
| }
tjj|
dd}| |}|d ur|| }t||}|dddd
 }| d d | j f }|j| }|r||f}|S |f}|S )Nr   rN   r   r   r   r   )r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr
   r         ?r   r[   )!r   r   r   r   r6   matmulrz   r   r   r   r   r   r   r   r   tor   einsummathsqrtr   ndim	unsqueezefloatfinfominr   
functionalsoftmaxr   r   
contiguousr   )r   r   r  r  r  r  mixed_query_layer	key_layervalue_layerquery_layerattention_scoresr   position_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputsr9   r9   r:   r     sJ   







zCanineSelfAttention.forwardNNF)r2   r3   r4   r   r   r6   r   r   r7   boolr   r   r   r9   r9   r   r:   r     s&    	r   c                       sF   e Zd Z fddZdeej dejdeejejf fddZ  ZS )CanineSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )r   r   r   r   r   denserP   r   r   r   r   r   r   r9   r:   r        
zCanineSelfOutput.__init__r0   input_tensorr   c                 C   &   |  |}| |}| || }|S r   r'  r   rP   r   r0   r)  r9   r9   r:   r     s   

zCanineSelfOutput.forward	r2   r3   r4   r   r   r6   r7   r   r   r9   r9   r   r:   r%    s    r%  c                       s   e Zd ZdZ							ddededededed	ef fd
dZdd Z			ddee	j
 dee	j
 dee	j
 dee dee	j
ee	j
 f f
ddZ  ZS )CanineAttentionav  
    Additional arguments related to local attention:

        - **local** (`bool`, *optional*, defaults to `False`) -- Whether to apply local attention.
        - **always_attend_to_first_position** (`bool`, *optional*, defaults to `False`) -- Should all blocks be able to
          attend
        to the `to_tensor`'s first position (e.g. a [CLS] position)? - **first_position_attends_to_all** (`bool`,
        *optional*, defaults to `False`) -- Should the *from_tensor*'s first position be able to attend to all
        positions within the *from_tensor*? - **attend_from_chunk_width** (`int`, *optional*, defaults to 128) -- The
        width of each block-wise chunk in `from_tensor`. - **attend_from_chunk_stride** (`int`, *optional*, defaults to
        128) -- The number of elements to skip when moving to the next block in `from_tensor`. -
        **attend_to_chunk_width** (`int`, *optional*, defaults to 128) -- The width of each block-wise chunk in
        *to_tensor*. - **attend_to_chunk_stride** (`int`, *optional*, defaults to 128) -- The number of elements to
        skip when moving to the next block in `to_tensor`.
    F   always_attend_to_first_positionfirst_position_attends_to_allattend_from_chunk_widthattend_from_chunk_strideattend_to_chunk_widthattend_to_chunk_stridec	           	         st   t    t|| _t|| _t | _|| _||k rt	d||k r&t	d|| _
|| _|| _|| _|| _|| _d S )Nze`attend_from_chunk_width` < `attend_from_chunk_stride` would cause sequence positions to get skipped.z``attend_to_chunk_width` < `attend_to_chunk_stride`would cause sequence positions to get skipped.)r   r   r   r   r%  outputsetpruned_headslocalr|   r0  r1  r2  r3  r4  r5  	r   r   r9  r0  r1  r2  r3  r4  r5  r   r9   r:   r     s&   



zCanineAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )rw   r   r   r   r   r8  r   r   r   r   r6  r'  r   union)r   headsindexr9   r9   r:   prune_heads8  s   zCanineAttention.prune_headsNr0   r  r  r  r   c                 C   sF  | j s| |||||}|d }n|jd  }}| }	}
g }| jr)|d d}nd}t||| jD ]}t||| j }|||f q2g }| jrP|d|f td|| j	D ]}t||| j
 }|||f qWt|t|kr|td| d| dg }g }t||D ]w\\}}\}}|	d d ||d d f }|
d d ||d d f }|d d ||||f }| jr|d d ||ddf }tj||gdd}|
d d ddd d f }tj||gdd}| |||||}||d  |r||d  qtj|dd}| ||}|f}| j s||dd   }|S |t| }|S )	Nr   r   )r   r   z/Expected to have same number of `from_chunks` (z) and `to_chunks` (z). Check strides.r[   r   )r9  r   r{   r1  rn   ry   r3  r  r2  r5  r4  rw   r|   ro   r0  r6   r   r6  tuple)r   r0   r  r  r  self_outputsattention_outputfrom_seq_lengthto_seq_lengthr   r  from_chunks
from_startchunk_start	chunk_end	to_chunksattention_output_chunksattention_probs_chunksfrom_endto_startto_endfrom_tensor_chunkto_tensor_chunkattention_mask_chunkcls_attention_maskcls_positionattention_outputs_chunkr"  r9   r9   r:   r   J  sf   


zCanineAttention.forwardFFFr/  r/  r/  r/  r#  )r2   r3   r4   r5   r$  rx   r   r>  r   r6   r7   r   r   r   r9   r9   r   r:   r.    sJ    	!r.  c                       s2   e Zd Z fddZdejdejfddZ  ZS )CanineIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r   r   r   r   r   intermediate_sizer'  
isinstancer   strr   intermediate_act_fnr   r   r9   r:   r     s
   
zCanineIntermediate.__init__r0   r   c                 C      |  |}| |}|S r   )r'  rY  r   r0   r9   r9   r:   r        

zCanineIntermediate.forward)r2   r3   r4   r   r6   r7   r   r   r9   r9   r   r:   rU    s    rU  c                       s<   e Zd Z fddZdeej dejdejfddZ  ZS )CanineOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r&  )r   r   r   r   rV  r   r'  rP   r   r   r   r   r   r   r9   r:   r     r(  zCanineOutput.__init__r0   r)  r   c                 C   r*  r   r+  r,  r9   r9   r:   r     s   

zCanineOutput.forwardr-  r9   r9   r   r:   r]    s    (r]  c                       sp   e Zd Z fddZ			ddeej deej deej dee d	eejeej f f
d
dZ	dd Z
  ZS )CanineLayerc	           	   	      sH   t    |j| _d| _t||||||||| _t|| _t|| _	d S Nr   )
r   r   chunk_size_feed_forwardseq_len_dimr.  	attentionrU  intermediater]  r6  r:  r   r9   r:   r     s   


zCanineLayer.__init__NFr0   r  r  r  r   c           	      C   sH   | j ||||d}|d }|dd  }t| j| j| j|}|f| }|S )N)r  r   r   )rb  r   feed_forward_chunkr`  ra  )	r   r0   r  r  r  self_attention_outputsrA  r"  layer_outputr9   r9   r:   r     s   
zCanineLayer.forwardc                 C   s   |  |}| ||}|S r   )rc  r6  )r   rA  intermediate_outputrf  r9   r9   r:   rd    s   
zCanineLayer.feed_forward_chunkr#  )r2   r3   r4   r   r   r6   r7   r   r$  r   rd  r   r9   r9   r   r:   r^    s"    
r^  c                       s   e Zd Z							d fdd	Z					ddeej deej d	eej d
ee dee dee de	ee
f fddZ  ZS )CanineEncoderFr/  c	           	   
      sH   t    | _t fddtjD | _d| _d S )Nc                    s"   g | ]}t  qS r9   )r^  )rD   _r0  r3  r2  r5  r4  r   r1  r9  r9   r:   r_     s    z*CanineEncoder.__init__.<locals>.<listcomp>F)	r   r   r   r   
ModuleListry   num_hidden_layerslayergradient_checkpointingr:  r   rj  r:   r     s   

zCanineEncoder.__init__NTr0   r  r  r  output_hidden_statesreturn_dictr   c                 C   s   |rdnd }|r
dnd }t | jD ]:\}	}
|r||f }|d ur$||	 nd }| jr7| jr7| |
j||||}n|
||||}|d }|rK||d f }q|rS||f }|satdd |||fD S t|||dS )Nr9   r   r   c                 s       | ]	}|d ur|V  qd S r   r9   rD   vr9   r9   r:   rF   0      z(CanineEncoder.forward.<locals>.<genexpr>)r.   r0   r1   )r   rm  rn  training_gradient_checkpointing_func__call__r?  r   )r   r0   r  r  r  ro  rp  all_hidden_statesall_self_attentionsr^   layer_modulelayer_head_masklayer_outputsr9   r9   r:   r     s8   	

zCanineEncoder.forwardrT  )NNFFT)r2   r3   r4   r   r   r6   r7   r   r$  r   r   r   r   r9   r9   r   r:   rh    s:    !
rh  c                       6   e Zd Z fddZdeej dejfddZ  ZS )CaninePoolerc                    s*   t    t|j|j| _t | _d S r   )r   r   r   r   r   r'  Tanhr   r   r   r9   r:   r   9  s   
zCaninePooler.__init__r0   r   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r'  r   )r   r0   first_token_tensorpooled_outputr9   r9   r:   r   >  s   

zCaninePooler.forwardr-  r9   r9   r   r:   r~  8  s    "r~  c                       r}  )CaninePredictionHeadTransformc                    sV   t    t|j|j| _t|jtrt	|j | _
n|j| _
tj|j|jd| _d S r&  )r   r   r   r   r   r'  rW  r   rX  r   transform_act_fnrP   r   r   r   r9   r:   r   H  s   
z&CaninePredictionHeadTransform.__init__r0   r   c                 C   s"   |  |}| |}| |}|S r   )r'  r  rP   r[  r9   r9   r:   r   Q  s   


z%CaninePredictionHeadTransform.forwardr-  r9   r9   r   r:   r  G  s    "	r  c                       r}  )CanineLMPredictionHeadc                    sL   t    t|| _tj|j|jdd| _t	t
|j| _| j| j_d S )NF)rY   )r   r   r  	transformr   r   r   
vocab_sizedecoder	Parameterr6   r   rY   r   r   r9   r:   r   Y  s
   

zCanineLMPredictionHead.__init__r0   r   c                 C   rZ  r   )r  r  r[  r9   r9   r:   r   f  r\  zCanineLMPredictionHead.forwardr-  r9   r9   r   r:   r  X  s    "r  c                       s:   e Zd Z fddZdeej deej fddZ  ZS )CanineOnlyMLMHeadc                    s   t    t|| _d S r   )r   r   r  predictionsr   r   r9   r:   r   m  s   
zCanineOnlyMLMHead.__init__sequence_outputr   c                 C   s   |  |}|S r   )r  )r   r  prediction_scoresr9   r9   r:   r   q  s   
zCanineOnlyMLMHead.forward)	r2   r3   r4   r   r   r6   r   r   r   r9   r9   r   r:   r  l  s    r  c                   @   s(   e Zd ZdZeZeZdZdZ	dd Z
dS )CaninePreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    canineTc                 C   s   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjrF|jjjd| jjd |jdurD|jj|j 
  dS dS t |tjr[|j	j
  |jjd dS dS )zInitialize the weightsg        )meanstdNr  )rW  r   r   r   rV   r~   normal_r   initializer_rangerY   zero_r   padding_idxrP   fill_)r   moduler9   r9   r:   _init_weights  s   

z#CaninePreTrainedModel._init_weightsN)r2   r3   r4   r5   r   config_classr   load_tf_weightsbase_model_prefixsupports_gradient_checkpointingr  r9   r9   r9   r:   r  y  s    r  aI  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CanineConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a5
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare CANINE Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Zd fdd	Zdd Zdd Zdejd	efd
dZ	dejdejdejfddZ
eedeeeed									d deej deej deej deej deej deej dee dee dee deeef fddZ  ZS )!CanineModelTc              
      s   t  | || _t|}d|_t|| _t|ddd|j	|j	|j	|j	d| _
t|| _t|| _t|| _t|| _|rAt|nd | _|   d S )Nr   TF)r9  r0  r1  r2  r3  r4  r5  )r   r   r   copydeepcopyrl  r   char_embeddingsrh  local_transformer_striderL   r   rM   rH   r   rR   rO   r~  pooler	post_init)r   r   add_pooling_layershallow_configr   r9   r:   r     s*   






zCanineModel.__init__c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrH   rm  rb  r>  )r   heads_to_prunerm  r<  r9   r9   r:   _prune_heads  s   zCanineModel._prune_headsc                 C   s\   |j d |j d }}|j d }t||d|f }tj||dftj|jd}|| }|S )aP  
        Create 3D attention mask from a 2D tensor mask.

        Args:
            from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
            to_mask: int32 Tensor of shape [batch_size, to_seq_length].

        Returns:
            float Tensor of shape [batch_size, from_seq_length, to_seq_length].
        r   r   )r   r   r   )r{   r6   reshaper  onesfloat32r   )r   r   to_mask
batch_sizerB  rC  broadcast_onesmaskr9   r9   r:   )_create_3d_attention_mask_from_input_mask  s   
z5CanineModel._create_3d_attention_mask_from_input_maskchar_attention_maskr   c                 C   sF   |j \}}t||d|f}tjj||d| }tj|dd}|S )z[Downsample 2D character attention mask to 2D molecule attention mask using MaxPool1d layer.r   )r   r   r   r   )r{   r6   r  r   	MaxPool1dr  squeeze)r   r  r   r  char_seq_lenpoolable_char_maskpooled_molecule_maskmolecule_attention_maskr9   r9   r:   _downsample_attention_mask  s   
z&CanineModel._downsample_attention_mask	moleculeschar_seq_lengthr   c           	      C   s   | j j}|ddddddf }tj||dd}|ddddddf }tt|t| }tj||| dd}tj||gddS )zDRepeats molecules to make them the same length as the char sequence.Nr   rN   )repeatsr   r   r   )r   r   r6   repeat_interleavefmodtensoritemr   )	r   r  r  ratemolecules_without_extra_clsrepeatedlast_moleculeremainder_lengthremainder_repeatedr9   r9   r:   _repeat_molecules)  s   zCanineModel._repeat_moleculesbatch_size, sequence_length
checkpointoutput_typer  Nr   r  r   r   r  r   r  ro  rp  c
           "      C   s  |d ur|n| j j}|d ur|n| j j}|rdnd }
|rdnd }|	d ur&|	n| j j}	|d ur6|d ur6td|d urE| || | }n|d urR| d d }ntd|\}}|d ura|jn|j}|d u rqtj	||f|d}|d u r~tj
|tj|d}| ||}| j|| j jd}| |||jd f}| || j j}| j||||d}| |d ur|n||}| j||||d	}|j}| |}| j||||||	d
}|d }| jd ur| |nd }| j||d d}tj||gdd}| |}| j||||d	}|j}|r|	r|jn|d }|
|j | |j }
|r2|	r$|jn|d } ||j |  |j }|	sH||f}!|!tdd |
|fD 7 }!|!S t |||
|dS )Nr9   zDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r   r   )r   )r   r   r   r   )r  r  ro  )r  r  r  ro  rp  r   )r  r   r   c                 s   rq  r   r9   rr  r9   r9   r:   rF     rt  z&CanineModel.forward.<locals>.<genexpr>)r.   r/   r0   r1   )!r   r  ro  use_return_dictr|   %warn_if_padding_and_no_attention_maskr   r   r6   r  r   r   get_extended_attention_maskr  r   r{   get_head_maskrl  r  r  rL   r.   rM   rH   r  r  r   rR   rO   r0   r1   r?  r-   )"r   r   r  r   r   r  r   r  ro  rp  rx  ry  r   r  r   r   extended_attention_maskr   extended_molecule_attention_maskinput_char_embeddingsr  init_chars_encoder_outputsinput_char_encodinginit_molecule_encodingencoder_outputsmolecule_sequence_outputr  repeated_moleculesconcatr  final_chars_encoder_outputsdeep_encoder_hidden_statesdeep_encoder_self_attentionsr6  r9   r9   r:   r   B  s   
	


zCanineModel.forward)T)	NNNNNNNNN)r2   r3   r4   r   r  r  r6   r   rx   r  r  r   CANINE_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr-   _CONFIG_FOR_DOCr   r   r7   r$  r   r   r   r   r9   r9   r   r:   r    sT    	

r  z
    CANINE Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                          e Zd Z fddZeedeee	e
d										ddeej deej deej d	eej d
eej deej deej dee dee dee deee	f fddZ  ZS )CanineForSequenceClassificationc                    J   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S r   r   r   
num_labelsr  r  r   r   r   r   r   r   
classifierr  r   r   r9   r:   r        
z(CanineForSequenceClassification.__init__r  r  Nr   r  r   r   r  r   labelsr  ro  rp  r   c                 C   sr  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur| j jdu rV| jdkr<d| j _n| jdkrR|jtj	ksM|jtj
krRd| j _nd| j _| j jdkrtt }| jdkrn|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   r  r   r  ro  rp  r   
regressionsingle_label_classificationmulti_label_classificationr   r[   losslogitsr0   r1   )r   r  r  r   r  problem_typer  r   r6   r   rx   r	   r  r   r   r   r   r0   r1   )r   r   r  r   r   r  r   r  r  ro  rp  r"  r  r  r  loss_fctr6  r9   r9   r:   r     sV   



"


z'CanineForSequenceClassification.forward
NNNNNNNNNN)r2   r3   r4   r   r   r  r  r   r  r   r  r   r6   r   r7   r$  r   r   r   r   r9   r9   r   r:   r    sR    	

r  z
    CANINE Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       r  )CanineForMultipleChoicec                    s@   t  | t|| _t|j| _t|j	d| _
|   d S r_  )r   r   r  r  r   r   r   r   r   r   r  r  r   r   r9   r:   r   P  s
   
z CanineForMultipleChoice.__init__z(batch_size, num_choices, sequence_lengthr  Nr   r  r   r   r  r   r  r  ro  rp  r   c                 C   sn  |
dur|
n| j j}
|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|durt }|||}|
s|f|dd  }|dur|f| S |S t	|||j
|jdS )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r   rN   r  r[   r  )r   r  r{   r   r   r  r   r  r   r   r0   r1   )r   r   r  r   r   r  r   r  r  ro  rp  num_choicesr"  r  r  reshaped_logitsr  r  r6  r9   r9   r:   r   Z  sL   


zCanineForMultipleChoice.forwardr  )r2   r3   r4   r   r   r  r  r   r  r   r  r   r6   r   r7   r$  r   r   r   r   r9   r9   r   r:   r  H  sR    
	

r  z
    CANINE Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       s   e Zd Z fddZeedeee	d										dde
ej de
ej de
ej d	e
ej d
e
ej de
ej de
ej de
e de
e de
e deeef fddZ  ZS )CanineForTokenClassificationc                    r  r   r  r   r   r9   r:   r     r  z%CanineForTokenClassification.__init__r  )r  r  Nr   r  r   r   r  r   r  r  ro  rp  r   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur<t }||d| j|d}|
sR|f|dd  }|durP|f| S |S t|||j	|j
dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CanineForTokenClassification
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
        >>> model = CanineForTokenClassification.from_pretrained("google/canine-s")

        >>> inputs = tokenizer(
        ...     "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
        ... )

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_token_class_ids = logits.argmax(-1)

        >>> # Note that tokens are classified rather then input words which means that
        >>> # there might be more predicted token classes than words.
        >>> # Multiple token classes might account for the same word
        >>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
        >>> predicted_tokens_classes  # doctest: +SKIP
        ```

        ```python
        >>> labels = predicted_token_class_ids
        >>> loss = model(**inputs, labels=labels).loss
        >>> round(loss.item(), 2)  # doctest: +SKIP
        ```Nr  r   r   r[   r  )r   r  r  r   r  r   r   r  r   r0   r1   )r   r   r  r   r   r  r   r  r  ro  rp  r"  r  r  r  r  r6  r9   r9   r:   r     s8   3

z$CanineForTokenClassification.forwardr  )r2   r3   r4   r   r   r  r  r   r   r  r   r6   r   r7   r$  r   r   r   r   r9   r9   r   r:   r    sJ    
	

r  z
    CANINE Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       s   e Zd Z fddZeededee	ddd											dd	e
ej d
e
ej de
ej de
ej de
ej de
ej de
ej de
ej de
e de
e de
e deeef fddZ  ZS )CanineForQuestionAnsweringc                    s<   t  | |j| _t|| _t|j|j| _| 	  d S r   )
r   r   r  r  r  r   r   r   
qa_outputsr  r   r   r9   r:   r     s
   
z#CanineForQuestionAnswering.__init__r  zSplend1dchan/canine-c-squadz'nice puppet'gQ!@)r  r  r  expected_outputexpected_lossNr   r  r   r   r  r   start_positionsend_positionsr  ro  rp  r   c                 C   s@  |dur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d}|d}d}|dur~|dur~t| dkrK|d}t| dkrX|d}|d}|d| |d| t	|d}|||}|||}|| d }|s||f|dd  }|dur|f| S |S t
||||j|jd	S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr  r   r   r   r   )ignore_indexr[   )r  start_logits
end_logitsr0   r1   )r   r  r  r  rp   r  rw   r   clamp_r   r   r0   r1   )r   r   r  r   r   r  r   r  r  r  ro  rp  r"  r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr6  r9   r9   r:   r     sP    








z"CanineForQuestionAnswering.forward)NNNNNNNNNNN)r2   r3   r4   r   r   r  r  r   r   r  r   r6   r   r7   r$  r   r   r   r   r9   r9   r   r:   r    s\    
		

r  )Hr5   r  r	  rg   dataclassesr   typingr   r   r   r6   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   configuration_caniner   
get_loggerr2   re   r  r  $CANINE_PRETRAINED_MODEL_ARCHIVE_LISTr   r-   r   Moduler   r   r   r   r%  r.  rU  r]  r^  rh  r~  r  r  r  r  CANINE_START_DOCSTRINGr  r  r  r  r  r  r9   r9   r9   r:   <module>   s    
"ae.:d :L2  ZTc