o
    hn                  	   @   s  d Z ddlZddlZddlmZmZmZmZ ddlZddl	m
  mZ ddlZddlm
Z
 ddlmZmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZm Z  ddl!m"Z" e #e$Z%dZ&dZ'g dZ(dZ)dZ*dgZ+d4dej,de-de.dej,fddZ/G dd de
j0Z1G dd de
j0Z2G dd de
j0Z3G d d! d!e
j0Z4G d"d# d#e
j0Z5G d$d% d%e
j0Z6G d&d' d'e
j0Z7G d(d) d)e
j0Z8G d*d+ d+eZ9d,Z:d-Z;ed.e:G d/d0 d0e9Z<ed1e:G d2d3 d3e9Z=dS )5z PyTorch PVT model.    N)IterableOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputImageClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )	PvtConfigr   zZetatech/pvt-tiny-224)r   2   i   ztabby, tabby cat        Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutput r(   Z/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/pvt/modeling_pvt.py	drop_path;   s   
r*   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )PvtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                    s   t    || _d S N)super__init__r   )selfr   	__class__r(   r)   r.   S   s   

zPvtDropPath.__init__hidden_statesc                 C   s   t || j| jS r,   )r*   r   r   r/   r2   r(   r(   r)   forwardW   s   zPvtDropPath.forwardc                 C   s   d | jS )Nzp={})formatr   )r/   r(   r(   r)   
extra_reprZ   s   zPvtDropPath.extra_reprr,   )__name__
__module____qualname____doc__r   floatr.   r!   Tensorr4   strr6   __classcell__r(   r(   r0   r)   r+   P   s
    r+   c                       s   e Zd ZdZ	ddedeeee f deeee f dededed	ef fd
dZ	de
jdedede
jfddZde
jdee
jeef fddZ  ZS )PvtPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    Fconfig
image_size
patch_sizestridenum_channelshidden_size	cls_tokenc           	         s   t    || _t|tjjr|n||f}t|tjjr|n||f}|d |d  |d |d   }|| _|| _|| _	|| _
ttd|rI|d n||| _|r[ttdd|nd | _tj||||d| _tj||jd| _tj|jd| _d S )Nr   r   kernel_sizerC   eps)p)r-   r.   r@   
isinstancecollectionsabcr   rA   rB   rD   num_patchesr   	Parameterr!   randnposition_embeddingszerosrF   Conv2d
projection	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropout)	r/   r@   rA   rB   rC   rD   rE   rF   rO   r0   r(   r)   r.   e   s    

 zPvtPatchEmbeddings.__init__
embeddingsheightwidthr   c                 C   sr   || }|| j j| j j kr| jS |d||ddddd}tj|||fdd}|dd|| ddd}|S )Nr   r   r
      bilinear)sizemode)r@   rA   rR   reshapepermuteFinterpolate)r/   r\   r]   r^   rO   interpolated_embeddingsr(   r(   r)   interpolate_pos_encoding   s   z+PvtPatchEmbeddings.interpolate_pos_encodingpixel_valuesc                 C   s   |j \}}}}|| jkrtd| |}|j ^ }}}|ddd}| |}| jd urc| j|dd}	t	j
|	|fdd}| | jd d dd f ||}
t	j
| jd d d df |
fdd}
n| | j||}
| ||
 }|||fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r`   r   r_   dim)r   rD   
ValueErrorrU   flatten	transposerX   rF   expandr!   catri   rR   r[   )r/   rj   
batch_sizerD   r]   r^   patch_embed_r\   rF   rR   r(   r(   r)   r4      s"   



 &
zPvtPatchEmbeddings.forwardF)r7   r8   r9   r:   r   r   intr   boolr.   r!   r<   ri   r   r4   r>   r(   r(   r0   r)   r?   ^   s(    (	r?   c                       s<   e Zd Zdedef fddZdejdejfddZ  Z	S )	PvtSelfOutputr@   rE   c                    s*   t    t||| _t|j| _d S r,   )r-   r.   r   LineardenserY   rZ   r[   )r/   r@   rE   r0   r(   r)   r.      s   
zPvtSelfOutput.__init__r2   r   c                 C   s   |  |}| |}|S r,   )rz   r[   r3   r(   r(   r)   r4      s   

zPvtSelfOutput.forward)
r7   r8   r9   r   rv   r.   r!   r<   r4   r>   r(   r(   r0   r)   rx      s    rx   c                       sp   e Zd ZdZdedededef fddZded	ej	fd
dZ
	ddej	dededed	eej	 f
ddZ  ZS )PvtEfficientSelfAttentionzpEfficient self-attention mechanism with reduction of the sequence [PvT paper](https://arxiv.org/abs/2102.12122).r@   rE   num_attention_headssequences_reduction_ratioc                    s   t    || _|| _| j| j dkr td| j d| j dt| j| j | _| j| j | _tj	| j| j|j
d| _tj	| j| j|j
d| _tj	| j| j|j
d| _t|j| _|| _|dkrwtj||||d| _tj||jd| _d S d S )	Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ())biasr   rG   rI   )r-   r.   rE   r|   rm   rv   attention_head_sizeall_head_sizer   ry   qkv_biasquerykeyvaluerY   attention_probs_dropout_probr[   r}   rT   sequence_reductionrV   rW   rX   r/   r@   rE   r|   r}   r0   r(   r)   r.      s,   

z"PvtEfficientSelfAttention.__init__r2   r   c                 C   s6   |  d d | j| jf }||}|ddddS )Nr_   r   r`   r   r
   )rb   r|   r   viewre   )r/   r2   	new_shaper(   r(   r)   transpose_for_scores   s   
z.PvtEfficientSelfAttention.transpose_for_scoresFr]   r^   output_attentionsc                 C   s&  |  | |}| jdkr6|j\}}}|ddd||||}| |}|||dddd}| |}|  | |}	|  | 	|}
t
||	dd}|t| j }tjj|dd}| |}t
||
}|dddd }| d d | jf }||}|r||f}|S |f}|S )Nr   r   r`   r_   rk   r
   )r   r   r}   r   re   rd   r   rX   r   r   r!   matmulro   mathsqrtr   r   
functionalsoftmaxr[   
contiguousrb   r   r   )r/   r2   r]   r^   r   query_layerrr   seq_lenrD   	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr(   r(   r)   r4      s*   




z!PvtEfficientSelfAttention.forwardru   )r7   r8   r9   r:   r   rv   r;   r.   r!   r<   r   rw   r   r4   r>   r(   r(   r0   r)   r{      s0    
r{   c                       s`   e Zd Zdedededef fddZdd Z		dd
ej	dedede
deej	 f
ddZ  ZS )PvtAttentionr@   rE   r|   r}   c                    s6   t    t||||d| _t||d| _t | _d S )N)rE   r|   r}   )rE   )r-   r.   r{   r/   rx   r'   setpruned_headsr   r0   r(   r)   r.      s   
zPvtAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rk   )lenr   r/   r|   r   r   r   r   r   r   r'   rz   r   union)r/   headsindexr(   r(   r)   prune_heads  s   zPvtAttention.prune_headsFr2   r]   r^   r   r   c                 C   s4   |  ||||}| |d }|f|dd   }|S )Nr   r   )r/   r'   )r/   r2   r]   r^   r   self_outputsattention_outputr   r(   r(   r)   r4     s   zPvtAttention.forwardru   )r7   r8   r9   r   rv   r;   r.   r   r!   r<   rw   r   r4   r>   r(   r(   r0   r)   r      s.    r   c                
       sR   e Zd Z		ddededee dee f fddZdejd	ejfd
dZ	  Z
S )PvtFFNNr@   in_featureshidden_featuresout_featuresc                    sj   t    |d ur|n|}t||| _t|jtr!t|j | _	n|j| _	t||| _
t|j| _d S r,   )r-   r.   r   ry   dense1rL   
hidden_actr=   r   intermediate_act_fndense2rY   rZ   r[   )r/   r@   r   r   r   r0   r(   r)   r.   )  s   
zPvtFFN.__init__r2   r   c                 C   s6   |  |}| |}| |}| |}| |}|S r,   )r   r   r[   r   r3   r(   r(   r)   r4   :  s   




zPvtFFN.forward)NN)r7   r8   r9   r   rv   r   r.   r!   r<   r4   r>   r(   r(   r0   r)   r   (  s    r   c                       sT   e Zd Zdedededededef fddZdd
ejdedede	fddZ
  ZS )PvtLayerr@   rE   r|   r*   r}   	mlp_ratioc                    sz   t    tj||jd| _t||||d| _|dkrt|nt	 | _
tj||jd| _t|| }t|||d| _d S )NrI   )r@   rE   r|   r}   r   )r@   r   r   )r-   r.   r   rV   rW   layer_norm_1r   	attentionr+   Identityr*   layer_norm_2rv   r   mlp)r/   r@   rE   r|   r*   r}   r   mlp_hidden_sizer0   r(   r)   r.   D  s   
	zPvtLayer.__init__Fr2   r]   r^   r   c           
      C   sn   | j | ||||d}|d }|dd  }| |}|| }| | |}| |}|| }	|	f| }|S )N)r2   r]   r^   r   r   r   )r   r   r*   r   r   )
r/   r2   r]   r^   r   self_attention_outputsr   r   
mlp_outputlayer_outputr(   r(   r)   r4   Z  s   


zPvtLayer.forwardru   )r7   r8   r9   r   rv   r;   r.   r!   r<   rw   r4   r>   r(   r(   r0   r)   r   C  s    &r   c                       s^   e Zd Zdef fddZ			ddejdee dee d	ee d
e	e
ef f
ddZ  ZS )
PvtEncoderr@   c           	         st  t    || _td|jt|j }g }t	|j
D ]9}|t||dkr)|jn	| jjd|d   |j| |j| |dkrA|jn|j|d  |j| ||j
d kd qt|| _g }d}t	|j
D ]@}g }|dkrv||j|d  7 }t	|j| D ]}|t||j| |j| |||  |j| |j| d q}|t| qet|| _tj|jd |jd| _d S )Nr   r`   r   )r@   rA   rB   rC   rD   rE   rF   )r@   rE   r|   r*   r}   r   r_   rI   )r-   r.   r@   r!   linspacedrop_path_ratesumdepthstolistrangenum_encoder_blocksappendr?   rA   patch_sizesstridesrD   hidden_sizesr   
ModuleListpatch_embeddingsr   r|   sequence_reduction_ratios
mlp_ratiosblockrV   rW   rX   )	r/   r@   drop_path_decaysr\   iblockscurlayersjr0   r(   r)   r.   r  sJ   
 

zPvtEncoder.__init__FTrj   r   output_hidden_statesreturn_dictr   c                 C   s  |rdnd }|r
dnd }|j d }t| j}|}	tt| j| jD ]C\}
\}}||	\}	}}|D ]}||	|||}|d }	|rF||d f }|rM||	f }q0|
|d krd|	|||ddddd }	q!| 	|	}	|rq||	f }|st
dd |	||fD S t|	||d	S )
Nr(   r   r   r_   r
   r`   c                 s   s    | ]	}|d ur|V  qd S r,   r(   ).0vr(   r(   r)   	<genexpr>  s    z%PvtEncoder.forward.<locals>.<genexpr>last_hidden_stater2   
attentions)r   r   r   	enumeratezipr   rd   re   r   rX   tupler   )r/   rj   r   r   r   all_hidden_statesall_self_attentionsrr   
num_blocksr2   idxembedding_layerblock_layerr]   r^   r   layer_outputsr(   r(   r)   r4     s8   


 

zPvtEncoder.forward)FFT)r7   r8   r9   r   r.   r!   FloatTensorr   rw   r   r   r   r4   r>   r(   r(   r0   r)   r   q  s     5
r   c                   @   s>   e Zd ZdZeZdZdZdee	j
e	je	jf ddfddZdS )	PvtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    pvtrj   moduler   Nc                 C   s   t |tjr$tjj|jjd| jjd|j_|j	dur"|j	j
  dS dS t |tjr9|j	j
  |jjd dS t |trctjj|jjd| jjd|j_|jduretjj|jjd| jjd|j_dS dS dS )zInitialize the weightsr   )meanstdNg      ?)rL   r   ry   inittrunc_normal_weightdatar@   initializer_ranger   zero_rV   fill_r?   rR   rF   )r/   r   r(   r(   r)   _init_weights  s,   



z PvtPreTrainedModel._init_weights)r7   r8   r9   r:   r   config_classbase_model_prefixmain_input_namer   r   ry   rT   rV   r   r(   r(   r(   r)   r     s    &r   aG  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`~PvtConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a
  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`PvtImageProcessor.__call__`]
            for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zSThe bare Pvt encoder outputting raw hidden-states without any specific head on top.c                       s   e Zd Zdef fddZdd Zeede	e
eeded						dd
ejdee dee dee deeef f
ddZ  ZS )PvtModelr@   c                    s(   t  | || _t|| _|   d S r,   )r-   r.   r@   r   encoder	post_initr/   r@   r0   r(   r)   r.     s   
zPvtModel.__init__c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   layerr   r   )r/   heads_to_pruner   r   r(   r(   r)   _prune_heads  s   zPvtModel._prune_heads%(batch_size, channels, height, width)vision)
checkpointoutput_typer   modalityexpected_outputNrj   r   r   r   r   c                 C   s~   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||||d}|d }|s6|f|dd   S t||j|jdS )Nrj   r   r   r   r   r   r   )r@   r   r   use_return_dictr   r   r2   r   )r/   rj   r   r   r   encoder_outputssequence_outputr(   r(   r)   r4     s$   zPvtModel.forward)NNN)r7   r8   r9   r   r.   r   r   PVT_INPUTS_DOCSTRINGr5   r   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr!   r   r   rw   r   r   r4   r>   r(   r(   r0   r)   r     s2    


r   z
    Pvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                       s   e Zd Zdeddf fddZeedee	e
eed				ddeej d	eej d
ee dee dee deee
f fddZ  ZS )PvtForImageClassificationr@   r   Nc                    sR   t  | |j| _t|| _|jdkrt|jd |jnt | _	| 
  d S )Nr   r_   )r-   r.   
num_labelsr   r   r   ry   r   r   
classifierr   r   r0   r(   r)   r.   N  s   
$z"PvtForImageClassification.__init__r   )r   r   r   r  rj   labelsr   r   r   c                 C   sp  |dur|n| j j}| j||||d}|d }| |dddddf }d}	|dur| j jdu rU| jdkr;d| j _n| jdkrQ|jtjksL|jtj	krQd| j _nd| j _| j jdkrst
 }
| jdkrm|
| | }	n+|
||}	n%| j jdkrt }
|
|d| j|d}	n| j jdkrt }
|
||}	|s|f|dd  }|	dur|	f| S |S t|	||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr_   )losslogitsr2   r   )r@   r  r   r  problem_typer  r   r!   longrv   r	   squeezer   r   r   r   r2   r   )r/   rj   r  r   r   r   r   r  r  r  loss_fctr'   r(   r(   r)   r4   \  sJ   

"


z!PvtForImageClassification.forward)NNNN)r7   r8   r9   r   r.   r   r  r5   r   _IMAGE_CLASS_CHECKPOINTr   r  _IMAGE_CLASS_EXPECTED_OUTPUTr   r!   r<   rw   r   r   r4   r>   r(   r(   r0   r)   r
  F  s4    	
r
  )r   F)>r:   rM   r   typingr   r   r   r   r!   torch.nn.functionalr   r   rf   torch.utils.checkpointtorch.nnr   r   r	   activationsr   modeling_outputsr   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   configuration_pvtr   
get_loggerr7   loggerr  r  r	  r  r  !PVT_PRETRAINED_MODEL_ARCHIVE_LISTr<   r;   rw   r*   Moduler+   r?   rx   r{   r   r   r   r   r   PVT_START_DOCSTRINGr  r   r
  r(   r(   r(   r)   <module>   sZ   
 BR*.Y#: