o
    hq                  	   @   sV  d Z ddlZddlmZ ddlmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZmZmZ ddlmZmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlmZ ee Z!dZ"dZ#g dZ$dZ%dZ&g dZ'eG dd deZ(dCde	j)de*de+de	j)fddZ,G dd dej-Z.G dd  d ej-Z/G d!d" d"ej-Z0G d#d$ d$ej-Z1G d%d& d&ej-Z2G d'd( d(ej-Z3G d)d* d*ej-Z4G d+d, d,ej-Z5G d-d. d.ej-Z6G d/d0 d0ej-Z7G d1d2 d2ej-Z8G d3d4 d4ej-Z9G d5d6 d6ej-Z:G d7d8 d8ej-Z;G d9d: d:eZ<d;Z=d<Z>ed=e=G d>d? d?e<Z?ed@e=G dAdB dBe<Z@dS )Dz PyTorch CvT model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forward)$ImageClassifierOutputWithNoAttentionModelOutput)PreTrainedModel find_pruneable_heads_and_indicesprune_linear_layer)logging   )	CvtConfigr   microsoft/cvt-13)r   i     r   ztabby, tabby cat)r   zmicrosoft/cvt-13-384zmicrosoft/cvt-13-384-22kzmicrosoft/cvt-21zmicrosoft/cvt-21-384zmicrosoft/cvt-21-384-22kc                   @   sD   e Zd ZU dZdZejed< dZejed< dZ	e
eej  ed< dS )BaseModelOutputWithCLSTokena  
    Base class for model's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        cls_token_value (`torch.FloatTensor` of shape `(batch_size, 1, hidden_size)`):
            Classification token at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
    Nlast_hidden_statecls_token_valuehidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   r    r#   r#   Z/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/cvt/modeling_cvt.pyr   ;   s
   
 r           Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r%   r   r   )r   )dtypedevice)shapendimr    randr*   r+   floor_div)r&   r'   r(   	keep_probr,   random_tensoroutputr#   r#   r$   	drop_pathQ   s   
r4   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )CvtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr'   r)   c                    s   t    || _d S N)super__init__r'   )selfr'   	__class__r#   r$   r8   i   s   

zCvtDropPath.__init__r   c                 C   s   t || j| jS r6   )r4   r'   r(   )r9   r   r#   r#   r$   forwardm   s   zCvtDropPath.forwardc                 C   s   d | jS )Nzp={})formatr'   )r9   r#   r#   r$   
extra_reprp   s   zCvtDropPath.extra_reprr6   )r   r   r   r   r   floatr8   r    Tensorr<   strr>   __classcell__r#   r#   r:   r$   r5   f   s
    r5   c                       (   e Zd ZdZ fddZdd Z  ZS )CvtEmbeddingsz'
    Construct the CvT embeddings.
    c                    s.   t    t|||||d| _t|| _d S )N)
patch_sizenum_channels	embed_dimstridepadding)r7   r8   CvtConvEmbeddingsconvolution_embeddingsr   Dropoutdropout)r9   rE   rF   rG   rH   rI   dropout_rater:   r#   r$   r8   y   s
   

zCvtEmbeddings.__init__c                 C   s   |  |}| |}|S r6   )rK   rM   )r9   pixel_valueshidden_stater#   r#   r$   r<         

zCvtEmbeddings.forwardr   r   r   r   r8   r<   rB   r#   r#   r:   r$   rD   t       rD   c                       rC   )rJ   z"
    Image to Conv Embedding.
    c                    sP   t    t|tjjr|n||f}|| _tj|||||d| _	t
|| _d S )N)kernel_sizerH   rI   )r7   r8   
isinstancecollectionsabcIterablerE   r   Conv2d
projection	LayerNormnormalization)r9   rE   rF   rG   rH   rI   r:   r#   r$   r8      s
   
zCvtConvEmbeddings.__init__c                 C   sf   |  |}|j\}}}}|| }||||ddd}| jr$| |}|ddd||||}|S Nr      r   )rZ   r,   viewpermuter\   )r9   rO   
batch_sizerF   heightwidthhidden_sizer#   r#   r$   r<      s   

zCvtConvEmbeddings.forwardrR   r#   r#   r:   r$   rJ      rS   rJ   c                       $   e Zd Z fddZdd Z  ZS )CvtSelfAttentionConvProjectionc              	      s4   t    tj|||||d|d| _t|| _d S )NF)rT   rI   rH   biasgroups)r7   r8   r   rY   convolutionBatchNorm2dr\   )r9   rG   rT   rI   rH   r:   r#   r$   r8      s   
	z'CvtSelfAttentionConvProjection.__init__c                 C      |  |}| |}|S r6   )ri   r\   r9   rP   r#   r#   r$   r<      rQ   z&CvtSelfAttentionConvProjection.forwardr   r   r   r8   r<   rB   r#   r#   r:   r$   rf      s    rf   c                   @   s   e Zd Zdd ZdS ) CvtSelfAttentionLinearProjectionc                 C   s2   |j \}}}}|| }||||ddd}|S r]   )r,   r_   r`   )r9   rP   ra   rF   rb   rc   rd   r#   r#   r$   r<      s   z(CvtSelfAttentionLinearProjection.forwardN)r   r   r   r<   r#   r#   r#   r$   rn      s    rn   c                       s&   e Zd Zd fdd	Zdd Z  ZS )CvtSelfAttentionProjectiondw_bnc                    s.   t    |dkrt||||| _t | _d S )Nrp   )r7   r8   rf   convolution_projectionrn   linear_projection)r9   rG   rT   rI   rH   projection_methodr:   r#   r$   r8      s   
z#CvtSelfAttentionProjection.__init__c                 C   rk   r6   )rq   rr   rl   r#   r#   r$   r<      rQ   z"CvtSelfAttentionProjection.forward)rp   rm   r#   r#   r:   r$   ro      s    ro   c                       0   e Zd Z	d fdd	Zdd Zdd Z  ZS )	CvtSelfAttentionTc                    s   t    |d | _|| _|| _|| _t|||||dkrdn|d| _t|||||d| _t|||||d| _	t
j|||	d| _t
j|||	d| _t
j|||	d| _t
|
| _d S )Ng      avglinear)rs   )rg   )r7   r8   scalewith_cls_tokenrG   	num_headsro   convolution_projection_queryconvolution_projection_keyconvolution_projection_valuer   Linearprojection_queryprojection_keyprojection_valuerL   rM   )r9   rz   rG   rT   	padding_q
padding_kvstride_q	stride_kvqkv_projection_methodqkv_biasattention_drop_ratery   kwargsr:   r#   r$   r8      s,   



zCvtSelfAttention.__init__c                 C   s6   |j \}}}| j| j }|||| j|ddddS )Nr   r^   r   r
   )r,   rG   rz   r_   r`   )r9   rP   ra   rd   _head_dimr#   r#   r$   "rearrange_for_multi_head_attention   s   z3CvtSelfAttention.rearrange_for_multi_head_attentionc                 C   sT  | j rt|d|| gd\}}|j\}}}|ddd||||}| |}| |}	| |}
| j rPtj	||	fdd}	tj	||fdd}tj	||
fdd}
| j
| j }| | |	}	| | |}| | |
}
td|	|g| j }tjjj|dd}| |}td||
g}|j\}}}}|dddd ||| j| }|S )	Nr   r   r^   dimzbhlk,bhtk->bhltzbhlt,bhtv->bhlvr
   )ry   r    splitr,   r`   r_   r|   r{   r}   catrG   rz   r   r   r   r   einsumrx   r   
functionalsoftmaxrM   
contiguous)r9   rP   rb   rc   	cls_tokenra   rd   rF   keyqueryvaluer   attention_scoreattention_probscontextr   r#   r#   r$   r<      s,   



$zCvtSelfAttention.forwardT)r   r   r   r8   r   r<   rB   r#   r#   r:   r$   ru      s
    )ru   c                       rC   )CvtSelfOutputz
    The residual connection is defined in CvtLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    c                    s(   t    t||| _t|| _d S r6   )r7   r8   r   r~   denserL   rM   )r9   rG   	drop_rater:   r#   r$   r8      s   
zCvtSelfOutput.__init__c                 C   rk   r6   r   rM   r9   rP   input_tensorr#   r#   r$   r<   %  rQ   zCvtSelfOutput.forwardrR   r#   r#   r:   r$   r     s    r   c                       rt   )	CvtAttentionTc                    s@   t    t|||||||||	|
|| _t||| _t | _d S r6   )r7   r8   ru   	attentionr   r3   setpruned_heads)r9   rz   rG   rT   r   r   r   r   r   r   r   r   ry   r:   r#   r$   r8   ,  s    
zCvtAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )lenr   r   num_attention_headsattention_head_sizer   r   r   r   r   r3   r   all_head_sizeunion)r9   headsindexr#   r#   r$   prune_headsL  s   zCvtAttention.prune_headsc                 C   s   |  |||}| ||}|S r6   )r   r3   )r9   rP   rb   rc   self_outputattention_outputr#   r#   r$   r<   ^  s   zCvtAttention.forwardr   )r   r   r   r8   r   r<   rB   r#   r#   r:   r$   r   +  s
     r   c                       re   )CvtIntermediatec                    s.   t    t|t|| | _t | _d S r6   )r7   r8   r   r~   intr   GELU
activation)r9   rG   	mlp_ratior:   r#   r$   r8   e  s   
zCvtIntermediate.__init__c                 C   rk   r6   )r   r   rl   r#   r#   r$   r<   j  rQ   zCvtIntermediate.forwardrm   r#   r#   r:   r$   r   d      r   c                       re   )	CvtOutputc                    s0   t    tt|| || _t|| _d S r6   )r7   r8   r   r~   r   r   rL   rM   )r9   rG   r   r   r:   r#   r$   r8   q  s   
zCvtOutput.__init__c                 C   s    |  |}| |}|| }|S r6   r   r   r#   r#   r$   r<   v  s   

zCvtOutput.forwardrm   r#   r#   r:   r$   r   p  r   r   c                       s,   e Zd ZdZ	d fdd	Zdd Z  ZS )CvtLayerzb
    CvtLayer composed by attention layers, normalization and multi-layer perceptrons (mlps).
    Tc                    s|   t    t|||||||||	|
||| _t||| _t|||| _|dkr+t|dnt	
 | _t	|| _t	|| _d S )Nr%   )r'   )r7   r8   r   r   r   intermediater   r3   r5   r   Identityr4   r[   layernorm_beforelayernorm_after)r9   rz   rG   rT   r   r   r   r   r   r   r   r   r   drop_path_ratery   r:   r#   r$   r8     s(   
zCvtLayer.__init__c                 C   sX   |  | |||}|}| |}|| }| |}| |}| ||}| |}|S r6   )r   r   r4   r   r   r3   )r9   rP   rb   rc   self_attention_outputr   layer_outputr#   r#   r$   r<     s   



zCvtLayer.forwardr   rR   r#   r#   r:   r$   r   }  s
    'r   c                       re   )CvtStagec                    s   t     _|_jjj r!ttddjj	d _t
 jj  jj jdkr4 jn j	jd   j	j  jj  jj d_dd td jj  j| D tj fddt jj D  _d S )Nr   r   r   )rE   rH   rF   rG   rI   rN   c                 S   s   g | ]}|  qS r#   )item).0xr#   r#   r$   
<listcomp>  s    z%CvtStage.__init__.<locals>.<listcomp>c                    s   g | ]K}t  jj  jj  jj  jj  jj  jj  jj  j	j  j
j  jj  jj j  jj  jj d qS ))rz   rG   rT   r   r   r   r   r   r   r   r   r   r   ry   )r   rz   stagerG   
kernel_qkvr   r   r   r   r   r   r   r   r   r   )r   r   configdrop_path_ratesr9   r#   r$   r     s&    












)r7   r8   r   r   r   r   	Parameterr    randnrG   rD   patch_sizespatch_striderF   patch_paddingr   	embeddinglinspacer   depth
Sequentialrangelayers)r9   r   r   r:   r   r$   r8     s&   





&	
zCvtStage.__init__c           	      C   s   d }|  |}|j\}}}}||||| ddd}| jj| j r4| j|dd}tj	||fdd}| j
D ]
}||||}|}q7| jj| j rVt|d|| gd\}}|ddd||||}||fS )Nr   r^   r   r   r   )r   r,   r_   r`   r   r   r   expandr    r   r   r   )	r9   rP   r   ra   rF   rb   rc   layerlayer_outputsr#   r#   r$   r<     s   

zCvtStage.forwardrm   r#   r#   r:   r$   r     s    (r   c                       s&   e Zd Z fddZdddZ  ZS )
CvtEncoderc                    sF   t    || _tg | _tt|jD ]}| j	t
|| qd S r6   )r7   r8   r   r   
ModuleListstagesr   r   r   appendr   )r9   r   	stage_idxr:   r#   r$   r8     s   
zCvtEncoder.__init__FTc           	      C   sl   |rdnd }|}d }t | jD ]\}}||\}}|r ||f }q|s/tdd |||fD S t|||dS )Nr#   c                 s   s    | ]	}|d ur|V  qd S r6   r#   )r   vr#   r#   r$   	<genexpr>  s    z%CvtEncoder.forward.<locals>.<genexpr>r   r   r   )	enumerater   tupler   )	r9   rO   output_hidden_statesreturn_dictall_hidden_statesrP   r   r   stage_moduler#   r#   r$   r<     s   
zCvtEncoder.forward)FTrm   r#   r#   r:   r$   r     s    r   c                   @   s$   e Zd ZdZeZdZdZdd ZdS )CvtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    cvtrO   c                 C   s   t |tjtjfr'tjj|jjd| jj	d|j_|j
dur%|j
j  dS dS t |tjr<|j
j  |jjd dS t |tr`| jj|j rbtjjtdd| jjd d| jj	d|j_dS dS dS )zInitialize the weightsr%   )meanstdNg      ?r   r   )rU   r   r~   rY   inittrunc_normal_weightdatar   initializer_rangerg   zero_r[   fill_r   r   r   r    zerosrG   )r9   moduler#   r#   r$   _init_weights"  s   

z CvtPreTrainedModel._init_weightsN)	r   r   r   r   r   config_classbase_model_prefixmain_input_namer   r#   r#   r#   r$   r     s    r   aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aE  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
            for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
z]The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.c                       sx   e Zd Zd fdd	Zdd Zeeeee	e
ded			dd	eej d
ee dee deee	f fddZ  ZS )CvtModelTc                    s(   t  | || _t|| _|   d S r6   )r7   r8   r   r   encoder	post_init)r9   r   add_pooling_layerr:   r#   r$   r8   O  s   
zCvtModel.__init__c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r9   heads_to_pruner   r   r#   r#   r$   _prune_headsU  s   zCvtModel._prune_headsvision)
checkpointoutput_typer   modalityexpected_outputNrO   r   r   r)   c                 C   sx   |d ur|n| j j}|d ur|n| j j}|d u rtd| j|||d}|d }|s3|f|dd   S t||j|jdS )Nz You have to specify pixel_valuesr   r   r   r   r   )r   r   use_return_dict
ValueErrorr   r   r   r   )r9   rO   r   r   encoder_outputssequence_outputr#   r#   r$   r<   ]  s$   zCvtModel.forwardr   )NNN)r   r   r   r8   r   r   CVT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r    r@   boolr   r   r<   rB   r#   r#   r:   r$   r   J  s.    	
r   z
    Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                       sx   e Zd Z fddZeeeeee	e
d				ddeej deej dee dee d	eeef f
d
dZ  ZS )CvtForImageClassificationc                    sh   t  | |j| _t|dd| _t|jd | _|jdkr)t	|jd |jnt
 | _|   d S )NF)r   r   r   )r7   r8   
num_labelsr   r   r   r[   rG   	layernormr~   r   
classifierr   )r9   r   r:   r#   r$   r8     s   $z"CvtForImageClassification.__init__)r   r  r   r  NrO   labelsr   r   r)   c                 C   s  |dur|n| j j}| j|||d}|d }|d }| j jd r&| |}n|j\}}	}
}|||	|
| ddd}| |}|jdd}| 	|}d}|dur| j j
du r}| j jdkrbd| j _
n| j jdkry|jtjkst|jtjkryd	| j _
nd
| j _
| j j
dkrt }| j jdkr|| | }n,|||}n&| j j
d	krt }||d| j j|d}n| j j
d
krt }|||}|s|f|dd  }|dur|f| S |S t|||jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   r   r^   r   
regressionsingle_label_classificationmulti_label_classification)losslogitsr   )r   r  r   r   r  r,   r_   r`   r   r  problem_typer  r*   r    longr   r	   squeezer   r   r   r   )r9   rO   r  r   r   outputsr  r   ra   rF   rb   rc   sequence_output_meanr  r  loss_fctr3   r#   r#   r$   r<     sL   


$

z!CvtForImageClassification.forward)NNNN)r   r   r   r8   r   r	  r   _IMAGE_CLASS_CHECKPOINTr   r  _IMAGE_CLASS_EXPECTED_OUTPUTr   r    r@   r  r   r   r<   rB   r#   r#   r:   r$   r    s0    
r  )r%   F)Ar   collections.abcrV   dataclassesr   typingr   r   r   r    torch.utils.checkpointr   torch.nnr   r   r	   
file_utilsr   r   r   modeling_outputsr   r   modeling_utilsr   r   r   utilsr   configuration_cvtr   
get_loggerr   loggerr  r
  r  r  r  !CVT_PRETRAINED_MODEL_ARCHIVE_LISTr   r@   r?   r  r4   Moduler5   rD   rJ   rf   rn   ro   ru   r   r   r   r   r   r   r   r   CVT_START_DOCSTRINGr	  r   r  r#   r#   r#   r$   <module>   sd   
 	Q9B=6