o
    h=                     @   s  d Z ddlZddlZddlmZmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZm Z m!Z!m"Z"m#Z# ddl$m%Z% e"&e'Z(dZ)dZ*g dZ+dZ,dZ-dgZ.G dd dej/Z0G dd dej/Z1G dd dej/Z2G dd dej/Z3G dd dej/Z4G dd dej/Z5G dd  d ej/Z6G d!d" d"ej/Z7G d#d$ d$ej/Z8G d%d& d&eZ9d'Z:d(Z;e d)e:G d*d+ d+e9Z<G d,d- d-ej/Z=e d.e:G d/d0 d0e9Z>e d1e:G d2d3 d3e9Z?dS )4z PyTorch ViT model.    N)DictListOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )	ViTConfigr   z!google/vit-base-patch16-224-in21k)r      i   zgoogle/vit-base-patch16-224zEgyptian catc                	       sx   e Zd ZdZddededdf fddZd	ejd
e	de	dejfddZ
		ddejdeej dedejfddZ  ZS )ViTEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    Fconfiguse_mask_tokenreturnNc                    s   t    ttdd|j| _|rttdd|jnd | _	t
|| _| jj}ttd|d |j| _t|j| _|| _d S )Nr   )super__init__r   	Parametertorchrandnhidden_size	cls_tokenzeros
mask_tokenViTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropoutr   )selfr   r   r,   	__class__ Z/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/vit/modeling_vit.pyr"   G   s   
 

zViTEmbeddings.__init__
embeddingsheightwidthc                 C   sJ  |j d d }| jj d d }||kr||kr| jS | jdddf }| jddddf }|j d }|| jj }	|| jj }
|	d |
d }	}
|dtt|tt||}|dddd}t	j
j||	t| |
t| fdd	d
}t|	|j d krt|
|j d ksJ |dddddd|}tj|d|fddS )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        r   Nr   g?r      bicubicF)scale_factormodealign_cornersdim)shaper-   r   
patch_sizereshapeintmathsqrtpermuter   
functionalinterpolateviewr$   cat	unsqueeze)r1   r6   r7   r8   r,   num_positionsclass_pos_embedpatch_pos_embedrA   h0w0r4   r4   r5   interpolate_pos_encodingR   s*   	
$(z&ViTEmbeddings.interpolate_pos_encodingpixel_valuesbool_masked_posrS   c                 C   s   |j \}}}}| j||d}|d ur1|j d }	| j||	d}
|d|
}|d|  |
|  }| j|dd}tj||fdd}|rN|| 	||| }n|| j
 }| |}|S )N)rS   r   r9         ?r@   )rB   r+   r)   expandrM   type_asr'   r$   rL   rS   r-   r0   )r1   rT   rU   rS   
batch_sizenum_channelsr7   r8   r6   
seq_lengthmask_tokensmask
cls_tokensr4   r4   r5   forwards   s   


zViTEmbeddings.forwardFNF)__name__
__module____qualname____doc__r   boolr"   r$   TensorrE   rS   r   
BoolTensorr_   __classcell__r4   r4   r2   r5   r   B   s    $r   c                       s<   e Zd ZdZ fddZd
dejdedejfdd	Z  Z	S )r*   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)r!   r"   
image_sizerC   rZ   r&   
isinstancecollectionsabcIterabler,   r   Conv2d
projection)r1   r   rl   rC   rZ   r&   r,   r2   r4   r5   r"      s   
 zViTPatchEmbeddings.__init__FrT   rS   r    c              
   C   s   |j \}}}}|| jkrtd| j d| d|s?|| jd ks(|| jd kr?td| d| d| jd  d| jd  d		| |d
dd
}|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).r:   )rB   rZ   
ValueErrorrl   rr   flatten	transpose)r1   rT   rS   rY   rZ   r7   r8   r6   r4   r4   r5   r_      s(   
zViTPatchEmbeddings.forwardr`   )
rb   rc   rd   re   r"   r$   rg   rf   r_   ri   r4   r4   r2   r5   r*      s    $r*   c                
       sv   e Zd Zdeddf fddZdejdejfddZ		dd
eej de	de
eejejf eej f fddZ  ZS )ViTSelfAttentionr   r    Nc                    s   t    |j|j dkr t|ds td|jf d|j d|j| _t|j|j | _| j| j | _t	j
|j| j|jd| _t	j
|j| j|jd| _t	j
|j| j|jd| _t	|j| _d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads rs   )bias)r!   r"   r&   num_attention_headshasattrru   rE   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluer.   attention_probs_dropout_probr0   r1   r   r2   r4   r5   r"      s   
zViTSelfAttention.__init__xc                 C   s6   |  d d | j| jf }||}|ddddS )Nr9   r   r:   r   r   )sizer{   r}   rK   rH   )r1   r   new_x_shaper4   r4   r5   transpose_for_scores   s   
z%ViTSelfAttention.transpose_for_scoresF	head_maskoutput_attentionsc                 C   s   |  |}| | |}| | |}| |}t||dd}|t| j	 }t
jj|dd}	| |	}	|d urA|	| }	t|	|}
|
dddd }
|
 d d | jf }|
|}
|rj|
|	f}|S |
f}|S )Nr9   r?   r@   r   r:   r   r   )r   r   r   r   r$   matmulrw   rF   rG   r}   r   rI   softmaxr0   rH   
contiguousr   r~   rK   )r1   hidden_statesr   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr4   r4   r5   r_      s$   



zViTSelfAttention.forwardra   )rb   rc   rd   r   r"   r$   rg   r   r   rf   r   r   r_   ri   r4   r4   r2   r5   rx      s    rx   c                       sF   e Zd ZdZdeddf fddZdejdejdejfd	d
Z  Z	S )ViTSelfOutputz
    The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   r    Nc                    s.   t    t|j|j| _t|j| _d S N)	r!   r"   r   r   r&   denser.   r/   r0   r   r2   r4   r5   r"         
zViTSelfOutput.__init__r   input_tensorc                 C      |  |}| |}|S r   r   r0   r1   r   r   r4   r4   r5   r_         

zViTSelfOutput.forward)
rb   rc   rd   re   r   r"   r$   rg   r_   ri   r4   r4   r2   r5   r      s    $r   c                       s~   e Zd Zdeddf fddZdee ddfddZ			dd
ej	de
ej	 dedeeej	ej	f eej	 f fddZ  ZS )ViTAttentionr   r    Nc                    s*   t    t|| _t|| _t | _d S r   )r!   r"   rx   	attentionr   outputsetpruned_headsr   r2   r4   r5   r"     s   


zViTAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r@   )lenr   r   r{   r}   r   r   r   r   r   r   r   r~   union)r1   r   indexr4   r4   r5   prune_heads  s   zViTAttention.prune_headsFr   r   r   c                 C   s4   |  |||}| |d |}|f|dd   }|S )Nr   r   )r   r   )r1   r   r   r   self_outputsattention_outputr   r4   r4   r5   r_      s   zViTAttention.forwardra   )rb   rc   rd   r   r"   r   rE   r   r$   rg   r   rf   r   r   r_   ri   r4   r4   r2   r5   r     s    r   c                       s<   e Zd Zdeddf fddZdejdejfddZ  ZS )	ViTIntermediater   r    Nc                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r!   r"   r   r   r&   intermediate_sizer   rm   
hidden_actstrr   intermediate_act_fnr   r2   r4   r5   r"   /  s
   
zViTIntermediate.__init__r   c                 C   r   r   )r   r   )r1   r   r4   r4   r5   r_   7  r   zViTIntermediate.forward	rb   rc   rd   r   r"   r$   rg   r_   ri   r4   r4   r2   r5   r   .  s    r   c                       sB   e Zd Zdeddf fddZdejdejdejfdd	Z  ZS )
	ViTOutputr   r    Nc                    s.   t    t|j|j| _t|j| _	d S r   )
r!   r"   r   r   r   r&   r   r.   r/   r0   r   r2   r4   r5   r"   ?  r   zViTOutput.__init__r   r   c                 C   s    |  |}| |}|| }|S r   r   r   r4   r4   r5   r_   D  s   

zViTOutput.forwardr   r4   r4   r2   r5   r   >  s    $r   c                       sl   e Zd ZdZdeddf fddZ		ddejd	eej d
e	de
eejejf eej f fddZ  ZS )ViTLayerz?This corresponds to the Block class in the timm implementation.r   r    Nc                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r!   r"   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr&   layer_norm_epslayernorm_beforelayernorm_afterr   r2   r4   r5   r"   P  s   



zViTLayer.__init__Fr   r   r   c                 C   s`   | j | |||d}|d }|dd  }|| }| |}| |}| ||}|f| }|S )N)r   r   r   )r   r   r   r   r   )r1   r   r   r   self_attention_outputsr   r   layer_outputr4   r4   r5   r_   Z  s   


zViTLayer.forwardra   )rb   rc   rd   re   r   r"   r$   rg   r   rf   r   r   r_   ri   r4   r4   r2   r5   r   M  s    r   c                       sb   e Zd Zdeddf fddZ				ddejd	eej d
ededede	e
ef fddZ  ZS )
ViTEncoderr   r    Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r4   )r   ).0_r   r4   r5   
<listcomp>{  s    z'ViTEncoder.__init__.<locals>.<listcomp>F)	r!   r"   r   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r2   r   r5   r"   x  s   
 
zViTEncoder.__init__FTr   r   r   output_hidden_statesreturn_dictc                 C   s   |rdnd }|r
dnd }t | jD ]8\}}	|r||f }|d ur$|| nd }
| jr6| jr6| |	j||
|}n|	||
|}|d }|rI||d f }q|rQ||f }|s_tdd |||fD S t|||dS )Nr4   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r4   )r   vr4   r4   r5   	<genexpr>  s    z%ViTEncoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)	enumerater   r   training_gradient_checkpointing_func__call__tupler   )r1   r   r   r   r   r   all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputsr4   r4   r5   r_   ~  s6   

zViTEncoder.forward)NFFT)rb   rc   rd   r   r"   r$   rg   r   rf   r   r   r   r_   ri   r4   r4   r2   r5   r   w  s&    	
r   c                   @   sJ   e Zd ZdZeZdZdZdZddgZ	de
ejejejf dd	fd
dZd	S )ViTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    vitrT   Tr   r   moduler    Nc                 C   s   t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS t |tr|tjj|jjt	j
d| jjd|jj|j_tjj|jjt	j
d| jjd|jj|j_dS dS )zInitialize the weightsg        )meanstdNrV   )rm   r   r   rq   inittrunc_normal_weightdatator$   float32r   initializer_rangedtyperz   zero_r   fill_r   r-   r'   )r1   r   r4   r4   r5   _init_weights  s8   





z ViTPreTrainedModel._init_weights)rb   rc   rd   re   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r   r   rq   r   r   r4   r4   r4   r5   r     s    &r   aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        interpolate_pos_encoding (`bool`, *optional*):
            Whether to interpolate the pre-trained position encodings.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z]The bare ViT Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Zddededef fddZdefd	d
Zdee	e
e	 f ddfddZeeeeeeded							ddeej deej deej dee dee dee dee deeef fddZ  ZS )ViTModelTFr   add_pooling_layerr   c                    s\   t  | || _t||d| _t|| _tj|j	|j
d| _|r%t|nd | _|   d S )N)r   r   )r!   r"   r   r   r6   r   encoderr   r   r&   r   	layernorm	ViTPoolerpooler	post_init)r1   r   r   r   r2   r4   r5   r"     s   
zViTModel.__init__r    c                 C   s   | j jS r   )r6   r+   )r1   r4   r4   r5   get_input_embeddings  s   zViTModel.get_input_embeddingsheads_to_pruneNc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r1   r   r   r   r4   r4   r5   _prune_heads
  s   zViTModel._prune_headsvision)
checkpointoutput_typer   modalityexpected_outputrT   rU   r   r   r   rS   r   c                 C   s
  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| || j j}| jjj	j
j}|j|kr?||}| j|||d}	| j|	||||d}
|
d }| |}| jdurd| |nd}|s{|durp||fn|f}||
dd  S t|||
j|
jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rU   rS   )r   r   r   r   r   r   )r   pooler_outputr   r   )r   r   r   use_return_dictru   get_head_maskr   r6   r+   rr   r   r   r   r   r   r   r   r   r   )r1   rT   rU   r   r   r   rS   r   expected_dtypeembedding_outputencoder_outputssequence_outputpooled_outputhead_outputsr4   r4   r5   r_     s@   


zViTModel.forward)TFNNNNNNN)rb   rc   rd   r   rf   r"   r*   r   r   rE   r   r   r   VIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r$   rg   rh   r   r   r_   ri   r4   r4   r2   r5   r     sH    	
	r   c                       s*   e Zd Zdef fddZdd Z  ZS )r   r   c                    s*   t    t|j|j| _t | _d S r   )r!   r"   r   r   r&   r   Tanh
activationr   r2   r4   r5   r"   Y  s   
zViTPooler.__init__c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r  )r1   r   first_token_tensorr  r4   r4   r5   r_   ^  s   

zViTPooler.forward)rb   rc   rd   r   r"   r_   ri   r4   r4   r2   r5   r   X  s    r   aV  ViT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                       s   e Zd Zdeddf fddZeeeee	d							dde
ej de
ej d	e
ej d
e
e de
e de
e de
e deeef fddZ  ZS )ViTForMaskedImageModelingr   r    Nc                    sX   t  | t|ddd| _ttj|j|jd |j	 ddt
|j| _|   d S )NFT)r   r   r:   r   )in_channelsout_channelsrj   )r!   r"   r   r   r   
Sequentialrq   r&   encoder_striderZ   PixelShuffledecoderr   r   r2   r4   r5   r"   t  s   

z"ViTForMaskedImageModeling.__init__)r   r   rT   rU   r   r   r   rS   r   c              	   C   s  |dur|n| j j}|dur%| j j| j jkr%td| j j d| j j d| j|||||||d}|d }	|	ddddf }	|	j\}
}}t|d  }}|		dd	d
|
|||}	| |	}d}|dur| j j| j j }|
d
||}|| j jd| j jd	d }tjj||dd}||  | d  | j j }|s|f|dd  }|dur|f| S |S t|||j|jdS )a=  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, ViTForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
        >>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```NzWhen `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that the reconstructed image has the same dimensions as the input. Got `patch_size` = z and `encoder_stride` = rs   )rU   r   r   r   rS   r   r   r   g      ?r:   r9   none)	reductiongh㈵>)lossreconstructionr   r   )r   r  rC   r  ru   r   rB   rF   floorrH   rD   r  rl   repeat_interleaverM   r   r   rI   l1_losssumrZ   r   r   r   )r1   rT   rU   r   r   r   rS   r   r   r  rY   sequence_lengthrZ   r7   r8   reconstructed_pixel_valuesmasked_im_lossr   r]   reconstruction_lossr   r4   r4   r5   r_     sX   )

 z!ViTForMaskedImageModeling.forwardr
  )rb   rc   rd   r   r"   r   r  r   r   r  r   r$   rg   rh   rf   r   r   r_   ri   r4   r4   r2   r5   r  g  s8    

	r  a  
    ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                       s   e Zd Zdeddf fddZeeeee	e
ed							ddeej deej d	eej d
ee dee dee dee deee	f fddZ  ZS )ViTForImageClassificationr   r    Nc                    sR   t  | |j| _t|dd| _|jdkrt|j|jnt | _	| 
  d S )NF)r   r   )r!   r"   
num_labelsr   r   r   r   r&   Identity
classifierr   r   r2   r4   r5   r"     s
   $z"ViTForImageClassification.__init__)r   r   r   r   rT   r   labelsr   r   rS   r   c                 C   s  |dur|n| j j}| j||||||d}|d }	| |	dddddf }
d}|dur||
j}| j jdu r]| jdkrCd| j _n| jdkrY|jt	j
ksT|jt	jkrYd| j _nd| j _| j jdkr{t }| jdkru||
 | }n+||
|}n%| j jdkrt }||
d| j|d}n| j jdkrt }||
|}|s|
f|dd  }|dur|f| S |S t||
|j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r   rS   r   r   r   
regressionsingle_label_classificationmulti_label_classificationr9   )r  logitsr   r   )r   r  r   r(  r   deviceproblem_typer&  r   r$   longrE   r   squeezer
   rK   r	   r   r   r   )r1   rT   r   r)  r   r   rS   r   r   r  r-  r  loss_fctr   r4   r4   r5   r_     sP   	

"


z!ViTForImageClassification.forwardr
  )rb   rc   rd   r   r"   r   r  r   _IMAGE_CLASS_CHECKPOINTr   r  _IMAGE_CLASS_EXPECTED_OUTPUTr   r$   rg   rf   r   r   r_   ri   r4   r4   r2   r5   r%    sB    
	r%  )@re   collections.abcrn   rF   typingr   r   r   r   r   r   r$   torch.utils.checkpointr   torch.nnr	   r
   r   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   configuration_vitr   
get_loggerrb   loggerr  r  r  r3  r4  !VIT_PRETRAINED_MODEL_ARCHIVE_LISTModuler   r*   rx   r   r   r   r   r   r   r   VIT_START_DOCSTRINGr  r   r   r  r%  r4   r4   r4   r5   <module>   sd    
P'<'*3'_	r