o
    h                  	   @   s  d Z ddlZddlZddlmZmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ eeZdZ dgZ!G dd dej"Z#dd Z$dd Z%G dd dej"Z&d<de
j'de(de)de
j'fddZ*G dd  d ej"Z+G d!d" d"ej"Z,G d#d$ d$ej"Z-G d%d& d&ej"Z.d'd( Z/d)d* Z0G d+d, d,ej"Z1G d-d. d.ej"Z2d/ej"ddfd0d1Z3G d2d3 d3eZ4d4Z5d5Z6ed6e5G d7d8 d8e4Z7ed9e5G d:d; d;e4eZ8dS )=z PyTorch ViTDet backbone.    N)DictListOptionalTupleUnion)nn   )ACT2FN)BackboneOutputBaseModelOutput)PreTrainedModel)add_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)BackboneMixin   )VitDetConfigr   zfacebook/vit-det-basec                       s>   e Zd ZdZ fddZdd Zdejdejfdd	Z  Z	S )
VitDetEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) to be consumed by a Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _
|| _|| _|| _|jr]|d }ttd||j| _nd | _tj||||d| _d S )Nr   r   )kernel_sizestride)super__init__pretrain_image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterable
image_sizenum_patches use_absolute_position_embeddingsr   	Parametertorchzerosposition_embeddingsConv2d
projection)selfconfigr!   r   r   r   r"   num_positions	__class__ `/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/vitdet/modeling_vitdet.pyr   9   s   
 zVitDetEmbeddings.__init__c                 C   s   |r|ddddf }|j d }tt|}|| |kr"td||ks*||krJtjj|d||d	dddd||fdd	d
}|	ddddS |d||dS )a  
        Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the
        original embeddings.

        Args:
            abs_pos_embeddings (`torch.Tensor`):
                Absolute positional embeddings with (1, num_position, num_channels).
            has_cls_token (`bool`):
                If true, has 1 embedding in abs_pos_embeddings for cls token.
            height (`int`):
                Height of input image tokens.
            width (`int`):
                Width of input image tokens.

        Returns:
            Absolute positional embeddings after processing with shape (1, height, width, num_channels)
        Nr   z5Absolute position embeddings must be a square number.r   r      bicubicF)sizemodealign_corners)
shapeintmathsqrt
ValueErrorr   
functionalinterpolatereshapepermute)r*   abs_pos_embeddingshas_cls_tokenheightwidthnum_positionr4   new_abs_pos_embeddingsr/   r/   r0   get_absolute_positionsO   s   
z'VitDetEmbeddings.get_absolute_positionspixel_valuesreturnc                 C   s   |j d }|| jkrtd| j d| d| |}| jd urA|dddd}|| | jd|j d |j d  }|dddd}|S )	Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r2   r   T)r7   r   r;   r)   r'   r?   rF   )r*   rG   r   
embeddingsr/   r/   r0   forwardt   s"   



zVitDetEmbeddings.forward)
__name__
__module____qualname____doc__r   rF   r%   TensorrK   __classcell__r/   r/   r-   r0   r   3   s
    %r   c                 C   s   t dt| | d }|jd |kr3tjj|d|jd dddd|dd}|d|dd}n|}t	| dddf t||  d }t	|dddf t| | d }|| |d t| | d  }||
  S )	a  
    Get relative positional embeddings according to the relative positions of query and key sizes.

    Args:
        q_size (`int`):
            Size of query q.
        k_size (`int`):
            Size of key k.
        rel_pos (`torch.Tensor`):
            Relative position embeddings (num_embeddings, num_channels).

    Returns:
        Extracted positional embeddings according to relative positions.
    r2   r   r   r1   linear)r4   r5   N      ?)r8   maxr7   r   r<   r=   r>   r?   r%   arangelong)q_sizek_sizerel_posmax_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordsr/   r/   r0   get_rel_pos   s   $$r_   c                 C   s   |\}}|\}}	t |||}
t ||	|}|j\}}}|||||}td||
}
td||}| |||||	|
dddddddddf  |dddddddddf  ||| ||	 } | S )a  
    Calculate decomposed Relative Positional Embeddings as introduced in
    [MViT2](https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py).

    Args:
        attn (`torch.Tensor`):
            Attention map.
        queries (`torch.Tensor`):
            Query q in the attention layer with shape (batch_size, queries_height * queries_width, num_channels).
        rel_pos_h (`torch.Tensor`):
            Relative position embeddings (Lh, num_channels) for height axis.
        rel_pos_w (`torch.Tensor`):
            Relative position embeddings (Lw, num_channels) for width axis.
        q_size (`Tuple[int]`):
            Spatial sequence size of query q with (queries_height, queries_width).
        k_size (`Tuple[int]`]):
            Spatial sequence size of key k with (keys_height, keys_width).

    Returns:
        attn (Tensor): attention map with added relative positional embeddings.
    zbhwc,hkc->bhwkzbhwc,wkc->bhwkN)r_   r7   r>   r%   einsumview)attnqueries	rel_pos_h	rel_pos_wrW   rX   queries_heightqueries_widthkeys_height
keys_widthrelative_heightrelative_width
batch_size_dimr_qrelative_weightr/   r/   r0   !add_decomposed_relative_positions   s      rq   c                       s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
VitDetAttentionz=Multi-head Attention block with relative position embeddings.Nc                    s   t    |j}|j}|| _|| }|d | _tj||d |jd| _	t||| _
|j| _| jrSttd|d  d || _ttd|d  d || _dS dS )z
        Args:
            config (`VitDetConfig`):
                Model configuration.
            input_size (`Tuple[int]`, *optional*):
                Input resolution, only required in case relative position embeddings are added.
        g      r   biasr2   r   r   N)r   r   r   num_attention_heads	num_headsscaler   Linearqkv_biasqkvproj use_relative_position_embeddingsr$   r%   r&   rd   re   )r*   r+   
input_sizern   rv   head_dimr-   r/   r0   r      s   

 $zVitDetAttention.__init__Fc                 C   s&  |j \}}}}| |||| d| jdddddd}|d|| j || dd\}}	}
|| j |	dd }| jrMt	||| j
| j||f||f}|jdd}||
 }||| j||d}|ddddd}||||d}| |}|r||| j|j d |j d }||f}|S |f}|S )	Nr   r1   r2   r   r      )rn   )r7   rz   r>   rv   r?   unbindrw   	transposer|   rq   rd   re   softmaxra   r{   )r*   hidden_stateoutput_attentionsrl   rB   rC   rm   rz   rc   keysvaluesattention_scoresattention_probsoutputsr/   r/   r0   rK      s,   ,&
zVitDetAttention.forwardN)FrL   rM   rN   rO   r   rK   rQ   r/   r/   r-   r0   rr      s    rr           Finput	drop_probtrainingrH   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)r7   ndimr%   randr   r   floor_div)r   r   r   	keep_probr7   random_tensoroutputr/   r/   r0   	drop_path  s   
r   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )VitDetDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rH   c                    s   t    || _d S r   )r   r   r   )r*   r   r-   r/   r0   r   .  s   

zVitDetDropPath.__init__hidden_statesc                 C   s   t || j| jS r   )r   r   r   )r*   r   r/   r/   r0   rK   2  s   zVitDetDropPath.forwardc                 C   s   d | jS )Nzp={})formatr   r*   r/   r/   r0   
extra_repr5  s   zVitDetDropPath.extra_reprr   )rL   rM   rN   rO   r   floatr   r%   rP   rK   strr   rQ   r/   r/   r-   r0   r   +  s
    r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )VitDetLayerNormaL  
    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and variance normalization over the
    channel dimension for inputs that have shape (batch_size, channels, height, width).
    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
    ư>c                    s@   t    tt|| _tt|| _|| _	|f| _
d S r   )r   r   r   r$   r%   onesweightr&   rt   epsnormalized_shape)r*   r   r   r-   r/   r0   r   @  s
   
zVitDetLayerNorm.__init__c                 C   sn   |j ddd}|| dj ddd}|| t|| j  }| jd d d d f | | jd d d d f  }|S )Nr   T)keepdimr2   )meanpowr%   r:   r   r   rt   )r*   xusr/   r/   r0   rK   G  s
   ,zVitDetLayerNorm.forward)r   r   r/   r/   r-   r0   r   9  s    r   c                       s(   e Zd ZdZ fddZdd Z  ZS )VitDetResBottleneckBlockz
    The standard bottleneck residual block without the last activation layer. It contains 3 conv layers with kernels
    1x1, 3x3, 1x1.
    c                    s   t    tj||ddd| _t|| _t|j | _	tj||dddd| _
t|| _t|j | _tj||ddd| _t|| _dS )ar  
        Args:
            config (`VitDetConfig`):
                Model configuration.
            in_channels (`int`):
                Number of input channels.
            out_channels (`int`):
                Number of output channels.
            bottleneck_channels (`int`):
                Number of output channels for the 3x3 "bottleneck" conv layers.
        r   Frs   r   )paddingrt   N)r   r   r   r(   conv1r   norm1r	   
hidden_actact1conv2norm2act2conv3norm3)r*   r+   in_channelsout_channelsbottleneck_channelsr-   r/   r0   r   U  s   


z!VitDetResBottleneckBlock.__init__c                 C   s&   |}|   D ]}||}q|| }|S r   )children)r*   r   outlayerr/   r/   r0   rK   m  s
   
z VitDetResBottleneckBlock.forwardr   r/   r/   r-   r0   r   O  s    r   c                       s@   e Zd Zdededdf fddZdejdejfdd	Z  ZS )
	VitDetMlpin_featureshidden_featuresrH   Nc                    sD   t    t||| _t|j | _t||| _t	|j
| _d S r   )r   r   r   rx   fc1r	   r   actfc2Dropoutdropout_probdrop)r*   r+   r   r   r-   r/   r0   r   w  s
   
zVitDetMlp.__init__r   c                 C   s6   |  |}| |}| |}| |}| |}|S r   )r   r   r   r   )r*   r   r/   r/   r0   rK   ~  s   




zVitDetMlp.forward)	rL   rM   rN   r8   r   r%   rP   rK   rQ   r/   r/   r-   r0   r   v  s    r   c              	   C   s   | j \}}}}|||  | }|||  | }|dks|dkr,tj| ddd|d|f} || || }}	| ||| ||	| ||} | dddddd d|||}
|
||	ffS )a  
    Partition into non-overlapping windows with padding if needed.

    Args:
        hidden_state (`torch.Tensor`):
            Input tokens with [batch_size, height, width, num_channels].
        window_size (`int`):
            Window size.

    Returns:
        `tuple(torch.FloatTensor)` comprising various elements:
        - windows: windows after partition with [batch_size * num_windows, window_size, window_size, num_channels].
        - (patch_height, patch_width): padded height and width before partition
    r   r   r   r2   r      r1   )r7   r   r<   padra   r?   
contiguous)r   window_sizerl   rB   rC   r   
pad_height	pad_widthpatch_heightpatch_widthwindowsr/   r/   r0   window_partition  s   $r   c           
      C   s   |\}}|\}}| j d || | |  }| ||| || ||d}	|	dddddd |||d}	||ks=||krO|	ddd|d|ddf  }	|	S )	a@  
    Window unpartition into original sequences and removing padding.

    Args:
        windows (`torch.Tensor`):
            Input tokens with [batch_size * num_windows, window_size, window_size, num_channels].
        window_size (`int`):
            Window size.
        pad_height_width (`Tuple[int]`):
            Padded height and width (patch_height, patch_width).
        height_width (`Tuple[int]`):
            Original height and width before padding.

    Returns:
        hidden_state: unpartitioned sequences with [batch_size, height, width, num_channels].
    r   r1   r   r   r2   r   r   N)r7   ra   r?   r   )
r   r   pad_height_widthheight_widthr   r   rB   rC   rl   r   r/   r/   r0   window_unpartition  s   $$r   c                       s|   e Zd ZdZ	ddededededd	f
 fd
dZ			dde	j
dee	j
 dedeee	j
e	j
f ee	j
 f fddZ  ZS )VitDetLayerzCThis corresponds to the Block class in the original implementation.r   Fr+   drop_path_rater   use_residual_blockrH   Nc                    s   t    |j}|j|j |j|j f}tj||jd| _t	||dkr%|n||fd| _
|dkr4t|nt | _tj||jd| _t||t||j d| _|| _|| _| jret||||d d| _d S d S )N)r   r   )r}   r   )r+   r   r   r2   )r+   r   r   r   )r   r   r   r!   r   r   	LayerNormlayer_norm_epsr   rr   	attentionr   Identityr   r   r   r8   	mlp_ratiomlpr   r   r   residual)r*   r+   r   r   r   rn   r}   r-   r/   r0   r     s(   
zVitDetLayer.__init__r   	head_maskr   c           
      C   s   | dddd}|}| |}| jdkr'|jd |jd }}t|| j\}}| j||d}|d }|dd  }	| jdkrGt|| j|||f}|| | }|| | | 	| }| dddd}| j
rk| |}|f|	 }	|	S )Nr   r2   r   r   )r   )r?   r   r   r7   r   r   r   r   r   r   r   r   )
r*   r   r   r   shortcutrB   rC   r   self_attention_outputsr   r/   r/   r0   rK     s*   




zVitDetLayer.forward)r   r   F)NF)rL   rM   rN   rO   r   r   r8   boolr   r%   rP   r   r   r   rK   rQ   r/   r/   r-   r0   r     s2     r   c                       sb   e Zd Zdeddf fddZ				ddejd	eej d
ededede	e
ef fddZ  ZS )VitDetEncoderr+   rH   Nc              	      s   t    || _|j}dd td|j|D }g }t|D ]}|t	||| ||j
v r/|jnd||jv d qt|| _d| _d S )Nc                 S   s   g | ]}|  qS r/   )item).0r   r/   r/   r0   
<listcomp>  s    z*VitDetEncoder.__init__.<locals>.<listcomp>r   )r   r   r   F)r   r   r+   num_hidden_layersr%   linspacer   rangeappendr   window_block_indicesr   residual_block_indicesr   
ModuleListr   gradient_checkpointing)r*   r+   depthr   layersir-   r/   r0   r     s    
	
zVitDetEncoder.__init__FTr   r   r   output_hidden_statesreturn_dictc                 C   s   |rdnd }|r
dnd }t | jD ]8\}}	|r||f }|d ur$|| nd }
| jr6| jr6| |	j||
|}n|	||
|}|d }|rI||d f }q|rQ||f }|s_tdd |||fD S t|||dS )Nr/   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r/   )r   vr/   r/   r0   	<genexpr>J  s    z(VitDetEncoder.forward.<locals>.<genexpr>last_hidden_stater   
attentions)	enumerater   r   r   _gradient_checkpointing_func__call__tupler   )r*   r   r   r   r   r   all_hidden_statesall_self_attentionsr   layer_modulelayer_head_masklayer_outputsr/   r/   r0   rK   &  s6   

zVitDetEncoder.forward)NFFT)rL   rM   rN   r   r   r%   rP   r   r   r   r   r   rK   rQ   r/   r/   r-   r0   r     s&    
r   modulec                 C   s6   t jj| jddd | jdurt j| jd dS dS )a  
    Initialize `module.weight` using the "MSRAFill" implemented in Caffe2. Also initializes `module.bias` to 0.

    Source: https://detectron2.readthedocs.io/en/latest/_modules/fvcore/nn/weight_init.html.

    Args:
        module (torch.nn.Module): module to initialize.
    fan_outrelu)r5   nonlinearityNr   )r   initkaiming_normal_r   rt   	constant_)r  r/   r/   r0   caffe2_msra_fillR  s   	
r	  c                   @   sF   e Zd ZdZeZdZdZdZg Z	de
ejejejf ddfdd	ZdS )
VitDetPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    vitdetrG   Tr  rH   Nc                 C   s  t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS t |trdtjj|jjt	j
d| jjd|jj|j_dS t |tr| jjrtjj|jjt	j
d| jjd|j_tjj|jjt	j
d| jjd|j_dS t |tr|j|j|jfD ]}t| q|j|jfD ]}|jjd |jj  q|jjj  |jjj  dS dS )zInitialize the weightsr   )r   stdNrS   ) r   r   rx   r(   r  trunc_normal_r   datator%   float32r+   initializer_ranger   rt   zero_r   fill_r   r'   rr   r|   rd   re   r   r   r   r   r	  r   r   r   )r*   r  r   r/   r/   r0   _init_weightsl  sP   







z#VitDetPreTrainedModel._init_weights)rL   rM   rN   rO   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r   rx   r(   r   r  r/   r/   r/   r0   r
  `  s    &r
  aH  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`VitDetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aK  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare VitDet Transformer model outputting raw hidden-states without any specific head on top.c                       s   e Zd Zdef fddZdefddZdeee	e f ddfd	d
Z
eeeeed					ddeej deej dee dee dee deeef fddZ  ZS )VitDetModelr+   c                    s2   t  | || _t|| _t|| _|   d S r   )r   r   r+   r   rJ   r   encoder	post_initr*   r+   r-   r/   r0   r     s
   

zVitDetModel.__init__rH   c                 C      | j jS r   rJ   r)   r   r/   r/   r0   get_input_embeddings     z VitDetModel.get_input_embeddingsheads_to_pruneNc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   prune_heads)r*   r"  r   headsr/   r/   r0   _prune_heads  s   zVitDetModel._prune_headsoutput_typer  rG   r   r   r   r   c           	      C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| || j j}| |}| j|||||d}|d }|sL|f|dd  S t	||j
|jdS )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import VitDetConfig, VitDetModel
        >>> import torch

        >>> config = VitDetConfig()
        >>> model = VitDetModel(config)

        >>> pixel_values = torch.randn(1, 3, 224, 224)

        >>> with torch.no_grad():
        ...     outputs = model(pixel_values)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 768, 14, 14]
        ```Nz You have to specify pixel_values)r   r   r   r   r   r   r   )r+   r   r   use_return_dictr;   get_head_maskr   rJ   r  r   r   r   )	r*   rG   r   r   r   r   embedding_outputencoder_outputssequence_outputr/   r/   r0   rK     s.   
zVitDetModel.forward)NNNNN)rL   rM   rN   r   r   r   r   r   r8   r   r&  r   VITDET_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr   r%   rP   r   r   r   rK   rQ   r/   r/   r-   r0   r    s0    


r  zF
    ViTDet backbone, to be used with frameworks like Mask R-CNN.
    c                       sr   e Zd Z fddZdefddZeeee	e
d			ddejd	ee d
ee dee de	f
ddZ  ZS )VitDetBackbonec                    sV   t    t    t | _t | _ fddt jd D | _	| 
  d S )Nc                    s   g | ]} j qS r/   )r   )r   rm   r+   r/   r0   r   %  s    z+VitDetBackbone.__init__.<locals>.<listcomp>r   )r   r   _init_backboner   rJ   r   r  r   r   num_featuresr  r  r-   r1  r0   r     s   

zVitDetBackbone.__init__rH   c                 C   r  r   r  r   r/   r/   r0   r   *  r!  z#VitDetBackbone.get_input_embeddingsr'  NrG   r   r   r   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}| j|d||d}|r1|jn|d }d}t| j|D ]\}	}
|	| j	v rK||
f7 }q=|sf|r[|f|dd  }|S |f|dd  }|S t
||rm|jnd|jdS )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import VitDetConfig, VitDetBackbone
        >>> import torch

        >>> config = VitDetConfig()
        >>> model = VitDetBackbone(config)

        >>> pixel_values = torch.randn(1, 3, 224, 224)

        >>> with torch.no_grad():
        ...     outputs = model(pixel_values)

        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 14, 14]
        ```NT)r   r   r   r   r/   r2   )feature_mapsr   r   )r+   r)  r   r   rJ   r  r   zipstage_namesout_featuresr
   r   )r*   rG   r   r   r   r+  r   r   r4  stager   r   r/   r/   r0   rK   -  s8   


zVitDetBackbone.forward)NNN)rL   rM   rN   r   r   r   r   r.  r   r
   r/  r%   rP   r   r   rK   rQ   r/   r/   r-   r0   r0    s&    
r0  )r   F)9rO   collections.abcr   r9   typingr   r   r   r   r   r%   torch.utils.checkpointr   activationsr	   modeling_outputsr
   r   modeling_utilsr   utilsr   r   r   r   utils.backbone_utilsr   configuration_vitdetr   
get_loggerrL   loggerr/  $VITDET_PRETRAINED_MODEL_ARCHIVE_LISTModuler   r_   rq   rr   rP   r   r   r   r   r   r   r   r   r   r   r   r	  r
  VITDET_START_DOCSTRINGr.  r  r0  r/   r/   r/   r0   <module>   sX   
W$) ?'KC8Z