o
    h{                  	   @   sV  d Z ddlZddlZddlmZmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' e"(e)Z*dZ+dZ,g dZ-dZ.dZ/dgZ0G dd dej1Z2G dd dej1Z3G dd dej1Z4G dd dej1Z5G dd dej1Z6G dd dej1Z7d@d"ej8d#e9d$e:d%ej8fd&d'Z;G d(d) d)ej1Z<G d*d+ d+ej1Z=G d,d- d-ej1Z>G d.d/ d/ej1Z?G d0d1 d1ej1Z@G d2d3 d3eZAd4ZBd5ZCd6ZDe d7eBG d8d9 d9eAZEe d:eBG d;d< d<eAZFe d=eBG d>d? d?eAe%ZGdS )Az PyTorch DINOv2 model.    N)DictListOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)BackboneMixin   )Dinov2Configr   zfacebook/dinov2-base)r   i  i   z(facebook/dinov2-small-imagenet1k-1-layerztabby, tabby catc                       sj   e Zd ZdZdeddf fddZdejded	edejfd
dZ	ddejde
ej dejfddZ  ZS )Dinov2EmbeddingszM
    Construct the CLS token, mask token, position and patch embeddings.
    configreturnNc                    s~   t    ttdd|j| _ttd|j| _	t
|| _| jj}ttd|d |j| _t|j| _|| _d S )Nr   )super__init__r   	Parametertorchrandnhidden_size	cls_tokenzeros
mask_tokenDinov2PatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropoutr   )selfr   r+   	__class__ `/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/dinov2/modeling_dinov2.pyr!   H   s   


zDinov2Embeddings.__init__
embeddingsheightwidthc           	      C   sV  |j d d }| jj d d }||kr||kr| jS | jdddf }| jddddf }|j d }|| jj }|| jj }|d |d }}|dtt|tt||}|dddd}t	j
j|t|t| t|t| fdd	d
}t||j d kst||j d krtd|dddddd|}tj|d|fddS )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        r   Nr   g?r      bicubicF)scale_factormodealign_cornerszHWidth or height does not match with the interpolated position embeddingsdim)shaper,   r   
patch_sizereshapeintmathsqrtpermuter   
functionalinterpolatefloat
ValueErrorviewr#   cat	unsqueeze)	r0   r5   r6   r7   r+   num_positionsclass_pos_embedpatch_pos_embedr@   r3   r3   r4   interpolate_pos_encodingS   s,   	
$"$z)Dinov2Embeddings.interpolate_pos_encodingpixel_valuesbool_masked_posc           	      C   s   |j \}}}}| |}|d ur"t|d| j|jd|}| j	|dd}tj
||fdd}|| ||| }| |}|S )Nr8   r   r   r?   )rA   r*   r#   whererN   r(   todtyper&   expandrM   rR   r/   )	r0   rS   rT   
batch_size_r6   r7   r5   
cls_tokensr3   r3   r4   forwardu   s   

zDinov2Embeddings.forwardN)__name__
__module____qualname____doc__r   r!   r#   TensorrD   rR   r   r\   __classcell__r3   r3   r1   r4   r   C   s
    *"r   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )r)   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)r    r!   
image_sizerB   num_channelsr%   
isinstancecollectionsabcIterabler+   r   Conv2d
projection)r0   r   rf   rB   rg   r%   r+   r1   r3   r4   r!      s   
 zDinov2PatchEmbeddings.__init__rS   r   c                 C   sH   |j d }|| jkrtd| j d| d| |ddd}|S )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r9   )rA   rg   rK   rm   flatten	transpose)r0   rS   rg   r5   r3   r3   r4   r\      s   

zDinov2PatchEmbeddings.forward)	r^   r_   r`   ra   r!   r#   rb   r\   rc   r3   r3   r1   r4   r)      s    r)   c                
       sv   e Zd Zdeddf fddZdejdejfddZ		dd
eej de	de
eejejf eej f fddZ  ZS )Dinov2SelfAttentionr   r   Nc                    s   t    |j|j dkr t|ds td|jf d|j d|j| _t|j|j | _| j| j | _t	j
|j| j|jd| _t	j
|j| j|jd| _t	j
|j| j|jd| _t	|j| _d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads rn   bias)r    r!   r%   num_attention_headshasattrrK   rD   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluer-   attention_probs_dropout_probr/   r0   r   r1   r3   r4   r!      s   
zDinov2SelfAttention.__init__xc                 C   s6   |  d d | j| jf }||}|ddddS )Nr8   r   r9   r   r   )sizeru   rw   rL   rG   )r0   r   new_x_shaper3   r3   r4   transpose_for_scores   s   
z(Dinov2SelfAttention.transpose_for_scoresF	head_maskoutput_attentionsc                 C   s   |  |}| | |}| | |}| |}t||dd}|t| j	 }t
jj|dd}	| |	}	|d urA|	| }	t|	|}
|
dddd }
|
 d d | jf }|
|}
|rj|
|	f}|S |
f}|S )Nr8   r>   r?   r   r9   r   r   )r{   r   r|   r}   r#   matmulrp   rE   rF   rw   r   rH   softmaxr/   rG   
contiguousr   rx   rL   )r0   hidden_statesr   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr3   r3   r4   r\      s$   



zDinov2SelfAttention.forwardNF)r^   r_   r`   r   r!   r#   rb   r   r   boolr   r   r\   rc   r3   r3   r1   r4   rq      s    rq   c                       sF   e Zd ZdZdeddf fddZdejdejdejfd	d
Z  Z	S )Dinov2SelfOutputz
    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   r   Nc                    s.   t    t|j|j| _t|j| _d S r]   )	r    r!   r   ry   r%   denser-   r.   r/   r   r1   r3   r4   r!      s   
zDinov2SelfOutput.__init__r   input_tensorc                 C   s   |  |}| |}|S r]   )r   r/   )r0   r   r   r3   r3   r4   r\      s   

zDinov2SelfOutput.forward)
r^   r_   r`   ra   r   r!   r#   rb   r\   rc   r3   r3   r1   r4   r      s    $r   c                       s~   e Zd Zdeddf fddZdee ddfddZ			dd
ej	de
ej	 dedeeej	ej	f eej	 f fddZ  ZS )Dinov2Attentionr   r   Nc                    s*   t    t|| _t|| _t | _d S r]   )r    r!   rq   	attentionr   outputsetpruned_headsr   r1   r3   r4   r!      s   


zDinov2Attention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r?   )lenr   r   ru   rw   r   r   r{   r|   r}   r   r   rx   union)r0   r   indexr3   r3   r4   prune_heads  s   zDinov2Attention.prune_headsFr   r   r   c                 C   s4   |  |||}| |d |}|f|dd   }|S )Nr   r   )r   r   )r0   r   r   r   self_outputsattention_outputr   r3   r3   r4   r\     s   zDinov2Attention.forwardr   )r^   r_   r`   r   r!   r   rD   r   r#   rb   r   r   r   r   r\   rc   r3   r3   r1   r4   r      s    r   c                       4   e Zd Zd fddZdejdejfddZ  ZS )	Dinov2LayerScaler   Nc                    s(   t    t|jt|j | _d S r]   )	r    r!   r   r"   layerscale_valuer#   onesr%   lambda1r   r1   r3   r4   r!   $  s   
zDinov2LayerScale.__init__hidden_statec                 C   s
   || j  S r]   )r   r0   r   r3   r3   r4   r\   (  s   
zDinov2LayerScale.forwardr   Nr^   r_   r`   r!   r#   rb   r\   rc   r3   r3   r1   r4   r   #  s    r           Finput	drop_probtrainingr   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )rW   device)rA   ndimr#   randrW   r   floor_div)r   r   r   	keep_probrA   random_tensorr   r3   r3   r4   	drop_path-  s   
r   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )Dinov2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                    s   t    || _d S r]   )r    r!   r   )r0   r   r1   r3   r4   r!   E  s   

zDinov2DropPath.__init__r   c                 C   s   t || j| jS r]   )r   r   r   )r0   r   r3   r3   r4   r\   I  s   zDinov2DropPath.forwardc                 C   s   d | jS )Nzp={})formatr   r0   r3   r3   r4   
extra_reprL  s   zDinov2DropPath.extra_reprr]   )r^   r_   r`   ra   r   rJ   r!   r#   rb   r\   strr   rc   r3   r3   r1   r4   r   B  s
    r   c                       r   )		Dinov2MLPr   Nc                    sn   t    |j }}t|j|j }tj||dd| _t|j	t
r(t|j	 | _n|j	| _tj||dd| _d S )NTrs   )r    r!   r%   rD   	mlp_ratior   ry   fc1rh   
hidden_actr   r   
activationfc2r0   r   in_featuresout_featureshidden_featuresr1   r3   r4   r!   Q  s   

zDinov2MLP.__init__r   c                 C   s"   |  |}| |}| |}|S r]   )r   r   r   r   r3   r3   r4   r\   \  s   


zDinov2MLP.forwardr   r   r3   r3   r1   r4   r   P  s    r   c                       r   )	Dinov2SwiGLUFFNr   Nc                    sl   t    |j }}t|j|j }t|d d d d d }tj|d| dd| _tj||dd| _d S )Nr9   r         Trs   )	r    r!   r%   rD   r   r   ry   
weights_inweights_outr   r1   r3   r4   r!   d  s   

zDinov2SwiGLUFFN.__init__r   c                 C   s6   |  |}|jddd\}}tj|| }| |S )Nr9   r8   r?   )r   chunkr   rH   silur   )r0   r   x1x2hiddenr3   r3   r4   r\   m  s   

zDinov2SwiGLUFFN.forwardr   r   r3   r3   r1   r4   r   c  s    	r   c                       sl   e Zd ZdZdeddf fddZ		ddejd	eej d
e	de
eejejf eej f fddZ  ZS )Dinov2LayerzCThis corresponds to the Block class in the original implementation.r   r   Nc                    s   t    tj|j|jd| _t|| _t	|| _
|jdkr#t|jnt | _tj|j|jd| _|jr;t|| _nt|| _t	|| _|jdkrRt|j| _d S t | _d S )Nepsr   )r    r!   r   	LayerNormr%   layer_norm_epsnorm1r   r   r   layer_scale1drop_path_rater   Identity
drop_path1norm2use_swiglu_ffnr   mlpr   layer_scale2
drop_path2r   r1   r3   r4   r!   w  s   




(zDinov2Layer.__init__Fr   r   r   c                 C   sp   | j | |||d}|d }| |}|dd  }|| }| |}| |}| |}|| }|f| }|S )N)r   r   r   )r   r   r   r   r   r   )r0   r   r   r   self_attention_outputsr   r   layer_outputr3   r3   r4   r\     s   




zDinov2Layer.forwardr   )r^   r_   r`   ra   r   r!   r#   rb   r   r   r   r   r\   rc   r3   r3   r1   r4   r   t  s    r   c                       sb   e Zd Zdeddf fddZ				ddejd	eej d
ededede	e
ef fddZ  ZS )Dinov2Encoderr   r   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r3   )r   .0rZ   r   r3   r4   
<listcomp>  s    z*Dinov2Encoder.__init__.<locals>.<listcomp>F)	r    r!   r   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r1   r   r4   r!     s   
 
zDinov2Encoder.__init__FTr   r   r   output_hidden_statesreturn_dictc                 C   s   |rdnd }|r
dnd }t | jD ]8\}}	|r||f }|d ur$|| nd }
| jr6| jr6| |	j||
|}n|	||
|}|d }|rI||d f }q|rQ||f }|s_tdd |||fD S t|||dS )Nr3   r   r   c                 s   s    | ]	}|d ur|V  qd S r]   r3   )r   vr3   r3   r4   	<genexpr>  s    z(Dinov2Encoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)	enumerater   r   r   _gradient_checkpointing_func__call__tupler   )r0   r   r   r   r   r   all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputsr3   r3   r4   r\     s6   

zDinov2Encoder.forward)NFFT)r^   r_   r`   r   r!   r#   rb   r   r   r   r   r   r\   rc   r3   r3   r1   r4   r     s&    	
r   c                   @   sB   e Zd ZdZeZdZdZdZde	e
je
je
jf ddfdd	ZdS )
Dinov2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    dinov2rS   Tmoduler   Nc                 C   s   t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS t |tr|tjj|jjt	j
d| jjd|jj|j_tjj|jjt	j
d| jjd|jj|j_dS dS )zInitialize the weightsr   )meanstdNg      ?)rh   r   ry   rl   inittrunc_normal_weightdatarV   r#   float32r   initializer_rangerW   rt   zero_r   fill_r   r,   r&   )r0   r   r3   r3   r4   _init_weights  s8   





z#Dinov2PreTrainedModel._init_weights)r^   r_   r`   ra   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr   r   ry   rl   r   r
  r3   r3   r3   r4   r     s    &r   aH  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`Dinov2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a4  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`BitImageProcessor.preprocess`] for details.

        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
            pre-training.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
aM  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`BitImageProcessor.preprocess`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Zdef fddZdefddZdeee	e f ddfd	d
Z
eeeeeeded						ddeej deej deej dee dee dee deeef fddZ  ZS )Dinov2Modelr   c                    sF   t  | || _t|| _t|| _tj|j	|j
d| _|   d S )Nr   )r    r!   r   r   r5   r   encoderr   r   r%   r   	layernorm	post_initr   r1   r3   r4   r!   C  s   

zDinov2Model.__init__r   c                 C      | j jS r]   r5   r*   r   r3   r3   r4   get_input_embeddingsO     z Dinov2Model.get_input_embeddingsheads_to_pruneNc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )r0   r  r   r   r3   r3   r4   _prune_headsR  s   zDinov2Model._prune_headsvision)
checkpointoutput_typer  modalityexpected_outputrS   rT   r   r   r   r   c                 C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| || j j}| j||d}| j|||||d}|d }	| 	|	}	|	d d dd d f }
|sa|	|
f}||dd   S t
|	|
|j|jdS )Nz You have to specify pixel_values)rT   r   r   r   r   r   r   )r   pooler_outputr   r   )r   r   r   use_return_dictrK   get_head_maskr   r5   r  r  r   r   r   )r0   rS   rT   r   r   r   r   embedding_outputencoder_outputssequence_outputpooled_outputhead_outputsr3   r3   r4   r\   Z  s6   
zDinov2Model.forwardNNNNNN)r^   r_   r`   r   r!   r)   r  r   rD   r   r  r   DINOV2_BASE_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r#   rb   r   r   r   r\   rc   r3   r3   r1   r4   r  >  sB    	
r  z
    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    c                       s   e Zd Zdeddf fddZeeeee	e
ed						ddeej deej d	eej d
ee dee dee deee	f fddZ  ZS )Dinov2ForImageClassificationr   r   Nc                    sR   t  | |j| _t|| _|jdkrt|jd |jnt | _	| 
  d S )Nr   r9   )r    r!   
num_labelsr  r   r   ry   r%   r   
classifierr  r   r1   r3   r4   r!     s   
$z%Dinov2ForImageClassification.__init__)r  r  r  r  rS   r   labelsr   r   r   c                 C   s  |dur|n| j j}| j|||||d}|d }|dddf }	|ddddf }
tj|	|
jddgdd}| |}d}|dur||j}| j j	du rr| j
dkrXd| j _	n| j
dkrn|jtjksi|jtjkrnd| j _	nd| j _	| j j	dkrt }| j
dkr|| | }n+|||}n%| j j	dkrt }||d	| j
|d	}n| j j	dkrt }|||}|s|f|d
d  }|dur|f| S |S t|||j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   r?   
regressionsingle_label_classificationmulti_label_classificationr8   r9   )losslogitsr   r   )r   r!  r   r#   rM   r   r/  rV   r   problem_typer.  rW   longrD   r   squeezer
   rL   r	   r   r   r   )r0   rS   r   r0  r   r   r   r   r%  r&   patch_tokenslinear_inputr5  r4  loss_fctr   r3   r3   r4   r\     sT   


"


z$Dinov2ForImageClassification.forwardr(  )r^   r_   r`   r   r!   r   DINOV2_INPUTS_DOCSTRINGr   _IMAGE_CLASS_CHECKPOINTr   r+  _IMAGE_CLASS_EXPECTED_OUTPUTr   r#   rb   r   r   r   r\   rc   r3   r3   r1   r4   r-    s<    
r-  zO
    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                       sr   e Zd Z fddZdefddZeeee	e
d			ddejd	ee d
ee dee de	f
ddZ  ZS )Dinov2Backbonec                    sj   t    t     fddt jd D | _t | _t | _	t
j j jd| _|   d S )Nc                    s   g | ]} j qS r3   )r%   r   r   r3   r4   r     s    z+Dinov2Backbone.__init__.<locals>.<listcomp>r   r   )r    r!   _init_backboner   r   num_featuresr   r5   r   r  r   r   r%   r   r  r  r   r1   r   r4   r!     s   

zDinov2Backbone.__init__r   c                 C   r  r]   r  r   r3   r3   r4   r    r  z#Dinov2Backbone.get_input_embeddings)r  r  NrS   r   r   r   c                 C   sb  |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}| j|d||d}|r1|jn|d }d}t| j|D ]F\}	}
|	| j	v r| j j
rO| |
}
| j jr~|
ddddf }
|j\}}}}| j j}|
||| || d}
|
dddd	 }
||
f7 }q=|s|r|f|dd  }|S |f|d	d  }|S t||r|jnd|r|jd
S dd
S )a7  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 16, 16]
        ```NT)r   r   r   r   r3   r8   r   r   r9   )feature_mapsr   r   )r   r!  r   r   r5   r  r   zipstage_namesr   apply_layernormr  reshape_hidden_statesrA   rB   rC   rG   r   r   r   )r0   rS   r   r   r   r#  r   r   rB  stager   rY   rZ   r6   r7   rB   r   r3   r3   r4   r\     sF   #



zDinov2Backbone.forward)NNN)r^   r_   r`   r!   r)   r  r   r<  r   r   r+  r#   rb   r   r   r\   rc   r3   r3   r1   r4   r?    s&    
r?  )r   F)Hra   collections.abcri   rE   typingr   r   r   r   r   r   r#   torch.utils.checkpointr   torch.nnr	   r
   r   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   utils.backbone_utilsr   configuration_dinov2r   
get_loggerr^   loggerr+  r*  r,  r=  r>  $DINOV2_PRETRAINED_MODEL_ARCHIVE_LISTModuler   r)   rq   r   r   r   rb   rJ   r   r   r   r   r   r   r   r   DINOV2_START_DOCSTRINGr)  r<  r  r-  r?  r3   r3   r3   r4   <module>   sl    
G"=' 
53&R]