o
    h                  	   @   s  d Z ddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZmZ ddlmZmZmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ e&,e-Z.dZ/dZ0g dZ1dZ2dZ3dgZ4eG dd deZ5dTdejde6de7dejfddZ8G dd  d ej9Z:G d!d" d"ej9Z;G d#d$ d$ej9Z<G d%d& d&ej9Z=G d'd( d(ej9Z>G d)d* d*ej9Z?G d+d, d,ej9Z@G d-d. d.ej9ZAG d/d0 d0ej9ZBG d1d2 d2ej9ZCG d3d4 d4ej9ZDG d5d6 d6eZEd7ZFd8ZGe$d9eFG d:d; d;eEZHG d<d= d=ej9ZIe$d>eFG d?d@ d@eEZJe$dAeFG dBdC dCeEZKG dDdE dEej9ZLG dFdG dGej9ZMG dHdI dIej9ZNG dJdK dKej9ZOG dLdM dMej9ZPe$dNeFG dOdP dPeEZQe$dQeFG dRdS dSeEe)ZRdS )Uz PyTorch BEiT model.    N)	dataclass)ListOptionalTupleUnion)Tensornn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedLMOutputSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesmeshgridprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)BackboneMixin   )
BeitConfigr   z%microsoft/beit-base-patch16-224-pt22k)r      i   zmicrosoft/beit-base-patch16-224ztabby, tabby catc                   @   s   e Zd ZdZdS )BeitModelOutputWithPoolinga  
    Class for outputs of [`BeitModel`].

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
            *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
            will be returned.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    N)__name__
__module____qualname____doc__ r&   r&   \/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.pyr!   E   s    r!           Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r(   r   r   r   )dtypedevice)shapendimtorchrandr.   r/   floor_div)r)   r*   r+   	keep_probr0   random_tensoroutputr&   r&   r'   	drop_path_   s   
r9   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )BeitDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr*   r,   c                    s   t    || _d S N)super__init__r*   )selfr*   	__class__r&   r'   r=   v   s   

zBeitDropPath.__init__hidden_statesc                 C   s   t || j| jS r;   )r9   r*   r+   r>   rA   r&   r&   r'   forwardz   s   zBeitDropPath.forwardc                 C   s   d | jS )Nzp={})formatr*   r>   r&   r&   r'   
extra_repr}   s   zBeitDropPath.extra_reprr;   )r"   r#   r$   r%   r   floatr=   r2   r   rC   strrF   __classcell__r&   r&   r?   r'   r:   s   s
    r:   c                       sL   e Zd ZdZdeddf fddZddejdeej	 dejfd	d
Z
  ZS )BeitEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    configr,   Nc                    s   t    ttdd|j| _|jr!ttdd|j| _	nd | _	t
|| _| jj}|jr?ttd|d |j| _nd | _t|j| _d S )Nr   )r<   r=   r   	Parameterr2   zeroshidden_size	cls_tokenuse_mask_token
mask_tokenBeitPatchEmbeddingspatch_embeddingsnum_patches use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probdropout)r>   rK   rT   r?   r&   r'   r=      s   

zBeitEmbeddings.__init__pixel_valuesbool_masked_posc                 C   s   |  || jd ur| jd d dd d d f nd \}\}}| \}}}|d urB| j||d}	|d|	}
|d|
  |	|
  }| j|dd}| jd ur_|| jd d d dd d f  }tj	||fdd}| 
|}|||ffS )Nr   dim)rS   rV   sizerQ   expand	unsqueezetype_asrO   r2   catrY   )r>   rZ   r[   
embeddingspatch_heightpatch_width
batch_sizeseq_len_mask_tokensw
cls_tokensr&   r&   r'   rC      s   *
 
zBeitEmbeddings.forwardr;   )r"   r#   r$   r%   r   r=   r2   r   r   
BoolTensorrC   rI   r&   r&   r?   r'   rJ      s    *rJ   c                       sB   e Zd ZdZ fddZd
dejdeej dejfdd	Z  Z	S )rR   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|d |d  |d |d  f}|| _|| _|| _|| _
|| _tj||||d| _d S )Nr   r   kernel_sizestride)r<   r=   
image_size
patch_sizenum_channelsrN   
isinstancecollectionsabcIterablerT   patch_shaper   Conv2d
projection)r>   rK   rq   rr   rs   rN   rT   rx   r?   r&   r'   r=      s   
  zBeitPatchEmbeddings.__init__NrZ   position_embeddingr,   c           
      C   s   |j \}}}}|| jkrtd| |}|j d |j d }}	|d urG|d| jd | jd ddddd}tjj	|||	fdd}|| }|
ddd}|||	ffS )	NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.   r   r   r   r\   bicubic)r_   mode)r0   rs   
ValueErrorrz   viewrx   permuter   
functionalinterpolateflatten	transpose)
r>   rZ   r{   rg   rs   heightwidthrd   re   rf   r&   r&   r'   rC      s"   


zBeitPatchEmbeddings.forwardr;   )
r"   r#   r$   r%   r=   r2   r   r   rC   rI   r&   r&   r?   r'   rR      s    *rR   c                          e Zd Zddedee ddf fddZdd Z				dd
ej	deej	 de
ded deeej	 eej	ej	f f f
ddZ  ZS )BeitSelfAttentionNrK   window_sizer,   c                    s   t    |j|j dkr t|ds td|jf d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	j
|j| jdd| _t	
|j| j| _t	|j| _|rct||d| _d S d | _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .F)biasr   )r<   r=   rN   num_attention_headshasattrr   intattention_head_sizeall_head_sizer   LinearquerykeyvaluerW   attention_probs_dropout_probrY   BeitRelativePositionBiasrelative_position_biasr>   rK   r   r?   r&   r'   r=      s"   

zBeitSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr\   r   r|   r   r   )r_   r   r   r   r   )r>   xnew_x_shaper&   r&   r'   transpose_for_scores   s   
z&BeitSelfAttention.transpose_for_scoresFrA   	head_maskoutput_attentionsr   r   c                 C   s
  |  |}| | |}| | |}| |}t||dd}	|	t| j	 }	| j
d ur:|	| 
 d }	|d urB|	| }	tjj|	dd}
| |
}
|d urW|
| }
t|
|}|dddd }| d d | jf }|j| }|r||
f}|S |f}|S )Nr\   r   r]   r|   r   r   )r   r   r   r   r2   matmulr   mathsqrtr   r   ra   r   r   softmaxrY   r   
contiguousr_   r   r   )r>   rA   r   r   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr&   r&   r'   rC      s,   




zBeitSelfAttention.forwardr;   NFN)r"   r#   r$   r   r   tupler=   r   r2   r   boolr   r   rC   rI   r&   r&   r?   r'   r      s"     r   c                       sH   e Zd ZdZdeddf fddZddejdejdejfd	d
Z  Z	S )BeitSelfOutputz
    The residual connection is defined in BeitLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rK   r,   Nc                    s.   t    t|j|j| _t|j| _d S r;   )	r<   r=   r   r   rN   denserW   rX   rY   r>   rK   r?   r&   r'   r=   4     
zBeitSelfOutput.__init__rA   input_tensorc                 C      |  |}| |}|S r;   r   rY   )r>   rA   r   gammar&   r&   r'   rC   9     

zBeitSelfOutput.forwardr;   )
r"   r#   r$   r%   r   r=   r2   r   rC   rI   r&   r&   r?   r'   r   .  s    &r   c                       r   )BeitAttentionNrK   r   r,   c                    s.   t    t||d| _t|| _t | _d S )Nr   )r<   r=   r   	attentionr   r8   setpruned_headsr   r?   r&   r'   r=   A  s   

zBeitAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r]   )lenr   r   r   r   r   r   r   r   r   r8   r   r   union)r>   headsindexr&   r&   r'   prune_headsG  s   zBeitAttention.prune_headsFrA   r   r   r   r   c                 C   s6   |  ||||}| |d |}|f|dd   }|S )Nr   r   )r   r8   )r>   rA   r   r   r   self_outputsattention_outputr   r&   r&   r'   rC   Y  s   zBeitAttention.forwardr;   r   )r"   r#   r$   r   r   r   r=   r   r2   r   r   r   r   rC   rI   r&   r&   r?   r'   r   @  s"     r   c                       <   e Zd Zdeddf fddZdejdejfddZ  ZS )	BeitIntermediaterK   r,   Nc                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r;   )r<   r=   r   r   rN   intermediate_sizer   rt   
hidden_actrH   r   intermediate_act_fnr   r?   r&   r'   r=   i  s
   
zBeitIntermediate.__init__rA   c                 C   r   r;   )r   r   rB   r&   r&   r'   rC   q  r   zBeitIntermediate.forward	r"   r#   r$   r   r=   r2   r   rC   rI   r&   r&   r?   r'   r   h  s    r   c                       r   )	
BeitOutputrK   r,   Nc                    s.   t    t|j|j| _t|j| _	d S r;   )
r<   r=   r   r   r   rN   r   rW   rX   rY   r   r?   r&   r'   r=   y  r   zBeitOutput.__init__rA   c                 C   r   r;   r   rB   r&   r&   r'   rC   ~  r   zBeitOutput.forwardr   r&   r&   r?   r'   r   x  s    r   c                       s   e Zd ZdZddedee deddf fdd	Z		
	dde	j
dee	j
 deded deee	j
 ee	j
e	j
f f f
ddZ  ZS )	BeitLayerz?This corresponds to the Block class in the timm implementation.Nr(   rK   r   drop_path_rater,   c                    s   t    |j| _d| _t||d| _t|| _t|| _	t
j|j|jd| _|dkr/t|nt
 | _t
j|j|jd| _|j}|dkrct
j|t|j dd| _t
j|t|j dd| _d S d\| _| _d S )	Nr   r   epsr(   r   T)requires_grad)NN)r<   r=   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r8   r   	LayerNormrN   layer_norm_epslayernorm_beforer:   Identityr9   layernorm_afterlayer_scale_init_valuerL   r2   oneslambda_1lambda_2)r>   rK   r   r   init_valuesr?   r&   r'   r=     s   


 zBeitLayer.__init__FrA   r   r   r   r   c           	      C   s   | j | ||||d}|d }|dd  }| jd ur | j| }| || }| |}| |}| |}| jd ur@| j| }| || }|f| }|S )N)r   r   r   r   )r   r   r   r9   r   r   r8   r   )	r>   rA   r   r   r   self_attention_outputsr   r   layer_outputr&   r&   r'   rC     s&   







zBeitLayer.forward)Nr(   r   )r"   r#   r$   r%   r   r   r   rG   r=   r2   r   r   r   r   rC   rI   r&   r&   r?   r'   r     s"    $r   c                       s:   e Zd Zdededdf fddZdejfddZ  Z	S )	r   rK   r   r,   Nc           	         s  t    || _d|d  d d|d  d  d | _tt| j|j| _	t
|d }t
|d }tt||gdd}t|d}|d d d d d f |d d d d d f  }|ddd }|d d d d df  |d d 7  < |d d d d df  |d d 7  < |d d d d df  d|d  d 9  < tj|d |d  d fd |jd}|d|dd dd f< | jd |ddd f< | jd |dd df< | jd |d	< | jd
|dd d S )Nr|   r   r   r   ij)indexing)r_   r.   r\   )r   r   relative_position_indexF)
persistent)r<   r=   r   num_relative_distancer   rL   r2   rM   r   relative_position_bias_tablearangestackr   r   r   r   r.   sumregister_buffer)	r>   rK   r   coords_hcoords_wcoordscoords_flattenrelative_coordsr   r?   r&   r'   r=     s.   
&,&&*z!BeitRelativePositionBias.__init__c                 C   sV   | j | jd | jd | jd  d | jd | jd  d d}|ddd S )Nr\   r   r   r|   )r   r   r   r   r   r   )r>   r   r&   r&   r'   rC     s   .z BeitRelativePositionBias.forward)
r"   r#   r$   r   r   r=   r2   r   rC   rI   r&   r&   r?   r'   r     s    r   c                       sl   e Zd Zddedee ddf fddZ				dd	ejd
eej de	de	de	de
eef fddZ  ZS )BeitEncoderNrK   r   r,   c                    sv   t     | _ jrt d| _nd | _dd td j j	D t
 fddt j	D | _d| _d S )Nr   c                 S   s   g | ]}|  qS r&   )item.0r   r&   r&   r'   
<listcomp>  s    z(BeitEncoder.__init__.<locals>.<listcomp>r   c                    s(   g | ]}t   jrnd | dqS )N)r   r   )r   use_relative_position_biasr   irK   dprr   r&   r'   r     s    F)r<   r=   rK   !use_shared_relative_position_biasr   r   r2   linspacer   num_hidden_layersr   
ModuleListrangelayergradient_checkpointingr   r?   r   r'   r=     s   


zBeitEncoder.__init__FTrA   r   r   output_hidden_statesreturn_dictc                 C   s   |rdnd }|r
dnd }t | jD ]D\}}	|r||f }|d ur$|| nd }
| jr6| jr6| |	j||
|}n| jd ur?|  nd }|	||
||}|d }|rU||d f }q|r]||f }|sktdd |||fD S t|||dS )Nr&   r   r   c                 s   s    | ]	}|d ur|V  qd S r;   r&   )r   vr&   r&   r'   	<genexpr>&  s    z&BeitEncoder.forward.<locals>.<genexpr>)last_hidden_staterA   
attentions)		enumerater   r  r+   _gradient_checkpointing_func__call__r   r   r   )r>   rA   r   r   r  r  all_hidden_statesall_self_attentionsr   layer_modulelayer_head_masklayer_outputsr   r&   r&   r'   rC     s:   

zBeitEncoder.forwardr;   )NFFT)r"   r#   r$   r   r   r   r=   r2   r   r   r   r   rC   rI   r&   r&   r?   r'   r     s&     
r   c                   @   s(   e Zd ZdZeZdZdZdZdd Z	dS )BeitPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    beitrZ   Tc                 C   s   t |tjtjtjfr%|jjjd| jj	d |j
dur#|j
j  dS dS t |tjrH|jjjd| jj	d |jdurF|jj|j   dS dS t |tjr]|j
j  |jjd dS dS )zInitialize the weightsr(   )meanstdNg      ?)rt   r   r   ry   ConvTranspose2dweightdatanormal_rK   initializer_ranger   zero_	Embeddingpadding_idxr   fill_)r>   moduler&   r&   r'   _init_weights9  s   

z!BeitPreTrainedModel._init_weightsN)
r"   r#   r$   r%   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr  r&   r&   r&   r'   r  .  s    r  aF  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`BeitConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aL  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`BeitImageProcessor.__call__`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare Beit Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd Zddededdf fddZdd	 Zd
d Zee	e
eeeded						ddeej deej deej dee dee dee deeef fddZ  ZS )	BeitModelTrK   add_pooling_layerr,   Nc                    sp   t  | || _t|| _t|| jjjd| _|j	rt
 nt
j|j|jd| _|r/t|nd | _|   d S )Nr   r   )r<   r=   rK   rJ   rd   r   rS   rx   encoderuse_mean_poolingr   r   r   rN   r   	layernorm
BeitPoolerpooler	post_init)r>   rK   r$  r?   r&   r'   r=   q  s   
zBeitModel.__init__c                 C      | j jS r;   rd   rS   rE   r&   r&   r'   get_input_embeddings     zBeitModel.get_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr%  r   r   r   )r>   heads_to_pruner   r   r&   r&   r'   _prune_heads  s   zBeitModel._prune_headsvision)
checkpointoutput_typer  modalityexpected_outputrZ   r[   r   r   r  r  c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| || j j}| ||\}\}}	| j|||||d}
|
d }| 	|}| j
durU| 
|nd}|sl|dura||fn|f}||
dd  S t|||
j|
jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_valuesr   r   r  r  r   r   )r  pooler_outputrA   r  )rK   r   r  use_return_dictr   get_head_maskr   rd   r%  r'  r)  r!   rA   r  )r>   rZ   r[   r   r   r  r  embedding_outputre   rf   encoder_outputssequence_outputpooled_outputhead_outputsr&   r&   r'   rC     s6   
zBeitModel.forward)TNNNNNN)r"   r#   r$   r   r   r=   r-  r1  r   BEIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr!   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r2   r   rm   r   r   rC   rI   r&   r&   r?   r'   r#  l  sB    	
r#  c                       r   )	r(  rK   r,   Nc                    s2   t    |jrtj|j|jd| _d S d | _d S )Nr   )r<   r=   r&  r   r   rN   r   r'  r   r?   r&   r'   r=     s
   
zBeitPooler.__init__rA   c                 C   sL   | j d ur|d d dd d d f }|  |d}|S |d d df }|S )Nr   r   )r'  r  )r>   rA   patch_tokensr>  r&   r&   r'   rC     s   
zBeitPooler.forwardr   r&   r&   r?   r'   r(    s    r(  a  Beit Model transformer with a 'language' modeling head on top. BEiT does masked image modeling by predicting
    visual tokens of a Vector-Quantize Variational Autoencoder (VQ-VAE), whereas other vision models like ViT and DeiT
    predict RGB pixel values. As a result, this class is incompatible with [`AutoModelForMaskedImageModeling`], so you
    will need to use [`BeitForMaskedImageModeling`] directly if you wish to do masked image modeling with BEiT.c                       s   e Zd Zdeddf fddZeeeee	d							dde
ej de
ej d	e
ej d
e
ej de
e de
e de
e deeef fddZ  ZS )BeitForMaskedImageModelingrK   r,   Nc                    sT   t  | |j| _t|dd| _tj|j|jd| _	t
|j|j| _|   d S )NFr$  r   )r<   r=   
num_labelsr#  r  r   r   rN   r   r'  r   
vocab_sizelm_headr*  r   r?   r&   r'   r=     s   z#BeitForMaskedImageModeling.__init__r4  r  rZ   r[   r   labelsr   r  r  c                 C   s   |dur|n| j j}| j||||||d}|d }	| |	}	| |	ddddf }
d}|dur;t }||
| |}|sQ|
f|dd  }|durO|f| S |S t||
|j|jdS )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
        >>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, logits = outputs.loss, outputs.logits
        >>> list(logits.shape)
        [1, 196, 8192]
        ```N)r[   r   r   r  r  r   r   losslogitsrA   r  )	rK   r9  r  r'  rJ  r
   r   rA   r  )r>   rZ   r[   r   rL  r   r  r  r   r=  prediction_scoresmasked_lm_lossloss_fctr8   r&   r&   r'   rC     s2   /	
z"BeitForMaskedImageModeling.forward)NNNNNNN)r"   r#   r$   r   r=   r   rA  r   r   rC  r   r2   r   rm   r   r   r   rC   rI   r&   r&   r?   r'   rF    s8    

	rF  z
    Beit Model transformer with an image classification head on top (a linear layer on top of the average of the final
    hidden states of the patch tokens) e.g. for ImageNet.
    c                       s   e Zd Zdeddf fddZeeeee	e
ed						ddeej deej d	eej d
ee dee dee deee	f fddZ  ZS )BeitForImageClassificationrK   r,   Nc                    sR   t  | |j| _t|dd| _|jdkrt|j|jnt | _	| 
  d S )NTrG  r   )r<   r=   rH  r#  r  r   r   rN   r   
classifierr*  r   r?   r&   r'   r=   H  s
   $z#BeitForImageClassification.__init__)r3  r4  r  r6  rZ   r   rL  r   r  r  c                 C   sj  |dur|n| j j}| j|||||d}|r|jn|d }| |}	d}
|dur| j jdu rR| jdkr8d| j _n| jdkrN|jtj	ksI|jtj
krNd| j _nd| j _| j jdkrpt }| jdkrj||	 | }
n+||	|}
n%| j jdkrt }||	d| j|d}
n| j jdkrt }||	|}
|s|	f|dd  }|
dur|
f| S |S t|
|	|j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr7  r   
regressionsingle_label_classificationmulti_label_classificationr\   r|   rM  )rK   r9  r  r8  rT  problem_typerH  r.   r2   longr   r   squeezer
   r   r	   r   rA   r  )r>   rZ   r   rL  r   r  r  r   r>  rO  rN  rR  r8   r&   r&   r'   rC   T  sL   


"


z"BeitForImageClassification.forwardr@  )r"   r#   r$   r   r=   r   rA  r   _IMAGE_CLASS_CHECKPOINTr   rC  _IMAGE_CLASS_EXPECTED_OUTPUTr   r2   r   r   r   r   rC   rI   r&   r&   r?   r'   rS  @  s<    
rS  c                       s   e Zd ZdZ			ddededeeeeef f deeeeef ef d	ed
eeeeef f ddf fddZ	de
jde
jfddZ  ZS )BeitConvModuleaD  
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    r   Fr   in_channelsout_channelsro   paddingr   dilationr,   Nc                    s<   t    tj||||||d| _t|| _t | _d S )N)r^  r_  ro   r`  r   ra  )	r<   r=   r   ry   convBatchNorm2dbnReLU
activation)r>   r^  r_  ro   r`  r   ra  r?   r&   r'   r=     s   
	zBeitConvModule.__init__r)   c                 C   s"   |  |}| |}| |}|S r;   )rb  rd  rf  )r>   r)   r8   r&   r&   r'   rC     s   


zBeitConvModule.forward)r   Fr   )r"   r#   r$   r%   r   r   r   rH   r   r=   r2   r   rC   rI   r&   r&   r?   r'   r]    s*    r]  c                       sD   e Zd Zdedededdf fddZdejdejfd	d
Z  ZS )BeitPyramidPoolingBlock
pool_scaler^  channelsr,   Nc                    sL   t    t|t||ddg| _t| jD ]\}}| t|| qd S )Nr   ro   )	r<   r=   r   AdaptiveAvgPool2dr]  layersr  
add_modulerH   )r>   rh  r^  ri  r   r   r?   r&   r'   r=     s   
z BeitPyramidPoolingBlock.__init__r)   c                 C   s   |}| j D ]}||}q|S r;   )rl  )r>   r)   hidden_stater   r&   r&   r'   rC     s   

zBeitPyramidPoolingBlock.forward)	r"   r#   r$   r   r=   r2   r   rC   rI   r&   r&   r?   r'   rg    s    	rg  c                
       sX   e Zd ZdZdeedf dedededdf
 fd	d
Zdej	de
ej	 fddZ  ZS )BeitPyramidPoolingModulea  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        in_channels (int): Input channels.
        channels (int): Channels after modules, before conv_seg.
        align_corners (bool): align_corners argument of F.interpolate.

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    pool_scales.r^  ri  align_cornersr,   Nc                    sh   t    || _|| _|| _|| _g | _t|D ]\}}t|||d}| j	| | 
t|| qd S )N)rh  r^  ri  )r<   r=   rp  rq  r^  ri  blocksr  rg  appendrm  rH   )r>   rp  r^  ri  rq  r   rh  blockr?   r&   r'   r=     s   
z!BeitPyramidPoolingModule.__init__r   c                 C   sH   g }| j D ]}||}tjj|| dd  d| jd}|| q|S )Nr|   bilinearr_   r~   rq  )rr  r   r   r   r_   rq  rs  )r>   r   ppm_outsppmppm_outupsampled_ppm_outr&   r&   r'   rC     s   
z BeitPyramidPoolingModule.forward)r"   r#   r$   r%   r   r   r   r=   r2   r   r   rC   rI   r&   r&   r?   r'   ro    s    *"ro  c                       sH   e Zd ZdZdeddf fddZdd Zd	ejdejfd
dZ	  Z
S )BeitUperHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://arxiv.org/abs/1807.10221).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rK   r,   Nc                    s  t    |j| _|jgd | _|j| _d| _tj| j|j	dd| _
t| j| jd | j| jd| _t| jd t| j| j  | jddd| _t | _t | _| jd d D ] }t|| jdd}t| j| jddd}| j| | j| qWtt| j| j | jddd| _d S )	N   Fr   rj  r\   )rq  r   ro   r`  )r<   r=   rp  rN   r^  ri  rq  r   ry   rH  rT  ro  psp_modulesr]  r   
bottleneckr   lateral_convs	fpn_convsrs  fpn_bottleneck)r>   rK   r^  l_convfpn_convr?   r&   r'   r=     s>   


zBeitUperHead.__init__c                 C   s:   |d }|g}| | | tj|dd}| |}|S )Nr\   r   r]   )extendr~  r2   rc   r  )r>   inputsr   psp_outsr8   r&   r&   r'   psp_forward"  s   
zBeitUperHead.psp_forwardencoder_hidden_statesc                    s   fddt jD   t}t|d ddD ]$}|d  jdd  }|d  tjj	| |dj
d |d < q fd	dt|d D }|d  t|d ddD ]}tjj	|| |d jdd  dj
d||< qbtj|dd
}|}|}|S )Nc                    s   g | ]
\}}| | qS r&   r&   )r   r   lateral_conv)r  r&   r'   r   -  s    z(BeitUperHead.forward.<locals>.<listcomp>r   r   r\   r|   ru  rv  c                    s   g | ]}j |  | qS r&   )r  r   )lateralsr>   r&   r'   r   :  s    r]   )r  r  rs  r  r   r   r0   r   r   r   rq  r2   rc   r  rT  )r>   r  used_backbone_levelsr   
prev_shapefpn_outsr8   r&   )r  r  r>   r'   rC   +  s$   

zBeitUperHead.forward)r"   r#   r$   r%   r   r=   r  r2   r   rC   rI   r&   r&   r?   r'   r{    s
    &	r{  c                       s`   e Zd ZdZ	ddedededeeeeef f d	d
f
 fddZde	j
d	e	j
fddZ  ZS )BeitFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is implemented of
    [FCNNet](https://arxiv.org/abs/1411.4038>).

    Args:
        config (BeitConfig): Configuration.
        in_channels
        kernel_size (int): The kernel size for convs in the head. Default: 3.
        dilation (int): The dilation rate for convs in the head. Default: 1.


    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    r|   r   r   rK   in_indexro   ra  r,   Nc              
      s   t    |j| _|j| _|j| _|j| _	|| _
|d | }g }|t| j| j|||d t| jd D ]}|t| j| j|||d q5| jdkrQt | _ntj| | _| j	rjt| j| j | j||d d| _tj| j|jdd| _d S )Nr|   )ro   r`  ra  r   r   r}  rj  )r<   r=   rN   r^  auxiliary_channelsri  auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputr  rs  r]  r   r   r   convs
Sequentialconv_catry   rH  rT  )r>   rK   r  ro   ra  conv_paddingr  r   r?   r&   r'   r=   X  s6   

zBeitFCNHead.__init__r  c                 C   s@   || j  }| |}| jr| tj||gdd}| |}|S )Nr   r]   )r  r  r  r  r2   rc   rT  )r>   r  rA   r8   r&   r&   r'   rC   z  s   


zBeitFCNHead.forward)r|   r   r   )r"   r#   r$   r%   r   r   r   r   r=   r2   r   rC   rI   r&   r&   r?   r'   r  I  s    "r  zf
    Beit Model transformer with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
    c                       s   e Zd Zdeddf fddZdd Zeeee	e
d						dd	eej d
eej deej dee dee dee deee	f fddZ  ZS )BeitForSemanticSegmentationrK   r,   Nc                    s   t  | |j| _t|dd| _t| jjdkrtdt	
t	j|j|jdddt	|jt	 t	j|j|jddd| _t	
t	j|j|jddd| _t	 | _t	jddd| _t|| _|jrft|nd | _|   d S )NFrG  r|  zBeitForSemanticSegmentation requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.r|   rn   )r<   r=   rH  r#  r  r   rK   out_indicesr   r   r  r  rN   rc  GELUfpn1fpn2r   fpn3	MaxPool2dfpn4r{  decode_headuse_auxiliary_headr  auxiliary_headr*  r   r?   r&   r'   r=     s*   


z$BeitForSemanticSegmentation.__init__c           
      C   s   t jj||jdd  ddd}|d ur"t jj||jdd  ddd}t| jjd}|||}|}|d urA|||}	|| jj|	 7 }|S )Nr   ru  Frv  )ignore_index)r   r   r   r0   r
   rK   semantic_loss_ignore_indexauxiliary_loss_weight)
r>   rO  auxiliary_logitsrL  upsampled_logitsupsampled_auxiliary_logitsrR  	main_lossrN  auxiliary_lossr&   r&   r'   compute_loss  s   

z(BeitForSemanticSegmentation.compute_lossrK  rZ   r   rL  r   r  r  c                    s|  |dur|nj j}|dur|nj j}j|||d|d}|r#|jn|d }fddt|D }	|jd  j jj j  fdd|	D }	j	j
jjg}
tt|	D ]}|
| |	| |	|< qY|	}d}jdurw|	}d}|durj jdkrtd	|||}|s|r|f|dd  }n	|f|d
d  }|dur|f| S |S t|||r|jnd|jdS )aV  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
        >>> model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```NTr7  r   c                    s$   g | ]\}}|d   j jv r|qS r-   )rK   r  )r   idxfeaturerE   r&   r'   r     s   $ z7BeitForSemanticSegmentation.forward.<locals>.<listcomp>r   c                    s<   g | ]}|d d dd d d f  ddd dqS )Nr   r   r|   r\   )r   reshaper   )rg   patch_resolutionr&   r'   r     s    0z/The number of labels should be greater than oner|   rM  )rK   r9  r  r  rA   r  r0   rq   rr   r  r  r  r  r   r   r  r  rH  r   r  r   r  )r>   rZ   r   rL  r   r  r  r   r  featuresopsr   rO  r  rN  r8   r&   )rg   r  r>   r'   rC     sP   $



z#BeitForSemanticSegmentation.forwardr@  )r"   r#   r$   r   r=   r  r   rA  r   r   rC  r   r2   r   r   r   r   rC   rI   r&   r&   r?   r'   r    s4     

r  zM
    BEiT backbone, to be used with frameworks like DETR and MaskFormer.
    c                       sj   e Zd Z fddZdd Zeeeee	d			dde
dee d	ee d
ee def
ddZ  ZS )BeitBackbonec                    s   t    t     fddt jd D | _t | _t | jj	j
d| _ jrut| jjdkr9td j}ttj||dddtj| jd	t tj||ddd| _ttj||ddd| _t | _tjddd| _|   d S )
Nc                    s   g | ]} j qS r&   )rN   )r   ri   rK   r&   r'   r   &  s    z)BeitBackbone.__init__.<locals>.<listcomp>r   r   r|  zBeitBackbone requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.r|   rn   r   )r<   r=   _init_backboner   r   num_featuresrJ   rd   r   rS   rx   r%  add_fpnr   rK   r  r   rN   r   r  r  rc  batch_norm_epsr  r  r  r   r  r  r  r*  )r>   rK   rN   r?   r  r'   r=   "  s*   

zBeitBackbone.__init__c                 C   r+  r;   r,  rE   r&   r&   r'   r-  @  r.  z!BeitBackbone.get_input_embeddingsrK  NrZ   r  r   r  r,   c                 C   s|  |dur|n| j j}|dur|n| j j}|dur|n| j j}|jd }| |\}\}}| j|d||d}	|r:|	jn|	d }
d}t| j	|
D ].\}}|| j
v rt| j jro|ddddddf }|ddd}||d||}||f7 }qF| j jr| |d | |d | |d | |d	 g}t|}|s|r|f|	dd  }|S |f|	dd  }|S t||r|	jnd|	jd
S )aL  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/beit-base-patch16-224", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 14, 14]
        ```Nr   T)r  r   r  r   r&   r|   r\   r   )feature_mapsrA   r  )rK   r9  r  r   r0   rd   r%  rA   zipstage_namesout_featuresreshape_hidden_statesr   r  r  r  r  r  r  r   r   r  )r>   rZ   r  r   r  rg   r;  re   rf   r   rA   r  stagern  r8   r&   r&   r'   rC   C  sJ   #


zBeitBackbone.forward)NNN)r"   r#   r$   r=   r-  r   rA  r   r   rC  r   r   r   rC   rI   r&   r&   r?   r'   r    s&    
r  )r(   F)Sr%   collections.abcru   r   dataclassesr   typingr   r   r   r   r2   torch.utils.checkpointr   r   torch.nnr	   r
   r   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   utils.backbone_utilsr   configuration_beitr   
get_loggerr"   loggerrC  rB  rD  r[  r\  "BEIT_PRETRAINED_MODEL_ARCHIVE_LISTr!   rG   r   r9   Moduler:   rJ   rR   r   r   r   r   r   r   r   r   r  BEIT_START_DOCSTRINGrA  r#  r(  rF  rS  r]  rg  ro  r{  r  r  r  r&   r&   r&   r'   <module>   s    
 -1M(=&FY]R%%U; 