o
    hF                  	   @   s  d Z ddlZddlZddlZddlmZ ddlmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& e!'e(Z)dZ*dZ+g dZ,dZ-dZ.dgZ/eG dd deZ0eG dd deZ1eG dd deZ2eG dd deZ3dd Z4dd Z5G d d! d!ej6Z7G d"d# d#ej6Z8G d$d% d%ej6Z9dQd(ej:d)e;d*e<d+ej:fd,d-Z=G d.d/ d/ej6Z>G d0d1 d1ej6Z?G d2d3 d3ej6Z@G d4d5 d5ej6ZAG d6d7 d7ej6ZBG d8d9 d9ej6ZCG d:d; d;ej6ZDG d<d= d=ej6ZEG d>d? d?ej6ZFG d@dA dAeZGdBZHdCZIedDeHdEG dFdG dGeGZJedHeHG dIdJ dJeGZKedKeHG dLdM dMeGZLedNeHG dOdP dPeGe$ZMdS )Rz  PyTorch Swin Transformer model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutput)PreTrainedModel) find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)BackboneMixin   )
SwinConfigr   z&microsoft/swin-tiny-patch4-window7-224)r   1   i   ztabby, tabby catc                   @   sb   e Zd ZU dZdZejed< dZe	e
ej  ed< dZe	e
ej  ed< dZe	e
ej  ed< dS )SwinEncoderOutputa  
    Swin encoder's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nlast_hidden_statehidden_states
attentionsreshaped_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   r   r    r'   r'   \/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.pyr   C   s   
 r   c                   @   st   e Zd ZU dZdZejed< dZe	ej ed< dZ
e	eej  ed< dZe	eej  ed< dZe	eej  ed< dS )SwinModelOutputaT  
    Swin model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr   pooler_outputr   r   r   )r    r!   r"   r#   r   r$   r%   r&   r*   r   r   r   r   r   r'   r'   r'   r(   r)   d   s   
 r)   c                   @   s   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< edd	 ZdS )
SwinMaskedImageModelingOutputa  
    Swin masked image model outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
            Masked image modeling (MLM) loss.
        reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Reconstructed pixel values.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nlossreconstructionr   r   r   c                 C   s   t dt | jS )Nzlogits attribute is deprecated and will be removed in version 5 of Transformers. Please use the reconstruction attribute to retrieve the final output instead.)warningswarnFutureWarningr-   selfr'   r'   r(   logits   s
   z$SwinMaskedImageModelingOutput.logits)r    r!   r"   r#   r,   r   r$   r%   r&   r-   r   r   r   r   propertyr3   r'   r'   r'   r(   r+      s   
 r+   c                   @   st   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dS )SwinImageClassifierOutputa  
    Swin outputs for image classification.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr,   r3   r   r   r   )r    r!   r"   r#   r,   r   r$   r%   r&   r3   r   r   r   r   r'   r'   r'   r(   r5      s   
 r5   c                 C   sR   | j \}}}}| ||| ||| ||} | dddddd d|||}|S )z2
    Partitions the given input into windows.
    r   r   r
            shapeviewpermute
contiguous)input_featurewindow_size
batch_sizeheightwidthnum_channelswindowsr'   r'   r(   window_partition   s   $rF   c                 C   sN   | j d }| d|| || |||} | dddddd d|||} | S )z?
    Merges windows to produce higher resolution features.
    r9   r   r   r
   r6   r7   r8   r:   )rE   r@   rB   rC   rD   r'   r'   r(   window_reverse   s   
$rG   c                       sN   e Zd ZdZd fdd	Z	ddeej deej de	ej
 fd	d
Z  ZS )SwinEmbeddingszW
    Construct the patch and position embeddings. Optionally, also the mask token.
    Fc                    s   t    t|| _| jj}| jj| _|r tt	
dd|jnd | _|jr5tt	
d|d |j| _nd | _t|j| _t|j| _d S )Nr   )super__init__SwinPatchEmbeddingspatch_embeddingsnum_patches	grid_size
patch_gridr   	Parameterr$   zeros	embed_dim
mask_tokenuse_absolute_embeddingsposition_embeddings	LayerNormnormDropouthidden_dropout_probdropout)r2   configuse_mask_tokenrM   	__class__r'   r(   rJ      s   


 zSwinEmbeddings.__init__Npixel_valuesbool_masked_posreturnc           
      C   s   |  |\}}| |}| \}}}|d ur1| j||d}|d|}	|d|	  ||	  }| jd ur;|| j }| |}||fS )Nr9         ?)	rL   rW   sizerS   expand	unsqueezetype_asrU   rZ   )
r2   r_   r`   
embeddingsoutput_dimensionsrA   seq_len_mask_tokensmaskr'   r'   r(   forward  s   



zSwinEmbeddings.forward)FN)r    r!   r"   r#   rJ   r   r$   r%   
BoolTensorr   Tensorrm   __classcell__r'   r'   r]   r(   rH      s    rH   c                       sN   e Zd ZdZ fddZdd Zdeej de	ej
e	e f fdd	Z  ZS )
rK   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
|d |d  |d |d  f| _tj||||d| _d S )Nr   r   )kernel_sizestride)rI   rJ   
image_size
patch_sizerD   rR   
isinstancecollectionsabcIterablerM   rN   r   Conv2d
projection)r2   r[   rt   ru   rD   hidden_sizerM   r]   r'   r(   rJ      s   
 "zSwinPatchEmbeddings.__init__c                 C   s   || j d  dkrd| j d || j d   f}tj||}|| j d  dkr>ddd| j d || j d   f}tj||}|S )Nr   r   )ru   r   
functionalpad)r2   r_   rB   rC   
pad_valuesr'   r'   r(   	maybe_pad/  s    zSwinPatchEmbeddings.maybe_padr_   ra   c                 C   sh   |j \}}}}|| jkrtd| |||}| |}|j \}}}}||f}|ddd}||fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r6   r   )r;   rD   
ValueErrorr   r{   flatten	transpose)r2   r_   rj   rD   rB   rC   rg   rh   r'   r'   r(   rm   8  s   

zSwinPatchEmbeddings.forward)r    r!   r"   r#   rJ   r   r   r$   r%   r   rp   intrm   rq   r'   r'   r]   r(   rK     s
    .	rK   c                	       sh   e Zd ZdZejfdee dedejddf fddZ	d	d
 Z
dejdeeef dejfddZ  ZS )SwinPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`Tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    input_resolutiondim
norm_layerra   Nc                    sB   t    || _|| _tjd| d| dd| _|d| | _d S )Nr7   r6   Fbias)rI   rJ   r   r   r   Linear	reductionrW   )r2   r   r   r   r]   r'   r(   rJ   U  s
   
zSwinPatchMerging.__init__c                 C   sF   |d dkp|d dk}|r!ddd|d d|d f}t j||}|S )Nr6   r   r   )r   r}   r~   )r2   r?   rB   rC   
should_padr   r'   r'   r(   r   \  s
   zSwinPatchMerging.maybe_padr?   input_dimensionsc                 C   s   |\}}|j \}}}|||||}| |||}|d d dd ddd dd d f }|d d dd ddd dd d f }	|d d dd ddd dd d f }
|d d dd ddd dd d f }t||	|
|gd}||dd| }| |}| |}|S )Nr   r6   r   r9   r7   )r;   r<   r   r$   catrW   r   )r2   r?   r   rB   rC   rA   r   rD   input_feature_0input_feature_1input_feature_2input_feature_3r'   r'   r(   rm   d  s   $$$$

zSwinPatchMerging.forward)r    r!   r"   r#   r   rV   r   r   ModulerJ   r   r$   rp   rm   rq   r'   r'   r]   r(   r   H  s
    **r           Finput	drop_probtrainingra   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)r;   ndimr$   randr   r   floor_div)r   r   r   	keep_probr;   random_tensoroutputr'   r'   r(   	drop_path  s   
r   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )SwinDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   ra   c                    s   t    || _d S rn   )rI   rJ   r   )r2   r   r]   r'   r(   rJ     s   

zSwinDropPath.__init__r   c                 C   s   t || j| jS rn   )r   r   r   r2   r   r'   r'   r(   rm     s   zSwinDropPath.forwardc                 C   s   d | jS )Nzp={})formatr   r1   r'   r'   r(   
extra_repr  s   zSwinDropPath.extra_reprrn   )r    r!   r"   r#   r   floatrJ   r$   rp   rm   strr   rq   r'   r'   r]   r(   r     s
    r   c                       b   e Zd Z fddZdd Z			ddejdeej d	eej d
ee	 de
ej f
ddZ  ZS )SwinSelfAttentionc                    s
  t    || dkrtd| d| d|| _t|| | _| j| j | _t|tj	j
r0|n||f| _ttd| jd  d d| jd  d  || _t| jd }t| jd }tt||gdd}t|d}|d d d d d f |d d d d d f  }	|	ddd }	|	d d d d df  | jd d 7  < |	d d d d df  | jd d 7  < |	d d d d df  d| jd  d 9  < |	d	}
| d
|
 tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _t|j| _ d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r6   r   ij)indexingr9   relative_position_indexr   )!rI   rJ   r   num_attention_headsr   attention_head_sizeall_head_sizerv   rw   rx   ry   r@   r   rP   r$   rQ   relative_position_bias_tablearangestackr   r   r=   r>   sumregister_bufferr   qkv_biasquerykeyvaluerX   attention_probs_dropout_probrZ   )r2   r[   r   	num_headsr@   coords_hcoords_wcoordscoords_flattenrelative_coordsr   r]   r'   r(   rJ     s8   
*,((,
zSwinSelfAttention.__init__c                 C   s6   |  d d | j| jf }||}|ddddS )Nr9   r   r6   r   r
   )rc   r   r   r<   r=   )r2   xnew_x_shaper'   r'   r(   transpose_for_scores  s   
z&SwinSelfAttention.transpose_for_scoresNFr   attention_mask	head_maskoutput_attentionsra   c                 C   s  |j \}}}| |}| | |}	| | |}
| |}t||	dd}|t	| j
 }| j| jd }|| jd | jd  | jd | jd  d}|ddd }||d }|d ur|j d }||| || j||}||dd }|d| j||}tjj|dd}| |}|d ur|| }t||
}|dddd }| d d | jf }||}|r||f}|S |f}|S )Nr9   r   r   r6   r   r
   )r;   r   r   r   r   r$   matmulr   mathsqrtr   r   r   r<   r@   r=   r>   re   r   r   r}   softmaxrZ   rc   r   )r2   r   r   r   r   rA   r   rD   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputsr'   r'   r(   rm     s@   

&


zSwinSelfAttention.forwardNNF)r    r!   r"   rJ   r   r$   rp   r   r%   boolr   rm   rq   r'   r'   r]   r(   r     s"    %r   c                       s8   e Zd Z fddZdejdejdejfddZ  ZS )SwinSelfOutputc                    s*   t    t||| _t|j| _d S rn   )rI   rJ   r   r   denserX   r   rZ   r2   r[   r   r]   r'   r(   rJ     s   
zSwinSelfOutput.__init__r   input_tensorra   c                 C      |  |}| |}|S rn   r   rZ   )r2   r   r   r'   r'   r(   rm     s   

zSwinSelfOutput.forwardr    r!   r"   rJ   r$   rp   rm   rq   r'   r'   r]   r(   r     s    $r   c                       r   )SwinAttentionc                    s2   t    t||||| _t||| _t | _d S rn   )rI   rJ   r   r2   r   r   setpruned_heads)r2   r[   r   r   r@   r]   r'   r(   rJ     s   
zSwinAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )lenr   r2   r   r   r   r   r   r   r   r   r   r   union)r2   headsindexr'   r'   r(   prune_heads  s   zSwinAttention.prune_headsNFr   r   r   r   ra   c                 C   s6   |  ||||}| |d |}|f|dd   }|S )Nr   r   )r2   r   )r2   r   r   r   r   self_outputsattention_outputr   r'   r'   r(   rm   ,  s   zSwinAttention.forwardr   )r    r!   r"   rJ   r   r$   rp   r   r%   r   r   rm   rq   r'   r'   r]   r(   r     s"    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )SwinIntermediatec                    sJ   t    t|t|j| | _t|jt	rt
|j | _d S |j| _d S rn   )rI   rJ   r   r   r   	mlp_ratior   rv   
hidden_actr   r   intermediate_act_fnr   r]   r'   r(   rJ   :  s
   
zSwinIntermediate.__init__r   ra   c                 C   r   rn   )r   r   r   r'   r'   r(   rm   B     

zSwinIntermediate.forwardr   r'   r'   r]   r(   r   9  s    r   c                       r   )
SwinOutputc                    s4   t    tt|j| || _t|j| _	d S rn   )
rI   rJ   r   r   r   r   r   rX   rY   rZ   r   r]   r'   r(   rJ   I  s   
zSwinOutput.__init__r   ra   c                 C   r   rn   r   r   r'   r'   r(   rm   N  r   zSwinOutput.forwardr   r'   r'   r]   r(   r   H  s    r   c                       s   e Zd Zd fdd	Zdd Zdd Zdd	 Z	
		ddejde	e
e
f deej dee dee de	ejejf fddZ  ZS )	SwinLayerr   c                    s   t    |j| _|| _|j| _|| _tj||jd| _	t
|||| jd| _|jdkr0t|jnt | _tj||jd| _t||| _t||| _d S )Neps)r@   r   )rI   rJ   chunk_size_feed_forward
shift_sizer@   r   r   rV   layer_norm_epslayernorm_beforer   	attentiondrop_path_rater   Identityr   layernorm_afterr   intermediater   r   )r2   r[   r   r   r   r   r]   r'   r(   rJ   U  s   
zSwinLayer.__init__c                 C   s&   t || jkrd| _t || _d S d S Nr   )minr@   r   )r2   r   r'   r'   r(   set_shift_and_window_sizeb  s   z#SwinLayer.set_shift_and_window_sizec              	   C   s  | j dkrtjd||df|d}td| j t| j | j  t| j  d f}td| j t| j | j  t| j  d f}d}|D ]}|D ]}	||d d ||	d d f< |d7 }qDq@t|| j}
|
d| j| j }
|
d|
d }||dkt	d|dkt	d}|S d }|S )Nr   r   r   r9   r6   g      Yr   )
r   r$   rQ   slicer@   rF   r<   re   masked_fillr   )r2   rB   rC   r   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_maskr'   r'   r(   get_attn_maskh  s.   

$zSwinLayer.get_attn_maskc                 C   sR   | j || j   | j  }| j || j   | j  }ddd|d|f}tj||}||fS r   )r@   r   r}   r~   )r2   r   rB   rC   	pad_right
pad_bottomr   r'   r'   r(   r     s
   zSwinLayer.maybe_padNFr   r   r   r   always_partitionra   c                 C   s  |s|  | n	 |\}}| \}}	}
|}| |}|||||
}| |||\}}|j\}	}}}	| jdkrGtj|| j | j fdd}n|}t	|| j
}|d| j
| j
 |
}| j|||jd}|d urm||j}| j||||d}|d }|d| j
| j
|
}t|| j
||}| jdkrtj|| j| jfdd}n|}|d dkp|d dk}|r|d d d |d |d d f  }|||| |
}|| | }| |}| |}|| | }|r||d	 f}|S |f}|S )
Nr   )r   r6   )shiftsdimsr9   r  )r   r
   r8   r   )r  rc   r   r<   r   r;   r   r$   rollrF   r@   r  r   tor   r   rG   r>   r   r   r   r   )r2   r   r   r   r   r  rB   rC   rA   rj   channelsshortcutr   
height_pad	width_padshifted_hidden_stateshidden_states_windowsr  attention_outputsr   attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputsr'   r'   r(   rm     sN   


$

zSwinLayer.forward)r   NFF)r    r!   r"   rJ   r  r  r   r$   rp   r   r   r   r%   r   rm   rq   r'   r'   r]   r(   r   T  s*    
r   c                       sd   e Zd Z fddZ			ddejdeeef deej	 dee
 d	ee
 d
eej fddZ  ZS )	SwinStagec                    sf   t     | _| _t fddt|D | _|d ur+|tjd| _	nd | _	d| _
d S )Nc              	      s4   g | ]}t  |d  dkrdn jd  dqS )r6   r   )r[   r   r   r   r   )r   r@   .0ir[   r   r   r   r'   r(   
<listcomp>  s    z&SwinStage.__init__.<locals>.<listcomp>)r   r   F)rI   rJ   r[   r   r   
ModuleListrangeblocksrV   
downsamplepointing)r2   r[   r   r   depthr   r   r+  r]   r&  r(   rJ     s   

zSwinStage.__init__NFr   r   r   r   r  ra   c                 C   s   |\}}t | jD ]\}}	|d ur|| nd }
|	|||
||}|d }q	|}| jd urE|d d |d d }}||||f}| ||}n||||f}|||f}|rZ||dd  7 }|S )Nr   r   r6   )	enumerater*  r+  )r2   r   r   r   r   r  rB   rC   r%  layer_modulelayer_head_maskr   !hidden_states_before_downsamplingheight_downsampledwidth_downsampledrh   stage_outputsr'   r'   r(   rm     s"   



zSwinStage.forwardr!  )r    r!   r"   rJ   r$   rp   r   r   r   r%   r   rm   rq   r'   r'   r]   r(   r"    s$    
r"  c                       s   e Zd Z fddZ						ddejdeeef deej	 d	ee
 d
ee
 dee
 dee
 dee
 deeef fddZ  ZS )SwinEncoderc                    sl   t    t j_ _dd td jt	 jD t
 fddtjD _d_d S )Nc                 S   s   g | ]}|  qS r'   )item)r$  r   r'   r'   r(   r'    s    z(SwinEncoder.__init__.<locals>.<listcomp>r   c                    s   g | ]E}t  t jd |  d d |  d d |  f j|  j| t jd| t jd|d   |jd k rCtnddqS )r6   r   r   N)r[   r   r   r-  r   r   r+  )r"  r   rR   depthsr   r   
num_layersr   )r$  i_layerr[   dprrN   r2   r'   r(   r'    s    
*F)rI   rJ   r   r7  r8  r[   r$   linspacer   r   r   r(  r)  layersgradient_checkpointing)r2   r[   rN   r]   r:  r(   rJ     s   
 

zSwinEncoder.__init__NFTr   r   r   r   output_hidden_states(output_hidden_states_before_downsamplingr  return_dictra   c	                 C   s  |rdnd }	|r
dnd }
|rdnd }|r7|j \}}}|j|g||R  }|dddd}|	|f7 }	|
|f7 }
t| jD ]\}}|d urH|| nd }| jr[| jr[| |j||||}n||||||}|d }|d }|d }|d |d f}|r|r|j \}}}|j|g|d |d f|R  }|dddd}|	|f7 }	|
|f7 }
n'|r|s|j \}}}|j|g||R  }|dddd}|	|f7 }	|
|f7 }
|r||dd  7 }q<|st	dd	 ||	|fD S t
||	||
d
S )Nr'   r   r
   r   r6   r   r9   c                 s   s    | ]	}|d ur|V  qd S rn   r'   )r$  vr'   r'   r(   	<genexpr>`  s    z&SwinEncoder.forward.<locals>.<genexpr>)r   r   r   r   )r;   r<   r=   r.  r=  r>  r   _gradient_checkpointing_func__call__tupler   )r2   r   r   r   r   r?  r@  r  rA  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsrA   rj   r|   reshaped_hidden_stater%  r/  r0  r   r1  rh   r'   r'   r(   rm   !  sf   





zSwinEncoder.forward)NFFFFT)r    r!   r"   rJ   r$   rp   r   r   r   r%   r   r   r   rm   rq   r'   r'   r]   r(   r5  
  s6    
	

r5  c                   @   s(   e Zd ZdZeZdZdZdZdd Z	dS )SwinPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    swinr_   Tc                 C   st   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjr8|j	j
  |jjd dS dS )zInitialize the weightsr   )meanstdNrb   )rv   r   r   rz   weightdatanormal_r[   initializer_ranger   zero_rV   fill_)r2   moduler'   r'   r(   _init_weightsu  s   
z!SwinPreTrainedModel._init_weightsN)
r    r!   r"   r#   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingrV  r'   r'   r'   r(   rK  j  s    rK  aG  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`SwinConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aJ  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare Swin Model transformer outputting raw hidden-states without any specific head on top.a  
        add_pooling_layer (`bool`, *optional*, defaults to `True`):
                Whether or not to apply pooling layer.
        use_mask_token (`bool`, *optional*, defaults to `False`):
                Whether or not to create and apply mask tokens in the embedding layer.
    c                       s   e Zd Zd fdd	Zdd Zdd Zeeee	e
ed	ed
						ddeej deej deej dee dee dee deee
f fddZ  ZS )	SwinModelTFc                    s   t  | || _t|j| _t|jd| jd   | _t	||d| _
t|| j
j| _tj| j|jd| _|r<tdnd | _|   d S )Nr6   r   )r\   r   )rI   rJ   r[   r   r7  r8  r   rR   num_featuresrH   rg   r5  rO   encoderr   rV   r   	layernormAdaptiveAvgPool1dpooler	post_init)r2   r[   add_pooling_layerr\   r]   r'   r(   rJ     s   zSwinModel.__init__c                 C      | j jS rn   rg   rL   r1   r'   r'   r(   get_input_embeddings     zSwinModel.get_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr]  layerr   r   )r2   heads_to_prunerh  r   r'   r'   r(   _prune_heads  s   zSwinModel._prune_headsvision)
checkpointoutput_typerW  modalityexpected_outputNr_   r`   r   r   r?  rA  ra   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| |t| j j}| j||d\}}| j	||||||d}	|	d }
| 
|
}
d}| jdurc| |
dd}t|d}|sq|
|f|	dd  }|S t|
||	j|	j|	jdS )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)r`   r   r   r?  rA  r   r   r6   )r   r*   r   r   r   )r[   r   r?  use_return_dictr   get_head_maskr   r7  rg   r]  r^  r`  r   r$   r   r)   r   r   r   )r2   r_   r`   r   r   r?  rA  embedding_outputr   encoder_outputssequence_outputpooled_outputr   r'   r'   r(   rm     s@   	

zSwinModel.forward)TFNNNNNN)r    r!   r"   rJ   re  rj  r   SWIN_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr)   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r$   r%   ro   r   r   r   rm   rq   r'   r'   r]   r(   r[    sB    	
r[  aW  Swin Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                       s   e Zd Z fddZeeeeed						dde	e
j de	e
j de	e
j de	e d	e	e d
e	e deeef fddZ  ZS )SwinForMaskedImageModelingc                    sn   t  | t|ddd| _t|jd|jd   }ttj	||j
d |j ddt|j
| _|   d S )NFT)rb  r\   r6   r   )in_channelsout_channelsrr   )rI   rJ   r[  rL  r   rR   r8  r   
Sequentialrz   encoder_striderD   PixelShuffledecoderra  )r2   r[   r\  r]   r'   r(   rJ     s   
z#SwinForMaskedImageModeling.__init__)rm  rW  Nr_   r`   r   r   r?  rA  ra   c                 C   s<  |dur|n| j j}| j||||||d}|d }|dd}|j\}	}
}t|d  }}||	|
||}| |}d}|dur|| j j	| j j
 }|d||}|| j j
d| j j
dd }tjj||dd	}||  | d
  | j j }|s|f|dd  }|dur|f| S |S t|||j|j|jdS )aI  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, SwinForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-base-simmim-window6-192")
        >>> model = SwinForMaskedImageModeling.from_pretrained("microsoft/swin-base-simmim-window6-192")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 192, 192]
        ```N)r`   r   r   r?  rA  r   r   r6   g      ?r9   none)r   gh㈵>)r,   r-   r   r   r   )r[   rq  rL  r   r;   r   floorreshaper  rt   ru   repeat_interleavere   r>   r   r}   l1_lossr   rD   r+   r   r   r   )r2   r_   r`   r   r   r?  rA  r   ru  rA   rD   sequence_lengthrB   rC   reconstructed_pixel_valuesmasked_im_lossrc   rl   reconstruction_lossr   r'   r'   r(   rm   +  sH   (	
 z"SwinForMaskedImageModeling.forwardrw  )r    r!   r"   rJ   r   rx  r   r+   rz  r   r$   r%   ro   r   r   r   rm   rq   r'   r'   r]   r(   r|    s2    

r|  z
    Swin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                       s   e Zd Z fddZeeeeee	e
d						ddeej deej deej dee d	ee d
ee deeef fddZ  ZS )SwinForImageClassificationc                    sP   t  | |j| _t|| _|jdkrt| jj|jnt | _	| 
  d S r   )rI   rJ   
num_labelsr[  rL  r   r   r\  r   
classifierra  )r2   r[   r]   r'   r(   rJ     s   
"z#SwinForImageClassification.__init__)rl  rm  rW  ro  Nr_   r   labelsr   r?  rA  ra   c                 C   sd  |dur|n| j j}| j|||||d}|d }| |}	d}
|dur| j jdu rM| jdkr3d| j _n| jdkrI|jtjksD|jtj	krId| j _nd| j _| j jdkrkt
 }| jdkre||	 | }
n+||	|}
n%| j jdkrt }||	d| j|d}
n| j jdkrt }||	|}
|s|	f|dd  }|
dur|
f| S |S t|
|	|j|j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nrp  r   
regressionsingle_label_classificationmulti_label_classificationr9   r6   )r,   r3   r   r   r   )r[   rq  rL  r  problem_typer  r   r$   longr   r	   squeezer   r<   r   r5   r   r   r   )r2   r_   r   r  r   r?  rA  r   rv  r3   r,   loss_fctr   r'   r'   r(   rm     sN   


"


z"SwinForImageClassification.forwardrw  )r    r!   r"   rJ   r   rx  r   _IMAGE_CLASS_CHECKPOINTr5   rz  _IMAGE_CLASS_EXPECTED_OUTPUTr   r$   r%   
LongTensorr   r   r   rm   rq   r'   r'   r]   r(   r    s<    
r  zM
    Swin backbone, to be used with frameworks like DETR and MaskFormer.
    c                       s^   e Zd Zdef fddZdd Z			ddejdee	 d	ee	 d
ee	 de
f
ddZ  ZS )SwinBackboner[   c                    s   t    t     jg fddtt jD  | _t | _	t
 | j	j| _i }t| j| jD ]\}}t|||< q5t|| _|   d S )Nc                    s   g | ]}t  jd |  qS )r6   )r   rR   r#  r[   r'   r(   r'    s    z)SwinBackbone.__init__.<locals>.<listcomp>)rI   rJ   _init_backbonerR   r)  r   r7  r\  rH   rg   r5  rO   r]  zip_out_featuresr  r   rV   
ModuleDicthidden_states_normsra  )r2   r[   r  stagerD   r]   r  r(   rJ     s   &
zSwinBackbone.__init__c                 C   rc  rn   rd  r1   r'   r'   r(   re    rf  z!SwinBackbone.get_input_embeddingsNr_   r?  r   rA  ra   c              
   C   s6  |dur|n| j j}|dur|n| j j}|dur|n| j j}| |\}}| j||d|ddddd}|j}d}	t| j|D ]A\}
}|
| j	v r~|j
\}}}}|dddd }|||| |}| j|
 |}|||||}|dddd }|	|f7 }	q=|s|	f}|r||jf7 }|S t|	|r|jnd|jd	S )
aK  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/swin-tiny-patch4-window7-224", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 7, 7]
        ```NT)r   r   r?  r@  r  rA  r'   r   r6   r
   r   )feature_mapsr   r   )r[   rq  r?  r   rg   r]  r   r  stage_namesout_featuresr;   r=   r>   r<   r  r   r   r   )r2   r_   r?  r   rA  rs  r   r   r   r  r  hidden_staterA   rD   rB   rC   r   r'   r'   r(   rm     sJ    

zSwinBackbone.forward)NNN)r    r!   r"   r   rJ   re  r$   rp   r   r   r   rm   rq   r'   r'   r]   r(   r    s"    r  )r   F)Nr#   collections.abcrw   r   r.   dataclassesr   typingr   r   r   r$   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_outputsr   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   utils.backbone_utilsr   configuration_swinr   
get_loggerr    loggerrz  ry  r{  r  r  "SWIN_PRETRAINED_MODEL_ARCHIVE_LISTr   r)   r+   r5   rF   rG   r   rH   rK   r   rp   r   r   r   r   r   r   r   r   r   r   r"  r5  rK  SWIN_START_DOCSTRINGrx  r[  r|  r  r  r'   r'   r'   r(   <module>   s    
 #,#
*/ 7d&{;`
a	hW