o
    hޕ                  	   @   sR  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ eeZdZ dZ!g dZ"dZ#dZ$dgZ%dCde&de&dee& de&fddZ'e(de(dfde(de(de(de(fddZ)G dd dej*Z+G d d! d!ej*Z,G d"d# d#ej*Z-G d$d% d%ej*Z.G d&d' d'ej*Z/G d(d) d)ej*Z0G d*d+ d+ej*Z1G d,d- d-ej*Z2G d.d/ d/ej*Z3G d0d1 d1eZ4d2Z5d3Z6ed4e5G d5d6 d6e4Z7ed7e5G d8d9 d9e4Z8G d:d; d;ej*Z9G d<d= d=ej*Z:G d>d? d?ej*Z;ed@e5G dAdB dBe4Z<dS )Dz PyTorch MobileViTV2 model.    )OptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )MobileViTV2Configr   z$apple/mobilevitv2-1.0-imagenet1k-256)r         r   ztabby, tabby catr   valuedivisor	min_valuereturnc                 C   sF   |du r|}t |t| |d  | | }|d|  k r||7 }t|S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    N   g?)maxint)r   r   r   	new_value r!   j/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/mobilevitv2/modeling_mobilevitv2.pymake_divisibleC   s   r#   z-infinfmin_valmax_valc                 C   s   t |t|| S N)r   minr   r%   r&   r!   r!   r"   clipR   s   r*   c                       sv   e Zd Z						ddededededed	ed
edededeeef ddf fddZde	j
de	j
fddZ  ZS )MobileViTV2ConvLayerr   FTconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                    s   t    t|d d | }|| dkr td| d| d|| dkr1td| d| dtj||||||||dd		| _|	rNtj|d
dddd| _nd | _|
rst	|
t
r_t|
 | _d S t	|jt
rmt|j | _d S |j| _d S d | _d S )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r-   r.   r/   r0   paddingr3   r1   r2   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr
   
activation
hidden_act)selfr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r7   	__class__r!   r"   r?   X   sB   



zMobileViTV2ConvLayer.__init__featuresc                 C   s6   |  |}| jd ur| |}| jd ur| |}|S r'   )rB   rD   rG   )rI   rL   r!   r!   r"   forward   s   




zMobileViTV2ConvLayer.forward)r   r   Fr   TT)__name__
__module____qualname__r   r   boolr   rF   r?   torchTensorrM   __classcell__r!   r!   rJ   r"   r+   W   s>    	

6r+   c                       sT   e Zd ZdZ	ddedededededd	f fd
dZdejdejfddZ	  Z
S )MobileViTV2InvertedResidualzQ
    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
    r   r,   r-   r.   r0   r3   r   Nc              	      s   t    ttt||j d}|dvrtd| d|dko$||k| _t|||dd| _	t|||d|||d| _
t|||dd	d
| _d S )Nr   )r   r   zInvalid stride .r   )r-   r.   r/   r	   )r-   r.   r/   r0   r1   r3   Fr-   r.   r/   r5   )r>   r?   r#   r   roundexpand_ratior@   use_residualr+   
expand_1x1conv_3x3
reduce_1x1)rI   r,   r-   r.   r0   r3   expanded_channelsrJ   r!   r"   r?      s0   

z$MobileViTV2InvertedResidual.__init__rL   c                 C   s4   |}|  |}| |}| |}| jr|| S |S r'   )r[   r\   r]   rZ   )rI   rL   residualr!   r!   r"   rM      s
   


z#MobileViTV2InvertedResidual.forward)r   rN   rO   rP   __doc__r   r   r?   rR   rS   rM   rT   r!   r!   rJ   r"   rU      s"    !rU   c                       sP   e Zd Z	ddedededededdf fd	d
ZdejdejfddZ  Z	S )MobileViTV2MobileNetLayerr   r,   r-   r.   r0   
num_stagesr   Nc                    sR   t    t | _t|D ]}t||||dkr|ndd}| j| |}qd S )Nr   r   )r-   r.   r0   )r>   r?   r   
ModuleListlayerrangerU   append)rI   r,   r-   r.   r0   rc   ire   rJ   r!   r"   r?      s   

z"MobileViTV2MobileNetLayer.__init__rL   c                 C      | j D ]}||}q|S r'   re   )rI   rL   layer_moduler!   r!   r"   rM         

z!MobileViTV2MobileNetLayer.forward)r   r   
rN   rO   rP   r   r   r?   rR   rS   rM   rT   r!   r!   rJ   r"   rb      s     rb   c                       sD   e Zd ZdZdededdf fddZdejdejfd	d
Z	  Z
S )MobileViTV2LinearSelfAttentionaq  
    This layer applies a self-attention with linear complexity, as described in MobileViTV2 paper:
    https://arxiv.org/abs/2206.02680

    Args:
        config (`MobileVitv2Config`):
             Model configuration object
        embed_dim (`int`):
            `input_channels` from an expected input of size :math:`(batch_size, input_channels, height, width)`
    r,   	embed_dimr   Nc              	      s\   t    t||dd|  ddddd| _tj|jd| _t|||ddddd| _|| _d S )Nr   r   TF)r,   r-   r.   r2   r/   r4   r5   p)	r>   r?   r+   qkv_projr   Dropoutattn_dropoutout_projro   )rI   r,   ro   rJ   r!   r"   r?      s*   



	z'MobileViTV2LinearSelfAttention.__init__hidden_statesc           	      C   s   |  |}tj|d| j| jgdd\}}}tjjj|dd}| |}|| }tj|ddd}tjj	||
| }| |}|S )Nr   )split_size_or_sectionsdimrx   Trx   keepdim)rr   rR   splitro   r   
functionalsoftmaxrt   sumrelu	expand_asru   )	rI   rv   qkvquerykeyr   context_scorescontext_vectoroutr!   r!   r"   rM     s   
 

z&MobileViTV2LinearSelfAttention.forwardr`   r!   r!   rJ   r"   rn      s    rn   c                       L   e Zd Z	ddededededdf
 fdd	Zd
ejdejfddZ	  Z
S )MobileViTV2FFN        r,   ro   ffn_latent_dimffn_dropoutr   Nc              
      sZ   t    t|||dddddd| _t|| _t|||dddddd| _t|| _d S )Nr   TF)r,   r-   r.   r/   r0   r2   r4   r5   )	r>   r?   r+   conv1r   rs   dropout1conv2dropout2)rI   r,   ro   r   r   rJ   r!   r"   r?      s.   


zMobileViTV2FFN.__init__rv   c                 C   s,   |  |}| |}| |}| |}|S r'   )r   r   r   r   )rI   rv   r!   r!   r"   rM   @  s
   



zMobileViTV2FFN.forwardr   rN   rO   rP   r   r   floatr?   rR   rS   rM   rT   r!   r!   rJ   r"   r     s     r   c                       r   )MobileViTV2TransformerLayerr   r,   ro   r   dropoutr   Nc                    sb   t    tjd||jd| _t||| _tj|d| _	tjd||jd| _
t||||j| _d S )Nr   
num_groupsnum_channelsr:   rp   )r>   r?   r   	GroupNormlayer_norm_epslayernorm_beforern   	attentionrs   r   layernorm_afterr   r   ffn)rI   r,   ro   r   r   rJ   r!   r"   r?   I  s   
z$MobileViTV2TransformerLayer.__init__rv   c                 C   s<   |  |}| |}|| }| |}| |}|| }|S r'   )r   r   r   r   )rI   rv   layernorm_1_outattention_outputlayer_outputr!   r!   r"   rM   W  s   



z#MobileViTV2TransformerLayer.forwardr   r   r!   r!   rJ   r"   r   H  s    r   c                       D   e Zd Zdedededdf fddZdejdejfd	d
Z  Z	S )MobileViTV2Transformerr,   n_layersd_modelr   Nc                    sf   t    |j}|| g| }dd |D }t | _t|D ]}t|||| d}| j| qd S )Nc                 S   s   g | ]
}t |d  d  qS )   )r   ).0dr!   r!   r"   
<listcomp>l  s    z3MobileViTV2Transformer.__init__.<locals>.<listcomp>)ro   r   )	r>   r?   ffn_multiplierr   rd   re   rf   r   rg   )rI   r,   r   r   r   ffn_dims	block_idxtransformer_layerrJ   r!   r"   r?   d  s   


zMobileViTV2Transformer.__init__rv   c                 C   ri   r'   rj   )rI   rv   rk   r!   r!   r"   rM   u  rl   zMobileViTV2Transformer.forwardrm   r!   r!   rJ   r"   r   c  s    r   c                       s   e Zd ZdZ			ddededededed	ed
eddf fddZdejde	eje	eef f fddZ
dejde	eef dejfddZdejdejfddZ  ZS )MobileViTV2Layerz=
    MobileViTV2 layer: https://arxiv.org/abs/2206.02680
    r   r   r,   r-   r.   attn_unit_dimn_attn_blocksr3   r0   r   Nc           	         s   t    |j| _|j| _|}|dkr.t||||dkr|nd|dkr&|d ndd| _|}nd | _t||||j|d| _	t|||dddd| _
t|||d| _tjd||jd| _t|||dd	dd| _d S )
Nr   r   )r-   r.   r0   r3   )r-   r.   r/   r1   F)r-   r.   r/   r4   r5   )r   r   r   T)r>   r?   
patch_sizepatch_widthpatch_heightrU   downsampling_layerr+   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projection)	rI   r,   r-   r.   r   r   r3   r0   cnn_out_dimrJ   r!   r"   r?     sN   


zMobileViTV2Layer.__init__feature_mapc                 C   sT   |j \}}}}tjj|| j| jf| j| jfd}|||| j| j d}|||ffS )N)r/   r0   ry   )shaper   r~   unfoldr   r   reshape)rI   r   
batch_sizer-   
img_height	img_widthpatchesr!   r!   r"   	unfolding  s   

zMobileViTV2Layer.unfoldingr   output_sizec                 C   sH   |j \}}}}|||| |}tjj||| j| jf| j| jfd}|S )N)r   r/   r0   )r   r   r   r~   foldr   r   )rI   r   r   r   in_dimr   	n_patchesr   r!   r!   r"   folding  s   

zMobileViTV2Layer.foldingrL   c                 C   s`   | j r|  |}| |}| |}| |\}}| |}| |}| ||}| |}|S r'   )r   r   r   r   r   r   r   r   )rI   rL   r   r   r!   r!   r"   rM     s   





zMobileViTV2Layer.forward)r   r   r   )rN   rO   rP   ra   r   r   r?   rR   rS   r   r   r   rM   rT   r!   r!   rJ   r"   r   {  s2    
	&="r   c                       sP   e Zd Zdeddf fddZ		ddejd	ed
edee	e
f fddZ  ZS )MobileViTV2Encoderr,   r   Nc                    s  t    || _t | _d| _d }}|jdkrd}d}n|jdkr%d}d}tt	d|j
 dddddd	}td|j
 dd
}td|j
 dd
}td|j
 dd
}td|j
 dd
}	td|j
 dd
}
t|||ddd}| j| t|||ddd}| j| t|||t|jd |j
 dd
|jd d}| j| |r|d9 }t|||	t|jd |j
 dd
|jd |d}| j| |r|d9 }t||	|
t|jd |j
 dd
|jd |d}| j| d S )NFr   Tr   r       @   r)   r   r   r         i  r   )r-   r.   r0   rc   r   r   )r-   r.   r   r   )r-   r.   r   r   r3   )r>   r?   r,   r   rd   re   gradient_checkpointingoutput_strider#   r*   width_multiplierrb   rg   r   base_attn_unit_dimsr   )rI   r,   dilate_layer_4dilate_layer_5r3   layer_0_dimlayer_1_dimlayer_2_dimlayer_3_dimlayer_4_dimlayer_5_dimlayer_1layer_2layer_3layer_4layer_5rJ   r!   r"   r?     s   



zMobileViTV2Encoder.__init__FTrv   output_hidden_statesreturn_dictc                 C   sx   |rdnd }t | jD ]\}}| jr| jr| |j|}n||}|r(||f }q|s6tdd ||fD S t||dS )Nr!   c                 s   s    | ]	}|d ur|V  qd S r'   r!   )r   vr!   r!   r"   	<genexpr>T  s    z-MobileViTV2Encoder.forward.<locals>.<genexpr>)last_hidden_staterv   )	enumeratere   r   training_gradient_checkpointing_func__call__tupler   )rI   rv   r   r   all_hidden_statesrh   rk   r!   r!   r"   rM   ?  s   
zMobileViTV2Encoder.forward)FT)rN   rO   rP   r   r?   rR   rS   rQ   r   r   r   rM   rT   r!   r!   rJ   r"   r     s    T
r   c                   @   sB   e Zd ZdZeZdZdZdZde	e
je
je
jf ddfdd	ZdS )
MobileViTV2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    mobilevitv2pixel_valuesTmoduler   Nc                 C   st   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjr8|j	j
  |jjd dS dS )zInitialize the weightsr   )meanstdNg      ?)rE   r   LinearrA   weightdatanormal_r,   initializer_ranger2   zero_	LayerNormfill_)rI   r   r!   r!   r"   _init_weightse  s   
z(MobileViTV2PreTrainedModel._init_weights)rN   rO   rP   ra   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr   r   r   rA   r   r   r!   r!   r!   r"   r   Z  s    &r   aM  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MobileViTV2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aF  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`MobileViTImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zYThe bare MobileViTV2 model outputting raw hidden-states without any specific head on top.c                       s   e Zd Zddedef fddZdd Zeee	e
eeded		
	
	
ddeej dee dee deeef fddZ  ZS )MobileViTV2ModelTr,   expand_outputc              	      sf   t  | || _|| _ttd|j dddddd}t||j|ddd	d	d
| _	t
|| _|   d S )Nr   r   r   r)   r   r   r	   r   Tr-   r.   r/   r0   r4   r5   )r>   r?   r,   r  r#   r*   r   r+   r   	conv_stemr   encoder	post_init)rI   r,   r  r   rJ   r!   r"   r?     s"   
	zMobileViTV2Model.__init__c                 C   sF   |  D ]\}}| jj| }t|tr |jjD ]}|j| qqdS )zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr  re   rE   r   r   r   prune_heads)rI   heads_to_prunelayer_indexheadsmobilevitv2_layerr   r!   r!   r"   _prune_heads  s   
zMobileViTV2Model._prune_headsvision)
checkpointoutput_typer   modalityexpected_outputNr   r   r   r   c           	      C   s   |d ur|n| j j}|d ur|n| j j}|d u rtd| |}| j|||d}| jr;|d }tj|ddgdd}n|d }d }|sV|d urK||fn|f}||dd   S t	|||j
d	S )
Nz You have to specify pixel_valuesr   r   r   ry   Fr{   r   )r   pooler_outputrv   )r,   r   use_return_dictr@   r  r  r  rR   r   r   rv   )	rI   r   r   r   embedding_outputencoder_outputsr   pooled_outputoutputr!   r!   r"   rM     s0   
zMobileViTV2Model.forward)T)NNN)rN   rO   rP   r   rQ   r?   r  r   MOBILEVITV2_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   rR   rS   r   r   rM   rT   r!   r!   rJ   r"   r    s.    
	
r  z
    MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                       s   e Zd Zdeddf fddZeeeee	e
ed				ddeej dee d	eej d
ee deee	f f
ddZ  ZS )!MobileViTV2ForImageClassificationr,   r   Nc                    s`   t  | |j| _t|| _td|j dd}|jdkr%tj||jdnt	 | _
|   d S )Nr   r   r   r   )in_featuresout_features)r>   r?   
num_labelsr  r   r#   r   r   r   Identity
classifierr  )rI   r,   r.   rJ   r!   r"   r?     s   

z*MobileViTV2ForImageClassification.__init__)r  r  r   r  r   r   labelsr   c                 C   sb  |dur|n| j j}| j|||d}|r|jn|d }| |}d}|dur| j jdu rP| jdkr6d| j _n| jdkrL|jtj	ksG|jtj
krLd| j _nd| j _| j jdkrnt }	| jdkrh|	| | }n+|	||}n%| j jdkrt }	|	|d| j|d}n| j jdkrt }	|	||}|s|f|dd  }
|dur|f|
 S |
S t|||jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   
regressionsingle_label_classificationmulti_label_classificationry   r   )losslogitsrv   )r,   r  r   r  r$  problem_typer"  dtyperR   longr   r   squeezer   viewr   r   rv   )rI   r   r   r%  r   outputsr  r*  r)  loss_fctr  r!   r!   r"   rM     s>   


"


z)MobileViTV2ForImageClassification.forwardNNNN)rN   rO   rP   r   r?   r   r  r   _IMAGE_CLASS_CHECKPOINTr   r  _IMAGE_CLASS_EXPECTED_OUTPUTr   rR   rS   rQ   r   r   rM   rT   r!   r!   rJ   r"   r    s0    
r  c                       r   )MobileViTV2ASPPPoolingr,   r-   r.   r   Nc              	      s4   t    tjdd| _t|||ddddd| _d S )Nr   )r   Tr   r  )r>   r?   r   AdaptiveAvgPool2dglobal_poolr+   r   )rI   r,   r-   r.   rJ   r!   r"   r?   ;  s   
zMobileViTV2ASPPPooling.__init__rL   c                 C   s:   |j dd  }| |}| |}tjj||ddd}|S )Nr  bilinearFsizemodealign_corners)r   r7  r   r   r~   interpolate)rI   rL   spatial_sizer!   r!   r"   rM   J  s
   

zMobileViTV2ASPPPooling.forwardrm   r!   r!   rJ   r"   r5  :  s    r5  c                       @   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
MobileViTV2ASPPzs
    ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
    r,   r   Nc                    s   t    td j dd}| jt jdkrtdt	 | _
t ddd}| j
| | j
 fd	d
 jD  t }| j
| t d ddd| _tj jd| _d S )Nr   r   r   r	   z"Expected 3 values for atrous_ratesr   r   rW   c              
      s    g | ]}t  d |ddqS )r	   r   )r-   r.   r/   r3   r5   )r+   )r   rater,   r-   r.   r!   r"   r   m  s    	z,MobileViTV2ASPP.__init__.<locals>.<listcomp>   rp   )r>   r?   r#   r   aspp_out_channelslenatrous_ratesr@   r   rd   convsr+   rg   extendr5  projectrs   aspp_dropout_probr   )rI   r,   encoder_out_channelsin_projection
pool_layerrJ   rB  r"   r?   W  s4   

	zMobileViTV2ASPP.__init__rL   c                 C   sD   g }| j D ]	}||| qtj|dd}| |}| |}|S )Nr   rz   )rG  rg   rR   catrI  r   )rI   rL   pyramidconvpooled_featuresr!   r!   r"   rM     s   


zMobileViTV2ASPP.forward
rN   rO   rP   ra   r   r?   rR   rS   rM   rT   r!   r!   rJ   r"   r@  R  s    ,r@  c                       r?  )
MobileViTV2DeepLabV3zB
    DeepLabv3 architecture: https://arxiv.org/abs/1706.05587
    r,   r   Nc              	      sB   t    t|| _t|j| _t||j	|j
ddddd| _d S )Nr   FT)r-   r.   r/   r4   r5   r2   )r>   r?   r@  asppr   	Dropout2dclassifier_dropout_probr   r+   rD  r"  r$  rI   r,   rJ   r!   r"   r?     s   

zMobileViTV2DeepLabV3.__init__rv   c                 C   s&   |  |d }| |}| |}|S )Nry   )rT  r   r$  )rI   rv   rL   r!   r!   r"   rM     s   

zMobileViTV2DeepLabV3.forwardrR  r!   r!   rJ   r"   rS    s    rS  zZ
    MobileViTV2 model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                       s~   e Zd Zdeddf fddZeeeee	d				dde
ej de
ej d	e
e d
e
e deeef f
ddZ  ZS )"MobileViTV2ForSemanticSegmentationr,   r   Nc                    s8   t  | |j| _t|dd| _t|| _|   d S )NF)r  )r>   r?   r"  r  r   rS  segmentation_headr  rW  rJ   r!   r"   r?     s
   
z+MobileViTV2ForSemanticSegmentation.__init__)r  r   r   r%  r   r   c                 C   s  |dur|n| j j}|dur|n| j j}| j|d|d}|r!|jn|d }| |}d}|durU| j jdkr:tdtj	j
||jdd ddd	}	t| j jd
}
|
|	|}|sw|rc|f|dd  }n	|f|dd  }|duru|f| S |S t|||r|jddS dddS )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
        >>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```NTr  r   z/The number of labels should be greater than oner  r8  Fr9  )ignore_indexr   )r)  r*  rv   
attentions)r,   r   r  r   rv   rY  r"  r@   r   r~   r=  r   r   semantic_loss_ignore_indexr   )rI   r   r%  r   r   r0  encoder_hidden_statesr*  r)  upsampled_logitsr1  r  r!   r!   r"   rM     sB   '

z*MobileViTV2ForSemanticSegmentation.forwardr2  )rN   rO   rP   r   r?   r   r  r   r   r  r   rR   rS   rQ   r   r   rM   rT   r!   r!   rJ   r"   rX    s&    


rX  )r   N)=ra   typingr   r   r   rR   torch.utils.checkpointr   torch.nnr   r   r   activationsr
   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   r   r   r   configuration_mobilevitv2r   
get_loggerrN   loggerr  r  r  r3  r4  )MOBILEVITV2_PRETRAINED_MODEL_ARCHIVE_LISTr   r#   r   r*   Moduler+   rU   rb   rn   r   r   r   r   r   r   MOBILEVITV2_START_DOCSTRINGr  r  r  r5  r@  rS  rX  r!   r!   r!   r"   <module>   sf   
 (A1?)rmTQ=