o
    hޜ                  	   @   s  d Z ddlZddlmZmZmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZm Z  ddl!m"Z" e#e$Z%dZ&dZ'g dZ(dZ)dZ*g dZ+dEde,de,dee, de,fddZ-G dd de
j.Z/G dd de
j.Z0G dd de
j.Z1G d d! d!e
j.Z2G d"d# d#e
j.Z3G d$d% d%e
j.Z4G d&d' d'e
j.Z5G d(d) d)e
j.Z6G d*d+ d+e
j.Z7G d,d- d-e
j.Z8G d.d/ d/e
j.Z9G d0d1 d1e
j.Z:G d2d3 d3eZ;d4Z<d5Z=ed6e<G d7d8 d8e;Z>ed9e<G d:d; d;e;Z?G d<d= d=e
j.Z@G d>d? d?e
j.ZAG d@dA dAe
j.ZBedBe<G dCdD dDe;ZCdS )Fz PyTorch MobileViT model.    N)DictOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )MobileViTConfigr   apple/mobilevit-small)r   i     r   ztabby, tabby cat)r   zapple/mobilevit-x-smallzapple/mobilevit-xx-smallzapple/deeplabv3-mobilevit-smallz!apple/deeplabv3-mobilevit-x-smallz"apple/deeplabv3-mobilevit-xx-smallr   valuedivisor	min_valuereturnc                 C   sF   |du r|}t |t| |d  | | }|d|  k r||7 }t|S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    N   g?)maxint)r   r   r   	new_value r%   f/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisibleI   s   r'   c                       sv   e Zd Z						ddededededed	ed
edededeeef ddf fddZde	j
de	j
fddZ  ZS )MobileViTConvLayerr   FTconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr    Nc                    s   t    t|d d | }|| dkr td| d| d|| dkr1td| d| dtj||||||||dd		| _|	rNtj|d
dddd| _nd | _|
rst	|
t
r_t|
 | _d S t	|jt
rmt|j | _d S |j| _d S d | _d S )Nr   r!   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r*   r+   r,   r-   paddingr0   r.   r/   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r#   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r4   	__class__r%   r&   r<   Y   sB   



zMobileViTConvLayer.__init__featuresc                 C   s6   |  |}| jd ur| |}| jd ur| |}|S N)r?   rA   rD   )rF   rI   r%   r%   r&   forward   s   




zMobileViTConvLayer.forward)r   r   Fr   TT)__name__
__module____qualname__r   r#   boolr   rC   r<   torchTensorrK   __classcell__r%   r%   rG   r&   r(   X   s>    	

6r(   c                       sT   e Zd ZdZ	ddedededededd	f fd
dZdejdejfddZ	  Z
S )MobileViTInvertedResidualzQ
    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
    r   r)   r*   r+   r-   r0   r    Nc              	      s   t    ttt||j d}|dvrtd| d|dko$||k| _t|||dd| _	t|||d|||d| _
t|||dd	d
| _d S )Nr   )r   r!   zInvalid stride .r   r*   r+   r,   r   )r*   r+   r,   r-   r.   r0   Fr*   r+   r,   r2   )r;   r<   r'   r#   roundexpand_ratior=   use_residualr(   
expand_1x1conv_3x3
reduce_1x1)rF   r)   r*   r+   r-   r0   expanded_channelsrG   r%   r&   r<      s0   

z"MobileViTInvertedResidual.__init__rI   c                 C   s4   |}|  |}| |}| |}| jr|| S |S rJ   )rZ   r[   r\   rY   )rF   rI   residualr%   r%   r&   rK      s
   


z!MobileViTInvertedResidual.forwardr   )rL   rM   rN   __doc__r   r#   r<   rP   rQ   rK   rR   r%   r%   rG   r&   rS      s"    !rS   c                       sP   e Zd Z	ddedededededdf fd	d
ZdejdejfddZ  Z	S )MobileViTMobileNetLayerr   r)   r*   r+   r-   
num_stagesr    Nc                    sR   t    t | _t|D ]}t||||dkr|ndd}| j| |}qd S )Nr   r   )r*   r+   r-   )r;   r<   r   
ModuleListlayerrangerS   append)rF   r)   r*   r+   r-   rb   ird   rG   r%   r&   r<      s   

z MobileViTMobileNetLayer.__init__rI   c                 C      | j D ]}||}q|S rJ   rd   )rF   rI   layer_moduler%   r%   r&   rK         

zMobileViTMobileNetLayer.forward)r   r   
rL   rM   rN   r   r#   r<   rP   rQ   rK   rR   r%   r%   rG   r&   ra      s     ra   c                       sV   e Zd Zdededdf fddZdejdejfdd	Zd
ejdejfddZ	  Z
S )MobileViTSelfAttentionr)   hidden_sizer    Nc                    s   t    ||j dkrtd|f d|j d|j| _t||j | _| j| j | _tj|| j|j	d| _
tj|| j|j	d| _tj|| j|j	d| _t|j| _d S )Nr   zThe hidden size z4 is not a multiple of the number of attention heads rT   )r/   )r;   r<   num_attention_headsr=   r#   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyr   Dropoutattention_probs_dropout_probdropoutrF   r)   rn   rG   r%   r&   r<      s   

zMobileViTSelfAttention.__init__xc                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr   r!   r   r   )sizero   rp   viewpermute)rF   rz   new_x_shaper%   r%   r&   transpose_for_scores   s   
z+MobileViTSelfAttention.transpose_for_scoreshidden_statesc           
      C   s   |  |}| | |}| | |}| |}t||dd}|t| j	 }t
jj|dd}| |}t||}|dddd }| d d | jf }	|j|	 }|S )Nr{   dimr   r!   r   r   )rt   r   ru   r   rP   matmul	transposemathsqrtrp   r   
functionalsoftmaxrx   r~   
contiguousr|   rq   r}   )
rF   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shaper%   r%   r&   rK      s   



zMobileViTSelfAttention.forward)rL   rM   rN   r   r#   r<   rP   rQ   r   rK   rR   r%   r%   rG   r&   rm      s    rm   c                       s@   e Zd Zdededdf fddZdejdejfdd	Z  Z	S )
MobileViTSelfOutputr)   rn   r    Nc                    s*   t    t||| _t|j| _d S rJ   r;   r<   r   rr   denserv   hidden_dropout_probrx   ry   rG   r%   r&   r<        
zMobileViTSelfOutput.__init__r   c                 C      |  |}| |}|S rJ   r   rx   rF   r   r%   r%   r&   rK     rk   zMobileViTSelfOutput.forwardrl   r%   r%   rG   r&   r     s    r   c                       sV   e Zd Zdededdf fddZdee ddfdd	Zd
ej	dej	fddZ
  ZS )MobileViTAttentionr)   rn   r    Nc                    s.   t    t||| _t||| _t | _d S rJ   )r;   r<   rm   	attentionr   outputsetpruned_headsry   rG   r%   r&   r<     s   
zMobileViTAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )lenr   r   ro   rp   r   r   rt   ru   r   r   r   rq   union)rF   r   indexr%   r%   r&   prune_heads%  s   zMobileViTAttention.prune_headsr   c                 C   s   |  |}| |}|S rJ   )r   r   )rF   r   self_outputsattention_outputr%   r%   r&   rK   7  rk   zMobileViTAttention.forward)rL   rM   rN   r   r#   r<   r   r   rP   rQ   rK   rR   r%   r%   rG   r&   r     s    r   c                       D   e Zd Zdedededdf fddZdejdejfd	d
Z  Z	S )MobileViTIntermediater)   rn   intermediate_sizer    Nc                    s@   t    t||| _t|jtrt|j | _	d S |j| _	d S rJ   )
r;   r<   r   rr   r   rB   rE   rC   r   intermediate_act_fnrF   r)   rn   r   rG   r%   r&   r<   >  s
   
zMobileViTIntermediate.__init__r   c                 C   r   rJ   )r   r   r   r%   r%   r&   rK   F  rk   zMobileViTIntermediate.forwardrl   r%   r%   rG   r&   r   =      r   c                       sJ   e Zd Zdedededdf fddZdejd	ejdejfd
dZ  Z	S )MobileViTOutputr)   rn   r   r    Nc                    s*   t    t||| _t|j| _d S rJ   r   r   rG   r%   r&   r<   M  r   zMobileViTOutput.__init__r   input_tensorc                 C   s    |  |}| |}|| }|S rJ   r   )rF   r   r   r%   r%   r&   rK   R  s   

zMobileViTOutput.forwardrl   r%   r%   rG   r&   r   L  s    $r   c                       r   )MobileViTTransformerLayerr)   rn   r   r    Nc                    sZ   t    t||| _t|||| _t|||| _tj	||j
d| _tj	||j
d| _d S )Nr7   )r;   r<   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   rG   r%   r&   r<   Z  s   
z"MobileViTTransformerLayer.__init__r   c                 C   s<   |  | |}|| }| |}| |}| ||}|S rJ   )r   r   r   r   r   )rF   r   r   layer_outputr%   r%   r&   rK   b  s   

z!MobileViTTransformerLayer.forwardrl   r%   r%   rG   r&   r   Y  r   r   c                       r   )MobileViTTransformerr)   rn   rb   r    Nc                    sJ   t    t | _t|D ]}t||t||j d}| j	| qd S )N)rn   r   )
r;   r<   r   rc   rd   re   r   r#   	mlp_ratiorf   )rF   r)   rn   rb   _transformer_layerrG   r%   r&   r<   m  s   

zMobileViTTransformer.__init__r   c                 C   rh   rJ   ri   )rF   r   rj   r%   r%   r&   rK   y  rk   zMobileViTTransformer.forwardrl   r%   r%   rG   r&   r   l  s    r   c                       s   e Zd ZdZ	ddedededededed	ed
df fddZdejd
e	eje
f fddZdejde
d
ejfddZdejd
ejfddZ  ZS )MobileViTLayerz;
    MobileViT block: https://arxiv.org/abs/2110.02178
    r   r)   r*   r+   r-   rn   rb   r0   r    Nc                    s   t    |j| _|j| _|dkr,t||||dkr|nd|dkr$|d ndd| _|}nd | _t||||jd| _	t|||dddd| _
t|||d| _tj||jd| _t|||dd| _t|d| ||jd| _d S )	Nr!   r   )r*   r+   r-   r0   rU   F)r*   r+   r,   r1   r2   )rn   rb   r   )r;   r<   
patch_sizepatch_widthpatch_heightrS   downsampling_layerr(   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)rF   r)   r*   r+   r-   rn   rb   r0   rG   r%   r&   r<     sN   

	zMobileViTLayer.__init__rI   c                 C   s  | j | j}}t|| }|j\}}}}tt|| | }	tt|| | }
d}|
|ks4|	|krBtjj||	|
fddd}d}|
| }|	| }|| }|	|| | |||}|
dd}|	||||}|
dd}|	|| |d}||f||||||d	}||fS )
NFbilinearr|   modealign_cornersTr   r!   r   r{   )	orig_size
batch_sizechannelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r#   shaper   ceilr   r   r   reshaper   )rF   rI   r   r   
patch_arear   r   orig_height
orig_width
new_height	new_widthr   num_patch_widthnum_patch_heightr   patches	info_dictr%   r%   r&   	unfolding  s<   	zMobileViTLayer.unfoldingr   r   c                 C   s   | j | j}}t|| }|d }|d }|d }|d }	|d }
| |||d}|dd}||| |	 |
||}|dd	}||||	| |
| }|d
 r_tjj	||d ddd}|S )Nr   r   r   r   r   r{   r   r   r!   r   r   r   Fr   )
r   r   r#   r   r}   r   r   r   r   r   )rF   r   r   r   r   r   r   r   r   r   r   rI   r%   r%   r&   folding  s*   zMobileViTLayer.foldingc                 C   s|   | j r|  |}|}| |}| |}| |\}}| |}| |}| ||}| |}| t	j
||fdd}|S Nr   r   )r   r   r   r   r   r   r   r   r   rP   cat)rF   rI   r^   r   r   r%   r%   r&   rK     s   





zMobileViTLayer.forwardr_   )rL   rM   rN   r`   r   r#   r<   rP   rQ   r   r   r   r   rK   rR   r%   r%   rG   r&   r     s.    	:+r   c                       sP   e Zd Zdeddf fddZ		ddejd	ed
edee	e
f fddZ  ZS )MobileViTEncoderr)   r    Nc           
   	      sX  t    || _t | _d| _d }}|jdkrd}d}n|jdkr%d}d}t||j	d |j	d ddd}| j
| t||j	d |j	d dd	d}| j
| t||j	d |j	d	 d|jd dd
}| j
| |rp|d9 }t||j	d	 |j	d d|jd d|d}| j
| |r|d9 }t||j	d |j	d d|jd d	|d}	| j
|	 d S )NFr   T   r   r   )r*   r+   r-   rb   r!   r   )r*   r+   r-   rn   rb      )r*   r+   r-   rn   rb   r0      )r;   r<   r)   r   rc   rd   gradient_checkpointingoutput_stridera   neck_hidden_sizesrf   r   hidden_sizes)
rF   r)   dilate_layer_4dilate_layer_5r0   layer_1layer_2layer_3layer_4layer_5rG   r%   r&   r<   !  sx   



		zMobileViTEncoder.__init__FTr   output_hidden_statesreturn_dictc                 C   sx   |rdnd }t | jD ]\}}| jr| jr| |j|}n||}|r(||f }q|s6tdd ||fD S t||dS )Nr%   c                 s   s    | ]	}|d ur|V  qd S rJ   r%   ).0vr%   r%   r&   	<genexpr>  s    z+MobileViTEncoder.forward.<locals>.<genexpr>)last_hidden_stater   )	enumeraterd   r   training_gradient_checkpointing_func__call__tupler   )rF   r   r   r   all_hidden_statesrg   rj   r%   r%   r&   rK   k  s   
zMobileViTEncoder.forward)FT)rL   rM   rN   r   r<   rP   rQ   rO   r   r   r   rK   rR   r%   r%   rG   r&   r      s    M
r   c                   @   sB   e Zd ZdZeZdZdZdZde	e
je
je
jf ddfdd	ZdS )
MobileViTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    	mobilevitpixel_valuesTmoduler    Nc                 C   st   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjr8|j	j
  |jjd dS dS )zInitialize the weightsg        )meanstdNg      ?)rB   r   rr   r>   weightdatanormal_r)   initializer_ranger/   zero_r   fill_)rF   r  r%   r%   r&   _init_weights  s   
z&MobileViTPreTrainedModel._init_weights)rL   rM   rN   r`   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr   r   rr   r>   r   r
  r%   r%   r%   r&   r     s    &r   aK  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MobileViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aF  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`MobileViTImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zWThe bare MobileViT model outputting raw hidden-states without any specific head on top.c                       s   e Zd Zddedef fddZdd Zeee	e
eeded		
	
	
ddeej dee dee deeef fddZ  ZS )MobileViTModelTr)   expand_outputc                    sn   t  | || _|| _t||j|jd ddd| _t|| _	| jr1t||jd |jd dd| _
|   d S )	Nr   r   r!   )r*   r+   r,   r-   r      r   rU   )r;   r<   r)   r  r(   num_channelsr   	conv_stemr   encoderconv_1x1_exp	post_init)rF   r)   r  rG   r%   r&   r<     s&   
zMobileViTModel.__init__c                 C   sF   |  D ]\}}| jj| }t|tr |jjD ]}|j| qqdS )zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr  rd   rB   r   r   r   r   )rF   heads_to_prunelayer_indexr   mobilevit_layerr   r%   r%   r&   _prune_heads  s   
zMobileViTModel._prune_headsvision)
checkpointoutput_typer  modalityexpected_outputNr   r   r   r    c           	      C   s   |d ur|n| j j}|d ur|n| j j}|d u rtd| |}| j|||d}| jr>| |d }tj	|ddgdd}n|d }d }|sY|d urN||fn|f}||dd   S t
|||jd	S )
Nz You have to specify pixel_valuesr   r   r   r   r{   F)r   keepdimr   )r   pooler_outputr   )r)   r   use_return_dictr=   r  r  r  r  rP   r  r   r   )	rF   r   r   r   embedding_outputencoder_outputsr   pooled_outputr   r%   r%   r&   rK     s0   
zMobileViTModel.forward)T)NNN)rL   rM   rN   r   rO   r<   r  r   MOBILEVIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   rP   rQ   r   r   rK   rR   r%   r%   rG   r&   r    s.    
	
r  z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                       s   e Zd Zdeddf fddZeeeee	e
ed				ddeej dee d	eej d
ee deee	f f
ddZ  ZS )MobileViTForImageClassificationr)   r    Nc                    sd   t  | |j| _t|| _tj|jdd| _|jdkr't	|j
d |jnt | _|   d S )NT)inplacer   r{   )r;   r<   
num_labelsr  r   r   rv   classifier_dropout_probrx   rr   r   Identity
classifierr  rF   r)   rG   r%   r&   r<     s   
$z(MobileViTForImageClassification.__init__)r  r  r  r   r   r   labelsr   c                 C   sh  |dur|n| j j}| j|||d}|r|jn|d }| | |}d}|dur| j jdu rS| jdkr9d| j _n| jdkrO|jt	j
ksJ|jt	jkrOd| j _nd| j _| j jdkrqt }	| jdkrk|	| | }n+|	||}n%| j jdkrt }	|	|d| j|d}n| j jdkrt }	|	||}|s|f|dd  }
|dur|f|
 S |
S t|||jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr!  r   
regressionsingle_label_classificationmulti_label_classificationr{   r!   )losslogitsr   )r)   r$  r   r#  r1  rx   problem_typer.  dtyperP   longr#   r
   squeezer	   r}   r   r   r   )rF   r   r   r3  r   outputsr'  r8  r7  loss_fctr   r%   r%   r&   rK   '  s>   

"


z'MobileViTForImageClassification.forwardNNNN)rL   rM   rN   r   r<   r   r(  r   _IMAGE_CLASS_CHECKPOINTr   r*  _IMAGE_CLASS_EXPECTED_OUTPUTr   rP   rQ   rO   r   r   rK   rR   r%   r%   rG   r&   r,    s0    
r,  c                       r   )MobileViTASPPPoolingr)   r*   r+   r    Nc              	      s4   t    tjdd| _t|||ddddd| _d S )Nr   )output_sizeTrelu)r*   r+   r,   r-   r1   r2   )r;   r<   r   AdaptiveAvgPool2dglobal_poolr(   r   )rF   r)   r*   r+   rG   r%   r&   r<   f  s   
zMobileViTASPPPooling.__init__rI   c                 C   s:   |j dd  }| |}| |}tjj||ddd}|S )Nr   r   Fr   )r   rF  r   r   r   r   )rF   rI   spatial_sizer%   r%   r&   rK   u  s
   

zMobileViTASPPPooling.forwardrl   r%   r%   rG   r&   rB  e  s    rB  c                       @   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
MobileViTASPPzs
    ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
    r)   r    Nc                    s   t     jd  jt jdkrtdt | _	t
 ddd}| j	| | j	 fdd jD  t }| j	| t
 d	 ddd| _tj jd
| _d S )Nr   r   z"Expected 3 values for atrous_ratesr   rD  rV   c              
      s    g | ]}t  d |ddqS )r   rD  )r*   r+   r,   r0   r2   )r(   )r   rater)   r*   r+   r%   r&   
<listcomp>  s    	z*MobileViTASPP.__init__.<locals>.<listcomp>r   )p)r;   r<   r   aspp_out_channelsr   atrous_ratesr=   r   rc   convsr(   rf   extendrB  projectrv   aspp_dropout_probrx   )rF   r)   in_projection
pool_layerrG   rK  r&   r<     s2   


	zMobileViTASPP.__init__rI   c                 C   sD   g }| j D ]	}||| qtj|dd}| |}| |}|S r   )rP  rf   rP   r   rR  rx   )rF   rI   pyramidconvpooled_featuresr%   r%   r&   rK     s   


zMobileViTASPP.forward
rL   rM   rN   r`   r   r<   rP   rQ   rK   rR   r%   r%   rG   r&   rI  }  s    +rI  c                       rH  )
MobileViTDeepLabV3zB
    DeepLabv3 architecture: https://arxiv.org/abs/1706.05587
    r)   r    Nc              	      sB   t    t|| _t|j| _t||j	|j
ddddd| _d S )Nr   FT)r*   r+   r,   r1   r2   r/   )r;   r<   rI  asppr   	Dropout2dr/  rx   r(   rN  r.  r1  r2  rG   r%   r&   r<     s   

zMobileViTDeepLabV3.__init__r   c                 C   s&   |  |d }| |}| |}|S )Nr{   )r[  rx   r1  )rF   r   rI   r%   r%   r&   rK     s   

zMobileViTDeepLabV3.forwardrY  r%   r%   rG   r&   rZ    s    rZ  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                       s~   e Zd Zdeddf fddZeeeee	d				dde
ej de
ej d	e
e d
e
e deeef f
ddZ  ZS ) MobileViTForSemanticSegmentationr)   r    Nc                    s8   t  | |j| _t|dd| _t|| _|   d S )NF)r  )r;   r<   r.  r  r   rZ  segmentation_headr  r2  rG   r%   r&   r<     s
   
z)MobileViTForSemanticSegmentation.__init__)r  r  r   r3  r   r   c                 C   s  |dur|n| j j}|dur|n| j j}| j|d|d}|r!|jn|d }| |}d}|durU| j jdkr:tdtj	j
||jdd ddd	}	t| j jd
}
|
|	|}|sw|rc|f|dd  }n	|f|dd  }|duru|f| S |S t|||r|jddS dddS )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
        >>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```NTr!  r   z/The number of labels should be greater than oner   r   Fr   )ignore_indexr!   )r7  r8  r   
attentions)r)   r   r$  r   r   r^  r.  r=   r   r   r   r   r	   semantic_loss_ignore_indexr   )rF   r   r3  r   r   r=  encoder_hidden_statesr8  r7  upsampled_logitsr>  r   r%   r%   r&   rK     sB   '

z(MobileViTForSemanticSegmentation.forwardr?  )rL   rM   rN   r   r<   r   r(  r   r   r*  r   rP   rQ   rO   r   r   rK   rR   r%   r%   rG   r&   r]    s&    


r]  )r   N)Dr`   r   typingr   r   r   r   r   rP   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   configuration_mobilevitr   
get_loggerrL   loggerr*  r)  r+  r@  rA  'MOBILEVIT_PRETRAINED_MODEL_ARCHIVE_LISTr#   r'   Moduler(   rS   ra   rm   r   r   r   r   r   r   r   r   r   MOBILEVIT_START_DOCSTRINGr(  r  r,  rB  rI  rZ  r]  r%   r%   r%   r&   <module>   sn   
 @03 "eWN;