o
    h                  	   @   sn  d Z ddlZddlZddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZ ddl m!Z! e"e#Z$dZ%dZ&g dZ'dZ(dZ)dgZ*eG dd deZ+eG dd deZ,eG dd deZ-eG dd deZ.G dd dej/Z0G dd dej/Z1dCd!e
j2d"e3d#e4d$e
j2fd%d&Z5G d'd( d(ej/Z6G d)d* d*ej/Z7G d+d, d,ej/Z8G d-d. d.ej/Z9G d/d0 d0ej/Z:G d1d2 d2ej/Z;G d3d4 d4eZ<d5Z=d6Z>ed7e=G d8d9 d9e<Z?ed:e=G d;d< d<e<Z@ed=e=G d>d? d?e<ZAed@e=G dAdB dBe<eZBdS )Dz PyTorch FocalNet model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutput)PreTrainedModel)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)BackboneMixin   )FocalNetConfigr   zmicrosoft/focalnet-tiny)r   1   i   ztabby, tabby catc                   @   sL   e Zd ZU dZdZejed< dZe	e
ej  ed< dZe	e
ej  ed< dS )FocalNetEncoderOutputa  
    FocalNet encoder's outputs, with potential hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.

        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nlast_hidden_statehidden_statesreshaped_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   r    r#   r#   d/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/focalnet/modeling_focalnet.pyr   ?   s
   
 r   c                   @   s^   e Zd ZU dZdZejed< dZe	ej ed< dZ
e	eej  ed< dZe	eej  ed< dS )FocalNetModelOutputa  
    FocalNet model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr   pooler_outputr   r   )r   r   r   r   r   r    r!   r"   r&   r   r   r   r   r#   r#   r#   r$   r%   Z   s   
 r%   c                   @   ^   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dS )!FocalNetMaskedImageModelingOutputa  
    FocalNet masked image model outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
            Masked image modeling (MLM) loss.
        reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Reconstructed pixel values.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nlossreconstructionr   r   )r   r   r   r   r)   r   r    r!   r"   r*   r   r   r   r#   r#   r#   r$   r(   w      
 r(   c                   @   r'   )FocalNetImageClassifierOutputaS  
    FocalNet outputs for image classification.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr)   logitsr   r   )r   r   r   r   r)   r   r    r!   r"   r-   r   r   r   r#   r#   r#   r$   r,      r+   r,   c                       sN   e Zd ZdZd fdd	Z	ddeej deej de	ej
 fd	d
Z  ZS )FocalNetEmbeddingszX
    Construct the patch embeddings and layernorm. Optionally, also the mask token.
    Fc              	      s|   t    t||j|j|j|j|jdd| _| jj	| _
|r(ttdd|jnd | _tj|j|jd| _t|j| _d S )NT)config
image_size
patch_sizenum_channels	embed_dimuse_conv_embedis_stemr   eps)super__init__FocalNetPatchEmbeddingsr0   r1   r2   r3   r4   patch_embeddings	grid_size
patch_gridr   	Parameterr    zeros
mask_token	LayerNormlayer_norm_epsnormDropouthidden_dropout_probdropout)selfr/   use_mask_token	__class__r#   r$   r9      s   

	 zFocalNetEmbeddings.__init__Npixel_valuesbool_masked_posreturnc           
      C   st   |  |\}}| |}| \}}}|d ur1| j||d}|d|}	|d|	  ||	  }| |}||fS )N      ?)r;   rC   sizer@   expand	unsqueezetype_asrF   )
rG   rK   rL   
embeddingsoutput_dimensions
batch_sizeseq_len_mask_tokensmaskr#   r#   r$   forward   s   

zFocalNetEmbeddings.forward)FN)r   r   r   r   r9   r   r    r!   
BoolTensorr   Tensorr[   __classcell__r#   r#   rI   r$   r.      s    r.   c                       sR   e Zd Z			d
 fdd	Zdd Zdeej deej	ee
 f fdd	Z  ZS )r:   Fc	                    s
  t    t|tjjr|n||f}t|tjjr|n||f}|d |d  |d |d   }	|| _|| _|| _|	| _	|d |d  |d |d  f| _
|ri|rWd}
d}d}nd}
d}d}tj|||
||d| _n
tj||||d| _|rtj||jd	| _d S d | _d S )
Nr   r            r
   )kernel_sizestridepadding)rc   rd   r6   )r8   r9   
isinstancecollectionsabcIterabler0   r1   r2   num_patchesr<   r   Conv2d
projectionrA   rB   rC   )rG   r/   r0   r1   r2   r3   add_normr4   r5   rj   rc   re   rd   rI   r#   r$   r9      s0   
 "


z FocalNetPatchEmbeddings.__init__c                 C   s   || j d  dkrd| j d || j d   f}tj||}|| j d  dkr>ddd| j d || j d   f}tj||}|S )Nr   r   )r1   r   
functionalpad)rG   rK   heightwidth
pad_valuesr#   r#   r$   	maybe_pad  s    z!FocalNetPatchEmbeddings.maybe_padrK   rM   c                 C   s|   |j \}}}}|| jkrtd| |||}| |}|j \}}}}||f}|ddd}| jd ur:| |}||fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.ra   r   )shaper2   
ValueErrorrs   rl   flatten	transposerC   )rG   rK   rX   r2   rp   rq   rT   rU   r#   r#   r$   r[     s   



zFocalNetPatchEmbeddings.forward)FFF)r   r   r   r9   rs   r   r    r!   r   r^   intr[   r_   r#   r#   rI   r$   r:      s    *.	r:           Finput	drop_probtrainingrM   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    ry   r   r   )r   )dtypedevice)rt   ndimr    randr}   r~   floor_div)rz   r{   r|   	keep_probrt   random_tensoroutputr#   r#   r$   	drop_path!  s   
r   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )FocalNetDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr{   rM   c                    s   t    || _d S r\   )r8   r9   r{   )rG   r{   rI   r#   r$   r9   9  s   

zFocalNetDropPath.__init__r   c                 C   s   t || j| jS r\   )r   r{   r|   )rG   r   r#   r#   r$   r[   =  s   zFocalNetDropPath.forwardc                 C   s   d | jS )Nzp={})formatr{   rG   r#   r#   r$   
extra_repr@  s   zFocalNetDropPath.extra_reprr\   )r   r   r   r   r   floatr9   r    r^   r[   strr   r_   r#   r#   rI   r$   r   6  s
    r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )	FocalNetModulationra   Try   c           	         s$  t    || _|j| | _|j| | _|| _|j| _|j	| _	t
j|d| | jd  |d| _t
j||dd|d| _t
 | _t
||| _t
|| _t
 | _g | _t| jD ](}| j| | j }| jt
t
j|||d||d ddt
  | j| qY| jrt
j||jd| _d S d S )Nra   r   )bias)rc   rd   r   F)rc   rd   groupsre   r   r6   )r8   r9   dimfocal_windowsfocal_windowfocal_levelsfocal_levelfocal_factor use_post_layernorm_in_modulationnormalize_modulatorr   Linearprojection_inrk   projection_contextGELU
activationprojection_outrD   projection_dropout
ModuleListfocal_layerskernel_sizesrangeappend
SequentialrA   rB   	layernorm)	rG   r/   indexr   r   r   r   krc   rI   r#   r$   r9   E  s8   
 

zFocalNetModulation.__init__c           
      C   s.  |j d }| |dddd }t|||| jd fd\}}| _d}t| jD ]}| j	| |}||| jdd||d f   }q*| 
|jdddjddd}||| jdd| jdf   }| jrn|| jd  }| || _|| j }	|	dddd }	| jr| |	}	| |	}	| |	}	|	S )	z
        Args:
            hidden_state:
                Input features with shape of (batch_size, height, width, num_channels)
        rN   r   r
   r   ra   NT)keepdim)rt   r   permute
contiguousr    splitr   gatesr   r   r   meanr   r   	modulatorr   r   r   r   )
rG   hidden_stater2   xqctxctx_alllevel
ctx_globalx_outr#   r#   r$   r[   f  s&   
"$ 



zFocalNetModulation.forward)ra   Try   r   r   r   r9   r[   r_   r#   r#   rI   r$   r   D  s    !r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )FocalNetMlpNry   c                    sR   t    |p|}|p|}t||| _t|j | _t||| _t	|| _
d S r\   )r8   r9   r   r   fc1r   
hidden_actr   fc2rD   drop)rG   r/   in_featureshidden_featuresout_featuresr   rI   r#   r$   r9     s   
zFocalNetMlp.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S r\   )r   r   r   r   )rG   r   r#   r#   r$   r[     s   




zFocalNetMlp.forward)NNry   r   r#   r#   rI   r$   r     s    	r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )FocalNetLayera  Focal Modulation Network layer (block).

    Args:
        config (`FocalNetConfig`):
            Model config.
        index (`int`):
            Layer index.
        dim (`int`):
            Number of input channels.
        input_resolution (`Tuple[int]`):
            Input resulotion.
        drop_path (`float`, *optional*, defaults to 0.0):
            Stochastic depth rate.
    ry   c                    s   t    || _|| _|| _|j| _|j| _tj	||j
d| _t|||| jd| _|dkr1t|nt | _tj	||j
d| _t||j }t|||| jd| _d| _d| _|jrwtj|jt| dd| _tj|jt| dd| _d S d S )Nr6   )r/   r   r   r   ry   )r/   r   r   r   rO   T)requires_grad)r8   r9   r/   r   input_resolutionrE   r   use_post_layernormr   rA   rB   norm1r   
modulationr   Identityr   norm2rx   	mlp_ratior   mlpgamma_1gamma_2use_layerscaler>   layerscale_valuer    ones)rG   r/   r   r   r   r   mlp_hidden_dimrI   r#   r$   r9     s.   
 zFocalNetLayer.__init__c           	   	   C   s   |\}}|j \}}}|}| jr|n| |}|||||}| |||| |}| js/|n| |}|| | j|  }|| | j| jrN| | 	|n| 	| |  }|S r\   )
rt   r   r   viewr   r   r   r   r   r   )	rG   r   input_dimensionsrp   rq   rV   rX   r2   shortcutr#   r#   r$   r[     s   $zFocalNetLayer.forward)ry   )r   r   r   r   r9   r[   r_   r#   r#   rI   r$   r     s     r   c                       sB   e Zd Z fddZdejdeeef deej fddZ  Z	S )FocalNetStagec              
      s  t     | _t j| _ fddt| jD }| | jd k r+|d  nd }| jd k r6tnd }dd t	d j
t jD }|t jd  t jd d   t fddt j D | _|d ur| d|d jd	d
| _nd | _d	| _d S )Nc                    s   g | ]	} j d |  qS )ra   )r3   .0i)r/   r#   r$   
<listcomp>  s    z*FocalNetStage.__init__.<locals>.<listcomp>r   c                 S   s   g | ]}|  qS r#   )item)r   r   r#   r#   r$   r     s    r   c              
      s0   g | ]}t  ttr| nd qS ))r/   r   r   r   r   )r   rf   listr   r/   r   r   r   r   r#   r$   r     s    ra   TF)r/   r0   r1   r2   r3   rm   r4   r5   )r8   r9   r/   lendepths
num_stagesr   r:   r    linspacedrop_path_ratesumr   r   layersr4   
downsamplepointing)rG   r/   r   r   r3   out_dimr   dprrI   r   r$   r9     s6   
 ,

zFocalNetStage.__init__r   r   rM   c           	      C   s|   |\}}| j D ]}|||}q|}| jd ur1|\}}|dd|jd d||}| |\}}n||||f}|||f}|S )Nr   ra   r   rN   )r   r   rw   reshapert   )	rG   r   r   rp   rq   layer_module!hidden_states_before_downsamplingrU   stage_outputsr#   r#   r$   r[     s   


zFocalNetStage.forward)
r   r   r   r9   r    r^   r   rx   r[   r_   r#   r#   rI   r$   r     s    .,r   c                       sd   e Zd Z fddZ			ddejdeeef dee	 dee	 d	ee	 d
e
eef fddZ  ZS )FocalNetEncoderc                    sH   t    t j| _ | _t fddt| jD | _	d| _
d S )Nc              	      s6   g | ]}t  |d  d|  d d|  fdqS )r   ra   r   )r/   r   r   )r   )r   i_layerr/   r<   r#   r$   r   ,  s    z,FocalNetEncoder.__init__.<locals>.<listcomp>F)r8   r9   r   r   r   r/   r   r   r   stagesgradient_checkpointing)rG   r/   r<   rI   r   r$   r9   &  s   

zFocalNetEncoder.__init__FTr   r   output_hidden_states(output_hidden_states_before_downsamplingreturn_dictrM   c                 C   s  |rdnd }|r
dnd }|r1|j \}}	}
|j|g||
R  }|dddd}||f7 }||f7 }t| jD ]\}}| jrI| jrI| |j||}n|||}|d }|d }|d }|d |d f}|r|r|j \}}	}
|j|g|d |d f|
R  }|dddd}||f7 }||f7 }q6|r|s|j \}}	}
|j|g||
R  }|dddd}||f7 }||f7 }q6|st	dd	 ||fD S t
|||d
S )Nr#   r   r
   r   ra   rN   c                 s   s    | ]	}|d ur|V  qd S r\   r#   )r   vr#   r#   r$   	<genexpr>n  s    z*FocalNetEncoder.forward.<locals>.<genexpr>)r   r   r   )rt   r   r   	enumerater   r   r|   _gradient_checkpointing_func__call__tupler   )rG   r   r   r   r   r   all_hidden_statesall_reshaped_hidden_statesrV   rX   hidden_sizereshaped_hidden_stater   stage_moduler   r   rU   r#   r#   r$   r[   8  s\   





zFocalNetEncoder.forward)FFT)r   r   r   r9   r    r^   r   rx   r   boolr   r   r[   r_   r#   r#   rI   r$   r   %  s$    

r   c                   @   s(   e Zd ZdZeZdZdZdZdd Z	dS )FocalNetPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    focalnetrK   Tc                 C   st   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjr8|j	j
  |jjd dS dS )zInitialize the weightsry   )r   stdNrO   )rf   r   r   rk   weightdatanormal_r/   initializer_ranger   zero_rA   fill_)rG   moduler#   r#   r$   _init_weights  s   
z%FocalNetPreTrainedModel._init_weightsN)
r   r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr	  r#   r#   r#   r$   r   x  s    r   aK  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`FocalNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aB  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`AutoImageProcessor.__call__`] for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zVThe bare FocalNet Model outputting raw hidden-states without any specific head on top.c                       s   e Zd Zd fdd	Zdd Zeeeee	e
ded								dd
eej deej dee dee deee	f f
ddZ  ZS )FocalNetModelTFc                    s   t  | || _t|j| _t|jd| jd   | _t	||d| _
t|| j
j| _tj| j|jd| _|r<tdnd | _|   d S )Nra   r   )rH   r6   )r8   r9   r/   r   r   r   rx   r3   num_featuresr.   rT   r   r=   encoderr   rA   rB   r   AdaptiveAvgPool1dpooler	post_init)rG   r/   add_pooling_layerrH   rI   r#   r$   r9     s   zFocalNetModel.__init__c                 C   s   | j jS r\   )rT   r;   r   r#   r#   r$   get_input_embeddings  s   z"FocalNetModel.get_input_embeddingsvision)
checkpointoutput_typer
  modalityexpected_outputNrK   rL   r   r   rM   c                 C   s   |dur|n| j j}|dur|n| j j}|du rtd| j||d\}}| j||||d}|d }| |}d}	| jdurM| |dd}	t	
|	d}	|s[||	f|dd  }
|
S t||	|j|jdS )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rL   r   r   r   r   ra   )r   r&   r   r   )r/   r   use_return_dictru   rT   r  r   r  rw   r    rv   r%   r   r   )rG   rK   rL   r   r   embedding_outputr   encoder_outputssequence_outputpooled_outputr   r#   r#   r$   r[     s6   

zFocalNetModel.forward)TFNNNN)r   r   r   r9   r  r   FOCALNET_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr%   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r    r!   r]   r   r   r   r[   r_   r#   r#   rI   r$   r    s4    	
r  a|  FocalNet Model with a decoder on top for masked image modeling.

    This follows the same implementation as in [SimMIM](https://arxiv.org/abs/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                       st   e Zd Z fddZeeeeed				dde	e
j de	e
j de	e de	e d	eeef f
d
dZ  ZS )FocalNetForMaskedImageModelingc                    sz   t  | t|ddd| _t|j| _t|jd| jd   }t	
t	j||jd |j ddt	|j| _|   d S )NFT)r  rH   ra   r   )in_channelsout_channelsrc   )r8   r9   r  r   r   r   r   rx   r3   r   r   rk   encoder_strider2   PixelShuffledecoderr  )rG   r/   r  rI   r#   r$   r9     s   
z'FocalNetForMaskedImageModeling.__init__r  r
  NrK   rL   r   r   rM   c                 C   s4  |dur|n| j j}| j||||d}|d }|dd}|j\}}}	t|	d  }
}||||
|}| |}d}|durz| j j	| j j
 }|d||}|| j j
d| j j
dd }tjj||dd	}||  | d
  | j j }|s|f|dd  }|dur|f| S |S t|||j|jdS )aQ  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, FocalNetConfig, FocalNetForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-base-simmim-window6-192")
        >>> config = FocalNetConfig()
        >>> model = FocalNetForMaskedImageModeling(config)

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 192, 192]
        ```N)rL   r   r   r   r   ra   g      ?rN   none)	reductiongh㈵>)r)   r*   r   r   )r/   r  r   rw   rt   mathfloorr   r+  r0   r1   repeat_interleaverR   r   r   rn   l1_lossr   r2   r(   r   r   )rG   rK   rL   r   r   outputsr  rV   r2   sequence_lengthrp   rq   reconstructed_pixel_valuesmasked_im_lossrP   rZ   reconstruction_lossr   r#   r#   r$   r[     sB   '
 z&FocalNetForMaskedImageModeling.forwardr!  )r   r   r   r9   r   r"  r   r(   r$  r   r    r!   r]   r   r   r   r[   r_   r#   r#   rI   r$   r&    s&    

r&  z
    FocalNet Model with an image classification head on top (a linear layer on top of the pooled output) e.g. for
    ImageNet.
    c                       sx   e Zd Z fddZeeeeee	e
d				ddeej deej dee dee d	eeef f
d
dZ  ZS )FocalNetForImageClassificationc                    sP   t  | |j| _t|| _|jdkrt| jj|jnt | _	| 
  d S )Nr   )r8   r9   
num_labelsr  r   r   r   r  r   
classifierr  rG   r/   rI   r#   r$   r9   u  s   
"z'FocalNetForImageClassification.__init__)r  r  r
  r  NrK   labelsr   r   rM   c                 C   s\  |dur|n| j j}| j|||d}|d }| |}d}|dur| j jdu rK| jdkr1d| j _n| jdkrG|jtjksB|jtj	krGd| j _nd| j _| j jdkrit
 }	| jdkrc|	| | }n+|	||}n%| j jdkrt }	|	|d| j|d}n| j jdkrt }	|	||}|s|f|dd  }
|dur|f|
 S |
S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   
regressionsingle_label_classificationmulti_label_classificationrN   ra   )r)   r-   r   r   )r/   r  r   r:  problem_typer9  r}   r    longrx   r	   squeezer   r   r   r,   r   r   )rG   rK   r<  r   r   r3  r   r-   r)   loss_fctr   r#   r#   r$   r[     sH   


"


z&FocalNetForImageClassification.forwardr!  )r   r   r   r9   r   r"  r   _IMAGE_CLASS_CHECKPOINTr,   r$  _IMAGE_CLASS_EXPECTED_OUTPUTr   r    r!   
LongTensorr   r   r   r[   r_   r#   r#   rI   r$   r8  l  s0    	
r8  zG
    FocalNet backbone, to be used with frameworks like X-Decoder.
    c                       s`   e Zd Zdef fddZeeeee	d		dde
jdee dee d	efd
dZ  ZS )FocalNetBackboner/   c                    s>   t  | t  | |jg|j | _t|| _|   d S r\   )	r8   r9   _init_backboner3   hidden_sizesr  r  r   r  r;  rI   r#   r$   r9     s
   
zFocalNetBackbone.__init__r,  NrK   r   r   rM   c           
      C   s   |dur|n| j j}|dur|n| j j}| j|ddd}|j}d}t| jD ]\}}|| jv r6||| f7 }q&|sF|f}	|rD|	|jf7 }	|	S t	||rP|jddS dddS )a|  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny-lrf")
        >>> model = AutoBackbone.from_pretrained("microsoft/focalnet-tiny-lrf")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr  r#   )feature_mapsr   
attentions)
r/   r  r   r   r   r   stage_namesr   r   r   )
rG   rK   r   r   r3  r   rJ  idxstager   r#   r#   r$   r[     s.   
zFocalNetBackbone.forward)NN)r   r   r   r   r9   r   r"  r   r   r$  r    r^   r   r   r[   r_   r#   r#   rI   r$   rG    s    

rG  )ry   F)Cr   collections.abcrg   r/  dataclassesr   typingr   r   r   r    torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_outputsr   modeling_utilsr   utilsr   r   r   r   r   r   utils.backbone_utilsr   configuration_focalnetr   
get_loggerr   loggerr$  r#  r%  rD  rE  &FOCALNET_PRETRAINED_MODEL_ARCHIVE_LISTr   r%   r(   r,   Moduler.   r:   r^   r   r   r   r   r   r   r   r   r   r   FOCALNET_START_DOCSTRINGr"  r  r&  r8  rG  r#   r#   r#   r$   <module>   s~    
( HGEBSLeS