o
    h\                  	   @   s  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ ddlmZ ee Z!dZ"dZ#g dZ$dZ%dZ&dgZ'd5dej(de)de*dej(fddZ+G dd dej,Z-G dd dej,Z.G dd dej,Z/G d d! d!ej,Z0G d"d# d#ej,Z1G d$d% d%ej,Z2G d&d' d'ej,Z3G d(d) d)eZ4d*Z5d+Z6ed,e5G d-d. d.e4Z7ed/e5G d0d1 d1e4Z8ed2e5G d3d4 d4e4eZ9dS )6z PyTorch ConvNextV2 model.    )OptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)BackboneMixin   )ConvNextV2Configr   zfacebook/convnextv2-tiny-1k-224)r   i      r   ztabby, tabby cat        Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr    random_tensoroutput r)   h/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/convnextv2/modeling_convnextv2.py	drop_path@   s   
r+   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )ConvNextV2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                    s   t    || _d S N)super__init__r   )selfr   	__class__r)   r*   r/   X   s   

zConvNextV2DropPath.__init__hidden_statesc                 C   s   t || j| jS r-   )r+   r   r   r0   r3   r)   r)   r*   forward\   s   zConvNextV2DropPath.forwardc                 C   s   d | jS )Nzp={})formatr   )r0   r)   r)   r*   
extra_repr_   s   zConvNextV2DropPath.extra_reprr-   )__name__
__module____qualname____doc__r   floatr/   r"   Tensorr5   strr7   __classcell__r)   r)   r1   r*   r,   U   s
    r,   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	ConvNextV2GRNz)GRN (Global Response Normalization) layerdimc                    s>   t    ttddd|| _ttddd|| _d S )Nr   )r.   r/   r   	Parameterr"   zerosweightbias)r0   rA   r1   r)   r*   r/   f   s   
zConvNextV2GRN.__init__r3   r   c                 C   sD   t j|dddd}||jdddd  }| j||  | j | }|S )N   )r   rF   T)prA   keepdim)rA   rH   ư>)r"   normmeanrD   rE   )r0   r3   global_featuresnorm_featuresr)   r)   r*   r5   k   s   zConvNextV2GRN.forward)
r8   r9   r:   r;   intr/   r"   FloatTensorr5   r?   r)   r)   r1   r*   r@   c   s    r@   c                       s8   e Zd ZdZd
 fdd	Zdejdejfdd	Z  ZS )ConvNextV2LayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    rJ   channels_lastc                    s`   t    tt|| _tt|| _|| _	|| _
| j
dvr*td| j
 |f| _d S )N)rR   channels_firstzUnsupported data format: )r.   r/   r   rB   r"   onesrD   rC   rE   epsdata_formatNotImplementedErrornormalized_shape)r0   rX   rU   rV   r1   r)   r*   r/   {   s   

zConvNextV2LayerNorm.__init__xr   c                 C   s   | j dkrtjj|| j| j| j| j}|S | j dkr]|j	}|
 }|jddd}|| djddd}|| t|| j  }|j|d}| jd d d d f | | jd d d d f  }|S )NrR   rS   r   T)rH   rF   )r   )rV   r"   r   
functional
layer_normrX   rD   rE   rU   r   r<   rL   powsqrtto)r0   rY   input_dtypeusr)   r)   r*   r5      s   
	
,zConvNextV2LayerNorm.forward)rJ   rR   )	r8   r9   r:   r;   r/   r"   r=   r5   r?   r)   r)   r1   r*   rQ   u   s    
rQ   c                       s6   e Zd ZdZ fddZdejdejfddZ  Z	S )ConvNextV2EmbeddingszThis class is comparable to (and inspired by) the SwinEmbeddings class
    found in src/transformers/models/swin/modeling_swin.py.
    c                    sL   t    tj|j|jd |j|jd| _t|jd ddd| _	|j| _d S )Nr   kernel_sizestriderJ   rS   rU   rV   )
r.   r/   r   Conv2dnum_channelshidden_sizes
patch_sizepatch_embeddingsrQ   	layernormr0   configr1   r)   r*   r/      s   
zConvNextV2Embeddings.__init__pixel_valuesr   c                 C   s4   |j d }|| jkrtd| |}| |}|S )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)r    rh   
ValueErrorrk   rl   )r0   ro   rh   
embeddingsr)   r)   r*   r5      s   



zConvNextV2Embeddings.forward
r8   r9   r:   r;   r/   r"   rP   r=   r5   r?   r)   r)   r1   r*   rb      s    rb   c                       s8   e Zd ZdZd	 fdd	ZdejdejfddZ  Z	S )
ConvNextV2Layera5  This corresponds to the `Block` class in the original implementation.

    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back

    The authors used (2) as they find it slightly faster in PyTorch.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        dim (`int`): Number of input channels.
        drop_path (`float`): Stochastic depth rate. Default: 0.0.
    r   c                    s   t    tj||dd|d| _t|dd| _t|d| | _t	|j
 | _td| | _td| || _|dkrAt|| _d S t | _d S )Nr   r	   )rd   paddinggroupsrJ   rU      r   )r.   r/   r   rg   dwconvrQ   rl   Linearpwconv1r
   
hidden_actactr@   grnpwconv2r,   Identityr+   )r0   rn   rA   r+   r1   r)   r*   r/      s   
$zConvNextV2Layer.__init__r3   r   c                 C   sr   |}|  |}|dddd}| |}| |}| |}| |}| |}|dddd}|| | }|S )Nr   rF   r	   r   )rx   permuterl   rz   r|   r}   r~   r+   )r0   r3   r   rY   r)   r)   r*   r5      s   





zConvNextV2Layer.forward)r   rr   r)   r)   r1   r*   rs      s    rs   c                       s8   e Zd ZdZd
 fdd	Zdejdejfdd	Z  Z	S )ConvNextV2Stagea  ConvNeXTV2 stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        in_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        depth (`int`): Number of residual blocks.
        drop_path_rates(`List[float]`): Stochastic depth rates for each layer.
    rF   Nc              	      s   t    |ks|dkr!tt|dddtj|||d| _nt | _p,dg| tj fddt|D  | _	d S )	Nr   rJ   rS   rf   rc   r   c                    s   g | ]}t  | d qS ))rA   r+   )rs   ).0jrn   drop_path_ratesout_channelsr)   r*   
<listcomp>   s    z,ConvNextV2Stage.__init__.<locals>.<listcomp>)
r.   r/   r   
SequentialrQ   rg   downsampling_layerr   rangelayers)r0   rn   in_channelsr   rd   re   depthr   r1   r   r*   r/      s   


zConvNextV2Stage.__init__r3   r   c                 C   s   |  |}| |}|S r-   )r   r   r4   r)   r)   r*   r5      s   

zConvNextV2Stage.forward)rF   rF   rF   Nrr   r)   r)   r1   r*   r      s    
r   c                       sN   e Zd Z fddZ		ddejdee dee dee	e
f fd	d
Z  ZS )ConvNextV2Encoderc              	      s   t    t | _dd td|jt|j	
|j	D }|jd }t|jD ]$}|j| }t||||dkr9dnd|j	| || d}| j| |}q(d S )Nc                 S   s   g | ]}|  qS r)   )tolist)r   rY   r)   r)   r*   r      s    z.ConvNextV2Encoder.__init__.<locals>.<listcomp>r   rF   r   )r   r   re   r   r   )r.   r/   r   
ModuleListstagesr"   linspacedrop_path_ratesumdepthssplitri   r   
num_stagesr   append)r0   rn   r   prev_chsiout_chsstager1   r)   r*   r/      s&   



zConvNextV2Encoder.__init__FTr3   output_hidden_statesreturn_dictr   c                 C   sj   |rdnd }t | jD ]\}}|r||f }||}q|r"||f }|s/tdd ||fD S t||dS )Nr)   c                 s   s    | ]	}|d ur|V  qd S r-   r)   )r   vr)   r)   r*   	<genexpr>   s    z,ConvNextV2Encoder.forward.<locals>.<genexpr>)last_hidden_stater3   )	enumerater   tupler   )r0   r3   r   r   all_hidden_statesr   layer_moduler)   r)   r*   r5     s   


zConvNextV2Encoder.forward)FT)r8   r9   r:   r/   r"   rP   r   boolr   r   r   r5   r?   r)   r)   r1   r*   r      s    
r   c                   @   s$   e Zd ZdZeZdZdZdd ZdS )ConvNextV2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    
convnextv2ro   c                 C   st   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjr8|j	j
  |jjd dS dS )zInitialize the weightsr   )rL   stdNg      ?)
isinstancer   ry   rg   rD   datanormal_rn   initializer_rangerE   zero_	LayerNormfill_)r0   moduler)   r)   r*   _init_weights3  s   
z'ConvNextV2PreTrainedModel._init_weightsN)	r8   r9   r:   r;   r   config_classbase_model_prefixmain_input_namer   r)   r)   r)   r*   r   )  s    r   aL  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ConvNextV2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aI  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`ConvNextImageProcessor`]. See
            [`ConvNextImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zSThe bare ConvNextV2 model outputting raw features without any specific head on top.c                       sj   e Zd Z fddZeeeeee	de
d			ddejdee dee d	eeef fd
dZ  ZS )ConvNextV2Modelc                    sJ   t  | || _t|| _t|| _tj|j	d |j
d| _|   d S )NrI   rv   )r.   r/   rn   rb   rq   r   encoderr   r   ri   layer_norm_epsrl   	post_initrm   r1   r)   r*   r/   ^  s   

zConvNextV2Model.__init__vision)
checkpointoutput_typer   modalityexpected_outputNro   r   r   r   c                 C   s   |d ur|n| j j}|d ur|n| j j}|d u rtd| |}| j|||d}|d }| |ddg}|sC||f|dd   S t|||j	dS )Nz You have to specify pixel_valuesr   r   r   rI   r   )r   pooler_outputr3   )
rn   r   use_return_dictrp   rq   r   rl   rL   r   r3   )r0   ro   r   r   embedding_outputencoder_outputsr   pooled_outputr)   r)   r*   r5   k  s(   
zConvNextV2Model.forward)NNN)r8   r9   r:   r/   r   CONVNEXTV2_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr"   rP   r   r   r   r   r5   r?   r)   r)   r1   r*   r   X  s,    	
r   z
    ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                       st   e Zd Z fddZeeeeee	e
d				ddejdeej dee dee d	eeef f
d
dZ  ZS ) ConvNextV2ForImageClassificationc                    sR   t  | |j| _t|| _|jdkrt|jd |jnt | _	| 
  d S )Nr   rI   )r.   r/   
num_labelsr   r   r   ry   ri   r   
classifierr   rm   r1   r)   r*   r/     s   
$z)ConvNextV2ForImageClassification.__init__)r   r   r   r   Nro   labelsr   r   r   c                 C   sb  |dur|n| j j}| j|||d}|r|jn|d }| |}d}|dur| j jdu rP| jdkr6d| j _n| jdkrL|jtj	ksG|jtj
krLd| j _nd| j _| j jdkrnt }	| jdkrh|	| | }n+|	||}n%| j jdkrt }	|	|d| j|d}n| j jdkrt }	|	||}|s|f|dd  }
|dur|f|
 S |
S t|||jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   
regressionsingle_label_classificationmulti_label_classificationrI   rF   )losslogitsr3   )rn   r   r   r   r   problem_typer   r   r"   longrO   r   squeezer   viewr   r   r3   )r0   ro   r   r   r   outputsr   r   r   loss_fctr(   r)   r)   r*   r5     s>   


"


z(ConvNextV2ForImageClassification.forward)NNNN)r8   r9   r:   r/   r   r   r   _IMAGE_CLASS_CHECKPOINTr   r   _IMAGE_CLASS_EXPECTED_OUTPUTr"   rP   r   
LongTensorr   r   r   r5   r?   r)   r)   r1   r*   r     s0    	
r   zT
    ConvNeXT V2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                       sZ   e Zd Z fddZeeeeed		dde	j
dee dee defd	d
Z  ZS )ConvNextV2Backbonec                    s   t  | t  | t|| _t|| _|jd g|j | _i }t	| j
| jD ]\}}t|dd||< q)t|| _|   d S )Nr   rS   )rV   )r.   r/   _init_backbonerb   rq   r   r   ri   num_featureszip_out_featureschannelsrQ   r   
ModuleDicthidden_states_normsr   )r0   rn   r   r   rh   r1   r)   r*   r/     s   

zConvNextV2Backbone.__init__)r   r   Nro   r   r   r   c                 C   s   |dur|n| j j}|dur|n| j j}| |}| j|d|d}|r&|jn|d }d}t| j|D ]\}}	|| jv rG| j	| |	}	||	f7 }q2|sV|f}
|rT|
|f7 }
|
S t
||r_|ddS dddS )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")
        >>> model = AutoBackbone.from_pretrained("facebook/convnextv2-tiny-1k-224")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr   r   r)   )feature_mapsr3   
attentions)rn   r   r   rq   r   r3   r   stage_namesout_featuresr   r   )r0   ro   r   r   r   r   r3   r   r   hidden_stater(   r)   r)   r*   r5     s:   



zConvNextV2Backbone.forward)NN)r8   r9   r:   r/   r   r   r   r   r   r"   r=   r   r   r5   r?   r)   r)   r1   r*   r     s    
r   )r   F):r;   typingr   r   r   r"   torch.utils.checkpointr   torch.nnr   r   r   activationsr
   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   r   r   r   utils.backbone_utilsr   configuration_convnextv2r   
get_loggerr8   loggerr   r   r   r   r   (CONVNEXTV2_PRETRAINED_MODEL_ARCHIVE_LISTr=   r<   r   r+   Moduler,   r@   rQ   rb   rs   r   r   r   CONVNEXTV2_START_DOCSTRINGr   r   r   r   r)   r)   r)   r*   <module>   s\   
 ,!0;L