o
    hC                     @   s  d Z ddlmZmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ dgZdZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd deZdZdZ edeG dd deZ!dS ) zs PyTorch UperNet model. Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.    )ListOptionalTupleUnionN)nn)CrossEntropyLoss   )AutoBackbone)SemanticSegmenterOutput)PreTrainedModel)add_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings   )UperNetConfigzopenmmlab/upernet-convnext-tinyr   c                       s   e Zd ZdZ			ddededeeeeef f deeeeef ef d	ed
eeeeef f ddf fddZ	de
jde
jfddZ  ZS )UperNetConvModulez
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
    r   Fr   in_channelsout_channelskernel_sizepaddingbiasdilationreturnNc                    s<   t    tj||||||d| _t|| _t | _d S )N)r   r   r   r   r   r   )	super__init__r   Conv2dconvBatchNorm2d
batch_normReLU
activation)selfr   r   r   r   r   r   	__class__ b/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/upernet/modeling_upernet.pyr   -   s   
	zUperNetConvModule.__init__inputc                 C   s"   |  |}| |}| |}|S N)r   r   r    )r!   r&   outputr$   r$   r%   forwardB   s   


zUperNetConvModule.forward)r   Fr   )__name__
__module____qualname____doc__intr   r   strboolr   torchTensorr)   __classcell__r$   r$   r"   r%   r   '   s*    
r   c                       sD   e Zd Zdedededdf fddZdejdejfd	d
Z  ZS )UperNetPyramidPoolingBlock
pool_scaler   channelsr   Nc                    sL   t    t|t||ddg| _t| jD ]\}}| t|| qd S )Nr   r   )	r   r   r   AdaptiveAvgPool2dr   layers	enumerate
add_moduler/   )r!   r5   r   r6   ilayerr"   r$   r%   r   K   s   
z#UperNetPyramidPoolingBlock.__init__r&   c                 C   s   |}| j D ]}||}q|S r'   )r9   )r!   r&   hidden_stater=   r$   r$   r%   r)   T   s   

z"UperNetPyramidPoolingBlock.forward)	r*   r+   r,   r.   r   r1   r2   r)   r3   r$   r$   r"   r%   r4   J   s    	r4   c                
       sX   e Zd ZdZdeedf dedededdf
 fd	d
Zdej	de
ej	 fddZ  ZS )UperNetPyramidPoolingModulea}  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (`Tuple[int]`):
            Pooling scales used in Pooling Pyramid Module.
        in_channels (`int`):
            Input channels.
        channels (`int`):
            Channels after modules, before conv_seg.
        align_corners (`bool`):
            align_corners argument of F.interpolate.
    pool_scales.r   r6   align_cornersr   Nc                    sh   t    || _|| _|| _|| _g | _t|D ]\}}t|||d}| j	| | 
t|| qd S )N)r5   r   r6   )r   r   r@   rA   r   r6   blocksr:   r4   appendr;   r/   )r!   r@   r   r6   rA   r<   r5   blockr"   r$   r%   r   j   s   
z$UperNetPyramidPoolingModule.__init__xc                 C   sH   g }| j D ]}||}tjj|| dd  d| jd}|| q|S )N   bilinearsizemoderA   )rB   r   
functionalinterpolaterI   rA   rC   )r!   rE   ppm_outsppmppm_outupsampled_ppm_outr$   r$   r%   r)   v   s   
z#UperNetPyramidPoolingModule.forward)r*   r+   r,   r-   r   r.   r0   r   r1   r2   r   r)   r3   r$   r$   r"   r%   r?   [   s    *"r?   c                       sN   e Zd ZdZ fddZdd Zdd Zdd	 Zd
ej	dej	fddZ
  ZS )UperNetHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://arxiv.org/abs/1807.10221).
    c                    s  t    || _|j| _|| _|j| _d| _tj	| j|j
dd| _t| j| jd | j| jd| _t| jd t| j| j  | jddd| _t | _t | _| jd d D ] }t|| jdd}t| j| jddd}| j| | j| qVtt| j| j | jddd| _d S )NFr   r7   )rA   r   r   r   )r   r   configr@   r   hidden_sizer6   rA   r   r   
num_labels
classifierr?   psp_modulesr   len
bottleneck
ModuleListlateral_convs	fpn_convsrC   fpn_bottleneck)r!   rT   r   l_convfpn_convr"   r$   r%   r      s@   


zUperNetHead.__init__c                 C      |  | j d S r'   apply_init_weightsr!   r$   r$   r%   init_weights      zUperNetHead.init_weightsc                 C   D   t |tjr|jjjd| jjd |jd ur |jj	  d S d S d S Ng        )meanstd

isinstancer   r   weightdatanormal_rT   initializer_ranger   zero_r!   moduler$   r$   r%   rd         
zUperNetHead._init_weightsc                 C   s:   |d }|g}| | | tj|dd}| |}|S )NrR   r   dim)extendrX   r1   catrZ   )r!   inputsrE   psp_outsr(   r$   r$   r%   psp_forward   s   
zUperNetHead.psp_forwardencoder_hidden_statesr   c                    s   fddt jD   t}t|d ddD ]$}|d  jdd  }|d  tjj	| |dj
d |d < q fd	dt|d D }|d  t|d ddD ]}tjj	|| |d jdd  dj
d||< qbtj|dd
}|}|}|S )Nc                    s   g | ]
\}}| | qS r$   r$   ).0r<   lateral_conv)r}   r$   r%   
<listcomp>   s    z'UperNetHead.forward.<locals>.<listcomp>r   r   rR   rF   rG   rH   c                    s   g | ]}j |  | qS r$   )r]   )r~   r<   )lateralsr!   r$   r%   r      s    rv   )r:   r\   rC   r|   rY   rangeshaper   rK   rL   rA   r1   ry   r^   rW   )r!   r}   used_backbone_levelsr<   
prev_shapefpn_outsr(   r$   )r}   r   r!   r%   r)      s$   

zUperNetHead.forward)r*   r+   r,   r-   r   rf   rd   r|   r1   r2   r)   r3   r$   r$   r"   r%   rQ      s    '	rQ   c                       sl   e Zd ZdZ	ddededeeeeef f dd	f fd
dZdd Zdd Z	de
jde
jfddZ  ZS )UperNetFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is the implementation of
    [FCNNet](https://arxiv.org/abs/1411.4038>).

    Args:
        config:
            Configuration.
        in_channels (int):
            Number of input channels.
        kernel_size (int):
            The kernel size for convs in the head. Default: 3.
        dilation (int):
            The dilation rate for convs in the head. Default: 1.
    rF   r   r   in_indexr   r   r   Nc              
      s   t    || _|j| _|j| _|j| _|j	| _
|| _|d | }g }|t| j| j|||d t| jd D ]}|t| j| j|||d q8| jdkrTt | _ntj| | _| j
rmt| j| j | j||d d| _tj| j|jdd| _d S )NrF   )r   r   r   r   r   rS   r7   )r   r   rT   auxiliary_in_channelsr   auxiliary_channelsr6   auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputr   rC   r   r   r   Identityconvs
Sequentialconv_catr   rV   rW   )r!   rT   r   r   r   conv_paddingr   r<   r"   r$   r%   r      s8   

zUperNetFCNHead.__init__c                 C   ra   r'   rb   re   r$   r$   r%   rf     rg   zUperNetFCNHead.init_weightsc                 C   rh   ri   rl   rs   r$   r$   r%   rd     ru   zUperNetFCNHead._init_weightsr}   c                 C   s@   || j  }| |}| jr| tj||gdd}| |}|S )Nr   rv   )r   r   r   r   r1   ry   rW   )r!   r}   hidden_statesr(   r$   r$   r%   r)     s   


zUperNetFCNHead.forward)rF   r   r   )r*   r+   r,   r-   r.   r   r   r   rf   rd   r1   r2   r)   r3   r$   r$   r"   r%   r      s    $r   c                   @   s(   e Zd ZdZeZdZdd Zdd ZdS )UperNetPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    pixel_valuesc                 C   s>   t |tr|j  |j  |jd ur|j  d S d S d S r'   )rm   r   backbonerf   decode_headauxiliary_headrs   r$   r$   r%   rd   .  s   



z$UperNetPreTrainedModel._init_weightsc                 C   s0   | j   | j  | jdur| j  dS dS )zInitialize the weightsN)r   rf   r   r   re   r$   r$   r%   rf   5  s
   


z#UperNetPreTrainedModel.init_weightsN)	r*   r+   r,   r-   r   config_classmain_input_namerd   rf   r$   r$   r$   r%   r   %  s    r   aI  
    Parameters:
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.
        config ([`UperNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
ax  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`SegformerImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers in case the backbone has them. See
            `attentions` under returned tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers of the backbone. See `hidden_states` under
            returned tensors for more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zMUperNet framework leveraging any vision backbone e.g. for ADE20k, CityScapes.c                       s   e Zd Z fddZeedeee	d					dde
ej de
e de
e d	e
ej d
e
e deeef fddZ  ZS )UperNetForSemanticSegmentationc                    sL   t  | t|j| _t|| jjd| _|j	rt
|nd | _|   d S )N)r   )r   r   r	   from_configbackbone_configr   rQ   r6   r   use_auxiliary_headr   r   	post_init)r!   rT   r"   r$   r%   r   \  s
   z'UperNetForSemanticSegmentation.__init__zbatch_size, sequence_length)output_typer   Nr   output_attentionsoutput_hidden_stateslabelsreturn_dictr   c                 C   sd  |dur|n| j j}|dur|n| j j}|dur|n| j j}| jj|||d}|j}| |}tj	j
||jdd ddd}d}	| jdurY| |}	tj	j
|	|jdd ddd}	d}
|dur| j jdkritdt| j jd	}|||}
|	dur||	|}|
| j j| 7 }
|s|r|f|dd  }n	|f|dd  }|
dur|
f| S |S t|
||j|jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, UperNetForSemanticSegmentation
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download

        >>> image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-tiny")
        >>> model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-tiny")

        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/fixtures_ade20k", filename="ADE_val_00000001.jpg", repo_type="dataset"
        ... )
        >>> image = Image.open(filepath).convert("RGB")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> logits = outputs.logits  # shape (batch_size, num_labels, height, width)
        >>> list(logits.shape)
        [1, 150, 512, 512]
        ```N)r   r   rF   rG   FrH   r   z/The number of labels should be greater than one)ignore_index)losslogitsr   
attentions)rT   use_return_dictr   r   r   forward_with_filtered_kwargsfeature_mapsr   r   rK   rL   r   r   rV   
ValueErrorr   loss_ignore_indexauxiliary_loss_weightr
   r   r   )r!   r   r   r   r   r   outputsfeaturesr   auxiliary_logitsr   loss_fctauxiliary_lossr(   r$   r$   r%   r)   h  sH   (




z&UperNetForSemanticSegmentation.forward)NNNNN)r*   r+   r,   r   r   UPERNET_INPUTS_DOCSTRINGformatr   r
   _CONFIG_FOR_DOCr   r1   r2   r0   r   tupler)   r3   r$   r$   r"   r%   r   W  s,    

r   )"r-   typingr   r   r   r   r1   r   torch.nnr    r	   modeling_outputsr
   modeling_utilsr   utilsr   r   r   configuration_upernetr   %UPERNET_PRETRAINED_MODEL_ARCHIVE_LISTr   Moduler   r4   r?   rQ   r   r   UPERNET_START_DOCSTRINGr   r   r$   r$   r$   r%   <module>   s4   #&]G
