o
    hf2                     @   s  d Z ddlmZ ddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZmZmZmZ d
dlmZ dgZdZeG dd deZG dd deZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdZdZedeG dd  d eZdS )!z PyTorch ViTMatte model.    )	dataclass)OptionalTupleN)nn   )AutoBackbone)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings   )VitMatteConfigz$hustvl/vitmatte-small-composition-1kr   c                   @   s^   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dS )ImageMattingOutputa  
    Class for outputs of image matting models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Loss.
        alphas (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
           Estimated alpha values.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
            (also called feature maps) of the model at the output of each stage.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossalphashidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r    r   r   d/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/vitmatte/modeling_vitmatte.pyr   ,   s   
 r   c                   @   s$   e Zd ZdZeZdZdZdd ZdS )VitMattePreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    pixel_valuesTc                 C   sD   t |tjr|jjjd| jjd |jd ur |jj	  d S d S d S )Ng        )meanstd)

isinstancer   Conv2dweightdatanormal_configinitializer_rangebiaszero_)selfmoduler   r   r   _init_weightsR   s   
z%VitMattePreTrainedModel._init_weightsN)	r   r   r   r   r   config_classmain_input_namesupports_gradient_checkpointingr,   r   r   r   r   r   H   s    r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	VitMatteBasicConv3x3zP
    Basic convolution layers including: Conv3x3, BatchNorm2d, ReLU layers.
       r   c                    sB   t    tj||d||dd| _tj||jd| _t | _	d S )Nr   F)in_channelsout_channelskernel_sizestridepaddingr(   )eps)
super__init__r   r"   convBatchNorm2dbatch_norm_eps
batch_normReLUrelu)r*   r&   r2   r3   r5   r6   	__class__r   r   r9   ^   s   
zVitMatteBasicConv3x3.__init__c                 C   s"   |  |}| |}| |}|S N)r:   r=   r?   r*   hidden_stater   r   r   forwardk   s   


zVitMatteBasicConv3x3.forward)r1   r   r   r   r   r   r9   rE   __classcell__r   r   r@   r   r0   Y   s    r0   c                       (   e Zd ZdZ fddZdd Z  ZS )VitMatteConvStreamzc
    Simple ConvStream containing a series of basic conv3x3 layers to extract detail features.
    c                    sv   t    |jj}|j}t | _|g| | _t	t
| jd D ]}| j| }| j|d  }| jt||| q d S )Nr   )r8   r9   backbone_confignum_channelsconvstream_hidden_sizesr   
ModuleListconvs
conv_chansrangelenappendr0   )r*   r&   r2   r3   iin_chan_	out_chan_r@   r   r   r9   x   s   


zVitMatteConvStream.__init__c                 C   sJ   d|i}|}t t| jD ]}| j| |}dt|d  }|||< q|S )Ndetailed_feature_map_0detailed_feature_map_r   )rP   rQ   rN   str)r*   r   out_dict
embeddingsrS   name_r   r   r   rE      s   
zVitMatteConvStream.forwardrF   r   r   r@   r   rI   s   s    rI   c                       rH   )VitMatteFusionBlockz\
    Simple fusion block to fuse features from ConvStream and Plain Vision Transformer.
    c                    s"   t    t|||ddd| _d S )Nr   )r5   r6   )r8   r9   r0   r:   )r*   r&   r2   r3   r@   r   r   r9      s   
zVitMatteFusionBlock.__init__c                 C   s4   t jj|dddd}tj||gdd}| |}|S )Nr1   bilinearF)scale_factormodealign_cornersr   )dim)r   
functionalinterpolater   catr:   )r*   featuresdetailed_feature_mapupscaled_featuresoutr   r   r   rE      s   
zVitMatteFusionBlock.forwardrF   r   r   r@   r   r\      s    r\   c                       rH   )VitMatteHeadzJ
    Simple Matting Head, containing only conv3x3 and conv1x1 layers.
    c                    sZ   t    |jd }d}ttj||ddddt|tdtj|ddddd| _d S )N   r   r   )r4   r5   r6   Tr   )	r8   r9   fusion_hidden_sizesr   
Sequentialr"   r;   r>   matting_convs)r*   r&   r2   mid_channelsr@   r   r   r9      s   


zVitMatteHead.__init__c                 C   s   |  |}|S rB   )rn   rC   r   r   r   rE      s   
zVitMatteHead.forwardrF   r   r   r@   r   ri      s    ri   c                       rH   )VitMatteDetailCaptureModulezG
    Simple and lightweight Detail Capture Module for ViT Matting.
    c              	      s   t    t|jt|jd krtd|| _t|| _| jj	| _	t
 | _|jg|j | _tt| jd D ]}| jt|| j| | j	|d    | j|d  d q8t|| _d S )Nr   z_The length of fusion_hidden_sizes should be equal to the length of convstream_hidden_sizes + 1.)r&   r2   r3   )r8   r9   rQ   rl   rL   
ValueErrorr&   rI   
convstreamrO   r   rM   fusion_blockshidden_sizefusion_channelsrP   rR   r\   ri   matting_head)r*   r&   rS   r@   r   r   r9      s&   



z$VitMatteDetailCaptureModule.__init__c                 C   s`   |  |}tt| jD ]}dtt| j| d  }| j| ||| }qt| |}|S )NrW   r   )rr   rP   rQ   rs   rX   r   sigmoidrv   )r*   re   r   detail_featuresrS   detailed_feature_map_namer   r   r   r   rE      s   
z#VitMatteDetailCaptureModule.forwardrF   r   r   r@   r   rp      s    rp   aI  
    Parameters:
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.
        config ([`UperNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aw  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`VitMatteImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers in case the backbone has them. See
            `attentions` under returned tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers of the backbone. See `hidden_states` under
            returned tensors for more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zNViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes.c                       sx   e Zd Z fddZeedeee	d					dde
ej de
e de
e d	e
ej d
e
e f
ddZ  ZS )VitMatteForImageMattingc                    s6   t  | || _t|j| _t|| _| 	  d S rB   )
r8   r9   r&   r   from_configrJ   backbonerp   decoder	post_init)r*   r&   r@   r   r   r9     s
   
z VitMatteForImageMatting.__init__zbatch_size, sequence_length)output_typer-   Nr   output_attentionsoutput_hidden_stateslabelsreturn_dictc                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| jj|||d}|jd }| ||}d}	|dur<td|sR|f|dd  }
|	durP|	f|
 S |
S t	|	||j
|jdS )aJ  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth image matting for computing the loss.

        Returns:

        Examples:

        ```python
        >>> from transformers import VitMatteImageProcessor, VitMatteForImageMatting
        >>> import torch
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download

        >>> processor = VitMatteImageProcessor.from_pretrained("hustvl/vitmatte-small-composition-1k")
        >>> model = VitMatteForImageMatting.from_pretrained("hustvl/vitmatte-small-composition-1k")

        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
        ... )
        >>> image = Image.open(filepath).convert("RGB")
        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
        ... )
        >>> trimap = Image.open(filepath).convert("L")

        >>> # prepare image + trimap for the model
        >>> inputs = processor(images=image, trimaps=trimap, return_tensors="pt")

        >>> with torch.no_grad():
        ...     alphas = model(**inputs).alphas
        >>> print(alphas.shape)
        torch.Size([1, 1, 640, 960])
        ```N)r   r   rj   zTraining is not yet supportedr   )r   r   r   r   )r&   use_return_dictr   r   r|   forward_with_filtered_kwargsfeature_mapsr}   NotImplementedErrorr   r   r   )r*   r   r   r   r   r   outputsre   r   r   outputr   r   r   rE     s*   ,
zVitMatteForImageMatting.forward)NNNNN)r   r   r   r9   r   VITMATTE_INPUTS_DOCSTRINGformatr   r   _CONFIG_FOR_DOCr   r   TensorboolrE   rG   r   r   r@   r   rz      s(    

rz   ) r   dataclassesr   typingr   r   r   r    r   modeling_utilsr   utilsr	   r
   r   r   configuration_vitmatter   &VITMATTE_PRETRAINED_MODEL_ARCHIVE_LISTr   r   r   Moduler0   rI   r\   ri   rp   VITMATTE_START_DOCSTRINGr   rz   r   r   r   r   <module>   s6   )
