o
    h:                     @   s`  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZ ddlmZmZm Z  e!e"Z#dZ$dgZ%de
j&de
j&fddZ'de
j&de
j&fddZ(eG dd deZ)eG dd deZ*eG dd deZ+G dd dej,Z-G dd dej,Z.G d d! d!ej,Z/G d"d# d#ej,Z0G d$d% d%ej,Z1G d&d' d'eZ2d(Z3d)Z4d*Z5d+Z6G d,d- d-ej,Z7G d.d/ d/ej,Z8G d0d1 d1e2Z9G d2d3 d3ej,Z:G d4d5 d5e2Z;ee3G d6d7 d7e2Z<G d8d9 d9ej,Z=G d:d; d;e2Z>ed<e3G d=d> d>e2Z?dS )?z PyTorch CLIPSeg model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfigzCIDAS/clipseg-rd64-refinedlogitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalcross_entropytorcharangelenr   )r    r!   b/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/clipseg/modeling_clipseg.pycontrastive_loss5   s   r#   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)r#   t)r$   caption_loss
image_lossr!   r!   r"   clipseg_loss:   s   r(   c                   @   s   e Zd ZU dZdZeej ed< dZ	ejed< dZ
ejed< dZejed< dZejed< dZeed< dZeed	< d
ee fddZdS )CLIPSegOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
        text_model_output(`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegTextModel`].
        vision_model_output(`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                       t  fdd  D S )Nc                 3   .    | ]}|d vr | nt  | V  qdS ))r/   r0   Ngetattrto_tuple.0kselfr!   r"   	<genexpr>`   
    
z)CLIPSegOutput.to_tuple.<locals>.<genexpr>tuplekeysr9   r!   r9   r"   r5   _      zCLIPSegOutput.to_tuple)__name__
__module____qualname____doc__r*   r   r   FloatTensor__annotations__r+   r,   r-   r.   r/   r   r0   r   r   r5   r!   r!   r!   r"   r)   @   s   
 r)   c                   @   sL   e Zd ZU dZdZejed< dZe	e
ej  ed< dZe	e
ej  ed< dS )CLIPSegDecoderOutputa  
    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
            Classification scores for each pixel.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nr   hidden_states
attentions)rA   rB   rC   rD   r   r   rE   rF   rH   r   r   rI   r!   r!   r!   r"   rG   f   s
   
 rG   c                   @   sx   e Zd ZU dZdZeej ed< dZ	ejed< dZ
ejed< dZejed< dZeed< dZeed< d	ee fd
dZdS )CLIPSegImageSegmentationOutputa,  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        ...
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegVisionModel`].
    Nr*   r   conditional_embeddingspooled_outputr0   decoder_outputr   c                    r1   )Nc                 3   r2   ))r0   rM   Nr3   r6   r9   r!   r"   r;      r<   z:CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>r=   r9   r!   r9   r"   r5      r@   z'CLIPSegImageSegmentationOutput.to_tuple)rA   rB   rC   rD   r*   r   r   rE   rF   r   rK   rL   r0   r   rM   rG   r   r   r5   r!   r!   r!   r"   rJ   z   s   
 	rJ   c                       s@   e Zd Zdef fddZdd Zdejdejfdd	Z	  Z
S )
CLIPSegVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__rO   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr   expandr:   rO   	__class__r!   r"   r\      s"   
"z CLIPSegVisionEmbeddings.__init__c                 C   s   t |dkr
tdt| jd }| jjdd  jd| jj	||}t
jj||dddd| jj	|d |d  j}t| jjd d |g}|S )	NrU   z#new_size should consist of 2 valuesg      ?r   bicubicF)modealign_cornersr   )r    
ValueErrorintrg   rj   weightTviewrO   r]   r   r   interpolatesqueezer   cat)r:   new_sizenum_patches_one_directionabresultr!   r!   r"   interpolate_position_embeddings   s   z7CLIPSegVisionEmbeddings.interpolate_position_embeddingspixel_valuesr   c                 C   s   |j d }| |}|ddd}| j|dd}tj||gdd}|j d | jkrIt	t
|j d d }|| ||f }||j}|S || | j }|S )Nr   rU   r   rX   dim)shaperf   flatten	transposerc   rl   r   rz   rh   rt   mathsqrtr   todtyperj   rV   )r:   r   
batch_sizepatch_embedsclass_embeds
embeddings	new_shaper!   r!   r"   forward   s   

zCLIPSegVisionEmbeddings.forward)rA   rB   rC   r   r\   r   r   rE   Tensorr   __classcell__r!   r!   rn   r"   rN      s    rN   c                	       sX   e Zd Zdef fddZ			ddeej deej deej dej	fd	d
Z
  ZS )CLIPSegTextEmbeddingsrO   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )NrV   rW   FrY   )r[   r\   r]   r   ri   
vocab_sizetoken_embeddingmax_position_embeddingsrj   rk   r   r   rl   r:   rO   r^   rn   r!   r"   r\      s   

zCLIPSegTextEmbeddings.__init__N	input_idsrV   inputs_embedsr   c                 C   sb   |d ur	|j d n|j d }|d u r| jd d d |f }|d u r&| |}| |}|| }|S )NrX   )r   rV   r   rj   )r:   r   rV   r   
seq_lengthposition_embeddingsr   r!   r!   r"   r      s   

zCLIPSegTextEmbeddings.forward)NNN)rA   rB   rC   r   r\   r   r   
LongTensorrE   r   r   r   r!   r!   rn   r"   r      s    r   c                       s   e Zd ZdZ fddZdejdedefddZ					
ddejde	ej de	ej de	e
 deeje	ej e	eej  f f
ddZ  ZS )CLIPSegAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      )r[   r\   rO   r]   r^   num_attention_heads	num_headshead_dimrs   scaleattention_dropoutdropoutr   Lineark_projv_projq_projout_projrm   rn   r!   r"   r\      s"   

zCLIPSegAttention.__init__tensorseq_lenbszc                 C   s    | ||| j| jdd S )Nr   rU   )rw   r   r   r   
contiguous)r:   r   r   r   r!   r!   r"   _shape	  s    zCLIPSegAttention._shapeNFrH   attention_maskcausal_attention_maskoutput_attentionsr   c                 C   s  |  \}}}| || j }| | |d|}	| | |d|}
|| j d| jf}| |||j| }|	j| }	|
j| }
|	 d}t	
||	dd}|  || j ||fkrmtd|| j ||f d|   |dur|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}|dur|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}tjj|dd}|r||| j||}||| j ||}nd}tjj|| j| jd	}t	
||
}|  || j || jfkr$td
|| j|| jf d|   ||| j|| j}|dd}||||}| |}||fS )z#Input shape: Batch x Time x ChannelrX   r   rU   z$Attention weights should be of size z	, but is Nz!Attention mask should be of size r   )ptrainingz `attn_output` should be of size )sizer   r   r   r   r   r   r   rw   r   bmmr   rs   r   r   softmaxr   r   reshaper   )r:   rH   r   r   r   r   tgt_lenr^   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputr!   r!   r"   r     sd   	



zCLIPSegAttention.forward)NNF)rA   rB   rC   rD   r\   r   r   rt   r   r   boolr   r   r   r!   r!   rn   r"   r      s$    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )
CLIPSegMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)r[   r\   rO   r	   
hidden_actactivation_fnr   r   r]   intermediate_sizefc1fc2rm   rn   r!   r"   r\   ]  s
   
zCLIPSegMLP.__init__rH   r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r:   rH   r!   r!   r"   r   d  s   


zCLIPSegMLP.forward)rA   rB   rC   r\   r   r   r   r   r!   r!   rn   r"   r   \  s    r   c                       sT   e Zd Zdef fddZ	ddejdejdejdee d	e	ej
 f
d
dZ  ZS )CLIPSegEncoderLayerrO   c                    R   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S N)epsr[   r\   r]   r^   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rm   rn   r!   r"   r\   m     


zCLIPSegEncoderLayer.__init__FrH   r   r   r   r   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rH   r   r   r   )r   r   r   r   r:   rH   r   r   r   residualr   outputsr!   r!   r"   r   u  s"   




zCLIPSegEncoderLayer.forwardF)rA   rB   rC   r   r\   r   r   r   r   r   rE   r   r   r!   r!   rn   r"   r   l  s    r   c                   @   s$   e Zd ZdZeZdZdZdd ZdS )CLIPSegPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    clipTc                 C   sV  | j j}t|tr"|jjjjd|d d |jjjjd|d d nt|t	rW| j j}t
jj|jd|jd | d t
jj|jj|j j| d t
jj|jj|j j| d nt|tr| j j}|jd d|j j d  | }|jd | }t
jj|jj|d t
jj|jj|d t
jj|jj|d t
jj|jj|d n_t|tr| j j}|j jd d|j j d  | }d|j j d | }t
jj|jj|d t
jj|jj|d n't|trt
jj|jj|jd | j j d t
jj|jj|jd | j j d t|t
jr|j j!  |jj"d t|t
j#r'|j dur)|j j!  dS dS dS )	zInitialize the weightsg        g{Gz?)meanstdr   )r   rU   g      ?N)$rO   initializer_factor
isinstancer   r   ru   datanormal_rj   rN   r   initrc   r^   rf   initializer_ranger   num_hidden_layersr   r   r   r   r   r]   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   rT   zero_fill_r   )r:   modulefactorin_proj_stdout_proj_stdfc_stdr!   r!   r"   _init_weights  sL   



 
z$CLIPSegPreTrainedModel._init_weightsN)	rA   rB   rC   rD   r   config_classbase_model_prefixsupports_gradient_checkpointingr   r!   r!   r!   r"   r     s    r   aI  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CLIPSegConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
aE  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                       st   e Zd ZdZdef fddZ					ddeej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )CLIPSegEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPSegEncoderLayer`].

    Args:
        config: CLIPSegConfig
    rO   c                    s:   t     | _t fddt jD | _d| _d S )Nc                       g | ]}t  qS r!   )r   r7   _rO   r!   r"   
<listcomp>?      z+CLIPSegEncoder.__init__.<locals>.<listcomp>F)	r[   r\   rO   r   
ModuleListranger   layersgradient_checkpointingrm   rn   r   r"   r\   <  s   
 
zCLIPSegEncoder.__init__Nr   r   r   output_hidden_statesreturn_dictr   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ]1\}
}|r<||	f }| jrM| jrM| |j	|	|||}n||	|||d}|d }	|rb||d f }q1|rj||	f }|sxt
dd |	||fD S t|	||dS )	a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr!   )r   r   r   c                 s       | ]	}|d ur|V  qd S r   r!   r7   vr!   r!   r"   r;         z)CLIPSegEncoder.forward.<locals>.<genexpr>)last_hidden_staterH   rI   )rO   r   r  use_return_dict	enumerater   r  r   _gradient_checkpointing_func__call__r>   r   )r:   r   r   r   r   r  r  encoder_statesall_attentionsrH   idxencoder_layerlayer_outputsr!   r!   r"   r   B  sF   &

zCLIPSegEncoder.forwardNNNNN)rA   rB   rC   rD   r   r\   r   r   r   r   r   r   r   r   r   r!   r!   rn   r"   r   3  s*    	
r   c                       s   e Zd Zdef fddZeeeeed						dde	e
j de	e
j de	e
j d	e	e d
e	e de	e deeef fddZ  ZS )CLIPSegTextTransformerrO   c                    sH   t    || _|j}t|| _t|| _tj	||j
d| _|j| _d S r   )r[   r\   rO   r]   r   r   r   encoderr   r   r   final_layer_normeos_token_idr   rn   r!   r"   r\     s   


zCLIPSegTextTransformer.__init__output_typer   Nr   r   rV   r   r  r  r   c                 C   sj  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| }|d|d }| j||d}t||j	|j
d}	|durLt||j	}| j|||	|||d}
|
d }| |}| jdkr|tj|jd |j
d|jtj|j
d	jdd
f }n|tj|jd |j
d|jtj|j
d	| jk jdd
f }|s||f|
dd  S t|||
j|
jdS )
        Returns:

        NzYou have to specify input_idsrX   )r   rV   r   )r   r   r   r   r  r  r   rU   )r   r   r   r   r  pooler_outputrH   rI   )rO   r   r  r	  rs   r   rw   r   r
   r   r   r   r  r  r  r   r   r   r   rt   argmaxr   rH   rI   )r:   r   r   rV   r   r  r  input_shaperH   r   encoder_outputsr  rL   r!   r!   r"   r     s\   
	

zCLIPSegTextTransformer.forwardNNNNNN)rA   rB   rC   r   r\   r   CLIPSEG_TEXT_INPUTS_DOCSTRINGr   r   r   r   r   r   r   r   r   r   r!   r!   rn   r"   r    s2    

r  c                       s   e Zd ZeZddgZdef fddZdejfddZ	d	d
 Z
eeeeed						ddeej deej deej dee dee dee deeef fddZ  ZS )CLIPSegTextModelr   r   rO   c                    "   t  | t|| _|   d S r   )r[   r\   r  
text_model	post_initrm   rn   r!   r"   r\        
zCLIPSegTextModel.__init__r   c                 C   
   | j jjS r   r#  r   r   r9   r!   r!   r"   get_input_embeddings      
z%CLIPSegTextModel.get_input_embeddingsc                 C   s   || j j_d S r   r'  )r:   valuer!   r!   r"   set_input_embeddings  s   z%CLIPSegTextModel.set_input_embeddingsr  Nr   r   rV   r   r  r  c                 C   s   | j ||||||dS )aM  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegTextModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rV   r   r  r  )r#  )r:   r   r   rV   r   r  r  r!   r!   r"   r     s   zCLIPSegTextModel.forwardr  )rA   rB   rC   r   r   _no_split_modulesr\   r   Moduler(  r+  r   r   r   r   r   r   r   r   r   r   r   r   r!   r!   rn   r"   r!    s:    

r!  c                       sx   e Zd Zdef fddZeeeeed				dde	e
j de	e de	e d	e	e d
eeef f
ddZ  ZS )CLIPSegVisionTransformerrO   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )r[   r\   rO   r]   rN   r   r   r   r   pre_layrnormr   r  post_layernormr   rn   r!   r"   r\   .  s   


z!CLIPSegVisionTransformer.__init__r  Nr   r   r  r  r   c           	      C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| |}| |}| j||||d}|d }|dddddf }| |}|sY||f|dd  S t	|||j
|jdS )r  Nz You have to specify pixel_values)r   r   r  r  r   r   r  )rO   r   r  r	  rs   r   r0  r  r1  r   rH   rI   )	r:   r   r   r  r  rH   r  r  rL   r!   r!   r"   r   8  s2   


z CLIPSegVisionTransformer.forwardNNNN)rA   rB   rC   r   r\   r   CLIPSEG_VISION_INPUTS_DOCSTRINGr   r   r   r   rE   r   r   r   r   r   r!   r!   rn   r"   r/  ,  s&    


r/  c                       s   e Zd ZeZdZdef fddZdejfddZ	e
eeeed								ddeej d
ee dee dee deeef f
ddZ  ZS )CLIPSegVisionModelr   rO   c                    r"  r   )r[   r\   r/  vision_modelr$  rm   rn   r!   r"   r\   l  r%  zCLIPSegVisionModel.__init__r   c                 C   r&  r   )r5  r   rf   r9   r!   r!   r"   r(  r  r)  z'CLIPSegVisionModel.get_input_embeddingsr  Nr   r  r  c                 C   s   | j ||||dS )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegVisionModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r  r  )r5  )r:   r   r   r  r  r!   r!   r"   r   u  s   zCLIPSegVisionModel.forwardr2  )rA   rB   rC   r   r   main_input_namer\   r   r.  r(  r   r3  r   r   r   r   rE   r   r   r   r   r   r!   r!   rn   r"   r4  h  s,    

r4  c                       sJ  e Zd ZeZdef fddZee						ddee	j
 dee	j
 dee	j
 dee d	ee d
ee de	jfddZee				ddee	j dee d	ee d
ee de	jf
ddZeeeeed								ddee	j dee	j dee	j
 dee	j dee dee d	ee d
ee deeef fddZ  ZS )r   rO   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	| _	|j
| _|j
| _t|| _t|| _tj| j| j	dd| _tj| j| j	dd| _tt| jj| _|   d S )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)rT   )r[   r\   r   text_configr   rs   typevision_configr   projection_dimr]   r   r   r  r#  r/  r5  r   r   r   r   ra   r   r   rO   logit_scale_init_valuelogit_scaler$  )r:   rO   r9  r;  rn   r!   r"   r\     s0   

zCLIPSegModel.__init__Nr   r   rV   r   r  r  r   c           
      C   sh   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||||||d}|d }| |}	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```Nr,  r   )rO   r   r  r	  r#  r   )
r:   r   r   rV   r   r  r  text_outputsrL   text_featuresr!   r!   r"   get_text_features  s   	
zCLIPSegModel.get_text_featuresr   c                 C   sd   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||||d}|d }| |}|S )aI  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```Nr6  r   )rO   r   r  r	  r5  r   )r:   r   r   r  r  vision_outputsrL   image_featuresr!   r!   r"   get_image_features  s   
zCLIPSegModel.get_image_featuresr  return_lossc	              	   C   s&  |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||||d}	| j||||||d}
|	d }| |}|
d }| |}||jdddd }||jdddd }| j	
 }t|| | }| }d}|rst|}|s|||||
|	f}|dur|f| S |S t||||||
|	d	S )
a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr6  r,  r   rU   rX   T)r   r   keepdim)r*   r+   r,   r-   r.   r/   r0   )rO   r   r  r	  r5  r#  r   r   normr>  expr   matmulr%   r(   r)   )r:   r   r   r   rV   rE  r   r  r  rB  r?  r.   r-   r>  r,   r+   r*   outputr!   r!   r"   r      sT   &	


zCLIPSegModel.forwardr  r2  )NNNNNNNN)rA   rB   rC   r   r   r\   r   r   r   r   r   r   rE   rA  r3  rD  CLIPSEG_INPUTS_DOCSTRINGr   r)   r   r   r   r   r   r!   r!   rn   r"   r     s     .0
	

r   c                       sX   e Zd ZdZdef fddZ	ddejdejdejd	ee	 d
e
ej f
ddZ  ZS )CLIPSegDecoderLayerz
    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
    self-attention/MLP, rather than before.
    rO   c                    r   r   r   rm   rn   r!   r"   r\     r   zCLIPSegDecoderLayer.__init__FrH   r   r   r   r   c                 C   sd   |}| j ||||d\}}|| }| |}|}| |}|| }| |}|f}|r0||f7 }|S r   )r   r   r   r   r   r!   r!   r"   r     s"   




zCLIPSegDecoderLayer.forwardr   )rA   rB   rC   rD   r   r\   r   r   r   r   r   rE   r   r   r!   r!   rn   r"   rL  ~  s    rL  c                       s\   e Zd Zdef fddZ			ddeej dejdee	 d	ee	 d
ee	 f
ddZ
  ZS )CLIPSegDecoderrO   c                    sX  t     j| _t j j| _t j j| _ j	r` j
jd  j
jd f}ttj j jdddt tj j jd |d |d dt tj jd d|d |d d| _ntj jd j
j j
jd| _t j}t fd	d
t|D | _t j
 j_ j_ j_d_tfdd
tt jD | _d S )N   r   r   )rR   paddingrU   r   )rR   rS   )rS   c                    s   g | ]}t  jj jqS r!   )r   r   r;  r]   
reduce_dimr   r   r!   r"   r     s    z+CLIPSegDecoder.__init__.<locals>.<listcomp>reluc                    r   r!   )rL  r   )decoder_configr!   r"   r     r   )r[   r\   conditional_layerr   r   r<  rP  film_mulfilm_add"use_complex_transposed_convolutionr;  r`   
Sequentialrd   ReLUConvTranspose2dtransposed_convolutionr    extract_layersr   r   reducescopydeepcopyr]   decoder_num_attention_headsr   decoder_intermediate_sizer   r   r   )r:   rO   transposed_kernelsdepthrn   )rO   rR  r"   r\     sB   
(zCLIPSegDecoder.__init__NTrH   rK   r   r  r  c                 C   sn  |rdnd }|r
dnd }|d d d }d }	t t|| j| jD ]O\}
\}}}|	d ur1|||	 }	n||}	|
| jkrR| ||	ddd | | }	|	ddd}	||	d d |d}|d }	|re||	f7 }|rn||d f7 }q|	d d dd d d f ddd}	tt	
|	jd }|jd }|	||	jd ||}	| |	 }|stdd |||fD S t|||d	S )
Nr!   rX   r   r   rU   )r   r   r   c                 s   r  r   r!   r  r!   r!   r"   r;     r  z)CLIPSegDecoder.forward.<locals>.<genexpr>)r   rH   rI   )r
  zipr   r\  rS  rT  permuterU  rt   r   r   r   rw   rZ  ry   r>   rG   )r:   rH   rK   r   r  r  all_hidden_statesr  activationsrJ  i
activationlayerreducer  r   r   r   r!   r!   r"   r     sD   "

$
zCLIPSegDecoder.forward)NNT)rA   rB   rC   r   r\   r   r   r   r   r   r   r   r!   r!   rn   r"   rM    s     .rM  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    c                       s  e Zd ZeZdef fddZ					ddedeej	 deej	 deej	 d	eej	 f
d
dZ
eeeeed										ddeej deej d	eej deej deej	 deej deej dee dee dee deeef fddZ  ZS )CLIPSegForImageSegmentationrO   c                    s:   t  | || _t|| _|j| _t|| _|   d S r   )	r[   r\   rO   r   r   r[  rM  decoderr$  rm   rn   r!   r"   r\   $  s   

z$CLIPSegForImageSegmentation.__init__Nr   r   r   rV   conditional_pixel_valuesc                 C   s   |d ur.t ||krtdt  | jj|||d}W d    |S 1 s'w   Y  |S |d urYt ||kr<tdt  | j|}W d    |S 1 sRw   Y  |S td)Nz@Make sure to pass as many prompt texts as there are query images)r   rV   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r    rs   r   no_gradr   rA  rD  )r:   r   r   r   rV   rm  rK   r!   r!   r"   get_conditional_embeddings1  s.   



z6CLIPSegForImageSegmentation.get_conditional_embeddingsr  r   rK   labelsr   r  r  r   c                    s  |
dur|
n| j j}
t P | jj||d|
d}| j|d }|
r&|jn|d   fdd| jD }|
rGt	|j
|j|	r@|jnd|jd}n|	sU|dd |d	d  n|}W d   n1 saw   Y  |du rx| j|jd
 ||||d}n|jd
 |jd
 krtd|jd | j jkrtd| j||||	|
d}|
r|jn|d
 }d}|dur||j}t }|||}|
s|||||f}|dur|f| S |S t||||||dS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = ["a cat", "a remote", "a blanket"]
        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> print(logits.shape)
        torch.Size([3, 352, 352])
        ```NTr6  r   rU   c                    s   g | ]} |d   qS )r   r!   )r7   rg  rH   r!   r"   r     s    z7CLIPSegForImageSegmentation.forward.<locals>.<listcomp>r  r   r   )r   r   r   rV   rm  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   r  r  )r*   r   rK   rL   r0   rM   )rO   r	  r   rn  r   r5  r   rH   r[  r   r  r  rI   ro  r   rs   r<  rl  r   r   r   r   BCEWithLogitsLossrJ   )r:   r   r   rm  rK   r   rV   rp  r   r  r  rB  rL   rf  decoder_outputsr   r*   loss_fnrJ  r!   rq  r"   r   N  s|   ,

z#CLIPSegForImageSegmentation.forwardr  )
NNNNNNNNNN)rA   rB   rC   r   r   r\   rt   r   r   r   ro  r   rK  r   rJ   r   rE   r   r   r   r   r)   r   r   r!   r!   rn   r"   rk    sl    

	

rk  )@rD   r]  r   dataclassesr   typingr   r   r   r   r   torch.utils.checkpointr   rf  r	   modeling_attn_mask_utilsr
   r   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   r   configuration_clipsegr   r   r   
get_loggerrA   logger_CHECKPOINT_FOR_DOC%CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LISTr   r#   r(   r)   rG   rJ   r.  rN   r   r   r   r   r   CLIPSEG_START_DOCSTRINGr   r3  rK  r   r  r!  r/  r4  r   rL  rM  rk  r!   r!   r!   r"   <module>   sf   
$>"i24 'aa7<4 b9d