o
    he	                    @   s  d Z ddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlZddlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZ ddl m!Z!m"Z"m#Z# e$e%Z&dZ'dgZ(dej)dej)fddZ*dej)dej)fddZ+dej)de,fddZ-dOdej)de.de/de,dej)f
ddZ0dPdd Z1d!d" Z2G d#d$ d$ej3Z4G d%d& d&ej3Z5G d'd( d(ej3Z6eG d)d* d*eZ7G d+d, d,ej3Z8G d-d. d.ej3Z9G d/d0 d0ej3Z:G d1d2 d2ej3Z;G d3d4 d4ej3Z<G d5d6 d6e<Z=G d7d8 d8ej3Z>G d9d: d:ej3Z?G d;d< d<eZ@d=ZAd>ZBd?ZCd@ZDG dAdB dBej3ZEG dCdD dDej3ZFG dEdF dFej3ZGG dGdH dHe@ZHG dIdJ dJej3ZIG dKdL dLe@ZJeeAG dMdN dNe@ZKdS )Qz PyTorch GroupViT model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )GroupViTConfigGroupViTTextConfigGroupViTVisionConfigznvidia/groupvit-gcc-yfcclogitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalcross_entropytorcharangelenr   )r    r!   d/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/groupvit/modeling_groupvit.pycontrastive_loss6   s   r#   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)r#   t)r$   caption_loss
image_lossr!   r!   r"   groupvit_loss;   s   r(   dimc                 C   sJ   |  |}|j|ddd }tj| tjd||d}||  | }|S )NTkeepdimr   memory_format      ?)softmaxmaxr   
zeros_likelegacy_contiguous_formatscatter_detach)r   r)   y_softindexy_hardretr!   r!   r"   hard_softmaxA   s
   
r9   Ftauhardc           
      C   s   t jjt jd| j| jdt jd| j| jd}|| j}| | | }|	|}|rK|j
|ddd }t j| t jd||d}||  | }	|	S |}	|	S )N        )r   dtyper.   Tr*   r   r,   )r   distributionsgumbelGumbeltensorr   r>   sampleshaper/   r0   r1   r2   r3   r4   )
r   r;   r<   r)   gumbel_distgumbelsr5   r6   r7   r8   r!   r!   r"   gumbel_softmaxK   s   
rG   c           	      C   s   || | j d  d }||kr tt|| }| j d | }ntt|| }| j d | }| j d }| j d }| ||||} tjj| ||fd|d} | S )a  
    Args:
        attentions (`torch.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
        height (`int`): height of the output attention map
        width (`int`): width of the output attention map
        align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.

    Returns:
        `torch.Tensor`: resized attention map of shape [batch_size, groups, height, width]
       g      ?r   r   bilinear)sizemodealign_corners)rD   intnproundreshaper   r   interpolate)	
attentionsheightwidthrL   scale
feat_widthfeat_height
batch_sizegroupsr!   r!   r"   resize_attention_mapa   s   

rZ   c                 C   s   g }t  7 d}| D ]*}|ddd }|du r|}n|| }t|ddd g|R  }|| qW d   n1 s@w   Y  |d }|S )a1  
    Args:
        attentions (`tuple(torch.FloatTensor)`: tuple of attention maps returned by `GroupViTVisionTransformer`
        hw_shape (`tuple(int)`): height and width of the output attention map
    Returns:
        `torch.Tensor`: the attention map of shape [batch_size, groups, height, width]
    Nr   rH   r   r:   )r   no_gradpermute
contiguousrZ   append)rR   hw_shape	attn_mapsprev_attn_masks
attn_maskscur_attn_mapfinal_groupingr!   r!   r"   get_grouping_from_attentions   s   	
re   c                       s*   e Zd Zdef fddZdd Z  ZS )GroupViTCrossAttentionLayerconfigc                    sJ   t    t|| _tj|j|jd| _t	|| _
tj|j|jd| _d S Neps)super__init__GroupViTAttentionattnr   	LayerNormhidden_sizelayer_norm_epsnorm2GroupViTMLPmlp	norm_postselfrg   	__class__r!   r"   rl      s
   


z$GroupViTCrossAttentionLayer.__init__c                 C   s<   |}|| j ||dd  }|| | | }| |}|S )N)encoder_hidden_statesr   )rn   rt   rr   ru   )rw   querykeyxr!   r!   r"   forward   s
   
z#GroupViTCrossAttentionLayer.forward)__name__
__module____qualname__r   rl   r~   __classcell__r!   r!   rx   r"   rf      s    rf   c                       s4   e Zd Zdef fddZd	ddZdd Z  ZS )
GroupViTAssignAttentionrg   c                    sj   t    |jd | _t|j|j| _t|j|j| _t|j|j| _t|j|j| _	|j
| _
d S )N      )rk   rl   rp   rU   r   Linearq_projk_projv_projproj
assign_epsrv   rx   r!   r"   rl      s   
z GroupViTAssignAttention.__init__Tc                 C   sD   |r| j rt|d|d}|S |rt|dd}|S tjj|dd}|S )N)r)   r<   r)   )trainingrG   r9   r   r   r/   )rw   rn   r@   r<   r!   r!   r"   get_attn   s   
z GroupViTAssignAttention.get_attnc                 C   s   |}|  |}| |}| |}||dd | j }| |}| j|ddd}||jddd| j  }|| }| |}||fS )Nr   r:   F)r@   r<   Tr)   r+   )	r   r   r   	transposerU   r   sumr   r   )rw   r{   r|   valueraw_attnrn   	soft_attnoutr!   r!   r"   r~      s   




zGroupViTAssignAttention.forward)TT)r   r   r   r   rl   r   r~   r   r!   r!   rx   r"   r      s    

r   c                       s2   e Zd Zdef fddZdd Zdd Z  ZS )GroupViTTokenAssignrg   c                    s   t    || _tj j jd| _t j	t
jjr j	n j	 j	f} fdd|D \}}t |||| _tj j jd| _tj j jd| _t | _t | _tj j jd| _t  j| j| _d S )Nri   c                    s   g | ]	}t | j qS r!   )rM   rp   ).0r}   rg   r!   r"   
<listcomp>   s    z0GroupViTTokenAssign.__init__.<locals>.<listcomp>)rk   rl   num_output_groupr   ro   rp   rq   norm_tokens
isinstanceassign_mlp_ratiocollectionsabcIterableGroupViTMixerMLP	mlp_internorm_post_tokensnorm_xrf   pre_assign_attnr   assign
norm_new_xrs   mlp_channels)rw   rg   num_group_tokenr   r   
tokens_dimchannels_dimrx   r   r"   rl      s   



zGroupViTTokenAssign.__init__c                 C   s   |  |}| |}|S )z
        Args:
            group_tokens (torch.Tensor): group tokens, [batch_size, num_group_tokens, channels]

        Returns:
            projected_group_tokens (torch.Tensor): [batch_size, num_output_groups, channels]
        )r   r   )rw   group_tokensprojected_group_tokensr!   r!   r"   project_group_token   s   
	
z'GroupViTTokenAssign.project_group_tokenc                 C   s^   |  |}| |}| |}| ||}| ||\}}||7 }|| | | }||fS )z
        Args:
            image_tokens (`torch.Tensor`): image tokens, of shape [batch_size, input_length, channels]
            group_tokens (`torch.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
        )r   r   r   r   r   r   r   )rw   image_tokensr   r   new_image_tokens	attentionr!   r!   r"   r~      s   


zGroupViTTokenAssign.forward)r   r   r   r   rl   r   r~   r   r!   r!   rx   r"   r      s    r   c                   @   s   e Zd ZU dZdZeej ed< dZ	ejed< dZ
ejed< dZejed< dZejed< dZejed< dZeed	< dZeed
< dee fddZdS )GroupViTModelOutputa\  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        segmentation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
            Classification scores for each pixel.

            <Tip warning={true}>

            The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
            to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
            original image size as post-processing. You should always check your logits shape and resize as needed.

            </Tip>

        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of
            [`GroupViTTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`GroupViTVisionModel`].
        text_model_output (`BaseModelOutputWithPooling`):
            The output of the [`GroupViTTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`GroupViTVisionModel`].
    Nlosslogits_per_imagelogits_per_textsegmentation_logitstext_embedsimage_embedstext_model_outputvision_model_outputr   c                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))r   r   N)getattrto_tuple)r   krw   r!   r"   	<genexpr>@  s
    
z/GroupViTModelOutput.to_tuple.<locals>.<genexpr>)tuplekeysr   r!   r   r"   r   ?  s   zGroupViTModelOutput.to_tuple)r   r   r   __doc__r   r   r   FloatTensor__annotations__r   r   r   r   r   r   r   r   r   r   r   r!   r!   r!   r"   r     s   
 !r   c                	       sh   e Zd ZdZ				ddedeeeeef f ded	ef fd
dZddej	de
dej	fddZ  ZS )GroupViTPatchEmbeddingsz#
    Image to Patch Embedding.
          r      
image_size
patch_sizenum_channels	embed_dimc                    s   t    t|tjjr|n||f}t|tjjr|n||f}|d |d  |d |d   }|| _|| _|| _t	j
||||d| _d S )Nr   r   )kernel_sizestride)rk   rl   r   r   r   r   r   r   num_patchesr   Conv2d
projection)rw   r   r   r   r   r   rx   r!   r"   rl   K  s   
 z GroupViTPatchEmbeddings.__init__Fpixel_valuesinterpolate_pos_encodingr   c              
   C   sx   |j \}}}}|s.|| jd ks|| jd kr.td| d| d| jd  d| jd  d	| |ddd}|S )Nr   r   zInput image size (*z) doesn't match model ().rH   )rD   r   
ValueErrorr   flattenr   )rw   r   r   rX   r   rS   rT   r}   r!   r!   r"   r~   \  s   zGroupViTPatchEmbeddings.forward)r   r   r   r   F)r   r   r   r   rM   r   r   rl   r   Tensorboolr~   r   r!   r!   rx   r"   r   F  s     $r   c                       s\   e Zd Zdef fddZdejdededejfdd	Zddejde	dejfddZ
  ZS )GroupViTVisionEmbeddingsrg   c                    sp   t    t|j|j|j|jd| _| jj}t	
td||j| _t	|j| _t	j|j|jd| _|| _d S )N)r   r   r   r   r   ri   )rk   rl   r   r   r   r   rp   patch_embeddingsr   r   	Parameterr   zerosposition_embeddingsDropoutdropoutro   rq   	layernormrg   )rw   rg   r   rx   r!   r"   rl   i  s   

z!GroupViTVisionEmbeddings.__init__
embeddingsrS   rT   r   c                 C   s   |j d }|| jj d kr||kr| jS | j}|j d }|j d }|| jj }|| jj }	|d |	d }}	t| }
}|dt|
t||dddd}||
 |	| f}t	j
j||ddd	}|dddddd|}|S )
a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        r   r:   g?r   r   rH   bicubicF)scale_factorrK   rL   )rD   r   rg   r   mathsqrtrP   rM   r\   r   r   rQ   view)rw   r   rS   rT   npatchpatch_pos_embednum_original_pos_embedr)   rW   rV   original_heightoriginal_widthreshaped_patch_pos_embedr   r!   r!   r"   r   x  s,   
	

z1GroupViTVisionEmbeddings.interpolate_pos_encodingFr   r   c           
      C   sd   |j \}}}}| j||d}| |}| \}}}	|r&|| ||| }n|| j }| |}|S )N)r   )rD   r   r   rJ   r   r   r   )
rw   r   r   rX   r   rS   rT   r   seq_len_r!   r!   r"   r~     s   


z GroupViTVisionEmbeddings.forwardr   )r   r   r   r   rl   r   r   rM   r   r   r~   r   r!   r!   rx   r"   r   h  s    $"r   c                	       sX   e Zd Zdef fddZ			ddeej deej deej dej	fd	d
Z
  ZS )GroupViTTextEmbeddingsrg   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )Nposition_ids)r   r:   F)
persistent)rk   rl   rp   r   	Embedding
vocab_sizetoken_embeddingmax_position_embeddingsposition_embeddingregister_bufferr   r   expandrw   rg   r   rx   r!   r"   rl     s   

zGroupViTTextEmbeddings.__init__N	input_idsr   inputs_embedsr   c                 C   sb   |d ur	|j d n|j d }|d u r| jd d d |f }|d u r&| |}| |}|| }|S )Nr:   r   )rD   r   r   r   )rw   r   r   r   
seq_lengthr   r   r!   r!   r"   r~     s   

zGroupViTTextEmbeddings.forwardNNN)r   r   r   r   rl   r   r   
LongTensorr   r   r~   r   r!   r!   rx   r"   r     s    r   c                
       s   e Zd ZdZdededededef
 fddZed	d
 Zdd Z	dde
jdee
j de
jfddZ		dde
jdee
j dee dee
j fddZ  ZS )GroupViTStagezMThis corresponds to the `GroupingLayer` class in the GroupViT implementation.rg   depthnum_prev_group_tokenr   r   c                    s   t    || _|| _|dkrttd| j| _	nd | _	t
 fddt|D | _|dkr;t ||d| _nd | _|dkr^|dkr^ttj j jdt | jd || _d S d | _d S )Nr   r   c                       g | ]}t  qS r!   GroupViTEncoderLayerr   r   r   r!   r"   r         z*GroupViTStage.__init__.<locals>.<listcomp>)rg   r   r   ri   rH   )rk   rl   r  r   r   r   r   r   rp   group_token
ModuleListrangelayersr   
downsample
Sequentialro   rq   r   group_projector)rw   rg   r  r  r   r   rx   r   r"   rl     s(   



zGroupViTStage.__init__c                 C   s
   | j d uS N)r	  r   r!   r!   r"   with_group_token  s   
zGroupViTStage.with_group_tokenc                 C   s>   | j r|d d d | j f |d d | j d f fS |d fS r  )r  r   rw   r}   r!   r!   r"   split_x  s   0zGroupViTStage.split_xNr}   r	  r   c                 C   s   |d u r|S t j||gddS )Nr   r   )r   cat)rw   r}   r	  r!   r!   r"   concat_x  s   zGroupViTStage.concat_xFhidden_statesprev_group_tokenoutput_attentionsc                 C   s   | j r| j|ddd}| jdur|| | }nd}|}| ||}| jD ]}||ddd}|d }q(| |\}}d}	| jdurL| ||\}}	||f}
|rW|
|	f }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the grouping tensors of Grouping block.
        r   r:   N)attention_maskcausal_attention_mask)	r  r	  r   rJ   r  r  r  r  r  )rw   r  r  r  r	  r}   cat_xlayer	layer_outr   outputsr!   r!   r"   r~     s&   




zGroupViTStage.forwardr  )NF)r   r   r   r   r   rM   rl   propertyr  r  r   r   r   r  r   r   r   r~   r   r!   r!   rx   r"   r    s8    "
"r  c                
       sX   e Zd Z			ddedee dee dee f fddZdejd	ejfd
dZ	  Z
S )rs   Nrg   rp   intermediate_sizeoutput_sizec                    sp   t    || _t|j | _|d ur|n|j}|d ur|n|j}|d ur&|n|}t	||| _
t	||| _d S r  )rk   rl   rg   r	   
hidden_actactivation_fnrp   r   r   r   fc1fc2)rw   rg   rp   r   r!  rx   r!   r"   rl   .  s   
zGroupViTMLP.__init__r  r   c                 C   s"   |  |}| |}| |}|S r  )r$  r#  r%  )rw   r  r!   r!   r"   r~   >  s   


zGroupViTMLP.forwardr   )r   r   r   r   r   rM   rl   r   r   r~   r   r!   r!   rx   r"   rs   -  s    rs   c                       s   e Zd Z fddZ  ZS )r   c                    s    t  |dd}|ddS Nr   rH   )rk   r~   r   r  rx   r!   r"   r~   F  s   zGroupViTMixerMLP.forward)r   r   r   r~   r   r!   r!   rx   r"   r   E  s    r   c                       s   e Zd ZdZ fddZdejdedefddZ							
ddejde	ej de	ej de	ej
 de	e deeje	ej e	eej  f fddZ  ZS )rm   z=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r   r   )rk   rl   rg   rp   r   num_attention_heads	num_headshead_dimr   rU   attention_dropoutr   r   r   r   r   r   out_projrv   rx   r!   r"   rl   N  s"   

zGroupViTAttention.__init__rB   r   bszc                 C   s    | ||| j| jdd S r&  )r   r(  r)  r   r]   )rw   rB   r   r,  r!   r!   r"   _shapea  s    zGroupViTAttention._shapeNFr  r  r  rz   r  r   c                 C   s  |  \}}}|du}	| || j }
|	r*| | |d|}| | |d|}n| | |d|}| | |d|}|| j d| jf}| |
||j| }
|j| }|j| }| d}t	
|
|dd}|  || j ||fkrtd|| j ||f d|   |dur|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}|dur|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}tjj|dd}|r||| j||}||| j ||}nd}tjj|| j| jd	}t	
||}|  || j || jfkr@td
|| j|| jf d|   ||| j|| j}|dd}||||}| |}||fS )z#Input shape: Batch x Time x ChannelNr:   r   rH   z$Attention weights should be of size z	, but is z!Attention mask should be of size r   )pr   z `attn_output` should be of size )rJ   r   rU   r-  r   r   r(  r)  r   r   bmmr   r   r   r   r/   r   r   rP   r+  )rw   r  r  r  rz   r  r,  tgt_lenr   is_cross_attentionquery_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputr!   r!   r"   r~   d  sl   




zGroupViTAttention.forward)NNNF)r   r   r   r   rl   r   r   rM   r-  r   r   r   r   r~   r   r!   r!   rx   r"   rm   K  s*    rm   c                       sT   e Zd Zdef fddZ	ddejdejdejdee d	e	ej
 f
d
dZ  ZS )r  rg   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S rh   )rk   rl   rp   r   rm   	self_attnr   ro   rq   layer_norm1rs   rt   layer_norm2rv   rx   r!   r"   rl     s   


zGroupViTEncoderLayer.__init__Fr  r  r  r  r   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r  r  r  r  )r<  r;  r=  rt   )rw   r  r  r  r  residualr7  r  r!   r!   r"   r~     s"   




zGroupViTEncoderLayer.forwardr   )r   r   r   r   rl   r   r   r   r   r   r   r~   r   r!   r!   rx   r"   r    s    r  c                   @   s$   e Zd ZdZeZdZdZdd ZdS )GroupViTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    groupvitTc                 C   s  | j j}t|tjtjfr"|jjjd|d |j	dur!|j	j
  nt|tjr5|j	j
  |jjd | j j}t|trX|jjjjd|d d |jjjjd|d d dS t|tr| j j}|jd d|j j d  | }|jd | }tjj|jj|d tjj|jj|d tjj|jj|d tjj|jj|d dS t|tr| j j}|j jd d|j j d  | }d|j j d | }tjj|jj|d tjj|jj|d dS dS )	zInitialize the weightsr=   )meanstdNr.   g{Gz?r   rH   )rB  )rg   initializer_ranger   r   r   r   weightdatanormal_biaszero_ro   fill_initializer_factorr   r   r   rm   r   num_hidden_layersinitr   r   r   r+  rs   rp   r$  r%  )rw   module
init_rangefactorin_proj_stdout_proj_stdfc_stdr!   r!   r"   _init_weights  s8   



 z%GroupViTPreTrainedModel._init_weightsN)	r   r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointingrS  r!   r!   r!   r"   r?    s    r?  aJ  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`GroupViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
aE  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`CLIPImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                       sb   e Zd Zdeddf fddZ			ddejdee dee d	ee de	e
ef f
d
dZ  ZS )GroupViTVisionEncoderrg   r   Nc                    s>   t     | _t fddtt jD | _d| _	d S )Nc              
      sF   g | ]}t   j|  j|  j| |d kr j|d  nd dqS )r   r   )rg   r  r   r   r  )r  depthsnum_group_tokensnum_output_groups)r   ir   r!   r"   r   }  s    z2GroupViTVisionEncoder.__init__.<locals>.<listcomp>F)
rk   rl   rg   r   r
  r  r    rX  stagesgradient_checkpointingrv   rx   r   r"   rl   y  s   


zGroupViTVisionEncoder.__init__r  output_hidden_statesr  return_dictc                 C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|r"dnd }|r(dnd }d }t| jD ](\}}	|r<||f }|	|||}
|
d }|
d }|rY|
d d urY||
d f }q1|ra||f }|sotdd |||fD S t|||dS )Nr!   r   r   rH   c                 s       | ]	}|d ur|V  qd S r  r!   r   vr!   r!   r"   r         z0GroupViTVisionEncoder.forward.<locals>.<genexpr>last_hidden_stater  rR   )rg   r  r^  use_return_dict	enumerater\  r   r   )rw   r  r^  r  r_  all_hidden_statesall_groupingsr   r[  stagelayer_outputsr!   r!   r"   r~     s.   

zGroupViTVisionEncoder.forwardr   )r   r   r   r   rl   r   r   r   r   r   r   r   r~   r   r!   r!   rx   r"   rW  x  s     
rW  c                       st   e Zd ZdZdef fddZ					ddeej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )GroupViTTextEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self-attention layers. Each layer is a
    [`GroupViTEncoderLayer`].

    Args:
        config: GroupViTTextConfig
    rg   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    r  r!   r  r  r   r!   r"   r     r  z0GroupViTTextEncoder.__init__.<locals>.<listcomp>F)	rk   rl   rg   r   r
  r  rK  r  r]  rv   rx   r   r"   rl     s   
 
zGroupViTTextEncoder.__init__Nr  r  r  r^  r_  r   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ]1\}
}|r<||	f }| jrM| jrM| |j	|	|||}n||	|||d}|d }	|rb||d f }q1|rj||	f }|sxt
dd |	||fD S t|	||dS )	a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr!   )r  r   r   c                 s   r`  r  r!   ra  r!   r!   r"   r     rc  z.GroupViTTextEncoder.forward.<locals>.<genexpr>rd  )rg   r  r^  rf  rg  r  r]  r   _gradient_checkpointing_func__call__r   r   )rw   r   r  r  r  r^  r_  encoder_statesall_attentionsr  idxencoder_layerrk  r!   r!   r"   r~     sF   &

zGroupViTTextEncoder.forward)NNNNN)r   r   r   r   r   rl   r   r   r   r   r   r   r   r~   r   r!   r!   rx   r"   rl    s*    	
rl  c                       s   e Zd Zdef fddZeeeeed						dde	e
j de	e
j de	e
j d	e	e d
e	e de	e deeef fddZ  ZS )GroupViTTextTransformerrg   c                    sH   t    || _|j}t|| _t|| _tj	||j
d| _|j| _d S rh   )rk   rl   rg   rp   r   r   rl  encoderr   ro   rq   final_layer_normeos_token_idr   rx   r!   r"   rl     s   


z GroupViTTextTransformer.__init__output_typerT  Nr   r  r   r  r^  r_  r   c                 C   sj  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| }|d|d }| j||d}t||j	|j
d}	|durLt||j	}| j|||	|||d}
|
d }| |}| jdkr|tj|jd |j
d|jtj|j
d	jdd
f }n|tj|jd |j
d|jtj|j
d	| jk jdd
f }|s||f|
dd  S t|||
j|
jdS )
        Returns:

        NzYou have to specify input_idsr:   )r   r   r   )r   r  r  r  r^  r_  r   rH   )r>   r   r   r   re  pooler_outputr  rR   )rg   r  r^  rf  r   rJ   r   r   r
   r>   r   r   rt  ru  rv  r   r   rD   torM   argmaxr   r  rR   )rw   r   r  r   r  r^  r_  input_shaper  r  encoder_outputsre  pooled_outputr!   r!   r"   r~      s\   
	

zGroupViTTextTransformer.forwardNNNNNN)r   r   r   r   rl   r   GROUPVIT_TEXT_INPUTS_DOCSTRINGr   r   r   r   r   r   r   r   r~   r   r!   r!   rx   r"   rs    s2    

rs  c                       s   e Zd ZeZdef fddZdejfddZdd Z	e
eeeed		
	
	
	
	
	
ddeej deej deej dee dee dee deeef fddZ  ZS )GroupViTTextModelrg   c                    "   t  | t|| _|   d S r  )rk   rl   rs  
text_model	post_initrv   rx   r!   r"   rl   v     
zGroupViTTextModel.__init__r   c                 C   
   | j jjS r  r  r   r   r   r!   r!   r"   get_input_embeddings|     
z&GroupViTTextModel.get_input_embeddingsc                 C   s   || j j_d S r  r  )rw   r   r!   r!   r"   set_input_embeddings  s   z&GroupViTTextModel.set_input_embeddingsrw  Nr   r  r   r  r^  r_  c                 C   s   | j ||||||dS )aK  
        Returns:

        Examples:

        ```python
        >>> from transformers import CLIPTokenizer, GroupViTTextModel

        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r  r   r  r^  r_  )r  )rw   r   r  r   r  r^  r_  r!   r!   r"   r~     s   zGroupViTTextModel.forwardr  )r   r   r   r   rT  rl   r   Moduler  r  r   r  r   r   r   r   r   r   r   r   r~   r   r!   r!   rx   r"   r  s  s8    

r  c                       sx   e Zd Zdef fddZeeeeed				dde	e
j de	e de	e d	e	e d
eeef f
ddZ  ZS )GroupViTVisionTransformerrg   c                    s@   t    || _|j}t|| _t|| _tj	||j
d| _d S rh   )rk   rl   rg   rp   r   r   rW  rt  r   ro   rq   r   r   rx   r!   r"   rl     s   


z"GroupViTVisionTransformer.__init__rw  Nr   r^  r  r_  r   c           	      C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| |}| j||||d}|d }| |}|jdd}|sO||f|dd  S t	|||j
|jdS )ry  Nz You have to specify pixel_values)r  r^  r  r_  r   r   r   rz  )rg   r  r^  rf  r   r   rt  r   rA  r   r  rR   )	rw   r   r^  r  r_  r  r  re  r  r!   r!   r"   r~     s0   

z!GroupViTVisionTransformer.forwardNNNN)r   r   r   r   rl   r    GROUPVIT_VISION_INPUTS_DOCSTRINGr   r   r   r   r   r   r   r   r~   r   r!   r!   rx   r"   r    s&    	

r  c                       s   e Zd ZeZdZdef fddZdefddZe	e
eeed								ddeej d
ee dee dee deeef f
ddZ  ZS )GroupViTVisionModelr   rg   c                    r  r  )rk   rl   r  vision_modelr  rv   rx   r!   r"   rl     r  zGroupViTVisionModel.__init__r   c                 C   r  r  )r  r   r   r   r!   r!   r"   r    r  z(GroupViTVisionModel.get_input_embeddingsrw  Nr  r^  r_  c                 C   s   | j ||||dS )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTVisionModel

        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r  r^  r_  )r  )rw   r   r  r^  r_  r!   r!   r"   r~     s   zGroupViTVisionModel.forwardr  )r   r   r   r   rT  main_input_namerl   r   r  r   r  r   r   r   r   r   r   r   r   r~   r   r!   r!   rx   r"   r    s,    

r  c                       sT  e Zd ZeZdef fddZee						ddee	j
 dee	j
 dee	j
 dee d	ee d
ee de	jfddZee				ddee	j dee d	ee d
ee de	jf
ddZeeeeed									ddee	j dee	j dee	j
 dee	j dee dee d	ee dee d
ee deeef fddZ  ZS )GroupViTModelrg   c              
      s6  t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	| _	|j
| _
|j| _|j| _t|| _t|| _ttj| j| j
ddt| j
tjddtj| j
| j	dd| _ttj| j| j
ddt| j
tjddtj| j
| j	dd| _tt| jj| _|   d S )NzOconfig.text_config is expected to be of type GroupViTTextConfig but is of type .zSconfig.vision_config is expected to be of type GroupViTVisionConfig but is of type T)rG  )inplace) rk   rl   r   text_configr   r   typevision_configr   projection_dimprojection_intermediate_dimrp   text_embed_dimvision_embed_dimrs  r  r  r  r   r  r   BatchNorm1dReLUvisual_projectiontext_projectionr   r   rB   rg   logit_scale_init_valuelogit_scaler  )rw   rg   r  r  rx   r!   r"   rl     sF   





zGroupViTModel.__init__Nr   r  r   r  r^  r_  r   c           
      C   sh   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||||||d}|d }| |}	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`GroupViTTextModel`].

        Examples:

        ```python
        >>> from transformers import CLIPTokenizer, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```Nr  r   )rg   r  r^  rf  r  r  )
rw   r   r  r   r  r^  r_  text_outputsr  text_featuresr!   r!   r"   get_text_featuresE  s   	
zGroupViTModel.get_text_featuresr   c                 C   sd   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||||d}|d }| |}|S )aH  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`GroupViTVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```Nr  r   )rg   r  r^  rf  r  r  )rw   r   r  r^  r_  vision_outputsr  image_featuresr!   r!   r"   get_image_featurest  s   
z GroupViTModel.get_image_featuresrw  return_lossoutput_segmentationc
              
   C   sP  |dur|n| j j}|dur|n| j j}|rd}|dur|n| j j}|	dur(|	n| j j}	| j||||	d}
| j||||||	d}|
d }| |}|d }| |}||j	ddd }||j	ddd }| j
 }t|| | }| }d}|r|
d }| |d|jd }|r|
d	 }n|
d
 }t||jd
d }||j	ddd }t|| | }||jd d|jd dd
d}||jd |jd d}t||| }||jd |jd |jd
 |jd	 }d}|rt|}|	s|dur|||||||
f}n||||||
f}|dur|f| S |S t||||||||
dS )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NTr  r  r   r:   r   r   r   rH   )r   r   r   r   r   r   r   r   )rg   r  r  r^  rf  r  r  r  r  normr  expr   matmulr%   rP   rD   re   r\   r(   r   )rw   r   r   r  r   r  r  r^  r  r_  r  r  r   r   r  r   r   
seg_logitsimage_group_embedsrR   groupinglogits_per_image_groupflatten_groupingr   outputr!   r!   r"   r~     s   '	




 

zGroupViTModel.forwardr  r  )	NNNNNNNNN)r   r   r   r   rT  rl   r   r  r   r   r   r   r   r  r  r  GROUPVIT_INPUTS_DOCSTRINGr   r   r   r   r   r~   r   r!   r!   rx   r"   r    s    +.0
	

r  )r   Fr:   r   )Lr   collections.abcr   r   dataclassesr   typingr   r   r   r   numpyrN   r   torch.utils.checkpointr   activationsr	   modeling_attn_mask_utilsr
   r   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   r   configuration_groupvitr   r   r   
get_loggerr   logger_CHECKPOINT_FOR_DOC&GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LISTr   r#   r(   rM   r9   floatr   rG   rZ   re   r  rf   r   r   r   r   r   r   r  rs   r   rm   r  r?  GROUPVIT_START_DOCSTRINGr  r  r  rW  rl  rs  r  r  r  r  r!   r!   r!   r"   <module>   sf   
$

072"F!^o2, &:b_5:4