o
    h;{                  	   @   s,  d Z ddlZddlmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZ ddlmZ eeZdZdZ g dZ!dgZ"d>dej#de$de%dej#fddZ&G dd de	j'Z(G dd de	j'Z)G dd de	j'Z*G dd de	j'Z+G d d! d!e	j'Z,G d"d# d#e	j'Z-G d$d% d%e	j'Z.G d&d' d'e	j'Z/G d(d) d)e	j'Z0G d*d+ d+eZ1d,Z2d-Z3ed.e2G d/d0 d0e1Z4G d1d2 d2e	j'Z5G d3d4 d4e	j'Z6G d5d6 d6e	j'Z7G d7d8 d8e	j'Z8G d9d: d:e	j'Z9ed;e2G d<d= d=e1Z:dS )?z PyTorch GLPN model.    N)ListOptionalTupleUnion)nn   )ACT2FN)BaseModelOutputDepthEstimatorOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
GLPNConfigr   zvinvino02/glpn-kitti)r   i                 Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutput r'   \/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/glpn/modeling_glpn.py	drop_path8   s   
r)   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )GLPNDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                       t    || _d S N)super__init__r   )selfr   	__class__r'   r(   r.   P      

zGLPNDropPath.__init__hidden_statesc                 C   s   t || j| jS r,   )r)   r   r   )r/   r3   r'   r'   r(   forwardT   s   zGLPNDropPath.forwardc                 C   s   d | jS )Nzp={})formatr   )r/   r'   r'   r(   
extra_reprW   s   zGLPNDropPath.extra_reprr,   )__name__
__module____qualname____doc__r   floatr.   r    Tensorr4   strr6   __classcell__r'   r'   r0   r(   r*   M   s
    r*   c                       s(   e Zd ZdZ fddZdd Z  ZS )GLPNOverlapPatchEmbeddingsz+Construct the overlapping patch embeddings.c                    s4   t    tj|||||d d| _t|| _d S )N   kernel_sizestridepadding)r-   r.   r   Conv2dproj	LayerNorm
layer_norm)r/   
patch_sizerC   num_channelshidden_sizer0   r'   r(   r.   _   s   
z#GLPNOverlapPatchEmbeddings.__init__c                 C   s>   |  |}|j\}}}}|ddd}| |}|||fS )Nr@   r   )rF   r   flatten	transposerH   )r/   pixel_values
embeddings_heightwidthr'   r'   r(   r4   k   s
   


z"GLPNOverlapPatchEmbeddings.forwardr7   r8   r9   r:   r.   r4   r>   r'   r'   r0   r(   r?   \   s    r?   c                       s4   e Zd ZdZ fddZdd Z	d	ddZ  ZS )
GLPNEfficientSelfAttentionzSegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
    paper](https://arxiv.org/abs/2102.12122).c                    s   t    || _|| _| j| j dkr td| j d| j dt| j| j | _| j| j | _t	| j| j| _
t	| j| j| _t	| j| j| _t|j| _|| _|dkrktj||||d| _t|| _d S d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   )rB   rC   )r-   r.   rK   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   LinearquerykeyvalueDropoutattention_probs_dropout_probdropoutsr_ratiorE   srrG   rH   r/   configrK   rV   sequence_reduction_ratior0   r'   r(   r.   z   s,   

z#GLPNEfficientSelfAttention.__init__c                 C   s6   |  d d | j| jf }||}|ddddS )Nr   r@   r   r   )sizerV   rY   viewpermute)r/   r3   	new_shaper'   r'   r(   transpose_for_scores   s   
z/GLPNEfficientSelfAttention.transpose_for_scoresFc                 C   s&  |  | |}| jdkr6|j\}}}|ddd||||}| |}|||dddd}| |}|  | |}	|  | 	|}
t
||	dd}|t| j }tjj|dd}| |}t
||
}|dddd }| d d | jf }||}|r||f}|S |f}|S )Nr   r   r@   rg   dimr   )rl   r\   rb   r   rj   reshaperc   rH   r]   r^   r    matmulrM   mathsqrtrY   r   
functionalsoftmaxra   
contiguousrh   rZ   ri   )r/   r3   rQ   rR   output_attentionsquery_layer
batch_sizeseq_lenrJ   	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr'   r'   r(   r4      s*   




z"GLPNEfficientSelfAttention.forwardF)r7   r8   r9   r:   r.   rl   r4   r>   r'   r'   r0   r(   rT   v   s    
rT   c                       s$   e Zd Z fddZdd Z  ZS )GLPNSelfOutputc                    s*   t    t||| _t|j| _d S r,   )r-   r.   r   r[   denser_   hidden_dropout_probra   )r/   re   rK   r0   r'   r(   r.      s   
zGLPNSelfOutput.__init__c                 C   s   |  |}| |}|S r,   )r   ra   )r/   r3   input_tensorr'   r'   r(   r4      s   

zGLPNSelfOutput.forwardr7   r8   r9   r.   r4   r>   r'   r'   r0   r(   r      s    r   c                       s.   e Zd Z fddZdd ZdddZ  ZS )	GLPNAttentionc                    s6   t    t||||d| _t||d| _t | _d S )N)re   rK   rV   rf   )rK   )r-   r.   rT   r/   r   r&   setpruned_headsrd   r0   r'   r(   r.      s   
zGLPNAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rn   )lenr   r/   rV   rY   r   r   r\   r]   r^   r&   r   rZ   union)r/   headsindexr'   r'   r(   prune_heads   s   zGLPNAttention.prune_headsFc                 C   s6   |  ||||}| |d |}|f|dd   }|S )Nr   r   )r/   r&   )r/   r3   rQ   rR   rw   self_outputsattention_outputr   r'   r'   r(   r4      s   zGLPNAttention.forwardr   )r7   r8   r9   r.   r   r4   r>   r'   r'   r0   r(   r      s    r   c                       &   e Zd Zd fdd	Zdd Z  ZS )
GLPNDWConv   c              	      s(   t    tj||dddd|d| _d S )Nr   r   T)biasgroups)r-   r.   r   rE   dwconv)r/   ro   r0   r'   r(   r.      s   
zGLPNDWConv.__init__c                 C   sD   |j \}}}|dd||||}| |}|ddd}|S )Nr   r@   )r   rM   ri   r   rL   )r/   r3   rQ   rR   ry   rz   rJ   r'   r'   r(   r4     s
   
zGLPNDWConv.forward)r   r   r'   r'   r0   r(   r      s    r   c                       r   )
GLPNMixFFNNc                    sl   t    |p|}t||| _t|| _t|jt	r"t
|j | _n|j| _t||| _t|j| _d S r,   )r-   r.   r   r[   dense1r   r   
isinstance
hidden_actr=   r   intermediate_act_fndense2r_   r   ra   )r/   re   in_featureshidden_featuresout_featuresr0   r'   r(   r.     s   

zGLPNMixFFN.__init__c                 C   sD   |  |}| |||}| |}| |}| |}| |}|S r,   )r   r   r   ra   r   )r/   r3   rQ   rR   r'   r'   r(   r4     s   




zGLPNMixFFN.forward)NNr   r'   r'   r0   r(   r     s    r   c                       s*   e Zd ZdZ fddZdddZ  ZS )	GLPNLayerzCThis corresponds to the Block class in the original implementation.c                    sn   t    t|| _t||||d| _|dkrt|nt | _	t|| _
t|| }t|||d| _d S )N)rK   rV   rf   r   )r   r   )r-   r.   r   rG   layer_norm_1r   	attentionr*   Identityr)   layer_norm_2rX   r   mlp)r/   re   rK   rV   r)   rf   	mlp_ratiomlp_hidden_sizer0   r'   r(   r.   &  s   
zGLPNLayer.__init__Fc           
      C   sr   | j | ||||d}|d }|dd  }| |}|| }| | |||}| |}|| }	|	f| }|S )N)rw   r   r   )r   r   r)   r   r   )
r/   r3   rQ   rR   rw   self_attention_outputsr   r   
mlp_outputlayer_outputr'   r'   r(   r4   4  s   


zGLPNLayer.forwardr   rS   r'   r'   r0   r(   r   #  s    r   c                       s,   e Zd Z fddZ			dddZ  ZS )GLPNEncoderc           	         sT  t     | _dd td jt jD }g }t j	D ]"}|
t j|  j| |dkr3 jn j|d   j| d qt|| _g }d}t j	D ]@}g }|dkrb| j|d  7 }t j| D ]}|
t  j|  j| |||   j|  j| d qi|
t| qQt|| _t fddt j	D | _d S )Nc                 S   s   g | ]}|  qS r'   )item).0xr'   r'   r(   
<listcomp>T  s    z(GLPNEncoder.__init__.<locals>.<listcomp>r   r   )rI   rC   rJ   rK   )rK   rV   r)   rf   r   c                    s   g | ]
}t  j| qS r'   )r   rG   hidden_sizes)r   ire   r'   r(   r   |  s    )r-   r.   re   r    linspacedrop_path_ratesumdepthsrangenum_encoder_blocksappendr?   patch_sizesstridesrJ   r   r   
ModuleListpatch_embeddingsr   rV   	sr_ratios
mlp_ratiosblockrH   )	r/   re   dprrO   r   blockscurlayersjr0   r   r(   r.   O  sH   
 


zGLPNEncoder.__init__FTc                 C   s   |rdnd }|r
dnd }|j d }|}tt| j| j| jD ]H\}	}
|
\}}}||\}}}t|D ]\}}|||||}|d }|rJ||d f }q2||}||||ddddd }|rf||f }q|sut	dd |||fD S t
|||d	S )
Nr'   r   r   rg   r   r@   c                 s   s    | ]	}|d ur|V  qd S r,   r'   )r   vr'   r'   r(   	<genexpr>  s    z&GLPNEncoder.forward.<locals>.<genexpr>last_hidden_stater3   
attentions)r   	enumeratezipr   r   rH   rp   rj   rv   tupler	   )r/   rN   rw   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsry   r3   idxr   embedding_layerblock_layer
norm_layerrQ   rR   r   blklayer_outputsr'   r'   r(   r4     s2   

 
zGLPNEncoder.forward)FFTr   r'   r'   r0   r(   r   N  s    3r   c                   @   s$   e Zd ZdZeZdZdZdd ZdS )GLPNPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    glpnrN   c                 C   s   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjrF|jjjd| jjd |jdurD|jj|j 
  dS dS t |tjr[|j	j
  |jjd dS dS )zInitialize the weightsr   )meanstdNg      ?)r   r   r[   rE   weightdatanormal_re   initializer_ranger   zero_	Embeddingpadding_idxrG   fill_)r/   moduler'   r'   r(   _init_weights  s   

z!GLPNPreTrainedModel._init_weightsN)	r7   r8   r9   r:   r   config_classbase_model_prefixmain_input_namer   r'   r'   r'   r(   r     s    r   aG  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`GLPNConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aG  

    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`GLPNImageProcessor.__call__`] for details.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zfThe bare GLPN encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.c                       s   e Zd Z fddZdd Zeedee	e
eded			dd	ejd
ee dee dee deee
f f
ddZ  ZS )	GLPNModelc                    s(   t  | || _t|| _|   d S r,   )r-   r.   re   r   encoder	post_initr/   re   r0   r'   r(   r.     s   
zGLPNModel.__init__c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   layerr   r   )r/   heads_to_pruner   r   r'   r'   r(   _prune_heads  s   zGLPNModel._prune_headsz(batch_size, sequence_length)vision)
checkpointoutput_typer   modalityexpected_outputNrN   rw   r   r   r   c                 C   s~   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||||d}|d }|s6|f|dd   S t||j|jdS )Nrw   r   r   r   r   r   )re   rw   r   use_return_dictr   r	   r3   r   )r/   rN   rw   r   r   encoder_outputssequence_outputr'   r'   r(   r4     s$   zGLPNModel.forward)NNN)r7   r8   r9   r.   r   r   GLPN_INPUTS_DOCSTRINGr5   r   _CHECKPOINT_FOR_DOCr	   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr    FloatTensorr   boolr   r   r4   r>   r'   r'   r0   r(   r     s2    

r   c                       *   e Zd ZdZd fdd	Zdd Z  ZS )GLPNSelectiveFeatureFusionz
    Selective Feature Fusion module, as explained in the [paper](https://arxiv.org/abs/2201.07436) (section 3.4). This
    module adaptively selects and integrates local and global features by attaining an attention map for each feature.
    @   c              	      s   t    ttjt|d |ddddt|t | _ttj|t|d ddddtt|d t | _	tjt|d ddddd| _
t | _d S )Nr@   r   r   )in_channelsout_channelsrB   rC   rD   )r-   r.   r   
SequentialrE   rX   BatchNorm2dReLUconvolutional_layer1convolutional_layer2convolutional_layer3Sigmoidsigmoid)r/   
in_channelr0   r'   r(   r.   %  s   
z#GLPNSelectiveFeatureFusion.__init__c                 C   s   t j||fdd}| |}| |}| |}| |}||d d dd d d d f d ||d d dd d d d f d  }|S )Nr   rn   r   )r    catr	  r
  r  r  	unsqueeze)r/   local_featuresglobal_featuresfeaturesattnhybrid_featuresr'   r'   r(   r4   :  s   



(z"GLPNSelectiveFeatureFusion.forward)r  rS   r'   r'   r0   r(   r    s    r  c                       s&   e Zd Z fddZdddZ  ZS )GLPNDecoderStagec                    sP   t    ||k}|stj||ddnt | _t|| _tjdddd| _	d S )Nr   )rB   r@   bilinearFscale_factormodealign_corners)
r-   r.   r   rE   r   convolutionr  fusionUpsampleupsample)r/   r  r  should_skipr0   r'   r(   r.   L  s
   

zGLPNDecoderStage.__init__Nc                 C   s,   |  |}|d ur| ||}| |}|S r,   )r  r  r  )r/   hidden_stateresidualr'   r'   r(   r4   S  s
   

zGLPNDecoderStage.forwardr,   r   r'   r'   r0   r(   r  K  s    r  c                       s:   e Zd Z fddZdeej deej fddZ  ZS )GLPNDecoderc                    s\   t    |jd d d }|j t fdd|D | _d | jd _tjdddd| _	d S )	Nrg   c                    s   g | ]}t | qS r'   )r  )r   rK   r  r'   r(   r   g  s    z(GLPNDecoder.__init__.<locals>.<listcomp>r   r@   r  Fr  )
r-   r.   r   decoder_hidden_sizer   r   stagesr  r  final_upsample)r/   re   reserved_hidden_sizesr0   r$  r(   r.   `  s   
zGLPNDecoder.__init__r3   r   c                 C   sN   g }d }t |d d d | jD ]\}}|||}|| q| ||d< |S )Nrg   )r   r&  r   r'  )r/   r3   stage_hidden_statesstage_hidden_stater!  stager'   r'   r(   r4   n  s   
zGLPNDecoder.forward	r7   r8   r9   r.   r   r    r<   r4   r>   r'   r'   r0   r(   r#  _  s    &r#  c                       r  )	SiLogLossz
    Implements the Scale-invariant log scale loss [Eigen et al., 2014](https://arxiv.org/abs/1406.2283).

    $$L=\frac{1}{n} \sum_{i} d_{i}^{2}-\frac{1}{2 n^{2}}\left(\sum_{i} d_{i}^{2}\right)$$ where $d_{i}=\log y_{i}-\log
    y_{i}^{*}$.

          ?c                    r+   r,   )r-   r.   lambd)r/   r/  r0   r'   r(   r.     r2   zSiLogLoss.__init__c                 C   sX   |dk  }t|| t||  }tt|d | jt| d  }|S )Nr   r@   )detachr    logrs   powr   r/  )r/   predtarget
valid_maskdiff_loglossr'   r'   r(   r4     s   ,zSiLogLoss.forward)r.  rS   r'   r'   r0   r(   r-  z  s    r-  c                       s6   e Zd Z fddZdeej dejfddZ  ZS )GLPNDepthEstimationHeadc                    sR   t    || _|j}ttj||ddddtjddtj|ddddd| _d S )Nr   r   rA   F)inplace)	r-   r.   re   r%  r   r  rE   r  head)r/   re   channelsr0   r'   r(   r.     s   


z GLPNDepthEstimationHead.__init__r3   r   c                 C   s8   || j j }| |}t|| j j }|jdd}|S )Nr   rn   )re   head_in_indexr:  r    r  	max_depthsqueeze)r/   r3   predicted_depthr'   r'   r(   r4     s
   
zGLPNDepthEstimationHead.forwardr,  r'   r'   r0   r(   r8    s    "r8  z]GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.c                       s   e Zd Z fddZeedeee	d				dde
jdee
j dee d	ee d
ee deee
j ef fddZ  ZS )GLPNForDepthEstimationc                    s6   t  | t|| _t|| _t|| _|   d S r,   )	r-   r.   r   r   r#  decoderr8  r:  r   r   r0   r'   r(   r.     s
   


zGLPNForDepthEstimation.__init__zbatch_size, sequence_length)r   r   NrN   labelsrw   r   r   r   c                 C   s   |dur|n| j j}|dur|n| j j}| j||d|d}|r"|jn|d }| |}| |}	d}
|dur>t }||	|}
|s`|rL|	f|dd  }n	|	f|dd  }|
dur^|
f| S |S t|
|	|rh|jnd|j	dS )a  
        labels (`torch.FloatTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, GLPNForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-kitti")
        >>> model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        ...     predicted_depth = outputs.predicted_depth

        >>> # interpolate to original size
        >>> prediction = torch.nn.functional.interpolate(
        ...     predicted_depth.unsqueeze(1),
        ...     size=image.size[::-1],
        ...     mode="bicubic",
        ...     align_corners=False,
        ... )

        >>> # visualize the prediction
        >>> output = prediction.squeeze().cpu().numpy()
        >>> formatted = (output * 255 / np.max(output)).astype("uint8")
        >>> depth = Image.fromarray(formatted)
        ```NTr   r   r@   )r7  r?  r3   r   )
re   r   r   r   r3   rA  r:  r-  r
   r   )r/   rN   rB  rw   r   r   r   r3   outr?  r7  loss_fctr&   r'   r'   r(   r4     s6   3


zGLPNForDepthEstimation.forward)NNNN)r7   r8   r9   r.   r   r   r5   r   r
   r   r    r   r   r   r   r   r<   r4   r>   r'   r'   r0   r(   r@    s*    

r@  )r   F);r:   rr   typingr   r   r   r   r    torch.utils.checkpointr   activationsr   modeling_outputsr	   r
   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   configuration_glpnr   
get_loggerr7   loggerr   r   r   "GLPN_PRETRAINED_MODEL_ARCHIVE_LISTr<   r;   r   r)   Moduler*   r?   rT   r   r   r   r   r   r   r   GLPN_START_DOCSTRINGr   r   r  r  r#  r-  r8  r@  r'   r'   r'   r(   <module>   s\   
 R'+X<,