o
    hO                     @   s8  d dl mZ d dlmZmZmZmZ d dlZeG dd dZ	ddeej	 deej
eef dej	d	ed
ee f
ddZ	ddeej	 deej
eef dej	d	ed
ee f
ddZddej	dejdee fddZddej	dejdee fddZ	 	ddeej
eef dejdejd	ed
ee deej	 fddZdS )    )	dataclass)ListOptionalTupleUnionNc                   @   s  e Zd ZU dZeed< eed< d"dedee fddZ	d#ded	ed
ede	j
dee	jdf dee	j fddZ	d"de	jd	ede	j
d
ee de	jf
ddZe		d$de	jde	j
de	jdedee f
ddZed"de	jde	j
dee fddZede	jde	jdeeef fd d!ZdS )%AttentionMaskConvertera9  
    A utility attention mask class that allows one to:
        - Create a causal 4d mask
        - Create a causal 4d mask with slided window
        - Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask (batch_size, 1, query_length,
          key_value_length) that can be multiplied with attention scores

    Examples:

    ```python
    >>> import torch
    >>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter

    >>> converter = AttentionMaskConverter(True)
    >>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32)
    tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00,  0.0000e+00]]]])
    ```

    Parameters:
        is_causal (`bool`):
            Whether the attention mask should be a uni-directional (causal) or bi-directional mask.

        sliding_window (`int`, *optional*):
            Optionally, the sliding window masks can be created if `sliding_window` is defined to a positive integer.
    	is_causalsliding_windowNc                 C   s:   || _ || _| jd ur| jdkrtd| j dd S d S )Nr   zaMake sure that when passing `sliding_window` that its value is a strictly positive integer, not ``)r   r	   
ValueError)selfr   r	    r   [/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/modeling_attn_mask_utils.py__init__7   s   zAttentionMaskConverter.__init__cpu
batch_sizequery_lengthkey_value_lengthdtypedevicestrreturnc           	      C   s\   | j std| j d||f}|| }d}|d dks!| jdur,| j||||| jd}|S )z
        Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative
        bias to upper right hand triangular matrix (causal mask).
        z"Please use `to_causal_4d` only if z has `is_causal` set to True.N   r   past_key_values_lengthr	   )r   r   	__class__r	   _make_causal_mask)	r   r   r   r   r   r   input_shaper   causal_4d_maskr   r   r   to_causal_4d@   s   z#AttentionMaskConverter.to_causal_4dattention_mask_2dc           
      C   s   |j d |f}d}|d dks| jdur0| jr0|du rtd|| }| j|||j|| jd}n	| jdur9td| j|||d d|j}|durW|	|
 t|j}|}	|	S )	a  
        Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
        key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is
        causal, a causal mask will be added.
        r   Nr   r   zpThis attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask.r   z?Sliding window is currently only implemented for causal masking)tgt_len)shaper	   r   r   r   r   NotImplementedError_expand_masktomasked_fillbooltorchfinfomin)
r   r!   r   r   r   r   r   r   expanded_attn_maskexpanded_4d_maskr   r   r   to_4da   s0   
zAttentionMaskConverter.to_4dr   input_ids_shaper   c                 C   s   | \}}t j||ft |j|d}t j|d|d}|||d |ddk d ||}|dkrFt j	t j
||||d|gdd}|durk|| d }	dt jt j|t jd|	d	 }
||
 t |j |ddddddf |d||| S )
zJ
        Make causal mask used for bi-directional self-attention.
        )r   r   r   r   r   r   )dimN)r   )diagonal)r)   fullr*   r+   arangesizemasked_fill_viewr&   catzerostriu	ones_likeintr(   expand)r/   r   r   r   r	   bszr"   mask	mask_condr2   context_maskr   r   r   r      s   "
 (z(AttentionMaskConverter._make_causal_maskr?   r"   c                 C   sj   |   \}}|dur|n|}| ddddddf |d|||}d| }||tjt|jS )zg
        Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
        Nr   g      ?)r5   r=   r&   r'   r)   r(   r*   r+   )r?   r   r"   r>   src_lenexpanded_maskinverted_maskr   r   r   r%      s
   *z#AttentionMaskConverter._expand_maskrC   attention_maskunmasked_valuec           
      C   s&  t |jd dd}t j| | ddd}t |dkd }|jd dkr(| S || }t |}t |d}||	dd}d|||k< | 
 dkr| jd }|dkrc|dddf d|f}	n*|ddddf t |ddddf |dddddf f}	n
|dddf |f}	|| |	< | S )aC  
        Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when
        using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
        Details: https://github.com/pytorch/pytorch/issues/110213

        `expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len].
        `attention_mask` is [bsz, src_seq_len].

        The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias.

        For example, if `attention_mask` is
        ```
        [[0, 0, 1],
         [1, 1, 1],
         [0, 1, 1]]
        ```
        and `expanded_mask` is (e.g. here left-padding case)
        ```
        [[[[0, 0, 0],
           [0, 0, 0],
           [0, 0, 1]]],
         [[[1, 0, 0],
           [1, 1, 0],
           [1, 1, 1]]],
         [[[0, 0, 0],
           [0, 1, 0],
           [0, 1, 1]]]]
        ```
        then the modified `expanded_mask` will be
        ```
        [[[[1, 1, 1],   <-- modified
           [1, 1, 1],   <-- modified
           [0, 0, 1]]],
         [[[1, 0, 0],
           [1, 1, 0],
           [1, 1, 1]]],
         [[[1, 1, 1],   <-- modified
           [0, 1, 0],
           [0, 1, 1]]]]
        ```
        r   r   r   T)keepdim   N)r)   r4   r#   argmaxr   wheremax	unsqueezerepeatr5   r1   )
rC   rE   rF   tmpindicesleft_masked_rowsmax_lenrange_tensor	num_masks
mask_slicer   r   r   _unmask_unattended   s*   2

z)AttentionMaskConverter._unmask_unattendedN)r   r   N)__name__
__module____qualname____doc__r(   __annotations__r<   r   r   r)   r   r   r   Tensorr    r.   staticmethodSizer   r%   floatrU   r   r   r   r   r      sn   
 
&
."
r   rE   r   inputs_embedsr   r	   c                 C   s`   t d|d}|d | }| dur|j| |d ||jd} | S |j|d |d ||j|jd} | S )a  
    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
    `(batch_size, key_value_length)`

    Args:
        attention_mask (`torch.Tensor` or `None`):
            A 2D attention mask of shape `(batch_size, key_value_length)`
        input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
            The input shape should be a tuple that defines `(batch_size, query_length)`.
        inputs_embeds (`torch.Tensor`):
            The embedded inputs as a torch Tensor.
        past_key_values_length (`int`):
            The length of the key value cache.
        sliding_window (`int`, *optional*):
            If the model uses windowed attention, a sliding window should be passed.
    Tr   r	   r   N)r   r   r   r0   )r   r.   r   r    r   )rE   r   ra   r   r	   attn_mask_converterr   r   r   r   !_prepare_4d_causal_attention_mask  s   rd   c                 C   s   t d|d}|d | }|\}}tj }	| dur3t| dkr2|	r#n!|dkr*d} n||kr1d} n	 n|dkr>||kr>d} n|	rDtd| du rLd}
|
S | du rb|j|d |d ||j|jd}
|
S |j	| |d |j|d	}
|dkrzt j
|
| d
d}
|
S )a  
    Prepares the correct `attn_mask` argument to be used by `torch.nn.functional.scaled_dot_product_attention`.

    In case no token is masked in the `attention_mask` argument, we simply set it to `None` for the cases `query_length == 1` and
    `key_value_length == query_length`, and rely instead on SDPA `is_causal` argument to use causal/non-causal masks,
    allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed).
    Trb   r   Nr   zAttention using SDPA can not be traced with torch.jit.trace when no attention_mask is provided. To solve this issue, please either load your model with the argument `attn_implementation="eager"` or pass an attention_mask input when tracing the model.r   r0   )r   r   g        )rF   )r   r)   jit
is_tracingallr   r    r   r   r.   rU   )rE   r   ra   r   r	   rc   r   r   r   rf   r-   r   r   r   *_prepare_4d_causal_attention_mask_for_sdpa>  sN   
	rh   r?   r   r"   c                 C   s   t j| ||dS )  
    Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
    `(batch_size, key_value_length)`

    Args:
        mask (`torch.Tensor` or `None`):
            A 2D attention mask of shape `(batch_size, key_value_length)`
        dtype (`torch.dtype`):
            The torch dtype the created mask shall have.
        tgt_len (`int`):
            The target length or query length the created mask shall have.
    r?   r   r"   )r   r%   rj   r   r   r   _prepare_4d_attention_mask  s   rk   c                 C   sr   | j \}}|dur|n|}tj }t| dkr1|rdS |dkr#dS ||kr)dS tj| ||dS tj| ||dS )ri   Nr   rj   )r#   r)   re   rf   rg   r   r%   )r?   r   r"   r   r   rf   r   r   r   #_prepare_4d_attention_mask_for_sdpa  s   

rl   r   r   c                 C   s8   t d|d}|| d  }|j| d | d |||d}|S )a/  
    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)`

    Args:
        input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
            The input shape should be a tuple that defines `(batch_size, query_length)`.
        dtype (`torch.dtype`):
            The torch dtype the created mask shall have.
        device (`int`):
            The torch device the created mask shall have.
        sliding_window (`int`, *optional*):
            If the model uses windowed attention, a sliding window should be passed.
    Trb   r   r   r0   )r   r    )r   r   r   r   r	   rc   r   rE   r   r   r    _create_4d_causal_attention_mask  s   rm   rV   rW   )dataclassesr   typingr   r   r   r   r)   r   r]   r_   r<   rd   rh   r   rk   rl   r   rm   r   r   r   r   <module>   s`     
.
 G *