o
    h@t                  
   @   sP  d Z ddlZddlZddlZddlmZmZmZmZ ddl	m
Z
 ddlmZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ eeeeef  eeeeeef  eeeeef   eeeeeef   f ZG dd deZdeeeeef dedeeef fddZdededefddZdd Zdd Z dd Z!dddZ"dS ) zProcessor class for KOSMOS-2.    N)ListOptionalTupleUnion   )BatchFeature)
ImageInput
is_batched)ProcessorMixin)
AddedToken)BatchEncodingPaddingStrategy	TextInputTruncationStrategy)
TensorTypec                "       s  e Zd ZdZddgZdZdZd3 fdd	Z							
																	d4dede	e
ee
 f dedee dee dedede	eeef de	eeef dee dee dee dededee	eef  def ddZdd  Zd!d" Z					
d5d#e	e
ee
 f dededee de	eee f f
d$d%Zd&d' Zd(d) Zd6d*d+Zed,d- Zdede	eee  eee  f defd.d/Z d0e	eeef eeeeef f deeef fd1d2Z!  Z"S )7Kosmos2Processora,  
    Constructs an KOSMOS-2 processor which wraps a KOSMOS-2 image processor and a KOSMOS-2 tokenizer into a single
    processor.

    [`Kosmos2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and some functionalities of
    [`XLMRobertaTokenizerFast`]. See the docstring of [`~Kosmos2Processor.__call__`] and [`~Kosmos2Processor.decode`]
    for more information.

    Args:
        image_processor (`CLIPImageProcessor`):
            An instance of [`CLIPImageProcessor`]. The image processor is a required input.
        tokenizer (`XLMRobertaTokenizerFast`):
            An instance of ['XLMRobertaTokenizerFast`]. The tokenizer is a required input.
        num_patch_index_tokens (`int`, *optional*, defaults to 1024):
            The number of tokens that represent patch indices.
    image_processor	tokenizerCLIPImageProcessor)XLMRobertaTokenizerXLMRobertaTokenizerFast   c                    s   d|_ d| _d| _d| _d| _d| _d| _d| _d	| _d
| _	d| _
d| _| j| j| j| j| j| j| j| j| j	| j
| jg| _|| _dd t| jD }g }| j| D ]}|t|dddd qQ|| t || d S )NFz</doc>z<image>z</image>z</chunk>z</line>z<phrase>z	</phrase>z<object>z	</object></delimiter_of_multi_objects/>z<grounding>c                 S   s"   g | ]}d t |d dqS )<patch_index_   >)strzfill.0x r!   d/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/kosmos2/processing_kosmos2.py
<listcomp>`   s   " z-Kosmos2Processor.__init__.<locals>.<listcomp>T)lstriprstrip
normalized)return_token_type_ids	eod_token	boi_token	eoi_token	eoc_token	eol_token	bop_token	eop_token	boo_token	eoo_token	dom_token	grd_token
tag_tokensnum_patch_index_tokensrangeappendr   
add_tokenssuper__init__)selfr   r   r4   patch_index_tokenstokens_to_addtoken	__class__r!   r"   r9   <   s>   
zKosmos2Processor.__init__N@   TFimagestextbboxesnum_image_tokensfirst_image_token_idadd_special_tokensadd_eos_tokenpadding
truncation
max_lengthpad_to_multiple_ofreturn_attention_maskreturn_lengthverbosereturn_tensorsreturnc           !         sh  |du r|du rt dt }|durj||d}|| |duruj||||d}|rL|sLt|tr>jj | }nt|t	rLfdd|D }jd||oS||oX|du |	|
|du r`|n||||du ri|ndd	|}|| |dur|dur|du rjj
d }|}t|d }t	t||| }d	gdg|  d	g }g }g }|d
 }t|tr|g}|d g|d< |D ]5}|d| | ||| d  }|| t|}|rd	g| }|d	gt|t|  7 }|| qt|t	rtdd t|jD dd d}|d	 \}}|d \} }jd||  g|o!|||	|
||dd|}t|jd	  | krjjdkra fdd|D } fdd|D } fdd|d D |d< n'jjdkr fdd|D } fdd|D } fdd|d D |d< t|tr|du r|d	 }|d d	 |d< |d	 }|t||d |d|d |S )a  
        This method uses [`CLIPImageProcessor.__call__`] method to prepare image(s) for the model, and
        [`XLMRobertaTokenizerFast.__call__`] to prepare text for the model.

        Please refer to the docstring of the above two methods for more information.

        The rest of this documentation shows the arguments specific to `Kosmos2Processor`.

        Args:
            bboxes (`Union[List[Tuple[int]], List[Tuple[float]], List[List[Tuple[int]]], List[List[Tuple[float]]]]`, *optional*):
                The bounding bboxes associated to `texts`.
            num_image_tokens (`int`, defaults to 64):
                The number of (consecutive) places that are used to mark the placeholders to store image information.
                This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
            first_image_token_id (`int`, *optional*):
                The token id that will be used for the first place of the subsequence that is reserved to store image
                information. If unset, will default to `self.tokenizer.unk_token_id + 1`.
            add_eos_token (`bool`, defaults to `False`):
                Whether or not to include `EOS` token id in the encoding when `add_special_tokens=True`.
        Nz*You have to specify either images or text.)rO   )rD   c                    s   g | ]
} j j | qS r!   )r   	bos_token)r   s)r:   r!   r"   r#          z-Kosmos2Processor.__call__.<locals>.<listcomp>)	rB   rF   rH   rI   rJ   rK   rL   rN   rO      r   	input_idsattention_maskc                 S   s   g | ]
\}}|t |fqS r!   len)r   idxr    r!   r!   r"   r#      rS   c                 S   s   | d S Nr!   )r    r!   r!   r"   <lambda>   s    z+Kosmos2Processor.__call__.<locals>.<lambda>)keyr[   )rB   rF   rH   rI   rJ   rK   rN   rO   rightc                    s&   g | ]}|j jg t|   qS r!   r   pad_token_idrX   r   max_len_paddedr:   r!   r"   r#         & c                    "   g | ]}|d g t |   qS r   rW   r   rb   r!   r"   r#          c                    rd   re   rW   r   rf   r!   r"   r#      rg   leftc                    s&   g | ]}j jg t|  | qS r!   r_   r   ra   r!   r"   r#      rc   c                    "   g | ]}d g t |  | qS re   rW   r   rf   r!   r"   r#      rg   c                    ri   re   rW   r   rf   r!   r"   r#      rg   )rU   rV   image_embeds_position_mask)datatensor_typer!   )
ValueErrorr   r   updatepreprocess_examples
isinstancer   r   rQ   listunk_token_idintr5   r6   copyrX   sorted	enumeraterU   padding_sider   )!r:   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   kwargsencodingimage_encodingtext_encodingwith_bosstart_indeximage_token_idsbase_image_embeds_position_maskrU   rj   all_input_idstext_idsmasksorted_length_min_len_not_paddedrY   r!   ra   r"   __call__i   s   '






 


	





zKosmos2Processor.__call__c                 C   s   |du rdS t |tstd|D ];}|du rqt |ts |g}|D ])}t |trGt|dkr8tdd |D sKt|dkrGtdd |D sKtdq"qdS )	a  
        Check `bboxes` for a single text example. It could be
            - `None`: no bounding box associated to a text.
            - A list with each element being the bounding boxes associated to one `<phrase> ... </phrase>` pair found
              in a text. This could be:
                  - `None`: no bounding box associated to a `<phrase> ... </phrase>` pair.
                  - A tuple of 2 integers: A single bounding box specified by patch indices.
                  - A tuple of 4 float point number: A single bounding box specified by (normalized) coordinates.
                  - A list containing the above 2 tuple types: Multiple bounding boxes for a
                   `<phrase> ... </phrase>` pair.
        Nz@`bboxes` (for a single text example) should be `None` or a list.   c                 s       | ]}t |tV  qd S N)rp   rs   r   r!   r!   r"   	<genexpr>)      zAKosmos2Processor._check_bboxes_for_single_text.<locals>.<genexpr>r   c                 s   r   r   )rp   floatr   r!   r!   r"   r   *  r   a'  Each element in `bboxes` (for a single text example) should be either `None`, a tuple containing 2 integers or 4 float point numbers, or a list containing such tuples. Also make sure the arguments `texts` and `bboxes` passed to `preprocess_text` are both in batches or both for a single example.)rp   rq   rm   tuplerX   all)r:   rC   bboxelementr!   r!   r"   _check_bboxes_for_single_text  s&   


z.Kosmos2Processor._check_bboxes_for_single_textc                 C   s.   |  }|d ur| d| }| ||}|S )N )strip_insert_patch_index_tokens)r:   rB   imagerC   img_info_tokensr!   r!   r"   _preprocess_single_example3  s
   z+Kosmos2Processor._preprocess_single_exampletextsc           	         s@  j g| }dj g| jg  d}t|trd}|g}|du r+dgt| }nt|s2|g}t|t|krItdt| dt| d|sT| |g}n|durlt|t	satd|D ]}| qcndgt| }t|t|krtd	t| dt| d fd
dt
|||D }|s|d }|S )a-  Add image and bounding box information to `texts` as image and patch index tokens.

        Args:
            texts (`Union[TextInput, List[TextInput]]`): The texts to be processed.
            images (`ImageInput`, *optional*): The images associated to `texts`.
            bboxes (`Union[List[Tuple[int]], List[Tuple[float]], List[List[Tuple[int]]], List[List[Tuple[float]]]]`, *optional*):
                The bounding bboxes associated to `texts`.
            num_image_tokens (`int`, *optional*, defaults to 64):
                The number of image tokens (used as latent queries). This should corresponds to the `latent_query_num`
                attribute in `Kosmos2Config`.

        Returns:
            `Union[TextInput, List[TextInput]]`: The processed texts with image and patch index tokens.
        r   TFNzGThe number of examples in `texts` and `images` should be the same. Got  v.s. 	 instead.zS`bboxes` should be `None` or a list (as a batch) when `texts` is passed as a batch.zGThe number of examples in `texts` and `bboxes` should be the same. Got c                    s"   g | ]\}}} ||| qS r!   )r   )r   rB   r   r   r   r:   r!   r"   r#   u  s    z8Kosmos2Processor.preprocess_examples.<locals>.<listcomp>r   )r)   joinr*   rp   r   rX   r	   rm   r   rq   zip)	r:   r   rA   rC   rD   
img_tokensbatchedr    resultr!   r   r"   ro   =  sD   



z$Kosmos2Processor.preprocess_examplesc                 O      | j j|i |S )z
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )r   batch_decoder:   argsrx   r!   r!   r"   r        zKosmos2Processor.batch_decodec                 O   r   )z
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r   decoder   r!   r!   r"   r     r   zKosmos2Processor.decodec                 C   s    | | jd }|rt|S |S rZ   )splitr*   +clean_text_and_extract_entities_with_bboxes)r:   rB   cleanup_and_extractcaptionr!   r!   r"   post_process_generation  s   z(Kosmos2Processor.post_process_generationc                 C   s"   | j j}| jj}tt|| S r   )r   model_input_namesr   rq   dictfromkeys)r:   tokenizer_input_namesimage_processor_input_namesr!   r!   r"   r     s   z"Kosmos2Processor.model_input_namesc                 C   sP  |d u s
t |dkr|S ttjd|d}t |t |kr,tdt | dt | dd}g }t||D ]\\}}| \}}	||||	  |	}|d u rOq5t|t	rW|g}g }
t
dd |D sftd	|D ]}| |\}}|
| d
|  qht |
dkrq5d|
}|d| d q5|t |k r|||d   d|}|S )Nr   z<phrase>.+?</phrase>)stringzuThe number of elements in `bboxes` should be the same as the number of `<phrase> ... </phrase>` pairs in `text`. Got r   r   c                 s   s    | ]}|d uV  qd S r   r!   )r   boxr!   r!   r"   r     s    z>Kosmos2Processor._insert_patch_index_tokens.<locals>.<genexpr>zTThe multiple bounding boxes for a single phrase should not contain any `None` value.r   z  </delimiter_of_multi_objects/> z	<object> z
 </object> )rX   rq   refinditerrm   r   spanr6   rp   r   r   #_convert_bbox_to_patch_index_tokensr   )r:   rB   rC   matched_phrasescurr_posbuffermatchedr   r   endpatch_index_stringsr   patch_index_1patch_index_2position_strr!   r!   r"   r     sB   


z+Kosmos2Processor._insert_patch_index_tokensr   c                 C   sh   t |dkr|\}}ntt| j}t||\}}dt|d d}dt|d d}||fS )Nr   r   r   r   )rX   rs   mathsqrtr4   coordinate_to_patch_indexr   r   )r:   r   idx_1idx_2num_patches_per_sidetoken_1token_2r!   r!   r"   r     s   
z4Kosmos2Processor._convert_bbox_to_patch_index_tokens)r   )NNNr@   NTFFNNNNFTN)NNr@   )T)#__name__
__module____qualname____doc__
attributesimage_processor_classtokenizer_classr9   r   r   r   r   	BboxInputr   rs   boolr   r   r   r   r   r   r   r   ro   r   r   r   propertyr   r   r   r   r   __classcell__r!   r!   r>   r"   r   &   s    /	

 (#
C

.-
r   r   r   rP   c                 C   s   | \}}}}||kr||kst dt|| }t|| }t|| d }t|| d }	|| | }
|	| | }|
|fS )a  Convert a bounding box to a pair of patch indices.

    Args:
        bbox (`Tuple[float, float, float, float]`):
            The 4 coordinates of the bounding box, with the format being (x1, y1, x2, y2) specifying the upper-left and
            lower-right corners of the box. It should have x2 > x1 and y2 > y1.
        num_patches_per_side (`int`): the number of patches along each side.

    Returns:
        `Tuple[int, int]`: A pair of patch indices representing the upper-left patch and lower-right patch.
    zTThe coordinates in `bbox` should be `(x1, y1, x2, y2)` with `x2 > x1` and `y2 > y1`.rT   )rm   r   floorceil)r   r   x1y1x2y2ul_xul_ylr_xlr_yul_idxlr_idxr!   r!   r"   r     s   r   r   r   c                 C   s   d| }| | }| | }|| }|| }| |kr-|| }|| }	|| | }
|| | }n=||ks5||krJ|| }|| }	|| | }
|| | }n || |d  }|| |d  }	|| |d  }
|| |d  }||	|
|fS )a  
    Given a grid of length `num_patches_per_side` and the indices of the upper-left and lower-right corners of a
    bounding box, returns the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).

    Args:
        ul_idx (`int`): the index of the grid cell that corresponds to the upper-left corner of the bounding box.
        lr_idx (`int`): the index of the grid cell that corresponds to the lower-right corner of the bounding box.
        num_patches_per_side (`int`): the number of patches along each side.

    Returns:
        `Tuple[float]`: the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
    g      ?r   r!   )r   r   r   	cell_sizer   r   r   r   r   r   r   r   r!   r!   r"   patch_index_to_coordinate  s(   r   c              	   C   s4  d}t || }g }|D ]}|d}| \}}}|s,d}|dd |dd f}|d}	g }
|	D ];}t d|}t d|dd }|rp|rp|r_|
t|dt|df q5|
t|dt|df q5|r|||||
f q|
D ]}d|d  d	|d  d
}||||gf q~q|S )a  Extract entities contained in `text`. The bounding bboxes is given in the form of patch indices.

    This functioin is only intended to be used within `clean_text_and_extract_entities_with_bboxes` where further
    processing happens, including converting to normalized coordinates and whitespace character cleaning up.

    Examples:

    ```python
    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
    >>> entities = extract_entities_with_patch_indices(text)
    >>> entities
    [(' a snowman', (31, 41), [(44, 863)]), (' a fire', (130, 137), [(5, 911)])]
    ```z(?:(<phrase>([^<]+)</phrase>))?<object>((?:<patch_index_\d+><patch_index_\d+></delimiter_of_multi_objects/>)*<patch_index_\d+><patch_index_\d+>)</object>r   Nr   r   z<patch_index_(\d+)>rT   r   z><patch_index_r   )	r   r   r   groupsr   searchr6   rs   group)rB   patternmatchesentities_with_patch_indicesmatchr   
phrase_tagphrasematch_contentpatch_index_pairsentity_bboxespairr    yr   entityr!   r!   r"   #extract_entities_with_patch_indices'  s4   

$"r   c                 C   sP   | \}\}}t tdd|d| }t tdd|d| }|||ff}|S )zfAdjust the positions of the entities in `text` to be relative to the text with special fields removed.<.*?>r   N)rX   r   sub)r   rB   entity_namestartr   adjusted_startadjusted_endadjusted_entityr!   r!   r"   adjust_entity_positionsa  s
   r   c                 C   s   |   }t| t|   }g }|D ]5\}\}}}t|t|  }	t|t|  }
|| |	 }|| |
 }|  }||||f|f q||fS )z9Remove the spaces around the text and the entities in it.)r   rX   r$   r%   r6   )rB   entitiesnew_textleading_spacesnew_entitiesr   r   r   rC   entity_name_leading_spacesentity_name_trailing_spacesr!   r!   r"   _cleanup_spacesk  s   r       c           
         sp   t dd| }t| }g }|D ]#}|dd |d }}t|| } fdd|D }	|||	f  qt||S )a  Remove the tag tokens from `text`, extract entities in it with some cleaning up of white characters.

    Examples:

    ```python
    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
    >>> clean_text, entities = clean_text_and_extract_entities_with_bboxes(text)
    >>> clean_text
    'An image of a snowman warming himself by a fire.'

    >>> entities
    [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
    ```r   r   r   r   c                    s    g | ]}t |d  |d  qS )r   rT   )r   )r   r   r   r!   r"   r#     s     z?clean_text_and_extract_entities_with_bboxes.<locals>.<listcomp>)r   r   r   r   r6   r   )
rB   r   processed_textr   r   itemr   rC   r   bboxes_in_coordsr!   r  r"   r     s   

r   )r  )#r   rt   r   r   typingr   r   r   r   image_processing_utilsr   image_utilsr   r	   processing_utilsr
   tokenization_utilsr   tokenization_utils_baser   r   r   r   utilsr   rs   r   r   r   r   r   r   r   r   r   r!   r!   r!   r"   <module>   s8      *8-:
