o
    h}                     @   s  d Z ddlZddlmZmZmZmZmZ ddlZ	ddl
mZ ddlmZmZ ddlmZmZmZmZ e r;dd	lmZ eeZe rGddlZd
ZdZdZdZdZdZdZ dZ!dZ"dee# ded de#de#de#de#ddfddZ$deeee#  df dddeed  de#d e#ded fd!d"Z%d#e&de&fd$d%Z'd#e&defd&d'Z(d#e&d(e)dee# fd)d*Z*d+e&d(e)dee# fd,d-Z+d.eee&  d/eeed   d0e#d1e#d2e,d3e,ded4 fd5d6Z-d7d8 Z.d9d: Z/d;e)d<e)d(e)dee# fd=d>Z0d?e)d@e)dAe)dBe)d(e)dee# fdCdDZ1G dEdF dFeZ2dS )Gz$
Image/Text processor class for GIT
    N)DictListOptionalTupleUnion   )ProcessorMixin)PaddingStrategyTruncationStrategy)
TensorTypeis_torch_availableloggingrequires_backends   )FuyuBatchFeaturez<box>z</box>z<point>z</point>z<0x00>z<0x01>z<0x02>z<0x03>z<0x04>all_bi_tokens_to_placefull_unpacked_streamtorch.Tensor
fill_value
batch_sizenew_seq_lenoffsetreturnc           	      C   s|   t | |ksJ t ||ksJ tj||g||d j|d jd}t|D ]}| | }|| |||  ||d|f< q%|S )zTakes an unpacked stream of tokens (i.e. a list of tensors, one for each item in the batch) and does
    the required padding to create a single tensor for the batch of shape batch_size x new_seq_len.
    r   )r   dtypedeviceN)lentorchfullr   r   range)	r   r   r   r   r   r   new_padded_tensorbitokens_to_place r"   ^/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/fuyu/processing_fuyu.pyfull_unpacked_stream_to_tensor3   s   "r$   num_real_text_tokensinput_streamimage_tokensnum_sub_sequencesc                 C   s|   g }t |D ]5}g }|| d }tj|||df gdd}	|jd | | d  }
||	d|
  |tj|dd q|S )a  Takes an input_stream tensor of shape B x S x ?. For each subsequence, adds any required
    padding to account for images and then unpacks the subsequences to create a single sequence per item in the batch.
    Returns a list of tensors, one for each item in the batch.r   dimN)r   r   catshapeappend)r%   r&   r'   r   r(   all_bi_streambatch_indexall_si_streamimage_adjustmentsubsequence_streamnum_real_tokensr"   r"   r#   construct_full_unpacked_streamR   s   r4   promptc                 C   s4   |  tt} |  tt} |  tt} |  tt} | S N)	replaceTEXT_REPR_POINT_OPENTOKEN_POINT_OPEN_STRINGTEXT_REPR_POINT_CLOSETOKEN_POINT_CLOSE_STRINGTEXT_REPR_BBOX_OPENTOKEN_BBOX_OPEN_STRINGTEXT_REPR_BBOX_CLOSETOKEN_BBOX_CLOSE_STRING)r5   r"   r"   r#   $_replace_string_repr_with_token_tagsn   s
   r@   c                 C   s   g }t dt dt dt dt d	}|| }t|D ]&\}}t|dks/|ttttfv r0q|	||dko@||d  ttfv f q|S )zY
    Given a string prompt, converts the prompt into a list of TextTokenConversions.
    (|)r   r   )
recompiler=   r?   r9   r;   split	enumerater   r-   )r5   prompt_text_listregex_patternprompt_splitielemr"   r"   r#   +_segment_prompt_into_text_token_conversionsv   s"   
rM   scale_factorc                 C   s^   t | } t| }g }|D ] }|d r t|d ||}|| q|||d ddj q|S )a  
    This function transforms the prompt in the following fashion:
    - <box> <point> and </box> </point> to their respective token mappings
    - extract the coordinates from the tag
    - transform the coordinates into the transformed image space
    - return the prompt tokens with the transformed coordinates and new tags

    Bounding boxes and points MUST be in the following format: <box>y1, x1, y2, x2</box> <point>x, y</point> The spaces
    and punctuation added above are NOT optional.
    r   r   Fadd_special_tokens)r@   rM   _transform_within_tagsextend	input_ids)r5   rN   	tokenizerrH   transformed_prompt_tokensrL   within_tag_tokenizedr"   r"   r#   #_transform_coordinates_and_tokenize   s   rW   textc           	         s   |  d}t|dkr jt } jt }n
 jt } jt }dd |D }t|dkr9t|d |d |d}n!t|dkrQt|d |d |d |d	 |d
}n	t	dt|  fdd|D }|g| |g S )z
    Given a bounding box of the fashion <box>1, 2, 3, 4</box> | <point>1, 2</point> This function is responsible for
    converting 1, 2, 3, 4 into tokens of 1 2 3 4 without any commas.
    ,   c                 S   s   g | ]}t | qS r"   )floatstrip.0numr"   r"   r#   
<listcomp>   s    z*_transform_within_tags.<locals>.<listcomp>r   r   )xyrN      r   )topleftbottomrightrN   zInvalid number of ints: c                    s   g | ]	} j t| qS r"   )vocabstrr]   rT   r"   r#   r`          )
rF   r   rh   r9   r;   r=   r?    scale_point_to_transformed_imagescale_bbox_to_transformed_image
ValueError)	rX   rN   rT   num_int_strstoken_space_open_stringtoken_space_close_stringnum_intsnum_ints_translatedtokensr"   rj   r#   rQ      s(   



rQ   promptsscale_factorsmax_tokens_to_generatemax_position_embeddingsadd_BOSadd_beginning_of_answer_token)r   r   c                    s|  |dur g }t ||D ]\}}	|fddt ||	D  qn	fdd|D }|}
|r3jd  njd   fdd|
D }
|rTjt }|
D ]	}|d | qJd	d |
D }t|}t|| |}|| |kr}td
| d| d| d t |
|D ]'\}}t ||D ]\}}t	||krt
d|| }|jd g|  qqtj|
tjd}tj|tjd}||fS )a"  
    Given a set of prompts and number of tokens to generate:
    - tokenize prompts
    - set the sequence length to be the max of length of prompts plus the number of tokens we would like to generate
    - pad all the sequences to this length so we can convert them into a 3D tensor.
    Nc                    s    g | ]\}}t ||  qS r"   )rW   item)r^   r5   rN   rj   r"   r#   r`      s    z:_tokenize_prompts_with_image_and_batch.<locals>.<listcomp>c                       g | ]} fd d|D qS )c                    s   g | ]}  |qS r"   )tokenize)r^   r5   rj   r"   r#   r`          E_tokenize_prompts_with_image_and_batch.<locals>.<listcomp>.<listcomp>r"   r^   
prompt_seqrj   r"   r#   r`          z<s>z|ENDOFTEXT|c                    r|   )c                    s   g | ]} g| qS r"   r"   r^   ra   	bos_tokenr"   r#   r`      r~   r   r"   r   r   r"   r#   r`      r   c                 S   s   g | ]	}d d |D qS )c                 S   s   g | ]}t |qS r"   )r   r   r"   r"   r#   r`         r   r"   )r^   prompts_tokens_seqr"   r"   r#   r`     rk   z!Max subsequence prompt length of z + max tokens to generate zexceeds context length of z+. Will generate as many tokens as possible.z5Length of subsequence prompt exceeds sequence length.r   )zipr-   rh   BEGINNING_OF_ANSWER_STRINGnpmaxminloggerwarningr   rn   rR   r   tensorint64)rT   ru   rv   rw   rx   ry   rz   rU   r   scale_factor_seqprompts_tokensboa	token_seqprompts_lengthmax_prompt_lensamples_lengthprompt_tokens_seqprompts_length_seqprompt_tokensprompt_lengthpadding_sizeprompts_tokens_tensorprompts_length_tensorr"   )r   rT   r#   &_tokenize_prompts_with_image_and_batch   sJ   




r   c                 C      t | | t jS r6   r   roundastypeint32)original_coordsscale_hr"   r"   r#    original_to_transformed_h_coords     r   c                 C   r   r6   r   )r   scale_wr"   r"   r#    original_to_transformed_w_coords!  r   r   ra   rb   c                 C   s<   t t| d g|d }tt|d g|d }||gS NrZ   r   r   r   arrayr   )ra   rb   rN   x_scaledy_scaledr"   r"   r#   rl   %  s   rl   rd   re   rf   rg   c           	      C   st   t t| d g|d }tt|d g|d }t t|d g|d }tt|d g|d }||||gS r   r   )	rd   re   rf   rg   rN   
top_scaledleft_scaledbottom_scaledright_scaledr"   r"   r#   rm   +  s
   rm   c                        s   e Zd ZdZddgZdZdZ fddZdee	 d	e
fd
dZdd Z																d)de
d	e
dee
eef dee
eef dee dedee de
de
de
de
de
de
deeeef  dd fd!d"Zd*d#d$Zd%d& Zd'd( Z  ZS )+FuyuProcessora  
    Constructs a Fuyu processor which wraps a Fuyu image processor and a Llama tokenizer into a single processor.

    [`FuyuProcessor`] offers all the functionalities of [`FuyuImageProcessor`] and [`LlamaTokenizerFast`]. See the
    [`~FuyuProcessor.__call__`] and [`~FuyuProcessor.decode`] for more information.

    Args:
        image_processor ([`FuyuImageProcessor`]):
            The image processor is a required input.
        tokenizer ([`LlamaTokenizerFast`]):
            The tokenizer is a required input.
    image_processorrT   FuyuImageProcessorAutoTokenizerc                    s8   t  j||d || _|| _d| _d| _d| _d| _d S )N)r   rT   
   i @  r   r   )super__init__r   rT   rw   rx   pad_token_iddummy_image_index)selfr   rT   	__class__r"   r#   r   G  s   
zFuyuProcessor.__init__model_inputsreturn_attention_maskc                 C   st  t dd |D }t dd |D }g g g g d}|D ]}| D ]z\}}|dkrk||jd  }	tjtj|jd |	f| jtjd|gdd	}
|| |
 tjtj	|jd |	tjdt
|gdd	}|d
 | q!|dkrw|| | q!||jd  }tjtj|jd |f| jtjd|gdd	}|| | q!qddg}|r|d
 |D ]}tj|| dd	||< q|S )Nc                 s       | ]
}|d  j d V  qdS )rS   r   Nr,   r^   entryr"   r"   r#   	<genexpr>Q      zEFuyuProcessor._left_pad_inputs_with_attention_mask.<locals>.<genexpr>c                 s   r   )image_patches_indicesr   Nr   r   r"   r"   r#   r   R  r   )rS   image_patchesr   attention_maskrS   r   r   r   r)   r   r   r   )r   itemsr,   r   r+   r   r   longr-   zeros	ones_liker   )r   r   r   max_length_input_idsmax_length_image_patch_indicesbatched_inputsr   keyr   num_padding_tokenspadded_input_idsr   num_padding_indicespadded_indicesbatched_keysr"   r"   r#   $_left_pad_inputs_with_attention_maskP  sL    	"
z2FuyuProcessor._left_pad_inputs_with_attention_maskc              	   C   s  t ddd}| jj||||||dd}	t| j||| j| jddd\}
}t||
|	d d| j	d}t|t 
|
d|	d d| j	d}td	d
 |D }t|| j | j}t|td|d jd }t|g|dd|dd}t dd |	d D }|d d||d}|S )Nr   T)image_inputimage_presentimage_unpadded_himage_unpadded_wimage_placeholder_idimage_newline_idvariable_sized)rT   ru   rv   rw   rx   ry   rz   image_input_ids)r%   r&   r'   r   r(   r   image_patch_indices_per_batchc                 s   s    | ]}|j d  V  qdS )r   Nr   r   r"   r"   r#   r     s    z4FuyuProcessor.get_sample_encoding.<locals>.<genexpr>r   )r   r   r   r   r   r   c                 S      g | ]}|d  qS r   r"   r^   imgr"   r"   r#   r`     r   z5FuyuProcessor.get_sample_encoding.<locals>.<listcomp>r   )rS   r   r   )r   onesr   preprocess_with_tokenizer_infor   rT   rw   rx   r4   subsequence_length	full_liker   r   r,   r$   stack	unsqueeze)r   ru   rv   image_unpadded_heightsimage_unpadded_widthsr   r   tensor_batch_imagesr   model_image_inputr   r   image_padded_unpacked_tokens&unpacked_image_patch_indices_per_batchmax_prompt_lengthmax_seq_len_batchr!   image_patch_input_indicesimage_patches_tensorbatch_encodingr"   r"   r#   get_sample_encoding  sd   


	
z!FuyuProcessor.get_sample_encodingNTFr   rP   padding
truncation
max_lengthstridepad_to_multiple_ofreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_token_type_idsreturn_lengthverbosereturn_tensorsr   r   c           $      K   s  t | dg |std|du r|du rtd|durC|du rCtd | j| _| jd|||||||	||
||||||d|}|S |du rT|durTtd dgg}|durr|durrt|trf|gg}nt|trrd	d
 |D }| j	j
|dd}|d }|d }|d }|d }d| _t|| _| jdddd d }| jdddd d }tdd
 |D d}g }t|||||D ]%\}}}} }!| j|g|gt|gt| g|||!dd}"||" q| j||d}#t|#dS )aV  
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to
        encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
        FuyuImageProcessor's [`~FuyuImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
        of the above two methods for more information.

        Args:
            text (`str`, `List[str]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            images (`PIL.Image.Image`, `List[PIL.Image.Image]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.

        Returns:
            [`FuyuBatchEncoding`]: A [`FuyuBatchEncoding`] with the following fields:

            - **input_ids** -- Tensor of token ids to be fed to a model. Returned when `text` is not `None`.
            - **image_patches** -- List of Tensor of image patches. Returned when `images` is not `None`.
            - **image_patches_indices** -- Tensor of indices where patch embeddings have to be inserted by the model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model when
              `return_attention_mask=True`.
        r   z>`return_attention_mask=False` is not supported for this model.Nz?You have to specify either text or images. Both cannot be None.zMYou are processing a text with no associated image. Make sure it is intended.)rX   rP   r   r   r   r   r   r   r   r   r  r  r  r  r  zNYou are processing an image with no associated text. Make sure it is intended. c                 S   s   g | ]}|gqS r"   r"   )r^   text_seqr"   r"   r#   r`     s    z*FuyuProcessor.__call__.<locals>.<listcomp>pt)r  imagesr   r   image_scale_factorsr   z	|SPEAKER|FrO   rS   z	|NEWLINE|c                 S   r   r   r"   r   r"   r"   r#   r`   '  r   r   )ru   rv   r   r   r   r   r   )r   r   )datar"   )r   rn   r   r   rT   current_processor
isinstanceri   listr   
preprocessr   r   r   r   r   r   r   r   r   r-   r   r   )$r   rX   r	  rP   r   r   r   r   r   r   r   r   r  r  r  r  r  kwargstext_encodingru   image_encodingbatch_imagesr   r   rv   r   r   r   all_encodingsr5   rN   image_unpadded_heightimage_unpadded_widthtensor_batch_imagesample_encodingr   r"   r"   r#   __call__  s   .








	
zFuyuProcessor.__call__c                    s   dfdd	fdd  fdd} fdd	}|du r3j jd
 j jd fft| }n|jd dkr>tdt|t|krJtdg }t||D ]\}}|||}|||}|| qQ|S )a  
        Transforms raw coordinates detected by [`FuyuForCausalLM`] to the original images' coordinate space.
        Coordinates will be returned in "box" format, with the following pattern:
            `<box>top, left, bottom, right</box>`

        Point coordinates are not supported yet.

        Args:
            outputs ([`GenerateOutput`]):
                Raw outputs from `generate`.
            target_sizes (`torch.Tensor`, *optional*):
                Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
                the batch. If set, found coordinates in the output sequence are rescaled to the target sizes. If left
                to None, coordinates will not be rescaled.

        Returns:
            `GenerateOutput`: Same output type returned by `generate`, with output token ids replaced with
                boxed and possible rescaled coordinates.
        Nc                    sX   | \}}|d u r j jd } j jd }n|\}}||kr#||kr#dS t|| || S )Nheightwidthg      ?)r   sizer   )original_sizetarget_sizer  r  
max_height	max_widthr   r"   r#   scale_factor_to_fitS  s   zGFuyuProcessor.post_process_box_coordinates.<locals>.scale_factor_to_fitc                    sh    j |} j |}| |kjddd }| |kjddd }t|r2t|r2|d |d fS dS )NT)as_tupler   NN)rT   convert_tokens_to_idsnonzeror   any)rt   start_token	end_tokenstart_idend_idstarting_positionsending_positionsr!  r"   r#   find_delimiters_pair^  s   zHFuyuProcessor.post_process_box_coordinates.<locals>.find_delimiters_pairc              
      s   | t t }dkry|\}}||d krq j| |d | }|  fdd|D \}}}}	dt | d| d| d|	 t 
}
j|
dd  }
j|
}
t	|

| }
t| d | |
| |d d  gd} | t t }dks
| S )	Nr$     r   c                        g | ]}d t t|   qS rZ   intr[   r^   cscaler"   r#   r`   w       zWFuyuProcessor.post_process_box_coordinates.<locals>.tokens_to_boxes.<locals>.<listcomp> , r   )r=   r?   rT   convert_ids_to_tokensr<   r>   r}   r%  r   r   tor+   )rt   r  pairstartendcoordsrd   re   rf   rg   replacementr.  r"  r   r6  r#   tokens_to_boxesi  s   $&zCFuyuProcessor.post_process_box_coordinates.<locals>.tokens_to_boxesc           	         s   | t t }dkrq|\}}||d krq j| |d | }|  fdd|D \}}dt | d| t }j|dd  }j|}t	|
| }t| d | || |d d  gd} | t t }dks
| S )	Nr$  r   r   c                    r0  r1  r2  r4  r6  r"   r#   r`     r8  zXFuyuProcessor.post_process_box_coordinates.<locals>.tokens_to_points.<locals>.<listcomp>r9  r:  r   )r9   r;   rT   r;  r8   r:   r}   r%  r   r   r<  r+   )	rt   r  r=  r>  r?  r@  ra   rb   rA  rB  r6  r#   tokens_to_points  s   &zDFuyuProcessor.post_process_box_coordinates.<locals>.tokens_to_pointsr  r  r   rZ   zTEach element of target_sizes must contain the size (h, w) of each image of the batchzCMake sure that you pass in as many target sizes as output sequencesr6   )r   r  r   r,   rn   r   r-   )r   outputstarget_sizesrC  rD  resultsseqr  r"   rB  r#   post_process_box_coordinates>  s    $

z*FuyuProcessor.post_process_box_coordinatesc                 O      | j j|i |S )z
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )rT   batch_decoder   argsr  r"   r"   r#   rK       zFuyuProcessor.batch_decodec                 O   rJ  )z
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )rT   decoderL  r"   r"   r#   rO    rN  zFuyuProcessor.decode)NNTTFNNr   NFFFFFTNr6   )__name__
__module____qualname____doc__
attributesimage_processor_classtokenizer_classr   r   r   boolr   r   r   ri   r	   r
   r   r3  r   r  rI  rK  rO  __classcell__r"   r"   r   r#   r   5  st    	1D	


{mr   )3rS  rD   typingr   r   r   r   r   numpyr   processing_utilsr   tokenization_utils_baser	   r
   utilsr   r   r   r   image_processing_fuyur   
get_loggerrP  r   r   r<   r>   r8   r:   r=   r?   r9   r;   r   r3  r$   r4   ri   r@   rM   r[   rW   rQ   rW  r   r   r   rl   rm   r   r"   r"   r"   r#   <module>   s   



 #

J

