o
    hX                     @   s   d dl mZmZ ddlmZmZmZmZmZm	Z	 ddl
mZmZ e r/d dlmZ ddlmZ e r8ddlmZ e rEd d	lZdd
lmZ eeZeeG dd deZd	S )    )ListUnion   )add_end_docstringsis_tf_availableis_torch_availableis_vision_availableloggingrequires_backends   )PIPELINE_INIT_ARGSPipeline)Image)
load_image)'TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMESN)$MODEL_FOR_VISION_2_SEQ_MAPPING_NAMESc                       sl   e Zd ZdZ fddZdddZdeeee ded f f fd	d
Z	dddZ
dddZdd Z  ZS )ImageToTextPipelinea  
    Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
    >>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
    [{'generated_text': 'two birds are standing next to each other '}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This image to text pipeline can currently be loaded from pipeline() using the following task identifier:
    "image-to-text".

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text).
    c                    s>   t  j|i | t| d | | jdkrt d S t d S )Nvisiontf)super__init__r
   check_model_type	frameworkr   r   )selfargskwargs	__class__ Z/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/pipelines/image_to_text.pyr   6   s   
zImageToTextPipeline.__init__Nc                 C   sz   i }i }|d ur||d< |d ur||d< |d ur||d< |d ur8d|vr(i |d< d|d v r2t d||d d< ||i fS )Nprompttimeoutgenerate_kwargsmax_new_tokenszp'max_new_tokens' is defined twice, once in 'generate_kwargs' and once as a direct parameter, please use only one)
ValueError)r   r#   r"   r    r!   forward_kwargspreprocess_paramsr   r   r   _sanitize_parameters=   s"   
z(ImageToTextPipeline._sanitize_parametersimageszImage.Imagec                    s   t  j|fi |S )a  
        Assign labels to the image(s) passed as inputs.

        Args:
            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing a HTTP(s) link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images.

            max_new_tokens (`int`, *optional*):
                The amount of maximum tokens to generate. By default it will use `generate` default.

            generate_kwargs (`Dict`, *optional*):
                Pass it to send all of these arguments directly to `generate` allowing full control of this function.
            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following key:

            - **generated_text** (`str`) -- The generated text.
        )r   __call__)r   r(   r   r   r   r   r)   S   s   zImageToTextPipeline.__call__c                 C   s$  t ||d}|d uryt|tstdt| d| jjj}|dkrI| j|| j	d}| j
|ddj}| j
jg| }t|d}|d	|i n8|d
krW| j||| j	d}n*|dkrq| j|| j	d}| j
|| j	d}|| ntd| d| j|| j	d}| jjjdkr|d u rd |d	< |S )N)r!   z&Received an invalid text input, got - zy - but expected a single string. Note also that one single text can be provided for conditional image to text generation.git)r(   return_tensorsF)textadd_special_tokensr   	input_ids
pix2struct)r(   header_textr+   zvision-encoder-decoder)r+   zModel type z- does not support conditional text generation)r   
isinstancestrr$   typemodelconfig
model_typeimage_processorr   	tokenizerr.   cls_token_idtorchtensor	unsqueezeupdate)r   imager    r!   r6   model_inputsr.   text_inputsr   r   r   
preprocessq   s0   

zImageToTextPipeline.preprocessc                 C   sj   d|v rt |d trtdd |d D rd |d< |d u r i }|| jj}| jj|fi ||}|S )Nr.   c                 s   s    | ]}|d u V  qd S Nr   ).0xr   r   r   	<genexpr>   s    z/ImageToTextPipeline._forward.<locals>.<genexpr>)r1   listallpopr4   main_input_namegenerate)r   r?   r"   inputsmodel_outputsr   r   r   _forward   s   zImageToTextPipeline._forwardc                 C   s0   g }|D ]}d| j j|ddi}|| q|S )Ngenerated_textT)skip_special_tokens)r8   decodeappend)r   rL   records
output_idsrecordr   r   r   postprocess   s   zImageToTextPipeline.postprocess)NNNN)NNrB   )__name__
__module____qualname____doc__r   r'   r   r2   r   r)   rA   rM   rU   __classcell__r   r   r   r   r      s    
&

'r   )typingr   r   utilsr   r   r   r   r	   r
   baser   r   PILr   image_utilsr   models.auto.modeling_tf_autor   r:   models.auto.modeling_autor   
get_loggerrV   loggerr   r   r   r   r   <module>   s     
