o
    h                     @   s^   d dl mZmZ ddlmZ ddlmZ e r#ddlmZ ddl	m
Z
 dZG d	d
 d
eZdS )    )ListUnion   )is_torch_available   )Pipeline)%MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING)SpeechT5HifiGanzmicrosoft/speecht5_hifiganc                       sn   e Zd ZdZddd fdd
Zdd Zdd	 Zd
eee	e f f fddZ
			dddZdd Z  ZS )TextToAudioPipelinea5  
    Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
    pipeline generates an audio file from an input text and optional other conditional inputs.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> pipe = pipeline(model="suno/bark-small")
    >>> output = pipe("Hey it's HuggingFace on the phone!")

    >>> audio = output["audio"]
    >>> sampling_rate = output["sampling_rate"]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    <Tip>

    You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or
    [`TextToAudioPipeline.__call__.generate_kwargs`].

    Example:

    ```python
    >>> from transformers import pipeline

    >>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt")

    >>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length
    >>> generate_kwargs = {
    ...     "do_sample": True,
    ...     "temperature": 0.7,
    ...     "max_new_tokens": 35,
    ... }

    >>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs)
    ```

    </Tip>

    This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
    `"text-to-audio"`.

    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
    N)vocodersampling_ratec                   s   t  j|i | | jdkrtdd | _| jjt v r.|d u r+t	
t| jjn|| _|| _| jd ur<| jjj| _| jd u rj| jj}| jjdd }|d urX||  dD ]}t||d }|d uri|| _qZd S d S )Ntfz5The TextToAudioPipeline is only available in PyTorch.generation_config)sample_rater   )super__init__	framework
ValueErrorr   model	__class__r   valuesr	   from_pretrainedDEFAULT_VOCODER_IDtodevicer   config__dict__getupdateto_dictgetattr)selfr   r   argskwargsr   
gen_configsampling_rate_namer    Z/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/pipelines/text_to_audio.pyr   L   s0   


zTextToAudioPipeline.__init__c                 K   sh   t |tr|g}| jjjdkr%| jjjddddddd}|| |}| j	|fi |dd	i}|S )
Nbarkmax_input_semantic_length   FT
max_length)r,   add_special_tokensreturn_attention_maskreturn_token_type_idspaddingreturn_tensorspt)

isinstancestrr   r   
model_typer   semantic_configr   r   	tokenizer)r!   textr#   
new_kwargsoutputr'   r'   r(   
preprocessk   s   

	zTextToAudioPipeline.preprocessc                 K   s   | j || jd}|d }|d }| j r.| j || jd}|| | jjdi ||}nt|r;td|  | jdi ||d }| j	d urQ| 	|}|S )N)r   forward_paramsgenerate_kwargsa\  You're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non empty.
                                 For forward-only TTA models, please use `forward_params` instead of of
                                 `generate_kwargs`. For reference, here are the `generate_kwargs` used here:
                                 r   r'   )
_ensure_tensor_on_devicer   r   can_generater   generatelenr   keysr   )r!   model_inputsr#   r<   r=   r:   r'   r'   r(   _forward   s"   



zTextToAudioPipeline._forwardtext_inputsc                    s   t  j|fi |S )a  
        Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.

        Args:
            text_inputs (`str` or `List[str]`):
                The text(s) to generate.
            forward_params (`dict`, *optional*):
                Parameters passed to the model generation/forward method. `forward_params` are always passed to the
                underlying model.
            generate_kwargs (`dict`, *optional*):
                The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
                complete overview of generate, check the [following
                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are
                only passed to the underlying model if the latter is a generative model.

        Return:
            A `dict` or a list of `dict`: The dictionaries have two keys:

            - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
            - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
        )r   __call__)r!   rE   r<   r&   r'   r(   rF      s   zTextToAudioPipeline.__call__c                 C   s4   |r|ni |r	|ni d}|d u ri }i }|||fS )N)r<   r=   r'   )r!   preprocess_paramsr<   r=   paramspostprocess_paramsr'   r'   r(   _sanitize_parameters   s   


z(TextToAudioPipeline._sanitize_parametersc                 C   s&   i }|    |d< | j|d< |S )Naudior   )cpufloatnumpyr   )r!   waveformoutput_dictr'   r'   r(   postprocess   s   
zTextToAudioPipeline.postprocess)NNN)__name__
__module____qualname____doc__r   r;   rD   r   r4   r   rF   rJ   rQ   __classcell__r'   r'   r&   r(   r
      s    0
r
   N)typingr   r   utilsr   baser   models.auto.modeling_autor   !models.speecht5.modeling_speecht5r	   r   r
   r'   r'   r'   r(   <module>   s   