o
    óÜÓhM ã                   @   sx  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z
ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZ ddlmZ e  e!¡Z"dZ#dgZ$eG dd„ deƒƒZ%eG dd„ deƒƒZ&ej'j(dd„ ƒZ)					dEdd„Z*dd„ Z+G dd„ dejj,ƒZ-G d d!„ d!ej,ƒZ.G d"d#„ d#ej,ƒZ/G d$d%„ d%ej,ƒZ0G d&d'„ d'ej,ƒZ1G d(d)„ d)ej,ƒZ2G d*d+„ d+ej,ƒZ3G d,d-„ d-ej,ƒZ4G d.d/„ d/ej,ƒZ5G d0d1„ d1ej,ƒZ6G d2d3„ d3ej,ƒZ7G d4d5„ d5ej,ƒZ8G d6d7„ d7ej,ƒZ9G d8d9„ d9ej,ƒZ:G d:d;„ d;ej,ƒZ;G d<d=„ d=ej,ƒZ<G d>d?„ d?eƒZ=d@Z>dAZ?edBe>ƒG dCdD„ dDe=ƒƒZ@dS )Fz PyTorch VITS model.é    N)Ú	dataclass)ÚAnyÚOptionalÚTupleÚUnion)Únné   )ÚACT2FN)Úis_deepspeed_zero3_enabled)Ú_prepare_4d_attention_mask)ÚBaseModelOutputÚModelOutput)ÚPreTrainedModel)Úadd_start_docstringsÚ%add_start_docstrings_to_model_forwardÚloggingÚreplace_return_docstringsé   )Ú
VitsConfigr   zfacebook/mms-tts-engc                   @   sp   e Zd ZU dZdZejed< dZejed< dZ	e
eej  ed< dZe
eej  ed< dZe
eej  ed< dS )ÚVitsModelOutputaC  
    Describes the outputs for the VITS model, with potential hidden states and attentions.

    Args:
        waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            The final audio waveform predicted by the model.
        sequence_lengths  (`torch.FloatTensor` of shape `(batch_size,)`):
            The length in samples of each element in the `waveform` batch.
        spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
            The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
            GAN decoder model to obtain the final audio waveform.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    NÚwaveformÚsequence_lengthsÚspectrogramÚhidden_statesÚ
attentions)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   ÚtorchÚFloatTensorÚ__annotations__r   r   r   r   r   r   © r"   r"   ú\/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/vits/modeling_vits.pyr   4   s   
 r   c                   @   sh   e Zd ZU dZdZejed< dZejed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dS )ÚVitsTextEncoderOutputaa  
    Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            The predicted mean values of the prior distribution for the latent text variables.
        prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            The predicted log-variance values of the prior distribution for the latent text variables.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    NÚlast_hidden_stateÚprior_meansÚprior_log_variancesr   r   )r   r   r   r   r%   r   r    r!   r&   r'   r   r   r   r   r"   r"   r"   r#   r$   U   s   
 r$   c                 C   sT   | | }t  |d d …d |…d d …f ¡}t  |d d …|d …d d …f ¡}|| }|S ©N)r   ÚtanhÚsigmoid)Úinput_aÚinput_bÚnum_channelsÚin_actÚt_actÚs_actÚactsr"   r"   r#   Úfused_add_tanh_sigmoid_multiplyu   s
     r2   Fç      @çü©ñÒMbP?c	                 C   sÎ   | | k| |k@ }	|	 }
t  | ¡}t  | ¡}t t d| ¡d ¡}tjj|dd}||d< ||d< | |
 ||
< d||
< t| |	 ||	dd…f ||	dd…f ||	dd…f |||||d	\||	< ||	< ||fS )	aô	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the
    `tail_bound`, the transform behaves as an identity function.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`, *optional*, defaults to `False`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`, *optional* defaults to 5):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function with the `tail_bound` limits
            applied.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs` with the `tail_bound`
            limits applied.
    r   )r   r   )Úpad©.r   ©.éÿÿÿÿç        N)	ÚinputsÚunnormalized_widthsÚunnormalized_heightsÚunnormalized_derivativesÚreverseÚ
tail_boundÚmin_bin_widthÚmin_bin_heightÚmin_derivative)	r   Ú
zeros_likeÚnpÚlogÚexpr   Ú
functionalr5   Ú_rational_quadratic_spline)r:   r;   r<   r=   r>   r?   r@   rA   rB   Úinside_interval_maskÚoutside_interval_maskÚoutputsÚlog_abs_detÚconstantr"   r"   r#   Ú(_unconstrained_rational_quadratic_spline~   s,   .

÷rN   c	           *      C   s  |}	| }
t  | ¡|
k st  | ¡|	krtdƒ‚|jd }|| dkr,td|› d|› ƒ‚|| dkr<td|› d|› ƒ‚tjj|dd}|d||  |  }t j|dd}tjj	|d	d
dd}|	|
 | |
 }|
|d< |	|d< |ddd…f |ddd…f  }|tj 
|¡ }tjj|dd}|d||  |  }t j|dd}tjj	|d	d
dd}|	|
 | |
 }|
|d< |	|d< |ddd…f |ddd…f  }|rÎ|n|}|d  d7  < t j| d |kddd }|d }| d|¡d }| d|¡d }| d|¡d }|| }| d|¡d }| d|¡d }|ddd…f  d|¡d }| d|¡d }|| d|  }|s†| | | }|d|  }||| d¡ ||   }|||  }|||  } | d¡|| d¡ d| |  |d|  d¡   }!t  |!¡dt  |¡  }"| |"fS | | }#|#| }$|||  |$ }%|| |$ }&| |# }'|& d¡d|% |'  }(|(dk ¡ sºtd|(› ƒ‚d|' |& t  |(¡  })|)| | } |)d|)  }|||  }| d¡||) d¡ d| |  |d|)  d¡   }!t  |!¡dt  |¡  }"| |" fS )a(	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
    function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs`.
    z-Input to a transform is not within its domainr8   ç      ð?zMinimal bin width z" too large for the number of bins zMinimal bin height ©Údimr   )r   r   rM   r9   )r5   ÚmodeÚvaluer6   r7   .Ngíµ ÷Æ°>).Né   é   r   zinvalid discriminant )r   ÚminÚmaxÚ
ValueErrorÚshaper   rG   ÚsoftmaxÚcumsumr5   ÚsoftplusÚsumÚgatherÚpowrE   ÚallÚRuntimeErrorÚsqrt)*r:   r;   r<   r=   r>   r?   r@   rA   rB   Úupper_boundÚlower_boundÚnum_binsÚwidthsÚ	cumwidthsÚderivativesÚheightsÚ
cumheightsÚbin_locationsÚbin_idxÚinput_cumwidthsÚinput_bin_widthsÚinput_cumheightsÚdeltaÚinput_deltaÚinput_derivativesÚinput_derivatives_plus_oneÚinput_heightsÚintermediate1ÚthetaÚtheta_one_minus_thetaÚ	numeratorÚdenominatorrK   Úderivative_numeratorrL   Úintermediate2Úintermediate3ÚaÚbÚcÚdiscriminantÚrootr"   r"   r#   rH   È   s–   ,
  
ÿþÿ

ÿþÿ
rH   c                       s8   e Zd Zdedef‡ fdd„Zd
dd„Zdd	„ Z‡  ZS )ÚVitsWaveNetÚconfigÚ
num_layersc                    sB  t ƒ  ¡  |j| _|| _tj ¡ | _tj ¡ | _t 	|j
¡| _ttjjdƒr,tjjj}ntjj}|jdkrJtj |jd|j | d¡}||dd| _t|ƒD ]P}|j| }|j| | d }tjj|jd|j |j||d}||dd}| j |¡ ||d k r†d|j }	n|j}	tj |j|	d¡}
||
dd}
| j |
¡ qNd S )NÚweight_normr   rT   r   Úweight)Úname)Úin_channelsÚout_channelsÚkernel_sizeÚdilationÚpadding)ÚsuperÚ__init__Úhidden_sizer„   r   r   Ú
ModuleListÚ	in_layersÚres_skip_layersÚDropoutÚwavenet_dropoutÚdropoutÚhasattrÚutilsÚparametrizationsr…   Úspeaker_embedding_sizeÚConv1dÚ
cond_layerÚrangeÚwavenet_dilation_rateÚwavenet_kernel_sizeÚappend)Úselfrƒ   r„   r…   r›   Úir‹   rŒ   Úin_layerÚres_skip_channelsÚres_skip_layer©Ú	__class__r"   r#   rŽ   Q  s>   


ûëzVitsWaveNet.__init__Nc                 C   s  t  |¡}t  | jg¡}|d ur|  |¡}t| jƒD ]p}| j| |ƒ}|d urA|d | j }|d d …||d| j  …d d …f }	nt  |¡}	t||	|d ƒ}
|  	|
¡}
| j
| |
ƒ}|| jd k r†|d d …d | j…d d …f }|| | }||d d …| jd …d d …f  }q|| }q|| S )NrT   r   r   )r   rC   Ú	IntTensorr   r›   rœ   r„   r‘   r2   r•   r’   )r    r:   Úpadding_maskÚglobal_conditioningrK   Únum_channels_tensorr¡   r   Úcond_offsetÚglobal_statesr1   Úres_skip_actsÚres_actsr"   r"   r#   Úforwardz  s&   

&

"
zVitsWaveNet.forwardc                 C   sR   | j dkrtjj | j¡ | jD ]	}tjj |¡ q| jD ]	}tjj |¡ qd S )Nr   )r™   r   r   r—   Úremove_weight_normr›   r‘   r’   ©r    Úlayerr"   r"   r#   r°   —  s   


ÿzVitsWaveNet.remove_weight_normr(   )	r   r   r   r   ÚintrŽ   r¯   r°   Ú__classcell__r"   r"   r¥   r#   r‚   P  s    
)r‚   c                       s,   e Zd Zdef‡ fdd„Zddd„Z‡  ZS )ÚVitsPosteriorEncoderrƒ   c                    sR   t ƒ  ¡  |j| _t |j|jd¡| _t	||j
d| _t |j| jd d¡| _d S )Nr   ©r„   rT   )r   rŽ   Ú	flow_sizer‰   r   rš   Úspectrogram_binsr   Úconv_prer‚   Ú$posterior_encoder_num_wavenet_layersÚwavenetÚ	conv_proj©r    rƒ   r¥   r"   r#   rŽ   ¡  s
   
zVitsPosteriorEncoder.__init__Nc                 C   sf   |   |¡| }|  |||¡}|  |¡| }tj|| jdd\}}|t |¡t |¡  | }|||fS )Nr   rP   )r¹   r»   r¼   r   Úsplitr‰   Ú
randn_likerF   )r    r:   r¨   r©   ÚstatsÚmeanÚ
log_stddevÚsampledr"   r"   r#   r¯   ©  s   
zVitsPosteriorEncoder.forwardr(   ©r   r   r   r   rŽ   r¯   r´   r"   r"   r¥   r#   rµ      ó    rµ   c                       s@   e Zd Zd‡ fdd„	Zddd„Zd	d
„ Zdd„ Zdd„ Z‡  ZS )ÚHifiGanResidualBlockr   ©r   r   é   çš™™™™™¹?c                    sb   t ƒ  ¡  |ˆ_t ‡ ‡‡‡fdd„ttˆƒƒD ƒ¡ˆ_t ‡ ‡‡fdd„ttˆƒƒD ƒ¡ˆ_d S )Nc                    s2   g | ]}t jˆ ˆ ˆd ˆ| ˆ ˆˆ| ¡d‘qS ©r   )Ústrider‹   rŒ   ©r   rš   Úget_padding)Ú.0r¡   ©Úchannelsr‹   rŠ   r    r"   r#   Ú
<listcomp>¹  s    	øúÿz1HifiGanResidualBlock.__init__.<locals>.<listcomp>c                    s*   g | ]}t jˆ ˆ ˆd d ˆ ˆd ¡d‘qS rÊ   rÌ   ©rÎ   Ú_)rÐ   rŠ   r    r"   r#   rÑ   Æ  s    	ø
úÿ)	r   rŽ   Úleaky_relu_sloper   r   rœ   ÚlenÚconvs1Úconvs2)r    rÐ   rŠ   r‹   rÔ   r¥   rÏ   r#   rŽ   ´  s   

	÷ÿ
	÷
ÿzHifiGanResidualBlock.__init__r   c                 C   s   || | d S )NrT   r"   )r    rŠ   r‹   r"   r"   r#   rÍ   Ó  s   z HifiGanResidualBlock.get_paddingc                 C   ó4   | j D ]}tj |¡ q| jD ]}tj |¡ qd S r(   )rÖ   r   r—   r…   r×   r±   r"   r"   r#   Úapply_weight_normÖ  ó
   

ÿz&HifiGanResidualBlock.apply_weight_normc                 C   rØ   r(   )rÖ   r   r—   r°   r×   r±   r"   r"   r#   r°   Ü  rÚ   z'HifiGanResidualBlock.remove_weight_normc                 C   sX   t | j| jƒD ]"\}}|}tj || j¡}||ƒ}tj || j¡}||ƒ}|| }q|S r(   )ÚziprÖ   r×   r   rG   Ú
leaky_relurÔ   )r    r   Úconv1Úconv2Úresidualr"   r"   r#   r¯   â  s   
zHifiGanResidualBlock.forward)r   rÇ   rÉ   ©r   )	r   r   r   rŽ   rÍ   rÙ   r°   r¯   r´   r"   r"   r¥   r#   rÆ   ³  s    
rÆ   c                       sV   e Zd Zdef‡ fdd„Zdd„ Zdd„ Z	dd	ejd
e	ej dejfdd„Z
‡  ZS )ÚVitsHifiGanrƒ   c              
      sF  t ƒ  ¡  || _t|jƒ| _t|jƒ| _tj	|j
|jdddd| _t ¡ | _tt|j|jƒƒD ]$\}\}}| j tj|jd|  |jd|d   |||| d d¡ q/t ¡ | _tt| jƒƒD ]#}|jd|d   }t|j|jƒD ]\}}| j t||||jƒ¡ qrq`tj	|dddddd| _|jdkr¡t 	|j|jd¡| _d S d S )	Né   r   r   )rŠ   rË   rŒ   rT   F)rŠ   rË   rŒ   Úbiasr   )r   rŽ   rƒ   rÕ   Úresblock_kernel_sizesÚnum_kernelsÚupsample_ratesÚnum_upsamplesr   rš   r·   Úupsample_initial_channelr¹   r   Ú	upsamplerÚ	enumeraterÛ   Úupsample_kernel_sizesrŸ   ÚConvTranspose1dÚ	resblocksrœ   Úresblock_dilation_sizesrÆ   rÔ   Ú	conv_postr™   Úcond)r    rƒ   r¡   Úupsample_raterŠ   rÐ   r‹   r¥   r"   r#   rŽ   î  s@   
û

ûÿ

ÿ
ÿzVitsHifiGan.__init__c                 C   s0   | j D ]}tj |¡ q| jD ]}| ¡  qd S r(   )ré   r   r—   r…   rí   rÙ   r±   r"   r"   r#   rÙ     ó
   


ÿzVitsHifiGan.apply_weight_normc                 C   s0   | j D ]}tj |¡ q| jD ]}| ¡  qd S r(   )ré   r   r—   r°   rí   r±   r"   r"   r#   r°     rò   zVitsHifiGan.remove_weight_normNr   r©   Úreturnc                 C   sÀ   |   |¡}|dur||  |¡ }t| jƒD ]8}tj || jj¡}| j	| |ƒ}| j
|| j  |ƒ}td| jƒD ]}|| j
|| j |  |ƒ7 }q7|| j }qtj |¡}|  |¡}t |¡}|S )aG  
        Converts a spectrogram into a speech waveform.

        Args:
            spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
                Tensor containing the spectrograms.
            global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*):
                Tensor containing speaker embeddings, for multispeaker models.

        Returns:
            `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
        Nr   )r¹   rð   rœ   rç   r   rG   rÜ   rƒ   rÔ   ré   rí   rå   rï   r   r)   )r    r   r©   r   r¡   Ú	res_stateÚjr   r"   r"   r#   r¯     s   


zVitsHifiGan.forwardr(   )r   r   r   r   rŽ   rÙ   r°   r   r    r   r¯   r´   r"   r"   r¥   r#   rá   í  s    $ÿÿÿþrá   c                       ó,   e Zd Zdef‡ fdd„Zddd„Z‡  ZS )	ÚVitsResidualCouplingLayerrƒ   c                    sR   t ƒ  ¡  |jd | _t | j|jd¡| _t||j	d| _
t |j| jd¡| _d S )NrT   r   r¶   )r   rŽ   r·   Úhalf_channelsr   rš   r   r¹   r‚   Ú prior_encoder_num_wavenet_layersr»   rï   r½   r¥   r"   r#   rŽ   B  s
   
z"VitsResidualCouplingLayer.__init__NFc                 C   sÆ   t j|| jgd dd\}}|  |¡| }|  |||¡}|  |¡| }t  |¡}	|sJ||t  |	¡ |  }t j||gdd}
t  	|	ddg¡}|
|fS || t  |	 ¡ | }t j||gdd}
|
d fS )NrT   r   rP   )
r   r¾   rø   r¹   r»   rï   rC   rF   Úcatr]   )r    r:   r¨   r©   r>   Ú
first_halfÚsecond_halfr   rÁ   rÂ   rK   Úlog_determinantr"   r"   r#   r¯   J  s   
z!VitsResidualCouplingLayer.forward©NFrÄ   r"   r"   r¥   r#   r÷   A  rÅ   r÷   c                       rö   )	ÚVitsResidualCouplingBlockrƒ   c                    s8   t ƒ  ¡  t ¡ | _t|jƒD ]
}| j t|ƒ¡ qd S r(   )	r   rŽ   r   r   Úflowsrœ   Úprior_encoder_num_flowsrŸ   r÷   )r    rƒ   rÓ   r¥   r"   r#   rŽ   ]  s
   

ÿz"VitsResidualCouplingBlock.__init__NFc                 C   sh   |s| j D ]}||||ƒ\}}t |dg¡}q|S t| j ƒD ]}t |dg¡}||||dd\}}q|S )Nr   T©r>   )r   r   ÚflipÚreversed)r    r:   r¨   r©   r>   ÚflowrÓ   r"   r"   r#   r¯   c  s   
ýz!VitsResidualCouplingBlock.forwardrþ   rÄ   r"   r"   r¥   r#   rÿ   \  ó    rÿ   c                       s.   e Zd Zddef‡ fdd„Zd	dd„Z‡  ZS )
ÚVitsDilatedDepthSeparableConvr9   rƒ   c                    sÖ   t ƒ  ¡  |j}|j}|j| _t |¡| _t 	¡ | _
t 	¡ | _t 	¡ | _t 	¡ | _t| jƒD ]:}|| }|| | d }| j
 tj||||||d¡ | j t ||d¡¡ | j t |¡¡ | j t |¡¡ q.d S )NrT   )rˆ   r‰   rŠ   Úgroupsr‹   rŒ   r   )r   rŽ   Úduration_predictor_kernel_sizer   Údepth_separable_num_layersr„   r   r“   r•   r   Úconvs_dilatedÚconvs_pointwiseÚnorms_1Únorms_2rœ   rŸ   rš   Ú	LayerNorm)r    rƒ   Údropout_raterŠ   rÐ   r¡   r‹   rŒ   r¥   r"   r#   rŽ   p  s4   




úÿ
ñz&VitsDilatedDepthSeparableConv.__init__Nc                 C   s®   |d ur|| }t | jƒD ]E}| j| || ƒ}| j| | dd¡ƒ dd¡}tj |¡}| j| |ƒ}| j	| | dd¡ƒ dd¡}tj |¡}|  
|¡}|| }q|| S ©Nr   r8   )rœ   r„   r  r  Ú	transposer   rG   Úgelur  r  r•   )r    r:   r¨   r©   r¡   r   r"   r"   r#   r¯   Œ  s   

z%VitsDilatedDepthSeparableConv.forward)r9   r(   rÄ   r"   r"   r¥   r#   r  o  s    r  c                       rö   )	ÚVitsConvFlowrƒ   c                    sr   t ƒ  ¡  |j| _|jd | _|j| _|j| _	t
 | j| jd¡| _t|ƒ| _t
 | j| j| jd d  d¡| _d S )NrT   r   r   )r   rŽ   r   Úfilter_channelsÚdepth_separable_channelsrø   Úduration_predictor_flow_binsre   Úduration_predictor_tail_boundr?   r   rš   r¹   r  Úconv_ddsr¼   r½   r¥   r"   r#   rŽ   ž  s   

&zVitsConvFlow.__init__NFc                 C   s  t j|| jgd dd\}}|  |¡}|  |||¡}|  |¡| }|j\}}	}
| ||	d|
¡ dddd¡}|dd | j	…f t
 | j¡ }|d| j	d| j	 …f t
 | j¡ }|dd| j	 d …f }t|||||| jd\}}t j||gdd| }|s‰t  || ddg¡}||fS |d fS )	NrT   r   rP   r8   r   r   .)r>   r?   )r   r¾   rø   r¹   r  r¼   rY   ÚreshapeÚpermutere   Úmathrb   r  rN   r?   rú   r]   )r    r:   r¨   r©   r>   rû   rü   r   Ú
batch_sizerÐ   Úlengthr;   r<   r=   rL   rK   rý   r"   r"   r#   r¯   ©  s,   
$
ú	zVitsConvFlow.forwardrþ   rÄ   r"   r"   r¥   r#   r    s    r  c                       rö   )	ÚVitsElementwiseAffinerƒ   c                    sB   t ƒ  ¡  |j| _t t | jd¡¡| _t t | jd¡¡| _	d S ©Nr   )
r   rŽ   r  rÐ   r   Ú	Parameterr   ÚzerosÚ	translateÚ	log_scaler½   r¥   r"   r#   rŽ   É  s   
zVitsElementwiseAffine.__init__NFc                 C   sd   |s | j t | j¡|  }|| }t | j| ddg¡}||fS || j  t | j ¡ | }|d fS ©Nr   rT   )r#  r   rF   r$  r]   )r    r:   r¨   r©   r>   rK   rý   r"   r"   r#   r¯   Ï  s   zVitsElementwiseAffine.forwardrþ   rÄ   r"   r"   r¥   r#   r  È  r  r  c                       s&   e Zd Z‡ fdd„Zddd„Z‡  ZS )	ÚVitsStochasticDurationPredictorc                    s  t ƒ  ¡  |j}|j}t ||d¡| _t ||d¡| _t||j	d| _
|dkr/t ||d¡| _t ¡ | _| j t|ƒ¡ t|jƒD ]
}| j t|ƒ¡ qAt d|d¡| _t ||d¡| _t||j	d| _t ¡ | _| j t|ƒ¡ t|jƒD ]
}| j t|ƒ¡ qvd S )Nr   )r  r   )r   rŽ   r™   r   r   rš   r¹   r¼   r  Úduration_predictor_dropoutr  rð   r   r   rŸ   r  rœ   Úduration_predictor_num_flowsr  Úpost_conv_preÚpost_conv_projÚpost_conv_ddsÚ
post_flows)r    rƒ   Ú	embed_dimr  rÓ   r¥   r"   r#   rŽ   Û  s4   
þ
þ
ÿz(VitsStochasticDurationPredictor.__init__NFrO   c                 C   s¬  t  |¡}|  |¡}|d urt  |¡}||  |¡ }|  ||¡}|  |¡| }|s	|  |¡}|  ||¡}|  |¡| }t  	| 
d¡d| 
d¡¡j|j|jd| }d}	|}
| jD ]}||
||| d\}
}t  |
dg¡}
|	|7 }	qYt j|
ddgdd\}}|	t  tj |¡tj | ¡ | ddg¡7 }	t  dt dtj ¡|d   | ddg¡|	 }|t  |¡ | }t  t  |d¡¡| }t  | ddg¡}t j||gdd}| jD ]}||||d\}}t  |dg¡}||7 }qÖt  d	t dtj ¡|d   | ddg¡| }|| S tt| jƒƒ}|d d
… |d g }t  	| 
d¡d| 
d¡¡j|j|jd| }|D ]}t  |dg¡}||||dd\}}q3t j|ddgdd\}}|S )Nr   rT   )ÚdeviceÚdtype)r©   r   rP   ç      à¿gñhãˆµøä>g      à?éþÿÿÿr8   T)r©   r>   )r   Údetachr¹   rð   r  r¼   r)  r+  r*  ÚrandnÚsizeÚtor.  r/  r,  r  r¾   r]   r   rG   Ú
logsigmoidr  rE   Úpir*   Ú	clamp_minrú   r   Úlistr  )r    r:   r¨   r©   Ú	durationsr>   Únoise_scaler   Úrandom_posteriorÚlog_determinant_posterior_sumÚlatents_posteriorr  rý   rû   rü   ÚlogqÚlog_determinant_sumÚlatentsÚnllr   rÓ   Úlog_durationr"   r"   r#   r¯   û  sh   



&ÿÿ


ÿ
"ÿ*ÿÿ

0&ÿÿz'VitsStochasticDurationPredictor.forward)NNFrO   ©r   r   r   rŽ   r¯   r´   r"   r"   r¥   r#   r&  Ú  s     r&  c                       s&   e Zd Z‡ fdd„Zddd„Z‡  ZS )ÚVitsDurationPredictorc                    s°   t ƒ  ¡  |j}|j}t |j¡| _tj|j	|||d d| _
tj||jd| _tj||||d d| _tj||jd| _t |dd¡| _|jdkrVt |j|j	d¡| _d S d S )NrT   )rŒ   ©Úepsr   r   )r   rŽ   r	  Ú"duration_predictor_filter_channelsr   r“   r'  r•   rš   r   Úconv_1r  Úlayer_norm_epsÚnorm_1Úconv_2Únorm_2Úprojr™   rð   )r    rƒ   rŠ   r  r¥   r"   r#   rŽ   ?  s   

ÿzVitsDurationPredictor.__init__Nc                 C   s¸   t  |¡}|d urt  |¡}||  |¡ }|  || ¡}t  |¡}|  | dd¡¡ dd¡}|  |¡}|  || ¡}t  |¡}|  	| dd¡¡ dd¡}|  |¡}|  
|| ¡}|| S r  )r   r2  rð   rI  ÚrelurK  r  r•   rL  rM  rN  )r    r:   r¨   r©   r"   r"   r#   r¯   N  s   





zVitsDurationPredictor.forwardr(   rD  r"   r"   r¥   r#   rE  >  s    rE  c                       s¦   e Zd ZdZdef‡ fdd„Zdejdedefdd	„Z		
	
	
	ddejde
ej de
ej de
ej dedeeje
ej f fdd„Zdd„ Zdd„ Zdd„ Z‡  ZS )ÚVitsAttentionz?Multi-headed attention with relative positional representation.rƒ   c                    s.  t ƒ  ¡  |j| _|j| _|j| _|j| _| j| j | _	| j	d | _
| j	| j | jkr8td| j› d| j› dƒ‚tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _| jr•t t d| jd d | j	¡| j
 ¡| _t t d| jd d | j	¡| j
 ¡| _d S d S )Nr0  zIhidden_size must be divisible by num_attention_heads (got `hidden_size`: z and `num_attention_heads`: z).)rã   r   rT   )r   rŽ   r   r-  Únum_attention_headsÚ	num_headsÚattention_dropoutr•   Úwindow_sizeÚhead_dimÚscalingrX   r   ÚLinearÚuse_biasÚk_projÚv_projÚq_projÚout_projr!  r   r3  Ú	emb_rel_kÚ	emb_rel_vr½   r¥   r"   r#   rŽ   f  s*   

ÿÿ(,þzVitsAttention.__init__ÚtensorÚseq_lenÚbszc                 C   s    |  ||| j| j¡ dd¡ ¡ S r%  )ÚviewrR  rU  r  Ú
contiguous)r    r_  r`  ra  r"   r"   r#   Ú_shape  s    zVitsAttention._shapeNFr   Úkey_value_statesÚattention_maskÚlayer_head_maskÚoutput_attentionsró   c                 C   s  |  ¡ \}}}|  |¡| j }	|  |  |¡d|¡}
|  |  |¡d|¡}|| j d| jf}|  |	||¡j|Ž }	|
j|Ž }
|j|Ž }|
  d¡}t	 
|	|
 dd¡¡}|  ¡ || j ||fkrmtd|| j ||f› d|  ¡ › ƒ‚| jdurŒ|  | j|¡}t	 |	| dd¡¡}|  |¡}||7 }|dur¿|  ¡ |d||fkrªtd|d||f› d|  ¡ › ƒ‚| || j||¡| }| || j ||¡}tjj|dd	}|durü|  ¡ | jfkrátd
| jf› d|  ¡ › ƒ‚| dddd¡| || j||¡ }| || j ||¡}|r| || j||¡}| || j ||¡}nd}tjj|| j| jd}t	 
||¡}|  ¡ || j || jfkrFtd|| j|| jf› d|  ¡ › ƒ‚| jdurb|  | j|¡}|  |¡}t	 ||¡}||7 }| || j|| j¡}| dd¡}| ||| j¡}|  |¡}||fS )z#Input shape: Batch x Time x Channelr8   r   rT   z$Attention weights should be of size z	, but is Nr1  z!Attention mask should be of size rP   z/Head mask for a single layer should be of size )ÚpÚtrainingz `attn_output` should be of size )r4  r[  rV  rd  rY  rZ  rR  rU  rb  r   Úbmmr  rX   rT  Ú_get_relative_embeddingsr]  ÚmatmulÚ'_relative_position_to_absolute_positionr   rG   rZ   r•   rj  r^  Ú'_absolute_position_to_relative_positionr  r-  r\  )r    r   re  rf  rg  rh  ra  Útgt_lenrÓ   Úquery_statesÚ
key_statesÚvalue_statesÚ
proj_shapeÚsrc_lenÚattn_weightsÚkey_relative_embeddingsÚrelative_logitsÚrel_pos_biasÚattn_weights_reshapedÚ
attn_probsÚattn_outputÚvalue_relative_embeddingsÚrelative_weightsr"   r"   r#   r¯   ‚  sx   


ÿÿ

ÿÿÿ"ÿÿ

zVitsAttention.forwardc              	   C   sn   t || jd  dƒ}|dkrtj |dd||ddg¡}t | jd | dƒ}|d|  d }|d d …||…f S )Nr   r   rT   )rW   rT  r   rG   r5   )r    Úrelative_embeddingsr  Ú
pad_lengthÚslice_start_positionÚslice_end_positionr"   r"   r#   rl  ä  s   z&VitsAttention._get_relative_embeddingsc                 C   sŽ   |  ¡ \}}}tj |g d¢¡}| ||d | g¡}tj |d|d ddg¡}| ||d d| d g¡}|d d …d |…|d d …f }|S )N)r   r   r   r   r   r   rT   r   r   ©r4  r   rG   r5   rb  ©r    ÚxÚbatch_headsr  rÓ   Úx_flatÚx_finalr"   r"   r#   rn  í  s   z5VitsAttention._relative_position_to_absolute_positionc              	   C   sŽ   |  ¡ \}}}tj |d|d ddddg¡}| ||d ||d   g¡}tj ||dddg¡}| ||d| g¡d d …d d …dd …f }|S )Nr   r   rT   rƒ  r„  r"   r"   r#   ro  ü  s   *z5VitsAttention._absolute_position_to_relative_position)NNNF)r   r   r   r   r   rŽ   r   ÚTensorr³   rd  r   Úboolr   r¯   rl  rn  ro  r´   r"   r"   r¥   r#   rP  c  s0    úþýüûú
ùb	rP  c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )ÚVitsFeedForwardc                    s¨   t ƒ  ¡  t |j|j|j¡| _t |j|j|j¡| _t 	|j
¡| _t|jtƒr/t|j | _n|j| _|jdkrO|jd d }|jd }||ddddg| _d S d | _d S )Nr   rT   r   )r   rŽ   r   rš   r   Úffn_dimÚffn_kernel_sizerI  rL  r“   Úactivation_dropoutr•   Ú
isinstanceÚ
hidden_actÚstrr	   Úact_fnrŒ   )r    rƒ   Úpad_leftÚ	pad_rightr¥   r"   r#   rŽ   
  s   



zVitsFeedForward.__init__c                 C   s¢   |  ddd¡}|  ddd¡}|| }| jd urtj || j¡}|  |¡}|  |¡}|  |¡}|| }| jd ur?tj || j¡}|  |¡}|| }|  ddd¡}|S )Nr   rT   r   )	r  rŒ   r   rG   r5   rI  r’  r•   rL  )r    r   r¨   r"   r"   r#   r¯     s   





zVitsFeedForward.forwardrD  r"   r"   r¥   r#   r‹  	  s    r‹  c                	       sL   e Zd Zdef‡ fdd„Z		ddejdejdeej d	e	fd
d„Z
‡  ZS )ÚVitsEncoderLayerrƒ   c                    sX   t ƒ  ¡  t|ƒ| _t |j¡| _tj|j	|j
d| _t|ƒ| _tj|j	|j
d| _d S )NrF  )r   rŽ   rP  Ú	attentionr   r“   Úhidden_dropoutr•   r  r   rJ  Ú
layer_normr‹  Úfeed_forwardÚfinal_layer_normr½   r¥   r"   r#   rŽ   4  s   


zVitsEncoderLayer.__init__NFr   r¨   rf  rh  c                 C   sp   |}| j |||d\}}|  |¡}|  || ¡}|}|  ||¡}|  |¡}|  || ¡}|f}|r6||f7 }|S )N)r   rf  rh  )r–  r•   r˜  r™  rš  )r    r   r¨   rf  rh  rß   rv  rK   r"   r"   r#   r¯   <  s    
ý


zVitsEncoderLayer.forwardrþ   )r   r   r   r   rŽ   r   r‰  r    r   rŠ  r¯   r´   r"   r"   r¥   r#   r•  3  s    ûþýüûr•  c                       sp   e Zd Zdef‡ fdd„Z				ddejdejdeej dee	 d	ee	 d
ee	 de
eef fdd„Z‡  ZS )ÚVitsEncoderrƒ   c                    sB   t ƒ  ¡  ˆ | _t ‡ fdd„tˆ jƒD ƒ¡| _d| _ˆ j	| _	d S )Nc                    s   g | ]}t ˆ ƒ‘qS r"   )r•  rÒ   ©rƒ   r"   r#   rÑ   ^  s    z(VitsEncoder.__init__.<locals>.<listcomp>F)
r   rŽ   rƒ   r   r   rœ   Únum_hidden_layersÚlayersÚgradient_checkpointingÚ	layerdropr½   r¥   rœ  r#   rŽ   [  s
   
 zVitsEncoder.__init__Nr   r¨   rf  rh  Úoutput_hidden_statesÚreturn_dictró   c                 C   s  |rdnd }|r
dnd }|d urt ||jƒ}|| }tƒ }	| jD ]F}
|r)||f }tj dd¡}| jo7|| jk }|r<|	rY| j	rM| jrM|  
|
j||||¡}n|
||||d}|d }|r]d}|rf||d f }q || }|rr||f }|s€tdd„ |||fD ƒƒS t|||dS )	Nr"   r   r   )rf  r¨   rh  )NNc                 s   s    | ]	}|d ur|V  qd S r(   r"   )rÎ   Úvr"   r"   r#   Ú	<genexpr>ž  s   € z&VitsEncoder.forward.<locals>.<genexpr>)r%   r   r   )r   r/  r
   rž  rD   ÚrandomÚuniformrj  r   rŸ  Ú_gradient_checkpointing_funcÚ__call__Útupler   )r    r   r¨   rf  rh  r¡  r¢  Úall_hidden_statesÚall_self_attentionsÚdeepspeed_zero3_is_enabledÚencoder_layerÚdropout_probabilityÚskip_the_layerÚlayer_outputsr"   r"   r#   r¯   b  sT   	

ûü€
ýzVitsEncoder.forward)NNNN)r   r   r   r   rŽ   r   r    r   r‰  rŠ  r   r   r   r¯   r´   r"   r"   r¥   r#   r›  Z  s*    ùþýüûúù
ør›  c                       sŠ   e Zd ZdZdef‡ fdd„Zdd„ Zdd„ Z							
ddej	dej
deej	 dee dee dee deeej	 ef fdd„Z‡  ZS )ÚVitsTextEncoderzs
    Transformer encoder that uses relative positional representation instead of absolute positional encoding.
    rƒ   c                    sN   t ƒ  ¡  || _t |j|j|j¡| _t	|ƒ| _
tj|j|jd dd| _d S )NrT   r   )rŠ   )r   rŽ   rƒ   r   Ú	EmbeddingÚ
vocab_sizer   Úpad_token_idÚembed_tokensr›  Úencoderrš   r·   Úprojectr½   r¥   r"   r#   rŽ   ¬  s
   

zVitsTextEncoder.__init__c                 C   ó   | j S r(   ©rµ  ©r    r"   r"   r#   Úget_input_embeddings³  ó   z$VitsTextEncoder.get_input_embeddingsc                 C   s
   || _ d S r(   r¹  )r    rS   r"   r"   r#   Úset_input_embeddings¶  s   
z$VitsTextEncoder.set_input_embeddingsNTÚ	input_idsr¨   rf  rh  r¡  r¢  ró   c                 C   sª   |   |¡t | jj¡ }| j||||||d}|s|d n|j}	|  |	 dd¡¡ dd¡| }
t	j
|
| jjdd\}}|sJ|	||f|dd …  }|S t|	|||j|jdS )N)r   r¨   rf  rh  r¡  r¢  r   r   rT   rP   )r%   r&   r'   r   r   )rµ  r  rb   rƒ   r   r¶  r%   r·  r  r   r¾   r·   r$   r   r   )r    r¾  r¨   rf  rh  r¡  r¢  r   Úencoder_outputsr%   rÀ   r&   r'   rK   r"   r"   r#   r¯   ¹  s,   	ú	ûzVitsTextEncoder.forward)NNNT)r   r   r   r   r   rŽ   r»  r½  r   r‰  r    r   rŠ  r   r   r$   r¯   r´   r"   r"   r¥   r#   r±  §  s0    ùþýüûúùør±  c                   @   s(   e Zd ZdZeZdZdZdZdd„ Z	dS )ÚVitsPreTrainedModelz†
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    Úvitsr¾  Tc                 C   s  t |tjƒr |jjjd| jjd |jdur|jj 	¡  dS dS t |tj
ƒr5|jj 	¡  |jj d¡ dS t |tjƒrdtj |j¡ |jdurbt |j|j|jd   ¡}tjj|j| |d dS dS t |tjƒr…|jjjd| jjd |jdur‡|jj|j  	¡  dS dS dS )zInitialize the weightsr9   )rÁ   ÚstdNrO   r   )r}   r~   )r  r   rW  r†   ÚdataÚnormal_rƒ   Úinitializer_rangerã   Úzero_r  Úfill_rš   ÚinitÚkaiming_normal_r  rb   r  rˆ   rŠ   Úuniform_r²  Úpadding_idx)r    ÚmoduleÚkr"   r"   r#   Ú_init_weightsê  s(   
ÿ
þ
ýz!VitsPreTrainedModel._init_weightsN)
r   r   r   r   r   Úconfig_classÚbase_model_prefixÚmain_input_nameÚsupports_gradient_checkpointingrÎ  r"   r"   r"   r#   rÀ  ß  s    rÀ  aI  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`VitsConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aÎ  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
            1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        speaker_id (`int`, *optional*):
            Which speaker embedding to use. Only used for multispeaker models.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z6The complete VITS model, for text-to-speech synthesis.c                       s¦   e Zd Zdef‡ fdd„Zdd„ Zeeƒee	e
d							ddeej d	eej d
ee dee dee dee deej deee e	f fdd„ƒƒZ‡  ZS )Ú	VitsModelrƒ   c                    s–   t ƒ  |¡ || _t|ƒ| _t|ƒ| _t|ƒ| _|j	r!t
|ƒ| _nt|ƒ| _|jdkr4t |j|j¡| _t|ƒ| _|j| _|j| _|j| _|  ¡  d S r   )r   rŽ   rƒ   r±  Útext_encoderrÿ   r  rá   ÚdecoderÚ"use_stochastic_duration_predictionr&  Úduration_predictorrE  Únum_speakersr   r²  r™   Úembed_speakerrµ   Úposterior_encoderÚspeaking_rater;  Únoise_scale_durationÚ	post_initr½   r¥   r"   r#   rŽ   3  s   





zVitsModel.__init__c                 C   r¸  r(   )rÔ  rº  r"   r"   r#   Úget_encoderM  r¼  zVitsModel.get_encoder)Úoutput_typerÏ  Nr¾  rf  Ú
speaker_idrh  r¡  r¢  Úlabelsró   c           #      C   sz  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur*| d¡ ¡ }n
t |¡ d¡ ¡ }| j jdkrn|durnd|  krJ| j jk sWn t	d| j jd › dƒ‚t
|tƒretjd|| jd}|  |¡ d¡}	nd}	|durxtd	ƒ‚| j||||||d
}
|s‰|
d n|
j}| dd¡}| dd¡}|sž|
d n|
j}|s§|
d n|
j}| j jrº| j|||	d| jd}n|  |||	¡}d| j }t t |¡| | ¡}t t |ddg¡d¡ ¡ }tj| ¡ |j |jd}| d¡| d¡k }| d¡ !|j ¡}t |d¡t |d¡ }|j"\}}}}t #|d¡ $|| d¡}tj||j |jd}| d¡|k }| !|j ¡ $|||¡}|t%j& '|g d¢¡dd…dd…f  }| d¡ dd¡| }t (| )d¡|¡ dd¡}t (| )d¡|¡ dd¡}|t *|¡t |¡ | j+  }| j,|||	dd}|| }|  -||	¡} |  )d¡} |t. /| j j0¡ }!|s²| |!|f|
dd…  }"|"S t1| |!||
j2|
j3dS )aZ  
        labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
            Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation.

        Returns:

        Example:

        ```python
        >>> from transformers import VitsTokenizer, VitsModel, set_seed
        >>> import torch

        >>> tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
        >>> model = VitsModel.from_pretrained("facebook/mms-tts-eng")

        >>> inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")

        >>> set_seed(555)  # make deterministic

        >>> with torch.no_grad():
        ...     outputs = model(inputs["input_ids"])
        >>> outputs.waveform.shape
        torch.Size([1, 45824])
        ```
        Nr8   r   r   z Set `speaker_id` in the range 0-Ú.rà   )r4  Ú
fill_valuer.  z&Training of VITS is not supported yet.)r¾  r¨   rf  rh  r¡  r¢  rT   T)r>   r;  rO   )r/  r.  )r   r   r   r   r   r   r   r  )r   r   r   r   r   )4rƒ   rh  r¡  Úuse_return_dictÚ	unsqueezeÚfloatr   Ú	ones_likerØ  rX   r  r³   Úfullr.  rÙ  ÚNotImplementedErrorrÔ  r%   r  r&   r'   rÖ  r×  rÜ  rÛ  ÚceilrF   r8  r]   ÚlongÚarangerW   r/  r5  rY   r[   rb  r   rG   r5   rm  Úsqueezer¿   r;  r  rÕ  rD   Úprodræ   r   r   r   )#r    r¾  rf  rà  rh  r¡  r¢  rá  Úinput_padding_maskÚspeaker_embeddingsÚtext_encoder_outputr   r&   r'   rC  Úlength_scaleÚdurationÚpredicted_lengthsÚindicesÚoutput_padding_maskÚ	attn_maskr  rÓ   Úoutput_lengthÚinput_lengthÚcum_durationÚvalid_indicesÚpadded_indicesÚattnÚprior_latentsrA  r   r   r   rK   r"   r"   r#   r¯   P  sŒ   &ÿ
úû
&
ûzVitsModel.forward)NNNNNNN)r   r   r   r   rŽ   rÞ  r   ÚVITS_INPUTS_DOCSTRINGr   r   Ú_CONFIG_FOR_DOCr   r   r‰  r³   rŠ  r    r   r   r   r¯   r´   r"   r"   r¥   r#   rÓ  .  s:    
øþýüûúùø	÷rÓ  )Fr3   r4   r4   r4   )Ar   r  Údataclassesr   Útypingr   r   r   r   ÚnumpyrD   r   Útorch.utils.checkpointr   Úactivationsr	   Úintegrations.deepspeedr
   Úmodeling_attn_mask_utilsr   Úmodeling_outputsr   r   Úmodeling_utilsr   r—   r   r   r   r   Úconfiguration_vitsr   Ú
get_loggerr   Úloggerr   Ú"VITS_PRETRAINED_MODEL_ARCHIVE_LISTr   r$   ÚjitÚscriptr2   rN   rH   ÚModuler‚   rµ   rÆ   rá   r÷   rÿ   r  r  r  r&  rE  rP  r‹  r•  r›  r±  rÀ  ÚVITS_START_DOCSTRINGrÿ  rÓ  r"   r"   r"   r#   Ú<module>   st   
ÿ 

÷J 	P:T.+d% '*'M8þ