o
    h$                     @   s   d Z ddlZddlZddlZddlmZmZmZmZm	Z	m
Z
 ddlmZ ddlmZmZ e r3ddlZeeZddiZdd	d
iiZd	diZdd ZG dd deZdS )zTokenization class for VITS.    N)AnyDictListOptionalTupleUnion   )PreTrainedTokenizer)is_phonemizer_availablelogging
vocab_filez
vocab.jsonzfacebook/mms-tts-engzChttps://huggingface.co/facebook/mms-tts-eng/resolve/main/vocab.jsoni   c                 C   s    t d}|| }|d u}|S )Nz[^\x00-\x7F])recompilesearch)input_stringnon_roman_patternmatchhas_non_roman r   `/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/vits/tokenization_vits.pyhas_non_roman_characters/   s   

r   c                       s  e Zd ZdZeZeZeZ	ddgZ
							d&	d' fd
dZedd Zdd Zdd Zdd Z	d(dededee d	eeeeef f fddZded	ee fddZdee d	efddZdd Zd d! Zd)d"ed#ee d	eee df fd$d%Z  ZS )*VitsTokenizera  
    Construct a VITS tokenizer. Also supports MMS-TTS.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        language (`str`, *optional*):
            Language identifier.
        add_blank (`bool`, *optional*, defaults to `True`):
            Whether to insert token id 0 in between the other tokens.
        normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the input text by removing all casing and punctuation.
        phonemize (`bool`, *optional*, defaults to `True`):
            Whether to convert the input text into phonemes.
        is_uroman (`bool`, *optional*, defaults to `False`):
            Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
    	input_idsattention_mask<pad><unk>NTFreturnc	              
      s   t |dd}
t|
| _W d    n1 sw   Y  dd | j D | _|| _|| _|| _|| _	|| _
t jd|||||||d|	 d S )Nutf-8encodingc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>c   s    z*VitsTokenizer.__init__.<locals>.<dictcomp>)	pad_token	unk_tokenlanguage	add_blank	normalize	phonemize	is_uromanr   )openjsonloadencoderitemsdecoderr&   r'   r(   r)   r*   super__init__)selfr   r$   r%   r&   r'   r(   r)   r*   kwargsvocab_handle	__class__r   r   r2   T   s(   
zVitsTokenizer.__init__c                 C   s
   t | jS N)lenr.   r3   r   r   r   
vocab_sizev   s   
zVitsTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r   )convert_ids_to_tokens)r    ir:   r   r   r#   {   s    z+VitsTokenizer.get_vocab.<locals>.<dictcomp>)ranger;   updateadded_tokens_encoder)r3   vocabr   r:   r   	get_vocabz   s   zVitsTokenizer.get_vocabc                 C   s   t | j t | j  }d}d}|t|k rMd}|D ]}|||t|  |kr8||7 }|t|7 }d} nq|sG|||  7 }|d7 }|t|k s|S )zfLowercase the input string, respecting any special token ids that may be part or entirely upper-cased. r   FT   )listr.   keysr@   r9   lower)r3   r   all_vocabularyfiltered_textr=   found_matchwordr   r   r   normalize_text   s"   zVitsTokenizer.normalize_textc                 C   s   | j dkr|dd}|S )z4Special treatment of characters in certain languagesronu   țu   ţ)r&   replace)r3   textr   r   r   _preprocess_char   s   
zVitsTokenizer._preprocess_charrO   is_split_into_wordsr(   c                    s   |dur|n j }|r |} |}t|r! jr!td  jrAt s+t	dt
j|dddddd}tdd	|}||fS |rSd
tt fdd| }||fS )a  
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize.
            normalize (`bool`, *optional*, defaults to `None`):
                Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
                trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
                text consists only of lower-case characters.
            kwargs (`Dict[str, Any]`, *optional*):
                Keyword arguments to use for the tokenization.

        Returns:
            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
        NzText to the tokenizer contains non-Roman characters. Ensure the `uroman` Romanizer is applied to the text prior to passing it to the tokenizer. See `https://github.com/isi-nlp/uroman` for details.zEPlease install the `phonemizer` Python package to use this tokenizer.zen-usespeakT)r&   backendstrippreserve_punctuationwith_stressz\s+ rC   c                    s
   |  j v S r8   )r.   )charr:   r   r   <lambda>   s   
 z8VitsTokenizer.prepare_for_tokenization.<locals>.<lambda>)r(   rL   rP   r   r*   loggerwarningr)   r
   ImportError
phonemizerr   subjoinrE   filterrT   )r3   rO   rQ   r(   r4   rI   r   r:   r   prepare_for_tokenization   s0   

 z&VitsTokenizer.prepare_for_tokenizationc                 C   s@   t |}| jr| dgt|d d  }||ddd< |}|S )z]Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters.r      rD   N)rE   r'   _convert_id_to_tokenr9   )r3   rO   tokensinterspersedr   r   r   	_tokenize   s   zVitsTokenizer._tokenizerd   c                 C   s*   | j rt|dkr|dd d }d|S )NrD   rb   rC   )r'   r9   r_   )r3   rd   r   r   r   convert_tokens_to_string   s   
z&VitsTokenizer.convert_tokens_to_stringc                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r.   getr%   )r3   tokenr   r   r   _convert_token_to_id   s   z"VitsTokenizer._convert_token_to_idc                 C   s   | j |S )z=Converts an index (integer) in a token (str) using the vocab.)r0   rh   )r3   indexr   r   r   rc      s   z"VitsTokenizer._convert_id_to_tokensave_directoryfilename_prefixc              	   C   s   t j|std| d d S t j||r|d ndtd  }t|ddd}|t	j
| jd	d
ddd  W d    |fS 1 sEw   Y  |fS )NzVocabulary path (z) should be a directory-rC   r   wr   r   rb   TF)indent	sort_keysensure_ascii
)ospathisdirrZ   errorr_   VOCAB_FILES_NAMESr+   writer,   dumpsr.   )r3   rl   rm   r   fr   r   r   save_vocabulary   s    
zVitsTokenizer.save_vocabulary)r   r   NTTTF)r   N)FNr8   ) __name__
__module____qualname____doc__rx   vocab_files_namesPRETRAINED_VOCAB_FILES_MAPpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmax_model_input_sizesmodel_input_namesr2   propertyr;   rB   rL   rP   strboolr   r   r   r   ra   r   rf   rg   rj   rc   r   r|   __classcell__r   r   r6   r   r   9   sH    "

<0r   )r   r,   rt   r   typingr   r   r   r   r   r   tokenization_utilsr	   utilsr
   r   r]   
get_loggerr}   rZ   rx   r   r   r   r   r   r   r   r   <module>   s&    

