o
    h5                     @   s   d Z ddlZddlZddlmZ ddlmZmZmZ ddl	m
Z
 ddlmZ eeZdd	d
Zddddddd
ZdddZdd ZG dd de
ZdS )z! Tokenization classes for PhoBERT    N)copyfile)ListOptionalTuple   )PreTrainedTokenizer)loggingz	vocab.txtz	bpe.codes)
vocab_filemerges_filez@https://huggingface.co/vinai/phobert-base/resolve/main/vocab.txtzAhttps://huggingface.co/vinai/phobert-large/resolve/main/vocab.txt)zvinai/phobert-basezvinai/phobert-largez@https://huggingface.co/vinai/phobert-base/resolve/main/bpe.codeszAhttps://huggingface.co/vinai/phobert-large/resolve/main/bpe.codes   c                 C   s>   t  }| d }| dd D ]}|||f |}qt |}|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r      N)setadd)wordpairs	prev_charchar r   f/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/phobert/tokenization_phobert.py	get_pairs4   s   r   c                
       s&  e Zd ZdZeZeZeZ								d) fdd	Z
		d*d
ee deee  dee fddZ	d+d
ee deee  dedee f fddZ		d*d
ee deee  dee fddZedd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd*d#ed$ee dee fd%d&Zd'd( Z  ZS ),PhobertTokenizeraO	  
    Construct a PhoBERT tokenizer. Based on Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        bos_token (`st`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    <s></s><unk><pad><mask>c
              
      s   || _ || _i | _d| jt|< d| jt|< d| jt|< d| jt|< | | dd | j D | _t|dd}| 	d	d d
 }W d    n1 sQw   Y  dd |D }t
t|tt|| _i | _t jd|||||||	d|
 d S )Nr   r      r   c                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>   s    z-PhobertTokenizer.__init__.<locals>.<dictcomp>utf-8encoding
c                 S   s    g | ]}t | d d qS )Nr%   )tuplesplit)r   merger   r   r   
<listcomp>   s     z-PhobertTokenizer.__init__.<locals>.<listcomp>)	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_tokenr   )r	   r
   encoderstradd_from_fileitemsdecoderopenreadr'   dictziprangelen	bpe_rankscachesuper__init__)selfr	   r
   r*   r+   r-   r.   r,   r/   r0   kwargsmerges_handlemerges	__class__r   r   r?   y   s4   

zPhobertTokenizer.__init__Ntoken_ids_0token_ids_1returnc                 C   sD   |du r| j g| | jg S | j g}| jg}|| | | | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A PhoBERT sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)cls_token_idsep_token_id)r@   rF   rG   clssepr   r   r    build_inputs_with_special_tokens   s
   z1PhobertTokenizer.build_inputs_with_special_tokensFalready_has_special_tokensc                    sh   |rt  j||ddS |du rdgdgt|  dg S dgdgt|  ddg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rF   rG   rN   Nr   r   )r>   get_special_tokens_maskr;   )r@   rF   rG   rN   rD   r   r   rO      s   0z(PhobertTokenizer.get_special_tokens_maskc                 C   sP   | j g}| jg}|du rt|| | dg S t|| | | | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. PhoBERT does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )rJ   rI   r;   )r@   rF   rG   rL   rK   r   r   r   $create_token_type_ids_from_sequences   s
   "z5PhobertTokenizer.create_token_type_ids_from_sequencesc                 C   s
   t | jS N)r;   r1   r@   r   r   r   
vocab_size   s   
zPhobertTokenizer.vocab_sizec                 C   s   t | jfi | jS rQ   )r8   r1   added_tokens_encoderrR   r   r   r   	get_vocab   s   zPhobertTokenizer.get_vocabc           
         s  | j v r
 j | S t|}tt|d d |d d g }t|}|s'|S 	 t| fddd}| jvr8ny|\}}g }d}|t|k rz|||}	W n ty`   |	||d   Y n?w |	|||	  |	}|| |kr|t|d k r||d  |kr|
||  |d	7 }n|
||  |d7 }|t|k sFt|}|}t|dkrnt|}q(d
|}|d d }| j |< |S )Nr%   z</w>Tc                    s    j | tdS )Ninf)r<   getfloat)pairrR   r   r   <lambda>  s    z&PhobertTokenizer.bpe.<locals>.<lambda>)keyr   r   r   @@ )r=   r&   listr   minr<   r;   index
ValueErrorextendappendjoin)
r@   tokenr   r   bigramfirstsecondnew_wordijr   rR   r   bpe   sN   

"
,


zPhobertTokenizer.bpec                 C   s8   g }t d|}|D ]}|t| |d q
|S )zTokenize a string.z\S+\n? )refindallrb   r^   rl   r'   )r@   textsplit_tokenswordsre   r   r   r   	_tokenize&  s
   zPhobertTokenizer._tokenizec                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r1   rW   r,   )r@   re   r   r   r   _convert_token_to_id0  s   z%PhobertTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)r5   rW   r,   )r@   r`   r   r   r   _convert_id_to_token4  s   z%PhobertTokenizer._convert_id_to_tokenc                 C   s   d |dd }|S )z:Converts a sequence of tokens (string) in a single string.rm   r\    )rd   replacestrip)r@   tokens
out_stringr   r   r   convert_tokens_to_string8  s   z)PhobertTokenizer.convert_tokens_to_stringsave_directoryfilename_prefixc                 C   s  t j|std| d d S t j||r|d ndtd  }t j||r,|d ndtd  }t j| jt j|krNt j	| jrNt
| j| n&t j	| jstt|d}| j }|| W d    n1 sow   Y  t j| jt j|krt
| j| ||fS )NzVocabulary path (z) should be a directory-rv   r	   r
   wb)ospathisdirloggererrorrd   VOCAB_FILES_NAMESabspathr	   isfiler   r6   sp_modelserialized_model_protowriter
   )r@   r|   r}   out_vocab_fileout_merge_fileficontent_spiece_modelr   r   r   save_vocabulary=  s&   (
z PhobertTokenizer.save_vocabularyc           	   
   C   s   t |trCz!t|ddd}| | W d   W dS 1 sw   Y  W dS  ty4 } z|d}~w tyB   td| dw | }|D ]!}| }|	d}|dkr\t
d	|d| }t| j| j|< qIdS )
zi
        Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
        rr!   r"   NzIncorrect encoding detected in z, please rebuild the datasetrm   r%   z5Incorrect dictionary format, expected '<token> <cnt>')
isinstancer2   r6   r3   FileNotFoundErrorUnicodeError	Exception	readlinesrx   rfindra   r;   r1   )	r@   ffdfnfelineslineTmplineidxr   r   r   r   r3   Z  s.   

zPhobertTokenizer.add_from_file)r   r   r   r   r   r   r   rQ   )NF)__name__
__module____qualname____doc__r   vocab_files_namesPRETRAINED_VOCAB_FILES_MAPpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmax_model_input_sizesr?   r   intr   rM   boolrO   rP   propertyrS   rU   rl   rs   rt   ru   r{   r2   r   r   r3   __classcell__r   r   rD   r   r   D   sb    0-





,
 r   )r   r   rn   shutilr   typingr   r   r   tokenization_utilsr   utilsr   
get_loggerr   r   r   r   r   r   r   r   r   r   r   <module>   s.   
