o
    h;2                     @   s   d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
ZddlmZ ddlmZ eeZdZd	d
iZd	ddiiZddiZG dd deZdS )zTokenization classes for .    N)copyfile)AnyDictListOptionalTuple   )PreTrainedTokenizer)loggingu   ▁
vocab_filezsentencepiece.bpe.modelzfacebook/xglm-564MzNhttps://huggingface.co/facebook/xglm-564M/resolve/main/sentencepiece.bpe.modeli   c                
       sR  e Zd ZdZeZeZeZ	ddgZ
							d,d	eeeef  d
df fddZdd Zdd Z	d-dee deee  d
ee fddZ	d.dee deee  ded
ee f fddZ	d-dee deee  d
ee fddZedd Zdd Zded
ee fd d!Zd"d# Zd$d% Zd&d' Zd-d(ed)ee d
ee fd*d+Z   Z!S )/XGLMTokenizera  
    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    	input_idsattention_mask<s></s><unk><pad>Nsp_model_kwargsreturnc	              
      s  |d u ri n|_ d_dd tjD }
 dg pg  d<  d   fdd|
D 7  < tjdi j _jt| |_	d_
dddd	d
_tjfddtjD }
j|
 dd j D _t jd||||||j d  d S )N   c                 S   s   g | ]}d | dqS z<madeupword> .0ir   r   `/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/xglm/tokenization_xglm.py
<listcomp>       z*XGLMTokenizer.__init__.<locals>.<listcomp>additional_special_tokensc                    s   g | ]
}| d  vr|qS )r   r   )r   word)kwargsr   r   r      s       r      r   )r   r   r   r   c                    s$   i | ]}d | d|  j  qS r   )fairseq_offsetr   )selfsp_sizer   r   
<dictcomp>   s   $ z*XGLMTokenizer.__init__.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r   r   )r   kvr   r   r   r'      s    )	bos_token	eos_token	unk_token	sep_token	cls_token	pad_tokenr   r   )r   num_madeup_wordsrangegetspmSentencePieceProcessorsp_modelLoadstrr   r$   fairseq_tokens_to_idslenupdateitemsfairseq_ids_to_tokenssuper__init__)r%   r   r*   r+   r-   r.   r,   r/   r   r!   madeup_words	__class__)r!   r%   r&   r   r>   p   s6   
	

zXGLMTokenizer.__init__c                 C   s$   | j  }d |d< | j |d< |S )Nr5   sp_model_proto)__dict__copyr5   serialized_model_proto)r%   stater   r   r   __getstate__   s   
zXGLMTokenizer.__getstate__c                 C   s<   || _ t| dsi | _tjdi | j| _| j| j d S )Nr   r   )rC   hasattrr   r3   r4   r5   LoadFromSerializedProtorB   )r%   dr   r   r   __setstate__   s
   
zXGLMTokenizer.__setstate__token_ids_0token_ids_1c                 C   s0   |du r
| j g| S | j g}|| | | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An XLM-RoBERTa sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)sep_token_idr%   rL   rM   sepr   r   r    build_inputs_with_special_tokens   s   z.XGLMTokenizer.build_inputs_with_special_tokensFalready_has_special_tokensc                    s\   |rt  j||ddS |du rdgdgt|  S dgdgt|  ddg dgt|  S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rL   rM   rR   Nr"   r   )r=   get_special_tokens_maskr9   )r%   rL   rM   rR   r@   r   r   rS      s   *z%XGLMTokenizer.get_special_tokens_maskc                 C   s@   | j g}|du rt|| dg S t|| | | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        Nr   )rN   r9   rO   r   r   r   $create_token_type_ids_from_sequences   s   z2XGLMTokenizer.create_token_type_ids_from_sequencesc                 C   s   t | j| j | j S N)r9   r5   r$   r0   r%   r   r   r   
vocab_size  s   zXGLMTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r   )convert_ids_to_tokensr   rV   r   r   r'   
  r   z+XGLMTokenizer.get_vocab.<locals>.<dictcomp>)r1   rW   r:   added_tokens_encoder)r%   vocabr   rV   r   	get_vocab	  s   zXGLMTokenizer.get_vocabtextc                 C   s   | j j|tdS )N)out_type)r5   encoder7   )r%   r\   r   r   r   	_tokenize  s   zXGLMTokenizer._tokenizec                 C   s4   || j v r
| j | S | j|}|r|| j S | jS )z0Converts a token (str) in an id using the vocab.)r8   r5   	PieceToIdr$   unk_token_id)r%   tokenspm_idr   r   r   _convert_token_to_id  s   

z"XGLMTokenizer._convert_token_to_idc                 C   s&   || j v r
| j | S | j|| j S )z=Converts an index (integer) in a token (str) using the vocab.)r<   r5   	IdToPiecer$   )r%   indexr   r   r   _convert_id_to_token  s   

z"XGLMTokenizer._convert_id_to_tokenc                 C   s   d |td }|S )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)r%   tokens
out_stringr   r   r   convert_tokens_to_string   s   z&XGLMTokenizer.convert_tokens_to_stringsave_directoryfilename_prefixc                 C   s   t j|std| d d S t j||r|d ndtd  }t j| jt j|kr?t j	| jr?t
| j| |fS t j	| jsgt|d}| j }|| W d    |fS 1 sbw   Y  |fS )NzVocabulary path (z) should be a directory-rh   r   wb)ospathisdirloggererrorrj   VOCAB_FILES_NAMESabspathr   isfiler   openr5   rE   write)r%   rq   rr   out_vocab_fileficontent_spiece_modelr   r   r   save_vocabulary%  s"   (

zXGLMTokenizer.save_vocabulary)r   r   r   r   r   r   NrU   )NF)"__name__
__module____qualname____doc__rz   vocab_files_namesPRETRAINED_VOCAB_FILES_MAPpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmax_model_input_sizesmodel_input_namesr   r   r7   r   r>   rG   rK   r   intrQ   boolrS   rT   propertyrW   r[   r_   rd   rg   rp   r   r   __classcell__r   r   r@   r   r   +   sl    ?	8





	(r   )r   ru   shutilr   typingr   r   r   r   r   sentencepiecer3   tokenization_utilsr	   utilsr
   
get_loggerr   rx   rl   rz   r   r   r   r   r   r   r   <module>   s"   
