o
    hV                     @   s   d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlZddlmZ ddlmZmZ ddlmZ er<dd	lmZ eeZd
diZddiddidZddiZdZd\ZZd\ZZ dZ!G dd deZ"dS )zTokenization classes for LLaMA.    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )import_protobuf)
AddedTokenPreTrainedTokenizer)logging)	TextInput
vocab_fileztokenizer.modelz#hf-internal-testing/llama-tokenizerzWhttps://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.modelz]https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json)r   tokenizer_filei   u   ▁)z[INST]z[/INST])z<<SYS>>
z
<</SYS>>

a  You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.c                
       sb  e Zd ZdZeZeZeZ	ddgZ
															d4d
eeeef  f fddZedd Zd5ddZdd Zdd Zedd Zdd Zd5dddee f fddZdd Zd d! Zd"d# Zd$d% Zd6d&ee dee fd'd(Zd6d)d*Z		d7d+ee  d,eee   d-e!dee  f fd.d/Z"	d6d+ee  d,eee   dee  fd0d1Z#ed2d3 Z$  Z%S )8LlamaTokenizera  
    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
    no padding token in the original model.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Whether or not the default system prompt for Llama should be used.
        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not to add spaces between special tokens.
        legacy (`bool`, *optional*):
            Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
            and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
            example:

            - `legacy=True`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
            >>> tokenizer.encode("Hello <extra_id_0>.")
            [8774, 32099, 3, 5, 1]
            ```
            - `legacy=False`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
            [8774, 32099, 5, 1]
            ```
            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.

    	input_idsattention_mask<unk><s></s>NTFsp_model_kwargsc                    s  |d u ri n|| _ t|trt|dddn|}t|tr#t|dddn|}t|tr1t|dddn|}t|tr?t|dddn|}|d u rQtd| j d d}|| _|| _|| _	|| _
|
| _| |dd| _t jd||||||| j |	|
||d| d S )	NFT)
normalizedspecialz2You are using the default legacy behaviour of the a_  . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565	from_slow)	bos_token	eos_token	unk_token	pad_tokenadd_bos_tokenadd_eos_tokenr   clean_up_tokenization_spacesuse_default_system_promptspaces_between_special_tokenslegacy )r   
isinstancestrr   loggerwarning_once	__class__r$   r   r   r    r"   get_spm_processorpopsp_modelsuper__init__)selfr   r   r   r   r   r   r   r    r!   r"   r#   r$   kwargsr*   r%   b/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/llama/tokenization_llama.pyr/      s>   
zLlamaTokenizer.__init__c                 C   s   t | jt| jS N)lenr-   encoder'   r   r0   r%   r%   r3   unk_token_length   s   zLlamaTokenizer.unk_token_lengthc                 C   s   t jdi | j}| js|r|| j |S t| jd3}| }td| j	j
 d}|j|}| }d|_|j| | }|| W d    |S 1 sRw   Y  |S )NrbzThe new behaviour of z (with `self.legacy = False`)Fr%   )spmSentencePieceProcessorr   r$   Loadr   openreadr
   r*   __name__
ModelProto
FromStringNormalizerSpecadd_dummy_prefixnormalizer_spec	MergeFromSerializeToStringLoadFromSerializedProto)r0   r   	tokenizerfr-   	model_pb2modelrD   r%   r%   r3   r+      s"   

		z LlamaTokenizer.get_spm_processorc                 C   s$   | j  }d |d< | j |d< |S )Nr-   sp_model_proto)__dict__copyr-   serialized_model_proto)r0   stater%   r%   r3   __getstate__   s   
zLlamaTokenizer.__getstate__c                 C   s,   || _ tjdi | j| _| j| j d S )Nr%   )rM   r:   r;   r   r-   rG   rL   )r0   dr%   r%   r3   __setstate__   s   zLlamaTokenizer.__setstate__c                 C   s
   | j  S )zReturns vocab size)r-   get_piece_sizer7   r%   r%   r3   
vocab_size   s   
zLlamaTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )zReturns vocab as a dictc                    s   i | ]}  ||qS r%   )convert_ids_to_tokens).0ir7   r%   r3   
<dictcomp>   s    z,LlamaTokenizer.get_vocab.<locals>.<dictcomp>)rangerU   updateadded_tokens_encoder)r0   vocabr%   r7   r3   	get_vocab   s   zLlamaTokenizer.get_vocabtextr   returnc                    s|   | j s	t|dkrt j|fi |S t jt|td fi |}t|dkr<|d tkr<|d | jv r<|dd }|S )z
        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
        first token is special.
        r       N)r$   r5   r.   tokenizeSPIECE_UNDERLINEreplaceall_special_tokens)r0   r_   add_special_tokensr1   tokensr2   r%   r3   rc      s    &zLlamaTokenizer.tokenizec                 K   s^   | j j|td}| js|tdfs|S | j j| j| td}t|| jkr-|| jd S |S )u(  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        )out_typera   N)	r-   r6   r'   r$   
startswithrd   r   r5   r8   )r0   r_   r1   rh   r%   r%   r3   	_tokenize   s
   
 zLlamaTokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r-   piece_to_id)r0   tokenr%   r%   r3   _convert_token_to_id  s   z#LlamaTokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r-   	IdToPiece)r0   indexrm   r%   r%   r3   _convert_id_to_token  s   z#LlamaTokenizer._convert_id_to_tokenc                 C   s   |d  tr|d dd |d< g }d}d}t|D ],\}}|| jv r@|s1|dkr1| jr1|d7 }|| j|| 7 }d}g }q|| d}q|| j|7 }|S )z:Converts a sequence of tokens (string) in a single string.r   rb   N Fra   T)rj   rd   	enumeraterf   r$   r-   decodeappend)r0   rh   current_sub_tokens
out_stringprev_is_specialrX   rm   r%   r%   r3   convert_tokens_to_string  s    

z'LlamaTokenizer.convert_tokens_to_stringfilename_prefixc                 C   s   t j|std| d dS t j||r|d ndtd  }t j| jt j|kr?t j	| jr?t
| j| |fS t j	| jsgt|d}| j }|| W d   |fS 1 sbw   Y  |fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        zVocabulary path (z) should be a directoryN-rr   r   wb)ospathisdirr(   errorjoinVOCAB_FILES_NAMESabspathr   isfiler   r=   r-   rO   write)r0   save_directoryrz   out_vocab_fileficontent_spiece_modelr%   r%   r3   save_vocabulary2  s"   (

zLlamaTokenizer.save_vocabularyc                 C   sL   | j r| jgng }| jr| jgng }|| | }|d ur$|| | | }|S r4   )r   bos_token_idr    eos_token_idr0   token_ids_0token_ids_1r   r   outputr%   r%   r3    build_inputs_with_special_tokensM  s   z/LlamaTokenizer.build_inputs_with_special_tokensr   r   already_has_special_tokensc                    s   |rt  j||ddS | jrdgng }| jrdgng }|du r*|dgt|  | S |dgt|  | | dgt|  | S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   rb   Nr   )r.   get_special_tokens_maskr   r    r5   )r0   r   r   r   r   r   r2   r%   r3   r   X  s(   z&LlamaTokenizer.get_special_tokens_maskc                 C   s`   | j r| jgng }| jr| jgng }dgt|| |  }|dur.|dgt|| |  7 }|S )a  
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   Nrb   )r   r   r    r   r5   r   r%   r%   r3   $create_token_type_ids_from_sequences}  s   z3LlamaTokenizer.create_token_type_ids_from_sequencesc                 C   sT   t d| jj d d}|d| jrdnd}tddd	d
}|d|}|S )aA  
        LLaMA uses [INST] and [/INST] to indicate user messages, and <<SYS>> and <</SYS>> to indicate system messages.
        Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict
        user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering
        rather than needing special tokens. The system message is partly 'embedded' in the first user message, which
        results in an unusual token ordering when it is present. This template should definitely be changed if you wish
        to fine-tune a model with more flexible role ordering!

        The output should look something like:

        <bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos><bos>[INST] Prompt [/INST] Answer <eos>
        <bos>[INST] Prompt [/INST]

        The reference for this chat template is [this code
        snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
        in the original repository.
        zU
No chat template is defined for this tokenizer - using the default template for the z class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.
a1  {% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\n' + content.strip() + '\n<</SYS>>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}USE_DEFAULT_PROMPTtruefalse
z\n'z\'DEFAULT_SYSTEM_MESSAGE)r(   r)   r*   r?   re   r"   DEFAULT_SYSTEM_PROMPT)r0   templatedefault_messager%   r%   r3   default_chat_template  s   z$LlamaTokenizer.default_chat_template)r   r   r   NNTFFFFN)Fr4   )NF)&r?   
__module____qualname____doc__r   vocab_files_namesPRETRAINED_VOCAB_FILES_MAPpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmax_model_input_sizesmodel_input_namesr   r   r'   r   r/   propertyr8   r+   rQ   rS   rU   r^   r   rc   rk   rn   rq   ry   r   r   r   intboolr   r   r   __classcell__r%   r%   r2   r3   r   B   sn    E6




&

!r   )#r   r}   shutilr   typingr   r   r   r   r   r   sentencepiecer:   convert_slow_tokenizerr
   tokenization_utilsr   r   utilsr   tokenization_utils_baser   
get_loggerr?   r(   r   r   r   rd   B_INSTE_INSTB_SYSE_SYSr   r   r%   r%   r%   r3   <module>   s0    
		