o
    h['                     @   s   d Z ddlZddlZddlmZmZmZ ddlmZm	Z	 e r#ddl
Z
ddlmZ ddlmZ eeZdd	iZdd
diiZd
diZdd ZG dd deZG dd deZdS )z Tokenization classes for CPMAnt.    N)ListOptionalTuple)is_jieba_availablerequires_backends   )PreTrainedTokenizer)logging
vocab_filez	vocab.txtzopenbmb/cpm-ant-10bz>https://huggingface.co/openbmb/cpm-ant-10b/blob/main/vocab.txti   c                 C   sf   t  }t| ddd}| }W d   n1 sw   Y  t|D ]\}}|d}|||< q#|S )z*Loads a vocabulary file into a dictionary.rutf-8encodingN
)collectionsOrderedDictopen	readlines	enumeraterstrip)r
   vocabreadertokensindextoken r   d/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/cpmant/tokenization_cpmant.py
load_vocab-   s   


r   c                   @   s   e Zd ZdddZdd ZdS )	WordpieceTokenizer<unk>   c                 C   s   || _ || _|| _d S N)r   	unk_tokenmax_input_chars_per_word)selfr   r"   r#   r   r   r   __init__9   s   
zWordpieceTokenizer.__init__c                 C   s   t |}t|| jkr| jgS d}g }|t|k rXt|}d }||k r<d||| }|| jv r4|}n|d8 }||k s#|d u rK|| j |d7 }n|| |}|t|k s|S )Nr       )listlenr#   r"   joinr   append)r$   r   charsstart
sub_tokensend
cur_substrsubstrr   r   r   tokenize>   s,   


zWordpieceTokenizer.tokenizeN)r   r    )__name__
__module____qualname__r%   r2   r   r   r   r   r   8   s    
r   c                
       sB  e Zd ZdZeZeZeZ	ddgZ
dZ							
			d4 fdd	Zedd Zedd Zedd ZedefddZdd Zdd Z fddZdd  Zd!ee defd"d#Zd$d% Zd&d' Zd5d)ed*ee dee fd+d,Zd5d-ee d.ee dee fd/d0Z	d6d-ee d.eee  d1e dee f fd2d3Z!  Z"S )7CpmAntTokenizera  
    Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bod_token (`str`, *optional*, defaults to `"<d>"`):
            The beginning of document token.
        eod_token (`str`, *optional*, defaults to `"</d>"`):
            The end of document token.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        line_token (`str`, *optional*, defaults to `"</n>"`):
            The line token.
        space_token (`str`, *optional*, defaults to `"</_>"`):
            The space token.
    	input_idsattention_maskF<d></d><s></s><pad>r   </n></_>leftc                    s   t | dg || _|| _t|| _| j|	 | jd< | j| | jd< | j|	= | j|= tt| j dd d| _dd | j D | _	t
| j|d	| _t jd||||||||	|
d
	| d S )Njieba r   c                 S      | d S Nr'   r   xr   r   r   <lambda>       z*CpmAntTokenizer.__init__.<locals>.<lambda>keyc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>   s    z,CpmAntTokenizer.__init__.<locals>.<dictcomp>)r   r"   )		bod_token	eod_token	bos_token	eos_token	pad_tokenr"   
line_tokenspace_tokenpadding_sider   )r   rO   rP   r   encoderr   r   sorteditemsdecoderr   wordpiece_tokenizersuperr%   )r$   r
   rO   rP   rQ   rR   rS   r"   rT   rU   rV   kwargs	__class__r   r   r%   w   s0   


zCpmAntTokenizer.__init__c                 C      | j | j S r!   )rW   rO   r$   r   r   r   bod_token_id      zCpmAntTokenizer.bod_token_idc                 C   r`   r!   )rW   rP   ra   r   r   r   eod_token_id   rc   zCpmAntTokenizer.eod_token_idc                 C   s
   | j d S )Nr   rW   ra   r   r   r   
newline_id      
zCpmAntTokenizer.newline_idreturnc                 C   s
   t | jS r!   )r)   rW   ra   r   r   r   
vocab_size   rg   zCpmAntTokenizer.vocab_sizec                 C   s   t | jfi | jS r!   )dictrW   added_tokens_encoderra   r   r   r   	get_vocab   s   zCpmAntTokenizer.get_vocabc                 C   s.   g }t j|ddD ]}|| j| q	|S )zTokenize a string.F)cut_all)rA   cutextendr[   r2   )r$   textoutput_tokensrF   r   r   r   	_tokenize   s   zCpmAntTokenizer._tokenizec                    s4   dd |D } fdd|D }t  j|fi |S )zDecode ids into a string.c                 S   s   g | ]}|d kr|qS )r   r   )rK   ir   r   r   
<listcomp>   s    z+CpmAntTokenizer._decode.<locals>.<listcomp>c                    s.   g | ]}| j kr| jkr| jkr|qS r   )pad_token_ideos_token_idbos_token_id)rK   rF   ra   r   r   rt      s    ()r\   _decode)r$   	token_idsr]   r^   ra   r   rx      s
   
zCpmAntTokenizer._decodec                 C   s
   || j v S r!   re   r$   r   r   r   r   check      
zCpmAntTokenizer.checkr   c                 C   s
   d |S )Nr&   )r*   )r$   r   r   r   r   convert_tokens_to_string   r|   z(CpmAntTokenizer.convert_tokens_to_stringc                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)rW   getr"   rz   r   r   r   _convert_token_to_id   s   z$CpmAntTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)rZ   r~   r"   )r$   r   r   r   r   _convert_id_to_token   s   z$CpmAntTokenizer._convert_id_to_tokenNsave_directoryfilename_prefixc                 C   s*  t j|rt j||r|d ndtd  }n
|r|d nd| }d}d| jv r5| jd | jd< | jd= d| jv rF| jd | jd< | jd= tt| j	 d	d
 d| _t
|ddd.}| j	 D ]\}}||krutd| d |}||d  |d7 }qbW d    |fS 1 sw   Y  |fS )N-r&   r
   r   rB   r?   r   r>   c                 S   rC   rD   r   rE   r   r   r   rG      rH   z1CpmAntTokenizer.save_vocabulary.<locals>.<lambda>rI   wr   r   zSaving vocabulary to z\: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!r'   )ospathisdirr*   VOCAB_FILES_NAMESrW   r   r   rX   rY   r   loggerwarningwrite)r$   r   r   r
   r   writerr   token_indexr   r   r   save_vocabulary   s6   






zCpmAntTokenizer.save_vocabularytoken_ids_0token_ids_1c                 C   s,   |du r
| j g| S | j g| | j g | S )a1  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CPMAnt sequence has the following format:

        - single sequence: `[BOS] Sequence`.

        Args:
            token_ids_0 (`List[int]`): The first tokenized sequence that special tokens will be added.
            token_ids_1 (`List[int]`): The optional second tokenized sequence that special tokens will be added.

        Returns:
            `List[int]`: The model input with special tokens.
        N)rw   )r$   r   r   r   r   r    build_inputs_with_special_tokens   s   z0CpmAntTokenizer.build_inputs_with_special_tokensalready_has_special_tokensc                    sZ   |rt  j||ddS |dur#dgdgt|  dg dgt|  S dgdgt|  S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`): List of IDs.
            token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   Nr'   r   )r\   get_special_tokens_maskr)   )r$   r   r   r   r^   r   r   r      s   (z'CpmAntTokenizer.get_special_tokens_mask)	r9   r:   r;   r<   r=   r   r>   r?   r@   r!   )NF)#r3   r4   r5   __doc__r   vocab_files_namesPRETRAINED_VOCAB_FILES_MAPpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmax_model_input_sizesmodel_input_namesadd_prefix_spacer%   propertyrb   rd   rf   intri   rl   rr   rx   r{   r   strr}   r   r   r   r   r   r   boolr   __classcell__r   r   r^   r   r6   X   sX    *


 $
r6   )r   r   r   typingr   r   r   transformers.utilsr   r   rA   tokenization_utilsr   utilsr	   
get_loggerr3   r   r   r   r   r   objectr   r6   r   r   r   r   <module>   s(   
 