o
    h9                     @   s   d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z
ddlmZmZmZ ddlmZ eeZdZdd	iZdd
ddiZdddZg dZG dd deZdS )    N)copyfile)AnyDictListOptionalTuple   )
AddedTokenBatchEncodingPreTrainedTokenizer)loggingu   ▁
vocab_filezsentencepiece.bpe.modelzVhttps://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentencepiece.bpe.modelzUhttps://huggingface.co/facebook/mbart-large-cc25/resolve/main/sentencepiece.bpe.model)zfacebook/mbart-large-en-rozfacebook/mbart-large-cc25i   )ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CNc                       s$  e Zd ZU dZeZeZeZ	ddgZ
g Zee ed< g Zee ed< 								
					dGdeeeef  f fddZdd Zdd Zedd ZedefddZejdeddfddZ	dHdee deee  dedee f fddZ	dIdee deee  dee fd d!Z	dIdee deee  dee fd"d#Zd$ed%ee d&ee fd'd(Zd)d* Z d+edee fd,d-Z!d.d/ Z"d0d1 Z#d2d3 Z$dId4ed5ee de%e fd6d7Z&	8		9dJd:ee d%ed;eee  d&ede'f
 fd<d=Z(d>d? Z)d@dA Z*dKdBdCZ+dDeddfdEdFZ,  Z-S )LMBartTokenizeruT  
    Construct an MBART tokenizer.

    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```python
    >>> from transformers import MBartTokenizer

    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
    ```	input_idsattention_maskprefix_tokenssuffix_tokens<s></s><unk><pad><mask>Nsp_model_kwargsc                    s~  t |trt|dddn|}|d u ri n|_tjdi j_jt| |_ddddd_	d_
tj_fd	d
ttD _dd
 j D _tjtj j
 j	d< j	j dd
 j	 D _tj  |d ur  fdd|D  t jd|||||||d |
| jd| |
d ur|
nd_jj _|_j d S )NTF)lstrip
normalizedr         r   )r,   r/   r-   r.   c                    s"   i | ]\}}| j |  j qS  )sp_model_sizefairseq_offset).0icodeselfr6   b/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/mbart/tokenization_mbart.py
<dictcomp>y   s    z+MBartTokenizer.__init__.<locals>.<dictcomp>c                 S      i | ]\}}||qS r6   r6   r9   kvr6   r6   r>   r?   |       r0   c                 S   r@   r6   r6   rA   r6   r6   r>   r?      rD   c                    s   g | ]}| vr|qS r6   r6   )r9   t)_additional_special_tokensr6   r>   
<listcomp>       z+MBartTokenizer.__init__.<locals>.<listcomp>)	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_tokentokenizer_filesrc_langtgt_langadditional_special_tokensr1   r   r6   )
isinstancestrr	   r1   spmSentencePieceProcessorsp_modelLoadr   fairseq_tokens_to_idsr8   lenr7   	enumerateFAIRSEQ_LANGUAGE_CODESlang_code_to_iditemsid_to_lang_codeupdatefairseq_ids_to_tokenslistkeysextendsuper__init__	_src_langcur_lang_code_idrR   set_src_lang_special_tokens)r=   r   rI   rJ   rL   rM   rK   rN   rO   rP   rQ   rR   r1   rS   kwargs	__class__)rF   r=   r>   rg   P   sR   	
 zMBartTokenizer.__init__c                 C   s$   | j  }d |d< | j |d< |S )NrX   sp_model_proto)__dict__copyrX   serialized_model_proto)r=   stater6   r6   r>   __getstate__   s   
zMBartTokenizer.__getstate__c                 C   s<   || _ t| dsi | _tjdi | j| _| j| j d S )Nr1   r6   )ro   hasattrr1   rV   rW   rX   LoadFromSerializedProtorn   )r=   dr6   r6   r>   __setstate__   s
   
zMBartTokenizer.__setstate__c                 C   s   t | jt | j | j d S )Nr4   )r[   rX   r^   r8   r<   r6   r6   r>   
vocab_size   s   zMBartTokenizer.vocab_sizereturnc                 C   s   | j S N)rh   r<   r6   r6   r>   rQ      s   zMBartTokenizer.src_langnew_src_langc                 C   s   || _ | | j  d S rz   )rh   rj   )r=   r{   r6   r6   r>   rQ      s   Ftoken_ids_0token_ids_1already_has_special_tokensc                    sx   |rt  j||ddS dgt| j }dgt| j }|du r*|dgt|  | S |dgt|  dgt|  | S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r|   r}   r~   r4   Nr   )rf   get_special_tokens_maskr[   r*   r+   )r=   r|   r}   r~   prefix_onessuffix_onesrl   r6   r>   r      s   $z&MBartTokenizer.get_special_tokens_maskc                 C   s,   |du r| j | | j S | j | | | j S )ab  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An MBART sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `X [eos, src_lang_code]`
        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)r*   r+   )r=   r|   r}   r6   r6   r>    build_inputs_with_special_tokens   s   z/MBartTokenizer.build_inputs_with_special_tokensc                 C   sP   | j g}| jg}|du rt|| | dg S t|| | | | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. mBART does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        Nr   )sep_token_idcls_token_idr[   )r=   r|   r}   sepclsr6   r6   r>   $create_token_type_ids_from_sequences   s
   "z3MBartTokenizer.create_token_type_ids_from_sequencesreturn_tensorsrQ   rR   c                 K   sJ   |du s|du rt d|| _| |fd|d|}| |}||d< |S )zIUsed by translation pipeline, to prepare inputs for the generate functionNzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensr   forced_bos_token_id)
ValueErrorrQ   convert_tokens_to_ids)r=   
raw_inputsr   rQ   rR   extra_kwargsinputstgt_lang_idr6   r6   r>   _build_translation_inputs  s   
z(MBartTokenizer._build_translation_inputsc                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r6   )convert_ids_to_tokens)r9   r:   r<   r6   r>   r?     rH   z,MBartTokenizer.get_vocab.<locals>.<dictcomp>)rangerx   ra   added_tokens_encoder)r=   vocabr6   r<   r>   	get_vocab  s   zMBartTokenizer.get_vocabtextc                 C   s   | j j|tdS )N)out_type)rX   encoderU   )r=   r   r6   r6   r>   	_tokenize  s   zMBartTokenizer._tokenizec                 C   s4   || j v r
| j | S | j|}|r|| j S | jS )z0Converts a token (str) in an id using the vocab.)rZ   rX   	PieceToIdr8   unk_token_id)r=   tokenspm_idr6   r6   r>   _convert_token_to_id!  s   

z#MBartTokenizer._convert_token_to_idc                 C   s&   || j v r
| j | S | j|| j S )z=Converts an index (integer) in a token (str) using the vocab.)rb   rX   	IdToPiecer8   )r=   indexr6   r6   r>   _convert_id_to_token*  s   

z#MBartTokenizer._convert_id_to_tokenc                 C   s   d |td }|S )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)r=   tokens
out_stringr6   r6   r>   convert_tokens_to_string0  s   z'MBartTokenizer.convert_tokens_to_stringsave_directoryfilename_prefixc                 C   s   t j|std| d d S t j||r|d ndtd  }t j| jt j|kr?t j	| jr?t
| j| |fS t j	| jsgt|d}| j }|| W d    |fS 1 sbw   Y  |fS )NzVocabulary path (z) should be a directory-r   r   wb)ospathisdirloggererrorr   VOCAB_FILES_NAMESabspathr   isfiler   openrX   rq   write)r=   r   r   out_vocab_fileficontent_spiece_modelr6   r6   r>   save_vocabulary5  s"   (

zMBartTokenizer.save_vocabularyr   r!   	src_texts	tgt_textsc                    s"   || _ || _t j||fi |S rz   )rQ   rR   rf   prepare_seq2seq_batch)r=   r   rQ   r   rR   rk   rl   r6   r>   r   F  s   z$MBartTokenizer.prepare_seq2seq_batchc                 C      |  | jS rz   )rj   rQ   r<   r6   r6   r>   _switch_to_input_modeR     z$MBartTokenizer._switch_to_input_modec                 C   r   rz   )set_tgt_lang_special_tokensrR   r<   r6   r6   r>   _switch_to_target_modeU  r   z%MBartTokenizer._switch_to_target_modec                 C   $   | j | | _g | _| j| jg| _dS )z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].Nr^   cur_lang_coder*   eos_token_idr+   )r=   rQ   r6   r6   r>   rj   X     z*MBartTokenizer.set_src_lang_special_tokenslangc                 C   r   )zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].Nr   )r=   r   r6   r6   r>   r   ^  r   z*MBartTokenizer.set_tgt_lang_special_tokens)r,   r-   r-   r,   r.   r/   r0   NNNNN)NFrz   )r   Nr!   )ry   N).__name__
__module____qualname____doc__r   vocab_files_names&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmax_model_input_sizesPRETRAINED_VOCAB_FILES_MAPpretrained_vocab_files_mapmodel_input_namesr*   r   int__annotations__r+   r   r   rU   r   rg   rs   rw   propertyrx   rQ   setterboolr   r   r   r   r   r   r   r   r   r   r   r
   r   r   r   rj   r   __classcell__r6   r6   rl   r>   r'   3   s   
 N







	 

r'   )r   shutilr   typingr   r   r   r   r   sentencepiecerV   tokenization_utilsr	   r
   r   utilsr   
get_loggerr   r   r   r   r   r   r]   r'   r6   r6   r6   r>   <module>   s&   
