o
    h\E                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlZddlmZ ddlmZ eeZdd	d
dddZddiddiddiddidZddiZi ZdZG dd deZdede
ee	f dejfddZdeddfddZ dedee
ef fdd Z!dS )!    N)Path)copyfile)AnyDictListOptionalTupleUnion   )PreTrainedTokenizer)loggingz
source.spmz
target.spmz
vocab.jsonztarget_vocab.jsonztokenizer_config.json)
source_spm
target_spmvocabtarget_vocab_filetokenizer_config_filezHelsinki-NLP/opus-mt-en-dezIhttps://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/source.spmzIhttps://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/target.spmzIhttps://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/vocab.jsonzThttps://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/tokenizer_config.json)r   r   r   r      u   ▁c                
       s  e Zd ZdZeZeZeZ	e
ZddgZedZ										
dEdeeeef  ddf fddZdd ZdedefddZdd ZdefddZdedee fddZdedefddZ fddZ fd d!Zd"ee defd#d$ZdFdee fd%d&Z d'd( Z!d)d* Z"e#defd+d,Z$dFd-ed.ee de%e fd/d0Z&defd1d2Z'd3d4 Z(d5d6 Z)defd7d8Z*d9eddfd:d;Z+d<d= Z,d>d? Z-	
dGd@edAee dBe.dee fdCdDZ/  Z0S )HMarianTokenizeraB  
    Construct a Marian tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        source_spm (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary for the source language.
        target_spm (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary for the target language.
        source_lang (`str`, *optional*):
            A string representing the source language.
        target_lang (`str`, *optional*):
            A string representing the target language.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        model_max_length (`int`, *optional*, defaults to 512):
            The maximum sentence length the model accepts.
        additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Examples:

    ```python
    >>> from transformers import MarianForCausalLM, MarianTokenizer

    >>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    >>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
    >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
    >>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)

    >>> outputs = model(**inputs)  # should work
    ```	input_idsattention_maskz>>.+<<N<unk></s><pad>r   Fsp_model_kwargsreturnc                    s.  |d u ri n|| _ t| sJ d| || _t|| _t|| jvr)tdt|	| jv s2J |rHt|| _dd | j	 D | _
g | _ndd | j	 D | _
dd | jD | _|| _|| _||g| _t|| j | _t|| j | _| j| _| j| _|   t jd	|||||	|
| j ||d	| d S )
Nzcannot find spm source z <unk> token must be in the vocabc                 S      i | ]\}}||qS  r   .0kvr   r   d/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/marian/tokenization_marian.py
<dictcomp>       z,MarianTokenizer.__init__.<locals>.<dictcomp>c                 S   r   r   r   r   r   r   r!   r"      r#   c                 S   s$   g | ]}| d r|dr|qS )z>>z<<)
startswithendswithr   r   r   r   r!   
<listcomp>   s   $ z,MarianTokenizer.__init__.<locals>.<listcomp>)	source_langtarget_lang	unk_token	eos_token	pad_tokenmodel_max_lengthr   r   separate_vocabsr   )r   r   existsr.   	load_jsonencoderstrKeyErrortarget_encoderitemsdecodersupported_language_codesr(   r)   	spm_filesload_spm
spm_source
spm_targetcurrent_spmcurrent_encoder_setup_normalizersuper__init__)selfr   r   r   r   r(   r)   r*   r+   r,   r-   r   r.   kwargs	__class__r   r!   r@      sD   



zMarianTokenizer.__init__c              	   C   sN   zddl m} || jj| _W d S  ttfy&   td dd | _Y d S w )Nr   )MosesPunctNormalizerz$Recommended: pip install sacremoses.c                 S   s   | S Nr   )xr   r   r!   <lambda>   s    z3MarianTokenizer._setup_normalizer.<locals>.<lambda>)	
sacremosesrE   r(   	normalizepunc_normalizerImportErrorFileNotFoundErrorwarningswarn)rA   rE   r   r   r!   r>      s   
z!MarianTokenizer._setup_normalizerrG   c                 C   s   |r|  |S dS )zHCover moses empty string edge case. They return empty list for '' input! )rK   )rA   rG   r   r   r!   rJ      s   zMarianTokenizer.normalizec                 C   s   | j || j | j S rF   )r=   getr*   )rA   tokenr   r   r!   _convert_token_to_id   s   z$MarianTokenizer._convert_token_to_idtextc                 C   s2   | j |}|r|dgng }|| j d|fS )z6Remove language codes like >>fr<< before sentencepiecer   rP   )language_code_rematchgroupsub)rA   rT   rV   coder   r   r!   remove_language_code   s   z$MarianTokenizer.remove_language_codec                 C   s&   |  |\}}| jj|td}|| S )N)out_type)rZ   r<   encoder2   )rA   rT   rY   piecesr   r   r!   	_tokenize   s   zMarianTokenizer._tokenizeindexc                 C   s   | j || jS )z?Converts an index (integer) in a token (str) using the decoder.)r6   rQ   r*   )rA   r_   r   r   r!   _convert_id_to_token   s   z$MarianTokenizer._convert_id_to_tokenc                       t  j|fi |S )ao  
        Convert a list of lists of token ids into a list of strings by calling decode.

        Args:
            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                problems).
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `List[str]`: The list of decoded sentences.
        )r?   batch_decode)rA   	sequencesrB   rC   r   r!   rb      s   zMarianTokenizer.batch_decodec                    ra   )a  
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.

        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                problems).
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `str`: The decoded sentence.
        )r?   decode)rA   	token_idsrB   rC   r   r!   rd      s   zMarianTokenizer.decodetokensc                 C   sv   | j r| jn| j}g }d}|D ]}|| jv r$|||| d 7 }g }q|| q|||7 }|td}| S )zQUses source spm if _decode_use_source_tokenizer is True, and target spm otherwiserP    )	_decode_use_source_tokenizerr:   r;   all_special_tokensdecode_piecesappendreplaceSPIECE_UNDERLINEstrip)rA   rf   sp_modelcurrent_sub_tokens
out_stringrR   r   r   r!   convert_tokens_to_string  s   
z(MarianTokenizer.convert_tokens_to_stringc                 C   s$   |du r
|| j g S || | j g S )z=Build model inputs from a sequence by appending eos_token_id.N)eos_token_id)rA   token_ids_0token_ids_1r   r   r!    build_inputs_with_special_tokens  s   z0MarianTokenizer.build_inputs_with_special_tokensc                 C   s   | j | _| j| _d S rF   )r:   r<   r1   r=   rA   r   r   r!   _switch_to_input_mode&  s   z%MarianTokenizer._switch_to_input_modec                 C   s   | j | _| jr| j| _d S d S rF   )r;   r<   r.   r4   r=   rw   r   r   r!   _switch_to_target_mode*  s   z&MarianTokenizer._switch_to_target_modec                 C   s
   t | jS rF   )lenr1   rw   r   r   r!   
vocab_size/  s   
zMarianTokenizer.vocab_sizesave_directoryfilename_prefixc              	   C   s  t j|std| d d S g }| jrOt j||r |d ndtd  }t j||r1|d ndtd  }t| j	| t| j
| || || nt j||rY|d ndtd  }t| j	| || ttd td g| j| j| jgD ]Z\}}}	t j||r|d nd| }
t j|t j|
krt j|rt||
 ||
 q|t j|st|
d	}|	 }|| W d    n1 sw   Y  ||
 q|t|S )
NzVocabulary path (z) should be a directory-rP   r   r   r   r   wb)ospathisdirloggererrorr.   joinVOCAB_FILES_NAMES	save_jsonr1   r4   rk   zipr8   r:   r;   abspathisfiler   openserialized_model_protowritetuple)rA   r|   r}   saved_filesout_src_vocab_fileout_tgt_vocab_fileout_vocab_filespm_save_filenamespm_orig_path	spm_modelspm_save_pathficontent_spiece_modelr   r   r!   save_vocabulary3  sR   


$

zMarianTokenizer.save_vocabularyc                 C   s   |   S rF   )get_src_vocabrw   r   r   r!   	get_vocab`  s   zMarianTokenizer.get_vocabc                 C      t | jfi | jS rF   )dictr1   added_tokens_encoderrw   r   r   r!   r   c     zMarianTokenizer.get_src_vocabc                 C   r   rF   )r   r4   added_tokens_decoderrw   r   r   r!   get_tgt_vocabf  r   zMarianTokenizer.get_tgt_vocabc                 C   s"   | j  }|dd dD  |S )Nc                 S   s   i | ]}|d qS rF   r   r&   r   r   r!   r"   l  s    z0MarianTokenizer.__getstate__.<locals>.<dictcomp>)r:   r;   r<   rK   r   )__dict__copyupdate)rA   stater   r   r!   __getstate__i  s
   
zMarianTokenizer.__getstate__dc                    sF   | _ t dsi  _ fdd jD \ _ _ j _   d S )Nr   c                 3   s    | ]	}t | jV  qd S rF   )r9   r   )r   frw   r   r!   	<genexpr>w  s    z/MarianTokenizer.__setstate__.<locals>.<genexpr>)r   hasattrr   r8   r:   r;   r<   r>   )rA   r   r   rw   r!   __setstate__p  s   
zMarianTokenizer.__setstate__c                 O   s   dS )zJust EOS   r   )rA   argsrB   r   r   r!   num_special_tokens_to_add{  s   z)MarianTokenizer.num_special_tokens_to_addc                    s(   t | j  | j  fdd|D S )Nc                    s   g | ]
}| v r
d ndqS )r   r   r   )r   rG   all_special_idsr   r!   r'     s    z7MarianTokenizer._special_token_mask.<locals>.<listcomp>)setr   removeunk_token_id)rA   seqr   r   r!   _special_token_mask  s   
z#MarianTokenizer._special_token_maskrt   ru   already_has_special_tokensc                 C   s:   |r|  |S |du r|  |dg S |  || dg S )zCGet list where entries are [1] if a token is [eos] or [pad] else 0.Nr   )r   )rA   rt   ru   r   r   r   r!   get_special_tokens_mask  s
   
z'MarianTokenizer.get_special_tokens_mask)	NNNr   r   r   r   NFrF   )NF)1__name__
__module____qualname____doc__r   vocab_files_namesPRETRAINED_VOCAB_FILES_MAPpretrained_vocab_files_mapPRETRAINED_INIT_CONFIGURATIONpretrained_init_configuration&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmax_model_input_sizesmodel_input_namesrecompilerU   r   r   r2   r   r@   r>   rJ   rS   rZ   r   r^   intr`   rb   rd   rr   rv   rx   ry   propertyr{   r   r   r   r   r   r   r   r   r   boolr   __classcell__r   r   rC   r!   r   ?   sl    :
>	 -r   r   r   r   c                 C   s   t jdi |}||  |S )Nr   )sentencepieceSentencePieceProcessorLoad)r   r   spmr   r   r!   r9     s   
r9   c                 C   s@   t |d}tj| |dd W d    d S 1 sw   Y  d S )Nw   )indent)r   jsondump)datar   r   r   r   r!   r     s   "r   c                 C   s8   t | d}t|W  d    S 1 sw   Y  d S )Nr)r   r   load)r   r   r   r   r!   r0     s   $r0   )"r   r   r   rN   pathlibr   shutilr   typingr   r   r   r   r   r	   r   tokenization_utilsr   utilsr   
get_loggerr   r   r   r   r   r   rm   r   r2   r   r9   r   r0   r   r   r   r!   <module>   sH    

   S