o
    h5                     @   s   d Z ddlZddlmZmZmZ ddlmZ ddlm	Z	 ddl
mZ eeZdd	iZdd
ddiZdddZdd ZG dd deZdS )zTokenization classes for ESM.    N)ListOptionalUnion   )PreTrainedTokenizer)
AddedToken)logging
vocab_file	vocab.txtzGhttps://huggingface.co/facebook/esm2_t6_8M_UR50D/resolve/main/vocab.txtzIhttps://huggingface.co/facebook/esm2_t12_35M_UR50D/resolve/main/vocab.txt)zfacebook/esm2_t6_8M_UR50Dzfacebook/esm2_t12_35M_UR50Di   c                 C   sH   t | d}|  }dd |D W  d    S 1 sw   Y  d S )Nrc                 S   s   g | ]}|  qS  )strip).0lr   r   ^/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/esm/tokenization_esm.py
<listcomp>,   s    z#load_vocab_file.<locals>.<listcomp>)openread
splitlines)r	   flinesr   r   r   load_vocab_file)   s   $r   c                
       s8  e Zd ZdZeZeZeZ	ddgZ
					d- fd	d
	ZdedefddZdedefddZdd Zd.ddZdd ZdedefddZdedefddZ	d/dee deee  dee fd d!Z	d0dedee d"edee fd#d$Zd%d& Zedefd'd(Zd.d)eee ee f d*edef fd+d,Z  Z S )1EsmTokenizerz&
    Constructs an ESM tokenizer.
    	input_idsattention_mask<unk><cls><pad><mask><eos>c                    sf   t || _tt| j| _dd t| jD | _t jd|||||d| | j| _| 	| j d S )Nc                 S      i | ]\}}||qS r   r   )r   indtokr   r   r   
<dictcomp>E       z)EsmTokenizer.__init__.<locals>.<dictcomp>)	unk_token	cls_token	pad_token
mask_token	eos_tokenr   )
r   
all_tokensdict	enumerate_id_to_token_token_to_idsuper__init__unique_no_split_tokens_update_trie)selfr	   r%   r&   r'   r(   r)   kwargs	__class__r   r   r0   9   s   

zEsmTokenizer.__init__indexreturnc                 C      | j || jS Nr-   getr%   r3   r7   r   r   r   _convert_id_to_tokenU      z!EsmTokenizer._convert_id_to_tokentokenc                 C      | j || j | jS r:   r.   r<   r%   r3   r@   r   r   r   _convert_token_to_idX      z!EsmTokenizer._convert_token_to_idc                 K   s   |  S r:   )split)r3   textr4   r   r   r   	_tokenize[   s   zEsmTokenizer._tokenizeFc                 C   s
   t | jS r:   )lenr-   )r3   with_added_tokensr   r   r   get_vocab_size^   s   
zEsmTokenizer.get_vocab_sizec                 C   s   dd t | jD S )Nc                 S   r    r   r   )r   ir@   r   r   r   r#   b   r$   z*EsmTokenizer.get_vocab.<locals>.<dictcomp>)r,   r*   r3   r   r   r   	get_vocaba   s   zEsmTokenizer.get_vocabc                 C   rA   r:   rB   rC   r   r   r   token_to_idd   rE   zEsmTokenizer.token_to_idc                 C   r9   r:   r;   r=   r   r   r   id_to_tokeng   r?   zEsmTokenizer.id_to_tokenNtoken_ids_0token_ids_1c                 C   s\   | j g}| jg}|d u r| jd u r|| S || | S | jd u r$td|| | | | S )Nz=Cannot tokenize multiple sequences when EOS token is not set!)cls_token_ideos_token_id
ValueError)r3   rQ   rR   clssepr   r   r    build_inputs_with_special_tokensj   s   

z-EsmTokenizer.build_inputs_with_special_tokensalready_has_special_tokensc                    sd   |r|dur
t d fdd|D S dgdgt|  dg }|dur0|dgt| dg 7 }|S )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of ids of the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        NzYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.c                    s   g | ]}| j v rd ndqS )   r   )all_special_ids)r   r@   rM   r   r   r      s    z8EsmTokenizer.get_special_tokens_mask.<locals>.<listcomp>rZ   r   )rU   rI   )r3   rQ   rR   rY   maskr   rM   r   get_special_tokens_maskx   s   z$EsmTokenizer.get_special_tokens_maskc                 C   sd   t j||r
|d ndd }t|d}|d| j W d    |fS 1 s*w   Y  |fS )N- r
   w
)ospathjoinr   writer*   )r3   save_directoryfilename_prefixr	   r   r   r   r   save_vocabulary   s   
zEsmTokenizer.save_vocabularyc                 C   s   | j ddS )NF)rJ   )rK   rM   r   r   r   
vocab_size   s   zEsmTokenizer.vocab_size
new_tokensspecial_tokensc                    s   t  j|ddS )NT)rk   )r/   _add_tokens)r3   rj   rk   r5   r   r   rl      r?   zEsmTokenizer._add_tokens)r   r   r   r   r   )Fr:   )NF)!__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesPRETRAINED_VOCAB_FILES_MAPpretrained_vocab_files_map&PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESmax_model_input_sizesmodel_input_namesr0   intstrr>   rD   rH   rK   rN   rO   rP   r   r   rX   boolr]   rh   propertyri   r   r   rl   __classcell__r   r   r5   r   r   /   sR    



4r   )rp   rb   typingr   r   r   tokenization_utilsr   tokenization_utils_baser   utilsr   
get_loggerrm   loggerrq   rs   ru   r   r   r   r   r   r   <module>   s$   
