o
    hH                     @   s2  d Z ddlZddlmZmZmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZmZ ddlmZmZ dd	lmZ dddZG dd dZdedefddZG dd dZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd  d eZ$G d!d" d"eZ%G d#d$ d$eZ&G d%d& d&eZ'G d'd( d(eZ(G d)d* d*eZ)G d+d, d,e)Z*G d-d. d.e)Z+G d/d0 d0e)Z,G d1d2 d2e)Z-G d3d4 d4e)Z.G d5d6 d6e)Z/G d7d8 d8e)Z0G d9d: d:e)Z1G d;d< d<e)Z2G d=d> d>e)Z3G d?d@ d@e)Z4G dAdB dBe)Z5G dCdD dDe)Z6G dEdF dFe)Z7G dGdH dHe)Z8G dIdJ dJeZ9G dKdL dLe)Z:G dMdN dNeZ;G dOdP dPeZ<G dQdR dReZ=G dSdT dTe)Z>G dUdV dVe)Z?G dWdX dXeZ@i dYe*dZe&d[e+d\ed]e:d^e=d_e,d`e;dae$dbedce(dde-deedfedgedhediei dje*dke!dle$dme%dnedoedpe&dqe2dre&dse&dtedue@dve.dwe/dxe"dyedze&i d{e0d|e#d}e7d~ede4de5dede&de'de1dede8de9de2de3de de>e?e?dZAdefddZBdS )z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)DictListTuple)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece   )is_protobuf_availablerequires_backends)PROTOBUF_IMPORT_ERROR c                 C   sT   t  r#dd l}t|jjtdk rddlm} |S ddlm} |S t	t
| )Nr   z4.0.0)sentencepiece_model_pb2)sentencepiece_model_pb2_new)r   google.protobufr   parseprotobuf__version__transformers.utilsr   r   ImportErrorr   format)error_messagegoogler    r    Y/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/convert_slow_tokenizer.pyimport_protobuf!   s   r"   c                   @   sB   e Zd ZdZdefddZd	deeeef e	e f fddZ
dS )
SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    modelc                 C   s.   t | d ddlm} | | _| j| d S )Nsentencepiecer   )SentencePieceProcessor)r   r%   r&   spLoad)selfr$   r&   r    r    r!   __init__3   s   
zSentencePieceExtractor.__init__Nreturnc           
         s   | j   fddt  D |durt|d}}nd}}g }| D ]<\}}g }tdt|D ]}|d| ||d }}	|v rS|	v rS|||	|f q4t|fddd	}|| q't|d
d |d}dd |D }|fS )z
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        c                    s   i | ]}  ||qS r    )id_to_piece).0index)r'   r    r!   
<dictcomp>@       z2SentencePieceExtractor.extract.<locals>.<dictcomp>NTFr   c                    s    | d   | d  fS )Nr   r   r    )x)vocabr    r!   <lambda>N   r0   z0SentencePieceExtractor.extract.<locals>.<lambda>)keyc                 S   s   | d S )N   r    )valr    r    r!   r3   Q   s    )r4   reversec                 S   s   g | ]
}|d  |d fqS )r   r   r    )r-   r6   r    r    r!   
<listcomp>R       z2SentencePieceExtractor.extract.<locals>.<listcomp>)	r'   rangeGetPieceSizedictitemslenappendsortedextend)
r)   vocab_scoresr7   mergesmergepiece_scorelocalr.   piece_lpiece_rr    )r'   r2   r!   extract:   s$   
zSentencePieceExtractor.extractN)__name__
__module____qualname____doc__strr*   r   r   intr   rI   r    r    r    r!   r#   .   s    (r#   piecer+   c                 C   s&   t | dk p| d dkp| d   S )Nr5   ,)r>   isdigit)rQ   r    r    r!   check_number_commaV   s   &rV   c                   @   s"   e Zd Zdd ZdefddZdS )	Converterc                 C   s
   || _ d S rJ   )original_tokenizer)r)   rX   r    r    r!   r*   [   s   
zConverter.__init__r+   c                 C   s   t  rJ   )NotImplementedErrorr)   r    r    r!   	converted^   s   zConverter.convertedN)rK   rL   rM   r*   r   r[   r    r    r    r!   rW   Z   s    rW   c                   @      e Zd ZdefddZdS )BertConverterr+   c           
      C      | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )N	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixrX   r2   r   r   rO   r`   hasattrra   tokenize_chinese_charsre   do_lower_caser
   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr   TemplateProcessingpost_processorr	   decoder
r)   r2   	tokenizerrt   re   ru   clssepr|   r}   r    r    r!   r[   c   :   



zBertConverter.convertedNrK   rL   rM   r   r[   r    r    r    r!   r]   b       r]   c                   @   r\   )SplinterConverterr+   c              
   C   sZ  | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}t| j j}d}	| j j}
| j j}| j j}| j d}| j jdkrx| d| d	|	 d	| d
| d
}n| d| d
| d	|	 d	| d
}tj| d| d|||
f||f||f|	|fgd|_tjdd|_|S )Nr_   Fra   Trb   .rightrg    ri   rj   rh   rk   ro   rp   )rX   r2   r   r   rO   r`   rs   ra   rt   re   ru   r
   rv   rw   r   rx   ry   rz   r{   question_tokenr|   r}   question_token_idconvert_tokens_to_idspadding_sider   r~   r   r	   r   )r)   r2   r   rt   re   ru   r   r   questiondotr|   r}   r   dot_token_idrm   r    r    r!   r[      sL   



$"
zSplinterConverter.convertedNr   r    r    r    r!   r      r   r   c                   @   r\   )FunnelConverterr+   c           
      C   r^   )Nr_   Fra   Trb   z:2 $A:0 rh   ri   rj   rk   ro   rp   rr   r   r    r    r!   r[      r   zFunnelConverter.convertedNr   r    r    r    r!   r      r   r   c                   @   r\   )MPNetConverterr+   c           
   
   C   s   | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	| d
||f||	fgd|_tjdd|_|S )Nr_   Fra   Trb   rg   rh   z:0 ri   rj   rk   ro   rp   rr   r   r    r    r!   r[      s:   



zMPNetConverter.convertedNr   r    r    r    r!   r      r   r   c                   @   r\   )OpenAIGPTConverterr+   c              	   C   s   | j j}t| j j }| j j}tt||d t|ddd}|	t|d ur/|
t|g tjdd|_t |_tjdd|_|S )N</w>F)r2   rC   dropoutr`   end_of_word_suffixfuse_unkT)rf   suffix)rX   encoderlist	bpe_rankskeysr`   r   r   rO   token_to_idadd_special_tokensr
   rv   rw   r   rx   ry   r	   
BPEDecoderr   r)   r2   rC   r`   r   r    r    r!   r[   
  s&   
zOpenAIGPTConverter.convertedNr   r    r    r    r!   r   	  r   r   c                   @   r\   )GPT2Converterr+   c              	   C   s   | j j}t| j j }tt||d dddd}tj| j j	d|_
t |_| j jrF| j j}| j j}tj| d| d||fgd|_|S tjdd|_|S )	Nr   Fr2   rC   r   continuing_subword_prefixr   r   add_prefix_spacez:0 $A:0z:0 $A:0 $B:1rk   )trim_offsets)rX   r   r   r   r   r   r   r   	ByteLevelr   ry   r	   r   add_bos_token	bos_tokenbos_token_idr   r~   r   )r)   r2   rC   r   bosr   r    r    r!   r[   %  s4   
zGPT2Converter.convertedNr   r    r    r    r!   r   $  r   r   c                   @   r\   )HerbertConverterr+   c                 C   s   d}d}| j j}t| j j }||d d v r|dd  }tt||d | j j|d}tj	ddd|_
t |_tj|d|_tj| j j| j jf| j j| j jfd	|_|S )
Nz	#version:r   r   r   )r   r`   r   F)rf   re   r   )r   r   )rX   r   r   r   r   r   r   r`   r
   rv   rw   r   rx   ry   r	   r   r   r   BertProcessingr{   r}   rz   r|   r   )r)   tokenizer_info_strtoken_suffixr2   rC   r   r    r    r!   r[   H  s.   

zHerbertConverter.convertedNr   r    r    r    r!   r   G  r   r   c                   @   r\   )RobertaConverterr+   c              	   C   sv   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tj|j|jf|j|jf|j	dd|_|S )Nr   Fr   r   Tr   r   r   r   )rX   r   r   r   r   r   r   r   r   r   ry   r	   r   r   RobertaProcessingr{   r}   rz   r|   r   r)   otr2   rC   r   r    r    r!   r[   g  s,   


zRobertaConverter.convertedNr   r    r    r    r!   r   f  r   r   c                   @   r\   )RoFormerConverterr+   c           
      C   s   ddl m} | jj}tt|t| jjd}d}d}t| jdr*| jj	j
}| jj	j}tjdd||d|_tj|||_t| jj}t| jj}| jj}| jj}	tj| d| d	| d| d
| d||f||	fgd|_tjdd|_|S )Nr   )JiebaPreTokenizerr_   Fra   Trb   rg   rh   ri   rj   rk   ro   rp   )"models.roformer.tokenization_utilsr   rX   r2   r   r   rO   r`   rs   ra   re   ru   r
   rv   rw   r   PreTokenizercustomry   rz   r{   r|   r}   r   r~   r   r	   r   )
r)   r   r2   r   re   ru   r   r   r|   r}   r    r    r!   r[     s8   

zRoFormerConverter.convertedNr   r    r    r    r!   r     r   r   c                   @   r\   )DebertaConverterr+   c              	   C   s~   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjddd| j dfd| j dfgd	|_|S )
Nr   Fr   r   [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]rk   )rX   r   r   r   r   r   r   r   r   r   ry   r	   r   r   r~   r   r   r   r    r    r!   r[     s.   
	zDebertaConverter.convertedNr   r    r    r    r!   r     r   r   c                       sb   e Zd Z fddZdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
defddZ  ZS )SpmConverterc                    s   t | d t j|  t }| }t| jjd}||	  W d    n1 s+w   Y  || _
| j
jjrEt| dd sGtd d S d S d S )Nr   rbhandle_byte_fallbacka  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superr*   r"   
ModelProtoopenrX   
vocab_fileParseFromStringreadprototrainer_specbyte_fallbackgetattrwarningswarn)r)   args	model_pb2mf	__class__r    r!   r*     s   

zSpmConverter.__init__c                 C      dd |j D S )Nc                 S      g | ]}|j |jfqS r    rQ   scorer-   rQ   r    r    r!   r8     r0   z&SpmConverter.vocab.<locals>.<listcomp>piecesr)   r   r    r    r!   r2        zSpmConverter.vocabc                 C   s   |j jS rJ   )r   unk_idr   r    r    r!   r        zSpmConverter.unk_idc           	      C   s   |j j}| |}| |}|dkrtt||}|S |dkr@t| jj	 \}}dd t
|D }tt|||j jdd}|S td)Nr   r5   c                 S      i | ]	\}\}}||qS r    r    )r-   iwordr   r    r    r!   r/         z*SpmConverter.tokenizer.<locals>.<dictcomp>T)r`   r   ]You're trying to run a `Unigram` model but you're file was trained with a different algorithm)r   
model_typer2   r   r   r   r#   rX   r   rI   	enumerater   	unk_piece	Exception)	r)   r   r   rB   r   r   _rC   	bpe_vocabr    r    r!   r     s*   

zSpmConverter.tokenizerc                 C   sD   |j j}|stttddgS tt|ttddgS N {2,}r   )normalizer_specprecompiled_charsmapr
   SequenceReplacer   Precompiled)r)   r   r   r    r    r!   rw     s   zSpmConverter.normalizerc                 C      t j||dS Nreplacementr   )r   	Metaspacer)   r   r   r    r    r!   ry        zSpmConverter.pre_tokenizerc                 C      d S rJ   r    rZ   r    r    r!   r        zSpmConverter.post_processorc                 C   r   r   )r	   r   r   r    r    r!   r     r   zSpmConverter.decoderr+   c                 C   sl   |  | j}| | j}|d ur||_d}d}| ||}|d ur$||_| |||_|  }|r4||_|S )N   ▁T)r   r   rw   ry   r   r   )r)   r   rw   r   r   ry   r   r    r    r!   r[     s   zSpmConverter.converted)rK   rL   rM   r*   r2   r   r   rw   ry   r   r   r   r[   __classcell__r    r    r   r!   r     s    	r   c                   @   $   e Zd Zdd Zdd Zdd ZdS )AlbertConverterc                 C   r   )Nc                 S   2   g | ]}t |jr|j|jfn|j|jd  fqS d   rV   rQ   r   r   r    r    r!   r8   )      $z)AlbertConverter.vocab.<locals>.<listcomp>r   r   r    r    r!   r2   (     zAlbertConverter.vocabc                 C      t ddt ddg}| jjs|t   |t   | jjr)|t   |j	j
}|r7|t | |t tdd t |S Nz``"z''r   r   r
   r   rX   keep_accentsr?   NFKDStripAccentsru   	Lowercaser   r   r   r   r   r)   r   list_normalizersr   r    r    r!   rw   .     


zAlbertConverter.normalizerc                 C   ,   t jddd| jdfd| jdfgdS Nr   r   r   r   rk   r   r~   rX   r   rZ   r    r    r!   r   A     zAlbertConverter.post_processorNrK   rL   rM   r2   rw   r   r    r    r    r!   r   '      r   c                   @      e Zd Zdd Zdd ZdS )BarthezConverterc                 C      d}|S N   r    r)   r   r   r    r    r!   r   M     zBarthezConverter.unk_idc                 C   r  Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>rk   r  rZ   r    r    r!   r   Q  r  zBarthezConverter.post_processorN)rK   rL   rM   r   r   r    r    r    r!   r  L  s    r  c                   @   r   )CamembertConverterc                 C   2   g d}|dd |j dd  D 7 }|dg7 }|S )N))z
<s>NOTUSED        z<pad>r$  )z</s>NOTUSEDr$  <unk>r$  )z<unk>NOTUSEDic                 S   r   r    r   r   r    r    r!   r8   f  r0   z,CamembertConverter.vocab.<locals>.<listcomp>r   z<mask>r$  r   r)   r   r2   r    r    r!   r2   ]  s   
zCamembertConverter.vocabc                 C      dS r  r    r   r    r    r!   r   j     zCamembertConverter.unk_idc                 C   r  r  r  rZ   r    r    r!   r   n  r  z!CamembertConverter.post_processorNrK   rL   rM   r2   r   r   r    r    r    r!   r"  \  s    r"  c                   @   r   )DebertaV2Converterc                 C   s<   g }| j jr|tjdd |tj||d t|S )Nisolated)behaviorr   )rX   split_by_punctr?   r   Punctuationr   r   )r)   r   r   list_pretokenizersr    r    r!   ry   z  s
   
z DebertaV2Converter.pre_tokenizerc                 C   sd   g }| j jr|t  |t  |jj}|r"|t| |t	t
dd t|S r   )rX   ru   r?   r
   r  Stripr   r   r   r   r   r   r  r    r    r!   rw     s   
zDebertaV2Converter.normalizerc                 C   r  r  r  rZ   r    r    r!   r     r  z!DebertaV2Converter.post_processorN)rK   rL   rM   ry   rw   r   r    r    r    r!   r-  y  s    r-  c                   @   r   )MBartConverterc                 C   >   g d}|dd |j dd  D 7 }|g d7 }|dg7 }|S )Nr   r$  r%  r!  r$  r&  c                 S   r   r    r   r   r    r    r!   r8     r0   z(MBartConverter.vocab.<locals>.<listcomp>r  )ar_ARr$  cs_CZr$  de_DEr$  en_XXr$  es_XXr$  et_EEr$  fi_FIr$  fr_XXr$  gu_INr$  hi_INr$  it_ITr$  ja_XXr$  kk_KZr$  ko_KRr$  lt_LTr$  lv_LVr$  my_MMr$  ne_NPr$  nl_XXr$  ro_ROr$  ru_RUr$  si_LKr$  tr_TRr$  vi_VNr$  zh_CNr$  r(  r   r)  r    r    r!   r2     s
   
zMBartConverter.vocabc                 C   r*  r  r    r   r    r    r!   r     r   zMBartConverter.unk_idc                 C   r  )Nz$A </s> en_XXz$A $B </s> en_XXr@  r!  rk   r  rZ   r    r    r!   r     r  zMBartConverter.post_processorNr,  r    r    r    r!   r4    s    &r4  c                   @   r   )MBart50Converterc                 C   r5  )Nr6  c                 S   r   r    r   r   r    r    r!   r8     r0   z*MBart50Converter.vocab.<locals>.<listcomp>r  )4r9  r;  r=  r?  rA  rC  rE  rG  rI  rK  rM  rO  rQ  rS  rU  rW  rY  r[  r]  r_  ra  rc  re  rg  ri  )af_ZAr$  )az_AZr$  )bn_INr$  )fa_IRr$  )he_ILr$  )hr_HRr$  )id_IDr$  )ka_GEr$  )km_KHr$  )mk_MKr$  )ml_INr$  )mn_MNr$  )mr_INr$  )pl_PLr$  )ps_AFr$  )pt_XXr$  )sv_SEr$  )sw_KEr$  )ta_INr$  )te_INr$  )th_THr$  )tl_XXr$  )uk_UAr$  )ur_PKr$  )xh_ZAr$  )gl_ESr$  )sl_SIr$  r(  r   r)  r    r    r!   r2     
   
zMBart50Converter.vocabc                 C   r*  r  r    r   r    r    r!   r     r   zMBart50Converter.unk_idc                 C   r  )Nzen_XX $A </s>zen_XX $A $B </s>r@  r!  rk   r  rZ   r    r    r!   r     r  zMBart50Converter.post_processorNr,  r    r    r    r!   rk        rk  c                   @   r   )NllbConverterc                 C   r5  )Nr6  c                 S   r   r    r   r   r    r    r!   r8     r0   z'NllbConverter.vocab.<locals>.<listcomp>r  ))ace_Arabr$  )ace_Latnr$  )acm_Arabr$  )acq_Arabr$  )aeb_Arabr$  )afr_Latnr$  )ajp_Arabr$  )aka_Latnr$  )amh_Ethir$  )apc_Arabr$  )arb_Arabr$  )ars_Arabr$  )ary_Arabr$  )arz_Arabr$  )asm_Bengr$  )ast_Latnr$  )awa_Devar$  )ayr_Latnr$  )azb_Arabr$  )azj_Latnr$  )bak_Cyrlr$  )bam_Latnr$  )ban_Latnr$  )bel_Cyrlr$  )bem_Latnr$  )ben_Bengr$  )bho_Devar$  )bjn_Arabr$  )bjn_Latnr$  )bod_Tibtr$  )bos_Latnr$  )bug_Latnr$  )bul_Cyrlr$  )cat_Latnr$  )ceb_Latnr$  )ces_Latnr$  )cjk_Latnr$  )ckb_Arabr$  )crh_Latnr$  )cym_Latnr$  )dan_Latnr$  )deu_Latnr$  )dik_Latnr$  )dyu_Latnr$  )dzo_Tibtr$  )ell_Grekr$  )eng_Latnr$  )epo_Latnr$  )est_Latnr$  )eus_Latnr$  )ewe_Latnr$  )fao_Latnr$  )pes_Arabr$  )fij_Latnr$  )fin_Latnr$  )fon_Latnr$  )fra_Latnr$  )fur_Latnr$  )fuv_Latnr$  )gla_Latnr$  )gle_Latnr$  )glg_Latnr$  )grn_Latnr$  )guj_Gujrr$  )hat_Latnr$  )hau_Latnr$  )heb_Hebrr$  )hin_Devar$  )hne_Devar$  )hrv_Latnr$  )hun_Latnr$  )hye_Armnr$  )ibo_Latnr$  )ilo_Latnr$  )ind_Latnr$  )isl_Latnr$  )ita_Latnr$  )jav_Latnr$  )jpn_Jpanr$  )kab_Latnr$  )kac_Latnr$  )kam_Latnr$  )kan_Kndar$  )kas_Arabr$  )kas_Devar$  )kat_Georr$  )knc_Arabr$  )knc_Latnr$  )kaz_Cyrlr$  )kbp_Latnr$  )kea_Latnr$  )khm_Khmrr$  )kik_Latnr$  )kin_Latnr$  )kir_Cyrlr$  )kmb_Latnr$  )kon_Latnr$  )kor_Hangr$  )kmr_Latnr$  )lao_Laoor$  )lvs_Latnr$  )lij_Latnr$  )lim_Latnr$  )lin_Latnr$  )lit_Latnr$  )lmo_Latnr$  )ltg_Latnr$  )ltz_Latnr$  )lua_Latnr$  )lug_Latnr$  )luo_Latnr$  )lus_Latnr$  )mag_Devar$  )mai_Devar$  )mal_Mlymr$  )mar_Devar$  )min_Latnr$  )mkd_Cyrlr$  )plt_Latnr$  )mlt_Latnr$  )mni_Bengr$  )khk_Cyrlr$  )mos_Latnr$  )mri_Latnr$  )zsm_Latnr$  )mya_Mymrr$  )nld_Latnr$  )nno_Latnr$  )nob_Latnr$  )npi_Devar$  )nso_Latnr$  )nus_Latnr$  )nya_Latnr$  )oci_Latnr$  )gaz_Latnr$  )ory_Oryar$  )pag_Latnr$  )pan_Gurur$  )pap_Latnr$  )pol_Latnr$  )por_Latnr$  )prs_Arabr$  )pbt_Arabr$  )quy_Latnr$  )ron_Latnr$  )run_Latnr$  )rus_Cyrlr$  )sag_Latnr$  )san_Devar$  )sat_Bengr$  )scn_Latnr$  )shn_Mymrr$  )sin_Sinhr$  )slk_Latnr$  )slv_Latnr$  )smo_Latnr$  )sna_Latnr$  )snd_Arabr$  )som_Latnr$  )sot_Latnr$  )spa_Latnr$  )als_Latnr$  )srd_Latnr$  )srp_Cyrlr$  )ssw_Latnr$  )sun_Latnr$  )swe_Latnr$  )swh_Latnr$  )szl_Latnr$  )tam_Tamlr$  )tat_Cyrlr$  )tel_Telur$  )tgk_Cyrlr$  )tgl_Latnr$  )tha_Thair$  )tir_Ethir$  )taq_Latnr$  )taq_Tfngr$  )tpi_Latnr$  )tsn_Latnr$  )tso_Latnr$  )tuk_Latnr$  )tum_Latnr$  )tur_Latnr$  )twi_Latnr$  )tzm_Tfngr$  )uig_Arabr$  )ukr_Cyrlr$  )umb_Latnr$  )urd_Arabr$  )uzn_Latnr$  )vec_Latnr$  )vie_Latnr$  )war_Latnr$  )wol_Latnr$  )xho_Latnr$  )ydd_Hebrr$  )yor_Latnr$  )yue_Hantr$  )zho_Hansr$  )zho_Hantr$  )zul_Latnr$  r(  r   r)  r    r    r!   r2     r  zNllbConverter.vocabc                 C   r*  r  r    r   r    r    r!   r     r   zNllbConverter.unk_idc                 C   r  )Nzeng_Latn $A </s>zeng_Latn $A $B </s>r  r!  rk   r  rZ   r    r    r!   r     r  zNllbConverter.post_processorNr,  r    r    r    r!   r    r  r  c                   @   r   )SeamlessM4TConverterc                 C   (   g d}|dd |j dd  D 7 }|S )N)r%  r&  r7  r8  c                 S   r   r    r   r   r    r    r!   r8     r0   z.SeamlessM4TConverter.vocab.<locals>.<listcomp>r  r   r)  r    r    r!   r2     s   zSeamlessM4TConverter.vocabc                 C   s   | j jS rJ   )rX   unk_token_idr   r    r    r!   r     r   zSeamlessM4TConverter.unk_idc                 C   r  )Nz__eng__ $A </s>z__eng__ $A $B </s>__eng__r!  rk   r  rZ   r    r    r!   r     r  z#SeamlessM4TConverter.post_processorNr,  r    r    r    r!   rT    s    
rT  c                   @   r   )XLMRobertaConverterc                 C   r#  )Nr6  c                 S   r   r    r   r   r    r    r!   r8   %  r0   z-XLMRobertaConverter.vocab.<locals>.<listcomp>r  r(  r   r)  r    r    r!   r2     s   
zXLMRobertaConverter.vocabc                 C   r  r  r    r  r    r    r!   r   )  r  zXLMRobertaConverter.unk_idc                 C   r  r  r  rZ   r    r    r!   r   -  r  z"XLMRobertaConverter.post_processorNr,  r    r    r    r!   rX        rX  c                   @   r   )XLNetConverterc                 C   r   )Nc                 S   r  r  r  r   r    r    r!   r8   :  r  z(XLNetConverter.vocab.<locals>.<listcomp>r   r   r    r    r!   r2   9  r  zXLNetConverter.vocabc                 C   r  r  r
  r  r    r    r!   rw   ?  r  zXLNetConverter.normalizerc                 C   r  )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>rk   r  rZ   r    r    r!   r   R  r  zXLNetConverter.post_processorNr  r    r    r    r!   rZ  8  r  rZ  c                   @      e Zd ZdS )ReformerConverterNrK   rL   rM   r    r    r    r!   r\  ]      r\  c                   @   r  )RemBertConverterc                 C   s   t ddt ddt tddg}| jjs%|t   |t   | jjr0|t 	  |j
j}|r>|t | t |S r  )r
   r   r   rX   r  r?   r  r  ru   r  r   r   r   r   r  r    r    r!   rw   c  s   


zRemBertConverter.normalizerc                 C   r  r  r  rZ   r    r    r!   r   v  r  zRemBertConverter.post_processorN)rK   rL   rM   rw   r   r    r    r    r!   r_  a  s    r_  c                   @   r[  )BertGenerationConverterNr]  r    r    r    r!   r`    r^  r`  c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
PegasusConverterc                 C   s   | j jdf| j jdfg}| j jd ur|| j jdfg7 }| j jd ur2| j j| j jk r2|| j jdfg7 }|dd td| j jD 7 }|dd |jdd  D 7 }|S )Nr$  c                 S      g | ]
}d | ddfqS )z<unk_>g      Yr    r-   r   r    r    r!   r8     r9   z*PegasusConverter.vocab.<locals>.<listcomp>r5   c                 S   r   r    r   r   r    r    r!   r8     r0   )	rX   	pad_token	eos_tokenmask_token_sent
mask_tokenmask_token_idoffsetr:   r   r)  r    r    r!   r2     s   

zPegasusConverter.vocabc                 C   s   |j j| jj S rJ   )r   r   rX   rj  r   r    r    r!   r     r   zPegasusConverter.unk_idc                 C   s   t t  t j||dgS r   )r   r   WhitespaceSplitr   r   r    r    r!   ry     s
   zPegasusConverter.pre_tokenizerc                 C   s0   | j j}|| j jfg}tjd|gdd|g|dS )N$A$Brk   )rX   rf  eos_token_idr   r~   )r)   eosrn   r    r    r!   r     s   
zPegasusConverter.post_processorN)rK   rL   rM   r2   r   ry   r   r    r    r    r!   ra    s
    ra  c                   @   r  )T5Converterc                 C   s:   | j j}dd |jD }|dd t|d ddD 7 }|S )Nc                 S   r   r    r   r   r    r    r!   r8     r0   z%T5Converter.vocab.<locals>.<listcomp>c                 S   rb  )z
<extra_id_rc  r$  r    rd  r    r    r!   r8     r9   r   rR   )rX   
_extra_idsr   r:   )r)   r   num_extra_idsr2   r    r    r!   r2     s   zT5Converter.vocabc                 C   s&   t jddgg dd| jdfgdS )Nrl  r!  )rl  r!  rm  r!  rk   r  rZ   r    r    r!   r     s   zT5Converter.post_processorN)rK   rL   rM   r2   r   r    r    r    r!   rp    s    rp  c                   @   r\   )WhisperConverterr+   c           	   	   C   s   | j j}t| j j }tt||d dddd}tj| j j	d|_
t |_| j j}| j |}| j j}| j j}ddd |D }tj| d| d	| d
| d||fgt||d|_|S )Nr   Fr   r   r   c                 S   s   g | ]}| d qS )rh   r    )r-   tokenr    r    r!   r8     s    z.WhisperConverter.converted.<locals>.<listcomp>z $A:0 rh   z $A:0 $B:1 rj   rk   )rX   r   r   r   r   r   r   r   r   r   ry   r	   r   prefix_tokensconvert_ids_to_tokensrf  rn  joinr   r~   zipr   )	r)   r2   rC   r   prefix_token_idsprefixesro  rn  prefix_templater    r    r!   r[     s8   
	zWhisperConverter.convertedNr   r    r    r    r!   rs    r   rs  c                   @   s   e Zd Zdd ZdS )BigBirdConverterc                 C   r  r  r  rZ   r    r    r!   r     r  zBigBirdConverter.post_processorN)rK   rL   rM   r   r    r    r    r!   r|    s    r|  c                   @   r\   )CLIPConverterr+   c              
   C   s   | j j}t| j j }| j j}tt||d dddt|d}t	
t	 t	tddt	 g|_t
tjtddd	d
tjddg|_t |_tj| j j| j jf| j j| j jfddd|_|S )Nr   r   Fr2   rC   r   r   r   r   r`   z\s+r   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedT)r/  invertr   r   )rX   r   r   r   r   r`   r   r   rO   r
   r   NFCr   r   r  rw   r   Splitr   ry   r	   r   r   r   rf  rn  r   r   r   r   r    r    r!   r[     sD   


zCLIPConverter.convertedNr   r    r    r    r!   r}    r   r}  c                   @   r\   )LayoutLMv2Converterr+   c           
      C   s   | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )Nr_   FTra   rb   rg   rh   ri   rj   rk   ro   rp   rr   r   r    r    r!   r[     r   zLayoutLMv2Converter.convertedNr   r    r    r    r!   r    r   r  c                   @   r\   )BlenderbotConverterr+   c              	   C   st   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjd|j d|j|jfgd|_|S )Nr   Fr   r   z$A:0 rh   )rl   rn   )rX   r   r   r   r   r   r   r   r   r   ry   r	   r   r   r~   rf  rn  r   r   r    r    r!   r[   @  s*   

zBlenderbotConverter.convertedNr   r    r    r    r!   r  ?  r   r  c                   @   r   )XGLMConverterc                 C   s4   g d}|dd |j dd  D 7 }|g d7 }|S )Nr6  c                 S   r   r    r   r   r    r    r!   r8   d  r0   z'XGLMConverter.vocab.<locals>.<listcomp>r  ))z<madeupword0>r$  )z<madeupword1>r$  )z<madeupword2>r$  )z<madeupword3>r$  )z<madeupword4>r$  )z<madeupword5>r$  )z<madeupword6>r$  r   r)  r    r    r!   r2   ]  s   zXGLMConverter.vocabc                 C   r  r  r    r  r    r    r!   r   h  r  zXGLMConverter.unk_idc                 C   r  )Nz</s> $Az</s> $A </s> </s> $Br   r!  rk   r  rZ   r    r    r!   r   l  r  zXGLMConverter.post_processorNr,  r    r    r    r!   r  \  rY  r  c                   @   sH   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dS )LlamaConverterTc                 C   rU  )N)r&  r7  r8  c                 S   r   r    r   r   r    r    r!   r8     r0   z(LlamaConverter.vocab.<locals>.<listcomp>r  r   r)  r    r    r!   r2   z  s   zLlamaConverter.vocabc                 C   r  )Nr   r    r  r    r    r!   r     r  zLlamaConverter.unk_idc              	   C   s,   t t ddt  t  t jdddgS )Nr   r   r   )contentleft)r	   r   r   ByteFallbackFuser3  r   r    r    r!   r     s   
zLlamaConverter.decoderc           	   	   C   s   |j j}| |}|dkr0dd l}t|jtdk r%tt|d}|S tt|ddd}|S |dkrnt	| j
j|\}}dd t|D }tt|||j jddd	}|td
dddtddddtddddg |S td)Nr   r   z0.14.0T)r   r5   c                 S   r   r    r    )r-   r   r   _scorer    r    r!   r/     r   z,LlamaConverter.tokenizer.<locals>.<dictcomp>)r`   r   r   r'  F)
normalizedspecialr   r!  r   )r   r   r2   
tokenizersr   r   r   r   r   r#   rX   r   rI   r   r   r   r   r   r   )	r)   r   r   rB   r  r   r   rC   r   r    r    r!   r     s2   
zLlamaConverter.tokenizerc                 C   s    t t jddt jdddgS )Nr   )prependr   )patternr  )r
   r   Prependr   r   r    r    r!   rw     s
   
zLlamaConverter.normalizerc                 C   r   rJ   r    r   r    r    r!   ry     r   zLlamaConverter.pre_tokenizerc                 C   r   rJ   r    rZ   r    r    r!   r     r+  zLlamaConverter.post_processorN)rK   rL   rM   r   r2   r   r   r   rw   ry   r   r    r    r    r!   r  w  s    	
r  c                   @   r\   )MarkupLMConverterr+   c           	   
   C   s   | j }|j}t|j }tt||d ddd| j jd}tj	|j
d|_t	 |_t| j j}t| j j}| j j}| j j}tj| d| | d| d| ||f||fgd|_|S )Nr   Fr~  r   z $A z $B rk   )rX   r   r   r   r   r   r   r`   r   r   r   ry   r	   r   rO   rz   r{   r|   r}   r   r~   r   )	r)   r   r2   rC   r   r   r   r|   r}   r    r    r!   r[     s8   
	zMarkupLMConverter.convertedNr   r    r    r    r!   r    r   r  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerRealmTokenizerReformerTokenizerRemBertTokenizerRetriBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizerSplinterTokenizerXGLMTokenizer)LlamaTokenizerCodeLlamaTokenizerc                 C   s@   | j j}|tvrtd| dtt  t| }||  S )a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    zAn instance of tokenizer class zv cannot be converted in a Fast tokenizer instance. No converter was found. Currently available slow->fast convertors: )r   rK   SLOW_TO_FAST_CONVERTERS
ValueErrorr   r   r[   )transformer_tokenizertokenizer_class_nameconverter_classr    r    r!   convert_slow_tokenizer  s   
r  )r   )CrN   r   typingr   r   r   	packagingr   r  r   r   r   r	   r
   r   r   tokenizers.modelsr   r   r   utilsr   r   utils.import_utilsr   r"   r#   rO   boolrV   rW   r]   r   r   r   r   r   r   r   r   r   r   r   r  r"  r-  r4  rk  r  rT  rX  rZ  r\  r_  r`  ra  rp  rs  r|  r}  r  r  r  r  r  r  r  r    r    r    r!   <module>   s2  $
('2''#'^% 5% '$+'I&	
 !"#$%&'()*+,-./012349