o
    h_                     @   st   d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ ddlmZ G dd deZd	S )
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)NFKC   )BaseTokenizerc                       sb  e Zd ZdZ							d$deeeeeef f  deeeee	eef e	eef f f  d	eee
f d
ededee dee f fddZededefddZdddgdg dfdeeee f dededeeee
f  dedee defddZdddgdg ddfd eee eee  f dededeeee
f  dedee ded!ee fd"d#Z  ZS )%SentencePieceBPETokenizerzrSentencePiece BPE Tokenizer

    Represents the BPE algorithm, with the pretokenization used by SentencePiece
    N<unk>   ▁TFvocabmerges	unk_tokenreplacementadd_prefix_spacedropoutfuse_unkc           
         s   |d ur|d urt t|||||d}n	t t|||d}|t|d ur.|t|g t |_tj||d|_	t
j||d|_d||||d}	t ||	 d S )N)r   r   r   )r   r   SentencePieceBPE)modelr   r   r   r   )r	   r   token_to_idstradd_special_tokensr   
normalizerr   	Metaspacepre_tokenizerr
   decodersuper__init__)
selfr   r   r   r   r   r   r   	tokenizer
parameters	__class__ b/var/www/html/ai/venv/lib/python3.10/site-packages/tokenizers/implementations/sentencepiece_bpe.pyr%      s   
z"SentencePieceBPETokenizer.__init__vocab_filenamemerges_filenamec                 K   s"   t | |\}}t||fi |S )N)r   	read_filer   )r-   r.   kwargsr   r   r+   r+   r,   	from_file0   s   z#SentencePieceBPETokenizer.from_filei0u     i  files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc           	      C   s:   t j||||||d}t|tr|g}| jj||d dS )z%Train the model using the given filesr4   r5   r6   r7   r8   r9   )trainerN)r   
BpeTrainer
isinstancer   
_tokenizertrain)	r&   r3   r4   r5   r6   r7   r8   r9   r;   r+   r+   r,   r?   5   s   
zSentencePieceBPETokenizer.trainiteratorlengthc	           
      C   s,   t j||||||d}	| jj||	|d dS )z(Train the model using the given iteratorr:   )r;   rA   N)r   r<   r>   train_from_iterator)
r&   r@   r4   r5   r6   r7   r8   r9   rA   r;   r+   r+   r,   rB   M   s   
z-SentencePieceBPETokenizer.train_from_iterator)NNr   r   TNF)__name__
__module____qualname____doc__r   r   r   r   intr   r   boolfloatr%   staticmethodr1   r   r?   r   rB   __classcell__r+   r+   r)   r,   r   
   s    &
 
	r   N)typingr   r   r   r   r   r   
tokenizersr   r	   r
   r   r   tokenizers.modelsr   tokenizers.normalizersr   base_tokenizerr   r   r+   r+   r+   r,   <module>   s     