o
    hS                     @   sN  U d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% e rsddl&m'Z' ndZ'e(e)Z*ere Z+ee,ee
e, e
e, f f e-d< neg de rdnde rdndffdde rdndffdde rdndffdde rdnde rdndffddde rdndffde rd nddffd!d"d#e rd$nde rd%ndffd&d'e rd(ndffd)d*d+d,de rdndffd-d.e rd/ndffd0de r(d1ndffd2d3e r4d4ndffd5de r@dndffd6d7e rMd8nde rTd9ndffd:d;de rbdndffd<d3e rnd4ndffd=d>e rzd?ndffd@d>e rd?ndffdAdBe rdCnde rdDndffdEdFe rdGndffdHdIe rdJndffdKe rdLnde rdMndffdNdOdPdQd3e rd4ndffdRdSe rdTndffdUe rdVnde rdWndffdXdYe r dZndffd[d\e rd]ndffd^d_e rd`ndffdade r$dndffdbe r/dcnddffdddede r>dfndffdgdhdie rLdjndffdkdldme rZdnndffdode rfdndffdpe rqdqnddffdrd.e r~d/ndffdsd.e rd/ndffdtd.e rd/ndffdude rdvndffdwdxd.e rd/ndffdydzd>e rd?ndffd{d|e rd}ndffd~dd3e rd4ndffdde rdndffdd.e rd/ndffdde rdnde rdndffdde rdndffdde rdndffdde r(dndffdde r4dndffdde r@dndffdde rLdndffde rWdnde r^dndffdde rjdndffdde rvdndffde rdnde rdndffddde rdndffde rdnddffde rdnddffde rdnde rdndffde rdnde rdndffdd3e rd4ndffdde rdndffdde rdnde rdndffde r	dnde rdndffde rdnddffdde r(dndffdde r4dndffdde r@dvndffdd3e rLd4ndffde rWdnde r^dndffdde rjdndffdde rvdndffdde rdndffde rdnde rdndffde rdnde rdndffde rdnde rdndffdd>e rd?ndffdde rdndffdd.e rd/ndffdd>e rd?ndffdd>e rd?ndffde rd'nde rd(ndffde rd'nde rd(ndffdӑde r%dnde r,dndffddFe r8dGndffd֑dde rFdndffde rQdnddffdڑdde r`dndffdܑdde rndndffde rydnde rdndffde rdnde rdndffdde rdndffdd3e rd4ndffdd3e rd4ndffddde rdndffdde rdvndffde rdnde rdndffde rdnde rdndffde rdnddffdde rdnddffddde rdndffde r'dnde r.dndffde r9dnde r@dndffdd ddde rUdndffde radnde rhdndffdde rudndffdde rdndffdddd	d
de rdndffdd>e rd?ndffde rdnde rdndffdde rϐdnddffde rdnde rdndffde rdnde rdndffde rdnde rdndffde rdnde rdndffde r*dnde r1dndffZ+ee!e+Z.dd e!/ D Z0de,fddZ1		 	 				 	!d/d"ee,ej2f d#e
ee,ej2f  d$e3d%e3d&e
e	e,e,f  d'e
ee3e,f  d(e
e, d)e3d*e,fd+d,Z4G d-d. d.Z5dS (0  z Auto Tokenizer class.    N)OrderedDict)TYPE_CHECKINGDictOptionalTupleUnion   )PretrainedConfig)get_class_from_dynamic_moduleresolve_trust_remote_code)PreTrainedTokenizer)TOKENIZER_CONFIG_FILE)cached_fileextract_commit_hashis_sentencepiece_availableis_tokenizers_availablelogging   )EncoderDecoderConfig   )_LazyAutoMapping)CONFIG_MAPPING_NAMES
AutoConfigconfig_class_to_model_typemodel_type_to_module_name!replace_list_option_in_docstrings)PreTrainedTokenizerFastTOKENIZER_MAPPING_NAMESalbertAlbertTokenizerAlbertTokenizerFastalignBertTokenizerBertTokenizerFastbark)bart)BartTokenizerBartTokenizerFastbarthezBarthezTokenizerBarthezTokenizerFast)bartpho)BartphoTokenizerNbertzbert-generationBertGenerationTokenizer)zbert-japanese)BertJapaneseTokenizerN)bertweet)BertweetTokenizerNbig_birdBigBirdTokenizerBigBirdTokenizerFastbigbird_pegasusPegasusTokenizerPegasusTokenizerFast)biogpt)BioGptTokenizerN)
blenderbot)BlenderbotTokenizerBlenderbotTokenizerFast)zblenderbot-small)BlenderbotSmallTokenizerNblipzblip-2GPT2TokenizerGPT2TokenizerFastbloomBloomTokenizerFastbridgetowerRobertaTokenizerRobertaTokenizerFastbros)byt5)ByT5TokenizerN	camembertCamembertTokenizerCamembertTokenizerFast)canine)CanineTokenizerNchinese_clipclapclipCLIPTokenizerCLIPTokenizerFastclipseg)clvp)ClvpTokenizerN
code_llamaCodeLlamaTokenizerCodeLlamaTokenizerFastcodegenCodeGenTokenizerCodeGenTokenizerFastconvbertConvBertTokenizerConvBertTokenizerFastcpmCpmTokenizerCpmTokenizerFast)cpmant)CpmAntTokenizerN)ctrl)CTRLTokenizerN)zdata2vec-audioWav2Vec2CTCTokenizerNzdata2vec-textdebertaDebertaTokenizerDebertaTokenizerFastz
deberta-v2DebertaV2TokenizerDebertaV2TokenizerFast
distilbertDistilBertTokenizerDistilBertTokenizerFastdprDPRQuestionEncoderTokenizerDPRQuestionEncoderTokenizerFastelectraElectraTokenizerElectraTokenizerFasternieernie_mErnieMTokenizer)esm)EsmTokenizerNfalconr   )flaubert)FlaubertTokenizerNfnetFNetTokenizerFNetTokenizerFast)fsmt)FSMTTokenizerNfunnelFunnelTokenizerFunnelTokenizerFastgitzgpt-sw3GPTSw3Tokenizergpt2gpt_bigcodegpt_neogpt_neoxGPTNeoXTokenizerFast)gpt_neox_japanese)GPTNeoXJapaneseTokenizerNgptj)zgptsan-japanese)GPTSanJapaneseTokenizerNgroupvitherbertHerbertTokenizerHerbertTokenizerFast)hubertrf   ibertideficsLlamaTokenizerFastinstructblip)jukebox)JukeboxTokenizerNzkosmos-2XLMRobertaTokenizerXLMRobertaTokenizerFastlayoutlmLayoutLMTokenizerLayoutLMTokenizerFast
layoutlmv2LayoutLMv2TokenizerLayoutLMv2TokenizerFast
layoutlmv3LayoutLMv3TokenizerLayoutLMv3TokenizerFast	layoutxlmLayoutXLMTokenizerLayoutXLMTokenizerFastledLEDTokenizerLEDTokenizerFastliltllamaLlamaTokenizerllava
longformerLongformerTokenizerLongformerTokenizerFastlongt5T5TokenizerT5TokenizerFast)luke)LukeTokenizerNlxmertLxmertTokenizerLxmertTokenizerFastm2m_100M2M100TokenizermarianMarianTokenizermbartMBartTokenizerMBartTokenizerFastmbart50MBart50TokenizerMBart50TokenizerFastmegazmegatron-bert)zmgp-str)MgpstrTokenizerNmistralmixtralmlukeMLukeTokenizer
mobilebertMobileBertTokenizerMobileBertTokenizerFastmpnetMPNetTokenizerMPNetTokenizerFastmptmramt5MT5TokenizerMT5TokenizerFastmusicgenmvpMvpTokenizerMvpTokenizerFastnezhanllbNllbTokenizerNllbTokenizerFastznllb-moenystromformer	oneformerz
openai-gptOpenAIGPTTokenizerOpenAIGPTTokenizerFastoptowlv2owlvitpegasus	pegasus_x)	perceiver)PerceiverTokenizerN	persimmonphi)phobert)PhobertTokenizerN
pix2structplbartPLBartTokenizer)
prophetnet)ProphetNetTokenizerNqdqbert)rag)RagTokenizerNrealmRealmTokenizerRealmTokenizerFastreformerReformerTokenizerReformerTokenizerFastrembertRemBertTokenizerRemBertTokenizerFast	retribertRetriBertTokenizerRetriBertTokenizerFastrobertazroberta-prelayernorm)roc_bert)RoCBertTokenizerNroformerRoFormerTokenizerRoFormerTokenizerFastrwkvseamless_m4tSeamlessM4TTokenizerSeamlessM4TTokenizerFastseamless_m4t_v2speech_to_textSpeech2TextTokenizer)speech_to_text_2)Speech2Text2TokenizerNspeecht5SpeechT5Tokenizer)splinter)SplinterTokenizerSplinterTokenizerFastsqueezebertSqueezeBertTokenizerSqueezeBertTokenizerFastswitch_transformerst5)tapas)TapasTokenizerN)tapex)TapexTokenizerN)z
transfo-xl)TransfoXLTokenizerNtvpumt5viltvisual_bert)vits)VitsTokenizerN)wav2vec2rf   )zwav2vec2-conformerrf   )wav2vec2_phoneme)Wav2Vec2PhonemeCTCTokenizerNwhisperWhisperTokenizerWhisperTokenizerFastxclipxglmXGLMTokenizerXGLMTokenizerFast)xlm)XLMTokenizerNzxlm-prophetnetXLMProphetNetTokenizerzxlm-robertazxlm-roberta-xlxlnetXLNetTokenizerXLNetTokenizerFastxmodyosoc                 C   s   i | ]\}}||qS  r8  ).0kvr8  r8  `/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py
<dictcomp>  s    r=  
class_namec              	   C   s   | dkrt S t D ]'\}}| |v r1t|}td| d}zt|| W   S  ty0   Y q
w q
tj	 D ]\}}|D ]}t|dd | krM|    S q=q7td}t
|| r^t|| S d S )Nr   .ztransformers.models__name__transformers)r   r   itemsr   	importlibimport_modulegetattrAttributeErrorTOKENIZER_MAPPING_extra_contenthasattr)r>  module_name
tokenizersmoduleconfig	tokenizermain_moduler8  r8  r<  tokenizer_class_from_name  s,   	


rP  F pretrained_model_name_or_path	cache_dirforce_downloadresume_downloadproxiestokenrevisionlocal_files_only	subfolderc	                 K   s   |	 dd}
|
durtdt |durtd|
}|	dd}t| t||||||||dd|d}|du r<t	d i S t
||}t|d	d
}t|}W d   n1 sWw   Y  ||d< |S )a  
    Loads the tokenizer configuration from a pretrained model tokenizer configuration.

    Args:
        pretrained_model_name_or_path (`str` or `os.PathLike`):
            This can be either:

            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
              under a user or organization name, like `dbmdz/bert-base-german-cased`.
            - a path to a *directory* containing a configuration file saved using the
              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.

        cache_dir (`str` or `os.PathLike`, *optional*):
            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
            cache should not be used.
        force_download (`bool`, *optional*, defaults to `False`):
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
            exist.
        resume_download (`bool`, *optional*, defaults to `False`):
            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
        proxies (`Dict[str, str]`, *optional*):
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
        token (`str` or *bool*, *optional*):
            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
            when running `huggingface-cli login` (stored in `~/.huggingface`).
        revision (`str`, *optional*, defaults to `"main"`):
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
            identifier allowed by git.
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, will only try to load the tokenizer configuration from local files.
        subfolder (`str`, *optional*, defaults to `""`):
            In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
            specify the folder name here.

    <Tip>

    Passing `token=True` is required when you want to use a private model.

    </Tip>

    Returns:
        `Dict`: The configuration of the tokenizer.

    Examples:

    ```python
    # Download configuration from huggingface.co and cache.
    tokenizer_config = get_tokenizer_config("bert-base-uncased")
    # This model does not have a tokenizer config so the result will be an empty dict.
    tokenizer_config = get_tokenizer_config("xlm-roberta-base")

    # Save a pretrained tokenizer locally and you can reload its config
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    tokenizer.save_pretrained("tokenizer-test")
    tokenizer_config = get_tokenizer_config("tokenizer-test")
    ```use_auth_tokenNrThe `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.V`token` and `use_auth_token` are both specified. Please set only the argument `token`._commit_hashF)rS  rT  rU  rV  rW  rX  rY  rZ  %_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errorsr^  z\Could not locate the tokenizer configuration file, will try to use the model config instead.zutf-8)encoding)popwarningswarnFutureWarning
ValueErrorgetr   r   loggerinfor   openjsonload)rR  rS  rT  rU  rV  rW  rX  rY  rZ  kwargsr[  commit_hashresolved_config_filereaderresultr8  r8  r<  get_tokenizer_config  sD   I

rr  c                   @   s6   e Zd ZdZdd Zeeedd Zd
dd	Z	dS )AutoTokenizera  
    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
    created with the [`AutoTokenizer.from_pretrained`] class method.

    This class cannot be instantiated directly using `__init__()` (throws an error).
    c                 C   s   t d)Nz}AutoTokenizer is designed to be instantiated using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.)EnvironmentError)selfr8  r8  r<  __init__[  s   zAutoTokenizer.__init__c              	   O   s  | dd}|dur tdt |dddurtd||d< | dd}d|d< | d	d}| d
d}| dd}|durd}	t|d}
|
du ratd| dddd t D  d|
\}}|ru|durpt	|}	nt
d |	du r}t	|}	|	du rtd| d|	j|g|R i |S t|fi |}d|v r|d |d< |d}d}d|v rt|d ttfr|d }n|d dd}|du rt|tstj|fd|i|}|j}t|drd|jv r|jd }|du}|dupt|tv }t||||}|rB|rB|r|d dur|d }n|d }t||fi |}	| dd}tj|r6|	  |	j|g|R i |S |durd}	|r[|ds[| d}t	|}	|	du rf|}t	|}	|	du rstd| d|	j|g|R i |S t|trt|j t|j!urt
d|j!j" d|j j" d  |j!}t#t|j$}|durtt| \}}|r|s|du r|j|g|R i |S |dur|j|g|R i |S td!td"|j" d#dd$d t D  d)%a8  
        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.

        The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
        falling back to using pattern matching on `pretrained_model_name_or_path`:

        List options

        Params:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                Can be either:

                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                      user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
                      applicable to all derived classes)
            inputs (additional positional arguments, *optional*):
                Will be passed along to the Tokenizer `__init__()` method.
            config ([`PretrainedConfig`], *optional*)
                The configuration object used to determine the tokenizer class to instantiate.
            cache_dir (`str` or `os.PathLike`, *optional*):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download the model weights and configuration files and override the
                cached versions if they exist.
            resume_download (`bool`, *optional*, defaults to `False`):
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.
            subfolder (`str`, *optional*):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
            use_fast (`bool`, *optional*, defaults to `True`):
                Use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported for
                a given model. If a fast tokenizer is not available for a given model, a normal Python-based tokenizer
                is returned instead.
            tokenizer_type (`str`, *optional*):
                Tokenizer type to be loaded.
            trust_remote_code (`bool`, *optional*, defaults to `False`):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                should only be set to `True` for repositories you trust and in which you have read the code, as it will
                execute code present on the Hub on your local machine.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
                `additional_special_tokens`. See parameters in the `__init__()` for more details.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer

        >>> # Download vocabulary from huggingface.co and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

        >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")

        >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
        >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")

        >>> # Download vocabulary from huggingface.co and define model-specific arguments
        >>> tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
        ```r[  Nr\  rW  r]  rM  T
_from_autouse_fasttokenizer_typetrust_remote_codezPassed `tokenizer_type` z3 does not exist. `tokenizer_type` should be one of z, c                 s   s    | ]}|V  qd S Nr8  r9  cr8  r8  r<  	<genexpr>  s    z0AutoTokenizer.from_pretrained.<locals>.<genexpr>r?  zt`use_fast` is set to `True` but the tokenizer class does not have a fast version.  Falling back to the slow version.zTokenizer class z is not currently imported.r^  tokenizer_classauto_maprs  r   r   code_revisionFastz- does not exist or is not currently imported.z The encoder model config class: z3 is different from the decoder model config class: z. It is not recommended to use the `AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder specific tokenizer classes.zzThis tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed in order to use this tokenizer.z!Unrecognized configuration class z8 to build an AutoTokenizer.
Model type should be one of c                 s   s    | ]}|j V  qd S r{  )r@  r|  r8  r8  r<  r~  1  s    )%rb  rc  rd  re  rg  rf  r   joinkeysrP  rh  warningfrom_pretrainedrr  
isinstancetuplelistr	   r   r  rI  r  typerG  r   r
   ospathisdirregister_for_auto_classendswithr   decoderencoder	__class__r   r@  )clsrR  inputsrm  r[  rM  rx  ry  rz  r  tokenizer_class_tupletokenizer_class_nametokenizer_fast_class_nametokenizer_configconfig_tokenizer_classtokenizer_auto_maphas_remote_codehas_local_code	class_ref_tokenizer_class_candidate
model_typetokenizer_class_pytokenizer_class_fastr8  r8  r<  r  a  s   O













zAutoTokenizer.from_pretrainedNFc                 C   s   |du r|du rt d|durt|trt d|dur&t|tr&t d|durD|durDt|trD|j|krDt d|j d| d| tjv r[t|  \}}|du rU|}|du r[|}tj| ||f|d dS )	a  
        Register a new tokenizer in this mapping.


        Args:
            config_class ([`PretrainedConfig`]):
                The configuration corresponding to the model to register.
            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
                The slow tokenizer to register.
            fast_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
                The fast tokenizer to register.
        NzKYou need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_classz:You passed a fast tokenizer in the `slow_tokenizer_class`.z:You passed a slow tokenizer in the `fast_tokenizer_class`.zThe fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not consistent with the slow tokenizer class you passed (fast tokenizer has z and you passed z!. Fix one of those so they match!)exist_ok)rf  
issubclassr   r   slow_tokenizer_classrG  rH  register)config_classr  fast_tokenizer_classr  existing_slowexisting_fastr8  r8  r<  r  4  s2   

zAutoTokenizer.register)NNF)
r@  
__module____qualname____doc__rv  classmethodr   r   r  r  r8  r8  r8  r<  rs  S  s     Rrs  )NFFNNNFrQ  )6r  rC  rk  r  rc  collectionsr   typingr   r   r   r   r   configuration_utilsr	   dynamic_module_utilsr
   r   tokenization_utilsr   tokenization_utils_baser   utilsr   r   r   r   r   encoder_decoderr   auto_factoryr   configuration_autor   r   r   r   r   tokenization_utils_fastr   
get_loggerr@  rh  r   str__annotations__rG  rB  CONFIG_TO_TYPErP  PathLikeboolrr  rs  r8  r8  r8  r<  <module>   sJ  	
*	
 !"#$%&')/029@FHNOQWXYZ[]ceklmnopqrstuvwxyz{|}~                               "  #  $  %  '  .  4  5  6  8  ?  E  F  G  H  I  K  Q  R  S  U  \  c  i  j  k  l  m  o  v  |                                               !    "    $    '    (    )    +    2    8    9    :    ;    =    A    H    N    O    P    Q    S    Y    Z    [    \    ]    ^    _    `    b    h    i    k    r    y                
	n