o
    h                     @   s   d Z ddlZddlmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ dZeeZd	ed
edejfddZG dd dZG dd dZdS )z%REALM Retriever model implementation.    N)OptionalUnion)hf_hub_download   )AutoTokenizer)loggingzblock_records.npyblock_records_pathnum_block_recordsreturnc                 C   sF   dd l m  m} |jj| dd}|j|dd}t|d }|S )Nr   i    )buffer_sizeT)drop_remainder   )	tensorflow.compat.v1compatv1dataTFRecordDatasetbatchnexttakeas_numpy_iterator)r   r	   tfblocks_dataset	np_record r   _/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/realm/retrieval_realm.pyconvert_tfrecord_to_np!   s
   r   c                   @   s*   e Zd ZdZ				dddZdd	 Zd
S )ScaNNSearcherztNote that ScaNNSearcher cannot currently be used within the model. In future versions, it might however be included.     d   順 c           	      C   sD   ddl m} |||dd}|j|||d}|j|d}| | _dS )zBuild scann searcher.r   )builderdot_product)dbnum_neighborsdistance_measure)
num_leavesnum_leaves_to_searchtraining_sample_size)dimensions_per_blockN)#scann.scann_ops.py.scann_ops_pybindr"   treescore_ahbuildsearcher)	selfr$   r%   r*   r'   r(   r)   Builderr"   r   r   r   __init__.   s   zScaNNSearcher.__init__c                 C   s"   | j |  \}}|dS )Nint64)r/   search_batcheddetachcpuastype)r0   question_projectionretrieved_block_ids_r   r   r   r4   C   s   
zScaNNSearcher.search_batchedN)r   r   r    r!   )__name__
__module____qualname____doc__r2   r4   r   r   r   r   r   +   s    
r   c                       sZ   e Zd ZdZ fddZdddZedeee	e
jf  fd	d
Zdd Zdd Z  ZS )RealmRetrieverah  The retriever of REALM outputting the retrieved evidence block and whether the block has answers as well as answer
    positions."

        Parameters:
            block_records (`np.ndarray`):
                A numpy array which cantains evidence texts.
            tokenizer ([`RealmTokenizer`]):
                The tokenizer to encode retrieved texts.
    c                    s   t    || _|| _d S N)superr2   block_records	tokenizer)r0   rB   rC   	__class__r   r   r2   S   s   

zRealmRetriever.__init__Nptc                 C   s   t j| j|dd}| jj|d dd}g }g }	|D ]}
|| |	|
  q| j||	ddd|d}||}|d urE| |||f S d d d |fS )Nr   )indicesaxisT)skip_special_tokens)padding
truncationreturn_special_tokens_mask
max_length)npr   rB   rC   decodeappendconvert_to_tensorsblock_has_answer)r0   r9   question_input_ids
answer_idsrM   return_tensorsretrieved_blocksquestiontext	text_pairretrieved_blockconcat_inputsconcat_inputs_tensorsr   r   r   __call__X   s   

zRealmRetriever.__call__pretrained_model_name_or_pathc                 O   s`   t j|rt j|t}n
td|td|}tj|dd}tj	|g|R i |}| ||S )N)repo_idfilenameT)allow_pickler   )
ospathisdirjoin_REALM_BLOCK_RECORDS_FILENAMEr   rN   loadr   from_pretrained)clsr^   init_inputskwargsr   rB   rC   r   r   r   rh   m   s   
zRealmRetriever.from_pretrainedc                 C   s(   t tj|t| j | j| d S r@   )	rN   saverb   rc   re   rf   rB   rC   save_pretrained)r0   save_directoryr   r   r   rm   {   s   zRealmRetriever.save_pretrainedc                 C   sd  g }g }g }d}|j D ]}| }|| jj}	|	d ||	d d | jj }
|g  |g  |D ]4}t|	d |
D ]*}|d || krh|||t|  |krh|d | |d |t| d  q>q5t|d dkrx|d q|d t|d |krt|d }qt||D ]\}}t||k rdg|t|  }||7 }||7 }q|||fS )z&check if retrieved_blocks has answers.r   r   NFT)		input_idstolistindexrC   sep_token_idrP   rangelenzip)r0   r[   rT   has_answers	start_posend_posmax_answersinput_idinput_id_listfirst_sep_idxsecond_sep_idxansweridx
start_pos_end_pos_paddedr   r   r   rR      s>   
"



zRealmRetriever.block_has_answer)NrF   )r;   r<   r=   r>   r2   r]   classmethodr   r   strrb   PathLikerh   rm   rR   __classcell__r   r   rD   r   r?   H   s    

r?   )r>   rb   typingr   r   numpyrN   huggingface_hubr    r   utilsr   rf   
get_loggerr;   loggerr   intndarrayr   r   r?   r   r   r   r   <module>   s   

