o
    h\                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZm	Z	 d dl
Z
d dlmZ d dlmZ ddlmZ ddlmZ eeZdZG d	d
 d
eZG dd deZG dd deZG dd deZG dd deZdS )    N)DictListOptional)FileLock)Dataset   )PreTrainedTokenizer)loggingu   This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: {0}c                
   @   sL   e Zd ZdZ		ddedededee fdd	Zd
d Z	de
jfddZdS )TextDatasetH
    This will be superseded by a framework-agnostic approach soon.
    FN	tokenizer	file_path
block_size	cache_dirc              
   C   s4  t tdt tj|du rtd| d||j	dd }tj
|\}}tj|d ur2|n|d|jj d| d| }|d }	t|	 tj|r|st }
t|d	}t|| _W d    n1 slw   Y  td
| dt |
  ntd|  g | _t|dd}| }W d    n1 sw   Y  |||}tdt|| d |D ]}| j|||||   qt }
t|d}tj| j|tjd W d    n1 sw   Y  td| dt |
 dd W d    d S W d    d S 1 sw   Y  d S )Nchttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.pyFInput file path 
 not foundpair
cached_lm__.lockrb"Loading features from cached file  [took %.3f s]'Creating features from dataset file at utf-8encodingr      wbprotocol!Saving features into cached file  [took .3f s]) warningswarnDEPRECATION_WARNINGformatFutureWarningospathisfile
ValueErrornum_special_tokens_to_addsplitjoin	__class____name__r   existstimeopenpickleloadexamplesloggerinforeadconvert_tokens_to_idstokenizerangelenappend build_inputs_with_special_tokensdumpHIGHEST_PROTOCOL)selfr   r   r   overwrite_cacher   	directoryfilenamecached_features_file	lock_pathstarthandleftexttokenized_texti rR   b/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/data/datasets/language_modeling.py__init__-   sX   

$zTextDataset.__init__c                 C   
   t | jS NrA   r:   rF   rR   rR   rS   __len__j      
zTextDataset.__len__returnc                 C   s   t j| j| t jdS )Ndtype)torchtensorr:   longrF   rQ   rR   rR   rS   __getitem__m   s   zTextDataset.__getitem__)FN)r4   
__module____qualname____doc__r   strintr   rT   rY   r^   Tensorrb   rR   rR   rR   rS   r
   (   s    	
=r
   c                   @   sF   e Zd ZdZdededefddZdd Zd	e	ee
jf fd
dZdS )LineByLineTextDatasetr   r   r   r   c                 C   s   t tdt tj|du rtd| dt	
d|  t|dd}dd	 |  D }W d    n1 s=w   Y  ||d
d
|d}|d | _dd	 | jD | _d S )Nr   Fr   r   r   r   r   c                 S   s$   g | ]}t |d kr| s|qS r   )rA   isspace.0linerR   rR   rS   
<listcomp>   s   $ z2LineByLineTextDataset.__init__.<locals>.<listcomp>Tadd_special_tokens
truncation
max_length	input_idsc                 S       g | ]}d t j|t jdiqS rt   r\   r^   r_   r`   rm   erR   rR   rS   ro           )r'   r(   r)   r*   r+   r,   r-   r.   r/   r;   r<   r7   r=   
splitlinesr:   )rF   r   r   r   rN   linesbatch_encodingrR   rR   rS   rT   v   s   
zLineByLineTextDataset.__init__c                 C   rU   rV   rW   rX   rR   rR   rS   rY      rZ   zLineByLineTextDataset.__len__r[   c                 C   
   | j | S rV   r:   ra   rR   rR   rS   rb      rZ   z!LineByLineTextDataset.__getitem__Nr4   rc   rd   re   r   rf   rg   rT   rY   r   r^   r_   rb   rR   rR   rR   rS   ri   q   s
    ri   c                   @   sJ   e Zd ZdZdedededefddZdd	 Zd
e	ee
jf fddZdS )LineByLineWithRefDatasetr   r   r   r   ref_pathc              
   C   s  t tdt tj|du rtd| dtj|du r)td| dt	
d|  t	
d|  t|dd	}| }W d    n1 sNw   Y  d
d |D }t|dd	}dd |  D }W d    n1 svw   Y  t|t|krtd| dt| d| dt| ||dd|d}|d | _dd | jD | _t| j}	t|	D ]}
tj||
 tjd| j|
 d< qd S )Nzghttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_wwm.pyFr   r   zRef file path r   zUse ref segment results at r   r   c                 S   s(   g | ]}t |d kr| s| qS rj   )rA   rk   striprl   rR   rR   rS   ro      s   ( z5LineByLineWithRefDataset.__init__.<locals>.<listcomp>c                 S   s*   g | ]}t |d kr| st|qS rj   )rA   rk   jsonloadsrl   rR   rR   rS   ro      s   * zDLength of Input file should be equal to Ref file. But the length of z is z while length of Trp   rt   c                 S   ru   rv   rw   rx   rR   rR   rS   ro      rz   r\   chinese_ref)r'   r(   r)   r*   r+   r,   r-   r.   r/   r;   r<   r7   	readlinesr=   r{   rA   r:   r@   r^   r_   r`   )rF   r   r   r   r   rN   datarefr}   nrQ   rR   rR   rS   rT      sD   


 z!LineByLineWithRefDataset.__init__c                 C   rU   rV   rW   rX   rR   rR   rS   rY      rZ   z LineByLineWithRefDataset.__len__r[   c                 C   r~   rV   r   ra   rR   rR   rS   rb      rZ   z$LineByLineWithRefDataset.__getitem__Nr   rR   rR   rR   rS   r      s
    $r   c                   @   sP   e Zd ZdZdededefddZddd	Zd
d Z	de
eejf fddZdS )LineByLineWithSOPTextDatasetzY
    Dataset for sentence order prediction task, prepare sentence pairs for SOP task
    r   file_dirr   c              	      s8  t tdt tj|du rt| dt	
d|  g | _t|D ]l}tj||}tj|du r@t| dd}t|ddD}| }g }	|D ]3}
d|
v rZd	}qQd
|
v r}d} fdd|	dd  D }| || }| j| g }	qQ|r|	|
 qQW d    n1 sw   Y  q(t	
d d S )Nr   Fz is not a directoryz.Creating features from dataset file folder at z is not a filer   r   z<doc id=Tz</doc>c                    s0   g | ]}t |d kr| s  |qS rj   )rA   rk   r>   r?   rl   r   rR   rS   ro      s
    z9LineByLineWithSOPTextDataset.__init__.<locals>.<listcomp>r   zDataset parse finished.)r'   r(   r)   r*   r+   r,   r-   isdirr/   r;   r<   r:   listdirr2   r.   r7   r   create_examples_from_documentextendrB   )rF   r   r   r   	file_namer   article_openrN   original_linesarticle_linesrn   documentr:   rR   r   rS   rT      sH   


z%LineByLineWithSOPTextDataset.__init__皙?c                 C   s  ||j dd }|}t |k rtd|}g }g }d}	d}
|
t|k r||
 }|s/|
d7 }
q|| |	t|7 }	|
t|d ksF|	|kr|rd}t|dkrZtdt|d }g }t|D ]	}|||  q`g }t|t|D ]	}|||  qst|dkst|dkrqt dk rd}||}}nd}dd	 }|||| t|dkstd
t| dt|dkstdt| d|||}|	||}t
j|t
jdt
j|t
jdt
j|rdndt
jdd}|| g }d}	|
d7 }
|
t|k s$|S )'Creates examples for a single document.Tr      r   r         ?Fc                 S   sh   	 t | t | }||krdS t | t |kr| n|}t |dks%tdt dk r/|d= n|  q)z;Truncates a pair of sequences to a maximum sequence length.Tr   z8Sequence length to be truncated must be no less than oner   r   N)rA   r/   randompop)tokens_atokens_bmax_num_tokenstotal_lengthtrunc_tokensrR   rR   rS   truncate_seq_pair-  s   zULineByLineWithSOPTextDataset.create_examples_from_document.<locals>.truncate_seq_pairLength of sequence a is  which must be no less than 1Length of sequence b is r\   )rt   token_type_idssentence_order_label)r0   r   randintrA   rB   r@   r   r/   rC   $create_token_type_ids_from_sequencesr^   r_   r`   )rF   r   r   r   short_seq_probr   target_seq_lengthr:   current_chunkcurrent_lengthrQ   segmenta_endr   jr   is_nextr   rt   r   examplerR   rR   rS   r      sd   	

Gz:LineByLineWithSOPTextDataset.create_examples_from_documentc                 C   rU   rV   rW   rX   rR   rR   rS   rY   S  rZ   z$LineByLineWithSOPTextDataset.__len__r[   c                 C   r~   rV   r   ra   rR   rR   rS   rb   V  rZ   z(LineByLineWithSOPTextDataset.__getitem__N)r   )r4   rc   rd   re   r   rf   rg   rT   r   rY   r   r^   r_   rb   rR   rR   rR   rS   r      s    
)cr   c                   @   s\   e Zd ZdZ			ddededefdd	Zd
eee  dedefddZ	dd Z
dd ZdS )$TextDatasetForNextSentencePredictionr   Fr   r   r   r   r   c              	   C   sz  t tdt tj|std| d|| _	|| _
tj|\}}tj|d|jj d| d| }	|| _|	d }
t|
 tj|	ry|syt }t|	d}t|| _W d    n1 sew   Y  td|	 d	t |  ntd
|  g g| _t|dd:}	 | }|sn*| }|st| jd dkr| jg  ||}||}|r| jd | qW d    n1 sw   Y  tdt| j d g | _t | jD ]\}}| !||| qt }t|	d}tj"| j|tj#d W d    n	1 sw   Y  td|	 dt | dd W d    d S W d    d S 1 s6w   Y  d S )Nr   r   r   cached_nsp_r   r   r   r   r   r   r   r   Tr   zCreating examples from z documents.r    r!   r#   r$   r%   r&   )$r'   r(   r)   r*   r+   r,   r-   r.   r/   short_seq_probabilitynsp_probabilityr1   r2   r3   r4   r   r   r5   r6   r7   r8   r9   r:   r;   r<   	documentsreadliner   rA   rB   r?   r>   	enumerater   rD   rE   )rF   r   r   r   rG   r   r   rH   rI   rJ   rK   rL   rM   rN   rn   tokens	doc_indexr   rR   rR   rS   rT   _  sr   	


$z-TextDatasetForNextSentencePrediction.__init__r   r   c                 C   s|  || j jdd }|}t | jk rtd|}g }d}d}|t|k r<|| }	||	 |t|	7 }|t|d ksA||kr0|r,d}
t|dkrVtdt|d }
g }t|
D ]	}|||  q\g }t|dksut | j	k rd}|t| }tdD ]}tdt| j
d }||kr nq| j
| }tdt|d }t|t|D ]}|||  t||kr nqt||
 }||8 }nd}t|
t|D ]	}|||  qt|dkstdt| d	t|dkstd
t| d	| j ||}| j ||}tj|tjdtj|tjdtj|rdndtjdd}| j| g }d}|d7 }|t|k s%dS dS )r   Tr   r   r   r   
   Fr   r   r   r\   )rt   r   next_sentence_labelN)r   r0   r   r   r   rA   rB   r@   r   r   r   r/   rC   r   r^   r_   r`   r:   )rF   r   r   r   r   r   r   r   rQ   r   r   r   r   r   is_random_nexttarget_b_lengthr   random_document_indexrandom_documentrandom_startnum_unused_segmentsrt   r   r   rR   rR   rS   r     sn   	


zBTextDatasetForNextSentencePrediction.create_examples_from_documentc                 C   rU   rV   rW   rX   rR   rR   rS   rY     rZ   z,TextDatasetForNextSentencePrediction.__len__c                 C   r~   rV   r   ra   rR   rR   rS   rb     rZ   z0TextDatasetForNextSentencePrediction.__getitem__N)Fr   r   )r4   rc   rd   re   r   rf   rg   rT   r   r   rY   rb   rR   rR   rR   rS   r   Z  s    	
UZr   )r   r,   r8   r   r6   r'   typingr   r   r   r^   filelockr   torch.utils.datar   tokenization_utilsr   utilsr	   
get_loggerr4   r;   r)   r
   ri   r   r   r   rR   rR   rR   rS   <module>   s*   
I!0 