o
    h                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlZd dlm  m	Z
 d dlmZmZmZ d dlmZ ddlmZ ddlmZ d	d
lmZmZ d	dlmZmZ e rZd dlmZmZmZ e rd dl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( d dl)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1 d dl2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9m:Z:m;Z;m<Z< d dl=m>Z>m?Z?m@Z@mAZAmBZB d dl=mZC d dlDmEZE d dlFmGZG d dlHmIZI d dlJmKZKmLZL d dlMmNZNmOZO d dlPmQZQ d dlRmSZSmTZTmUZU d dlVmWZWmXZXmYZYmZZZ d?ddZ[dd  Z\G d!d" d"Z]d#d$ Z^G d%d& d&eZ_d'd( Z`G d)d* d*ZaG d+d, d,eZbd-d. ZcG d/d0 d0eZdG d1d2 d2edZeG d3d4 d4edZfG d5d6 d6edZgdi fd7d8ZhG d9d: d:ejjiZjd;d< Zkd=d> ZldS )@    N)ABC)partial)BCEWithLogitsLossCrossEntropyLossMSELoss)DistributedDataParallel   )AcceleratedOptimizer)AcceleratedScheduler   )is_megatron_lm_availableis_transformers_available)recursively_applysend_to_device)!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSequenceClassifierOutput)get_argsget_num_microbatchesget_tensorboard_writer
get_timersget_tokenizermpuprint_rank_0print_rank_last)_add_data_args_add_validation_args
parse_argsvalidate_args)load_args_from_checkpointload_checkpointsave_checkpoint) MegatronPretrainingRandomSamplerMegatronPretrainingSampler)set_global_variables)_compile_dependencies_init_autoresume_set_random_seedset_jit_fusion_optionswrite_args_to_tensorboard)	BertModelFloat16ModuleGPTModel	ModelTypeT5Model)Classification)get_megatron_optimizer)get_forward_backward_func)broadcast_int_listbroadcast_tensor)%beam_search_and_return_on_first_stage/generate_tokens_probs_and_return_on_first_stage)_vocab_size_with_padding)	get_modelget_optimizer_param_schedulertraining_log))average_losses_across_data_parallel_groupcalc_params_l2_normget_ltor_masks_and_position_idsunwrap_modelTc                 C   s   t  }|jrdnd}|jdkrtd|j d| d td |jdkrE|jr:|jr,d	nd}t||jd
| |d}|S t|jd	| |d}|S |jdkrTt	dd
| |d}|S |jdkret
dd
| |||d}|S td|j )zBuild the model.zpre-trainingzfine-tuningr   z	Building z model in the z mode.zThe Megatron LM model weights are initialized at random in `accelerator.prepare`. Please use `accelerator.load_checkpoint` to load a pre-trained checkpoint matching the distributed setup.bertr   T)num_tokentypesadd_binary_headparallel_outputpre_processpost_process)num_classesr?   rB   rC   gpt)r?   rA   rB   rC   t5)r?   rA   rB   rC   add_encoderadd_decoderUnsupported model type: )r   pretraining_flagrankprintmodel_type_namebert_binary_headr*   r/   
num_labelsr,   r.   
ValueError)rB   rC   rG   rH   argsmoder?   model rT   R/var/www/html/ai/venv/lib/python3.10/site-packages/accelerate/utils/megatron_lm.pymodel_provider_funcR   sH   





rV   c                 C   s   |  d t }| jjjd ur(| jjjd u rtd| jjj}| jj|}|S |jdv r1tj	}n|jdkrItj
}|jd u rI|jdkrI|jd |_tt|}|S )NzPreparing modelzaYou must provide a `custom_model_provider_function` when using a `custom_prepare_model_function`.)r>   rE   rF   r   r   )rL   r   statemegatron_lm_plugincustom_prepare_model_functioncustom_model_provider_functionrP   rM   r-   encoder_or_decoderencoder_and_decoder"pipeline_model_parallel_split_rankpipeline_model_parallel_sizer7   rV   )acceleratorrQ   custom_model_provider_funcrS   
model_typerT   rT   rU   prepare_modelz   s$   

	


rb   c                   @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )MegatronLMDummyDataLoaderz
    Dummy dataloader presents model parameters or param groups, this is primarily used to follow conventional training

    Args:
        **dataset_kwargs: Megatron data arguments.
    c                 K   sH   t  }t|}t|}| }t|d | _| j| d| jd< d S )Nr   Tmegatron_dataset_flag)argparseArgumentParserr   r   parse_known_argsvarsdataset_argsupdate)selfdataset_kwargsparser	data_argsrT   rT   rU   __init__   s   z"MegatronLMDummyDataLoader.__init__c                 C   s*   t  }| j D ]
\}}t||| qd S N)r   ri   itemssetattr)rk   rQ   keyvaluerT   rT   rU   set_megatron_data_args   s   z0MegatronLMDummyDataLoader.set_megatron_data_argsc                 C      dd }|S )Nc                 S   s   t  }|j|j|j| |j |jd}|jdkr&||j|j	|j
|jd n+|jdkr4|d|ji n|jdkrI||j|j|j	|j
dd ntd|j |jdkr]d	d
lm} nd	d
lm} |di |\}}}|||fS )z&Build train, valid, and test datasets.)data_prefix	data_implsplits_stringtrain_valid_test_num_samplesskip_warmupseedr>   )max_seq_lengthmasked_lm_probshort_seq_probbinary_headrE   
seq_lengthrF   )r}   max_seq_length_decr~   r   dataset_typerI   r   )build_train_valid_test_datasetsNrT   )r   	data_pathrx   splitmmap_warmupr|   rM   rj   r   	mask_probr   rN   encoder_seq_lengthdecoder_seq_lengthrP   megatron.data.gpt_datasetr   megatron.data.dataset_utils)train_val_test_num_samplesrQ   ri   r   train_dsvalid_dstest_dsrT   rT   rU   "train_valid_test_datasets_provider   sH   





zlMegatronLMDummyDataLoader.get_train_valid_test_datasets_provider.<locals>.train_valid_test_datasets_providerrT   )rk   r   rT   rT   rU   &get_train_valid_test_datasets_provider   s   -z@MegatronLMDummyDataLoader.get_train_valid_test_datasets_providerc              	   C   s   |d u rd S t  }|j|j }|jdkr$tt|||t t d}n |jdkr<t	|t|||t t |j
d}ntd|jtjjj|||jddS )Nsingle)total_samplesconsumed_samplesmicro_batch_sizedata_parallel_rankdata_parallel_sizecyclic)r   r   r   r   r   data_shardingz${} dataloader type is not supported.T)batch_samplernum_workers
pin_memory)r   r   num_micro_batchesdataloader_typer#   lenr   get_data_parallel_rankget_data_parallel_world_sizer"   r   	Exceptionformattorchutilsdata
DataLoaderr   )rk   datasetr   rQ   r   r   rT   rT   rU   build_pretraining_data_loader   s4   



z7MegatronLMDummyDataLoader.build_pretraining_data_loaderc                 C   s  dd }t  }d\}}}td |jdkr*|jdkr*|jd u s#J d|j|j |_|jdkrF|jdkrF|jd u rF|j|j |j |j |_t	
 dkr|jrS|j}n|j|j }|j|j d |j }|j}|||j ||j g}	td td	|	d  td
|	d  td|	d  |  }
|
|	\}}}| ||j}| ||j}| |d}|d uo|jdk}|d uo|jdk}|d uo|jdk}tjt|t|t|g}ntjg d}tjj|t	 t	 d |d  |_|d  |_|d  |_|j}|dv sJ |d ur$|dkrt|nt||}nd }|d ur;|dkr4t|nt||}nd }|d urR|dkrKt|nt||}nd }|||fS )Nc                 s   s    	 | D ]}|V  qqrp   rT   )iterxrT   rT   rU   cyclic_iter   s
   zTMegatronLMDummyDataLoader.build_train_valid_test_data_iterators.<locals>.cyclic_iter)NNNz3> building train, validation, and test datasets ...r   z?only backward compatiblity support for iteration-based trainingr   z( > datasets target sizes (minimum size):z    train:      {}z    validation: {}z    test:       {}r   r   r   r   group)r   r   r   )r   r   	iterationconsumed_train_samplestrain_samplesglobal_batch_sizeconsumed_valid_sampleseval_interval
eval_itersr   get_tensor_model_parallel_ranktrain_itersr   r   r   r   cuda
LongTensorintdistributed	broadcast"get_tensor_model_parallel_src_rankget_tensor_model_parallel_groupitemdo_traindo_validdo_testr   r   )rk   r   rQ   train_dataloadervalid_dataloadertest_dataloaderr   r   
test_itersr   r   r   r   r   r   r   r   flagsdl_typetrain_data_iteratorvalid_data_iteratortest_data_iteratorrT   rT   rU   %build_train_valid_test_data_iterators   sn   

 


 
z?MegatronLMDummyDataLoader.build_train_valid_test_data_iteratorsN)	__name__
__module____qualname____doc__ro   ru   r   r   r   rT   rT   rT   rU   rc      s    	0!rc   c           	   
      s.  |  d t }|jstddlm m} t }|j|j } fdd D }|d d u rKt|d t	j
jjr<||d _n|d= |d= |d= ||d	 _n|d	= ||d< t	j
jjjfi ||| jt t | jd
| j | jdS |jd ur|j\|_|_|_nd\|_|_|_ \}}}|||fS )NzPreparing dataloaderr   )_PYTORCH_DATALOADER_KWARGSprepare_data_loaderc                    s   i | ]}|t | | qS rT   )getattr).0kr   
dataloaderrT   rU   
<dictcomp>\  s    z'prepare_data_loader.<locals>.<dictcomp>
batch_sizesamplershuffler   T)num_processesprocess_indexsplit_batchesput_on_device	rng_typesdispatch_batchesr   )rL   r   rd   data_loaderr   r   r   r   
isinstancer   r   r   BatchSamplerr   r   r   devicer   r   r   r   r   copyr   r   r   r   consumed_test_samplesr   )	r_   r   rQ   r   r   kwargsr   r   r   rT   r   rU   r   T  sP   


r   c                       s:   e Zd Z fddZd
ddZdd Zedd	 Z  ZS )MegatronLMOptimizerWrapperc                    s   t  j|dd d d S )NF)device_placementscalersuperro   )rk   	optimizer	__class__rT   rU   ro     s   z#MegatronLMOptimizerWrapper.__init__Nc                 C      d S rp   rT   )rk   set_to_nonerT   rT   rU   	zero_grad     z$MegatronLMOptimizerWrapper.zero_gradc                 C   r   rp   rT   rk   rT   rT   rU   step  r   zMegatronLMOptimizerWrapper.stepc                 C   s   | j jS )zTWhether or not the optimizer step was done, or skipped because of gradient overflow.)r   skipped_iterr   rT   rT   rU   step_was_skipped  s   z+MegatronLMOptimizerWrapper.step_was_skippedrp   )	r   r   r   ro   r   r   propertyr   __classcell__rT   rT   r   rU   r     s    
r   c                 C   s(   |  d t }t||j|j|j}|S )NzPreparing optimizer)rL   r   r0   no_wd_decay_condscale_lr_condlr_mult)r_   rS   rQ   r   rT   rT   rU   prepare_optimizer  s   
r   c                   @   s   e Zd ZdZdddZdS )MegatronLMDummySchedulera  
    Dummy scheduler presents model parameters or param groups, this is primarily used to follow conventional training
    loop when scheduler config is specified in the deepspeed config file.

    Args:
        optimizer (`torch.optim.optimizer.Optimizer`):
            The optimizer to wrap.
        total_num_steps (int):
            Total number of steps.
        warmup_num_steps (int):
            Number of steps for warmup.
        **kwargs:
            Other arguments.
    Nr   c                 K   s   || _ || _|| _|| _d S rp   )r   total_num_stepswarmup_num_stepsr   )rk   r   r  r  r   rT   rT   rU   ro     s   
z!MegatronLMDummyScheduler.__init__Nr   )r   r   r   r   ro   rT   rT   rT   rU   r    s    r  c                       s$   e Zd Z fddZdd Z  ZS )MegatronLMSchedulerWrapperc                    s   t  || d S rp   r   )rk   	scheduler
optimizersr   rT   rU   ro     s   z#MegatronLMSchedulerWrapper.__init__c                 O   r   rp   rT   )rk   rQ   r   rT   rT   rU   r     r   zMegatronLMSchedulerWrapper.step)r   r   r   ro   r   r   rT   rT   r   rU   r    s    r  c                 C   s   |  d t|}|S )NzPreparing scheduler)rL   r8   )r_   r   r  rT   rT   rU   prepare_scheduler  s   
r  c                       8   e Zd ZdZ fddZdd Zdd Zdd	 Z  ZS )
AbstractTrainStepz;Abstract class for batching, forward pass and loss handler.c                    s   t    || _d S rp   )r   ro   name)rk   r  r   rT   rU   ro     s   

zAbstractTrainStep.__init__c                 C   r   rp   rT   r   rT   rT   rU   get_batch_func  r   z AbstractTrainStep.get_batch_funcc                 C   r   rp   rT   r   rT   rT   rU   get_forward_step_func  r   z'AbstractTrainStep.get_forward_step_funcc                 C   r   rp   rT   r   rT   rT   rU   get_loss_func  r   zAbstractTrainStep.get_loss_func)	r   r   r   r   ro   r  r  r  r   rT   rT   r   rU   r
    s    r
  c                       r	  )
BertTrainStepzg
    Bert train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                    sX   t  d | |j| _| |j|j| _| 	|j|j
| _|js'd | _d S t| _d S )Nr  )r   ro   r  rd   	get_batchr  rJ   rO   	loss_funcr  rN   forward_stepmodel_return_dictmodel_output_classr   rk   rQ   r   rT   rU   ro     s   

zBertTrainStep.__init__c                 C      dd }dd }|r|S |S )Nc                 S   s   g d}t j}| durt| }nd}t|||}|d  }|d  }|d  }|d  }|d  }	|d  }
|||||	|
fS )	Build the batch.)texttypeslabels	is_random	loss_maskpadding_maskNr  r  r  r  r  r  r   int64nextr   broadcast_datalongfloat)data_iteratorkeysdatatyper   data_btokensr  sentence_orderr  	lm_labelsr  rT   rT   rU   get_batch_megatron  s   
z8BertTrainStep.get_batch_func.<locals>.get_batch_megatronc                 S   s   t | }t|tj }|d  }|d  }d|v r#|d  }nd}d|v r:|d  }|d dktj}nd}d}d|v rI|d  }nd}||||||fS )r  	input_idsattention_masktoken_type_idsNr  next_sentence_label)r   r   r   r   current_devicer"  tor#  )r$  r   r(  r  r  r*  r  r)  rT   rT   rU   get_batch_transformer  s    z;BertTrainStep.get_batch_func.<locals>.get_batch_transformerrT   rk   rd   r+  r3  rT   rT   rU   r    s
   zBertTrainStep.get_batch_funcc                    s"   dd } fdd}|r|S |S )Nc           	      S   s   |\}}|  }|   } t|d| d |   }|d urKtj|dd  |ddd}|  }|| }t||g}||d |d dfS |}t|g}|d|d ifS )Nr   )ignore_indexr   r   )lm losszsop lossr7  )r#  r   sumviewreshapeFcross_entropyr:   )	r  r)  output_tensorlm_loss_
sop_logitslm_losssop_losslossaveraged_lossesrT   rT   rU   loss_func_pretrain  s   ""
z7BertTrainStep.get_loss_func.<locals>.loss_func_pretrainc                    s    dkrt  }||d| d}n&jdkr1| jtjtjfv r1t }||d | d}nt }||| }t	|g}|d|d ifS )Nr   r5  rB  r   )
r   r9  rO   dtyper   r"  r   r   r   r:   )r  logitsloss_fctrB  rC  rO   rk   rT   rU   loss_func_finetune1  s   

z7BertTrainStep.get_loss_func.<locals>.loss_func_finetunerT   )rk   rJ   rO   rD  rI  rT   rH  rU   r    s
   zBertTrainStep.get_loss_funcc                    s    fdd}|S )Nc           
         sb    | \}}}}}} sd}r"|||||d}|tj||fS ||||d}	|	tj|fS )Forward step.Ntokentype_idsr*  )rL  r  r   r  )
r$  rS   r(  r  r)  r  r  r  r=  rF  rN   rJ   rk   rT   rU   r  E  s   z9BertTrainStep.get_forward_step_func.<locals>.forward_steprT   )rk   rJ   rN   r  rT   rN  rU   r  D  s   z#BertTrainStep.get_forward_step_func	r   r   r   r   ro   r  r  r  r   rT   rT   r   rU   r    s    
7'r  c                       r	  )
GPTTrainStepzf
    GPT train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                    s   t  d | |j| _|  | _|  | _|j	d | _
|jd ur)t }|j| _
|j| _|j| _|j| _|js=d | _d S t| _d S )NrP  r   )r   ro   r  rd   r  r  r  r  r  padded_vocab_size	eod_token
vocab_filer   eodreset_position_idsreset_attention_maskeod_mask_lossr  r  r   )rk   rQ   	tokenizerr   rT   rU   ro   ]  s   




zGPTTrainStep.__init__c                    s$    fdd} fdd}|r|S |S )Nc                    s   dg}t j}| durt| }nd}t|||}|d  }|ddddf  }|ddddf  }t| j j	 j
 j\}}	}
|||	||
fS )zGenerate a batchr  Nr   r5  )r   r  r   r   r!  r"  
contiguousr<   rR  rU  rV  rW  )r$  r%  r&  r   r'  tokens_r  r(  r-  r  position_idsr   rT   rU   r+  o  s   

z7GPTTrainStep.get_batch_func.<locals>.get_batch_megatronc           	         s   t | }d|d i}t|tj }|d  }tj|jd df|j|j	d j
 }tj||gdd}|d d dd f  }|d d d df  }t| j
 j jd\}}}|||||fS )Nr,  r   r   )rE  r   dimr5  T)r   r   r   r   r1  r"  zerosshaperE  r   rR  concatrY  r<   rU  rV  )	r$  r   rZ  paddingr  r(  r-  r  r[  r   rT   rU   r3    s   $
z:GPTTrainStep.get_batch_func.<locals>.get_batch_transformerrT   r4  rT   r   rU   r  n  s
   zGPTTrainStep.get_batch_funcc                    s   t    fdd}|S )Nc                    sx    j r|\}}n|}| }| d } t|d|  |   }t|g}d|d i} j r8|d|i ||fS )Nr5  r7  r   rF  )return_logitsr#  r9  r   r8  r:   rj   )r  r=  lossesrF  rB  averaged_lossoutput_dictrQ   rT   rU   r    s   

z-GPTTrainStep.get_loss_func.<locals>.loss_func)r   rk   r  rT   rf  rU   r    s   zGPTTrainStep.get_loss_funcc                        fdd}|S )Nc                    s4     | \}}}}}|||||d}|t j|fS )rJ  )r  rM  )r$  rS   r(  r  r  r-  r[  r=  r   rT   rU   r    s   z8GPTTrainStep.get_forward_step_func.<locals>.forward_steprT   rk   r  rT   r   rU   r    s   z"GPTTrainStep.get_forward_step_funcrO  rT   rT   r   rU   rP  U  s    /rP  c                       s\   e Zd ZdZ fddZedd Zedd Zedd	 Zd
d Z	dd Z
dd Z  ZS )T5TrainStepze
    T5 train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                    sH   t  d | |j| _|  | _|  | _|j	sd | _
d S t| _
d S )Nrj  )r   ro   r  rd   r  r  r  r  r  r  r  r   r  r   rT   rU   ro     s   



zT5TrainStep.__init__c                 C   s(   |  d}|  d}|| }|dk }|S )Nr   r         ?)	unsqueeze)r-  attention_mask_b1sattention_mask_bs1attention_mask_bssextended_attention_maskrT   rT   rU   attn_mask_postprocess  s
   

z!T5TrainStep.attn_mask_postprocessc                 C   s&   t t jd| | f|d}|dk }|S Nr   r   rk  )r   trilones)r   r   r-  rT   rT   rU   get_decoder_mask  s   zT5TrainStep.get_decoder_maskc           	      C   s<   | j \}}| d}tj||df|d}|| }|dk }|S rr  )r_  rl  r   ru  )	r-  dec_seq_lengthr   r   _rm  rn  ro  rp  rT   rT   rU   get_enc_dec_mask  s   

zT5TrainStep.get_enc_dec_maskc                 C   r  )Nc                 S   s   g d}t j}| durt| }nd}t|||}|d  }|d  }|d  }|d  }|d dk }	|d	 dk }
|d
 dk }|||||	|
|fS )r  )text_enctext_decr  r  enc_maskdec_maskenc_dec_maskNrz  r{  r  r  r|  rk  r}  r~  r  )r$  r%  r&  r   r'  
tokens_enc
tokens_decr  r  r|  r}  r~  rT   rT   rU   r+    s   
z6T5TrainStep.get_batch_func.<locals>.get_batch_megatronc           	      S   s   t | }t|tj }|d  }|d  }|dktj}d|v r+|d  }n'|j|j	|j
tjd}|dddf  |dd	df< d
|d< ||dkd
 t|d  }t|j	d	 |j
}t|d  |j	d	 |j
}|||||||fS )r  r,  r  r/  decoder_input_ids)r   rE  .Nr5  r   r   ).r   r-  )r   r   r   r   r1  r"  r2  r#  	new_zerosr_  r   clonemasked_fill_rj  rq  rv  ry  )	r$  r   r  r  r  r  r|  r}  r~  rT   rT   rU   r3  
  s"    z9T5TrainStep.get_batch_func.<locals>.get_batch_transformerrT   r4  rT   rT   rU   r    s
   zT5TrainStep.get_batch_funcc                 C   rv   )Nc                 S   sH   |  }t|d| d |   }|}t|g}|d|d ifS )Nr5  r7  r   )r#  r   r8  r9  r:  r:   )r  r=  r>  r@  rB  rC  rT   rT   rU   r  '  s
   "
z,T5TrainStep.get_loss_func.<locals>.loss_funcrT   rg  rT   rT   rU   r  &  s   	zT5TrainStep.get_loss_funcc                    rh  )Nc           
   	      s>     | \}}}}}}}||||||d|d}	|	t j|fS )rJ  NrK  rM  )
r$  rS   r  r  r  r*  r|  r}  r~  r=  r   rT   rU   r  3  s   z7T5TrainStep.get_forward_step_func.<locals>.forward_steprT   ri  rT   r   rU   r  2  s   z!T5TrainStep.get_forward_step_func)r   r   r   r   ro   staticmethodrq  rv  ry  r  r  r  r   rT   rT   r   rU   rj    s    



6rj  c                 C   s&  |  d tj sJ dt|dd}| D ]&\}}t||d d ur8|jdkr8t dj|t|||ddd t	||| q|j
sH|d	d
rU|jd usQJ dt| t| t| dd }t }|  t  t  t  t }t|j||_|jdkr|jr|jdkrd|_nd
|_d|_d S )NzInitializing Megatron-LMzMegatron requires CUDA.T)ignore_unknown_argsr   z[WARNING: overriding default arguments for {key}:{v}                         with {key}:{v2})rs   vv2)flushuse_checkpoint_argsFz/--use-checkpoints-args requires --load argumentc                  S   s   t  } tj }tj | _tj | _|dkrD| j| }| j	d ur,| j	|ks+J dn|| _	t
 r8td nt
| j| j| j| j | jdkrQtd| j t| j| j d S )Nr   z:expected local-rank to be the same as rank % device-count.z%model parallel is already initializedz > setting random seeds to {} ...)r   r   r   device_countr   get_rankrK   get_world_size
world_size
local_rankr   model_parallel_is_initializedrL   initialize_model_paralleltensor_model_parallel_sizer^   $virtual_pipeline_model_parallel_sizer]   r   r|   r'   data_parallel_random_init)rQ   r  r   rT   rT   rU   finish_mpu_initc  s(   




z#initialize.<locals>.finish_mpu_initr>   r   )rL   r   r   is_availabler   rq   r   rK   r   rr   r  getloadr   r   r$   r   r&   r%   r(   r6   orig_vocab_sizerQ  rM   rJ   rO   rN   r   )r_   extra_args_providerargs_defaultsrQ   rs   rt   r  rT   rT   rU   
initializeD  s>   


r  c                       sz   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd Z								dddZ  ZS )MegatronEnginez
    Megatron-LM model wrapper

    Args:
        accelerator (:class:`~accelerate.Accelerator`): The accelerator object to use.
        model: Megatron-LM model
        optimizer: Megatron-LM optimizer
        lr_scheduler: Megatron-LM lr scheduler
    c                    s   t t|   || _|d | _|| _|| _t }|jj	j
d ur/|jj	j
|fi |jj	j| _n)|jdkr:t|| _n|jdkrEt|| _n|jdkrPt|| _ntd|j d| j_i | _i | _d| _d| _|jd urrt  d S d S )Nr   r>   rE   rF   rI   FT)r   r  ro   module
base_modelr   r  r   rW   rX   custom_train_step_classcustom_train_step_kwargstrain_step_handlerrM   r  rP  rj  rP   r   total_loss_dicteval_total_loss_dictr   report_memory_flagtensorboard_dirr)   )rk   r_   rS   r   r  rQ   r   rT   rU   ro     s6   






zMegatronEngine.__init__c                 C   s    | j D ]}|  q|   d S rp   )r  trainlog_eval_resultsrk   model_modulerT   rT   rU   r    s   

zMegatronEngine.trainc                 C   s   | j D ]}|  qd S rp   )r  evalr  rT   rT   rU   r    s   

zMegatronEngine.evalc                    s  t   t }t|dkr/g  jdkr,td jD ] fdd| D  qn|gt| jdkrSt|dkrJfddtt| jD ndgt| j }nt|dkr]tnd} j	dkrq j
rq| jD ]}|  qj| j  t }|| jj|| j| jdd	d
} jdkrtj  |d  | j | |d  |d  | j |\}}}	|d  |r|d  | j | |d  |r| jdurt  j  j }
| jj|
d d}nd}| | j_ jdkrtj    jt !  j t  7  _t j"ddrFi }|d D ])fdd|D }t|d j#dkr7t$|t| |< qt%||< q||||	fS i |||	fS )z
        Training step for Megatron-LM

        Args:
            batch_data (:obj:`dict`): The batch data to train on.
        r   r   c                    .   i | ]\}}|| j  d   j   qS r   r   r   r   r  rQ   irT   rU   r     s    z-MegatronEngine.train_step.<locals>.<dictcomp>c                       g | ]}t  qS rT   r   r   rx  data_chunksrT   rU   
<listcomp>      z-MegatronEngine.train_step.<locals>.<listcomp>NlocalF)forward_onlyzbackward-reduce-model-gradsr   zbackward-gather-model-params)	incrementr   Tignore_virtualc                       g | ]}|  qS rT   rT   r   r   rs   rT   rU   r  %  r  )&r   r   r   r   rangeappendrq   r  r   DDP_impl#use_contiguous_buffers_in_local_ddpzero_grad_bufferr   r   r1   r  r  empty_unused_memory_levelr   r   empty_cachestartreduce_model_gradsstopr   gather_model_paramsr  r   r   r   r   r   r   r   is_pipeline_last_stager_  r8  r`  )rk   
batch_datatimersbatch_data_iterator	partitionforward_backward_funclosses_reducedupdate_successful	grad_normnum_zeros_in_gradr  r   loss_reducedlosses_reduced_for_keyrT   rQ   r  r  rs   rU   
train_step  s   










zMegatronEngine.train_stepc                    s@  t   g  jdkr#td jD ] fdd| D  qn|gt| jdkr<fddtt| jD }nt}t }|| j	j
|| jdddd	} jdkrZtj    jt  j t  7  _tjdd
ri }|d D ]&fdd|D }t|d jdkrt|t| |< qut||< qu|S i S )z
        Evaluation step for Megatron-LM

        Args:
            batch_data (:obj:`dict`): The batch data to evaluate on.
        r   r   c                    r  r  r  r  r  rT   rU   r   :  s   . z,MegatronEngine.eval_step.<locals>.<dictcomp>c                    r  rT   r  r  r  rT   rU   r  @  r  z,MegatronEngine.eval_step.<locals>.<listcomp>NT)r   r  r  r  c                    r  rT   rT   r  r  rT   rU   r  X  r  )r   r   r  r  rq   r   r  r   r1   r  r  r  r   r   r  r   r   r   r   r   r  r_  r8  r`  )rk   r  r  r  
loss_dictsr  r  rT   r  rU   	eval_step-  sF   

	
zMegatronEngine.eval_stepc                 K   s~  t  }| jd jrJ| jd
i |\}}}}|  jd7  _|jd urI| j  }d }|j	r2t
| j}t|| j| jjd d | j|| j||||
| _n?| jd
i |}|jd ur|D ]/}	| j|	tjdg||	  | j|	< | j|	d tjdgtjdg | j|	d < qYtjd|jd}
|D ]}	t||	 jdkr|
||	 7 }
qd }d|v r|d }| jjd ur| jj|
|d	S |
S )Nr   r   lr        
_num_iters      ?rs  rF  )rB  rF  rT   )r   r  trainingr  r   r  r   get_loss_scaler   log_params_normr;   rS   r9   r  param_groupsr  r  r  r  r   r   FloatTensortensorr  r   r_  r  r  )rk   r  rQ   	loss_dictr   r  r  
loss_scaleparams_normrs   rB  rF  rT   rT   rU   forwarda  sV   


zMegatronEngine.forwardc                 C   s  t  }|jd u s| jdkrd S t  }t }d| j d}| jD ]R}|dr'q| j| | j|d   }|| d| d7 }ttd|	 }|j
rT|| d| d7 }|rq|| d|	 | j |j
rq|| d	|| j qt|d
 }td|  t| td|  i | _d S )Nr   zvalidation loss at iteration z | r  z value:    z PPL: z validationz validation pplr   -)r   r  r   r   r  endswithmathexpminr   rJ   
add_scalarr   r   )rk   rQ   writerstringrs   rt   ppllengthrT   rT   rU   r    s0   


zMegatronEngine.log_eval_resultsc                 C   sB   |    t }||_tj  t| j| j| j	| j
 tj  d S rp   )r  r   saver   r   barrierr!   r   r  r   r  )rk   
output_dirrQ   rT   rT   rU   r!     s   
zMegatronEngine.save_checkpointc                 C   sj   t  }||_d|_d|_tj  t| j| j	| j
}tj  || _|jr1| jdkr3| j	  d S d S d S r  )r   r  r   r   r   r   r  r    r  r   r  r   fp16reload_model_params)rk   	input_dirrQ   r   rT   rT   rU   r      s   

zMegatronEngine.load_checkpointNc
                 K   sZ  t  }|jdkrtd|jdkrtd|jrtd|jdur%td|jdu r.td|du r:|du r:td	|du rAd
}nd|  k rNdksStd td|du rZd}nd|  krgdksltd td|du rsd}n|dkr|dkrtdd|  krd
kstd td|
dd}d|  krd
kstd td|
dd}d|  krd
kstd td|
dd}t	|t
std|}|durt	|tstd|dk rtd|jd dkrdS t }|
d|j}|dur	t	|ts	td|	du rd
}	d}d}d}tj dkr|du r3tj|jd g|jd  }n|jdd  }|du rG||jd  }|dkrPtd!|r||jd  d }d"t|d"  }||jd d  }tj|jg| g|jd  }tjtj|dddf dd | |gdd }n2||jd  }d"t|d"  }||jd  }tj|jg| g|jd  }tj| |gdd }|d|dg}td#|dd$}| }t|tj|dd%}t|d tj|dd%}|
d&d}tj| t | j!t"t#t$f}|durt%|||||d|	d'\}}|S t&|||d|||||d(d)
\}}}|S )*a  
        Generate method for GPT2 model. This method is used for inference. Supports both greedy and beam search along
        with sampling. Refer the Megatron-LM repo for more details

        Args:
            inputs (torch.Tensor): input ids
            attention_mask (torch.Tensor, optional): attention mask. Defaults to None.
            max_length (int, optional): max length of the generated sequence. Defaults to None.
            Either this or max_new_tokens should be provided.
            max_new_tokens (int, optional): max number of tokens to be generated. Defaults to None.
            Either this or max_length should be provided.
            num_beams (int, optional): number of beams to use for beam search. Defaults to None.
            temperature (float, optional): temperature for sampling. Defaults to 1.0.
            top_k (int, optional): top k tokens to consider for sampling. Defaults to 0.0.
            top_p (float, optional): tokens in top p probability are considered for sampling. Defaults to 0.0.
            length_penalty (float, optional): length penalty for beam search. Defaults to None.
            kwargs: additional key-value arguments
        rE   z1Generate method is not implemented for this modelr   z1Generate method requires data parallelism to be 1z9Generate method requires sequence parallelism to be FalseNz2Checkpoint activations cannot be set for inferencez$Vocab file is required for inferencez;`max_length` or `max_new_tokens` are required for inferencer  r  g      Y@zAtemperature must be a positive number less than or equal to 100.0r   i  z:top_k must be a positive number less than or equal to 1000z/top_p and top_k sampling cannot be set togetherz'top_p must be less than or equal to 1.0top_p_decayz-top_p_decay must be less than or equal to 1.0top_p_boundz-top_p_bound must be less than or equal to 1.0add_BOSFzadd_BOS must be a booleanzbeam_width must be an integerz!beam_width must be greater than 0z,When doing beam_search, batch size must be 1
stop_tokenzstop_token must be an integerr5  )axisz%max_new_tokens must be greater than 0   r   )int_listrK   )r  rK   random_seed)r  num_return_genlength_penaltyT)return_output_log_probstop_ktop_pr  r  temperature#use_eod_token_for_early_termination)'r   rM   NotImplementedErrorr   rP   sequence_parallelrecompute_granularityrS  r  r   boolr   r_  r   rT  r   r   r  r   r   r8  r  ceilr`  rl  sizer2   tolistr3   r  randommanual_seedr=   r  torchDDPLocalDDPr+   r4   r5   )rk   inputsr-  
max_lengthmax_new_tokens	num_beamsr  r  r  r  r   rQ   r  r  r  
beam_widthrX  r  
sizes_listprompts_tokens_tensorprompts_length_tensorra  sizes_tensorsizescontext_tokens_tensorcontext_length_tensorr  unwrapped_modelr(  rx  rT   rT   rU   megatron_generate  s   !








 

 $ 

z MegatronEngine.megatron_generate)NNNNNNNN)r   r   r   r   ro   r  r  r  r  r  r  r!   r    r"  r   rT   rT   r   rU   r    s(    
e4=r  c                 C   s   t | S )z
    Average losses across data parallel group.

    Args:
        losses (List[Tensor]): List of losses to average across data parallel group.
    )r:   )rc  rT   rT   rU   %avg_losses_across_data_parallel_group  s   r#  c                 C   s   dd }t || ddS )z
    Recursively gather tensor in a nested list/tuple/dictionary of tensors from data parallel ranks.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather across data parallel ranks.

    c                    s^    j dkr  d    fddttjjt dD }tjj| t d tj	|ddS )Nr   c                    s   g | ]}t  qS rT   )r   
empty_liker  r  rT   rU   r    s    zOgather_across_data_parallel_groups.<locals>._gpu_gather_one.<locals>.<listcomp>r   r\  )
ndimr  r  r   r   r  r   get_data_parallel_group
all_gathercat)r  output_tensorsrT   r%  rU   _gpu_gather_one  s   

z;gather_across_data_parallel_groups.<locals>._gpu_gather_oneT)error_on_other_type)r   )r  r+  rT   rT   rU   "gather_across_data_parallel_groups  s   

r-  )TTTT)mre   r  abcr   	functoolsr   r   torch.nn.functionalnn
functionalr;  torch.nnr   r   r   torch.nn.parallel.distributedr   r  r   r	   r  r
   importsr   r   
operationsr   r   transformers.modeling_outputsr   r   r   megatronr   r   r   r   r   r   r   r   megatron.argumentsr   r   r   r   megatron.checkpointingr   r    r!   megatron.data.data_samplersr"   r#   megatron.global_varsr$   megatron.initializer%   r&   r'   r(   r)   megatron.modelr*   r+   r,   r-   r.   r  megatron.model.classificationr/   megatron.optimizerr0   megatron.schedulesr1   &megatron.text_generation.communicationr2   r3   #megatron.text_generation.generationr4   r5   megatron.tokenizer.tokenizerr6   megatron.trainingr7   r8   r9   megatron.utilsr:   r;   r<   r=   rV   rb   rc   r   r   r   r  r  r  r
  r  rP  rj  r  Moduler  r#  r-  rT   rT   rT   rU   <module>   sl   (

	( E2 j S   l