o
    hB                     @   s   d dl mZ d dlZd dlm  mZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZmZ d
d Zd#ddZd$ddZdd Zdd Zdd Zd%ddZd%ddZdd Z dd Z!d d! Z"e#d"kr|e!  dS dS )&    )deepcopyN)AdamW)LambdaLR)
DataLoader)Accelerator)GradientState)RegressionDatasetRegressionModel)DistributedTypeis_torch_versionset_seedc              	   C   s   t |  | D ]?\}}|jsq	|s.t|j|jdu s-J d| d|j d|j dq	t|j|jdu sHJ d| d|j d|j dq	d S )	NF7Gradients in sync when they should not be at iteration z:
model_a grad (z) == model_b grad ()T7Gradients not in sync when they should be at iteration z) != model_b grad ()zip
parametersrequires_gradtorchallclosegrad)model_amodel_bdid_step	iterationparam
grad_param r   ]/var/www/html/ai/venv/lib/python3.10/site-packages/accelerate/test_utils/scripts/test_sync.pycheck_model_parameters   s   r   Tc                 C   sL   |    | |}t|||j}|s||j }|  d S || d S N)trainFmse_losstodevicegradient_accumulation_stepsbackward)modelinputtargetacceleratordo_backwardoutputlossr   r   r   
step_model-   s   
r.   Fc           	      C   s   t d t }t|}tdd}t|dd}|| j |r>t| dd}t| dd}t	|dd	 d
}t	|dd	 d
}|rM| 
||||\}}}}n| 
||\}}|r`|||||||fS |||fS )z3Returns everything needed to perform basic training*   P   length   
batch_sizegMbP?)paramslrc                 S      | d S Ng?r   epochr   r   r   <lambda>C       z$get_training_setup.<locals>.<lambda>)	lr_lambdac                 S   r8   r9   r   r:   r   r   r   r<   D   r=   )r   r	   r   r   r   r#   r$   r   r   r   prepare)	r*   schedr'   	ddp_modeldset
dataloaderoptddp_opt	ddp_schedr   r   r   get_training_setup8   s"   

rG   c              	   C   s>  t | \}}}tt| \}}tdD ]}| ||f\}}|| j|| j}}t||||  |d dkrW| 	| t||||  W d    n1 sQw   Y  nt||||  t
||d| t| | D ]\}	}
|	jsvqnt|	j|
jsJ d|	j d|
j dqntd|  |tt| }qd S )	N      r   T7Gradients not in sync when they should be:
Model grad () != DDP grad (r   9  )rG   nextitervaluesrangegatherr#   r$   r.   no_syncr   r   r   r   r   r   r   manual_seedrandpermlenr*   r'   rA   rC   	ddp_input
ddp_targetr   r(   r)   r   	ddp_paramr   r   r   test_noop_syncO   s0   rZ   c              	   C   sp  t | \}}}tt| \}}tdD ]}| ||f\}}|| j|| j}}t||||  |d dkrW| 	| t||||  W d    n1 sQw   Y  nt||||  t
| | D ]=\}	}
|	jsoqg|d dkrt|	j|
jdu sJ d|	j d|
j dqgt|	j|
jdu sJ d	|	j d
|
j dqgtd|  |tt| }qd S )NrH   rI   r   F7Gradients in sync when they should not be:
Model grad () == DDP grad (r   TrJ   rK   rL   )rG   rM   rN   rO   rP   rQ   r#   r$   r.   rR   r   r   r   r   r   r   rS   rT   rU   rV   r   r   r   test_distributed_syncq   s2   r]   c              	   C   s  t | \}}}g }d}t|D ]Q}tt| \}}| ||f\}	}
|	| j|
| j}	}
t||	|
|  | 	| ||}t
|||j}|| W d    n1 s[w   Y  qt|D ]~}|| }||d k r| | t| | D ]\}}|jsqt|j|jdu sJ d|j d|j dqqe| | | | W d    n1 sw   Y  t| | D ]\}}|jsqt|j|jdu sJ d|j d	|j dqqed S )
NrH      Fr[   r\   r   TrJ   rK   )rG   rP   rM   rN   rO   rQ   r#   r$   r.   rR   r!   r"   appendr&   r   r   r   r   r   r   trigger_sync_in_backward)r*   r'   rA   rC   lossesnum_iterationsr   rW   rX   r(   r)   
ddp_outputr-   r   rY   r   r   r   "test_distributed_sync_multiple_fwd   sH   

rd   c              
   C   s  t | |dd}t|\}}}t|D ]\}}| \}}	|||	f\}
}|
|j||j}
}t||
||d || t|||	| W d    n1 sQw   Y  t	|
 |
 D ]M\}}|jsgq_|d d dksw|t|d krt|j|jdu sJ d| d|j d	|j d
q_t|j|jdu sJ d| d|j d|j d
q_td|  |tt| }qt  d S )NrI   split_batchesdispatch_batchesr%   Fr^   r   Tr   z:
Model grad (rK   r   r   r\   rL   )r   rG   	enumeraterO   rQ   r#   r$   r.   
accumulater   r   r   rU   r   r   r   rS   rT   r   _reset_state)rf   rg   r*   r'   rA   rC   r   batchrW   rX   r(   r)   r   rY   r   r   r   test_gradient_accumulation   s2    rl   c              	   C   s  t | |dd}t|d\}}}}}}}	t|D ]\}
}| \}}|||f\}}||j||j}}|  |  t||||d |	  |
d d dks[|
d t
|krn| rb|	  nt|jD ]}|	  qg|  || t|||| |	  |		  |  W d    n1 sw   Y  |jd d |jd d ksJ d|jd d  d	|jd d  d
|
d d dkp|
d t
|k}|jdkrt||||
 td|
  qt  d S )NrI   re   TFr^   r   r7   z:Learning rates found in each optimizer did not align
opt: z

DDP opt: 
rL   )r   rG   rh   rO   rQ   r#   r$   r    r.   steprU   rP   num_processes	zero_gradri   param_groupsr   r   rS   r   rj   )rf   rg   r*   r'   rD   r@   rC   rA   rE   rF   r   rk   rW   rX   r(   r)   _r   r   r   r   1test_gradient_accumulation_with_opt_and_scheduler   s@    


$ 
rs   c                  C   s  t  } tdd}t|dd}tdd}t|dd}| ||\}}| jjd u s)J t|D ]S\}}t| jjt|ks=J |t|d k rz| jj	rKJ |dkryt|D ]%\}}t| jjt|kscJ |t|d k rr| jj	rqJ qS| jj	sxJ qSq-| jj	sJ q-| jjd u sJ d S )Nr0   r1   r3   r4   `   r^   )
r   r   r   r?   gradient_stateactive_dataloaderrh   idrU   end_of_dataloader)r*   
first_dsetfirst_dataloadersecond_dsetsecond_dataloaderr   rr   	batch_numr   r   r   test_dataloader_break   s(   

r~   c               	   C   s~  t  } | j}|jdkrtd t  |jtjkr%|jdkr!td t|  |jtj	tj
tjfv rJ|jdkr9td t|  |jdkrFtd t|  |jtj	tj
fv rsdD ]}dD ]}|jdkrltdd| d	| d
 t|| qYqUtdds~|jtjkr|jdkrtdd t  |jtj	tj
fv rdD ]&}dD ]}|s|sq|jdkrtdd| d	| d
 t|| qqd S d S d S )Nr   zA**Test `accumulate` gradient accumulation with dataloader break**z'**Test NOOP `no_sync` context manager**z.**Test Distributed `no_sync` context manager**zE**Test Distributed `no_sync` context manager with multiple forwards**)TFz+**Test `accumulate` gradient accumulation, z`split_batches=z` and `dispatch_batches=z`**<z2.0zH**Test `accumulate` gradient accumulation with optimizer and scheduler, z1`split_batches=False`, `dispatch_batches=False`**)r   statelocal_process_indexprintr~   distributed_typer
   NOrZ   	MULTI_GPU	MULTI_NPU	MULTI_CPUr]   rd   rl   r   rs   )r*   r   split_batchrg   r   r   r   main9  s`   




	

r   c                 C   s
   t   d S r   )r   )indexr   r   r   _mp_fni  s   
r   __main__)T)F)FF)$copyr   r   torch.nn.functionalnn
functionalr!   torch.optimr   torch.optim.lr_schedulerr   torch.utils.datar   accelerate.acceleratorr   accelerate.stater   accelerate.test_utilsr   r	   accelerate.utilsr
   r   r   r   r.   rG   rZ   r]   rd   rl   rs   r~   r   r   __name__r   r   r   r   <module>   s0   

"(
6
&+0
