o
    hR                     @   s  U d dl mZmZ d dlmZmZ d dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZmZmZmZmZmZmZmZ d dlmZ d dlZd dlmZ d dlZd dlmZ d dlm   m!Z" d d	lm#Z# d d
l$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7 G dd deZ8G dd de1Z9dej#j:dej#j:fddZ;e, a<dej=dee/ fddZ>edej?j@deeAef d eeAejBf fd!d"ZCejDjEZEed#d$ ZFdDd&d'ZGd(d) ZHd*d+ ZIdDd,d-ZJdDd.d/ZKd%d%d%d%d%d0d0ddd1	d2d3ZLeEjMjeGeEjMjNe
eJeEjOjNeEjPjNe
eKeEjQjNeEjRjNe
eKeEjSjNeEjTje
eIeEjUjeEjVjNe
eJeEjWjNeEjTjNe
eJeEjUjNeEjXjYe
eHeEjZjYeEj[jYe
eHeEj\jYeEj]jYe
eHeEj^jYeEj_jNe
eJeEj`jNeEjajYeLeEj%jYe%iZbejDjcjdjYejDjcjejYhZfeejgjh eid4< de#j:de#j:fd5d6ZjeG d7d8 d8Zkd9ed:eee8  d;e-d<ed=edekfd>d?Zld@Zm			dEd:eee8  dAeee#j:ge#j:f  d;ee- fdBdCZndS )F    )ABCabstractmethod)contextmanagernullcontext)copy)	dataclass)partialwraps)	AnyCallablecastDictListOptionalSetTupleUnion)make_fxN)fx)native_layer_norm_backward)FakeTensorMode)gradients_tagging)DataParallelDTensorExpandModeParallelMode)	Placement)_PyTreeCodeGen_PyTreeInfoCodeGen)	stateless)NamedMemberAccessorc                   @   sV   e Zd ZdZededejjdejjfddZ	ede
jdeej de
jfd	d
ZdS )Overridea  
    Override the tracing and transformation behavior of :meth:`~torch.distributed._spmd.compile`.
    This is useful when any part of the model is not traceable or if you prefer
    to not trace it due to any reason. More specifically, users can implement
    :meth:`torch.distributed._spmd.Override.replacement` to replace an original
    submodule with the return new submodule. The new submodule contains
    operations that users preferred to be traced, which simply be a dummy
    placeholder operator. After tracing, users can implement
    :meth:`torch.distributed._spmd.Override.transform` to transform the traced
    graph, where the dummy placeholder operator serves as an anchor to insert
    new sub-graphs.
    fqnorig_submodulereturnc                 C      dS )a  
        Implement this method to return a new :class:`nn.Module` instance to
        replace the ``orig_submodule`` argument in the model. This helps if
        ``orig_submodule`` is not traceable or should not be traced.

        Args:
            fqn (str): fully quantified name of the submodule.
            orig_submodule (class:`nn.Module`): original submodule instance to replace.

        Returns:
            A new :class:`nn.Module` instance to replace the original one.
        N )selfr"   r#   r&   r&   Q/var/www/html/ai/venv/lib/python3.10/site-packages/torch/distributed/_spmd/api.pyreplacement0   s   zOverride.replacementgm
flat_statec                 C   r%   )a.  
        Given a DTensor-expanded graph and sharding schema for every node,
        conduct additional transformation for the sub-graph from the :class:`nn.Module`
        returned by :meth:`torch.distributed._spmd.Override.replacement` if
        necessary.

        Args:
            gm (:class:`fx.Graph`): a DTensor-expanded graph.
            flat_state (List[str, :class:`Tensor`]): a reference to the list of
                flattened state. The elements in ``flat_state`` map to the first
                ``len(flat_state)`` placeholders in the graph. The transformation
                can add state to or remove state from ``flat_state`` as long as
                it keeps ``flat_state`` and the placeholders consistent.

        Returns:
            The :class:`fx.Graph` after transformation.
        Nr&   )r'   r*   r+   r&   r&   r(   	transform@   s   zOverride.transformN)__name__
__module____qualname____doc__r   strtorchnnModuler)   r   GraphModuler   Tensorr,   r&   r&   r&   r(   r!   "   s     r!   c                   @   s&   e Zd ZdedefddZdd ZdS )_PyTreeCodeGenOutputsOnlyargsr$   c                 G   s   |S Nr&   )r'   r8   r&   r&   r(   process_inputs\      z(_PyTreeCodeGenOutputsOnly.process_inputsc                 C   s   t | ||S r9   )r   
gen_fn_def)r'   	free_varsmaybe_return_annotationr&   r&   r(   r<   `   s   z$_PyTreeCodeGenOutputsOnly.gen_fn_defN)r-   r.   r/   r
   r:   r<   r&   r&   r&   r(   r7   Z   s    r7   r*   r$   c                 C   s,   t tdd| jjjjdd| j_|   | S )zMove the responsibility of flattening the input arguments from the
    graph module to the caller.

    Example:

        output = gm(my_struct)

        gm = gm(to_caller_flattened_graph_module)

        output = gm(*pytree.flatten(my_struct)[0])
    N)	orig_argsin_specout_spec)pytree_info)r7   r   _graph_codegenrB   rA   	recompile)r*   r&   r&   r(   !_to_caller_flattened_graph_moduled   s   


rF   t
placementsc                 C   s   |t jt| < d S r9   )dtensor_expand_mode_placements_overrideid)rG   rH   r&   r&   r(   _override_placements   s   rL   optnamed_statesparamsc              	   c   s~    | d usJ t | j}|D ]}|| | j|| < q| jd }|d }| |d< zd V  W ||d< || _d S ||d< || _w )Nr   rO   )r   stateparam_groupsvalues)rM   rN   rO   orig_statesnparam_grouporig_paramsr&   r&   r(   _rematerialize_optimizer   s   


rW   c                  c   sD    dd } t jjj}| jt jj_zd V  W |t jj_d S |t jj_w )Nc                   S   r%   )NTr&   r&   r&   r&   r(   f_true   r;   z_enable_compile.<locals>.f_true)r2   _utilsis_compiling__code__)rX   orig_is_compiling_coder&   r&   r(   _enable_compile   s   
r]      c                 C   s4   t jj| ||d}t| |D ]	\}}|| qd S )N)alpha)aten_foreach_addr   zipcopy_)r'   otherr_   self_updatedss_ur&   r&   r(   _foreach_add_decomp   s   rh   c                 C   s*   | |}t ||D ]	\}}|| q	d S r9   rb   rc   )opr'   re   rf   rg   r&   r&   r(   _foreach_unaop_decomp   s   rk   c                 C   ,   | ||}t ||D ]	\}}|| q
d S r9   ri   )rj   r'   rd   re   rf   rg   r&   r&   r(   _foreach_binop_list_decomp      
rm   c                 C   rl   r9   ri   )rj   r'   scalarre   rf   rg   r&   r&   r(   _foreach_binop_scalar_decomp   rn   rp   c                 C   s0   | ||||}t ||D ]	\}}|| qd S r9   ri   )rj   r'   tensor1tensor2ro   re   rf   rg   r&   r&   r(   _foreach_addcop_scalar_decomp   s   rs   T	lrbeta1beta2weight_decayepsamsgradmaximize
grad_scale	found_infc       	         C   s   | ||||f}t jj| |||||||||	|
||||d}tt||D ]\}\}}|dkr.q#t||D ]	\}}|| q3q#d S )Nrt   r^   )r`   _fused_adamdefault	enumeraterb   rc   )r'   gradsexp_avgsexp_avg_sqsmax_exp_avg_sqsstate_stepsru   rv   rw   rx   ry   rz   r{   r|   r}   
orig_tupleupdated_tupleidxorigupdatedour&   r&   r(   _fused_adam_decomp   s2   r   DEDUP_TARGETSc                 C   sz   i }| j jD ]0}t|j\}}|jtv r6|jg|R }||d }|d u r+|||< q|| | j 	| q| 
  | S r9   )graphnodespytreetree_flattenr8   targetr   getreplace_all_uses_with
erase_noderE   )r*   args_to_nodenoder8   _args_keyunique_noder&   r&   r(   _dedup_collectives,  s   


r   c                   @   s@   e Zd ZU ejed< ejed< ee	j
j ed< ee	j ed< dS )_CompiledResultr*   modrM   r+   N)r-   r.   r/   r   r5   __annotations__r3   r4   r   r2   optim	Optimizerr   r6   r&   r&   r&   r(   r   C  s
   
 

r   funcmodule_overrideparallel_moder8   kwargsc              	      sh  d\t t|t|  d D ]#}t|tjr%d u s#J d|t|tjj	r6d u s4J d|qd us?J dr[t
 dtdtjjdd f fd	d
d tjdd}tjdd}i }d ur| D ]\}	}
|
jv rj|
 ||	< qut|tfdd}rdnd}rt tt|dtjdtjffdd}t tj||}t tj||}t 0 tjjdd tt|| |tdd|||||}W d    n1 sw   Y  W d    n1 sw   Y  i ||}||||||}t ||g\}}t|}t|}r-D ]	}|||}q#t ||S )N)NNr   z%Only support single nn.Module for nowz%Only support single Optimizer for nowz5Couldn't find nn.Module instances from the arguments.
fqn_prefixmoduler$   c                    sz   D ]8}|  D ]1\}}t|dkrq| dkr| d | n|}|||}t|t|kr3|| q || qqd S )Nr    .)named_childrenlenr)   rK   swap_submodule)r   r   overridenamechildr"   	new_child)accessorr   swapr&   r(   r   e  s   z_compile.<locals>.swapr   F)remove_duplicatec              
      s   t i ||X rt||nt <  rt|nt  | |i |}W d    n1 s2w   Y  |t t| fW  d    W  d    S 1 sUw   Y  W d    d S 1 sew   Y  d S r9   )r   _reparametrize_modulerW   r   r   list
parametersrR   )r   rO   buffersrN   r8   r   ret)is_data_parallel_moder   rM   r&   r(   stateless_func  s   Rz _compile.<locals>.stateless_funcfakesymbolicargc                    s6    | }dg| j }| j  t 9  < ||S )Nr^   )from_tensorndiminput_batch_dimdistget_world_sizerepeat)r   fake_argarg_dims)data_parallel_mode	fake_moder&   r(   _get_full_batch_arg  s   

z%_compile.<locals>._get_full_batch_arg)	check_nan)tracing_modedecomposition_table_allow_non_fake_inputs)!r   r   r   rR   
isinstancer3   r4   r2   r   r   r    r1   dictnamed_parametersnamed_buffersitemsrP   r   r   r   r6   tree_map_onlyr]   autograddetect_anomalyr   r   SPMD_DECOMP_TABLE	partitionrF   r   r,   r   )r   r   r   r8   r   r   rO   r   rN   rT   pr   r   r   r*   params_and_buffersr+   r   r   r&   )r   r   r   r   r   r   rM   r   r(   _compileK  s   ""




 r   _compiled_objgm_transformationc                    s   dt f fdd}|S )a  
    Compile and optimize a callable, which can be a train step within a training
    loop. This method will extract :class:`nn.Module` and :class:`torch.optim.Optimizer`
    instances from the input arguments and trace operations applied to their
    parameters and states.

    Args:
        module_override (Optional[List[Override]]): a list of Override instances
            that will be applied to the module in order. The :class:`Override`
            objects provide :class:`nn.Module` replacements during tracing and a
            graph transformation function after tracing. (Default: ``None``)
        gm_transformation (Optional[Callable[fx.GraphModule, fx.GraphModule]]):
            a callback that will be called after the original callable is
            compiled and distributed (usually after the first iteration) to
            transform the compiled GraphModule into a new optimized one.
        parallel_mode (Optional[ParallelMode]): a :class:`ParallelMode` object
            that specifies how to parallelize the callable. Each ParallelMode
            would have its own strategy to partition the model and the captured
            graph (Default: ``None``)
    r   c                    s    t   fddS )Nc            	         s2  |r| ddnd}d}jtd }|d u r3d}d u rtn}t |g| R i |}|jt< |jt| |gd  }t	
 L |rNrN|j|_|sX|j| d }n.z|j|d|id }W n! ty } zdt|vrt||j| d }W Y d }~nd }~ww |W  d    S 1 sw   Y  d S )Nlast_train_stepFTr   	last_iter)pop__dict__r   COMPILED_OBJECT_KEYrI   r   r+   r   r   r2   no_gradr*   	TypeErrorr1   )	r8   r   r   
first_itercompiled_objmode	flat_inpsoutpute)r   r   r   r   wrapperr&   r(   r     s6   

$z'compile.<locals>.inner.<locals>.wrapper)r	   )r   r   r   r   )r   r   r(   inner  s   *zcompile.<locals>.inner)r   )r   r   r   r   r&   r   r(   compile  s   .r   )r^   )NNN)oabcr   r   
contextlibr   r   r   dataclassesr   	functoolsr   r	   typingr
   r   r   r   r   r   r   r   r   	functorchr   r2   torch.distributeddistributedr   )torch.distributed._functional_collectivestorch.nnr3   torch.utils._pytreeutils_pytreer   r   torch._decomp.decompositionsr   torch._subclasses.fake_tensorr   %torch.distributed._spmd.data_parallelr   %torch.distributed._spmd.parallel_moder   r   r   torch.distributed._tensorr   torch.fx.graphr   r   r   torch.nn.utilsr   %torch.nn.utils._named_member_accessorr    r!   r7   r5   rF   rI   r6   rL   r   r   r1   	ParameterrW   opsr`   r]   rh   rk   rm   rp   rs   r   _foreach_add_Scalarra   _foreach_addcdiv__foreach_addcdiv_foreach_addcmul__foreach_addcmul_foreach_div__foreach_div_foreach_mul__foreach_mul_foreach_neg_r   _foreach_neg_foreach_reciprocal__foreach_reciprocal_foreach_sqrt__foreach_sqrt_foreach_sub__foreach_sub_fused_adam_r   c10d_functional
all_reducewait_tensorr   _ops
OpOverloadr   r   r   r   r   r   r&   r&   r&   r(   <module>   s    ,8





.
&


 %
