o
    hMy                     @   s  U d dl Z d dlZd dlmZ d dlmZmZ d dlmZ d dl	m
Z
mZmZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
l m!Z! d dl"m#Z# d dl$m%Z%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3m4Z4m5Z5 da6ee j7 e8d< ej9j:Z:G dd deZ;eG dd dZ<eG dd dZ=de
de>fddZ?		dVdej@jAdee
df deeeBe
f  d eeejCeejDeee' ee' f f  de
f
d!d"ZEd#d$ ZFd%ejjGd&e#ddfd'd(ZHd)eejGe
f d*e
de
fd+d,ZId-eee=eJf  d.edeeeJ ee' f fd/d0ZKd%ejGd1ee
df defd2d3ZLd%ejGd1ee
df defd4d5ZMd%ejGd1ee
df deeBe
f d6edef
d7d8ZNd%ejGd1ee
df deeBe
f d6edef
d9d:ZOd%ejGd1ee
df deeBe
f d6edef
d;d<ZPe:jQjReLe:jSjReLe:jTjReMe:jUjReLiZVeej@jAef e8d=< e:jWjReNe:jXjReOe:jXjYeOiZZeej@jAef e8d>< e:j[jRePe:jXjYePe:j\jRePiZ]eej@jAef e8d?< d@ddAd%ejGd)eejGe
f dBe>d6ee deej^ f
dCdDZ_dEed)eejGe
f deej^e
f fdFdGZ`dHej^d%ejGd)eejGe
f dejGfdIdJZadHej^dKeejjGejj^f ddfdLdMZbdNejcdeejGeejG f fdOdPZd		@dWdHej^dQeejC dRee< d6ee dSe>deej^eeBe<f f fdTdUZedS )X    N)	dataclass)autoEnum)partial)	AnyCallablecastDictListOptionalSequenceTupleUnion)_get_tracer)OP)
get_logger)
DeviceMeshDTensor)_operator_dispatch)OpSchema)_PartialDTensorSpec	Placement	ReplicateShard)redistribute_local_tensor)make_fx
proxy_slot)TensorMetadata)tree_flattentree_maptree_map_onlytree_unflattenloggerc                   @   s   e Zd Ze Ze ZdS )TrainingPhaseN)__name__
__module____qualname__r   FORWARDBACKWARD r*   r*   X/var/www/html/ai/venv/lib/python3.10/site-packages/torch/distributed/_spmd/distribute.pyr$   %   s    
r$   c                   @   s"   e Zd ZU eed< ee ed< dS )Schemamesh
placementsN)r%   r&   r'   r   __annotations__r
   r   r*   r*   r*   r+   r,   *   s   
 r,   c                   @   sT   e Zd ZU dZeed< eed< eed< defddZe	de
jd	edd fd
dZdS )DSymInta?  
    DSymInt represents a value retrieved by a SymInt op from a DTensor. DSymInt
    helps View and Factory ops to determine the placement and shape of the
    output tensor, as those operators either do not have an input DTensor or
    the input DTensor is insufficient to determine the output tensor's placement.
    global_valuelocal_valuer-   returnc                 C   s   | j | jkS N)r2   r1   )selfr*   r*   r+   is_shard=   s   zDSymInt.is_shardnodedtensorc                 C   s   d}|j tjkr tt|jd }| ||| ||jdS |j tj	kr4| |
 | 
 |jdS |j tjkrRtt|jd }| ||| ||jdS td|j  )Nr      )r1   r2   r-   zDSymInt does not support )targetatensym_sizer   intargssizeto_localdevice_mesh	sym_numelnumel
sym_stridestrideNotImplementedError)clsr7   r8   dimr*   r*   r+   	from_node@   s,   
zDSymInt.from_nodeN)r%   r&   r'   __doc__r=   r/   r   boolr6   classmethodfxNoder   rI   r*   r*   r*   r+   r0   0   s   
 r0   objr3   c                 C   s6   t | tsdS d}| jD ]}t |trd} |S q|S )zDcheck if object is 1) DTensor and  2) with any placement of _PartialFT)
isinstancer   r.   r   )rO   
is_partial	placementr*   r*   r+   _is_partial_dtensor[   s   


rS   op
local_args.kwargsspecsc                    sB   |d u ri } d u ri  dt dt f fdd}| t||i |S )Nargr3   c              	      sz    |  \}}}}t || j| j|  tj| ji d}t|t||d}t|t||d}t	| tj
r;|  v r;t| ||S | S )N)dtyperequires_gradrE   memory_formatis_quantizedqparams)tensor_meta)r   rY   rZ   rE   torchcontiguous_formatr\   r   tuplerP   Tensorr   )rX   tensor_shaper-   current_placementtarget_placementr^   current_spectarget_specrW   r*   r+   redistributey   s.   	



z2_dispatch_with_local_tensors.<locals>.redistribute)r   r    )rT   rU   rV   rW   ri   r*   rh   r+   _dispatch_with_local_tensorsi   s   rj   c                 C   s~   t | \}}t |j\}}i }t|D ]"\}}	t|	tr5|r0|	 || j|	j|| jf||	j< |	j||< qt	||}
||
fS r4   )
r   args_schema	enumeraterP   r   r?   r-   r.   _local_tensorr"   )r>   target_schemari   flatten_argsargs_tree_specflatten_args_schema_rW   irX   unflattened_argsr*   r*   r+   _update_specs_for_redistribute   s    
	



ru   r7   	op_schemac                 C   s   t | j\}}t |j\}}dtttjjf dtfdd}t	|t	|ks'J t
t||D ]\}\}}	||rAt|	trA|	||< q.t||}
t
|
D ]
\}}| || qKd S )NrX   r3   c                 S   s.   t | tjjr| jtjtjtjfv S t | t	S r4   )
rP   r_   rM   rN   r:   r;   r<   rB   rD   r=   )rX   r*   r*   r+   is_sym_int_or_int   s   
z6_update_node_from_op_schema.<locals>.is_sym_int_or_int)r   r>   rk   r   r=   r_   rM   rN   rK   lenrl   ziprP   r"   
update_arg)r7   rv   	flat_argsrp   flat_args_schemarr   rw   rs   rX   
arg_schemar>   idxr*   r*   r+   _update_node_from_op_schema   s   	
r   node_to_objrX   c                 C   s:   t |tjjr| | }t rttttf |jt	= |S |S r4   )
rP   r_   rM   rN   r   r   r	   r   __dict__r   )r   rX   rO   r*   r*   r+   
_remap_arg   s   r   sizesr-   c                 C   sZ   dd | D }dd t | D pt g}t||jks)J dt| d|j d||fS )Nc                 S       g | ]}t |tr|jn|qS r*   rP   r0   r2   .0sr*   r*   r+   
<listcomp>       z)unpack_sizes_and_dims.<locals>.<listcomp>c                 S   s*   g | ]\}}t |tr| rt|qS r*   )rP   r0   r6   r   )r   rs   ar*   r*   r+   r      s    z"The number of sharded dimensions (z2) must match number of dimensions in device mesh (z).)rl   r   rx   ndim)r   r-   local_sizesr.   r*   r*   r+   unpack_sizes_and_dims   s   r   r>   c                 C   s   t |dksJ d| j d| t|d ts!J d|d  t|d ts1J d|d  t|d |d j\}}| jd |f| _tt	j
j| j}tj||d j||d j|dd	S )
N   zExpect two args but got op z with args r   z*Expect 1st argument to be DTensor but got r9   $Expect 2nd argument as list but got Flocal_tensorrA   r.   	run_check)rx   r:   rP   r   listr   rA   r>   r   r_   _ops
OpOverload
from_localrm   )r7   r>   r   r.   rT   r*   r*   r+   binop_sym_int_consumer_rule   s    " r   c           
   	   C   s\   |\}}}}}}dd |D }t j||j|jd}	tjt |	| |||||j|j	ddS )Nc                 S   r   r*   r   r   r*   r*   r+   r     r   z7slice_backwad_sym_int_consumer_rule.<locals>.<listcomp>)devicerY   Fr   )
r_   zerosr   rY   r   r   slice_scatterr@   rA   r.   )
r7   r>   grad_outputinput_sizesrH   startendstepr   input_tensorr*   r*   r+   #slice_backwad_sym_int_consumer_rule  s   
r   default_meshc                 C   s   t |d }tdd |D rJ d| j d| dt|d ts+J d|d  t|d |\}}|g|dd  R | _ttj	j
| j}tj|| ji |||d	d
S )Nr   c                 s   s    | ]}t |tV  qd S r4   )rP   r   r   r   r*   r*   r+   	<genexpr>#  s    z*factory_with_sizes_rule.<locals>.<genexpr>z4Not expect DTensor argument for factory op, but got z with arguments .r   r9   Fr   )r   anyr:   rP   r   r   r>   r   r_   r   r   r   r   )r7   r>   rV   r   r{   r   r.   rT   r*   r*   r+   factory_with_sizes_rule  s    
 r   c                 C   sB   t dd || _ttjj| j}tj|| ji ||t	 gddS )Nc                 S      t | tr| jS | S r4   r   r   r*   r*   r+   <lambda>:      z%factory_arange_rule.<locals>.<lambda>Fr   )
r    r>   r   r_   r   r   r:   r   r   r   r7   r>   rV   r   rT   r*   r*   r+   factory_arange_rule4  s   r   c                 C   sB   ||| _ | _ttjj| j}tj|| j i | j|t	 gddS )NFr   )
r>   rV   r   r_   r   r   r:   r   r   r   r   r*   r*   r+   default_factory_op_ruleD  s   r   VIEW_SYM_INT_CONSUMERSFACTORY_SYM_INT_CONSUMERSFACTORY_OPSF)force_make_fxr   r   c                C   s(  t   ttt|| j}ttt|| j}tt jj	| j
}tdd t|d D r|tv rPt|dks>J d| t| | ||| < 	 W d    d S |tv rp|d us\J dt| | ||||| < 	 W d    d S tttjsxJ td|| | j
tjjkrtjj}tdd |}td	d |}|tv rt| | ||||| < 	 W d    d S t|||tj\}}}	||| < |	d usJ |	jst| | 	 W d    d S |	jd usJ |	jd }
t ||
|	j\}}tt!|||d
}t"|dd|}|j#$  |W  d    S 1 sw   Y  d S )Nc                 s   s"    | ]}t |tr| V  qd S r4   )rP   r0   r6   r   r*   r*   r+   r   |  s     z._get_dtensor_dispatch_graph.<locals>.<genexpr>r   zExpect empty kwargs, but got z%Requires default mesh for factory opszYAssuming using local_value from SymInt for %sis mathematically correct. Full args are %s.c                 S   r   r4   r   r   r*   r*   r+   r     r   z-_get_dtensor_dispatch_graph.<locals>.<lambda>c                 S   r   r4   r   r   r*   r*   r+   r     r   )rV   rW   F)_allow_non_fake_inputs)%r_   no_gradr    r   r   r>   rV   r   r   r   r:   r   r   r   rx   r   rP   r#   loggingLoggerwarningr;   viewdefaultreshaper   r   r   _propagatorneeds_redistributer   schema_suggestionsru   rj   r   grapheliminate_dead_code)r7   r   r   r   r>   rV   op_overloadoutrv   output_shardingrn   updated_args_specrt   dispatchgmr*   r*   r+   _get_dtensor_dispatch_graphn  s|   2

E

&r   dtc           	      C   s   dt jdt jdt jfdd}| j}t | j}t|||}dd |jjD }dd |jjD }t|d	ks8J t|d
ks@J | ||d < tj	|| j
t gdd||d
 < t|d |dd}|duseJ |||d  fS )a   
    Creates a graph for a dummy add function from a partial DTensor.
    This dummy add is used for triggering all_reduce on a Partial DTensor
    during the DTensor expansion of the traced graph.
    Also returns the actual DTensor after resharding.
    gradzeror3   c                 S   s   | | S r4   r*   )r   r   r*   r*   r+   	dummy_add  s   z)_build_dummy_add_graph.<locals>.dummy_addc                 S      g | ]
}|j tjkr|qS r*   )rT   r   PLACEHOLDERr   nr*   r*   r+   r         z*_build_dummy_add_graph.<locals>.<listcomp>c                 S   r   r*   )rT   r   CALL_FUNCTIONr   r*   r*   r+   r     r   r   r9   r   Fr   T)r   N)r_   rb   rm   
zeros_liker   r   nodesrx   r   r   rA   r   r   )	r   r   r   r   r   
traced_addplaceholderscall_functionstraced_dispatchr*   r*   r+   _build_dummy_add_graph  s"   

r   r   c              
      s  g }d}|j d D ]}t|tjs|| q	|| }t|s%|| q	d}tt|}t||\}}	dd |j	j
D }
dd |j	j
D }t|
dkrQt|dksSJ |d |
d  |j	  |	||
d < i  |j	j
D ]}|jtjkrz| |< qm|jtjkrt|j dkrt|j d dksJ d|j  d	t|j  | |j d d   ||j d d  | |j d d  < qm|jtjkrt| |jt||j | j	| | j	| fd
d |< W d    n1 sw   Y  qmq	|r| j	| | j	|S |S )NFr   Tc                 S   s$   g | ]}|j d ks|j dkr|qS )	wait_commwait_tensornamer   r*   r*   r+   r     s
    z#_convert_output.<locals>.<listcomp>c                 S   s   g | ]	}|j d kr|qS )addr   r   r*   r*   r+   r     s    r9   !Expecting single output, but got  c                        |  S r4   r*   r   value_remapr*   r+   r   A      z!_convert_output.<locals>.<lambda>)r>   rP   rM   rN   appendrS   r   r   r   r   r   rx   replace_all_uses_withr   rT   r   r   OUTPUTGET_ATTRsetattrr:   getattrinserting_before	node_copy
erase_nodeoutput)r   r7   r   new_argshas_partialargumentrO   r   r   
result_objwaitr   dtnr*   r   r+   _convert_output  s\   




"&
r   node_replacementsc              
      s  | j jD ]}||vrq|| }t|j\}}di } |j jD ]}|jtjkr1||  |< |d7 }q| j | |j jD ]}|jtjkrFq=|jtjkrt	|jdkscJ d|j dt	|jd  |jd }t	|dkrs|d }	nJd }
t
|D ];\}}|d u rqy|jdksJ |jjdksJ |jjdksJ |
d u s|
|jd ksJ |jd }
|jd |ksJ qy|
d usJ |
}	 |	 }|| q=| j | fdd	 |< td
d ||fD r| |   nq=| j | W d    n1 sw   Y  q| j   |   d S )Nr   r9   r   r   call_function	_operatorgetitemc                    r   r4   r*   r   r   r*   r+   r     r   z _rebuild_graph.<locals>.<lambda>c                 s   s0    | ]}t |jtjjo|jjjd V  qdS ))zaten::_foreachzaten::_fused_adamN)rP   r:   r_   r   r   _schemar   
startswithr   r*   r*   r+   r     s    

z!_rebuild_graph.<locals>.<genexpr>)r   r   r   r>   rT   r   r   r   r   rx   rl   r:   r&   r%   r   r   allr   r   	recompile)r   r   r7   r   ro   rr   rs   r   outputsr   sourcer   new_noder*   r   r+   _rebuild_graphI  s`   




=r  r   c                    sp   i i dt jdt jdd ffdd t| jD ]t jj fdd t jj fdd qS )	Narg_nodeconsumerr3   c                    s*   | vr|| <   |g |  d S d S r4   )
setdefaultr   )r  r  )last_consumer_to_nodesnode_to_last_consumerr*   r+   _register_final_consumer  s   z=_get_last_consumer_to_nodes.<locals>._register_final_consumerc                    
    | S r4   r*   )r  r  r7   r*   r+   r        
 z-_get_last_consumer_to_nodes.<locals>.<lambda>c                    r	  r4   r*   )
kwarg_noder
  r*   r+   r     r  )rM   rN   reversedr   r7   map_argr>   rV   )r   r*   )r  r  r7   r  r+   _get_last_consumer_to_nodes  s    r  inpsschemas_allow_partialc                    s  t dadd tjD }i }i }t| j}i }	t| jjD ]`\}
}tdus(J td|
|j	|j
 |j	tjkrd|
t|k sMJ d|
d  dt| d	tj||
  ||
 j||
 jd
d||< n
t|j
tjjrtt||jd  }t||||< nt|j
tjjrt|||d}|dur|||< n|j	tjkr|st| ||}|jd D ]}t|tj r|| }t|trt!|j"|j|	|j#< qn|j	tj$krft%t&t'||j}t%t&t'||j(}t)t*dd |t+|,   |j
|v rKt dkrKt- fdd D sJ dt.tdd |}t.tdd |}t.tdd |}t.tdd |}||_||_(t|j
|i ||j
|i | d jd||< n#t dks[J d|j
 d|j
|i |||< nt/d|j	 ||v r~|| D ]}||= qwqt0| | | |	fS )zc
    Returns:
        - transformed graph module
        - map from output name to DTensorSpec
    spmd_expc                 S   s   h | ]}t t|qS r*   )r   operator)r   r   r*   r*   r+   	<setcomp>  s    z*_convert_to_distributed.<locals>.<setcomp>Nznode%s: op=%s target=%szgot more placeholder nodes (r9   z) than inputs ()Fr   r   )r   c                 S   s
   t | tS r4   )rP   r0   r   r*   r*   r+   r     r  z)_convert_to_distributed.<locals>.<lambda>c                 3   s     | ]} d  j |j kV  qdS )r   N)r-   )r   ddsymintsr*   r+   r     s    
z*_convert_to_distributed.<locals>.<genexpr>z&all DSymInts must have the same mesh. c                 S      | j S r4   r2   r   r*   r*   r+   r   	      c                 S   r  r4   r  r   r*   r*   r+   r   
  r  c                 S   r  r4   r1   r   r*   r*   r+   r     r  c                 S   r  r4   r  r   r*   r*   r+   r     r  )r2   r1   r-   zBSPMD expansion does not support SymInt in non-operator nodes, got r   zUnrecognized node.op type )1r   r#   r  __all__r  r   rl   r   inforT   r:   r   r   rx   r   r   cloner-   r.   rP   r_   r   OpOverloadPacketr   r>   r0   rI   r   r   r   r   rM   rN   r,   rA   r   r   r    r   r   rV   r   filterra   valuesr   r!   
ValueErrorr  )r   r  r  r   r  	operatorsr   r   r  output_schemasrs   r7   r8   replacementinp_argrO   r>   rV   rU   local_kwargsglobal_argsglobal_kwargsr  r*   r  r+   _convert_to_distributed  s   






r,  )NN)NF)fr   r  dataclassesr   enumr   r   	functoolsr   typingr   r   r   r	   r
   r   r   r   r   r_   (torch.distributed._spmd.experimental_opstorch.fxrM   #torch.distributed._spmd.comm_tensorr   #torch.distributed._spmd.graph_utilsr   !torch.distributed._spmd.log_utilsr   torch.distributed._tensorr   r   "torch.distributed._tensor.dispatchr   #torch.distributed._tensor.op_schemar   )torch.distributed._tensor.placement_typesr   r   r   r   r   &torch.distributed._tensor.redistributer   "torch.fx.experimental.proxy_tensorr   r   torch.fx.passes.shape_propr   torch.utils._pytreer   r    r!   r"   r#   r   r/   opsr;   r$   r,   r0   rK   rS   r   r   strrb   Sizerj   ru   rN   r   r   r=   r   r   r   r   r   r   _unsafe_viewr   expandslice_backwardr   r   fullaranger   r   scalar_tensorr   r   GraphModuler   r   r   r  Graphr  r,  r*   r*   r*   r+   <module>   sN  
 ,*


/  
 












l
%
J
V
