o
    h                     @   s   U d dl Z d dlZddlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 G dd deZG d	d
 d
eZG dd deZG dd deZdae
e	e
ej   ed< dejfddZdS )    N   )comm)Function_get_device_index)ListOptionalc                   @   $   e Zd Zedd Zedd ZdS )	Broadcastc                 G   s   t dd |D sJ ddd |D }|| _t|dkr t S t|| _|d  | _t|| j}g }t	| j
dd  D ]\}}|sP|D ]	}|||  qFq>| j|  tdd |D S )	Nc                 s       | ]	}|j jd kV  qdS cpuNdevicetype.0i r   R/var/www/html/ai/venv/lib/python3.10/site-packages/torch/nn/parallel/_functions.py	<genexpr>       z$Broadcast.forward.<locals>.<genexpr>z2Broadcast function not implemented for CPU tensorsc                 S      g | ]}t |d qS Tr   r   xr   r   r   
<listcomp>       z%Broadcast.forward.<locals>.<listcomp>r   r   c                 S   s   g | ]	}|D ]}|qqS r   r   )r   tensorstr   r   r   r      s    )alltarget_gpuslentuple
num_inputs
get_deviceinput_devicer   broadcast_coalesced	enumerateneeds_input_gradappendmark_non_differentiable)ctxr!   inputsoutputsnon_differentiablesidxinput_requires_gradoutputr   r   r   forward   s$   

zBroadcast.forwardc                 G   s   dt j| j| jg|R   S )NN)ReduceAddCoalescedapplyr&   r$   r,   grad_outputsr   r   r   backward    s   zBroadcast.backwardN__name__
__module____qualname__staticmethodr3   r9   r   r   r   r   r
   
   s
    
r
   c                   @   r	   )r5   c                    sL    fddt dt D | _ fddt dt D }t||S )Nc                    s   g | ]} |   qS r   r%   r   )gradsr   r   r   )   s    z.ReduceAddCoalesced.forward.<locals>.<listcomp>r   c                    s   g | ]
} ||  qS r   r   r   r@   r$   r   r   r   +   s    )ranger"   r!   r   reduce_add_coalesced)r,   destinationr$   r@   grads_r   rA   r   r3   '   s
    zReduceAddCoalesced.forwardc                 G   s   dt j| jg|R   S )NNN)r
   r6   r!   r7   r   r   r   r9   /   s   zReduceAddCoalesced.backwardNr:   r   r   r   r   r5   %   s
    
r5   c                   @   r	   )Gatherc                    s   t dd |D sJ d|dkrd _nt|d}| _| _tdd |D  _t dd |D rI|dkrItd	d |D }td
 d _nd _t fdd|D  _	t
| j jS )Nc                 s   r   r   r   r   r   r   r   r   8   r   z!Gather.forward.<locals>.<genexpr>z/Gather function not implemented for CPU tensorsr   Tc                 s   s    | ]}|  V  qd S r4   r?   r   r   r   r   r   A       c                 s   s    | ]	}|  d kV  qdS r   N)dimr   r   r   r   r   r   B   r   r   c                 s   s    | ]}| d V  qdS )r   N)viewrK   r   r   r   r   C   s    zvWas asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.Fc                 3   s    | ]	}|  jV  qd S r4   )sizerJ   r   r,   r   r   r   J   r   )r    target_devicer   rJ   r#   
input_gpuswarningswarnunsqueezed_scalarinput_sizesr   gather)r,   rO   rJ   r-   r   rN   r   r3   6   s    

zGather.forwardc                 C   s6   t | j| j| j|}| jrtdd |D }d| S )Nc                 s   s    | ]}|d  V  qdS rI   r   )r   gr   r   r   r   Q   rH   z"Gather.backward.<locals>.<genexpr>rF   )Scatterr6   rP   rT   rJ   rS   r#   )r,   grad_outputscattered_gradsr   r   r   r9   M   s   zGather.backwardNr:   r   r   r   r   rG   4   s
    
rG   c                   @   r	   )rW   c           
   	   C   s   dd |D }|| _ |jjdkr| nd| _d }tj r*| jdkr*dd |D }t	|||| j |}|d urjt
|D ]-\}}tj||  tj }	|	||  ||	 W d    n1 sdw   Y  q<|S )Nc                 S   r   r   r   r   r   r   r   r   Y   r   z#Scatter.forward.<locals>.<listcomp>r   c                 S   s   g | ]
}t td |qS )cuda)_get_streamtorchr   )r   r   r   r   r   r   _   s    )rJ   r   r   r%   r&   r]   r[   is_availabler   scatterr(   current_streamwait_streamrecord_stream)
r,   r!   chunk_sizesrJ   inputstreamsr.   r   r2   main_streamr   r   r   r3   W   s    
zScatter.forwardc                 G   s    d d d t j| j| jg|R  fS r4   )rG   r6   r&   rJ   )r,   rX   r   r   r   r9   j   s    zScatter.backwardNr:   r   r   r   r   rW   U   s
    
rW   _streamsr   c                 C   sh   | j dkrdS tt| j d}|du rdS tdu rdg|  at| j du r/|| jt| j< t| j S )zBGets a background stream for copying between CPU and target devicer   N)r   getattrr]   rg   device_countindexStream)r   
device_modr   r   r   r\   r   s   

r\   )rQ   r]    r   torch.autogradr   torch._utilsr   typingr   r   r
   r5   rG   rW   rg   rk   __annotations__r   r\   r   r   r   r   <module>   s   
 !