o
    h)                     @   s   d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
 d dlmZ dddddZdd	d
ZdddZdddZdddddZdddddZdS )    N)nccl)_take_tensors_flatten_dense_tensors_unflatten_dense_tensors_reorder_tensors_as_get_device_index_handle_complex)List)outc                C   s^   t | } |du |du A std| d| |dur(dd |D }tj| |S tj| |S )a  Broadcasts a tensor to specified GPU devices.

    Args:
        tensor (Tensor): tensor to broadcast. Can be on CPU or GPU.
        devices (Iterable[torch.device, str or int], optional): an iterable of
          GPU devices, among which to broadcast.
        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
          store output results.

    .. note::
        Exactly one of :attr:`devices` and :attr:`out` must be specified.

    Returns:
        - If :attr:`devices` is specified,
            a tuple containing copies of :attr:`tensor`, placed on
            :attr:`devices`.
        - If :attr:`out` is specified,
            a tuple containing :attr:`out` tensors, each containing a copy of
            :attr:`tensor`.
    NzFExactly one of 'devices' and 'out' must be specified, but got devices=z	 and out=c                 S      g | ]}t |qS  r   .0dr   r   L/var/www/html/ai/venv/lib/python3.10/site-packages/torch/nn/parallel/comm.py
<listcomp>"       zbroadcast.<locals>.<listcomp>)r   RuntimeErrortorch_C
_broadcast_broadcast_out)tensordevicesr
   r   r   r   	broadcast   s   r      c                 C   s,   dd |D }dd | D } t j| ||S )a/  Broadcasts a sequence tensors to the specified GPUs.
    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Args:
        tensors (sequence): tensors to broadcast. Must be on the same device,
          either CPU or GPU.
        devices (Iterable[torch.device, str or int]): an iterable of GPU
          devices, among which to broadcast.
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple containing copies of :attr:`tensor`, placed on :attr:`devices`.
    c                 S   r   r   r   r   r   r   r   r   7   r   z'broadcast_coalesced.<locals>.<listcomp>c                 S   r   r   r   r   tr   r   r   r   8   r   )r   r   _broadcast_coalesced)tensorsr   buffer_sizer   r   r   broadcast_coalesced(   s   r#   c                    sb  t |dd}| d  }d t| D ]?\}}|jjdks J d| |kr(| | |krQddd	 | D }dd
d	 |D }td| d| d| q du rZtdt	| dkrd| d S t
| rzt|   }t
j| | d |S t|   jj|} fddt| D }	|   |	d j|dd }|	dd D ]}
||
j|dd q|S )a  Sums tensors from multiple GPUs.

    All inputs should have matching shapes, dtype, and layout. The output tensor
    will be of the same shape, dtype, and layout.

    Args:
        inputs (Iterable[Tensor]): an iterable of tensors to add.
        destination (int, optional): a device on which the output will be
            placed (default: current device).

    Returns:
        A tensor containing an elementwise sum of all inputs, placed on the
        :attr:`destination` device.
    T)optionalr   Ncpuz+reduce_add expects all inputs to be on GPUsxc                 s       | ]}t |V  qd S Nstrr   r&   r   r   r   	<genexpr>S       zreduce_add.<locals>.<genexpr>c                 s   r'   r(   r)   r+   r   r   r   r,   T   r-   zinput z has invalid size: got z, but expected zLreduce_add expects destination to be on the same GPU with one of the tensors   )outputrootc                    s   g | ]
\}}| kr|qS r   r   )r   ir   
root_indexr   r   r   a   s    zreduce_add.<locals>.<listcomp>)devicenon_blocking)r   size	enumerater4   type
get_devicejoin
ValueErrorr   lenr   is_availabler   
empty_likereducetoadd_)inputsdestination
input_sizer1   inpgotexpectedresultdestination_devicenonroototherr   r2   r   
reduce_add<   s4   
rL   c                    s   dd | D }g }g }t |  D ]<}tdd |D r,t||}|| ||d  qt ||D ]\}}	||	jr>|	 n|	 q1||d d  q fdd|D }
t |
 D ]}dd |D }t||}t||d D ]}	||	j qnqYtt	||S )	a]  Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Args:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    c                 S   s   g | ]}g qS r   r   )r   _r   r   r   r   |   s    z(reduce_add_coalesced.<locals>.<listcomp>c                 s   s    | ]}|j V  qd S r(   )	is_sparser   r   r   r   r,      s    z'reduce_add_coalesced.<locals>.<genexpr>r   c                    s   g | ]}t | qS r   )r   )r   r!   r"   r   r   r      s    c                 S   r   r   )r   )r   chunkr   r   r   r      r   )
zipallrL   appendrN   to_denser   datatupler   )rB   rC   r"   dense_tensorsr/   	ref_ordertensor_at_gpusrH   collr   itrschunksflat_tensorsflat_resultr   rP   r   reduce_add_coalescedi   s&   


r`   c                C   sx   t | } |du rdd |D }ttj| ||||S |dur&td| |dur1td| ttj| |||S )a<  Scatters tensor across multiple GPUs.

    Args:
        tensor (Tensor): tensor to scatter. Can be on CPU or GPU.
        devices (Iterable[torch.device, str or int], optional): an iterable of
          GPU devices, among which to scatter.
        chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on
          each device. It should match :attr:`devices` in length and sums to
          ``tensor.size(dim)``. If not specified, :attr:`tensor` will be divided
          into equal chunks.
        dim (int, optional): A dimension along which to chunk :attr:`tensor`.
          Default: ``0``.
        streams (Iterable[torch.cuda.Stream], optional): an iterable of Streams, among
          which to execute the scatter. If not specified, the default stream will
          be utilized.
        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
          store output results. Sizes of these tensors must match that of
          :attr:`tensor`, except for :attr:`dim`, where the total size must
          sum to ``tensor.size(dim)``.

    .. note::
        Exactly one of :attr:`devices` and :attr:`out` must be specified. When
        :attr:`out` is specified, :attr:`chunk_sizes` must not be specified and
        will be inferred from sizes of :attr:`out`.

    Returns:
        - If :attr:`devices` is specified,
            a tuple containing chunks of :attr:`tensor`, placed on
            :attr:`devices`.
        - If :attr:`out` is specified,
            a tuple containing :attr:`out` tensors, each containing a chunk of
            :attr:`tensor`.
    Nc                 S   r   r   r   r   r   r   r   r      r   zscatter.<locals>.<listcomp>zI'devices' must not be specified when 'out' is specified, but got devices=zQ'chunk_sizes' must not be specified when 'out' is specified, but got chunk_sizes=)r   rW   r   r   _scatterr   _scatter_out)r   r   chunk_sizesdimstreamsr
   r   r   r   scatter   s   "rf   c                C   sl   dd | D } |du r#|dkrt d t|ddd}tj| ||S |dur.td| tj| ||S )	a  Gathers tensors from multiple GPU devices.

    Args:
        tensors (Iterable[Tensor]): an iterable of tensors to gather.
          Tensor sizes in all dimensions other than :attr:`dim` have to match.
        dim (int, optional): a dimension along which the tensors will be
          concatenated. Default: ``0``.
        destination (torch.device, str, or int, optional): the output device.
          Can be CPU or CUDA. Default: the current CUDA device.
        out (Tensor, optional, keyword-only): the tensor to store gather result.
          Its sizes must match those of :attr:`tensors`, except for :attr:`dim`,
          where the size must equal ``sum(tensor.size(dim) for tensor in tensors)``.
          Can be on CPU or CUDA.

    .. note::
        :attr:`destination` must not be specified when :attr:`out` is specified.

    Returns:
        - If :attr:`destination` is specified,
            a tensor located on :attr:`destination` device, that is a result of
            concatenating :attr:`tensors` along :attr:`dim`.
        - If :attr:`out` is specified,
            the :attr:`out` tensor, now containing results of concatenating
            :attr:`tensors` along :attr:`dim`.
    c                 S   r   r   r   r   r   r   r   r      r   zgather.<locals>.<listcomp>NrO   zjUsing -1 to represent CPU tensor is deprecated. Please use a device object or string instead, e.g., "cpu".T)	allow_cpur$   zQ'destination' must not be specified when 'out' is specified, but got destination=)warningswarnr   r   r   _gatherr   _gather_out)r!   rd   rC   r
   r   r   r   gather   s   rl   r(   )r   )Nr   )NNr   N)r   N)rh   r   
torch.cudar   torch._utilsr   r   r   r   r   r   typingr	   r   r#   rL   r`   rf   rl   r   r   r   r   <module>   s     
 

--0