o
    hn                     @   s   d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlZddlmZ ddgZG d	d
 d
ZG dd deZdd ZG dd dZdS )    N)abcdefaultdict)Enum)AnycastDictListOptionalTuple   )amp_definitely_not_availableOptState
GradScalerc                   @   s4   e Zd ZdZdejddfddZdejfddZdS )	_MultiDeviceReplicatorz_
    Lazily serves copies of a tensor to requested devices.  Copies are cached per-device.
    master_tensorreturnNc                 C   s&   |j s|jjdksJ || _i | _d S )Nxla)is_cudadevicetypemaster_per_device_tensors)selfr    r   P/var/www/html/ai/venv/lib/python3.10/site-packages/torch/cuda/amp/grad_scaler.py__init__   s   
z_MultiDeviceReplicator.__init__c                 C   s6   | j |d }|d u r| jj|ddd}|| j |< |S )NT)r   non_blockingcopy)r   getr   to)r   r   retvalr   r   r   r      s
   
z_MultiDeviceReplicator.get)__name__
__module____qualname____doc__torchTensorr   r   r   r   r   r   r      s    r   c                   @   s   e Zd ZdZdZdZdS )r   r   r      N)r!   r"   r#   READYUNSCALEDSTEPPEDr   r   r   r   r   %   s    c                   C   s   t ji dS )N)stagefound_inf_per_device)r   r(   r   r   r   r   _refresh_per_optimizer_state+   s   r-   c                   @   s,  e Zd ZU eej ed< eej ed< eeee	e
f f ed< 	 					d=d	d
Zdeejejf fddZdd Zdd Zdd Zdd Zdd Zdd Zd>ddZdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, Zd-d. Zd/d0 Zd1d2 Zd3d4 Z d5d6 Z!d7d8 Z"d9d: Z#d;d< Z$dS )?r   _scale_grows_tracker_per_optimizer_states      @       @      ?  Tc                 C   s   |rt  rtd d| _n|| _| jr@|dksJ d|dk s$J d|| _d | _|| _|| _|| _d| _	d | _
tt| _d S d S )NzLtorch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling.F      ?z The growth factor must be > 1.0.z!The backoff factor must be < 1.0.r   )r   warningswarn_enabled_init_scaler.   _growth_factor_backoff_factor_growth_interval_init_growth_tracker_growth_trackerr   r-   r0   )r   
init_scalegrowth_factorbackoff_factorgrowth_intervalenabledr   r   r   r   t   s$   
zGradScaler.__init__r   c                 C   sL   d}| j d usJ d| d| | jd us J d| d| | j | jfS )NzaThis may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration.z
Attempted z but _scale is None.  z but _growth_tracker is None.  )r.   r>   )r   funcnamefixr   r   r   _check_scale_growth_tracker   s   z&GradScaler._check_scale_growth_trackerc                 C   sF   | j d u s	J dtjd| jtj|d| _tjd| jtj|d| _ d S )Nz)_growth_tracker initialized before _scaler   dtyper   )r>   r%   fullr9   float32r.   r=   int32)r   devr   r   r   _lazy_init_scale_growth_tracker   s
   z*GradScaler._lazy_init_scale_growth_trackerc                    s   j s|S t|tjr3|js|jjdksJ jdu r!|j jdus(J |jj	|jdd S g  fdd  |S )a2  
        Multiplies ('scales') a tensor or list of tensors by the scale factor.

        Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
        unmodified.

        Args:
            outputs (Tensor or iterable of Tensors):  Outputs to scale.
        r   NTr   r   c                    s   t | tjr;| js| jjdksJ tdkr1jd u r"| j jd us)J 	t
j | d | j S t | tjrUt | }t | ttfrSt| |S |S td)Nr   r   z2outputs must be a Tensor or an iterable of Tensors)
isinstancer%   r&   r   r   r   lenr.   rM   appendr   r   r   Iterablemaplisttuple
ValueError)valiterableapply_scaler   stashr   r   rZ      s   

z%GradScaler.scale.<locals>.apply_scale)
r8   rO   r%   r&   r   r   r   r.   rM   r   )r   outputsr   rY   r   scale   s   

zGradScaler.scalec              
   C   s  t |}t |}tdd }t l |jD ]A}|d D ]:}	|	jd u r$q|s1|	jjtjkr1td|	jj	rH|	jjtju rB|	j
 |	_|	j }
n|	j}
||
j |
j |
 qq| D ]\}}| D ]}t||||| qdq\W d    |jS 1 sw   Y  |jS )Nc                   S   s   t tS N)r   rT   r   r   r   r   <lambda>   s    z,GradScaler._unscale_grads_.<locals>.<lambda>paramsz%Attempting to unscale FP16 gradients.)r   r   r%   no_gradparam_groupsgradrH   float16rV   	is_sparsecoalesce_valuesr   rQ   itemsvalues*_amp_foreach_non_finite_check_and_unscale_r   r   )r   	optimizer	inv_scale	found_inf
allow_fp16per_device_inv_scaleper_device_found_infper_device_and_dtype_gradsgroupparam
to_unscaler   per_dtype_gradsgradsr   r   r   _unscale_grads_   sB   




zGradScaler._unscale_grads_c                 C   s   | j sdS | d | jt| }|d tju rtd|d tju r'td| jdus.J | j	 
  }tjddtj| jjd}| |||d	|d
< tj|d< dS )as  
        Divides ("unscales") the optimizer's gradient tensors by the scale factor.

        :meth:`unscale_` is optional, serving cases where you need to
        :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
        between the backward pass(es) and :meth:`step`.
        If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.

        Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::

            ...
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
            scaler.step(optimizer)
            scaler.update()

        Args:
            optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.

        .. note::
            :meth:`unscale_` does not incur a CPU-GPU sync.

        .. warning::
            :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
            and only after all gradients for that optimizer's assigned parameters have been accumulated.
            Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.

        .. warning::
            :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
        Nunscale_r+   zMunscale_() has already been called on this optimizer since the last update().z(unscale_() is being called after step().r           rG   Fr,   )r8   rF   r0   idr   r)   RuntimeErrorr*   r.   double
reciprocalfloatr%   rI   rJ   r   rw   )r   rk   optimizer_staterl   rm   r   r   r   rx      s"    
zGradScaler.unscale_c                 O   s2   d }t dd |d  D s|j|i |}|S )Nc                 s   s    | ]}|  V  qd S r^   )item).0vr   r   r   	<genexpr>:  s    z-GradScaler._maybe_opt_step.<locals>.<genexpr>r,   )sumri   step)r   rk   r   argskwargsr    r   r   r   _maybe_opt_step8  s   zGradScaler._maybe_opt_stepc           	         s  | j s|j|i |S d|v rtd| d | jt| }|d tju r*tdd}t|dr|j	r|}dt
|jjv }|rOtd	t |d| i n2|d tju r[| | |   ttjt fd
d|d  D }|d tjkr{dn |_||_|j|i |}tj|d< |s|`|`|S |d tju r| | t|d dksJ d| j||g|R i |}tj|d< |S )a  
        :meth:`step` carries out the following two operations:

        1.  Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer``
            earlier in the iteration).  As part of the :meth:`unscale_`, gradients are checked for infs/NaNs.
        2.  If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled
            gradients.  Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params.

        ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``.

        Returns the return value of ``optimizer.step(*args, **kwargs)``.

        Args:
            optimizer (torch.optim.Optimizer):  Optimizer that applies the gradients.
            args:  Any arguments.
            kwargs:  Any keyword arguments.

        .. warning::
            Closure use is not currently supported.
        closurez@Closure use is not currently supported if GradScaler is enabled.r   r+   z7step() has already been called since the last update().N_step_supports_amp_scalinggrad_scalerzGradScaler is going to stop passing itself as a keyword argument to the passed optimizer. In the near future GradScaler registers `grad_scale: Tensor` and `found_inf: Tensor` to the passed optimizer and let the optimizer use them directly.c                    s   g | ]
}|j  jd dqS )T)r   )r   r   )r   tscalerr   r   
<listcomp>  s    z#GradScaler.step.<locals>.<listcomp>r,   r   z/No inf checks were recorded for this optimizer.)r8   r   r{   rF   r0   rz   r   r*   hasattrr   inspect	signature
parametersr6   r7   FutureWarningupdater(   _check_inf_per_device_get_scale_asyncr   r%   r&   r   ri   r)   
grad_scalerm   rx   rP   r   )	r   rk   r   r   r   r    kwargs_has_grad_scaler_kwargrm   r   r   r   r   >  sl   







zGradScaler.stepNc                    s  | j sdS | d\ }|durCt|tr| j| ndd}t|tjjs)J ||	 dks3J ||j
du s<J || j| n= fdd| j D }t|dksYJ d	|d }t|dkrstdt|D ]}||| 7 }qjt ||| j| j| j tt| _dS )
aS  
        Updates the scale factor.

        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
        the scale is multiplied by ``growth_factor`` to increase it.

        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
        used directly, it's used to fill GradScaler's internal scale tensor. So if
        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
        affect the scale GradScaler uses internally.)

        Args:
            new_scale (float or :class:`torch.cuda.FloatTensor`, optional, default=None):  New scale factor.

        .. warning::
            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
            been invoked for all optimizers used this iteration.

        .. warning::
            For performance reasons, we do not check the scale factor value to avoid synchronizations,
            so the scale factor is not guaranteed to be above 1. If the scale falls below 1 and/or
            you are seeing NaNs in your gradients or loss, something is likely wrong. For example,
            bf16-pretrained models are often incompatible with AMP/fp16 due to differing dynamic ranges.
        Nr   z[new_scale should be a float or a 1-element torch.cuda.FloatTensor with requires_grad=False.r   Fc                    s.   g | ]}|d    D ]
}|j jddq
qS )r,   TrN   )ri   r   r   )r   staterm   r.   r   r   r     s    
z%GradScaler.update.<locals>.<listcomp>r   z,No inf checks were recorded prior to update.)r8   rF   rO   r~   r.   fill_r%   cudaFloatTensornumelrequires_gradcopy_r0   ri   rP   range_amp_update_scale_r:   r;   r<   r   r-   )r   	new_scaler>   reason
found_infsfound_inf_combinedir   r   r   r     s8   


zGradScaler.updatec                 C      | j S r^   r   r   r   r   r   r     s   zGradScaler._get_scale_asyncc                 C   s&   | j r| jdu r| jS |   S dS )z
        Returns a Python float containing the current scale, or 1.0 if scaling is disabled.

        .. warning::
            :meth:`get_scale` incurs a CPU-GPU sync.
        Nr5   )r8   r.   r9   r   r   r   r   r   r   	get_scale  s   

zGradScaler.get_scalec                 C   r   )zL
        Returns a Python float containing the scale growth factor.
        r:   r   r   r   r   get_growth_factor     zGradScaler.get_growth_factorc                 C   
   || _ dS )zd
        Args:
            new_scale (float):  Value to use as the new scale growth factor.
        Nr   r   
new_factorr   r   r   set_growth_factor     
zGradScaler.set_growth_factorc                 C   r   )zM
        Returns a Python float containing the scale backoff factor.
        r;   r   r   r   r   get_backoff_factor  r   zGradScaler.get_backoff_factorc                 C   r   )ze
        Args:
            new_scale (float):  Value to use as the new scale backoff factor.
        Nr   r   r   r   r   set_backoff_factor  r   zGradScaler.set_backoff_factorc                 C   r   )zF
        Returns a Python int containing the growth interval.
        r<   r   r   r   r   get_growth_interval  r   zGradScaler.get_growth_intervalc                 C   r   )za
        Args:
            new_interval (int):  Value to use as the new growth interval.
        Nr   )r   new_intervalr   r   r   set_growth_interval  r   zGradScaler.set_growth_intervalc                 C   s$   | j r| jd u r| jS | j S dS )Nr   )r8   r>   r=   r   r   r   r   r   _get_growth_tracker%  s   
zGradScaler._get_growth_trackerc                 C   r   )zM
        Returns a bool indicating whether this instance is enabled.
        )r8   r   r   r   r   
is_enabled/  r   zGradScaler.is_enabledc                 C   s(   | j r|  | j| j| j|  dS i S )a   
        Returns the state of the scaler as a :class:`dict`.  It contains five entries:

        * ``"scale"`` - a Python float containing the current scale
        * ``"growth_factor"`` - a Python float containing the current growth factor
        * ``"backoff_factor"`` - a Python float containing the current backoff factor
        * ``"growth_interval"`` - a Python int containing the current growth interval
        * ``"_growth_tracker"`` - a Python int containing the number of recent consecutive unskipped steps.

        If this instance is not enabled, returns an empty dict.

        .. note::
           If you wish to checkpoint the scaler's state after a particular iteration, :meth:`state_dict`
           should be called after :meth:`update`.
        )r]   r@   rA   rB   r>   )r8   r   r:   r;   r<   r   r   r   r   r   
state_dict5  s   	zGradScaler.state_dictc                 C   s   | j sdS t|dkrtd|d | _| jdur!| j|d  |d | _|d | _|d | _|d | _	| j
durD| j
|d  dS dS )	z
        Loads the scaler state.  If this instance is disabled, :meth:`load_state_dict` is a no-op.

        Args:
           state_dict(dict): scaler state.  Should be an object returned from a call to :meth:`state_dict`.
        Nr   zeThe source state dict is empty, possibly because it was saved from a disabled instance of GradScaler.r]   r@   rA   rB   r>   )r8   rP   r{   r9   r.   r   r:   r;   r<   r=   r>   )r   r   r   r   r   load_state_dictQ  s    






zGradScaler.load_state_dictc                 C   sR   | j  }| jr't| jdksJ d|  |d< |  |d< d |d< d |d< |S )Nr   zpA GradScaler instance may only be pickled at the beginning of an iteration, or at the end after scaler.update().r9   r=   r.   r>   )__dict__r   r8   rP   r0   r   r   r   r   r   r   r   __getstate__k  s   
zGradScaler.__getstate__c                 C   s   | j | d S r^   )r   r   r   r   r   r   __setstate__{  s   zGradScaler.__setstate__c                 C   sj   |  d\}}tjddtj|jd}tjddtj|jd}| |||d| jt| d< | jt| d S )Nr   r   r5   rG   ry   Tr,   )rF   r%   rI   rJ   r   rw   r0   rz   )r   rk   r.   _dummy_inv_scalerm   r   r   r   r   ~  s   z GradScaler._check_inf_per_devicec                 C   s   | j t| d S )Nr,   )r0   rz   )r   rk   r   r   r   _found_inf_per_device  s   z GradScaler._found_inf_per_device)r1   r2   r3   r4   Tr^   )%r!   r"   r#   r	   r%   r&   __annotations__r   intstrr   r   r
   rF   rM   r]   rw   rx   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   /   sJ   
 C

.,8
hE
)r   r6   collectionsr   r   enumr   typingr   r   r   r   r	   r
   r%   commonr   __all__r   r   r-   r   r   r   r   r   <module>   s     