o
    h6                     @   s>  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ dd	lmZmZmZ dd
lmZ ddlmZmZm Z  d dl!m"  m#Z$ e%e&Z'dd Z(edd Z)edej*de
fddZ+d1ddZ,dd Z-edej*de
fddZ.G dd dej/Z0edej*de
fddZ1edd  Z2d!d" Z3ej4j5Z5e5j6e5j7e5j8e5j9e5j:e5j;e5j<e5j=e5j>e5j?e5j@e5jAe5jBe5jCjDe5jCjEe5jFe5jGe5jHe5jIe5jJe5jKe5jLhZMeeMZMed#d$ ZNd%ee
ejOf fd&d'ZPd(d) ZQd aRd*d+ ZSd,d- ZTd2d/d0ZUdS )3    N)contextmanager)partial)CallableUnion)SymInt)get_decompositions)bind_symbols   )aot_function
aot_modulemake_boxed_compiler)strip_overloads)default_partition
draw_graph#min_cut_rematerialization_partitionc                 C   s6   | j jD ]}|jtjjjkrtjjj|_q|   | S N)	graphnodestargettorchopsaten_to_copyto	recompile)fx_gnode r   P/var/www/html/ai/venv/lib/python3.10/site-packages/torch/_functorch/compilers.py_canonicalize!   s   r   c               	   c   s6    t jd} zd V  W t j|  d S t j|  w )NF)r   _C_jit_set_autocast_mode)old_jit_autocast_flagr   r   r   _disable_jit_autocast)   s
   r#   r   returnc                 C   s<  t   t|  | jjD ]#}|jtjjjkr/t	|j
dkr/t	|jdkr/d|jv r/tjjj|_q| jjD ]}i }|j D ]\}}t|tjrJ|j}|||< q=||_q4| j  |   tj| }tj|j tj| }tj|}tdd |D s||  W d   |S W d   |S 1 sw   Y  |S )a  
    Compiles the :attr:`fx_g` with Torchscript compiler.

    .. warning::
        This API is experimental and likely to change.

    Args:
        fx_g(fx.GraphModule): The input Fx graph module to be compiled.

    Returns:
        Torch scripted model.
    r	   dtypec                 s   s    | ]
}t |tjjV  qd S r   )
isinstancer   _subclasses
FakeTensor).0tr   r   r   	<genexpr>_   s    zts_compile.<locals>.<genexpr>N)r#   r   r   r   r   r   r   r   r   lenargskwargsr   itemsr&   devicetypelintr   jitscriptr    _jit_pass_remove_mutationfreezeevaloptimize_for_inferenceany)r   inpsr   
new_kwargskvfr   r   r   
ts_compile2   s>   




 
  r?   Tc                 C   s   t | j t| ||d | S )N)
clear_meta)printcoder   )r   _namer@   r   r   r   _draw_graph_compiled   s   
rE   c                 C   s   t tt| dS )NrD   )r   r   rE   rF   r   r   r   draw_graph_compilej   s   
rG   c                 C   s   | S )z
    Returns the :attr:`fx_g` Fx graph module as it is. This is a no-op compiler
    and can be used to check accuracy.

    .. warning::
        This API is experimental and likely to change.

    r   r   rC   r   r   r   nopp   s   
rI   c                       s(   e Zd Z fddZ fddZ  ZS )DebugInterpreterc                    s$   t | jg|R  | _t j|  d S r   )r   modulesymbol_mappingsuperrun)selfr-   	__class__r   r   rN   }   s   zDebugInterpreter.runc           
         s   fddfddfdd  fdd}t  |}d	|jv rlt|jd	 \}}t|\}}t|t|ksJJ t| d
t| ttt|||D ]\}}	t|	t	j
s`qT|||	fdd qT|S )Nc                    sB   t | ts| S t| jj j}t|j	dksJ |t
|S )Nr   )r&   r   sympyexpandr   exprxreplacerL   r,   free_symbolsint)nir)rO   r   r   subst_symint   s
   
z/DebugInterpreter.run_node.<locals>.subst_symintc                    s   t  fdd| D S )Nc                 3   s    | ]} |V  qd S r   r   )r)   rX   rZ   r   r   r+      s    zHDebugInterpreter.run_node.<locals>.subst_symint_tuple.<locals>.<genexpr>)tuple)nisr[   r   r   subst_symint_tuple   s   z5DebugInterpreter.run_node.<locals>.subst_symint_tuplec                    sT    |   dkr(t| jD ]} | |||kr' | |dkr' dS qdS )Nr   r	   FT)numelrangendimstridesize)abidxr[   r   r   check_significant_strides   s   *z<DebugInterpreter.run_node.<locals>.check_significant_stridesc              	      s   t |sJ | j|jksJ |  d| j d|j |  | ks=J |  d|   d|   d|   | |}|s\J |  d|   d|   d|  d S )Nz:  != z aka )callabler%   rc   rb   )nvrvdescsame_strides)rg   r^   r   r   check   s   **
8z(DebugInterpreter.run_node.<locals>.checkvalrh   c                      s   d  dj  S )Nzoutput z where )rL   r   )irO   r   r   <lambda>   s    z+DebugInterpreter.run_node.<locals>.<lambda>)rM   run_nodemetapytreetree_flattenr,   zipr`   r&   r   Tensor)
rO   nrn   rY   n_valsn_specr_valsr_specrj   rk   rP   )rg   rp   rO   rZ   r^   r   rr      s   
*zDebugInterpreter.run_node)__name__
__module____qualname__rN   rr   __classcell__r   r   rP   r   rJ   |   s    rJ   c                 C   s
   t | jS )z
    Returns a (slow) interpreter over the FX graph module that also checks
    various debugging properties (e.g., that tracing strides matched real
    strides.)
    )rJ   rN   rH   r   r   r   	debug_nop   s   
r   c                 C   s(   t |  tj| }tj| }|S r   )r   r   r3   r4   r6   r7   )r   rC   r>   r   r   r   simple_ts_compile   s   r   c                 C   s
   t | tS r   )r
   r   )r>   r   r   r   nnc_jit   s   
r   c                 C   s   t | j | S r   )rA   rB   rH   r   r   r   print_compile   s   
r   fnc                 K   sF   t t ttd}|| t| tjjrt| fi |S t	| fi |S )a  
    Wrapper function over :func:`aot_function` and :func:`aot_module` to perform
    memory efficient fusion. It uses the
    :func:`min_cut_rematerialization_partition` partitioner to perform efficient
    recomputation. It uses NVFuser to compile the generated forward and backward
    graphs.

    .. warning::
        This API is experimental and likely to change.

    Args:
        fn (Union[Callable, nn.Module]): A Python function or a ``nn.Module``
            that takes one ore more arguments. Must return one or more Tensors.
        **kwargs: Any other overrides you want to make to the settings

    Returns:
        Returns a ``Callable``  or ``nn.Module`` that retains the eager behavior
        of the original :attr:`fn`, but whose forward and backward graphs have
        gone through recomputation optimizations, and the graphs have been
        compiled with nvfuser.

    fw_compilerbw_compilerpartition_fndecompositions)
r?   r   default_decompositionsupdater&   r   nnModuler   r
   )r   r.   configr   r   r   memory_efficient_fusion   s   
r   c                 C   sH   |  d tddd |D  d ddlm} |  |  t| |S )NfooaQ  
##############################################################
# To minimize FX graph, copy and paste the below and run it  #
##############################################################

import torch
import torch.fx as fx
from functorch.compile import minifier, check_nvfuser_subprocess, check_nvfuser_correctness_subprocess

inps = c                 S   s   g | ]}|j |jfqS r   )shaper%   )r)   rp   r   r   r   
<listcomp>  s    z!debug_compile.<locals>.<listcomp>a?  
inps = [torch.ones(shape, dtype=dtype, device='cuda') for (shape, dtype) in inps]
from foo import FxModule
mod = FxModule().cuda()

with torch.jit.fuser("fuser2"):
  # check_nvfuser_subprocess can be replaced with check_nvfuser_correctness_subprocess
  minifier(fx.symbolic_trace(mod), inps, check_nvfuser_subprocess)
r   )FxModule)	to_folderrA   r   r   cudar?   )r   r:   r   r   r   r   debug_compile  s   
	
r   c                 C   s   g }t | dU}t|}g }|D ]B}t|dkr"|}|t }n,|\}}}}	}
|	tjtjtj	tj
tjtjtthv rFtjdd||	|
d}ntj||	|
d}|| qW d   |S 1 s_w   Y  |S )zZ
    Return a random input for the given inputs meta generated from _save_fx_default.
    rbr	   r   )r%   r0   N)openpickleloadr,   randomrandr   rW   int32int64booluint8floatrandintappend)input_data_pathinputsr>   inputs_metars   r1   inputr   rb   r%   r0   r   r   r   
get_inputs-  s6   


r   c           	         sb   ddl m} fdd fddfdd}fd	d
}fdd}||||||tdS )aO  
    The forward, backward, and joint computation graph will be stored in
    {folder_name}/{current_name}/{current_name}_forward_{graph_index},
    {folder_name}/{current_name}/{current_name}_backward_{graph_index}, and
    {folder_name}/{current_name}/{current_name}_joint_{graph_index} respectively.
    The input shape of the graphs will be stored in the .input files.
    These files can be loaded with pickle,
    and is a list of format (type, shape, stride, dtype, device).
    In the case of type = int or float, it is just (type,).
    For joint graph input, it is a nested list [[],[]]
    where the two inner lists have the same format.
    If dump_example_input is True, example_inputs will be stored in .pt file.
    Since each function might produce multiple graphs,
    the graph_index is used to distinguish difference graphs
    r   )aot_module_simplifiedc                    s   g }t | dkr!t| d tr!| | d 7 }| | d 7 }|S | D ](}t|tks1t|tkr:|t|f q#|t||j| |j	|j
f q#|S )Nr   r	   )r,   r&   r\   r1   rW   r   r   r   rb   r%   r0   )r-   
input_metaarg)get_input_metar   r   r   ^  s   z(_save_fx_default.<locals>.get_input_metac                    s4  t | jjdkrttjd |t d S t| }|j	t
jj  |  |}tj d  }|sBt d   | d  d  d| dt 	 t|t d  d  d| dt d  d| dt dd rt
| d  d  d| dt d  d| dt d d S d S )Nr   z!No nodes in graph {%s}_{%s}_{%s}./rC   z.inputwbz.pt)r,   r   r   logloggingWARNINGgraph_indexcopydeepcopyset_codegenr   fxCodeGenr   ospathexistsmakedirsr   r   dumpr   save)
gm_to_saver-   	type_namegmr   isExist)current_namedump_example_inputfolder_namer   r   r   graph_saver_helperm  s@   
22z,_save_fx_default.<locals>.graph_saver_helperc                    s    | |d | S )Nforwardr   )r   fw_argsr   r   r   graph_saver_forward  s   z-_save_fx_default.<locals>.graph_saver_forwardc                    s    | |d t d7 a | S )Nbackwardr	   )r   )r   bw_argsr   r   r   graph_saver_backward  s   z._save_fx_default.<locals>.graph_saver_backwardc                    s    | |d t | |S )Njoint)r   )r   
joint_argsr   r   r   graph_saver_joint  s   
z+_save_fx_default.<locals>.graph_saver_jointr   )functorch.compiler   r   )	r   r   r   r   example_inputsr   r   r   r   r   )r   r   r   r   r   r   _save_fx_defaultL  s   %r   Fc                 C   s   da tt| ||S )as  
    Dump the forward, backward, and joint computation graph.
    Example Usage:
    save_fx_func = graph_dumper_aot(current_name, folder_name, dump_example_input = False)
    optimize_ctx = torchdynamo.optimize(
        save_fx_func
    )
    with torch.enable_grad():
        with optimize_ctx:
            result = forward_and_backward_pass(model, example_inputs)
    r   )r   r   r   )r   r   r   r   r   r   graph_dumper_aot  s   r   )T)F)Vr   r   r   r   r   
contextlibr   	functoolsr   typingr   r   rR   r   r   torch.fxr   torch.nnr   torch._decompr   %torch.fx.experimental.symbolic_shapesr   aot_autogradr
   r   r   compile_utilsr   partitionersr   r   r   torch.utils._pytreeutils_pytreert   	getLoggerr}   r   r   r#   GraphModuler?   rE   rG   rI   InterpreterrJ   r   r   r   r   r   detachgelu_backwardleaky_relu_backwardsigmoid_backwardthreshold_backwardhardtanh_backwardhardsigmoid_backwardhardswish_backwardtanh_backwardsilu_backwardelu_backwardcudnn_batch_normcudnn_batch_norm_backwardmasked_fillScalarrw   elu
leaky_reluhardtanh	hardswishhardsigmoidconj_physicalis_same_sizer   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s    


12


'_