o
    h                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZmZ d dlmZ d dlZd dlmZmZ d dlZ d dl!Z d dl"m#  m$Z% d d	l&m'Z' d d
l(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8 ddl9m:Z: ddl6m;Z;m<Z< ddl#m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZH ddlImJZJmKZK eLeMZNejejOddZOe jJjPZP	 dd ZQdd ZRdd ZSdd ZTdd ZUg d ZVd!d" ZWd#eeX fd$d%ZYdd'd(ZZG d)d* d*Z[G d+d, d,e[Z\G d-d. d.e[Z]G d/d0 d0e[Z^G d1d2 d2e[Z_G d3d4 d4e[Z`d5e_iZad6d7 Zbd8d9 Zcd:d; Zdd<d= ZeejfG d>d? d?ZgejfG d@dA dAegZhdBdC ZiG dDdE dEehZjejfG dFdG dGejZkG dHdI dIeZlG dJdK dKeZmeRdLeRdMeRdNeRdOeRdPeRdQdRZndSdT ZoejfG dUdV dVehZpdWdX ZqG dYdZ dZepZrd[d\ Zsd]d^ Ztdd`daZuejeud&dbZvdcdd ZwejfG dedf dfegZxejfG dgdh dhexZyejfG didj djexZzG dkdl dlexZ{ejfG dmdn dnexZ|ejfG dodp dpe|Z}ejfG dqdr drexZ~G dsdt dte}ZG dudv dvegZejfG dwdx dxeZejfG dydz dzeZejfG d{d| d|egZG d}d~ d~eZG dd deZG dd deZG dd deZejfG dd degZG dd deZG dd deZG dd degZG dd degZejfG dd deZG dd deZejfG dd deZG dd deZG dd deZejfG dd deZejfG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd degZejfG dd deZejfG dd degZG dd deZ	_	ddddddddeeX deeX deeX deXdedeeX fddZ						dddZG dd deZG dd deZG ddĄ deZG ddƄ deZG ddȄ deZG ddʄ deZG dd̄ deZG dd΄ deZG ddЄ deZG dd҄ deZG ddԄ deZejfG ddք degZG dd deZG ddل deZG ddۄ de jjZG dd݄ d݃ZG dd߄ d߃ZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZdS )    N)nullcontext)Enum)partial)	signature)
AnyCallableClassVarDictListOptionalSequenceSetTupleUnion)patch)ExprInteger)identity)compute_required_storage_lengthis_boolean_dtypeis_float_dtypemake_channels_last_strides_formake_contiguous_strides_for)get_signature_for_torch_op)CleanDivFloorDivModularIndexing   )configdependencies)index_prevent_reordering)get_device_properties)extract_read_writesvar_builder)argsortcache_on_selfconvert_shape_to_inductorconvert_shape_to_symintdeveloper_warningget_kernel_metadatapad_listlike	sympy_dotsympy_product
sympy_subssympy_symboltry_find_schema)opsVz  prefixc                    s    fdd  |  d S )Nc              	      sh   t | ttfr| D ]} | q	d S t | tjjjttt	j
t	jjjttjjjfs2J dt|  dd S )NzFound zE, which is not a supported top level IR node. See [Note: Inductor IR])
isinstancelisttupletorch	_inductorir
ExpandViewDynamicScalar	TensorBoxsympySymbollogicboolalgBooleanr   type)nodesnode_check_tensorbox H/var/www/html/ai/venv/lib/python3.10/site-packages/torch/_inductor/ir.pyrF   i   s"   
z%validate_ir.<locals>._check_tensorboxrG   )node_or_nodesrG   rE   rH   validate_irh   s   rJ   c                    s   t  tsJ  fdd}|S )Nc                     s   t t | i |S N)getattrr0   )argskwargsnamerG   rH   fn      zops_wrapper.<locals>.fn)r4   str)rP   rQ   rG   rO   rH   ops_wrapper   s   rT   c                    s&   t t| tt|   fdd}|S )Nc                    0   t  t ks
J  fddtt  D S )Nc                       g | ]} |  qS rG   rG   .0i)index	inv_orderrG   rH   
<listcomp>       z4inverse_reorder.<locals>.reindex.<locals>.<listcomp>lenrangerZ   r[   ra   rH   reindex      z inverse_reorder.<locals>.reindex)dictzipr`   r_   orderrc   rG   rb   rH   inverse_reorder   s   ri   c                        fdd}|S )Nc                    rU   )Nc                    rV   rG   rG   rW   )rZ   rh   rG   rH   r\      r]   z1same_reorder.<locals>.reindex.<locals>.<listcomp>r^   ra   rh   ra   rH   rc      rd   zsame_reorder.<locals>.reindexrG   rg   rG   rk   rH   same_reorder      rl   c                    s    fdd}|S )Nc                        | S rK   rG   ra   reindex1reindex2rG   rH   rc         z fuse_reindexing.<locals>.reindexrG   )rp   rq   rc   rG   ro   rH   fuse_reindexing   s   rs   )   r      r   c                    s0   dd t | D   fddtt| D }|S )z
    Convert stride order to fill order
    For channel last format,
    stride order = [3, 0, 2, 1] and fill order = [1, 3, 2, 0]
    c                 S      i | ]\}}||qS rG   rG   rX   idxposrG   rG   rH   
<dictcomp>       z+stride_order2fill_order.<locals>.<dictcomp>c                       g | ]} | qS rG   rG   rW   lookuprG   rH   r\          z+stride_order2fill_order.<locals>.<listcomp>)	enumerater`   r_   )rh   
fill_orderrG   r}   rH   stride_order2fill_order   s   r   seqc                 C   s<   t | }dd tt| D }t|D ]\}}|||< q|S )z)
    Convert strides to stride order
    c                 S   s   g | ]}d qS rK   rG   rX   _rG   rG   rH   r\      s    z$get_stride_order.<locals>.<listcomp>)r$   r`   r_   r   )r   
sorted_idxoutrY   elemrG   rG   rH   get_stride_order   s
   
r   Tc                    s   | d u rd S |st jjj nt  fdd|  D }t| r, fdd|  jD }nt	|}| 
 }|  }t|}t|}tj||||d }|S )Nc                       g | ]} |qS rG   rG   rX   sshape_fnrG   rH   r\      r   z%ir_node_to_tensor.<locals>.<listcomp>c                    r   rG   rG   r   r   rG   rH   r\      r   )sizestridedtypedevice)r1   graphsizevars	size_hintr   get_sizeis_storage_and_layout
get_layoutr   r   	get_dtype
get_devicer'   r7   empty_stridedzero_)xguard_shaper   r   r   r   trG   r   rH   ir_node_to_tensor   s&   r   c                   @   s   e Zd Zdd Zdd ZdS )OptionalAttrc                 C   
   d| _ d S )Noptional_attrrO   selfrG   rG   rH   __init__      
zOptionalAttr.__init__c                 C      | j S rK   rO   r   rG   rG   rH   __repr__      zOptionalAttr.__repr__N)__name__
__module____qualname__r   r   rG   rG   rG   rH   r      s    r   c                   @      e Zd Zdd ZdS )OptionalStringc                 C   r   )Noptional_stringrO   r   rG   rG   rH   r      r   zOptionalString.__init__Nr   r   r   r   rG   rG   rG   rH   r          r   c                   @   r   )OptionalListc                 C   r   )Noptional_listrO   r   rG   rG   rH   r      r   zOptionalList.__init__Nr   rG   rG   rG   rH   r      r   r   c                   @   r   )OptionalScalarc                 C   r   )Noptional_scalarrO   r   rG   rG   rH   r      r   zOptionalScalar.__init__Nr   rG   rG   rG   rH   r      r   r   c                   @   r   )OptionalLayoutc                 C   r   )Noptional_layoutrO   r   rG   rG   rH   r      r   zOptionalLayout.__init__Nr   rG   rG   rG   rH   r      r   r   c                   @   r   )OptionalTensorc                 C   r   )Noptional_tensorrO   r   rG   rG   rH   r      r   zOptionalTensor.__init__Nr   rG   rG   rG   rH   r      r   r   zOptional[Layout]c                 C   s   |st jjr| S |S rK   )r1   r   cpp_wrapper)optional_valuevaluerG   rG   rH   may_convert_to_optional   rR   r   c                 C   s.   t | dd rt|  S t| tjr| jS d S )Nr   )rL   get_device_typer   r4   r7   r   rB   r   rG   rG   rH   r      s
   r   c                 C      t | dkS )Ncudar   r   rG   rG   rH   	is_triton  rr   r   c                 C   r   )Ncpur   r   rG   rG   rH   is_cpu  rr   r   c                   @   s   e Zd ZU e Zeee  ed< e	e
jdeejj fddZdd Zdd Zd	d
 Zdd Zdd Zedd Zdd Zdd Zdd ZdS )IRNode_current_originsoriginsc                 c   s.    t j}|| B t _z	d V  W |t _d S |t _w rK   )r   r   )r   oldrG   rG   rH   current_origins  s   
zIRNode.current_originsc                 C   s*   t | j| _tjrt | _d S d | _d S rK   )setr   r   r   debug_ir_traceback	tracebackformat_stackr   rG   rG   rH   __post_init__  s   zIRNode.__post_init__c                 C   r   rK   )r   r   rG   rG   rH   get_traceback  r   zIRNode.get_tracebackc                 C   s6   dt | dd }t|dkr|d d  d}|gS )Nzorigins=r    @   =   z...)rL   r_   )r   r   rG   rG   rH   common_repr   s   zIRNode.common_reprc                 C   s6   ||    }tdtt|}t| j d| dS )Nz,
z(
z
))r   indentjoinmaprS   rB   r   r   linesrG   rG   rH   
str_helper'  s   zIRNode.str_helperc                 C   s   ||   v S rK   )get_read_namesr   rP   rG   rG   rH   
is_user_of,  rr   zIRNode.is_user_ofc                 C   s   dd |   D S )Nc                 S   s   h | ]}|j qS rG   rO   )rX   deprG   rG   rH   	<setcomp>1  s    z(IRNode.get_read_names.<locals>.<setcomp>)	get_readsr   rG   rG   rH   r   /     zIRNode.get_read_namesc                 C      t |  S rK   )r,   r   r   rG   rG   rH   	get_numel3  rr   zIRNode.get_numelc                 C      t jjt|  dS Nr   r1   r   r   is_expr_static_and_truer=   Eqr   r   rG   rG   rH   is_zero_elements6     zIRNode.is_zero_elementsc                 C   s   t dt|  )a)  
        If the IRNode refers to data which has not been materialized (e.g.,
        it is a Pointwise/Reduction that could potentially have more
        compute fused into it), realize the IRNode into physical memory,
        ending the possibility of fusing into it, but allowing, e.g., multiple
        users to access the data without having to recompute.

        Check StorageBox.realize for a particularly notable implementation.

        TODO(ezyang): I think, in principle, every IRNode should have an
        implementation of this, and most of the time no-op is OK, but you
        really do have to audit each IRNode for this, so for now, raise
        an error if it's not implemented.  Note that some code in graph.py
        will catch this thrown error and suppress it with a warning.
        zrealize NYI on )NotImplementedErrorrB   r   rG   rG   rH   realize9  s   zIRNode.realizeN)r   r   r   r   r   r   r   r   __annotations__staticmethod
contextlibcontextmanagerr7   fxNoder   r   r   r   r   r   r%   r   r   r   r   rG   rG   rG   rH   r     s   
 
r   c                       s   e Zd ZU ejed< ejed< edef ed< e	e
 ed< d ddZ fd	d
ZeZdd Zdd Zdd Zdd Zdd Zedd Zed!ddZedd Zdd Zdd Z  ZS )"Loopsr   r   .inner_fnrangesr   c                    sF     d jj dt j  g fdd|D  d jg S )N'c                    s    g | ]}| d t  | qS =)rL   )rX   rP   r   rG   rH   r\   Z       z!Loops.__str__.<locals>.<listcomp>origin_node=)r   r   rB   rS   r   inner_fn_strorigin_node)r   namesrG   r   rH   __str__S  s   zLoops.__str__c                       t    d | _d S rK   superr   r   r   	__class__rG   rH   r   ^     

zLoops.__post_init__c                 C   r   rK   r   r   rG   rG   rH   r   d  r   zLoops.get_dtypec                 C   r   rK   r   r   rG   rG   rH   r   g  r   zLoops.get_devicec                 C   r   rK   r   r   rG   rG   rH   get_origin_nodej  r   zLoops.get_origin_nodec                 C   r   rK   r   r   rG   rG   rH   r   m  r   zLoops.get_sizec                 C      dS NFrG   r   rG   rG   rH   	is_externp     zLoops.is_externc                 O   sN   | dd }| dd }| |i |}||_tjr|pt nd |_t|S )Nr   r   )popr   r   r   r   r   r<   create)clsrM   rN   r   tbrrG   rG   rH   r  s  s    
zLoops.createrY   c                    s    fddt | D S )Nc                    s2   g | ]\}}|d krt dnt  | qS )r   r   )r=   r   r.   )rX   nr   r2   rG   rH   r\     s     z Loops._index.<locals>.<listcomp>r   )r   r3   rG   r2   rH   _index~  s   
zLoops._indexc                 C   r   rK   )r_   r   r   rG   rG   rH   inner_fn_str_len     zLoops.inner_fn_str_lenc                 C   s   |  | j}tj| j|S rK   )r  r   r1   KernelFormatterHandlerir_to_stringr   )r   rZ   rG   rG   rH   r        zLoops.inner_fn_strc                 C   sv   t tdd* |  r t|  |  |  jW  d    S t|  |  jW  d    S 1 s4w   Y  d S Nallow_indexingT)	r   objectFlexibleLayoutget_reduction_typer"   make_loaderr   get_reduction_sizereadsr   rG   rG   rH   r     s   $zLoops.get_reads)r   rY   )r   r   r   r7   r   r   r   r   r   r
   r   r   r   r   r   r   r
  r   r  classmethodr  r   r  r%   r  r   r   __classcell__rG   rG   r  rH   r   L  s*   
 





r   c                C   s"   |j rttd|S td|S )Nnanr   )is_floating_pointr0   constantfloat)rx   r   rG   rG   rH   nop_loader_fn  s   r+  c                   @   s4   e Zd Zdd Zdd Zdd Zdd Zd	d
 ZdS )	Pointwisec                 C   s   |   rtt| jdS | jS )Nr  )r   r   r+  r   r   r   rG   rG   rH   r!    s   zPointwise.make_loaderc                 C   s   g S rK   rG   r   rG   rG   rH   r"    r  zPointwise.get_reduction_sizec                 C      d S rK   rG   r   rG   rG   rH   r     r  zPointwise.get_reduction_typec                 C   s   |   }t|||||S rK   )r!  r0   storer   output_nameindexervarsloaderrG   rG   rH   store_output  s   zPointwise.store_outputc                 C   s,   |   }ttd||}t|| j|| jS FMove this to a given device. Requires that all reads are to constants.override_device)r!  r   r  ConstantBufferr,  r   r   r   r   r3  rG   rG   rH   constant_to_device  s   zPointwise.constant_to_deviceN)r   r   r   r!  r"  r   r4  r:  rG   rG   rG   rH   r,    s    r,  c                   @   sD   e Zd ZU eee gef ed< dZee	 ed< dd Z
dd ZdS )Scatteroutput_indexerNscatter_modec                 C   s4   |   }ttd||}t|| j|| j| j| jS r5  )	r!  r   r  r8  r;  r   r   r<  r=  r9  rG   rG   rH   r:    s   zScatter.constant_to_devicec                 C   s*   |   }tj||| |||| jdS )N)mode)r!  r0   r.  r<  r=  r/  rG   rG   rH   r4    s   zScatter.store_output)r   r   r   r   r
   r   r   r=  r   rS   r:  r4  rG   rG   rG   rH   r;    s
   
 r;  c                   @   s   e Zd ZdZdZdZdZdS )ReductionHintr   r   ru   rt   N)r   r   r   INNEROUTER
OUTER_TINYDEFAULTrG   rG   rG   rH   r?    s
    r?  c                   @   s   e Zd ZdZdZdS )TileHintr   r   N)r   r   r   SQUARErC  rG   rG   rG   rH   rD    s    rD  
logical_ormaximumminimummuladdbitwise_xor)anymaxminprodsumxor_sumc                    sP   t v r
t  }|S dv r fdd}|S dkr!dd }|S td )N   argmaxargminc           
   
      s   | \}}|\}}dkrt ||}nt ||}t ||}t rCt ||}t ||}	t |t ||	}t |t ||	}t |t |t ||}t |||t |||fS )NrT  )	r0   ltgteqr   nerF  logical_andwhere)
aba_valuea_indexb_valueb_indexmaskequala_isnanb_isnanr   reduction_typerG   rH   
combine_fn  s"   z,get_reduction_combine_fn.<locals>.combine_fnwelford_combinec                 S   sR   | \}}}|\}}}|| }|| }	||	 }
|||
  || || | |
  |	fS rK   rG   )r[  r\  a_meana_m2a_weightb_meanb_m2b_weightdelta
new_weight	w2_over_wrG   rG   rH   rg    s   


zunknown reduction_type=)REDUCTION_COMBINE_FNr   )rf  r   rg  rG   re  rH   get_reduction_combine_fn  s   -*rs  c                   @   s`  e Zd ZU ee ed< eed< ejed< e	ed< dd Z
dd Zd	d
 Zdd Zdd Zdd Zdd Zdd Zedd Zedd Zee	jfdejdejdejdedef dee dee dede	fddZed d! Zed"d# Zed$ed%ede	d&e	fd'd(Zed)d* Zedejdejdejdedef dee dee ded$ede	fd+d,Z d-S ).	Reductionreduction_rangesrf  	src_dtypereduction_hintc                 C   s   t j| ddS )N)r   ru  rf  )r   )r   r   r   rG   rG   rH   r   (  s   zReduction.__str__c                 C      |   S rK   )r   r   rG   rG   rH   r   -     zReduction.__repr__c                 C   r   rK   )ru  r   rG   rG   rH   r"  0  r   zReduction.get_reduction_sizec                 C   r   rK   rf  r   rG   rG   rH   r   3  r   zReduction.get_reduction_typec              	   C   s0   t | j| j| j| ||}t ||||S rK   )r0   	reductionr   rv  rf  r   store_reduction)r   r0  r1  r2  reduction_varsr   rG   rG   rH   r|  6  s   
zReduction.store_reductionc                 C   s   t | jt | j S rK   )r_   r   ru  r   rG   rG   rH   index_length?  rR   zReduction.index_lengthc                 C   s,   |  | j}|  | jd}tj| j||S )Nr  )r  r   ru  r1   r  r  r   )r   rZ   rindexrG   rG   rH   r   B  s   zReduction.inner_fn_strc              	   C   s<   |   }ttd||}t|| j|| j| j| j| j	t
jS r5  )r!  r   r  r8  rt  r   r   ru  rf  rv  r?  rC  r9  rG   rG   rH   r:  K  s   zReduction.constant_to_devicec              	      s  dd }t jj|}	t jjt|}
t| o&|dvo&tjo&||	o&||
}|s.tj	dfS t
| jddd      fdd	} fd
d}|
dkrgtj||	|
fS |	kss|
d d krxtj	dfS t| ||||||tj	}dd }||\}}|r||\}}t|dkrtj	dfS t| | \\}}}d}d}|D ])}t jj||}t jj||| }tdd |D }|r|d7 }q|d7 }q||krtj||	|
fS tj||	|
fS )Nc                 S   s   t | ttjfS rK   )r4   intr=   r   r   rG   rG   rH   
_is_statice     z(Reduction.num_splits.<locals>._is_staticrR  r          i   c           	         s  d}d| }|d krdS | dkrdS | | kr}n^| | k r_ d|  }|| d | }| ||  d ||   t | }t| fddd}t|  d	k r\t|}n }nt | }t|fd
dd}t| dk ry|}n}| ||  d ||  S )N   r  ru   r   i    c                       t |   S rK   absr   tmp_split_sizerG   rH   <lambda>      zFReduction.num_splits.<locals>.inner_reduction_splits.<locals>.<lambda>key   c                    r  rK   r  r   max_elements_per_threadrG   rH   r    r  2   r=   divisorsrN  r  rM  )	reduction_numel_hint
numel_hint	num_warpsnum_threads
split_sizetarget_blocksblocks_per_outputr  closestmax_elements_per_devicer  min_elements_per_devicemin_elements_per_threadnum_smthreads_per_smr  rH   inner_reduction_splits  s6   

z4Reduction.num_splits.<locals>.inner_reduction_splitsc                    s  d}|d }d}d}|| d | }| | k r}n\| | k r[ | }|| d | }| ||  d ||   t | }	t|	 fddd}
t |
 d	k rXt|
}n }nt | }	t|	fd
dd}
t|
 dk ru|
}n}| ||  d ||  S )Nr  r        r   c                    r  rK   r  r   r  rG   rH   r    r  zFReduction.num_splits.<locals>.outer_reduction_splits.<locals>.<lambda>r     c                    r  rK   r  r   r  rG   rH   r    r  r  r  )r  r  r  r  rvals_per_threadxvals_per_blockxblocksr  r  r  r  r  r  rH   outer_reduction_splits  s4   

z4Reduction.num_splits.<locals>.outer_reduction_splitsru   c                    s   t d t|  |  |  d| d}| }dd |jD }g }d}t|jdd dD ]1 t	 fd	d
|D r\|
 j  jtjjv r\tjj j }|jj}|  |jj|kr\d}q+||fS )Nr   r   r   rP   layoutdatac                 S   s(   g | ]}t |tjrt |tjs|qS rG   )r4   r=   r   NumberrX   r  rG   rG   rH   r\     s    

zBReduction.num_splits.<locals>.get_read_indices.<locals>.<listcomp>Fc                 S   r   rK   rO   r   rG   rG   rH   r    s    z@Reduction.num_splits.<locals>.get_read_indices.<locals>.<lambda>r  c                 3   s    | ]	}| j jv V  qd S rK   )rZ   free_symbolsr  mdrG   rH   	<genexpr>      zAReduction.num_splits.<locals>.get_read_indices.<locals>.<genexpr>T)ComputedBufferr  r   r   r   get_read_writes
range_varssortedr#  allappendrZ   rP   r1   r   name_to_bufferr  r   decide_layout)r  cbread_writesr  indiceschangedbuforiginal_striderG   r  rH   get_read_indices  s4   	z.Reduction.num_splits.<locals>.get_read_indicesr   c                 s   s    | ]}|d kV  qdS )r   NrG   r   rG   rG   rH   r        z'Reduction.num_splits.<locals>.<genexpr>)r1   r   r   symbolic_hintr,   r   r   split_reductionsr?  rC  r!   multi_processor_countr@  rt  r_   r   index_vars_squeezer   r"  simplify_with_rangesstride_hintskeysr  rA  )r   	dst_dtyperv  r   r   ru  rf  reduction_numelr  r  r  should_splitr  r  r  r  r  r  r   r}  	num_outer	num_innerrY   stridesouterrG   r  rH   
num_splitsZ  s   	

$$
 


zReduction.num_splitsc                    sj   dd D t ||  fdd|dv r1tddt fddfd	d
S S )z1Convert inner_fn from a reduction to an pointwisec                 S      g | ]	}t jj|qS rG   )r1   r   r   evaluate_static_shaperX   r   rG   rG   rH   r\     s    z2Reduction._unroll_reduction_fn.<locals>.<listcomp>c                    s,   t  fddtjdd D  D S )Nc                 3   s    | ]} |V  qd S rK   rG   )rX   r  )rZ   value_fnrG   rH   r  (  s
    
z=Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<genexpr>c                 S   s   g | ]}t |qS rG   )r`   r  rG   rG   rH   r\   +  r   z>Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<listcomp>)	functoolsreduce	itertoolsproductra   )rg  ru  r  ra   rH   rQ   %  s   z*Reduction._unroll_reduction_fn.<locals>.fnrT  rS  Nc                    s*   dd |D }| |t  |tjfS )Nc                 S      g | ]}t |qS rG   )r=   expandrW   rG   rG   rH   r\   9  r{   zDReduction._unroll_reduction_fn.<locals>.value_fn.<locals>.<listcomp>)r0   
index_exprr7   int64rZ   r  )flatten_indexr   rG   rH   r  8  s   z0Reduction._unroll_reduction_fn.<locals>.value_fnc                    s    | d S Nr   rG   ra   )rQ   rG   rH   r  ?  r  z0Reduction._unroll_reduction_fn.<locals>.<lambda>)rs  FixedLayoutr  contiguous_stridesmake_indexer)r   ru  rf  rv  rG   )rg  r  rQ   r   ru  r  rH   _unroll_reduction_fn  s$   
zReduction._unroll_reduction_fnr   r  r   .r   c	                    sp  t jjt}	|	dkrB fdd}
|
d|
d|
d|
dd v s/J  d fdd}tj|||t|d	S |	dkr`d
v rQ fdd}nfdd}t| ||S t	|	t
jrt jj|	tjk rt|dkrt| | ||S | | |||	\}}|tjkr|}|dkr| | ||||	S tt| |||S )Nr   c                    s(    t jkr	t| S  jrt| S t| S rK   )r7   boolr(  r*  r  valr  rG   rH   py_cnstV  s   
z!Reduction.create.<locals>.py_cnstr   )rP  rQ  rO  rL  z* not supported for zero-dimension tensors!c                    s   t   S rK   r0   r)  ra   )r  rf  rtypes_to_initsrG   rH   const_fnk  r  z"Reduction.create.<locals>.const_fnr   r   r   r   r  c                    s   t d S r   r  ra   r  rG   rH   rQ   y  rr   zReduction.create.<locals>.fnc                       dd D } | |S )Nc                 S      g | ]}t d qS r   r=   r   r   rG   rG   rH   r\     r{   z0Reduction.create.<locals>.fn.<locals>.<listcomp>rG   rZ   reduction_index)r   ru  rG   rH   rQ   ~     
)r1   r   r   simplifyr,   r  r,  r  r5   r4   r=   r   r   r   unroll_reductions_thresholdr  r  r?  rC  create_multilayerr<   rt  )r  r   r  rv  r   r   ru  rf  rw  r  r  r  rQ   hintsplitrG   )r  r   ru  rf  r  rH   r  D  s   
	


zReduction.createc                 C   sv   | dv rt |rtdS t|rdS t|jS | dv r0t |r$tdS t|r*dS t|jS ddddddd|  S )	N>   rM  rS  z-infr   >   rN  rT  infr   r   r   r   )rP  rO  rQ  rL  welford_reducerh  )r   r*  r   r7   iinforN  rM  rf  r   rG   rG   rH   default_accumulator  s*   zReduction.default_accumulatorc                 C   s   | dkrdS t | |S )Nr   r   )rt  r  r  rG   rG   rH   default_value  s   zReduction.default_valuer  r  returnc                 C   sD   | dkr|dkr|t jkrt jS | dkr |dkr |t jkr t jS |S )Nr  i      )r?  rA  rB  )r  r  rw  rG   rG   rH   _multilayer_second_step_hint  s   
z&Reduction._multilayer_second_step_hintc                    sD   t |gtjjt| d  fdd}|S )Nr   c                    sf   |\}| ^ }| |   fdd}r0t t  tjt tj}t ||S | S )Nc                      s    gS rK   rG   rG   )r  r3  	new_indexrc   rG   rH   body  r  zCReduction._multilayer_wrap_loader.<locals>.wrapper_fn.<locals>.body)r0   rU  r  r7   int32masked)rZ   r  reduction_blockr	  ra  
block_sizedefaultr3  	need_maskr  rc   )r  r  rH   
wrapper_fn  s   
z5Reduction._multilayer_wrap_loader.<locals>.wrapper_fn)Viewdynamic_reshape_indexerr1   r   r   r   r=   r   )r  r3  ru  r  r  r  r  r  rG   r  rH   _multilayer_wrap_loader  s   
z!Reduction._multilayer_wrap_loaderc
                    s   t |}
t|
|d  |}| ||}| |||
|||}|tjtjfvr'|ntj}t	||||g |||g||	}|
  |   fdd}tjjt |}| |||	}	t	t|||||g|||	S )a
        Break a large reduction up into multiple smaller reductions
        recursively
        r   c                    s    g | |S rK   rG   r  intermediate_loaderrG   rH   intermediate_fn=  r  z4Reduction.create_multilayer.<locals>.intermediate_fn)r,   r   r  r  r7   float16bfloat16r*  rt  r  r   r!  r1   r   r   r   r  r<   )r  r   r  rv  r   r   ru  rf  r  rw  r  r  r  r  intermediate_dtypeintermediater  r  rG   r  rH   r    sN   	

zReduction.create_multilayerN)!r   r   r   r
   r   r   rS   r7   r   r?  r   r   r"  r   r|  r~  r   r:  r   r  r  r%  rC  r   r   r   r  r  r  r  r  r  r  rG   rG   rG   rH   rt     s   
 
		
 B
'

	y


!
	
rt  c                 C   s   d| v rdS dS )Nwelfordrt   r   rG   rz  rG   rG   rH   num_reduction_outputsR  r  r  c                       s   e Zd ZU eed<  fddZdd Zeej	fde
jde
jdeed	ef  d
ee dee dedefddZedd Zede
jde
jdeed	ef  d
ee dee dededefddZ  ZS )WelfordReductionoutput_indexc	           
   
      sF   t  dkr d }	n fdd}	t |||	||||| || _d S )Nr   r   c                    s   t  fddD S )Nc                 3   s    | ]}| V  qd S rK   rG   rX   rQ   rx   reduction_idxrG   rH   r  i      z<WelfordReduction.__init__.<locals>.loader.<locals>.<genexpr>)r6   r"  	inner_fnsr"  rH   r3  h     z)WelfordReduction.__init__.<locals>.loader)r_   r  r   r   )
r   r   r   r&  r   ru  rf  rw  r   r3  r  r%  rH   r   Y  s   


zWelfordReduction.__init__c              	   C   s:   t | j| j| j| ||}|| j }t ||||S rK   )r0   r{  r   rv  rf  r   r   r|  )r   r0  r1  r2  r}  valuesr   rG   rG   rH   r|  w  s   

z WelfordReduction.store_reductionr   r   r&  .r   ru  rf  rw  c              
      s0  dv sJ t jjt}fdd}	|dkr,|	d}
|	d}|	d}|
||fS |dkrTfdd dkrI d |	d|	dfS t fd	d
D S tjd |d\}}tj	krl||dkr|| 
|S fddtdD }|D ]}|  q|S )N>   r   rh  c                    s"    fdd}t j|tdS )Nc                    s   t  tS rK   )r0   r)  r  rx   r  rG   rH   r     rr   z8WelfordReduction.create.<locals>.const.<locals>.inner_fnr  )r,  r  r5   )rx   r  r   )r   r   r   r  rH   const  s   z&WelfordReduction.create.<locals>.constr   r   c                    s$    fdd}t jt|tdS )Nc                    r  )Nc                 S   r  r  r  r   rG   rG   rH   r\     r{   zKWelfordReduction.create.<locals>.copy.<locals>.inner_fn.<locals>.<listcomp>rG   )rx   r  )r3  ru  rG   rH   r     r  z7WelfordReduction.create.<locals>.copy.<locals>.inner_fnr  )r,  r  r  r5   )r3  r   )r   r   ru  r3  rH   copy  s   z%WelfordReduction.create.<locals>.copyr   c                 3       | ]} |V  qd S rK   rG   r!  )r,  rG   rH   r    r  z*WelfordReduction.create.<locals>.<genexpr>)rf  r  c                    s(   g | ]}t t |qS rG   )r<   r  r  )rX   
output_idx)r   r   r&  r   rw  ru  rf  rG   rH   r\     s    z+WelfordReduction.create.<locals>.<listcomp>rt   )r1   r   r   r  r,   r6   rt  r  r?  rC  r  r`   r   )r  r   r   r&  r   ru  rf  rw  r  r*  meanm2weightr  r  resultsr   rG   )r,  r   r   r&  r   rw  ru  rf  rH   r    sT   



zWelfordReduction.createc                 C   r  )Nr  rG   r  rG   rG   rH   r    s   zWelfordReduction.default_valuer  c	              
      s.  t tjjt d }	|	r8|dkr8fdd}
j||d t|
ddt|
ddf|d|dS t	d   t
|t fdd	|D g | g||}|D ]}|  q`d
d |D }dd tjjt |}||}t
|tfdd	|D |gd|S )r  r   rh  c                    s   t | S rK   r  )rx   r#  r   r  rG   rH   r)    rr   z4WelfordReduction.create_multilayer.<locals>.constant)r   r   )r   r   r&  r   ru  rf  r  rw  c              	   3   s&    | ]}j | d dV  qdS )r   )r  N)r  )rX   r3  )r  r  r  ru  r  rG   rH   r  '  s    	
z5WelfordReduction.create_multilayer.<locals>.<genexpr>c                 S      g | ]}|  qS rG   )r!  rW   rG   rG   rH   r\   :  r   z6WelfordReduction.create_multilayer.<locals>.<listcomp>c                 S   s   |g | |S rK   rG   )rZ   r  r3  rG   rG   rH   intermediate_loader_fn<  r  zBWelfordReduction.create_multilayer.<locals>.intermediate_loader_fnc                 3   s     | ]}t  | d V  qdS )r+  N)r   r!  rW   )r4  rG   rH   r  F  s
    
)r,   r1   r   r   r   r=   r   r  r   r   r  r  r6   r   r   r  )r  r   r   r&  r   ru  rf  r  rw  r  r)  intermediatesrY   	i_loadersr  rG   )r  r  r   r4  r  ru  r  rH   r    sd   

	

z"WelfordReduction.create_multilayer)r   r   r   r  r   r   r|  r%  r?  rC  r7   r   r   r   r   r   r
   r   rS   r  r   r  r  r&  rG   rG   r  rH   r  V  sR   
 
	r
	r  c                 C   s(   z	t | dd W dS  ty   Y dS w )NFfreezeT)as_storage_and_layoutr   r   rG   rG   rH   r   R  s   r   c                 C   s0   zt | dd\}}| W S  ty   Y dS w NFr7  )r9  is_contiguousr   )r   bufferr  rG   rG   rH    is_contiguous_storage_and_layoutZ  s   
r=  Fc                 C   s   t | trt| j|||dS t | trAt | jtrA|r;|r+| j  | jj s*J n|dur6| j	| n| j
  | | jjfS t | trTt| j|d\}}|| jfS t)z0Try to simplify x into a StorageBox and a Layoutr8  want_contiguousstride_orderNr7  )r4   r<   r9  r  
StorageBoxBufferfreeze_layoutr  r;  freeze_layout_with_stride_orderr  ReinterpretViewr   )r   r8  r?  r@  r<  r   rG   rG   rH   r9  b  s.   





r9  )r?  c                 C   s2   zt | dd\}}||W S  ty   Y dS w r:  )r9  is_stride_orderedr   )r   r@  r<  r  rG   rG   rH   "is_stride_order_storage_and_layout  s   rG  c                   @   s   e Zd ZU eed< dd Zdd Zdd Zdd	 Zd
d Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"S )#BaseViewr  c                 C   s   t d|  )Nzmake_reindexer NYI on r   r   rG   rG   rH   make_reindexer     zBaseView.make_reindexerc                    $   | j   |   fdd}|S )Nc                    rn   rK   rG   r)  innerrc   rG   rH   r1    rr   z&BaseView.make_indexer.<locals>.indexer)r  r  rJ  r   r1  rG   rM  rH   r       
zBaseView.make_indexerc                    rL  )Nc                    rn   rK   rG   r)  rM  rG   rH   r3    rr   z$BaseView.make_loader.<locals>.loader)r  r!  rJ  r   r3  rG   rM  rH   r!    rP  zBaseView.make_loaderc                 C   
   | j  S rK   )r  r   r   rG   rG   rH   r     r   zBaseView.get_dtypec                 C   rR  rK   )r  r   r   rG   rG   rH   r     r   zBaseView.get_devicec                 C   r-  rK   rG   r   rG   rG   rH   r
    r  zBaseView.get_origin_nodec                 C   rR  rK   r  get_namer   rG   rG   rH   rT    r   zBaseView.get_namec                 C      | j |S rK   )r  
mark_reuser   usersrG   rG   rH   rV    rr   zBaseView.mark_reusec                 C   rR  rK   )r  has_exceeded_max_readsr   rG   rG   rH   rY    r   zBaseView.has_exceeded_max_readsc                 C   rR  rK   r  r   r   rG   rG   rH   r     r   zBaseView.realizec                 C   rR  rK   )r  realize_hintr   rG   rG   rH   r[    r   zBaseView.realize_hintc                 C   rR  rK   )r  get_storage_numelr   rG   rG   rH   r\    r   zBaseView.get_storage_numelc                 C   rR  rK   )r  r  r   rG   rG   rH   r    r   zBaseView.is_externc                 C   sF   t tdd t|  |  jW  d    S 1 sw   Y  d S r  )r   r  r  r"   r!  r   r#  r   rG   rG   rH   r     s   $zBaseView.get_readsc                 C   s"   | }t |tr|j}t |ts|S rK   )r4   rH  r  r   r   rG   rG   rH   unwrap_view  s
   

zBaseView.unwrap_viewc                 C   s0   |   }ttd||}t||  ||  S r5  )r!  r   r  r8  r,  r   r   r9  rG   rG   rH   r:    s   zBaseView.constant_to_deviceN)r   r   r   r   r   rJ  r  r!  r   r   r
  rT  rV  rY  r   r[  r\  r  r   r^  r:  rG   rG   rG   rH   rH    s$   
 		rH  c                   @   sB   e Zd ZU ee ed< edd Zedd Z	dd Z
dd	 Zd
S )r:   r   c                 C   s   t ttj|}|  }dgt|t|  t | }t|t|ks%J tt|D ]}|| dkrA|| dus;J || ||< q+|S )zReplace `-1` with correct sizesN)r5   r   r=   r  r   r_   r`   )r   new_sizeold_sizerY   rG   rG   rH   _normalize_size  s   zExpandView._normalize_sizec           
      C   s   |  ||}t|rSt|\}}t|t|j }|dksJ tdg| }t|j|jD ]\}}|	|dkr:|ntd q.t
|j|jt|||j}	t||	S t||S Nr   r   )rb  r   r9  r_   r   r=   r   rf   r   r  r  r   r   r5   offsetrE  r:   )
r  r   r`  storage
old_layoutskip
new_strider   r   
new_layoutrG   rG   rH   r    s"   

zExpandView.createc                 C   r   rK   r   r   rG   rG   rH   r     r   zExpandView.get_sizec                    s4   |   }| j   t|t   fdd}|S )Nc                    sT   t | d  } t| t ksJ tt D ]} | dkr'td| |< q| S )Nr   r   )r5   r_   r`   r=   r   )rZ   rY   actualrg  rG   rH   rc     s   z*ExpandView.make_reindexer.<locals>.reindex)r   r  r_   )r   targetrc   rG   rk  rH   rJ    s
   
	zExpandView.make_reindexerN)r   r   r   r
   r   r   r   rb  r%  r  r   rJ  rG   rG   rG   rH   r:     s   
 

r:   c                   @   sB   e Zd ZU ee ed< edd Zedd Zdd Z	dd	 Z
d
S )PermuteViewdimsc                    s   |  |}t|ttt|ksJ t|r;t|\} t j j fdd|D  fdd|D  j	}t
||S t||S )Nc                       g | ]} j | qS rG   rj  rW   rf  rG   rH   r\   !  r{   z&PermuteView.create.<locals>.<listcomp>c                    rp  rG   )r   rW   rq  rG   rH   r\   "  r{   )_map_neg_dimsr   r`   r_   r   r9  r  r   r   rd  rE  rn  )r  r   ro  re  ri  rG   rq  rH   r    s   


zPermuteView.createc                    s    fdd D S )Nc                    s$   g | ]}|d kr
|nt  | qS r  )r_   )rX   dimro  rG   rH   r\   +  s   $ z-PermuteView._map_neg_dims.<locals>.<listcomp>rG   )r  ro  rG   rt  rH   rr  )  r   zPermuteView._map_neg_dimsc                    sD   t | | jt tt| jksJ | j   fdd| jD S )Nc                    r|   rG   rG   rW   rj  rG   rH   r\   0  r   z(PermuteView.get_size.<locals>.<listcomp>)r   rr  ro  r`   r_   r  r   r   rG   rj  rH   r   -  s   &
zPermuteView.get_sizec                    s^   dd t | jD   fddtt| jD  t ttt| jks'J  fdd}|S )Nc                 S   rv   rG   rG   )rX   rY   jrG   rG   rH   rz   3  r{   z.PermuteView.make_reindexer.<locals>.<dictcomp>c                    r|   rG   rG   rW   invrG   rH   r\   4  r   z.PermuteView.make_reindexer.<locals>.<listcomp>c                    s    fddD S )Nc                    r|   rG   rG   rW   ra   rG   rH   r\   8  r   z?PermuteView.make_reindexer.<locals>.reindex.<locals>.<listcomp>rG   ra   rv  ra   rH   rc   7     z+PermuteView.make_reindexer.<locals>.reindex)r   ro  r`   r_   r   )r   rc   rG   rv  rH   rJ  2  s
   zPermuteView.make_reindexerN)r   r   r   r
   r   r   r%  r  rr  r   rJ  rG   rG   rG   rH   rn    s   
 

rn  c                   @   sB   e Zd ZeddddZedeejdf fddZ	d	d
 Z
dS )SqueezeViewNrs  c                   s<  t |rrt|\}}g }g } d ur(t tsJ dd kr& t|jk s(J tt|j|jD ]0\}\}}	 d u rJ|dkrI|	| |	|	 q1| krY|	| |	|	 q1|dksaJ dq1t
|j|j|||j}
t||
S  d u rt|dd | D S |   dksJ t| fddt| D S )Nzexpected integer dim argumentr   r   zexpected squeezed size to be 1c                 S      g | ]}|d kr|qS r   rG   r   rG   rG   rH   r\   _  r]   z&SqueezeView.create.<locals>.<listcomp>c                    s   g | ]
\}}| kr|qS rG   rG   rX   rY   r   rz  rG   rH   r\   b      )r   r9  r4   r  r_   r   r   rf   r   r  r  r   r   rd  rE  r  r  r   )r  r   rs  re  rf  r`  rh  rY   r   r   ri  rG   rz  rH   r  >  s:   



"zSqueezeView.creater   .c                    sX   dd | D }dd t | D t|  dttj dttjdf f fdd}||fS )	Nc                 S   r{  r|  rG   r   rG   rG   rH   r\   f  r]   z(SqueezeView.squeezer.<locals>.<listcomp>c                 S   s   g | ]
\}}|d kr|qS r|  rG   r}  rG   rG   rH   r\   g  r~  rZ   r  .c                    sV   t | t ksJ |  d tdg  }t| D ]\}}|||< qt|S )N r   )r_   r=   r   rf   r6   )rZ   r  rx   r   lengthnot_onerG   rH   rc   j  s
   "
z%SqueezeView.squeezer.<locals>.reindex)r   r_   r
   r=   r   r   )r   r`  rc   rG   r  rH   squeezerd  s
   (zSqueezeView.squeezerc                 C      t d)Nzuse SqueezeView.create())AssertionError)r   r  rG   rG   rH   r   s  ry  zSqueezeView.__init__)r   r   r   r%  r  r   r   r=   r   r  r   rG   rG   rG   rH   ry  =  s    %ry  c                   @   sZ   e Zd ZU ee ed< edef ed< dd Zdd Z	dd	 Z
e
Zed
d Zdd ZdS )GenericViewr   .rc   c                 C   r   rK   )rc   r   rG   rG   rH   rJ  |  r   zGenericView.make_reindexerc                 C   sB   dd t t| jD }t| |}ddtt| d| S )Nc                 S      g | ]	}t d | qS r$  r.   )rX   r  rG   rG   rH   r\         z+GenericView.reindex_str.<locals>.<listcomp>zlambda , z: )r`   r_   r   r5   rc   r   r   rS   )r   	index_old	index_newrG   rG   rH   reindex_str  s   zGenericView.reindex_strc                 C   s$   |  | jd| j d|   gS )Nsize=zreindex=)r   r  r   r  r   rG   rG   rH   r     s   zGenericView.__str__c                 C   s   | |t ||S rK   )r5   )r  r   r`  rc   rG   rG   rH   r       zGenericView.createc                 C   r   rK   rj  r   rG   rG   rH   r     r   zGenericView.get_sizeN)r   r   r   r
   r   r   r   r   rJ  r  r   r   r%  r  r   rG   rG   rG   rH   r  w  s   
 
r  c                   @   sH   e Zd Zedd Zedd Zedd Zedd Zed	d
 Z	dS )r  c                 C   s<   t | } t |}tjjjj}|t | dr| | } | S r   )r=   r  r1   r   r   	shape_envevaluate_exprLt)rx   r   r  rG   rG   rH   handle_negative_index  s   

zView.handle_negative_indexc                    s   t |ttfs	J | | |\ }tjj |r|S d|v r/ fdd}| |t||S t	|rRt |j
tsRt|\}}t|j|j|t||j}t||S |  |}| |t||S )Nr   c                    s   t dgt  S r   )r6   r_   ra   ra  rG   rH   fake_reindex  rx  z!View.create.<locals>.fake_reindex)r4   r6   r5   resolve_negative_sizer   r1   r   r   statically_known_list_equalsr=  r  ExternKernelAlloc as_contiguous_storage_and_layoutr  r   r   r  r  rd  rE  r  )r  r   r`  r  re  rf  ri  rc   rG   r  rH   r    s*   

zView.createc                 C   s   dd |D }dd | D } t |}tt|D ]}|| dkr4td||< tt| t|||<  nqtjj	
t| t| | |fS )Nc                 S   r  rG   r1   r   r   r  r  rG   rG   rH   r\     r  z.View.resolve_negative_size.<locals>.<listcomp>c                 S   r  rG   r  r  rG   rG   rH   r\     r  r_  r   )r5   r`   r_   r=   r   r   r,   r1   r   r   guard_equals)ra  r`  rY   rG   rG   rH   r    s   zView.resolve_negative_sizec              	   C   sX   z	|  ||}W |S  ttfy+   t|g}|  ||}|  ||}t||}Y |S w rK   )_dynamic_reshape_indexerr  
IndexErrorr,   rs   )r  ra  r`  rc   flatrp   rq   rG   rG   rH   r    s   
zView.dynamic_reshape_indexerc                    sX  t jjj}dd tt|D  tt |}t| }g |r|r| }| \}}|dkr?	t
d |	||f n|dkrI|	| n||||kr_	| t jj|| n||||k r||||k r| \}}	|| | }||	 }||||k so	| t jj|| nM||||krt
d}
|}	t||
| |
| }
||||kr| }	t||
| |
| }
|| }||||kst jj|| nt |r|s!|r| }t jj|d 	t
d |s|r| \}}t jj|d |stttt| ks#J  fdd}|S )zG
        Perform a reshape entirely by modifying indexing math
        c                 S   r  )viewr  rW   rG   rG   rH   r\     r  z1View._dynamic_reshape_indexer.<locals>.<listcomp>r   r   c                    sH   t | t ksJ t | t ftt|  t fddD S )Nc                 3   s    | ]}t | V  qd S rK   r-   r  replacementsrG   rH   r    r$  zAView._dynamic_reshape_indexer.<locals>.reindex.<locals>.<genexpr>)r_   re   rf   r6   ra   r2  	view_exprr  rH   rc     s   $z.View._dynamic_reshape_indexer.<locals>.reindex)r1   r   r   r   r`   r_   r5   rf   r  r  r=   r   r  r   r  reversed)ra  r`  r   	stack_new	stack_oldsize_oldvarsize_newvar2	size_new2divisormodulusrc   rG   r  rH   r    sf   



 zView._dynamic_reshape_indexerN)
r   r   r   r   r  r%  r  r  r  r  rG   rG   rG   rH   r    s    



r  c                       s   e Zd ZU dZded<  fddZdd ZeZdd	 Zd
d Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Z  ZS )rE  z*Pretend our storage has a different layoutLayoutr  c                    s*   t    t| jtr| j | _d S d S rK   )r  r   r4   r  rH  r^  r   r  rG   rH   r     s   
zReinterpretView.__post_init__c                 C   s   |  | j| jgS rK   )r   r  r  r   rG   rG   rH   r   $  s
   zReinterpretView.__str__c                 C   rR  rK   rS  r   rG   rG   rH   rT  .  r   zReinterpretView.get_namec                 C      | j jS rK   r  r   r   rG   rG   rH   r   1  ry  zReinterpretView.get_devicec                 C   r-  rK   rG   r   rG   rG   rH   r
  4  r  zReinterpretView.get_origin_nodec                 C   r  rK   )r  r   r   rG   rG   rH   r   7  ry  zReinterpretView.get_dtypec                 C      t | jjS rK   r5   r  r   r   rG   rG   rH   r   :  rr   zReinterpretView.get_sizec                 C   r  rK   r5   r  r   r   rG   rG   rH   
get_stride=  rr   zReinterpretView.get_stridec                    rj   )Nc                    s    j  }t  || S rK   )r  r  r0   loadrT  rZ   r1  r   rG   rH   r3  A  s   
z+ReinterpretView.make_loader.<locals>.loaderrG   rQ  rG   r   rH   r!  @  rm   zReinterpretView.make_loaderc                 C   rR  rK   r  r  r   rG   rG   rH   r  G  r   zReinterpretView.make_indexerc                 C   r   rK   r  r   rG   rG   rH   r   J  r   zReinterpretView.get_layoutc                 C   r-  rK   rG   r   rG   rG   rH   rC  M  r  zReinterpretView.freeze_layoutc              	   C   sX   t jj| jj}t jj| jj}t jj| jj}d| 	  d| d| d| d	S )Nzreinterpret_tensor(r  ))
r1   r   wrapper_codecodegen_shape_tupler  r   r   codegen_sizevarrd  rT  )r   r   r   rd  rG   rG   rH   codegen_referenceP  s   "z!ReinterpretView.codegen_reference)r   r   r   __doc__r   r   r   r   rT  r   r
  r   r   r  r!  r  r   rC  r  r&  rG   rG   r  rH   rE    s"   
 rE  c                   @   s   e Zd ZedddZdS )	SliceViewr   c                    sx  t dksJ zdkr|dkrdkr|W S W n	 ty%   Y nw tjj}t| |   | |  }|	|  }|	|	  |dkrq|
|   dkrqdkrq||   |S t| d   < t|rt|\}}t|j}	|	   |	 < t|j|j|	|j|j    }
t||
S  fdd}t||dS )Nr   l    r   c                    sD   t | t ksJ d|  d t| } |     |  < | S )Nzwrong ndim r  )r_   r5   ra   rs  r`  startsteprG   rH   rc     s   $z!SliceView.create.<locals>.reindex)r   rc   )r=   r  	TypeErrorr1   r   r   r5   r   r  evaluate_minr   r  r   r   r9  r   r  r   r   rd  rE  r  )r  r   rs  r  endr  r   re  rf  rh  ri  rc   rG   r  rH   r  [  sB   
&

zSliceView.createNr|  )r   r   r   r%  r  rG   rG   rG   rH   r  Z  s    r  c                   @   sL   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dS )BaseConstantc                 C   r  NrG   rG   r   rG   rG   rH   r     r  zBaseConstant.get_sizec                 C   r   rK   r  r   rG   rG   rH   r     r   zBaseConstant.get_dtypec                 C   r   rK   r  r   rG   rG   rH   r     r   zBaseConstant.get_devicec                 C   r-  rK   rG   r   rG   rG   rH   r
    r  zBaseConstant.get_origin_nodec                 C   r-  rK   rG   rW  rG   rG   rH   rV    r  zBaseConstant.mark_reusec                 C   r  r  rG   r   rG   rG   rH   rY    r  z#BaseConstant.has_exceeded_max_readsc                 C   r  r  rG   r   rG   rG   rH   r     r  zBaseConstant.get_readsc                 C   r  r  rG   r   rG   rG   rH   r    r  zBaseConstant.is_externN)r   r   r   r   r   r   r
  rV  rY  r   r  rG   rG   rG   rH   r    s    r  c                   @   sB   e Zd ZU eed< ejed< ejed< dd Zdd Z	dd	 Z
d
S )Constantr   r   r   c                    rj   )Nc                       t  j jS rK   )r0   r)  r   r   ra   r   rG   rH   r3    r  z$Constant.make_loader.<locals>.loaderrG   rQ  rG   r   rH   r!       zConstant.make_loaderc                 C   r-  rK   rG   r   rG   rG   rH   r     r  zConstant.realizec                 C      t | j| j|S rK   )r  r   r   r   r   rG   rG   rH   r:    r  zConstant.constant_to_deviceN)r   r   r   r   r   r7   r   r   r!  r   r:  rG   rG   rG   rH   r    s   
 

r  c                   @   s:   e Zd ZU eed< ejed< ejed< dd Zdd Z	dS )	IndexingConstantrZ   r   r   c                    rj   )Nc                    r  rK   )r0   r  rZ   r   ra   r   rG   rH   r3    r  z,IndexingConstant.make_loader.<locals>.loaderrG   rQ  rG   r   rH   r!    r  zIndexingConstant.make_loaderc                 C   r  rK   )r  rZ   r   r  rG   rG   rH   r:    r  z#IndexingConstant.constant_to_deviceN)
r   r   r   r   r   r7   r   r   r!  r:  rG   rG   rG   rH   r    s   
 

r  c                   @   s   e Zd Zedfdejdejdee dee def
ddZ	e
d	d
 Zdd ZeZdd Zdd Zdd Zdd Zdd Zdd Zdd ZdefddZdejfddZd S )!r  r   r   r   r   r   rd  c                 C   sd   |d u st |t |ksJ d| d| || _|| _tdd |D s'J || _|| _|| _d S )Nr  	, stride=c                 s   s    | ]
}t |ttfV  qd S rK   )r4   r   r  r   rG   rG   rH   r    s    z"Layout.__init__.<locals>.<genexpr>)r_   r   r   r  r   _striderd  r   r   r   r   r   rd  rG   rG   rH   r     s   
zLayout.__init__c                 C   r   rK   )r  r   rG   rG   rH   r     s   zLayout.stridec                 C   sP   d}| j dkrd| j  }t| j d| jj d| j d| j d| j | dS )	Nr   r   z	, offset=z('', z, size=r  r  )rd  rB   r   r   r   r   r   )r   rd  rG   rG   rH   r     s   
zLayout.__str__c                 C   s>   t | jt| j| jD ]\}}}|dkr||kr dS qdS Nr   FT)rf   r   r  r  r   r   leftrightr   rG   rG   rH   r;    s   zLayout.is_contiguousc                 C   sR   t | j}|dvrdS t| jt| j| jD ]\}}}|dkr&||kr& dS qdS )N)r     Fr   T)r_   r   rf   r   r   )r   ndimr  r  r   rG   rG   rH   is_channels_last_contiguous  s   
z"Layout.is_channels_last_contiguousc                 C   sB   t | jtt| j| jD ]\}}}|dkr||kr dS qdS r  )rf   r   r  r  r  r   r  rG   rG   rH   is_transposed  s   zLayout.is_transposedc                 C   s   t | jt |ksJ dgt | }tt |D ]}tjj| j| ||| < qtt |d D ]}|| ||d  kr@ dS q1dS )Nr_  r   FT)r_   r   r`   r1   r   r   r   )r   rh   stride_orderedrY   rG   rG   rH   rF    s   zLayout.is_stride_orderedc                 C   s:   dgt ttdt| jd  }t|g| }| |S rc  )r5   r  r`   r_   r   rF  r   rh   rG   rG   rH   is_channels_last_stride_ordered  s   "
z&Layout.is_channels_last_stride_orderedc                 C   s   t | j| j| j| j| jS rK   )r  r   r   r   r   rd  r   rG   rG   rH   as_fixed  s   zLayout.as_fixedc                 C   s(   t jsJ dt| j d|   S )Nzconvert z to FixedLayout first)r  r  rB   r   r  r  r   rG   rG   rH   r  #  s
   zLayout.make_indexerr  c                 C   s<   | j |j ko| j|jko| j|jko| j|jko| j|jkS rK   r   r   r   r   rd  )r   otherrG   rG   rH   __eq__)  s   



zLayout.__eq__c                 C   s   t | j| j| jS rK   )r   r   r   rd  r   rG   rG   rH   storage_size2  rx  zLayout.storage_sizeN)r   r   r   r   r7   r   r   r
   r   r   propertyr   r   r   r;  r  r  rF  r  r  r  r  r  r=   r  rG   rG   rG   rH   r    s4    

	
		r  c                       sx   e Zd ZdZdedfdejdejdee	e
 e	e f deee	e
 e	e f  dee
ef f
 fd	d
Zdd Z  ZS )r  z A Tensor layout we cannot changeNr   r   r   r   r   rd  c                    s*   |d u r	t |}t ||||| d S rK   )r  r  r  r   r  r  rG   rH   r   9  s   
zFixedLayout.__init__c                    rj   )z1A closure containing math to read a given elementc                    sf   t | t  j  krt  jksJ  J  j}t|  j jD ]\}}}|dkr0|||  }q!|S r  )r_   r   r   rd  rf   )rZ   resultrx   r   szr   rG   rH   r1  N  s   ,z)FixedLayout.make_indexer.<locals>.indexerrG   rO  rG   r   rH   r  K  s   zFixedLayout.make_indexer)r   r   r   r  r   r7   r   r   r   r
   r   r  r   r   r  r&  rG   rG   r  rH   r  6  s     
r  c                       sn   e Zd ZdZdZedd Zedd Zedd Zed	d
 Z	dd Z
dd Zdd Zd fdd	Z  ZS )r  z(A Tensor layout we are allowed to changeFc                 C   sP   t | dkrg S tdg}t| dd  D ]}|||d   qtt|S )Nr   r   r_  )r_   r=   r   r  r  r5   )sizesreversed_stridesr   rG   rG   rH   r  ^  s   z!FlexibleLayout.contiguous_stridesc                 C   sV   t tt| t |ksJ td}dgt| }|D ]}|||< || |  }q|S )z
        Create a stride based on the order the dimensions should be filled in.

        In this format, channels last would be:
            [1, 3, 2, 0]
        r   N)r   r`   r_   r=   r   )r  rh   next_strider  rY   rG   rG   rH   fill_orderedg  s   
zFlexibleLayout.fill_orderedc                 C   s0   t tt| t |ksJ t|}t| |S )z
        Create a stride based on the sorted order of a permuted range.

        In this format, channels last would be:
            [3, 0, 2, 1]
        )r   r`   r_   r   r  r  )r  rh   r   rG   rG   rH   r  x  s   zFlexibleLayout.stride_orderedc                 C   sD   t | t |ks
J dd |D }ttt ||jd}t| |S )z
        Create a stride that has the same stride order as given stride

        For example, if given stride is [1000, 1, 100, 10],
        the fill order should be [1, 3, 2, 0]
        c                 S   r  rG   )r1   r   r   r   r  rG   rG   rH   r\     r  z/FlexibleLayout.same_ordered.<locals>.<listcomp>r  )r_   r  r`   __getitem__r  r  )r  r   r   rG   rG   rH   same_ordered  s   zFlexibleLayout.same_orderedc                 C   "   t | j| j| j| | j|| jS rK   )r  r   r   r   r  rd  r  rG   rG   rH   as_stride_order     zFlexibleLayout.as_stride_orderc                 C   r  rK   )r  r   r   r   r  rd  r  rG   rG   rH   as_fill_order  r  zFlexibleLayout.as_fill_orderc                 C   r  rK   )r  r   r   r   r  rd  r   r   rG   rG   rH   as_same_order  r  zFlexibleLayout.as_same_orderNc                    s2   |r	t ||}nt |}t |||| d S rK   )r  r  r  r  r   )r   r   r   r   r@  r  r  rG   rH   r     s   
zFlexibleLayout.__init__rK   )r   r   r   r  r  r   r  r  r  r  r  r  r  r   r&  rG   rG   r  rH   r  Y  s    



			r  c                       s2   e Zd ZdZd
 fddZdd Zdd	 Z  ZS )AliasedLayoutz)Shares the same storage as another tensorr  rE  c                    s,   |  }t |j|j|j|j || _d S rK   )r   r  r   r   r   r   r   r  )r   r  r  r  rG   rH   r     s   
zAliasedLayout.__init__c                 C      |    S rK   )r  r  r   rG   rG   rH   r    rr   zAliasedLayout.make_indexerc                 C   s4   | j  j}|dkrdS ddlm} tjj||S )Nr   Tr   )	ALIGNMENT)	r  r   rd  
compile_fxr  r1   r   r   statically_known_multiple_of)r   rd  r  rG   rG   rH   maybe_guard_aligned  s
   z!AliasedLayout.maybe_guard_aligned)r  rE  )r   r   r   r  r   r  r  r&  rG   rG   r  rH   r    s
    
r  c                       sp   e Zd Zdef fddZejjdd Zdej	fddZ
dd
dZdd Zedd Zdd Zdd Z  ZS )MutationLayoutrm  c                    s@   t  | | | d  || _|   }tj	
| d S rK   )r  r   r   r   r   rm  
get_bufferrT  r1   r   mark_buffer_mutated)r   rm  rP   r  rG   rH   r     s   zMutationLayout.__init__c                 C   
   |   jS rK   )real_layoutr   r   rG   rG   rH   r     s   
zMutationLayout.strider  c                 C   r  rK   )r  r  r   rG   rG   rH   r    rr   zMutationLayout.storage_sizerB  c                    s,    fdd  | j }t|tsJ d|S )Nc                    sB   t | tr
 | jS t | tr |  S t | tr | jS | S rK   )r4   r  rm  rH  r^  
MutableBoxr  )rm  unwrap_viewsrG   rH   r    s   




z/MutationLayout.get_buffer.<locals>.unwrap_viewsz%MutationLayout must refer to a buffer)rm  r4   rB  )r   r  rG   r  rH   r    s   
	zMutationLayout.get_bufferc                 C   r  rK   )r  r  r   rG   rG   rH   r    r   zMutationLayout.real_layoutc              	   C   s   |   t|tr|j}t|tr|| rd}n|   t|jjt }|rIt	j
| | | dd t| | D dj}|   t|jjtsRJ t||j_|jS )NTc                 S       g | ]\}}t jj||qS rG   r1   r   r   r  rX   r[  r\  rG   rG   rH   r\   	      z/MutationLayout.realize_into.<locals>.<listcomp>r  )r   r4   r<   r  rA  r   rT  r  r  r,  r  r   r   r!  rf   r   r  )r  srcdst	need_copyrG   rG   rH   realize_into  s,   
	zMutationLayout.realize_intoc                 C   s   | S rK   rG   r   rG   rG   rH   r  	  r  zMutationLayout.as_fixedc                 C   rR  rK   )rm  r  r   rG   rG   rH   r  	  r   zMutationLayout.make_indexer)r  rB  )r   r   r   r   r   r  r   getterr=   r   r  r  r  r%  r  r  r  r&  rG   rG   r  rH   r    s    


r  c                       s   e Zd ZU ee ed< eed<  fddZdd Zdd Z	d	d
 Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, Zd-d. Zd/d0 Zd1d2 Zd3d4 Z  Z S )5rB  rP   r  c                    r  rK   r  r   r  rG   rH   r   	  r  zBuffer.__post_init__c                 C   rR  rK   r  r   rG   rG   rH   r   	  r   zBuffer.make_indexerc                 C   s   | j sJ | j S rK   rO   r   rG   rG   rH   rT  #	  s   
zBuffer.get_namec                 C   r  rK   r  r   rG   rG   rH   r   '	  ry  zBuffer.get_devicec                 C   r   rK   r	  r   rG   rG   rH   r
  *	  r   zBuffer.get_origin_nodec                 C   s   t | jdd S )Nr   )rL   r  r   rG   rG   rH   r   -	  rK  zBuffer.get_dtypec                 C   r  rK   r  r   rG   rG   rH   r   0	  rr   zBuffer.get_sizec                 C   r  rK   r  r   rG   rG   rH   r  3	  rr   zBuffer.get_stridec                 C   r   rK   r  r   rG   rG   rH   r   6	  r   zBuffer.get_layoutc                 C   rx  rK   )r   r   rG   rG   rH   r\  9	  ry  zBuffer.get_storage_numelc                 C   r  r  rG   r   rG   rG   rH   r  <	  r  zBuffer.is_externc                 C   s$   t | jttfs| j | _d S d S rK   )r4   r  MultiOutputLayoutr  r  r   rG   rG   rH   rC  ?	  s   zBuffer.freeze_layoutc                 C   "   t | jtsJ | j|| _d S rK   )r4   r  r  r  r  rG   rG   rH   rD  C	     z&Buffer.freeze_layout_with_stride_orderc                 C   r
  rK   )r4   r  r  r  r  rG   rG   rH   freeze_layout_with_fill_orderG	  r  z$Buffer.freeze_layout_with_fill_orderc                 C   r
  rK   )r4   r  r  r  r  rG   rG   rH   freeze_layout_with_same_orderK	  r  z$Buffer.freeze_layout_with_same_orderc                 C   r   r   r   r   rG   rG   rH   r   O	  r   zBuffer.is_zero_elementsc                    s(      rtt  dS  fdd}|S )Nr  c                    s    j  }t j|| S rK   )r  r  r0   r  rP   r  r   rG   rH   r3  W	  s   
z"Buffer.make_loader.<locals>.loader)r   r   r+  r   rQ  rG   r   rH   r!  R	  s   zBuffer.make_loaderc                 C   r  r  rG   r   rG   rG   rH   is_no_op]	  r  zBuffer.is_no_opc                 C   rx  rK   )rT  r   rG   rG   rH   r  `	  ry  zBuffer.codegen_referencec                 C   r-  rK   rG   r   rG   rG   rH   r  c	  r  zBuffer.decide_layoutc                 C      t | jtr| jj gS dS r  )r4   r  r  r  rT  r   rG   rG   rH   get_alias_namesf	     zBuffer.get_alias_namesc                 C   r  r  r4   r  r  rm  rT  r   rG   rG   rH   get_mutation_namesk	  r  zBuffer.get_mutation_namesc                 C   sD   t tdd t|  |  W  d    S 1 sw   Y  d S r  )r   r  r  r"   r!  r   r   rG   rG   rH   r  p	  s   $zBuffer.get_read_writesc                 C   r  rK   )r  r#  r   rG   rG   rH   r   w	  r   zBuffer.get_readsc                 C   r-  rK   rG   r   rG   rG   rH   r   z	  r  zBuffer.realize)!r   r   r   r   rS   r   r  r   r  rT  r   r
  r   r   r  r   r\  r  rC  rD  r  r  r   r!  r  r  r  r  r  r  r   r   r&  rG   rG   r  rH   rB  	  s8   
 rB  c                   @   s   e Zd ZdS )InputBufferN)r   r   r   rG   rG   rG   rH   r  ~	  s    r  c                   @   s    e Zd ZdZdd Zdd ZdS )r8  Nc                    rj   )Nc                    s(    j  }ttj j j|| S rK   )	r  r  r0   r  r1   r   constant_namerP   r7  r  r   rG   rH   r3  	  s   
z*ConstantBuffer.make_loader.<locals>.loaderrG   rQ  rG   r   rH   r!  	  s   zConstantBuffer.make_loaderc                 C   s   t tj| j|| jS rK   )r8  r1   r   r  rP   r  r  rG   rG   rH   r:  	  r'  z!ConstantBuffer.constant_to_device)r   r   r   r7  r!  r:  rG   rG   rG   rH   r8  	  s    	r8  c                   @   r   )NoneAsConstantBufferc                 C   s
   t jjjS rK   )r1   r   r  none_strr   rG   rG   rH   r  	  r   z&NoneAsConstantBuffer.codegen_referenceN)r   r   r   r  rG   rG   rG   rH   r  	  r   r  c                       s$   e Zd Z fddZdd Z  ZS )ShapeAsConstantBufferc                    s   t    || _d S rK   )r  r   shape)r   r  r  rG   rH   r   	  r  zShapeAsConstantBuffer.__init__c                 C   s2   t jjt jj| j}t jjrd| dS |S )Nztorch::tensor(r  )r1   r   r  expr_printerr   r  r  r   )r   exprrG   rG   rH   r  	  s   z'ShapeAsConstantBuffer.codegen_reference)r   r   r   r   r  r&  rG   rG   r  rH   r  	  s    r  c                       s   e Zd ZU eed< edd Zdd Z fddZdd	 Z	d
d Z
dd Zdd Ze		dddZdd Zdd Zdd Zdd Zdd Z  ZS )r  r  c                 C   s   t |  jS rK   )r_   r  r#  r   rG   rG   rH   	num_reads	  s   zComputedBuffer.num_readsc                 C   sz   t tdd, | j r"t|  | j | j W  d    S t|  | j W  d    S 1 s6w   Y  d S r  )	r   r  r  r  r   r"   get_store_functionr   r"  r   rG   rG   rH   r  	  s   
$zComputedBuffer.get_read_writesc                    s>   t | jdo| jtjjvo|  dk}|r| j S t  S )Nr!  r   )	hasattrr  rP   r1   r   mutated_buffersr  r!  r  )r   
can_inliner  rG   rH   r!  	  s   


zComputedBuffer.make_loaderc                 C   s<   | j   }| j rt| jj| j|S t| jj| j|S rK   )	r  r  r  r  r   r   r|  rP   r4  rO  rG   rG   rH   r  	  s   
z!ComputedBuffer.get_store_functionc                    s   t | jtrNt| j | j \\ }|  j	}dd |D }t
dd |D s-J fdd|D }|rN fdd|D }ddlm} |||  S d	S )
al  
        If our layout is still flexible, try to determine the stride order based on stride orders of reads.

        TODO(jansel): A better algorithm here would look at downstream consumers of this
                      value and try to do global graph-level layout optimization.
                      This is also something just begging to be autotuned.
        c                 S   s0   g | ]}|j tjj v rtjj|j  nd qS rK   )rP   r1   r   r  r  r  rG   rG   rH   r\   	  s    z1ComputedBuffer.get_fill_order.<locals>.<listcomp>c                 s   s"    | ]}t |tjtjfV  qd S rK   )r4   r   StarDep	MemoryDepr  rG   rG   rH   r  	  s
    
z0ComputedBuffer.get_fill_order.<locals>.<genexpr>c                    s.   g | ]}t |tjrt|jd d  D qS )c                 S   s    i | ]}|d kr|t d qS r  r  rX   vrG   rG   rH   rz   	  r   z<ComputedBuffer.get_fill_order.<locals>.<listcomp>.<dictcomp>)r4   r   r"  r-   rZ   r  )r}  rG   rH   r\   	  s    
c                    s   g | ]
}t jj| qS rG   r1   r   r   r  rX   r  
index_varsrG   rH   r\   	  s    r   pick_loop_orderN)r4   r  r  r   r  r  r   r"  r  r#  r  	schedulerr*  )r   r   r#  
reads_bufsstride_lengthsr*  rG   )r(  r}  rH   get_fill_order	  s*   


zComputedBuffer.get_fill_orderc                 C   s6   t | jtr|  }|r| | d S |   d S d S rK   )r4   r  r  r.  r  rC  r  rG   rG   rH   r  	  s   zComputedBuffer.decide_layoutc                    s  t jj j dd\}}ttd  t	
  r$|n|dd |}W d   n1 s6w   Y  g |j  dd |j D }g |j |j g }g }g }g }| D ]+\}	}
|	|d v r}|rrJ ||	 ||
 qd|	|d v sJ ||	 ||
 qdttt|gt }t|D ]\}}t|trt|d	r|j||< qd fd
d	}|| }|||||\}}}||||\}}}t|t|kr|_t j||dd\\}}}t	|||||g|}||f|fS )a  
        This is a main place where we do loop transformations in a
        backend-agnostic way.

        Here we:
            1) Remove any 1 dimensions
            2) Fuse contiguous dimensions together
            3) Reorder dimensions based on stride orders
        qr2   r7  Nr   c                 S   s,   g | ]}|t jj v rt jj| nd qS rK   )r1   r   r  r  )rX   
reads_namerG   rG   rH   r\   
  s    z7ComputedBuffer.simplify_and_reorder.<locals>.<listcomp>r   iter_reordering_reindexc           	         s\    | |||\}}}|| } tjj| |t | |\}}}|| } t||}|||fS rK   )_apply_loop_reorderingr1   r   r   _simplify_loopsr    rs   )	x_varssupport_varsr  reordering_reindexreindex0rp   rq   prunerc   index_formulasmemory_addrsr   rG   rH   simplify_and_reorder4
  s   





zAComputedBuffer.simplify_and_reorder.<locals>.simplify_and_reorderzrK   )r   r  r  r   r"  r   r  r8  r   LoopBodyr  r   indexing_exprsr(  reads_name2exprr  writes_name2expritemsr  rl   r`   r_   r   r4   r  r  r1  index_vars_no_squeeze)r   rM   
var_rangesr	  r,  r(  reduce_vars
index_sizereduce_sizer$  r   r6  rY   	reads_bufr<  r5  iter_rangesiter_reindexr1  reduce_rangesreduce_reindexr   	iter_varsrG   r9  rH   r<   
  sn   






z#ComputedBuffer.simplify_and_reorderNc           
   
      s  ddl m} |du rg }zL fdd|D }t|t|kr)t|d t ks+J |durMtt|D ]}z|| || ||< W q5 tyL   Y q5w tt|||}	W n  tyx   tj	rnt
dtt | ttt}	Y nw fdd|	D t|	t|	fS )	zU
        Shuffle the order of loops around to hopefully improve performance.
        r   r)  Nc                    s   g | ]}t jj| qS rG   r%  r&  )r(  r5  rG   rH   r\   l
  s    z9ComputedBuffer._apply_loop_reordering.<locals>.<listcomp>r   z%Did not simplify complex index:
%s
%sc                    r|   rG   rG   rW   )r  rG   rH   r\   
  r   )r+  r*  r_   r`   r  r5   r  	Exceptionr   debuglogwarningre   rf   rl   ri   )
r(  r5  r  r;  r6  priority_idxr*  r  rY   rh   rG   )r(  r  r5  rH   r2  Z
  s:   
z%ComputedBuffer._apply_loop_reorderingc                 C   rR  rK   )r  r"  r   rG   rG   rH   r"  
  r   z!ComputedBuffer.get_reduction_sizec                 C   rR  rK   )r  r   r   rG   rG   rH   r   
  r   z!ComputedBuffer.get_reduction_typec                 C   rR  rK   )r  r   r   rG   rG   rH   r  
  r   zComputedBuffer.is_no_opc                 C   r  NTrG   r   rG   rG   rH   should_allocate
  r  zComputedBuffer.should_allocatec                 C   rU  )r6  )r  r:  r  rG   rG   rH   r:  
  r  z!ComputedBuffer.constant_to_deviceNN)r   r   r   r   r   r%   r  r  r!  r  r.  r  r<  r   r2  r"  r   r  rT  r:  r&  rG   rG   r  rH   r  	  s&   
 
+Z,r  c                       sX   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Z  ZS )TemplateBufferzt
    Represents a Triton (in the future other type) of template operator
    that we can fuse an epilogue onto.
    c                    s4   t  jd |d t|| _|| _tj| | _	d S )N)rP   r  )
r  r   InputsKernelunwrap_storageinputsmake_kernel_renderr1   r   register_bufferrP   )r   r  rY  rZ  r  rG   rH   r   
  s   zTemplateBuffer.__init__c                 C   rx  rK   )normalized_read_writesr   rG   rG   rH   r  
  ry  zTemplateBuffer.get_read_writesc                    sL   |   | j   fdd}tj||  ddd}dd | jD |_|S )Nc                    s"   t |dksJ t | dS )Nr   fake)r_   r0   r.  r  r1  rP   rG   rH   dummy
  r  z4TemplateBuffer.normalized_read_writes.<locals>.dummyrG   T)	normalizec                 S   s   h | ]	}t | qS rG   r   r!  rT  r  rG   rG   rH   r   
  r  z8TemplateBuffer.normalized_read_writes.<locals>.<setcomp>)rT  r  r  r   r"   r   rY  r#  )r   r_  depsrG   r^  rH   r\  
  s   
z%TemplateBuffer.normalized_read_writesc                 C   r  r  rG   r   rG   rG   rH   r"  
  r  z!TemplateBuffer.get_reduction_sizec                 C   r-  rK   rG   r   rG   rG   rH   r   
  r  z!TemplateBuffer.get_reduction_typec                 C   r  r  rG   r   rG   rG   rH   r  
  r  zTemplateBuffer.is_no_opc                 C   r  rS  rG   r   rG   rG   rH   rT  
  r  zTemplateBuffer.should_allocatec                 C   s   |   dfd fS r  )r   r   rG   rG   rH   r<  
  s
   z#TemplateBuffer.simplify_and_reorder)r   r   r   r  r   r  r\  r"  r   r  rT  r<  r&  rG   rG   r  rH   rV  
  s    rV  c                   @   sJ   e Zd ZU ee ed< dd Zdd Zedd Z	edd	 Z
d
d ZdS )rW  rY  c                 C   s   t | S rK   ra  r]  rG   rG   rH   get_read_writes_input
  rK  z"InputsKernel.get_read_writes_inputc                    sp   g } j D ]}t|tr| fdd|D  q| | qtjt|t	 
 ht g d t dS )Nc                       g | ]}  |qS rG   )rc  r  r   rG   rH   r\   
  r{   z0InputsKernel.get_read_writes.<locals>.<listcomp>)	op_counts)rY  r4   r5   extendr  rc  r   
ReadWritesr   r!  rT  collectionsCounter)r   star_depinputrG   r   rH   r  
  s   

zInputsKernel.get_read_writesc                 C   sX   t | tr| j} t | tr| j} t | trt | tst| } t | ttfs*J | | S rK   )	r4   r<   r  rA  rH  rE  ExternKernelrealize_inputrB  r   rG   rG   rH   unwrap_storage_for_input
  s   


z%InputsKernel.unwrap_storage_for_inputc                 C   s@   g }| D ]}t |trdd |D }nt|}|| q|S )Nc                 S   r  rG   )rW  rn  rW   rG   rG   rH   r\   
  r{   z/InputsKernel.unwrap_storage.<locals>.<listcomp>)r4   r5   rW  rn  r  )rY  
inputs_newr   rG   rG   rH   rX  
  s   

zInputsKernel.unwrap_storagec                 C   r  rS  rG   r   rG   rG   rH   r  
  r  zInputsKernel.is_externN)r   r   r   r
   rB  r   rc  r  r   rn  rX  r  rG   rG   rG   rH   rW  
  s   
 



rW  c                   @   r   )	NopKernelc                 C   r  rS  rG   r   rG   rG   rH   r  
  r  zNopKernel.is_no_opN)r   r   r   r  rG   rG   rG   rH   rp  
  r   rp  c                   @   s0   e Zd ZdZedd Zedd Zdd ZdS )	ConcatKernelzn
    There isn't actually a real kernel for concat, we just change the
    storage for the upstream data.
    c                 C   s  |d   }|d  }t|d  }dg}|| g}d|  kr)t|k s,J  J tdt|D ]Z}||  }	|||  t|	t|ksLJ ||  |ksVJ ||   |ks`J tt|D ]}
|
|krw||
 |	|
  ||
< qftjj	
||
 |	|
 ||
< qf|||  q3t|}tt|D ]}|| }t|r| }t|tr| rt|} nqtd t||||dg d}t|}tt|D ]}|jj| || t|||| ||  qtj|j|j_| |jj|j_|S )Nr   r   )r   r   r   r   rP   r  rY  )r   r   r5   r   r_   r`   r  r1   r   r   r  r  r  r   r   r4   r  r  r   rq  rA  r  rY  r  r  r  r[  rP   rX  )r  rY  rs  r   r   r`  offsets_startoffsets_endrY   
input_sizeru  output_strider   r  concat_kernelkernelrG   rG   rH   r    sh   
 


zConcatKernel.createc              	   C   s   t |tst|rt|\}}t||}t |tsJ |t |tr)| |j|S t |trH|  t |jj	t
rHt |jtsHt||j_	|jS tj| | | dd t| | D d}| ||S )Nc                 S   r   rG   r  r  rG   rG   rH   r\   ^  r  z-ConcatKernel.realize_into.<locals>.<listcomp>r  )r4   rE  r   r9  r<   r  r  rA  r   r  r  r  r  r,  r  r   r   r!  rf   r   )r  r  r  re  r  pwrG   rG   rH   r  D  s.   



	zConcatKernel.realize_intoc                 C   r  rS  rG   r   rG   rG   rH   rT  e  r  zConcatKernel.should_allocateN)r   r   r   r  r%  r  r  rT  rG   rG   rG   rH   rq    s    
;
 rq  c                   @   s  e Zd ZU dZeedf ed< eje	dZ
eeef ed< dZee ed< dd	 Zd
d Zdd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zdd Zd d! Zd"d# Zd$d% Z d&d' Z!d(d) Z"d*d+ Z#d,d- Z$d.d/ Z%e%Z&dS )0rl  rG   .constant_args)default_factoryrN   Noutput_viewc                 C   s$   t | jtr|   |   d S d S rK   )r4   r  r  apply_constraintrC  r   rG   rG   rH   r  o  s   zExternKernel.decide_layoutc                 C   s$   t | |\}}|r|| d S d S rK   )r)   	writeline)r   wrapper
origin_strdetailed_origin_strrG   rG   rH   codegen_commentt  s   zExternKernel.codegen_commentc                 C   s   t  rK   rI  r   r  rG   rG   rH   codegeny  r   zExternKernel.codegenc                 C   s:   t j|  |  |  |  |  |  d}|  |S )N)r   r   r   r   r   r   )	r,  r  r   r   r!  r   r
  r   r   )r   ry  rG   rG   rH   
copy_input|  s   zExternKernel.copy_inputc                    sr  t |j|i |j}t|dd\}}d }tjjr(|r(t|tj	j
r(t|||}t|\} g g }	g }
|D ])}t|t d rK|	| q7t|tjr[tjjjj|d d}|
| q7 fdd}fdd|	D }	|	D ]}t|rt|dd	 qsg }|	D ]}| tjjv r|tjj|   q|t|dd
 q|||
\}}||i |}||	|
||fS )NT)return_schemasr_  )r  c                    sd   g }t | }t |}D ]}|r|t| q|t| qt| }|dg |di fS )NrM   rN   )iterr  nextpytreetree_unflattenget)new_tensor_argsnew_non_tensor_argsr  
it_tensorsit_non_tensors	is_tensorr  )	args_specis_arg_tensorrG   rH   unflatten_args  s   z3ExternKernel.process_kernel.<locals>.unflatten_argsc                    rd  rG   rm  r  r  rG   rH   r\     r{   z/ExternKernel.process_kernel.<locals>.<listcomp>r7  r   )r   bind	argumentsr   r1   r   r   r4   r7   _opsOpOverloadPacketr/   r  tree_flattenr  r   r=   r   r   r  create_symintnoder   r9  rT  	constantsr   )r  rx  rM   rN   binded_argsr   schemasschema	args_flattensor_argsnon_tensor_argsargr  r   example_argsnew_args
new_kwargsexample_outputrG   )r  r  r  rH   process_kernel  sF   zExternKernel.process_kernelc           	   	   C   s   t |tsJ t |tr|S |   tj| dd\}}|d }| |}t	j
j||}t	j
j||}t	j
j||}t||| }||krWtd||| t t|jt| | | ||ddS )z
        In order to pass this to an extern kernel we need a
        ReinterpretView not a View.  This allows us to avoid some
        unneeded copies.
        r  r2   r   z@convert_to_reinterpret_view failed: stride=%s offset=%s index=%sr  r  r  )r4   rH  rE  r^  rC  r   r  r   r  r1   r   r   r  stride_vars
offset_varr+   rP  rO  r   r  r  r   r   )	r  r   
index_argsrD  r  rZ   r  rd  expectedrG   rG   rH   convert_to_reinterpret_view  s>   

z(ExternKernel.convert_to_reinterpret_viewc                 C   s   |d u rt  S t|tjtjjjtfrt|S t|t	r-t
jtj|j| | dS t|tr4|S t|tr?| |jS t|trF|S t|trm|  t| rmt| jtsmz| |W S  tyl   Y nw t|trx|  |S | |S )N)r   r   ) r  r4   r=   r   r?   r@   rA   r  r  r  r1   r   add_tensor_constantr7   tensorr   r   r   r8  r<   rm  r  rE  rH  r   r   r^  r  r  r   rA  r  r  r   rG   rG   rH   rm    s8   







zExternKernel.realize_inputc                 C   sD   t |rt| dkr|S | D ]
}|dkr|  S q| |S rc  )r   r_   r  r  )r  r   r   rG   rG   rH   require_stride1  s   
zExternKernel.require_stride1c                 C   s`  |  dkr|S t|rUt| trt|dd|d |S t| tr-| |r-|S t| trUt| 	 trAt
dt| 	 trU| 	 |rU|S t|trc| |rc|S t|trt|jtrt|jtst| rt| jtsz| |j|_| ||W S  ty   Y nw | |}t|dd|d t||sJ |S )Nr   TFr>  z<the MutationLayout's real layout shouldn't be FlexibleLayout)r   r   r4   r   r  r9  r  rF  r  r  r  r  r<   r  rH  rE  r^  r  r  require_stride_orderr   r  rG  )r  r   rh   rG   rG   rH   r  $  s^   



z!ExternKernel.require_stride_orderc                 C   s   |  |tS rK   )r  NHWC_STRIDE_ORDERr  rG   rG   rH   require_channels_lastS  r  z"ExternKernel.require_channels_lastc              	   C   s    |  |tttt| S rK   )r  r5   r  r`   r_   r   r  rG   rG   rH   require_contiguousW  s    zExternKernel.require_contiguousc                 C   r-  rK   rG   r   rG   rG   rH   r}  [  r  zExternKernel.apply_constraintc                 C   s   t tjjj| jS rK   )r   r1   r   r  val_to_arg_strrz  r   rG   rG   rH   codegen_const_args^  rx  zExternKernel.codegen_const_argsc                 C   sf   g }| j D ]$}t|tr"dd |D }dd| d}|| q||  q||   |S )Nc                 S   r3  rG   r  rW   rG   rG   rH   r\   e  r   z-ExternKernel.codegen_args.<locals>.<listcomp>[r  ])rY  r4   r5   r   r  r  rf  r  )r   rM   r   r   r  rG   rG   rH   codegen_argsa  s   

zExternKernel.codegen_argsc                 C   s   || j v r| j |S t| drC|| jv rC| j|d}|d u rA| j|d}t|tv s:J dt| tt|  S |S td| d)Nkwargs_default_valuer   rB   $unsupported default_value arg_type: zarg z6 not found in self.kwargs or self.kwargs_default_value)rN   r  r  r  rS   default_value_mapr  )r   arg_namer  arg_typerG   rG   rH   get_kwargs_valuem  s    


zExternKernel.get_kwargs_valuec                 C   sf   g }| j r1tjjr'| jsJ d| jD ]}| |}|tjj| q|S dd | j 	 D }|S )Nz0ordered_kwargs_for_cpp_kernel has to be providedc                 S   s(   g | ]\}}| d t jj| qS r   r1   r   r  r  rX   kr$  rG   rG   rH   r\     s    z/ExternKernel.codegen_kwargs.<locals>.<listcomp>)
rN   r1   r   r   ordered_kwargs_for_cpp_kernelr  r  r  r  rB  )r   rN   r  r$  rG   rG   rH   codegen_kwargs  s   

zExternKernel.codegen_kwargsc              	   C   s`   t jr,tjjs.tjj|  }tjj|  }|	d| 
  d| d| d d S d S d S )Nzassert_size_stride(r  r  )r   size_assertsr1   r   r   r  r  r   r  r~  rT  )r   r  r   r   rG   rG   rH   codegen_size_asserts  s   z!ExternKernel.codegen_size_assertsc                 C   s   |   }|  }|g g|fS )zD
        get output sizes and strides, for template_codegen
        )r   r  )r   _sizer  rG   rG   rH   get_group_stride  s   zExternKernel.get_group_stridec                    s  t jj|  }|  }fdd|D }dd tt|D ttt||jdd}dd t	|D fddttD }fd	d|D | 
 }|}t jj||g\}}}	td
\}
 tt| fdd|D }tt||}|t|fS )zC
        Manually get canonicalization of the output index
        c                    rd  rG   )r   r  )r   rG   rH   r\     r{   z-ExternKernel.canonicalize.<locals>.<listcomp>c                 S   r  )dr  rW   rG   rG   rH   r\     r  T)r  reversec                 S   rv   rG   rG   rw   rG   rG   rH   rz     r{   z-ExternKernel.canonicalize.<locals>.<dictcomp>c                    r|   rG   rG   rW   r}   rG   rH   r\     r   c                    r|   rG   rG   rW   r'  rG   rH   r\     r   cc                    r   rG   rG   r  )add_varrG   rH   r\     r   )r1   r   r   r   r  r`   r_   r  r  r   r  r3  r#   re   rf   r-   r=   r  r6   )r   r  r  index_orderrh   r1  rZ   	new_sizesrc   r8  r   replacementrG   )r  r(  r~   r   rH   canonicalize  s$   
 zExternKernel.canonicalizec                    sP   t  dd }d|g}| fddt D 7 }|d j  |S )Nrx  zkernel=c                    s$   g | ]}|j  d t |j  qS r   )rP   rL   )rX   fieldr   rG   rH   r\     s    z(ExternKernel.__str__.<locals>.<listcomp>r   )rL   dataclassesfieldsr  r   r   )r   kernel_namer   rG   r   rH   r     s   
zExternKernel.__str__)'r   r   r   rz  r   r   r   r  r  re   rN   r	   rS   r|  r   rE  r  r  r  r   r  r%  r  r  rm  r  r  r  r  r}  r  r  r  r  r  r  r  r   r   rG   rG   rG   rH   rl  i  sB   
 

D
,

	
.

	rl  c                       sL   e Zd ZU dZee ed< dd Z						d
 fdd	Zdd	 Z	  Z
S )ExternKernelOutNr|  c                 C   s:   |  | g |  |  }|| j|  || j d S rK   )r  r  r  generate_extern_kernel_outr|  r  rx  r   r  rM   rG   rG   rH   r    s   
zExternKernelOut.codegenrG   c	           	         sN   t  d || |||pi  || _tj| | _tjjr|n|| _	|| _
d S rK   )r  r   rX  r|  r1   r   r[  rP   r   rx  r  )	r   r  rY  rz  rN   r|  rx  
cpp_kernelr  r  rG   rH   r     s   
zExternKernelOut.__init__c                 C   r  rS  rG   r   rG   rG   rH   rT    r  zExternKernelOut.should_allocate)rG   NNNNrG   )r   r   r   r|  r   rE  r   r  r   rT  r&  rG   rG   r  rH   r    s   
 r  c                       s(   e Zd Zdedejf fddZ  ZS )RandomSeedscountr   c                    s@   t t j}t jt|t j|gdg |j|j|ggddd d S )Nr  zaten.randint.low_outzat::randint_out)r  rY  rz  rx  r  )r7   r  r  r  r   r  rN  rM  )r   r  r   limitsr  rG   rH   r     s   
zRandomSeeds.__init__)r   r   r   r  r7   r   r   r&  rG   rG   r  rH   r    s     r  c                       s@   e Zd Zdd Z					d fdd	Zdd Zd	d
 Z  ZS )r  c                 C   sZ   |  | g |  |  }tjj|  | j|| 	  t
| jtr+| | d S d S rK   )r  r  r  r1   r   r  generate_extern_kernel_allocrT  rx  r
  r4   r  r  r  r  rG   rG   rH   r    s   
zExternKernelAlloc.codegenrG   Nc                    sH   t  d || |||pi  tj| | _tjjr|n|| _|| _	d S rK   )
r  r   rX  r1   r   r[  rP   r   rx  r  )r   r  rY  rz  rN   rx  r  r  r  rG   rH   r     s   

zExternKernelAlloc.__init__c                 C   r  r  rG   r   rG   rG   rH   rT  "  r  z!ExternKernelAlloc.should_allocatec                 C   s   t rK   rI  r   rG   rG   rH   r}  %  r  z"ExternKernelAlloc.apply_constraint)rG   NNNrG   )r   r   r   r  r   rT  r}  r&  rG   rG   r  rH   r    s    r  c                       s<   e Zd ZdZdZdd Zdd Zdd Z fd	d
Z  Z	S )InplaceBernoulliFallbackzE
    This needs to be a custom class to handle mutation properly
    zaten.bernoulli_c                 C   sB   dd | j D \}|| j d| ddtt| j d d S )Nc                 s       | ]}|  V  qd S rK   r  rX   r   rG   rG   rH   r  1  r  z3InplaceBernoulliFallback.codegen.<locals>.<genexpr>(r  r  )rY  r~  rx  r   r   reprrz  )r   r  r   rG   rG   rH   r  0  s   $z InplaceBernoulliFallback.codegenc                 C   r  r  rG   r   rG   rG   rH   rT  6  r  z(InplaceBernoulliFallback.should_allocatec                 C      t | jtsJ | jj fS rK   r  r   rG   rG   rH   r  9     z+InplaceBernoulliFallback.get_mutation_namesc                    s0   t  d t|| |g| tj| | _d S rK   )r  r   r  rX  r1   r   r[  rP   )r   r   rz  r  rG   rH   r   =  s   
z!InplaceBernoulliFallback.__init__)
r   r   r   r  rx  r  rT  r  r   r&  rG   rG   r  rH   r  )  s    r  c                       sR   e Zd ZdZdd Zdd Zdd Zdd	d
dedee	 de
f fddZ  ZS )ScatterFallbackz
    This needs to be a custom class to handle mutation properly.
    This class handles both aten.scatter_ and aten.scatter_reduce_.
    It also handle the case `src` being a scalar properly.
    c              
   C   st   | j rdd | jD \}}}ndd | jD \}}| jd }|||| jd ||g| j| j| j | jd |   d S )Nc                 s   r  rK   r  r  rG   rG   rH   r  P  r  z*ScatterFallback.codegen.<locals>.<genexpr>c                 s   r  rK   r  r  rG   rG   rH   r  R  r  r   r   r  )src_is_tensorrY  rz  generate_scatter_fallbackrx  rQ   rN   r  )r   r  r   rZ   r  rG   rG   rH   r  N  s   
zScatterFallback.codegenc                 C   r  r  rG   r   rG   rG   rH   rT  ^  r  zScatterFallback.should_allocatec                 C   sR   |dkr| j r|d u rdnd}|S |d u sJ dd}|S |d us%J dd}|S )Naten.scatter_zat::scatter_outzat::scatter_reduce_outz:Expect reduce to be None for aten.scatter_ with scalar srcz5Expect reduce to be not None for aten.scatter_reduce_)r  )r   rQ   r  rx  rG   rG   rH   get_cpp_kernela  s   

zScatterFallback.get_cpp_kernelNTr  include_selfrs  r  r  c             	      s   |dv sJ t |t _tjjr%ddd}||v r|| } || _n| _| _ jr> fdd|||fD }	|f}
n fdd||fD }	||f}
t	 
d t| |	|
||d d	d
g _tj  _d S )N>   aten.scatter_reduce_r  rP  rO  )rJ  multiplyc                    rd  rG   r  r  r   rG   rH   r\     r{   z,ScatterFallback.__init__.<locals>.<listcomp>c                    rd  rG   r  r  r   rG   rH   r\     r{   r  r  r  )r4   r<   r  r1   r   r   r  rx  rQ   r  r   r  rX  r  r[  rP   )r   rQ   r   rs  rZ   r  r  r  get_operator_enumtensorsrz  r  r   rH   r   s  s.   

zScatterFallback.__init__)r   r   r   r  r  rT  r  r  r   rS   r  r   r&  rG   rG   r  rH   r  G  s    	r  c                       s0   e Zd ZdZdd Zdd Z fddZ  ZS )IndexPutFallbackzQ
    This needs to be a custom class to handle mutation and indices properly
    c                 C   s   dd | j D ^}}}g }t|}t| jD ]\}}| j| d ur)|t| q|tjjj	 qtjjj
 d| tjjj }	||	|g|  }
||| j|
 d S )Nc                 s   r  rK   r  r  rG   rG   rH   r    r  z+IndexPutFallback.codegen.<locals>.<genexpr>r  )rY  r  r   r  r  r  r1   r   r  r  open_bracketr   closed_bracketr  r~  wrap_kernel_callrx  )r   r  r   r(  valid_indicesr  iter_valid_indicesrY   r   indices_strrM   rG   rG   rH   r    s   "zIndexPutFallback.codegenc                 C   r  r  rG   r   rG   rG   rH   rT    r  z IndexPutFallback.should_allocatec                    sv   | _ dd |D } fdd||g|D }t d t| ||f tj  _tjj	r6d _
d S d _
d S )Nc                 S   s   g | ]}|d ur|qS rK   rG   rW   rG   rG   rH   r\     r]   z-IndexPutFallback.__init__.<locals>.<listcomp>c                    rd  rG   r  r  r   rG   rH   r\     r{   zat::index_put_zaten.index_put_)r  r  r   r  rX  r1   r   r[  rP   r   rx  )r   r   r  r(  
accumulater  r  r  r   rH   r     s   zIndexPutFallback.__init__)r   r   r   r  r  rT  r   r&  rG   rG   r  rH   r    s
    r  c                   @   s    e Zd Zedd Zdd ZdS )
DeviceCopyc                 C   s   |  stdd | D r||S tjj|j tj	|j
 tjj| j tj	| j
 td tt|| | d| |gS )Nc                 s   s*    | ]}|j tjjv ot|tjV  qd S rK   )rP   r1   r   r  r4   r   r"  r  rG   rG   rH   r    s
    
z$DeviceCopy.create.<locals>.<genexpr>zDeviceCopy in input programr  )r  r  r   r:  r1   r   device_typesrJ  rB   add_device_idxrZ   r   r(   r  r  r   r   rm  )r  r   r   rG   rG   rH   r    s"   

zDeviceCopy.createc                 C   sx   |   }t|dksJ | jr&|| j  d|d  dtjjj  d S ||   d|d  dtjjj  d S )Nr   .copy_(r   r  )	r  r_   r|  r~  r  r1   r   r  endingr  rG   rG   rH   r    s   " zDeviceCopy.codegenN)r   r   r   r%  r  r  rG   rG   rG   rH   r    s    
r  c                   @   s   e Zd ZdZdd ZdS )r;   z
    The result of a call to aten._local_scalar_dense.

    This is not yet implemented.  The one model (so far) that calls this
    (fastNLP_Bert) does not actually use the result.  So we expect this
    node to get dead code eliminated.
    c                 C   r  r  rG   r   rG   rG   rH   r     r  zDynamicScalar.get_readsN)r   r   r   r  r   rG   rG   rG   rH   r;     s    r;   c                       sj   e Zd Z		d fdd	Zdd Zdd Zdd	 Zed
d Z fddZ	e
dd Z fddZ  ZS )FallbackKernelNc           	         sJ  t  |t|t| d| _t|tjjr|jn|}t	tj
j|jd |u rWtjjr0d|j nd|j | _|d urVdd |jD | _dd |jD | _dd |jD | _n9t|tjjrut	tjj|jd |u rqd	|j | _ntd
tjjrd| _| | n|jdd d|j | _|| _|d u ri n|| _tj| j d S )NFzat::zaten.c                 S   s    g | ]}|j s|j|jd qS )rB   r   )
kwarg_only	real_typer  r  rG   rG   rH   r\     s    z+FallbackKernel.__init__.<locals>.<listcomp>c                 S      g | ]}|j r|jqS rG   r  rP   r  rG   rG   rH   r\     
    c                 S   s$   i | ]}|j r|j|j|jd qS r  )r  rP   r  r  r  rG   rG   rH   rz     s    z+FallbackKernel.__init__.<locals>.<dictcomp>ztorch._prims.rng_prims.z.Unable to find HigherOrderOperator kernel nameTz._ops.z.ops..)r  r   r6   use_cpp_op_schemar4   r7   r  
OpOverload_overloadpacketrL   r0   atenr   r1   r   r   rx  r  args_default_valuer  r  HigherOrderOperator_prims	rng_primsr   set_cpp_kernelr   replacer  rN   warn_fallback)	r   r  rx  r  nontensor_argsr  rN   r  op_overload_packetr  rG   rH   r     sT   

zFallbackKernel.__init__c                    s   ddl m} |jjrJ d|j ddd  t fdd|jjD s,J |j d	t fd
d|jjD sAJ |j d|jj| _	|jj
| _| j	dd d| j | _||| _dd |jjD | _d S )Nr   )get_cpp_op_schemazmutable z" is not supported with cpp_wrapperc                 S   s   | j d u p	| j j S rK   )
alias_infois_write)r  rG   rG   rH   is_not_write@  rR   z3FallbackKernel.set_cpp_kernel.<locals>.is_not_writec                 3   r-  rK   rG   r  r  rG   rH   r  C      
z0FallbackKernel.set_cpp_kernel.<locals>.<genexpr>z< with alias_info arguments is not supported with cpp_wrapperc                 3   r-  rK   rG   r  r  rG   rH   r  F  r  z: with alias_info returns is not supported with cpp_wrapperz::r   c                 S   r  rG   r  r  rG   rG   rH   r\   Q  r  z1FallbackKernel.set_cpp_kernel.<locals>.<listcomp>)codegen.wrapperr  _schema
is_mutabler   r  r  returnsrP   rx  overload_namecpp_kernel_overlad_namer  cpp_kernel_keycpp_op_schemar  )r   rx  r  rG   r  rH   r  6  s0   






zFallbackKernel.set_cpp_kernelc                 C   s   t | ds	J d|t| jk sJ d| dt| j | j| d }|d u rE| j| d }t|tv s>J dt| tt|  S |S )Nr  z*self.args_default_value has to be providedzexpected the index z2 to be smaller than len(self.args_default_value): r   rB   r  )r  r_   r  rS   r  )r   ry   r$  r  rG   rG   rH   get_arg_default_valueU  s$   z$FallbackKernel.get_arg_default_valuec                    s   t jG dd d  fddjD }|j\}}dd |D }tjjrRtdrRt	|}t	j
}||k rRfddt||D }dd |D }|| j| |S )	Nc                   @   s   e Zd ZU eed< dd ZdS )z)FallbackKernel.codegen_args.<locals>.Shimrefc                 S   r   rK   )r$  r   rG   rG   rH   r   l  r   z2FallbackKernel.codegen_args.<locals>.Shim.__repr__N)r   r   r   r   r   r   rG   rG   rG   rH   Shimh  s   
 r%  c                    s   g | ]} |  qS rG   r  r  )r%  rG   rH   r\   o  r]   z/FallbackKernel.codegen_args.<locals>.<listcomp>c                 S   r  rG   r  r  rG   rG   rH   r\   q  r  r  c                    rd  rG   )r#  rW   r   rG   rH   r\   w  s    
c                 S   r  rG   r  r  rG   rG   rH   r\   z  r  )r  	dataclassrY  r  rz  r1   r   r   r  r_   r  r`   rf  rN   update)r   r  rM   rN   n_args
n_pos_argspos_argsrG   )r%  r   rH   r  g  s    


zFallbackKernel.codegen_argsc                 C   s   | r| d   S t|tjr|jS t|ttfrBdd |D }dd |D }t|dkr0|d S |D ]}|jdkr=|  S q2|d S d S )Nr   c                 S   s   h | ]}t d |qS rK   )r   find_devicer  rG   rG   rH   r     r]   z-FallbackKernel.find_device.<locals>.<setcomp>c                 S   s   g | ]}|r|qS rG   rG   )rX   r   rG   rG   rH   r\     r   z.FallbackKernel.find_device.<locals>.<listcomp>r   r   )	r   r4   r7   Tensorr   r5   r6   r_   rB   )r  r  devicesr   rG   rG   rH   r+    s   
zFallbackKernel.find_devicec                    sN   | j rg |  |  }||  | j|| j| j| j d S t	 
| d S rK   )r  r  r  6generate_extern_kernel_alloc_and_find_schema_if_neededrT  rx  r"  r!  r   r  r  r  r  rG   rH   r    s   	zFallbackKernel.codegenc                    s   t jf}||vrtjjnt }| | j|g|R i |\}}}}	}
W d    n1 s-w   Y  t||}|s>J dtt	|||||	|
d fdd  |g S )Nz"Not sure where to find device info)r  c                    s   t ttfrt fddttD S t tjr5tt	j
jt t  S t tr<S d u sDJ dd S )Nc                 3   s,    | ]} | t |fg V  qd S rK   )rB   rW   )generate_outputr  outputrG   rH   r    s
    
zAFallbackKernel.create.<locals>.generate_output.<locals>.<genexpr>z+FallbackKernel output type is not supported)r4   r5   r6   rB   r`   r_   r7   r,  MultiOutputr  r   r   r&   r   r   r  )r0  r  r/  packed)r  r0  rH   r/    s&   




z.FallbackKernel.create.<locals>.generate_output)
r  *_fused_moving_avg_obs_fq_helper_functionalr1   r   	fake_moder   r  r   r+  r	  )r  rx  rM   rN   fake_incorrect_kernelscontextr  r  r  r  r  r   rG   r2  rH   r    s0   		
zFallbackKernel.createc                    s
   t   S rK   )r  r}  r   r  rG   rH   r}    r   zFallbackKernel.apply_constraintrU  )r   r   r   r   r  r#  r  r   r+  r  r%  r  r}  r&  rG   rG   r  rH   r     s    	A

2r   c                   @   s   e Zd ZU ejed< dS )r	  r   N)r   r   r   r7   r   r   rG   rG   rG   rH   r	    s   
 r	  c                       sF   e Zd Zdd Zdd Zdeeedf  f fddZd	d
 Z	  Z
S )r1  c                 C   s|   t |dkr<|d \}}|tkr | | d| d|dd  S |tkr8tjj|t|}| ||dd  S t	d|S )Nr   r  r  r   znon supported index type)
r_   r5   codegen_list_tuple_accessr6   r1   r   r  codegen_tuple_accessrS   r  )r   basenamer  ityperY   tuple_accessrG   rG   rH   r8    s    z%MultiOutput.codegen_list_tuple_accessc                 C   sb   t jjj}||   d| | jd  | j 7 }|t jjj7 }t jj	| | 
t jj d S N = r   )r1   r   r  declarerT  r8  rY  r  r  r~  r  )r   r  linerG   rG   rH   r    s
   
*zMultiOutput.codegenr  .c                    s,   t  d ||gd tj| | _|| _d S r  )r  r   r1   r   r[  rP   r  )r   r  rk  r  r  rG   rH   r     s   
zMultiOutput.__init__c                 C   r  r  rG   r   rG   rG   rH   rT    r  zMultiOutput.should_allocate)r   r   r   r8  r  r
   r   r   r   rT  r&  rG   rG   r  rH   r1    s
    r1  r   r<   r1  biaspaddingr   dilationgroups
transposedoutput_paddingc
                 C   s  dd }
dd }|   |   |dur|   tjj t|dd}t|dd}t| d }d	t|  k r>|ksAJ  J d	t|  k rN|ksQJ  J d	t|  k r^|ksaJ  J t||}t||}t||}|	du r{td	g|}	nd	t|	  k r|ksJ  J t|	|}	t|t	sJ |r|||}| }|
||||	|||}n|durt|ddn|}t
jj||||||||	|	}| }d	gtttd
t|d
  }t|g| }t|}W d   n1 sw   Y  | ||}| jdkr| jdksJ ||g}t| | t|t|}||||g}|r0|d
|	 |dur;|| n|d	| ||||fS )au  
    This function is a helper function to prepare inputs, layout and constant args
    for convolution post-op fusion's create function, including deciding the output
    layout (channels first or channels last), realizing inputs and make them etc. The
    function only supports the CPU device since conv post-op fusion kernel is only
    supported on CPU right now.
    c                 S   s   t | t |ksJ dt | }|dksJ dd}d}	g }
|
| |  |
||	 |  td|D ]1}|| d ||d   d }| | d ||d   ||d  d  | ||d   }|
| q3ttt|
S )NzExpect input dim == weight dimru   zExpect input dim > 2r   r   )r_   r  r`   r5   r   r  )output_sizeweight_sizerB  rF  r   rC  rD  rs  	BATCH_DIMWEIGHT_INPUT_CHANNELS_DIMru  r  rx  input_size_drG   rG   rH   _conv_input_size  s(   
z<_prepare_convolution_fusion_create.<locals>._conv_input_sizec                 S   s   |   }t|}|dksJ d|dkr9g }||d |  ||d |  td|D ]	}|||  q-|S | dd  }|S )Nru   zExpect weight dim > 2r   r   )r   r_   r  r`   	transpose)prepacked_weightrD  prepacked_weight_sizers  rH  r  rG   rG   rH   _original_deconv_weight_size.  s   zH_prepare_convolution_fusion_create.<locals>._original_deconv_weight_sizeNTr  ru   r   r   r   )r   r1   r   r5  r   r_   r   r*   r4   r  r7   r0   r  convolutionr5   r  r`   r   r  r   rB   r  r   r&   insertr  )r  r   r1  rA  rB  r   rC  rD  rE  rF  rL  rP  x_fakeweight_fakero  rH  ru  rG  	bias_faker0  req_stride_orderrv  rY  kernel_layoutrz  rG   rG   rH   "_prepare_convolution_fusion_create  s   
   


 

 
4$
rX  c                 C   sB  |   |   |dur|   tjjD t|dd}t|dd}|dur+t|ddn|}|dur<tjjj	|||}n	tjjj
	||}| }ddg}	t|}
W d   n1 s[w   Y  | ||	}| jdkrt| jdksvJ ||g}t| | t|t|
}g }|dur|| n|d| ||||	fS )z
    This function is a helper function to prepare inputs, layout and constant args
    for linear post-op fusion's create function. The function only supports the CPU device
    since linear post-op fusion kernel is only supported on CPU right now.
    NTr  r   r   r   )r   r1   r   r5  r   r7   r0   r  addmmr  mmr   r   r  r   rB   r  r   r&   r  rR  )r  r   r1  rA  rS  rT  rU  r0  rG  rV  rv  rY  rW  rz  rG   rG   rH   _prepare_linear_fusion_create  sJ   



 r[  c                       s`   e Zd Z		d fdd	Zdd Zeddd	dd
ddee dee dee defddZ  Z	S )ConvolutionUnaryrG   'torch.ops.mkldnn._convolution_pointwisec                    (   t  j|||d ddd d| _d| _d S )Nr]  mkldnn::_convolution_pointwiserx  r  convolution_pointwisea  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)r  r   r!  r"  r   r  rY  rz  rx  r  rG   rH   r     s   
zConvolutionUnary.__init__c                 C   s>   | |  | j|  | j| j t| jtr| 	| d S d S rK   )
r.  rT  rx  r  r"  r!  r4   r  r  r  r  rG   rG   rH   r    s   zConvolutionUnary.codegenr   r<   r1  rA  padding_stride_	dilation_rD  c              	   C   sR   t | |||||||\}}}}t }t }||t||	t||
g }t|||dS Nr  rY  rz  )rX  r   r   r   r\  )r  r   r1  rA  rd  re  rf  rD  attrscalars	algorithmrY  rz  rW  r   r   r   rG   rG   rH   r    s   zConvolutionUnary.create)rG   r]  
r   r   r   r   r  r%  r
   r  r  r&  rG   rG   r  rH   r\    s*    r\  c                       s   e Zd Z		d fdd	Zdd Zeddddd	dd
ddee dee dee dedede	e
 de	e de	ee  de	e fddZ  ZS )ConvolutionBinaryrG   c                    s4   t  j|||d ddd d| _d| _d| _|| _d S )Nz.torch.ops.mkldnn._convolution_pointwise.binaryr_  r`  binaryconvolution_pointwise_binarya  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& other_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view binary_attr,
                c10::optional<at::Scalar> alpha,
                c10::optional<c10::string_view> unary_attr,
                torch::List<c10::optional<at::Scalar>> unary_scalars,
                c10::optional<c10::string_view> unary_algorithm))r  r   r   r!  r"  cpp_constant_args)r   r  rY  rz  rp  r  rG   rH   r     s   
zConvolutionBinary.__init__c                 C   sB   | |  | j|  | j| j| j t| jt	r| 
| d S d S rK   )r.  rT  rx  r  r"  r!  r   r4   r  r  r  r  rG   rG   rH   r  /  s   zConvolutionBinary.codegenr   r<   r  r1  rA  rd  re  rf  rD  binary_attrbinary_alpha
unary_attrunary_scalarsunary_algorithmc              	   C   s   t | |||||||\}}}}| ||}|d| t }t }t }||	t||
t||t||t||g }t|||dS )Nr   rh  )rX  r  rR  r   r   r   r   rm  )r  r   r  r1  rA  rd  re  rf  rD  rq  rr  rs  rt  ru  rY  rz  rW  rV  r   r   r   rG   rG   rH   r  ;  s2   zConvolutionBinary.create)rG   rG   )r   r   r   r   r  r%  r
   r  rS   r   r*  r   r  r&  rG   rG   r  rH   rm    sB    "	

rm  c                       s   e Zd Z	d fdd	Zdd Zdd Zedd	d
d	dd	dd	dee dee dee dede	de
e de
e	 de
ee  de
e	 fddZ  ZS )ConvolutionBinaryInplacerG   c                    sJ   |d |d g|dd   }t  j|||d ddd d| _d| _d	| _d S )
Nr   r   ru   z/torch.ops.mkldnn._convolution_pointwise_.binaryzmkldnn::_convolution_pointwise_r`  rn  convolution_pointwise_binary_a  
            at::Tensor&(
                at::Tensor& other_t,
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view binary_attr,
                c10::optional<at::Scalar> alpha,
                c10::optional<c10::string_view> unary_attr,
                torch::List<c10::optional<at::Scalar>> unary_scalars,
                c10::optional<c10::string_view> unary_algorithm)r  r   r   r!  r"  )r   rW  rY  rz  reordered_inputsr  rG   rH   r   h  s   
z!ConvolutionBinaryInplace.__init__c                 C   (   | |  | j|  | j| j| j d S rK   r.  rT  rx  r  r"  r!  r   r  rG   rG   rH   r       z ConvolutionBinaryInplace.codegenc                 C   r  rK   r  r   rG   rG   rH   r    r  z+ConvolutionBinaryInplace.get_mutation_namesr   r<   r  r1  rA  rd  re  rf  rD  rq  rr  rs  rt  ru  c              	   C   s   t | |||||||\}}}}| ||}|d| t }t }t }||	t||
t||t||t||g }tt|d ||dS )Nr   )rW  rY  rz  )	rX  r  rR  r   r   r   r   rv  r  )r  r   r  r1  rA  rd  re  rf  rD  rq  rr  rs  rt  ru  rY  rz  r   rV  r   r   r   rG   rG   rH   r    s2   
zConvolutionBinaryInplace.createrG   )r   r   r   r   r  r  r%  r
   r  rS   r   r*  r   r  r&  rG   rG   r  rH   rv  g  sB    $
	

rv  c                       s4   e Zd Z	d fdd	Zdd Zedd Z  ZS )	MKLPackedLinearrG   c                    r^  )Nztorch.ops.mkl._mkl_linearzmkl::_mkl_linearr`  
mkl_lineara  
            at::Tensor(
                const at::Tensor& self,
                const at::Tensor& mkl_weight_t,
                const at::Tensor& origin_weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                const int64_t prepack_batch_size)rb  r   r  rY  rz  r  rG   rH   r        
zMKLPackedLinear.__init__c                 C   $   | |  | j|  | j| j d S rK   r.  rT  rx  r  r"  r!  r  rG   rG   rH   r       zMKLPackedLinear.codegenc                 C   s   |  | |}|  | |}| ^ }}| \}}t||g }t|}	|||g}
d |g}tt| | ||	|
|dS rg  )	r  rm  r   r5   r   r~  r  r   r   )r  r   packed_worig_w
batch_sizemr   ocrG  rv  rY  rz  rG   rG   rH   r    s   
zMKLPackedLinear.creater}  )r   r   r   r   r  r%  r  r&  rG   rG   r  rH   r~    s    	r~  c                       s<   e Zd Z	d
 fdd	Zdd Zedd Zdd	 Z  ZS )LinearUnaryrG   c                    r^  )Nz"torch.ops.mkldnn._linear_pointwisemkldnn::_linear_pointwiser`  linear_pointwiseaL  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)rb  r  r  rG   rH   r     r  zLinearUnary.__init__c                 C   r  rK   r  r  rG   rG   rH   r    r  zLinearUnary.codegenc                 C   s   |  | |}|  | |}| ^ }}| \}	}||g}
||r&|ndg|g}|d ur=|  | |}|
| n|dd  tt| | t	||	g d|
|dS )Nr_  r   r  rh  )
r  rm  r   r  rR  r  r  r   r   r5   )r  r   wr\  ri  rj  rk  r  icr  rY  rz  rG   rG   rH   r    s&   zLinearUnary.createc                 C   r-  rK   rG   r   rG   rG   rH   r}  7  r  zLinearUnary.apply_constraintr}  )	r   r   r   r   r  r%  r  r}  r&  rG   rG   r  rH   r    s    	
r  c                       s@   e Zd ZdZ	d fdd	Zdd Zedd Zd	d
 Z  Z	S )LinearBinary)torch.ops.mkldnn._linear_pointwise.binaryrG   c                    s.   t  j|||d ddd d| _d| _d| _d S )Nr  r  r`  rn  linear_pointwise_binarya  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& other_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                c10::string_view attr)
        rx  r  r  rG   rH   r   >  s   
zLinearBinary.__init__c                 C   rz  rK   r{  r  rG   rG   rH   r  W  r|  zLinearBinary.codegenc                 C   s   |  | |}|  | |}|  | |}| ^ }}| \}}|||g}	|g}
|d ur?|  | |}|	| n|
d| tt| | t	||g d|	|
dS )Nr   r  rh  )
r  rm  r   r  rR  r  r  r   r   r5   )r  r   yr  r\  ri  r  r  r  rY  rz  rG   rG   rH   r  a  s(   
zLinearBinary.createc                 C   r-  rK   rG   r   rG   rG   rH   r}  |  r  zLinearBinary.apply_constraintr}  )
r   r   r   rx  r   r  r%  r  r}  r&  rG   rG   r  rH   r  ;  s    

r  c                       sf   e Zd Z	d fdd	Zdd Zeddddd	dd
ee dee dee dee defddZ  Z	S )ConvolutionTransposeUnaryrG   c                    r^  )Nz1torch.ops.mkldnn._convolution_transpose_pointwisez(mkldnn::_convolution_transpose_pointwiser`  convolution_transpose_pointwisea  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef output_padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)rb  r  r  rG   rH   r     r  z"ConvolutionTransposeUnary.__init__c                 C   r  rK   r  r  rG   rG   rH   r    r  z!ConvolutionTransposeUnary.codegenr   r<   r1  rA  rd  output_padding_re  rf  groups_c                 C   sZ   d}t | |||||||||
\}}}}t }t }||	t||
t||g }t|||dS )NTrh  )rX  r   r   r   r  )r  r   r1  rA  rd  r  re  rf  r  ri  rj  rk  rE  rY  rz  rW  r   r   r   rG   rG   rH   r    s<   z ConvolutionTransposeUnary.creater}  rl  rG   rG   r  rH   r    s,    		r  c                !       sz   e Zd Z	d fdd	Zdd Zeddd	dd
ddddddddddedee dededededededef ddZ	  Z
S )MkldnnRnnLayerrG   aten.mkldnn_rnn_layerc                       t  ||| || _d S rK   )r  r   rx  rc  r  rG   rH   r     s   
zMkldnnRnnLayer.__init__c              
   C   s0   | |   d| j dd|   d d S )Nr>  r  r  r  )r~  rT  rx  r   r  r  rG   rG   rH   r    s   $zMkldnnRnnLayer.codegenr   r<   w0w1w2w3hxcxr  batch_sizesr>  hidden_size
num_layers
has_biasesbidirectionalbatch_firsttrainc              	      s\  |  |   |  | |}|  | |}|  | |}|  | |}|  | |}|  |  | |}|   }t|dksRJ d|\}}}|||g}| }| }g }||||||g}||	|
||||||g	}tt ||ddd }g  |||g}|||t|t|g} fddt	t
||D }|S )Nrt   zExpect lstm input to be 3D)rY  rz  c                 S   s   t | dks
J dt| S )Nrt   zExpect output_shape to be 3D)r_   r   )output_shaper  rG   rG   rH   get_strides_of_lstm_output&  s   z9MkldnnRnnLayer.create.<locals>.get_strides_of_lstm_outputc                    s<   g | ]\}\}}t t  || t|fg qS rG   )r1  r  r   r   r5   )rX   rY   rG  rv  r  r3  r   rG   rH   r\   1  s    
z)MkldnnRnnLayer.create.<locals>.<listcomp>)r  rm  rC  r   r_   r  r	  r   r   r   rf   )r  r   r  r  r  r  r  r  r  r  r>  r  r  r  r  r  r  ru  
seq_length
mini_batchr  hy_shapecy_shaperesrY  rz  r  output_sizesoutput_strides	output_irrG   r  rH   r    s\   



zMkldnnRnnLayer.create)rG   r  )r   r   r   r   r  r%  r  r
   r  r  r&  rG   rG   r  rH   r    sL    
	
r  c                       sv   e Zd Z	d fdd	Zdd Zeddded	ed
ddddddddee dee dee dededefddZ	  Z
S )QConvPointWisePT2ErG   c                    "   t |dk| _t ||| dS )a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp]
            - const_args is: [stride, padding, dilation, groups, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp]
            - const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        r  Nr_   has_biasr  r   r  r  rG   rH   r   E     zQConvPointWisePT2E.__init__c                 C   sj  dd | j D }g }||   |d }|d }| jr |d n|d }|d |d }}|dd  \}	}
}}}}}}}}}}d	| _| d
|  d
|  d
|  d
|  d
|  d
|  d
|	  d
|
  d
|  d
|  d
|  d
|  d
|  d
|  d
|  d
|  }||   d| j d| d t| jt	r| 
| d S d S )Nc                 S   r3  rG   r  r  rG   rG   rH   r\   Z  r   z.QConvPointWisePT2E.codegen.<locals>.<listcomp>r   r   ru   r_  iz"torch.ops.onednn.qconv2d_pointwiser  r>  r  r  rY  rf  r  r  rx  r~  rT  r4   r  r  r  )r   r  rM   
const_argsr   packed_weightrA  w_scalew_zpr   rB  rC  rD  x_scalex_zpo_inv_scaleo_zpfp32_outputrs  rt  ru  r  rG   rG   rH   r  X  sx   
	
"zQConvPointWisePT2E.codegenr   r<   r  r  r1  r  r  rA  re  rd  rf  rD  r  output_zero_pointc                 C   s   d}d }t | ||||	||
|||
\}}}}|d u r'|d |d |d< |d< n|d |d |d< |d< |  |  |||g }|||||||||g }|rTtj|_t|||dS NFru   r   r   rh  )rX  r   r7   float32r   r  )r  r   r  r  r1  r  r  rA  re  rd  rf  rD  r  r  r  rs  rt  ru  rE  rF  rY  rz  rW  r   rG   rG   rH   r    sJ   zQConvPointWisePT2E.creater}  )r   r   r   r   r  r%  r*  r  r
   r  r&  rG   rG   r  rH   r  D  s@    1	
r  c                       sj   e Zd Z	d fdd	Zdd Zeddddd	dd
ddee dee dee deddddfddZ  Z	S )QConvPointWiseBinaryPT2ErG   c                    r  )a~  
        Needs input/weight/output qparams
        if bias is not None
            - inputs = [x, w, b, accum, w_scale, w_zp]
            - const_args = [stride, padding, dilation, groups, x_scale, x_zp, accum_scale, accum_zp, o_inv_scale, o_zp,
            fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, accum, w_scale, w_zp]
            - const_args = const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, accum_scale,
            accum_zp, o_inv_scale, o_zp, fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
           Nr  r  r  rG   rH   r     s   z!QConvPointWiseBinaryPT2E.__init__c                 C   s  dd | j D }g }||   |d }|d }| jr |d n|d }|d |d |d }}}	|d	d  \}
}}}}}}}}}}}}}}}d
| _| d|  d|  d|  d|  d|  d|  d|  d|	  d|  d|
  d|  d|  d|  d|  d|  d|  d|  d|  d|  d|  d|  }||   d| j d| d t| jt	r| 
| d S d S )Nc                 S   r3  rG   r  r  rG   rG   rH   r\     r   z4QConvPointWiseBinaryPT2E.codegen.<locals>.<listcomp>r   r   ru   r  r_  iz)torch.ops.onednn.qconv2d_pointwise.binaryr  r>  r  r  r  )r   r  rM   r  r   r  rA  accumr  r  r   rB  rC  rD  r  r  accum_scaleaccum_zpr  r  r  rq  alphars  rt  ru  	conv_argsrG   rG   rH   r    s   
	
"z QConvPointWiseBinaryPT2E.codegenr   r<   r  r1  rA  re  rd  rf  rD  r  r  c                 C   s   d}d }t | |||
||||||
\}}}}| ||}|| |
d u r2|d |d |d< |d< n|d |d |d< |d< |  |	  |||	g }|||||||||||||g }|rctj|_t|||dS r  )rX  r  r  r   r7   r  r   r  )r  r   r  r  r  r  r  r1  r  r  rA  re  rd  rf  rD  r  r  r  rq  r  rs  rt  ru  rE  rF  rY  rz  rW  rV  rG   rG   rH   r    s`   
zQConvPointWiseBinaryPT2E.creater}  rl  rG   rG   r  rH   r    s4    9r  c                       sZ   e Zd Z	d fdd	Zdd Zeddded	ed
ddddddddedefddZ  Z	S )QLinearPointwisePT2ErG   c                    r  )a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp]
            - const_args is: [x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp]
            - const_args is: [bias, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        r  Nr  r  r  rG   rH   r   o  r  zQLinearPointwisePT2E.__init__c                 C   s:  dd | j D }g }||   |d }|d }| jr |d n|d }|d |d }}|dd  \}	}
}}}}}}d	| _| d
|	  d
|
  d
|  d
|  d
|  d
|  d
|  d
|  d
|  d
|  d
|  d
|  }||   d| j d| d t| jt	r| 
| d S d S )Nc                 S   r3  rG   r  r  rG   rG   rH   r\     r   z0QLinearPointwisePT2E.codegen.<locals>.<listcomp>r   r   ru   r  r_  iz"torch.ops.onednn.qlinear_pointwiser  r>  r  r  r  )r   r  rM   r  r   r  rA  r  r  r  r  r  r  r  rs  rt  ru  r  rG   rG   rH   r    s`   

	
"zQLinearPointwisePT2E.codegenr   r<   r  r  r1  r  r  rA  r  r  c              	   C   sd   t | |||\}}}}|  |  |||g }|||||	|
|||g }|
r+tj|_t|||dS rg  )r[  r   r7   r  r   r  )r  r   r  r  r1  r  r  rA  r  r  r  rs  rt  ru  rY  rz  rW  r   rG   rG   rH   r    s4   zQLinearPointwisePT2E.creater}  )
r   r   r   r   r  r%  r*  r  r  r&  rG   rG   r  rH   r  n  s0    )	
r  c                   @   sB   e Zd ZU dZeed< dd Zdd Zedd Z	d	d
 Z
e
ZdS )r  zC
    TensorBox / StorageBox allow in-place mutation of Tensors
    r  c                 C   s4   t | j|}t|r|S tt| jj d| d)Nr  z not callable)rL   r  callableAttributeErrorrB   r   )r   rP   rQ   rG   rG   rH   __getattr__  s   zMutableBox.__getattr__c                 C   rR  rK   rZ  r   rG   rG   rH   r     r   zMutableBox.realizec                 C   r  rK   r  r   rG   rG   rH   r       zMutableBox.layoutc                 C   sn   t | jtrt| j dt| jj d}d}| jj}nt| j d}| j}d}|tt||g}d|S )Nr  z))r  
)r4   r  r  rB   r   r   rS   r   )r   line0endlrN  r   rG   rG   rH   r     s   


zMutableBox.__str__N)r   r   r   r  r   r   r  r   r  r  r   r   rG   rG   rG   rH   r    s   
 
r  c                   @   s   e Zd Zedd ZdS )r<   c                 C   s   t t| S rK   )r<   rA  )r  rG   rG   rH   r    r  zTensorBox.createN)r   r   r   r   r  rG   rG   rG   rH   r<     s    c                   @   sL   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zedd Z	edd Z
dS )rA  c                 C   s&   t | jttfr| j tjjv S dS r  )r4   r  r  rE  rT  r1   r   graph_inputsr   rG   rG   rH   is_input_buffer  s   zStorageBox.is_input_bufferc                 C   s   t | jtttttfr| j S t | jtt	fsJ t
| j| j }| j }td t| j | j | j d| jd| _tj| j| j_| j| j_|| j_|| j_| jjS )Nr  r  )r4   r  r  rW  r  rE  rV  rT  r,  rt  rB   r
  r   r  r   r   r   r1   r   r[  rP   r   r   r   )r   r   r   rG   rG   rH   r     s6   



	
zStorageBox.realizec                 C   s<   t | jttfr|  dkr|  r|   dS dS dS dS )zL
        Called on buffers we expect to be forced to realize later.
        r   N)r4   r  r,  rt  r  8is_pointwise_non_scalar_tensor_num_reads_larger_than_oner   r   rG   rG   rH   r[  0  s   zStorageBox.realize_hintc                 C   s(   t | jto|  tjkp|  tjkS rK   )r4   r  r,  r  r   realize_acc_reads_thresholdr  realize_bytes_thresholdr   rG   rG   rH   rY  ;  s   z!StorageBox.has_exceeded_max_readsc                 C   s~   dt ttf fdd}|dkr7t| jttfr9|  tjks1t| 	 tj
ks1t| jr;|| jr=|   dS dS dS dS dS )zj
        A heuristic to decide if we should realize a tensor
        that is used multiple times.
        loopsc                    s$   dg}|    t fdd|D S )zW
            The heuristic for realizing reused result of heavy ops on cpu
            expc                 3   s    | ]	}|d   v V  qdS )r  NrG   )rX   opfn_strrG   rH   r  M  r  zGStorageBox.mark_reuse.<locals>.should_realize_on_cpu.<locals>.<genexpr>)r   rL  )r  	heavy_opsrG   r  rH   should_realize_on_cpuG  s   z4StorageBox.mark_reuse.<locals>.should_realize_on_cpur   N)r   r,  rt  r4   r  r  r   realize_reads_thresholdr_   r   r  r   r   )r   rX  r  rG   rG   rH   rV  A  s   	zStorageBox.mark_reusec                 C   sz   | j }t|tttfrdS t|tr| }n!t|ttfs$J t	|td t
| | | d|d }t|jS )Nr   r  r  )r  r4   rW  r  rE  r  r  r,  rt  rB   r  r   r   r   r_   r#  )r   r  r  rG   rG   rH   r  Z  s$   


	zStorageBox.num_readsc                 C   s,   t | jtrtdd | j D dkS dS )Nc                 s   s    | ]}|j d kV  qdS )r   Nra   )rX   readrG   rG   rH   r  r  r$  zVStorageBox.is_pointwise_non_scalar_tensor_num_reads_larger_than_one.<locals>.<genexpr>r   T)r4   r  r,  rP  r   r   rG   rG   rH   r  n  s
   
zCStorageBox.is_pointwise_non_scalar_tensor_num_reads_larger_than_oneN)r   r   r   r  r   r[  rY  rV  r%   r  r  rG   rG   rG   rH   rA    s    
rA  c                       sX   e Zd Zeeddd Z fddZdej	j
def fdd	Z fd
dZ  ZS )InterpreterShimNc                   C   s   t jtS rK   )r7   r   symbolic_tracer   rG   rG   rG   rH   	_dummy_gmy  s   zInterpreterShim._dummy_gmc                    s>   t  j|  dd | | _|| _|| _d| _|j| _d | _	d S )NF)garbage_collect_values)
r  r   r  moduler   
submodulesextra_tracebackr  
fetch_attrcurrent_noder   r   r  r  rG   rH   r   ~  s   
zInterpreterShim.__init__r  r  c                    s   || _ t |S rK   )r  r  run_node)r   r  r  rG   rH   r    s   zInterpreterShim.run_nodec                    s@   t |  t j|i |W  d    S 1 sw   Y  d S rK   )r1   set_interpreter_handlerr  run)r   rM   rN   r  rG   rH   r    s   $zInterpreterShim.run)r   r   r   r   r  	lru_cacher  r   r7   r   r   r   r  r  r&  rG   rG   r  rH   r  x  s    r  c                       sx   e Zd ZdZ fddZedd Zedd Zdd	 Zd
e	j
fddZdd Zdd Zdd Zdd Zdd Z  ZS )r>  z
    Captures the body of a Loops subclass into an FX graph.  Persists any
    indexing simplifications and makes it easier to analyze loop bodies.
    c                    sj   t    || _i | _i | _g | _g | _i | _i | _g | _	d| j
i| _i | _g | _t| ||| _d | _d S )N	get_index)r  r   rD  r?  indexing_exprs_namer#  writesr@  rA  r  r  r  	subblocksindirect_varsLoopBodyBlock
root_blockindexing)r   rQ   rM   rD  r  rG   rH   r     s   

zLoopBody.__init__c                 C   s0   t | jjfdd | j D }dd |D S )Nc                 s       | ]}|j V  qd S rK   )r   )rX   blockrG   rG   rH   r        z%LoopBody.get_nodes.<locals>.<genexpr>c                 S   s   g | ]
}|j D ]}|qqS rG   )rC   )rX   r   rD   rG   rG   rH   r\     r~  z&LoopBody.get_nodes.<locals>.<listcomp>)r  chainr  r   r  r(  )r   
all_graphsrG   rG   rH   	get_nodes  s
   zLoopBody.get_nodesc                 C   s   ddl m} || S )Nr   )	BoundVars)boundsr  )r   r  rG   rG   rH   r     s   zLoopBody.boundsc                 C   s`   dt | j g}|dd | j D  |dd td| jfg| j D  d	|S )Nzvar_ranges = c                 S   s   g | ]\}}| d | qS )r>  rG   )rX   rP   r  rG   rG   rH   r\     s    z&LoopBody.debug_str.<locals>.<listcomp>c                 S   s   g | ]	\}}| |qS rG   )	debug_str)rX   rP   r  rG   rG   rH   r\     s    r	  r  )
re   rD  rf  r?  rB  r  r  r  r  r   r   rG   rG   rH   r    s   
zLoopBody.debug_strr  c                 C   sd   t | || |d ur|t | | d|< || jvr-dt| j }|| j|< || j|< | j| S )N
_name2exprrZ   )rL   r  r  r_   r?  )r   r  categorybuf_namerP   rG   rG   rH   add_index_expr  s   



zLoopBody.add_index_exprc                 C   s<   |d   r|| jvr|}n	| t| j }|| j|< |S )zaNot actually for nn.Modules, but subblocks in generated code are mapped to FX call_module opcodesr_  )	isnumericr  r_   )r   r  r3   rP   rG   rG   rH   add_submodule  s
   
zLoopBody.add_submodulec                 C   s(   dt | j }t|}| j| |S )Nindirect)r_   r  r.   r  )r   r   rP   r  rG   rG   rH   add_indirect  s   zLoopBody.add_indirectc                    sB   t t  kr
dS | jdusJ  fdd| j D | _dS )z,Swap in a variable used in indirect indexingNc                    s    i | ]\}}|t | iqS rG   r  r  newr   rG   rH   rz     r   z-LoopBody.replace_indirect.<locals>.<dictcomp>)rS   r  rB  )r   r   r  rG   r
  rH   replace_indirect  s    zLoopBody.replace_indirectc                 C   s   | j d usJ | j | S rK   )r  r   rG   rG   rH   r    r  zLoopBody.get_indexc                    s   t tj| }t|tjksJ |jftfdd|D s$J ttj |  fddj	
 D _ }d _|S )Nc                 3   s    | ]}| j vV  qd S rK   )rD  r#  r   rG   rH   r    r$  z$LoopBody.__call__.<locals>.<genexpr>c                    s   i | ]
\}}|t | qS rG   r  )rX   rP   r  r  rG   rH   rz     s    
z%LoopBody.__call__.<locals>.<dictcomp>)r5   r  r  r_   rD  r  re   rf   r  r?  rB  r  r  )r   r  rZ   r  rG   )r  r   rH   __call__  s    
zLoopBody.__call__)r   r   r   r  r   r%   r  r   r  r=   r   r  r  r	  r  r  r  r&  rG   rG   r  rH   r>    s    


	r>  c                   @   sD   e Zd ZdZdededef dee fddZdd	 Z	dddZ
dS )r  a  
    Captures the body of a Loops subclass into an FX graph.
    In normal cases there will be a 1:1 mapping between LoopBody and
    LoopBodyBlock, hower in the case of ops.masked() the masked out
    operations will manifest as an extra LoopBodyBlock.
    r	  rQ   .rM   c           	         s   |_ dfdd	 G  fdddtj}tj tjjjd_	dddi }d	d
l
m} d	dlm} |||j j}tjrJ||}t| t||  W d    n1 saw   Y  j_d S )Nc              	      s    dd j| ||fi S )Ncall_moduler  )create_proxyr	  r  )r  r  r  r   tracerrG   rH   	add_index  s   z)LoopBodyBlock.__init__.<locals>.add_indexc                       s   e Zd Zd_dedejf fddZd fdd	Z fd	d
Z	dd Z
 fddZdedejdejdef fddZededef ffddZedfdd	ZefddZdS ) z/LoopBodyBlock.__init__.<locals>.CaptureIndexingCaptureIndexingrP   rZ   c                    s    |d|}| j ||S )Nr#  )_innerr  )r   rP   rZ   r  rG   rH   r    s   z4LoopBodyBlock.__init__.<locals>.CaptureIndexing.loadNc                    s    |d|}| j ||||S Nr  )r  r.  )r   rP   rZ   r   r>  r  rG   rH   r.    s   z5LoopBodyBlock.__init__.<locals>.CaptureIndexing.storec                    s    |d|}| j |||S r  )r  r|  )r   rP   rZ   r   r  rG   rH   r|    r  z?LoopBodyBlock.__init__.<locals>.CaptureIndexing.store_reductionc                    s8   | j |||| d|v rt fddtdD S  S )Nr  c                 3   s    | ]} | V  qd S rK   rG   rW   r  rG   rH   r    r  zLLoopBodyBlock.__init__.<locals>.CaptureIndexing.reduction.<locals>.<genexpr>rt   )r  r{  r6   r`   )r   r   rv  rf  r   rG   r  rH   r{    s   z9LoopBodyBlock.__init__.<locals>.CaptureIndexing.reductionc                    s:   t |ttjfr| jt||S  |d}| j||S Nr  )r4   r  r=   r   r  r)  r  )r   rZ   r   r  rG   rH   r    s   
z:LoopBodyBlock.__init__.<locals>.CaptureIndexing.index_exproffsets_nameoffsets_sizeindexing_dtyper  c                    s    |d}| j |||||S r  )r  	bucketize)r   r(  r  r  r  r  r  rG   rH   r  $  s   

z9LoopBodyBlock.__init__.<locals>.CaptureIndexing.bucketizemasked_body.c                    sH    fdd}j |d}tj |g   j j|< d|| |fi S )zb
                Recursively capture the masked out body in another LoopBodyBlock
                c                    s   t j|  |S rK   )r1   r0   r  )ra  r  subblockrG   rH   shim7  r  zDLoopBodyBlock.__init__.<locals>.CaptureIndexing.masked.<locals>.shimmasked_subblockr  )r	  r  r  r  r  )
mask_proxyr  other_proxyr   rP   r  r  rH   r  1  s   z6LoopBodyBlock.__init__.<locals>.CaptureIndexing.maskedTc                    sD    fdd}j dj |d | fi  S )z
                Flow data from tensors into indexing formulas.
                Introduce a call_module to update the indexing.
                c                    s   j tj|   d S rK   )r	  r  r1   r0   indirect_indexing)new_var)checkr   r   r  rG   rH   set_indirectH  s   zWLoopBodyBlock.__init__.<locals>.CaptureIndexing.indirect_indexing.<locals>.set_indirectr  set_)r	  r	  r  r  )index_proxyr   r&  r'  r  )r&  r   r  rH   r$  A  s   zALoopBodyBlock.__init__.<locals>.CaptureIndexing.indirect_indexingc                    s     dd| fi  d S )Nr0  )r  r  )r  rG   rH   r0  V  s   z6LoopBodyBlock.__init__.<locals>.CaptureIndexing.outputrK   T)r   r   r   rP   rS   r=   r   r  r.  r|  r{  r  r7   r   r  r  r   r   r   r  r$  r0  rG   r  r   r  rG   rH   r  	  s,    r  )
tracer_clsplaceholderr0   rG   r   )IndexPropagation)SimplifyIndexingrK   )r	  r1   WrapperHandlerr7   r   TracerGraphr  r   r  index_propagationr.  r   r/  rD  r   constant_and_index_propagationset_ops_handlerr0   r0  )	r   r	  rQ   rM   r  	proxy_opsr.  r/  handlerrG   r+  rH   r     s   
QzLoopBodyBlock.__init__c                 C   s"   | j }| jj}t||t S rK   )r   r	  r  r  r  r1   get_ops_handlerr  rG   rG   rH   r  k  s   zLoopBodyBlock.__call__r  c              
   C   s8   t j| jj| jj}tdd|	 
dd| dS )Nz;[^\n]*r   zdef forward(zdef r  )r7   r   GraphModuler	  r  r   coderesubstripr  )r   rP   r:  rG   rG   rH   r  q  s   zLoopBodyBlock.debug_strN)r  )r   r   r   r  r>  r   r   r
   r   r  r  rG   rG   rG   rH   r    s
    "mr  c                       sJ   e Zd ZdZ	d fdd	Zdd Zdd ZedddZdd Z	  Z
S )Waitz
    Wait should not be used by itself.  It should always be constructed in tandem
    with a collective op that produces a work to wait on.
    rG   c                       t  ||| d S rK   r  r   r  r  rG   rH   r     s   zWait.__init__c                 C   r  r  rG   r   rG   rG   rH   rT    r  zWait.should_allocatec                 C   sN   | d dd | jD \}|| d| d ||   d|  d S )NzGfrom torch.distributed._functional_collectives_impl import _wait_tensorc                 s   r  rK   r  r  rG   rG   rH   r    r  zWait.codegen.<locals>.<genexpr>z = _wait_tensor(r  r>  )add_import_oncerY  r~  rT  )r   r  input_collectiverG   rG   rH   r    s   zWait.codegencollective_opr<   c                 C   s   |   t| |gdS )N)r  rY  )r  r>  r   )r  rC  rG   rG   rH   r    s
   zWait.createc                 C   s   | j d  gS r   )rY  r  r   rG   rG   rH   r    r  zWait.get_alias_namesr}  )rC  r<   )r   r   r   r  r   rT  r  r%  r  r  r&  rG   rG   r  rH   r>  {  s    	r>  c                       sD   e Zd ZdZ fddZdd Zdd Zedd	 Zd
d Z	  Z
S )CollectiveKernela  
    Each collective should follow the pattern:
    - extend InPlaceCollectiveKernel or OutOfPlaceCollectiveKernel.
    - the kernel delegates into c10d processgroup, which returns a 'work' obj
    - the work obj is registered via _register_tensor_work so it can be waited on later
    c                    s$   t  d ||| tj| | _d S rK   r  r   r1   r   r[  rP   r  r  rG   rH   r        zCollectiveKernel.__init__c                 C   r  NzMust implementrI  r   r  r0  input_namesrG   rG   rH   codegen_collective  r  z#CollectiveKernel.codegen_collectivec                 C   r  rG  rI  rH  rG   rG   rH   codegen_output  r  zCollectiveKernel.codegen_outputc                 C   s   dd }t t||S )Nc                 S   s(   t t|  |  |  | }t|S rK   )InPlaceHintr  r   r   r   r<   r  )r  r  rG   rG   rH   
wrap_input  s   
z;CollectiveKernel.wrap_inputs_as_inplace.<locals>.wrap_input)r5   r   )r  rY  rM  rG   rG   rH   wrap_inputs_as_inplace  s   z'CollectiveKernel.wrap_inputs_as_inplacec              
   C   s   | d | d | d dd | jD }|  }| j\}}}|| d| d| d| d	 | ||| | ||| |d
| d| d d S )Nz import torch.distributed as distz1import torch.distributed.distributed_c10d as c10dzEimport torch.distributed._functional_collectives_impl as fun_col_implc                 S   r3  rG   r  r  rG   rG   rH   r\     r   z,CollectiveKernel.codegen.<locals>.<listcomp>z0_pg = c10d._find_or_create_pg_by_ranks_and_tag('r  r  r  z#fun_col_impl._register_tensor_work(z_work))rA  rY  rT  rz  r~  rK  rJ  )r   r  rI  r0  tagranks
group_sizerG   rG   rH   r    s    

zCollectiveKernel.codegen)r   r   r   r  r   rJ  rK  r%  rN  r  r&  rG   rG   r  rH   rD    s    
	rD  c                       s8   e Zd ZdZ fddZdd Zdd Zdd	 Z  ZS )
InPlaceCollectiveKernelz
    InPlaceCollectiveKernel are those with in-out arguments such as all_reduce.
    Extend this kernel if your collective needs to modify its inputs in-place.
    c                    r?  rK   r@  r  r  rG   rH   r     rR   z InPlaceCollectiveKernel.__init__c                 C   r  r  rG   r   rG   rG   rH   rT    r  z'InPlaceCollectiveKernel.should_allocatec                 C   r  rS  rG   r   rG   rG   rH   has_side_effects  r  z(InPlaceCollectiveKernel.has_side_effectsc                 C   sH   t |dkr|| dd| d d S || d|d   d S )Nr    = [,z] r>  r   )r_   r~  r   rH  rG   rG   rH   rK    s    z&InPlaceCollectiveKernel.codegen_output)	r   r   r   r  r   rT  rS  rK  r&  rG   rG   r  rH   rR    s    rR  c                       sR   e Zd ZdZ fddZdd Zdd Zdd	 ZedddZ	edd Z
  ZS )OutOfPlaceCollectiveKernelz
    OutOfPlaceCollectiveKernel are those that allocate their
    outputs and leave their inputs inplace, such as all_gather.
    c                    s$   t  ||| | || _|| _d S rK   )r  r   outputsoriginal_inputsr   r  rY  rW  rz  r  rG   rH   r     s   
z#OutOfPlaceCollectiveKernel.__init__c                 C   r  r  rG   r   rG   rG   rH   rT    r  z*OutOfPlaceCollectiveKernel.should_allocatec                 C   r  rS  rG   r   rG   rG   rH   rS     r  z+OutOfPlaceCollectiveKernel.has_side_effectsc                 C   sX   dd | j D }|| dd| d || dddd | jD  d d S )	Nc                 S   r3  rG   r  r  rG   rG   rH   r\     r   z=OutOfPlaceCollectiveKernel.codegen_output.<locals>.<listcomp>z_inputs = [rU  r  rT  c                 s   r  rK   rO   r  rG   rG   rH   r    r  z<OutOfPlaceCollectiveKernel.codegen_output.<locals>.<genexpr>)rX  r~  r   rW  rH  rG   rG   rH   rK    s   ,z)OutOfPlaceCollectiveKernel.codegen_outputNc                 C   sP   g }|D ]!}|  }|d ur|| tt| | |dd}|| q|S )Nr  r  )r   OutputBufferr  r   r   r  )r  rY  size_cbrW  rk  r`  buffrG   rG   rH   create_output_buffers  s   z0OutOfPlaceCollectiveKernel.create_output_buffersc                    s    fddt |D S )Nc                    s&   g | ]\}}t |j d | dqS )r  r  )MultiOutputNoSizeAssertr  )rX   rY   out_tcollrG   rH   r\     s    
zBOutOfPlaceCollectiveKernel.create_output_nodes.<locals>.<listcomp>r  )r  ra  output_buffersrG   r`  rH   create_output_nodes  s   
z.OutOfPlaceCollectiveKernel.create_output_nodesrK   )r   r   r   r  r   rT  rS  rK  r%  r]  rc  r&  rG   rG   r  rH   rV    s    rV  c                       s0   e Zd ZdZdd Z fddZdd Z  ZS )rL  a  
    Helper OP to encode an in/out argument that tries to make it inplace whenever possible.
    Wrap the input of your inplace op to enable this behavior.

    The design is based on two key decisions:
    - this node is resposible for allocating the in/out buffer used by the collective.
        This is controlled by the ``should_allocate`` method that returns True here and
        False for the collective node
    - The scheduler special-case this node and enable it to reuse its input.
    c                 C   sF   | j d  }|  }|| | j d s!|| d| d d S d S )Nr   r  z) #no reuse)rY  r  rT  	did_reuser~  )r   r  
input_namer0  rG   rG   rH   r  3  s
   zInPlaceHint.codegenc                    s6   |  |}t d || |gd tj| | _d S r  )rm  r  r   rX  r1   r   r[  rP   )r   r  rk  r  rG   rH   r   9  s   
zInPlaceHint.__init__c                 C   r  rS  rG   r   rG   rG   rH   rT  >  r  zInPlaceHint.should_allocate)r   r   r   r  r  r   rT  r&  rG   rG   r  rH   rL  '  s
    rL  c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )rZ  zO
    Represent the output buffer used by ops that require multiple of them
    c                    s$   t  jd |g d tj| | _d S )Nrr  rE  )r   r  r  rG   rH   r   G  rF  zOutputBuffer.__init__c                 C   r  rS  rG   r   rG   rG   rH   rT  K  r  zOutputBuffer.should_allocatec                 C   s   | d| j  d S )Nz# collective out buffer )r~  rP   r  rG   rG   rH   r  N     zOutputBuffer.codegen)r   r   r   r  r   rT  r  r&  rG   rG   r  rH   rZ  B  s
    rZ  c                       s(   e Zd ZdZ fddZdd Z  ZS )r^  z
    Extract partial output from a multi-output OP.
    Works like MultiOutput but doesn't assert size. This must be a property guaranteed by the op emiting this.
    c                    s   t  ||g  || _d S rK   )r  r   rZ   )r   r  rk  rZ   r  rG   rH   r   X     
z MultiOutputNoSizeAssert.__init__c                 C   s,   | |   d| jd   | j  d S r=  )r~  rT  rY  rZ   r  rG   rG   rH   r  \  s    zMultiOutputNoSizeAssert.codegen)r   r   r   r  r   r  r&  rG   rG   r  rH   r^  R  s    r^  c                       sV   e Zd Z fddZdd Zeded deded	ee d
ef
ddZ	dd Z
  ZS )AllReduceCoalescedc                    r  rK   r  r   	reduce_opr   r  rY  rz  rj  r  rG   rH   r   c  rg  zAllReduceCoalesced.__init__c                 C   r  r  rG   r   rG   rG   rH   rT  g  r  z"AllReduceCoalesced.should_allocaterY  r<   rj  rO  rP  rQ  c           	      C   s4   |  |}t|d  }t|||||g|d}|S Nr   )r  rY  rz  rj  )rN  r	  r   rh  )	r  rY  rj  rO  rP  rQ  inplace_inputsr  r   rG   rG   rH   r  j  s   
	zAllReduceCoalesced.createc              
   C   s,   | | d| dt| j d| d d S )Nz"_work = dist.all_reduce_coalesced(z%, op=fun_col_impl._str_to_reduce_op('
'), group=_pg, async_op=True)r~  rS   rj  rH  rG   rG   rH   rJ  ~  s   z%AllReduceCoalesced.codegen_collective)r   r   r   r   rT  r%  r
   rS   r  r  rJ  r&  rG   rG   r  rH   rh  b  s     rh  c                       J   e Zd Z fddZedddededee def
d	d
Zdd Z	  Z
S )	AllReducec                    r  rK   ri  rk  r  rG   rH   r     rg  zAllReduce.__init__r   r<   rj  rO  rP  rQ  c           	      C   s:   |  |g}t|d  }t|||||g|d}|d S rl  )rN  r	  r   rr  )	r  r   rj  rO  rP  rQ  rm  r  r   rG   rG   rH   r    s   zAllReduce.createc              
   C   s,   | | d| d| dt| j d d S )Nz_work = dist.all_reduce(z, async_op=True, group=(_pg, op=fun_col_impl._str_to_reduce_op(''))rp  rH  rG   rG   rH   rJ    s   zAllReduce.codegen_collectiver   r   r   r   r%  rS   r
   r  r  rJ  r&  rG   rG   r  rH   rr    s    rr  c                	       sF   e Zd Z fddZedddedee defdd	Zd
d Z	  Z
S )AllGatherIntoTensorc                       t  |||| d S rK   r@  rY  r  rG   rH   r     rf  zAllGatherIntoTensor.__init__r   r<   rO  rP  rQ  c           
         sZ   |  |g} fdd}| ||}t|d  }t||||| gd}	| |	|d S )Nc                       | d   9  < d S r   rG   r`  rQ  rG   rH   compute_size  rR   z0AllGatherIntoTensor.create.<locals>.compute_sizer   r  rY  rW  rz  )rm  r]  r	  r   rv  rc  )
r  r   rO  rP  rQ  rY  r{  rW  r  r3  rG   rz  rH   r    s   zAllGatherIntoTensor.createc              
   C   &   | | d| d| d| d d S )Nz$_work = dist.all_gather_into_tensor([0], !_inputs[0], async_op=True, group=z_pg)r~  rH  rG   rG   rH   rJ    s   z&AllGatherIntoTensor.codegen_collectiveru  rG   rG   r  rH   rv    s
     rv  c                       rq  )ReduceScatterTensorc                       t  |||| || _d S rK   ri  r   r  rY  rW  rz  rj  r  rG   rH   r        
zReduceScatterTensor.__init__r   r<   rj  rO  rP  rQ  c                    s\   |  |g} fdd}| ||}t|d  }	t|	|||| g|d}
| |
|d S )Nc                       | d     < d S r   rG   ry  rz  rG   rH   r{    rR   z0ReduceScatterTensor.create.<locals>.compute_sizer   r  rY  rW  rz  rj  )rm  r]  r	  r   r  rc  )r  r   rj  rO  rP  rQ  rY  r{  rW  r  r3  rG   rz  rH   r    s   	zReduceScatterTensor.createc                 C   s2   | | d| d| d| dt| j d
 d S )Nz#_work = dist.reduce_scatter_tensor(r~  r  rs  rt  rp  rH  rG   rG   rH   rJ    s   z&ReduceScatterTensor.codegen_collectiveru  rG   rG   r  rH   r    s    r  c                	       sJ   e Zd Z fddZeded dedee defdd	Zd
d Z	  Z
S )AllGatherIntoTensorCoalescedc                    rw  rK   r@  rY  r  rG   rH   r     rf  z%AllGatherIntoTensorCoalesced.__init__rY  r<   rO  rP  rQ  c           	         sT    fdd|D }fdd}  ||}t|d  }t|||||gd}|S )Nc                    rd  rG   r  r  r  rG   rH   r\     r{   z7AllGatherIntoTensorCoalesced.create.<locals>.<listcomp>c                    rx  r   rG   ry  rz  rG   rH   r{    rR   z9AllGatherIntoTensorCoalesced.create.<locals>.compute_sizer   r|  )r]  r	  r   r  )	r  rY  rO  rP  rQ  r{  rW  r  r3  rG   r  rQ  rH   r    s   z#AllGatherIntoTensorCoalesced.createc              
   C   r}  )NzO_work = fun_col_impl._all_gather_into_tensor_coalesced_fallback(output_tensors=, input_tensors=z_inputs, group=ro  r  rH  rG   rG   rH   rJ    s   z/AllGatherIntoTensorCoalesced.codegen_collectiver   r   r   r   r%  r
   rS   r  r  rJ  r&  rG   rG   r  rH   r    s    r  c                       sN   e Zd Z fddZeded dededee def
d	d
Zdd Z	  Z
S )ReduceScatterTensorCoalescedc                    r  rK   ri  r  r  rG   rH   r     r  z%ReduceScatterTensorCoalesced.__init__rY  r<   rj  rO  rP  rQ  c           
         sV    fdd|D }fdd}  ||}t|d  }t|||||g|d}	|S )Nc                    rd  rG   r  r  r  rG   rH   r\      r{   z7ReduceScatterTensorCoalesced.create.<locals>.<listcomp>c                    r  r   rG   ry  rz  rG   rH   r{  "  rR   z9ReduceScatterTensorCoalesced.create.<locals>.compute_sizer   r  )r]  r	  r   r  )
r  rY  rj  rO  rP  rQ  r{  rW  r  r   rG   r  rH   r    s   	z#ReduceScatterTensorCoalesced.createc                 C   s2   | | d| d| dt| j d| d
 d S )NzN_work = fun_col_impl._reduce_scatter_tensor_coalesced_fallback(output_tensors=r  z,_inputs, op=fun_col_impl._str_to_reduce_op('rn  ro  rp  rH  rG   rG   rH   rJ  3  s   z/ReduceScatterTensorCoalesced.codegen_collectiver  rG   rG   r  rH   r    s    r  r*  )TFN)FN)r   r<   r1  r<   rA  r<   )rh  r   r  r  r  loggingr;  textwrapr   r   enumr   r   inspectr   typingr   r   r   r	   r
   r   r   r   r   r   unittest.mockr   r=   r   r   torch._loggingr7   torch.fxtorch.utils._pytreeutils_pytreer  torch._dynamo.utilsr   torch._prims_commonr   r   r   r   r   torch.fx.operator_schemasr   torch.utils._sympy.functionsr   r   r   r   r   r   codegen.commonr    cuda_propertiesr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   virtualizedr0   r1   	getLoggerr   rP  r   r  rJ   rT   ri   rl   rs   r  r   r  r   r   r   r   r   r   r   r   r  r   r   r   r   r&  r   r   r+  r,  r;  r?  rD  rr  rs  rt  r  r  r   r=  r9  r  rG  rH  r:   rn  ry  r  r  rE  r  r  r  r  r  r  r  r  r  rB  r  r8  r  r  r  rV  rW  rp  rq  rl  r  r  r  r  r  r  r  r;   r   r	  r1  r  rX  r[  r\  rm  rv  r~  r  r  r  r  r  r  r  r  r<   rA  r   Interpreterr  r>  r  r>  rD  rR  rV  rL  rZ  r^  rh  rr  rv  r  r  r  rG   rG   rG   rH   <module>   s   08
"	

@O
2    5 }
J:): @1m#[Hh s31g  j$"V$% e*	

 
8H[_5@EVn 	 #o(ld ,55&()