"""Auto-generated stub file"""
from enum import Enum
import nki.language as nl
import ml_dtypes
class NKIObject:
...
bool_ = 'bool'
"""Boolean (True or False) stored as a byte"""
int8 = 'int8'
"""8-bit signed integer number"""
int16 = 'int16'
"""16-bit signed integer number"""
int32 = 'int32'
"""32-bit signed integer number"""
uint8 = 'uint8'
"""8-bit unsigned integer number"""
uint16 = 'uint16'
"""16-bit unsigned integer number"""
uint32 = 'uint32'
"""32-bit unsigned integer number"""
float16 = 'float16'
"""16-bit floating-point number"""
float32 = 'float32'
"""32-bit floating-point number"""
bfloat16 = 'bfloat16'
"""16-bit floating-point number (1S,8E,7M)"""
tfloat32 = 'tfloat32'
"""32-bit floating-point number (1S,8E,10M)"""
float8_e4m3 = 'float8_e4m3'
"""8-bit floating-point number (1S,4E,3M)"""
float8_e5m2 = 'float8_e5m2'
"""8-bit floating-point number (1S,5E,2M)"""
float8_e5m2_x4 = 'float8_e5m2_x4'
"""4x packed float8_e5m2 elements, custom data type for nki.isa.nc_matmul_mx on NeuronCore-v4"""
float8_e4m3fn_x4 = 'float8_e4m3fn_x4'
"""4x packed float8_e4m3fn elements, custom data type for nki.isa.nc_matmul_mx on NeuronCore-v4"""
float4_e2m1fn_x4 = 'float4_e2m1fn_x4'
"""4x packed float4_e2m1fn elements, custom data type for nki.isa.nc_matmul_mx on NeuronCore-v4"""
[docs]def ndarray(shape, dtype, buffer=None, name=""):
r"""
Create a new tensor of given shape and dtype on the specified buffer.
((Similar to `numpy.ndarray <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_))
:param shape: the shape of the tensor.
:param dtype: the data type of the tensor (see :ref:`nki-dtype` for more information).
:param buffer: the specific buffer (ie, :doc:`sbuf<nki.language.sbuf>`, :doc:`psum<nki.language.psum>`, :doc:`hbm<nki.language.hbm>`), defaults to :doc:`sbuf<nki.language.sbuf>`.
:param name: the name of the tensor.
:return: a new tensor allocated on the buffer.
"""
...
[docs]def zeros(shape, dtype, buffer=None, name=""):
r"""
Create a new tensor of given shape and dtype on the specified buffer, filled with zeros.
((Similar to `numpy.zeros <https://numpy.org/doc/stable/reference/generated/numpy.zeros.html>`_))
:param shape: the shape of the tensor.
:param dtype: the data type of the tensor (see :ref:`nki-dtype` for more information).
:param buffer: the specific buffer (ie, :doc:`sbuf<nki.language.sbuf>`, :doc:`psum<nki.language.psum>`, :doc:`hbm<nki.language.hbm>`), defaults to :doc:`sbuf<nki.language.sbuf>`.
:param name: the name of the tensor.
:return: a new tensor allocated on the buffer.
"""
...
def shared_constant(constant, dtype=None):
r"""
Create a new tensor filled with the data specified by data array.
:param constant: the constant data to be filled into a tensor
:return: a tensor which contains the constant data
"""
...
def shared_identity_matrix(n, dtype="uint8"):
r"""
Create a new identity tensor with specified data type.
This function has the same behavior to :doc:`nki.language.shared_constant <nki.language.shared_constant>` but
is preferred if the constant matrix is an identity matrix. The
compiler will reuse all the identity matrices of the same
dtype in the graph to save space.
:param n: the number of rows(and columns) of the returned identity matrix
:param dtype: the data type of the tensor, default to be ``nl.uint8`` (see :ref:`nki-dtype` for more information).
:return: a tensor which contains the identity tensor
"""
...
[docs]def affine_range(start, stop=None, step=1):
r"""
Create a sequence of numbers for use as **parallel** loop iterators in NKI. ``affine_range`` should be the default
loop iterator choice, when there is **no** loop carried dependency. Note, associative reductions are **not** considered
loop carried dependencies in this context. A concrete example of associative reduction
is multiple :doc:`nl.matmul <nki.language.matmul>`
or :doc:`nisa.nc_matmul <nki.isa.nc_matmul>` calls accumulating into the same
output buffer defined outside of this loop level (see code example #2 below).
When the above conditions are not met, we recommend using :doc:`sequential_range <nki.language.sequential_range>`
instead.
Notes:
- Using ``affine_range`` prevents Neuron compiler from unrolling the loops until entering compiler backend,
which typically results in better compilation time compared to the fully unrolled iterator
:doc:`static_range <nki.language.static_range>`.
- Using ``affine_range`` also allows Neuron compiler to perform additional loop-level optimizations, such as
loop vectorization in current release. The exact type of loop-level optimizations applied is subject
to changes in future releases.
- Since each kernel instance only runs on a single NeuronCore, `affine_range` does **not** parallelize
different loop iterations across multiple NeuronCores. However, different iterations could be parallelized/pipelined
on different compute engines within a NeuronCore depending on the invoked instructions (engines) and data dependency
in the loop body.
.. code-block::
:linenos:
import nki.language as nl
#######################################################################
# Example 1: No loop carried dependency
# Input/Output tensor shape: [128, 2048]
# Load one tile ([128, 512]) at a time, square the tensor element-wise,
# and store it into output tile
#######################################################################
# Every loop instance works on an independent input/output tile.
# No data dependency between loop instances.
for i_input in nl.affine_range(input.shape[1] // 512):
offset = i_input * 512
input_sb = nl.load(input[0:input.shape[0], offset:offset+512])
result = nl.multiply(input_sb, input_sb)
nl.store(output[0:input.shape[0], offset:offset+512], result)
#######################################################################
# Example 2: Matmul output buffer accumulation, a type of associative reduction
# Input tensor shapes for nl.matmul: xT[K=2048, M=128] and y[K=2048, N=128]
# Load one tile ([128, 128]) from both xT and y at a time, matmul and
# accumulate into the same output buffer
#######################################################################
result_psum = nl.zeros((128, 128), dtype=nl.float32, buffer=nl.psum)
for i_K in nl.affine_range(xT.shape[0] // 128):
offset = i_K * 128
xT_sbuf = nl.load(offset:offset+128, 0:xT.shape[1]])
y_sbuf = nl.load(offset:offset+128, 0:y.shape[1]])
result_psum += nl.matmul(xT_sbuf, y_sbuf, transpose_x=True)
"""
...
[docs]def ds(start, size):
r"""
Construct a dynamic slice for simple tensor indexing.
.. nki_example:: ../../test/test_nki_nl_dslice.py
:language: python
:marker: NKI_EXAMPLE_1
"""
...
[docs]def sequential_range(start, stop, step):
r"""
Create a sequence of numbers for use as **sequential** loop iterators in NKI. ``sequential_range``
should be used when there is a loop carried dependency. Note, associative reductions are **not** considered
loop carried dependencies in this context. See :doc:`affine_range <nki.language.affine_range>` for
an example of such associative reduction.
Notes:
- Inside a NKI kernel, any use of Python ``range(...)`` will be replaced with ``sequential_range(...)``
by Neuron compiler.
- Using ``sequential_range`` prevents Neuron compiler from unrolling the loops until entering compiler backend,
which typically results in better compilation time compared to the fully unrolled iterator
:doc:`static_range <nki.language.static_range>`.
- Using ``sequential_range`` informs Neuron compiler to respect inter-loop dependency and perform
much more conservative loop-level optimizations compared to ``affine_range``.
- Using ``affine_range`` instead of ``sequential_range`` in case of loop carried dependency
incorrectly is considered unsafe and could lead to numerical errors.
.. code-block::
:linenos:
import nki.language as nl
#######################################################################
# Example 1: Loop carried dependency from tiling tensor_tensor_scan
# Both sbuf tensor input0 and input1 shapes: [128, 2048]
# Perform a scan operation between the two inputs using a tile size of [128, 512]
# Store the scan output to another [128, 2048] tensor
#######################################################################
# Loop iterations communicate through this init tensor
init = nl.zeros((128, 1), dtype=input0.dtype)
# This loop will only produce correct results if the iterations are performed in order
for i_input in nl.sequential_range(input0.shape[1] // 512):
offset = i_input * 512
# Depends on scan result from the previous loop iteration
result = nisa.tensor_tensor_scan(input0[:, offset:offset+512],
input1[:, offset:offset+512],
initial=init,
op0=nl.multiply, op1=nl.add)
nl.store(output[0:input0.shape[0], offset:offset+512], result)
# Prepare initial result for scan in the next loop iteration
init[:, :] = result[:, 511]
"""
...
[docs]def static_range(start, stop=None, step=1):
r"""
Create a sequence of numbers for use as loop iterators in NKI, resulting in a fully unrolled loop.
Unlike :doc:`affine_range <nki.language.affine_range>` or :doc:`sequential_range <nki.language.sequential_range>`,
Neuron compiler will fully unroll the loop during NKI kernel tracing.
Notes:
- Due to loop unrolling, compilation time may go up significantly compared to
:doc:`affine_range <nki.language.affine_range>` or :doc:`sequential_range <nki.language.sequential_range>`.
- On-chip memory (SBUF) usage may also go up significantly compared to
:doc:`affine_range <nki.language.affine_range>` or :doc:`sequential_range <nki.language.sequential_range>`.
- No loop-level optimizations will be performed in the compiler.
- ``static_range`` should only be used as a fall-back option for debugging purposes when
:doc:`affine_range <nki.language.affine_range>` or :doc:`sequential_range <nki.language.sequential_range>`
is giving functionally incorrect results or undesirable performance characteristics.
"""
...
[docs]class tile_size(NKIObject):
r"""Tile size constants."""
pmax: int = ...
r"""Maximum partition dimension of a tile"""
psum_fmax: int = ...
r"""Maximum free dimension of a tile on PSUM buffer"""
gemm_stationary_fmax: int = ...
r"""Maximum free dimension of the stationary operand of General Matrix Multiplication on Tensor Engine"""
gemm_moving_fmax: int = ...
r"""Maximum free dimension of the moving operand of General Matrix Multiplication on Tensor Engine"""
bn_stats_fmax: int = ...
r"""Maximum free dimension of BN_STATS"""
psum_min_align: int = ...
r"""Minimum byte alignment requirement for PSUM free dimension address"""
sbuf_min_align: int = ...
r"""Minimum byte alignment requirement for SBUF free dimension address"""
total_available_sbuf_size: int = ...
r"""Total SBUF available size"""
def abs(x, dtype=None):
r"""This operation is not supported in the current release of NKI."""
...
def add(x, y, dtype=None):
r"""This operation is not supported in the current release of NKI."""
...
def bitwise_and(x, y, dtype=None):
r"""This operation is not supported in the current release of NKI."""
...
def bitwise_or(x, y, dtype=None):
r"""This operation is not supported in the current release of NKI."""
...
def bitwise_xor(x, y, dtype=None):
r"""This operation is not supported in the current release of NKI."""
...
def divide(x, y, dtype=None):
r"""
Divide the inputs, element-wise.
((Similar to `numpy.divide <https://numpy.org/doc/stable/reference/generated/numpy.divide.html>`_))
:param x: a tile or a scalar value.
:param y: a tile or a scalar value. ``x.shape`` and ``y.shape`` must be `broadcastable <https://numpy.org/doc/stable/user/basics.broadcasting.html>`__ to a common shape, that will become the shape of the output.
:param dtype: (optional) data type to cast the output type to (see :ref:`nki-dtype` for more information); if not specified, it will default to be the same as the data type of the input tiles, or whichever input type has the highest precision (see :ref:`nki-type-promotion` for more information);
:return: a tile that has ``x / y``, element-wise.
"""
...
def equal(x, y, dtype=bool):
r"""This operation is not supported in the current release of NKI."""
...
def gelu_apprx_sigmoid(x, dtype=None):
r"""
Gaussian Error Linear Unit activation function on the input, element-wise, with sigmoid approximation.
:param x: a tile.
:param dtype: (optional) data type to cast the output type to (see :ref:`nki-dtype` for more information); if not specified, it will default to be the same as the data type of the input tile.
:return: a tile that has gelu of ``x``.
"""
...
def greater(x, y, dtype=bool):
r"""This operation is not supported in the current release of NKI."""
...
def greater_equal(x, y, dtype=bool):
r"""This operation is not supported in the current release of NKI."""
...
def invert(x, dtype=None):
r"""This operation is not supported in the current release of NKI."""
...
def left_shift(x, y, dtype=None):
r"""This operation is not supported in the current release of NKI."""
...
def less(x, y, dtype=bool):
r"""This operation is not supported in the current release of NKI."""
...
def less_equal(x, y, dtype=bool):
r"""This operation is not supported in the current release of NKI."""
...
def logical_and(x, y, dtype=bool):
r"""This operation is not supported in the current release of NKI."""
...
def logical_not(x, dtype=bool):
r"""This operation is not supported in the current release of NKI."""
...
def logical_or(x, y, dtype=bool):
r"""This operation is not supported in the current release of NKI."""
...
def logical_xor(x, y, dtype=bool):
r"""This operation is not supported in the current release of NKI."""
...
def maximum(x, y, dtype=None):
r"""
Maximum of the inputs, element-wise.
((Similar to `numpy.maximum <https://numpy.org/doc/stable/reference/generated/numpy.maximum.html>`_))
:param x: a tile or a scalar value.
:param y: a tile or a scalar value. ``x.shape`` and ``y.shape`` must be `broadcastable <https://numpy.org/doc/stable/user/basics.broadcasting.html>`__ to a common shape, that will become the shape of the output.
:param dtype: (optional) data type to cast the output type to (see :ref:`nki-dtype` for more information); if not specified, it will default to be the same as the data type of the input tiles, or whichever input type has the highest precision (see :ref:`nki-type-promotion` for more information);
:return: a tile that has the maximum of each elements from x and y.
"""
...
def minimum(x, y, dtype=None):
r"""
Minimum of the inputs, element-wise.
((Similar to `numpy.minimum <https://numpy.org/doc/stable/reference/generated/numpy.minimum.html>`_))
:param x: a tile or a scalar value.
:param y: a tile or a scalar value. ``x.shape`` and ``y.shape`` must be `broadcastable <https://numpy.org/doc/stable/user/basics.broadcasting.html>`__ to a common shape, that will become the shape of the output.
:param dtype: (optional) data type to cast the output type to (see :ref:`nki-dtype` for more information); if not specified, it will default to be the same as the data type of the input tiles, or whichever input type has the highest precision (see :ref:`nki-type-promotion` for more information);
:return: a tile that has the minimum of each elements from x and y.
"""
...
def multiply(x, y, dtype=None):
r"""This operation is not supported in the current release of NKI."""
...
def not_equal(x, y, dtype=bool):
r"""This operation is not supported in the current release of NKI."""
...
def power(x, y, dtype=None):
r"""
Elements of x raised to powers of y, element-wise.
((Similar to `numpy.power <https://numpy.org/doc/stable/reference/generated/numpy.power.html>`_))
:param x: a tile or a scalar value.
:param y: a tile or a scalar value. ``x.shape`` and ``y.shape`` must be `broadcastable <https://numpy.org/doc/stable/user/basics.broadcasting.html>`__ to a common shape, that will become the shape of the output.
:param dtype: (optional) data type to cast the output type to (see :ref:`nki-dtype` for more information); if not specified, it will default to be the same as the data type of the input tiles, or whichever input type has the highest precision (see :ref:`nki-type-promotion` for more information);
:return: a tile that has values ``x`` to the power of ``y``.
"""
...
def reciprocal(x, dtype=None):
r"""This operation is not supported in the current release of NKI."""
...
def right_shift(x, y, dtype=None):
r"""This operation is not supported in the current release of NKI."""
...
def rsqrt(x, dtype=None):
r"""This operation is not supported in the current release of NKI."""
...
def subtract(x, y, dtype=None):
r"""This operation is not supported in the current release of NKI."""
...
sbuf = 'sbuf'
"""State Buffer - Only visible to each individual kernel instance in the SPMD grid"""
psum = 'psum'
"""PSUM - Only visible to each individual kernel instance in the SPMD grid"""
hbm = 'hbm'
"""HBM - Alias of private_hbm"""
shared_hbm = 'shared_hbm'
"""Shared HBM - Visible to all kernel instances in the SPMD grid"""
private_hbm = 'private_hbm'
"""HBM - Only visible to each individual kernel instance in the SPMD grid"""
[docs]def device_print(print_prefix, tensor):
r"""
Print a message with a string ``print_prefix`` followed by the value of a tile ``tensor``.
By default, using this function will not result in your tensors being printed out. When running your kernel,
you need to define the environment variable ``NEURON_RT_DEBUG_OUTPUT_DIR`` and point it to a directory that will
store the tensor data grouped by prefix each time the device_print instruction is executed.
The structure of the directory will be ``<print_prefix>/core_<logical core id>/<iteration>/...``.
.. code-block:: python
:caption: Example usage
:emphasize-lines: 7
import nki.isa as nisa
import nki.language as nl
def my_nki_kernel(input_tensor):
a_tile = sbuf.view(input_tensor.dtype, input_tensor.shape)
nisa.dma_copy(a_tile, input_tensor)
nl.device_print("a_tile", a_tile)
...
.. warning::
This feature is only available when using the NxD Inference library.
:param print_prefix: prefix of the print message. This string is evaluated at trace time and must be a constant expression.
:type print_prefix: str
:param tensor: tensor to print out. Can be in SBUF or HBM.
:return: None
"""
...
[docs]def num_programs(axes=None):
r"""
Number of SPMD programs along the given axes in the launch grid. If ``axes`` is not provided,
returns the total number of programs.
:param axes: The axes of the ND launch grid. If not provided, returns the total number of programs along the entire launch grid.
:return: The number of SPMD(single process multiple data) programs along ``axes`` in the launch grid
"""
...
[docs]def program_id(axis):
r"""
Index of the current SPMD program along the given axis in the launch grid.
:param axis: The axis of the ND launch grid.
:return: The program id along ``axis`` in the launch grid
"""
...
[docs]def program_ndim():
r"""
Number of dimensions in the SPMD launch grid.
:return: The number of dimensions in the launch grid, i.e. the number of axes
"""
...