This document is relevant for: Inf1, Inf2, Trn1, Trn2, Trn3
nec.h#
Neuron Elastic Collectives (NEC) API - Collective operations for distributed computing on Neuron devices.
Source: src/libnrt/include/nrt/nec.h
Overview#
This is the main component for Neuron Elastic Collectives in Neuron Runtime (NRT). This provides collective operations to applications offloaded by the device including collective comm init, receiving (post) operations, building resources for the operation, triggering the operation and polling its completion.
Constants#
NEC_MAX_CHANNELS#
#define NEC_MAX_CHANNELS 32
Maximum channels (matches MAXCHANNELS in NCCL).
Source: nec.h:18
NEC_MAX_COMM_N#
#define NEC_MAX_COMM_N 12
Max supported replica-groups in NEFF.
Source: nec.h:26
NEC_MAX_STREAM_N#
#define NEC_MAX_STREAM_N 4
The maximum number of concurrent cc execution.
Source: nec.h:56
Enumerations#
nec_pod_type_t#
typedef enum nec_pod_type {
NEC_POD_TYPE_NONE,
NEC_POD_TYPE_P2P,
NEC_POD_TYPE_SWITCH,
NEC_POD_TYPE_INVALID
} nec_pod_type_t;
Pod type enumeration (translated from what KaenaDriver returns).
Source: nec.h:103
enc_pattern_t#
typedef enum enc_pattern {
ENC_PATTERN_RING,
ENC_PATTERN_MESH,
ENC_PATTERN_INVALID,
} enc_pattern_t;
Communication pattern types.
Source: nec.h:244
Structures#
nccl_comm_info_t#
typedef struct nccl_comm_info {
uint64_t cluster_id;
time_t epoch;
int neuron_dev;
int rank;
int rank_n;
int local_rank_n;
int local_rack_rank_n;
int node;
int node_n;
bool enable_pod;
bool use_net;
int pod;
int pod_n;
int pod_node;
int pod_node_n;
struct enc_peer_info *peers;
int channel_n;
struct enc_ring rings[NEC_MAX_CHANNELS];
int kangaring_channel_n;
int* kangaring_paths[NEC_MAX_CHANNELS];
int mla_cycle_n;
int* mla_cycles[NEC_MAX_CHANNELS];
} nccl_comm_info_t;
Comm info to query from NCCL.
Source: nec.h:732
enc_neuron_device_info_t#
typedef struct enc_neuron_device_info {
int nec_dev_id;
int mla_idx;
int tpb_idx;
int host_device_id;
int routing_id;
uint64_t pod_id;
nec_pod_type_t pod_type;
uint32_t pod_node_id;
uint32_t virtual_server_id;
enc_proxy_histogram_config_t histogram_config;
} enc_neuron_device_info_t;
Neuron Device information. This data structure is used to send the device information from KaenaRuntime to KaenaNCCL for nccl communicator building.
Source: nec.h:787
nec_version_info_t#
typedef struct nec_version_info {
uint64_t major;
uint64_t minor;
uint64_t patch;
uint64_t maintenance;
char git_hash[16];
uint64_t compatibility_version;
uint8_t future_fields[];
} nec_version_info_t;
NEC version information.
Source: nec.h:920
Functions#
nec_get_device_count#
int nec_get_device_count(int *available_devices_array, uint32_t array_size);
Query device information - get device count.
Parameters:
available_devices_array[out] - Array to store available device IDsarray_size[in] - Size of the array
Returns: Number of available devices
Source: nec.h:917
nec_get_virtual_core_size#
NRT_STATUS nec_get_virtual_core_size(uint32_t *virtual_core_size);
Query vcore size.
Parameters:
virtual_core_size[out] - Virtual core size
Returns: NRT_STATUS_SUCCESS on success
Source: nec.h:923
nec_get_version_info#
NRT_STATUS nec_get_version_info(nec_version_info_t *version_info);
Get NEC version information.
Parameters:
version_info[out] - Version information structure
Returns: NRT_STATUS_SUCCESS on success
Source: nec.h:932
This document is relevant for: Inf1, Inf2, Trn1, Trn2, Trn3