Merge remote-tracking branch 'nccl/master' into develop

Dieser Commit ist enthalten in:
Marzieh Berenjkoub
2026-01-20 13:01:49 -06:00
Commit 858b4e76eb
240 geänderte Dateien mit 16266 neuen und 3578 gelöschten Zeilen
+2 -2
Datei anzeigen
@@ -3,6 +3,6 @@
/coverage/
build/
ext/
src/transport/net_ib_rocm.cc
# Visual Studio Code
.vscode
.vscode
+15 -1
Datei anzeigen
@@ -2,16 +2,30 @@
Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
## Unreleased - RCCL 2.28.3 for ROCm 7.11
### Known issues
* AllGather regression for small message sizes (less than 1 MB) due to the Direct algorithm.
* ROCTx feature needs to be verified.
* Profiler plugin needs to be verified.
### Changed
* Compatibility with NCCL 2.28.3.
* The MSCCL feature is now disabled by default. The `--disable-msccl-kernel` build flag is replaced with `--enable-msccl-kernel` in the `rccl/install.sh` script.
* MSCCL and NPKIT are deprecated and will be removed in a future release of RCCL.
## Unreleased - RCCL 2.27.7 for ROCm 7.2.0
### Changed
* RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`.
* Disabled `reduceCopyPacks` pipelining for `gfx950`.
* Experimental support for traffic shaping using warp specialization (also known as WarpSpeed) is now available for the Ring algorithm.
* Enabling WarpSpeed in auto mode using RCCL_WARP_SPEED_AUTO optimizes performance and reduces the CU count by 50% on a single node for AllReduce, AllGather from 64MB, and ReduceScatter from 256MB.
* The following configuration knobs control WarpSpeed behavior for debugging purposes: `RCCL_WARP_SPEED_ENABLE`, `RCCL_UNROLL_FACTOR`, `RCCL_WARP_SPEED_CU_COUNT`, and `RCCL_THREADS_PER_BLOCK`. Note that the effective unroll factor is calculated as 2 raised to the value of `RCCL_UNROLL_FACTOR`.
### Known issues
* AllToAllv/AlltoAll for single GPU is hanging.
## Unreleased - RCCL 2.27.7 for ROCm 7.1.1
### Changed
+39 -3
Datei anzeigen
@@ -26,7 +26,7 @@ option(BUILD_TESTS "Build unit test programs"
option(COLLTRACE "Collective Trace Option" ON)
option(DUMP_ASM "Disassemble and dump" OFF)
option(ENABLE_CODE_COVERAGE "Enable code coverage" OFF)
option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" ON)
option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" OFF)
option(ENABLE_MSCCLPP "Enable MSCCL++" OFF)
option(ENABLE_MSCCLPP_CLIP "Enable MSCCL++ CLIP" OFF)
option(ENABLE_MSCCLPP_EXECUTOR "Enable MSCCL++ Executor" OFF)
@@ -463,10 +463,12 @@ configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/include/nccl.h) # Used b
set(SRC_FILES
src/allocator.cc
src/bootstrap.cc
src/ce_coll.cc
src/channel.cc
src/collectives.cc
src/commDump.cc
src/debug.cc
src/dev_runtime.cc
src/enqueue.cc
src/group.cc
src/init.cc
@@ -475,7 +477,7 @@ set(SRC_FILES
src/msccl.cc
src/proxy.cc
src/rccl_wrap.cc
src/symmetric.cc
src/sym_kernels.cc
src/transport.cc
src/device/all_gather.h
src/device/all_reduce.h
@@ -526,6 +528,7 @@ set(SRC_FILES
src/include/BfdBacktrace.hpp
src/include/bitops.h
src/include/bootstrap.h
src/include/ce_coll.h
src/include/channel.h
src/include/checks.h
src/include/collectives.h
@@ -535,6 +538,7 @@ set(SRC_FILES
src/include/cpuset.h
# src/include/cudawrap.h
src/include/debug.h
src/include/dev_runtime.h
src/include/device.h
src/include/enqueue.h
src/include/gdrwrap.h
@@ -549,6 +553,7 @@ set(SRC_FILES
src/include/ipcsocket.h
src/include/mnnvl.h
src/include/nccl_common.h
src/include/nccl_device.h
src/include/net_device.h
src/include/net.h
src/include/nvmlwrap.h
@@ -569,12 +574,13 @@ set(SRC_FILES
src/include/rocmwrap.h
src/include/roctx.h
src/include/recorder.h
src/include/scheduler.h
src/include/shm.h
src/include/shmutils.h
src/include/signals.h
src/include/socket.h
src/include/strongstream.h
src/include/symmetric.h
src/include/sym_kernels.h
src/include/timer.h
src/include/transport.h
src/include/trees.h
@@ -592,6 +598,23 @@ set(SRC_FILES
src/include/msccl/msccl_setup.h
src/include/msccl/msccl_status.h
src/include/msccl/msccl_struct.h
src/include/nccl_device/comm.h
src/include/nccl_device/coop.h
src/include/nccl_device/core.h
src/include/nccl_device/ll_a2a.h
src/include/nccl_device/mem_barrier.h
src/include/nccl_device/ptr.h
src/include/nccl_device/utility.h
src/include/nccl_device/impl/comm__funcs.h
src/include/nccl_device/impl/comm__types.h
src/include/nccl_device/impl/core__funcs.h
src/include/nccl_device/impl/core__types.h
src/include/nccl_device/impl/ll_a2a__funcs.h
src/include/nccl_device/impl/ll_a2a__types.h
src/include/nccl_device/impl/mem_barrier__funcs.h
src/include/nccl_device/impl/mem_barrier__types.h
src/include/nccl_device/impl/ptr__funcs.h
src/include/nccl_device/impl/ptr__types.h
src/include/npkit/npkit.h
src/include/npkit/npkit_event.h
src/include/npkit/npkit_struct.h
@@ -639,6 +662,7 @@ set(SRC_FILES
src/include/plugin/net/net_v8.h
src/include/plugin/net/net_v9.h
src/include/plugin/net/net_v10.h
src/include/plugin/net/net_v11.h
src/include/plugin/profiler/net_ib_v1.h
src/include/plugin/profiler/net_ib.h
src/include/plugin/profiler/net_socket_v1.h
@@ -647,9 +671,11 @@ set(SRC_FILES
src/include/plugin/profiler/profiler_v2.h
src/include/plugin/profiler/profiler_v3.h
src/include/plugin/profiler/profiler_v4.h
src/include/plugin/profiler/profiler_v5.h
src/include/plugin/tuner/tuner_v2.h
src/include/plugin/tuner/tuner_v3.h
src/include/plugin/tuner/tuner_v4.h
src/include/plugin/tuner/tuner_v5.h
src/misc/alt_rsmi.cc
src/misc/archinfo.cc
src/misc/argcheck.cc
@@ -682,6 +708,9 @@ set(SRC_FILES
src/misc/msccl/msccl_setup.cc
src/misc/msccl/msccl_status.cc
src/misc/proxy_trace/proxy_trace.cc
src/nccl_device/core.cc
src/nccl_device/ll_a2a.cc
src/nccl_device/mem_barrier.cc
src/plugin/net.cc
src/plugin/plugin_open.cc
src/plugin/profiler.cc
@@ -691,13 +720,16 @@ set(SRC_FILES
src/plugin/net/net_v8.cc
src/plugin/net/net_v9.cc
src/plugin/net/net_v10.cc
src/plugin/net/net_v11.cc
src/plugin/profiler/profiler_v1.cc
src/plugin/profiler/profiler_v2.cc
src/plugin/profiler/profiler_v3.cc
src/plugin/profiler/profiler_v4.cc
src/plugin/profiler/profiler_v5.cc
src/plugin/tuner/tuner_v2.cc
src/plugin/tuner/tuner_v3.cc
src/plugin/tuner/tuner_v4.cc
src/plugin/tuner/tuner_v5.cc
src/ras/client.cc
src/ras/client_support.cc
src/ras/collectives.cc
@@ -708,6 +740,7 @@ set(SRC_FILES
src/register/coll_reg.cc
src/register/register.cc
src/register/sendrecv_reg.cc
src/scheduler/symmetric_sched.cc
src/transport/coll_net.cc
src/transport/generic.cc
src/transport/net.cc
@@ -880,6 +913,7 @@ target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device/network/unpack)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/mlx5)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/nccl_device)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/ionic)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/plugin)
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/gensrc)
@@ -899,6 +933,7 @@ if(COLLTRACE)
target_compile_definitions(rccl PRIVATE ENABLE_COLLTRACE)
endif()
if(ENABLE_MSCCL_KERNEL)
message(WARNING "MSCCL is deprecated and will be removed in a future version of RCCL.")
target_compile_definitions(rccl PRIVATE COMPILE_MSCCL_KERNEL)
endif()
if(ENABLE_MSCCLPP)
@@ -939,6 +974,7 @@ endif()
# NPKit flags
## May be better to move these to a separate file
if(ENABLE_NPKIT)
message(WARNING "NPKit is deprecated and will be removed in a future version of RCCL. Please consider using alternative profiling tools.")
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
target_compile_definitions(rccl PRIVATE ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
+1 -1
Datei anzeigen
@@ -42,7 +42,7 @@ RCCL build & installation helper script
--debug Build debug library
--enable_backtrace Build with custom backtrace support
--disable-colltrace Build without collective trace
--disable-msccl-kernel Build without MSCCL kernels
--enable-msccl-kernel Build with MSCCL kernels
--enable-mscclpp Build with MSCCL++ support
--enable-mscclpp-clip Build MSCCL++ with clip wrapper on bfloat16 and half addition routines
--disable-roctx Build without ROCTX logging
+8
Datei anzeigen
@@ -246,4 +246,12 @@ execute_process(
COMMAND bash -c "sed -i 's/ncclNetIb/rocmNetIb/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbFinalize/rocmNetIbFinalize/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
execute_process(
COMMAND bash -c "sed -i 's/ncclIbSetNetAttr/rocmNetIbSetNetAttr/g' ${ROCM_NETIB_FILE}"
WORKING_DIRECTORY ${RCCL_SRC_DIR}
)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+16 -9
Datei anzeigen
@@ -60,36 +60,36 @@ of newer ones.
The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
from old API versions. It also provides error codes in `err.h`.
# API (v10)
# API (v11)
Below is the main `ncclNet_v10` struct. Each function is explained in later sections.
Below is the main `ncclNet_v11` struct. Each function is explained in later sections.
```
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
@@ -191,6 +191,12 @@ This will allow the plugin to discover network devices and make sure they are us
`init` function does not return `ncclSuccess`, then NCCL will not use the plugin and fall back on
internal ones.
Every call to `init` returns an opaque context that the plugin uses internally to allocate resources
and manage state. Such context is passed to other net plugin calls that create further resources,
such as `listen` and `connect`. Every context is uniquely associated to a communicator
using the commId. The network can also be initialized with a per communicator configuration using
the `config` argument.
To allow the plugin logs to integrate into the NCCL logs seemlessly, NCCL provides a logging
function to `init`. This function is typically used to allow for `INFO` and `WARN` macros within
the plugin code adding the following definitions:
@@ -282,7 +288,7 @@ side.
`listen`
To create a connection, NCCL will start by calling `listen` on the receiver side. This function
takes a device number as input argument, and should return a local `listenComm` object, and a
takes the opaque plugin context returned by `init` and a device number as input argument, and should return a local `listenComm` object, and a
`handle` to pass to the other side, so that the sender side can connect to the receiver.
The `handle` is a buffer of size `NCCL_NET_HANDLE_MAXSIZE` and is provided by NCCL.
@@ -304,7 +310,8 @@ the `listen` call previously. If the sender did not connect yet, `accept` should
should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
succeeds.
The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field.
The `connect` API takes the opaque plugin context returned by `init`. The plugin context can reference
the `ncclNetCommConfig_t` passed to the `init` function and containing a trafficClass field.
This field can be used by the network plugin to specify the QoS level of the connection. By default,
`trafficClass` is set to -1 but can be configured by the application during communicator initialization
to select a plugin-supported QoS level.
+19
Datei anzeigen
@@ -0,0 +1,19 @@
set(SRC_FILES
${CMAKE_CURRENT_SOURCE_DIR}/plugin.c
)
# Create shared library
add_library(nccl-net-example SHARED ${SRC_FILES})
# Set include directories
target_include_directories(nccl-net-example PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/nccl
)
# Set output name to match Makefile
set_target_properties(nccl-net-example PROPERTIES
OUTPUT_NAME "nccl-net-example"
PREFIX "lib"
POSITION_INDEPENDENT_CODE ON
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/unit/plugins
)
+6 -4
Datei anzeigen
@@ -22,7 +22,9 @@
// Maximum number of requests per comm object
#define NCCL_NET_MAX_REQUESTS 32
#define NCCL_NET_MAX_DEVS_PER_NIC 4
#include "net_v11.h"
#include "net_v10.h"
#include "net_v9.h"
#include "net_v8.h"
@@ -33,9 +35,9 @@
#include "net_v3.h"
#include "net_v2.h"
typedef ncclNet_v10_t ncclNet_t;
typedef ncclNetProperties_v10_t ncclNetProperties_t;
typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
typedef ncclNet_v11_t ncclNet_t;
typedef ncclNetProperties_v11_t ncclNetProperties_t;
typedef ncclNetVDeviceProps_v11_t ncclNetVDeviceProps_t;
typedef ncclNetCommConfig_v11_t ncclNetCommConfig_t;
#endif // end include guard
+3 -2
Datei anzeigen
@@ -12,7 +12,7 @@
// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
@@ -27,6 +27,7 @@ typedef struct {
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t;
typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t;
#endif
+1 -2
Datei anzeigen
@@ -5,10 +5,9 @@
#ifndef NET_V10_H_
#define NET_V10_H_
#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
typedef struct {
int ndevs;
int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
int devs[NCCL_NET_MAX_DEVS_PER_NIC];
} ncclNetVDeviceProps_v10_t;
+120
Datei anzeigen
@@ -0,0 +1,120 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NET_V11_H_
#define NET_V11_H_
typedef struct {
int ndevs;
int devs[NCCL_NET_MAX_DEVS_PER_NIC];
} ncclNetVDeviceProps_v11_t;
#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
typedef struct {
// Plugin-specific TC value
int trafficClass;
} ncclNetCommConfig_v11_t;
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int regIsGlobal; // regMr is not tied to a particular comm
int forceFlush; // Force a flush on receives
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
ncclNetDeviceType netDeviceType; // Network offload type
int netDeviceVersion; // Version number for network offload
ncclNetVDeviceProps_v11_t vProps;
size_t maxP2pBytes; // Max transfer size for point-to-point operations
size_t maxCollBytes; // Max transfer size for collective operations
int maxMultiRequestSize; // Maximum number of requests supported in a single multi-request.
} ncclNetProperties_v11_t;
typedef struct {
int32_t maxConcurrentPeers;
int32_t minConcurrentPeers;
int32_t maxFlowsPerPeer;
int32_t minFlowsPerPeer;
} ncclNetCommAttr_v11_t;
typedef struct {
ncclNetCommAttr_v11_t sendCommAttr;
ncclNetCommAttr_v11_t recvCommAttr;
uint32_t op;
uint32_t algo;
uint32_t proto;
} ncclNetAttr_v11_t;
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
// Notify the plugin that a recv has completed by the device
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
// Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
// what index this new vNIC exists at
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props);
// Finalize the network.
ncclResult_t (*finalize)(void* ctx);
ncclResult_t (*setNetAttr)(void* ctx, ncclNetAttr_v11_t* netAttr);
} ncclNet_v11_t;
#endif // end include guard
+1 -2
Datei anzeigen
@@ -5,10 +5,9 @@
#ifndef NET_V9_H_
#define NET_V9_H_
#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
typedef struct {
int ndevs;
int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
int devs[NCCL_NET_MAX_DEVS_PER_NIC];
} ncclNetVDeviceProps_v9_t;
typedef struct {
+84 -17
Datei anzeigen
@@ -11,7 +11,7 @@
int max_requests = NCCL_NET_MAX_REQUESTS;
__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
__hidden ncclResult_t pluginInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
@@ -51,8 +51,8 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
return ncclSuccess;
}
__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginListen(void* ctx, int dev, void* handle, void** listenComm) { return ncclInternalError; }
__hidden ncclResult_t pluginConnect(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
__hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
@@ -67,10 +67,11 @@ __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalE
__hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
__hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
__hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; }
__hidden ncclResult_t pluginFinalize(void* ctx) { return ncclSuccess; }
#define PLUGIN_NAME "Plugin"
const ncclNet_v10_t ncclNetPlugin_v10 = {
const ncclNet_v11_t ncclNetPlugin_v11 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.devices = pluginDevices,
@@ -91,18 +92,84 @@ const ncclNet_v10_t ncclNetPlugin_v10 = {
.getDeviceMr = pluginGetDeviceMr,
.irecvConsumed = pluginIrecvConsumed,
.makeVDevice = pluginMakeVDevice,
.finalize = pluginFinalize,
};
__hidden ncclResult_t pluginInit_v10(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
__hidden ncclResult_t pluginGetProperties_v10(int dev, ncclNetProperties_v10_t* props) {
// Below are default values, if unsure don't change.
props->name = "Example";
// Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
props->pciPath = NULL;
// Only used to detect NICs with multiple PCI attachments.
props->guid = 0;
// Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers.
props->ptrSupport = NCCL_PTR_HOST;
// If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
props->regIsGlobal = 0;
// Force flush after receive. Needed if the control path and data path use a different path to the GPU
props->forceFlush = 0;
// Speed in *Mbps*. 100000 means 100G
props->speed = 100000;
// Port number, used in conjunction with guid
props->port = 0;
// Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values.
props->latency = 0;
// Maximum number of comm objects we can create.
props->maxComms = 1024*1024;
// Maximum number of receive operations taken by irecv().
props->maxRecvs = NCCL_PLUGIN_MAX_RECVS;
// Coupling with NCCL network device-side code.
props->netDeviceType = NCCL_NET_DEVICE_HOST;
props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
// Used to tell NCCL core whether this is a virtual device fusing multiple physical devices.
props->vProps.ndevs = 1;
props->vProps.devs[0] = dev;
// maximum transfer sizes the plugin can handle
props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES;
return ncclSuccess;
}
__hidden ncclResult_t pluginListen_v10(int d, void* handle, void** listenComm) { return ncclInternalError; }
__hidden ncclResult_t pluginConnect_v10(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm) { return ncclInternalError; }
__hidden ncclResult_t pluginMakeVDevice_v10(int* d, ncclNetVDeviceProps_v10_t* props) { return ncclInternalError; }
const ncclNet_v10_t ncclNetPlugin_v10 = {
.name = PLUGIN_NAME,
.init = pluginInit_v10,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v10,
.listen = pluginListen_v10,
.connect = pluginConnect_v10,
.accept = pluginAccept,
.regMr = pluginRegMr,
.regMrDmaBuf = pluginRegMrDmaBuf,
.deregMr = pluginDeregMr,
.isend = pluginIsend,
.irecv = pluginIrecv,
.iflush = pluginIflush,
.test = pluginTest,
.closeSend = pluginCloseSend,
.closeRecv = pluginCloseRecv,
.closeListen = pluginCloseListen,
.getDeviceMr = pluginGetDeviceMr,
.irecvConsumed = pluginIrecvConsumed,
.makeVDevice = pluginMakeVDevice_v10,
};
__hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
return pluginInit(logFunction, NULL);
return pluginInit_v10(logFunction, NULL);
}
__hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) {
return pluginGetProperties(dev, (ncclNetProperties_t*)props);
return pluginGetProperties_v10(dev, (ncclNetProperties_v10_t*)props);
}
__hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){
return pluginConnect(dev, NULL, handle, sendComm, sendDevComm);
return pluginConnect_v10(dev, NULL, handle, sendComm, sendDevComm);
}
__hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
@@ -120,7 +187,7 @@ const ncclNet_v9_t ncclNetPlugin_v9 = {
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v9,
.listen = pluginListen,
.listen = pluginListen_v10,
.connect = pluginConnect_v9,
.accept = pluginAccept,
.regMr = pluginRegMr,
@@ -172,7 +239,7 @@ const ncclNet_v8_t ncclNetPlugin_v8 = {
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v8,
.listen = pluginListen,
.listen = pluginListen_v10,
.connect = pluginConnect_v9,
.accept = pluginAccept,
.regMr = pluginRegMr,
@@ -216,7 +283,7 @@ const ncclNet_v7_t ncclNetPlugin_v7 = {
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v7,
.listen = pluginListen,
.listen = pluginListen_v10,
.connect = pluginConnect_v9,
.accept = pluginAccept,
.regMr = pluginRegMr_v7,
@@ -257,7 +324,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v6,
.listen = pluginListen,
.listen = pluginListen_v10,
.connect = pluginConnect_v6,
.accept = pluginAccept_v6,
.regMr = pluginRegMr_v7,
@@ -278,7 +345,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v6,
.listen = pluginListen,
.listen = pluginListen_v10,
.connect = pluginConnect_v6,
.accept = pluginAccept_v6,
.regMr = pluginRegMr_v7,
@@ -320,7 +387,7 @@ static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
ncclResult_t ret;
do {
ncclNetDeviceHandle_v7_t* handle = NULL;
ret = pluginConnect(dev, NULL, handle, sendComm, &handle);
ret = pluginConnect_v10(dev, NULL, handle, sendComm, &handle);
} while (ret == ncclSuccess && *sendComm == NULL);
return ret;
}
@@ -337,7 +404,7 @@ const ncclNet_v4_t ncclNetPlugin_v4 = {
.init = pluginInit_v9,
.devices = pluginDevices,
.getProperties = pluginGetProperties_v4,
.listen = pluginListen,
.listen = pluginListen_v10,
.connect = pluginConnect_v4,
.accept = pluginAccept_v4,
.regMr = pluginRegMr_v7,
@@ -363,12 +430,12 @@ static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhan
}
static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
max_requests = NCCL_NET_MAX_REQUESTS_V3;
return pluginInit(logFunction, NULL);
return pluginInit_v10(logFunction, NULL);
}
#include <string.h>
static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm);
ncclResult_t ret = pluginListen_v10(dev, &pluginHandle, listenComm);
memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V4);
return ret;
}
@@ -403,7 +470,7 @@ const ncclNet_v2_t ncclNetPlugin_v2 = {
.devices = pluginDevices,
.pciPath = pluginPciPath,
.ptrSupport = pluginPtrSupport,
.listen = pluginListen,
.listen = pluginListen_v3,
.connect = pluginConnect_v4,
.accept = pluginAccept_v4,
.regMr = pluginRegMr_v7,
+84 -37
Datei anzeigen
@@ -49,9 +49,9 @@ of newer ones.
The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
from old API versions. It also provides error codes in `err.h`.
# API (v4)
# API (v5)
Below is the main `ncclProfiler_v4` struct. Each function is explained in later sections.
Below is the main `ncclProfiler_v5` struct. Each function is explained in later sections.
```
typedef struct {
@@ -60,15 +60,15 @@ typedef struct {
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// - commId : communicator id
// - commName : user assigned communicator name
// - commHash : communicator id
// - nNodes : number of nodes in communicator
// - nranks : number of ranks in communicator
// - rank : rank identifier in communicator
// - logfn : logger function
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
@@ -76,7 +76,7 @@ typedef struct {
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
@@ -88,13 +88,13 @@ typedef struct {
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v4_t;
} ncclProfiler_v5_t;
```
## Error codes
@@ -148,10 +148,37 @@ is the `ncclProfilerEventDescr_t` struct.
```
typedef struct {
uint8_t type; // event type (e.g., ncclProfileGroup, ncclProfileColl, ...)
void* parentObj; // pointer to parent event used to expose the event hierarchy to the profiler
int rank; // rank that generated the event
uint64_t type; // event type descriptor: ncclProfileGroupApi, ncclProfileCollApi, ...
void* parentObj; // pointer to parent event used to expose the event hierarchy to the profiler
int rank; // rank that generated the event
union {
struct { // GroupAPI event metadata
bool graphCaptured; // Set to true if the Group API event is emitted inside a CUDA graph capture
int groupDepth; // Determines the depth of a ncclGroup. A depth of 1 implies that the Group API call is implicit (internal to NCCL)
// and not called by the user. Any depth greater than 1 means that the user made the Group API call.
} groupApi;
struct { // Collective API call metadata
const char* func; // string containing name of the collective operation during
size_t count; // data count
const char* datatype; // string containing the name of the datatype
int root; // root rank
void* stream; // Opaque handle that points to the CUDA stream that the operation is enqueued in
bool graphCaptured; // Set to true if the Collective API event is emitted inside a CUDA graph capture
} collApi;
struct { // Point-to-point API call metadata
const char* func; // string containing name of the p2p operation
size_t count; // data count
const char* datatype; // string containing the name of the datatype
void* stream; // Opaque handle that points to a CUDA stream object
bool graphCaptured; // Set to true if the Collective API event is emitted inside a CUDA graph capture
} p2pApi;
struct { // Kernel Launch event metadata
void* stream; // Opaque handle that points to the CUDA stream that the operation is enqueued in
} kernelLaunch;
struct { // collective events metadata
uint64_t seqNumber; // sequence number of this collective operation in the communicator
const char* func; // string containing name of the collective
@@ -164,6 +191,7 @@ typedef struct {
uint8_t nWarps; // number of GPU warps for this collective
const char* algo; // string containing name of the algorithm for this collective
const char* proto; // string containing name of the protocol for this collective
void* parentGroup; // for backward compatibility with v4 - this points to the legacy v4 group parent
} coll;
struct { // point-to-point events metadata
@@ -173,6 +201,7 @@ typedef struct {
size_t count;
int peer; // peer rank for this point-to-point
uint8_t nChannels; // number of channels for this p2p
void* parentGroup; // for backward compatibility with v4 - this points to the legacy v4 group parent
} p2p;
struct { // proxyOp events metadata
@@ -198,12 +227,12 @@ typedef struct {
void* data; // pointer to network plugin defined event
} netPlugin;
};
} ncclProfilerEventDescr_v4_t;
} ncclProfilerEventDescr_v5_t;
```
NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
`ncclProfileNetPlugin`.
NCCL defines the following events: `ncclProfileGroupApi`, `ncclProfileCollApi`, `ncclProfileP2pApi`, `ncclProfileKernelLaunch`,
`ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`,
`ncclProfileKernelCh` and `ncclProfileNetPlugin`.
#### stopEvent
@@ -213,10 +242,10 @@ handle after `eventStop` is undefined behavior.
#### recordEventState
Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`,
`ncclProfileP2p`, cannot be updated through calls to `recordEventState`.
Some events can only be started and stopped. For example, `ncclProfileP2pApi`, `ncclProfileCollApi`, `ncclProfileGroup`,
`ncclProfileColl`, `ncclProfileP2p` cannot be updated through calls to `recordEventState`.
`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
`ncclProfileGroupApi`, `ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
`ncclProfileProxyCtrl` can be updated through calls to `recordEventState`.
The state of these events can be updated, along with event attributes, using `recordEventState`.
@@ -258,9 +287,21 @@ typedef enum {
// ncclProfileKernelCh event states
ncclProfilerKernelChStop = 22,// state marks stop of kernelCh event and timestamp update
} ncclProfilerEventState_v4_t;
// Group API States
ncclProfilerGroupStartApiStop = 23,// state marks the end of a ncclGroupStart() API call
ncclProfilerEndGroupApiStart = 24 // state marks the start of a ncclGroupEnd() API call
} ncclProfilerEventState_v5_t;
```
NCCL profile API events are generated when the API calls are made, right after NCCL checks
for graph capture information. They parent collective, point-to-point and kernel launch events
and persist across multiple operations in a group.
`ncclProfileKernelLaunch` events are generated when the CUDA call to a kernel launch is made. In the
case of graph capture, the event start indicates that the kernel launch operation has been recorded,
not launched.
`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
network requests for the GPU kernel. ProxyOp events are generated for every active channel and
provide a summary of the activity of the proxy progress thread for that channel. Most of the
@@ -379,7 +420,7 @@ typedef union {
struct { // attribute to update for ncclProfileKernelCh events
uint64_t pTimer; // timestamp provided by the NCCL kernel
} kernelCh;
} ncclProfilerEventStateArgs_v4_t;
} ncclProfilerEventStateArgs_v5_t;
```
The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
@@ -389,27 +430,33 @@ The example profiler in `ext-profiler/example` contains details on how to captur
NCCL core events (reported above) are organized into a hierarchy as reported below:
```
Group event
Group API event
|
+- Collective event
+- Collective API event
| |
| +- ProxyOp event
| | |
| | +- ProxyStep event
| | |
| | +- NetPlugin event
| |
| +- KernelCh event
| +- Collective event
| |
| +- ProxyOp event
| | |
| | +- ProxyStep event
| | |
| | +- NetPlugin event
| |
| +- KernelCh event
|
+- Point-to-point event
|
+- ProxyOp event
| |
| +- ProxyStep event
| |
| +- NetPlugin event
|
+- KernelCh event
+- Point-to-point API event
| |
| +- Point-to-point event
| |
| +- ProxyOp event
| | |
| | +- ProxyStep event
| | |
| | +- NetPlugin event
| |
| +- KernelCh event
|
+- Kernel Launch event
ProxyCtrl event
```
+34
Datei anzeigen
@@ -0,0 +1,34 @@
# Find all C source files in current directory
set(SRC_FILES
${CMAKE_CURRENT_SOURCE_DIR}/plugin.cc
${CMAKE_CURRENT_SOURCE_DIR}/print_event.cc
)
# Create shared library
add_library(nccl-profiler-example SHARED ${SRC_FILES})
# Set include directories
target_include_directories(nccl-profiler-example PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/nccl
${CUDAToolkit_INCLUDE_DIRS}
)
# Set output name to match Makefile
set_target_properties(nccl-profiler-example PROPERTIES
OUTPUT_NAME "nccl-profiler-example"
PREFIX "lib"
POSITION_INDEPENDENT_CODE ON
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
)
add_custom_command(TARGET nccl-profiler-example POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/test/unit/plugins
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/lib/libnccl-profiler-example.so ${CMAKE_BINARY_DIR}/test/unit/plugins
)
# Add custom target for clean (equivalent to Makefile clean target)
add_custom_target(clean-profiler-lib
COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_BINARY_DIR}/lib/libnccl-profiler-example.so
COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_BINARY_DIR}/test/unit/plugins/libnccl-profiler-example.so
COMMENT "Cleaning libnccl-profiler-example.so"
)
+15 -8
Datei anzeigen
@@ -4,19 +4,26 @@
# See LICENSE.txt for license information
#
.DEFAULT_GOAL: build
include ../../makefiles/common.mk
SRCDIR ?= $(abspath ../..)
ROCM_PATH ?= $(wildcard /opt/rocm)
CXX = $(ROCM_PATH)/lib/llvm/bin/amdclang++
BUILDDIR ?= .
NCCLDIR := $(BUILDDIR)
HIPIFY_DIR := hipify-profiler
SRC_FILES := $(wildcard *.c)
SRC_FILES := $(wildcard *.cc)
HIPIFY_SRC := $(addprefix $(HIPIFY_DIR)/,$(SRC_FILES))
build: ${BUILDDIR}/librccl-profiler.so
build: ${BUILDDIR}/librccl-profiler-example.so
${BUILDDIR}/librccl-profiler.so: ${SRC_FILES}
${BUILDDIR}/librccl-profiler-example.so: $(HIPIFY_SRC)
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${BUILDDIR}
$(CC) -Inccl -fPIC -shared -o $@ $^
$(CXX) -D__HIP_PLATFORM_AMD__ -I$(HIPIFY_DIR) -I$(HIPIFY_DIR)/nccl -I$(ROCM_PATH)/include -fPIC -shared -o $@ $^
$(HIPIFY_DIR)/%.cc: %.cc
@mkdir -p $(HIPIFY_DIR)/nccl
@cp *.cc *.h $(HIPIFY_DIR)/
@cp nccl/*.h $(HIPIFY_DIR)/nccl/
@hipify-perl -inplace -quiet-warnings $(HIPIFY_DIR)/*.cc $(HIPIFY_DIR)/*.h
clean:
rm -f ${BUILDDIR}/librccl-profiler.so
rm -rf ${BUILDDIR}/librccl-profiler-example.so $(HIPIFY_DIR)
+59 -121
Datei anzeigen
@@ -13,8 +13,7 @@ change the size of the event window the profiler keeps track of.
## Building the profiler plugin
To use the example plugin, just type `make`. You will need a NCCL build's include directory present.
You can override `NCCL_HOME` to where the NCCL installation is on your system.
To build the example plugin shipped as part of NCCL, just type `make`.
## Using the profiler plugin
@@ -27,13 +26,13 @@ You can override `NCCL_HOME` to where the NCCL installation is on your system.
As an example, setting:
`NCCL_PROFILE_EVENT_MASK` to 1 (`ncclProfileGroup`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
`NCCL_PROFILE_EVENT_MASK` to 256 (`ncclProfileGroupApi`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
enables the profiling of the group, the collective and the proxy op events. The same events can be
enables the profiling of the group API, the collective and the proxy op events. The same events can be
expressed more concisely by setting `NCCL_PROFILE_EVENT_MASK` to 8 (`ncclProfileProxyOp`). Indeed,
in NCCL all the events above (in the event hierarchy) the one requested are also captured. The advantage
is that the profiler can easily correlate events that belong to the same NCCL operation and present
them accordingly.
them accordingly. Setting `NCCL_PROFILE_EVENT_MASK` to 4095 enables all events supported by the v5 profiler.
3. Set `NCCL_PROFILE_DUMP_FILE` to the name of the dump file for the collected traces. A file named
${NCCL_PROFILE_DUMP_FILE}-hostname-tid.txt is created. Profiler traces are saved using the chrome
@@ -57,11 +56,14 @@ The group, collective and p2p pools contain objects for the corresponding events
contains objects for `ProxyCtrl` events and the `ProxyDetach` pool contains objects for `ProxyOp` events
generated by remote proxies. A list of pools and their size is reported below:
- `NCCL_PROFILE_GROUP_POOL_SIZE` (16)
- `NCCL_PROFILE_COLL_POOL_SIZE` (16)
- `NCCL_PROFILE_P2P_POOL_SIZE` (1024)
- `NCCL_PROFILE_GROUP_API_POOL_SIZE` (256)
- `NCCL_PROFILE_COLL_API_POOL_SIZE` (256)
- `NCCL_PROFILE_P2P_API_POOL_SIZE` (256)
- `NCCL_PROFILE_KERNEL_LAUNCH_POOL_SIZE` (256)
- `NCCL_PROFILE_COLL_POOL_SIZE` (256)
- `NCCL_PROFILE_P2P_POOL_SIZE` (256)
- `NCCL_PROFILE_PROXY_CTRL_POOL_SIZE` (16)
- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (128)
- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (256)
Remote proxy operations are generated when PXN is in use. Refer to this article for more information
about PXN and how it works:
@@ -73,76 +75,58 @@ The example profiler generates traces using the json format. An example of trace
```
[
{"name": "Group", "cat": "GROUP", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764234.611328, "args": {"groupId": 0}},
{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764237.294922, "args": {"SeqNum": 0, "CommHash": 673864846479792718, "Rank": 1, "Count": 32768, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "LL", "nMaxChannels": 2}},
{"name": "Recv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768464.936523, "args": {"Channel": 0, "Peer": 0, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 772020.300781}, "RECEIVED": {"step": 14, "ts": 772196.049805}, "TRANSMITTED": {"step": 14, "ts": 772197.326172}, "DONE": {"step": 14, "ts": 772201.538086}}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768465.158203, "args": {"Step": 0}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805, "args": {"Step": 0}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266, "args": {"Step": 0}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805, "args": {"Step": 0}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768568.276367},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768503.604492, "args": {"Step": 1}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805, "args": {"Step": 1}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234, "args": {"Step": 1}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695, "args": {"Step": 1}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 770006.914062},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768506.941406, "args": {"Step": 2}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547, "args": {"Step": 2}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133, "args": {"Step": 2}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547, "args": {"Step": 2}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771468.458008},
{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768509.484375, "args": {"Step": 3}},
{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000, "args": {"Step": 3}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023, "args": {"Step": 3}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211, "args": {"Step": 3}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771910.500000},
{"name": "Send", "cat": "PROXY", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768482.878906, "args": {"Channel": 0, "Peer": 2, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 771995.675781}, "REM_FIFO_WAIT": {"step": 14, "ts": 772190.692383}, "TRANSMITTED": {"step": 14, "ts": 772191.516602}, "DONE": {"step": 14, "ts": 772208.473633}}},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.019531, "args": {"Step": 0}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781, "args": {"Step": 0}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234, "args": {"Step": 0}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769618.889648},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.083008, "args": {"Step": 1}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086, "args": {"Step": 1}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664, "args": {"Step": 1}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769622.517578},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768507.937500, "args": {"Step": 2}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578, "args": {"Step": 2}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883, "args": {"Step": 2}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770013.848633},
{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.742188, "args": {"Step": 3}},
{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266, "args": {"Step": 3}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477},
{"name": "SendWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477, "args": {"Step": 3}},
{"name": "SendWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771469.171875},
{"name": "Group API", "cat": "GROUP_API", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 3433.595001, "args": {"groupApiId": 0, "groupDepth":1}},
{"name": "KernelLaunch", "cat": "KERNEL_LAUNCH", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 0.000000, "args": {"groupId": 0, "Stream": 0x5020000567d0}},
{"name": "KernelLaunch", "cat": "KERNEL_LAUNCH", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 111991.558990},
{"name": "AllReduce", "cat": "COLL_API", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 0.000000, "args": {"count": 262144, "datatype": ncclFloat32, "root": 0, "GraphCaptured":0, "Stream": 0x5020000567d0}},
{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 111994.477997, "args": {"SeqNum": 0, "CommHash": 1493613951195738943, "Rank": 0, "Count": 262144, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "SIMPLE", "nChannels": 2}},
{"name": "KernelCh", "cat": "GPU", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119711.888000, "args": {"Channel": 0, "StartGpuClk": 1756135989724672000, "StopGpuClk": 1756135989732831232}},
{"name": "ScheduleRecv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119652.709991, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
{"name": "ScheduleRecv", "cat": "PROXY", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119686.300995},
{"name": "ProgressRecv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119686.300995, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
{name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119707.677979, "args": {"Step": 0}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119807.691986},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119807.691986, "args": {"Step": 0}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119867.338989},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119867.338989, "args": {"Step": 0}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 120120.983002},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119733.647980, "args": {"Step": 1}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119844.401001},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119844.401001, "args": {"Step": 1}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119890.567993},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119890.567993, "args": {"Step": 1}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 120121.129974},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 119753.023987, "args": {"Step": 2}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120038.847992},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 120038.847992, "args": {"Step": 2}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120085.685974},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 120085.685974, "args": {"Step": 2}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120121.244995},
{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 119772.510986, "args": {"Step": 3}},
{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120062.944977},
{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 120062.944977, "args": {"Step": 3}},
{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120101.089996},
{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 120101.089996, "args": {"Step": 3}},
{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120165.115997},
{"name": "ProgressRecv", "cat": "PROXY", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 120165.356995},
{"name": "ScheduleSend", "cat": "PROXY", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119656.950989, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
{"name": "ScheduleSend", "cat": "PROXY", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119709.078979},
{"name": "ProgressSend", "cat": "PROXY", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119709.078979, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 4, "pid": 225798, "tid": 1, "ts": 119710.632996, "args": {"Step": 0}},
{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 4, "pid": 225798, "tid": 1, "ts": 119808.636993},
{"name": "SendPeerWait", "cat": "NET", "ph": "b", "id": 4, "pid": 225798, "tid": 1, "ts": 119808.636993, "args": {"Step": 0}},
{"name": "SendPeerWait", "cat": "NET", "ph": "e", "id": 4, "pid": 225798, "tid": 1, "ts": 119818.972992},
... [ trace truncated for brevity ]
{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.317383},
{"name": "Group", "cat": "GROUP", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.418945},
{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170633.535980},
{"name": "AllReduce", "cat": "COLL_API", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170582.923981},
{"name": "Group API", "cat": "GROUP_API", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170637.582001},
{}]
```
Details about the fields used in the trace can be found at this link:
https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw
The trace above is obtained by running a `ncclAllReduce` operation on 8 GPUs, communicating with each other through
The trace above is obtained by running a `ncclAllReduce` operation on 2 GPUs, communicating with each other through
the network interface. The `Group` event encloses all traces that are related to the single `ncclAllReduce` call.
(Note that for single collective invocations, where there are no explicit group calls, NCCL creates a group with only
one collective and this is what is presented in the traces above).
@@ -161,38 +145,17 @@ The `AllReduce` entry presents information about the `ncclAllReduce` operation.
- datatype : NCCL datatype
- algorithm : algorithm used to process the ncclAllReduce
- protocol : protocol used to process the ncclAllReduce
- nMaxChannels: max number of channels used to process the ncclAllReduce
- nChannels : Number of channels used to process the ncclAllReduce
If the proxy events are not active (e.g., the `ncclAllReduce` is intranode) the end timestamp will match the time
consumed by the CPU to launch the collective. For more details refer to `ext-profiler/README.md`, section `Profiling
of collective and p2p operations`.
### Proxy Send
The `Send` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
info in the args field:
- Channel : id of the channel used by this proxy operation to send data to the peer
- Peer : peer rank
- Steps : number of network steps required to transfer transSize bytes to the peer
- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread
- transSize : bytes transferred across the channel by this proxy operation
- POSTED : struct containing the number of buffer posts to the GPU and the time stamp for the last post
- REM_FIFO_WAIT: struct containing the number of remote buffer waits and the time stamp for the last wait
- TRANSMITTED : struct containing the number of network sends and the time stamp of the last send
- DONE : struct containing the number of network sends completed and the time stamp of the last send completed
In case of a network problem the POSTED, REM_FIFO_WAIT, TRANSMITTED and DONE might all have partially updated steps,
which could help identify at which point the network problem occurred.
The Proxy send trace gives a summary of the proxy progress thread activity for the channel. If more details are
needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
entries below are also reported by the profiler.
#### Proxy SendBufferWait
Presents, for every network step, the time the CPU proxy spends waiting for the channel staging buffer to become available.
#### Proxy SendGPUWait
#### Proxy SendGpuWait
Presents, for every network step, the time the CPU proxy spends waiting for the GPU to provide the data in the staging
buffer.
@@ -201,31 +164,6 @@ buffer.
Presents, for every network step, the time the CPU proxy spends waiting for the `isend` to complete
### Proxy Recv
The `Recv` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
info in the args field:
- Channel : id of the channel used by this proxy operation to recv data from the peer
- Peer : peer rank
- Steps : number of network steps required to transfer transSize bytes from the peer
- ChunkSize : chunk size used by NCCL to pipeline data through the proxy thread
- transSize : bytes transferred across the channel by this proxy operation
- POSTED : struct containing the number of recvs posted and the time stamp for the last recv posted
- RECEIVED : struct containing the number of recvs completed and the time stamp for the last recv completed
- TRANSMITTED: struct containing the number of recvs flushed to the GPU memory and the time stamp for the last recv flushed
- DONE : struct containing the number of flush completed and the time stamp for the last flush completed
The Proxy Recv trace gives a summary of the proxy progress thread activity for the channel. If more details are
needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
entries below are also reported by the profiler.
#### Proxy RecvBufferWait
Presents, for every network step, the time the CPU proxy spends waiting for the staging buffer for the channel to
become available.
#### Proxy RecvWait
Presents, for every network step, the time the CPU proxy spends waiting for a posted `irecv` to complete
@@ -234,6 +172,6 @@ Presents, for every network step, the time the CPU proxy spends waiting for a po
Presents, for every network step, the time the CPU proxy spends waitng for the recv data to be flushed to the GPU
#### Proxy RecvGPUWait
#### Proxy RecvGpuWait
Presents, for every network step, the time the CPU proxy spends waiting for the GPU to consume the recv data
-30
Datei anzeigen
@@ -1,30 +0,0 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <stdio.h>
#include "event.h"
int taskEventQueueEmpty(struct group* g) {
return g->eventHead == NULL;
}
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
event->next = NULL;
if (g->eventHead) g->eventTail->next = event;
else g->eventHead = event;
g->eventTail = event;
}
struct taskEventBase* taskEventQueueHead(struct group* g) {
return g->eventHead;
}
struct taskEventBase* taskEventQueueDequeue(struct group* g) {
struct taskEventBase* tmp = g->eventHead;
g->eventHead = g->eventHead->next;
if (g->eventHead == NULL) g->eventTail = NULL;
return tmp;
}
+142 -13
Datei anzeigen
@@ -10,10 +10,14 @@
#include <sys/types.h>
#include <stdint.h>
#include <unistd.h>
#include <cstring>
#include "err.h"
#include "profiler.h"
#include "queue.h"
#include <cuda_runtime.h>
#define MAX_CHANNELS 128 // Match RCCL's MAXCHANNELS
#define MAX_STEPS 16
#define MAX_STEPS 1024
#define MAX_OPS 16 // Up to 64K ranks for PAT
#define MAX_EVENTS_PER_REQ (8)
@@ -21,7 +25,7 @@ struct proxyOp;
struct proxyStep;
struct netPlugin {
uint8_t type;
uint64_t type;
int pluginType;
int pluginVer;
uint8_t pluginEvent;
@@ -63,7 +67,7 @@ struct kernelCh {
#define PROXY_STEP_MAX_STATES 3
struct proxyStep {
uint8_t type; // type of event: network transfer
uint64_t type; // type of event: network transfer
int state;
int step; // network transfer id in given channel
int isSend; // send/recv channel operation
@@ -76,7 +80,7 @@ struct proxyStep {
};
struct proxyOp {
uint8_t type; // type of event: proxy operation
uint64_t type; // type of event: proxy operation
uint8_t channelId; // channel id for this proxy operation
pid_t pid;
int rank;
@@ -97,7 +101,7 @@ struct group;
struct context;
struct proxyCtrl {
uint8_t type;
uint64_t type;
struct context* ctx; // profiler context
double startTs;
double stopTs;
@@ -107,12 +111,12 @@ struct proxyCtrl {
// task level event base structure
struct taskEventBase {
uint8_t type; // event type: collective/p2p
uint64_t type; // event type: collective/p2p
int rank; // rank of the operation in NCCL communicator
const char* func; // ncclFunc*
int refCount; // number of references for this operation
struct group* parent; // parent event group
struct taskEventBase* next; // next top level event in group
void* parent; // parent API event
struct taskEventBase* next; // next top level event
double startTs;
double stopTs;
};
@@ -147,7 +151,7 @@ struct p2p {
};
struct group {
uint8_t type;
uint64_t type;
struct context* ctx; // profiler context
int groupId;
int refCount;
@@ -158,6 +162,70 @@ struct group {
struct group* next; // next group event in queue
};
struct collApi {
uint64_t type;
struct groupApi* parent;
struct context* ctx; // profiler context
int collApiId;
int refCount;
cudaStream_t stream;
const char* func;
size_t count;
const char* datatype;
int root;
bool graphCaptured;
struct taskEventBase* eventHead; // queue head for task events
struct taskEventBase* eventTail; // queue tail for task events
double startTs;
double stopTs;
struct collApi* next;
};
struct p2pApi {
uint64_t type;
struct groupApi* parent;
struct context* ctx; // profiler context
int p2pApiId;
int refCount;
const char* func;
cudaStream_t stream;
size_t count;
const char* datatype;
bool graphCaptured;
struct taskEventBase* eventHead; // queue head for task events
struct taskEventBase* eventTail; // queue tail for task events
double startTs;
double stopTs;
struct p2pApi* next;
};
struct kernelLaunch {
uint64_t type;
struct groupApi* parent;
cudaStream_t stream;
int kernelLaunchId;
double startTs;
double stopTs;
struct kernelLaunch* next;
};
struct groupApi {
uint64_t type;
struct context* ctx;
int groupApiId;
int refCount;
bool graphCaptured;
int groupDepth;
struct profilerQueue<struct p2pApi, &p2pApi::next> p2pApiEvents;
struct profilerQueue<struct collApi, &collApi::next> collApiEvents;
struct profilerQueue<struct kernelLaunch, &kernelLaunch::next> kernelLaunchEvents;
double endOfncclGroupStartTs;
double startOfncclGroupEndTs;
double startTs;
double stopTs;
struct groupApi* next;
};
// arrays for different event objects
struct context {
const char* commName;
@@ -165,6 +233,26 @@ struct context {
int nranks;
int rank;
int groupApiPoolSize;
int groupApiPoolBase;
int groupApiPoolIndex;
struct groupApi* groupApiPool;
int collApiPoolSize;
int collApiPoolBase;
int collApiPoolIndex;
struct collApi* collApiPool;
int p2pApiPoolSize;
int p2pApiPoolBase;
int p2pApiPoolIndex;
struct p2pApi* p2pApiPool;
int kernelLaunchPoolSize;
int kernelLaunchPoolBase;
int kernelLaunchPoolIndex;
struct kernelLaunch* kernelLaunchPool;
int groupPoolSize;
int groupPoolBase;
int groupPoolIndex;
@@ -186,9 +274,50 @@ struct context {
struct proxyCtrl* proxyCtrlPool;
};
int taskEventQueueEmpty(struct group* g);
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event);
struct taskEventBase* taskEventQueueHead(struct group* g);
struct taskEventBase* taskEventQueueDequeue(struct group* g);
template <typename T>
inline int taskEventQueueEmpty(T *obj) {
return obj->eventHead == NULL;
}
template <typename T>
inline void taskEventQueueEnqueue(T* obj, struct taskEventBase* event) {
event->next = NULL;
if (obj->eventHead) obj->eventTail->next = event;
else obj->eventHead = event;
obj->eventTail = event;
}
template <typename T>
inline struct taskEventBase* taskEventQueueHead(T *obj) {
return obj->eventHead;
}
template <typename T>
inline struct taskEventBase* taskEventQueueDequeue(T* obj) {
struct taskEventBase* tmp = obj->eventHead;
obj->eventHead = obj->eventHead->next;
if (obj->eventHead == NULL) obj->eventTail = NULL;
return tmp;
}
template <typename T>
inline void resetTaskEvents(T *obj, struct context* ctx) {
while (!taskEventQueueEmpty(obj)) {
struct taskEventBase* base = taskEventQueueDequeue(obj);
if (base->type == ncclProfileColl) {
struct collective* c = (struct collective *)base;
// reset event proxyOps & proxySteps
memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
// release collective events in the group and return them to the collective pool
__atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
} else if (base->type == ncclProfileP2p) {
struct p2p* p = (struct p2p *)base;
// reset event proxyOp and proxySteps
memset(&p->op, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
// release p2p events in the group and return them to the p2p pool
__atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
}
}
}
#endif
+21 -12
Datei anzeigen
@@ -11,17 +11,20 @@
#include <stdlib.h>
#include "common.h"
#include "err.h"
enum {
ncclProfileGroup = (1 << 0), // group event type
ncclProfileColl = (1 << 1), // host collective call event type
ncclProfileP2p = (1 << 2), // host point-to-point call event type
ncclProfileProxyOp = (1 << 3), // proxy operation event type
ncclProfileProxyStep = (1 << 4), // proxy step event type
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
ncclProfileKernelCh = (1 << 6), // kernel channel event type
ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events
ncclProfileGroup = (1 << 0), // group event type
ncclProfileColl = (1 << 1), // host collective call event type
ncclProfileP2p = (1 << 2), // host point-to-point call event type
ncclProfileProxyOp = (1 << 3), // proxy operation event type
ncclProfileProxyStep = (1 << 4), // proxy step event type
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
ncclProfileKernelCh = (1 << 6), // kernel channel event type
ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events
ncclProfileGroupApi = (1 << 8), // Group API events
ncclProfileCollApi = (1 << 9), // Collective API events
ncclProfileP2pApi = (1 << 10), // Point-to-Point API events
ncclProfileKernelLaunch = (1 << 11), // Kernel launch events
};
typedef enum {
@@ -56,21 +59,27 @@ typedef enum {
/* Kernel event states */
ncclProfilerKernelChStop = 22,
/* Group API States */
ncclProfilerEndGroupApiStart = 23,
ncclProfilerBeginGroupApiEnd = 24
} ncclProfilerEventState_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t;
#include "profiler_v5.h"
#include "profiler_v4.h"
#include "profiler_v3.h"
#include "profiler_v2.h"
#include "profiler_v1.h"
#include "profiler_net.h"
typedef ncclProfiler_v4_t ncclProfiler_t;
typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
typedef ncclProfiler_v5_t ncclProfiler_t;
typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t;
#endif // end include guard
+152
Datei anzeigen
@@ -0,0 +1,152 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V5_H_
#define PROFILER_V5_H_
#include <stdbool.h>
typedef struct {
uint64_t type; // event type descriptor: ncclProfileGroupApi, ...
void* parentObj; // pointer to the profiler parent object
int rank; // originating rank
union {
struct {
int graphCaptured;
int groupDepth;
} groupApi;
struct {
const char* func;
size_t count;
const char* datatype;
int root;
void* stream;
bool graphCaptured;
} collApi;
struct {
const char* func;
size_t count;
const char* datatype;
void* stream;
bool graphCaptured;
} p2pApi;
struct {
void* stream;
} kernelLaunch;
struct {
uint64_t seqNumber;
const char* func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
uint8_t nChannels;
uint8_t nWarps;
const char* algo;
const char* proto;
void* parentGroup; // for backward compatibility with v4
} coll;
struct {
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer;
uint8_t nChannels;
void* parentGroup; // for backward compatibility with v4
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
struct {
uint8_t channelId;
uint64_t pTimer; // start timestamp from GPU globaltimer
} kernelCh;
struct {
int64_t id;
void* data;
} netPlugin;
};
} ncclProfilerEventDescr_v5_t;
typedef union {
struct {
size_t transSize;
} proxyStep;
struct {
int appendedProxyOps;
} proxyCtrl;
struct {
void* data;
} netPlugin;
struct {
uint64_t pTimer;
} kernelCh;
} ncclProfilerEventStateArgs_v5_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// - commId : communicator id
// - commName : user assigned communicator name
// - nNodes : number of nodes in communicator
// - nranks : number of ranks in communicator
// - rank : rank identifier in communicator
// - logfn : logger function
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v5_t;
#endif
@@ -6,7 +6,7 @@
#include <stdio.h>
#include <pthread.h>
#include <string.h>
#include <cstring>
#include <linux/limits.h>
#include <sys/time.h>
#include <sys/types.h>
@@ -22,12 +22,20 @@ static int initialized; // initialization counter for profiler
static double startTime; // profiler start time
static const int defaultEActivationMask = ncclProfileColl | ncclProfileP2p;
static const int defaultGroupPoolSize = 16;
static const int defaultCollPoolSize = 16;
static const int defaultP2pPoolSize = 1024;
static const int defaultGroupApiPoolSize = 256;
static const int defaultCollApiPoolSize = 256;
static const int defaultP2pApiPoolSize = 256;
static const int defaultKernelLaunchPoolSize = 256;
static const int defaultGroupPoolSize = 256;
static const int defaultCollPoolSize = 256;
static const int defaultP2pPoolSize = 256;
static const int defaultProxyCtrlPoolSize = 16;
static const int defaultDetachPoolSize = 128;
static const int defaultDetachPoolSize = 256;
static int groupApiPoolSize;
static int collApiPoolSize;
static int p2pApiPoolSize;
static int kernelLaunchPoolSize;
static int groupPoolSize;
static int collPoolSize;
static int p2pPoolSize;
@@ -51,7 +59,7 @@ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
static pid_t pid;
static int* eActivationMaskPtr;
__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
__hidden ncclResult_t exampleProfilerInit(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
pthread_mutex_lock(&lock);
if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
// first thread initializes event mask, environment and detach pool
@@ -59,6 +67,18 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
str = getenv("NCCL_PROFILE_EVENT_MASK");
__atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED);
str = getenv("NCCL_PROFILE_GROUP_API_POOL_SIZE");
groupApiPoolSize = str ? atoi(str) : defaultGroupApiPoolSize;
str = getenv("NCCL_PROFILE_COLL_API_POOL_SIZE");
collApiPoolSize = str ? atoi(str) : defaultCollApiPoolSize;
str = getenv("NCCL_PROFILE_P2P_API_POOL_SIZE");
p2pApiPoolSize = str ? atoi(str) : defaultP2pApiPoolSize;
str = getenv("NCCL_PROFILE_KERNEL_LAUNCH_POOL_SIZE");
kernelLaunchPoolSize = str ? atoi(str) : defaultKernelLaunchPoolSize;
str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
@@ -96,11 +116,23 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
// pre-allocate memory for event object pools in dedicated profiler context
struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
ctx->commName = commName;
ctx->commHash = commHash;
ctx->commHash = commId;
ctx->nranks = nranks;
ctx->rank = rank;
logFn = logfn;
INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commHash, nranks, rank);
INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commId, nranks, rank);
ctx->groupApiPool = (struct groupApi *)calloc(groupApiPoolSize, sizeof(*ctx->groupApiPool));
if (ctx->groupApiPool == NULL) goto fail;
ctx->collApiPool = (struct collApi *)calloc(collApiPoolSize, sizeof(*ctx->collApiPool));
if (ctx->collApiPool == NULL) goto fail;
ctx->p2pApiPool = (struct p2pApi *)calloc(p2pApiPoolSize, sizeof(*ctx->p2pApiPool));
if (ctx->p2pApiPool == NULL) goto fail;
ctx->kernelLaunchPool = (struct kernelLaunch *)calloc(kernelLaunchPoolSize, sizeof(*ctx->kernelLaunchPool));
if (ctx->kernelLaunchPool == NULL) goto fail;
ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
if (ctx->groupPool == NULL) goto fail;
@@ -130,16 +162,22 @@ fail:
if (ctx->p2pPool) free(ctx->p2pPool);
if (ctx->collPool) free(ctx->collPool);
if (ctx->groupPool) free(ctx->groupPool);
if (ctx->collApiPool) free(ctx->collApiPool);
if (ctx->p2pApiPool) free(ctx->p2pApiPool);
if (ctx->kernelLaunchPool) free(ctx->kernelLaunchPool);
if (ctx->groupApiPool) free(ctx->groupApiPool);
free(ctx);
if (detachPool) free(detachPool);
return ncclSystemError;
}
static const char* profilerDumpFile;
__hidden ncclResult_t exampleProfilerFinalize(void* context) {
FILE* fh = NULL;
char filename[PATH_MAX] = { 0 };
struct context* ctx = (struct context *)context;
const char* dump = getenv("NCCL_PROFILE_DUMP_FILE");
const char* dump = profilerDumpFile ? profilerDumpFile : getenv("NCCL_PROFILE_DUMP_FILE");
if (dump) {
sprintf(filename, "%s_%lu_%d.json", dump, ctx->commHash, ctx->rank);
fh = fopen(filename, "w");
@@ -148,10 +186,12 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
INFO(NCCL_INIT, "PROFILER/Plugin: finalize commName: %s commHash: %lu nranks: %d rank: %d", ctx->commName ? ctx->commName : "", ctx->commHash, ctx->nranks, ctx->rank);
// print last N groups/collectives/p2ps
int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0;
int end = ctx->groupPoolIndex;
// Note that since the v5 version of the profiler, group API events are now at the top of the hierarchy.
// Legacy Group events from v4 are still emitted for compatibility purposes when using the v4 profiler but excluded from this example.
int start = (ctx->groupApiPoolIndex - groupApiPoolSize >= 0) ? ctx->groupApiPoolIndex - groupApiPoolSize : 0;
int end = ctx->groupApiPoolIndex;
for (int i = start; i < end; i++) {
printEvent(fh, &ctx->groupPool[i%groupPoolSize]);
printEvent(fh, &ctx->groupApiPool[i%groupApiPoolSize]);
}
start = (ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize >= 0) ? ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize : 0;
@@ -161,6 +201,10 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
}
free(ctx->groupPool);
free(ctx->collApiPool);
free(ctx->p2pApiPool);
free(ctx->kernelLaunchPool);
free(ctx->groupApiPool);
free(ctx->collPool);
free(ctx->p2pPool);
free(ctx->proxyCtrlPool);
@@ -187,7 +231,113 @@ __hidden void updateEvent(void* handle);
__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
*eHandle = NULL;
struct context* ctx = (struct context *)context;
if (eDescr->type == ncclProfileGroup) {
if (eDescr->type == ncclProfileGroupApi) {
struct groupApi* event;
int groupApiId = __atomic_fetch_add(&ctx->groupApiPoolIndex, 1, __ATOMIC_RELAXED);
if ((groupApiId - __atomic_load_n(&ctx->groupApiPoolBase, __ATOMIC_RELAXED)) < groupApiPoolSize) {
// if there are available group API events grab one
event = &ctx->groupApiPool[groupApiId%groupApiPoolSize];
// Make sure all child events of the picked group API event are cleared
while (!profilerQueueEmpty(&event->collApiEvents)) {
struct collApi *collApiEvent = profilerQueueDequeue(&event->collApiEvents);
resetTaskEvents(collApiEvent, ctx);
__atomic_fetch_add(&ctx->collApiPoolBase, 1, __ATOMIC_RELAXED);
}
while (!profilerQueueEmpty(&event->p2pApiEvents)) {
struct p2pApi *p2pApiEvent = profilerQueueDequeue(&event->p2pApiEvents);
resetTaskEvents(p2pApiEvent, ctx);
__atomic_fetch_add(&ctx->p2pApiPoolBase, 1, __ATOMIC_RELAXED);
}
while (!profilerQueueEmpty(&event->kernelLaunchEvents)) {
profilerQueueDequeue(&event->kernelLaunchEvents);
__atomic_fetch_add(&ctx->kernelLaunchPoolBase, 1, __ATOMIC_RELAXED);
}
} else {
// else drop this event
__atomic_fetch_sub(&ctx->groupApiPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->type = ncclProfileGroupApi;
event->ctx = ctx;
event->groupApiId = groupApiId;
event->graphCaptured = eDescr->groupApi.graphCaptured;
event->groupDepth = eDescr->groupApi.groupDepth;
event->startTs = gettime() - startTime;
*eHandle = event;
} else if (eDescr->type == ncclProfileCollApi) {
if (eDescr->parentObj == NULL) return ncclSuccess;
struct collApi* event;
int collApiId = __atomic_fetch_add(&ctx->collApiPoolIndex, 1, __ATOMIC_RELAXED);
if ((collApiId - __atomic_load_n(&ctx->collApiPoolBase, __ATOMIC_RELAXED)) < collApiPoolSize) {
// if there are available Coll API events grab one
event = &ctx->collApiPool[collApiId%collApiPoolSize];
resetTaskEvents(event, ctx);
} else {
// else drop this event
__atomic_fetch_sub(&ctx->collApiPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->type = ncclProfileCollApi;
event->collApiId = collApiId;
event->ctx = ctx;
event->func = eDescr->collApi.func;
event->stream = (cudaStream_t) eDescr->collApi.stream;
event->count = eDescr->collApi.count;
event->datatype = eDescr->collApi.datatype;
event->root = eDescr->collApi.root;
event->graphCaptured = eDescr->collApi.graphCaptured;
struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
event->parent = parent;
profilerQueueEnqueue(&parent->collApiEvents, event);
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
*eHandle = event;
} else if (eDescr->type == ncclProfileP2pApi) {
if (eDescr->parentObj == NULL) return ncclSuccess;
struct p2pApi* event;
int p2pApiId = __atomic_fetch_add(&ctx->p2pApiPoolIndex, 1, __ATOMIC_RELAXED);
if ((p2pApiId - __atomic_load_n(&ctx->p2pApiPoolBase, __ATOMIC_RELAXED)) < p2pApiPoolSize) {
// if there are available p2p API events grab one
event = &ctx->p2pApiPool[p2pApiId%p2pApiPoolSize];
resetTaskEvents(event, ctx);
} else {
// else drop this event
__atomic_fetch_sub(&ctx->p2pApiPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->type = ncclProfileP2pApi;
event->p2pApiId = p2pApiId;
event->ctx = ctx;
event->func = eDescr->p2pApi.func;
event->stream = (cudaStream_t) eDescr->p2pApi.stream;
event->count = eDescr->p2pApi.count;
event->datatype = eDescr->p2pApi.datatype;
event->graphCaptured = eDescr->p2pApi.graphCaptured;
struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
event->parent = parent;
profilerQueueEnqueue(&parent->p2pApiEvents, event);
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
*eHandle = event;
} else if (eDescr->type == ncclProfileKernelLaunch) {
if (eDescr->parentObj == NULL) return ncclSuccess;
struct kernelLaunch* event;
int kernelLaunchId = __atomic_fetch_add(&ctx->kernelLaunchPoolIndex, 1, __ATOMIC_RELAXED);
if ((kernelLaunchId - __atomic_load_n(&ctx->kernelLaunchPoolBase, __ATOMIC_RELAXED)) < kernelLaunchPoolSize) {
// if there are available kernel API events grab one
event = &ctx->kernelLaunchPool[kernelLaunchId%kernelLaunchPoolSize];
} else {
// else drop this event
__atomic_fetch_sub(&ctx->kernelLaunchPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->type = ncclProfileKernelLaunch;
event->stream = (cudaStream_t) eDescr->kernelLaunch.stream;
struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
event->parent = parent;
profilerQueueEnqueue(&parent->kernelLaunchEvents, event);
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
*eHandle = event;
} else if (eDescr->type == ncclProfileGroup) {
if (eDescr->parentObj == NULL) return ncclSuccess;
struct group* event;
int groupId = __atomic_fetch_add(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
if ((groupId - __atomic_load_n(&ctx->groupPoolBase, __ATOMIC_RELAXED)) < groupPoolSize) {
@@ -222,7 +372,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
debugEvent(event, "GroupStart");
} else if (eDescr->type == ncclProfileColl) {
// the parent might be null if we run out of events
struct group* parent = (struct group *)eDescr->parentObj;
struct collApi* parent = (struct collApi *)eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
struct collective* event;
@@ -253,12 +403,12 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
event->proto = eDescr->coll.proto;
*eHandle = event;
taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
// increment the group ref counter so the event will staty open
// increment the group ref counter so the event will stay open
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "CollStart");
} else if (eDescr->type == ncclProfileP2p) {
// the parent might be null if we run out of events
struct group* parent = (struct group *)eDescr->parentObj;
struct p2pApi* parent = (struct p2pApi*) eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
struct p2p* event;
@@ -458,8 +608,34 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
}
void updateEvent(void* handle) {
uint8_t type = *(uint8_t *)handle;
if (type == ncclProfileGroup) {
uint64_t type = *(uint64_t *)handle;
if (type == ncclProfileGroupApi) {
struct groupApi* event = (struct groupApi*) handle;
if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
event->stopTs = gettime() - startTime;
__atomic_fetch_add(&event->ctx->groupApiPoolBase, 1, __ATOMIC_RELAXED);
}
} else if (type == ncclProfileCollApi) {
struct collApi* event = (struct collApi*) handle;
if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
event->stopTs = gettime() - startTime;
__atomic_fetch_add(&event->ctx->collApiPoolBase, 1, __ATOMIC_RELAXED);
}
updateEvent(event->parent);
return;
} else if (type == ncclProfileP2pApi) {
struct p2pApi* event = (struct p2pApi*) handle;
if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
event->stopTs = gettime() - startTime;
__atomic_fetch_add(&event->ctx->p2pApiPoolBase, 1, __ATOMIC_RELAXED);
}
updateEvent(event->parent);
event->stopTs = gettime() - startTime;
} else if (type == ncclProfileKernelLaunch) {
struct kernelLaunch* event = (struct kernelLaunch*) handle;
event->stopTs = gettime() - startTime;
updateEvent(event->parent);
} else if (type == ncclProfileGroup) {
struct group* event = (struct group *)handle;
if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
event->stopTs = gettime() - startTime;
@@ -527,25 +703,35 @@ __hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
// the event handle might be null if we run out of events
if (eHandle == NULL) return ncclSuccess;
uint8_t type = *(uint8_t *)eHandle;
if (type == ncclProfileGroup) {
// stopping the group event in NCCL core does not
// mean the group has completed. It means the group
// was submitted/enqueued so we need to keep the event open
uint64_t type = *(uint64_t *)eHandle;
// Stopping API events, Kernel Launch events, collective/p2p task events
// in NCCL core do not mean that they are complete. It means that the
// operation was enqueued so we need to keep the events open
if (type == ncclProfileGroupApi) {
struct groupApi* event = (struct groupApi*) eHandle;
event->stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileCollApi) {
struct collApi* event = (struct collApi*) eHandle;
event->stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileP2pApi) {
struct p2pApi* event = (struct p2pApi*) eHandle;
event->stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileKernelLaunch) {
struct kernelLaunch* event = (struct kernelLaunch*) eHandle;
event->stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileGroup) {
struct group* event = (struct group *)eHandle;
event->stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileColl) {
// stopping the collective event in NCCL core does not
// mean the collective has completed. It means the collective
// was submitted/enqueued so we need to keep the event open
struct collective* event = (struct collective *)eHandle;
event->base.stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileP2p) {
// stopping the p2p event in NCCL core does not
// mean the p2p has completed. It means the p2p
// was submitted/enqueued so we need to keep the event open
struct p2p* event = (struct p2p *)eHandle;
event->base.stopTs = gettime() - startTime;
return ncclSuccess;
@@ -559,8 +745,15 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
// the event handle might be null if we run out of events
if (eHandle == NULL) return ncclSuccess;
uint8_t type = *(uint8_t *)eHandle;
if (type == ncclProfileProxyOp) {
uint64_t type = *(uint64_t *)eHandle;
if (type == ncclProfileGroupApi) {
struct groupApi* event = (struct groupApi*) eHandle;
if (eState == ncclProfilerEndGroupApiStart) {
event->endOfncclGroupStartTs = gettime() - startTime;
} else if (eState == ncclProfilerBeginGroupApiEnd) {
event->startOfncclGroupEndTs = gettime() - startTime;
}
} else if (type == ncclProfileProxyOp) {
struct proxyOp* event = (struct proxyOp *)eHandle;
if (eState == ncclProfilerProxyOpInProgress_v4) {
event->progrTs = gettime() - startTime;
@@ -592,6 +785,8 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
case ncclProfilerProxyStepRecvGPUWait:
event->timestamp[PROXY_STEP_RECV_GPU_WAIT] = gettime() - startTime;
break;
default:
break;
}
} else if (type == ncclProfileProxyCtrl) {
struct proxyCtrl* event = (struct proxyCtrl *)eHandle;
@@ -609,7 +804,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
return ncclSuccess;
}
ncclProfiler_t ncclProfiler_v4 = {
ncclProfiler_t ncclProfiler_v5 = {
"Example-profiler",
exampleProfilerInit,
exampleProfilerStartEvent,
@@ -618,14 +813,15 @@ ncclProfiler_t ncclProfiler_v4 = {
exampleProfilerFinalize,
};
int exampleProfilerStart(int eActivationMask) {
__attribute__((visibility("default"))) int exampleProfilerStart(int eActivationMask, const char* name) {
profilerDumpFile = name;
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
__atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED);
}
return ncclSuccess;
}
int exampleProfilerStop(void) {
__attribute__((visibility("default"))) int exampleProfilerStop(void) {
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
__atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED);
}
+3 -2
Datei anzeigen
@@ -7,7 +7,8 @@
#ifndef PLUGIN_H_
#define PLUGIN_H_
int exampleProfilerStart(int eActivationMask);
int exampleProfilerStop(void);
__attribute__((visibility("default"))) int exampleProfilerStart(int eActivationMask, const char* name);
__attribute__((visibility("default"))) int exampleProfilerStop(void);
#endif
@@ -5,15 +5,59 @@
************************************************************************/
#include <stdio.h>
#include "err.h"
#include "profiler.h"
#include "event.h"
#include "print_event.h"
#include <cuda_runtime.h>
#define __hidden __attribute__ ((visibility("hidden")))
// FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category
// It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a
// category that matches the type of event (GROUP, COLL, P2P, PROXY, NET)
// category that matches the type of event (GROUP API, COLL API, P2P API, GROUP, COLL, P2P, PROXY, NET)
static __thread int groupApiId;
__hidden void printGroupApiEventHeader(FILE* fh, struct groupApi* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupApiId\": %d, \"groupDepth\":%d}},\n",
"Group API", groupApiId, getpid(), 1, event->startTs, event->groupApiId, event->groupDepth);
}
__hidden void printGroupApiEventTrailer(FILE* fh, struct groupApi* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"Group API", groupApiId++, getpid(), 1, event->stopTs);
}
static __thread int p2pApiId;
__hidden void printP2pApiEventHeader(FILE* fh, struct p2pApi* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"count\": %lu, \"datatype\": %s, \"GraphCaptured\":%d, \"Stream\": %p}},\n",
event->func, p2pApiId, getpid(), 1, event->startTs, event->count, event->datatype, event->graphCaptured, event->stream);
}
__hidden void printP2pApiEventTrailer(FILE* fh, struct p2pApi* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
event->func, p2pApiId++, getpid(), 1, event->stopTs);
}
static __thread int collApiId;
__hidden void printCollApiEventHeader(FILE* fh, struct collApi* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"count\": %lu, \"datatype\": %s, \"root\": %d, \"GraphCaptured\":%d, \"Stream\": %p}},\n",
event->func, collApiId, getpid(), 1, event->startTs, event->count, event->datatype, event->root, event->graphCaptured, event->stream);
}
__hidden void printCollApiEventTrailer(FILE* fh, struct collApi* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
event->func, collApiId++, getpid(), 1, event->stopTs);
}
static __thread int kernelLaunchId;
__hidden void printKernelLaunchEventHeader(FILE* fh, struct kernelLaunch* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"KERNEL_LAUNCH\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d, \"Stream\": %p}},\n", "KernelLaunch", kernelLaunchId, getpid(), 1, event->startTs, event->kernelLaunchId, event->stream);
}
__hidden void printKernelLaunchEventTrailer(FILE* fh, struct kernelLaunch* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"KERNEL_LAUNCH\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", "KernelLaunch", kernelLaunchId++, getpid(), 1, event->stopTs);
}
static __thread int groupId;
__hidden void printGroupEventHeader(FILE* fh, struct group* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d}},\n",
@@ -28,7 +72,7 @@ __hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
static __thread int collId;
__hidden void printCollEventHeader(FILE* fh, struct collective* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nChannels\": %d}},\n",
event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.parent->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels);
event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, ((struct collApi*)event->base.parent)->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels);
}
__hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
@@ -39,7 +83,7 @@ __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
static __thread int p2pId;
__hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"nChannels\": %d}},\n",
event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.parent->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels);
event->base.func, p2pId, getpid(), 1, event->base.startTs, ((struct p2pApi*)event->base.parent)->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels);
}
__hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
@@ -173,7 +217,7 @@ void debugEvent(void* eHandle, const char* tag) {
char filename[64] = { 0 };
sprintf(filename, "EventDebug-%d", getpid());
FILE* fh = fopen(filename, "a+");
uint8_t type = *(uint8_t *)eHandle;
uint64_t type = *(uint64_t *)eHandle;
if (type == ncclProfileGroup) {
struct group* event = (struct group *)eHandle;
fprintf(fh, "Group event %p tag = %s {\n", event, tag);
@@ -241,8 +285,51 @@ void debugEvent(void* eHandle, const char* tag) {
void printEvent(FILE* fh, void* handle) {
if (handle == NULL || fh == NULL) return;
uint8_t type = *(uint8_t *)handle;
if (type == ncclProfileGroup) {
uint64_t type = *(uint64_t *)handle;
if (type == ncclProfileGroupApi) {
struct groupApi* g = (struct groupApi*) handle;
printGroupApiEventHeader(fh, g);
struct kernelLaunch* kernelLaunchHead = profilerQueueHead(&g->kernelLaunchEvents);
while (kernelLaunchHead != NULL) {
printEvent(fh, kernelLaunchHead);
kernelLaunchHead = kernelLaunchHead->next;
}
struct collApi* collApiHead = profilerQueueHead(&g->collApiEvents);
while (collApiHead != NULL) {
printEvent(fh, collApiHead);
collApiHead = collApiHead->next;
}
struct p2pApi* p2pApiHead = profilerQueueHead(&g->p2pApiEvents);
while (p2pApiHead != NULL) {
printEvent(fh, p2pApiHead);
p2pApiHead = p2pApiHead->next;
}
printGroupApiEventTrailer(fh, g);
} else if (type == ncclProfileCollApi) {
struct collApi* collApiEvent = (struct collApi *) handle;
printCollApiEventHeader(fh, collApiEvent);
struct taskEventBase* base = taskEventQueueHead(collApiEvent);
while (base) {
struct taskEventBase* next = base->next;
printEvent(fh, base);
base = next;
}
printCollApiEventTrailer(fh, collApiEvent);
} else if (type == ncclProfileP2pApi) {
struct p2pApi* p2pApiEvent = (struct p2pApi *) handle;
printP2pApiEventHeader(fh, p2pApiEvent);
struct taskEventBase* base = taskEventQueueHead(p2pApiEvent);
while (base) {
struct taskEventBase* next = base->next;
printEvent(fh, base);
base = next;
}
printP2pApiEventTrailer(fh, p2pApiEvent);
} else if (type == ncclProfileKernelLaunch) {
struct kernelLaunch* kernelLaunchEvent = (struct kernelLaunch *) handle;
printKernelLaunchEventHeader(fh, kernelLaunchEvent);
printKernelLaunchEventTrailer(fh, kernelLaunchEvent);
} else if (type == ncclProfileGroup) {
struct group* g = (struct group *)handle;
printGroupEventHeader(fh, g);
struct taskEventBase* base = taskEventQueueHead(g);
+50
Datei anzeigen
@@ -0,0 +1,50 @@
/*************************************************************************
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef QUEUE_H
#define QUEUE_H
template<typename T, T *T::*next>
struct profilerQueue {
T *head, *tail;
};
template<typename T, T *T::*next>
inline void profilerQueueConstruct(profilerQueue<T,next> *me) {
me->head = nullptr;
me->tail = nullptr;
}
template<typename T, T *T::*next>
inline bool profilerQueueEmpty(profilerQueue<T,next> *me) {
return me->head == nullptr;
}
template<typename T, T *T::*next>
inline T* profilerQueueHead(profilerQueue<T,next> *me) {
return me->head;
}
template<typename T, T *T::*next>
inline T* profilerQueueTail(profilerQueue<T,next> *me) {
return me->tail;
}
template<typename T, T *T::*next>
inline void profilerQueueEnqueue(profilerQueue<T,next> *me, T *x) {
x->*next = nullptr;
(me->head ? me->tail->*next : me->head) = x;
me->tail = x;
}
template<typename T, T *T::*next>
inline T* profilerQueueDequeue(profilerQueue<T,next> *me) {
T *ans = me->head;
me->head = ans->*next;
if (me->head == nullptr) me->tail = nullptr;
return ans;
}
#endif
+22
Datei anzeigen
@@ -0,0 +1,22 @@
.PHONY: build-CoMMA
all: build-CoMMA
build-CoMMA: clone-CoMMA
cd CoMMA && cargo build
clone-CoMMA:
@if [ ! -d CoMMA ] ; then \
git clone https://github.com/google/CoMMA.git; \
ln -s $(PWD)/.. CoMMA/third_party/nccl/ext-profiler; \
fi
clean:
@if [ -d CoMMA ] ; then \
cd CoMMA && cargo clean; \
fi
delete:
@if [ -d CoMMA ] ; then \
rm -rf CoMMA; \
fi
+62
Datei anzeigen
@@ -0,0 +1,62 @@
#
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
# Variables
NCCL_HOME := ../../build
INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
PLUGIN_SO := libnccl-profiler-inspector.so
VERSION_FILE := version.cc
# Compiler and flags
CXX := g++
CXXFLAGS := -g -O3 -fPIC -shared -march=native -DNDEBUG -Wall -Wextra
ifeq ($(DEBUG), 1)
CXXFLAGS += -g2 -ggdb3 -rdynamic -funwind-tables -fno-omit-frame-pointer
endif
ifeq ($(ASAN), 1)
CXXFLAGS += -fsanitize=address
LDFLAGS += -fsanitize=address -static-libasan
NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan
endif
ifeq ($(UBSAN), 1)
CXXFLAGS += -fsanitize=undefined
LDFLAGS += -fsanitize=undefined -static-libubsan
NVLDFLAGS += -Xcompiler -fsanitize=undefined,-static-libubsan
endif
# Source files
SOURCES := inspector_plugin.cc inspector.cc json.cc
# Default target
all: $(PLUGIN_SO)
# Rule to build the plugin
$(PLUGIN_SO): $(VERSION_FILE) $(SOURCES)
@echo "Compiling to create $@ from $^"
$(CXX) $(INC) $(CXXFLAGS) -o $@ -Wl,-soname,$(PLUGIN_SO) $^
# Rule to generate version.cc
$(VERSION_FILE):
@GIT_INFO=$$(./utils/extract_git_version.sh); \
echo '#include "version.h"' > $(VERSION_FILE).tmp; \
echo 'const char* get_git_version_info() { return "'$$GIT_INFO'"; }' >> $(VERSION_FILE).tmp; \
if ! cmp $(VERSION_FILE).tmp $(VERSION_FILE); then \
echo "updating ${VERSION_FILE} file -> $$GIT_INFO"; \
mv $(VERSION_FILE).tmp $(VERSION_FILE); \
else \
echo "${VERSION_FILE} up to date -> $$GIT_INFO"; \
rm $(VERSION_FILE).tmp; \
fi
# Clean target
clean:
rm -f $(VERSION_FILE) $(PLUGIN_SO)
# Phony targets
.PHONY: all clean
+216
Datei anzeigen
@@ -0,0 +1,216 @@
# NCCL Inspector Plugin
The NCCL Inspector is a plugin for the NVIDIA Collective Communications Library (NCCL) that provides detailed, per-communicator, per-collective performance and metadata logging. It is designed to help users analyze and debug NCCL collective operations by generating structured JSON output for each operation.
## Related Documentation
- **[Performance Exporter](exporter/example/README.md)** - Tool for analyzing and visualizing NCCL performance data from inspector logs
## Folder Location
The Inspector plugin source is located in:
```
ext-profiler/inspector/
```
## Building the Inspector Plugin
To build the Inspector plugin, run:
```bash
make
```
The build system will automatically detect CUDA and NCCL installations from your environment. If you need to specify custom paths, you can set `CUDA_HOME` and `NCCL_HOME` environment variables or pass them as make arguments.
### Build Options
The Makefile supports several build options:
- **DEBUG=1**: Enable debug build with additional debugging information
- **ASAN=1**: Enable Address Sanitizer for memory error detection
- **UBSAN=1**: Enable Undefined Behavior Sanitizer
Example debug build:
```bash
make DEBUG=1
```
### Build Output
The build process creates:
- `libnccl-profiler-inspector.so`: The main inspector plugin library
- `version.cc`: Auto-generated version information from git
## Using NCCL Inspector
### Key Differences from Normal NCCL Usage
The main difference between running NCCL with the Inspector plugin versus running NCCL normally is the addition of environment variables that enable detailed performance logging:
**Normal NCCL Run:**
```bash
# Standard NCCL execution
./your_nccl_application
```
**NCCL Inspector Run:**
```bash
# NCCL Inspector enabled execution
export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
export NCCL_INSPECTOR_ENABLE=1
export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
./your_nccl_application
```
### Required Environment Variables
- `NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so`
Loads the Inspector plugin into NCCL.
- `NCCL_INSPECTOR_ENABLE=1`
Enables the Inspector plugin.
- `NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=<interval>`
Sets the interval (in microseconds) for the internal dump thread to write output. Example: `500`.
- `NCCL_INSPECTOR_DUMP_DIR=<output_dir>` (optional)
Sets the output directory for logs. If not set, defaults to `nccl-inspector-unknown-jobid` or `nccl-inspector-<slurm_job_id>` if running under SLURM.
- `NCCL_INSPECTOR_DUMP_VERBOSE=<0|1>` (optional)
Enables verbose output including event trace information. Set to `1` to enable, `0` to disable (default).
### Example Usage
**Single Node:**
```bash
export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
export NCCL_INSPECTOR_ENABLE=1
export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
./build/test/perf/all_reduce_perf -b 8 -e 16G -f 2 -g 8
```
**Multi-Node (SLURM):**
```bash
# Add these environment variables to your SLURM script
export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
export NCCL_INSPECTOR_ENABLE=1
export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
export NCCL_INSPECTOR_DUMP_DIR=/path/to/logs/${SLURM_JOB_ID}/
# Then run your normal NCCL application
srun your_nccl_application
```
## Example Scripts
For detailed example scripts showing how to integrate NCCL Inspector with different workloads, see the **[test/examples/](test/examples/)** directory:
- **Single Node Example**: Basic NCCL performance testing with inspector
- **Multi-Node SLURM Example**: Comprehensive multi-node testing with various collective operations
- **Training Workload Example**: Integration with distributed training workloads
## Output Example
Each output file contains JSON objects with the following structure:
```json
{
"header": {
"id": "0x7f8c496ae9f661",
"rank": 2,
"n_ranks": 8,
"nnodes": 1
},
"metadata": {
"inspector_output_format_version": "v4.0",
"git_rev": "",
"rec_mechanism": "profiler_plugin",
"dump_timestamp_us": 1748030377748202,
"hostname": "example-hostname",
"pid": 1639453
},
"coll_perf": {
"coll": "AllReduce",
"coll_sn": 1407,
"coll_msg_size_bytes": 17179869184,
"coll_exec_time_us": 61974,
"coll_algobw_gbs": 277.210914,
"coll_busbw_gbs": 485.119099
}
}
```
## Output Example Verbose
To enable verbose output with event trace information, set the `NCCL_INSPECTOR_DUMP_VERBOSE=1` environment variable:
```bash
export NCCL_INSPECTOR_DUMP_VERBOSE=1
```
This will include additional event trace information in the JSON output, showing the sequence of callbacks and timestamps for each individual event.
```json
{
"header": {
"id": "0xe62dedaa97644a",
"rank": 4,
"n_ranks": 8,
"nnodes": 1
},
"metadata": {
"inspector_output_format_version": "v4.0",
"git_rev": "9019a1912-dirty",
"rec_mechanism": "nccl_profiler_interface",
"dump_timestamp_us": 1752867229276385,
"hostname": "example-hostname",
"pid": 438776
},
"coll_perf": {
"coll": "ReduceScatter",
"coll_sn": 1231,
"coll_msg_size_bytes": 2147483648,
"coll_exec_time_us": 41057,
"coll_timing_source": "kernel_gpu",
"coll_algobw_gbs": 418.439467,
"coll_busbw_gbs": 366.134533,
"event_trace_sn": {
"coll_start_sn": 1,
"coll_stop_sn": 2,
"kernel_events": [
{
"channel_id": 0,
"kernel_start_sn": 3,
"kernel_stop_sn": 48,
"kernel_record_sn": 47
}
]
},
"event_trace_ts": {
"coll_start_ts": 1752867229235059,
"coll_stop_ts": 1752867229235064,
"kernel_events": [
{
"channel_id": 0,
"kernel_start_ts": 1752867229235181,
"kernel_stop_ts": 1752867229275811,
"kernel_record_ts": 1752867229275811
}
]
}
}
}
```
Multiple such JSON objects are written, one per collective operation per communicator.
## Output Directory
- By default, output files are written to:
- `nccl-inspector-unknown-jobid` (if no SLURM job ID is present)
- `nccl-inspector-<slurm_job_id>` (if running under SLURM)
- You can override this with the `NCCL_INSPECTOR_DUMP_DIR` environment variable.
## Additional Notes
- The plugin is compatible with standard NCCL workflows and can be used in both single-node and multi-node (SLURM) environments.
- For more details, see the source code and comments in `ext-profiler/inspector/`.
@@ -0,0 +1,151 @@
# NCCL Inspector Performance Summary Exporter
This tool processes NCCL Inspector log files and generates comprehensive performance analysis reports including visualizations and statistical summaries.
One can build similar exporters to integrate with various observability systems like Elastic, Prometheus or other Custom Metric systems.
## Features
- **Performance Analysis**: Generates statistical summaries for collective operations
- **Communication Type Classification**: Automatically categorizes communication patterns
- **Visualizations**: Creates scatter plots, histograms, and box plots for performance metrics
- **Data Export**: Converts logs to Parquet format for efficient processing
- **Multi-format Log Support**: Processes `.log`, `.log.gz`, `.jsonl`, and `.jsonl.gz` files
- **Parallel Processing**: Utilizes multi-core processing for faster analysis
## Requirements
- Python 3.7+
- Access to NCCL Inspector log files
## Installation
### Clone the Repository
```bash
git clone https://github.com/NVIDIA/nccl.git
cd nccl/ext-profiler/inspector/exporter/example
```
Install the required dependencies using the provided `requirements.txt` file:
```bash
pip install -r requirements.txt
```
## Usage
The script processes NCCL Inspector log files from a specified directory.
**Note:** To generate NCCL Inspector log files, you need to run your NCCL application with the inspector plugin enabled. The log files will be output to a directory specified by the `NCCL_INSPECTOR_DUMP_DIR` environment variable. For detailed setup instructions and environment variable configuration, see the [Inspector README](../../../README.md).
### Basic Usage
```bash
python perf_summary_exporter.py --input_dir /path/to/nccl/inspector/logs
```
This mode processes all log files in the specified directory and its subdirectories recursively.
### Command Line Arguments
- `--input_dir <path>`: **Required**. Directory containing NCCL Inspector log files (searches recursively in subdirectories)
- `--output_dir <name>`: **Optional**. Custom output directory name (default: `<input_directory_name>-analysis`)
## Output
The tool generates:
1. **Parquet Files**: One per log file containing processed log data (stored in `parquet_files/` subdirectory)
2. **Summary Directory**: Contains comprehensive analysis results
3. **Visualizations**: Scatter plots, histograms, and box plots for each message size
4. **CSV Files**: Detailed summaries for each message size and collective type
5. **Log File**: Processing log with detailed information
## Example Output Structure
```
<output_dir_name>/
├── output.log
├── parquet_files/
│ ├── <filename1>.parquet
│ ├── <filename2>.parquet
│ └── ...
└── summary/
├── scatter_plot_<comm_type>_<coll_type>.png
├── combined_scatter_plot_<comm_type>_<coll_type>.png
└── msg_size_<human_readable_size>/
├── histograms/
│ └── histogram_<comm_type>_<coll_type>_<size>.png
├── boxplots/
│ └── boxplot_<comm_type>_<coll_type>_<size>.png
└── summary_<comm_type>_<coll_type>_<size>.csv
```
## Supported Communicator Types
- `single-rank`
- `nvlink-only`
- `hca-only`
- `mixed`
## Supported Collective Types
- `AllReduce`
- `AllGather`
- `ReduceScatter`
- `Broadcast`
## Log File Formats
### Supported Formats
- `.log` - Plain text JSON lines
- `.log.gz` - Compressed JSON lines
- `.jsonl` - JSON lines format
- `.jsonl.gz` - Compressed JSON lines
### Expected JSON Structure
```json
{
"header": {
"id": "0x9e7a479f95a66c",
"rank": 31,
"n_ranks": 32,
"nnodes": 4
},
"metadata": {
"inspector_output_format_version": "v4.0",
"git_rev": "75e61acda-dirty",
"rec_mechanism": "nccl_profiler_interface",
"dump_timestamp_us": 1749490229087081,
"hostname": "example-hostname",
"pid": 468528
},
"coll_perf": {
"coll": "ReduceScatter",
"coll_sn": 129,
"coll_msg_size_bytes": 65536,
"coll_exec_time_us": 110,
"coll_timing_source": "kernel_gpu",
"coll_algobw_gbs": 19.065018,
"coll_busbw_gbs": 18.469236
}
}
```
## Troubleshooting
### Common Issues
1. **No log files found**: Ensure the log directory path is correct and contains valid log files
2. **Missing dependencies**: Ensure all requirements are installed in your virtual environment
3. **Mixed file formats**: The tool will exit if it detects mixed `.log`, `.log.gz`, `.jsonl`, and `.jsonl.gz` files in the same directory. This is typically indicative of corrupt input directories caused by multiple overlapping NCCL Inspector runs with different output format options. Clean the directory and re-run with consistent settings.
### Log Files
The tool creates detailed logs in the output directory. Check `output.log` for processing information and any error messages.
## Support
Please refer to the github issues page at https://github.com/NVIDIA/nccl/issues. Your question may already have been asked by another user. If not, feel free to create a new issue and refer to the "inspector plugin" in the title.
@@ -0,0 +1,548 @@
from pathlib import Path
import argparse
import glob
import gzip
import sys
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
import json
from tqdm.auto import tqdm
import duckdb
import math
import matplotlib.pyplot as plt
import matplotlib.dates
from matplotlib.gridspec import GridSpec
import os
import logging
import contextlib
from datetime import datetime
import numpy as np
def setup_logging(output_dir):
log_file = output_dir / "output.log"
logging.basicConfig(
filename=log_file,
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
@contextlib.contextmanager
def smart_open(filename, mode="r"):
if filename.endswith(".gz"):
opener = gzip.open
else:
opener = open
with opener(filename, mode) as f:
yield f
def get_log_files_and_output_dir():
parser = argparse.ArgumentParser(description="Process log files in a directory.")
parser.add_argument(
"--input_dir",
type=str,
help="The directory containing NCCL Inspector log files to process.",
)
parser.add_argument(
"--output_dir",
type=str,
help="Custom output directory name (default: auto-generated from input directory)."
)
args = parser.parse_args()
if args.input_dir:
# Use the provided input directory
root_dir = Path(args.input_dir)
if not root_dir.exists():
raise FileNotFoundError(f"Input directory not found: {root_dir}")
logfiles = list(glob.iglob(str(Path(root_dir) / "**" / "*.log"), recursive=True))
gzlogfiles = list(
glob.iglob(str(Path(root_dir) / "**" / "*.log.gz"), recursive=True)
)
jsonlfiles = list(
glob.iglob(str(Path(root_dir) / "**" / "*.jsonl"), recursive=True)
)
gzjsonlfiles = list(
glob.iglob(str(Path(root_dir) / "**" / "*.jsonl.gz"), recursive=True)
)
if (
sum((1 for x in [logfiles, gzlogfiles, jsonlfiles, gzjsonlfiles] if len(x) > 0))
> 1
):
### TODO: we could probably generate some logic to pick the "right" file to load, but for now, bail
logging.critical("Appear to have mixed .log/.log.gz/.jsonl/.jsonl.gz; bailing!")
sys.exit(1)
files = logfiles + gzlogfiles + jsonlfiles + gzjsonlfiles
if not files:
print("No inspector logs found")
sys.exit(1)
# Generate output directory name from input directory
if args.output_dir:
output_dir_name = args.output_dir
else:
output_dir_name = f"{root_dir.name}-analysis"
return files, output_dir_name
def bytes_to_human_readable(size_bytes):
"""
Convert bytes to human-readable format using decimal (SI) units.
Uses powers of 1000 (decimal/SI standard):
- 1 KB = 1,000 bytes
- 1 MB = 1,000,000 bytes
- 1 GB = 1,000,000,000 bytes
Not binary units (powers of 1024):
- Does NOT use KiB, MiB, GiB (1024-based)
Args:
size_bytes: Number of bytes to convert
Returns:
Human-readable string (e.g., "1.50MB", "2.34GB")
"""
if size_bytes == 0:
return "0B"
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i = int(math.log10(int(size_bytes)) / 3)
s = round(size_bytes * math.pow(10, -3 * i), 2)
return f"{s:.2f}{size_name[i]}"
def timestamp_to_datetime(timestamp_us):
"""Convert microsecond timestamp to datetime string"""
return datetime.fromtimestamp(timestamp_us / 1000000).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
def microseconds_to_human_readable(microseconds):
"""Convert microseconds to human readable format"""
if microseconds < 1000:
return f"{microseconds:.1f}μs"
elif microseconds < 1000000:
return f"{microseconds/1000:.1f}ms"
else:
return f"{microseconds/1000000:.1f}s"
def get_comm_type(row) -> str:
if row["n_ranks"] == 1:
return "single-rank"
elif row["nnodes"] == 1:
return "nvlink-only"
elif row["n_ranks"] == row["nnodes"]:
return "hca-only"
else:
return "mixed"
def parse_file(filepath: Path, output_dir):
filename = Path(filepath).stem
parquet_file = output_dir / f"{filename}.parquet"
# Check if parquet file exists and is newer than source file
if parquet_file.exists():
source_mtime = Path(filepath).stat().st_mtime
parquet_mtime = parquet_file.stat().st_mtime
if parquet_mtime >= source_mtime:
logging.info(f"Parquet file {parquet_file} is up to date. Skipping...")
return
else:
logging.info(f"Source file {filepath} is newer than parquet. Regenerating...")
# Check if file is empty or too small
file_size = Path(filepath).stat().st_size
if file_size == 0:
logging.warning(f"Skipping empty file: {filepath}")
return
recs = []
try:
with smart_open(filepath, "r") as infile:
for lineno, line in enumerate(infile):
try:
json_recs = json.loads(line)
except json.JSONDecodeError:
logging.error(f"Failed to parse line {filepath}:{lineno}")
continue
# Validate that required fields exist
if not all(key in json_recs for key in ["header", "metadata", "coll_perf"]):
logging.error(f"Missing required fields in {filepath}:{lineno}")
continue
header = json_recs["header"]
metadata = json_recs["metadata"]
comm_type = get_comm_type(header)
coll_perf = json_recs["coll_perf"]
recs.append(
dict(
**header,
comm_type=comm_type,
**coll_perf,
**metadata,
)
)
except Exception as e:
logging.error(f"Error reading file {filepath}: {e}")
return
# Skip files with no valid records
if not recs:
logging.warning(f"No valid records found in file: {filepath}. Skipping...")
return
df = pd.DataFrame(recs)
df.to_parquet(parquet_file)
logging.info(f"Created parquet file {parquet_file} with {len(recs)} records")
def create_per_node_parquet_files(files, output_dir):
output_dir = Path(output_dir) / "parquet_files"
output_dir.mkdir(parents=True, exist_ok=True)
max_workers = min(64, len(files), os.cpu_count() or 1)
with ProcessPoolExecutor(max_workers=max_workers) as executor:
list(
tqdm(
executor.map(parse_file, files, [output_dir] * len(files)),
total=len(files),
desc="Processing files",
unit="file",
)
)
return output_dir
def generate_scatter_plot(df, comm_type, coll_type, output_file):
plt.figure(figsize=(10, 6), dpi=100)
distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
for msg_size in distinct_msg_sizes:
df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
mean_busbw = df_msg_size["mean_coll_busbw_gbs"].mean()
plt.scatter(
df_msg_size["coll_sn"],
df_msg_size["mean_coll_busbw_gbs"],
label=f"MsgSize: {bytes_to_human_readable(msg_size)} (Mean: {mean_busbw:.2f} GB/s)",
alpha=0.5,
)
plt.xlabel("Operation Sequence Number")
plt.ylabel("Mean Collective Bus BW (GB/s)")
plt.title(f"Comm Type: {comm_type}, Coll Type: {coll_type}")
plt.legend(title="Message Size", loc="upper right")
plt.tight_layout()
plt.savefig(output_file)
plt.close()
logging.info(f"Scatter plot saved to {output_file}")
def generate_combined_scatter_plot(df, comm_type, coll_type, output_file, max_cols=3):
distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
num_plots = len(distinct_msg_sizes)
# Compute number of rows and columns
num_cols = min(max_cols, num_plots) # Limit max columns
num_rows = (num_plots + num_cols - 1) // num_cols # Calculate rows dynamically
# Create figure with GridSpec
fig = plt.figure(figsize=(5 * num_cols, 5 * num_rows), dpi=100)
gs = GridSpec(num_rows, num_cols, figure=fig)
for i, msg_size in enumerate(distinct_msg_sizes):
row, col = divmod(i, num_cols) # Determine row & column index
ax = fig.add_subplot(gs[row, col]) # Create subplot at position
df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
mean_busbw = df_msg_size["mean_coll_busbw_gbs"].mean()
ax.scatter(
df_msg_size["coll_sn"],
df_msg_size["mean_coll_busbw_gbs"],
label=f"MsgSize: {bytes_to_human_readable(msg_size)} (Mean: {mean_busbw:.2f} GB/s)",
alpha=0.5,
)
ax.set_xlabel("Op Seq No")
ax.set_ylabel("Mean Collective Bus BW (GB/s)")
ax.set_title(f"Message Size: {bytes_to_human_readable(msg_size)}({msg_size})")
ax.legend(loc="upper right")
fig.suptitle(f"Comm Type: {comm_type}, Coll Type: {coll_type}", ha="center", y=0.98)
plt.tight_layout()
plt.savefig(output_file)
plt.close()
logging.info(f"Combined scatter plot saved to {output_file}")
def generate_histogram(df, comm_type, coll_type, output_file, message_size):
plt.figure(figsize=(10, 6), dpi=100)
data_range = df["mean_coll_busbw_gbs"].max() - df["mean_coll_busbw_gbs"].min()
num_bins = min(50, int(data_range) + 1)
plt.hist(
df["mean_coll_busbw_gbs"],
bins=num_bins,
alpha=0.7,
color="b",
edgecolor="black",
linewidth=1.2,
)
plt.xlabel("Mean Collective Bus BW (GB/s)")
plt.ylabel("Frequency")
plt.title(
f"Comm Type: {comm_type}, Coll Type: {coll_type} Mean Collective Bus BW Histogram\nMsg Size: {message_size}"
)
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.0f}"))
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:.2f} GB/s"))
plt.gca().xaxis.get_offset_text().set_visible(False)
plt.tight_layout()
plt.savefig(output_file)
plt.close()
logging.info(f"Histogram saved to {output_file}")
def generate_boxplot(df, comm_type, coll_type, output_file, message_size):
plt.figure(figsize=(10, 6))
boxprops = dict(linestyle="-", linewidth=2, color="blue")
flierprops = dict(marker="o", color="red", alpha=0.5)
medianprops = dict(linestyle="-", linewidth=2.5, color="orange")
whiskerprops = dict(linestyle="--", linewidth=2, color="green")
capprops = dict(linestyle="-", linewidth=2, color="black")
plt.boxplot(
df["mean_coll_busbw_gbs"],
vert=False,
patch_artist=True,
boxprops=boxprops,
flierprops=flierprops,
medianprops=medianprops,
whiskerprops=whiskerprops,
capprops=capprops,
)
plt.xlabel("Mean Coll Bus BW (GB/s)")
plt.title(
f"Box Plot of Coll Bus BW (CommType: {comm_type} - Coll Type: {coll_type} - Msg Size: {message_size})"
)
# Adding labels for min, max, and median
stats = df["mean_coll_busbw_gbs"].describe(percentiles=[0.5])
plt.annotate(
f"Min: {stats['min']:.2f}",
xy=(stats["min"], 1),
xytext=(stats["min"], 1.1),
arrowprops=dict(facecolor="black", shrink=0.05),
)
plt.annotate(
f"Median: {stats['50%']:.2f}",
xy=(stats["50%"], 1),
xytext=(stats["50%"], 1.1),
arrowprops=dict(facecolor="black", shrink=0.05),
)
plt.annotate(
f"Max: {stats['max']:.2f}",
xy=(stats["max"], 1),
xytext=(stats["max"], 1.1),
arrowprops=dict(facecolor="black", shrink=0.05),
)
plt.tight_layout()
plt.savefig(output_file)
plt.close()
logging.info(f"Box plot saved to {output_file}")
def summarize_data_per_comm_coll_type(output_root, comm_type, coll_type, output_dir_name):
"""Summarize parquet data per communication and collective type using DuckDB"""
logging.info(f"Summarizing data per comm/coll type for {output_dir_name}, {comm_type} and {coll_type}")
# Check if there are any parquet files
parquet_dir = output_root / "parquet_files"
parquet_files = list(parquet_dir.glob("*.parquet"))
if not parquet_files:
logging.warning(f"No parquet files found for {comm_type} and {coll_type}")
return None
# Clean up invalid/empty parquet files by moving them to a separate directory
invalid_dir = parquet_dir / "invalid"
invalid_dir.mkdir(exist_ok=True)
invalid_count = 0
for pf in parquet_files:
try:
# Check file size first
if pf.stat().st_size == 0:
logging.warning(f"Moving zero-byte parquet file {pf} to invalid directory")
pf.rename(invalid_dir / pf.name)
invalid_count += 1
continue
# Use pyarrow to check parquet metadata without reading data
import pyarrow.parquet as pq
parquet_file = pq.ParquetFile(pf)
if parquet_file.metadata.num_rows == 0:
logging.warning(f"Moving empty parquet file {pf} (0 rows) to invalid directory")
pf.rename(invalid_dir / pf.name)
invalid_count += 1
except Exception as e:
logging.warning(f"Moving invalid parquet file {pf} to invalid directory: {e}")
pf.rename(invalid_dir / pf.name)
invalid_count += 1
# Check if any valid files remain
remaining_files = list(parquet_dir.glob("*.parquet"))
if not remaining_files:
logging.warning(f"No valid parquet files found for {comm_type} and {coll_type} (moved {invalid_count} invalid files)")
return None
logging.info(f"Found {len(remaining_files)} valid parquet files (moved {invalid_count} invalid files)")
try:
duckdb.execute(
f"CREATE OR REPLACE VIEW logs AS SELECT * FROM read_parquet('{parquet_dir}/*.parquet')"
)
df = duckdb.execute(f"""
SELECT
id,
coll_sn,
coll_msg_size_bytes,
AVG(coll_busbw_gbs) as mean_coll_busbw_gbs,
COUNT(*) as log_count,
ARRAY_DISTINCT(LIST(n_ranks)) as n_ranks,
ARRAY_DISTINCT(LIST(nnodes)) as nnodes,
MIN(dump_timestamp_us) as coll_start_timestamp_us,
MAX(dump_timestamp_us) as coll_end_timestamp_us,
(MAX(dump_timestamp_us) - MIN(dump_timestamp_us)) as coll_duration_us
FROM logs
WHERE coll = '{coll_type}' and comm_type = '{comm_type}'
GROUP BY id, coll_sn, coll_msg_size_bytes
ORDER BY coll_sn
""").df()
except Exception as e:
logging.error(f"Error executing DuckDB query for {comm_type} and {coll_type}: {e}")
return None
if df.empty:
logging.info(f"No data for {comm_type} and {coll_type}")
return None
# Add human-readable formatting
df["human_readable_coll_msg_size_bytes"] = df["coll_msg_size_bytes"].apply(
bytes_to_human_readable
)
# Log example of time range data for first few rows
if len(df) > 0:
sample_row = df.iloc[0]
start_time = timestamp_to_datetime(sample_row['coll_start_timestamp_us'])
end_time = timestamp_to_datetime(sample_row['coll_end_timestamp_us'])
duration = microseconds_to_human_readable(sample_row['coll_duration_us'])
logging.info(f"Example time range - ID: {sample_row['id']}, Coll_SN: {sample_row['coll_sn']}, "
f"Start: {start_time}, End: {end_time}, Duration: {duration}")
return df
def generate_visualizations(df, output_root, comm_type, coll_type):
"""Generate all visualizations and save CSV files for the processed data"""
logging.info(f"Generating visualizations for {comm_type} and {coll_type}")
summary_dir = output_root / "summary"
summary_dir.mkdir(parents=True, exist_ok=True)
# Scatter Plot for all message sizes
output_file = summary_dir / f"scatter_plot_{comm_type}_{coll_type}.png"
generate_scatter_plot(df, comm_type, coll_type, output_file)
# Combined Scatter Plot for all message sizes
output_file = summary_dir / f"combined_scatter_plot_{comm_type}_{coll_type}.png"
generate_combined_scatter_plot(df, comm_type, coll_type, output_file)
distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
for msg_size in distinct_msg_sizes:
hr_msg_size = bytes_to_human_readable(msg_size)
msg_size_dir = summary_dir / f"msg_size_{msg_size}_{hr_msg_size}"
msg_size_hist_dir = msg_size_dir / "histograms"
msg_size_boxplot_dir = msg_size_dir / "boxplots"
msg_size_dir.mkdir(parents=True, exist_ok=True)
msg_size_hist_dir.mkdir(parents=True, exist_ok=True)
msg_size_boxplot_dir.mkdir(parents=True, exist_ok=True)
df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
# Add human-readable time formatting
df_msg_size = df_msg_size.copy()
df_msg_size["coll_start_datetime"] = df_msg_size["coll_start_timestamp_us"].apply(timestamp_to_datetime)
df_msg_size["coll_end_datetime"] = df_msg_size["coll_end_timestamp_us"].apply(timestamp_to_datetime)
df_msg_size["coll_duration_human"] = df_msg_size["coll_duration_us"].apply(microseconds_to_human_readable)
# Histogram
output_file = (
msg_size_hist_dir / f"histogram_{comm_type}_{coll_type}_{msg_size}.png"
)
generate_histogram(
df_msg_size,
comm_type,
coll_type,
output_file,
bytes_to_human_readable(msg_size),
)
# Box Plot
output_file = (
msg_size_boxplot_dir / f"boxplot_{comm_type}_{coll_type}_{msg_size}.png"
)
generate_boxplot(
df_msg_size,
comm_type,
coll_type,
output_file,
bytes_to_human_readable(msg_size),
)
output_file = msg_size_dir / f"summary_{comm_type}_{coll_type}_{msg_size}.csv"
df_msg_size.to_csv(output_file, index=False)
logging.info(
f"Summary for {comm_type}, {coll_type}, and msg_size {msg_size} written to {output_file}"
)
def generate_summary(output_root, comm_type, coll_type, output_dir_name):
"""Generate summary by summarizing data per comm/coll type and creating visualizations"""
logging.info(f"Generating summary for {output_dir_name}, {comm_type} and {coll_type}")
# Step 1: Summarize data per communication and collective type
df = summarize_data_per_comm_coll_type(output_root, comm_type, coll_type, output_dir_name)
# Step 2: Generate visualizations if data exists
if df is not None:
generate_visualizations(df, output_root, comm_type, coll_type)
else:
logging.warning(f"No data found for {comm_type} and {coll_type} - skipping visualization generation")
def generate_summary_wrapper(args):
return generate_summary(*args)
if __name__ == "__main__":
files, output_dir_name = get_log_files_and_output_dir()
print(f"Number of log files found: {len(files)}")
print(f"Output directory: {output_dir_name}")
output_dir = Path(output_dir_name)
output_dir.mkdir(parents=True, exist_ok=True)
setup_logging(output_dir)
create_per_node_parquet_files(files, output_dir)
comm_types = ["single-rank", "nvlink-only", "hca-only", "mixed"]
coll_types = ["AllReduce", "AllGather", "ReduceScatter", "Broadcast"]
summary_args = [
(output_dir, comm_type, coll_type, output_dir_name)
for comm_type in comm_types
for coll_type in coll_types
]
max_workers = min(64, len(summary_args), os.cpu_count() or 1)
with ProcessPoolExecutor(max_workers=max_workers) as executor:
list(
tqdm(
executor.map(generate_summary_wrapper, summary_args),
total=len(summary_args),
desc="Generating summaries",
)
)
print("Done!")
@@ -0,0 +1,6 @@
pandas>=1.3.0
tqdm>=4.60.0
duckdb>=0.8.0
matplotlib>=3.3.0
pyarrow>=5.0.0
numpy>=1.21.0
Datei-Diff unterdrückt, da er zu groß ist Diff laden
+198
Datei anzeigen
@@ -0,0 +1,198 @@
#pragma once
#include <pthread.h>
#include "json.h"
#include "common.h"
#include "version.h"
#define MAX_CHANNELS 64
#define INS_CHK_GOTO(call, res, label) \
do { \
res = call; \
if (inspectorSuccess != res) { \
INFO(NCCL_INSPECTOR, "%s:%d -> error %d: %s", __FILE__, __LINE__, res, \
inspectorErrorString(res)); \
goto label; \
} \
} while (0);
typedef enum {
ncclFuncBroadcast = 0,
ncclFuncReduce = 1,
ncclFuncAllGather = 2,
ncclFuncReduceScatter = 3,
ncclFuncAllReduce = 4,
ncclFuncSendRecv = 5,
ncclFuncSend = 6,
ncclFuncRecv = 7,
ncclNumFuncs = 8
} ncclFunc_t;
typedef enum {
inspectorSuccess = 0,
inspectorUninitializedError,
inspectorMemoryError,
inspectorFileOpenError,
inspectorDisabledError,
inspectorLockError,
inspectorPthreadError,
inspectorJsonError,
inspectorCudaError,
inspectorBadHash,
inspectorDeleteUnknownCommError,
inspectorAddDuplicateCommError,
inspectorNop,
inspectorNullTally,
inspectorGlobalInitError,
inspectorReturn,
} inspectorResult_t;
typedef enum {
inspectorTimingSourceKernelGpu = 0,
inspectorTimingSourceKernelCpu = 1,
inspectorTimingSourceCollectiveCpu = 2,
} inspectorTimingSource_t;
struct inspectorEventTraceInfo {
uint64_t ts;
uint64_t sn;
};
typedef enum {
NCCL_INSP_EVT_TRK_COLL_START = 0,
NCCL_INSP_EVT_TRK_COLL_STOP = 1,
NCCL_INSP_EVT_TRK_COLL_NEVT = 2,
} inspectorEventTrkColl_t;
typedef enum {
NCCL_INSP_EVT_TRK_KERNEL_START = 0,
NCCL_INSP_EVT_TRK_KERNEL_STOP = 1,
NCCL_INSP_EVT_TRK_KERNEL_RECORD = 2,
NCCL_INSP_EVT_TRK_KERNEL_NEVT = 3,
} inspectorEventTrkKernel_t;
struct inspectorEventTrkKernelInfo {
struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_KERNEL_NEVT];
};
struct inspectorEventTrkCollInfo {
int sn;
uint32_t nChannels;
struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_COLL_NEVT];
struct inspectorEventTrkKernelInfo kernelCh[MAX_CHANNELS];
};
struct inspectorCompletedCollInfo {
ncclFunc_t func;
uint64_t sn;
size_t msgSizeBytes;
uint64_t execTimeUsecs;
inspectorTimingSource_t timingSource;
double algoBwGbs;
double busBwGbs;
// Event trace information
struct inspectorEventTrkCollInfo collEvtTrk;
};
enum {
NCCL_COMM_HASH_LENGTH = 17
};
struct inspectorCommInfo {
struct inspectorCommInfo* next;
const char* commName;
uint64_t commHash;
char commHashStr[NCCL_COMM_HASH_LENGTH];
int rank;
int nranks;
int nnodes;
bool dump;
struct inspectorCompletedCollInfo completedCollInfo;
pthread_rwlock_t guard;
};
struct inspectorKernelChInfo {
uint64_t type;
int refCount; /*unused*/
struct inspectorCollInfo *collInfo;
uint8_t channelId;
uint64_t tsStartUsec;
uint64_t tsCompletedUsec;
uint64_t startGpuClk;
uint64_t stopGpuClk;
};
struct inspectorCollInfo {
uint64_t type;
int refCount;
struct inspectorCommInfo *commInfo;
const char* func;
uint64_t sn;
size_t msgSizeBytes;
uint64_t tsStartUsec;
uint64_t tsCompletedUsec;
uint32_t nChannels;
uint32_t nKernelChStarted;
uint32_t nKernelChCompleted;
pthread_rwlock_t guard;
struct inspectorKernelChInfo kernelCh[MAX_CHANNELS];
struct inspectorEventTrkCollInfo collEvtTrk;
};
extern ncclDebugLogger_t logFn;
#define VERSION(...) logFn(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
#define WARN(...) logFn(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
inline int ncclTypeSize(ncclDataType_t type) {
switch (type) {
case ncclInt8:
case ncclUint8:
case ncclFloat8e4m3:
case ncclFloat8e5m2:
return 1;
case ncclFloat16:
case ncclBfloat16:
return 2;
case ncclInt32:
case ncclUint32:
case ncclFloat32:
return 4;
case ncclInt64:
case ncclUint64:
case ncclFloat64:
return 8;
default:
return -1;
}
}
const char* inspectorErrorString(inspectorResult_t result);
inspectorResult_t inspectorLockInit(pthread_rwlock_t* lockRef);
inspectorResult_t inspectorLockDestroy(pthread_rwlock_t* lockRef);
inspectorResult_t inspectorLockRd(pthread_rwlock_t* lockRef);
inspectorResult_t inspectorLockWr(pthread_rwlock_t* lockRef);
inspectorResult_t inspectorUnlockRWLock(pthread_rwlock_t* lockRef);
inspectorResult_t inspectorGlobalInit(int rank);
inspectorResult_t inspectorGlobalFinalize();
uint64_t inspectorGetTime();
inspectorResult_t inspectorAddComm(struct inspectorCommInfo **commInfo,
const char* commName, uint64_t commHash,
int nNodes, int nranks, int rank);
inspectorResult_t inspectorDelComm(struct inspectorCommInfo *commInfo);
void inspectorUpdateCollPerf(struct inspectorCompletedCollInfo *completedColl,
struct inspectorCollInfo *collInfo);
ncclDataType_t inspectorStringToDatatype(const char* str);
void inspectorComputeCollBw(struct inspectorCommInfo *commInfo,
struct inspectorCompletedCollInfo *completedColl,
ncclFunc_t collType);
@@ -0,0 +1,493 @@
/*************************************************************************
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <stdio.h>
#include <pthread.h>
#include <string.h>
#include <linux/limits.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <unistd.h>
#include "profiler.h"
#include "inspector.h"
#define __hidden __attribute__ ((visibility("hidden")))
static int gInitialized;
static pthread_mutex_t gLock = PTHREAD_MUTEX_INITIALIZER;
/*
* Description:
* Records an event trace with timestamp and sequence number
*
* Thread Safety:
* Not thread-safe - must be called with proper locking. This function
* is designed to be called from within locked sections where the
* collective info structure is already protected.
*
* Input:
* struct inspectorEventTraceInfo* evtTrace - event trace array
* int eventIndex - index in the event trace array (must be valid)
* struct inspectorCollInfo* collInfo - collective info structure (must not be NULL)
*
* Output:
* Event trace is updated with current timestamp and next sequence
* number from collective
*
* Return:
* uint64_t - the sequence number assigned to this event
*
* Preconditions:
* - collInfo must not be NULL
* - eventIndex must be within valid bounds for evtTrace array
* - Function must be called from within a locked section
*/
static uint64_t inspectorRecordEventTrace(struct inspectorEventTraceInfo* evtTrace,
int eventIndex,
struct inspectorCollInfo* collInfo) {
evtTrace[eventIndex].ts = inspectorGetTime();
evtTrace[eventIndex].sn = ++collInfo->collEvtTrk.sn; // Increment coll sequence counter
return evtTrace[eventIndex].sn;
}
/*
* Description:
*
* Initializes the NCCL Inspector plugin and global state for a
* communicator.
*
* Thread Safety:
* Thread-safe (uses mutex for initialization).
*
* Input:
* void** context - pointer to plugin context.
* int* eActivationMask - pointer to activation mask output.
* const char* commName - communicator name.
* uint64_t commHash - communicator hash.
* int nNodes - number of nodes.
* int nranks - number of ranks.
* int rank - rank.
* ncclDebugLogger_t logfn - logger function pointer.
*
* Output:
* context is set to plugin context; eActivationMask is set.
*
* Return:
* ncclResult_t - success or error code.
*
*/
__hidden ncclResult_t inspectorPluginInit(void** context, uint64_t commHash,
int* eActivationMask,
const char* commName,
int nNodes, int nranks, int rank,
ncclDebugLogger_t logfn) {
inspectorResult_t res = inspectorSuccess;
*context = nullptr;
logFn = logfn;
pthread_mutex_lock(&gLock);
if (++gInitialized == 1) {
res = inspectorGlobalInit(rank);
if (res != inspectorSuccess) {
WARN("Inspector Init Failed %s:%d -> error %d: %s",__FILE__, __LINE__, res,
inspectorErrorString(res));
gInitialized = 0;
pthread_mutex_unlock(&gLock);
return ncclInternalError;
}
}
pthread_mutex_unlock(&gLock);
INS_CHK_GOTO(inspectorAddComm((struct inspectorCommInfo **)context,
commName, commHash,
nNodes, nranks, rank), res, success);
*eActivationMask = ncclProfileColl | ncclProfileKernelCh;
INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d",
commName ? commName : "", commHash, nranks, rank);
success:
if (res != inspectorSuccess) {
return ncclInternalError;
} else {
return ncclSuccess;
}
}
/*
* Description:
*
* Finalizes the NCCL Inspector plugin and global state for a
* communicator.
*
* Thread Safety:
* Thread-safe (uses mutex for finalization).
*
* Input:
* void* context - plugin context.
*
* Output:
* Plugin context is finalized and cleaned up.
*
* Return:
* ncclResult_t - success or error code.
*
*/
__hidden ncclResult_t inspectorPluginFinalize(void* context) {
inspectorDelComm((struct inspectorCommInfo *)context);
pthread_mutex_lock(&gLock);
if (--gInitialized == 0) {
inspectorGlobalFinalize();
}
pthread_mutex_unlock(&gLock);
return ncclSuccess;
}
inspectorResult_t inspectorPluginCollInfoRef(struct inspectorCollInfo *collInfo) {
collInfo->refCount += 1;
return inspectorSuccess;
}
inspectorResult_t inspectorPluginCollInfoRefSafe(struct inspectorCollInfo *collInfo) {
inspectorLockWr(&collInfo->guard);
inspectorPluginCollInfoRef(collInfo);
inspectorUnlockRWLock(&collInfo->guard);
return inspectorSuccess;
}
inspectorResult_t inspectorPluginCollInfoDeRef(struct inspectorCollInfo *collInfo) {
collInfo->refCount -= 1;
if (collInfo->refCount == 0) {
inspectorLockDestroy(&collInfo->guard);
memset(collInfo, 0, sizeof(struct inspectorCollInfo));
free(collInfo);
return inspectorReturn;
}
return inspectorSuccess;
}
inspectorResult_t inspectorPluginCollInfoDeRefSafe(struct inspectorCollInfo *collInfo) {
inspectorLockWr(&collInfo->guard);
inspectorResult_t res = inspectorPluginCollInfoDeRef(collInfo);
inspectorUnlockRWLock(&collInfo->guard);
return res;
}
/*
* Description:
* Initializes a new inspectorCollInfo structure for a collective
* event.
*
* Thread Safety:
* Not thread-safe (allocates and initializes a new collective info
* structure).
*
* Input:
*
* struct inspectorCollInfo **collInfo - pointer to output
* collective info struct.
* ncclProfilerEventDescr_t *eDescr - event descriptor.
*
* Output:
* collInfo is set to the new collective info struct.
*
* Return:
* None.
*/
static void inspectorPluginCollInfoInit(struct inspectorCollInfo **collInfo,
ncclProfilerEventDescr_t *eDescr,
struct inspectorCommInfo *commInfo) {
struct inspectorCollInfo *collInfoPtr
= (struct inspectorCollInfo*)calloc(1, sizeof(struct inspectorCollInfo));
if (collInfoPtr == nullptr) {
WARN("Inspector: Failed to allocate memory for collective info structure");
*collInfo = nullptr;
return;
}
collInfoPtr->type = ncclProfileColl;
collInfoPtr->refCount = 0;
inspectorPluginCollInfoRef(collInfoPtr); //self ref; no locks needed
collInfoPtr->func = eDescr->coll.func;
collInfoPtr->sn = eDescr->coll.seqNumber;
collInfoPtr->nChannels = eDescr->coll.nChannels;
if (collInfoPtr->nChannels > 0) {
inspectorPluginCollInfoRef(collInfoPtr); //extra ref for kernel completion
}
collInfoPtr->tsStartUsec = inspectorGetTime();
collInfoPtr->msgSizeBytes =
ncclTypeSize(inspectorStringToDatatype(eDescr->coll.datatype)) * eDescr->coll.count;
collInfoPtr->commInfo = commInfo;
collInfoPtr->collEvtTrk.sn = 0;
collInfoPtr->collEvtTrk.nChannels = collInfoPtr->nChannels;
inspectorRecordEventTrace(collInfoPtr->collEvtTrk.evntTrace,
NCCL_INSP_EVT_TRK_COLL_START, collInfoPtr);
inspectorLockInit(&collInfoPtr->guard);
*collInfo = collInfoPtr;
}
/*
* Description:
*
* Initializes a new inspectorKernelChInfo structure for a kernel
* channel event.
*
* Thread Safety:
* Not thread-safe (initializes kernel channel info within a
* collective info structure).
*
* Input:
* struct inspectorKernelChInfo **kernelChInfo - pointer to output
* kernel channel info struct.
* ncclProfilerEventDescr_t *eDescr - event descriptor.
*
* Output:
*
* kernelChInfo is set to the new kernel channel info struct.
*
* Return:
* None.
*/
static void inspectorPluginKernelChInfoInit(struct inspectorKernelChInfo **kernelChInfo,
ncclProfilerEventDescr_t *eDescr) {
if (eDescr->parentObj) {
uint64_t parentType=*(uint64_t*)eDescr->parentObj;
if (parentType == ncclProfileColl) {
struct inspectorCollInfo *collInfo = (struct inspectorCollInfo*)eDescr->parentObj;
if (collInfo && collInfo->type == ncclProfileColl) {
inspectorLockWr(&collInfo->guard);
struct inspectorEventTraceInfo *krnlEvtTrk =
collInfo->collEvtTrk.kernelCh[eDescr->kernelCh.channelId].evntTrace;
inspectorRecordEventTrace(krnlEvtTrk,
NCCL_INSP_EVT_TRK_KERNEL_START,
collInfo);
struct inspectorKernelChInfo *kernelChInfoPtr
= &collInfo->kernelCh[eDescr->kernelCh.channelId];
kernelChInfoPtr->type = ncclProfileKernelCh;
kernelChInfoPtr->channelId = eDescr->kernelCh.channelId;
kernelChInfoPtr->startGpuClk = eDescr->kernelCh.pTimer;
if (kernelChInfoPtr->stopGpuClk == 0) {
inspectorPluginCollInfoRef(collInfo); //Pairs with Record Kernel Stop event
}
kernelChInfoPtr->tsStartUsec = inspectorGetTime();
if (collInfo->nKernelChStarted == 0) {
collInfo->tsStartUsec = kernelChInfoPtr->tsStartUsec;
}
collInfo->nKernelChStarted += 1;
inspectorPluginCollInfoRef(collInfo); //Pairs with Stop Kernel Event
kernelChInfoPtr->collInfo = collInfo;
*kernelChInfo = kernelChInfoPtr;
inspectorUnlockRWLock(&collInfo->guard);
}
}
}
}
/*
* Description:
*
* Starts a profiling event for the NCCL Inspector plugin.
*
* Thread Safety:
* Thread-safe (allocates and initializes event structures).
*
* Input:
* void* context - plugin context.
* void** eHandle - pointer to event handle output.
* ncclProfilerEventDescr_t* eDescr - event descriptor.
*
* Output:
* eHandle is set to the new event structure.
*
* Return:
* ncclResult_t - success or error code.
*
*/
__hidden ncclResult_t inspectorPluginStartEvent(void* context,
void** eHandle,
ncclProfilerEventDescr_t* eDescr) {
if (context == nullptr || eDescr == nullptr) {
INFO(NCCL_INIT, "Profiler/Plugin: context/eDescr NULL for start event %s", __func__);
return ncclSuccess;
}
*eHandle = nullptr;
if (eDescr->type == ncclProfileColl) {
struct inspectorCollInfo *collEvent = nullptr;
struct inspectorCommInfo *commInfoCtx = (struct inspectorCommInfo*)context;
inspectorPluginCollInfoInit(&collEvent, eDescr, commInfoCtx);
*eHandle = collEvent;
} else if (eDescr->type == ncclProfileKernelCh) {
struct inspectorKernelChInfo *kernelChEvent = nullptr;
inspectorPluginKernelChInfoInit(&kernelChEvent, eDescr);
*eHandle = kernelChEvent;
} else {
return ncclSuccess;
}
return ncclSuccess;
}
/*
* Description:
*
* Stops a profiling event for the NCCL Inspector plugin.
*
* Thread Safety:
*
* Thread-safe (updates event state and performance info).
*
* Input:
*
* void *eHandle - event handle.
*
* Output:
*
* Event is stopped and performance info may be updated.
*
* Return:
* ncclResult_t - success or error code.
*
*/
__hidden ncclResult_t inspectorPluginStopEvent(void *eHandle) {
if (eHandle == nullptr) {
INFO(NCCL_INIT,
"Profiler/Plugin: Event Handle NULL for start event %s", __func__);
return ncclSuccess;
}
uint64_t type = *(uint64_t *)eHandle;
inspectorResult_t res = inspectorSuccess;
if (type == ncclProfileColl) {
struct inspectorCollInfo *collInfo = (struct inspectorCollInfo *)eHandle;
// Record collective stop event
inspectorLockWr(&collInfo->guard);
inspectorRecordEventTrace(collInfo->collEvtTrk.evntTrace,
NCCL_INSP_EVT_TRK_COLL_STOP,
collInfo);
res = inspectorPluginCollInfoDeRef(collInfo);
if (res == inspectorReturn) {
// WARN("NCCL Inspector unnatural return: inspectorPluginStopEvent:ncclProfileColl");
return ncclSuccess;
}
inspectorUnlockRWLock(&collInfo->guard);
return ncclSuccess;
} else if (type == ncclProfileKernelCh) {
struct inspectorKernelChInfo *kernelChInfo
= (struct inspectorKernelChInfo *)eHandle;
struct inspectorCollInfo *collInfo = kernelChInfo->collInfo;
if (collInfo && collInfo->type == ncclProfileColl) {
inspectorLockWr(&collInfo->guard);
struct inspectorEventTraceInfo *krnlEvtTrk =
collInfo->collEvtTrk.kernelCh[kernelChInfo->channelId].evntTrace;
inspectorRecordEventTrace(krnlEvtTrk,
NCCL_INSP_EVT_TRK_KERNEL_STOP,
collInfo);
kernelChInfo->tsCompletedUsec = inspectorGetTime();
collInfo->nKernelChCompleted += 1;
res = inspectorPluginCollInfoDeRef(collInfo);
if (res == inspectorReturn) {
WARN("NCCL Inspector unnatural return: inspectorPluginStopEvent:ncclProfileKernelCh");
return ncclSuccess;
}
if ((collInfo->nKernelChCompleted == collInfo->nKernelChStarted)
&& (collInfo->nKernelChCompleted == collInfo->nChannels)) {
struct inspectorCompletedCollInfo completedColl;
struct inspectorCommInfo *commInfo = collInfo->commInfo;
collInfo->tsCompletedUsec = kernelChInfo->tsCompletedUsec;
inspectorUpdateCollPerf(&completedColl, collInfo);
res = inspectorPluginCollInfoDeRef(collInfo);
if (res != inspectorReturn) {
inspectorUnlockRWLock(&collInfo->guard);
}
if (commInfo != nullptr) {
inspectorLockWr(&commInfo->guard);
inspectorComputeCollBw(commInfo,
&completedColl,
completedColl.func);
memcpy(&commInfo->completedCollInfo,
&completedColl,
sizeof(struct inspectorCompletedCollInfo));
commInfo->dump = true;
inspectorUnlockRWLock(&commInfo->guard);
}
return ncclSuccess;
}
inspectorUnlockRWLock(&collInfo->guard);
}
return ncclSuccess;
}
return ncclSuccess;
}
/*
* Description:
*
* Records the state of a profiling event for the NCCL Inspector
* plugin.
*
* Thread Safety:
*
* Thread-safe (updates event state as needed).
*
* Input:
* void* eHandle - event handle.
* ncclProfilerEventState_t eState - event state.
* ncclProfilerEventStateArgs_t* eStateArgs - event state arguments.
*
* Output:
* Event state is updated as needed.
*
* Return:
* ncclResult_t - success or error code.
*
*/
__hidden ncclResult_t inspectorPluginRecordEventState(void* eHandle,
ncclProfilerEventState_t eState,
ncclProfilerEventStateArgs_t* eStateArgs) {
if (eHandle == nullptr || eStateArgs == nullptr)
return ncclSuccess;
uint64_t type = *(uint64_t *)eHandle;
if (type == ncclProfileKernelCh && eState == ncclProfilerKernelChStop) {
struct inspectorKernelChInfo *kernelChInfo = (struct inspectorKernelChInfo *)eHandle;
struct inspectorCollInfo *collInfo = kernelChInfo->collInfo;
inspectorResult_t res = inspectorSuccess;
if (collInfo && collInfo->type == ncclProfileColl) {
inspectorLockWr(&collInfo->guard);
struct inspectorEventTraceInfo *krnlEvtTrk
= collInfo->collEvtTrk.kernelCh[kernelChInfo->channelId].evntTrace;
inspectorRecordEventTrace(krnlEvtTrk,
NCCL_INSP_EVT_TRK_KERNEL_RECORD,
collInfo);
kernelChInfo->stopGpuClk = eStateArgs->kernelCh.pTimer;
if (kernelChInfo->startGpuClk != 0) {
res = inspectorPluginCollInfoDeRef(collInfo);
if (res == inspectorReturn) {
WARN("NCCL Inspector unnatural return: inspectorPluginRecordEventState");
return ncclSuccess;
}
}
inspectorUnlockRWLock(&collInfo->guard);
}
}
return ncclSuccess;
}
ncclProfiler_t ncclProfiler_v5 = {
"Inspector",
inspectorPluginInit,
inspectorPluginStartEvent,
inspectorPluginStopEvent,
inspectorPluginRecordEventState,
inspectorPluginFinalize,
};
+496
Datei anzeigen
@@ -0,0 +1,496 @@
#include "json.h"
#include <assert.h>
#include <math.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
const char* jsonErrorString(jsonResult_t res) {
switch (res) {
case jsonSuccess:
return "jsonSuccess";
case jsonFileError:
return "jsonFileError";
case jsonUnknownStateError:
return "jsonUnknownStateError";
case jsonEmptyStateError:
return "jsonEmptyStateError";
case jsonExpectedNonNoneStateError:
return "jsonExpectedNonNoneStateError";
case jsonMemoryError:
return "jsonMemoryError";
case jsonStringOverflowError:
return "jsonStringOverflowError";
case jsonStringBadChar:
return "jsonStringBadChar";
case jsonLockError:
return "jsonLockError";
default:
return "unknown json error";
}
}
// We use these statics to mantain a stack of states where we are writing.
typedef struct jsonFileOutput {
jsonState_t* states;
size_t state_cap; // Allocated stack capacity
size_t state_n; // # of items in the stack.
FILE* fp;
pthread_mutex_t mutex;
} jsonFileOutput;
jsonResult_t jsonInitFileOutput(jsonFileOutput** jfo, const char* outfile) {
jsonFileOutput* new_jfo = (jsonFileOutput*)malloc(sizeof(jsonFileOutput));
if (new_jfo == NULL) {
return jsonMemoryError;
}
if (pthread_mutex_init(&new_jfo->mutex, NULL) != 0) {
free(new_jfo);
*jfo = 0;
return jsonLockError;
}
new_jfo->states = NULL;
new_jfo->state_cap = 0;
new_jfo->state_n = 0;
new_jfo->fp = fopen(outfile, "w");
if (new_jfo->fp == NULL) {
free(new_jfo);
*jfo = 0;
return jsonFileError;
}
*jfo = new_jfo;
return jsonSuccess;
}
jsonResult_t jsonNewline(jsonFileOutput* jfo) {
fprintf(jfo->fp, "\n");
return jsonSuccess;
}
jsonResult_t jsonFlushOutput(jsonFileOutput* jfo) {
fflush(jfo->fp);
return jsonSuccess;
}
jsonResult_t jsonLockOutput(jsonFileOutput* jfo) {
if (pthread_mutex_lock(&jfo->mutex) != 0) {
return jsonLockError;
}
return jsonSuccess;
}
jsonResult_t jsonUnlockOutput(jsonFileOutput* jfo) {
if (pthread_mutex_unlock(&jfo->mutex) != 0) {
return jsonLockError;
}
return jsonSuccess;
}
jsonResult_t jsonFinalizeFileOutput(jsonFileOutput* jfo) {
// Really should probably complain if we aren't in a valid state
if (pthread_mutex_destroy(&jfo->mutex) != 0) {
free(jfo);
return jsonLockError;
}
if (jfo->states != NULL) {
free(jfo->states);
}
jfo->states = NULL;
jfo->state_cap = 0;
jfo->state_n = 0;
if (jfo->fp) {
fclose(jfo->fp);
jfo->fp = 0;
}
free(jfo);
return jsonSuccess;
}
static int utf8copy(unsigned char* out, int out_lim, const unsigned char* in) {
int copy_len;
if ((in[0] & 0xE0) == 0xC0) {
// 2-byte sequence
if ((in[1] & 0xC0) != 0x80 || out_lim < 2) {
return 0;
}
copy_len = 2;
} else if ((in[0] & 0xF0) == 0xE0) {
// 3-byte sequence
if ((in[1] & 0xC0) != 0x80 || (in[2] & 0xC0) != 0x80 || out_lim < 3) {
return 0;
}
copy_len = 3;
} else if ((in[0] & 0xF8) == 0xF0) {
// 4-byte sequence
if ((in[1] & 0xC0) != 0x80 || (in[2] & 0xC0) != 0x80 || (in[3] & 0xC0) != 0x80 || out_lim < 4) {
return 0;
}
copy_len = 4;
} else {
// Invalid start byte
return 0;
}
for (int i = 0; i < copy_len; ++i) {
out[i] = in[i];
}
return copy_len;
}
// This tries to sanitize/quote a string from 'in' into 'out',
// assuming 'out' has length 'lim'. We mainly quote ",/,\,\t,\n, and
// bail if we encounter non-printable stuff or non-ASCII stuff.
// 'in' should be null-terminated, of course.
//
// We return false if we were not able to copy all of 'in', either for
// length reasons or for unhandled characters.
static jsonResult_t sanitizeJson(unsigned char out[], int lim, const unsigned char* in) {
int c = 0;
while (*in) {
if (c + 1 >= lim) {
out[c] = 0;
return jsonStringOverflowError;
}
switch (*in) {
case '"':
case '\\':
case '/':
case '\t':
case '\n':
if (c + 2 > lim) {
out[c] = 0;
return jsonStringOverflowError;
}
out[c++] = '\\';
if (*in == '\n') {
out[c++] = 'n';
} else if (*in == '\t') {
out[c++] = 't';
} else {
out[c++] = *in;
}
++in;
break;
default:
if (*in <= 0x1F) {
out[c] = 0;
return jsonStringBadChar;
} else if (*in <= 0x7F) {
out[c++] = *in;
++in;
} else {
const int utf8len = utf8copy(out + c, lim - c - 1, in);
if (utf8len == 0) {
out[c] = 0;
return jsonStringBadChar;
}
c += utf8len;
in += utf8len;
}
break;
}
}
out[c] = 0;
return jsonSuccess;
}
static size_t max(size_t a, size_t b) {
if (a < b) {
return b;
}
return a;
}
// Push state onto the state stack. Reallocate for extra storage if needed.
// Because JSON_NONE is a pseudo-state, don't allow it to be pushed.
static jsonResult_t jsonPushState(jsonFileOutput* jfo, jsonState_t state) {
if (state == JSON_NONE) {
return jsonExpectedNonNoneStateError;
}
if (jfo->state_cap <= (jfo->state_n + 1)) {
jfo->state_cap = max((size_t)16, jfo->state_cap * 2);
jfo->states = (jsonState_t*)realloc(jfo->states, sizeof(jsonState_t) * jfo->state_cap);
if (jfo->states == 0) {
return jsonMemoryError;
}
}
jfo->states[jfo->state_n++] = state;
return jsonSuccess;
}
// Return the current state at the top of the stack
static jsonState_t jsonCurrState(const jsonFileOutput* jfo) {
if (jfo->state_n == 0) {
return JSON_NONE;
}
return jfo->states[jfo->state_n - 1];
}
// Replace the stack with state (equivalent to a pop & push if stack is not empty)
static jsonResult_t jsonReplaceState(jsonFileOutput* jfo, jsonState_t state) {
if (state == JSON_NONE) {
return jsonExpectedNonNoneStateError;
}
if (jfo->state_n == 0) {
return jsonEmptyStateError;
}
jfo->states[jfo->state_n - 1] = state;
return jsonSuccess;
}
// Pop the top state off the stack, or return that the state is empty
static jsonState_t jsonPopState(jsonFileOutput* jfo) {
if (jfo->state_n == 0) {
return JSON_NONE;
}
return jfo->states[--jfo->state_n];
}
// Emit a key and separator. Santize the key.
// This is only acceptable if the top state is an object
// Emit a ',' separator of we aren't the first item.
jsonResult_t jsonKey(jsonFileOutput* jfo, const char* name) {
switch (jsonCurrState(jfo)) {
case JSON_OBJECT_EMPTY:
jsonReplaceState(jfo, JSON_OBJECT_SOME);
break;
case JSON_OBJECT_SOME:
fprintf(jfo->fp, ",");
break;
default:
return jsonUnknownStateError;
}
unsigned char tmp[2048];
const jsonResult_t res = sanitizeJson(tmp, sizeof(tmp), (const unsigned char*)name);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "\"%s\":", tmp);
jsonPushState(jfo, JSON_KEY);
return jsonSuccess;
}
// Helper function for inserting values.
// Only acceptable after keys, top-level, or in lists.
// Emit preceeding ',' if in a list and not first item.
static jsonResult_t jsonValHelper(jsonFileOutput* jfo) {
switch (jsonCurrState(jfo)) {
case JSON_LIST_EMPTY:
jsonReplaceState(jfo, JSON_LIST_SOME);
break;
case JSON_LIST_SOME:
fprintf(jfo->fp, ",");
break;
case JSON_KEY:
jsonPopState(jfo);
break;
case JSON_NONE:
break;
default:
return jsonUnknownStateError;
}
return jsonSuccess;
}
// Start an object
jsonResult_t jsonStartObject(jsonFileOutput* jfo) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "{");
return jsonPushState(jfo, JSON_OBJECT_EMPTY);
}
// Close an object
jsonResult_t jsonFinishObject(jsonFileOutput* jfo) {
switch (jsonPopState(jfo)) {
case JSON_OBJECT_EMPTY:
case JSON_OBJECT_SOME:
break;
default:
return jsonUnknownStateError;
}
fprintf(jfo->fp, "}");
return jsonSuccess;
}
// Start a list
jsonResult_t jsonStartList(jsonFileOutput* jfo) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "[");
return jsonPushState(jfo, JSON_LIST_EMPTY);
}
// Close a list
jsonResult_t jsonFinishList(jsonFileOutput* jfo) {
switch (jsonPopState(jfo)) {
case JSON_LIST_EMPTY:
case JSON_LIST_SOME:
break;
default:
return jsonUnknownStateError;
}
fprintf(jfo->fp, "]");
return jsonSuccess;
}
// Write a null value
jsonResult_t jsonNull(jsonFileOutput* jfo) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "null");
return jsonSuccess;
}
// Write a (sanititzed) string
jsonResult_t jsonStr(jsonFileOutput* jfo, const char* str) {
if (str == NULL) {
jsonNull(jfo);
return jsonSuccess;
}
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
unsigned char tmp[2048];
const jsonResult_t san_res = sanitizeJson(tmp, sizeof(tmp), (const unsigned char*)str);
if (san_res != jsonSuccess) {
return san_res;
}
fprintf(jfo->fp, "\"%s\"", tmp);
return jsonSuccess;
}
// Write a bool as "true" or "false" strings.
jsonResult_t jsonBool(jsonFileOutput* jfo, bool val) {
return jsonStr(jfo, val ? "true" : "false");
}
// Write an integer value
jsonResult_t jsonInt(jsonFileOutput* jfo, const int val) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "%d", val);
return jsonSuccess;
}
// Write an integer value
jsonResult_t jsonUint32(jsonFileOutput* jfo, const uint32_t val) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "%u", val);
return jsonSuccess;
}
// Write an integer value
jsonResult_t jsonUint64(jsonFileOutput* jfo, const uint64_t val) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "%lu", val);
return jsonSuccess;
}
// Write a size_t value
jsonResult_t jsonSize_t(jsonFileOutput* jfo, const size_t val) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
fprintf(jfo->fp, "%zu", val);
return jsonSuccess;
}
// Write a double value
jsonResult_t jsonDouble(jsonFileOutput* jfo, const double val) {
const jsonResult_t res = jsonValHelper(jfo);
if (res != jsonSuccess) {
return res;
}
if (val != val) {
fprintf(jfo->fp, "\"nan\"");
} else {
fprintf(jfo->fp, "%lf", val);
}
return jsonSuccess;
}
#ifdef DO_JSON_TEST
// compile with
// gcc json.cc -Iinclude/ -DDO_JSON_TEST -o json_test
// run with:
// ./json_test
// if something fails, it will print out the error
// if it all works, print out "output matches reference"
#define JSONCHECK(expr) \
do { \
const jsonResult_t res = (expr); \
if (res != jsonSuccess) { \
fprintf(stderr, "jsonError: %s\n", jsonErrorString(res)); \
exit(1); \
} \
} while (0)
int main() {
const char refstr[] =
"{\"number\":123,\"utfstring\":\"∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ "
"¬β = ¬(¬α β),\",\"list\":[\"true\",null,9423812381231,3123111,0.694234]}";
jsonFileOutput* jfo;
JSONCHECK(jsonInitFileOutput(&jfo, "test.json"));
JSONCHECK(jsonStartObject(jfo));
JSONCHECK(jsonKey(jfo, "number"));
JSONCHECK(jsonInt(jfo, 123));
JSONCHECK(jsonKey(jfo, "utfstring"));
JSONCHECK(
jsonStr(jfo, "∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β),"));
JSONCHECK(jsonKey(jfo, "list"));
JSONCHECK(jsonStartList(jfo));
JSONCHECK(jsonBool(jfo, true));
JSONCHECK(jsonNull(jfo));
JSONCHECK(jsonUint64(jfo, 9423812381231ULL));
JSONCHECK(jsonSize_t(jfo, 3123111));
JSONCHECK(jsonDouble(jfo, 0.69423413));
JSONCHECK(jsonFinishList(jfo));
JSONCHECK(jsonFinishObject(jfo));
JSONCHECK(jsonFinalizeFileOutput(jfo));
FILE* fp = fopen("test.json", "r");
const size_t reflen = sizeof(refstr) / sizeof(char);
char buffer[reflen];
fread(buffer, sizeof(char), reflen, fp);
fclose(fp);
if (memcmp(buffer, refstr, reflen) == 0) {
printf("output matches reference\n");
} else {
printf("output %s\nreference %s\n", buffer, refstr);
return 1;
}
return 0;
}
#endif
+83
Datei anzeigen
@@ -0,0 +1,83 @@
#pragma once
#include <stdbool.h>
#include <stdint.h>
#include <stddef.h>
typedef enum {
JSON_NONE, // A pseudo-state meaning that the document is empty
JSON_KEY,
JSON_OBJECT_EMPTY,
JSON_OBJECT_SOME,
JSON_LIST_EMPTY,
JSON_LIST_SOME,
} jsonState_t;
typedef enum {
jsonSuccess,
jsonFileError,
jsonUnknownStateError,
jsonEmptyStateError,
jsonExpectedNonNoneStateError,
jsonStringOverflowError,
jsonStringBadChar,
jsonMemoryError,
jsonLockError,
} jsonResult_t;
const char *jsonErrorString(jsonResult_t res);
typedef struct jsonFileOutput jsonFileOutput;
jsonResult_t jsonLockOutput(jsonFileOutput *jfo);
jsonResult_t jsonUnlockOutput(jsonFileOutput *jfo);
jsonResult_t jsonInitFileOutput(jsonFileOutput **jfo,
const char *outfile);
jsonResult_t jsonFinalizeFileOutput(jsonFileOutput *jfo);
jsonResult_t jsonNewline(jsonFileOutput *jfo);
jsonResult_t jsonFlushOutput(jsonFileOutput *jfo);
// Emit a key and separator. Santize the key.
// This is only acceptable if the top state is an object
// Emit a ',' separator of we aren't the first item.
jsonResult_t jsonKey(jsonFileOutput *jfo, const char *name);
// Start an object
jsonResult_t jsonStartObject(jsonFileOutput *jfo);
// Close an object
jsonResult_t jsonFinishObject(jsonFileOutput *jfo);
// Start a list
jsonResult_t jsonStartList(jsonFileOutput *jfo);
// Close a list
jsonResult_t jsonFinishList(jsonFileOutput *jfo);
// Emit a null value
jsonResult_t jsonNull(jsonFileOutput *jfo);
// Write a (sanititzed) string
jsonResult_t jsonStr(jsonFileOutput *jfo, const char *str);
// Write a bool as "true" or "false" strings.
jsonResult_t jsonBool(jsonFileOutput *jfo, bool val);
// Write an integer value
jsonResult_t jsonInt(jsonFileOutput *jfo, const int val);
//Write an unsigned int value
jsonResult_t jsonUint32(jsonFileOutput *jfo, const uint32_t val);
// Write an integer value
jsonResult_t jsonUint64(jsonFileOutput *jfo, const uint64_t val);
// Write a size_t value
jsonResult_t jsonSize_t(jsonFileOutput *jfo, const size_t val);
// Write a double value
jsonResult_t jsonDouble(jsonFileOutput *jfo, const double val);
+73
Datei anzeigen
@@ -0,0 +1,73 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COMMON_H_
#define COMMON_H_
/* typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; */
/* typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; */
/* Data types */
typedef enum { ncclInt8 = 0, ncclChar = 0,
ncclUint8 = 1,
ncclInt32 = 2, ncclInt = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6, ncclHalf = 6,
ncclFloat32 = 7, ncclFloat = 7,
ncclFloat64 = 8, ncclDouble = 8,
ncclBfloat16 = 9,
ncclFloat8e4m3 = 10,
ncclFloat8e5m2 = 11,
ncclNumTypes = 12
} ncclDataType_t;
typedef enum {
NCCL_LOG_NONE = 0,
NCCL_LOG_VERSION = 1,
NCCL_LOG_WARN = 2,
NCCL_LOG_INFO = 3,
NCCL_LOG_ABORT = 4,
NCCL_LOG_TRACE = 5
} ncclDebugLogLevel;
typedef enum { ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6,
ncclInProgress = 7,
ncclNumResults = 8 } ncclResult_t;
typedef enum {
NCCL_INIT = 0x1,
NCCL_COLL = 0x2,
NCCL_P2P = 0x4,
NCCL_SHM = 0x8,
NCCL_NET = 0x10,
NCCL_GRAPH = 0x20,
NCCL_TUNING = 0x40,
NCCL_ENV = 0x80,
NCCL_ALLOC = 0x100,
NCCL_CALL = 0x200,
NCCL_PROXY = 0x400,
NCCL_NVLS = 0x800,
NCCL_BOOTSTRAP = 0x1000,
NCCL_REG = 0x2000,
NCCL_PROFILE = 0x4000,
NCCL_RAS = 0x8000,
NCCL_INSPECTOR = 0x100000, // big number to avoid short-term conflicts
NCCL_ALL = ~0
} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
#endif
+85
Datei anzeigen
@@ -0,0 +1,85 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_H_
#define PROFILER_H_
#include <stdint.h>
#include <stdlib.h>
#include "common.h"
enum {
ncclProfileGroup = (1 << 0), // group event type
ncclProfileColl = (1 << 1), // host collective call event type
ncclProfileP2p = (1 << 2), // host point-to-point call event type
ncclProfileProxyOp = (1 << 3), // proxy operation event type
ncclProfileProxyStep = (1 << 4), // proxy step event type
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
ncclProfileKernelCh = (1 << 6), // kernel channel event type
ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events
ncclProfileGroupApi = (1 << 8), // Group API events
ncclProfileCollApi = (1 << 9), // Collective API events
ncclProfileP2pApi = (1 << 10), // Point-to-Point API events
ncclProfileKernelLaunch = (1 << 11), // Kernel launch events
};
typedef enum {
ncclProfilerProxyOpSendPosted = 0, // deprecated in v4
ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4
ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4
ncclProfilerProxyOpSendDone = 3, // deprecated in v4
ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4
ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4
ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4
ncclProfilerProxyOpRecvDone = 7, // deprecated in v4
ncclProfilerProxyOpInProgress_v4 = 19,
/* Legacy proxy profiler states */
ncclProfilerProxyStepSendGPUWait = 8,
ncclProfilerProxyStepSendPeerWait_v4 = 20,
ncclProfilerProxyStepSendWait = 9,
ncclProfilerProxyStepRecvWait = 10,
ncclProfilerProxyStepRecvFlushWait = 11,
ncclProfilerProxyStepRecvGPUWait = 12,
/* Legacy proxy control states */
ncclProfilerProxyCtrlIdle = 13,
ncclProfilerProxyCtrlActive = 14,
ncclProfilerProxyCtrlSleep = 15,
ncclProfilerProxyCtrlWakeup = 16,
ncclProfilerProxyCtrlAppend = 17,
ncclProfilerProxyCtrlAppendEnd = 18,
/* Network defined events states */
ncclProfilerNetPluginUpdate = 21,
/* Kernel event states */
ncclProfilerKernelChStop = 22,
/* Group API States */
ncclProfilerEndGroupApiStart = 23,
ncclProfilerBeginGroupApiEnd = 24
} ncclProfilerEventState_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t;
#include "profiler_v5.h"
#include "profiler_v4.h"
#include "profiler_v3.h"
#include "profiler_v2.h"
#include "profiler_v1.h"
#include "profiler_net.h"
typedef ncclProfiler_v5_t ncclProfiler_t;
typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t;
#endif // end include guard
@@ -0,0 +1,19 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_NET_H_
#define PROFILER_NET_H_
#define NCCL_PROFILER_NET_VER_BITS (16)
#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS)
#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
typedef enum {
NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS),
NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
} ncclProfilerNetType;
#endif
+112
Datei anzeigen
@@ -0,0 +1,112 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V1_H_
#define PROFILER_V1_H_
#include <stdint.h>
#include <stddef.h>
#include <sys/types.h>
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
const char* name;
uint64_t commHash;
uint64_t seqNumber;
uint8_t func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
uint8_t datatype;
uint32_t op;
size_t trafficBytes;
uint8_t nMaxChannels;
uint8_t nWarps;
uint8_t algo;
uint8_t proto;
int isCollnet;
int isNvls;
} coll;
struct {
const char* name;
uint64_t commHash;
uint8_t func;
void* buff;
uint8_t datatype;
size_t count;
int peer;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
};
} ncclProfilerEventDescr_v1_t;
typedef union {
struct {
size_t transSize;
int steps;
} proxyOp;
struct {
int appendedProxyOps;
} proxyCtrl;
} ncclProfilerEventStateArgs_v1_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v1_t;
#endif
+108
Datei anzeigen
@@ -0,0 +1,108 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V2_H_
#define PROFILER_V2_H_
#include <stdint.h>
#include <stddef.h>
#include <sys/types.h>
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
const char* name;
uint64_t commHash;
uint64_t seqNumber;
const char* func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
size_t trafficBytes;
uint8_t nMaxChannels;
uint8_t nWarps;
const char* algo;
const char* proto;
} coll;
struct {
const char* name;
uint64_t commHash;
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
};
} ncclProfilerEventDescr_v2_t;
typedef union {
struct {
size_t transSize;
int steps;
} proxyOp;
struct {
int appendedProxyOps;
} proxyCtrl;
} ncclProfilerEventStateArgs_v2_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v2_t;
#endif
+116
Datei anzeigen
@@ -0,0 +1,116 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V3_H_
#define PROFILER_V3_H_
#include <stdint.h>
#include <stddef.h>
#include <sys/types.h>
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
const char* name;
uint64_t commHash;
uint64_t seqNumber;
const char* func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
uint8_t nMaxChannels;
uint8_t nWarps;
const char* algo;
const char* proto;
} coll;
struct {
const char* name;
uint64_t commHash;
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
struct {
uint8_t channelId;
} kernelCh;
struct {
int64_t id;
void* data;
} netPlugin;
};
} ncclProfilerEventDescr_v3_t;
typedef union {
struct {
size_t transSize;
int steps;
} proxyOp;
struct {
int appendedProxyOps;
} proxyCtrl;
} ncclProfilerEventStateArgs_v3_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v3_t;
#endif
+127
Datei anzeigen
@@ -0,0 +1,127 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V4_H_
#define PROFILER_V4_H_
#include <stdint.h>
#include <stddef.h>
#include <sys/types.h>
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
uint64_t seqNumber;
const char* func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
uint8_t nChannels;
uint8_t nWarps;
const char* algo;
const char* proto;
} coll;
struct {
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer;
uint8_t nChannels;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
struct {
uint8_t channelId;
uint64_t pTimer; // start timestamp from GPU globaltimer
} kernelCh;
struct {
int64_t id;
void* data;
} netPlugin;
};
} ncclProfilerEventDescr_v4_t;
typedef union {
struct {
size_t transSize;
} proxyStep;
struct {
int appendedProxyOps;
} proxyCtrl;
struct {
void* data;
} netPlugin;
struct {
uint64_t pTimer;
} kernelCh;
} ncclProfilerEventStateArgs_v4_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// - commName : user assigned communicator name
// - commHash : communicator id
// - nNodes : number of nodes in communicator
// - nranks : number of ranks in communicator
// - rank : rank identifier in communicator
// - logfn : logger function
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v4_t;
#endif
+151
Datei anzeigen
@@ -0,0 +1,151 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PROFILER_V5_H_
#define PROFILER_V5_H_
typedef struct {
uint64_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
bool graphCaptured;
int groupDepth;
} groupApi;
struct {
const char* func;
size_t count;
const char* datatype;
int root;
void* stream;
bool graphCaptured;
} collApi;
struct {
const char* func;
size_t count;
const char* datatype;
void* stream;
bool graphCaptured;
} p2pApi;
struct {
void* stream;
} kernelLaunch;
struct {
uint64_t seqNumber;
const char* func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
const char* datatype;
uint8_t nChannels;
uint8_t nWarps;
const char* algo;
const char* proto;
void* parentGroup; // for backward compatibility with v4
} coll;
struct {
const char* func;
void* buff;
const char* datatype;
size_t count;
int peer;
uint8_t nChannels;
void* parentGroup; // for backward compatibility with v4
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
struct {
uint8_t channelId;
uint64_t pTimer; // start timestamp from GPU globaltimer
} kernelCh;
struct {
int64_t id;
void* data;
} netPlugin;
};
} ncclProfilerEventDescr_v5_t;
typedef union {
struct {
size_t transSize;
} proxyStep;
struct {
int appendedProxyOps;
} proxyCtrl;
struct {
void* data;
} netPlugin;
struct {
uint64_t pTimer;
} kernelCh;
} ncclProfilerEventStateArgs_v5_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// - commId : communicator id
// - commName : user assigned communicator name
// - nNodes : number of nodes in communicator
// - nranks : number of ranks in communicator
// - rank : rank identifier in communicator
// - logfn : logger function
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v5_t;
#endif
+21
Datei anzeigen
@@ -0,0 +1,21 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NCCL_TYPES_H_
#define NCCL_TYPES_H_
/* Data types */
typedef enum { ncclInt8 = 0, ncclChar = 0,
ncclUint8 = 1,
ncclInt32 = 2, ncclInt = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6, ncclHalf = 6,
ncclFloat32 = 7, ncclFloat = 7,
ncclFloat64 = 8, ncclDouble = 8,
ncclBfloat16 = 9,
} ncclDataType_t;
#endif
+12
Datei anzeigen
@@ -0,0 +1,12 @@
#ifndef VERSION_H
#define VERSION_H
#ifdef __cplusplus
extern "C" {
#endif
const char* get_git_version_info();
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif // VERSION_H
+60 -54
Datei anzeigen
@@ -1,8 +1,8 @@
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 7af56a6c..5c3e3d46 100644
index 9bfd8dcf..4d3f0a08 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -28,6 +28,7 @@
@@ -29,6 +29,7 @@
#include "ibvwrap.h"
#include "mlx5/mlx5dvwrap.h"
@@ -10,9 +10,9 @@ index 7af56a6c..5c3e3d46 100644
#include "graph/xml.h"
#define MAXSUFFIXSIZE 16
@@ -107,9 +108,31 @@ struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_VDEVS];
@@ -110,16 +111,38 @@ struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_VDEVS];
struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
static std::mutex ncclIbMutex;
static int ncclIbRelaxedOrderingEnabled = 0;
+static bool rcclAinicRoce = 0;
+static bool rcclCtsInlineData = 0;
@@ -35,6 +35,13 @@ index 7af56a6c..5c3e3d46 100644
+static ncclChannelToUd nccl_channel_ud_map[MAXCHANNELS][ncclIbChannelTypeMax];
+static bool nccl_channel_last_ud[MAX_IB_DEVS][ncclIbChannelTypeMax];
// With ncclNet_v11_t the NCCL core initializes the network plugin per-communicator
// rather than once for all communicators. However, the internal plugin implementation
// still assumes the plugin is initialized only once across all communicators. The ref
// counter makes sure the plugin internally initializes only once. When per communicator
// context support is added to the plugin the ref counter can be removed.
static int netRefCount;
#define NCCL_IB_LLSTR(ll) (((ll) == IBV_LINK_LAYER_INFINIBAND) ? "IB" : (((ll) == IBV_LINK_LAYER_ETHERNET) ? "RoCE" : "UNSPECIFIED"))
+#define NCCL_CTS_QP_SLOT_INVALID 0xFF
@@ -42,7 +49,7 @@ index 7af56a6c..5c3e3d46 100644
#define NCCL_IB_SL_DEFAULT 0
#define NCCL_IB_TC_DEFAULT 0
@@ -131,6 +154,13 @@ NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1);
@@ -141,6 +164,13 @@ NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1);
NCCL_PARAM(IbDataDirect,"IB_DATA_DIRECT",1);
NCCL_PARAM(IbQpsPerConn, "IB_QPS_PER_CONNECTION", 1);
RCCL_PARAM(IbQpsPerP2p, "IB_QPS_PER_P2P", 0);
@@ -56,7 +63,7 @@ index 7af56a6c..5c3e3d46 100644
static ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat) {
__atomic_store_n(&stat->fatalErrorCount, 0, __ATOMIC_RELAXED);
@@ -630,6 +660,10 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
@@ -779,6 +809,10 @@ ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config
static int shownIbHcaEnv = 0;
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
if(wrap_mlx5dv_symbols() != ncclSuccess) { INFO(NCCL_NET, "NET/IB : Failed to open mlx5dv symbols. Advance features like CX-8 Direct-NIC will be disabled."); }
@@ -67,7 +74,7 @@ index 7af56a6c..5c3e3d46 100644
// Detect IB cards
int nIbDevs = 0;
@@ -783,6 +817,24 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
@@ -944,6 +978,23 @@ ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config
INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline));
@@ -88,11 +95,10 @@ index 7af56a6c..5c3e3d46 100644
+ "IB Use Inline: enabled; GDR Flush: disabled", rcclCtsInlineData ? "Enabled": "Disabled",
+ rcclCtsOffloadEnabled ? "Enabled": "Disabled");
+ }
+
pthread_mutex_unlock(&ncclIbLock);
}
exit:
@@ -1112,6 +1164,8 @@ struct ncclIbListenComm {
ibContext.trafficClass = config->trafficClass;
@@ -1271,6 +1322,8 @@ struct ncclIbListenComm {
struct ncclIbCommStage stage;
};
@@ -101,7 +107,7 @@ index 7af56a6c..5c3e3d46 100644
struct alignas(64) ncclIbSendFifo {
uint64_t addr;
uint64_t size;
@@ -1122,10 +1176,21 @@ struct alignas(64) ncclIbSendFifo {
@@ -1281,10 +1334,21 @@ struct alignas(64) ncclIbSendFifo {
char padding[16];
};
@@ -123,7 +129,7 @@ index 7af56a6c..5c3e3d46 100644
};
struct ncclIbRemSizesFifo {
@@ -1172,6 +1237,7 @@ struct ncclIbSendComm {
@@ -1331,6 +1395,7 @@ struct ncclIbSendComm {
struct ncclIbNetCommBase base;
// Start with fifo and ibv structs as they have alignment restrictions
struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
@@ -131,7 +137,7 @@ index 7af56a6c..5c3e3d46 100644
struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS + 1];
// Each dev correlates to a mergedIbDev
@@ -1187,6 +1253,7 @@ struct ncclIbSendComm {
@@ -1346,6 +1411,7 @@ struct ncclIbSendComm {
static_assert((sizeof(struct ncclIbNetCommBase) % 32) == 0, "ncclIbNetCommBase size must be 32-byte multiple to ensure fifo is at proper offset");
static_assert((offsetof(struct ncclIbSendComm, fifo) % 32) == 0, "ncclIbSendComm fifo must be 32-byte aligned");
static_assert((sizeof(struct ncclIbSendFifo) % 32) == 0, "ncclIbSendFifo element size must be 32-byte multiples");
@@ -139,7 +145,7 @@ index 7af56a6c..5c3e3d46 100644
static_assert((offsetof(struct ncclIbSendComm, sges) % 32) == 0, "sges must be 32-byte aligned");
static_assert((offsetof(struct ncclIbSendComm, wrs) % 32) == 0, "wrs must be 32-byte aligned");
@@ -1201,6 +1268,7 @@ struct ncclIbGpuFlush {
@@ -1360,6 +1426,7 @@ struct ncclIbGpuFlush {
struct ncclIbRemFifo {
struct ncclIbSendFifo elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
@@ -147,8 +153,8 @@ index 7af56a6c..5c3e3d46 100644
uint64_t fifoTail;
uint64_t addr;
uint32_t flags;
@@ -1265,20 +1333,59 @@ returning:
return res;
@@ -1415,20 +1482,59 @@ ncclResult_t ncclIbDestroyBase(struct ncclIbNetCommDevBase* base) {
return ncclSuccess;
}
-ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, void* qp_context, struct ncclIbQp* qp) {
@@ -209,7 +215,7 @@ index 7af56a6c..5c3e3d46 100644
struct ibv_qp_attr qpAttr;
memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
qpAttr.qp_state = IBV_QPS_INIT;
@@ -1288,6 +1395,9 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
@@ -1438,6 +1544,9 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
NCCLCHECK(wrap_ibv_modify_qp(qp->qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
TRACE(NCCL_NET, "NET/IB : ncclIbCreateQp port=%d dev=%d devName=%s ndevs=%d nmdevs=%d qpn=%u pkey=%u pd=%p",
ib_port, base->ibDevN, ncclIbDevs[base->ibDevN].devName, ncclNIbDevs, ncclNMergedIbDevs, qp->qp->qp_num, qpAttr.pkey_index, base->pd);
@@ -219,16 +225,16 @@ index 7af56a6c..5c3e3d46 100644
return ncclSuccess;
}
@@ -1371,7 +1481,7 @@ fail:
@@ -1521,7 +1630,7 @@ fail:
goto exit;
}
-ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
-ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
ncclResult_t ret = ncclSuccess;
struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
struct ncclIbCommStage* stage = &handle->stage;
@@ -1379,8 +1489,13 @@ ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHan
@@ -1529,8 +1638,13 @@ ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendCo
int ready;
uint8_t link_layer = IBV_LINK_LAYER_UNSPECIFIED;
int isP2p = 0;
@@ -242,7 +248,7 @@ index 7af56a6c..5c3e3d46 100644
if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
if (stage->state == ncclIbCommStateSendDevList) goto ib_send_dev_list;
if (stage->state == ncclIbCommStateRecvDevList) goto ib_recv_dev_list;
@@ -1461,7 +1576,7 @@ ib_recv_dev_list:
@@ -1612,7 +1726,7 @@ ib_recv_dev_list:
for (int q = 0; q < comm->base.nqps; q++) {
ncclIbSendCommDev* commDev = comm->devs + devIndex;
ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
@@ -251,7 +257,7 @@ index 7af56a6c..5c3e3d46 100644
comm->base.qps[q].devIndex = devIndex;
meta.qpInfo[q].qpn = comm->base.qps[q].qp->qp_num;
meta.qpInfo[q].devIndex = comm->base.qps[q].devIndex;
@@ -1486,7 +1601,11 @@ ib_recv_dev_list:
@@ -1637,7 +1751,11 @@ ib_recv_dev_list:
devInfo->lid = ibDev->portAttr.lid;
devInfo->ibv_dev_index = commDev->base.ibDevN;
// Prepare my fifo
@@ -264,10 +270,10 @@ index 7af56a6c..5c3e3d46 100644
devInfo->fifoRkey = commDev->fifoMr->rkey;
// Pack local GID info
@@ -1528,7 +1647,11 @@ ib_recv_dev_list:
return ncclInternalError;
@@ -1680,7 +1798,11 @@ ib_recv_dev_list:
}
}
config = (ncclNetCommConfig_t*)ctx;
- meta.fifoAddr = (uint64_t)comm->fifo;
+ if (rcclCtsInlineData) {
+ meta.fifoAddr = (uint64_t)comm->fifo_inline;
@@ -277,7 +283,7 @@ index 7af56a6c..5c3e3d46 100644
meta.sl = (ncclParamIbSl() != -1) ? ncclParamIbSl() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_SL_DEFAULT;
meta.tc = (ncclParamIbTc() != -1) ? ncclParamIbTc() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_TC_DEFAULT;
strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME);
@@ -1673,18 +1796,22 @@ ncclResult_t ncclIbCheckVProps(ncclNetVDeviceProps_t* vProps1, ncclNetVDevicePro
@@ -1825,18 +1947,22 @@ ncclResult_t ncclIbCheckVProps(ncclNetVDeviceProps_t* vProps1, ncclNetVDevicePro
return ncclSuccess;
}
@@ -302,7 +308,7 @@ index 7af56a6c..5c3e3d46 100644
if (stage->state == ncclIbCommStateAccept) goto ib_accept_check;
if (stage->state == ncclIbCommStateRecvDevList) goto ib_recv_dev_list;
if (stage->state == ncclIbCommStateSendDevList) goto ib_send_dev_list;
@@ -1814,7 +1941,7 @@ ib_recv:
@@ -1966,7 +2092,7 @@ ib_recv:
// Local ibDevN
ibDevN = rComm->devs[devIndex].base.ibDevN;
ibDev = ncclIbDevs + ibDevN;
@@ -311,7 +317,7 @@ index 7af56a6c..5c3e3d46 100644
qp->devIndex = devIndex;
devIndex = (devIndex + 1) % rComm->base.vProps.ndevs;
@@ -1840,16 +1967,22 @@ ib_recv:
@@ -1992,16 +2118,22 @@ ib_recv:
useDmaBuf = (ncclIbDmaBufSupport(lComm->dev) == ncclSuccess);
rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || useDmaBuf)
@@ -337,7 +343,7 @@ index 7af56a6c..5c3e3d46 100644
// Allocate Flush dummy buffer for GPU Direct RDMA
if (rComm->flushEnabled) {
@@ -1887,7 +2020,7 @@ ib_recv:
@@ -2039,7 +2171,7 @@ ib_recv:
rCommDev->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlushHostMem;
rCommDev->gpuFlush.sge.length = 1;
rCommDev->gpuFlush.sge.lkey = rCommDev->gpuFlush.hostMr->lkey;
@@ -346,7 +352,7 @@ index 7af56a6c..5c3e3d46 100644
struct ncclIbDevInfo devInfo;
devInfo.lid = ibDev->portAttr.lid;
devInfo.link_layer = ibDev->portAttr.link_layer;
@@ -2115,10 +2248,15 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
@@ -2257,10 +2389,15 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0);
@@ -364,7 +370,7 @@ index 7af56a6c..5c3e3d46 100644
if (nreqs > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
uint64_t wr_id = 0ULL;
@@ -2130,7 +2268,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
@@ -2272,7 +2409,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
sge->addr=(uintptr_t)reqs[r]->send.data;
wr->opcode = IBV_WR_RDMA_WRITE;
wr->send_flags = 0;
@@ -377,7 +383,7 @@ index 7af56a6c..5c3e3d46 100644
wr->next = wr + 1;
wr_id += (reqs[r] - comm->base.reqs) << (r*8);
#ifdef NCCL_ENABLE_NET_PROFILING
@@ -2141,7 +2283,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
@@ -2283,7 +2424,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
// Write size as immediate data. In the case of multi-send, only write
// 0 or 1 as size to indicate whether there was data sent or received.
uint32_t immData = 0;
@@ -386,7 +392,7 @@ index 7af56a6c..5c3e3d46 100644
immData = reqs[0]->send.size;
} else {
int* sizes = comm->remSizesFifo.elems[slot];
@@ -2151,22 +2293,24 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
@@ -2293,22 +2434,24 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
}
struct ibv_send_wr* lastWr = comm->wrs+nreqs-1;
@@ -424,7 +430,7 @@ index 7af56a6c..5c3e3d46 100644
lastWr->next = NULL;
lastWr->send_flags = IBV_SEND_SIGNALED;
@@ -2182,7 +2326,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
@@ -2324,7 +2467,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
//ncclIbAddEvent(reqs[r], devIndex, &comm->devs[devIndex].base);
// Select proper rkey (needed even for 0-size send)
@@ -437,7 +443,7 @@ index 7af56a6c..5c3e3d46 100644
int chunkSize = DIVUP(DIVUP(reqs[r]->send.size, nqps), align) * align;
int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize);
@@ -2198,7 +2346,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
@@ -2340,7 +2487,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
}
}
@@ -446,7 +452,7 @@ index 7af56a6c..5c3e3d46 100644
// Also make sure lastWr writes remote sizes using the right lkey
comm->remSizesFifo.sge.lkey = comm->remSizesFifo.mrs[devIndex]->lkey;
lastWr->wr.rdma.rkey = comm->remSizesFifo.rkeys[devIndex];
@@ -2256,32 +2404,46 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
@@ -2398,32 +2545,46 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle;
@@ -511,7 +517,7 @@ index 7af56a6c..5c3e3d46 100644
}
struct ncclIbRequest* req;
@@ -2325,10 +2487,12 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
@@ -2467,10 +2628,12 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
}
TIME_START(0);
@@ -526,7 +532,7 @@ index 7af56a6c..5c3e3d46 100644
memset(reqs, 0, NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbRequest*));
comm->fifoHead++;
TIME_STOP(0);
@@ -2341,30 +2505,60 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
@@ -2483,30 +2646,60 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, size_t* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) {
struct ibv_send_wr wr;
@@ -566,10 +572,7 @@ index 7af56a6c..5c3e3d46 100644
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandles[i];
+ if (rcclCtsInlineData) {
+ localElemCtsInline[i].addr = (uint64_t)data[i];
- // Send all applicable rkeys
- for (int j = 0; j < comm->base.vProps.ndevs; j++)
- localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
+
+ // Send all applicable rkeys
+ for (int j = 0; j < comm->base.vProps.ndevs; j++)
+ localElemCtsInline[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
@@ -583,14 +586,17 @@ index 7af56a6c..5c3e3d46 100644
+ } else {
+ localElem[i].addr = (uint64_t)data[i];
- // Send all applicable rkeys
- for (int j = 0; j < comm->base.vProps.ndevs; j++)
- localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
+ // Send all applicable rkeys
+ for (int j = 0; j < comm->base.vProps.ndevs; j++)
+ localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
- localElem[i].nreqs = n;
- localElem[i].size = sizes[i]; // Sanity/Debugging
- localElem[i].tag = tags[i];
- localElem[i].idx = comm->remFifo.fifoTail+1;
+ // Send all applicable rkeys
+ for (int j = 0; j < comm->base.vProps.ndevs; j++)
+ localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
+
+ localElem[i].nreqs = n;
+ localElem[i].size = sizes[i]; // Sanity/Debugging
+ localElem[i].tag = tags[i];
@@ -600,7 +606,7 @@ index 7af56a6c..5c3e3d46 100644
}
wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbSendFifo);
@@ -2372,8 +2566,12 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
@@ -2514,8 +2707,12 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
wr.wr.rdma.rkey = comm->base.remDevs[ctsQp->remDevIdx].fifoRkey;
// Set the correct sge properties
@@ -615,7 +621,7 @@ index 7af56a6c..5c3e3d46 100644
wr.sg_list = &comm->devs[ctsQp->devIndex].fifoSge;
wr.num_sge = 1;
@@ -2403,7 +2601,13 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
@@ -2545,7 +2742,13 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
//
// slot == devIndex - When writing to fifo slot N, and this QP lives on device index N, it should send signalled.
// This works out that each fifo posting QP gets drained
@@ -630,7 +636,7 @@ index 7af56a6c..5c3e3d46 100644
wr.send_flags |= IBV_SEND_SIGNALED;
wr.wr_id = req - comm->base.reqs;
ncclIbAddEvent(req, ctsQp->devIndex, &comm->devs[ctsQp->devIndex].base);
@@ -2418,10 +2622,16 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
@@ -2560,10 +2763,16 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
comm->remFifo.fifoTail++;
@@ -647,7 +653,7 @@ index 7af56a6c..5c3e3d46 100644
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
if (comm->base.ready == 0) {
WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0");
@@ -2431,6 +2641,11 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
@@ -2573,6 +2782,11 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
@@ -659,7 +665,7 @@ index 7af56a6c..5c3e3d46 100644
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
req->type = NCCL_NET_IB_REQ_RECV;
@@ -2444,50 +2659,64 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
@@ -2586,50 +2800,64 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
req->devBases[i] = &comm->devs[i].base;
}
@@ -756,7 +762,7 @@ index 7af56a6c..5c3e3d46 100644
}
ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
@@ -2556,6 +2785,8 @@ static int getReqQpIndex(struct ncclIbRequest* req, int request, int qpNumber) {
@@ -2698,6 +2926,8 @@ static int getReqQpIndex(struct ncclIbRequest* req, int request, int qpNumber) {
}
#endif
@@ -765,7 +771,7 @@ index 7af56a6c..5c3e3d46 100644
ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
struct ncclIbRequest *r = (struct ncclIbRequest*)request;
*done = 0;
@@ -2589,13 +2820,18 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
@@ -2731,13 +2961,18 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
int totalWrDone = 0;
int wrDone = 0;
@@ -786,7 +792,7 @@ index 7af56a6c..5c3e3d46 100644
totalWrDone += wrDone;
if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); }
if (wrDone == 0) continue;
@@ -2742,7 +2978,7 @@ ncclResult_t rcclNetP2pPolicy(void* handle, int isP2p) {
@@ -2889,7 +3124,7 @@ ncclResult_t rcclNetP2pPolicy(void* handle, int isP2p) {
}
ncclNet_t ncclNetIb = {
+1 -1
Datei anzeigen
@@ -179,4 +179,4 @@ When developing new tuner plugins:
- [NCCL Documentation](https://docs.nvidia.com/deeplearning/nccl/)
- Example plugin implementations in this directory
For questions and support, refer to the NCCL community resources and documentation.
For questions and support, refer to the NCCL community resources and documentation.
+49
Datei anzeigen
@@ -0,0 +1,49 @@
# Compiled shared objects and binaries
*.so
*.o
*.a
*.out
*.exe
*.dll
*.dylib
*.bin
*.elf
# Python cache
__pycache__/
*.pyc
*.pyo
# Build and test artifacts
/build/
*.log
*.tmp
*.swp
# Ignore all CSV files except scripts/sample_performance_data.csv
*.csv
!scripts/sample_performance_data.csv
# Ignore all .conf files except nccl_tuner.conf
*.conf
!nccl_tuner.conf
my_configs
# Ignore test binary
test/test_plugin
# Editor/OS files
.DS_Store
Thumbs.db
# Backup files
*~
*.bak
# Ignore by convention
*.old
*.orig
# Git
.git/
+26
Datei anzeigen
@@ -0,0 +1,26 @@
# Find all C source files in current directory
set(SRC_FILES
${CMAKE_CURRENT_SOURCE_DIR}/plugin.c
)
# Create shared library
add_library(nccl-tuner-example SHARED ${SRC_FILES})
# Set include directories
target_include_directories(nccl-tuner-example PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/nccl
)
# Set output name to match Makefile
set_target_properties(nccl-tuner-example PROPERTIES
OUTPUT_NAME "nccl-tuner-example"
PREFIX "lib"
POSITION_INDEPENDENT_CODE ON
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/unit/plugins
)
# Add custom target for clean (equivalent to Makefile clean target)
add_custom_target(clean-tuner-lib
COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/libnccl-tuner-example.so
COMMENT "Cleaning libnccl-tuner-example.so"
)
+46 -5
Datei anzeigen
@@ -45,6 +45,40 @@ typedef enum {
#define NCCL_ALGO_PROTO_IGNORE -1.0
#define NCCL_HW_NVLINK 0
#define NCCL_HW_PCI 1
#define NCCL_HW_NET 2
#define NCCL_NUM_HW_LINKS 3
#define NCCL_VOLTA_COMPCAP_IDX 0
#define NCCL_AMPERE_COMPCAP_IDX 1
#define NCCL_HOPPER_COMPCAP_IDX 2
#define NCCL_BLACKWELL_COMPCAP_IDX 3
#define NCCL_NUM_COMPCAPS 4
#define NCCL_TUNING_SCALE_1NODE 0
#define NCCL_TUNING_SCALE_2NODES 1
#define NCCL_TUNING_SCALE_4NODES 2
#define NCCL_NUM_TUNING_SCALES 3
typedef struct {
int nNvlDomains; // number of NVLink domains
int minRanksPerNvlDomain; // minimum ranks across all NVLink domains
int maxRanksPerNvlDomain; // maximum ranks across all NVLink domains
} ncclNvlDomainInfo_v5_t;
typedef struct {
double baseLatencies [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
double hwLatencies [NCCL_NUM_HW_LINKS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
double llMaxBws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
double perChMaxRingLL128Bws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
double perChMaxTreeLL128Bws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
double perChMaxTreeBws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
} ncclTunerConstants_v5_t;
// API to be implemented by external tuner
typedef struct {
// Name of the tuner
@@ -52,12 +86,17 @@ typedef struct {
// Initializes tuner states.
// Inputs:
// - commId: communicator identifier
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
// - nNodes: number of nodes in current communicator.
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
// - nvlDomainInfo: NVL domain information struct
// Outputs:
// - context: tuner context object
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
// Input/Output:
// - constants: tuner constants
ncclResult_t (*init)(void** ctx, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction,
ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants);
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
// Inputs:
@@ -87,11 +126,13 @@ typedef struct {
// Terminates the plugin and cleans up any resources that the plugin allocated.
// context: tuner context object
ncclResult_t (*destroy)(void* context);
} ncclTuner_v4_t;
ncclResult_t (*finalize)(void* context);
} ncclTuner_v5_t;
typedef ncclTuner_v4_t ncclTuner_t;
typedef ncclTuner_v5_t ncclTuner_t;
typedef ncclNvlDomainInfo_v5_t ncclNvlDomainInfo_t;
typedef ncclTunerConstants_v5_t ncclTunerConstants_t;
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v5"
#endif
+31 -5
Datei anzeigen
@@ -51,6 +51,7 @@ typedef struct {
size_t nRanks;
size_t nNodes;
ncclDebugLogger_t logFunction;
ncclNvlDomainInfo_v5_t nvlDomainInfo;
} TunerContext;
// Parse collective type from string
@@ -289,7 +290,25 @@ static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) {
return ncclSuccess;
}
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) {
__hidden ncclResult_t pluginInit(void** context, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction,
ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants) {
if (NULL != constants) {
// NCCL constants tuning
// Tune NCCL's internal tuning model to improve base algo/proto selection.
// Note: Example numbers are for reference only.
// Actual numbers may vary depending on the hardware and network topology.
// These numbers are not guaranteed to be optimal for all cases.
// Limit the tree bandwidth to 15GB/s
constants->perChMaxTreeBws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] = 15.0;
// Limit the ring bandwidth to 20GB/s
constants->perChMaxRingLL128Bws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] = 20.0;
// Set NVLSTree base network latency to 24us
constants->hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = 24.0;
}
TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext));
if (!ctx) return ncclSystemError;
@@ -299,10 +318,16 @@ __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t
ctx->nRanks = nRanks;
ctx->nNodes = nNodes;
ctx->logFunction = logFunction;
if (nvlDomainInfo) {
ctx->nvlDomainInfo = *nvlDomainInfo;
} else {
memset(&ctx->nvlDomainInfo, 0, sizeof(ncclNvlDomainInfo_v5_t));
}
if (logFunction) {
logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
"TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks);
"TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks, %d NVL domains",
nNodes, nRanks, ctx->nvlDomainInfo.nNvlDomains);
}
// Try to load config file from environment variable or default location
@@ -435,7 +460,7 @@ __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size
return ncclSuccess;
}
__hidden ncclResult_t pluginDestroy(void* context) {
__hidden ncclResult_t pluginFinalize(void* context) {
if (context) {
TunerContext* ctx = (TunerContext*)context;
if (ctx->configs) {
@@ -446,11 +471,12 @@ __hidden ncclResult_t pluginDestroy(void* context) {
return ncclSuccess;
}
#define PLUGIN_NAME "Example"
const ncclTuner_v4_t ncclTunerPlugin_v4 = {
const ncclTuner_v5_t ncclTunerPlugin_v5 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.getCollInfo = pluginGetCollInfo,
.destroy = pluginDestroy
.finalize = pluginFinalize
};
+151 -27
Datei anzeigen
@@ -98,12 +98,12 @@ int test_plugin_init() {
void* context = NULL;
// Test successful initialization
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed");
TEST_ASSERT(context != NULL, "Context should be allocated");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
TEST_PASS();
}
@@ -123,11 +123,11 @@ int test_config_parsing_valid() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1);
void* context = NULL;
ncclResult_t result = pluginInit(16, 2, mock_logger, &context);
ncclResult_t result = pluginInit(&context, 0, 16, 2, mock_logger, NULL, NULL);
TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_valid.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -144,12 +144,12 @@ int test_config_parsing_invalid() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1);
void* context = NULL;
ncclResult_t result = pluginInit(8, 1, mock_logger, &context);
ncclResult_t result = pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
// Should still succeed but with no valid configs loaded
TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_invalid.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -165,7 +165,7 @@ int test_collective_matching() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
// Create mock cost table
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
@@ -209,7 +209,7 @@ int test_collective_matching() {
TEST_ASSERT(nChannels == 4, "Should set 4 channels");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_match.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -226,7 +226,7 @@ int test_size_matching() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -280,7 +280,7 @@ int test_size_matching() {
TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_size.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -298,7 +298,7 @@ int test_topology_matching() {
// Test with single node setup
void* context1 = NULL;
pluginInit(8, 1, mock_logger, &context1); // 8 ranks, 1 node
pluginInit(&context1, 0, 8, 1, mock_logger, NULL, NULL); // 8 ranks, 1 node
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -316,11 +316,11 @@ int test_topology_matching() {
TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config");
TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels");
pluginDestroy(context1);
pluginFinalize(context1);
// Test with 4 nodes, 32 ranks setup
void* context2 = NULL;
pluginInit(32, 4, mock_logger, &context2); // 32 ranks, 4 nodes
pluginInit(&context2, 0, 32, 4, mock_logger, NULL, NULL); // 32 ranks, 4 nodes
for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
@@ -349,7 +349,7 @@ int test_default_channels() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -369,7 +369,7 @@ int test_default_channels() {
TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_default.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -386,7 +386,7 @@ int test_regbuff_matching() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -437,7 +437,7 @@ int test_regbuff_matching() {
TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_regbuff.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -454,7 +454,7 @@ int test_pipeops_matching() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -504,7 +504,7 @@ int test_pipeops_matching() {
TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_pipeops.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -519,7 +519,7 @@ int test_no_match_fallback() {
setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1);
void* context = NULL;
pluginInit(8, 1, mock_logger, &context);
pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -543,7 +543,7 @@ int test_no_match_fallback() {
TEST_ASSERT(nChannels == 1, "Should use default channels");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink("test_fallback.conf");
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
@@ -593,7 +593,7 @@ int test_large_config() {
// Initialize plugin with large config
void* context = NULL;
ncclResult_t result = pluginInit(16, 4, mock_logger, &context);
ncclResult_t result = pluginInit(&context, 0, 16, 4, mock_logger, NULL, NULL);
TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed");
TEST_ASSERT(context != NULL, "Context should be allocated");
@@ -652,7 +652,7 @@ int test_large_config() {
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink(large_config_file);
unsetenv("NCCL_TUNER_CONFIG_FILE");
@@ -684,7 +684,7 @@ int test_very_large_config_stress() {
// Test initialization with stress config
void* context = NULL;
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files");
TunerContext* ctx = (TunerContext*)context;
@@ -705,7 +705,7 @@ int test_very_large_config_stress() {
}
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink(stress_config_file);
unsetenv("NCCL_TUNER_CONFIG_FILE");
@@ -726,7 +726,7 @@ int test_empty_config() {
setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1);
void* context = NULL;
ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files");
TunerContext* ctx = (TunerContext*)context;
@@ -751,13 +751,134 @@ int test_empty_config() {
TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config");
// Clean up
pluginDestroy(context);
pluginFinalize(context);
unlink(empty_config_file);
unsetenv("NCCL_TUNER_CONFIG_FILE");
TEST_PASS();
}
// Test NVLink domain info handling
int test_nvl_domain_info() {
printf("Testing NVLink domain info handling...\n");
// Test NVLink domain structure with min/max ranks per domain
ncclNvlDomainInfo_v5_t nvl_domain = {
.nNvlDomains = 2, // 2 nodes = 2 domains
.minRanksPerNvlDomain = 3, // minimum ranks across all domains (bottleneck)
.maxRanksPerNvlDomain = 5 // maximum ranks across all domains (capacity)
};
void* context = NULL;
ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, &nvl_domain, NULL);
TEST_ASSERT(result == ncclSuccess, "Plugin init with NVLink domains should succeed");
// Validate NVLD info structure
TEST_ASSERT(nvl_domain.nNvlDomains == 2, "Should have 2 domains (nodes)");
TEST_ASSERT(nvl_domain.minRanksPerNvlDomain == 3, "Should have minimum 3 ranks per domain");
TEST_ASSERT(nvl_domain.maxRanksPerNvlDomain == 5, "Should have maximum 5 ranks per domain");
// Clean up
pluginFinalize(context);
printf("NVLink domain info test passed!\n");
TEST_PASS();
}
int test_tuner_constants() {
// Initialize constants to -1.0 for testing purposes
ncclTunerConstants_v5_t constants = {
// Base latencies: [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]
.baseLatencies = {
{-1.0, -1.0, -1.0}, // NCCL_ALGO_TREE: LL, LL128, Simple
{-1.0, -1.0, -1.0}, // NCCL_ALGO_RING: LL, LL128, Simple
{-1.0, -1.0, -1.0}, // NCCL_ALGO_COLLNET_DIRECT
{-1.0, -1.0, -1.0}, // NCCL_ALGO_COLLNET_CHAIN
{-1.0, -1.0, -1.0}, // NCCL_ALGO_NVLS
{-1.0, -1.0, -1.0}, // NCCL_ALGO_NVLS_TREE
{-1.0, -1.0, -1.0} // NCCL_ALGO_PAT
},
// Hardware latencies: [NCCL_NUM_HW_LINKS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]
.hwLatencies = {
// NCCL_HW_NVLINK
{
{-1.0, -1.0, -1.0}, // TREE
{-1.0, -1.0, -1.0}, // RING
{-1.0, -1.0, -1.0}, // COLLNET_DIRECT
{-1.0, -1.0, -1.0}, // COLLNET_CHAIN
{-1.0, -1.0, -1.0}, // NVLS
{-1.0, -1.0, -1.0}, // NVLS_TREE
{-1.0, -1.0, -1.0} // PAT
},
// NCCL_HW_PCI
{
{-1.0, -1.0, -1.0}, // TREE
{-1.0, -1.0, -1.0}, // RING
{-1.0, -1.0, -1.0}, // COLLNET_DIRECT
{-1.0, -1.0, -1.0}, // COLLNET_CHAIN
{-1.0, -1.0, -1.0}, // NVLS
{-1.0, -1.0, -1.0}, // NVLS_TREE
{-1.0, -1.0, -1.0} // PAT
},
// NCCL_HW_NET
{
{-1.0, -1.0, -1.0}, // TREE
{-1.0, -1.0, -1.0}, // RING
{-1.0, -1.0, -1.0}, // COLLNET_DIRECT
{-1.0, -1.0, -1.0}, // COLLNET_CHAIN
{-1.0, -1.0, -1.0}, // NVLS
{-1.0, -1.0, -1.0}, // NVLS_TREE
{-1.0, -1.0, -1.0} // PAT
}
},
// LL maximum bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
.llMaxBws = {
{-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes
},
// Per-channel maximum Ring LL128 bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
.perChMaxRingLL128Bws = {
{-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes
},
// Per-channel maximum Tree LL128 bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
.perChMaxTreeLL128Bws = {
{-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes
},
// Per-channel maximum Tree bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
.perChMaxTreeBws = {
{-1.0, -1.0, -1.0}, // Volta: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Ampere: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0}, // Hopper: 1node, 2nodes, 4nodes
{-1.0, -1.0, -1.0} // Blackwell: 1node, 2nodes, 4nodes
}
};
void* context = NULL;
ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, &constants);
TEST_ASSERT(result == ncclSuccess, "Plugin init with constants should succeed");
// Test that the constants were set correctly
TEST_ASSERT(constants.perChMaxTreeBws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] == 15.0, "Tree bandwidth should be 15GB/s");
TEST_ASSERT(constants.perChMaxRingLL128Bws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] == 20.0, "Ring bandwidth should be 20GB/s");
TEST_ASSERT(constants.hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] == 24.0, "NVLSTree base network latency should be 24us");
// Clean up
pluginFinalize(context);
TEST_PASS();
}
// Test runner function pointer type
typedef int (*TestFunction)(void);
@@ -783,6 +904,8 @@ TestCase test_cases[] = {
{"large-config", test_large_config, "Large configuration files (dynamic allocation)"},
{"stress-config", test_very_large_config_stress, "Very large configuration stress test"},
{"empty-config", test_empty_config, "Empty configuration file handling"},
{"nvl-domain", test_nvl_domain_info, "NVL domain info handling"},
{"constants", test_tuner_constants, "Tuner constants initialization"},
{NULL, NULL, NULL} // End marker
};
@@ -826,6 +949,7 @@ int main(int argc, char* argv[]) {
if (argc == 1) {
// No arguments - run all tests
for (int i = 0; test_cases[i].name != NULL; i++) {
printf("Running test: %s\n", test_cases[i].name);
total++;
passed += test_cases[i].func();
}
+2 -2
Datei anzeigen
@@ -26,7 +26,7 @@ install_dependencies=false
install_library=false
install_prefix="${ROCM_PATH}"
log_trace=false
msccl_kernel_enabled=true
msccl_kernel_enabled=false
mscclpp_enabled=false
enable_mscclpp_clip=false
num_parallel_jobs=$(nproc)
@@ -56,7 +56,7 @@ function display_help()
echo " --debug Build debug library"
echo " --enable_backtrace Build with custom backtrace support"
echo " --disable-colltrace Build without collective trace"
echo " --disable-msccl-kernel Build without MSCCL kernels"
echo " --enable-msccl-kernel Build with MSCCL kernels"
echo " --dump-asm Disassemble code and dump assembly with inline code"
echo " --enable-mscclpp Build with MSCCL++ support"
echo " --enable-mscclpp-clip Build MSCCL++ with clip wrapper on bfloat16 and half addition routines"
+1 -6
Datei anzeigen
@@ -32,13 +32,8 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
# You should define NVCC_GENCODE in your environment to the minimal set
# of archs to reduce compile time.
CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_60,code=sm_60 \
CUDA8_GENCODE = -gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61
ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0)
# SM35 is deprecated from CUDA12.0 onwards
CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
endif
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
CUDA10_GENCODE = -gencode=arch=compute_75,code=sm_75
CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
+2 -2
Datei anzeigen
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 27
NCCL_PATCH := 7
NCCL_MINOR := 28
NCCL_PATCH := 3
NCCL_SUFFIX :=
PKG_REVISION := 1
+1 -1
Datei anzeigen
@@ -10,7 +10,7 @@ build : debian.build txz.build
BUILDDIR ?= $(abspath ../build)
ABSBUILDDIR := $(abspath $(BUILDDIR))
TARGETS := debian txz
TARGETS := debian txz doc
all: ${TARGETS:%=%.build}
prep: ${TARGETS:%=%.prep}
build: ${TARGETS:%=%.build}
+1 -1
Datei anzeigen
@@ -1,4 +1,4 @@
bin/ncclras /usr/bin
include/nccl.h /usr/include
include/* /usr/include
lib/libnccl.so /usr/lib/${pkg:MultiArch}
lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
+2 -2
Datei anzeigen
@@ -47,8 +47,8 @@ ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_li
# devel
install -m 755 -d $RPM_BUILD_ROOT/%{_bindir}
install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
cp -a include/* $RPM_BUILD_ROOT/%{_includedir}/
install -m 755 bin/ncclras $RPM_BUILD_ROOT/%{_bindir}
install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
# static
@@ -67,7 +67,7 @@ rm -rf $RPM_BUILD_ROOT
%doc LICENSE.txt
%defattr(-,root,root,-)
%{_bindir}/ncclras
%{_includedir}/nccl.h
%{_includedir}/*
%{_libdir}/libnccl.so
%files static
+1 -1
Datei anzeigen
@@ -22,7 +22,7 @@ prep: $(TXZTARGETS)
build: prep
$(MAKE) -C ../../src clean
@printf "Building source tar.xz package\n"
(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
(cd $(BUILDDIR); SRCTXZ_APITESTS=$(SRCTXZ_APITESTS) bash srctxz/create_srctxz.sh)
mkdir -p $(PKGDIR)
mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)
+27 -1
Datei anzeigen
@@ -28,8 +28,34 @@ NCCL_SUFFIX=${nccl:Suffix}
NCCL_BUILD=${pkg:Revision}
NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
if [ "${SRCTXZ_APITESTS}" = "1" ]; then
NCCLNAME+="-apitest"
fi
tar --exclude build \
INCLUDE_TEST_ENTRIES=("apitest" "googletest" "gtest.mk")
if [ "${SRCTXZ_APITESTS}" = "1" ]; then
# Exclude all entries inside test folder except those in INCLUDE_TEST_ENTRIES
for entry in $(ls $NCCLDIR/test); do
if [[ ! " ${INCLUDE_TEST_ENTRIES[@]} " =~ " $entry " ]]; then
EXCLUDE_TEST+=" --exclude $NCCLDIR/test/$entry"
fi
done
else
# Exclude the entire test directory
EXCLUDE_TEST+=" --exclude test"
fi
tar --exclude fortran \
--exclude doc \
--exclude plc \
--exclude build \
--exclude ".git*" \
--exclude share \
--exclude ompi \
--exclude ext-net \
--exclude pkg/srctxz \
--exclude docker \
$EXCLUDE_TEST \
--transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
+180
Datei anzeigen
@@ -0,0 +1,180 @@
# Source files
set(LIBSRCFILES
bootstrap.cc
channel.cc
ce_coll.cc
collectives.cc
debug.cc
enqueue.cc
group.cc
init.cc
init_nvtx.cc
proxy.cc
transport.cc
mnnvl.cc
allocator.cc
sym_kernels.cc
dev_runtime.cc
)
# Add compatibility shim if using static cudart
if(CUDARTLIB STREQUAL "cudart_static")
list(APPEND LIBSRCFILES enhcompat.cc)
endif()
# Configure pkg-config file
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/nccl.pc.in
${CMAKE_BINARY_DIR}/lib/pkgconfig/nccl.pc
@ONLY
)
# Add files from subdirectories
add_subdirectory(transport)
add_subdirectory(misc)
add_subdirectory(register)
add_subdirectory(graph)
add_subdirectory(plugin)
add_subdirectory(device)
add_subdirectory(nccl_device)
add_subdirectory(ras)
add_subdirectory(scheduler)
add_compile_options(-fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/=)
# Add all source files
list(APPEND LIBSRCFILES
${TRANSPORT_SOURCES}
${MISC_SOURCES}
${REGISTER_SOURCES}
${GRAPH_SOURCES}
${PLUGIN_SOURCES}
${RAS_SOURCES}
${SYM_SOURCES}
${SCHEDULER_SOURCES}
)
###################### Create a shared NCCL library ############################
add_library(nccl SHARED)
target_sources(nccl PRIVATE ${LIBSRCFILES})
# Include directories
target_include_directories(nccl PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/device
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
${CUDAToolkit_INCLUDE_DIRS}
${CUDAToolkit_INCLUDE_DIRS}/cccl
)
add_custom_command(
OUTPUT ${CMAKE_BINARY_DIR}/include/nccl.h
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/include
COMMAND sed -e "s/\\\$$\\{nccl:Major\\}/${NCCL_MAJOR}/g"
-e "s/\\\$$\\{nccl:Minor\\}/${NCCL_MINOR}/g"
-e "s/\\\$$\\{nccl:Patch\\}/${NCCL_PATCH}/g"
-e "s/\\\$$\\{nccl:Suffix\\}/${NCCL_SUFFIX}/g"
-e "s/\\\$$\\{nccl:Version\\}/${NCCL_VERSION_CODE}/g"
${CMAKE_CURRENT_SOURCE_DIR}/nccl.h.in > ${CMAKE_BINARY_DIR}/include/nccl.h
BYPRODUCTS ${CMAKE_BINARY_DIR}/include/nccl.h
)
add_custom_target(nccl_header DEPENDS ${CMAKE_BINARY_DIR}/include/nccl.h)
add_dependencies(nccl nccl_header)
# Set version and output name
set_target_properties(nccl PROPERTIES
VERSION ${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}
SOVERSION ${NCCL_MAJOR}
OUTPUT_NAME "nccl"
PREFIX "lib"
)
# Set CUDA specific flags
set_target_properties(nccl PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
POSITION_INDEPENDENT_CODE ON
)
# Link libraries
target_link_libraries(nccl
PRIVATE
nccl_device
pthread
rt
dl
${CUDAToolkit_LIBRARIES}
${EXTRA_LIBS}
)
# Set output directories for nccl shared library
set_target_properties(nccl PROPERTIES
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
)
###################### Create a ras binary executable ############################
set(RAS_BINSRCFILES ras/client.cc)
add_executable(ncclras ${RAS_BINSRCFILES})
target_include_directories(ncclras PUBLIC
${CMAKE_BINARY_DIR}/include
${CUDAToolkit_INCLUDE_DIRS}
)
add_dependencies(ncclras nccl_header)
target_link_libraries(ncclras
PRIVATE
pthread
rt
dl
)
# Set output directory for ncclras executable
set_target_properties(ncclras PROPERTIES
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
)
###################### Create a static NCCL library ############################
add_library(nccl_static STATIC ${LIBSRCFILES})
# Include directories
target_include_directories(nccl_static PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/device
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
${CUDAToolkit_INCLUDE_DIRS}
${CUDAToolkit_INCLUDE_DIRS}/cccl
)
# Add dependency on nccl_header
add_dependencies(nccl_static nccl_header)
# Link libraries
target_link_libraries(nccl_static
PRIVATE
nccl_device
pthread
rt
dl
${CUDAToolkit_LIBRARIES}
${EXTRA_LIBS}
)
# Set CUDA specific flags
set_target_properties(nccl_static PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
POSITION_INDEPENDENT_CODE ON
)
# Set output directory for nccl_static library
set_target_properties(nccl_static PROPERTIES
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
)
+17 -3
Datei anzeigen
@@ -7,10 +7,12 @@ include ../makefiles/common.mk
include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h
INCEXPORTS := nccl.h nccl_device.h \
$(patsubst include/%,%,$(wildcard include/nccl_device/*.h include/nccl_device/impl/*.h))
LIBSRCFILES := \
bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc symmetric.cc \
init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc dev_runtime.cc sym_kernels.cc ce_coll.cc \
$(wildcard graph/*.cc) \
$(wildcard misc/*.cc) \
$(wildcard transport/*.cc) \
@@ -19,6 +21,8 @@ LIBSRCFILES := \
$(wildcard plugin/net/*.cc) \
$(wildcard plugin/tuner/*.cc) \
$(wildcard plugin/profiler/*.cc) \
$(wildcard nccl_device/*.cc) \
$(wildcard scheduler/*.cc) \
$(filter-out ras/client.cc,$(wildcard ras/*.cc))
BINSRCFILES := ras/client.cc
@@ -123,6 +127,16 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
mkdir -p $(INCDIR)
install -m 644 $< $@
$(INCDIR)/nccl_device/%.h: include/nccl_device/%.h
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(INCDIR)/nccl_device
install -m 644 $< $@
$(INCDIR)/nccl_device/impl/%.h: include/nccl_device/impl/%.h
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(INCDIR)/nccl_device/impl
install -m 644 $< $@
$(PKGDIR)/%.pc : %.pc
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(PKGDIR)
@@ -149,7 +163,7 @@ install : build
mkdir -p $(PREFIX)/bin
cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
cp -v -r $(BUILDDIR)/include/* $(PREFIX)/include/
cp -v $(BUILDDIR)/bin/ncclras $(PREFIX)/bin/
FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h')
+332 -62
Datei anzeigen
@@ -7,10 +7,11 @@
#include "comm.h"
#include "transport.h"
#include "group.h"
#include "nvtx.h"
NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCL_NVTX3_FUNC_RANGE;
ncclResult_t ret = ncclSuccess;
#if ROCM_VERSION >= 70000
@@ -99,7 +100,7 @@ fail:
NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
ncclResult_t ncclMemFree_impl(void *ptr) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCL_NVTX3_FUNC_RANGE;
ncclResult_t ret = ncclSuccess;
int saveDevice;
@@ -129,70 +130,339 @@ fail:
goto exit;
}
// This is a collective function and should be called by all ranks in the communicator
ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr) {
ncclResult_t ret = ncclSuccess;
void* regSymAddr = NULL;
size_t allocSize = size;
size_t granularity;
CUdevice cuDev;
CUmemAllocationProp memprop = {};
CUmemGenericAllocationHandle memHandle;
int bit = 0, cnt = 0;
////////////////////////////////////////////////////////////////////////////////
// ncclSpace:
//
// This datastructure "cuts" the line of non-negative integers into segments
// which alternate between "full" (allocated) and "empty" (not allocated). The
// cuts are sorted ascending. The segment after the last cut must be empty
// (the unallocated frontier). Knwoing this we can deduce whether the segment
// ending at cut[i] is full or empty with this formula:
// isFull(i) = (i%2 != ncuts%2)
// aligment must be power of 2 as an input
while (bit < sizeof(size_t) * 8) {
if (alignment & (1L << bit)) cnt++;
if (cnt == 2) {
WARN("rank %d alignment %ld is not power of 2", comm->rank, alignment);
goto fail;
void ncclSpaceConstruct(struct ncclSpace* a) {
memset(a, 0, sizeof(*a));
}
void ncclSpaceDestruct(struct ncclSpace* a) {
free(a->cuts);
}
static void insertSegment(struct ncclSpace* a, int index, int64_t lo, int64_t hi) {
// Insert space for two cuts in `a->cuts[]` before `index`.
if (a->count + 2 > a->capacity) {
a->capacity *= 2;
if (a->capacity == 0) a->capacity = 16;
int64_t* cuts1 = (int64_t*)malloc(a->capacity*sizeof(int64_t));
for (int i=0; i < index; i++) cuts1[i] = a->cuts[i];
for (int i=index; i < a->count; i++) cuts1[i+2] = a->cuts[i];
free(a->cuts);
a->cuts = cuts1;
} else {
for (int i=a->count-1; index <= i; i--) a->cuts[i+2] = a->cuts[i];
}
a->cuts[index+0] = lo;
a->cuts[index+1] = hi;
a->count += 2;
// Filter pairs of adjacent repeated values from cuts[]. Since these mark
// boundaries where segments transition between full<->empty, dropping such a
// pair fuses two adjacent segments together. Examples:
// [1,2,3,3,4] -> [1,2,4]
// [1,2,3,3,3,4] -> [1,2,3,4] // have to leave one 3 because its a full<->empty transition
// [1,2,3,3,3,3,4] -> [1,2,4]
// Leading zeros don't have to be in pairs, they are always dropped:
// [0,1,2] -> [1,2]
// [0,0,1,2] -> [1,2]
int r = index, w = index; // Read and write cursors.
int64_t prev = r==0 ? 0 : a->cuts[r-1];
while (r < a->count) {
int64_t cur = a->cuts[r++];
a->cuts[w++] = cur;
if (prev == cur) { // Repeated value is an empty segment which can be deleted.
// Erase last two cuts or just one if we're at the start.
w -= w==1 ? 1 : 2;
// Zeros can only occur at the beginning (due to being sorted). We want to
// drop any number of zeros, but only even numbers of other repeated values.
// So set to zero here, which will make prev=0, thus if next value is zero
// it will be dropped but if its not zero then it will need to begin a new
// pair to be dropped.
cur = 0;
}
bit++;
prev = cur;
}
// temporarily align the alignment to NCCL_REC_PAGE_SIZE
ALIGN_SIZE(alignment, NCCL_REC_PAGE_SIZE);
CUCHECKGOTO(cuDeviceGet(&cuDev, comm->cudaDev), ret, fail);
memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
memprop.requestedHandleType = ncclCuMemHandleType;
memprop.location.id = cuDev;
CUCHECKGOTO(cuMemGetAllocationGranularity(&granularity, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
ALIGN_SIZE(allocSize, granularity);
CUCHECKGOTO(cuMemCreate(&memHandle, allocSize, &memprop, 0), ret, fail);
ALIGN_SIZE(comm->symAllocHead, alignment);
NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, allocSize, memHandle, &regSymAddr), ret, fail);
NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, allocSize, regSymAddr), ret, fail);
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
comm->symAllocHead += allocSize;
*symPtr = regSymAddr;
exit:
return ret;
fail:
*symPtr = NULL;
goto exit;
a->count = w;
}
ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr) {
CUmemGenericAllocationHandle handle;
size_t size = 0;
ncclResult_t ret = ncclSuccess;
int saveDev = comm->cudaDev;
CUDACHECKGOTO(cudaGetDevice(&saveDev), ret, fail);
if (ncclCuMemEnable()) {
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
CUCHECKGOTO(cuMemRetainAllocationHandle(&handle, symPtr), ret, fail);
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
CUCHECKGOTO(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)symPtr), ret, fail);
NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, size, symPtr), ret, fail);
NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, size, symPtr), ret, fail);
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
ncclResult_t ncclSpaceAlloc(
struct ncclSpace* a, int64_t limit, int64_t size, int align,
int64_t* outOffset
) {
// When allocating we try to locate the first empty segment which can hold
// the allocation and move its lower cut upward.
int i = a->count%2; // First empty segment ends at cuts[i]
size_t off;
while (i <= a->count) {
size_t lo = i == 0 ? 0 : a->cuts[i-1];
size_t hi = i == a->count ? limit : a->cuts[i];
off = alignUp(lo, align);
if (off + size <= hi) {
*outOffset = off;
if (i == 0 || off + size == hi) { // Slow path required.
insertSegment(a, i, off, off+size);
} else { // We can just append to the end of a full segment.
a->cuts[i-1] = off + size;
}
return ncclSuccess;
}
i += 2; // Next empty segment
}
exit:
CUDACHECK(cudaSetDevice(saveDev));
return ret;
fail:
goto exit;
WARN("Allocation failed. No suitable space found to accommodate size=0x%lx within limit=0x%lx", (long)size, (long)limit);
return ncclInternalError;
}
ncclResult_t ncclSpaceFree(struct ncclSpace* a, int64_t offset, int64_t size) {
if (a->count == 0 || a->cuts[a->count-1] <= offset) {
WARN("No allocation found at offset=0x%lx", (long)offset);
return ncclInternalError;
}
// This could be binary search, but since allocate is linear there's no point.
int i = 1 - a->count%2; // First full segment ends at cuts[i]
while (a->cuts[i] <= offset) i += 2;
int64_t lo = i==0 ? 0 : a->cuts[i-1];
int64_t hi = a->cuts[i];
if (offset < lo || hi < offset + size) {
WARN("Given size=0x%lx extends beyond allocation.", (long)size);
return ncclInternalError;
}
// First try the two fast cases which just shrink a segment from one side.
if (i != 0 && lo == offset && offset + size != hi) {
a->cuts[i-1] = offset + size; // Bring bottom up.
} else if (lo != offset && offset + size == hi) {
a->cuts[i] = offset; // Bring top down.
} else { // Slow path.
insertSegment(a, i, offset, offset+size);
}
return ncclSuccess;
}
////////////////////////////////////////////////////////////////////////////////
// ncclShadowPool:
struct ncclShadowPage { // A contiguous block of (at most) 64 objects
struct ncclShadowPage* next;
int objSize;
uint64_t freeMask;
void* devObjs;
};
struct ncclShadowObject {
struct ncclShadowObject* next;
void* devObj;
void* hostObj;
struct ncclShadowPage* page; // null if not allocated in page but directly in CUDA mempool.
};
void ncclShadowPoolConstruct(struct ncclShadowPool* pool) {
pool->hbits = 0;
pool->count = 0;
pool->table = nullptr;
pool->pages = nullptr;
}
ncclResult_t ncclShadowPoolDestruct(struct ncclShadowPool* pool) {
if (pool->hbits != 0) {
cudaStream_t stream;
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
if (pool->count != 0) {
for (int i=0; i < 1<<pool->hbits; i++) {
struct ncclShadowObject* obj = pool->table[i];
while (obj != nullptr) {
struct ncclShadowPage* page = obj->page;
if (page != nullptr) {
if (page->freeMask == 0) { // Put full pages back into page list.
page->freeMask = 1;
page->next = pool->pages;
pool->pages = page;
}
} else {
cudaFreeAsync(obj->devObj, stream);
}
struct ncclShadowObject* next = obj->next;
free(obj);
obj = next;
}
}
}
free(pool->table);
while (pool->pages != nullptr) {
cudaFreeAsync(pool->pages->devObjs, stream);
struct ncclShadowPage* next = pool->pages->next;
free(pool->pages);
pool->pages = next;
}
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
cudaMemPoolDestroy(pool->memPool);
}
return ncclSuccess;
}
static int hashBucket(int hbits, void* devObj) {
uintptr_t h = reinterpret_cast<uintptr_t>(devObj);
h ^= h>>32;
h *= 0x9e3779b97f4a7c13;
return (uint64_t)h >> (64-hbits);
}
static void hashInsert(struct ncclShadowPool* pool, struct ncclShadowObject* obj) {
int b = hashBucket(pool->hbits, obj->devObj);
obj->next = pool->table[b];
pool->table[b] = obj;
}
ncclResult_t ncclShadowPoolAlloc(
struct ncclShadowPool* pool, size_t size, void** outDevObj, void** outHostObj,
cudaStream_t stream
) {
if (size == 0) {
if (outDevObj) *outDevObj = nullptr;
if (outHostObj) *outHostObj = nullptr;
return ncclSuccess;
}
int hbits = pool->hbits;
if (hbits == 0) {
cudaMemPoolProps props = {};
props.allocType = cudaMemAllocationTypePinned;
props.handleTypes = cudaMemHandleTypeNone;
props.location.type = cudaMemLocationTypeDevice;
cudaGetDevice(&props.location.id);
CUDACHECK(cudaMemPoolCreate(&pool->memPool, &props));
pool->hbits = hbits = 4;
pool->table = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<<hbits);
for (int i=0; i < 1<<hbits; i++) pool->table[i] = nullptr;
}
// Check for hash table size increase before inserting. Maintain 2:1 object:bucket ratio.
if (pool->count+1 > 2<<hbits) {
struct ncclShadowObject** table0 = pool->table;
struct ncclShadowObject** table1 = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<<(hbits+1));
pool->table = table1;
pool->hbits = hbits+1;
for (int i1=0; i1 < 2<<hbits; i1++) table1[i1] = nullptr;
for (int i0=0; i0 < 1<<hbits; i0++) {
struct ncclShadowObject* obj = table0[i0];
while (obj) {
struct ncclShadowObject* next = obj->next;
hashInsert(pool, obj);
obj = next;
}
}
hbits += 1; // match pool->hbits
free(table0);
}
struct ncclShadowPage* page;
void *devObj;
if ((64<<10)/size >= 3) {
int shift = std::max<int>(0, (int)log2Down(size) + 1 - 4);
int pageObjSize = ((size + (1<<shift)-1)>>shift)<<shift;
struct ncclShadowPage** pagePtr = &pool->pages;
while (true) {
page = *pagePtr;
if (page == nullptr) {
size_t pageSize = std::min<size_t>(64<<10, 64*pageObjSize);
page = (struct ncclShadowPage*)malloc(sizeof(struct ncclShadowPage));
page->objSize = pageObjSize;
page->freeMask = uint64_t(-1)>>(64 - pageSize/pageObjSize);
page->next = pool->pages;
pool->pages = page;
CUDACHECK(cudaMallocFromPoolAsync(&page->devObjs, pageSize, pool->memPool, stream));
CUDACHECK(cudaMemsetAsync(page->devObjs, 0, pageSize, stream));
// fall through...
}
if (page->objSize == pageObjSize) {
int slot = popFirstOneBit(&page->freeMask);
devObj = (char*)page->devObjs + slot*pageObjSize;
if (page->freeMask == 0) *pagePtr = page->next; // Remove full page from list.
break;
}
pagePtr = &page->next;
}
} else {
page = nullptr;
CUDACHECK(cudaMallocFromPoolAsync(&devObj, size, pool->memPool, stream));
CUDACHECK(cudaMemsetAsync(devObj, 0, size, stream));
}
struct ncclShadowObject* obj = (struct ncclShadowObject*)malloc(
sizeof(struct ncclShadowObject) + /*padding=*/alignof(max_align_t)-1 + size
);
obj->page = page;
obj->devObj = devObj;
obj->hostObj = alignUp((char*)(obj+1), alignof(max_align_t));
memset(obj->hostObj, 0, size);
hashInsert(pool, obj);
pool->count += 1;
if (outDevObj) *outDevObj = devObj;
if (outHostObj) *outHostObj = obj->hostObj;
return ncclSuccess;
}
ncclResult_t ncclShadowPoolFree(struct ncclShadowPool* pool, void* devObj, cudaStream_t stream) {
if (devObj == nullptr) return ncclSuccess;
int b = hashBucket(pool->hbits, devObj);
struct ncclShadowObject** pobj = &pool->table[b];
while (true) {
if (*pobj == nullptr) {
WARN("Device object does not exist in shadow pool.");
return ncclInternalError;
}
if ((*pobj)->devObj == devObj) break;
pobj = &(*pobj)->next;
}
struct ncclShadowObject* obj = *pobj;
*pobj = obj->next;
if (obj->page != nullptr) {
if (obj->page->freeMask == 0) {
obj->page->next = pool->pages;
pool->pages = obj->page;
}
int slot = ((char*)obj->devObj - (char*)obj->page->devObjs)/obj->page->objSize;
obj->page->freeMask |= uint64_t(1)<<slot;
} else {
CUDACHECK(cudaFreeAsync(devObj, stream));
}
free(obj);
pool->count -= 1;
return ncclSuccess;
}
ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool* pool, void* devObj, void** hostObj) {
if (devObj == nullptr) {
*hostObj = nullptr;
return ncclSuccess;
}
int b = hashBucket(pool->hbits, devObj);
struct ncclShadowObject* obj = pool->table[b];
while (true) {
if (obj == nullptr) {
WARN("Device object does not exist in shadow pool.");
return ncclInternalError;
}
if (obj->devObj == devObj) break;
obj = obj->next;
}
*hostObj = obj->hostObj;
return ncclSuccess;
}
+10 -15
Datei anzeigen
@@ -15,6 +15,7 @@
#include "signals.h" // [RCCL]
#include "param.h"
#include "ras.h"
#include <mutex>
#define BOOTSTRAP_N_CHECK_ABORT 10000
#define BOOTSTRAP_TAG_CONNECT (0x1 << 31)
@@ -86,13 +87,13 @@ struct bootstrapRootArgs {
static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
static union ncclSocketAddress bootstrapNetIfAddr;
static int bootstrapNetInitDone = 0;
pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
static std::mutex bootstrapNetMutex;
NCCL_PARAM(BootstrapNetEnable,"OOB_NET_ENABLE", 0);
ncclResult_t bootstrapNetInit() {
if (bootstrapNetInitDone == 0) {
pthread_mutex_lock(&bootstrapNetLock);
std::lock_guard<std::mutex> lock(bootstrapNetMutex);
if (bootstrapNetInitDone == 0) {
const char* env = ncclGetEnv("NCCL_COMM_ID");
int nIfs = 0;
@@ -100,21 +101,18 @@ ncclResult_t bootstrapNetInit() {
union ncclSocketAddress remoteAddr;
if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
pthread_mutex_unlock(&bootstrapNetLock);
return ncclInvalidArgument;
}
NCCLCHECK(ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE,
&nIfs));
if (nIfs <= 0) {
WARN("NET/Socket : No usable listening interface found");
pthread_mutex_unlock(&bootstrapNetLock);
return ncclSystemError;
}
} else {
NCCLCHECK(ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1, &nIfs));
if (nIfs <= 0) {
WARN("Bootstrap : no socket interface found");
pthread_mutex_unlock(&bootstrapNetLock);
return ncclInvalidUsage;
}
}
@@ -124,7 +122,6 @@ ncclResult_t bootstrapNetInit() {
INFO(NCCL_BOOTSTRAP, "Bootstrap: Using%s", line);
bootstrapNetInitDone = 1;
}
pthread_mutex_unlock(&bootstrapNetLock);
}
return ncclSuccess;
}
@@ -486,7 +483,7 @@ static ncclResult_t getUDS(uint64_t* peerUDS) {
static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
static int devOOB = -1;
if (devOOB < 0) {
pthread_mutex_lock(&bootstrapNetLock);
std::lock_guard<std::mutex> lock(bootstrapNetMutex);
if (devOOB < 0) {
const char* userIfEnv = ncclGetEnv("NCCL_OOB_NET_IFNAME");
if (userIfEnv && strlen(userIfEnv) > 0) {
@@ -517,7 +514,6 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
WARN("no device found matching %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
else
WARN("no device found after excluding %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
pthread_mutex_unlock(&bootstrapNetLock);
return ncclInvalidArgument;
}
} else {
@@ -530,13 +526,12 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
bool hasProp = res == ncclSuccess;
INFO(NCCL_BOOTSTRAP, "Bootstrap: Using %s:%d", (hasProp) ? props.name : "N/A", (hasProp) ? props.port : -1);
}
pthread_mutex_unlock(&bootstrapNetLock);
}
*dev = devOOB;
return ncclSuccess;
}
static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE],
static ncclResult_t netRingConnect(void* ctx, ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE],
void** sendComm, ncclNetDeviceHandle_t** sendDevHandle,
void** recvComm, ncclNetDeviceHandle_t** recvDevHandle, volatile uint32_t* abortFlag) {
@@ -544,7 +539,7 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis
do {
NCCLCHECK(checkAbort(abortFlag, &abortCounter));
if (!*sendComm)
NCCLCHECK(net->connect(listen->net.dev, NULL, peerHandle, sendComm, sendDevHandle));
NCCLCHECK(net->connect(ctx, listen->net.dev, peerHandle, sendComm, sendDevHandle));
if (!*recvComm)
NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle));
} while (!*sendComm || !*recvComm);
@@ -660,7 +655,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
if (ncclParamBootstrapNetEnable()) {
// Create net interface for other ranks to contact me (all gather)
NCCLCHECK(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)));
NCCLCHECK(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)));
NCCLCHECK(state->net->listen(comm->netContext, STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)));
memcpy(info.connectInfo.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
} else {
// create socket for ring neightbor to contact mee
@@ -714,7 +709,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
// accept and connect the ring network
if (ncclParamBootstrapNetEnable()) {
NCCLCHECK(netRingConnect(state->net, &state->listen, nextPeer.handle,
NCCLCHECK(netRingConnect(comm->netContext, state->net, &state->listen, nextPeer.handle,
&STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
&STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag));
} else {
@@ -807,7 +802,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
// create a handle for the others to reach out to me
if (ncclParamBootstrapNetEnable()) {
NCCLCHECKGOTO(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)), ret, fail);
NCCLCHECKGOTO(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail);
NCCLCHECKGOTO(state->net->listen(comm->netContext, STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail);
memcpy(info.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
} else {
// create socket for ring neightbor to contact mee
@@ -826,7 +821,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, BOOTSTRAP_TAG_COMMSPLIT, &info, sizeof(union ringConnectInfo)), ret, fail);
NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, BOOTSTRAP_TAG_COMMSPLIT, &nextPeer, sizeof(union ringConnectInfo)), ret, fail);
if (ncclParamBootstrapNetEnable()) {
NCCLCHECKGOTO(netRingConnect(state->net, &state->listen, nextPeer.handle,
NCCLCHECKGOTO(netRingConnect(comm->netContext, state->net, &state->listen, nextPeer.handle,
&STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
&STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag),
ret, fail);
+615
Datei anzeigen
@@ -0,0 +1,615 @@
/*************************************************************************
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "comm.h"
#include "register_inline.h"
#include <cuda.h>
#include "rocmwrap.h"
#include "ce_coll.h"
#include "alloc.h"
// Static constant for graph synchronization
static const uint32_t GRAPH_SYNC_VALUE = 1;
// Static constants for intra-batch synchronization to improve CE collective performance with large scale
// Frequency of intra-batch synchronization
static const uint32_t CE_COLL_INTRA_BATCH_SYNC_FREQ = 8;
// Message threshold for intra-batch synchronization
static const uint64_t CE_COLL_INTRA_BATCH_SYNC_MSG_THRESHOLD = 512*1024*1024;
ncclResult_t ncclCeInit(struct ncclComm* comm) {
ncclResult_t ret = ncclSuccess;
uint8_t* ceDevBase;
size_t ceDevBaseSize = alignUp(comm->nRanks*sizeof(uint32_t), 16) * 2;
ncclWindow_vidmem* ceWinDev;
ncclWindow_vidmem* ceWinDevHost;
// Ensure symmetric memory runtime is initialized
NCCLCHECKGOTO(ncclDevrInitOnce(comm), ret, fail);
// Allocate and register memory for the symmetric memory
NCCLCHECKGOTO(ncclMemAlloc((void**)&ceDevBase, ceDevBaseSize), ret, fail);
NCCLCHECKGOTO(ncclDevrWindowRegisterInGroup(comm, ceDevBase, ceDevBaseSize, NCCL_WIN_COLL_SYMMETRIC, &ceWinDev), ret, fail);
NCCLCHECKGOTO(ncclShadowPoolToHost(&comm->devrState.shadows, ceWinDev, &ceWinDevHost), ret, fail);
// Get the ncclDevrWindow from the winHost field
comm->ceColl.ceSyncWin = (struct ncclDevrWindow*)ceWinDevHost->winHost;
comm->ceColl.baseUCSymReadyOffset = 0;
comm->ceColl.baseUCSymComplOffset = alignUp(comm->nRanks*sizeof(uint32_t), 16);
comm->ceColl.baseUCSymReadyPtr = (uint8_t*)comm->ceColl.ceSyncWin->userPtr + comm->ceColl.baseUCSymReadyOffset;
comm->ceColl.baseUCSymComplPtr = (uint8_t*)comm->ceColl.ceSyncWin->userPtr + comm->ceColl.baseUCSymComplOffset;
comm->ceColl.ceSeqNum = 0;
comm->ceColl.useCompletePtr = false;
comm->ceColl.intraBatchSyncFreq = CE_COLL_INTRA_BATCH_SYNC_FREQ;
comm->ceColl.intraBatchSyncMsgThreshold = CE_COLL_INTRA_BATCH_SYNC_MSG_THRESHOLD;
INFO(NCCL_INIT, "Init CE, rank %d baseUCSymReadyPtr %p, baseUCSymComplPtr %p, seq num %d", comm->rank, comm->ceColl.baseUCSymReadyPtr, comm->ceColl.baseUCSymComplPtr, comm->ceColl.ceSeqNum);
exit:
return ret;
fail:
goto exit;
}
ncclResult_t ncclCeFinalize(struct ncclComm* comm) {
ncclResult_t ret = ncclSuccess;
// Clean up ceInitTaskQueue
while (!ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) {
struct ncclCeInitTask* task = ncclIntruQueueDequeue(&comm->ceInitTaskQueue);
free(task);
}
// Clean up CE resources
if (comm->ceColl.baseUCSymReadyPtr != NULL) {
if (comm->ceColl.ceSyncWin && comm->ceColl.ceSyncWin->vidmem) {
NCCLCHECKGOTO(ncclCommWindowDeregister(comm, comm->ceColl.ceSyncWin->vidmem), ret, fail);
NCCLCHECKGOTO(ncclMemFree(comm->ceColl.baseUCSymReadyPtr), ret, fail);
}
comm->ceColl.baseUCSymReadyPtr = NULL;
comm->ceColl.baseUCSymComplPtr = NULL;
comm->ceColl.ceSyncWin = NULL;
}
exit:
return ret;
fail:
goto exit;
}
bool ncclCeImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) {
int driverVersion;
if (ncclCudaDriverVersion(&driverVersion) != ncclSuccess) return false;
// CE is supported in CUDA 12.5 and later
if (driverVersion >= 12050) {
switch (coll) {
case ncclFuncAllGather:
case ncclFuncAlltoAll:
case ncclFuncScatter:
case ncclFuncGather:
return true;
default:
return false;
}
}
return false;
}
ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, hipStreamBatchMemOpParams* batchParams, size_t* opIdx, cudaStream_t stream) {
ncclResult_t ret = ncclSuccess;
uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
uint32_t currentSeq = ++comm->ceColl.ceSeqNum;
// Source pointer is either the constant graph sync value or the sequence number
void* srcPtr = capturing ? (void*)&GRAPH_SYNC_VALUE : (void*)&currentSeq;
// Wait value is either the constant graph sync value or the sequence number
uint32_t waitValue = capturing ? GRAPH_SYNC_VALUE : currentSeq;
// Use multi-cast address as destination pointer
void* mcDstPtr;
void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank];
size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr;
NCCLCHECKGOTO(ncclDevrGetLsaTeamPtrMC(comm, comm->ceColl.ceSyncWin, offset, ncclTeamLsa(comm), &mcDstPtr), ret, fail);
// Write our own ready/complete flag to the multi-cast address
CUDACHECKGOTO(cudaMemcpyAsync(
mcDstPtr,
srcPtr,
sizeof(uint32_t),
cudaMemcpyHostToDevice,
stream), ret, fail);
// Add local wait operations for every other rank
for (int r = 0; r < comm->nRanks; ++r) {
if (r == comm->rank) continue;
batchParams[*opIdx] = {};
// batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
batchParams[*opIdx].waitValue.address = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]);
batchParams[*opIdx].waitValue.value = waitValue;
batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
(*opIdx)++;
}
exit:
return ret;
fail:
goto exit;
}
ncclResult_t ncclPrepUCSync(struct ncclComm* comm, bool isComplete,
hipStreamBatchMemOpParams* batchParams,
size_t* opIdx) {
ncclResult_t ret = ncclSuccess;
uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
uint32_t currentSeq = ++comm->ceColl.ceSeqNum;
// Write our own ready/complete flag to remote ranks
uint32_t waitValue = capturing ? GRAPH_SYNC_VALUE : currentSeq;
for (int r = 0; r < comm->nRanks; ++r) {
if (r == comm->rank) continue;
void * peerDstPtr;
void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank];
size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr;
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, comm->ceColl.ceSyncWin, offset, r, &peerDstPtr), ret, fail);
batchParams[*opIdx] = {};
// batchParams[*opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
batchParams[*opIdx].writeValue.address = (CUdeviceptr)peerDstPtr;
batchParams[*opIdx].writeValue.value = waitValue;
// batchParams[*opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT;
(*opIdx)++;
}
// Add local wait operations for every other rank
for (int r = 0; r < comm->nRanks; ++r) {
if (r == comm->rank) continue;
batchParams[*opIdx] = {};
// batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
batchParams[*opIdx].waitValue.address = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]);
batchParams[*opIdx].waitValue.value = waitValue;
batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
(*opIdx)++;
}
exit:
return ret;
fail:
goto exit;
}
ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
ncclResult_t ret = ncclSuccess;
// Get pointers to the ready and complete synchronization arrays
uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
// Allocate enough slots for all possible ops
size_t batchSize = (comm->nvlsSupport ? NCCL_CE_SYNC_OPS_PER_RANK_MC : NCCL_CE_SYNC_OPS_PER_RANK_UC) * comm->nRanks;
size_t opIdx = 0;
// Prepare batch memory operations for synchronization
hipStreamBatchMemOpParams* batchParams = nullptr;
NCCLCHECKGOTO(ncclCalloc(&batchParams, batchSize), ret, fail);
if (comm->nvlsSupport) {
NCCLCHECKGOTO(ncclPrepMCSync(comm, comm->ceColl.useCompletePtr, batchParams, &opIdx, stream), ret, fail);
} else {
NCCLCHECKGOTO(ncclPrepUCSync(comm, comm->ceColl.useCompletePtr, batchParams, &opIdx), ret, fail);
}
// For CUDA graph capture, add reset operation
if (ncclCudaGraphValid(comm->planner.capturingGraph)) {
for (int i = 0; i < comm->nRanks; i++) {
batchParams[opIdx] = {};
// batchParams[opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
batchParams[opIdx].writeValue.address = (CUdeviceptr)(comm->ceColl.useCompletePtr ? (void*)&completePtrs[i] : (void*)&readyPtrs[i]);
batchParams[opIdx].writeValue.value = 0;
// batchParams[opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT;
opIdx++;
}
}
// Execute all memory operations in a single batch
CUCHECKGOTO(hipStreamBatchMemOp(stream, opIdx, batchParams, 0), ret, fail);
// Toggle the flag for next call
comm->ceColl.useCompletePtr = !comm->ceColl.useCompletePtr;
exit:
if (batchParams) free(batchParams);
return ret;
fail:
goto exit;
}
ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* params, int nRanks) {
ncclResult_t ret = ncclSuccess;
params->srcs = nullptr;
params->dsts = nullptr;
params->sizes = nullptr;
params->numOps = 0;
params->intraBatchSync = false;
#if CUDART_VERSION >= 12080
params->attrs = nullptr;
params->attrIdxs = nullptr;
params->numAttrs = 0;
#endif
NCCLCHECKGOTO(ncclCalloc(&params->srcs, nRanks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&params->dsts, nRanks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&params->sizes, nRanks), ret, fail);
#if CUDART_VERSION >= 12080
NCCLCHECKGOTO(ncclCalloc(&params->attrs, nRanks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&params->attrIdxs, nRanks), ret, fail);
#endif
exit:
return ret;
fail:
goto exit;
}
void ncclCeFreeBatchOpsParams(struct ncclCeBatchOpsParams* params) {
if (params->srcs) free(params->srcs);
if (params->dsts) free(params->dsts);
if (params->sizes) free(params->sizes);
#if CUDART_VERSION >= 12080
if (params->attrs) free(params->attrs);
if (params->attrIdxs) free(params->attrIdxs);
#endif
}
ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBatchOpsParams* params, cudaStream_t stream) {
ncclResult_t ret = ncclSuccess;
// Check if there are any operations to perform
if (params->numOps == 0) {
return ncclSuccess;
}
// Check if we are in a CUDA graph capture
bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
int driverVersion;
NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, fail);
//--------------Graph capture--------------
// cudaMemcpyBatchAsync is not supported during CUDA graph capture
if (capturing) {
for (int i =0; i < params->numOps; i++) {
CUDACHECKGOTO(cudaMemcpyAsync(
(void*)params->dsts[i],
(void*)params->srcs[i],
params->sizes[i],
cudaMemcpyDeviceToDevice,
stream), ret, fail);
if (params->intraBatchSync && ((i+1) % comm->ceColl.intraBatchSyncFreq == 0) && ((i+1) < params->numOps)) {
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
}
}
}
//--------------No graph capture--------------
else {
if (/*CUDART_VERSION >= 12080 &&*/ driverVersion >= 12080) {
#if CUDART_VERSION >= 12080
// For CUDA 12.8+, use batch memory copy for better performance
params->attrs[0] = {};
params->attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderStream;
params->attrs[0].flags = cudaMemcpyFlagPreferOverlapWithCompute;
params->attrIdxs[0] = 0;
params->numAttrs = 1;
if (params->intraBatchSync) {
// Break into multiple batches with sync between them
int batchSize = comm->ceColl.intraBatchSyncFreq;
for (int i = 0; i < params->numOps; i += batchSize) {
int currentBatchSize = (i + batchSize <= params->numOps) ? batchSize : params->numOps - i;
#if CUDART_VERSION >= 13000
CUDACHECKGOTO(cudaMemcpyBatchAsync(
&params->dsts[i], &params->srcs[i], &params->sizes[i], currentBatchSize,
params->attrs, params->attrIdxs, params->numAttrs, stream), ret, fail);
#else
CUDACHECKGOTO(cudaMemcpyBatchAsync(
&params->dsts[i], &params->srcs[i], &params->sizes[i], currentBatchSize,
params->attrs, params->attrIdxs, params->numAttrs, nullptr, stream), ret, fail);
#endif
// Sync after each batch
if (i + batchSize < params->numOps) {
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
}
}
} else {
// Use single batch for all operations
#if CUDART_VERSION >= 13000
CUDACHECKGOTO(cudaMemcpyBatchAsync(
params->dsts, params->srcs, params->sizes, params->numOps,
params->attrs, params->attrIdxs, params->numAttrs, stream), ret, fail);
#else
CUDACHECKGOTO(cudaMemcpyBatchAsync(
params->dsts, params->srcs, params->sizes, params->numOps,
params->attrs, params->attrIdxs, params->numAttrs, nullptr, stream), ret, fail);
#endif
}
#endif
} else {
// For older CUDA versions, fall back to individual transfers
for (int i = 0; i < params->numOps; i++) {
CUDACHECKGOTO(cudaMemcpyAsync(
(void*)params->dsts[i],
(void*)params->srcs[i],
params->sizes[i],
cudaMemcpyDeviceToDevice,
stream), ret, fail);
if (params->intraBatchSync && ((i+1) % comm->ceColl.intraBatchSyncFreq == 0) && ((i+1) < params->numOps)) {
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
}
}
}
}
exit:
return ret;
fail:
goto exit;
}
ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
ncclResult_t ret = ncclSuccess;
// Calculate the size of each rank's data chunk
const size_t chunkBytes = args->nElts * args->eltSize;
uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
uint8_t* myRecvBuff = (uint8_t*)args->recvBuff + comm->rank * chunkBytes;
void* peerRecvBuff;
size_t offset;
struct ncclCeBatchOpsParams batchOpsParams = {};
NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks), ret, fail);
// Ensure all ranks are ready before starting transfers
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
// Copy own data to receive buffer if operation is out-of-place
if (myRecvBuff != mySendBuff) {
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)myRecvBuff;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
}
// Copy data to other ranks
for (int r = 1; r < comm->nRanks; r++) {
int targetRank = (comm->rank + r) % comm->nRanks;
offset = myRecvBuff - (uint8_t*)args->recvWin->userPtr;
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, targetRank, &peerRecvBuff), ret, fail);
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
}
// Check if we need to perform intra-batch synchronization
batchOpsParams.intraBatchSync = (batchOpsParams.numOps > comm->ceColl.intraBatchSyncFreq && chunkBytes*batchOpsParams.numOps >= comm->ceColl.intraBatchSyncMsgThreshold);
// Launch the batch operations
NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
// Ensure all transfers are complete across all ranks
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
exit:
ncclCeFreeBatchOpsParams(&batchOpsParams);
return ret;
fail:
goto exit;
}
ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
ncclResult_t ret = ncclSuccess;
// Calculate the size of data each rank sends to every other rank
const size_t chunkBytes = args->nElts * args->eltSize;
uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
void* peerRecvBuff;
size_t offset;
struct ncclCeBatchOpsParams batchOpsParams = {};
NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks * comm->nRanks), ret, fail);
// Ensure all ranks are ready before starting transfers
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
// Copy data to other ranks: send data chunk for each destination rank
for (int r = 0; r < comm->nRanks; r++) {
int dstRank = (comm->rank + r) % comm->nRanks;
uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes;
uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes;
if (dstRank == comm->rank) {
// Local copy for own data
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
} else {
// Remote copy to other ranks: send to rank dstRank's receive buffer at position comm->rank
offset = dstPtr - (uint8_t*)args->recvWin->userPtr;
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, dstRank, &peerRecvBuff), ret, fail);
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
}
}
// Check if we need to perform intra-batch synchronization
batchOpsParams.intraBatchSync = (batchOpsParams.numOps > comm->ceColl.intraBatchSyncFreq && chunkBytes*batchOpsParams.numOps >= comm->ceColl.intraBatchSyncMsgThreshold);
// Launch the batch operations
NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
// Ensure all transfers are complete across all ranks
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
exit:
ncclCeFreeBatchOpsParams(&batchOpsParams);
return ret;
fail:
goto exit;
}
ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
ncclResult_t ret = ncclSuccess;
// Calculate the size of data root sends to each rank
const size_t chunkBytes = args->nElts * args->eltSize;
uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
int rootRank = args->rootRank;
void* peerDstPtr;
size_t offset;
struct ncclCeBatchOpsParams batchOpsParams = {};
NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks), ret, fail);
// Ensure all ranks are ready before starting transfers
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
if (comm->rank == rootRank) {
// Check if this is an in-place scatter operation
bool isInPlace = (myRecvBuff == mySendBuff + comm->rank * chunkBytes);
// Copy root's own data first if not in-place
if (!isInPlace) {
uint8_t* srcPtr = mySendBuff + comm->rank * chunkBytes;
uint8_t* dstPtr = myRecvBuff;
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
}
// Root rank distributes data to other ranks
for (int r = 1; r < comm->nRanks; r++) {
int dstRank = (comm->rank + r) % comm->nRanks;
uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes;
uint8_t* dstPtr = isInPlace ? myRecvBuff + dstRank * chunkBytes : myRecvBuff;
offset = dstPtr - (uint8_t*)args->recvWin->userPtr;
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, dstRank, &peerDstPtr), ret, fail);
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerDstPtr;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
}
}
// Non-root ranks don't need to perform any copy operations
// Launch the batch operations
NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
// Ensure all transfers are complete across all ranks
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
exit:
ncclCeFreeBatchOpsParams(&batchOpsParams);
return ret;
fail:
goto exit;
}
ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
ncclResult_t ret = ncclSuccess;
// Calculate the size of data each rank sends to root
const size_t chunkBytes = args->nElts * args->eltSize;
uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
int rootRank = args->rootRank;
void* peerRecvBuff;
size_t offset;
struct ncclCeBatchOpsParams batchOpsParams = {};
NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, 1), ret, fail);
// Ensure all ranks are ready before starting transfers
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
if (comm->rank == rootRank) {
// Root rank copies its own data to the correct position in receive buffer
uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes;
if (mySendBuff != dstPtr) {
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
}
} else {
// Non-root ranks send their data to root's receive buffer
uint8_t* rootRecvPtr = (uint8_t*)args->recvBuff + comm->rank * chunkBytes;
offset = rootRecvPtr - (uint8_t*)args->recvWin->userPtr;
NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, rootRank, &peerRecvBuff), ret, fail);
batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
batchOpsParams.numOps++;
}
// Launch the batch operations
NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
// Ensure all transfers are complete across all ranks
NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
exit:
ncclCeFreeBatchOpsParams(&batchOpsParams);
return ret;
fail:
goto exit;
}
ncclResult_t ncclLaunchCeColl(struct ncclComm* comm, struct ncclKernelPlan* plan) {
ncclResult_t ret = ncclSuccess;
cudaStream_t stream = comm->planner.streams->stream;
struct ncclCeCollArgs* args = plan->ceCollArgs;
switch (args->func) {
case ncclFuncAllGather:
NCCLCHECKGOTO(ncclCeAllGather(comm, args, stream), ret, fail);
break;
case ncclFuncAlltoAll:
NCCLCHECKGOTO(ncclCeAlltoAll(comm, args, stream), ret, fail);
break;
case ncclFuncScatter:
NCCLCHECKGOTO(ncclCeScatter(comm, args, stream), ret, fail);
break;
case ncclFuncGather:
NCCLCHECKGOTO(ncclCeGather(comm, args, stream), ret, fail);
break;
default:
ret = ncclInvalidUsage;
}
exit:
return ret;
fail:
goto exit;
}
+118 -166
Datei anzeigen
@@ -23,10 +23,13 @@ const char* ncclFuncToString(ncclFunc_t fn) {
switch (fn) {
case ncclFuncAllGather: return "AllGather";
case ncclFuncAllReduce: return "AllReduce";
case ncclFuncAlltoAll: return "AlltoAll";
case ncclFuncBroadcast: return "Broadcast";
case ncclFuncGather: return "Gather";
case ncclFuncRecv: return "Recv";
case ncclFuncReduce: return "Reduce";
case ncclFuncReduceScatter: return "ReduceScatter";
case ncclFuncScatter: return "Scatter";
case ncclFuncSendRecv: return "SendRecv";
case ncclFuncSend: return "Send";
default: return "Invalid";
@@ -85,7 +88,6 @@ const char* ncclProtoToString(int proto) {
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(AllGather, NcclNvtxParamsAllGather,
@@ -148,10 +150,101 @@ ncclResult_t ncclAllGather_impl(const void* sendbuff, void* recvbuff, size_t sen
}
}
RCCL_PARAM(AlltoAllPivotEnable, "ALL_TO_ALL_PIVOT_ENABLE", 0);
NCCL_API(ncclResult_t, ncclAlltoAll, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclAlltoAll_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclComm* comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(AlltoAll, NcclNvtxParamsAlltoAll,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), datatype));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrAllToAll, sendbuff, recvbuff, count, datatype, comm, stream));
}
if (mscclAvailable(comm) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
count, datatype, 0, 0, ncclSum, mscclFuncAllToAll, comm, stream);
}
size_t rankOffset = count * ncclTypeSize(datatype);
size_t rankAlign = rankOffset & ((~rankOffset) + 1);
size_t msgSize = count * ncclTypeSize(datatype) * comm->nRanks;
struct ncclInfo info;
if (comm->topo->pivotA2AEnabled && comm->nChannels >= comm->topo->pivotA2ANumBiRings * 2 &&
rankOffset >= 744 * 1024 && rankAlign != 4 && rcclParamAlltoAllPivotEnable()) {
info = { ncclFuncAlltoAllPivot, "AlltoAllPivot",
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
} else {
#ifdef ENABLE_ROCSHMEM
if (rcclUseAllToAllGda(comm) && msgSize <= comm->rocshmemThreshold) {
struct ncclInfo info = { ncclFuncAllToAllGda, "AllToAllGda",
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream,
ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
return ncclEnqueueCheck(&info);
}
#endif ENABLE_ROCSHMEM
info = { ncclFuncAlltoAll, "AlltoAll",
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
}
return ncclEnqueueCheck(&info);
}
NCCL_API(ncclResult_t, ncclAlltoAllv, const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAlltoAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(AlltoAllv, NcclNvtxParamsAlltoAllv,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcounts[comm->rank] * ncclTypeSize(datatype),
recvcounts[comm->rank] * ncclTypeSize(datatype), datatype));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrAllToAllv, sendbuff, recvbuff, 0, datatype, comm, stream, -1, sendcounts, sdispls, recvcounts, rdispls));
}
if (mscclAvailable(comm) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls,
0, datatype, 0, 0, ncclSum, mscclFuncAllToAllv, comm, stream);
}
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
if (!mscclIsCaller()) Recorder::instance().skip(true);
NCCLCHECK(ncclGroupStart());
for (int r=0; r<nRanks; r++) {
NCCLCHECK(ncclSend(
((char*)sendbuff) + sdispls[r]*ncclTypeSize(datatype),
sendcounts[r],
datatype,
r,
comm,
stream));
NCCLCHECK(ncclRecv(
((char*)recvbuff) + rdispls[r]*ncclTypeSize(datatype),
recvcounts[r],
datatype,
r,
comm,
stream));
}
NCCLCHECK(ncclGroupEnd());
if (!mscclIsCaller()) Recorder::instance().skip(false);
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclAllReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(AllReduce, NcclNvtxParamsAllReduce,
@@ -202,116 +295,8 @@ ncclResult_t ncclAllReduceWithBias_impl(const void* sendbuff, void* recvbuff, si
return ncclEnqueueCheck(&info);
}
RCCL_PARAM(AllToAllPivotEnable, "ALL_TO_ALL_PIVOT_ENABLE", 0);
NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAllToAll_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclComm_t comm, hipStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(AllToAll, NcclNvtxParamsAllToAll,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), datatype));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrAllToAll, sendbuff, recvbuff, count, datatype, comm, stream));
}
if (mscclAvailable(comm) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
count, datatype, 0, 0, ncclSum, mscclFuncAllToAll, comm, stream);
}
size_t rankOffset = count * ncclTypeSize(datatype);
size_t rankAlign = rankOffset & ((~rankOffset) + 1);
size_t msgSize = count * ncclTypeSize(datatype) * comm->nRanks;
// Determine Pivot A2A support now that we know number of channels
if (comm->topo->pivotA2AEnabled && comm->nChannels >= comm->topo->pivotA2ANumBiRings * 2 &&
rankOffset >= 744 * 1024 && rankAlign != 4 && rcclParamAllToAllPivotEnable()) {
struct ncclInfo info = { ncclFuncAllToAllPivot, "AllToAllPivot",
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
return ncclEnqueueCheck(&info);
} else {
#ifdef ENABLE_ROCSHMEM
if (rcclUseAllToAllGda(comm) && msgSize <= comm->rocshmemThreshold) {
struct ncclInfo info = { ncclFuncAllToAllGda, "AllToAllGda",
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream,
ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS, nullptr };
return ncclEnqueueCheck(&info);
}
#endif
int nRanks;
//comm->isA2a = 0;
NCCLCHECK(ncclCommCount(comm, &nRanks));
if (count == 0) return ncclSuccess;
if (!mscclIsCaller()) Recorder::instance().skip(true);
NCCLCHECK(ncclGroupStart());
for (int r=0; r<nRanks; r++) {
NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, datatype, r, comm, stream));
NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, datatype, r, comm, stream));
}
NCCLCHECK(ncclGroupEnd());
if (!mscclIsCaller()) Recorder::instance().skip(false);
return ncclSuccess;
}
}
NCCL_API(ncclResult_t, ncclAllToAllv, const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAllToAllv_impl(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(AllToAllv, NcclNvtxParamsAllToAllv,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcounts[comm->rank] * ncclTypeSize(datatype),
recvcounts[comm->rank] * ncclTypeSize(datatype), datatype));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrAllToAllv, sendbuff, recvbuff, 0, datatype, comm, stream, -1, sendcounts, sdispls, recvcounts, rdispls));
}
if (mscclAvailable(comm) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls,
0, datatype, 0, 0, ncclSum, mscclFuncAllToAllv, comm, stream);
}
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
if (!mscclIsCaller()) Recorder::instance().skip(true);
NCCLCHECK(ncclGroupStart());
for (int r=0; r<nRanks; r++) {
NCCLCHECK(ncclSend(
((char*)sendbuff) + sdispls[r]*ncclTypeSize(datatype),
sendcounts[r],
datatype,
r,
comm,
stream));
NCCLCHECK(ncclRecv(
((char*)recvbuff) + rdispls[r]*ncclTypeSize(datatype),
recvcounts[r],
datatype,
r,
comm,
stream));
}
NCCLCHECK(ncclGroupEnd());
if (!mscclIsCaller()) Recorder::instance().skip(false);
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBroadcast_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(Broadcast, NcclNvtxParamsBroadcast,
@@ -343,46 +328,32 @@ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int ro
return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
}
NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) {
NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclGather_impl(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm* comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(Gather, NcclNvtxParamsGather,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype), root, datatype));
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, sendcount, datatype, comm, stream, root));
NCCLCHECK(Recorder::instance().record(rrGather, sendbuff, recvbuff, count, datatype, comm, stream, root));
}
if (mscclAvailable(comm) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
sendcount, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
count, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
}
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
size_t rankOffset = sendcount * ncclTypeSize(datatype);
if (sendcount == 0) return ncclSuccess;
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
if (!mscclIsCaller()) Recorder::instance().skip(true);
NCCLCHECK(ncclGroupStart());
if (rank == root) {
for (int r=0; r<nRanks; r++)
NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, sendcount, datatype, r, comm, stream));
}
NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, root, comm, stream));
NCCLCHECK(ncclGroupEnd());
if (!mscclIsCaller()) Recorder::instance().skip(false);
return ncclSuccess;
struct ncclInfo info = { ncclFuncGather, "Gather",
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
GATHER_CHUNKSTEPS, GATHER_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(Reduce, NcclNvtxParamsReduce,
@@ -408,8 +379,6 @@ ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, NcclNvtxParamsReduceScatter,
@@ -433,48 +402,32 @@ ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t
return ncclEnqueueCheck(&info);
}
NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream) {
NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, int root, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclScatter_impl(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, int root, ncclComm* comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(Scatter, NcclNvtxParamsScatter,
NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), root, datatype));
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, datatype));
if (!mscclIsCaller()) // when msccl falls back to
{
NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, recvcount, datatype, comm, stream, root));
NCCLCHECK(Recorder::instance().record(rrScatter, sendbuff, recvbuff, count, datatype, comm, stream, root));
}
if (mscclAvailable(comm) && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
recvcount, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
count, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
}
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
size_t rankOffset = recvcount * ncclTypeSize(datatype);
if (recvcount == 0) return ncclSuccess;
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
if (!mscclIsCaller()) Recorder::instance().skip(true);
NCCLCHECK(ncclGroupStart());
if (rank == root) {
for (int r=0; r<nRanks; r++)
NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, recvcount, datatype, r, comm, stream));
}
NCCLCHECK(ncclRecv(recvbuff, recvcount, datatype, root, comm, stream));
NCCLCHECK(ncclGroupEnd());
if (!mscclIsCaller()) Recorder::instance().skip(false);
return ncclSuccess;
struct ncclInfo info = { ncclFuncScatter, "Scatter",
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS };
return ncclEnqueueCheck(&info);
}
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(Send, NcclNvtxParamsSendRecv,
@@ -500,7 +453,6 @@ ncclResult_t ncclSend_impl(const void* sendbuff, size_t count, ncclDataType_t da
NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclRecv_impl(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_WITH_PARAMS(Recv, NcclNvtxParamsSendRecv,
+2 -2
Datei anzeigen
@@ -28,7 +28,7 @@ static int pid = -1;
static char hostname[1024];
thread_local int ncclDebugNoWarn = 0;
char ncclLastError[1024] = ""; // Global string for the last error in human readable form
static uint64_t ncclDebugMask = 0;
uint64_t ncclDebugMask = 0;
FILE *ncclDebugFile = stdout;
static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
static std::chrono::steady_clock::time_point ncclEpoch;
@@ -419,4 +419,4 @@ void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
va_end(vargs);
pthread_setname_np(thread, threadName);
#endif
}
}
Datei-Diff unterdrückt, da er zu groß ist Diff laden
+60
Datei anzeigen
@@ -0,0 +1,60 @@
# Run the scripts once during configuration to get the file lists
execute_process(
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc "${ONLY_FUNCS}"
OUTPUT_VARIABLE files
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
string(STRIP "${files}" files)
list(TRANSFORM files PREPEND ${CMAKE_CURRENT_BINARY_DIR}/gensrc/)
execute_process(
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric "${ONLY_FUNCS}"
OUTPUT_VARIABLE symmetric_files
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
string(STRIP "${symmetric_files}" symmetric_files)
list(TRANSFORM symmetric_files PREPEND ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric/)
# Create custom commands to generate source files with proper dependencies
add_custom_command(
OUTPUT ${files}
BYPRODUCTS ${files}
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc "${ONLY_FUNCS}"
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/generate.py
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Generating device source files"
)
add_custom_command(
OUTPUT ${symmetric_files}
BYPRODUCTS ${symmetric_files}
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric "${ONLY_FUNCS}"
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Generating symmetric device source files"
)
# Add library target
add_library(nccl_device OBJECT
${files}
${symmetric_files}
${CMAKE_CURRENT_SOURCE_DIR}/common.cu
${CMAKE_CURRENT_SOURCE_DIR}/onerank.cu
)
set_target_properties(nccl_device PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
# Set include directories for the target
target_include_directories(nccl_device PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_SOURCE_DIR}/src/include
${CMAKE_SOURCE_DIR}/src/include/plugin
${CMAKE_BINARY_DIR}/include
${CUDAToolkit_INCLUDE_DIRS}
${CUDAToolkit_INCLUDE_DIRS}/cccl
)
add_dependencies(nccl_device nccl_header)
+6 -2
Datei anzeigen
@@ -19,7 +19,7 @@ OBJDIR := $(BUILDDIR)/obj/device
MANIFEST := $(OBJDIR)/manifest
DEVGLUE_OBJ := $(OBJDIR)/device_glue.o
INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include
INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include -I../include/plugin
NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
CXXFLAGS += $(INCFLAGS)
@@ -47,7 +47,11 @@ endif
define COMPILE_SYM
@$(SAY) "Compiling" $2;\
mkdir -p $(dir $1);\
$(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1
if [[ -n "$3" ]]; then\
$(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1;\
else\
touch $2.empty.cu; $(NVCC) $(NVCUFLAGS_SYM) -dw $2.empty.cu -o $1; rm $2.empty.cu;\
fi
endef
DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1
+1 -1
Datei anzeigen
@@ -75,7 +75,7 @@ namespace {
}
template<typename T, typename RedOp>
struct RunWorkColl<ncclFuncAllToAllPivot, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
struct RunWorkColl<ncclFuncAlltoAllPivot, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int nThreads, struct ncclDevWorkColl* work) {
using Proto = ProtoSimple<ALLTOALL_PIVOT_CHUNKSTEPS/ALLTOALL_PIVOT_SLICESTEPS, ALLTOALL_PIVOT_SLICESTEPS>;
runRing<T, RedOp, Proto>(tid, nThreads, work);
+8 -8
Datei anzeigen
@@ -150,7 +150,7 @@ struct ncclShmemData {
struct ncclDevKernelArgs args;
int channelId;
int aborted;
alignas(16) struct ncclDevComm comm;
alignas(16) struct ncclKernelComm comm;
alignas(16) struct ncclDevChannel channel;
#ifdef ENABLE_WARP_SPEED
int warpComm;
@@ -502,7 +502,7 @@ __device__ __forceinline__ void profiler(int action) {
ncclShmem.comm.workCompleted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].counter = wc;
}
ncclShmem.channel.workCounter += ncclShmem.nWorks;
if (action == FINI) ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
if (action == FINI) ((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
}
}
}
@@ -579,7 +579,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
/* set abort flag to 0 */
if (tid == 0) {
ncclShmem.aborted = 0;
ncclShmem.channel.workCounter = ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter;
ncclShmem.channel.workCounter = ((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter;
}
// Use first 2 warps to load comm and channel, and remaining load work batch.
@@ -587,14 +587,14 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
case 0:
{ void* dst = &ncclShmem.comm;
void* src = ncclShmem.args.comm;
int bytes = sizeof(ncclDevComm);
static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
int bytes = sizeof(ncclKernelComm);
static_assert(sizeof(ncclKernelComm) <= 16*WARP_SIZE, "ncclKernelComm cannot be loaded by a single warp in one insn.");
copyToShmem16(tid, dst, src, bytes);
} break;
case 1:
{ // Get address of channel without incurring indirect load from ncclDevComm::channels
{ // Get address of channel without incurring indirect load from ncclKernelComm::channels
void* dst = &ncclShmem.channel;
void* src = &((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
void* src = &((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
int bytes = sizeof(ncclDevChannel);
static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
copyToShmem16(tid-WARP_SIZE, dst, src, bytes);
@@ -641,7 +641,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
__syncthreads();
if(ncclShmem.warpChannelId[localWarpId] >= 0) {
void* dst = &ncclShmem.warpChannel[localWarpId];
void* src = &((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.warpChannelId[localWarpId]];
void* src = &((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.warpChannelId[localWarpId]];
int bytes = sizeof(ncclDevChannel);
static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
// assert((tid-localWarpId*WARP_SIZE) >= 0 && (tid-localWarpId*WARP_SIZE) < WARP_SIZE);
+17 -13
Datei anzeigen
@@ -3,9 +3,10 @@ import os
import sys
import subprocess
from dataclasses import dataclass
import shutil
# Order of colls, redops, tys, protos, algos must match src/include/device.h
all_colls = ["Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "", "", "AllToAllPivot", "AllToAllGda"]
all_colls = ["Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "", "", "AlltoAllPivot", "AllToAllGda"]
all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
all_tys = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16","f8e4m3","f8e5m2"]
all_protos = ["LL","LL128","SIMPLE"]
@@ -24,8 +25,11 @@ gensrc = sys.argv[1]
if os.path.exists(gensrc):
for name in os.listdir(gensrc):
os.remove(os.path.join(gensrc, name))
#os.truncate(os.path.join(gensrc, name), 0)
path = os.path.join(gensrc, name)
if os.path.isfile(path):
os.remove(path)
elif os.path.isdir(path):
shutil.rmtree(path)
else:
os.makedirs(gensrc)
@@ -64,7 +68,7 @@ else:
# make ONLY_FUNCS="AllReduce RING SIMPLE * *|ReduceScatter RING LL * f32"
# --- or ---
# make ONLY_FUNCS="AllReduce RING SIMPLE|ReduceScatter RING LL * f32"
# make ONLY_FUNCS="AllReduce RING/TREE LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|AllGather RING LL/SIMPLE Sum i8|AllToAllPivot RING SIMPLE Sum i8|Broadcast RING LL/SIMPLE Sum i8|Reduce RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|ReduceScatter RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|SendRecv RING SIMPLE Sum i8"
# make ONLY_FUNCS="AllReduce RING/TREE LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|AllGather RING LL/SIMPLE Sum i8|AlltoAllPivot RING SIMPLE Sum i8|Broadcast RING LL/SIMPLE Sum i8|Reduce RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|ReduceScatter RING LL/SIMPLE Sum/MinMax i8/u8/f16/f32/f64/bf16/f8e4m3/f8e5m2|SendRecv RING SIMPLE Sum i8"
# Paste all non-None arguments together with `sep`.
def paste(sep, *args):
@@ -79,14 +83,14 @@ func_pattern = sys.argv[6:7]
if func_pattern and func_pattern[0]:
func_pattern = func_pattern[0]
else:
func_pattern = "AllGather|AllReduce|AllToAllPivot|AllToAllGda|Broadcast|Reduce|ReduceScatter|SendRecv"
func_pattern = "AllGather|AllReduce|AlltoAllPivot|AllToAllGda|Broadcast|Reduce|ReduceScatter|SendRecv"
################################################################################
algos_of_coll = {
"AllGather": ["RING", "PAT"],
"AllReduce": ["RING", "TREE"],
"AllToAllPivot": ["RING"],
"AlltoAllPivot": ["RING"],
"AllToAllGda": ["RING"],
"Broadcast": ["RING"],
"Reduce": ["RING"],
@@ -97,7 +101,7 @@ algos_of_coll = {
protos_of_coll = {
"AllGather": all_protos,
"AllReduce": all_protos,
"AllToAllPivot": ["SIMPLE"],
"AlltoAllPivot": ["SIMPLE"],
"AllToAllGda": ["SIMPLE"],
"Broadcast": all_protos,
"Reduce": all_protos,
@@ -108,7 +112,7 @@ protos_of_coll = {
redops_of_coll = {
"AllGather": ["Sum"],
"AllReduce": all_redops,
"AllToAllPivot": ["Sum"],
"AlltoAllPivot": ["Sum"],
"AllToAllGda": ["Sum"],
"Broadcast": ["Sum"],
"Reduce": all_redops,
@@ -119,7 +123,7 @@ redops_of_coll = {
tys_of_coll = {
"AllGather": ["i8"],
"AllReduce": all_tys,
"AllToAllPivot": ["i8"],
"AlltoAllPivot": ["i8"],
"AllToAllGda": ["i8"],
"Broadcast": ["i8"],
"Reduce": all_tys,
@@ -130,7 +134,7 @@ tys_of_coll = {
acc_of_coll = {
"AllGather": ["0"],
"AllReduce": all_accs,
"AllToAllPivot": ["0"],
"AlltoAllPivot": ["0"],
"AllToAllGda": ["0"],
"Broadcast": ["0"],
"Reduce": ["0"],
@@ -141,7 +145,7 @@ acc_of_coll = {
pipelines_of_coll = {
"AllGather": ["0"],
"AllReduce": all_pipelines,
"AllToAllPivot": ["0"],
"AlltoAllPivot": ["0"],
"AllToAllGda": ["0"],
"Broadcast": ["0"],
"Reduce": all_pipelines,
@@ -153,7 +157,7 @@ pipelined_types = ["bf16"]
coll_camel_to_lower = {
"AllGather": "all_gather",
"AllReduce": "all_reduce",
"AllToAllPivot": "alltoall_pivot",
"AlltoAllPivot": "alltoall_pivot",
"AllToAllGda": "alltoall_gda",
"Broadcast": "broadcast",
"Reduce": "reduce",
@@ -510,7 +514,7 @@ with open(os.path.join(gensrc, "host_table.cpp"), "w") as f:
)
if fn.coll == "Broadcast":
key = ((coll_idx & 0x3F) | ((proto_idx & 0x3F) << 8))
if fn.coll in ["SendRecv", "AllToAllPivot", "AllToAllGda"]:
if fn.coll in ["SendRecv", "AlltoAllPivot", "AllToAllGda"]:
key = ((coll_idx & 0x3F))
out(f' {{{key}, {fn_id}}}, {comment}\n')
+7 -7
Datei anzeigen
@@ -93,7 +93,7 @@ __device__ __forceinline__ static void mscclReduce(int c, int numReductions, int
template<typename T, typename RedOp, typename Proto, bool fullOps>
__device__ __forceinline__ void mscclRunInterpreter(
struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) {
struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) {
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int nthreads = MSCCL_MAX_NTHREADS;
@@ -120,12 +120,12 @@ __device__ __forceinline__ void mscclRunInterpreter(
case 0:
dst = &ncclShmem.comm;
src = comm;
bytes = sizeof(ncclDevComm);
bytes = sizeof(ncclKernelComm);
break;
case 1:
// Get address of channel without incurring indirect load from ncclDevComm::channels
// Get address of channel without incurring indirect load from ncclKernelComm::channels
dst = &ncclShmem.channel;
src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
src = &((ncclKernelCommAndChannels*)comm)->channels[channelId];
bytes = sizeof(ncclDevChannel);
break;
case 2:
@@ -372,13 +372,13 @@ __device__ __forceinline__ void mscclRunInterpreter(
}
#define MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, type, fullOps) \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
mscclRunInterpreter<type, Func##devredop<type>, ProtoLL, fullOps>(comm, algo, work); \
} \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL128, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL128, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
mscclRunInterpreter<type, Func##devredop<type>, ProtoLL128, fullOps>(comm, algo, work); \
} \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, Simple, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, Simple, fullOps)(struct ncclKernelComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
mscclRunInterpreter<type, Func##devredop<type>, ProtoSimple<MSCCL_CHUNKSTEPS/MSCCL_SLICESTEPS, MSCCL_SLICESTEPS, 0, 2>, fullOps>(comm, algo, work); \
}
+128 -124
Datei anzeigen
@@ -1,35 +1,36 @@
// Modification Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include "symmetric.h"
#include "sym_kernels.h"
#include "symmetric/kernel.h"
#include "symmetric/primitives.h"
template<int BytePerPack, int UnrollPacks, int UnrollPeers>
static __device__ void bcastDeep(
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
char* inputHere, char* outputRank0, bool inPlace, int nIters
ncclSymkArgsHandler const& handler, int tn, int t,
bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
ncclSymPtr<char> input, ncclSymPtr<char> output, bool inPlace, int nIters
) {
using Pack = BytePack<BytePerPack>;
int wn = tn/WARP_SIZE;
int w = t/WARP_SIZE;
int lane = t%WARP_SIZE;
int const& rank = prim.rank;
int const& nRanks = prim.nRanks;
uint32_t const& stride4G = prim.stride4G;
Pack* inpHere = (Pack*)inputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
Pack* inpPacks = (Pack*)input.localPtr() + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
ncclSymPtr<Pack> outPacks = (ncclSymPtr<Pack>)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
Pack tmp[UnrollPacks];
nIters -= w;
if (0 < nIters) {
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
tmp[u] = inpHere[u*WARP_SIZE];
tmp[u] = inpPacks[u*WARP_SIZE];
}
}
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
if (0 < nIters) {
while (true) {
@@ -47,21 +48,21 @@ static __device__ void bcastDeep(
if (partial && dr == nRanks) break;
#pragma unroll UnrollPacks
for (int u=0; u < UnrollPacks; u++) {
add4G(outRank0, r*stride4G)[u*WARP_SIZE] = tmp[u];
outPacks.lsaPtr(r)[u*WARP_SIZE] = tmp[u];
}
if (++r == nRanks) r = 0;
}
}
}
inpHere += intptr_t(wn)*UnrollPacks*WARP_SIZE;
outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
nIters -= wn;
if (nIters <= 0) break;
// Load data for next iteration.
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
tmp[u] = inpHere[u*WARP_SIZE];
tmp[u] = inpPacks[u*WARP_SIZE];
}
}
}
@@ -69,18 +70,17 @@ static __device__ void bcastDeep(
template<int UnrollPeers, typename T>
static __device__ void bcastEnds(
ncclSymPrims& prim, int tn, int t,
T* inputHere, T* outputRank0, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts
ncclSymkArgsHandler const& handler, int tn, int t,
ncclSymPtr<T> input, ncclSymPtr<T> output, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts
) {
int const& rank = prim.rank;
int const& nRanks = prim.nRanks;
uint32_t const& stride4G = prim.stride4G;
BytePack<sizeof(T)>* inpHere = (BytePack<sizeof(T)>*)inputHere;
BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
BytePack<sizeof(T)>* inpPacks = (BytePack<sizeof(T)>*)input.localPtr();
ncclSymPtr<BytePack<sizeof(T)>> outPacks = (ncclSymPtr<BytePack<sizeof(T)>>)output;
#pragma unroll 1
for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
size_t elt = i < nPreElts ? i : nElts-nPreElts-nSufElts+i;
BytePack<sizeof(T)> tmp = inpHere[elt];
BytePack<sizeof(T)> tmp = inpPacks[elt];
int dr = inPlace ? 1 : 0;
int r = rank + dr;
if (r == nRanks) r = 0;
@@ -88,14 +88,14 @@ static __device__ void bcastEnds(
for (; dr + UnrollPeers <= nRanks; dr += UnrollPeers) {
#pragma unroll UnrollPeers
for (int u=0; u < UnrollPeers; u++) {
*add4G(outRank0+elt, r*stride4G) = tmp;
outPacks.lsaPtr(r)[elt] = tmp;
if (++r == nRanks) r = 0;
}
}
#pragma unroll UnrollPeers
for (int u=0; u < UnrollPeers; u++) {
if (dr+u == nRanks) break;
*add4G(outRank0+elt, r*stride4G) = tmp;
outPacks.lsaPtr(r)[elt] = tmp;
if (++r == nRanks) r = 0;
}
}
@@ -103,95 +103,95 @@ static __device__ void bcastEnds(
template<typename T>
static __device__ void bcast(
ncclSymPrims& prim, int tn, int t, bool waitNeeded, T* input, T* output, size_t nElts
ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks,
bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
) {
bool inPlace = (input == output);
// Mpve to rank=0
output = prim.peerPtr(0, output);
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
size_t nBytes = nElts*sizeof(T);
uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks);
uint32_t nPreBytes = (128u - inputUptr)%128u;
uint32_t nPreBytes = (16 - input.offset)%16;
nPreBytes = min((size_t)nPreBytes, nBytes);
uintptr_t cursor = nPreBytes;
constexpr int MinWarpPerBlock = 4;
if ((inputUptr-outputUptr)%16 == 0) {
constexpr int BytePerPack = 16, UnrollPacks = 1, UnrollPeers = 1;
if ((input.offset - output.offset)%16 == 0) {
constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
chunks -= imodFast32(chunks, nBlocks, nBlocks_rcp32);
if (chunks != 0) {
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
bcastDeep<BytePerPack, UnrollPacks, UnrollPeers>(
prim, tn, t, waitNeeded,
(char*)input + cursor, (char*)output + cursor, inPlace,
chunks*MinWarpPerBlock
handler, tn, t, waitNeeded, bar,
(ncclSymPtr<char>)input + cursor,
(ncclSymPtr<char>)output + cursor,
inPlace, chunks*MinWarpPerBlock
);
cursor = cursorAfter;
waitNeeded = false;
}
}
if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
constexpr int BytePerPack = 4, UnrollPacks = 1, UnrollPeers = 1;
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
if (sizeof(T) == 4 || (sizeof(T) < 4 && (input.offset - output.offset)%4 == 0)) {
chunks -= imodFast32(chunks, nBlocks, nBlocks_rcp32);
if (chunks != 0) {
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
bcastDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers>(
prim, tn, t, waitNeeded,
(char*)input + cursor, (char*)output + cursor, inPlace,
chunks*MinWarpPerBlock
handler, tn, t, waitNeeded, bar,
(ncclSymPtr<char>)input + cursor,
(ncclSymPtr<char>)output + cursor,
inPlace, chunks*MinWarpPerBlock
);
cursor = cursorAfter;
waitNeeded = false;
}
}
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
constexpr int UnrollPeers = 8;
size_t nSufElts = (nBytes-cursor)/sizeof(T);
bcastEnds<UnrollPeers>(prim, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts);
bcastEnds<UnrollPeers>(handler, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts);
}
__device__ __forceinline__ void ncclSymRun_AllGather_ST(ncclSymDevArgs const* args) {
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
int const& rank = prim.rank;
__device__ __forceinline__ void ncclSymkRun_AllGather_ST(ncclSymkDevWorkArgs const* args) {
ncclSymkArgsHandler handler{args};
ncclLsaBarrierSession<ncclCoopCta> bar{
ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x
};
int const& rank = handler.comm.rank;
// Threads numbered over rank.
int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
prim.block, prim.nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int btn = prim.nBlocks*blockDim.x;
bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed);
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
//prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
bool waitNeeded = true;
handler.forEachWork<char>(
[&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
ncclSymPtr<char> input, ncclSymPtr<char> output) {
// Threads numbered over rank.
int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
block, nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int btn = nBlocks*blockDim.x;
bcast(prim, btn, bt, /*waitNeeded=*/true, (char*)args->input, (char*)args->output + rank*args->nElts, args->nElts);
bcast(handler, btn, bt, nBlocks, waitNeeded, bar, input, output + rank*nAllElts, nElts);
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
waitNeeded = false;
}
);
bar.sync(ncclCoopCta(), cuda::memory_order_release);
}
template<typename T>
static __device__ void bcastMultimem(
ncclSymPrims& prim, int tn, int t, T* input, T* output, size_t nElts
ncclSymkArgsHandler& handler, int tn, int t, ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
) {
// Move output to multimem
output = prim.multimemPtr(output);
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
size_t nBytes = nElts*sizeof(T);
uint32_t nPreBytes = (16-inputUptr)%16;
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input.localPtr());
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output.multimemPtr(handler.comm.lsaMultimem));
uint32_t nPreBytes = (16 - input.offset)%16;
nPreBytes = min((size_t)nPreBytes, nBytes);
uintptr_t nSufBytes;
@@ -230,51 +230,52 @@ static __device__ void bcastMultimem(
uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
BytePack<sizeof(T)> val = *reinterpret_cast<BytePack<sizeof(T)>*>(inputUptr + cursor);
multimem_st_global(outputUptr + cursor, val);
cursor += tn*sizeof(T);
}
}
__device__ __forceinline__ void ncclSymRun_AllGather_STMC(ncclSymDevArgs const* args) {
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
int const& rank = prim.rank;
__device__ __forceinline__ void ncclSymkRun_AllGather_STMC(ncclSymkDevWorkArgs const* args) {
ncclSymkArgsHandler handler{args};
ncclLsaBarrierSession<ncclCoopCta> bar(
ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true
);
int const& rank = handler.comm.rank;
char* input = args->input;
char* output = args->output;
size_t bytes = args->nElts;
// Round robin memory to blocks.
int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
prim.block, prim.nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int tn = prim.nBlocks*blockDim.x;
bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
handler.forEachWork<char>(
[&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
ncclSymPtr<char> input, ncclSymPtr<char> output) {
// Round robin memory to blocks.
int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
block, nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int tn = nBlocks*blockDim.x;
bcastMultimem(prim, tn, t, input, output + rank*bytes, bytes);
bcastMultimem(handler, tn, t, input, output + rank*nAllElts, nElts);
}
);
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
bar.sync(ncclCoopCta(), cuda::memory_order_release);
}
template<typename EltType>
static __device__ void allgather_LL_body(
ncclSymPrims &prim, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts
ncclSymkArgsHandler& handler, ncclLLA2ASession<ncclCoopCta>& lla2a,
EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts
) {
using Pack = BytePack<8>;
constexpr int EltPerPack = 8/sizeof(EltType);
ncclCoopCta cta;
int rank = prim.rank;
int nRanks = prim.nRanks;
constexpr int tn = ncclSymMaxThreads;
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
int t = threadIdx.x;
constexpr int tn = ncclSymkMaxThreads;
#pragma unroll 1
while (0 < nElts) {
int nIterPacks = min(nPacks, tn);
if (t < nIterPacks) {
Pack x = loadPack<Pack>(input, t*EltPerPack, nElts);
prim.bcastLL(/*slot=*/nIterPacks*rank + t, x);
lla2a.bcast(/*slot=*/nIterPacks*rank + t, x);
}
int tn_div_nPacks = tn/nIterPacks;
@@ -287,7 +288,7 @@ static __device__ void allgather_LL_body(
#pragma unroll 1
for (int i = t; i < (nRanks*nIterPacks & -(Unroll*tn)); i += Unroll*tn) {
Pack got[Unroll];
prim.template recvLL<Unroll, Unroll>(i, Unroll, tn, /*&*/got);
lla2a.template recvUnrolled<Unroll, Unroll>(i, Unroll, tn, /*&*/got);
#pragma unroll
for (int u=0; u < Unroll; u++) {
storePack<Pack>(output + peer*nStrideElts, pack*EltPerPack, nElts, got[u]);
@@ -302,7 +303,7 @@ static __device__ void allgather_LL_body(
if (i + n*tn < nRanks*nIterPacks) n += 1;
if (n != 0) {
Pack got[Unroll];
prim.template recvLL<1, Unroll>(i, n, tn, /*&*/got);
lla2a.template recvUnrolled<1, Unroll>(i, n, tn, /*&*/got);
#pragma unroll
for (int u=0; u < Unroll; u++) {
if (u != 0 && u == n) break;
@@ -316,7 +317,7 @@ static __device__ void allgather_LL_body(
// The non-unrolled but "obviously correct" implementation for reference.
#pragma unroll 1
for (int i = t; i < nRanks*nIterPacks; i += tn) {
Pack got = prim.template recvLL<Pack>(i);
Pack got = lla2a.template recv<Pack>(i);
storePack(output + peer*nStrideElts, pack*EltPerPack, nElts, got);
peer += tn_div_nPacks;
pack += tn_mod_nPacks;
@@ -324,7 +325,7 @@ static __device__ void allgather_LL_body(
}
#endif
prim.endLL(cta);
lla2a.endEpoch(ncclCoopCta());
input += tn*EltPerPack;
output += tn*EltPerPack;
@@ -333,38 +334,41 @@ static __device__ void allgather_LL_body(
}
}
static __device__ void ncclSymRun_AllGather_LL_impl(ncclSymDevArgs const* args, bool multimem) {
ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
static __device__ void ncclSymkRun_AllGather_LL_impl(ncclSymkDevWorkArgs const* args, bool multimem) {
ncclSymkArgsHandler handler{args};
ncclLLA2ASession<ncclCoopCta> lla2a(
ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A, blockIdx.x, /*maxElts=*/ncclSymkMaxThreads, multimem, handler.comm.lsaMultimem
);
using Pack = BytePack<8>;
constexpr int BytePerPack = 8;
int nElts = args->nElts;
int nPacks = divUp(nElts, BytePerPack);
uint32_t nPackPerBlock, nPackModBlock;
idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
int blockPackBegin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
int nBlockPacks = blockPackEnd - blockPackBegin;
int nBlockElts = nElts - blockPackBegin*BytePerPack;
nBlockElts = min(nBlockElts, nBlockPacks*BytePerPack);
char* blockInput = args->input + blockPackBegin*BytePerPack;
char* blockOutput = args->output + blockPackBegin*BytePerPack;
handler.singleWork<char>(
[&]__device__(int nElts, int nAllElts,
ncclSymPtr<char> input, ncclSymPtr<char> output) {
int nPacks = divUp(nElts, BytePerPack);
uint32_t lowBits = args->nElts;
lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->input);
lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->output);
if (__builtin_expect(lowBits%8 == 0, true)) {
// NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us
allgather_LL_body(prim, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput, nBlockElts/8, nBlockPacks, nElts/8);
} else {
allgather_LL_body(prim, blockInput, blockOutput, nBlockElts, nBlockPacks, nElts);
}
char* blockInput = input.localPtr();
char* blockOutput = output.localPtr();
uint32_t lowBits = nElts;
lowBits |= (uintptr_t)blockInput;
lowBits |= (uintptr_t)blockOutput;
if (__builtin_expect(lowBits%8 == 0, true)) {
// NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us
allgather_LL_body(handler, lla2a, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput,
nElts/8, nPacks, nAllElts/8);
} else {
allgather_LL_body(handler, lla2a, blockInput, blockOutput, nElts, nPacks, nAllElts);
}
}
);
}
__device__ __forceinline__ void ncclSymRun_AllGather_LL(ncclSymDevArgs const* args) {
ncclSymRun_AllGather_LL_impl(args, /*multimem=*/false);
__device__ __forceinline__ void ncclSymkRun_AllGather_LL(ncclSymkDevWorkArgs const* args) {
ncclSymkRun_AllGather_LL_impl(args, /*multimem=*/false);
}
__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(ncclSymDevArgs const* args) {
ncclSymRun_AllGather_LL_impl(args, /*multimem=*/true);
__device__ __forceinline__ void ncclSymkRun_AllGather_LLMC(ncclSymkDevWorkArgs const* args) {
ncclSymkRun_AllGather_LL_impl(args, /*multimem=*/true);
}
+182 -147
Datei anzeigen
@@ -1,38 +1,41 @@
// Modification Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
#include "symmetric.h"
#include "sym_kernels.h"
#include "nccl_device.h"
#include "symmetric/kernel.h"
#include "symmetric/primitives.h"
template<int BytePerPack, int UnrollPacks, int UnrollPeers, typename T, typename Red>
static __device__ __forceinline__ void allreduceDeep(
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
Red red, char* inputRank0, char* outputRank0, int32_t nIters
ncclSymkArgsHandler const& handler, int tn, int t,
bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
Red red, ncclSymPtr<char> input, ncclSymPtr<char> output, int32_t nIters
) {
using Pack = BytePack<BytePerPack>;
using Acc = typename Red::EltType;
using AccPack = BytePack<BytePerPack*sizeof(Acc)/sizeof(T)>;
ncclTeam world = ncclTeamWorld(handler.comm);
int wn = tn/WARP_SIZE;
int w = t/WARP_SIZE;
int lane = t%WARP_SIZE;
int const& rank = prim.rank;
int const& nRanks = prim.nRanks;
uint32_t const& stride4G = prim.stride4G;
Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
ncclSymPtr<Pack> inpPacks = (ncclSymPtr<Pack>)input + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
ncclSymPtr<Pack> outPacks = (ncclSymPtr<Pack>)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
Pack acc0[UnrollPacks];
nIters -= w;
if (0 < nIters) {
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
}
}
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
if (0 < nIters) {
while (true) {
@@ -42,7 +45,7 @@ static __device__ __forceinline__ void allreduceDeep(
{ Pack tmp1[UnrollPacks];
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
tmp1[u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
}
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
@@ -67,7 +70,7 @@ static __device__ __forceinline__ void allreduceDeep(
if (partial && ur!=0 && dr+ur == nRanks) break;
#pragma unroll UnrollPacks
for (int u=0; u < UnrollPacks; u++) {
tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
tmp1[ur][u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
}
if (++r == nRanks) r = 0;
}
@@ -98,22 +101,22 @@ static __device__ __forceinline__ void allreduceDeep(
if (partial && dr == nRanks) break;
#pragma unroll UnrollPacks
for (int u=0; u < UnrollPacks; u++) {
add4G(outRank0, r*stride4G)[u*WARP_SIZE] = acc0[u];
outPacks.peerPtr(world, r)[u*WARP_SIZE] = acc0[u];
}
if (++r == nRanks) r = 0;
}
}
}
inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
nIters -= wn;
if (nIters <= 0) break;
// Load data for next iteration.
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
}
}
}
@@ -121,21 +124,23 @@ static __device__ __forceinline__ void allreduceDeep(
template<int UnrollPeers, typename Red, typename T>
static __device__ __forceinline__ void allreduceEnds(
ncclSymPrims& prim, int tn, int t, Red red,
T* inputRank0, T* outputRank0, size_t nElts, uint32_t nPreElts, size_t nSufElts
ncclSymkArgsHandler const& handler, int tn, int t, Red red,
ncclSymPtr<T> input, ncclSymPtr<T> output,
size_t nElts, uint32_t nPreElts, size_t nSufElts
) {
using Acc = typename Red::EltType;
int const& rank = prim.rank;
int const& nRanks = prim.nRanks;
uint32_t const& stride4G = prim.stride4G;
BytePack<sizeof(T)>* inpRank0 = (BytePack<sizeof(T)>*)inputRank0;
BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
ncclTeam world = ncclTeamWorld(handler.comm);
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
ncclSymPtr<BytePack<sizeof(T)>> inpPacks = (ncclSymPtr<BytePack<sizeof(T)>>)input;
ncclSymPtr<BytePack<sizeof(T)>> outPacks = (ncclSymPtr<BytePack<sizeof(T)>>)output;
#pragma unroll 1
for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i;
BytePack<sizeof(T)> acc0 = *add4G(inpRank0+elt, rank*stride4G);
BytePack<sizeof(T)> acc0 = inpPacks.peerPtr(world, rank)[elt];
BytePack<sizeof(Acc)> acc1;
BytePack<sizeof(T)> tmp[UnrollPeers];
int dr = 1;
@@ -154,7 +159,7 @@ static __device__ __forceinline__ void allreduceEnds(
#pragma unroll
for (int u=0; u < UnrollPeers-partial; u++) {
if (partial && u!=0 && dr+u == nRanks) break;
tmp[u] = *add4G(inpRank0+elt, r*stride4G);
tmp[u] = inpPacks.peerPtr(world, r)[elt];
r += 1;
if (r == nRanks) r = 0;
}
@@ -182,7 +187,7 @@ static __device__ __forceinline__ void allreduceEnds(
#pragma unroll
for (int u=0; u < UnrollPeers-partial; u++) {
if (partial && dr+u == nRanks) break;
*add4G(outRank0+elt, r*stride4G) = acc0;
outPacks.peerPtr(world, r)[elt] = acc0;
r += 1;
if (r == nRanks) r = 0;
}
@@ -193,35 +198,33 @@ static __device__ __forceinline__ void allreduceEnds(
template<typename Red, typename T>
static __device__ void allreduce(
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
Red red, T* input, T* output, size_t nElts
ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks,
bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
Red red, ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
) {
int nRanks = prim.nRanks;
int nBlocks = prim.nBlocks;
// Mpve to rank=0
input = prim.peerPtr(0, input);
output = prim.peerPtr(0, output);
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
int const& nRanks = handler.comm.nRanks;
int const& nRanks_rcp32 = handler.nRanks_rcp32;
size_t nBytes = nElts*sizeof(T);
uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks);
uint32_t nRanks_nBlocks_rcp32 = nccl::utility::imulRcp32(nRanks, nRanks_rcp32, nBlocks, nBlocks_rcp32);
uint32_t nPreBytes = (16u - inputUptr)%16u;
uint32_t nPreBytes = (16u - input.offset)%16u;
nPreBytes = min((size_t)nPreBytes, nBytes);
uintptr_t cursor = nPreBytes;
constexpr int MinWarpPerBlock = 4;
if ((inputUptr-outputUptr)%16 == 0) {
if ((input.offset - output.offset)%16 == 0) {
constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
if (chunks != 0) {
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
allreduceDeep<BytePerPack, UnrollPacks, UnrollPeers, T>(
prim, tn, t, waitNeeded, red,
(char*)input + cursor, (char*)output + cursor,
handler, tn, t, waitNeeded, bar, red,
(ncclSymPtr<char>)input + cursor,
(ncclSymPtr<char>)output + cursor,
chunks*MinWarpPerBlock
);
cursor = cursorAfter;
@@ -229,16 +232,17 @@ static __device__ void allreduce(
}
}
if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
if (sizeof(T) == 4 || (sizeof(T) < 4 && (input.offset - output.offset)%4 == 0)) {
constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4;
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
if (chunks != 0) {
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
allreduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>(
prim, tn, t, waitNeeded, red,
(char*)input + cursor, (char*)output + cursor,
handler, tn, t, waitNeeded, bar, red,
(ncclSymPtr<char>)input + cursor,
(ncclSymPtr<char>)output + cursor,
chunks*MinWarpPerBlock
);
cursor = cursorAfter;
@@ -246,46 +250,51 @@ static __device__ void allreduce(
}
}
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
constexpr int UnrollPeers = 8;
size_t nSufElts = (nBytes-cursor)/sizeof(T);
allreduceEnds<UnrollPeers>(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
allreduceEnds<UnrollPeers>(handler, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
}
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(ncclSymDevArgs const* args) {
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
int /*const&*/ rank = prim.rank;
int /*const&*/ nRanks = prim.nRanks;
Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLD_AGxST(ncclSymkDevWorkArgs const* args) {
ncclSymkArgsHandler handler{args};
ncclLsaBarrierSession<ncclCoopCta> bar{
ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x
};
// Threads numbered globally such that we round robin warps by rank then block.
int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
rank, nRanks,
prim.block, prim.nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int gtn = nRanks*prim.nBlocks*blockDim.x;
Red<typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type> red(handler.devWork->redOpArg);
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
//prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
allreduce(prim, gtn, gt, /*waitNeeded=*/true, red, (T*)args->input, (T*)args->output, args->nElts);
bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed);
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
bool waitNeeded = true;
handler.forEachWork<T>(
[&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
ncclSymPtr<T> input, ncclSymPtr<T> output) {
// Threads numbered globally such that we round robin warps by rank then block.
int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
rank, nRanks,
block, nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int gtn = nRanks*nBlocks*blockDim.x;
allreduce(handler, gtn, gt, nBlocks, waitNeeded, bar, red, input, output, nElts);
waitNeeded = false;
}
);
bar.sync(ncclCoopCta(), cuda::memory_order_release);
}
template<typename Red, typename T>
static __device__ void allreduceMultimem(
ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts
int tn, int t, Red red, T* input, T* output, size_t nElts
) {
// Mpve to multimem
input = prim.multimemPtr(input);
output = prim.multimemPtr(output);
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
size_t nBytes = nElts*sizeof(T);
@@ -330,106 +339,132 @@ static __device__ void allreduceMultimem(
uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
BytePack<sizeof(T)> val = applyLoadMultimem<Red, sizeof(T)>(red, inputUptr + cursor);
multimem_st_global(outputUptr + cursor, val);
cursor += tn*sizeof(T);
}
}
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymDevArgs const* args) {
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
Red<typename ncclSymAccumType<Red, T, /*nvls=*/true>::Type> red(args->redOpArg);
__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymkDevWorkArgs const* args) {
ncclSymkArgsHandler handler{args};
ncclLsaBarrierSession<ncclCoopCta> bar{
ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true
};
// Threads numbered globally such that we round robin warps by rank then block.
int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
prim.rank, prim.nRanks,
prim.block, prim.nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int gtn = prim.nRanks*prim.nBlocks*blockDim.x;
Red<typename ncclSymkAccumType<Red, T, /*nvls=*/true>::Type> red(handler.devWork->redOpArg);
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
auto const& multimem = handler.comm.lsaMultimem;
allreduceMultimem(prim, gtn, gt, red, (T*)args->input, (T*)args->output, args->nElts);
bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
handler.forEachWork<T>(
[&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
ncclSymPtr<T> input, ncclSymPtr<T> output) {
// Threads numbered globally such that we round robin warps by rank then block.
int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
rank, nRanks,
block, nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int gtn = nRanks*nBlocks*blockDim.x;
allreduceMultimem(gtn, gt, red, input.multimemPtr(multimem), output.multimemPtr(multimem), nElts);
}
);
bar.sync(ncclCoopCta(), cuda::memory_order_release);
}
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R_impl(ncclSymDevArgs const* args, bool multimem) {
ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
int /*const&*/ rank = prim.rank;
using Acc = typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type;
Red<Acc> red(args->redOpArg);
__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R_impl(ncclSymkDevWorkArgs const* args, bool multimem) {
ncclSymkArgsHandler handler{args};
ncclLLA2ASession<ncclCoopCta> lla2a(
ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A,
blockIdx.x, ncclSymkMaxThreads, multimem, handler.comm.lsaMultimem
);
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
using Acc = typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type;
Red<Acc> red(handler.devWork->redOpArg);
using Pack = BytePack<8>;
using AccPack = BytePack<8*sizeof(Acc)/sizeof(T)>;
constexpr int EltPerPack = 8/sizeof(T);
int nElts = args->nElts;
int nPacks = divUp(nElts, EltPerPack);
bool packAligned = 8 <= alignof(T) || (
args->nElts*sizeof(T) |
(uint32_t)reinterpret_cast<uintptr_t>(args->input) |
(uint32_t)reinterpret_cast<uintptr_t>(args->output)
)%8 == 0;
handler.singleWork<T>(
[&]__device__(int nElts, int nAllElts,
ncclSymPtr<T> inputPtr, ncclSymPtr<T> outputPtr) {
int nPacks = divUp(nElts, EltPerPack);
uint32_t nPackPerBlock, nPackModBlock;
idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
int begin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
int end = begin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
T* input = (T*)inputPtr.localPtr();
T* output = (T*)outputPtr.localPtr();
nPacks = end - begin;
nElts -= begin*EltPerPack;
nElts = min(nElts, nPacks*EltPerPack);
T* input = (T*)args->input + begin*EltPerPack;
T* output = (T*)args->output + begin*EltPerPack;
bool packAligned = 8 <= alignof(T) || (nElts*sizeof(T) | (uintptr_t)input | (uintptr_t)output)%8 == 0;
ncclCoopCta cta;
int t = threadIdx.x;
int tn = ncclSymMaxThreads;
ncclCoopCta cta;
int t = threadIdx.x;
int tn = ncclSymkMaxThreads;
if (__builtin_expect(packAligned, true)) {
#pragma unroll 1
while (0 < nPacks) {
if (t < nPacks) {
int nIterPacks = min(nPacks, tn);
Pack inp = loadPack<Pack>((Pack*)input, t, nPacks);
prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
storePack((Pack*)output, t, nPacks, out);
if (__builtin_expect(packAligned, true)) {
#pragma unroll 1
while (0 < nPacks) {
if (t < nPacks) {
int nIterPacks = min(nPacks, tn);
Pack inp = loadPack<Pack>((Pack*)input, t, nPacks);
lla2a.bcast(/*slot=*/nIterPacks*rank + t, inp);
AccPack out = lla2a.template recvReduce</*Unroll=*/8, Pack>(
/*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks,
/*eltToAcc=*/[&] __device__ (Pack x)->AccPack {
return applyCast<T, Acc>(x);
},
/*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack {
return applyReduce(red, a, b);
}
);
storePack((Pack*)output, t, nPacks, applyCast<Acc, T>(out));
}
lla2a.endEpoch(cta);
input += tn*EltPerPack;
output += tn*EltPerPack;
nPacks -= tn;
}
} else {
#pragma unroll 1
while (0 < nElts) {
if (t*EltPerPack < nElts) {
int nIterPacks = min(nPacks, tn);
Pack inp = loadPack<Pack>(input, t*EltPerPack, nElts);
lla2a.bcast(/*slot=*/nIterPacks*rank + t, inp);
AccPack out = lla2a.template recvReduce</*Unroll=*/8, Pack>(
/*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks,
/*eltToAcc=*/[&] __device__ (Pack x)->AccPack {
return applyCast<T, Acc>(x);
},
/*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack {
return applyReduce(red, a, b);
}
);
storePack(output, t*EltPerPack, nElts, applyCast<Acc, T>(out));
}
lla2a.endEpoch(cta);
input += tn*EltPerPack;
output += tn*EltPerPack;
nElts -= tn*EltPerPack;
nPacks -= tn;
}
}
}
prim.endLL(cta);
input += tn*EltPerPack;
output += tn*EltPerPack;
nPacks -= tn;
}
} else {
#pragma unroll 1
while (0 < nElts) {
if (t*EltPerPack < nElts) {
int nIterPacks = min(nPacks, tn);
Pack inp = loadPack<Pack>(input, t*EltPerPack, nElts);
prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
storePack(output, t*EltPerPack, nElts, out);
}
prim.endLL(cta);
input += tn*EltPerPack;
output += tn*EltPerPack;
nElts -= tn*EltPerPack;
nPacks -= tn;
}
}
);
}
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(ncclSymDevArgs const* args) {
ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/false);
__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R(ncclSymkDevWorkArgs const* args) {
ncclSymkRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/false);
}
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(ncclSymDevArgs const* args) {
ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/true);
__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLLMC_R(ncclSymkDevWorkArgs const* args) {
ncclSymkRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/true);
}
+25 -19
Datei anzeigen
@@ -4,6 +4,7 @@
#!/usr/bin/env python3
import os
import sys
import shutil
################################################################################
# The first command line argument is the path to the directory to generate and
@@ -13,8 +14,11 @@ gensrc = sys.argv[1]
if os.path.exists(gensrc):
for name in os.listdir(gensrc):
os.remove(os.path.join(gensrc, name))
#os.truncate(os.path.join(gensrc, name), 0)
path = os.path.join(gensrc, name)
if os.path.isfile(path):
os.remove(path)
elif os.path.isdir(path):
shutil.rmtree(path)
else:
os.mkdir(gensrc)
@@ -97,7 +101,7 @@ def enumerate_kernels():
yield Rec(coll="ReduceScatter", algo=algo, red=red, ty=ty)
def required_cuda(k):
cudart, arch, specific_sms = 0, 0, None
cudart, arch, specific_sms = 0, 600, None
is_nvls = k.algo in nvls_algos_by_coll.get(k.coll, [])
if is_nvls:
cudart = max(cudart, 12010)
@@ -136,13 +140,13 @@ def kernel_gencode(k):
def kernel_cname(k):
if k.coll in reductions:
return paste("_", "ncclSymDevKernel", k.coll, k.algo, k.red, k.ty)
return paste("_", "ncclSymkDevKernel", k.coll, k.algo, k.red, k.ty)
else:
return paste("_", "ncclSymDevKernel", k.coll, k.algo)
return paste("_", "ncclSymkDevKernel", k.coll, k.algo)
def kernel_conds(k):
cudart, arch, specific_sms = required_cuda(k)
if cudart == 0: return (None, None)
if cudart == 0 and arch == 0: return (None, None)
cudart_cond = "CUDART_VERSION >= %d"%cudart
if not specific_sms:
@@ -153,13 +157,13 @@ def kernel_conds(k):
def instantiate(k):
form_red_ty = (
"__global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const *args) {{\n"
" ncclSymRun_{id}<{red}, {ty}>(args);\n"
"__global__ void {cname}(ncclSymkDevWorkArgs4K NCCL_GRID_CONSTANT const *args4K) {{\n"
" ncclSymkRun_{id}<{red}, {ty}>(args4K->args);\n"
"}}"
)
form = (
"__global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const *args) {{\n"
" ncclSymRun_{id}(args);\n"
"__global__ void {cname}(ncclSymkDevWorkArgs4K NCCL_GRID_CONSTANT const *args4K) {{\n"
" ncclSymkRun_{id}(args4K->args);\n"
"}}"
)
@@ -172,7 +176,7 @@ def instantiate(k):
return inst
def prototype(k):
return "__global__ void {cname}(ncclSymDevArgs const *args);".format(cname=kernel_cname(k))
return "__global__ void {cname}(ncclSymkDevWorkArgs4K const *args4K);".format(cname=kernel_cname(k))
################################################################################
@@ -194,20 +198,22 @@ for coll in set(k.coll for k in enumerate_kernels()):
if (fname, coll) not in kernels_by_file:
kernels_by_file[fname, coll] = []
files_to_print = ""
# Generate each kernel instantiation file
for (fname, coll), ks in kernels_by_file.items():
files_to_print += fname + ";"
with open(os.path.join(gensrc, fname), "w") as f:
print("-- Generating %s" % os.path.join(gensrc, fname))
emitln(f, '#include "symmetric.h"')
emitln(f, '#include "sym_kernels.h"')
emitln(f, '#include "symmetric/kernel.h"')
emitln(f, '#include "symmetric/{coll}.h"'.format(coll=coll_to_lower[coll]))
for k in ks:
emitln(f, instantiate(k))
# Generate <gensrc>/symmetric_host.cc
with open(os.path.join(gensrc, "symmetric_kernels.cc"), "w") as f:
# Generate <gensrc>/sym_kernels_host.cc
with open(os.path.join(gensrc, "sym_kernels_host.cc"), "w") as f:
print("-- Generating %s" % os.path.join(gensrc, "symmetric_kernels.cc"))
emitln(f, '#include "symmetric.h"')
emitln(f, '#include "sym_kernels.h"')
emitln(f, '#include "device.h"')
emitln(f, '')
@@ -215,19 +221,19 @@ with open(os.path.join(gensrc, "symmetric_kernels.cc"), "w") as f:
emitln(f, prototype(k))
emitln(f, '')
emitln(f, 'extern int const ncclSymKernelCount = %d;' % len(list(enumerate_kernels())))
emitln(f, 'extern void* const ncclSymKernelList[] = {')
emitln(f, 'extern int const ncclSymkKernelCount = %d;' % len(list(enumerate_kernels())))
emitln(f, 'extern void* const ncclSymkKernelList[] = {')
for k in enumerate_kernels():
emitln(f, '(void*){cname},'.format(cname=kernel_cname(k)))
emitln(f, 'nullptr};')
emitln(f, '')
emitln(f, 'void* ncclSymGetKernelPtr(ncclSymKernelId id, int red, ncclDataType_t ty) {')
emitln(f, 'void* ncclSymkGetKernelPtr(ncclSymkKernelId id, int red, ncclDataType_t ty) {')
indents += 1
emitln(f, 'switch (id) {')
emitln(f, 'default: return nullptr;')
for (coll, algo), coll_algo_ks in partition(enumerate_kernels(), lambda k: (k.coll, k.algo)).items():
emitln(f, 'case ncclSymKernelId_'+coll+'_'+algo+':')
emitln(f, 'case ncclSymkKernelId_'+coll+'_'+algo+':')
indents += 1
if len(coll_algo_ks) == 1:
emitln(f, 'return (void*)&'+kernel_cname(coll_algo_ks[0])+';')
+12 -12
Datei anzeigen
@@ -4,27 +4,27 @@
#ifndef NCCL_DEVICE_SYMMETRIC_KERNEL_H_
#define NCCL_DEVICE_SYMMETRIC_KERNEL_H_
#include "symmetric.h"
#include "sym_kernels.h"
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(struct ncclSymDevArgs const* args);
__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R(struct ncclSymkDevWorkArgs const* args);
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(struct ncclSymDevArgs const* args);
__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLLMC_R(struct ncclSymkDevWorkArgs const* args);
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(struct ncclSymDevArgs const* args);
__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLD_AGxST(struct ncclSymkDevWorkArgs const* args);
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(struct ncclSymDevArgs const* args);
__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLDMC_AGxSTMC(struct ncclSymkDevWorkArgs const* args);
__device__ __forceinline__ void ncclSymRun_AllGather_LL(struct ncclSymDevArgs const* args);
__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(struct ncclSymDevArgs const* args);
__device__ __forceinline__ void ncclSymRun_AllGather_ST(struct ncclSymDevArgs const* args);
__device__ __forceinline__ void ncclSymRun_AllGather_STMC(struct ncclSymDevArgs const* args);
__device__ __forceinline__ void ncclSymkRun_AllGather_LL(struct ncclSymkDevWorkArgs const* args);
__device__ __forceinline__ void ncclSymkRun_AllGather_LLMC(struct ncclSymkDevWorkArgs const* args);
__device__ __forceinline__ void ncclSymkRun_AllGather_ST(struct ncclSymkDevWorkArgs const* args);
__device__ __forceinline__ void ncclSymkRun_AllGather_STMC(struct ncclSymkDevWorkArgs const* args);
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(struct ncclSymDevArgs const* args);
__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL(struct ncclSymkDevWorkArgs const* args);
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(struct ncclSymDevArgs const* args);
__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LD(struct ncclSymkDevWorkArgs const* args);
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(struct ncclSymDevArgs const* args);
__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LDMC(struct ncclSymkDevWorkArgs const* args);
#endif
+89 -418
Datei anzeigen
@@ -4,7 +4,7 @@
#ifndef NCCL_DEVICE_SYMMETRIC_PRIMITIVES_H_
#define NCCL_DEVICE_SYMMETRIC_PRIMITIVES_H_
#include "symmetric.h"
#include "sym_kernels.h"
#include "bitops.h"
#include "collectives.h"
#include "op128.h"
@@ -28,453 +28,124 @@ static __device__ Int0 flattenIx(Int0 pos, Int1 size, Ints ...more) {
return pos + size*flattenIx(more...);
}
// Precomputed integer reciprocoals for denominator values 1..64 inclusive.
// Pass these to idivFast64() for fast division on the GPU.
static __device__ uint64_t idivRcp64_upto64(int x) {
static constexpr uint64_t table[65] = {
idivRcp64(0x01), idivRcp64(0x01), idivRcp64(0x02), idivRcp64(0x03),
idivRcp64(0x04), idivRcp64(0x05), idivRcp64(0x06), idivRcp64(0x07),
idivRcp64(0x08), idivRcp64(0x09), idivRcp64(0x0a), idivRcp64(0x0b),
idivRcp64(0x0c), idivRcp64(0x0d), idivRcp64(0x0e), idivRcp64(0x0f),
idivRcp64(0x10), idivRcp64(0x11), idivRcp64(0x12), idivRcp64(0x13),
idivRcp64(0x14), idivRcp64(0x15), idivRcp64(0x16), idivRcp64(0x17),
idivRcp64(0x18), idivRcp64(0x19), idivRcp64(0x1a), idivRcp64(0x1b),
idivRcp64(0x1c), idivRcp64(0x1d), idivRcp64(0x1e), idivRcp64(0x1f),
idivRcp64(0x20), idivRcp64(0x21), idivRcp64(0x22), idivRcp64(0x23),
idivRcp64(0x24), idivRcp64(0x25), idivRcp64(0x26), idivRcp64(0x27),
idivRcp64(0x28), idivRcp64(0x29), idivRcp64(0x2a), idivRcp64(0x2b),
idivRcp64(0x2c), idivRcp64(0x2d), idivRcp64(0x2e), idivRcp64(0x2f),
idivRcp64(0x30), idivRcp64(0x31), idivRcp64(0x32), idivRcp64(0x33),
idivRcp64(0x34), idivRcp64(0x35), idivRcp64(0x36), idivRcp64(0x37),
idivRcp64(0x38), idivRcp64(0x39), idivRcp64(0x3a), idivRcp64(0x3b),
idivRcp64(0x3c), idivRcp64(0x3d), idivRcp64(0x3e), idivRcp64(0x3f),
idivRcp64(0x40)
};
return table[x];
}
static __device__ uint32_t idivRcp32_upto64(int x) {
return idivRcp64_upto64(x)>>32;
}
namespace {
struct ncclCoopCta {
__device__ void sync() { __syncthreads(); }
__device__ int self() { return threadIdx.x; }
__device__ int count() { return blockDim.x; }
};
struct ncclCoopWarps {
int log2_nWarps;
__device__ void sync() {
asm volatile("barrier.sync %0, %1;" :: "r"(1 + (threadIdx.x>>(5+log2_nWarps))), "r"(32<<log2_nWarps) : "memory");
}
__device__ int self() { return threadIdx.x & ((32<<log2_nWarps)-1); }
__device__ int count() { return 32<<log2_nWarps; }
};
struct ncclCoopWarp {
__device__ void sync() { __syncwarp(); }
__device__ int self() { return threadIdx.x%32; }
__device__ int count() { return 32; }
};
}
struct ncclSymkArgsHandler {
ncclDevComm const& comm;
ncclLLA2AHandle const& lsaLLA2A;
struct ncclSymkChannelWorkRange* channelWorkRange;
struct ncclSymkDevWork* devWork;
uint32_t nRanks_rcp32;
namespace {
static constexpr int ncclSymPrims_UseBarrier = 1;
static constexpr int ncclSymPrims_UseLL = 2;
static constexpr int ncclSymPrims_UseMultimem = 4;
struct ncclSymPrims {
int flags;
int const &rank;
int const &nRanks;
uint32_t const &nRanks_rcp32;
int block, nBlocks;
uint32_t nBlocks_rcp32;
uint32_t nBlocks_nWarps_rcp32;
uint32_t nRanks_nBlocks_rcp32;
uint32_t nWarpPerRank, nWarpPerRank_rcp32;
struct ncclSymDevBase* const &base;
uintptr_t offsetMc;
__device__ ncclSymkArgsHandler(ncclSymkDevWorkArgs const* args):
comm(args->kcomm.devComm),
lsaLLA2A(args->kcomm.lsaLLA2A) {
channelWorkRange = args->getWorkRange();
uint32_t const &stride4G;
uint32_t barEpoch;
uint32_t llEpoch;
__device__ ncclSymPrims(ncclSymDevComm const &comm, int flags):
flags(flags),
rank(comm.rank),
nRanks(comm.nRanks),
nRanks_rcp32(comm.nRanks_rcp32),
block(blockIdx.x),
nBlocks(gridDim.x),
nBlocks_rcp32(idivRcp32_upto64(nBlocks)),
nBlocks_nWarps_rcp32(imulRcp32(nBlocks, nBlocks_rcp32, blockDim.x/32, idivRcp32_upto64(blockDim.x/32))),
nRanks_nBlocks_rcp32(imulRcp32(nRanks, nRanks_rcp32, gridDim.x, nBlocks_rcp32)),
nWarpPerRank(idivFast32(nBlocks*blockDim.x/32, nRanks, nRanks_rcp32)),
nWarpPerRank_rcp32(idivRcp32_upto64(nWarpPerRank)),
base(comm.base),
offsetMc((flags & ncclSymPrims_UseMultimem) ? (char*)comm.baseMc - (char*)base : 0x0),
stride4G(comm.stride4G) {
#if CUDART_VERSION >= 12030 && __CUDA_ARCH__ >= 900
cudaGridDependencySynchronize();
#endif
if ((flags & ncclSymPrims_UseBarrier) && threadIdx.x < nRanks) {
barEpoch = (flags & ncclSymPrims_UseMultimem) ? base->barEpochMc[block] : base->barEpochUc[block];
}
if (flags & ncclSymPrims_UseLL) llEpoch = base->llEpoch[block] + 2;
}
__device__ ~ncclSymPrims() {
if (threadIdx.x == 0) {
if (flags & ncclSymPrims_UseBarrier) {
((flags & ncclSymPrims_UseMultimem) ? base->barEpochMc : base->barEpochUc)[block] = barEpoch;
}
if (flags & ncclSymPrims_UseLL) base->llEpoch[block] = llEpoch - 2;
}
devWork = args->getWorks(args->nMaxChannels);
nRanks_rcp32 = comm.nRanks_rcp32;
}
template<typename T>
__device__ T* peerPtr(int peer, T* selfPtr) {
return add4G(selfPtr, (peer-rank)*stride4G);
__device__ void getWorkRange(int block,
uint16_t& workLo, size_t& indexLo, uint16_t& workHi, size_t& indexHi) {
constexpr int EltPerCell = NCCL_SYM_KERNEL_CELL_SIZE / sizeof(T);
uint32_t fracLo, fracHi;
// Where the work begins
workLo = (block==0) ? 0 : channelWorkRange[block-1].workHi; // start where predecessor ends
fracLo = (block==0) ? 0 : channelWorkRange[block-1].fracHi + 1;
// If the predecessor ended on the work boundary, then we step to the beginning of the next work.
// This ensures we never have empty parts.
if (fracLo == 0x10000) {
workLo++;
fracLo = 0;
}
struct ncclSymkDevWork const& dw = devWork[workLo];
indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell;
// Where the work ends
workHi = channelWorkRange[block].workHi;
fracHi = channelWorkRange[block].fracHi + 1;
indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts);
}
template<typename T>
__device__ T* multimemPtr(T* selfPtr) {
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(selfPtr) + offsetMc);
__device__ void getWorkRangeFused(int blockIdx, int w,
int& block, int& nBlocks, size_t& indexLo, size_t& indexHi) {
constexpr int EltPerCell = NCCL_SYM_KERNEL_CELL_SIZE / sizeof(T);
struct ncclSymkDevWork const& dw = devWork[w];
uint32_t fracLo, fracHi;
int lastBlock;
block = blockIdx - dw.sChannelId;
nBlocks = dw.nChannels;
lastBlock = dw.sChannelId+dw.nChannels-1;
// Where the work begins
fracLo = (dw.sChannelId==0) ? 0 : ((channelWorkRange[dw.sChannelId-1].fracHi + 1) & 0xFFFF);
indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell;
fracHi = (channelWorkRange[lastBlock].workHi == w) ? channelWorkRange[lastBlock].fracHi + 1 : 0x10000;
indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts);
}
__device__ void barrierArrive(ncclCoopCta cta, bool release) {
cta.sync();
#if __CUDA_ARCH__ < 700
if (release) {
if (cta.self() == 0) __threadfence_system();
cta.sync();
}
#endif
if (flags & ncclSymPrims_UseMultimem) {
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
if (cta.self() == 0) {
uint32_t* inbox = &multimemPtr(base)->barInboxMc[block];
if (release) {
asm volatile("multimem.red.release.sys.add.u32 [%0],1;" :: "l"(inbox));
template<typename T, typename Fn>
__device__ void forEachWork(Fn const& fn) {
uint16_t workLo, workHi;
size_t indexLo, indexHi;
getWorkRange<T>(blockIdx.x, workLo, indexLo, workHi, indexHi);
size_t currentIndexLo = indexLo;
#pragma unroll 1
for (int w = workLo; w <= workHi; w++) {
struct ncclSymkDevWork const& dw = devWork[w];
size_t const& nAllElts = dw.nElts;
size_t currentIndexHi;
int block, nBlocks;
if (blockIdx.x >= dw.sChannelId && blockIdx.x < dw.sChannelId + dw.nChannels) {
getWorkRangeFused<T>(blockIdx.x, w, block, nBlocks, currentIndexLo, currentIndexHi);
} else {
asm volatile("multimem.red.relaxed.sys.add.u32 [%0],1;" :: "l"(inbox));
currentIndexHi = (w < workHi) ? nAllElts : indexHi;
block = 0;
nBlocks = 1;
}
fn(block, nBlocks, currentIndexHi - currentIndexLo, nAllElts,
ncclSymPtr<T>(dw.inputWin, dw.inputOff) + currentIndexLo,
ncclSymPtr<T>(dw.outputWin, dw.outputOff) + currentIndexLo);
currentIndexLo = 0;
}
#endif
} else {
int r = cta.self();
if (r != rank && r < nRanks) {
uint32_t* inbox = &peerPtr(r, base)->barInboxPerPeer[block*nRanks + rank];
#if __CUDA_ARCH__ >= 700
if (release) {
asm volatile("st.release.sys.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1));
} else {
asm volatile("st.relaxed.sys.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1));
}
#else
if (release) {
__atomic_store_n(inbox, barEpoch + 1, __ATOMIC_RELEASE);
} else {
__atomic_store_n(inbox, barEpoch + 1, __ATOMIC_RELAXED);
}
// asm volatile("st.volatile.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1));
#endif
}
}
}
__device__ void barrierWait(ncclCoopCta cta, bool acquire) {
if (flags & ncclSymPrims_UseMultimem) {
#if __CUDA_ARCH__ >= 900
if (cta.self() == 0) {
uint32_t* inbox = &base->barInboxMc[block];
while (true) {
uint32_t got;
if (acquire) {
asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
} else {
asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
}
if (got-(barEpoch+nRanks) <= uint32_t(-1)>>1) break;
}
barEpoch += nRanks;
}
#endif
} else {
int r = cta.self();
if (r != rank && r < nRanks) {
uint32_t* inbox = &base->barInboxPerPeer[block*nRanks + r];
while (true) {
uint32_t got;
#if __CUDA_ARCH__ >= 700
if (acquire) {
asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
} else {
asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
}
#else
if (acquire) {
got = __atomic_load_n(inbox, __ATOMIC_ACQUIRE);
} else {
got = __atomic_load_n(inbox, __ATOMIC_RELAXED);
}
// asm volatile("ld.volatile.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
#endif
if (got-(barEpoch+1) <= uint32_t(-1)>>1) break;
}
}
#if __CUDA_ARCH__ < 700
if (acquire) {
cta.sync();
if (cta.self() == 0) __threadfence();
}
#endif
barEpoch += 1;
}
cta.sync();
}
template<typename T, typename Fn>
__device__ void singleWork(Fn const& fn) {
uint16_t w;
size_t indexLo, indexHi;
__device__ void endLL(ncclCoopCta cta) {
if (__builtin_expect(llEpoch >= -2u, false)) {
cta.sync();
uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch);
int epochSize = ncclSymLLEpochSize(nRanks);
#pragma unroll 4
for (int i=cta.self(); i*16 < epochSize; i += cta.count()) {
buf[i] = uint4{0, 0, 0, 0};
}
}
cta.sync();
llEpoch += (llEpoch == -1u) ? 3 : 1;
}
getWorkRange<T>(blockIdx.x, w, indexLo, w, indexHi);
template<typename T>
__device__ void sendLL(int peer, int slot, T val) {
union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
tmp = val;
uint4* buf = ncclSymDevBase_getLLBuf(peerPtr(peer, base), nRanks, block, llEpoch) + slot;
#pragma unroll
for (int u=0; u < divUp(sizeof(T),8); u++) {
using Vec = uint32_t __attribute__((ext_vector_type(4)));
Vec i4;
i4[0] = u32[u][0];
i4[1] = llEpoch;
i4[2] = u32[u][1];
i4[3] = llEpoch;
#if defined(__gfx950__)
asm volatile ("flat_store_dwordx4 %0, %1 sc0 sc1 nt" :: "v"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "v"(i4));
#else
__builtin_nontemporal_store(i4, (Vec*)(buf + ncclSymLLMaxSlots(sizeof(T))*u));
#endif
// asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
}
}
struct ncclSymkDevWork const& dw = devWork[w];
template<typename T>
__device__ void bcastLL(int slot, T val) {
if (flags & ncclSymPrims_UseMultimem) {
union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
tmp = val;
uint4* bufmc = ncclSymDevBase_getLLBuf(multimemPtr(base), nRanks, block, llEpoch) + slot;
#pragma unroll
for (int u=0; u < divUp(sizeof(T),8); u++) {
using Vec = uint32_t __attribute__((ext_vector_type(4)));
Vec i4;
i4[0] = u32[u][0];
i4[1] = llEpoch;
i4[2] = u32[u][1];
i4[3] = llEpoch;
#if defined(__gfx950__)
asm volatile ("flat_store_dwordx4 %0, %1 sc0 sc1 nt" :: "v"(bufmc + ncclSymLLMaxSlots(sizeof(T))*u), "v"(i4));
#else
__builtin_nontemporal_store(i4, (Vec*)(bufmc + ncclSymLLMaxSlots(sizeof(T))*u));
#endif
// asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(bufmc + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
}
} else {
union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
tmp = val;
uint4* buf0 = ncclSymDevBase_getLLBuf(peerPtr(0, base), nRanks, block, llEpoch) + slot;
int dr = 0;
int r = rank;
#pragma unroll 1
for (; dr+8 <= nRanks; dr += 8) {
#pragma unroll
for (int ur=0; ur < 8; ur++) {
uint4* buf = add4G(buf0, r*stride4G);
#pragma unroll
for (int u=0; u < divUp(sizeof(T),8); u++) {
using Vec = uint32_t __attribute__((ext_vector_type(4)));
Vec i4;
i4[0] = u32[u][0];
i4[1] = llEpoch;
i4[2] = u32[u][1];
i4[3] = llEpoch;
#if defined(__gfx950__)
asm volatile ("flat_store_dwordx4 %0, %1 sc0 sc1 nt" :: "v"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "v"(i4));
#else
__builtin_nontemporal_store(i4, (Vec*)((buf + ncclSymLLMaxSlots(sizeof(T))*u)));
#endif
// asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
}
r += 1;
if (r == nRanks) r = 0;
}
}
#pragma unroll
for (int ur=0; ur < 8; ur++, dr++) {
if (dr == nRanks) break;
uint4* buf = add4G(buf0, r*stride4G);
#pragma unroll
for (int u=0; u < divUp(sizeof(T),8); u++) {
using Vec = uint32_t __attribute__((ext_vector_type(4)));
Vec i4;
i4[0] = u32[u][0];
i4[1] = llEpoch;
i4[2] = u32[u][1];
i4[3] = llEpoch;
#if defined(__gfx950__)
asm volatile ("flat_store_dwordx4 %0, %1 sc0 sc1 nt" :: "v"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "v"(i4));
#else
__builtin_nontemporal_store(i4, (Vec*)(buf + ncclSymLLMaxSlots(sizeof(T))*u));
#endif
// asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
}
r += 1;
if (r == nRanks) r = 0;
}
}
}
template<int nSlotsMin, int nSlotsMax, typename T>
__device__ void recvLL(int slot0, int nSlots, int stride, T(&elts)[nSlotsMax]) {
uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch) + slot0;
uint4 tmp[nSlotsMax][divUp(sizeof(T),8)];
//int spins=0;
while (true) {
#pragma unroll
for (int u=0; u < nSlotsMax; u++) {
if (u < nSlotsMin || u < nSlots) {
#pragma unroll
for (int v=0; v < divUp(sizeof(T),8); v++) {
tmp[u][v] = *(buf + u * stride + v * ncclSymLLMaxSlots(sizeof(T)));
// asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(tmp[u][v].x), "=r"(tmp[u][v].y), "=r"(tmp[u][v].z), "=r"(tmp[u][v].w) : "l"(buf + u*stride + v*ncclSymLLMaxSlots(sizeof(T))));
}
}
}
bool okAll = true;
#pragma unroll
for (int u=0; u < nSlotsMax; u++) {
#pragma unroll
for (int v=0; v < divUp(sizeof(T),8); v++) {
if (u < nSlotsMin || u < nSlots) {
bool ok = tmp[u][v].y == llEpoch &&
tmp[u][v].w == llEpoch;
okAll &= ok;
}
}
}
if (__builtin_expect(okAll, true)) break;
//if (spins++ == 10<<20) spins=0;
}
#pragma unroll
for (int u=0; u < nSlotsMax; u++) {
if (nSlotsMin <= u && u == nSlots) break;
union { T val; uint32_t u32[divUp(sizeof(T),8)][2]; };
#pragma unroll
for (int v=0; v < divUp(sizeof(T),8); v++) {
u32[v][0] = tmp[u][v].x;
u32[v][1] = tmp[u][v].z;
}
elts[u] = val;
}
}
template<typename Pack, typename T, typename Red, int Unroll=8>
__device__ Pack recvReduceLL(int slot, int stride, Red red) {
using Acc = typename Red::EltType;
using AccPack = BytePack<sizeof(Pack)*sizeof(Acc)/sizeof(T)>;
AccPack acc;
bool first = true;
int r = 0;
#pragma unroll 1
for (; r+Unroll <= nRanks; r += Unroll) {
Pack got[Unroll];
this->template recvLL</*Min=*/Unroll>(slot + r*stride, Unroll, stride, got);
AccPack acc0 = applyCast<T, Acc>(got[0]);
acc = first ? acc0 : applyReduce(red, acc, acc0);
first = false;
#pragma unroll
for (int i=1; i < Unroll; i++) acc = applyReduce(red, acc, applyCast<T, Acc>(got[i]));
}
if (r < nRanks) {
Pack got[Unroll];
this->template recvLL</*Min=*/1>(slot + r*stride, nRanks-r, stride, got);
AccPack acc0 = applyCast<T, Acc>(got[0]);
acc = first ? acc0 : applyReduce(red, acc, acc0);
#pragma unroll
for (int i=1; i < Unroll-1; i++) {
if (r+i < nRanks) acc = applyReduce(red, acc, applyCast<T, Acc>(got[i]));
}
}
return applyCast<Acc, T>(acc);
}
template<typename T>
__device__ T recvLL(int slot) {
T one[1];
this->template recvLL<1, 1, T>(slot, 1, 0, one);
return one[0];
}
template<typename Coop, typename T>
__device__ void coopRecvLL(Coop coop, int slot0, int nSlots, T* dst) {
int me = coop.self();
if (me < nSlots) {
uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch) + slot0 + me;
uint4 got[divUp(sizeof(T), 8)];
//int spins=0;
#pragma unroll 1
while (true) {
#pragma unroll
for (int u=0; u < divUp(sizeof(T), 8); u++) {
got[u] = *((buf + u * ncclSymLLMaxSlots(sizeof(T))));
// asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(got[u].x), "=r"(got[u].y), "=r"(got[u].z), "=r"(got[u].w) : "l"(buf + u*ncclSymLLMaxSlots(sizeof(T))));
}
bool ok = true;
#pragma unroll
for (int u=0; u < divUp(sizeof(T), 8); u++) {
ok &= got[u].y == llEpoch;
ok &= got[u].w == llEpoch;
}
if (__builtin_expect(ok, true)) break;
//if (++spins == 10<<20) { spins=0; printf("r=%d LL spin @ ix=%d got=%d want=%d\n", rank, slot0+me, got[0].y, llEpoch); }
}
union { T val; uint32_t u32[divUp(sizeof(T), 8)][2]; };
#pragma unroll
for (int u=0; u < divUp(sizeof(T), 8); u++) {
u32[u][0] = got[u].x;
u32[u][1] = got[u].z;
}
dst[slot0 + me] = val;
}
fn(indexHi - indexLo, dw.nElts,
ncclSymPtr<T>(dw.inputWin, dw.inputOff) + indexLo,
ncclSymPtr<T>(dw.outputWin, dw.outputOff) + indexLo);
}
};
}
template<template<typename> typename Red, typename T, bool nvls>
struct ncclSymAccumType { using Type = T; };
struct ncclSymkAccumType { using Type = T; };
// Only Red's whose opArg is invariant w.r.t. the datatype can have a different
// accumulator type. At the moment this excludes integer min/max, sumpostdiv,
// and premulsum.
template<> struct ncclSymAccumType<FuncSum, __half, false> { using Type = float; };
template<> struct ncclSymkAccumType<FuncSum, __half, false> { using Type = float; };
#if defined(__CUDA_BF16_TYPES_EXIST__)
template<> struct ncclSymAccumType<FuncSum, __nv_bfloat16, false> { using Type = float; };
template<> struct ncclSymkAccumType<FuncSum, __nv_bfloat16, false> { using Type = float; };
#endif
#if defined(__CUDA_FP8_TYPES_EXIST__)
template<> struct ncclSymAccumType<FuncSum, __nv_fp8_e4m3, false> { using Type = float; };
template<> struct ncclSymAccumType<FuncSum, __nv_fp8_e5m2, false> { using Type = float; };
template<> struct ncclSymkAccumType<FuncSum, __nv_fp8_e4m3, false> { using Type = float; };
template<> struct ncclSymkAccumType<FuncSum, __nv_fp8_e5m2, false> { using Type = float; };
#endif
#endif
+138 -109
Datei anzeigen
@@ -1,38 +1,39 @@
// Modification Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include "symmetric.h"
#include "sym_kernels.h"
#include "symmetric/kernel.h"
#include "symmetric/primitives.h"
template<int BytePerPack, int UnrollPacks, int UnrollPeers, typename T, typename Red>
static __device__ void reduceDeep(
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
Red red, char* inputRank0, char* outputHere, int32_t nIters
ncclSymkArgsHandler const& handler, int tn, int t,
bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
Red red, ncclSymPtr<char> input, ncclSymPtr<char> output, int32_t nIters
) {
using Pack = BytePack<BytePerPack>;
using Acc = typename Red::EltType;
using AccPack = BytePack<BytePerPack*sizeof(Acc)/sizeof(T)>;
ncclTeam world = ncclTeamWorld(handler.comm);
int wn = tn/WARP_SIZE;
int w = t/WARP_SIZE;
int lane = t%WARP_SIZE;
int const& rank = prim.rank;
int const& nRanks = prim.nRanks;
uint32_t const& stride4G = prim.stride4G;
Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
Pack* outHere = (Pack*)outputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
ncclSymPtr<Pack> inpPacks = (ncclSymPtr<Pack>)input + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
ncclSymPtr<Pack> outPacks = (ncclSymPtr<Pack>)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
Pack acc0[UnrollPacks];
nIters -= w;
if (0 < nIters) {
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
}
}
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
if (0 < nIters) {
while (true) {
@@ -42,7 +43,7 @@ static __device__ void reduceDeep(
{ Pack tmp1[UnrollPacks];
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
tmp1[u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
}
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
@@ -68,7 +69,7 @@ static __device__ void reduceDeep(
if (partial && ur!=0 && dr+ur == nRanks) break;
#pragma unroll UnrollPacks
for (int u=0; u < UnrollPacks; u++) {
tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
tmp1[ur][u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
}
r += 1;
if (r == nRanks) r = 0;
@@ -88,17 +89,17 @@ static __device__ void reduceDeep(
for (int u=0; u < UnrollPacks; u++) acc0[u] = applyCast<Acc, T>(acc1[u]);
#pragma unroll UnrollPacks
for (int u=0; u < UnrollPacks; u++) outHere[u*WARP_SIZE] = acc0[u];
for (int u=0; u < UnrollPacks; u++) outPacks.localPtr()[u*WARP_SIZE] = acc0[u];
inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
outHere += intptr_t(wn)*UnrollPacks*WARP_SIZE;
inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
nIters -= wn;
if (nIters <= 0) break;
// Load data for next iteration.
#pragma unroll
for (int u=0; u < UnrollPacks; u++) {
acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
}
}
}
@@ -106,20 +107,22 @@ static __device__ void reduceDeep(
template<int UnrollPeers, typename Red, typename T>
static __device__ void reduceEnds(
ncclSymPrims& prim, int tn, int t, Red red,
T* inputRank0, T* outputHere, size_t nElts, uint32_t nPreElts, size_t nSufElts
ncclSymkArgsHandler const& handler, int tn, int t, Red red,
ncclSymPtr<T> input, ncclSymPtr<T> output,
size_t nElts, uint32_t nPreElts, size_t nSufElts
) {
using Acc = typename Red::EltType;
int const& rank = prim.rank;
int const& nRanks = prim.nRanks;
uint32_t const& stride4G = prim.stride4G;
BytePack<sizeof(T)>* inpRank0 = (BytePack<sizeof(T)>*)inputRank0;
BytePack<sizeof(T)>* outHere = (BytePack<sizeof(T)>*)outputHere;
ncclTeam world = ncclTeamWorld(handler.comm);
int const& rank = handler.comm.rank;
int const& nRanks = handler.comm.nRanks;
ncclSymPtr<BytePack<sizeof(T)>> inpPacks = (ncclSymPtr<BytePack<sizeof(T)>>)input;
ncclSymPtr<BytePack<sizeof(T)>> outPacks = (ncclSymPtr<BytePack<sizeof(T)>>)output;
#pragma unroll 1
for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i;
BytePack<sizeof(T)> acc0 = *add4G(inpRank0+elt, rank*stride4G);
BytePack<sizeof(T)> acc0 = inpPacks.peerPtr(world, rank)[elt];
BytePack<sizeof(Acc)> acc1;
BytePack<sizeof(T)> tmp[UnrollPeers];
int dr = 1;
@@ -138,7 +141,7 @@ static __device__ void reduceEnds(
#pragma unroll
for (int u=0; u < UnrollPeers-partial; u++) {
if (partial && u!=0 && dr+u == nRanks) break;
tmp[u] = *add4G(inpRank0+elt, r*stride4G);
tmp[u] = inpPacks.peerPtr(world, r)[elt];
r += 1;
if (r == nRanks) r = 0;
}
@@ -155,26 +158,25 @@ static __device__ void reduceEnds(
}
acc0 = applyCast<Acc, T>(acc1);
outHere[elt] = acc0;
outPacks.localPtr()[elt] = acc0;
}
}
template<typename Red, typename T>
static __device__ void reduce(
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
Red red, T* input, T* output, size_t nElts
ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks,
bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
Red red, ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
) {
int nRanks = prim.nRanks;
int nBlocks = prim.nBlocks;
// Mpve input to rank=0
input = prim.peerPtr(0, input);
int const& nRanks = handler.comm.nRanks;
int const& nRanks_rcp32 = handler.nRanks_rcp32;
uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks);
uint32_t nRanks_nBlocks_rcp32 = nccl::utility::imulRcp32(nRanks, nRanks_rcp32, nBlocks, nBlocks_rcp32);
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
uint32_t alignment = uint32_t(inputUptr - outputUptr);
uint32_t alignment = uint32_t(input.offset - output.offset);
size_t nBytes = nElts*sizeof(T);
uint32_t nPreBytes = (16u - inputUptr)%16u;
uint32_t nPreBytes = (16u - input.offset)%16u;
nPreBytes = min((size_t)nPreBytes, nBytes);
uintptr_t cursor = nPreBytes;
@@ -184,12 +186,12 @@ static __device__ void reduce(
constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
if (chunks != 0) {
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
reduceDeep<BytePerPack, UnrollPacks, UnrollPeers, T>(
prim, tn, t, waitNeeded, red,
(char*)input + cursor, (char*)output + cursor,
handler, tn, t, waitNeeded, bar, red,
(ncclSymPtr<char>)input + cursor, (ncclSymPtr<char>)output + cursor,
chunks*MinWarpPerBlock
);
cursor = cursorAfter;
@@ -201,12 +203,12 @@ static __device__ void reduce(
constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4;
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
if (chunks != 0) {
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
reduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>(
prim, tn, t, waitNeeded, red,
(char*)input + cursor, (char*)output + cursor,
handler, tn, t, waitNeeded, bar, red,
(ncclSymPtr<char>)input + cursor, (ncclSymPtr<char>)output + cursor,
chunks*MinWarpPerBlock
);
cursor = cursorAfter;
@@ -214,42 +216,47 @@ static __device__ void reduce(
}
}
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
constexpr int UnrollPeers = 8;
size_t nSufElts = (nBytes-cursor)/sizeof(T);
reduceEnds<UnrollPeers>(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
reduceEnds<UnrollPeers>(handler, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
}
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(ncclSymDevArgs const* args) {
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LD(ncclSymkDevWorkArgs const* args) {
ncclSymkArgsHandler handler{args};
ncclLsaBarrierSession<ncclCoopCta> bar{
ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x
};
Red<typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type> red(handler.devWork->redOpArg);
int const& rank = handler.comm.rank;
// Round robin warps over blocks.
int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
prim.block, prim.nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int tn = prim.nBlocks*blockDim.x;
bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed);
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
//prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
bool waitNeeded = true;
handler.forEachWork<T>(
[&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
ncclSymPtr<T> input, ncclSymPtr<T> output) {
// Round robin warps over blocks.
int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
block, nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int tn = nBlocks*blockDim.x;
reduce(prim, tn, t, /*waitNeeded=*/true, red, (T*)args->input + prim.rank*args->nElts, (T*)args->output, args->nElts);
reduce(handler, tn, t, nBlocks, waitNeeded, bar, red, input + rank*nElts, output, nElts);
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
waitNeeded = false;
}
);
bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
}
template<typename Red, typename T>
static __device__ void reduceMultimem(
ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts
int tn, int t, Red red, T* input, T* output, size_t nElts
) {
// Mpve input to multimem
input = prim.multimemPtr(input);
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
size_t nBytes = nElts*sizeof(T);
@@ -294,41 +301,52 @@ static __device__ void reduceMultimem(
uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
BytePack<sizeof(T)> val = applyLoadMultimem<Red, sizeof(T)>(red, inputUptr + cursor);
*reinterpret_cast<BytePack<sizeof(T)>*>(outputUptr + cursor) = val;
cursor += tn*sizeof(T);
}
}
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(ncclSymDevArgs const* args) {
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
Red<typename ncclSymAccumType<Red, T, /*nvls=*/true>::Type> red(args->redOpArg);
__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LDMC(ncclSymkDevWorkArgs const* args) {
ncclSymkArgsHandler handler{args};
ncclLsaBarrierSession<ncclCoopCta> bar{
ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true
};
Red<typename ncclSymkAccumType<Red, T, /*nvls=*/true>::Type> red(handler.devWork->redOpArg);
// Round robin warps over blocks.
int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
prim.block, prim.nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int tn = prim.nBlocks*blockDim.x;
int const& rank = handler.comm.rank;
auto const& multimem = handler.comm.lsaMultimem;
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
reduceMultimem(prim, tn, t, red, (T*)args->input + prim.rank*args->nElts, (T*)args->output, args->nElts);
handler.forEachWork<T>(
[&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
ncclSymPtr<T> input, ncclSymPtr<T> output) {
// Round robin warps over blocks.
int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
block, nBlocks,
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
int tn = nBlocks*blockDim.x;
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
reduceMultimem(tn, t, red, input.multimemPtr(multimem) + rank*nElts, output.localPtr(), nElts);
}
);
bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
}
// T is user type, EltType is the most aligned type
template<typename T, typename Red, typename EltType>
__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL_body(
ncclSymPrims &prim, Red red, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts) {
__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL_body(
ncclSymkArgsHandler& handler, ncclLLA2ASession<ncclCoopCta>& lla2a,
Red red, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts) {
using Pack = BytePack<8>;
using Acc = typename Red::EltType;
using AccPack = BytePack<8*sizeof(Acc)/sizeof(T)>;
constexpr int EltPerPack = 8/sizeof(EltType);
int nRanks = prim.nRanks;
int rank = prim.rank;
int const& nRanks = handler.comm.nRanks;
int const& rank = handler.comm.rank;
int t = threadIdx.x;
int tn = ncclSymMaxThreads;
constexpr int tn = ncclSymkMaxThreads;
ncclCoopCta cta;
#pragma unroll 1
@@ -342,17 +360,25 @@ __device__ __forceinline__ void ncclSymRun_ReduceScatter_LL_body(
#pragma unroll 1
for (int i = t; i < nRanks*nIterPacks; i += tn) {
Pack got = loadPack<Pack>(input + peer*nStrideElts, pack*EltPerPack, nElts);
prim.sendLL(peer, rank*nIterPacks + pack, got);
lla2a.send(peer, rank*nIterPacks + pack, got);
peer += tn_div_nPacks;
pack += tn_mod_nPacks;
if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; }
}
if (t < nIterPacks) {
Pack got = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
storePack(output, t*EltPerPack, nElts, got);
AccPack got = lla2a.template recvReduce</*Unroll=*/8, Pack>(
/*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks,
/*eltToAcc=*/[&] __device__ (Pack x)->AccPack {
return applyCast<T, Acc>(x);
},
/*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack {
return applyReduce(red, a, b);
}
);
storePack(output, t*EltPerPack, nElts, applyCast<Acc, T>(got));
}
prim.endLL(cta);
lla2a.endEpoch(cta);
input += tn*EltPerPack;
output += tn*EltPerPack;
@@ -360,31 +386,34 @@ __device__ __forceinline__ void ncclSymRun_ReduceScatter_LL_body(
nPacks -= tn;
}
}
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(ncclSymDevArgs const* args) {
ncclSymPrims prim(args->comm, ncclSymPrims_UseLL);
Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
template<template<typename> typename Red, typename T>
__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL(ncclSymkDevWorkArgs const* args) {
ncclSymkArgsHandler handler{args};
ncclLLA2ASession<ncclCoopCta> lla2a(
ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A, blockIdx.x, ncclSymkMaxThreads
);
Red<typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type> red(handler.devWork->redOpArg);
using Pack = BytePack<8>;
constexpr int EltPerPack = 8/sizeof(T);
int nAllElts = args->nElts;
int nAllPacks = divUp(nAllElts, EltPerPack);
uint32_t nPackPerBlock, nPackModBlock;
idivmodFast32(&nPackPerBlock, &nPackModBlock, nAllPacks, prim.nBlocks, prim.nBlocks_rcp32);
int blockPackBegin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
int nPacks = blockPackEnd - blockPackBegin;
int nElts = nAllElts - blockPackBegin*EltPerPack;
nElts = min(nElts, nPacks*EltPerPack);
T* input = (T*)args->input + blockPackBegin*EltPerPack;
T* output = (T*)args->output + blockPackBegin*EltPerPack;
uint32_t lowBits = args->nElts*sizeof(T);
lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->input);
lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->output);
if (__builtin_expect(lowBits%8 == 0, true)) {
ncclSymRun_ReduceScatter_LL_body<T>(prim, red, (Pack*)input, (Pack*)output, nPacks, nPacks, nAllElts/EltPerPack);
} else {
ncclSymRun_ReduceScatter_LL_body<T>(prim, red, input, output, nElts, nPacks, nAllElts);
}
handler.singleWork<T>(
[&]__device__(int nElts, int nAllElts,
ncclSymPtr<T> inputPtr, ncclSymPtr<T> outputPtr) {
int nPacks = divUp(nElts, EltPerPack);
T* input = (T*)inputPtr.localPtr();
T* output = (T*)outputPtr.localPtr();
uint32_t lowBits = nElts*sizeof(T);
lowBits |= (uintptr_t)input;
lowBits |= (uintptr_t)output;
if (__builtin_expect(lowBits%8 == 0, true)) {
ncclSymkRun_ReduceScatter_LL_body<T>(handler, lla2a, red, (Pack*)input, (Pack*)output,
nPacks, nPacks, divUp(nAllElts, EltPerPack));
} else {
ncclSymkRun_ReduceScatter_LL_body<T>(handler, lla2a, red, input, output, nElts, nPacks, nAllElts);
}
}
);
}
+417 -215
Datei anzeigen
@@ -22,6 +22,9 @@
#include "profiler.h"
#include "transport.h"
#include "register_inline.h"
#include "ce_coll.h"
#include "nvtx.h"
#include "scheduler.h"
#include "common.h"
#include "api_trace.h"
@@ -248,6 +251,7 @@ static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) {
size_t workBytes = plan->workBytes;
size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
if (plan->isSymColl) return;
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
#else
plan->threadPerBlock = std::max(plan->threadPerBlock, 256 /*NCCL_MIN_NTHREADS*/);
@@ -364,7 +368,6 @@ bool gfx9CheapFenceOff(const ncclDevWorkColl& devWork, bool disabledByPrecheck){
ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
struct ncclKernelPlanner* planner = &comm->planner;
if (planner->isSymColl) return ncclSuccess;
struct ncclTaskColl *task;
task = ncclIntruQueueHead(&planner->collTaskQueue);
while (task != nullptr) {
@@ -448,6 +451,7 @@ next:
ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo) {
struct ncclKernelPlanner* planner = &comm->planner;
planner->persistent = ncclCudaGraphValid(planner->capturingGraph);
// Tasks from the sorter come out ordered size descending.
struct ncclTaskColl* task = ncclTaskCollSorterDequeueAll(&planner->collSorter);
// Tasks are assembled by (fn,op,ty) size ascending.
@@ -456,36 +460,8 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
int fnOpTyIndices[ncclNumFuncs*ncclNumDevRedOps*ncclNumTypes];
int fnOpTyCount = 0;
if (comm->nNodes == 1 && planner->nTasksColl == 1 && planner->nTasksP2p == 0) {
void* sendSymPtr;
void* recvSymPtr;
struct ncclReg* sendReg;
struct ncclReg* recvReg;
size_t size = task->count*ncclTypeSize(task->datatype);
NCCLCHECK(ncclRegFindSymmetric(comm, task->sendbuff, size, &sendSymPtr, &sendReg));
NCCLCHECK(ncclRegFindSymmetric(comm, task->recvbuff, size, &recvSymPtr, &recvReg));
bool implemented = ncclSymImplemented(task->func, task->opDev.op, task->datatype);
if (sendReg && recvReg && (sendReg->winFlags & recvReg->winFlags & NCCL_WIN_COLL_SYMMETRIC) && implemented) {
enum ncclSymKernelId kernel;
int nChannels, nWarps;
float estTimeUs = 1.e18;
NCCLCHECK(ncclSymPickKernel(comm, task->func, task->opDev.op, task->datatype, task->count, &estTimeUs, &kernel, &nChannels, &nWarps));
// We should only use symmetric kernel if it beats the asymmetric kernel. But the
// perf model accuracy from asymmetric kernels is too inaccurate and reports too high
// of a bandwidth. For now just always use symmetric if available.
if (kernel != ncclSymKernelId_Count) {
task->sendbuff = sendSymPtr;
task->recvbuff = recvSymPtr;
task->devFuncId = (int)kernel;
task->nMaxChannels = nChannels;
task->nWarps = nWarps;
ncclIntruQueueEnqueue(&planner->collTaskQueue, task);
planner->isSymColl = true;
return ncclSuccess;
}
}
if (comm->symmetricSupport) {
NCCLCHECK(ncclMakeSymmetricTaskList(comm, task, &planner->collSymTaskQueue, &task));
}
// Walk the size sorted tasks, binning them by (fn,op,ty).
@@ -677,7 +653,7 @@ static ncclResult_t scheduleCollTasksToPlan(
size_t trafficBytes[2*2] = {0, 0, 0, 0}; // [collnet][nvls]
int nChannels[2*2] = {0, 0, 0, 0}; // [collnet][nvls]
int const nMaxChannels[2*2] = {comm->nChannels, comm->nvlsChannels, // [collnet][nvls]
comm->nChannels, comm->nvlsChannels};
comm->nChannels, std::min(comm->nChannels, comm->nvlsChannels)};
constexpr size_t MinTrafficPerChannel = 16 << 10; // 16K traffic as minimal
do {
size_t workBytes = 0;
@@ -888,6 +864,7 @@ static ncclResult_t scheduleCollTasksToPlan(
}
proxyOp->eActivationMask = task->eActivationMask;
proxyOp->incWorkCounter = true;
proxyOp->nChannels = nChannels;
proxyOp->connIndex = 0;
if (task->protocol == NCCL_PROTO_SIMPLE && task->algorithm == NCCL_ALGO_RING) {
if (comm->useIntraNet && nBytes > rcclParamIntraNetThreshold()) {
@@ -920,6 +897,8 @@ static ncclResult_t scheduleCollTasksToPlan(
plan->kernelFn = ncclKerns[ncclGetKernelIndex(comm)].kernelFn;
plan->kernelSpecialized = ncclKerns[ncclGetKernelIndex(comm)].specialized;
}
// Profiler
plan->groupApiEventHandle = task->groupApiEventHandle;
if (comm->rank == 0) {
INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %s proto %s channel{Lo..Hi}={%d..%d}",
@@ -993,8 +972,9 @@ static ncclResult_t addP2pToPlan(
int sendRank, void* sendAddr, ssize_t sendBytes,
int recvRank, void* recvAddr, ssize_t recvBytes,
uint64_t sendOpCount, uint64_t recvOpCount,
struct ncclTaskP2p** p2pTasks
const int planTotalTasks[], struct ncclTaskP2p** p2pTasks
) {
ncclResult_t ret = ncclSuccess;
int connIndex[2] = {1, 1};
bool selfSend = (sendRank == comm->rank);
// recv: dir=0, send: dir=1
@@ -1012,6 +992,8 @@ static ncclResult_t addP2pToPlan(
//replacing line below with ncclP2pChannelBaseForRound(comm, p2pRound, batchP2P) can cause issues due to ncclP2pChannelBaseForRound calling the same routine
//channel base computed in taskAppend and here must be the same, but in taskAppend the call happens once and is cached for later usage, which is why it wouldn't be consistent with the call below
uint8_t base = ncclP2pChannelBaseForRound(comm, p2pRound, batchP2PEnableEnv);
struct ncclProxyOp proxyOps[2] = {};
int nProxyOps = selfSend ? 0 : 2;
if (comm->p2pNet) {
for (int dir = 0; dir <= 1; dir++) {
if (bytes[dir] > rcclParamP2pNetThreshold())
@@ -1072,7 +1054,7 @@ static ncclResult_t addP2pToPlan(
bool pxnUsed = !ncclPxnDisable(comm) && comm->isAllNvlink && comm->maxLocalRanks > 1;
if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (!pxnUsed)) {
int regFlag = 0;
NCCLCHECK(ncclCalloc(&handles[dir], nChannelsMax));
NCCLCHECKGOTO(ncclCalloc(&handles[dir], nChannelsMax), ret, cleanup);
for (int part = 0; part < nChannelsMax; part++) {
int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part, nChannelsMax, comm->nNodes);
struct ncclChannelPeer** channelPeers = comm->channels[channelId].peers;
@@ -1095,7 +1077,7 @@ static ncclResult_t addP2pToPlan(
void* regAddr = NULL;
if (conn->conn.flags & (NCCL_P2P_WRITE | NCCL_P2P_READ)) {
// We require users registering buffers on both sides
NCCLCHECK(ncclRegisterP2pIpcBuffer(comm, addrs[dir], bytes[dir], peerRank, &regFlag, &regAddr, &plan->cleanupQueue));
NCCLCHECKGOTO(ncclRegisterP2pIpcBuffer(comm, addrs[dir], bytes[dir], peerRank, &regFlag, &regAddr, &plan->cleanupQueue), ret, cleanup);
if (regFlag) {
if (dir == 0 && (conn->conn.flags & NCCL_P2P_WRITE)) recvAddr = regAddr;
else if (dir == 1 && (conn->conn.flags & NCCL_P2P_READ)) sendAddr = regAddr;
@@ -1120,14 +1102,17 @@ static ncclResult_t addP2pToPlan(
if (p2pTasks[dir]) p2pTasks[dir]->nChannels = nChannels[dir];
}
struct ncclWorkList* workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkP2p>(&comm->memScoped, 1);
struct ncclWorkList* workNode;
workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkP2p>(&comm->memScoped, 1);
workNode->workType = ncclDevWorkTypeP2p;
workNode->size = sizeof(struct ncclDevWorkP2p);
ncclIntruQueueEnqueue(&plan->workQueue, workNode);
uint32_t workOffset = plan->workBytes;
uint32_t workOffset;
workOffset = plan->workBytes;
plan->workBytes += sizeof(struct ncclDevWorkP2p);
struct ncclDevWorkP2p* work = (struct ncclDevWorkP2p*)(workNode+1);
struct ncclDevWorkP2p* work;
work = (struct ncclDevWorkP2p*)(workNode+1);
work->nP2pChannels = comm->p2pnChannels;
work->channelBase = base;
work->nSendChannels = nChannels[1];
@@ -1152,8 +1137,6 @@ static ncclResult_t addP2pToPlan(
work->recvConnIndex = connIndex[0];
work->recvOpCount = recvOpCount;
struct ncclProxyOp proxyOps[2] = {};
int nProxyOps = selfSend ? 0 : 2;
for (int dir=0; dir < nProxyOps; dir++) {
struct ncclProxyOp* op = &proxyOps[dir];
op->root = dir ? sendRank : recvRank;
@@ -1166,6 +1149,7 @@ static ncclResult_t addP2pToPlan(
op->chunkSize = chunkSize[dir];
op->reg = netRegistered[dir];
op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0;
op->collAPI = p2pTasks[dir] ? p2pTasks[dir]->collAPI : 0;
op->task.p2p = p2pTasks[dir];
op->rank = comm->rank;
op->eActivationMask = p2pTasks[dir] ? p2pTasks[dir]->eActivationMask : 0;
@@ -1178,6 +1162,15 @@ static ncclResult_t addP2pToPlan(
}
nChannelsMax = std::max(nChannels[0], nChannels[1]);
// Determine how many peers this plan will target concurrently. Make a
// simplifying assumption that each task targets a different peer.
// Each task is striped across 'nChannelsMax' of 'p2pnChannels' channels.
// Each channel runs up to NCCL_MAX_DEV_WORK_P2P_PER_BATCH tasks concurrently.
int maxConcurrent;
int concurrentTasks[2];
maxConcurrent = comm->p2pnChannels / nChannelsMax * NCCL_MAX_DEV_WORK_P2P_PER_BATCH;
concurrentTasks[0] = std::min(planTotalTasks[0], maxConcurrent);
concurrentTasks[1] = std::min(planTotalTasks[1], maxConcurrent);
for (int part=0; part < nChannelsMax; part++) {
int incWorkCounter = -1;
int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part, comm->p2pnChannelsPerPeer, comm->nNodes);
@@ -1234,13 +1227,17 @@ static ncclResult_t addP2pToPlan(
// equal one plus the batch index this p2p settled in.
proxyOps[dir].channelId = channelId;
proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1;
NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
proxyOps[dir].nChannels = nChannels[dir];
proxyOps[dir].nPeers = concurrentTasks[dir];
NCCLCHECKGOTO(addProxyOpIfNeeded(comm, plan, &proxyOps[dir]), ret, cleanup);
NCCLCHECKGOTO(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir]), ret, cleanup);
}
}
}
return ncclSuccess;
cleanup:
free(handles[0]);
free(handles[1]);
return ret;
}
static int calcP2pChannelCount(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
@@ -1275,6 +1272,8 @@ static ncclResult_t scheduleP2pTasksToPlan(
// Try to use all channels, but one channel per operation.
while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
// Save the total count of send/recv tasks in the plan
int planTotalTasks[2] = {comm->planner.nTasksP2pRecv, comm->planner.nTasksP2pSend};
while (comm->planner.nTasksP2p != 0) {
for (int round=0; round < nRanks; round++) {
int sendRank = comm->p2pSchedule[round].sendRank;
@@ -1306,22 +1305,30 @@ static ncclResult_t scheduleP2pTasksToPlan(
ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, send);
ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, recv);
comm->planner.nTasksP2p -= 2;
comm->planner.nTasksP2pSend -= 1;
comm->planner.nTasksP2pRecv -= 1;
} else {
// Ensure room for worst case of one new batch per channel.
if (!testBudget(budget, plan->nWorkBatches+nChannelsMax, plan->workBytes + sizeof(struct ncclDevWorkP2p))) {
return ncclSuccess;
}
struct ncclTaskP2p* p2pTasks[2] = { recv, send };
NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes, send ? send->opCount : 0, recv ? recv->opCount : 0, p2pTasks));
NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes, send ? send->opCount : 0, recv ? recv->opCount : 0, planTotalTasks, p2pTasks));
if (send != nullptr) {
ncclIntruQueueDequeue(&peers[sendRank].sendQueue);
// Profiler - We can overwrite groupAPI event handles here since all operations here belong to the same group
plan->groupApiEventHandle = send->groupApiEventHandle;
ncclIntruQueueEnqueue(&plan->p2pTaskQueue, send);
comm->planner.nTasksP2p -= 1;
comm->planner.nTasksP2pSend -= 1;
}
if (recv != nullptr) {
ncclIntruQueueDequeue(&peers[recvRank].recvQueue);
// Profiler - We can overwrite groupAPI event handles here since all operations here belong to the same group
plan->groupApiEventHandle = recv->groupApiEventHandle;
ncclIntruQueueEnqueue(&plan->p2pTaskQueue, recv);
comm->planner.nTasksP2p -= 1;
comm->planner.nTasksP2pRecv -= 1;
}
}
}
@@ -1372,7 +1379,7 @@ namespace {
}
static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) {
if (plan->isSymColl) return ncclSuccess;
if (plan->isSymColl || plan->isCeColl) return ncclSuccess;
size_t workBytes = plan->workBytes;
size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
@@ -1544,7 +1551,7 @@ static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelP
}
static void HIPRT_CB hostStreamPlanCallback(void *plan_) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCL_NVTX3_FUNC_RANGE;
struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plan_;
ncclResult_t result = hostStreamPlanTask(plan->comm, plan);
if (result != ncclSuccess) {
@@ -1565,6 +1572,9 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
}
}
if (plan->isSymColl) {
free(plan->kernelSymArgs);
}
// Free coll tasks
struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
while (ct != nullptr) {
@@ -1645,7 +1655,9 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
planner->persistent = persistent;
int nPlans = 0;
if (planner->nTasksColl + planner->nTasksP2p != 0) {
if (planner->nTasksColl + planner->nTasksP2p != 0 ||
!ncclIntruQueueEmpty(&planner->collSymTaskQueue) ||
!ncclIntruQueueEmpty(&planner->collCeTaskQueue)) {
do {
memset(&planner->wipPlan, 0, sizeof(planner->wipPlan));
@@ -1657,55 +1669,55 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent
: ncclDevWorkStorageTypeFifo;
if (planner->isSymColl) {
plan->workStorageType = ncclDevWorkStorageTypeArgs;
if (!ncclIntruQueueEmpty(&planner->collCeTaskQueue)) {
struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collCeTaskQueue);
plan->isCeColl = true;
plan->ceCollArgs = ncclMemoryStackAlloc<struct ncclCeCollArgs>(&comm->memScoped);
plan->ceCollArgs->rootRank = task->root;
plan->ceCollArgs->nElts = task->count;
plan->ceCollArgs->eltSize = ncclTypeSize(task->datatype);
plan->ceCollArgs->sendBuff = (uint8_t*)task->sendbuff;
plan->ceCollArgs->recvBuff = (uint8_t*)task->recvbuff;
plan->ceCollArgs->func = task->func;
plan->ceCollArgs->sendWin = task->sendWin;
plan->ceCollArgs->recvWin = task->recvWin;
struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue);
plan->isSymColl = true;
plan->kernelFn = ncclSymGetKernelPtr((ncclSymKernelId)task->devFuncId, task->opDev.op, task->datatype);
plan->threadPerBlock = task->nWarps*WARP_SIZE;
for (int i = 0; i < MAXCHANNELS/64; i++)
plan->channelMask.masks[i] = uint64_t(-1) >> (64-task->nMaxChannels);
// plan->channelMask = uint64_t(-1) >> (64-task->nMaxChannels);
plan->kernelArgsSize = sizeof(struct ncclSymDevArgs);
plan->kernelSymArgs = ncclMemoryStackAlloc<struct ncclSymDevArgs>(&comm->memScoped);
plan->kernelSymArgs->comm = comm->symDevComm;
plan->kernelSymArgs->rootRank = task->root;
plan->kernelSymArgs->redOpArg = task->opDev.scalarArg;
plan->kernelSymArgs->nElts = task->count;
plan->kernelSymArgs->input = (char*)task->sendbuff;
plan->kernelSymArgs->output = (char*)task->recvbuff;
planner->nTasksColl -= 1;
ncclIntruQueueEnqueue(&planner->planQueue, plan);
INFO(NCCL_TUNING, "%s [Symmetric]: %ld Bytes -> Kernel %s nchannels %d nthreads %d",
ncclFuncToString(task->func), task->count * ncclTypeSize(task->datatype), ncclSymKernelIdToString(task->devFuncId), task->nMaxChannels, plan->threadPerBlock);
ncclIntruQueueDequeue(&planner->collCeTaskQueue);
ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, task);
nPlans += 1;
} else {
struct ncclKernelPlanBudget budget;
budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs);
// Non-persistent kernels fill up at most half of our fifo per kernel.
budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2;
if (!ncclIntruQueueEmpty(&planner->collSymTaskQueue)) {
NCCLCHECKGOTO(ncclSymmetricTaskScheduler(comm, &planner->collSymTaskQueue, plan), result, failure);
}
else {
struct ncclKernelPlanBudget budget;
budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs);
// Non-persistent kernels fill up at most half of our fifo per kernel.
budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2;
// Drain coll tasks first. This is essential since we partition tasks based
// on the work budget and p2p work isn't collective. If we were to drain p2p
// first, the place where we cut the kernel could vary by rank which would
// cause the "shortest channel first" channel picker to have divergent results.
if (planner->nTasksColl != 0) {
NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure);
}
// And only drain p2p tasks once colls are depleted.
if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) {
NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure);
// Drain coll tasks first. This is essential since we partition tasks based
// on the work budget and p2p work isn't collective. If we were to drain p2p
// first, the place where we cut the kernel could vary by rank which would
// cause the "shortest channel first" channel picker to have divergent results.
if (planner->nTasksColl != 0) {
NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure);
}
// And only drain p2p tasks once colls are depleted.
if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) {
NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure);
}
}
finishPlan(comm, plan);
if (plan->workBytes != 0) {
ncclIntruQueueEnqueue(&planner->planQueue, plan);
nPlans += 1;
}
}
} while (planner->nTasksColl + planner->nTasksP2p != 0);
} while (planner->nTasksColl + planner->nTasksP2p != 0 ||
!ncclIntruQueueEmpty(&planner->collSymTaskQueue) ||
!ncclIntruQueueEmpty(&planner->collCeTaskQueue));
struct ncclKernelPlan* planHead = ncclIntruQueueHead(&planner->planQueue);
planner->unlaunchedPlansHead = planHead;
@@ -1789,7 +1801,6 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
#endif
NCCL_PARAM(NvlinkUtilCentricSchedEnable, "NVLINK_UTIL_CENTRIC_SCHED_ENABLE", 0);
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
ncclResult_t ret = ncclSuccess;
struct ncclKernelPlanner* planner = &comm->planner;
@@ -1804,6 +1815,9 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
int smem = rcclShmemDynamicSize(comm->cudaArch, comm->WarpSize);
cudaStream_t launchStream = planner->streams->stream;
NCCLCHECK(ncclProfilerStartKernelLaunchEvent(plan, launchStream));
void* extra[] = {plan->kernelArgs, &plan->kernelArgsSize};
auto event = latency_profiler::collTraceAquireEventBaseline(plan, launchStream);
@@ -1860,25 +1874,24 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
}
#endif
#if CUDART_VERSION >= 12030
bool capturing = ncclCudaGraphValid(planner->capturingGraph);
enum ncclImplicitOrder implicitOrder;
NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing, driverVersion), ret, do_return);
NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, plan->persistent, driverVersion), ret, do_return);
if (implicitOrder == ncclImplicitOrderLaunch) {
launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT;
launchAttrs[attrs].value.launchCompletionEvent.event = comm->sharedRes->launchEvent;
launchAttrs[attrs].value.launchCompletionEvent.flags = 0;
attrs++;
}
if (comm->planner.isSymColl && compCap >= 90 && driverVersion >= 12030) {
if (plan->isSymColl && compCap >= 90 && driverVersion >= 12030) {
launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION;
launchAttrs[attrs].value.programmaticStreamSerializationAllowed = 1;
attrs++;
}
#endif
#if CUDART_VERSION >= 13000
if (compCap >= 90 && driverVersion >= 13000) {
if (compCap >= 100 && driverVersion >= 13000) {
launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING;
launchAttrs[attrs].value.nvlinkUtilCentricScheduling = ncclParamNvlinkUtilCentricSchedEnable();
launchAttrs[attrs].value.nvlinkUtilCentricScheduling = comm->config.nvlinkCentricSched;
attrs++;
}
#endif
@@ -1911,6 +1924,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
latency_profiler::collTraceRecordEndEvent(comm, plan, launchStream, std::move(event));
do_return:
NCCLCHECK(ncclProfilerStopKernelLaunchEvent(plan));
return ret;
}
@@ -2047,7 +2061,7 @@ static ncclResult_t updateCollCostTable(
float** collCostTable) {
float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
if (comm->nRanks == 1 || info->func == ncclFuncAllToAllPivot || info->func == ncclFuncAllToAllGda) {
if (comm->nRanks == 1 || info->func == ncclFuncAlltoAllPivot || info->func == ncclFuncAllToAllGda) {
table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
return ncclSuccess;
}
@@ -2056,6 +2070,8 @@ static ncclResult_t updateCollCostTable(
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
// CollNetDirect is only supported for up to 8 local GPUs
if (a == NCCL_ALGO_COLLNET_DIRECT && comm->maxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue;
// Disable CollNet Chain for more than 8 local GPUs
if (a == NCCL_ALGO_COLLNET_CHAIN && comm->maxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue;
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && (!nvlsSupport || (info->func != ncclFuncAllReduce && comm->localRanks > NCCL_MAX_NVLS_ARITY))) continue;
if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
/* Tree reduceScatter doesn't support scaling yet */
@@ -2160,7 +2176,11 @@ static ncclResult_t topoGetAlgoInfo(
}
} else if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
// NVLS should not need more than 16 channels to get peak BW.
nc = comm->nvlsChannels;
if (comm->nNodes > 1 && info->algorithm == NCCL_ALGO_NVLS) {
nc = std::min(comm->nvlsChannels, comm->nChannels);
} else {
nc = comm->nvlsChannels;
}
} else {
rcclUpdateThreadThreshold(comm, nBytes, info, threadThreshold);
INFO(NCCL_INIT, "pre-adjustment threadThreshold:%i nBytes:%lu nc:%i", threadThreshold, nBytes, nc);
@@ -2348,7 +2368,7 @@ static ncclResult_t calcCollChunking(
info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
ncclPatternRing;
break;
case ncclFuncAllToAllPivot:
case ncclFuncAlltoAllPivot:
pattern = ncclPatternRing;
break;
case ncclFuncAllToAllGda:
@@ -2510,6 +2530,7 @@ static ncclResult_t calcCollChunking(
}
proxyOp->pattern = pattern;
proxyOp->coll = info->func;
proxyOp->collAPI = info->func;
proxyOp->root = info->root;
proxyOp->isOneRPN = comm->isOneRPN;
// This is used by P2P to reduce the receive buffer size. We don't use it in collectives
@@ -2573,6 +2594,35 @@ static ncclResult_t calcCollChunking(
proxyOp->nbytes = DIVUP(nBytes, nChannels);
}
// Set peer count hints used by network plugin
switch (proxyOp->pattern) {
case ncclPatternRing:
case ncclPatternRingTwice:
case ncclPatternPipelineFrom:
case ncclPatternPipelineTo:
case ncclPatternPatUp:
case ncclPatternPatDown:
proxyOp->nPeers = 1;
break;
case ncclPatternTreeUp:
case ncclPatternTreeDown:
case ncclPatternTreeUpDown:
case ncclPatternNvlsTree:
proxyOp->nPeers = (NCCL_MAX_TREE_ARITY - 1) * 2;
break;
case ncclPatternCollnetChain:
case ncclPatternCollnetDirect:
case ncclPatternNvls:
case ncclPatternProfiler:
// Peer count hints unused
break;
case ncclPatternSend:
case ncclPatternRecv:
default:
WARN("Unknown pattern %d", pattern);
return ncclInternalError;
}
*outChunkSize = proxyOp->chunkSize;
return ncclSuccess;
}
@@ -2673,128 +2723,8 @@ static ncclResult_t hostToDevRedOp(
return ncclSuccess;
}
// Converts `info` to a task and adds it to `comm->planner`. The exception is with
// single rank communicators, collectives are issued as `ncclMemcpyAsync`s and
// thus don't need a task.
static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
static ncclResult_t ncclPlannerSetCapturingGraph(struct ncclComm* comm, struct ncclInfo* info) {
struct ncclKernelPlanner *planner = &comm->planner;
if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) {
int peer = info->root;
ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
bool isSendNotRecv = info->coll == ncclFuncSend;
// Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc<struct ncclTaskP2p>(&comm->memPool_ncclTaskP2p, &comm->memPermanent);
p2p->func = info->coll;
p2p->buff = (void*)info->recvbuff;
p2p->count = info->count;
p2p->datatype = info->datatype;
p2p->root = info->root;
p2p->bytes = nBytes;
p2p->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
p2p->opCount = comm->opCount;
ncclIntruQueueEnqueue(
isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
p2p);
planner->nTasksP2p += 1;
// Mark channels that need pre-connect
if (comm->rank != peer) {
if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) {
// planner->peers[peer].send/recvSeen is private to each comm, so we need to set it anyway.
(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true;
int round = 0;
while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank
: comm->p2pSchedule[round].recvRank)) {
round += 1;
}
uint8_t base = ncclP2pChannelBaseForRound(comm, round, rcclParamP2pBatchEnable());
for (int c=0; c < comm->p2pnChannelsPerPeer; c++) {
int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c, comm->p2pnChannelsPerPeer, comm->nNodes);
if (isSendNotRecv) {
if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector
// the send/recv connector is shared among split shared comms. We need to set hasSeen to
// 1 in order to avoid duplicate connection setup if user group sendrecv ops with split
// shared comms together.
comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
//comm->connectSend[peer] |= (1UL<<channelId);
comm->connectSend[peer].masks[channelId/64] |= (1UL<<(channelId%64));
ncclGroupCommPreconnect(comm);
}
if (comm->p2pNet && comm->channels[channelId].peers[peer]->send[NCCL_CONN_IDX_P2P_NET].hasSeen == 0) {
comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
//comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET].masks[channelId/64] |= (1UL<<(channelId%64));
ncclGroupCommPreconnect(comm);
}
} else {
if (comm->channels[channelId].peers[peer]->recv[1].hasSeen == 0) { // P2P uses only 1 connector
comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
//comm->connectRecv[peer] |= (1UL<<channelId);
comm->connectRecv[peer].masks[channelId/64] |= (1UL<<(channelId%64));
ncclGroupCommPreconnect(comm);
}
if (comm->p2pNet && comm->channels[channelId].peers[peer]->recv[NCCL_CONN_IDX_P2P_NET].hasSeen == 0) {
comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
//comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET].masks[channelId/64] |= (1UL<<(channelId%64));
ncclGroupCommPreconnect(comm);
}
}
}
}
}
} else {
// Empty collectives can be discarded.
if (info->count == 0) return ncclSuccess;
if (info->datatype == ncclFloat8e4m3 || info->datatype == ncclFloat8e5m2) {
if (comm->minCompCap < 90) {
WARN("FP8 reduction support begins with sm90 capable devices.");
return ncclInvalidArgument;
}
}
// Copy reduction op state from op handle into info struct here since the
// op handle may be destroyed before ncclGroupEnd().
struct ncclDevRedOpFull opDev;
NCCLCHECK(hostToDevRedOp(&opDev, info->op, info->datatype, comm));
if (comm->nRanks == 1) {
NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, opDev, info->datatype, info->stream));
return ncclSuccess;
} else {
// Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
t->func = info->coll;
t->sendbuff = info->sendbuff;
t->recvbuff = info->recvbuff;
t->count = info->count;
t->root = info->root;
t->datatype = info->datatype;
size_t elementSize = ncclTypeSize(t->datatype);
if (t->func == ncclFuncAllGather || t->func == ncclFuncBroadcast || t->func == ncclFuncAllToAllPivot || t->func == ncclFuncAllToAllGda) {
t->count *= elementSize;
t->datatype = ncclInt8;
elementSize = 1;
}
t->trafficBytes = t->count*elementSize*ncclFuncTrafficPerByte(t->func, comm->nRanks);
t->opHost = info->op;
t->opDev = opDev; // C++ struct assignment
t->chunkSteps = info->chunkSteps;
t->sliceSteps = info->sliceSteps;
t->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
t->opCount = comm->opCount;
t->acc = info->acc;
planner->nTasksColl += 1;
ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes);
}
}
if (info->stream != planner->streamRecent || planner->streams == nullptr) {
planner->streamRecent = info->stream;
struct ncclCudaStreamList* l = planner->streams;
@@ -2823,7 +2753,279 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
return ncclSuccess;
}
static ncclResult_t p2pTaskAppend(
struct ncclComm* comm,
struct ncclInfo* info,
ncclFunc_t coll,
ncclFunc_t collAPI,
void* buff,
size_t count,
ncclDataType_t datatype,
int peer) {
struct ncclKernelPlanner *planner = &comm->planner;
// Determine peer and basic parameters.
ssize_t nBytes = count*ncclTypeSize(datatype);
bool isSendNotRecv = coll == ncclFuncSend;
// Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
ncclGroupCommJoin(comm, ncclGroupTaskTypeCollective);
info->coll = coll;
// Set capturing graph. Called here so that profiler can emit a group API event with this information
NCCLCHECK(ncclPlannerSetCapturingGraph(comm, info));
bool isGraphCaptured = ncclCudaGraphValid(planner->capturingGraph);
NCCLCHECK(ncclProfilerStartGroupApiEvent(info, isGraphCaptured));
NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupStartApiStop));
NCCLCHECK(ncclProfilerStartP2pApiEvent(info, isGraphCaptured));
struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc<struct ncclTaskP2p>(&comm->memPool_ncclTaskP2p, &comm->memPermanent);
p2p->func = coll;
p2p->collAPI = collAPI;
p2p->buff = buff;
p2p->count = count;
p2p->datatype = datatype;
p2p->root = peer;
p2p->bytes = nBytes;
p2p->eActivationMask = ncclProfilerApiState.eActivationMask;
p2p->groupApiEventHandle = ncclProfilerApiState.groupApiEventHandle;
p2p->p2pApiEventHandle = ncclProfilerApiState.p2pApiEventHandle;
ncclIntruQueueEnqueue(
isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
p2p);
planner->nTasksP2p += 1;
if (isSendNotRecv)
planner->nTasksP2pSend += 1;
else
planner->nTasksP2pRecv += 1;
// Mark channels that need pre-connect
if (comm->rank != peer) {
if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) {
// planner->peers[peer].send/recvSeen is private to each comm, so we need to set it anyway.
(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true;
int round = 0;
while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank
: comm->p2pSchedule[round].recvRank)) {
round += 1;
}
uint8_t base = ncclP2pChannelBaseForRound(comm, round);
for (int c=0; c < comm->p2pnChannelsPerPeer; c++) {
int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c, comm->p2pnChannelsPerPeer, comm->nNodes);
if (isSendNotRecv) {
if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector
// the send/recv connector is shared among split shared comms. We need to set hasSeen to
// 1 in order to avoid duplicate connection setup if user group sendrecv ops with split
// shared comms together.
comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
comm->channels[channelId].peers[peer]->send[1].p2pOnly = 1;
// comm->connectSend[peer] |= (1UL<<channelId);
comm->connectSend[peer].masks[channelId/64] |= (1UL<<(channelId%64));
ncclGroupCommPreconnect(comm);
}
if (comm->p2pNet && comm->channels[channelId].peers[peer]->send[NCCL_CONN_IDX_P2P_NET].hasSeen == 0) {
comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
//comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET].masks[channelId/64] |= (1UL<<(channelId%64));
ncclGroupCommPreconnect(comm);
}
} else {
if (comm->channels[channelId].peers[peer]->recv[1].hasSeen == 0) { // P2P uses only 1 connector
comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
comm->channels[channelId].peers[peer]->recv[1].p2pOnly = 1;
// comm->connectRecv[peer] |= (1UL<<channelId);
comm->connectRecv[peer].masks[channelId/64] |= (1UL<<(channelId%64));
ncclGroupCommPreconnect(comm);
}
if (comm->p2pNet && comm->channels[channelId].peers[peer]->recv[NCCL_CONN_IDX_P2P_NET].hasSeen == 0) {
comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
//comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET].masks[channelId/64] |= (1UL<<(channelId%64));
ncclGroupCommPreconnect(comm);
}
}
}
}
}
ncclProfilerStopP2pApiEvent();
return ncclSuccess;
}
static ncclResult_t collTaskAppend(
struct ncclComm* comm,
struct ncclInfo* info,
struct ncclDevRedOpFull opDev) {
struct ncclKernelPlanner *planner = &comm->planner;
// Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
// Set capturing graph. Called here so that profiler can emit a group API event with this information
NCCLCHECK(ncclPlannerSetCapturingGraph(comm, info));
bool isGraphCaptured = ncclCudaGraphValid(planner->capturingGraph);
NCCLCHECK(ncclProfilerStartGroupApiEvent(info, isGraphCaptured));
NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupStartApiStop));
NCCLCHECK(ncclProfilerStartCollApiEvent(info, isGraphCaptured));
struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
t->func = info->coll;
t->sendbuff = info->sendbuff;
t->recvbuff = info->recvbuff;
t->count = info->count;
t->root = info->root;
t->datatype = info->datatype;
size_t elementSize = ncclTypeSize(t->datatype);
if (t->func == ncclFuncAllGather || t->func == ncclFuncBroadcast || t->func == ncclFuncAlltoAllPivot) {
t->count *= elementSize;
t->datatype = ncclInt8;
elementSize = 1;
}
t->trafficBytes = t->count*elementSize*ncclFuncTrafficPerByte(t->func, comm->nRanks);
t->opHost = info->op;
t->opDev = opDev; // C++ struct assignment
t->chunkSteps = info->chunkSteps;
t->sliceSteps = info->sliceSteps;
t->eActivationMask = ncclProfilerApiState.eActivationMask;
t->groupApiEventHandle = ncclProfilerApiState.groupApiEventHandle;
t->collApiEventHandle = ncclProfilerApiState.collApiEventHandle;
t->opCount = comm->opCount;
t->acc = info->acc;
planner->nTasksColl += 1;
ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes);
ncclProfilerStopCollApiEvent();
return ncclSuccess;
}
static ncclResult_t ceCollTaskAppend(
struct ncclComm* comm,
struct ncclInfo* info,
struct ncclDevrWindow* sendWin,
struct ncclDevrWindow* recvWin,
struct ncclDevRedOpFull opDev) {
struct ncclKernelPlanner *planner = &comm->planner;
// Check if CE needs initialization
if (comm->ceColl.baseUCSymReadyPtr == NULL && ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) {
struct ncclCeInitTask* ceTask;
NCCLCHECK(ncclCalloc(&ceTask, 1));
ceTask->comm = comm;
ncclIntruQueueEnqueue(&comm->ceInitTaskQueue, ceTask);
ncclGroupCommJoin(comm, ncclGroupTaskTypeSymRegister);
}
// Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
NCCLCHECK(ncclPlannerSetCapturingGraph(comm, info));
struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
t->func = info->coll;
t->sendbuff = info->sendbuff;
t->recvbuff = info->recvbuff;
t->count = info->count;
t->root = info->root;
t->datatype = info->datatype;
size_t elementSize = ncclTypeSize(t->datatype);
if (t->func == ncclFuncAllGather || t->func == ncclFuncBroadcast) {
t->count *= elementSize;
t->datatype = ncclInt8;
elementSize = 1;
}
t->trafficBytes = t->count*elementSize*ncclFuncTrafficPerByte(t->func, comm->nRanks);
t->opHost = info->op;
t->opDev = opDev; // C++ struct assignment
t->chunkSteps = info->chunkSteps;
t->sliceSteps = info->sliceSteps;
t->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
t->sendWin = sendWin;
t->recvWin = recvWin;
ncclIntruQueueEnqueue(&planner->collCeTaskQueue, t);
return ncclSuccess;
}
// Converts `info` to a task and adds it to `comm->planner`. The exception is with
// single rank communicators, collectives are issued as `ncclMemcpyAsync`s and
// thus don't need a task.
static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
ncclFunc_t collAPI = info->coll;
if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) {
NCCLCHECK(p2pTaskAppend(comm, info, info->coll, collAPI, (void*)info->recvbuff, info->count, info->datatype, info->root));
} else {
// Empty collectives can be discarded.
if (info->count == 0) return ncclSuccess;
if (info->datatype == ncclFloat8e4m3 || info->datatype == ncclFloat8e5m2) {
if (comm->minCompCap < 90 && info->coll != ncclFuncAllGather && info->coll != ncclFuncBroadcast && info->coll != ncclFuncAlltoAll && info->coll != ncclFuncScatter && info->coll != ncclFuncGather) {
WARN("FP8 reduction support begins with sm90 capable devices.");
return ncclInvalidArgument;
}
}
// Copy reduction op state from op handle into info struct here since the
// op handle may be destroyed before ncclGroupEnd().
struct ncclDevRedOpFull opDev;
NCCLCHECK(hostToDevRedOp(&opDev, info->op, info->datatype, comm));
if (comm->nRanks == 1) {
NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, opDev, info->datatype, info->stream));
return ncclSuccess;
} else {
struct ncclDevrWindow* sendWin;
struct ncclDevrWindow* recvWin;
ncclDevrFindWindow(comm, info->sendbuff, &sendWin);
ncclDevrFindWindow(comm, info->recvbuff, &recvWin);
bool ceImplemented = ncclCeImplemented(info->coll, info->op, info->datatype);
// Append CE collective task if CE is supported and requested by user
if (comm->symmetricSupport && comm->nNodes == 1 && sendWin && recvWin && (sendWin->winFlags & recvWin->winFlags & NCCL_WIN_COLL_SYMMETRIC) && comm->config.CTAPolicy == NCCL_CTA_POLICY_ZERO && ceImplemented) {
NCCLCHECK(ceCollTaskAppend(comm, info, sendWin, recvWin, opDev));
}
// Append kernel-based collective
else {
if (info->coll == ncclFuncAlltoAll) {
for (int r=0; r<comm->nRanks; r++) {
NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncSend, collAPI, (void*)((char*)info->sendbuff+r*info->count*ncclTypeSize(info->datatype)), info->count, info->datatype, r));
NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncRecv, collAPI, (void*)((char*)info->recvbuff+r*info->count*ncclTypeSize(info->datatype)), info->count, info->datatype, r));
}
} else if (info->coll == ncclFuncGather){
size_t offset = 0;
NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncSend, collAPI, (void*)info->sendbuff, info->count, info->datatype, info->root));
if (comm->rank == info->root) {
for (int r=0; r<comm->nRanks; r++) {
void* buff = (void*)((char*)info->recvbuff + offset);
NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncRecv, collAPI, buff, info->count, info->datatype, r));
offset += info->count * ncclTypeSize(info->datatype);
}
}
} else if (info->coll == ncclFuncScatter) {
size_t offset = 0;
if (comm->rank == info->root) {
for (int r = 0; r < comm->nRanks; r++) {
void* buff = (void*)((char*)info->sendbuff + offset);
NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncSend, collAPI, buff, info->count, info->datatype, r));
offset += info->count * ncclTypeSize(info->datatype);
}
}
NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncRecv, collAPI, (void*)info->recvbuff, info->count, info->datatype, info->root));
} else {
NCCLCHECK(collTaskAppend(comm, info, opDev));
}
}
}
}
return ncclSuccess;
}
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
// Profiler - If a group API event has already started, update the profilerGroupDepth so that the depth
// updates correctly for implicit ncclGroupStartInternal and ncclGroupEndInternal calls
if (ncclProfilerApiState.profilerGroupDepth > 0) {
ncclProfilerApiState.profilerGroupDepth++;
}
NCCLCHECK(ncclGroupStartInternal());
ncclResult_t ret = ncclSuccess;
int devOld = -1;
+14
Datei anzeigen
@@ -0,0 +1,14 @@
# Graph sources
set(GRAPH_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/topo.cc
${CMAKE_CURRENT_SOURCE_DIR}/tuning.cc
${CMAKE_CURRENT_SOURCE_DIR}/xml.cc
${CMAKE_CURRENT_SOURCE_DIR}/search.cc
${CMAKE_CURRENT_SOURCE_DIR}/paths.cc
${CMAKE_CURRENT_SOURCE_DIR}/connect.cc
${CMAKE_CURRENT_SOURCE_DIR}/rings.cc
${CMAKE_CURRENT_SOURCE_DIR}/trees.cc
)
# Add graph sources to parent scope
set(GRAPH_SOURCES ${GRAPH_SOURCES} PARENT_SCOPE)
+22 -18
Datei anzeigen
@@ -24,6 +24,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
int localRanks = comm->topo->nodes[GPU].count;
int nChannels = comm->nChannels;
topoRanks->crossNicRing = graphs[NCCL_ALGO_RING]->crossNic;
topoRanks->nvlsHeadNum = 0;
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
@@ -430,7 +431,6 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift);
INFO(NCCL_GRAPH, "%s", line);
channel->collnetChain.depth = comm->nRanks/comm->nNodes;
}
free(heads);
return ncclSuccess;
@@ -447,7 +447,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
if (nvlsHeads[h * comm->nNodes + comm->node] == comm->rank) headRank = h;
}
for (int c=0; c<comm->nChannels; c++) {
for (int c=0; c<comm->nvlsChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->nvls.nHeads = nHeads;
for (int h=0; h<nHeads; h++) channel->nvls.up[h] = comm->nRanks+1+h;
@@ -499,7 +499,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
}
// Set prev/next in all channels (NVLS compute channels work
// orthogonally to NVLS search channels).
for (int c=0; c<comm->nChannels; c++) {
for (int c=0; c<comm->nvlsChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->nvls.treeUp = treeUp[c%2];
channel->nvls.treeDown[0] = channel->nvls.down;
@@ -731,17 +731,17 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
// Alternate rings to avoid crossing rails
if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) {
for (int r=0; r<comm->nRanks; r++) {
if (comm->rankToNode[r] % 2 == 1) {
// Exchange rings
for (int c=0; c<nChannels; c+=2) {
exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
}
// Alternate rings to avoid crossing rails.
// CrossNic values could be not the same on all nodes as it depends on the number of net devs and the NVLink bandwidth.
// Therefore, it's only done if the rank obtained a solution with crossNic=2.
for (int r = 0; r < comm->nRanks; r++) {
if (allTopoRanks[r]->crossNicRing == 2 && (nChannels % 2) == 0 && (comm->rankToNode[r] % 2) == 1) {
// Exchange rings
for (int c=0; c<nChannels; c+=2) {
exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
}
}
}
@@ -858,7 +858,14 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
int collNetNchannels = std::min(maxChannels, nChannels+nChannels/2);
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
}
NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
for (int c = 0; c < comm->nChannels; c++) {
comm->channels[c].collnetChain.depth = comm->nRanks/comm->nNodes;
}
if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
}
}
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
@@ -910,9 +917,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) {
comm->nvlsChannels = parent->nvlsResources->nChannels;
}
if (comm->nChannels < comm->nvlsChannels) {
nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
}
NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail);
#endif
if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
+11 -10
Datei anzeigen
@@ -391,11 +391,15 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
nvmlGpuFabricInfoV_t *fabricInfo1 = &info1->fabricInfo;
nvmlGpuFabricInfoV_t *fabricInfo2 = &info2->fabricInfo;
// A zero UUID means we don't have MNNVL fabric info
if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess;
unsigned long uuid0 = 0;
unsigned long uuid1 = 0;
memcpy(&uuid0, fabricInfo2->clusterUuid, sizeof(uuid0));
memcpy(&uuid1, fabricInfo2->clusterUuid + sizeof(uuid0), sizeof(uuid1));
if ((uuid0 | uuid1) == 0) return ncclSuccess;
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
TRACE(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x",
info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId);
info2->busId, uuid0, uuid1, fabricInfo2->cliqueId);
*ret = 1;
}
return ncclSuccess;
@@ -936,9 +940,6 @@ void ncclTopoFree(struct ncclTopoSystem* system) {
free(system);
}
NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", -1);
NCCL_PARAM(NChannelsPerPeer, "NCHANNELS_PER_PEER", -2);
static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gpu index*/, int peerRank, int* nChannels) {
int peer;
struct ncclTopoSystem* system = comm->topo;
@@ -959,10 +960,10 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp
}
} else {
// Remote rank, use network
int nNetChannels = ncclParamNChannelsPerNetPeer();
if (nNetChannels == -1) {
//start from 2 channels per NIC and reduce with scale
nNetChannels = 2;
int nNetChannels = comm->config.nChannelsPerNetPeer;
if (nNetChannels == NCCL_CONFIG_UNDEF_INT) {
//start from 2 channels per NIC and reduce with scale
nNetChannels = 2;
// check if we need to use more than one NIC, hence more than one channel
int netCountByBw = 1, nChannelsMax = nNetChannels;
@@ -1014,7 +1015,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
comm->p2pnChannels = std::min(pow2Up(comm->p2pnChannels), pow2Down(ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes())));
} else {
// Round to next pow2 nChannelsPerPeer and nChannels
comm->p2pnChannelsPerPeer = (ncclParamNChannelsPerPeer() == -2 ? pow2Up(minChannels) : ncclParamNChannelsPerPeer());
comm->p2pnChannelsPerPeer = pow2Up(minChannels);
// Doubling P2P channels per peer on single node
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950"))) comm->p2pnChannelsPerPeer *= 2;
comm->p2pnChannels = std::min(pow2Up(comm->p2pnChannels), 4*CHANNEL_LIMIT);
+254 -118
Datei anzeigen
@@ -9,6 +9,7 @@
#include "graph.h"
#include "topo.h"
#include "comm.h"
#include "nccl.h"
#include "nvmlwrap.h"
#include "coll_net.h"
#include "transport.h"
@@ -16,6 +17,7 @@
#include <fcntl.h>
#include "cpuset.h"
#include "bootstrap.h"
#include <mutex>
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
@@ -427,6 +429,7 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s
#define PCI_BRIDGE_DEVICE_CLASS "0x060400"
// struct kvDict kvDictPciClass[] = { { PCI_BRIDGE_DEVICE_CLASS, PCI }, {"0x080100", /*CX8 data direct*/PCI}, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
struct kvDict kvDictPciClass[] = { { PCI_BRIDGE_DEVICE_CLASS, PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { "0x120000", GPU }, { NULL, PCI /* Default fallback value */ } };
struct kvDict kvDictPciGen[] = {
{ "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
@@ -1069,8 +1072,7 @@ ncclResult_t ncclTopoMakePciParent(struct ncclXml* xml, struct ncclXmlNode** par
return ncclSuccess;
}
ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, ncclNetVDeviceProps_t* vProps,
struct ncclXmlNode** physNetNodes, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, ncclNetVDeviceProps_t* vProps, struct ncclXmlNode** physNetNodes) {
if (vProps->ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
WARN("TOPO/NET : Tried to merge too many NICs. %d > %d", vProps->ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
return ncclInternalError;
@@ -1084,7 +1086,7 @@ struct ncclXmlNode** physNetNodes, ncclResult_t (*makeVDevice)(int*, ncclNetVDev
// Trigger the merge, then get the new device's properties
int vDevIndex = 0;
ncclResult_t ret = makeVDevice(&vDevIndex, vProps);
ncclResult_t ret = netInfo->makeVDevice(&vDevIndex, vProps);
if (ret != ncclSuccess) {
INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "TOPO/NET : Tried merging multiple devices together and failed. vProps={ndevs=%d, devs=[%d %d %d %d]}. Set NCCL_NET_MERGE_LEVEL=LOC to disable NIC fusion.",
vProps->ndevs, vProps->devs[0], vProps->devs[1], vProps->devs[2], vProps->devs[3]);
@@ -1102,9 +1104,10 @@ struct ncclXmlNode** physNetNodes, ncclResult_t (*makeVDevice)(int*, ncclNetVDev
return ncclSuccess;
}
ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs) {
ncclResult_t ret = ncclSuccess;
INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
const char* str = netInfo->forceMerge;
INFO(NCCL_ENV | NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
char* ncStr;
NCCLCHECK(ncclCalloc(&ncStr, strlen(str)+1));
strcpy(ncStr, str);
@@ -1140,7 +1143,7 @@ ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, char* str, int* placedDevs,
goto fail;
}
ret = ncclTopoMakeVnic(xml, &vProps, physNetNodes, makeVDevice);
ret = ncclTopoMakeVnic(xml, netInfo, &vProps, physNetNodes);
if (ret == ncclSuccess) {
// Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
for (int i = 0; i < vProps.ndevs; i++) {
@@ -1162,7 +1165,7 @@ fail:
goto exit;
}
ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs) {
// Compute the path type between each device
int* paths = NULL;
ncclResult_t res = ncclSuccess;
@@ -1192,7 +1195,7 @@ ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, int mergeLevel, int* placedD
// Select each unplaced device "j" which is at most "mergeLevel" distance from "i", but not equal to "i"
// (Don't merge the same device with itself)
for (int j = 0; j < nPhysDevs; j++) {
if (paths[i*nPhysDevs + j] <= mergeLevel &&
if (paths[i*nPhysDevs + j] <= netInfo->mergeLevel &&
placedDevs[j] == 0 && j != i) {
vProps.devs[vProps.ndevs++] = j;
placedDevs[j] = 1;
@@ -1206,7 +1209,7 @@ ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, int mergeLevel, int* placedD
return ncclInternalError;
}
ncclResult_t ret = ncclTopoMakeVnic(xml, &vProps, physNetNodes, makeVDevice);
ncclResult_t ret = ncclTopoMakeVnic(xml, netInfo, &vProps, physNetNodes);
// Merging failed.
// Mark all as unplaced and increase their distance to disconnected (PATH_DIS)
@@ -1244,6 +1247,92 @@ struct kvDict nicPathKvList[] = {
{ NULL, 0 }
};
ncclResult_t ncclTopoFindLinkWidthRec(ncclXmlNode* node, ncclXmlNode** physNetNodes, int ndevs, int* foundPhysNet, int* linkWidth) {
int myLinkWidth = 0;
if (strcmp(node->name, "pci") == 0) {
NCCLCHECK(xmlGetAttrInt(node, "link_width", &myLinkWidth));
#ifdef ENABLE_TRACE
const char *busidAttr, *linkAttr;
NCCLCHECK(xmlGetAttrStr(node, "busid", &busidAttr));
NCCLCHECK(xmlGetAttr(node, "link_width", &linkAttr));
TRACE(NCCL_GRAPH, "Found link_width (%s)=%d for busid=%s", linkAttr, myLinkWidth, busidAttr);
#endif
}
*foundPhysNet = 0;
// Detect if a physical child is found. This information will be propagated up the stack.
int devId = 0;
while (devId < ndevs && !(*foundPhysNet)) *foundPhysNet = (node == physNetNodes[devId++]);
int totalChildLinkWidth = 0;
for (int i = 0; i < node->nSubs; i++) {
ncclXmlNode* child = node->subs[i];
int found = 0;
int tempLinkWidth = 0;
NCCLCHECK(ncclTopoFindLinkWidthRec(child, physNetNodes, ndevs, &found, &tempLinkWidth));
if (found) {
*foundPhysNet = 1;
totalChildLinkWidth += tempLinkWidth;
}
}
if (*foundPhysNet == 0) {
// No child NICs were found, do not accrue any detected link_width
*linkWidth = 0;
INFO(NCCL_GRAPH, "Did not find child net device. Returning link_width=%d totalChildLinkWidth=%d", *linkWidth, totalChildLinkWidth);
} else if (totalChildLinkWidth == 0) {
// If A child NIC was found but no link_width was detected among children, assign the link_width to mine (I am the first pci node right above the physNetNode).
*linkWidth = myLinkWidth;
INFO(NCCL_GRAPH, "Found child net device for %s. Returning link_width=%d totalChildLinkWidth=%d", node->name, *linkWidth, totalChildLinkWidth);
} else {
// Standard recursive accrual of link_width. The link_width is either the bottleneck of this PCI node's width or the sum of its children's width.
*linkWidth = myLinkWidth > 0 ? std::min(myLinkWidth, totalChildLinkWidth) : totalChildLinkWidth;
INFO(NCCL_GRAPH, "Found child net device for %s. Returning link_width=%d totalChildLinkWidth=%d", node->name, *linkWidth, totalChildLinkWidth);
}
return ncclSuccess;
}
// DFS over nodes under common parent
// Exclude link widths of non-physNetNodes chains
ncclResult_t ncclTopoFindLinkWidth(ncclXmlNode* parent, ncclXmlNode** physNetNodes, int ndevs, int* linkWidth) {
*linkWidth = 0;
for (int i = 0; i < parent->nSubs; i++) {
ncclXmlNode* child = parent->subs[i];
int foundPhysNet = 0;
int childLinkWidth = 0;
NCCLCHECK(ncclTopoFindLinkWidthRec(child, physNetNodes, ndevs, &foundPhysNet, &childLinkWidth));
if (foundPhysNet) {
*linkWidth += childLinkWidth;
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoWidenLinks(ncclXmlNode** physNetNodes, int ndevs, ncclXmlNode* parent) {
int sumLinkWidth = 0;
NCCLCHECK(ncclTopoFindLinkWidth(parent, physNetNodes, ndevs, &sumLinkWidth));
for (int i = 0; i < ndevs; i++) {
ncclXmlNode* temp = physNetNodes[i];
while (temp != parent) {
if (strcmp(temp->name, "pci") == 0) {
NCCLCHECK(xmlSetAttrInt(temp, "link_width", sumLinkWidth));
TRACE(NCCL_GRAPH, "Set link_width to %d for node %s", sumLinkWidth, temp->name);
}
temp = temp->parent;
}
}
if (strcmp(parent->name, "pci") == 0) {
NCCLCHECK(xmlSetAttrInt(parent, "link_width", sumLinkWidth));
TRACE(NCCL_GRAPH, "Set link_width to %d for node %s", sumLinkWidth, parent->name);
}
return ncclSuccess;
}
ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclNetVDeviceProps_t* vProps, ncclXmlNode** parent) {
ncclNetProperties_t props[NCCL_NET_MAX_DEVS_PER_NIC];
ncclXmlNode* physNetNodes[NCCL_NET_MAX_DEVS_PER_NIC];
@@ -1257,54 +1346,50 @@ ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProper
int path = PATH_LOC;
NCCLCHECK(ncclTopoGetPath(physNetNodes, vProps->ndevs, &path, parent));
if (path == PATH_LOC) {
*parent = NULL;
} else if (parent && strcmp((*parent)->name, "pci") == 0) {
// Compare PCI class here to avoid NCCL WARN when the "class" attribute doesn't exist
const char* c;
NCCLCHECK(xmlGetAttrStr(*parent, "class", &c));
if (strcmp(c, PCI_BRIDGE_DEVICE_CLASS) == 0) {
if (path == PATH_PHB || path == PATH_PXB || path == PATH_PIX) {
INFO(NCCL_GRAPH, "Widening links");
NCCLCHECK(ncclTopoWidenLinks(physNetNodes, vProps->ndevs, *parent));
}
if (*parent) {
if (strcmp((*parent)->name, "pci") == 0) {
// Compare PCI class here to avoid NCCL WARN when the "class" attribute doesn't exist
const char* c;
NCCLCHECK(xmlGetAttrStr(*parent, "class", &c));
if (c && strcmp(c, PCI_BRIDGE_DEVICE_CLASS) == 0) {
// If the common parent is a PCI switch, we must reparent the new NIC under a made up pci device with a unique busid
NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0]));
}
} else if (strcmp((*parent)->name, "cpu") == 0) {
// If the common parent is a PCI switch, we must reparent the new NIC under a made up pci device with a unique busid
NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0]));
}
}
TRACE(NCCL_GRAPH, "Selected parent %s with path %d", (*parent)->name, path);
return ncclSuccess;
}
ncclResult_t ncclTopoMakeVNics(struct ncclXml* xml, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*getProperties)(int, ncclNetProperties_t*), int physicalDevs) {
ncclResult_t ncclTopoMakeVNics(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, int physicalDevs) {
int* placedDevs = NULL;
struct ncclXmlNode** physNetNodes = NULL;
ncclNetProperties_t* props = NULL;
ncclResult_t res = ncclSuccess;
if (physicalDevs == 0) return ncclSuccess;
ncclCalloc(&physNetNodes, physicalDevs);
ncclResult_t res = ncclSuccess;
ncclNetProperties_t* props = NULL;
ncclCalloc(&props, physicalDevs);
NCCLCHECK(ncclCalloc(&physNetNodes, physicalDevs));
NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
NCCLCHECK(ncclCalloc(&props, physicalDevs));
for (int i = 0; i < physicalDevs; i++) {
NCCLCHECKGOTO(getProperties(i, props + i), res, out);
NCCLCHECKGOTO(netInfo->getProperties(i, props + i), res, out);
struct ncclXmlNode* physNetNode;
NCCLCHECKGOTO(xmlFindTagKv(xml, "net", &physNetNode, "name", props[i].name), res, out);
physNetNodes[i] = physNetNode;
TRACE(NCCL_GRAPH, "Found physical ncclNet node %d %s", i, props[i].name);
}
// By default, don't merge any devices
int mergeLevel;
mergeLevel = PATH_PORT;
{ // Avoids warnings related to jumping to "out"
const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL");
if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
char* forceMerge = (char*) ncclGetEnv("NCCL_NET_FORCE_MERGE");
NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
memset(placedDevs, 0, sizeof(int)*physicalDevs);
if (forceMerge) {
NCCLCHECKGOTO(ncclTopoForceMerge(xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
}
}
NCCLCHECKGOTO(ncclTopoAutoMerge(xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
if (netInfo->forceMerge) NCCLCHECKGOTO(ncclTopoForceMerge(xml, netInfo, placedDevs, props, physNetNodes, physicalDevs), res, out);
NCCLCHECKGOTO(ncclTopoAutoMerge(xml, netInfo, placedDevs, props, physNetNodes, physicalDevs), res, out);
out:
free(physNetNodes);
@@ -1313,10 +1398,10 @@ out:
return res;
}
static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int virtualNics, bool dmaBufSupport) {
static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIndex, struct ncclTopoNetInfo* netInfo, int virtualNics) {
for (int n = startIndex; n < endIndex; n++) {
ncclNetProperties_t props;
NCCLCHECK(getProperties(n, &props));
NCCLCHECK(netInfo->getProperties(n, &props));
struct ncclXmlNode* netNode = NULL;
struct ncclXmlNode* parent = NULL;
if (virtualNics) {
@@ -1324,7 +1409,7 @@ static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIn
NCCLCHECK(xmlFindTagKv(xml, "net", &net, "name", props.name));
// In the event of multithreaded use case, we need to re-discover the shared parent of the given devices for this vNIC
// Only run this if the net doesn't exist locally - this may alter the XML state
if (net == NULL) NCCLCHECK(ncclTopoGetVNicParent(xml, getProperties, &props.vProps, &parent));
if (net == NULL) NCCLCHECK(ncclTopoGetVNicParent(xml, netInfo->getProperties, &props.vProps, &parent));
}
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode, parent));
@@ -1335,18 +1420,18 @@ static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIn
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
int dev;
xmlGetAttrIntDefault(netNode, "dev", &dev, -1);
if (dev != -1 && dev != n) INFO(NCCL_GRAPH, "TOPO/NET : Changing %s dev index from %d to %d", netName, dev, n);
if (dev != -1 && dev != n) INFO(NCCL_GRAPH, "TOPO/NET : Changing %s dev index from %d to %d", netInfo->name, dev, n);
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "latency", props.latency));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netName, gdrSupport ? "Enabled" : "Disabled", n, props.name);
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (netInfo->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netInfo->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
// Only set coll if it's not 0
if (coll) NCCLCHECK(xmlInitAttrInt(netNode, "coll", coll));
if (netInfo->coll) NCCLCHECK(xmlInitAttrInt(netNode, "coll", netInfo->coll));
const char* keepAttr;
NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr));
@@ -1359,51 +1444,45 @@ static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIn
}
// Calls to network plugin APIs should be protected. This function should be called inside a per-process lock.
ncclResult_t ncclTopoProcessNet(ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName, bool dmaBufSupport) {
int usePhysicalDevices = (dumpXmlFile || makeVDevice == NULL);
if (state->nPhysicalNics == -1) NCCLCHECK(devices(&state->nPhysicalNics));
// Enumerate physical devices
NCCLCHECK(ncclTopoPopulateNics(xml, 0, state->nPhysicalNics, getProperties, netName, coll, false, dmaBufSupport));
ncclResult_t ncclTopoProcessNet(ncclXml* xml, const char* dumpXmlFile, struct ncclTopoNetInfo* net) {
bool usePhysicalDevices = (dumpXmlFile || net->makeVDevice == NULL);
int nPhysicalNics, nVirtualNics;
NCCLCHECK(net->getDevCount(net->netPluginIndex, &nPhysicalNics, &nVirtualNics));
// List the physical devices in the topo
NCCLCHECK(ncclTopoPopulateNics(xml, 0, nPhysicalNics, net, /*virtual=*/false));
if (!usePhysicalDevices) {
if (state->nVirtualNics == -1) {
NCCLCHECK(ncclTopoMakeVNics(xml, makeVDevice, getProperties, state->nPhysicalNics));
// Virtual devices are only created once per network
if (nVirtualNics == NCCL_UNDEF_DEV_COUNT) {
NCCLCHECK(ncclTopoMakeVNics(xml, net, nPhysicalNics));
// Update the number of virtual devices both locally and in the state tracking the plugin.
// Note: 0 is a valid number of virtual devices
int nDevs;
NCCLCHECK(devices(&nDevs));
state->nVirtualNics = nDevs - state->nPhysicalNics;
NCCLCHECK(net->devices(&nDevs));
nVirtualNics = nDevs - nPhysicalNics;
NCCLCHECK(net->setVirtDevCount(net->netPluginIndex, nVirtualNics));
}
if (state->nVirtualNics > 0) {
// Populate new devices
NCCLCHECK(ncclTopoPopulateNics(xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, true, dmaBufSupport));
// populate the virtual devices if any
if (nVirtualNics > 0) {
NCCLCHECK(ncclTopoPopulateNics(xml, nPhysicalNics, nPhysicalNics + nVirtualNics, net, /*virtual=*/true));
}
}
return ncclSuccess;
}
static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
ncclTopoNetState netStates[NCCL_NET_MAX_PLUGINS] = {};
ncclTopoNetState collNetStates[NCCL_NET_MAX_PLUGINS] = {};
ncclResult_t ncclTopoGetSharedState(ncclTopoNetState** state, const char* name, ncclTopoNetState* states) {
INFO(NCCL_GRAPH, "Retrieving state for %s", name);
for (int i = 0; i < NCCL_NET_MAX_PLUGINS; i++) {
// Empty slot
if (states[i].name == NULL) {
states[i].nVirtualNics = -1;
states[i].nPhysicalNics = -1;
states[i].name = strdup(name);
*state = states + i;
INFO(NCCL_GRAPH, "Initialized state %d for %s", i, name);
return ncclSuccess;
// Found my slot
} else if (strcmp(states[i].name, name) == 0) {
*state = states + i;
return ncclSuccess;
}
ncclResult_t ncclTopoGetFusionEnv(int* mergeLevel, const char** forceMerge) {
if (forceMerge) *forceMerge = ncclGetEnv("NCCL_NET_FORCE_MERGE");
const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL");
if (mergeLevelEnv) {
kvConvertToInt(mergeLevelEnv, mergeLevel, nicPathKvList);
} else {
*mergeLevel = PATH_PORT;
}
WARN("NET/TOPO : Couldn't find net with name %s", name);
return ncclInternalError;
return ncclSuccess;
}
static std::mutex netMutex;
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system, const char* dumpXmlFile) {
ncclResult_t ret = ncclSuccess;
struct ncclXml* xml;
@@ -1411,7 +1490,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
int* localRanks = NULL;
struct ncclXml* rankXml;
int localRank = -1, nLocalRanks = 0;
int netLockHeld = 0;
struct ncclTopoNetInfo netInfo = {0};
NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES));
const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
if (xmlTopoFile) {
@@ -1451,21 +1530,35 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
// Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
// so we start with collnet so that it has precedence.
pthread_mutex_lock(&netLock);
netLockHeld = 1;
INFO(NCCL_GRAPH, "TOPO/NET : Importing network plugins to topology");
ncclTopoNetState* state;
state = NULL;
if (collNetSupport(comm)) {
NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclCollNet->name, collNetStates), ret, fail);
NCCLCHECKGOTO(ncclTopoProcessNet(xml, 1, dumpXmlFile, state,
comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name, comm->dmaBufSupport), ret, fail);
{
std::lock_guard<std::mutex> lock(netMutex);
INFO(NCCL_GRAPH, "TOPO/NET : Importing network plugins to topology");
if (collNetSupport(comm)) {
netInfo.coll = 1;
netInfo.netPluginIndex = comm->netPluginIndex;
netInfo.dmaBufSupport = comm->dmaBufSupport;
netInfo.getDevCount = ncclCollNetGetDevCount;
netInfo.setVirtDevCount = ncclCollNetSetVirtDevCount;
netInfo.name = comm->ncclCollNet->name;
netInfo.getProperties = comm->ncclCollNet->getProperties;
netInfo.makeVDevice = comm->ncclCollNet->makeVDevice;
netInfo.devices = comm->ncclCollNet->devices;
NCCLCHECK(ncclTopoGetFusionEnv(&netInfo.mergeLevel, &netInfo.forceMerge));
NCCLCHECKGOTO(ncclTopoProcessNet(xml, dumpXmlFile, &netInfo), ret, fail);
}
netInfo.coll = 0;
netInfo.netPluginIndex = comm->netPluginIndex;
netInfo.dmaBufSupport = comm->dmaBufSupport;
netInfo.getDevCount = ncclNetGetDevCount;
netInfo.setVirtDevCount = ncclNetSetVirtDevCount;
netInfo.name = comm->ncclNet->name;
netInfo.getProperties = comm->ncclNet->getProperties;
netInfo.makeVDevice = comm->ncclNet->makeVDevice;
netInfo.devices = comm->ncclNet->devices;
NCCLCHECK(ncclTopoGetFusionEnv(&netInfo.mergeLevel, &netInfo.forceMerge));
NCCLCHECKGOTO(ncclTopoProcessNet(xml, dumpXmlFile, &netInfo), ret, fail);
}
NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclNet->name, netStates), ret, fail);
NCCLCHECKGOTO(ncclTopoProcessNet(xml, 0, dumpXmlFile, state,
comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name, comm->dmaBufSupport), ret, fail);
pthread_mutex_unlock(&netLock);
netLockHeld = 0;
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
NCCLCHECKGOTO(ncclTopoTrimXml(xml), ret, fail);
@@ -1523,7 +1616,6 @@ exit:
free(xml);
return ret;
fail:
if (netLockHeld) pthread_mutex_unlock(&netLock);
goto exit;
}
@@ -1578,6 +1670,38 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
return ncclSuccess;
}
enum netDevsPolicy {
NETDEVS_POLICY_AUTO = 0x0,
NETDEVS_POLICY_ALL = 0x1,
NETDEVS_POLICY_MAX = 0x2,
NETDEVS_POLICY_UNDEF = 0xffffffff
};
static enum netDevsPolicy netDevsPolicy = NETDEVS_POLICY_UNDEF;
static int netDevsPolicyNum = -1;
static void getNetDevsPolicyOnce() {
const char* envStr = ncclGetEnv("NCCL_NETDEVS_POLICY");
if (envStr) {
if (strcasecmp(envStr, "AUTO") == 0) {
netDevsPolicy = NETDEVS_POLICY_AUTO;
} else if (strcasecmp(envStr, "ALL") == 0) {
netDevsPolicy = NETDEVS_POLICY_ALL;
} else if (strncasecmp(envStr, "MAX:", strlen("MAX:")) == 0) {
int envNum = atoi(envStr + strlen("MAX:"));
if (envNum > 0) {
netDevsPolicy = NETDEVS_POLICY_MAX;
netDevsPolicyNum = envNum;
}
}
if (netDevsPolicy == NETDEVS_POLICY_UNDEF)
INFO(NCCL_ENV, "Unable to recognize NCCL_NETDEVS_POLICY=%s, using NCCL_NETDEVS_POLICY_AUTO instead.", envStr);
else
INFO(NCCL_ENV, "NCCL_NETDEVS_POLICY set by environment to %s", envStr);
}
if (netDevsPolicy == NETDEVS_POLICY_UNDEF) netDevsPolicy = NETDEVS_POLICY_AUTO;
}
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
int gpu;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu, /*showWarn=*/true));
@@ -1592,13 +1716,30 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
return ncclInternalError;
}
int localGpus[NCCL_TOPO_MAX_NODES];
int localGpuCount;
NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL));
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once,getNetDevsPolicyOnce);
int netsPerGpu = 0;
if (netDevsPolicy == NETDEVS_POLICY_AUTO) {
int localGpus[NCCL_TOPO_MAX_NODES];
int localGpuCount;
NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL));
netsPerGpu = DIVUP(localNetCount, localGpuCount);
} else if (netDevsPolicy == NETDEVS_POLICY_ALL) {
netsPerGpu = localNetCount;
} else if (netDevsPolicy == NETDEVS_POLICY_MAX) {
if (netDevsPolicyNum <= 0) {
WARN("Invalid number of network devices = %d for policy MAX", netDevsPolicyNum);
return ncclInternalError;
}
netsPerGpu = std::min(netDevsPolicyNum, localNetCount);
} else {
WARN("Unknown netDevs policy");
return ncclInternalError;
}
int net = system->nodes[GPU].nodes[gpu].gpu.dev;
if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
net += channelId%(DIVUP(localNetCount,localGpuCount));
net += channelId%(netsPerGpu);
if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
return ncclSuccess;
@@ -1656,25 +1797,10 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
cpu_set_t mask;
SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
#ifdef ENABLE_TRACE
{
char affinityStr[sizeof(cpu_set_t)*2];
TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev,
ncclCpusetToRangeStr(&mask, affinityStr, sizeof(affinityStr)));
}
#endif
// Get the affinity of the CPU close to our GPU.
cpu_set_t cpuMask = cpu->cpu.affinity;
#ifdef ENABLE_TRACE
{
char affinityStr[sizeof(cpu_set_t)*2];
TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev,
ncclCpusetToRangeStr(&cpuMask, affinityStr, sizeof(affinityStr)));
}
#endif
// Get the final affinity
cpu_set_t finalMask;
if (ncclParamIgnoreCpuAffinity())
// Ignore the CPU affinity set and use the GPU one instead
@@ -1685,12 +1811,22 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
memcpy(affinity, &finalMask, sizeof(cpu_set_t));
// If there is a non empty set, use it to set affinity
// display the final affinity
char msg[1024] = "";
snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), "Affinity for GPU %d is ", gpu->gpu.dev);
if (CPU_COUNT(&finalMask)) {
char affinityStr[sizeof(cpu_set_t)*2];
INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev,
ncclCpusetToRangeStr(&finalMask, affinityStr, sizeof(affinityStr)));
(void)ncclCpusetToRangeStr(&finalMask, msg + strlen(msg), sizeof(msg) - strlen(msg));
} else {
snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), "empty, ignoring");
}
snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ". (GPU affinity = ");
(void)ncclCpusetToRangeStr(&cpuMask, msg + strlen(msg), sizeof(msg) - strlen(msg));
if (!ncclParamIgnoreCpuAffinity()) {
snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), " ; CPU affinity = ");
(void)ncclCpusetToRangeStr(&mask, msg + strlen(msg), sizeof(msg) - strlen(msg));
}
snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ").");
INFO(NCCL_INIT, "%s: %s", __func__, msg);
return ncclSuccess;
}
+20 -4
Datei anzeigen
@@ -229,12 +229,26 @@ ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int*
ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max);
ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink);
struct ncclTopoNetState {
int nVirtualNics;
int nPhysicalNics;
struct ncclTopoNetInfo {
bool coll;
// communicator-specific information
int netPluginIndex;
bool dmaBufSupport;
// NIC fusion
int mergeLevel;
const char* forceMerge;
// dev count tracking functions (not part of ncclNet)
ncclResult_t (*getDevCount)(int, int*, int*);
ncclResult_t (*setVirtDevCount)(int, int);
// ncclNet API functions
const char* name;
ncclResult_t (*getProperties)(int, ncclNetProperties_t*);
ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*);
ncclResult_t (*devices)(int*);
};
ncclResult_t ncclTopoProcessNet(ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName, bool dmaBufSupport);
ncclResult_t ncclTopoProcessNet(ncclXml* xml, const char* dumpXmlFile, struct ncclTopoNetInfo* net);
ncclResult_t ncclTopoGetFusionEnv(int* mergeLevel, const char** forceMerge);
#define NCCL_TOPO_XML_MAX_NODES 8192
#define NCCL_GRAPH_XML_MAX_NODES 8192
@@ -279,6 +293,8 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in
return ncclInternalError;
}
extern struct kvDict nicPathKvList[];
static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, int64_t id, int* netDev) {
*netDev = -1;
for (int i=0; i<system->nodes[NET].count; i++) {
+98 -51
Datei anzeigen
@@ -10,6 +10,7 @@
#include "device.h"
#include "comm.h"
#include "topo.h"
#include "nccl_tuner.h"
NCCL_PARAM(Nthreads, "NTHREADS", -2);
NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2);
@@ -484,40 +485,73 @@ static struct tuningModel rcclTuningModel[] = {
tuning_model_7,
};
/* Array indexes used below */
#define VOLTA_COMPCAP_IDX 0
#define AMPERE_COMPCAP_IDX 1
#define HOPPER_COMPCAP_IDX 2
#define BLACKWELL_COMPCAP_IDX 3
#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
// LL128 max BW per channel
static const double llMaxBws[][3] = {
/* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
/* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
/* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0},
/* Blackwell-N1/AMD-N2/AMD-N4) */ {2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0},
// NVLS efficiency factor.
static const float nvlsEfficiency[NCCL_NUM_COMPCAPS] = {
0.0f, // Volta
0.0f, // Ampere
0.85f, // Hopper
0.74f, // Blackwell
};
static const double perChMaxRingLL128Bws[][3] = {
/* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7},
/* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*36.7},
// Default tuner constants
static const ncclTunerConstants_t ncclTunerConstantsDefaults = {
.baseLatencies = {
{ 6.8, 14.0, 8.4 }, { 6.6, 14.0, 8.4 }, // Tree, Ring
{ 0, 0, 0 }, { 0, 0, 0 }, // Collnet Direct, Chain
{ 0, 0, 0 }, { 0, 0, 0 }, // NVLS, NVLS Tree
{ 8.0, 8.0, 8.0 } // PAT
},
.hwLatencies = {
/* NVLINK */
{ { .6, 1.25, 4.0 }, { .6, 1.9, 3.4 }, /* Tree (LL/LL128/Simple), Ring (LL/LL128/Simple)*/
{ 0, 0, 3.7 }, { 0, 0, 2.8 }, /* CollNetDirect (LL/LL128/Simple), CollNetChain (LL/LL128/Simple)*/
{ 0, 0, 25 }, { 0, 0, 25 }, /* NVLS (LL/LL128/Simple), NVLSTree (LL/LL128/Simple)*/
{ 0, 0, 4.0 } /* PAT (LL/LL128/Simple)*/
},
/* PCI */
{ { 1.0, 1.9, 4.0 }, { 1.0, 2.5, 5.7 }, /* Tree (LL/LL128/Simple), Ring (LL/LL128/Simple)*/
{ 0, 0, 3.7 }, { 0, 0, 2.8 }, /* CollNetDirect (LL/LL128/Simple), CollNetChain (LL/LL128/Simple)*/
{ 0, 0, 0 }, { 0, 0, 0 }, /* NVLS (LL/LL128/Simple), NVLSTree (LL/LL128/Simple)*/
{ 0, 0, 4.0 } /* PAT (LL/LL128/Simple)*/
},
/* NET */
{ { 5.0, 8.5, 14 }, { 2.7, 4.0, 14.0 }, /* Tree (LL/LL128/Simple), Ring (LL/LL128/Simple)*/
{ 0, 0, 31 }, { 0, 0, 30 }, /* CollNetDirect (LL/LL128/Simple), CollNetChain (LL/LL128/Simple)*/
{ 0, 0, 18 }, { 0, 0, 14 }, /* NVLS (LL/LL128/Simple), NVLSTree (LL/LL128/Simple)*/
{ 0, 0, 14 } /* PAT (LL/LL128/Simple)*/
},
},
.llMaxBws = {
{39.0, 39.0, 20.4}, /* Volta-N1/Intel-N2/Intel-N4) */
{87.7, 22.5 /*avg of ring & tree*/, 19.0}, /* Ampere-N1/AMD-N2/AMD-N4) */
{141.0, 45.0 /*avg of ring & tree*/, 35.0}, /* Hopper-N1/AMD-N2/AMD-N4) */
{2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0}, /* Blackwell-N1/AMD-N2/AMD-N4) */
},
.perChMaxRingLL128Bws = {
{20.0, 20.0, 20.0}, /* Volta (N1/N2/N4) */
{20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */
{36.7, 36.7, 36.7}, /* Hopper (N1/N2/N4) */
{2*36.7, 2*36.7, 2*36.7}, /* Blackwell (N1/N2/N4) */
},
.perChMaxTreeLL128Bws = {
{20.0, 20.0, 20.0}, /* Volta (N1/N2/N4) */
{20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */
{36.7, 36.7, 29.0}, /* Hopper (N1/N2/N4) */
{55.6, 31.67, 20.0}, /* Blackwell (N1/N2/N4) */
},
.perChMaxTreeBws = {
{26.5, 18.5, 10.0}, /* Volta (N1/N2/N4) */
{24.0, 23.6, 17.8}, /* Ampere (N1/N2/N4) */
{38.7, 41.4, 36.0}, /* Hopper (N1/N2/N4) */
{70.0, 42.8, 24.0}, /* Blackwell (N1/N2/N4) */
},
.perChMaxNVLSTreeBws = {
{26.5, 18.5, 10.0}, /* Volta (N1/N2/N4) */
{24.0, 23.6, 17.8}, /* Ampere (N1/N2/N4) */
{0.0, 57.7, 45.5}, /* Hopper (N1/N2/N4) */
{0.0, 96.0, 43.1} /* Blackwell (N1/N2/N4) */
}
};
static const double perChMaxTreeLL128Bws[][3] = {
/* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
/* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0},
/* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*29.0},
};
static const double perChMaxTreeBws[][3] = {
/* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
/* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
/* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
/* Blackwell (N1/N2/N4) */ {2*38.7, 2*41.4, 2*36.0},
};
#endif
NCCL_PARAM(PatEnable, "PAT_ENABLE", 0);
static int ncclPatEnable(struct ncclComm* comm) {
@@ -542,6 +576,13 @@ static float getNetOverhead(struct ncclComm* comm) {
NCCL_PARAM(Ll128C2c, "LL128_C2C", 1);
ncclResult_t ncclTopoInitTunerConstants(struct ncclComm* comm) {
comm->tunerConstants = ncclTunerConstantsDefaults;
return ncclSuccess;
}
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
static int rcclMaxThreads[NCCL_NUM_PROTOCOLS] = {0};
@@ -576,18 +617,19 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
int nRanks = comm->nRanks;
if (nRanks <= 1) return ncclSuccess;
#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
int compCapIndex = minCompCap >= 100 ? NCCL_BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? NCCL_HOPPER_COMPCAP_IDX : minCompCap >= 80 ? NCCL_AMPERE_COMPCAP_IDX : NCCL_VOLTA_COMPCAP_IDX);
int index2 = nNodes <= 2 ? nNodes-1 : 2;
// LL: for single node, we look at GPU type; for multi-node, we look at CPU type
int index1 = nNodes == 1 ? compCapIndex :
(comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD || comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) ? 1 : 0;
double llMaxBw = llMaxBws[index1][index2];
double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2];
double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2];
double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
#endif
double llMaxBw = comm->tunerConstants.llMaxBws[index1][index2];
double perChMaxTreeBw = comm->tunerConstants.perChMaxTreeBws[compCapIndex][index2];
double perChMaxRingLL128Bw = comm->tunerConstants.perChMaxRingLL128Bws[compCapIndex][index2];
double perChMaxTreeLL128Bw = comm->tunerConstants.perChMaxTreeLL128Bws[compCapIndex][index2];
double perChMaxNVLSTreeBw = comm->tunerConstants.perChMaxNVLSTreeBws[compCapIndex][index2];
// De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
//if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) comm->tunerConstants.hwLatencies[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->tunerConstants.hwLatencies[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
#endif
float ppn = (float)nRanks / nNodes;
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
@@ -621,18 +663,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
&& a == NCCL_ALGO_PAT && (p != NCCL_PROTO_SIMPLE || ncclPatEnable(comm) == 0)) continue;
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw;
//INFO(NCCL_INIT, "algo %s proto %s busBw %f baseBw %f bw %f nChannels %d bwIntra %f bwInter %f", ncclAlgoStr[a], ncclProtoStr[p], busBw, comm->topo->baseBw, bw, graphs[a]->nChannels, graphs[a]->bwIntra, graphs[a]->bwInter);
if (a == NCCL_ALGO_NVLS) {
#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
if (a == NCCL_ALGO_NVLS_TREE || a == NCCL_ALGO_NVLS)
{
// NVLS/NVLStree needs at least 2 channels
if (graphs[a]->nChannels < 2 ) continue;
// Convert to NVLS busBW/channel
float intraBw = graphs[a]->bwIntra * nvlsEfficiency[compCapIndex] * (graphs[a]->nChannels - 1) / graphs[a]->nChannels;
// AllReduce pipelines two operations.
if (coll == ncclFuncAllReduce) {
bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter);
intraBw *= 2.0f;
} else {
// allgather and reducescatter
bw = std::min(graphs[a]->bwIntra * (ppn - 1.0f) / ppn, graphs[a]->bwInter * 0.9f);
intraBw *= (ppn - 1) / ppn;
}
}
if (a == NCCL_ALGO_NVLS_TREE) bw = std::min(graphs[a]->bwIntra, nNodes <= 2 ? graphs[a]->bwInter : graphs[a]->bwInter/2);
// Handle 2 node case of NVLSTree
float interBw = graphs[a]->bwInter * ((nNodes <= 2 && a == NCCL_ALGO_NVLS_TREE) ? 2 : 1);
bw = std::min( {intraBw, interBw, a == NCCL_ALGO_NVLS_TREE ? (float)perChMaxNVLSTreeBw : std::numeric_limits<float>::max()} );
};
#endif
float busBw = graphs[a]->nChannels * bw;
// Various model refinements
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
@@ -686,8 +735,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
// Convert bus BW to algorithm BW
if (!(a != NCCL_ALGO_RING && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
float ratio = 1.0f;
if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps;
else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0;
if (a == NCCL_ALGO_RING || a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= (1.0 * nRanks) / nsteps;
else ratio *= .5;
busBw *= ratio;
}
@@ -735,8 +783,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
comm->latencies[coll][a][p] += 2*(nNodes-1)*rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p];
} else if (a == NCCL_ALGO_PAT) {
if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
comm->latencies[coll][a][p] = 8 // Base time
+ log2i(nNodes) * (interLat/3.5) // Log latency
comm->latencies[coll][a][p] += log2i(nNodes) * (interLat/3.5) // Log latency
+ nRanks * 2.8; // Still a linear part; hopefully we'll manage to remove it at some point.
}
}
+23 -21
Datei anzeigen
@@ -1008,31 +1008,33 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
if (*netNode != NULL) return ncclSuccess;
const char* pciSysPath = pciPath;
if (pciSysPath) {
char subSystem[PATH_MAX];
NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem));
// This is not a PCI device (virtual, usb, ...).
if (strcmp(subSystem, "pci") != 0) {
INFO(NCCL_NET|NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem);
pciSysPath = NULL;
}
}
struct ncclXmlNode* parent = NULL;
if (forceParent) {
parent = forceParent;
} else if (pciSysPath) {
int offset;
for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--);
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
strcpy(busId, pciSysPath+offset+1);
NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent));
NCCLCHECK(xmlSetAttrIfUnset(parent, "class", "0x02"));
NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
} else {
// Virtual NIC, no PCI device, attach to first CPU
NCCLCHECK(xmlFindTag(xml, "cpu", &parent));
const char* pciSysPath = pciPath;
if (pciSysPath) {
char subSystem[PATH_MAX];
NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem));
// This is not a PCI device (virtual, usb, ...).
if (strcmp(subSystem, "pci") != 0 && !forceParent) {
INFO(NCCL_NET | NCCL_GRAPH, "Topology detection: network path (name = %s) %s is not a PCI device (%s). Attaching to first CPU", netName, pciSysPath, subSystem);
pciSysPath = NULL;
}
}
if (pciSysPath) {
int offset;
for (offset = strlen(pciSysPath) - 1; pciSysPath[offset] != '/'; offset--);
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
strcpy(busId, pciSysPath + offset + 1);
NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent));
NCCLCHECK(xmlSetAttrIfUnset(parent, "class", "0x02"));
NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
} else {
// Virtual NIC, no PCI device, attach to first CPU
NCCLCHECK(xmlFindTag(xml, "cpu", &parent));
}
}
struct ncclXmlNode* nicNode = NULL;
+7
Datei anzeigen
@@ -128,6 +128,13 @@ static ncclResult_t xmlGetAttrUint64(struct ncclXmlNode* node, const char* attrN
return ncclSuccess;
}
static ncclResult_t xmlGetAttrUint64Default(struct ncclXmlNode* node, const char* attrName, uint64_t* value, uint64_t defaultValue) {
const char* str;
NCCLCHECK(xmlGetAttr(node, attrName, &str));
*value = str ? strtoull(str, NULL, 0) : defaultValue;
return ncclSuccess;
}
static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char* attrName, int64_t* value) {
const char* str;
NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
+158 -107
Datei anzeigen
@@ -14,6 +14,9 @@
#include "api_trace.h"
#include <assert.h>
#include "bootstrap.h"
#include "ce_coll.h"
#include "profiler.h"
#include "nvtx.h"
#include "msccl/msccl_lifecycle.h"
@@ -101,7 +104,7 @@ ncclResult_t ncclGroupStart_impl() {
NCCLCHECK(Recorder::instance().record(rrGroupStart, ncclGroupDepth));
}
ncclResult_t ret = ncclSuccess;
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCL_NVTX3_FUNC_RANGE;
NCCLCHECK(ncclGroupStartInternal());
TRACE_CALL("ncclGroupStart()");
@@ -123,7 +126,7 @@ ncclResult_t ncclGroupEnd_impl() {
NCCLCHECK(Recorder::instance().record(rrGroupEnd, ncclGroupDepth));
}
ncclResult_t ret = ncclSuccess;
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCL_NVTX3_FUNC_RANGE;
NCCLCHECKGOTO(ncclGroupEndInternal(), ret, exit);
TRACE_CALL("ncclGroupEnd()");
exit:
@@ -137,7 +140,7 @@ ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo) {
Recorder::instance().record(ncclGroupDepth, simInfo);
}
ncclResult_t ret = ncclSuccess;
NVTX3_FUNC_RANGE_IN(nccl_domain);
NCCL_NVTX3_FUNC_RANGE;
NCCLCHECKGOTO(ncclGroupEndInternal(simInfo), ret, exit);
TRACE_CALL("ncclGroupSimulateEnd()");
exit:
@@ -150,65 +153,88 @@ struct ncclPreconnectJob {
bool* algoNeedConnect;
};
struct ncclPrepareTasksAndCollPreconnectJob {
struct ncclAsyncJob base;
struct ncclComm* comm;
ncclSimInfo_t* simInfo;
};
ncclResult_t ncclP2PPreconnectFunc(struct ncclAsyncJob* job_) {
struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
struct ncclComm* comm = job->comm;
CUDACHECK(cudaSetDevice(comm->cudaDev));
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
if (!job_->isThreadMain && CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
if (comm->p2pNet) NCCLCHECK(ncclTransportP2pSetup(comm, NULL, NCCL_CONN_IDX_P2P_NET));
return ncclSuccess;
}
static ncclResult_t ncclCollPreconnect(struct ncclComm* comm, bool* algoNeedConnect) {
for (int i = 0; i < NCCL_NUM_ALGORITHMS; ++i) {
if (algoNeedConnect[i]) {
switch (i) {
case NCCL_ALGO_RING: {
NCCLCHECK(ncclTransportRingConnect(comm));
break;
}
case NCCL_ALGO_TREE: {
NCCLCHECK(ncclTransportTreeConnect(comm));
break;
}
case NCCL_ALGO_NVLS: {
/* If we are using NVLS_TREE algo, we must mark NVLS algo to set up
* NVLS intra-node buffer */
NCCLCHECK(ncclNvlsBufferSetup(comm));
break;
}
case NCCL_ALGO_NVLS_TREE: {
NCCLCHECK(ncclNvlsTreeConnect(comm));
break;
}
case NCCL_ALGO_COLLNET_CHAIN: {
NCCLCHECK(ncclCollNetChainBufferSetup(comm));
break;
}
case NCCL_ALGO_COLLNET_DIRECT: {
NCCLCHECK(ncclCollNetDirectBufferSetup(comm));
break;
}
case NCCL_ALGO_PAT: {
NCCLCHECK(ncclTransportPatConnect(comm));
break;
}
// Yes, it's a dead code. That's fine...
// coverity[dead_error_begin]
default: {
NCCLCHECK(ncclInternalError);
}
}
}
}
return ncclSuccess;
}
ncclResult_t ncclPrepareTasksAndCollPreconnectFunc(struct ncclAsyncJob* job_) {
struct ncclPrepareTasksAndCollPreconnectJob* job = (ncclPrepareTasksAndCollPreconnectJob*)job_;
struct ncclComm* comm = job->comm;
bool needConnect;
bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
memset(algoNeedConnect, 0, sizeof(bool)*NCCL_NUM_ALGORITHMS);
CUDACHECK(cudaSetDevice(comm->cudaDev));
if (!job_->isThreadMain && CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
NCCLCHECK(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, job->simInfo));
if (comm->cuMemSupport && needConnect) NCCLCHECK(ncclCollPreconnect(comm, algoNeedConnect));
return ncclSuccess;
}
ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) {
struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
struct ncclComm* comm = job->comm;
ncclResult_t ret = ncclSuccess;
CUDACHECK(cudaSetDevice(comm->cudaDev));
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
for (int i = 0; i < NCCL_NUM_ALGORITHMS; ++i) {
if (job->algoNeedConnect[i]) {
switch (i) {
case NCCL_ALGO_RING: {
NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
break;
}
case NCCL_ALGO_TREE: {
NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
break;
}
case NCCL_ALGO_NVLS: {
/* If we are using NVLS_TREE algo, we must mark NVLS algo to set up
* NVLS intra-node buffer */
NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
break;
}
case NCCL_ALGO_NVLS_TREE: {
NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
break;
}
case NCCL_ALGO_COLLNET_CHAIN: {
NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
break;
}
case NCCL_ALGO_COLLNET_DIRECT: {
NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
break;
}
case NCCL_ALGO_PAT: {
NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
break;
}
// Yes, it's a dead code. That's fine...
// coverity[dead_error_begin]
default: {
ret = ncclInternalError;
goto fail;
}
}
}
}
if (!job_->isThreadMain) CUDACHECK(cudaSetDevice(comm->cudaDev));
if (!job_->isThreadMain && CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
NCCLCHECKGOTO(ncclCollPreconnect(comm, job->algoNeedConnect), ret, fail);
exit:
free(job->algoNeedConnect);
@@ -222,52 +248,33 @@ struct ncclGroupSymmetricJob {
struct ncclComm* comm;
};
NCCL_PARAM(WinStride, "WIN_STRIDE", -1);
ncclResult_t ncclCommGroupRegisterSymmetric(struct ncclAsyncJob* job_) {
struct ncclGroupSymmetricJob* job = (struct ncclGroupSymmetricJob*)job_;
struct ncclComm* comm = job->comm;
ncclResult_t ret = ncclSuccess;
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
if (comm->baseStride == 0) {
cudaStream_t hostStream;
// first time to allocate symmetric VA space.
// calling into this function means symmetric is supported.
struct ncclSymDevBase* symBase = NULL;
size_t size = ncclSymDevBase::size(comm->localRanks);
if (ncclParamWinStride() != -1) {
comm->baseStride = ncclParamWinStride();
} else {
size_t maxStride = 0;
for (int r = 0; r < comm->nRanks; ++r)
if (comm->peerInfo[r].totalGlobalMem > maxStride) maxStride = comm->peerInfo[r].totalGlobalMem;
comm->baseStride = maxStride;
}
INFO(NCCL_INIT, "rank %d base stride %zuGB total VM %zuGB", comm->rank, comm->baseStride >> 30, (comm->baseStride * comm->localRanks) >> 30);
NCCLCHECKGOTO(ncclIpcSymmetricInit(comm), ret, fail);
NCCLCHECKGOTO(ncclNvlsSymmetricInit(comm), ret, fail);
comm->symAllocHead = 0;
// Allocate symmetric memory for NCCL internal usage
NCCLCHECKGOTO(ncclCommSymmetricAllocInternal(comm, size, alignof(struct ncclSymDevBase), (void**)&symBase), ret, fail);
assert((void*)symBase == (void*)(comm->baseUCSymPtr + comm->localRank * comm->baseStride));
NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail);
CUDACHECKGOTO(cudaMemsetAsync(symBase, 0, size, hostStream), ret, fail);
CUDACHECKGOTO(cudaStreamSynchronize(hostStream), ret, fail);
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), ret, fail);
comm->symDevComm.base = (struct ncclSymDevBase*)(comm->baseUCSymPtr + comm->localRank * comm->baseStride);
comm->symDevComm.baseMc = (struct ncclSymDevBase*)comm->baseMCSymPtr;
comm->symDevComm.nRanks = comm->localRanks;
comm->symDevComm.nRanks_rcp32 = idivRcp32(comm->localRanks);
comm->symDevComm.rank = comm->localRank;
comm->symDevComm.stride4G = comm->baseStride >> 32;
while (!ncclIntruQueueEmpty(&comm->devrState.regTaskQueue)) {
struct ncclDevrRegTask* task = ncclIntruQueueDequeue(&comm->devrState.regTaskQueue);
NCCLCHECKGOTO(ncclDevrWindowRegisterInGroup(
comm, task->userPtr, task->userSize, task->winFlags, task->outWinDev),
ret, fail);
free(task);
}
while (!ncclIntruQueueEmpty(&comm->symRegTaskQueue)) {
struct ncclSymRegTask* task = ncclIntruQueueDequeue(&comm->symRegTaskQueue);
NCCLCHECKGOTO(ncclCommSymmetricRegisterInternal(comm, task->buff, task->baseSize, task->alignment, task->memHandle, task->regHandle), ret, fail);
while (!ncclIntruQueueEmpty(&comm->devrState.commCreateTaskQueue)) {
struct ncclDevrCommCreateTask* task = ncclIntruQueueDequeue(&comm->devrState.commCreateTaskQueue);
NCCLCHECKGOTO(ncclDevrCommCreateInternal(
comm, (struct ncclDevCommRequirements const*)task->reqs, task->outDevComm),
ret, fail);
freeDevCommRequirements(task->reqs); // free additional task memory for reqs
free(task);
}
while (!ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) {
struct ncclCeInitTask* task = ncclIntruQueueDequeue(&comm->ceInitTaskQueue);
NCCLCHECKGOTO(ncclCeInit(task->comm), ret, fail);
free(task);
}
@@ -324,7 +331,11 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
comm->planner.unlaunchedPlansHead = plan->next;
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure);
NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
if (plan->isCeColl) {
NCCLCHECKGOTO(ncclLaunchCeColl(comm, plan), result, failure);
} else {
NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
}
}
// Barrier reduction input indicates if we require further rounds.
if (useBarrier) ncclCommIntraBarrierIn(comm, comm->planner.unlaunchedPlansHead != nullptr ? 1 : 0);
@@ -422,6 +433,12 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
if (!ncclIntruQueueEmpty(asyncJobsMain)) {
struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain);
if (job->next == nullptr) {
job->isThreadMain = true;
ncclAsyncJobMain(job);
job->state = ncclGroupJobJoined;
return job->result;
}
do {
PTHREADCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), "pthread_create", ret, fail);
job = job->next;
@@ -474,6 +491,51 @@ fail:
goto exit;
}
NCCL_PARAM(SingleProcMemRegEnable, "SINGLE_PROC_MEM_REG_ENABLE", 0);
static ncclResult_t ncclPrepareTasksAndCollPreconnect(struct ncclComm* comm, ncclSimInfo_t* simInfo, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncCollJobs) {
if (ncclParamSingleProcMemRegEnable()) {
struct ncclPrepareTasksAndCollPreconnectJob* job;
NCCLCHECK(ncclCalloc(&job, 1));
job->base.func = ncclPrepareTasksAndCollPreconnectFunc;
job->base.undo = nullptr;
job->base.destructor = free;
job->base.state = ncclGroupJobRunning;
job->base.abortFlag = comm->abortFlag;
job->base.abortFlagDev = comm->abortFlagDev;
job->comm = comm;
job->simInfo = simInfo;
ncclIntruQueueEnqueue(asyncCollJobs, &job->base);
} else {
bool needConnect = false;
bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
CUDACHECK(cudaSetDevice(comm->cudaDev));
NCCLCHECK(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo));
if (comm->cuMemSupport && needConnect) {
ncclResult_t ret;
struct ncclPreconnectJob* job;
NCCLCHECK(ncclCalloc(&job, 1));
job->base.func = ncclCollPreconnectFunc;
job->base.undo = nullptr;
job->base.destructor = free;
job->base.state = ncclGroupJobRunning;
job->base.abortFlag = comm->abortFlag;
job->base.abortFlagDev = comm->abortFlagDev;
job->comm = comm;
if ((ret = ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS))) {
free(job);
NCCLCHECK(ret);
}
memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
ncclIntruQueueEnqueue(asyncCollJobs, &job->base);
}
}
return ncclSuccess;
}
static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) {
ncclResult_t ret = ncclSuccess;
struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
@@ -548,27 +610,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
// at the same time.
comm = cliqueHead;
do {
bool needConnect = false;
bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
if (comm->cuMemSupport && needConnect) {
struct ncclPreconnectJob* job;
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
job->base.func = ncclCollPreconnectFunc;
job->base.undo = nullptr;
job->base.destructor = free;
job->base.state = ncclGroupJobRunning;
job->base.abortFlag = comm->abortFlag;
job->base.abortFlagDev = comm->abortFlagDev;
job->comm = comm;
NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
}
NCCLCHECKGOTO(ncclPrepareTasksAndCollPreconnect(comm, simInfo, &asyncCollJobs), ret, fail);
comm = comm->groupNext[ncclGroupTaskTypeCollective];
} while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
// connect
@@ -650,6 +692,13 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
if (mscclAvailable() && !mscclIsCaller()) {
NCCLCHECK(mscclGroupEnd());
}
if (ncclProfilerApiState.profilerGroupDepth > 0) {
ncclProfilerApiState.profilerGroupDepth--;
}
if (ncclProfilerApiState.profilerGroupDepth == 0) {
NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupEndApiStart));
}
if ((--ncclGroupDepth) > 0) goto exit;
@@ -735,6 +784,8 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
groupLocalResetJobState();
exit:
// Profiler group API start is called inside taskAppend to get graph capture information for the event
NCCLCHECK(ncclProfilerStopGroupApiEvent());
return ret;
fail:
if (groupJob) {
+50 -2
Datei anzeigen
@@ -7,7 +7,55 @@
#ifndef NCCL_ALLOCATOR_H_
#define NCCL_ALLOCATOR_H_
ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr);
ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr);
////////////////////////////////////////////////////////////////////////////////
// ncclSpace: Allocates contiguous segments of non-negative integers. Useful
// as a memory allocator when we can't put allocator state within the memory
// being allocated.
struct ncclSpace {
int count;
int capacity;
int64_t* cuts;
};
void ncclSpaceConstruct(struct ncclSpace* a);
void ncclSpaceDestruct(struct ncclSpace* a);
ncclResult_t ncclSpaceAlloc(struct ncclSpace* a, int64_t spaceLimit, int64_t objSize, int objAlign, int64_t* outObjOffset);
ncclResult_t ncclSpaceFree(struct ncclSpace* a, int64_t objOffset, int64_t objSize);
////////////////////////////////////////////////////////////////////////////////
// ncclShadowPool: Allocates device-side objects, their host-side shadows, and
// maintains the device->host object address mapping.
struct ncclShadowObject;
struct ncclShadowPage;
struct ncclShadowPool {
int count, hbits;
struct ncclShadowObject** table;
cudaMemPool_t memPool;
struct ncclShadowPage* pages;
};
void ncclShadowPoolConstruct(struct ncclShadowPool*);
ncclResult_t ncclShadowPoolDestruct(struct ncclShadowPool*);
ncclResult_t ncclShadowPoolAlloc(struct ncclShadowPool*, size_t size, void** outDevObj, void** outHostObj, cudaStream_t stream);
ncclResult_t ncclShadowPoolFree(struct ncclShadowPool*, void* devObj, cudaStream_t stream);
ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool*, void* devObj, void** outHostObj);
template<typename T>
static inline ncclResult_t ncclShadowPoolAlloc(struct ncclShadowPool* pool, T** outDevObj, T** outHostObj, cudaStream_t stream) {
void* devObj;
void* hostObj;
ncclResult_t got = ncclShadowPoolAlloc(pool, sizeof(T), &devObj, &hostObj, stream);
if (outDevObj) *outDevObj = (T*)devObj;
if (outHostObj) *outHostObj = (T*)hostObj;
return got;
}
template<typename T>
static inline ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool* pool, T* devObj, T** hostObj) {
return ncclShadowPoolToHost(pool, (void*)devObj, (void**)hostObj);
}
#endif
+8 -6
Datei anzeigen
@@ -31,7 +31,7 @@
#define RCCL_API_TRACE_VERSION_MAJOR 0
// should be increased every time new members are added to existing dispatch tables
#define RCCL_API_TRACE_VERSION_PATCH 2
#define RCCL_API_TRACE_VERSION_PATCH 3
#if !defined(RCCL_EXTERN_C_INIT)
# ifdef __cplusplus
@@ -65,10 +65,10 @@ typedef ncclResult_t (*ncclAllReduceWithBias_fn_t)(const void* sendbuff, void* r
size_t count, ncclDataType_t datatype,
ncclRedOp_t op, struct ncclComm* comm,
hipStream_t stream, const void* acc);
typedef ncclResult_t (*ncclAllToAll_fn_t)(const void* sendbuff, void* recvbuff,
typedef ncclResult_t (*ncclAlltoAll_fn_t)(const void* sendbuff, void* recvbuff,
size_t count, ncclDataType_t datatype,
ncclComm_t comm, hipStream_t stream);
typedef ncclResult_t (*ncclAllToAllv_fn_t)(
typedef ncclResult_t (*ncclAlltoAllv_fn_t)(
const void* sendbuff, const size_t sendcounts[], const size_t sdispls[],
void* recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
@@ -162,7 +162,7 @@ typedef ncclResult_t (*ncclCommRegister_fn_t)(const ncclComm_t comm, void* buff,
typedef ncclResult_t (*ncclCommDeregister_fn_t)(const ncclComm_t comm, void* handle);
typedef ncclResult_t (*ncclCommWindowRegister_fn_t)(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
typedef ncclResult_t (*ncclCommWindowRegister_fn_t)(ncclComm_t comm, void* userPtr, size_t userSize, ncclWindow_t* outWinDev, int winFlags);
typedef ncclResult_t (*ncclCommWindowDeregister_fn_t)(ncclComm_t comm, ncclWindow_t win);
@@ -172,8 +172,8 @@ typedef struct rcclApiFuncTable
uint64_t size;
ncclAllGather_fn_t ncclAllGather_fn;
ncclAllReduce_fn_t ncclAllReduce_fn;
ncclAllToAll_fn_t ncclAllToAll_fn;
ncclAllToAllv_fn_t ncclAllToAllv_fn;
ncclAlltoAll_fn_t ncclAllToAll_fn;
ncclAlltoAllv_fn_t ncclAllToAllv_fn;
ncclBroadcast_fn_t ncclBroadcast_fn;
ncclGather_fn_t ncclGather_fn;
ncclReduce_fn_t ncclReduce_fn;
@@ -211,6 +211,8 @@ typedef struct rcclApiFuncTable
ncclCommShrink_fn_t ncclCommShrink_fn;
ncclCommWindowRegister_fn_t ncclCommWindowRegister_fn;
ncclCommWindowDeregister_fn_t ncclCommWindowDeregister_fn;
ncclAlltoAll_fn_t ncclAlltoAll_fn;
ncclAlltoAllv_fn_t ncclAlltoAllv_fn;
// ADD NEW FUNCTIONS HERE ONLY
} rcclApiFuncTable;
+22 -7
Datei anzeigen
@@ -41,6 +41,9 @@ constexpr static __host__ __device__ Int maxval(Int a, Int b, More ...more) {
#endif
}
#define BIT(x) (1UL << (x))
#define MASK(x) ((1UL << x) - 1UL)
#define DIVUP(x, y) \
(((x)+(y)-1)/(y))
@@ -68,14 +71,26 @@ static __host__ __device__ constexpr Z roundDown(X x, Y y) {
}
// assumes second argument is a power of 2
template<typename X, typename Z = decltype(X()+int())>
static __host__ __device__ constexpr Z alignUp(X x, int a) {
return (x + a-1) & Z(-a);
template<typename X, typename Y, typename Z = decltype(X()+Y())>
static __host__ __device__ constexpr Z alignUp(X x, Y a) {
return (x + a-1) & -Z(a);
}
template<typename T>
static __host__ __device__ T* alignUp(T* x, size_t a) {
static_assert(sizeof(T) == 1, "Only single byte types allowed.");
return reinterpret_cast<T*>((reinterpret_cast<uintptr_t>(x) + a-1) & -uintptr_t(a));
}
// assumes second argument is a power of 2
template<typename X, typename Z = decltype(X()+int())>
static __host__ __device__ constexpr Z alignDown(X x, int a) {
return x & Z(-a);
template<typename X, typename Y, typename Z = decltype(X()+Y())>
static __host__ __device__ constexpr Z alignDown(X x, Y a) {
return x & -Z(a);
}
template<typename T>
static __host__ __device__ T* alignDown(T* x, size_t a) {
static_assert(sizeof(T) == 1, "Only single byte types allowed.");
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(x) & -uintptr_t(a));
}
template<typename Int>
@@ -341,7 +356,7 @@ static __host__ __device__ UInt reverseSubBits(UInt x) {
default: static_assert(8*sizeof(UInt) <= 64, "Unsupported integer type.");
}
return reverseSubBits<UInt, 8>(x);
} else if (nSubBits == 1) {
} else if (nSubBits <= 1) {
return x;
} else {
UInt m = UInt(-1)/((UInt(1)<<(nSubBits/2))+1);
+76
Datei anzeigen
@@ -0,0 +1,76 @@
/*************************************************************************
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_CE_COLL_H_
#define NCCL_CE_COLL_H_
#include "nccl.h"
#include "nccl_common.h"
#include "bitops.h"
// Memory operations per rank for different synchronization protocols
#define NCCL_CE_SYNC_OPS_PER_RANK_MC 2
#define NCCL_CE_SYNC_OPS_PER_RANK_UC 3
struct ncclCeColl {
uint8_t* baseUCSymReadyPtr;
uint8_t* baseUCSymComplPtr;
size_t baseUCSymReadyOffset;
size_t baseUCSymComplOffset;
uint32_t ceSeqNum;
bool useCompletePtr;
uint32_t intraBatchSyncFreq;
uint64_t intraBatchSyncMsgThreshold;
struct ncclDevrWindow* ceSyncWin;
};
struct ncclCeInitTask {
struct ncclCeInitTask *next;
struct ncclComm* comm;
};
struct alignas(16) ncclCeCollArgs {
ncclFunc_t func;
int rootRank;
size_t nElts;
size_t eltSize;
uint8_t* sendBuff;
uint8_t* recvBuff;
struct ncclDevrWindow* sendWin;
struct ncclDevrWindow* recvWin;
};
struct ncclCeBatchOpsParams {
void** dsts;
void** srcs;
size_t* sizes;
size_t numOps;
bool intraBatchSync;
#if CUDART_VERSION >= 12080
cudaMemcpyAttributes* attrs;
size_t* attrIdxs;
size_t numAttrs;
#endif
};
bool ncclCeImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
ncclResult_t ncclCeInit(struct ncclComm* comm);
ncclResult_t ncclCeFinalize(struct ncclComm* comm);
ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclLaunchCeColl(struct ncclComm* comm, struct ncclKernelPlan* plan);
ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream);
ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream);
ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream);
ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream);
#endif /* NCCL_CE_COLL_H_ */
+4 -3
Datei anzeigen
@@ -17,16 +17,17 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound, int p2pBatchEnable = 0) {
int base;
if (comm->nNodes > 1) {
int nodeDelta = p2pRound/comm->maxLocalRanks;
int localDelta = p2pRound%comm->maxLocalRanks;
int batchSize = (comm->nNodes > 2 && p2pBatchEnable) ? NCCL_MAX_DEV_WORK_P2P_PER_BATCH : 1;
int base = nodeDelta*divUp(comm->maxLocalRanks, batchSize);
base = nodeDelta*divUp(comm->maxLocalRanks, batchSize);
base += localDelta/batchSize;
return base & 0xff;
} else {
return p2pRound & 0xff;
base = p2pRound;
}
return base & 0xff;
}
#endif
+2 -1
Datei anzeigen
@@ -16,7 +16,7 @@ typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; }
static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; }
static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(comm->collNetContext, dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
@@ -29,6 +29,7 @@ static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* d
static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; }
static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; }
static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
static ncclResult_t collNetFinalize(struct ncclComm* comm, void* ctx) { NCCLCHECK(comm->ncclCollNet->finalize(ctx)); return ncclSuccess; }
static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; }
+7 -1
Datei anzeigen
@@ -10,7 +10,7 @@
#define NCCL_COLLECTIVES_H_
#include "nccl.h"
#include "nccl_common.h"
#include "nccl_tuner.h"
#include "device.h"
#define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
@@ -25,11 +25,17 @@
#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
#define ALLGATHER_SLICESTEPS_SINGLE_NODE (NCCL_STEPS/2)
#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
#define ALLTOALL_SLICESTEPS 1
#define ALLTOALL_CHUNKSTEPS 1
#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
#define REDUCESCATTER_SLICESTEPS_SINGLE_NODE (NCCL_STEPS/2)
#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
#define BROADCAST_SLICESTEPS 1
#define BROADCAST_CHUNKSTEPS 1
#define GATHER_SLICESTEPS 1
#define GATHER_CHUNKSTEPS 1
#define SCATTER_SLICESTEPS 1
#define SCATTER_CHUNKSTEPS 1
#define REDUCE_SLICESTEPS 1
#define REDUCE_CHUNKSTEPS 1
#define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
+38 -15
Datei anzeigen
@@ -20,6 +20,9 @@
#include "nvmlwrap.h"
#include "profiler.h"
#include "allocator.h"
#include "dev_runtime.h"
#include "sym_kernels.h"
#include "ce_coll.h"
#include "latency_profiler/CollTrace.h"
#include "rccl_common.h"
#include "recorder.h"
@@ -217,13 +220,15 @@ struct ncclTaskColl {
#endif
int32_t nWarps:8;
int32_t algorithm:8, protocol:8, pipeline:8;
uint32_t isCollnet:1, isNvls:1;
uint32_t devFuncId:30;
uint32_t isCollnet:1, isNvls:1, isSymLast:1;
uint32_t devFuncId:29;
int regBufType;
uint64_t opCount;
// number of elements in planner->ipcMemQueue associated with this collective
int nCleanupQueueElts;
struct ncclDevrWindow* sendWin;
struct ncclDevrWindow* recvWin;
void* sendMhandle;
void* recvMhandle;
void** sendNetHandles;
@@ -237,12 +242,16 @@ struct ncclTaskColl {
// Profiler plugin
int eActivationMask;
void* groupApiEventHandle;
void* collApiEventHandle;
void* eventHandle;
uint8_t nChannels;
};
struct ncclTaskP2p {
struct ncclTaskP2p* next;
ncclFunc_t func;
ncclFunc_t collAPI;
void* buff;
size_t count;
ncclDataType_t datatype;
@@ -252,6 +261,8 @@ struct ncclTaskP2p {
// Profiler plugin
int eActivationMask;
void* groupApiEventHandle;
void* p2pApiEventHandle;
void* eventHandle;
uint8_t nChannels;
};
@@ -267,12 +278,14 @@ struct ncclKernelPlan {
bool persistent; // aka captured in a graph
bool isHostCbEnq;
bool isSymColl;
bool isCeColl;
enum ncclDevWorkStorageType workStorageType;
bool kernelSpecialized;
void* kernelFn;
union {
struct ncclDevKernelArgs* kernelArgs;
struct ncclSymDevArgs* kernelSymArgs;
void* kernelSymArgs;
struct ncclCeCollArgs* ceCollArgs;
};
size_t kernelArgsSize;
struct channelMasks channelMask;
@@ -291,6 +304,8 @@ struct ncclKernelPlan {
struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
// Profiler plugin
void* groupApiEventHandle;
void* kernelLaunchEventHandle;
void* groupEventHandle;
};
@@ -381,9 +396,8 @@ struct ncclKernelPlanner {
struct ncclTaskCollSorter collSorter;
struct Peer* peers/*[nRanks]*/;
int nTasksColl, nTasksP2p;
int nTasksP2pSend, nTasksP2pRecv;
bool persistent;
bool isSymColl;
// The list of user streams aggregated over all tasks present.
struct ncclCudaStreamList* streams;
// Keep track of the number of user streams
@@ -401,6 +415,8 @@ struct ncclKernelPlanner {
//////////////////////////////////////////////////////////////////////////////
struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collTaskQueue;
struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collCeTaskQueue;
struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collSymTaskQueue;
struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> collWorkQueue;
struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> tmpCollWorkQueue;
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> collCleanupQueue;
@@ -459,6 +475,8 @@ typedef enum ncclGroupTaskType {
ncclGroupTaskTypeNum = 2,
} ncclGroupTaskType_t;
struct ncclCommSymTeams;
struct ncclComm {
uint64_t startMagic;
struct ncclMemoryStack memPermanent, memScoped;
@@ -478,10 +496,12 @@ struct ncclComm {
bool peerInfoValid;
ncclNet_t* ncclNet;
void* netContext;
int netPluginIndex;
int ncclNetVer;
ncclNetDeviceType netDeviceType;
ncclCollNet_t* ncclCollNet;
void* collNetContext;
void* bootstrap;
// Bitmasks for ncclTransportP2pSetup
struct channelMasks* connectSend;
@@ -517,6 +537,7 @@ struct ncclComm {
int localRank;
int localRanks;
int maxLocalRanks;
int minLocalRanks;
int* rankToNode;
int* rankToLocalRank;
int* localRankToRank;
@@ -527,6 +548,9 @@ struct ncclComm {
struct cliqueInfo clique; // Our MNNVL clique information
int cliqueRank; // Our rank within the MNNVL clique
// NVL Domain info
ncclNvlDomainInfo_v5_t nvlDomainInfo;
bool checkPointers;
bool dmaBufSupport;
@@ -553,7 +577,8 @@ struct ncclComm {
int p2pChunkSize;
int nvlsChunkSize;
// Algorithm/Protocols thresholds
// Tuner values
ncclTunerConstants_t tunerConstants;
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
@@ -579,8 +604,7 @@ struct ncclComm {
bool hasFineGrain;
// Device side of the communicator (for cudaFree's)
struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
struct ncclSymDevComm symDevComm;
struct ncclKernelComm* devComm; // actually = &ncclKernelCommAndChannels::comm
uint32_t workArgsBytes; // max size of kernel args
uint32_t workFifoBytes; // size of workFifoBuf, power of 2
@@ -703,6 +727,10 @@ struct ncclComm {
uint64_t seqNumber[NCCL_NUM_FUNCTIONS];
struct ncclProfilerProxy profiler;
// CE Collective
struct ncclCeColl ceColl;
struct ncclIntruQueue<struct ncclCeInitTask, &ncclCeInitTask::next> ceInitTaskQueue;
// buffer registration cache
struct ncclRegCache regCache;
int isAllNvlink;
@@ -712,13 +740,8 @@ struct ncclComm {
bool useGdr;
int splitCount;
// symmetric buffer
uint8_t* baseUCSymPtr;
uint8_t* baseMCSymPtr;
size_t baseStride;
size_t symAllocHead;
CUmemGenericAllocationHandle symMCHandle;
struct ncclIntruQueue<struct ncclSymRegTask, &ncclSymRegTask::next> symRegTaskQueue;
struct ncclDevrState devrState; // The symmetric runtime state
struct ncclSymkState symkState; // The symmetric kernels state (built on previous)
// unroll factor for comm [RCCL]
int unroll;
+1
Datei anzeigen
@@ -17,6 +17,7 @@
#ifdef PROFAPI
#define NCCL_API(ret, func, args...) \
extern "C" \
__attribute__ ((visibility("default"))) \
__attribute__ ((alias(#func))) \
ret p##func (args); \

Einige Dateien werden nicht angezeigt, da zu viele Dateien in diesem Diff geändert wurden Mehr anzeigen