Merge remote-tracking branch 'nccl/master' into develop
Этот коммит содержится в:
+2
-1
@@ -2,7 +2,7 @@
|
||||
|
||||
Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
|
||||
|
||||
## Unreleased - RCCL 2.25.1 for ROCm 7.0.0
|
||||
## Unreleased - RCCL 2.26.6 for ROCm 7.0.0
|
||||
|
||||
### Resolved issues
|
||||
|
||||
@@ -29,6 +29,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
|
||||
* Compatibility with NCCL 2.23.4
|
||||
* Compatibility with NCCL 2.24.3
|
||||
* Compatibility with NCCL 2.25.1
|
||||
* Compatibility with NCCL 2.26.6
|
||||
|
||||
## RCCL 2.22.3 for ROCm 6.4.1
|
||||
|
||||
|
||||
+39
-9
@@ -423,7 +423,6 @@ set(SRC_FILES
|
||||
src/init.cc
|
||||
src/init_nvtx.cc
|
||||
src/mnnvl.cc
|
||||
src/net.cc
|
||||
src/msccl.cc
|
||||
src/proxy.cc
|
||||
src/rccl_wrap.cc
|
||||
@@ -491,9 +490,6 @@ set(SRC_FILES
|
||||
src/include/ipcsocket.h
|
||||
src/include/mnnvl.h
|
||||
src/include/nccl_common.h
|
||||
src/include/nccl_net.h
|
||||
src/include/nccl_profiler.h
|
||||
src/include/nccl_tuner.h
|
||||
src/include/net_device.h
|
||||
src/include/net.h
|
||||
src/include/nvmlwrap.h
|
||||
@@ -566,6 +562,25 @@ set(SRC_FILES
|
||||
src/include/nvtx3/nvtxDetail/nvtxInitDefs.h
|
||||
src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h
|
||||
src/include/nvtx3/nvtxDetail/nvtxTypes.h
|
||||
src/include/plugin/nccl_net.h
|
||||
src/include/plugin/nccl_profiler.h
|
||||
src/include/plugin/nccl_tuner.h
|
||||
src/include/plugin/plugin.h
|
||||
src/include/plugin/net/net_v6.h
|
||||
src/include/plugin/net/net_v7.h
|
||||
src/include/plugin/net/net_v8.h
|
||||
src/include/plugin/net/net_v9.h
|
||||
src/include/plugin/net/net_v10.h
|
||||
src/include/plugin/profiler/net_ib_v1.h
|
||||
src/include/plugin/profiler/net_ib.h
|
||||
src/include/plugin/profiler/net_socket_v1.h
|
||||
src/include/plugin/profiler/net_socket.h
|
||||
src/include/plugin/profiler/profiler_v1.h
|
||||
src/include/plugin/profiler/profiler_v2.h
|
||||
src/include/plugin/profiler/profiler_v3.h
|
||||
src/include/plugin/tuner/tuner_v2.h
|
||||
src/include/plugin/tuner/tuner_v3.h
|
||||
src/include/plugin/tuner/tuner_v4.h
|
||||
src/misc/alt_rsmi.cc
|
||||
src/misc/archinfo.cc
|
||||
src/misc/argcheck.cc
|
||||
@@ -580,7 +595,6 @@ set(SRC_FILES
|
||||
# src/misc/nvmlwrap.cc
|
||||
src/misc/nvmlwrap_stub.cc
|
||||
src/misc/param.cc
|
||||
src/misc/profiler.cc
|
||||
src/misc/rocm_smi_wrap.cc
|
||||
src/misc/rocmwrap.cc
|
||||
src/misc/roctx.cc
|
||||
@@ -589,12 +603,26 @@ set(SRC_FILES
|
||||
src/misc/signals.cc
|
||||
src/misc/socket.cc
|
||||
src/misc/strongstream.cc
|
||||
src/misc/tuner.cc
|
||||
src/misc/utils.cc
|
||||
src/misc/msccl/msccl_lifecycle.cc
|
||||
src/misc/msccl/msccl_parser.cc
|
||||
src/misc/msccl/msccl_setup.cc
|
||||
src/misc/msccl/msccl_status.cc
|
||||
src/plugin/net.cc
|
||||
src/plugin/plugin_open.cc
|
||||
src/plugin/profiler.cc
|
||||
src/plugin/tuner.cc
|
||||
src/plugin/net/net_v6.cc
|
||||
src/plugin/net/net_v7.cc
|
||||
src/plugin/net/net_v8.cc
|
||||
src/plugin/net/net_v9.cc
|
||||
src/plugin/net/net_v10.cc
|
||||
src/plugin/profiler/profiler_v1.cc
|
||||
src/plugin/profiler/profiler_v2.cc
|
||||
src/plugin/profiler/profiler_v3.cc
|
||||
src/plugin/tuner/tuner_v2.cc
|
||||
src/plugin/tuner/tuner_v3.cc
|
||||
src/plugin/tuner/tuner_v4.cc
|
||||
src/ras/client.cc
|
||||
src/ras/client_support.cc
|
||||
src/ras/collectives.cc
|
||||
@@ -612,6 +640,7 @@ set(SRC_FILES
|
||||
src/transport/net_socket.cc
|
||||
src/transport/nvls.cc
|
||||
src/transport/p2p.cc
|
||||
src/transport/profiler.cc
|
||||
src/transport/shm.cc
|
||||
)
|
||||
|
||||
@@ -737,6 +766,7 @@ add_dependencies(rccl git_version_check) #
|
||||
target_include_directories(rccl PRIVATE ${PROJECT_BINARY_DIR}/include) # for generated rccl.h header
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src) # for hipfied headers
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/plugin)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device/network/unpack)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/gensrc)
|
||||
@@ -1024,7 +1054,7 @@ endif()
|
||||
#==================================================================================================
|
||||
## Specify install targets
|
||||
rocm_install_targets(TARGETS rccl)
|
||||
rocm_install(FILES ${PROJECT_BINARY_DIR}/include/rccl/rccl.h src/include/nccl_net.h
|
||||
rocm_install(FILES ${PROJECT_BINARY_DIR}/include/rccl/rccl.h src/include/plugin/nccl_net.h
|
||||
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl)
|
||||
rocm_install(FILES src/include/api_trace.h
|
||||
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rccl/amd_detail)
|
||||
@@ -1050,10 +1080,10 @@ if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY)
|
||||
|
||||
### install the wrapper header file to package
|
||||
rocm_install(
|
||||
FILES ${PROJECT_BINARY_DIR}/rccl/include/rccl.h src/include/nccl_net.h
|
||||
FILES ${PROJECT_BINARY_DIR}/rccl/include/rccl.h src/include/plugin/nccl_net.h
|
||||
DESTINATION "./rccl/${CMAKE_INSTALL_INCLUDEDIR}/" )
|
||||
rocm_install(
|
||||
FILES ${PROJECT_BINARY_DIR}/include/rccl.h src/include/nccl_net.h
|
||||
FILES ${PROJECT_BINARY_DIR}/include/rccl.h src/include/plugin/nccl_net.h
|
||||
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/" )
|
||||
endif()
|
||||
|
||||
|
||||
@@ -60,20 +60,20 @@ of newer ones.
|
||||
The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
|
||||
from old API versions. It also provides error codes in `err.h`.
|
||||
|
||||
# API (v9)
|
||||
# API (v10)
|
||||
|
||||
Below is the main `ncclNet_v9` struct. Each function is explained in later sections.
|
||||
Below is the main `ncclNet_v10` struct. Each function is explained in later sections.
|
||||
|
||||
```
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
@@ -83,13 +83,13 @@ typedef struct {
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
|
||||
ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
|
||||
@@ -98,10 +98,10 @@ typedef struct {
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
@@ -200,6 +200,9 @@ the plugin code adding the following definitions:
|
||||
#define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
|
||||
```
|
||||
|
||||
The `ncclProfilerCallback_t` argument is a NCCL core callback that allows the plugin to define and
|
||||
record its own events with the NCCL profiler plugin.
|
||||
|
||||
`devices`
|
||||
|
||||
Once the plugin is initialized, NCCL will query the number of devices available. It should not
|
||||
@@ -301,6 +304,11 @@ the `listen` call previously. If the sender did not connect yet, `accept` should
|
||||
should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
|
||||
succeeds.
|
||||
|
||||
The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field.
|
||||
This field can be used by the network plugin to specify the QoS level of the connection. By default,
|
||||
`trafficClass` is set to -1 but can be configured by the application during communicator initialization
|
||||
to select a plugin-supported QoS level.
|
||||
|
||||
`closeListen`/`closeSend`/`closeRecv`
|
||||
|
||||
Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call
|
||||
@@ -354,6 +362,9 @@ The `isend` operation returns a handle in the `request` argument for further cal
|
||||
the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call
|
||||
`isend` again later.
|
||||
|
||||
The `pHandle` argument allows NCCL to pass an opaque handle that can be used by the network plugin
|
||||
to support network defined events.
|
||||
|
||||
`irecv`
|
||||
|
||||
To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument
|
||||
@@ -375,6 +386,9 @@ of irecv and is resilient to redundant network writes. This allows the plugin to
|
||||
completions on such irecvs (for example, complete the request immediately). The plugin is still
|
||||
expected to set a valid request pointer on return which NCCL can poll to check for completion.
|
||||
|
||||
The `pHandle` argument allows NCCL to pass an array of opaque handles that can be used by the
|
||||
network plugin to support network defined events.
|
||||
|
||||
Note: for a given connection, send/receive operations should always match in the order they were
|
||||
posted. Tags provided for receive operations are only used to assign a given send operation to one
|
||||
of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag
|
||||
|
||||
@@ -2,14 +2,15 @@
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_NET_H_
|
||||
#define NCCL_NET_H_
|
||||
#ifndef NET_H_
|
||||
#define NET_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "err.h"
|
||||
#include "net_device.h"
|
||||
|
||||
#define NCCL_NET_HANDLE_MAXSIZE 128
|
||||
#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
|
||||
@@ -22,6 +23,9 @@
|
||||
// Maximum number of requests per comm object
|
||||
#define NCCL_NET_MAX_REQUESTS 32
|
||||
|
||||
typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
|
||||
|
||||
#include "net_v10.h"
|
||||
#include "net_v9.h"
|
||||
#include "net_v8.h"
|
||||
#include "net_v7.h"
|
||||
@@ -31,4 +35,9 @@
|
||||
#include "net_v3.h"
|
||||
#include "net_v2.h"
|
||||
|
||||
typedef ncclNet_v10_t ncclNet_t;
|
||||
typedef ncclNetProperties_v10_t ncclNetProperties_t;
|
||||
typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
|
||||
typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
|
||||
|
||||
#endif // end include guard
|
||||
|
||||
@@ -26,6 +26,7 @@ typedef struct {
|
||||
|
||||
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
|
||||
typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
|
||||
typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
|
||||
typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
|
||||
typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,101 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NET_V10_H_
|
||||
#define NET_V10_H_
|
||||
|
||||
#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
|
||||
typedef struct {
|
||||
int ndevs;
|
||||
int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
|
||||
} ncclNetVDeviceProps_v10_t;
|
||||
|
||||
|
||||
#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
|
||||
typedef struct {
|
||||
// Plugin-specific TC value
|
||||
int trafficClass;
|
||||
} ncclNetCommConfig_v10_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int regIsGlobal; // regMr is not tied to a particular comm
|
||||
int forceFlush; // Force a flush on receives
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
ncclNetDeviceType netDeviceType; // Network offload type
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
ncclNetVDeviceProps_v10_t vProps;
|
||||
size_t maxP2pBytes; // Max transfer size for point-to-point operations
|
||||
size_t maxCollBytes; // Max transfer size for collective operations
|
||||
} ncclNetProperties_v10_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
|
||||
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
|
||||
|
||||
// Notify the plugin that a recv has completed by the device
|
||||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
|
||||
|
||||
// Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
|
||||
// what index this new vNIC exists at
|
||||
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
|
||||
} ncclNet_v10_t;
|
||||
|
||||
#endif // end include guard
|
||||
@@ -2,8 +2,8 @@
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_NET_V2_H_
|
||||
#define NCCL_NET_V2_H_
|
||||
#ifndef NET_V2_H_
|
||||
#define NET_V2_H_
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_NET_V3_H_
|
||||
#define NCCL_NET_V3_H_
|
||||
#ifndef NET_V3_H_
|
||||
#define NET_V3_H_
|
||||
|
||||
#define NCCL_NET_MAX_REQUESTS_V3 16
|
||||
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_NET_V4_H_
|
||||
#define NCCL_NET_V4_H_
|
||||
#ifndef NET_V4_H_
|
||||
#define NET_V4_H_
|
||||
|
||||
#define NCCL_NET_HANDLE_MAXSIZE_V4 64
|
||||
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_NET_V5_H_
|
||||
#define NCCL_NET_V5_H_
|
||||
#ifndef NET_V5_H_
|
||||
#define NET_V5_H_
|
||||
|
||||
typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
|
||||
typedef struct {
|
||||
|
||||
@@ -2,10 +2,8 @@
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_NET_V6_H_
|
||||
#define NCCL_NET_V6_H_
|
||||
|
||||
#define NCCL_NET_MAX_REQUESTS_V6 8
|
||||
#ifndef NET_V6_H_
|
||||
#define NET_V6_H_
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
|
||||
@@ -2,10 +2,8 @@
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_NET_V7_H_
|
||||
#define NCCL_NET_V7_H_
|
||||
|
||||
#include "net_device.h"
|
||||
#ifndef NET_V7_H_
|
||||
#define NET_V7_H_
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
|
||||
@@ -2,10 +2,8 @@
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_NET_V8_H_
|
||||
#define NCCL_NET_V8_H_
|
||||
|
||||
#include "net_device.h"
|
||||
#ifndef NET_V8_H_
|
||||
#define NET_V8_H_
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
|
||||
@@ -2,18 +2,14 @@
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_NET_V9_H_
|
||||
#define NCCL_NET_V9_H_
|
||||
|
||||
#include "net_device.h"
|
||||
#ifndef NET_V9_H_
|
||||
#define NET_V9_H_
|
||||
|
||||
#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
|
||||
#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
|
||||
typedef struct {
|
||||
int ndevs;
|
||||
int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
|
||||
} ncclNetVDeviceProps_v9_t;
|
||||
typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
@@ -35,8 +31,6 @@ typedef struct {
|
||||
size_t maxCollBytes; // Max transfer size for collective operations
|
||||
} ncclNetProperties_v9_t;
|
||||
|
||||
typedef ncclNetProperties_v9_t ncclNetProperties_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
@@ -93,7 +87,7 @@ typedef struct {
|
||||
|
||||
// Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
|
||||
// what index this new vNIC exists at
|
||||
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
|
||||
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
|
||||
} ncclNet_v9_t;
|
||||
|
||||
#endif // end include guard
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
|
||||
int max_requests = NCCL_NET_MAX_REQUESTS;
|
||||
|
||||
__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
|
||||
__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
|
||||
__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
|
||||
__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
|
||||
@@ -52,13 +52,13 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
|
||||
__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
|
||||
__hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
|
||||
@@ -70,7 +70,7 @@ __hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) {
|
||||
|
||||
#define PLUGIN_NAME "Plugin"
|
||||
|
||||
ncclNet_v9_t ncclNetPlugin_v9 = {
|
||||
const ncclNet_v10_t ncclNetPlugin_v10 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.devices = pluginDevices,
|
||||
@@ -93,6 +93,51 @@ ncclNet_v9_t ncclNetPlugin_v9 = {
|
||||
.makeVDevice = pluginMakeVDevice,
|
||||
};
|
||||
|
||||
__hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
|
||||
return pluginInit(logFunction, NULL);
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) {
|
||||
return pluginGetProperties(dev, (ncclNetProperties_t*)props);
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){
|
||||
return pluginConnect(dev, NULL, handle, sendComm, sendDevComm);
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
|
||||
return pluginIsend(sendComm, data, size, tag, mhandle, NULL, request);
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginIrecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
|
||||
return pluginIrecv(recvComm, n, data, sizes, tags, mhandles, NULL, request);
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginMakeVDevice_v9(int* d, ncclNetVDeviceProps_v9_t* props) { return ncclInternalError; }
|
||||
|
||||
const ncclNet_v9_t ncclNetPlugin_v9 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit_v9,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v9,
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect_v9,
|
||||
.accept = pluginAccept,
|
||||
.regMr = pluginRegMr,
|
||||
.regMrDmaBuf = pluginRegMrDmaBuf,
|
||||
.deregMr = pluginDeregMr,
|
||||
.isend = pluginIsend_v9,
|
||||
.irecv = pluginIrecv_v9,
|
||||
.iflush = pluginIflush,
|
||||
.test = pluginTest,
|
||||
.closeSend = pluginCloseSend,
|
||||
.closeRecv = pluginCloseRecv,
|
||||
.closeListen = pluginCloseListen,
|
||||
.getDeviceMr = pluginGetDeviceMr,
|
||||
.irecvConsumed = pluginIrecvConsumed,
|
||||
.makeVDevice = pluginMakeVDevice_v9,
|
||||
};
|
||||
|
||||
__hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* props_v8) {
|
||||
ncclNetProperties_t props;
|
||||
ncclResult_t ret = pluginGetProperties(dev, &props);
|
||||
@@ -113,22 +158,22 @@ __hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* pr
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginIsend_v8(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
|
||||
return pluginIsend(sendComm, data, (int)size, tag, mhandle, request);
|
||||
return pluginIsend(sendComm, data, (int)size, tag, mhandle, NULL, request);
|
||||
}
|
||||
|
||||
__hidden ncclResult_t pluginIrecv_v8(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
|
||||
size_t sizesOut[NCCL_PLUGIN_MAX_RECVS];
|
||||
for (int i=0; i<n; i++) sizesOut[i] = sizes[i];
|
||||
return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, request);
|
||||
return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, NULL, request);
|
||||
}
|
||||
|
||||
const ncclNet_v8_t ncclNetPlugin_v8 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.init = pluginInit_v9,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v8,
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect,
|
||||
.connect = pluginConnect_v9,
|
||||
.accept = pluginAccept,
|
||||
.regMr = pluginRegMr,
|
||||
.regMrDmaBuf = pluginRegMrDmaBuf,
|
||||
@@ -168,11 +213,11 @@ __hidden ncclResult_t pluginRegMr_v7(void* collComm, void* data, int size, int t
|
||||
|
||||
const ncclNet_v7_t ncclNetPlugin_v7 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.init = pluginInit_v9,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v7,
|
||||
.listen = pluginListen,
|
||||
.connect = pluginConnect,
|
||||
.connect = pluginConnect_v9,
|
||||
.accept = pluginAccept,
|
||||
.regMr = pluginRegMr_v7,
|
||||
.regMrDmaBuf = pluginRegMrDmaBuf,
|
||||
@@ -209,7 +254,7 @@ __hidden ncclResult_t pluginAccept_v6(void* listenComm, void** recvComm) { retur
|
||||
|
||||
const ncclNet_v6_t ncclNetPlugin_v6 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.init = pluginInit_v9,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v6,
|
||||
.listen = pluginListen,
|
||||
@@ -230,7 +275,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
|
||||
/* v5 Compat */
|
||||
const ncclNet_v5_t ncclNetPlugin_v5 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.init = pluginInit_v9,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v6,
|
||||
.listen = pluginListen,
|
||||
@@ -275,7 +320,7 @@ static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
|
||||
ncclResult_t ret;
|
||||
do {
|
||||
ncclNetDeviceHandle_v7_t* handle = NULL;
|
||||
ret = pluginConnect(dev, handle, sendComm, &handle);
|
||||
ret = pluginConnect(dev, NULL, handle, sendComm, &handle);
|
||||
} while (ret == ncclSuccess && *sendComm == NULL);
|
||||
return ret;
|
||||
}
|
||||
@@ -289,7 +334,7 @@ static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) {
|
||||
}
|
||||
const ncclNet_v4_t ncclNetPlugin_v4 = {
|
||||
.name = PLUGIN_NAME,
|
||||
.init = pluginInit,
|
||||
.init = pluginInit_v9,
|
||||
.devices = pluginDevices,
|
||||
.getProperties = pluginGetProperties_v4,
|
||||
.listen = pluginListen,
|
||||
@@ -318,7 +363,7 @@ static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhan
|
||||
}
|
||||
static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
|
||||
max_requests = NCCL_NET_MAX_REQUESTS_V3;
|
||||
return pluginInit(logFunction);
|
||||
return pluginInit(logFunction, NULL);
|
||||
}
|
||||
#include <string.h>
|
||||
static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
|
||||
|
||||
+128
-14
@@ -49,9 +49,9 @@ of newer ones.
|
||||
The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
|
||||
from old API versions. It also provides error codes in `err.h`.
|
||||
|
||||
# API (v2)
|
||||
# API (v3)
|
||||
|
||||
Below is the main `ncclProfiler_v2` struct. Each function is explained in later sections.
|
||||
Below is the main `ncclProfiler_v3` struct. Each function is explained in later sections.
|
||||
|
||||
```
|
||||
typedef struct {
|
||||
@@ -70,7 +70,7 @@ typedef struct {
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
@@ -82,13 +82,13 @@ typedef struct {
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v2_t;
|
||||
} ncclProfiler_v3_t;
|
||||
```
|
||||
|
||||
## Error codes
|
||||
@@ -156,7 +156,6 @@ typedef struct {
|
||||
size_t count; // data count
|
||||
int root; // root rank
|
||||
const char* datatype; // string containing the name of the datatype
|
||||
size_t trafficBytes; // number of transfer bytes
|
||||
uint8_t nMaxChannels; // max number of channels for this collective
|
||||
uint8_t nWarps; // number of GPU warps for this collective
|
||||
const char* algo; // string containing name of the algorithm for this collective
|
||||
@@ -185,12 +184,22 @@ typedef struct {
|
||||
struct { // proxyStep events metadata
|
||||
int step; // individual step in `ncclProxyOp`
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
uint8_t channelId; // id of the channel used by the kernel
|
||||
} kernelCh;
|
||||
|
||||
struct {
|
||||
int64_t id; // net plugin id (used by net and profiler plugins to agree on event definitions)
|
||||
void* data; // pointer to network plugin defined event
|
||||
} netPlugin;
|
||||
};
|
||||
} ncclProfilerEventDescr_v2_t;
|
||||
} ncclProfilerEventDescr_v3_t;
|
||||
```
|
||||
|
||||
NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
|
||||
`ncclProfileProxyOp`, `ncclProfileProxyStep`, and `ncclProfileProxyCtrl`.
|
||||
`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
|
||||
`ncclProfileNetPlugin`.
|
||||
|
||||
#### stopEvent
|
||||
|
||||
@@ -236,7 +245,7 @@ typedef enum {
|
||||
ncclProfilerProxyCtrlWakeup, // state marks proxy progress thread waking up
|
||||
ncclProfilerProxyCtrlAppend, // state marks append of new network work item begin
|
||||
ncclProfilerProxyCtrlAppendEnd, // state marks append of new network work item end
|
||||
} ncclProfilerEventState_v2_t;
|
||||
} ncclProfilerEventState_v3_t;
|
||||
```
|
||||
|
||||
`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
|
||||
@@ -251,6 +260,89 @@ the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events.
|
||||
network requests for the GPU kernel. This includes everything else that the proxy thread might be
|
||||
doing, including appending new `ncclProxyOp` objects to the list of work elements to process.
|
||||
|
||||
`ncclProfileKernelCh` events are generated by the profiler proxy progress function while the kernel
|
||||
processes work items for the enqueued NCCL operations.
|
||||
|
||||
`ncclProfileNetPlugin` events are generated by the network plugin. Network plugins are free to define
|
||||
their own set of events and communicate them to the profiler plugin using `ncclProfileNetPlugin` and
|
||||
the `ncclProfilerCallback\_t` NCCL core callback. The network and profiler plugin can agree on the
|
||||
network defined event definition using the plugin id in the event descriptor. The plugin identifier
|
||||
is a 64-bit integer that has two parts: the 16 LSB are assigned to the plugin event version, the next
|
||||
16 bits are assigned to the plugin type (NCCL\_PROFILER\_NET\_TYPE\_IB, ...). The rest of the bits are
|
||||
unused and available for future extensions.
|
||||
|
||||
A network IB plugin can use this infrastructure to define a QP event as:
|
||||
|
||||
```C
|
||||
#define NCCL_PROFILER_NET_IB_VER 1
|
||||
|
||||
enum {
|
||||
ncclProfileQp = (1 << 0),
|
||||
};
|
||||
|
||||
// The data structure version is encoded in the plugin identifier bitmask and
|
||||
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||
// identifier in the event descriptor before calling the profiler startEvent
|
||||
// function. The profiler should inspect the plugin id to find out the source
|
||||
// plugin as well as the version of the event struct
|
||||
typedef struct {
|
||||
uint8_t type; // event type (plugin defined)
|
||||
union {
|
||||
struct {
|
||||
int device; // network device id
|
||||
uint64_t wr_id; // work request id
|
||||
int opcode; // ibv opcode
|
||||
int qpNum; // QP number
|
||||
size_t length; // work request data length
|
||||
} qp;
|
||||
};
|
||||
} ncclProfilerNetIbDescr_v1_t;
|
||||
```
|
||||
|
||||
The network event infrastructure is network agnostic. A different network socket plugin can
|
||||
use it to define a socket event as:
|
||||
|
||||
```C
|
||||
#define NCCL_PROFILER_NET_SOCKET_VER 1
|
||||
|
||||
enum {
|
||||
ncclProfileSocket = (1 << 0),
|
||||
};
|
||||
|
||||
// The data structure version is encoded in the plugin identifier bitmask and
|
||||
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||
// identifier in the event descriptor before calling the profiler startEvent
|
||||
// function. The profiler should inspect the plugin id to find out the source
|
||||
// plugin as well as the version of the event struct
|
||||
typedef struct {
|
||||
uint8_t type; // event type (plugin defined)
|
||||
union {
|
||||
struct {
|
||||
int fd;
|
||||
int op;
|
||||
size_t length;
|
||||
} sock;
|
||||
};
|
||||
} ncclProfilerNetSockDescr_v1_t;
|
||||
```
|
||||
|
||||
The network plugin creates an event (descriptor) and passes it to the profiler callback,
|
||||
along with the network type and version (plugin id). NCCL then creates a `ncclProfileNetPlugin`
|
||||
event descriptor, attaches the network plugin defined event as external data, and calls
|
||||
the profiler `startEvent` function.
|
||||
|
||||
```C
|
||||
ncclResult_t isend(..., void* phandle, ...) {
|
||||
...
|
||||
int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
|
||||
ncclProfilerNetIbDescr_v1_t eDescr = { };
|
||||
eDescr.type = ncclProfileQp;
|
||||
eDescr.qp = { ... };
|
||||
ncclProfilerCallback(&eHandle, 0 /* start net event */, phandle, pluginId, &eDescr);
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
State transitions for the events described can also come with event attribute updates. For this
|
||||
reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below.
|
||||
|
||||
@@ -264,7 +356,7 @@ typedef union {
|
||||
struct { // attributes to update for ncclProfileProxyCtrl
|
||||
int appendedProxyOps; // number of appended proxy ops thus far
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v2_t;
|
||||
} ncclProfilerEventStateArgs_v3_t;
|
||||
```
|
||||
|
||||
The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
|
||||
@@ -279,14 +371,22 @@ Group event
|
||||
+- Collective event
|
||||
| |
|
||||
| +- ProxyOp event
|
||||
| |
|
||||
| +- ProxyStep event
|
||||
| | |
|
||||
| | +- ProxyStep event
|
||||
| | |
|
||||
| | +- NetPlugin event
|
||||
| |
|
||||
| +- KernelCh event
|
||||
|
|
||||
+- Point-to-point event
|
||||
|
|
||||
+- ProxyOp event
|
||||
|
|
||||
+- ProxyStep event
|
||||
| |
|
||||
| +- ProxyStep event
|
||||
| |
|
||||
| +- NetPlugin event
|
||||
|
|
||||
+- KernelCh event
|
||||
|
||||
ProxyCtrl event
|
||||
```
|
||||
@@ -316,3 +416,17 @@ thread originating the operation. To avoid the profiler instance in the remote p
|
||||
dereference a pointer from another address space the event descriptor includes the PID of the originator.
|
||||
The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the
|
||||
parent event.
|
||||
|
||||
# Known Limitations
|
||||
|
||||
In intra-node communication, or whenever a rank does not have any network activity for which proxy events
|
||||
are unavailable, the profiler will only report the enqueue events (e.g., ncclAllReduce). The events from
|
||||
enqueue can be time stamped by the profiler (at start and stop) to reconstruct the execution time of the
|
||||
collective. However, this time only represents the launch time of the collective and not the actual
|
||||
execution time. To reconstruct the execution time more accurately proxy and kernel events are provided.
|
||||
|
||||
Kernel events instrumentation leverages counters exposed by the kernel to the host and the proxy progress
|
||||
thread. Thus, the proxy progress thread infrastructure is shared between the network and the profiler. If
|
||||
the proxy is serving network requests the kernel profiling probing can be delayed, causing loss of
|
||||
accuracy. Similarly, if the CPU is under heavy load and the scheduling of the proxy progress thread is
|
||||
delayed, a similar loss of accuracy can be encountered. Keep this in mind when using kernel events.
|
||||
|
||||
@@ -10,7 +10,7 @@ PLUGIN_SO := libnccl-profiler.so
|
||||
default: $(PLUGIN_SO)
|
||||
|
||||
$(PLUGIN_SO): plugin.c event.c print_event.c
|
||||
$(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
|
||||
$(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
|
||||
|
||||
clean:
|
||||
rm -f $(PLUGIN_SO)
|
||||
|
||||
@@ -33,10 +33,42 @@
|
||||
|
||||
#define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES)
|
||||
#define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES)
|
||||
|
||||
#define MAX_COMM_CLIQUES (32 * 8)
|
||||
#define MAX_EVENTS_PER_REQ (8)
|
||||
|
||||
struct proxyOp;
|
||||
struct proxyStep;
|
||||
|
||||
struct netPlugin {
|
||||
uint8_t type;
|
||||
int pluginType;
|
||||
int pluginVer;
|
||||
uint8_t pluginEvent;
|
||||
union {
|
||||
struct {
|
||||
int device;
|
||||
int qpNum;
|
||||
int opcode;
|
||||
uint64_t wr_id;
|
||||
size_t length;
|
||||
} qp;
|
||||
struct {
|
||||
int fd;
|
||||
int op;
|
||||
size_t length;
|
||||
} sock;
|
||||
};
|
||||
double startTs;
|
||||
double stopTs;
|
||||
struct proxyStep* parent;
|
||||
};
|
||||
|
||||
struct kernelCh {
|
||||
uint8_t type;
|
||||
uint8_t channelId;
|
||||
struct taskEventBase* parent;
|
||||
double startTs;
|
||||
double stopTs;
|
||||
};
|
||||
|
||||
struct proxyStep {
|
||||
uint8_t type; // type of event: network transfer
|
||||
@@ -46,6 +78,8 @@ struct proxyStep {
|
||||
double startTs;
|
||||
double stopTs;
|
||||
struct proxyOp* parent;
|
||||
struct netPlugin net[MAX_EVENTS_PER_REQ];
|
||||
int nNetEvents;
|
||||
};
|
||||
|
||||
struct proxyOp {
|
||||
@@ -101,7 +135,6 @@ struct collective {
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
size_t trafficBytes;
|
||||
int root;
|
||||
const char* datatype;
|
||||
uint8_t nMaxChannels;
|
||||
@@ -111,6 +144,7 @@ struct collective {
|
||||
struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events
|
||||
struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events
|
||||
int nProxyOps[MAX_CHANNELS];
|
||||
struct kernelCh kernel[MAX_CHANNELS];
|
||||
};
|
||||
|
||||
struct p2p {
|
||||
@@ -121,6 +155,7 @@ struct p2p {
|
||||
const char* datatype;
|
||||
int peer;
|
||||
struct proxyOp op[MAX_CHANNELS];
|
||||
struct kernelCh kernel[MAX_CHANNELS];
|
||||
};
|
||||
|
||||
struct group {
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NET_IB_V1_H_
|
||||
#define NET_IB_V1_H_
|
||||
|
||||
#define NCCL_PROFILER_NET_IB_VER 1
|
||||
|
||||
enum {
|
||||
ncclProfileQp = (1 << 0),
|
||||
};
|
||||
|
||||
// The data structure version is encoded in the plugin identifier bitmask and
|
||||
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||
// identifier in the event descriptor before calling the profiler startEvent
|
||||
// function. The profiler should inspect the plugin id to find out the source
|
||||
// plugin as well as the version of the event struct
|
||||
typedef struct {
|
||||
uint8_t type; // event type (plugin defined)
|
||||
union {
|
||||
struct {
|
||||
int device; // network device id
|
||||
uint64_t wr_id; // work request id
|
||||
int opcode; // ibv opcode
|
||||
int qpNum; // QP number
|
||||
size_t length; // work request data length
|
||||
} qp;
|
||||
};
|
||||
} ncclProfilerNetIbDescr_v1_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,32 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NET_SOCKET_V1_H_
|
||||
#define NET_SOCKET_V1_H_
|
||||
|
||||
#define NCCL_PROFILER_NET_SOCKET_VER 1
|
||||
|
||||
enum {
|
||||
ncclProfileSocket = (1 << 0),
|
||||
};
|
||||
|
||||
// The data structure version is encoded in the plugin identifier bitmask and
|
||||
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||
// identifier in the event descriptor before calling the profiler startEvent
|
||||
// function. The profiler should inspect the plugin id to find out the source
|
||||
// plugin as well as the version of the event struct
|
||||
typedef struct {
|
||||
uint8_t type; // event type (plugin defined)
|
||||
union {
|
||||
struct {
|
||||
int fd;
|
||||
int op;
|
||||
size_t length;
|
||||
} sock;
|
||||
};
|
||||
} ncclProfilerNetSockDescr_v1_t;
|
||||
|
||||
#endif
|
||||
@@ -4,8 +4,8 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_PROFILER_H_
|
||||
#define NCCL_PROFILER_H_
|
||||
#ifndef PROFILER_H_
|
||||
#define PROFILER_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
@@ -13,7 +13,54 @@
|
||||
#include "common.h"
|
||||
#include "err.h"
|
||||
|
||||
enum {
|
||||
ncclProfileGroup = (1 << 0), // group event type
|
||||
ncclProfileColl = (1 << 1), // host collective call event type
|
||||
ncclProfileP2p = (1 << 2), // host point-to-point call event type
|
||||
ncclProfileProxyOp = (1 << 3), // proxy operation event type
|
||||
ncclProfileProxyStep = (1 << 4), // proxy step event type
|
||||
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
|
||||
ncclProfileKernelCh = (1 << 6), // kernel channel event type
|
||||
ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
ncclProfilerProxyOpSendPosted,
|
||||
ncclProfilerProxyOpSendRemFifoWait,
|
||||
ncclProfilerProxyOpSendTransmitted,
|
||||
ncclProfilerProxyOpSendDone,
|
||||
ncclProfilerProxyOpRecvPosted,
|
||||
ncclProfilerProxyOpRecvReceived,
|
||||
ncclProfilerProxyOpRecvTransmitted,
|
||||
ncclProfilerProxyOpRecvDone,
|
||||
|
||||
/* Legacy proxy profiler states */
|
||||
ncclProfilerProxyStepSendGPUWait,
|
||||
ncclProfilerProxyStepSendWait,
|
||||
ncclProfilerProxyStepRecvWait,
|
||||
ncclProfilerProxyStepRecvFlushWait,
|
||||
ncclProfilerProxyStepRecvGPUWait,
|
||||
|
||||
/* Legacy proxy control states */
|
||||
ncclProfilerProxyCtrlIdle,
|
||||
ncclProfilerProxyCtrlActive,
|
||||
ncclProfilerProxyCtrlSleep,
|
||||
ncclProfilerProxyCtrlWakeup,
|
||||
ncclProfilerProxyCtrlAppend,
|
||||
ncclProfilerProxyCtrlAppendEnd,
|
||||
} ncclProfilerEventState_t;
|
||||
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
|
||||
|
||||
#include "profiler_v3.h"
|
||||
#include "profiler_v2.h"
|
||||
#include "profiler_v1.h"
|
||||
#include "profiler_net.h"
|
||||
|
||||
typedef ncclProfiler_v3_t ncclProfiler_t;
|
||||
typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
|
||||
|
||||
#endif // end include guard
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_NET_H_
|
||||
#define PROFILER_NET_H_
|
||||
|
||||
#define NCCL_PROFILER_NET_VER_BITS (16)
|
||||
#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS)
|
||||
#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
|
||||
|
||||
typedef enum {
|
||||
NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS),
|
||||
NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
|
||||
} ncclProfilerNetType;
|
||||
|
||||
#include "net_ib_v1.h"
|
||||
#include "net_socket_v1.h"
|
||||
|
||||
#endif
|
||||
@@ -4,8 +4,8 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_PROFILER_V1_H_
|
||||
#define NCCL_PROFILER_V1_H_
|
||||
#ifndef PROFILER_V1_H_
|
||||
#define PROFILER_V1_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
@@ -59,8 +59,16 @@ typedef struct {
|
||||
};
|
||||
} ncclProfilerEventDescr_v1_t;
|
||||
|
||||
typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
|
||||
typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
int steps;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v1_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
@@ -4,20 +4,11 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_PROFILER_V2_H_
|
||||
#define NCCL_PROFILER_V2_H_
|
||||
#ifndef PROFILER_V2_H_
|
||||
#define PROFILER_V2_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
enum {
|
||||
ncclProfileGroup = (1 << 0), // group event type
|
||||
ncclProfileColl = (1 << 1), // host collective call event type
|
||||
ncclProfileP2p = (1 << 2), // host point-to-point call event type
|
||||
ncclProfileProxyOp = (1 << 3), // proxy operation event type
|
||||
ncclProfileProxyStep = (1 << 4), // proxy step event type
|
||||
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
@@ -65,32 +56,6 @@ typedef struct {
|
||||
};
|
||||
} ncclProfilerEventDescr_v2_t;
|
||||
|
||||
typedef enum {
|
||||
ncclProfilerProxyOpSendPosted,
|
||||
ncclProfilerProxyOpSendRemFifoWait,
|
||||
ncclProfilerProxyOpSendTransmitted,
|
||||
ncclProfilerProxyOpSendDone,
|
||||
ncclProfilerProxyOpRecvPosted,
|
||||
ncclProfilerProxyOpRecvReceived,
|
||||
ncclProfilerProxyOpRecvTransmitted,
|
||||
ncclProfilerProxyOpRecvDone,
|
||||
|
||||
/* Legacy proxy profiler states */
|
||||
ncclProfilerProxyStepSendGPUWait,
|
||||
ncclProfilerProxyStepSendWait,
|
||||
ncclProfilerProxyStepRecvWait,
|
||||
ncclProfilerProxyStepRecvFlushWait,
|
||||
ncclProfilerProxyStepRecvGPUWait,
|
||||
|
||||
/* Legacy proxy control states */
|
||||
ncclProfilerProxyCtrlIdle,
|
||||
ncclProfilerProxyCtrlActive,
|
||||
ncclProfilerProxyCtrlSleep,
|
||||
ncclProfilerProxyCtrlWakeup,
|
||||
ncclProfilerProxyCtrlAppend,
|
||||
ncclProfilerProxyCtrlAppendEnd,
|
||||
} ncclProfilerEventState_v2_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
@@ -138,9 +103,4 @@ typedef struct {
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v2_t;
|
||||
|
||||
typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
|
||||
typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
|
||||
typedef ncclProfiler_v2_t ncclProfiler_t;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_V3_H_
|
||||
#define PROFILER_V3_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint64_t seqNumber;
|
||||
const char* func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
const char* datatype;
|
||||
uint8_t nMaxChannels;
|
||||
uint8_t nWarps;
|
||||
const char* algo;
|
||||
const char* proto;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
const char* func;
|
||||
void* buff;
|
||||
const char* datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
uint8_t channelId;
|
||||
} kernelCh;
|
||||
|
||||
struct {
|
||||
int64_t id;
|
||||
void* data;
|
||||
} netPlugin;
|
||||
};
|
||||
} ncclProfilerEventDescr_v3_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
int steps;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v3_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v3_t;
|
||||
|
||||
typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventState_v3_t ncclProfilerEventState_t;
|
||||
typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
|
||||
typedef ncclProfiler_v3_t ncclProfiler_t;
|
||||
|
||||
#endif
|
||||
@@ -58,6 +58,7 @@ __hidden double gettime(void) {
|
||||
|
||||
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static pid_t pid;
|
||||
static int* eActivationMaskPtr;
|
||||
|
||||
__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) {
|
||||
pthread_mutex_lock(&lock);
|
||||
@@ -65,7 +66,7 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
|
||||
// first thread initializes event mask, environment and detach pool
|
||||
const char* str;
|
||||
str = getenv("NCCL_PROFILE_EVENT_MASK");
|
||||
__atomic_store_n(eActivationMask, str ? atoi(str) : defaultEActivationMask, __ATOMIC_RELAXED);
|
||||
__atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED);
|
||||
|
||||
str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
|
||||
groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
|
||||
@@ -100,6 +101,9 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
|
||||
}
|
||||
pthread_mutex_unlock(&lock);
|
||||
|
||||
// store pointer to activation mask globally
|
||||
eActivationMaskPtr = eActivationMask;
|
||||
|
||||
// pre-allocate memory for event object pools in dedicated profiler context
|
||||
struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
|
||||
ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
|
||||
@@ -199,8 +203,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
if (base->type == ncclProfileColl) {
|
||||
struct collective* c = (struct collective *)base;
|
||||
// reset event proxyOps & proxySteps
|
||||
memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
|
||||
memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
|
||||
memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
|
||||
// release collective events in the group and return them to the collective pool
|
||||
__atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
|
||||
@@ -252,7 +254,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
event->count = eDescr->coll.count;
|
||||
event->root = eDescr->coll.root;
|
||||
event->datatype = eDescr->coll.datatype;
|
||||
event->trafficBytes = eDescr->coll.trafficBytes;
|
||||
event->nMaxChannels = eDescr->coll.nMaxChannels;
|
||||
event->nWarps = eDescr->coll.nWarps;
|
||||
event->algo = eDescr->coll.algo;
|
||||
@@ -373,7 +374,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
|
||||
debugEvent(event, "ProxyOpStart");
|
||||
}
|
||||
} else if (eDescr->type == ncclProfileProxyStep) {
|
||||
} else if (eDescr->type == ncclProfileProxyStep) {
|
||||
// the parent might be null if we run out of events
|
||||
struct proxyOp* parent = (struct proxyOp *)eDescr->parentObj;
|
||||
if (parent == NULL) return ncclSuccess;
|
||||
@@ -385,8 +386,77 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
event->isSend = parent->isSend;
|
||||
event->parent = parent;
|
||||
event->startTs = gettime() - startTime;
|
||||
event->nNetEvents = 0;
|
||||
*eHandle = event;
|
||||
debugEvent(event, "ProxyStepStart");
|
||||
} else if (eDescr->type == ncclProfileKernelCh) {
|
||||
struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj;
|
||||
if (eventBase == NULL) return ncclSuccess;
|
||||
if (eventBase->type == ncclProfileColl) {
|
||||
struct collective* parent = (struct collective *)eDescr->parentObj;
|
||||
struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
|
||||
event->type = ncclProfileKernelCh;
|
||||
event->channelId = eDescr->kernelCh.channelId;
|
||||
event->parent = eventBase;
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
|
||||
debugEvent(event, "KernelChStart");
|
||||
} else { // ncclProfileP2p
|
||||
struct p2p* parent = (struct p2p *)eDescr->parentObj;
|
||||
struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
|
||||
event->type = ncclProfileKernelCh;
|
||||
event->channelId = eDescr->kernelCh.channelId;
|
||||
event->parent = eventBase;
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
|
||||
debugEvent(event, "KernelChStart");
|
||||
}
|
||||
} else if (eDescr->type == ncclProfileNetPlugin) {
|
||||
struct proxyStep* parent = (struct proxyStep *)eDescr->parentObj;
|
||||
if (parent == NULL) return ncclSuccess;
|
||||
|
||||
int64_t pluginId = eDescr->netPlugin.id;
|
||||
int64_t type = pluginId & NCCL_PROFILER_NET_TYPE_MASK;
|
||||
int64_t ver = pluginId & NCCL_PROFILER_NET_VER_MASK;
|
||||
if (type == NCCL_PROFILER_NET_TYPE_IB) {
|
||||
if (ver == 1) {
|
||||
ncclProfilerNetIbDescr_v1_t* descr = (ncclProfilerNetIbDescr_v1_t *)eDescr->netPlugin.data;
|
||||
struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
|
||||
event->type = ncclProfileNetPlugin;
|
||||
event->pluginType = type;
|
||||
event->pluginVer = ver;
|
||||
if (descr->type == ncclProfileQp) {
|
||||
event->pluginEvent = ncclProfileQp;
|
||||
event->qp.device = descr->qp.device;
|
||||
event->qp.wr_id = descr->qp.wr_id;
|
||||
event->qp.opcode = descr->qp.opcode;
|
||||
event->qp.qpNum = descr->qp.qpNum;
|
||||
event->qp.length = descr->qp.length;
|
||||
}
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
debugEvent(event, "NetPluginStart");
|
||||
}
|
||||
} else if (type == NCCL_PROFILER_NET_TYPE_SOCK) {
|
||||
if (ver == 1) {
|
||||
ncclProfilerNetSockDescr_v1_t* descr = (ncclProfilerNetSockDescr_v1_t *)eDescr->netPlugin.data;
|
||||
struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
|
||||
event->type = ncclProfileNetPlugin;
|
||||
event->pluginType = type;
|
||||
event->pluginVer = ver;
|
||||
if (descr->type == ncclProfileSocket) {
|
||||
event->pluginEvent = ncclProfileSocket;
|
||||
event->sock.fd = descr->sock.fd;
|
||||
event->sock.op = descr->sock.op;
|
||||
event->sock.length = descr->sock.length;
|
||||
}
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
debugEvent(event, "NetPluginStart");
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -445,6 +515,15 @@ void updateEvent(void* handle) {
|
||||
struct proxyCtrl* event = (struct proxyCtrl *)handle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
debugEvent(event, "ProxyCtrlStop");
|
||||
} else if (type == ncclProfileKernelCh) {
|
||||
struct kernelCh* event = (struct kernelCh *)handle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
updateEvent(event->parent);
|
||||
debugEvent(event, "KernelChStop");
|
||||
} else if (type == ncclProfileNetPlugin) {
|
||||
struct netPlugin* event = (struct netPlugin *)handle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
debugEvent(event, "NetPluginStop");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -506,7 +585,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclProfiler_t ncclProfiler_v2 = {
|
||||
ncclProfiler_t ncclProfiler_v3 = {
|
||||
"Example-profiler",
|
||||
exampleProfilerInit,
|
||||
exampleProfilerStartEvent,
|
||||
@@ -514,3 +593,17 @@ ncclProfiler_t ncclProfiler_v2 = {
|
||||
exampleProfilerRecordEventState,
|
||||
exampleProfilerFinalize,
|
||||
};
|
||||
|
||||
int exampleProfilerStart(int eActivationMask) {
|
||||
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
|
||||
__atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int exampleProfilerStop(void) {
|
||||
if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
|
||||
__atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PLUGIN_H_
|
||||
#define PLUGIN_H_
|
||||
|
||||
int exampleProfilerStart(int eActivationMask);
|
||||
int exampleProfilerStop(void);
|
||||
|
||||
#endif
|
||||
@@ -72,7 +72,7 @@ __hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) {
|
||||
}
|
||||
|
||||
static __thread int proxyStepId;
|
||||
__hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
|
||||
__hidden void printProxyStepEventHeader(FILE* fh, struct proxyStep* event) {
|
||||
if (event->isSend) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
|
||||
@@ -84,8 +84,6 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
|
||||
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"SendWait", proxyStepId++, getpid(), 1, event->stopTs);
|
||||
} else {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
|
||||
@@ -93,6 +91,14 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
|
||||
"RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step);
|
||||
}
|
||||
}
|
||||
|
||||
__hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* event) {
|
||||
if (event->isSend) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"SendWait", proxyStepId++, getpid(), 1, event->stopTs);
|
||||
} else {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
@@ -106,6 +112,19 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
|
||||
}
|
||||
}
|
||||
|
||||
static __thread int kernelId;
|
||||
__hidden void printKernelChEventHeader(FILE* fh, struct kernelCh* event) {
|
||||
if (event->type != ncclProfileKernelCh) return;
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d}},\n",
|
||||
"KernelCh", kernelId, getpid(), 1, event->startTs, event->channelId);
|
||||
}
|
||||
|
||||
__hidden void printKernelChEventTrailer(FILE* fh, struct kernelCh* event) {
|
||||
if (event->type != ncclProfileKernelCh) return;
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"KernelCh", kernelId, getpid(), 1, event->stopTs);
|
||||
}
|
||||
|
||||
static __thread int proxyCtrlId;
|
||||
__hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
|
||||
const char* str;
|
||||
@@ -127,6 +146,29 @@ __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
|
||||
str, proxyCtrlId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
|
||||
static __thread int ibQpId, sockId;
|
||||
__hidden void printNetPluginEvent(FILE* fh, struct netPlugin* event) {
|
||||
if (event->pluginType == NCCL_PROFILER_NET_TYPE_IB) {
|
||||
if (event->pluginVer == 1) {
|
||||
if (event->pluginEvent == ncclProfileQp) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"device\": %d, \"qp_num\": %d, \"opcode\": %d, \"wr_id\": %lu, \"size\": %lu}},\n",
|
||||
"Qp", ibQpId, getpid(), 1, event->startTs, event->qp.device, event->qp.qpNum, event->qp.opcode, event->qp.wr_id, event->qp.length);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"Qp", ibQpId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
}
|
||||
} else if (event->pluginType == NCCL_PROFILER_NET_TYPE_SOCK) {
|
||||
if (event->pluginVer == 1) {
|
||||
if (event->pluginEvent == ncclProfileSocket) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"sock\": %d, \"op\": %d, \"size\": %lu}},\n",
|
||||
"Sock", sockId, getpid(), 1, event->startTs, event->sock.fd, event->sock.op, event->sock.length);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"Sock", sockId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//#define DEBUG_EVENTS
|
||||
void debugEvent(void* eHandle, const char* tag) {
|
||||
#ifdef DEBUG_EVENTS
|
||||
@@ -146,8 +188,10 @@ void debugEvent(void* eHandle, const char* tag) {
|
||||
fprintf(fh, "Collective event %p tag = %s {\n", event, tag);
|
||||
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
|
||||
fprintf(fh, " parent = %p\n", event->base.parent);
|
||||
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->send[i].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]);
|
||||
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->recv[i].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]);
|
||||
for (int j = 0; j < MAX_OPS; j++) {
|
||||
for (int i = 0; i < MAX_CHANNELS; i++) if (event->send[i][j].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]);
|
||||
for (int i = 0; i < MAX_CHANNELS; i++) if (event->recv[i][j].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]);
|
||||
}
|
||||
fprintf(fh, " startTs = %f\n", event->base.startTs);
|
||||
fprintf(fh, " stopTs = %f\n", event->base.stopTs);
|
||||
fprintf(fh, "}\n");
|
||||
@@ -178,6 +222,20 @@ void debugEvent(void* eHandle, const char* tag) {
|
||||
fprintf(fh, " startTs = %f\n", event->startTs);
|
||||
fprintf(fh, " stopTs = %f\n", event->stopTs);
|
||||
fprintf(fh, "}\n");
|
||||
} else if (type == ncclProfileKernelCh) {
|
||||
struct kernelCh* event = (struct kernelCh *)eHandle;
|
||||
fprintf(fh, "KernelCh event %p tag = %s {\n", event, tag);
|
||||
fprintf(fh, " parent = %p\n", event->parent);
|
||||
fprintf(fh, " channel = %d\n", event->channelId);
|
||||
} else if (type == ncclProfileNetPlugin) {
|
||||
struct netPlugin* event = (struct netPlugin *)eHandle;
|
||||
fprintf(fh, "NetPlugin event %p tag = %s {\n", event, tag);
|
||||
fprintf(fh, " pluginType = %d\n", event->pluginType);
|
||||
fprintf(fh, " pluginVer = %d\n", event->pluginVer);
|
||||
fprintf(fh, " pluginEvent = %d\n", event->pluginEvent);
|
||||
fprintf(fh, " startTs = %f\n", event->startTs);
|
||||
fprintf(fh, " stopTs = %f\n", event->stopTs);
|
||||
fprintf(fh, "}\n");
|
||||
}
|
||||
fclose(fh);
|
||||
#endif
|
||||
@@ -200,17 +258,21 @@ void printEvent(FILE* fh, void* handle) {
|
||||
struct collective* c = (struct collective *)handle;
|
||||
printCollEventHeader(fh, c);
|
||||
for (int i = 0; i < MAX_CHANNELS; i++) {
|
||||
printKernelChEventHeader(fh, &c->kernel[i]);
|
||||
for (int j = 0; j < c->nProxyOps[i]; j++) {
|
||||
printEvent(fh, &c->send[i][j]);
|
||||
printEvent(fh, &c->recv[i][j]);
|
||||
}
|
||||
printKernelChEventTrailer(fh, &c->kernel[i]);
|
||||
}
|
||||
printCollEventTrailer(fh, c);
|
||||
} else if (type == ncclProfileP2p) {
|
||||
struct p2p* p = (struct p2p *)handle;
|
||||
printP2pEventHeader(fh, p);
|
||||
for (int i = 0; i < MAX_CHANNELS; i++) {
|
||||
printKernelChEventHeader(fh, &p->kernel[i]);
|
||||
printEvent(fh, &p->op[i]);
|
||||
printKernelChEventTrailer(fh, &p->kernel[i]);
|
||||
}
|
||||
printP2pEventTrailer(fh, p);
|
||||
} else if (type == ncclProfileProxyOp) {
|
||||
@@ -222,7 +284,11 @@ void printEvent(FILE* fh, void* handle) {
|
||||
printProxyOpEventTrailer(fh, p);
|
||||
} else if (type == ncclProfileProxyStep) {
|
||||
struct proxyStep* p = (struct proxyStep *)handle;
|
||||
printProxyStepEvent(fh, p);
|
||||
printProxyStepEventHeader(fh, p);
|
||||
for (int q = 0; q < p->nNetEvents; q++) {
|
||||
printNetPluginEvent(fh, &p->net[q]);
|
||||
}
|
||||
printProxyStepEventTrailer(fh, p);
|
||||
} else if (type == ncclProfileProxyCtrl) {
|
||||
struct proxyCtrl* p = (struct proxyCtrl *)handle;
|
||||
printProxyCtrlEvent(fh, p);
|
||||
|
||||
@@ -16,6 +16,7 @@ WERROR ?= 0
|
||||
PROFAPI ?= 1
|
||||
NVTX ?= 1
|
||||
RDMA_CORE ?= 0
|
||||
NET_PROFILER ?= 0
|
||||
|
||||
NVCC = $(CUDA_HOME)/bin/nvcc
|
||||
|
||||
@@ -137,3 +138,7 @@ endif
|
||||
ifneq ($(RDMA_CORE), 0)
|
||||
CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1
|
||||
endif
|
||||
|
||||
ifneq ($(NET_PROFILER), 0)
|
||||
CXXFLAGS += -DNCCL_ENABLE_NET_PROFILING=1
|
||||
endif
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 25
|
||||
NCCL_PATCH := 1
|
||||
NCCL_MINOR := 26
|
||||
NCCL_PATCH := 6
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
+8
-3
@@ -10,11 +10,15 @@ include ../makefiles/version.mk
|
||||
INCEXPORTS := nccl.h
|
||||
LIBSRCFILES := \
|
||||
bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
|
||||
init.cc init_nvtx.cc net.cc proxy.cc transport.cc mnnvl.cc \
|
||||
init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc \
|
||||
$(wildcard graph/*.cc) \
|
||||
$(wildcard misc/*.cc) \
|
||||
$(wildcard transport/*.cc) \
|
||||
$(wildcard register/*.cc) \
|
||||
$(wildcard plugin/*.cc) \
|
||||
$(wildcard plugin/net/*.cc) \
|
||||
$(wildcard plugin/tuner/*.cc) \
|
||||
$(wildcard plugin/profiler/*.cc) \
|
||||
$(filter-out ras/client.cc,$(wildcard ras/*.cc))
|
||||
BINSRCFILES := ras/client.cc
|
||||
|
||||
@@ -49,6 +53,7 @@ LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
|
||||
BINOBJ := $(BINSRCFILES:%.cc=$(OBJDIR)/%.o)
|
||||
DEPFILES := $(LIBOBJ:%.o=%.d) $(BINOBJ:%.o=%.d)
|
||||
LDFLAGS += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
|
||||
INCPLUGIN := include/plugin
|
||||
|
||||
DEVMANIFEST := $(BUILDDIR)/obj/device/manifest
|
||||
|
||||
@@ -126,8 +131,8 @@ $(PKGDIR)/%.pc : %.pc
|
||||
$(OBJDIR)/%.o : %.cc $(INCTARGETS)
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
mkdir -p `dirname $@`
|
||||
$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
|
||||
@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
|
||||
$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -c $< -o $@
|
||||
@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -M $< > $(@:%.o=%.d.tmp)
|
||||
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
|
||||
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
|
||||
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
|
||||
|
||||
@@ -154,7 +154,7 @@ static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int siz
|
||||
int* done) {
|
||||
if (*done) return ncclSuccess;
|
||||
if (!*sendReq) {
|
||||
NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, sendReq));
|
||||
NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, NULL, sendReq));
|
||||
}
|
||||
if (*sendReq) {
|
||||
NCCLCHECK(net->test(*sendReq, done, NULL));
|
||||
@@ -168,8 +168,8 @@ static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data, int siz
|
||||
int* done) {
|
||||
if (*done) return ncclSuccess;
|
||||
if (!*recvReq) {
|
||||
size_t size64 = size;
|
||||
NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, recvReq));
|
||||
size_t size64 = size;
|
||||
NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, NULL, recvReq));
|
||||
}
|
||||
if (*recvReq) {
|
||||
NCCLCHECK(net->test(*recvReq, done, NULL));
|
||||
@@ -485,7 +485,7 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
|
||||
if (devOOB < 0) {
|
||||
pthread_mutex_lock(&bootstrapNetLock);
|
||||
if (devOOB < 0) {
|
||||
char* userIfEnv = getenv("NCCL_OOB_NET_IFNAME");
|
||||
const char* userIfEnv = ncclGetEnv("NCCL_OOB_NET_IFNAME");
|
||||
if (userIfEnv && strlen(userIfEnv) > 0) {
|
||||
INFO(NCCL_BOOTSTRAP | NCCL_ENV, "NCCL_OOB_NET_IFNAME set to %s", userIfEnv);
|
||||
bool searchNot = userIfEnv && userIfEnv[0] == '^';
|
||||
@@ -541,7 +541,7 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis
|
||||
do {
|
||||
NCCLCHECK(checkAbort(abortFlag, &abortCounter));
|
||||
if (!*sendComm)
|
||||
NCCLCHECK(net->connect(listen->net.dev, peerHandle, sendComm, sendDevHandle));
|
||||
NCCLCHECK(net->connect(listen->net.dev, NULL, peerHandle, sendComm, sendDevHandle));
|
||||
if (!*recvComm)
|
||||
NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle));
|
||||
} while (!*sendComm || !*recvComm);
|
||||
@@ -741,6 +741,8 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
|
||||
rasRanks[rank].pid = getpid();
|
||||
rasRanks[rank].cudaDev = comm->cudaDev;
|
||||
rasRanks[rank].nvmlDev = comm->nvmlDev;
|
||||
rasRanks[rank].hostHash = getHostHash();
|
||||
rasRanks[rank].pidHash = getPidHash();
|
||||
if (ncclRasCommInit(comm, rasRanks+rank) != ncclSuccess) {
|
||||
INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
|
||||
// We should still participate in the ringAllInfo below as the peers will be waiting for us.
|
||||
@@ -972,7 +974,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int s
|
||||
NCCLCHECK(socketAccept(commState, peer, tag, &sock));
|
||||
TRACE(NCCL_BOOTSTRAP, "Receiving tag=%d peer=%d size=%d", tag, peer, size);
|
||||
NCCLCHECKGOTO(socketRecv(&sock, ((char*)data), size), ret, fail);
|
||||
NCCLCHECK(ncclSocketClose(&sock));
|
||||
NCCLCHECKGOTO(ncclSocketClose(&sock, /*wait*/true), ret, fail);
|
||||
return ret;
|
||||
fail:
|
||||
(void)ncclSocketClose(&sock);
|
||||
@@ -1067,7 +1069,7 @@ static ncclResult_t bootstrapP2PBarrier(void* commState, int* ranks, int rank, i
|
||||
* Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet,
|
||||
* "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
|
||||
*/
|
||||
int data[1];
|
||||
int data[1] = {0};
|
||||
for (int mask = 1; mask < nranks; mask <<= 1) {
|
||||
int src = (rank - mask + nranks) % nranks;
|
||||
int dst = (rank + mask) % nranks;
|
||||
|
||||
+19
-18
@@ -20,8 +20,8 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
|
||||
channel->workFifoProduced = 0;
|
||||
|
||||
struct ncclSharedResources* sharedRes = comm->sharedRes;
|
||||
|
||||
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
|
||||
cudaStream_t deviceStream;
|
||||
NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
|
||||
|
||||
if (channel->peers == NULL) {
|
||||
// The extra on nRanks+1 is for collnet root (i.e. network)
|
||||
@@ -39,33 +39,33 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
|
||||
|
||||
if (channel->devPeers == NULL) {
|
||||
if (sharedRes->devPeers[channelId] == NULL) {
|
||||
NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream));
|
||||
NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, deviceStream));
|
||||
}
|
||||
/* channel->devPeers is not shared, so just free it when calling commFree() */
|
||||
NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream));
|
||||
NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, deviceStream));
|
||||
ncclCommPushCudaFree(comm, channel->devPeers);
|
||||
NCCLCHECK(ncclCalloc(&channel->devPeersHostPtr, nPeers));
|
||||
for (int r = 0; r < nRanks; r++) {
|
||||
uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]);
|
||||
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
|
||||
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, deviceStream));
|
||||
channel->devPeersHostPtr[r] = (struct ncclDevChannelPeer*)addr;
|
||||
}
|
||||
}
|
||||
|
||||
channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
|
||||
NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream));
|
||||
NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, deviceStream));
|
||||
ncclCommPushCudaFree(comm, channel->devRingUserRanks);
|
||||
|
||||
/* guarantee addr has been copied into channel->devPeers */
|
||||
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
|
||||
NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
|
||||
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) {
|
||||
struct ncclChannel* channel = &comm->channels[channelId];
|
||||
struct ncclSharedResources* sharedRes = comm->sharedRes;
|
||||
cudaStream_t deviceStream;
|
||||
|
||||
if (channel->nvlsPeers != NULL)
|
||||
return ncclSuccess;
|
||||
@@ -73,7 +73,7 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
|
||||
if (channel->id == -1)
|
||||
NCCLCHECK(initChannel(comm, channelId));
|
||||
|
||||
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
|
||||
NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
|
||||
|
||||
int nvlsRanks = comm->localRanks;
|
||||
|
||||
@@ -84,24 +84,24 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
|
||||
int tr = comm->topParentLocalRanks[r];
|
||||
uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr);
|
||||
channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr;
|
||||
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
|
||||
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream));
|
||||
channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr;
|
||||
ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount);
|
||||
}
|
||||
} else {
|
||||
NCCLCHECK(ncclCalloc(&channel->nvlsPeers, nvlsRanks));
|
||||
NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, sharedRes->deviceStream.cudaStream));
|
||||
NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, deviceStream));
|
||||
for (int r = 0; r < nvlsRanks; ++r) {
|
||||
uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r);
|
||||
channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r;
|
||||
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
|
||||
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream));
|
||||
channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr;
|
||||
ncclAtomicRefCountIncrement(&channel->nvlsPeers[r].refCount);
|
||||
}
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
|
||||
NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
|
||||
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -110,6 +110,7 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
|
||||
struct ncclChannel* channel = &comm->channels[channelId];
|
||||
struct ncclSharedResources* sharedRes = comm->sharedRes;
|
||||
uintptr_t addr;
|
||||
cudaStream_t deviceStream;
|
||||
|
||||
if (channel->collnetPeers != NULL)
|
||||
return ncclSuccess;
|
||||
@@ -117,28 +118,28 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
|
||||
if (channel->id == -1)
|
||||
NCCLCHECK(initChannel(comm, channelId));
|
||||
|
||||
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
|
||||
NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
|
||||
|
||||
if (share) {
|
||||
channel->collnetPeers = parent->channels[channelId].collnetPeers;
|
||||
channel->collnetDevPeers = parent->channels[channelId].collnetDevPeers;
|
||||
addr = (uintptr_t)parent->channels[channelId].collnetDevPeers;
|
||||
channel->peers[comm->nRanks] = parent->channels[channelId].collnetPeers;
|
||||
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
|
||||
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream));
|
||||
channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr;
|
||||
ncclAtomicRefCountIncrement(&parent->channels[channelId].collnetPeers->refCount);
|
||||
} else {
|
||||
NCCLCHECK(ncclCalloc(&channel->collnetPeers, 1));
|
||||
NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, sharedRes->deviceStream.cudaStream));
|
||||
NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, deviceStream));
|
||||
addr = (uintptr_t)channel->collnetDevPeers;
|
||||
channel->peers[comm->nRanks] = channel->collnetPeers;
|
||||
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
|
||||
NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream));
|
||||
channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr;
|
||||
ncclAtomicRefCountIncrement(&channel->collnetPeers->refCount);
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
|
||||
NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
|
||||
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
+149
-14
@@ -6,6 +6,7 @@
|
||||
|
||||
#include "core.h"
|
||||
#include "nccl_net.h"
|
||||
#include <ctime>
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
@@ -16,6 +17,11 @@
|
||||
#include "param.h"
|
||||
|
||||
int ncclDebugLevel = -1;
|
||||
static uint32_t ncclDebugTimestampLevels = 0; // bitmaps of levels that have timestamps turned on
|
||||
static char ncclDebugTimestampFormat[256]; // with space for subseconds
|
||||
static int ncclDebugTimestampSubsecondsStart; // index where the subseconds starts
|
||||
static uint64_t ncclDebugTimestampMaxSubseconds; // Max number of subseconds plus 1, used in duration ratio
|
||||
static int ncclDebugTimestampSubsecondDigits; // Number of digits to display
|
||||
static int pid = -1;
|
||||
static char hostname[1024];
|
||||
thread_local int ncclDebugNoWarn = 0;
|
||||
@@ -114,6 +120,88 @@ static void ncclDebugInit() {
|
||||
ncclWarnSetDebugInfo = value;
|
||||
}
|
||||
|
||||
// Determine which debug levels will have timestamps.
|
||||
const char* timestamps = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_LEVELS");
|
||||
if (timestamps == nullptr) {
|
||||
ncclDebugTimestampLevels = (1<<NCCL_LOG_WARN);
|
||||
} else {
|
||||
int invert = 0;
|
||||
if (timestamps[0] == '^') { invert = 1; ++timestamps; }
|
||||
ncclDebugTimestampLevels = invert ? ~0U : 0U;
|
||||
char *timestampsDup = strdup(timestamps);
|
||||
char *level = strtok(timestampsDup, ",");
|
||||
while (level != NULL) {
|
||||
uint32_t mask = 0;
|
||||
if (strcasecmp(level, "ALL") == 0) {
|
||||
mask = ~0U;
|
||||
} else if (strcasecmp(level, "VERSION") == 0) {
|
||||
mask = (1<<NCCL_LOG_VERSION);
|
||||
} else if (strcasecmp(level, "WARN") == 0) {
|
||||
mask = (1<<NCCL_LOG_WARN);
|
||||
} else if (strcasecmp(level, "INFO") == 0) {
|
||||
mask = (1<<NCCL_LOG_INFO);
|
||||
} else if (strcasecmp(level, "ABORT") == 0) {
|
||||
mask = (1<<NCCL_LOG_ABORT);
|
||||
} else if (strcasecmp(level, "TRACE") == 0) {
|
||||
mask = (1<<NCCL_LOG_TRACE);
|
||||
} else {
|
||||
// Silently fail.
|
||||
}
|
||||
if (mask) {
|
||||
if (invert) ncclDebugTimestampLevels &= ~mask;
|
||||
else ncclDebugTimestampLevels |= mask;
|
||||
}
|
||||
level = strtok(NULL, ",");
|
||||
}
|
||||
free(timestampsDup);
|
||||
}
|
||||
|
||||
// Store a copy of the timestamp format with space for the subseconds, if used.
|
||||
const char* tsFormat = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_FORMAT");
|
||||
if (tsFormat == nullptr) tsFormat = "[%F %T] ";
|
||||
ncclDebugTimestampSubsecondsStart = -1;
|
||||
// Find where the subseconds are in the format.
|
||||
for (int i=0; tsFormat[i] != '\0'; ++i) {
|
||||
if (tsFormat[i]=='%' && tsFormat[i+1]=='%') { // Next two chars are "%"
|
||||
// Skip the next character, too, and restart checking after that.
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
if (tsFormat[i]=='%' && // Found a percentage
|
||||
('1' <= tsFormat[i+1] && tsFormat[i+1] <= '9') && // Next char is a digit between 1 and 9 inclusive
|
||||
tsFormat[i+2]=='f' // Two characters later is an "f"
|
||||
) {
|
||||
constexpr int replaceLen = sizeof("%Xf") - 1;
|
||||
ncclDebugTimestampSubsecondDigits = tsFormat[i+1] - '0';
|
||||
if (ncclDebugTimestampSubsecondDigits + strlen(tsFormat) - replaceLen > sizeof(ncclDebugTimestampFormat) - 1) {
|
||||
// Won't fit; fall back on the default.
|
||||
break;
|
||||
}
|
||||
ncclDebugTimestampSubsecondsStart = i;
|
||||
ncclDebugTimestampMaxSubseconds = 1;
|
||||
|
||||
memcpy(ncclDebugTimestampFormat, tsFormat, i);
|
||||
for (int j=0; j<ncclDebugTimestampSubsecondDigits; ++j) {
|
||||
ncclDebugTimestampFormat[i+j] = ' ';
|
||||
ncclDebugTimestampMaxSubseconds *= 10;
|
||||
}
|
||||
strcpy(ncclDebugTimestampFormat+i+ncclDebugTimestampSubsecondDigits, tsFormat+i+replaceLen);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ncclDebugTimestampSubsecondsStart == -1) {
|
||||
if (strlen(tsFormat) < sizeof(ncclDebugTimestampFormat)) {
|
||||
strcpy(ncclDebugTimestampFormat, tsFormat);
|
||||
} else {
|
||||
strcpy(ncclDebugTimestampFormat, "[%F %T] ");
|
||||
}
|
||||
}
|
||||
|
||||
// Replace underscore with spaces... it is hard to put spaces in command line parameters.
|
||||
for (int i=0; ncclDebugTimestampFormat[i] != '\0'; ++i) {
|
||||
if (ncclDebugTimestampFormat[i]=='_') ncclDebugTimestampFormat[i] = ' ';
|
||||
}
|
||||
|
||||
// Cache pid and hostname
|
||||
getHostName(hostname, 1024, '.');
|
||||
pid = getpid();
|
||||
@@ -194,39 +282,86 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
tid = syscall(SYS_gettid);
|
||||
}
|
||||
|
||||
char buffer[1024];
|
||||
size_t len = 0;
|
||||
|
||||
// WARNs come with an extra newline at the beginning.
|
||||
if (level == NCCL_LOG_WARN) {
|
||||
buffer[len++] = '\n';
|
||||
};
|
||||
|
||||
// Add the timestamp to the buffer if they are turned on for this level.
|
||||
if (ncclDebugTimestampLevels & (1<<level)) {
|
||||
if (ncclDebugTimestampFormat[0] != '\0') {
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_REALTIME, &ts); // clock_gettime failure should never happen
|
||||
std::tm nowTm;
|
||||
localtime_r(&ts.tv_sec, &nowTm);
|
||||
|
||||
// Add the subseconds portion if it is part of the format.
|
||||
char localTimestampFormat[sizeof(ncclDebugTimestampFormat)];
|
||||
const char* pformat = ncclDebugTimestampFormat;
|
||||
if (ncclDebugTimestampSubsecondsStart != -1) {
|
||||
pformat = localTimestampFormat; // Need to use the local version which has subseconds
|
||||
memcpy(localTimestampFormat, ncclDebugTimestampFormat, ncclDebugTimestampSubsecondsStart);
|
||||
snprintf(localTimestampFormat + ncclDebugTimestampSubsecondsStart,
|
||||
ncclDebugTimestampSubsecondDigits+1,
|
||||
"%0*ld", ncclDebugTimestampSubsecondDigits,
|
||||
ts.tv_nsec / (1000000000UL/ncclDebugTimestampMaxSubseconds));
|
||||
strcpy( localTimestampFormat+ncclDebugTimestampSubsecondsStart+ncclDebugTimestampSubsecondDigits,
|
||||
ncclDebugTimestampFormat+ncclDebugTimestampSubsecondsStart+ncclDebugTimestampSubsecondDigits);
|
||||
}
|
||||
|
||||
// Format the time. If it runs out of space, fall back on a simpler format.
|
||||
int adv = std::strftime(buffer+len, sizeof(buffer)-len, pformat, &nowTm);
|
||||
if (adv==0 && ncclDebugTimestampFormat[0] != '\0') {
|
||||
// Ran out of space. Fall back on the default. This should never fail.
|
||||
adv = std::strftime(buffer+len, sizeof(buffer)-len, "[%F %T] ", &nowTm);
|
||||
}
|
||||
len += adv;
|
||||
}
|
||||
}
|
||||
len = std::min(len, sizeof(buffer)-1); // prevent overflows
|
||||
|
||||
// Add hostname, pid and tid portion of the log line.
|
||||
if (level != NCCL_LOG_VERSION) {
|
||||
len += snprintf(buffer+len, sizeof(buffer)-len, "%s:%d:%d ", hostname, pid, tid);
|
||||
len = std::min(len, sizeof(buffer)-1); // prevent overflows
|
||||
}
|
||||
|
||||
int cudaDev = 0;
|
||||
if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
|
||||
(void)cudaGetDevice(&cudaDev);
|
||||
}
|
||||
|
||||
char buffer[4096];
|
||||
size_t len = 0;
|
||||
// Add level specific formatting.
|
||||
if (level == NCCL_LOG_WARN) {
|
||||
len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
|
||||
hostname, pid, tid, cudaDev, filefunc, line);
|
||||
len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %s:%d NCCL WARN ", cudaDev, filefunc, line);
|
||||
if (ncclWarnSetDebugInfo) ncclDebugLevel = NCCL_LOG_INFO;
|
||||
} else if (level == NCCL_LOG_INFO) {
|
||||
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
|
||||
len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] NCCL INFO ", cudaDev);
|
||||
} else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
|
||||
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid);
|
||||
len += snprintf(buffer+len, sizeof(buffer)-len, "NCCL CALL ");
|
||||
} else if (level == NCCL_LOG_TRACE) {
|
||||
auto delta = std::chrono::steady_clock::now() - ncclEpoch;
|
||||
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
|
||||
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ",
|
||||
hostname, pid, tid, cudaDev, timestamp, filefunc, line);
|
||||
len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %f %s:%d NCCL TRACE ", cudaDev, timestamp, filefunc, line);
|
||||
}
|
||||
len = std::min(len, sizeof(buffer)-1); // prevent overflows
|
||||
|
||||
// Add the message as given by the call site.
|
||||
va_list vargs;
|
||||
va_start(vargs, fmt);
|
||||
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
|
||||
va_end(vargs);
|
||||
// vsnprintf may return len >= sizeof(buffer) in the case of a truncated output.
|
||||
// Rewind len so that we can replace the final \0 by \n
|
||||
if (len >= sizeof(buffer)) len = sizeof(buffer)-1;
|
||||
if (len) {
|
||||
buffer[len++] = '\n';
|
||||
fwrite(buffer, 1, len, ncclDebugFile);
|
||||
}
|
||||
// Rewind len so that we can replace the final \0 by "\n"
|
||||
len = std::min(len, sizeof(buffer)-1); // prevent overflows
|
||||
|
||||
// Add a newline and write it to the debug file. No terminating null is
|
||||
// necessary since we write bytes instead of the string.
|
||||
buffer[len++] = '\n';
|
||||
fwrite(buffer, 1, len, ncclDebugFile);
|
||||
}
|
||||
|
||||
NCCL_API(void, ncclResetDebugInit);
|
||||
|
||||
@@ -141,7 +141,7 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
// Final wait/copy.
|
||||
prims.directRecv(offset, offset, nelem);
|
||||
prims.directRecv(offset, nelem);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT)
|
||||
if (tid == 0) {
|
||||
@@ -220,25 +220,63 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
#if __CUDA_ARCH__ >= 600
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const int rank = ncclShmem.comm.rank;
|
||||
size_t count, channelOffset, channelCount, chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
|
||||
|
||||
T *inputBuf = (T*)work->sendbuff;
|
||||
T *outputBuf = (T*)work->recvbuff;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
|
||||
(tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatAg);
|
||||
static constexpr int nworkers = NCCL_PAT_NWORKERS;
|
||||
struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
|
||||
uint64_t pollCount = 0;
|
||||
__syncthreads(); // Don't start using shared mem until everyone arrives
|
||||
for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
|
||||
if (tid == 0) shmem->localAccSize = 0;
|
||||
if (tid == nworkers) shmem->parallelFactor = 0;
|
||||
__syncthreads();
|
||||
|
||||
PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
|
||||
int last = 0;
|
||||
while (!last) {
|
||||
int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
|
||||
size_t inpIx, outIx;
|
||||
patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
|
||||
prims.patCopy(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend);
|
||||
if (tid == nworkers) { // Algo computation thread
|
||||
PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
|
||||
int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
|
||||
int step = 0;
|
||||
while (1) {
|
||||
struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
|
||||
cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
|
||||
while (poll.load(cuda::memory_order_acquire) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
|
||||
patAlgo.getNextOp(ps);
|
||||
int last = ps->last;
|
||||
step++;
|
||||
if (last == 2) break;
|
||||
}
|
||||
} else if (tid < nworkers) { // Worker threads
|
||||
T *inputBuf = (T*)work->sendbuff;
|
||||
T *outputBuf = (T*)work->recvbuff;
|
||||
int parallelFactor = 0;
|
||||
volatile int* pfPtr = &shmem->parallelFactor;
|
||||
while (parallelFactor == 0) parallelFactor = *pfPtr;
|
||||
|
||||
int groupSize = nworkers/(WARP_SIZE*parallelFactor) * WARP_SIZE;
|
||||
int group = tid / groupSize;
|
||||
int nGroups = nworkers / groupSize;
|
||||
int tidInGroup = tid - group*groupSize;
|
||||
// We don't use recvPeers/sendPeers so let's pass shmem structs instead
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
|
||||
(tidInGroup, groupSize, (int*)shmem->recvDims, (int*)shmem->sendDims, inputBuf, outputBuf, work->redOpArg, group, 0, 0, nullptr, nullptr, 0, primsModePatAg);
|
||||
|
||||
int step = group;
|
||||
while(1) {
|
||||
struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
|
||||
cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
|
||||
while (poll.load(cuda::memory_order_acquire) == 0) pollCount++; // Wait for compute thread
|
||||
int last = ps->last;
|
||||
prims.patCopy(ps, shmem);
|
||||
if (tidInGroup == 0) poll.store(0, cuda::memory_order_release); // Return element to compute thread
|
||||
if (last) break;
|
||||
step += nGroups;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -190,7 +190,7 @@ namespace {
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
|
||||
prims.directRecv(offset, offset, nelem);
|
||||
prims.directRecv(offset, nelem);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
|
||||
if (tid == 0) {
|
||||
@@ -329,7 +329,7 @@ namespace {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecv(offset, offset, nelem);
|
||||
prims.directRecv(offset, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
@@ -528,7 +528,7 @@ namespace {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecv(offset, offset, nelem);
|
||||
prims.directRecv(offset, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
@@ -1055,7 +1055,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * int(chunkSize);
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.directRecv(offset, offset, nelem, /*postOp*/true);
|
||||
prims.directRecv(offset, nelem, /*postOp*/true);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -1082,7 +1082,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.directRecv(offset, offset, nelem);
|
||||
prims.directRecv(offset, nelem);
|
||||
}
|
||||
} else {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
|
||||
@@ -83,7 +83,7 @@ namespace {
|
||||
prims.directCopySend(offset, offset, nelem);
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
prims.directRecv(offset, offset, nelem);
|
||||
prims.directRecv(offset, nelem);
|
||||
} else {
|
||||
prims.directRecvCopyDirectSend(offset, offset, nelem);
|
||||
}
|
||||
|
||||
+54
-30
@@ -144,6 +144,8 @@ struct ncclShmemData {
|
||||
int nWorks;
|
||||
int workSize;
|
||||
uint32_t workConsumed;
|
||||
uint64_t workCounter;
|
||||
bool profilerEnabled;
|
||||
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
|
||||
uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
|
||||
|
||||
@@ -236,24 +238,6 @@ __device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
|
||||
: "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
|
||||
return bool(ans);
|
||||
}
|
||||
__device__ inline bool barrier_red_or_aligned(bool vote, int name) {
|
||||
int ans;
|
||||
asm volatile("{ .reg .pred p;"
|
||||
" setp.ne.s32 p, %1, 0;"
|
||||
" barrier.red.or.pred.aligned p, %2, p; "
|
||||
" selp.s32 %0, 1, 0, p; }"
|
||||
: "=r"(ans) : "r"((int)vote), "r"(name) : "memory");
|
||||
return bool(ans);
|
||||
}
|
||||
__device__ inline bool barrier_red_or_aligned(bool vote, int name, int nThreads) {
|
||||
int ans;
|
||||
asm("{ .reg .pred p;"
|
||||
" setp.ne.s32 p, %1, 0;"
|
||||
" barrier.red.or.pred.aligned p, %2, %3, p; "
|
||||
" selp.s32 %0, 1, 0, p; }"
|
||||
: "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
|
||||
return bool(ans);
|
||||
}
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
#define __insert_timestamp(line_num) do { \
|
||||
@@ -455,6 +439,48 @@ struct RunWorkBatch {
|
||||
}
|
||||
};
|
||||
|
||||
#define START 0
|
||||
#define STOP 1
|
||||
#define FINI 2
|
||||
|
||||
__device__ __forceinline__ bool profilerEnabled(void) {
|
||||
// Check if any of the workItems in the batch is profiled. If so, there is an equivalent
|
||||
// profiler ProxyOp waiting for the counter update in the host thread. If this check was
|
||||
// done only for the first workItem the profiler counter for other workItems in the batch
|
||||
// could never be updated, leaving the host thread spinning forever for the counter update
|
||||
// and causing a hang.
|
||||
bool enabled = false;
|
||||
for (int i = 0; i < ncclShmem.nWorks && !enabled; i++) {
|
||||
if (ncclShmem.workType == ncclDevWorkTypeP2p)
|
||||
enabled = ((struct ncclDevWorkP2p*)ncclShmem.workStorage)[i].profilerEnabled;
|
||||
else
|
||||
enabled = ((struct ncclDevWorkColl*)ncclShmem.workStorage)[i].profilerEnabled;
|
||||
}
|
||||
return enabled;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void profiler(int action) {
|
||||
if (action == START) {
|
||||
if (threadIdx.x == 0) {
|
||||
// increment workCounter regardless of the profiler being active or not
|
||||
ncclShmem.channel.workCounter += ncclShmem.nWorks;
|
||||
if(!profilerEnabled()) return;
|
||||
ncclShmem.comm.workStarted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
|
||||
}
|
||||
} else if (action == STOP) {
|
||||
if (threadIdx.x == 0 && profilerEnabled()) {
|
||||
ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
|
||||
}
|
||||
} else { // FINI
|
||||
if (threadIdx.x == 0) {
|
||||
// store the workCounter back to vidmem regardless of the profiler being active or not
|
||||
((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
|
||||
if (!profilerEnabled()) return;
|
||||
ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int SpecializedFnId, typename SpecializedRunWorkBatch, bool COLLTRACE, int COLL_UNROLL>
|
||||
__device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* args) {
|
||||
const int tid = threadIdx.x;
|
||||
@@ -517,8 +543,13 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
break;
|
||||
}
|
||||
__syncthreads(); // publish ncclShmem.{args, channelId}
|
||||
/* set abort flag to 0 */
|
||||
if (tid == 0) {
|
||||
ncclShmem.aborted = 0;
|
||||
ncclShmem.channel.workCounter = ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter;
|
||||
}
|
||||
|
||||
// Use first 2 warps to load comm and channel, and reamaining load work batch.
|
||||
// Use first 2 warps to load comm and channel, and remaining load work batch.
|
||||
switch (tid/WARP_SIZE) {
|
||||
case 0:
|
||||
{ void* dst = &ncclShmem.comm;
|
||||
@@ -566,9 +597,9 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
while (ncclShmem.aborted == 0) {
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
|
||||
profiler(START);
|
||||
if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) {
|
||||
SpecializedRunWorkBatch().run();
|
||||
} else {
|
||||
@@ -586,21 +617,14 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
default:
|
||||
break;
|
||||
}
|
||||
profiler(STOP);
|
||||
loadWorkBatchToShmem(tid%WARP_SIZE, tn, args, batchIx);
|
||||
__syncthreads();
|
||||
|
||||
// Check whether the last operation was aborted and make sure all threads exit
|
||||
bool aborted = false;
|
||||
if (tid == 0) aborted = *ncclShmem.comm.abortFlag;
|
||||
aborted = __any(aborted); // publish ncclShmem.work
|
||||
if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
|
||||
// ncclShmem.workConsumed written by loadWorkBatchToShmem before barrier_red_or()
|
||||
// ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
|
||||
ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
|
||||
}
|
||||
if (aborted) {
|
||||
if(COLLTRACE && tid%WARP_SIZE == 0) traceAbort();
|
||||
break;
|
||||
}
|
||||
if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelLaunch(ncclCollTraceCollLaunchType, batchIx);
|
||||
}
|
||||
if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelEnd(ncclCollTraceKernelEndType);
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
#include "common_kernel.h"
|
||||
#include "common.h"
|
||||
|
||||
#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
|
||||
#define NCCL_SPINS_BEFORE_CHECK_ABORT 10000
|
||||
|
||||
#define barrier_by_group_common(__THREAD_FENCE) do { \
|
||||
if (nthreads == NCCL_MAX_NTHREADS) { \
|
||||
@@ -154,7 +154,7 @@ struct PrimitivesWithoutDirect {
|
||||
__device__ void directSendFromOutput(intptr_t outIx, int eltN) {
|
||||
static_cast<RealPrimitives*>(this)->sendFromOutput(outIx, eltN);
|
||||
}
|
||||
__device__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN) {
|
||||
__device__ void directRecv(intptr_t outIx, int eltN) {
|
||||
static_cast<RealPrimitives*>(this)->recv(outIx, eltN, /*postOp=*/false);
|
||||
}
|
||||
__device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
|
||||
@@ -178,6 +178,18 @@ struct PrimitivesWithoutDirect {
|
||||
}
|
||||
};
|
||||
|
||||
__device__ inline int checkAbort(int &abortCache, const int abortValue, int &spins) {
|
||||
if (abortCache & abortValue) return 1;
|
||||
if (++spins < NCCL_SPINS_BEFORE_CHECK_ABORT) return 0;
|
||||
spins = 0;
|
||||
int abort = __atomic_load_n((ncclShmem.comm.abortFlag), __ATOMIC_SEQ_CST);
|
||||
if (abort) {
|
||||
__atomic_store_n(&ncclShmem.aborted, abort, __ATOMIC_SEQ_CST);
|
||||
abortCache |= abortValue;
|
||||
}
|
||||
return abort;
|
||||
}
|
||||
|
||||
#include "prims_simple.h"
|
||||
#include "prims_ll.h"
|
||||
#include "prims_ll128.h"
|
||||
|
||||
@@ -85,15 +85,18 @@ private:
|
||||
#endif
|
||||
}
|
||||
|
||||
uint32_t abort = 0;
|
||||
int abort = 0;
|
||||
|
||||
inline __device__ int checkAbort(int &spins, int send) {
|
||||
spins++;
|
||||
if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
|
||||
abort = __atomic_load_n((ncclShmem.comm.abortFlag), __ATOMIC_SEQ_CST);
|
||||
__device__ inline int checkAbort(int &abortCache, const int abortValue, int &spins) {
|
||||
if (abortCache == 0 && ++spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
|
||||
int abort = __atomic_load_n((ncclShmem.comm.abortFlag), __ATOMIC_SEQ_CST);
|
||||
spins = 0;
|
||||
if (abort) {
|
||||
__atomic_store_n(&ncclShmem.aborted, abort, __ATOMIC_SEQ_CST);
|
||||
abortCache |= abortValue;
|
||||
}
|
||||
}
|
||||
return abort;
|
||||
return abortCache;
|
||||
}
|
||||
|
||||
inline __device__ void waitSend(int nbytes) {
|
||||
@@ -108,7 +111,7 @@ private:
|
||||
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
|
||||
__builtin_amdgcn_s_sleep(1);
|
||||
sendConnHeadCache = atomicAdd((unsigned long long *)sendConnHeadPtr, 0);
|
||||
if (checkAbort(spins, 1)) break;
|
||||
if (checkAbort(abort, 1, spins)) break;
|
||||
}
|
||||
if (sendConnFifo) {
|
||||
int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes;
|
||||
@@ -168,7 +171,7 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
|
||||
npkitWaitRecvSpins++;
|
||||
#endif
|
||||
if (checkAbort(spins, 0)) break;
|
||||
if (checkAbort(abort, 1, spins)) break;
|
||||
} while ((i4.flag1 != flag) || (i4.flag2 != flag));
|
||||
uint64_t val64 = (uint64_t)(i4.data1) + (((uint64_t)i4.data2) << 32);
|
||||
#else
|
||||
@@ -177,7 +180,7 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
|
||||
npkitWaitRecvSpins++;
|
||||
#endif
|
||||
if (checkAbort(spins, 0)) break;
|
||||
if (checkAbort(abort, 1, spins)) break;
|
||||
} while ((flag1 != flag) || (flag2 != flag));
|
||||
uint64_t val64 = data1 + (((uint64_t)data2) << 32);
|
||||
#endif
|
||||
@@ -241,7 +244,7 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
|
||||
npkitWaitRecvSpins++;
|
||||
#endif
|
||||
if (checkAbort(spins, 0)) break;
|
||||
if (checkAbort(abort, 1, spins)) break;
|
||||
} while(line[i].flag1 != flag || line[i].flag2 != flag);
|
||||
uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32);
|
||||
|
||||
|
||||
@@ -86,16 +86,18 @@ private:
|
||||
#endif
|
||||
}
|
||||
|
||||
uint32_t abort = 0;
|
||||
uint32_t* sync;
|
||||
int abort = 0;
|
||||
|
||||
inline __device__ int checkAbort(int &spins, int i, int send) {
|
||||
spins++;
|
||||
if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
|
||||
abort = __atomic_load_n(ncclShmem.comm.abortFlag, __ATOMIC_SEQ_CST);
|
||||
__device__ inline int checkAbort(int &abortCache, const int abortValue, int &spins) {
|
||||
if (abortCache == 0 && ++spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
|
||||
int abort = __atomic_load_n((ncclShmem.comm.abortFlag), __ATOMIC_SEQ_CST);
|
||||
spins = 0;
|
||||
if (abort) {
|
||||
__atomic_store_n(&ncclShmem.aborted, abort, __ATOMIC_SEQ_CST);
|
||||
abortCache |= abortValue;
|
||||
}
|
||||
}
|
||||
return abort;
|
||||
return abortCache;
|
||||
}
|
||||
|
||||
inline __device__ void waitSend(int nbytes) {
|
||||
@@ -104,7 +106,7 @@ private:
|
||||
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
|
||||
__builtin_amdgcn_s_sleep(1);
|
||||
sendConnHeadCache = __atomic_load_n(sendConnHeadPtr, __ATOMIC_RELAXED);
|
||||
if (checkAbort(spins, wid, 1)) break;
|
||||
if (checkAbort(abort, 1, spins)) break;
|
||||
}
|
||||
if (sendConnFifo) {
|
||||
sendConnFifo[sendStep[wid]%NCCL_STEPS].size = nbytes;
|
||||
@@ -241,7 +243,7 @@ private:
|
||||
load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
|
||||
needReload |= flagThread && (vr[u+1] != flag);
|
||||
}
|
||||
needReload &= (0 == checkAbort(spins, 0, 0));
|
||||
needReload &= (0 == checkAbort(abort, 1, spins));
|
||||
} while (__any(needReload));
|
||||
#pragma unroll
|
||||
for (int u=0; u<ELEMS_PER_THREAD; u+=2)
|
||||
@@ -287,7 +289,7 @@ private:
|
||||
load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
|
||||
needReload |= flagThread && (vr[u+1] != flag);
|
||||
}
|
||||
needReload &= (0 == checkAbort(spins, i, 0));
|
||||
needReload &= (0 == checkAbort(abort, 1, spins));
|
||||
} while (__any(needReload));
|
||||
|
||||
#pragma unroll
|
||||
|
||||
+243
-160
@@ -59,7 +59,7 @@ class Primitives<
|
||||
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
|
||||
int connStepSize; // Connection step size
|
||||
void* netDeviceHandle;
|
||||
uint64_t accSize; // Accumulated size. Used by PAT operations
|
||||
uint64_t accSize;
|
||||
uint32_t* next_hdp_reg;
|
||||
uint64_t* barriers;
|
||||
uint64_t barrier_next = 0;
|
||||
@@ -86,19 +86,21 @@ private:
|
||||
#endif
|
||||
}
|
||||
inline __device__ void subBarrier() {
|
||||
if (nworkers == WARP_SIZE) __syncwarp();
|
||||
else
|
||||
barrier();
|
||||
}
|
||||
|
||||
inline __device__ void patBarrier() {
|
||||
barrier();
|
||||
}
|
||||
|
||||
inline __device__ bool checkAbort(int &spins) {
|
||||
spins++;
|
||||
if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
|
||||
if (__atomic_load_n(ncclShmem.comm.abortFlag, __ATOMIC_SEQ_CST)) {
|
||||
flags |= Aborted;
|
||||
ncclShmem.aborted = 1;
|
||||
}
|
||||
spins = 0;
|
||||
}
|
||||
return flags & Aborted;
|
||||
inline __device__ void barrierAny() {
|
||||
barrier();
|
||||
}
|
||||
|
||||
inline __device__ void subBarrierAny() {
|
||||
barrier();
|
||||
}
|
||||
|
||||
inline __device__ uint64_t loadStepValue(uint64_t* ptr) {
|
||||
@@ -129,7 +131,7 @@ private:
|
||||
while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
|
||||
__builtin_amdgcn_s_sleep(1);
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
if (checkAbort(spins)) break;
|
||||
if (checkAbort(flags, Aborted, spins)) break;
|
||||
//if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
|
||||
if (spins == 0 && repeat > 0) {
|
||||
repeat --;
|
||||
@@ -482,13 +484,8 @@ public:
|
||||
peerPtr->recv[connIndex].step += steps;
|
||||
st_relaxed_sys_global(peerPtr->recv[connIndex].head, peerPtr->recv[connIndex].step);
|
||||
while (ld_volatile_global(peerPtr->recv[connIndex].tail) < peerPtr->recv[connIndex].step) {
|
||||
if (spins++ == NCCL_SPINS_BEFORE_CHECK_ABORT) {
|
||||
if (*ncclShmem.comm.abortFlag) {
|
||||
ncclShmem.aborted = 1;
|
||||
break;
|
||||
}
|
||||
spins = 0;
|
||||
}
|
||||
int abort = 0;
|
||||
if (checkAbort(abort, 1, spins)) break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -503,7 +500,7 @@ public:
|
||||
int spins = 0;
|
||||
while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
if (checkAbort(spins)) break;
|
||||
if (checkAbort(flags, Aborted, spins)) break;
|
||||
}
|
||||
void **ptrs = isSendNotRecv ? ncclShmem.groups[group].dsts
|
||||
: ncclShmem.groups[group].srcs;
|
||||
@@ -754,6 +751,9 @@ public:
|
||||
flags = 0;
|
||||
index = -1;
|
||||
if (mode == primsModeDefault) { // Connect to ranks in sendPeers/recvPeers
|
||||
// // For send operations, we need an extra warp to overlap the threadfence and the copy
|
||||
// this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0);
|
||||
|
||||
int nrecv=0, nsend=0;
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
@@ -783,68 +783,84 @@ public:
|
||||
|
||||
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
|
||||
if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
|
||||
|
||||
// Coverity thinks that index could be -1 here but that's not actually the case.
|
||||
// coverity[negative_returns:FALSE]
|
||||
int sendIpcReg;
|
||||
int recvIpcReg;
|
||||
int sendNetReg;
|
||||
int recvNetReg;
|
||||
if (P2p) {
|
||||
sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0;
|
||||
recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0;
|
||||
sendNetReg = p2pWork ? p2pWork->sendNetReg : 0;
|
||||
recvNetReg = p2pWork ? p2pWork->recvNetReg : 0;
|
||||
} else {
|
||||
recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0;
|
||||
recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0;
|
||||
}
|
||||
|
||||
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
|
||||
if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
|
||||
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
|
||||
if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
|
||||
|
||||
// if (barrierAny(flags & NetDeviceUnpack)) {
|
||||
// flags |= AnyNetDeviceUnpack;
|
||||
// // RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers
|
||||
// // have NetDeviceUnpack.
|
||||
// uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0);
|
||||
// if (tid == 0) {
|
||||
// ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
|
||||
// }
|
||||
// }
|
||||
|
||||
// coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case
|
||||
// coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case
|
||||
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer);
|
||||
// coverity[uninit_member] => coverity thinks fan.n is not initialized
|
||||
} else if (mode == primsModePatRs || mode == primsModePatAg) { // Connect to all ranks +/- 2^n
|
||||
flags |= PatMode;
|
||||
accSize = 0;
|
||||
const int roles[5] = { RoleWaitRecv, RolePostRecv, RoleWaitSend, RolePostSend, RoleInput | RoleOutput };
|
||||
if (tid < 5) flags |= roles[tid];
|
||||
|
||||
int nranks = ncclShmem.comm.nRanks;
|
||||
int rank = ncclShmem.comm.rank;
|
||||
// A thread is responsible for rank +/- 2 ^ (tid%32). That should be fine as long as rank is a 32-bits integer.
|
||||
index = tid % 32;
|
||||
uint32_t delta = 1 << index;
|
||||
const int roles[4] = { RoleWaitRecv, RoleWaitSend, RolePostSend, RolePostRecv};
|
||||
int block = tid / 32;
|
||||
if (block < 4 && delta < nranks) {
|
||||
int role = roles[block];
|
||||
if (mode == primsModePatRs) {
|
||||
if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank - delta + nranks) % nranks;
|
||||
if (role & (RoleWaitSend|RolePostSend)) peer = (rank + delta) % nranks;
|
||||
} else if (mode == primsModePatAg) {
|
||||
if (role & (RoleWaitSend|RolePostSend)) peer = (rank - delta + nranks) % nranks;
|
||||
if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank + delta) % nranks;
|
||||
}
|
||||
flags |= role;
|
||||
} else if (tid == 128) {
|
||||
flags |= RoleInput | RoleOutput; // Only one will be used depending on the operation
|
||||
if (tid < 32 && ((1UL<<tid) < nranks)) {
|
||||
int rank = ncclShmem.comm.rank;
|
||||
uint32_t delta = 1 << tid;
|
||||
// Load recv peer
|
||||
int recvPeer = mode == primsModePatRs ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
|
||||
struct ncclPatPeer* peer = ((struct ncclPatPeer*)recvPeers)+tid;
|
||||
struct ncclConnInfo* conn = peer->conn = ncclShmem.channel.peers[recvPeer]->recv+connIndexRecv;
|
||||
peer->step = conn->step;
|
||||
peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
peer->stepCache = loadStepValue(peer->tailPtr = conn->tail);
|
||||
peer->headPtr = conn->head;
|
||||
peer->accSize = 0;
|
||||
peer->connStepSize = conn->stepSize/sizeof(T);
|
||||
// Load send peer
|
||||
int sendPeer = mode == primsModePatAg ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
|
||||
peer = ((struct ncclPatPeer*)sendPeers)+tid;
|
||||
conn = peer->conn = ncclShmem.channel.peers[sendPeer]->send+connIndexSend;
|
||||
peer->step = conn->step;
|
||||
peer->connFifo = conn->connFifo;
|
||||
peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
peer->stepCache = loadStepValue(peer->headPtr = conn->head);
|
||||
peer->tailPtr = conn->tail;
|
||||
peer->accSize = 0;
|
||||
peer->connStepSize = conn->stepSize/sizeof(T);
|
||||
}
|
||||
if (tid==0) {
|
||||
ncclShmem.groups[group].userInput = (void*)inputBuf;
|
||||
ncclShmem.groups[group].userOutput = (void*)outputBuf;
|
||||
ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input
|
||||
}
|
||||
patBarrier();
|
||||
}
|
||||
|
||||
// Coverity thinks that index could be -1 here but that's not actually the case.
|
||||
// coverity[negative_returns:FALSE]
|
||||
int sendIpcReg;
|
||||
int recvIpcReg;
|
||||
int sendNetReg;
|
||||
int recvNetReg;
|
||||
if (P2p) {
|
||||
sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0;
|
||||
recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0;
|
||||
sendNetReg = p2pWork ? p2pWork->sendNetReg : 0;
|
||||
recvNetReg = p2pWork ? p2pWork->recvNetReg : 0;
|
||||
} else {
|
||||
recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0;
|
||||
recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0;
|
||||
}
|
||||
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
|
||||
if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
|
||||
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
|
||||
if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
|
||||
|
||||
// if (barrierAny(flags & NetDeviceUnpack)) {
|
||||
// flags |= AnyNetDeviceUnpack;
|
||||
// // RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers
|
||||
// // have NetDeviceUnpack.
|
||||
// uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0);
|
||||
// if (tid == 0) {
|
||||
// ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
|
||||
// }
|
||||
// }
|
||||
|
||||
// coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case
|
||||
// coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case
|
||||
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer);
|
||||
// coverity[uninit_member] => coverity thinks fan.n is not initialized
|
||||
}
|
||||
|
||||
__forceinline__ __device__ ~Primitives() {
|
||||
if (flags&PatMode) return;
|
||||
// Save steps for the next operation
|
||||
if (flags & (RolePostSend|RolePostRecv)) conn->step = step;
|
||||
if ((flags & NetRegMode) && (flags & RoleWaitSend)) {
|
||||
@@ -854,7 +870,7 @@ public:
|
||||
uint64_t prevStep = step - StepPerSlice;
|
||||
volatile ssize_t* ptr = &(connFifo[prevStep%NCCL_STEPS].size);
|
||||
int spins = 0;
|
||||
while (*ptr != -1) if (checkAbort(spins)) break;
|
||||
while (*ptr != -1) if (checkAbort(flags, Aborted, spins)) break;
|
||||
}
|
||||
|
||||
if (flags & NetDeviceUnpack) {
|
||||
@@ -872,7 +888,7 @@ public:
|
||||
int spins = 0;
|
||||
volatile uint64_t* tail = conn->tail;
|
||||
volatile uint64_t* head = conn->head;
|
||||
while (*tail > *head) if (checkAbort(spins)) break;
|
||||
while (*tail > *head) if (checkAbort(flags, Aborted, spins)) break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -895,7 +911,7 @@ public:
|
||||
if (slot) {
|
||||
T* exchgPtr;
|
||||
directBuff = (T*)outputBuf;
|
||||
while ((void *)atomicAdd((unsigned long long *) slot,0) != nullptr && !checkAbort(spins));
|
||||
while ((void *)atomicAdd((unsigned long long *) slot,0) != nullptr && !checkAbort(flags, Aborted, spins));
|
||||
if (P2p) {
|
||||
exchgPtr = (T*)outputBuf;
|
||||
} else {
|
||||
@@ -912,7 +928,7 @@ public:
|
||||
void* ptr;
|
||||
while (slot) {
|
||||
ptr = (void *)atomicAdd((unsigned long long *) slot,0);
|
||||
if (ptr != nullptr || checkAbort(spins)) break;
|
||||
if (ptr != nullptr || checkAbort(flags, Aborted, spins)) break;
|
||||
}
|
||||
|
||||
if (slot) {
|
||||
@@ -931,7 +947,7 @@ public:
|
||||
// Wait for consumer to consume previous value before trampling it.
|
||||
if (slot && argSlot0 && argSlot1) {
|
||||
T* exchgPtr;
|
||||
while (((void *)atomicAdd((unsigned long long *) slot,0) != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(spins));
|
||||
while (((void *)atomicAdd((unsigned long long *) slot,0) != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(flags, Aborted, spins));
|
||||
// If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
|
||||
// Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
|
||||
directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
|
||||
@@ -961,7 +977,7 @@ public:
|
||||
void* ptr;
|
||||
while (slot) {
|
||||
ptr = (void *)atomicAdd((unsigned long long *) slot,0);
|
||||
if (ptr != nullptr || checkAbort(spins)) break;
|
||||
if (ptr != nullptr || checkAbort(flags, Aborted, spins)) break;
|
||||
}
|
||||
|
||||
if (slot && argSlot0 && argSlot1) {
|
||||
@@ -972,7 +988,7 @@ public:
|
||||
while (true) {
|
||||
arg0 = *argSlot0;
|
||||
arg1 = *argSlot1;
|
||||
if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
|
||||
if ((arg0 != 0 && arg1 != 0) || checkAbort(flags, Aborted, spins)) break;
|
||||
}
|
||||
ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
|
||||
}
|
||||
@@ -1020,8 +1036,8 @@ public:
|
||||
__device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
|
||||
genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp);
|
||||
}
|
||||
__device__ __forceinline__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
|
||||
genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, postOp);
|
||||
__device__ __forceinline__ void directRecv(intptr_t outIx, int eltN, bool postOp=false) {
|
||||
genericOp<1, 0, 1, 0, -1, Output>(outIx, outIx, eltN, postOp);
|
||||
}
|
||||
__device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) {
|
||||
genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false);
|
||||
@@ -1099,54 +1115,65 @@ public:
|
||||
ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void patReduce(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int sendStepOffset, int nelem, int postRecv, int postSend) {
|
||||
nelem = nelem < 0 ? 0 : nelem;
|
||||
__device__ __forceinline__ void patReduce(struct ncclPatStep* ps, struct ncclPatShmem* shmem) {
|
||||
if (ps->flags & PatSkipped) { patBarrier(); patBarrier(); return; } // Skipped
|
||||
int nelem = ps->nelem < 0 ? 0 : ps->nelem;
|
||||
T* userInput = (T*)ncclShmem.groups[group].userInput;
|
||||
T* userOutput = (T*)ncclShmem.groups[group].userOutput;
|
||||
|
||||
if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
|
||||
ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + recvOffset;
|
||||
int spins = 0;
|
||||
while (connStepCache < step + StepPerSlice) {
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
if (checkAbort(spins)) break;
|
||||
}
|
||||
if (postRecv) step += StepPerSlice;
|
||||
bool recv = ps->recvDim >= 0 && (flags & (RolePostRecv|RoleWaitRecv));
|
||||
bool send = ps->sendDim >= 0 && (flags & (RolePostSend|RoleWaitSend));
|
||||
bool postRecv = ps->postRecv && recv;
|
||||
bool postSend = ps->postSend && send;
|
||||
struct ncclPatPeer* peer = NULL;
|
||||
if (recv) {
|
||||
peer = shmem->recvDims+ps->recvDim;
|
||||
step = peer->step;
|
||||
}
|
||||
if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
|
||||
int spins = 0;
|
||||
while (connStepCache + NCCL_STEPS < step + sendStepOffset + StepPerSlice) {
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
if (checkAbort(spins)) break;
|
||||
}
|
||||
ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + ((step+sendStepOffset)%NCCL_STEPS)*connStepSize) + sendOffset;
|
||||
if (accSize < sendOffset + nelem + (step+sendStepOffset)*connStepSize) {
|
||||
// New data, add our own data to it.
|
||||
ncclShmem.groups[group].srcs[1] = userInput + inpIx;
|
||||
accSize = sendOffset + nelem + (step+sendStepOffset)*connStepSize;
|
||||
if (flags & ConnFifoEnabled)
|
||||
connFifo[(step+sendStepOffset)%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
|
||||
} else {
|
||||
// There is already data in there, accumulate instead of writing to it.
|
||||
ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
|
||||
}
|
||||
if (postSend) step += StepPerSlice;
|
||||
if (send) {
|
||||
peer = shmem->sendDims+ps->sendDim;
|
||||
step = peer->step;
|
||||
}
|
||||
if (sendPow2 < 0 && (flags & RoleOutput)) { // Destination is our own local buffer
|
||||
ncclShmem.groups[group].dsts[0] = userOutput + outIx;
|
||||
if (accSize < outIx + nelem) {
|
||||
|
||||
if (recv && (flags & RoleWaitRecv)) {
|
||||
ncclShmem.groups[group].srcs[0] = ((T*)peer->buff) + (step%NCCL_STEPS)*peer->connStepSize + ps->recvOffset;
|
||||
int spins = 0;
|
||||
while (peer->stepCache < step + StepPerSlice) {
|
||||
peer->stepCache = loadStepValue(peer->tailPtr);
|
||||
if (checkAbort(flags, Aborted, spins)) break;
|
||||
}
|
||||
}
|
||||
if (send && (flags & RoleWaitSend)) {
|
||||
int spins = 0;
|
||||
while (peer->stepCache + NCCL_STEPS < step + ps->stepOffset + StepPerSlice) {
|
||||
peer->stepCache = loadStepValue(peer->headPtr);
|
||||
if (checkAbort(flags, Aborted, spins)) break;
|
||||
}
|
||||
ncclShmem.groups[group].dsts[0] = ((T*)peer->buff) + ((step+ps->stepOffset)%NCCL_STEPS)*peer->connStepSize + ps->sendOffset;
|
||||
if (peer->accSize < ps->sendOffset + nelem + (step+ps->stepOffset)*peer->connStepSize) {
|
||||
// New data, add our own data to it.
|
||||
ncclShmem.groups[group].srcs[1] = userInput + inpIx;
|
||||
accSize = outIx + nelem;
|
||||
ncclShmem.groups[group].srcs[1] = userInput + ps->inpIx;
|
||||
} else {
|
||||
// There is already data in there, accumulate instead of writing to it.
|
||||
ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
long long int localAccSize = shmem->localAccSize;
|
||||
if (ps->sendDim < 0 && (flags & RoleOutput)) { // Destination is our own local buffer
|
||||
ncclShmem.groups[group].dsts[0] = userOutput + ps->outIx;
|
||||
if (localAccSize < ps->outIx + nelem) {
|
||||
// New data, add our own data to it.
|
||||
ncclShmem.groups[group].srcs[1] = userInput + ps->inpIx;
|
||||
localAccSize = ps->outIx + nelem;
|
||||
} else {
|
||||
// There is already data in there, accumulate instead of writing to it.
|
||||
ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
|
||||
}
|
||||
}
|
||||
patBarrier();
|
||||
int nSrcs = 2;
|
||||
void** srcs = ncclShmem.groups[group].srcs;
|
||||
if (recvPow2 < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source
|
||||
if (ps->recvDim < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source
|
||||
|
||||
int workSize = ncclShmem.aborted ? 0 : nelem;
|
||||
|
||||
@@ -1154,59 +1181,92 @@ public:
|
||||
(tid, nthreads, ncclShmem.redOpArgs[0], nullptr, /*postOp=*/false,
|
||||
nSrcs, srcs, 1, ncclShmem.groups[group].dsts, workSize);
|
||||
|
||||
barrier();
|
||||
if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
|
||||
if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
|
||||
// Store conn step here inside the two barriers to make sure next reload will see the update.
|
||||
if (postSend && (flags & RolePostSend)) {
|
||||
if (peer->connFifo) {
|
||||
peer->connFifo[step%NCCL_STEPS].size = (ps->sendOffset + nelem)*sizeof(T);
|
||||
}
|
||||
peer->step = step += StepPerSlice;
|
||||
st_relaxed_sys_global(&peer->conn->step, step);
|
||||
}
|
||||
if (postRecv && (flags & RolePostRecv)) {
|
||||
peer->step = step += StepPerSlice;
|
||||
st_relaxed_sys_global(&peer->conn->step, step); // Also save in global mem for next op
|
||||
}
|
||||
|
||||
// Update accSize
|
||||
if (ps->sendDim < 0 && (flags & RoleOutput)) atomicMax(&shmem->localAccSize, localAccSize);
|
||||
if (ps->sendDim >= 0 && (flags & RoleWaitSend)) atomicMax(&peer->accSize, ps->sendOffset + nelem + (step+ps->stepOffset)*peer->connStepSize);
|
||||
|
||||
patBarrier();
|
||||
|
||||
if (postSend && (flags & RolePostSend)) {
|
||||
if (nelem > 0 || peer->connFifo) fence_acq_rel_sys();
|
||||
st_relaxed_sys_global(peer->tailPtr, step);
|
||||
}
|
||||
if (postRecv && (flags & RolePostRecv)) {
|
||||
st_relaxed_sys_global(peer->headPtr, step);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void patCopy(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int recvStepOffset, int nelem, int postRecv, int postSend) {
|
||||
nelem = nelem < 0 ? 0 : nelem;
|
||||
__device__ __forceinline__ void patCopy(struct ncclPatStep* ps, struct ncclPatShmem* shmem) {
|
||||
if (ps->flags & PatSkipped) { patBarrier(); patBarrier(); return; } // Skipped
|
||||
int nelem = ps->nelem < 0 ? 0 : ps->nelem;
|
||||
T* userInput = (T*)ncclShmem.groups[group].userInput;
|
||||
T* userOutput = (T*)ncclShmem.groups[group].userOutput;
|
||||
|
||||
if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
|
||||
ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + ((step+recvStepOffset)%NCCL_STEPS)*connStepSize) + recvOffset;
|
||||
int spins = 0;
|
||||
while (connStepCache < step + recvStepOffset + StepPerSlice) {
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
if (checkAbort(spins)) break;
|
||||
}
|
||||
if (accSize < recvOffset + nelem + (step+recvStepOffset)*connStepSize) {
|
||||
// New data, copy to our output buffer.
|
||||
ncclShmem.groups[group].dsts[1] = userOutput + outIx;
|
||||
accSize = recvOffset + nelem + (step+recvStepOffset)*connStepSize;
|
||||
} else {
|
||||
ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
|
||||
}
|
||||
if (postRecv) step += StepPerSlice;
|
||||
bool recv = ps->recvDim >= 0 && (flags & (RolePostRecv|RoleWaitRecv));
|
||||
bool send = ps->sendDim >= 0 && (flags & (RolePostSend|RoleWaitSend));
|
||||
bool postRecv = ps->postRecv && recv;
|
||||
bool postSend = ps->postSend && send;
|
||||
struct ncclPatPeer* peer = NULL;
|
||||
if (recv) {
|
||||
peer = shmem->recvDims+ps->recvDim;
|
||||
step = peer->step;
|
||||
}
|
||||
if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
|
||||
int spins = 0;
|
||||
while (connStepCache + NCCL_STEPS < step + StepPerSlice) {
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
if (checkAbort(spins)) break;
|
||||
}
|
||||
ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + sendOffset;
|
||||
if (postSend) {
|
||||
if (flags & ConnFifoEnabled)
|
||||
connFifo[step%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
|
||||
step += StepPerSlice;
|
||||
}
|
||||
if (send) {
|
||||
peer = shmem->sendDims+ps->sendDim;
|
||||
step = peer->step;
|
||||
}
|
||||
if (recvPow2 < 0 && (flags & RoleInput)) { // Source is our own local buffer
|
||||
ncclShmem.groups[group].srcs[0] = userInput + inpIx;
|
||||
if (accSize < inpIx + nelem) {
|
||||
|
||||
if (recv && (flags & RoleWaitRecv)) {
|
||||
ncclShmem.groups[group].srcs[0] = ((T*)peer->buff) + ((step+ps->stepOffset)%NCCL_STEPS)*peer->connStepSize + ps->recvOffset;
|
||||
int spins = 0;
|
||||
while (peer->stepCache < step + ps->stepOffset + StepPerSlice) {
|
||||
peer->stepCache = loadStepValue(peer->tailPtr);
|
||||
if (checkAbort(flags, Aborted, spins)) break;
|
||||
}
|
||||
if (peer->accSize < ps->recvOffset + nelem + (step+ps->stepOffset)*peer->connStepSize) {
|
||||
// New data, copy to our output buffer.
|
||||
ncclShmem.groups[group].dsts[1] = userOutput + outIx;
|
||||
accSize = inpIx + nelem;
|
||||
ncclShmem.groups[group].dsts[1] = userOutput + ps->outIx;
|
||||
} else {
|
||||
ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
if (send && (flags & RoleWaitSend)) {
|
||||
int spins = 0;
|
||||
while (peer->stepCache + NCCL_STEPS < step + StepPerSlice) {
|
||||
peer->stepCache = loadStepValue(peer->headPtr);
|
||||
if (checkAbort(flags, Aborted, spins)) break;
|
||||
}
|
||||
ncclShmem.groups[group].dsts[0] = ((T*)peer->buff) + (step%NCCL_STEPS)*peer->connStepSize + ps->sendOffset;
|
||||
}
|
||||
long long int localAccSize = shmem->localAccSize;
|
||||
if (ps->recvDim < 0 && (flags & RoleInput)) { // Source is our own local buffer
|
||||
ncclShmem.groups[group].srcs[0] = userInput + ps->inpIx;
|
||||
if (localAccSize < ps->inpIx + nelem) {
|
||||
// New data, copy to our output buffer.
|
||||
ncclShmem.groups[group].dsts[1] = userOutput + ps->outIx;
|
||||
localAccSize = ps->inpIx + nelem;
|
||||
} else {
|
||||
// Already done
|
||||
ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0];
|
||||
}
|
||||
}
|
||||
patBarrier();
|
||||
int nDsts = 2;
|
||||
void** dsts = ncclShmem.groups[group].dsts;
|
||||
if (sendPow2 < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest
|
||||
if (ps->sendDim < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest
|
||||
if (ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[1]) nDsts--; // In-place or already done.
|
||||
|
||||
int workSize = ncclShmem.aborted ? 0 : nelem;
|
||||
@@ -1215,9 +1275,32 @@ public:
|
||||
(tid, nthreads, ncclShmem.redOpArgs[0], nullptr, /*postOp=*/false,
|
||||
1, ncclShmem.groups[group].srcs, nDsts, dsts, workSize);
|
||||
|
||||
barrier();
|
||||
if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
|
||||
if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
|
||||
// Store conn step here inside the two barriers to make sure next reload will see the update.
|
||||
if (postSend && (flags & RolePostSend)) {
|
||||
if (peer->connFifo) {
|
||||
peer->connFifo[step%NCCL_STEPS].size = (ps->sendOffset + nelem)*sizeof(T);
|
||||
}
|
||||
peer->step = step += StepPerSlice;
|
||||
st_relaxed_sys_global(&peer->conn->step, step);
|
||||
}
|
||||
if (postRecv && (flags & RolePostRecv)) {
|
||||
peer->step = step += StepPerSlice;
|
||||
st_relaxed_sys_global(&peer->conn->step, step); // Also save in global mem for next op
|
||||
}
|
||||
|
||||
// Update accSize
|
||||
if (ps->recvDim < 0 && (flags & RoleInput)) atomicMax(&shmem->localAccSize, localAccSize);
|
||||
if (ps->recvDim >= 0 && (flags & RoleWaitRecv)) atomicMax(&peer->accSize, ps->recvOffset + nelem + (step+ps->stepOffset)*peer->connStepSize);
|
||||
|
||||
patBarrier();
|
||||
|
||||
if (postSend && (flags & RolePostSend)) {
|
||||
if (nelem > 0 || peer->connFifo) fence_acq_rel_sys();
|
||||
st_relaxed_sys_global(peer->tailPtr, step);
|
||||
}
|
||||
if (postRecv && (flags & RolePostRecv)) {
|
||||
st_relaxed_sys_global(peer->headPtr, step);
|
||||
}
|
||||
}
|
||||
|
||||
// MSCCL primitives
|
||||
|
||||
@@ -170,29 +170,66 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_L
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
#if __CUDA_ARCH__ >= 600
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const int rank = ncclShmem.comm.rank;
|
||||
size_t count, channelOffset, channelCount, chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
|
||||
|
||||
T *inputBuf = (T*)work->sendbuff;
|
||||
T *outputBuf = (T*)work->recvbuff;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
|
||||
(tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatRs);
|
||||
static constexpr int nworkers = NCCL_PAT_NWORKERS;
|
||||
struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
|
||||
uint64_t pollCount = 0;
|
||||
__syncthreads(); // Don't start using shared mem until everyone arrives
|
||||
for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
|
||||
if (tid == 0) shmem->localAccSize = 0;
|
||||
if (tid == nworkers) shmem->parallelFactor = 0;
|
||||
__syncthreads();
|
||||
|
||||
PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
|
||||
int last = 0;
|
||||
while (!last) {
|
||||
int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
|
||||
size_t inpIx, outIx;
|
||||
patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
|
||||
prims.patReduce(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend);
|
||||
if (tid == nworkers) { // Algo computation thread
|
||||
PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
|
||||
int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
|
||||
int step = 0;
|
||||
while (1) {
|
||||
struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
|
||||
cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
|
||||
while (poll.load(cuda::memory_order_acquire) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
|
||||
patAlgo.getNextOp(ps);
|
||||
int last = ps->last;
|
||||
step++;
|
||||
if (last == 2) break;
|
||||
}
|
||||
} else if (tid < nworkers) { // Worker threads
|
||||
T *inputBuf = (T*)work->sendbuff;
|
||||
T *outputBuf = (T*)work->recvbuff;
|
||||
int parallelFactor = 0;
|
||||
volatile int* pfPtr = &shmem->parallelFactor;
|
||||
while (parallelFactor == 0) parallelFactor = *pfPtr;
|
||||
|
||||
int groupSize = nworkers/(WARP_SIZE*parallelFactor) * WARP_SIZE;
|
||||
int group = tid / groupSize;
|
||||
int nGroups = nworkers / groupSize;
|
||||
int tidInGroup = tid - group*groupSize;
|
||||
// We don't use recvPeers/sendPeers so let's pass shmem structs instead
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
|
||||
(tidInGroup, groupSize, (int*)shmem->recvDims, (int*)shmem->sendDims, inputBuf, outputBuf, work->redOpArg, group, 0, 0, nullptr, nullptr, 0, primsModePatRs);
|
||||
|
||||
int step = group;
|
||||
while(1) {
|
||||
struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
|
||||
cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
|
||||
while (poll.load(cuda::memory_order_acquire) == 0) pollCount++; // Wait for compute thread
|
||||
int last = ps->last;
|
||||
prims.patReduce(ps, shmem);
|
||||
if (tidInGroup == 0) poll.store(0, cuda::memory_order_release); // Return element to compute thread
|
||||
if (last) break;
|
||||
step += nGroups;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
|
||||
|
||||
@@ -122,7 +122,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
|
||||
size_t cursor = 0;
|
||||
do {
|
||||
int n = min(size_t(chunkSize), bytes-cursor);
|
||||
prims.directRecv(cursor, cursor, n);
|
||||
prims.directRecv(cursor, n);
|
||||
cursor += n;
|
||||
} while (cursor < bytes);
|
||||
|
||||
|
||||
+214
-89
@@ -84,7 +84,6 @@ NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
|
||||
ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize) {
|
||||
constexpr int KernelCount = sizeof(rcclKernelTable)/sizeof(rcclKernelTable[0]);
|
||||
ncclResult_t result = ncclSuccess;
|
||||
int print = 0;
|
||||
|
||||
if (maxStackSize) *maxStackSize = 0;
|
||||
int carveout = ncclParamL1SharedMemoryCarveout();
|
||||
@@ -115,11 +114,9 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
|
||||
if (ncclMaxSharedMem != 0) {
|
||||
int sharedMemSize = ncclMaxSharedMem;
|
||||
if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) {
|
||||
if (print++ == 0)
|
||||
INFO(NCCL_INIT, "ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
|
||||
sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
|
||||
// Reduce requested MaxDynamicSharedMemorySize attribute
|
||||
sharedMemSize = maxSharedMem - attr.sharedSizeBytes;
|
||||
WARN("cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
|
||||
cudaArch, sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
|
||||
return ncclSystemError;
|
||||
}
|
||||
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize),
|
||||
@@ -366,6 +363,7 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
|
||||
devWork.rcclUseOneSlice = comm->rcclUseOneSlice;
|
||||
devWork.isOneRPN = comm->isOneRPN;
|
||||
devWork.netRegUsed = devWork.regUsed = 0;
|
||||
devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh);
|
||||
if (task->regBufType & NCCL_NET_REG_BUFFER)
|
||||
devWork.netRegUsed = 1;
|
||||
if (task->regBufType & (NCCL_IPC_REG_BUFFER | NCCL_NVLS_REG_BUFFER))
|
||||
@@ -467,6 +465,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
|
||||
struct ncclTaskColl* next = aggBeg->next;
|
||||
aggBeg->algorithm = agg.algorithm;
|
||||
aggBeg->protocol = agg.protocol;
|
||||
if (aggBeg->protocol == NCCL_PROTO_LL) aggBeg->trafficBytes *= 4;
|
||||
aggBeg->nMaxChannels = agg.nMaxChannels;
|
||||
aggBeg->nWarps = agg.nWarps;
|
||||
aggBeg->devFuncId = agg.devFuncId;
|
||||
@@ -526,6 +525,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
|
||||
devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
|
||||
devWork.oneNode = (comm->nNodes == 1);
|
||||
devWork.netRegUsed = devWork.regUsed = 0;
|
||||
devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh);
|
||||
if (task->regBufType & NCCL_NET_REG_BUFFER)
|
||||
devWork.netRegUsed = 1;
|
||||
if (task->regBufType & (NCCL_IPC_REG_BUFFER | NCCL_NVLS_REG_BUFFER))
|
||||
@@ -559,6 +559,14 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t addProfilerProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclProxyOp* op) {
|
||||
int tmp = op->pattern;
|
||||
op->pattern = ncclPatternProfiler;
|
||||
ncclResult_t ret = addProxyOpIfNeeded(comm, plan, op);
|
||||
op->pattern = tmp;
|
||||
return ret;
|
||||
}
|
||||
|
||||
RCCL_PARAM(IntraNetThreshold, "INTRANET_THRESHOLD", 8388608);
|
||||
|
||||
static ncclResult_t scheduleCollTasksToPlan(
|
||||
@@ -571,7 +579,7 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
int nChannels[2*2] = {0, 0, 0, 0}; // [collnet][nvls]
|
||||
int const nMaxChannels[2*2] = {comm->nChannels, comm->nvlsChannels, // [collnet][nvls]
|
||||
comm->nChannels, comm->nvlsChannels};
|
||||
constexpr size_t MinTrafficPerChannel = 512; // Traffic as minimal
|
||||
constexpr size_t MinTrafficPerChannel = 16 << 10; // 16K traffic as minimal
|
||||
do {
|
||||
size_t workBytes = 0;
|
||||
struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue);
|
||||
@@ -634,11 +642,16 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
proxyOp.opCount = proxyOpId;
|
||||
proxyOp.task.coll = task;
|
||||
proxyOp.rank = comm->rank;
|
||||
proxyOp.eActivationMask = task->eActivationMask;
|
||||
proxyOp.incWorkCounter = true;
|
||||
addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
|
||||
// Set pattern to profiler to add a proxy profiler for kernel events
|
||||
NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp));
|
||||
NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOp));
|
||||
}
|
||||
} else { // not task->isCollnet
|
||||
int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks);
|
||||
if (task->protocol == NCCL_PROTO_LL) trafficPerByte *= 4;
|
||||
size_t cellSize = divUp(divUp(MinTrafficPerChannel, (size_t)trafficPerByte), 16) * 16;
|
||||
int elementsPerCell = cellSize/elementSize;
|
||||
size_t cells = divUp(task->count*elementSize, cellSize);
|
||||
@@ -762,6 +775,8 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
}
|
||||
proxyOp->ringAlgo->incRefCount();
|
||||
}
|
||||
proxyOp->eActivationMask = task->eActivationMask;
|
||||
proxyOp->incWorkCounter = true;
|
||||
proxyOp->connIndex = 0;
|
||||
if (task->protocol == NCCL_PROTO_SIMPLE && task->algorithm == NCCL_ALGO_RING) {
|
||||
if (comm->useIntraNet && nBytes > rcclParamIntraNetThreshold()) {
|
||||
@@ -773,6 +788,7 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
// determine if that's actually true but it's also not clear if that would be an issue.
|
||||
// coverity[uninit_use_in_call:FALSE]
|
||||
NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp));
|
||||
NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, proxyOp));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -915,7 +931,8 @@ static ncclResult_t addP2pToPlan(
|
||||
if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2;
|
||||
|
||||
if (network[dir]) {
|
||||
if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (ncclPxnDisable(comm) || !comm->isAllNvlink)) {
|
||||
bool pxnUsed = !ncclPxnDisable(comm) && comm->isAllNvlink && comm->maxLocalRanks > 1;
|
||||
if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (!pxnUsed)) {
|
||||
int regFlag = 0;
|
||||
NCCLCHECK(ncclCalloc(&handles[dir], nChannelsMax));
|
||||
for (int part = 0; part < nChannelsMax; part++) {
|
||||
@@ -991,6 +1008,7 @@ static ncclResult_t addP2pToPlan(
|
||||
work->recvRank = recvRank;
|
||||
work->recvAddr = recvAddr;
|
||||
work->recvBytes = recvBytes==-1 ? 0 : recvBytes;
|
||||
work->profilerEnabled = ncclProfilerPluginLoaded() && ((p2pTasks[0] ? p2pTasks[0] : p2pTasks[1])->eActivationMask & ncclProfileKernelCh);
|
||||
work->recvConnIndex = connIndex[0];
|
||||
work->recvOpCount = recvOpCount;
|
||||
|
||||
@@ -1010,6 +1028,7 @@ static ncclResult_t addP2pToPlan(
|
||||
op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0;
|
||||
op->task.p2p = p2pTasks[dir];
|
||||
op->rank = comm->rank;
|
||||
op->eActivationMask = p2pTasks[dir] ? p2pTasks[dir]->eActivationMask : 0;
|
||||
op->connIndex = connIndex[dir];
|
||||
// The following are modified per channel part in addWorkToChannels():
|
||||
// op->buffer, op->nbytes, op->nsteps = ...;
|
||||
@@ -1017,6 +1036,7 @@ static ncclResult_t addP2pToPlan(
|
||||
|
||||
nChannelsMax = std::max(nChannels[0], nChannels[1]);
|
||||
for (int part=0; part < nChannelsMax; part++) {
|
||||
int incWorkCounter = -1;
|
||||
int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part, comm->p2pnChannelsPerPeer, comm->nNodes);
|
||||
plan->channelMask.masks[channelId/64] |= uint64_t(1)<<(channelId%64);
|
||||
// Add batch first.
|
||||
@@ -1058,12 +1078,19 @@ static ncclResult_t addP2pToPlan(
|
||||
}
|
||||
}
|
||||
|
||||
// Increment work counter for <send, recv> pair rather than individual p2p
|
||||
if (proxyOps[dir].nsteps && incWorkCounter < 0) {
|
||||
proxyOps[dir].incWorkCounter = true;
|
||||
incWorkCounter = dir;
|
||||
}
|
||||
|
||||
if (proxyOps[dir].nsteps != 0) {
|
||||
// Calculate the opCount after adding batch since then the batch count will
|
||||
// equal one plus the batch index this p2p settled in.
|
||||
proxyOps[dir].channelId = channelId;
|
||||
proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1;
|
||||
NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
|
||||
NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1289,22 +1316,23 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
|
||||
struct uploadWork_cleanup_t* cleanup = nullptr;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
void* fifoBufDev = nullptr;
|
||||
cudaStream_t deviceStream;
|
||||
|
||||
CUDACHECKGOTO(cudaThreadExchangeStreamCaptureMode(&mode), result, fail);
|
||||
|
||||
// Acquire deviceStream to gain access to deviceStream.cudaStream. Since the
|
||||
// user's graph will be launched later, and it also acquires the deviceStream,
|
||||
// it will observe this upload.
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, fail);
|
||||
// Acquire deviceStream. Since the user's graph will be launched later and it also
|
||||
// acquires the deviceStream, it will observe this upload.
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), result, fail);
|
||||
|
||||
CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, fail);
|
||||
CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, deviceStream), result, fail);
|
||||
plan->workBufPersistent = fifoBufDev;
|
||||
plan->kernelArgs->workBuf = fifoBufDev;
|
||||
|
||||
// coverity[uninit_use_in_call:FALSE] => fifoBufHost is never NULL
|
||||
CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, fail);
|
||||
CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, deviceStream), result, fail);
|
||||
cudaEvent_t memcpyDone;
|
||||
CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, fail);
|
||||
CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, fail);
|
||||
CUDACHECKGOTO(cudaEventRecord(memcpyDone, deviceStream), result, fail);
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&cleanup, 1), result, fail);
|
||||
cleanup->base.fn = uploadWork_cleanup_fn;
|
||||
@@ -1312,7 +1340,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
|
||||
cleanup->hostBuf = fifoBufHost;
|
||||
ncclIntruQueueEnqueue(&comm->eventCallbackQueue, (struct ncclCommEventCallback *)cleanup);
|
||||
|
||||
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, fail);
|
||||
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), result, fail);
|
||||
NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, fail);
|
||||
|
||||
finish_scope:
|
||||
@@ -1386,15 +1414,38 @@ static void HIPRT_CB hostStreamPlanCallback(void *plan_) {
|
||||
if (result != ncclSuccess) {
|
||||
WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result));
|
||||
}
|
||||
if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->noncapturedRefs);
|
||||
if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->sharedRes->noncapturedRefs);
|
||||
return;
|
||||
}
|
||||
|
||||
static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* me) {
|
||||
struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim`
|
||||
if (plan->persistent) {
|
||||
comm->persistentRefs -= 1;
|
||||
NCCLCHECK(ncclCudaFree(plan->workBufPersistent));
|
||||
comm->sharedRes->persistentRefs -= 1;
|
||||
comm->localPersistentRefs -= 1;
|
||||
if (plan->workStorageType == ncclDevWorkStorageTypePersistent) {
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECK(cudaFree(plan->workBufPersistent));
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
}
|
||||
}
|
||||
// Free coll tasks
|
||||
struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
|
||||
while (ct != nullptr) {
|
||||
struct ncclTaskColl* ct1 = ct->next;
|
||||
free(ct->sendNetHandles);
|
||||
free(ct->recvNetHandles);
|
||||
free(ct->srecvNetHandles);
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, ct);
|
||||
ct = ct1;
|
||||
}
|
||||
// Free p2p tasks
|
||||
struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
|
||||
while (pt != nullptr) {
|
||||
struct ncclTaskP2p* pt1 = pt->next;
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, pt);
|
||||
pt = pt1;
|
||||
}
|
||||
// Free proxy ops
|
||||
struct ncclProxyOp* q = ncclIntruQueueHead(&plan->proxyOpQueue);
|
||||
@@ -1427,6 +1478,32 @@ static void persistentDestructor(void* plans_) {
|
||||
}
|
||||
}
|
||||
|
||||
NCCL_PARAM(LaunchOrderImplicit, "LAUNCH_ORDER_IMPLICIT", 0);
|
||||
|
||||
namespace {
|
||||
enum ncclImplicitOrder {
|
||||
ncclImplicitOrderNone,
|
||||
ncclImplicitOrderSerial,
|
||||
ncclImplicitOrderLaunch
|
||||
};
|
||||
}
|
||||
|
||||
static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool capturing, int driver=-1) {
|
||||
if (ncclParamLaunchOrderImplicit()) {
|
||||
#if !defined(__HIP_PLATFORM_AMD__) || !defined(__HIPCC__)
|
||||
// Due to an unresolved bug in CUDA ncclImplicitOrderLaunch is not supported in graphs
|
||||
if (capturing) { *mode = ncclImplicitOrderSerial; return ncclSuccess; }
|
||||
if (driver < 0) { NCCLCHECK(ncclCudaDriverVersion(&driver)); }
|
||||
*mode = 12030 <= std::min<int>(CUDART_VERSION, driver) ? ncclImplicitOrderLaunch : ncclImplicitOrderSerial;
|
||||
#else
|
||||
*mode = ncclImplicitOrderNone;
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
*mode = ncclImplicitOrderNone;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
struct ncclKernelPlanner* planner = &comm->planner;
|
||||
@@ -1474,63 +1551,65 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
|
||||
|
||||
if (nPlans == 0) return ncclSuccess;
|
||||
|
||||
// Semantically we want these dependencies for the kernels launched:
|
||||
// 1. Launch host task on hostStream.
|
||||
// 2. Launch kernel, depends on all of {deviceStream, hostStream, userStream[i]...}
|
||||
// 3. {deviceStream, userStream[i]...} depend on kernel.
|
||||
// We achieve this by:
|
||||
// 1. userStream[0] waits on deviceStream
|
||||
// 2. deviceStream waits on each of userStream[1...]
|
||||
// 3. host task launch on hostStream
|
||||
// 4. userStream[0] waits on hostStream
|
||||
// 5. kernel launch on userStream[0]
|
||||
// 6. deviceStream waits on userStream[0]
|
||||
// 7. userStream[1...] each waits on deviceStream
|
||||
// The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires
|
||||
// at least one of the two streams to be strong-stream.
|
||||
cudaStream_t launchStream = planner->streams->stream;
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream), result, failure);
|
||||
cudaStream_t deviceStream, launchOrder;
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), result, failure);
|
||||
|
||||
if (planner->numStreams != 1 || persistent) {
|
||||
// Create dependency for device stream on user streams. First from extra user
|
||||
// streams to deviceStream. Then deviceStream to first user stream.
|
||||
if (persistent || planner->numStreams != 1) {
|
||||
// userStream[0] waits on each userStream[i]...
|
||||
for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) {
|
||||
NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure);
|
||||
CUDACHECKGOTO(cudaEventRecord(comm->sharedRes->scratchEvent, l->stream), result, failure);
|
||||
CUDACHECKGOTO(cudaStreamWaitEvent(launchStream, comm->sharedRes->scratchEvent, 0), result, failure);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure);
|
||||
// userStream[0] waits on deviceStream
|
||||
NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, deviceStream, comm->sharedRes->scratchEvent), result, failure);
|
||||
} else if (planner->streams->stream != comm->lastStream && comm->lastStream != nullptr && !persistent) {
|
||||
// Stream changed from last call, create dependency against last NCCL kernel launch
|
||||
CUDACHECK(hipStreamWaitEvent(planner->streams->stream, comm->doneEvent, 0));
|
||||
CUDACHECKGOTO(hipStreamWaitEvent(planner->streams->stream, comm->doneEvent, 0), result, failure);
|
||||
}
|
||||
|
||||
if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->noncapturedRefs, __ATOMIC_ACQUIRE)) {
|
||||
bool capturing = ncclCudaGraphValid(planner->capturingGraph);
|
||||
enum ncclImplicitOrder implicitOrder;
|
||||
NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing), result, failure);
|
||||
|
||||
if (implicitOrder != ncclImplicitOrderNone) {
|
||||
// userStream[0] waits on per-device (context) launchOrder. Concurrent strong stream access is
|
||||
// required if this is a graph capture, non-captured cannot be concurrent because that would violate
|
||||
// deterministic program order of launches.
|
||||
bool concurrent = capturing;
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder), result, failure);
|
||||
NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, launchOrder, comm->sharedRes->scratchEvent), result, failure);
|
||||
}
|
||||
|
||||
if (persistent || comm->sharedRes->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->sharedRes->noncapturedRefs, __ATOMIC_ACQUIRE)) {
|
||||
// We have to launch host tasks to push proxy args. We are careful to only
|
||||
// do this if necessary since host tasks impose a high performance cost in CUDA.
|
||||
bool acquired = false;
|
||||
cudaStream_t hostStream;
|
||||
for (struct ncclKernelPlan* plan=planHead; plan != nullptr; plan = plan->next) {
|
||||
if (plan->hasProxyOps) {
|
||||
if (!acquired) {
|
||||
acquired = true;
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure);
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), result, failure);
|
||||
}
|
||||
if (!persistent) ncclAtomicRefCountIncrement(&comm->noncapturedRefs);
|
||||
if (!persistent) ncclAtomicRefCountIncrement(&comm->sharedRes->noncapturedRefs);
|
||||
plan->isHostCbEnq = true;
|
||||
NCCLCHECKGOTO(ncclStrongStreamLaunchHost(planner->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure);
|
||||
CUDACHECKGOTO(cudaLaunchHostFunc(hostStream, hostStreamPlanCallback, plan), result, failure);
|
||||
}
|
||||
}
|
||||
if (acquired) {
|
||||
// Make to-be-launched kernels dependent on just-launched host stream tasks.
|
||||
NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure);
|
||||
NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure);
|
||||
NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, hostStream, comm->sharedRes->scratchEvent), result, failure);
|
||||
NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false), result, failure);
|
||||
}
|
||||
}
|
||||
|
||||
if (persistent) {
|
||||
comm->persistentRefs += nPlans;
|
||||
comm->sharedRes->persistentRefs += nPlans;
|
||||
comm->localPersistentRefs += nPlans;
|
||||
NCCLCHECKGOTO(ncclCudaGraphAddDestructor(planner->capturingGraph, persistentDestructor, (void*)planHead), result, failure);
|
||||
}
|
||||
}
|
||||
|
||||
failure:
|
||||
return result;
|
||||
}
|
||||
@@ -1549,6 +1628,7 @@ NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
|
||||
#endif
|
||||
|
||||
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclKernelPlanner* planner = &comm->planner;
|
||||
int nChannels = 0;
|
||||
for (int i = 0; i < MAXCHANNELS/64; i++)
|
||||
@@ -1561,23 +1641,28 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
|
||||
void* extra[] = {plan->kernelArgs, &plan->kernelArgsSize};
|
||||
|
||||
if (planner->numStreams == 1 && !plan->persistent) {
|
||||
CUDACHECK(hipExtLaunchKernel(plan->kernelFn, grid, block, extra, 0, launchStream, NULL, comm->doneEvent, 0));
|
||||
comm->lastStream = planner->streams->stream;
|
||||
CUDACHECKGOTO(hipExtLaunchKernel(plan->kernelFn, grid, block, extra, 0, launchStream, NULL, comm->doneEvent, 0), ret, do_return);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// CUfunction fn;
|
||||
// CUDACHECK(cudaGetFuncBySymbol(&fn, sym));
|
||||
|
||||
#if CUDART_VERSION >= 11080
|
||||
#if !defined(__HIP_PLATFORM_AMD__) || !defined(__HIPCC__)
|
||||
int driverVersion;
|
||||
NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
|
||||
if (driverVersion >= 11080) {
|
||||
NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, do_return);
|
||||
|
||||
CUfunction fn;
|
||||
CUDACHECKGOTO(cudaGetFuncBySymbol(&fn, sym), ret, do_return);
|
||||
|
||||
if (CUDART_VERSION >= 11080 && driverVersion >= 11080) {
|
||||
#if CUDART_VERSION >= 11080
|
||||
int compCap = comm->compCap;
|
||||
unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0;
|
||||
|
||||
CUlaunchConfig launchConfig = {0};
|
||||
CUlaunchAttribute launchAttrs[3];
|
||||
CUlaunchAttribute launchAttrs[4] = {};
|
||||
int attrs = 0;
|
||||
/* Cooperative Group Array (CGA)
|
||||
* On sm90 and later we have an extra level of hierarchy where we
|
||||
@@ -1604,6 +1689,17 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
|
||||
launchAttrs[attrs++].value.memSyncDomain = (CUlaunchMemSyncDomain) ncclParamMemSyncDomain();
|
||||
}
|
||||
#endif
|
||||
#if CUDART_VERSION >= 12030
|
||||
bool capturing = ncclCudaGraphValid(planner->capturingGraph);
|
||||
enum ncclImplicitOrder implicitOrder;
|
||||
NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing, driverVersion), ret, do_return);
|
||||
if (implicitOrder == ncclImplicitOrderLaunch) {
|
||||
launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT;
|
||||
launchAttrs[attrs].value.launchCompletionEvent.event = comm->sharedRes->launchEvent;
|
||||
launchAttrs[attrs].value.launchCompletionEvent.flags = 0;
|
||||
attrs++;
|
||||
}
|
||||
#endif
|
||||
launchConfig.gridDimX = grid.x;
|
||||
launchConfig.gridDimY = grid.y;
|
||||
launchConfig.gridDimZ = grid.z;
|
||||
@@ -1615,15 +1711,19 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
|
||||
launchConfig.numAttrs = attrs;
|
||||
launchConfig.hStream = launchStream;
|
||||
|
||||
//CUDACHECK(cudaLaunchKernelExC(&launchConfig, fnAddr, args));
|
||||
CUCHECK(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra));
|
||||
return ncclSuccess;
|
||||
}
|
||||
CUCHECKGOTO(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra), ret, do_return);
|
||||
#endif
|
||||
} else {
|
||||
// Standard kernel launch
|
||||
CUCHECKGOTO(cuLaunchKernel(fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra), ret, do_return);
|
||||
}
|
||||
#endif
|
||||
// Standard kernel launch
|
||||
//cuLaunchKernel(sym, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra);
|
||||
CUDACHECK(cudaLaunchKernel(sym, grid, block, extra, smem, launchStream));
|
||||
return ncclSuccess;
|
||||
CUDACHECKGOTO(cudaLaunchKernel(sym, grid, block, extra, smem, launchStream), ret, do_return);
|
||||
|
||||
do_return:
|
||||
return ret;
|
||||
}
|
||||
|
||||
ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
||||
@@ -1643,36 +1743,51 @@ ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKern
|
||||
}
|
||||
|
||||
ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
struct ncclKernelPlanner* planner = &comm->planner;
|
||||
bool persistent = ncclCudaGraphValid(planner->capturingGraph);
|
||||
|
||||
if (!ncclIntruQueueEmpty(&planner->planQueue)) {
|
||||
// Reset queue to empty without destroying plans since those will be sent
|
||||
// back to us for reclaiming via callbackQueue.
|
||||
ncclIntruQueueConstruct(&planner->planQueue);
|
||||
|
||||
bool capturing = ncclCudaGraphValid(planner->capturingGraph);
|
||||
cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
|
||||
// Create dependency for deviceStream on launchStream. We know that deviceStream
|
||||
// hasn't been modified since launchStream waited on it (in ncclLaunchPrepare),
|
||||
// so we can say that launchStream subsumes it.
|
||||
if (persistent || planner->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
|
||||
resume1:
|
||||
// Create dependency for other user streams (skip launch stream) on deviceStream.
|
||||
// Again, the user streams haven't been touched since deviceStream waited on them
|
||||
// so we can say they are subsumed by deviceStream.
|
||||
struct ncclCudaStreamList* sl = planner->streams->next;
|
||||
planner->streams = nullptr; // Reset comm->planner.streams to empty.
|
||||
while (sl != nullptr && (planner->numStreams != 1 || persistent)) {
|
||||
NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2);
|
||||
resume2:
|
||||
sl = sl->next;
|
||||
cudaStream_t deviceStream, launchOrder;
|
||||
|
||||
if (capturing || planner->numStreams != 1) {
|
||||
// CUDACHECK(cudaEventRecord(comm->sharedRes->scratchEvent, launchStream));
|
||||
// deviceStream waits on userStream[0]
|
||||
NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
|
||||
|
||||
// We know that deviceStream is strictly behind the launchStream because launchStream
|
||||
// synced with it before kernel launch. This allows us to to see deviceStream waiting
|
||||
// on launchStream as a fast-forward. When building CUDA graphs fast forwards should
|
||||
// be handled specially so as not to create graphs with a blowup in the number of edges.
|
||||
// So we could do this:
|
||||
// CUDACHECK(cudaStreamWaitEvent(deviceStream, comm->sharedRes->scratchEvent, 0));
|
||||
// But instead we do:
|
||||
NCCLCHECK(ncclStreamAdvanceToEvent(planner->capturingGraph, deviceStream, comm->sharedRes->scratchEvent));
|
||||
|
||||
// Each userStream[i] waits on userStream[0]
|
||||
for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) {
|
||||
CUDACHECK(cudaStreamWaitEvent(l->stream, comm->sharedRes->scratchEvent, 0));
|
||||
}
|
||||
}
|
||||
planner->numStreams = 0;
|
||||
// Release device stream as acquired in ncclLaunchPrepare()
|
||||
NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream), result, resume3);
|
||||
resume3:;
|
||||
enum ncclImplicitOrder implicitOrder;
|
||||
NCCLCHECK(getImplicitOrder(&implicitOrder, capturing));
|
||||
if (implicitOrder != ncclImplicitOrderNone) {
|
||||
// As in ncclLaunchPrepare, strong stream can be non-concurrent when non-captured.
|
||||
bool concurrent = capturing;
|
||||
// Incorporate launch event into per-device (context) launch order.
|
||||
NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder));
|
||||
// If we don't have launch events (requires CUDA 12.3) then just use completion event (serialize execution).
|
||||
CUDACHECK(cudaStreamWaitEvent(launchOrder, implicitOrder == ncclImplicitOrderLaunch ? comm->sharedRes->launchEvent : comm->sharedRes->scratchEvent));
|
||||
// Release launchOrder as acquired in ncclLaunchPrepare()
|
||||
NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->context->launchOrder, concurrent));
|
||||
}
|
||||
// Release deviceStream as acquired in ncclLaunchPrepare()
|
||||
NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false));
|
||||
}
|
||||
return result;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
@@ -1780,11 +1895,11 @@ static ncclResult_t topoGetAlgoInfo(
|
||||
if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
|
||||
char ncclAlgoEnvStr[1024] = "";
|
||||
char ncclProtoEnvStr[1024] = "";
|
||||
char* algoEnv = getenv("NCCL_ALGO");
|
||||
const char* algoEnv = ncclGetEnv("NCCL_ALGO");
|
||||
if (algoEnv) {
|
||||
snprintf(ncclAlgoEnvStr, 1023, " NCCL_ALGO was set to %s.", algoEnv);
|
||||
}
|
||||
char* protoEnv = getenv("NCCL_PROTO");
|
||||
const char* protoEnv = ncclGetEnv("NCCL_PROTO");
|
||||
if (protoEnv) {
|
||||
snprintf(ncclProtoEnvStr, 1023, " NCCL_PROTO was set to %s.", protoEnv);
|
||||
}
|
||||
@@ -2265,12 +2380,13 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
|
||||
|
||||
// Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
|
||||
ncclGroupCommJoin(info->comm);
|
||||
struct ncclTaskP2p* p2p = ncclMemoryStackAlloc<struct ncclTaskP2p>(&comm->memScoped);
|
||||
struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc<struct ncclTaskP2p>(&comm->memPool_ncclTaskP2p, &comm->memPermanent);
|
||||
p2p->buff = (void*)info->recvbuff;
|
||||
p2p->count = info->count;
|
||||
p2p->datatype = info->datatype;
|
||||
p2p->root = info->root;
|
||||
p2p->bytes = nBytes;
|
||||
p2p->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
|
||||
p2p->opCount = comm->opCount;
|
||||
ncclIntruQueueEnqueue(
|
||||
isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
|
||||
@@ -2280,6 +2396,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
|
||||
// Mark channels that need pre-connect
|
||||
if (comm->rank != peer) {
|
||||
if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) {
|
||||
// planner->peers[peer].send/recvSeen is private to each comm, so we need to set it anyway.
|
||||
(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true;
|
||||
int round = 0;
|
||||
while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank
|
||||
@@ -2290,23 +2407,30 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
|
||||
for (int c=0; c < comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c, comm->p2pnChannelsPerPeer, comm->nNodes);
|
||||
if (isSendNotRecv) {
|
||||
if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector
|
||||
if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector
|
||||
// the send/recv connector is shared among split shared comms. We need to set hasSeen to
|
||||
// 1 in order to avoid duplicate connection setup if user group sendrecv ops with split
|
||||
// shared comms together.
|
||||
comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
|
||||
//comm->connectSend[peer] |= (1UL<<channelId);
|
||||
comm->connectSend[peer].masks[channelId/64] |= (1UL<<(channelId%64));
|
||||
ncclGroupCommPreconnect(comm);
|
||||
}
|
||||
if (comm->p2pNet && comm->channels[channelId].peers[peer]->send[NCCL_CONN_IDX_P2P_NET].connected == 0) {
|
||||
if (comm->p2pNet && comm->channels[channelId].peers[peer]->send[NCCL_CONN_IDX_P2P_NET].hasSeen == 0) {
|
||||
comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
|
||||
//comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
|
||||
comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET].masks[channelId/64] |= (1UL<<(channelId%64));
|
||||
ncclGroupCommPreconnect(comm);
|
||||
}
|
||||
} else {
|
||||
if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { // P2P uses only 1 connector
|
||||
if (comm->channels[channelId].peers[peer]->recv[1].hasSeen == 0) { // P2P uses only 1 connector
|
||||
comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
|
||||
//comm->connectRecv[peer] |= (1UL<<channelId);
|
||||
comm->connectRecv[peer].masks[channelId/64] |= (1UL<<(channelId%64));
|
||||
ncclGroupCommPreconnect(comm);
|
||||
}
|
||||
if (comm->p2pNet && comm->channels[channelId].peers[peer]->recv[NCCL_CONN_IDX_P2P_NET].connected == 0) {
|
||||
if (comm->p2pNet && comm->channels[channelId].peers[peer]->recv[NCCL_CONN_IDX_P2P_NET].hasSeen == 0) {
|
||||
comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
|
||||
//comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
|
||||
comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET].masks[channelId/64] |= (1UL<<(channelId%64));
|
||||
ncclGroupCommPreconnect(comm);
|
||||
@@ -2337,7 +2461,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
|
||||
} else {
|
||||
// Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
|
||||
ncclGroupCommJoin(info->comm);
|
||||
struct ncclTaskColl* t = ncclMemoryStackAlloc<struct ncclTaskColl>(&comm->memScoped);
|
||||
struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
|
||||
t->func = info->coll;
|
||||
t->sendbuff = info->sendbuff;
|
||||
t->recvbuff = info->recvbuff;
|
||||
@@ -2355,6 +2479,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
|
||||
t->opDev = opDev; // C++ struct assignment
|
||||
t->chunkSteps = info->chunkSteps;
|
||||
t->sliceSteps = info->sliceSteps;
|
||||
t->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
|
||||
t->opCount = comm->opCount;
|
||||
|
||||
planner->nTasksColl += 1;
|
||||
|
||||
@@ -731,7 +731,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
|
||||
|
||||
// Alternate rings to avoid crossing rails
|
||||
if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) {
|
||||
if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) {
|
||||
for (int r=0; r<comm->nRanks; r++) {
|
||||
if (comm->rankToNode[r] % 2 == 1) {
|
||||
// Exchange rings
|
||||
|
||||
+78
-20
@@ -380,8 +380,8 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
|
||||
if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess;
|
||||
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
|
||||
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
|
||||
INFO(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x",
|
||||
info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId);
|
||||
TRACE(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x",
|
||||
info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId);
|
||||
*ret = 1;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -389,9 +389,12 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
|
||||
|
||||
NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
|
||||
int ncclTopoUserGdrLevel = -1;
|
||||
const char* ncclTopoGdrModeStr[ncclTopoGdrModeNum] = { "Disabled", "Default", "PCI" };
|
||||
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, int* useGdr) {
|
||||
*useGdr = 0;
|
||||
NCCL_PARAM(NetGdrC2c, "NET_GDR_C2C", 0);
|
||||
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode) {
|
||||
*gdrMode = ncclTopoGdrModeDisable;
|
||||
|
||||
// Get GPU and NET
|
||||
int n, g;
|
||||
@@ -434,7 +437,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t n
|
||||
else {
|
||||
int arch, vendor, model;
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) {
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_MODEL_AMD_ROME) {
|
||||
int i, d1 = -1, d2 = -1;
|
||||
for (i = 0; i < system->nodes[CPU].count; i++)
|
||||
if (system->nodes[GPU].nodes[g].paths[CPU][i].count == 2) break;
|
||||
@@ -452,25 +455,37 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t n
|
||||
int distance = gpu->paths[NET][n].type;
|
||||
if (distance == PATH_PXN) {
|
||||
// In case of PXN, use the intermediate GPU distance instead
|
||||
int proxyRank, g;
|
||||
int proxyRank;
|
||||
NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netId, &proxyRank));
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
|
||||
struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g;
|
||||
distance = proxyGpu->paths[NET][n].type;
|
||||
gpu = system->nodes[GPU].nodes+g;
|
||||
distance = gpu->paths[NET][n].type;
|
||||
}
|
||||
|
||||
int c;
|
||||
NCCLCHECK(ncclGetLocalCpu(system, g, &c));
|
||||
if (ncclParamNetGdrC2c() && distance == PATH_PHB && gpu->paths[CPU][c].type == PATH_C2C) {
|
||||
// On C2C platforms we can still use GDRDMA on NICs connected to the CPUs
|
||||
INFO(NCCL_NET, "GPU %d / HCA %lx connected to CPU %d via C2C link", rank, netId, c);
|
||||
distance = PATH_C2C;
|
||||
}
|
||||
|
||||
if (distance > netGdrLevel) {
|
||||
INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
*useGdr = 1;
|
||||
INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d", rank, netId, distance, netGdrLevel, read);
|
||||
// Force PCIe mapping if path goes through PCI on a C2C system
|
||||
if (gpu->paths[CPU][c].type == PATH_C2C && distance != PATH_C2C) *gdrMode = ncclTopoGdrModePci;
|
||||
else *gdrMode = ncclTopoGdrModeDefault;
|
||||
|
||||
INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d mode %s", rank, netId, distance, netGdrLevel, read, ncclTopoGdrModeStr[*gdrMode]);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail) {
|
||||
int netNum = system->nodes[NET].count;
|
||||
int useGdr = 0;
|
||||
enum ncclTopoGdrMode useGdr = ncclTopoGdrModeDisable;
|
||||
*avail = false;
|
||||
for (int n = 0; n < netNum; n++) {
|
||||
int64_t netId = system->nodes[NET].nodes[n].id;
|
||||
@@ -492,7 +507,7 @@ ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *a
|
||||
NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0);
|
||||
|
||||
// Determine whether we need to flush the GDR recv buffers
|
||||
ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush) {
|
||||
ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev, int rank, int* flush) {
|
||||
*flush = 1;
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECK(comm->ncclNet->getProperties(netDev, &props));
|
||||
@@ -506,6 +521,14 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int*
|
||||
#else
|
||||
// Flush is required on Ampere and earlier
|
||||
if (gpu->gpu.cudaCompCap >= 90) *flush = 0;
|
||||
// On C2C platforms, data could go through a PCI switch while completions and
|
||||
// flags would go through C2C. In that case, force a flush.
|
||||
int c, n;
|
||||
NCCLCHECK(ncclGetLocalCpu(system, g, &c));
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
|
||||
if (gpu->paths[NET][n].type <= PATH_PXB && gpu->paths[CPU][c].type == PATH_C2C) {
|
||||
*flush = 1;
|
||||
}
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -576,7 +599,7 @@ NCCL_PARAM(PxnDisable, "PXN_DISABLE", 1);
|
||||
int ncclPxnDisable(struct ncclComm* comm) {
|
||||
static int pxnDisable = -1;
|
||||
if (pxnDisable == -1) {
|
||||
if (comm && ncclNetVersion(comm) == 4) {
|
||||
if (comm && comm->ncclNetVer == 4) {
|
||||
INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
|
||||
pxnDisable = 1;
|
||||
} else {
|
||||
@@ -599,9 +622,9 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
|
||||
int proxyRank;
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank));
|
||||
if (proxyRank == comm->rank) continue;
|
||||
int useGdr;
|
||||
enum ncclTopoGdrMode useGdr;
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->rank, netId, 1, &useGdr));
|
||||
if (useGdr == 0) continue;
|
||||
if (useGdr == ncclTopoGdrModeDisable) continue;
|
||||
int found = 0;
|
||||
for (int r=0; r<nr; r++) {
|
||||
if (ranks[r] == proxyRank) found = 1;
|
||||
@@ -746,7 +769,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
||||
}
|
||||
if (gpu->paths[NET][n].type < PATH_PHB) {
|
||||
// Update path when we dont want to / can't use GPU Direct RDMA.
|
||||
int gdr;
|
||||
enum ncclTopoGdrMode gdr;
|
||||
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netNode->id, 0, &gdr));
|
||||
if (gdr == 0) {
|
||||
// We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
|
||||
@@ -770,7 +793,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
int myDomain = 0;
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int remove = 1;
|
||||
int gdr = 1;
|
||||
enum ncclTopoGdrMode useGdr = ncclTopoGdrModeDefault;
|
||||
bool allXgmi = true;
|
||||
NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
|
||||
NCCLCHECKGOTO(ncclCalloc(&ids, system->nodes[GPU].count), ret, fail);
|
||||
@@ -839,10 +862,10 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
for (int g = 0; g < system->nodes[GPU].count; g++) {
|
||||
int64_t netId;
|
||||
NCCLCHECKGOTO(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &netId, nullptr), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netId, 1, &gdr), ret, fail);
|
||||
if (!gdr) break;
|
||||
NCCLCHECKGOTO(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netId, 1, &useGdr), ret, fail);
|
||||
if (!useGdr) break;
|
||||
}
|
||||
if (gdr && !allXgmi) {
|
||||
if (useGdr && !allXgmi) {
|
||||
remove = 0;
|
||||
system->type |= RCCL_TOPO_GDR_ALL;
|
||||
INFO(NCCL_GRAPH, "GDR is available on all GPUs");
|
||||
@@ -1014,3 +1037,38 @@ ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink
|
||||
*allNvLink = maxPath >= PATH_PIX ? 0 : 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Check whether we are in a split NVLink situation, with two NVLink domains, not
|
||||
// connected through NVLink (e.g. QPI).
|
||||
ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink) {
|
||||
ncclResult_t res = ncclSuccess;
|
||||
int nvlDomains = 0;
|
||||
int *nvlDomain = NULL, *nvlDomainCount = NULL;
|
||||
// Compute NVLink domains
|
||||
NCCLCHECKGOTO(ncclCalloc(&nvlDomain, system->nodes[GPU].count), res, exit);
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) nvlDomain[g] = g;
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
int domain = nvlDomain[g];
|
||||
for (int p=g+1; p<system->nodes[GPU].count; p++) {
|
||||
if (gpu->paths[GPU][p].type == PATH_NVL) {
|
||||
nvlDomain[p] = domain;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Compute number of GPUs per NVLink domain.
|
||||
NCCLCHECKGOTO(ncclCalloc(&nvlDomainCount, system->nodes[GPU].count), res, exit);
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
nvlDomainCount[nvlDomain[g]]++;
|
||||
}
|
||||
// Count the number of NVLink domains
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
if (nvlDomainCount[g] > 1) nvlDomains++;
|
||||
}
|
||||
*splitNvLink = nvlDomains == 2 ? 1 : 0;
|
||||
|
||||
exit:
|
||||
if(nvlDomain) free(nvlDomain);
|
||||
if(nvlDomainCount) free(nvlDomainCount);
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -2075,7 +2075,7 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
|
||||
|
||||
for (i = 0; i < sizeof(romeTopoModels)/sizeof(romeTopoModels[0]); i++) {
|
||||
bool ignore_cpu = checkOption(romeTopoModels[i].options, "noCpuCheck");
|
||||
if (!ignore_cpu && (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME))
|
||||
if (!ignore_cpu && (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_MODEL_AMD_ROME))
|
||||
continue;
|
||||
bool ignore_numa = checkOption(romeTopoModels[i].options, "disableNumaMatching");
|
||||
if (!ignore_numa && romeTopo.nCpus != romeTopoModels[i].nCpus) continue;
|
||||
@@ -2225,7 +2225,7 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
|
||||
// only valid on Rome
|
||||
int arch, vendor, model;
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
|
||||
if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_MODEL_AMD_ROME)
|
||||
return ncclSuccess;
|
||||
|
||||
// number of GPUs and NICs on each numa node is used as first screening pattern
|
||||
@@ -2396,7 +2396,7 @@ ncclResult_t parse4H4P(struct ncclTopoSystem* system, struct ncclTopoGraph* grap
|
||||
// only valid on Rome
|
||||
int arch, vendor, model;
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
|
||||
if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_MODEL_AMD_ROME)
|
||||
return ncclSuccess;
|
||||
|
||||
// number of GPUs and NICs on each numa node is used as first screening pattern
|
||||
@@ -2460,7 +2460,7 @@ ncclResult_t parse4H4P(struct ncclTopoSystem* system, struct ncclTopoGraph* grap
|
||||
}
|
||||
}
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME)
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_MODEL_AMD_ROME)
|
||||
system->type |= RCCL_TOPO_4P2H_ROME;
|
||||
parseOptions(system, rome_model_68.options);
|
||||
// create 4P4H based on reference and remapped ids
|
||||
|
||||
+35
-43
@@ -483,12 +483,11 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
|
||||
// 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
|
||||
// 2. add other NETs satisfying typeInter but not already in the list.
|
||||
|
||||
ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
|
||||
ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int netCount = 0;
|
||||
int localNetCount;
|
||||
int* localNets;
|
||||
NCCLCHECK(ncclCalloc(&localNets, MAXCHANNELS));
|
||||
int localNets[MAXCHANNELS];
|
||||
|
||||
// First add the preferred NICs
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
@@ -497,8 +496,8 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
for (int c = 0; c<MAXCHANNELS; c++) {
|
||||
int64_t netId;
|
||||
NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount), ret, fail);
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
|
||||
if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
|
||||
localNetCount++;
|
||||
}
|
||||
@@ -506,7 +505,7 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
|
||||
for (int i=0; i<localNetCount; i++) {
|
||||
int n = localNets[i];
|
||||
int found = 0;
|
||||
while (nets[found] != n && found<netCount) found++;
|
||||
while (found<netCount && nets[found] != n) found++;
|
||||
if (found == netCount) nets[netCount++] = n;
|
||||
}
|
||||
}
|
||||
@@ -525,22 +524,17 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
|
||||
for (int i=0; i<localNetCount; i++) {
|
||||
int n = localNets[i];
|
||||
int found = 0;
|
||||
while (nets[found] != n && found<netCount) found++;
|
||||
while (found<netCount && nets[found] != n) found++;
|
||||
if (found == netCount) nets[netCount++] = n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*netCountRet = netCount;
|
||||
exit:
|
||||
free(localNets);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
if ((*time) <= 0) return ncclSuccess;
|
||||
(*time)--;
|
||||
|
||||
@@ -562,7 +556,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
}
|
||||
graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
|
||||
int g = gpu - system->nodes[GPU].nodes;
|
||||
int* nets = NULL;
|
||||
int nets[NCCL_TOPO_MAX_NODES];
|
||||
if (step == backToNet) {
|
||||
// first get back to NIC
|
||||
if (system->nodes[NET].count) {
|
||||
@@ -570,8 +564,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
|
||||
struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
|
||||
int netCount;
|
||||
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
|
||||
NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount), ret, fail);
|
||||
NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount));
|
||||
for (int i=0; i<netCount; i++) {
|
||||
int n = nets[i];
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
@@ -592,14 +585,14 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
graph->bwInter /= 2;
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net), ret, fail);
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
|
||||
graph->bwInter = bwInterSave;
|
||||
if (net) {
|
||||
graph->inter[graph->nChannels*2+1] = net->id;
|
||||
NCCLCHECKGOTO(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time), ret, fail);
|
||||
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
|
||||
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2;
|
||||
NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net), ret, fail);
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
|
||||
graph->bwInter = bwInterSave;
|
||||
}
|
||||
}
|
||||
@@ -638,21 +631,15 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
// Next path
|
||||
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
|
||||
}
|
||||
exit:
|
||||
if (nets) free(nets);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
const int bw = graph->bwInter;
|
||||
int* nets;
|
||||
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
|
||||
int nets[NCCL_TOPO_MAX_NODES];
|
||||
int netCount;
|
||||
int graphFound = 0;
|
||||
NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount), ret, fail);
|
||||
NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
|
||||
for (int i=0; i<netCount; i++) {
|
||||
if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) break;
|
||||
int n = nets[(graph->nChannels+i)%netCount];
|
||||
@@ -676,7 +663,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
// NVLS search only tries to find NIC:GPU combinations to compute the heads.
|
||||
if (graph->nChannels < netCount) {
|
||||
int gpu;
|
||||
NCCLCHECKGOTO(ncclTopoGetLocalGpu(system, net->id, &gpu), ret, fail);
|
||||
NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
|
||||
if (gpu != -1) {
|
||||
int duplicate = 0;
|
||||
// check whether there is duplicate head when one GPU connects with multiple NICs
|
||||
@@ -687,7 +674,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
}
|
||||
}
|
||||
if (!duplicate) {
|
||||
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu), ret, fail);
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
|
||||
graphFound = 1;
|
||||
}
|
||||
}
|
||||
@@ -696,8 +683,8 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
if (graph->nChannels > 0) {
|
||||
// Try to replay the last channel
|
||||
int g;
|
||||
NCCLCHECKGOTO(ncclTopoReplayGetGpu(system, graph, -1, &g), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g), ret, fail);
|
||||
NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
|
||||
}
|
||||
if (graph->nChannels == 0 || graph->sameChannels == 0) {
|
||||
if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
|
||||
@@ -708,16 +695,16 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
for (int i = 0; i<system->nodes[GPU].count; i++) {
|
||||
if (paths[i].count <= paths[f].count) {
|
||||
// prefer GPU direct RDMA
|
||||
int gdr;
|
||||
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &gdr));
|
||||
if (paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && gdr)) {
|
||||
enum ncclTopoGdrMode useGdr;
|
||||
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &useGdr));
|
||||
if (paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && useGdr)) {
|
||||
f = i;
|
||||
f_gdr = gdr;
|
||||
f_gdr = useGdr;
|
||||
}
|
||||
}
|
||||
}
|
||||
int t = 1 << 10;
|
||||
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0), ret, fail);
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
|
||||
if (t == -1) *time = -1;
|
||||
}
|
||||
|
||||
@@ -737,7 +724,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||
int g = (graph->nChannels+i)%system->nodes[GPU].count;
|
||||
if (paths[g].bw == maxBw && paths[g].count == minHops) {
|
||||
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g), ret, fail);
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -751,11 +738,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
}
|
||||
}
|
||||
}
|
||||
exit:
|
||||
free(nets);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Search Patterns
|
||||
@@ -1061,7 +1044,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
graph->pattern == NCCL_TOPO_PATTERN_RING ? system->hostIdx % 2 : 0));
|
||||
int arch, vendor, model;
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
if (graph->nChannels && arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) {
|
||||
if (graph->nChannels && arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_MODEL_AMD_ROME) {
|
||||
system->type |= RCCL_TOPO_4P2H_ROME;
|
||||
}
|
||||
}
|
||||
@@ -1107,6 +1090,15 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
graph->minChannels = graph->maxChannels;
|
||||
}
|
||||
|
||||
int splitNvLink;
|
||||
NCCLCHECK(ncclTopoSplitNvLink(system, &splitNvLink));
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_RING && splitNvLink) {
|
||||
// We have two sockets with NVLink and a slower link in between (typically QPI).
|
||||
// Tree is likely going to work better but it needs at least 2 channels.
|
||||
// Since Tree needs to have the same number of channels as Ring, also force Ring to use 2 channels.
|
||||
if (graph->maxChannels >= 2 && graph->minChannels == 1) graph->minChannels = 2;
|
||||
}
|
||||
|
||||
struct ncclTopoGraph tmpGraph;
|
||||
memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
|
||||
|
||||
|
||||
+84
-42
@@ -24,11 +24,11 @@
|
||||
|
||||
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
const char* topoLinkTypeStr[] = { "LOC", "XGMI", "", "PCI", "", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" };
|
||||
const char* topoLinkTypeStr[] = { "LOC", "XGMI", "", "C2C", "PCI", "", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "C2C", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" };
|
||||
#else
|
||||
const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" };
|
||||
const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "C2C", "PCI", "", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" };
|
||||
#endif
|
||||
|
||||
/******************************************************************/
|
||||
@@ -51,7 +51,7 @@ ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id)
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) {
|
||||
static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu, struct ncclTopoNode* from) {
|
||||
*cpu = NULL;
|
||||
if (node->type == CPU) {
|
||||
*cpu = node;
|
||||
@@ -60,9 +60,10 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode*
|
||||
for (int l=0; l<node->nlinks; l++) {
|
||||
// Go up the PCI tree to find the CPU. Follow only PCI switches.
|
||||
if (node->links[l].type == LINK_PCI
|
||||
&& (node->links[l].remNode->type == PCI
|
||||
|| node->links[l].remNode->type == CPU)) {
|
||||
NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
|
||||
&& node->links[l].remNode != from
|
||||
&& (node->links[l].remNode->type == PCI
|
||||
|| node->links[l].remNode->type == CPU)) {
|
||||
NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu, node));
|
||||
}
|
||||
if (*cpu != NULL) return ncclSuccess;
|
||||
}
|
||||
@@ -83,13 +84,17 @@ static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
|
||||
*bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
|
||||
*bw =
|
||||
cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_ERP ? ERP_QPI_BW :
|
||||
cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_SRP ? SRP_QPI_BW :
|
||||
cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_SKL ? SKL_QPI_BW :
|
||||
BDW_QPI_BW;
|
||||
}
|
||||
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
|
||||
*bw = AMD_BW;
|
||||
}
|
||||
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
|
||||
*bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
|
||||
*bw = cpu->cpu.model == NCCL_TOPO_CPU_MODEL_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -534,19 +539,23 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
|
||||
int familyId, modelId;
|
||||
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
|
||||
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
|
||||
cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
|
||||
cpu->cpu.model =
|
||||
(familyId == 6 && modelId >= 0xCF) ? NCCL_TOPO_CPU_MODEL_INTEL_ERP :
|
||||
(familyId == 6 && modelId >= 0x8F) ? NCCL_TOPO_CPU_MODEL_INTEL_SRP :
|
||||
(familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_MODEL_INTEL_SKL :
|
||||
NCCL_TOPO_CPU_MODEL_INTEL_BDW;
|
||||
} else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
|
||||
int familyId, modelId;
|
||||
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
|
||||
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
|
||||
if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG;
|
||||
if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_MODEL_YONGFENG;
|
||||
}
|
||||
if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
|
||||
int familyId, modelId;
|
||||
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
|
||||
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
|
||||
// Treat "Milan" also as "Rome"
|
||||
cpu->cpu.model = ((familyId == 143 && modelId >= 49) || familyId == 175) ? NCCL_TOPO_CPU_TYPE_ROME : NCCL_TOPO_CPU_TYPE_ZEN;
|
||||
cpu->cpu.model = ((familyId == 143 && modelId >= 49) || familyId == 175) ? NCCL_TOPO_CPU_MODEL_AMD_ROME : NCCL_TOPO_CPU_MODEL_AMD_ZEN;
|
||||
}
|
||||
}
|
||||
for (int s=0; s<xmlCpu->nSubs; s++) {
|
||||
@@ -595,7 +604,7 @@ ncclResult_t ncclTopoAddXGMI(struct ncclXmlNode* node, struct ncclTopoSystem* sy
|
||||
NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId));
|
||||
} else if (targetType == CPU) {
|
||||
// NVL connection to the local CPU
|
||||
NCCLCHECK(findLocalCpu(gpu, &remote));
|
||||
NCCLCHECK(findLocalCpu(gpu, &remote, NULL));
|
||||
} else {
|
||||
if (system->nodes[NVS].count == 0) {
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));
|
||||
@@ -647,7 +656,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
|
||||
NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, NCCL_TOPO_ID(systemId, busId)));
|
||||
} else if (targetType == CPU) {
|
||||
// NVL connection to the local CPU
|
||||
NCCLCHECK(findLocalCpu(gpu, &remote));
|
||||
NCCLCHECK(findLocalCpu(gpu, &remote, NULL));
|
||||
} else {
|
||||
if (system->nodes[NVS].count == 0) {
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));
|
||||
@@ -725,10 +734,10 @@ ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* sys
|
||||
NCCLCHECK(xmlGetAttrInt(node, "bw", &bw));
|
||||
double c2cBw = (bw*count)/1000.0;
|
||||
struct ncclTopoNode* cpu = NULL;
|
||||
NCCLCHECK(findLocalCpu(gpu, &cpu));
|
||||
NCCLCHECK(findLocalCpu(gpu, &cpu, NULL));
|
||||
if (cpu == NULL) return ncclSuccess;
|
||||
NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_NVL, c2cBw));
|
||||
NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_NVL, c2cBw));
|
||||
NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_C2C, c2cBw));
|
||||
NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_C2C, c2cBw));
|
||||
} else {
|
||||
if (strcmp(node->name, "cpu") == 0) {
|
||||
NCCLCHECK(ncclGetSystemId(system, node, &systemId));
|
||||
@@ -1048,26 +1057,31 @@ struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (*
|
||||
// Trigger the merge, then get the new device's properties
|
||||
int vDevIndex = 0;
|
||||
ncclResult_t ret = makeVDevice(&vDevIndex, vProps);
|
||||
if (ret == ncclInvalidUsage) {
|
||||
WARN("TOPO/NET : Tried merging multiple devices together and failed. Try setting NCCL_NET_MERGE_LEVEL=LOC");
|
||||
NCCLCHECK(ret);
|
||||
if (ret != ncclSuccess) {
|
||||
INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "TOPO/NET : Tried merging multiple devices together and failed. vProps={ndevs=%d, devs=[%d %d %d %d]}. Set NCCL_NET_MERGE_LEVEL=LOC to disable NIC fusion.",
|
||||
vProps->ndevs, vProps->devs[0], vProps->devs[1], vProps->devs[2], vProps->devs[3]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
INFO(NCCL_GRAPH, "TOPO/NET : Made vNic %d", vDevIndex);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
|
||||
ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
|
||||
char* ncStr;
|
||||
NCCLCHECK(ncclCalloc(&ncStr, strlen(str)+1));
|
||||
strcpy(ncStr, str);
|
||||
char* semi_token;
|
||||
char* semi = strtok_r(str, ";", &semi_token);
|
||||
char* semi = strtok_r(ncStr, ";", &semi_token);
|
||||
while (semi) {
|
||||
TRACE(NCCL_NET, "Fusing %s", semi);
|
||||
struct netIf userIfs[NCCL_NET_MAX_DEVS_PER_NIC];
|
||||
int nUserIfs = parseStringList(semi, userIfs, NCCL_NET_MAX_DEVS_PER_NIC);
|
||||
if (nUserIfs == 0) {
|
||||
INFO(NCCL_NET, "NET/IB : Invalid NCCL_NET_FORCE_MERGE specified %s. Couldn't parse substring %s. Please provide a semicolon-delimited list of comma-delimited NIC groups.",
|
||||
str, semi);
|
||||
ncStr, semi);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1081,26 +1095,37 @@ ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str,
|
||||
if (vProps.ndevs != nUserIfs) {
|
||||
WARN("TOPO/NET : Only matched %d devices, %d requested from %s",
|
||||
vProps.ndevs, nUserIfs, semi);
|
||||
return ncclInvalidUsage;
|
||||
ret = ncclInvalidUsage;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (vProps.ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
|
||||
WARN("Specified fused NIC %s which has too many devices (%d). Max %d", semi, vProps.ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
|
||||
return ncclInvalidUsage;
|
||||
ret = ncclInvalidUsage;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
struct ncclXmlNode* netNode;
|
||||
NCCLCHECK(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice));
|
||||
|
||||
// Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
|
||||
for (int i = 0; i < vProps.ndevs; i++) {
|
||||
placedDevs[vProps.devs[i]] = 1;
|
||||
ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice);
|
||||
if (ret == ncclSuccess) {
|
||||
// Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
|
||||
for (int i = 0; i < vProps.ndevs; i++) {
|
||||
placedDevs[vProps.devs[i]] = 1;
|
||||
}
|
||||
} else {
|
||||
WARN("TOPO/NET : Could not force merge NICs %s. Please specify a valid NCCL_NET_FORCE_MERGE string.", semi);
|
||||
ret = ncclInvalidUsage;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
semi = strtok_r(NULL, ";", &semi_token);;
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
exit:
|
||||
free(ncStr);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
|
||||
@@ -1148,7 +1173,24 @@ ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLe
|
||||
}
|
||||
|
||||
struct ncclXmlNode* netNode;
|
||||
NCCLCHECKGOTO(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice), res, out);
|
||||
ncclResult_t ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice);
|
||||
|
||||
// Merging failed.
|
||||
// Mark all as unplaced and increase their distance to disconnected (PATH_DIS)
|
||||
// Set i to 0 to restart the automatic merging process and ensure all are placed
|
||||
if (ret != ncclSuccess) {
|
||||
INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "Marking physical devices as unplaced, increasing distance and restarting search.");
|
||||
placedDevs[i] = 0;
|
||||
TRACE(NCCL_GRAPH, "Setting dev %d as unplaced, keeping distance -> self as PATH_LOC", i);
|
||||
for (int k = 1; k < vProps.ndevs; k++) {
|
||||
int dev = vProps.devs[k];
|
||||
placedDevs[dev] = 0;
|
||||
paths[i*nPhysDevs + dev] = PATH_DIS;
|
||||
paths[dev*nPhysDevs + i] = PATH_DIS;
|
||||
TRACE(NCCL_GRAPH, "Setting dev %d as unplaced, setting distance -> %d as PATH_DIS", dev, i);
|
||||
}
|
||||
i = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1212,16 +1254,16 @@ ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_
|
||||
// By default, don't merge any devices
|
||||
int mergeLevel;
|
||||
mergeLevel = PATH_PORT;
|
||||
char* mergeLevelEnv;
|
||||
mergeLevelEnv = getenv("NCCL_NET_MERGE_LEVEL");
|
||||
if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
|
||||
char* forceMerge;
|
||||
forceMerge = getenv("NCCL_NET_FORCE_MERGE");
|
||||
NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
|
||||
memset(placedDevs, 0, sizeof(int)*physicalDevs);
|
||||
{ // Avoids warnings related to jumping to "out"
|
||||
const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL");
|
||||
if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
|
||||
const char* forceMerge = ncclGetEnv("NCCL_NET_FORCE_MERGE");
|
||||
NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
|
||||
memset(placedDevs, 0, sizeof(int)*physicalDevs);
|
||||
|
||||
if (forceMerge) {
|
||||
NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
|
||||
if (forceMerge) {
|
||||
NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
|
||||
}
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTopoAutoMerge(comm, xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
|
||||
|
||||
|
||||
+21
-17
@@ -21,9 +21,11 @@
|
||||
#define SM86_NVLINK_BW 12.0
|
||||
#define SM100_NVLINK_BW 40.0
|
||||
#define PCI_BW 12.0 // PCI Gen3 x16
|
||||
#define QPI_BW 6.0
|
||||
#define AMD_BW 16.0
|
||||
#define BDW_QPI_BW 6.0
|
||||
#define SKL_QPI_BW 10.0
|
||||
#define SRP_QPI_BW 22.0
|
||||
#define ERP_QPI_BW 40.0
|
||||
#define ZPI_BW 6.0
|
||||
#define YONGFENG_ZPI_BW 9.0
|
||||
#define P9_BW 32.0
|
||||
@@ -51,12 +53,13 @@ extern const char* topoNodeTypeStr[];
|
||||
#define LINK_LOC 0
|
||||
#define LINK_NVL 1
|
||||
// Skipping 2 for PATH_NVB
|
||||
#define LINK_PCI 3
|
||||
// Skipping 4 for PATH_PXB
|
||||
// Skipping 5 for PATH_PXN
|
||||
// Skipping 6 for PATH_PHB
|
||||
#define LINK_SYS 7
|
||||
#define LINK_NET 8
|
||||
#define LINK_C2C 3
|
||||
#define LINK_PCI 4
|
||||
// Skipping 5 for PATH_PXB
|
||||
// Skipping 6 for PATH_PXN
|
||||
// Skipping 7 for PATH_PHB
|
||||
#define LINK_SYS 8
|
||||
#define LINK_NET 9
|
||||
extern const char* topoLinkTypeStr[];
|
||||
|
||||
// Local (myself)
|
||||
@@ -68,29 +71,32 @@ extern const char* topoLinkTypeStr[];
|
||||
// Connection through NVLink using an intermediate GPU
|
||||
#define PATH_NVB 2
|
||||
|
||||
// Connection through C2C
|
||||
#define PATH_C2C 3
|
||||
|
||||
// Connection traversing at most a single PCIe bridge
|
||||
#define PATH_PIX 3
|
||||
#define PATH_PIX 4
|
||||
|
||||
// Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
|
||||
#define PATH_PXB 4
|
||||
#define PATH_PXB 5
|
||||
|
||||
// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
|
||||
#define PATH_PXN 5
|
||||
#define PATH_PXN 6
|
||||
|
||||
// Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
|
||||
#define PATH_PHB 6
|
||||
#define PATH_PHB 7
|
||||
|
||||
// Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
|
||||
#define PATH_SYS 7
|
||||
#define PATH_SYS 8
|
||||
|
||||
// Connection through the network
|
||||
#define PATH_NET 8
|
||||
#define PATH_NET 9
|
||||
|
||||
// New type of path which should precede PATH_PIX
|
||||
#define PATH_PORT PATH_NVL
|
||||
|
||||
// Disconnected
|
||||
#define PATH_DIS 9
|
||||
#define PATH_DIS 10
|
||||
extern const char* topoPathTypeStr[];
|
||||
|
||||
struct ncclTopoNode;
|
||||
@@ -110,9 +116,6 @@ struct ncclTopoLinkList {
|
||||
int type;
|
||||
};
|
||||
|
||||
#define NCCL_TOPO_CPU_INTEL_BDW 1
|
||||
#define NCCL_TOPO_CPU_INTEL_SKL 2
|
||||
|
||||
#define NCCL_TOPO_UNDEF (-1)
|
||||
|
||||
#define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff
|
||||
@@ -212,6 +215,7 @@ ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem*
|
||||
ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank);
|
||||
ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min);
|
||||
ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max);
|
||||
ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink);
|
||||
|
||||
#define NCCL_TOPO_XML_MAX_NODES 8192
|
||||
#define NCCL_GRAPH_XML_MAX_NODES 8192
|
||||
|
||||
@@ -382,6 +382,7 @@ static const double perChMaxTreeBws[][3] = {
|
||||
NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
|
||||
static int ncclPatEnable(struct ncclComm* comm) {
|
||||
int patEnable = ncclParamPatEnable();
|
||||
if (comm->minCompCap < 60) return 0; // Need SM60 or higher for CUDA atomics
|
||||
if (patEnable != 2) return patEnable;
|
||||
if (comm->nNodes != comm->nRanks) return 0; // PAT only supports 1 GPU per node
|
||||
if (comm->netDeviceType != NCCL_NET_DEVICE_HOST) return 0; // PAT doesn't support net device offload
|
||||
@@ -486,7 +487,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
if (a == NCCL_ALGO_TREE && coll == ncclFuncAllReduce) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
|
||||
if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
|
||||
if (a == NCCL_ALGO_TREE && comm->maxTreePattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
|
||||
if (a == NCCL_ALGO_PAT) busBw *= .75;
|
||||
if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
||||
if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
||||
|
||||
+36
-28
@@ -221,7 +221,6 @@ fail:
|
||||
|
||||
static ncclResult_t doLaunches(struct ncclComm* head) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
struct ncclComm* cliqueComm0 = head->intraComm0;
|
||||
struct ncclComm* cliqueHead = head;
|
||||
struct ncclComm* cliqueNextHead;
|
||||
bool useBarrier = ncclParamLaunchMode == ncclLaunchModeGroup;
|
||||
@@ -237,7 +236,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
|
||||
NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
|
||||
if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
|
||||
comm = comm->groupNext;
|
||||
} while (comm != nullptr && comm->intraComm0 == cliqueComm0);
|
||||
} while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
|
||||
cliqueNextHead = comm;
|
||||
|
||||
if (capturingYes && capturingNo) {
|
||||
@@ -454,38 +453,47 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
|
||||
|
||||
/* Connect channels at runtime if cumem is supported */
|
||||
if (groupCommHeadMain != nullptr) {
|
||||
struct ncclComm* comm = groupCommHeadMain;
|
||||
struct ncclComm* cliqueHead = groupCommHeadMain;
|
||||
struct ncclComm* comm = NULL;
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncCollJobs;
|
||||
ncclIntruQueueConstruct(&asyncCollJobs);
|
||||
do {
|
||||
bool needConnect = false;
|
||||
bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
|
||||
memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
|
||||
// We need to preconnect connections for collectives clique by clique to avoid
|
||||
// race condition for split shared comms which can connect the same connections
|
||||
// at the same time.
|
||||
comm = cliqueHead;
|
||||
do {
|
||||
bool needConnect = false;
|
||||
bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
|
||||
memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
|
||||
|
||||
// CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
|
||||
NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
|
||||
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
|
||||
NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
|
||||
|
||||
if (comm->cuMemSupport && needConnect) {
|
||||
struct ncclPreconnectJob* job;
|
||||
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
|
||||
job->base.func = ncclCollPreconnectFunc;
|
||||
job->base.undo = nullptr;
|
||||
job->base.destructor = free;
|
||||
job->base.state = ncclGroupJobRunning;
|
||||
job->base.abortFlag = comm->abortFlag;
|
||||
job->base.abortFlagDev = comm->abortFlagDev;
|
||||
job->comm = comm;
|
||||
NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
|
||||
memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
|
||||
ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
|
||||
if (comm->cuMemSupport && needConnect) {
|
||||
struct ncclPreconnectJob* job;
|
||||
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
|
||||
job->base.func = ncclCollPreconnectFunc;
|
||||
job->base.undo = nullptr;
|
||||
job->base.destructor = free;
|
||||
job->base.state = ncclGroupJobRunning;
|
||||
job->base.abortFlag = comm->abortFlag;
|
||||
job->base.abortFlagDev = comm->abortFlagDev;
|
||||
job->comm = comm;
|
||||
NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
|
||||
memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
|
||||
ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
|
||||
}
|
||||
comm = comm->groupNext;
|
||||
} while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
|
||||
// connect
|
||||
NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
|
||||
while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
|
||||
if (job->destructor) job->destructor((void*)job);
|
||||
}
|
||||
comm = comm->groupNext;
|
||||
} while (comm);
|
||||
NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
|
||||
while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
|
||||
if (job->destructor) job->destructor((void*)job);
|
||||
}
|
||||
cliqueHead = comm;
|
||||
} while (cliqueHead != nullptr);
|
||||
|
||||
// done with all buffer allocation, start registration and enqueue
|
||||
comm = groupCommHeadMain;
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#define NCCL_BITOPS_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#if !__NVCC__
|
||||
#ifndef __host__
|
||||
@@ -276,13 +277,53 @@ inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
|
||||
return u32fpDecode(x, 3);
|
||||
}
|
||||
|
||||
inline __host__ __device__ uint64_t getHash(const char* string, int n) {
|
||||
// Based on DJB2a, result = result * 33 ^ char
|
||||
uint64_t result = 5381;
|
||||
for (int c = 0; c < n; c++) {
|
||||
result = ((result << 5) + result) ^ string[c];
|
||||
// The hash isn't just a function of the bytes but also where the bytes are split
|
||||
// into different calls to eatHash().
|
||||
inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
|
||||
char const* ptr = (char const*)bytes;
|
||||
acc[0] ^= size;
|
||||
while (size != 0) {
|
||||
// Mix the accumulator bits.
|
||||
acc[0] += acc[1];
|
||||
acc[1] ^= acc[0];
|
||||
acc[0] ^= acc[0] >> 31;
|
||||
acc[0] *= 0x9de62bbc8cef3ce3;
|
||||
acc[1] ^= acc[1] >> 32;
|
||||
acc[1] *= 0x485cd6311b599e79;
|
||||
// Read in a chunk of input.
|
||||
size_t chunkSize = size < sizeof(uint64_t) ? size : sizeof(uint64_t);
|
||||
uint64_t x = 0;
|
||||
memcpy(&x, ptr, chunkSize);
|
||||
ptr += chunkSize;
|
||||
size -= chunkSize;
|
||||
// Add to accumulator.
|
||||
acc[0] += x;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
|
||||
eatHash(acc, (const void*)bytes, sizeof(T));
|
||||
}
|
||||
|
||||
inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
|
||||
uint64_t h = acc[0];
|
||||
h ^= h >> 31;
|
||||
h *= 0xbac3bd562846de6b;
|
||||
h += acc[1];
|
||||
h ^= h >> 32;
|
||||
h *= 0x995a187a14e7b445;
|
||||
return h;
|
||||
}
|
||||
|
||||
inline __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
|
||||
uint64_t acc[2] = {1, 1};
|
||||
eatHash(acc, bytes, size);
|
||||
return digestHash(acc);
|
||||
}
|
||||
template<typename T>
|
||||
inline __host__ __device__ uint64_t getHash(const T* bytes) {
|
||||
return getHash((const void*)bytes, sizeof(T));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
+239
-211
@@ -12,6 +12,7 @@
|
||||
#include "nccl.h"
|
||||
#include "nccl_common.h"
|
||||
#include "device.h"
|
||||
|
||||
#define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
|
||||
|
||||
// CHUNKSIZE must be a multiple of SLICESIZE
|
||||
@@ -396,6 +397,42 @@ public:
|
||||
~RingBCAlgorithm() {}
|
||||
};
|
||||
|
||||
#if !defined (__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
|
||||
// #include <cuda/atomic>
|
||||
#endif
|
||||
|
||||
// Need a power of two to ensure it divides by parallelFactor (which is also a power of two)
|
||||
#define NCCL_PAT_NWORKERS 512
|
||||
|
||||
static constexpr int PatUsed = 0x1,
|
||||
PatSkipped = 0x2;
|
||||
|
||||
struct ncclPatStep {
|
||||
int recvDim, sendDim, recvOffset, sendOffset, stepOffset, postRecv, postSend, nelem, last, flags;
|
||||
size_t inpIx, outIx;
|
||||
};
|
||||
|
||||
struct ncclPatPeer {
|
||||
uint64_t step;
|
||||
struct ncclConnInfo* conn;
|
||||
struct ncclConnFifo* connFifo;
|
||||
void* buff;
|
||||
uint64_t *headPtr;
|
||||
uint64_t *tailPtr;
|
||||
uint64_t stepCache;
|
||||
long long int accSize;
|
||||
int connStepSize;
|
||||
};
|
||||
|
||||
#define NCCL_SHMEM_PAT_STEPS 32
|
||||
struct ncclPatShmem {
|
||||
struct ncclPatStep patSteps[NCCL_SHMEM_PAT_STEPS];
|
||||
int parallelFactor;
|
||||
long long int localAccSize;
|
||||
struct ncclPatPeer sendDims[32]; // Should cover 2^32 ranks
|
||||
struct ncclPatPeer recvDims[32];
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class PatRSAlgorithm{
|
||||
size_t offset;
|
||||
@@ -408,18 +445,17 @@ class PatRSAlgorithm{
|
||||
int nrPow2;
|
||||
int postFreq;
|
||||
int lastA;
|
||||
|
||||
int parallelFactor;
|
||||
int aggFactor;
|
||||
int as; // aggregated steps
|
||||
int a; // step inside aggregated step
|
||||
int sendSkipped; // number of skipped steps during aggregation
|
||||
int recvSkipped; // number of skipped steps during aggregation
|
||||
int phase2recv; // receive offset for phase 2
|
||||
int stepOffset;
|
||||
int aggDelta;
|
||||
int scale;
|
||||
int phase;
|
||||
|
||||
__device__ __host__ int min(int a, int b) {
|
||||
__device__ __host__ ssize_t min(ssize_t a, ssize_t b) {
|
||||
return (a<b)?a:b;
|
||||
}
|
||||
|
||||
@@ -447,16 +483,16 @@ class PatRSAlgorithm{
|
||||
|
||||
__device__ __host__ void resetA() {
|
||||
a = 0;
|
||||
sendSkipped = recvSkipped = 0;
|
||||
sendSkipped = stepOffset = 0;
|
||||
lastA = aggFactor;
|
||||
if (phase >= 2) lastA /= 2*scale;
|
||||
if (phase == 4) lastA = 1;
|
||||
}
|
||||
|
||||
__device__ __host__ void reset() {
|
||||
nelem = getNelem();
|
||||
phase = 0;
|
||||
scale = 1;
|
||||
phase2recv = 0;
|
||||
as = aggDelta - 1;
|
||||
resetA();
|
||||
}
|
||||
@@ -479,8 +515,9 @@ class PatRSAlgorithm{
|
||||
}
|
||||
|
||||
public:
|
||||
__device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
|
||||
__device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, int maxParallelFactor, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
|
||||
offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
|
||||
parallelFactor = maxParallelFactor;
|
||||
aggDelta = nrPow2 = (1<<log2Up(nranks));
|
||||
|
||||
aggFactor = 1;
|
||||
@@ -490,6 +527,7 @@ public:
|
||||
aggDelta /= 2;
|
||||
}
|
||||
postFreq = aggFactor;
|
||||
if (postFreq < parallelFactor) parallelFactor = postFreq;
|
||||
int d = stepDepth;
|
||||
while (d > 1 && aggFactor < nranks/2) {
|
||||
d /= 2;
|
||||
@@ -500,160 +538,151 @@ public:
|
||||
reset();
|
||||
}
|
||||
|
||||
__device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &sendStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
|
||||
restart:
|
||||
last = 0;
|
||||
nelemOut = nelem;
|
||||
outIx = offset;
|
||||
__device__ __host__ int getParallelFactor() {
|
||||
return parallelFactor;
|
||||
}
|
||||
|
||||
__device__ __host__ void getNextOp(struct ncclPatStep* ps) {
|
||||
ps->last = 0;
|
||||
ps->nelem = nelem;
|
||||
ps->outIx = offset;
|
||||
ps->stepOffset = stepOffset;
|
||||
int skip = 0;
|
||||
//printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
|
||||
if (phase == 0) {
|
||||
if (a >= lastA) {
|
||||
skip = 1;
|
||||
} else if (phase == 0) {
|
||||
int s = mirrorInvert(a, lastA)*aggDelta + as;
|
||||
if (s >= nranks) skip = 1;
|
||||
int sendDataRank = (rank + s) % nranks;
|
||||
inpIx = sendDataRank * count + offset;
|
||||
recvDim = -1;
|
||||
sendDim = 0;
|
||||
outIx = 0;
|
||||
recvOffset = -1;
|
||||
sendOffset = ((a - sendSkipped)%postFreq) * nelem;
|
||||
sendStepOffset = 0;
|
||||
if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
|
||||
postSend = 1;
|
||||
ps->inpIx = sendDataRank * count + offset;
|
||||
ps->recvDim = -1;
|
||||
ps->sendDim = 0;
|
||||
ps->outIx = 0;
|
||||
ps->recvOffset = -1;
|
||||
ps->sendOffset = (a%postFreq) * nelem;
|
||||
if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
|
||||
ps->postSend = 1;
|
||||
} else {
|
||||
postSend = 0;
|
||||
ps->postSend = 0;
|
||||
}
|
||||
postRecv = 0;
|
||||
if (skip) sendSkipped++;
|
||||
if (++a == lastA) {
|
||||
phase = as == 1 ? (aggFactor > 1 ? 2 : 4) : 1; // If as == 1, switch to phase 2
|
||||
resetA();
|
||||
}
|
||||
if (skip == 0) return;
|
||||
ps->postRecv = 0;
|
||||
} else if (phase == 1) {
|
||||
int s = mirrorInvert(a, lastA)*aggDelta + as;
|
||||
if (s >= nranks) skip = 1;
|
||||
recvDim = firstBitSet(s, nrPow2);
|
||||
sendOffset = ((a - sendSkipped)%postFreq)*nelem;
|
||||
recvOffset = ((a - recvSkipped)%postFreq)*nelem;
|
||||
postSend = 0;
|
||||
if (recvDim == 0) {
|
||||
if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) postSend = 1;
|
||||
sendStepOffset = 0;
|
||||
ps->recvDim = firstBitSet(s, nrPow2);
|
||||
ps->sendOffset = (a%postFreq)*nelem;
|
||||
ps->recvOffset = (a%postFreq)*nelem;
|
||||
ps->postSend = 0;
|
||||
if (ps->recvDim == 0 && (((a%postFreq) + 1 >= postFreq) || (a == lastA-1))) ps->postSend = 1;
|
||||
if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
|
||||
ps->postRecv = 1;
|
||||
} else {
|
||||
sendStepOffset = (a - sendSkipped)/postFreq;
|
||||
ps->postRecv = 0;
|
||||
}
|
||||
if ((((a - recvSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
|
||||
postRecv = 1;
|
||||
} else {
|
||||
postRecv = 0;
|
||||
}
|
||||
s -= (1<<recvDim);
|
||||
s -= (1<<ps->recvDim);
|
||||
int recvDataRank = (rank + nranks + s) % nranks;
|
||||
inpIx = recvDataRank * count + offset;
|
||||
sendDim = s ? firstBitSet(s, nrPow2) : -1;
|
||||
if (sendDim == -1) {
|
||||
sendOffset = -1;
|
||||
sendStepOffset = 0;
|
||||
} else if (as - (1<<recvDim) == 0) {
|
||||
if (newPeer(a, aggFactor)) sendSkipped = a;
|
||||
ps->inpIx = recvDataRank * count + offset;
|
||||
ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
|
||||
if (ps->sendDim == -1) {
|
||||
ps->sendOffset = -1;
|
||||
} else if (as - (1<<ps->recvDim) == 0) {
|
||||
if (newPeer(a, aggFactor)) { sendSkipped = a; ps->stepOffset = stepOffset = 0; }
|
||||
int foffset = a - sendSkipped;
|
||||
sendStepOffset = recvDim == 0 ? 0 : foffset/postFreq;
|
||||
sendOffset = (foffset%postFreq)*nelem;
|
||||
ps->sendOffset = (foffset%postFreq)*nelem;
|
||||
}
|
||||
int recvDim = ps->recvDim;
|
||||
if (s < nranks && skip) {
|
||||
recvDim = -1;
|
||||
recvOffset = -1;
|
||||
postRecv = 0;
|
||||
ps->recvDim = -1;
|
||||
ps->recvOffset = -1;
|
||||
ps->postRecv = 0;
|
||||
skip = 0;
|
||||
}
|
||||
if (skip || recvDim == -1) recvSkipped++;
|
||||
if (skip) sendSkipped++;
|
||||
if (++a == lastA) {
|
||||
as--;
|
||||
phase = as % 2 == 1 ? 0 : 1;
|
||||
resetA();
|
||||
}
|
||||
if (skip == 0) return;
|
||||
if (recvDim > 0 && (((a-sendSkipped)%postFreq) + 1 >= postFreq) && skip == 0) stepOffset++;
|
||||
} else if (phase == 2) {
|
||||
int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta + 1;
|
||||
postRecv = 0;
|
||||
ps->postRecv = 0;
|
||||
if (s >= nranks) skip = 1;
|
||||
recvDim = 0;
|
||||
postSend = a == lastA-1 ? 1 : 0;
|
||||
ps->recvDim = 0;
|
||||
ps->postSend = a == lastA-1 ? 1 : 0;
|
||||
s -= 1;
|
||||
if (s < nranks && skip) {
|
||||
recvDim = -1;
|
||||
recvOffset = -1;
|
||||
ps->recvDim = -1;
|
||||
ps->recvOffset = -1;
|
||||
skip = 0;
|
||||
} else if (!skip) {
|
||||
int foffset = phase2recv;
|
||||
phase2recv++;
|
||||
postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
|
||||
recvOffset = (foffset%postFreq) * nelem;
|
||||
int foffset = a + aggFactor - aggFactor/scale;
|
||||
ps->postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
|
||||
ps->recvOffset = (foffset%postFreq) * nelem;
|
||||
}
|
||||
int recvDataRank = (rank + nranks + s) % nranks;
|
||||
inpIx = recvDataRank * count + offset;
|
||||
sendDim = s ? firstBitSet(s, nrPow2) : -1;
|
||||
int foffset = a - sendSkipped;
|
||||
postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
|
||||
sendStepOffset = 0;
|
||||
sendOffset = (foffset%postFreq) * nelem;
|
||||
if (skip || sendDim == -1) sendSkipped++;
|
||||
if (++a == lastA) {
|
||||
phase = 3;
|
||||
resetA();
|
||||
}
|
||||
if (skip == 0) return;
|
||||
ps->inpIx = recvDataRank * count + offset;
|
||||
ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
|
||||
int foffset = a;
|
||||
ps->postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
|
||||
ps->sendOffset = (foffset%postFreq) * nelem;
|
||||
} else if (phase == 3) {
|
||||
int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta;
|
||||
postRecv = a == lastA-1 ? 1 : 0;
|
||||
ps->postRecv = a == lastA-1 ? 1 : 0;
|
||||
if (s >= nranks) skip = 1;
|
||||
recvDim = firstBitSet(s, nrPow2);
|
||||
postSend = 0;
|
||||
s -= (1<<recvDim);
|
||||
int foffset = a - recvSkipped;
|
||||
postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0;
|
||||
recvOffset = (foffset%postFreq) * nelem;
|
||||
ps->recvDim = firstBitSet(s, nrPow2);
|
||||
ps->postSend = 0;
|
||||
s -= (1<<ps->recvDim);
|
||||
int foffset = a;
|
||||
ps->postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0;
|
||||
ps->recvOffset = (foffset%postFreq) * nelem;
|
||||
int recvDataRank = (rank + nranks + s) % nranks;
|
||||
inpIx = recvDataRank * count + offset;
|
||||
sendDim = s ? firstBitSet(s, nrPow2) : -1;
|
||||
ps->inpIx = recvDataRank * count + offset;
|
||||
ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
|
||||
if (s < nranks && skip) {
|
||||
recvDim = -1;
|
||||
recvOffset = -1;
|
||||
postRecv = 0;
|
||||
ps->recvDim = -1;
|
||||
ps->recvOffset = -1;
|
||||
ps->postRecv = 0;
|
||||
skip = 0;
|
||||
}
|
||||
if (newPeer(a, aggFactor/(2*scale))) sendSkipped = a;
|
||||
if (newPeer(a, aggFactor/(2*scale))) { sendSkipped = a; ps->stepOffset = stepOffset = 0; }
|
||||
foffset = a - sendSkipped;
|
||||
sendStepOffset = foffset / postFreq; // Accumulate on next steps
|
||||
sendOffset = sendDim >= 0 ? (foffset%postFreq) * nelem : -1;
|
||||
if (skip || recvDim == -1) recvSkipped++;
|
||||
if (skip) sendSkipped++;
|
||||
if (++a == lastA) {
|
||||
scale *= 2;
|
||||
phase = scale < aggFactor ? 2 : 4;
|
||||
if ((foffset%postFreq) + 1 >= postFreq && skip == 0) stepOffset++;
|
||||
ps->sendOffset = ps->sendDim >= 0 ? (foffset%postFreq) * nelem : -1;
|
||||
} else if (phase == 4) {
|
||||
ps->recvDim = 0;
|
||||
ps->sendDim = -1;
|
||||
ps->inpIx = rank * count + offset;
|
||||
ps->recvOffset = ((aggFactor-1)%postFreq) * nelem;
|
||||
ps->sendOffset = -1;
|
||||
ps->postRecv = 1;
|
||||
ps->postSend = 0;
|
||||
offset += chunkCount;
|
||||
}
|
||||
a++;
|
||||
if (a >= lastA && a >= parallelFactor) {
|
||||
int p = phase;
|
||||
if (p == 1) as--;
|
||||
if (p == 3) scale *= 2;
|
||||
phase =
|
||||
p == 0 ? as == 1 ? (aggFactor > 1 ? 2 : 4) : 1 :
|
||||
p == 1 ? as % 2 == 1 ? 0 : 1 :
|
||||
p == 2 ? 3 :
|
||||
p == 3 ? scale < aggFactor ? 2 : 4 :
|
||||
5;
|
||||
if (p == 4) {
|
||||
if (offset >= end) {
|
||||
ps->last = 2;
|
||||
} else {
|
||||
reset();
|
||||
}
|
||||
} else {
|
||||
resetA();
|
||||
}
|
||||
if (skip == 0) return;
|
||||
} else if (phase == 4) {
|
||||
recvDim = 0;
|
||||
sendDim = -1;
|
||||
inpIx = rank * count + offset;
|
||||
recvOffset = (phase2recv%postFreq) * nelem;
|
||||
sendStepOffset = 0;
|
||||
sendOffset = -1;
|
||||
postRecv = 1;
|
||||
postSend = 0;
|
||||
offset += chunkCount;
|
||||
if (offset >= end) {
|
||||
last = 1;
|
||||
} else {
|
||||
reset();
|
||||
}
|
||||
return;
|
||||
} else if (phase == 4 && offset >= end) {
|
||||
ps->last = 1;
|
||||
}
|
||||
goto restart;
|
||||
int flags = PatUsed | (skip ? PatSkipped : 0);
|
||||
#if __CUDA_ARCH__ >= 600
|
||||
cuda::atomic_ref<int, cuda::thread_scope_block> a(ps->flags);
|
||||
a.store(flags, cuda::memory_order_release);
|
||||
#else
|
||||
ps->flags = flags;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@@ -669,14 +698,12 @@ class PatAGAlgorithm{
|
||||
int nrPow2;
|
||||
int postFreq;
|
||||
int lastA;
|
||||
|
||||
int parallelFactor;
|
||||
int aggFactor;
|
||||
int as; // aggregated steps
|
||||
int a; // step inside aggregated step
|
||||
int aggDelta;
|
||||
|
||||
int scale;
|
||||
|
||||
int phase;
|
||||
|
||||
// AS computation
|
||||
@@ -685,7 +712,7 @@ class PatAGAlgorithm{
|
||||
int bitCount[32];
|
||||
int bitZeroStep[32];
|
||||
|
||||
__device__ __host__ int min(int a, int b) {
|
||||
__device__ __host__ ssize_t min(ssize_t a, ssize_t b) {
|
||||
return (a<b)?a:b;
|
||||
}
|
||||
|
||||
@@ -752,8 +779,9 @@ class PatAGAlgorithm{
|
||||
|
||||
|
||||
public:
|
||||
__device__ __host__ PatAGAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
|
||||
__device__ __host__ PatAGAlgorithm(int stepSize, int stepDepth, int maxParallelFactor, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
|
||||
offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
|
||||
parallelFactor = maxParallelFactor;
|
||||
aggDelta = nrPow2 = (1<<log2Up(nranks));
|
||||
|
||||
aggFactor = 1;
|
||||
@@ -763,120 +791,120 @@ public:
|
||||
aggDelta /= 2;
|
||||
}
|
||||
postFreq = aggFactor;
|
||||
if (postFreq < parallelFactor) parallelFactor = postFreq;
|
||||
int d = stepDepth;
|
||||
while (d > 1 && aggFactor < nranks/2) {
|
||||
d /= 2;
|
||||
aggFactor *= 2;
|
||||
aggDelta /= 2;
|
||||
}
|
||||
//printf("AggFactor %d PostFreq %d AggDelta %d\n", aggFactor, postFreq, aggDelta);
|
||||
|
||||
asDim = log2Up(aggDelta);
|
||||
reset();
|
||||
}
|
||||
|
||||
__device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &recvStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
|
||||
restart:
|
||||
//printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
|
||||
last = 0;
|
||||
nelemOut = nelem;
|
||||
inpIx = offset;
|
||||
__device__ __host__ int getParallelFactor() {
|
||||
return parallelFactor;
|
||||
}
|
||||
|
||||
__device__ __host__ void getNextOp(struct ncclPatStep* ps) {
|
||||
ps->last = 0;
|
||||
ps->nelem = nelem;
|
||||
ps->inpIx = offset;
|
||||
int skip = 0;
|
||||
if (phase == 0) {
|
||||
if (a >= lastA) {
|
||||
skip = 1;
|
||||
} else if (phase == 0) {
|
||||
int s = a*aggDelta + as;
|
||||
if (s >= nranks) skip = 1;
|
||||
int nextSkip = (a+1)*aggDelta + as >= nranks ? 1 : 0;
|
||||
int recvDataRank = (rank + s) % nranks;
|
||||
outIx = recvDataRank * count + offset;
|
||||
sendDim = -1;
|
||||
recvDim = 0;
|
||||
inpIx = 0;
|
||||
sendOffset = -1;
|
||||
recvOffset = (a % postFreq) * nelem;
|
||||
recvStepOffset = 0;
|
||||
postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
|
||||
postSend = 0;
|
||||
a++;
|
||||
if (nextSkip) {
|
||||
as = nextAs();
|
||||
if (as == aggDelta/2) {
|
||||
offset += chunkCount;
|
||||
if (offset >= end) {
|
||||
last = 1;
|
||||
} else {
|
||||
reset();
|
||||
}
|
||||
return;
|
||||
}
|
||||
phase = 1;
|
||||
resetA();
|
||||
}
|
||||
if (skip == 0) return;
|
||||
ps->outIx = recvDataRank * count + offset;
|
||||
ps->sendDim = -1;
|
||||
ps->recvDim = 0;
|
||||
ps->inpIx = 0;
|
||||
ps->sendOffset = -1;
|
||||
ps->recvOffset = (a % postFreq) * nelem;
|
||||
ps->stepOffset = 0;
|
||||
ps->postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
|
||||
ps->postSend = 0;
|
||||
} else if (phase == 1) {
|
||||
int s = a*aggDelta + as;
|
||||
if (s >= nranks) skip = 1;
|
||||
sendDim = firstBitSet(s, nrPow2);
|
||||
s -= (1<<sendDim);
|
||||
ps->sendDim = firstBitSet(s, nrPow2);
|
||||
s -= (1<<ps->sendDim);
|
||||
int sendDataRank = (rank + nranks + s) % nranks;
|
||||
outIx = sendDataRank * count + offset;
|
||||
recvDim = s ? firstBitSet(s, nrPow2) : -1;
|
||||
sendOffset = recvOffset = (a % postFreq) * nelem;
|
||||
postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
|
||||
postRecv = (sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0;
|
||||
recvStepOffset = (sendDim == 0) ? 0 : a/postFreq;
|
||||
if (recvDim == -1) {
|
||||
recvOffset = -1;
|
||||
postRecv = 0;
|
||||
} else if (as - (1<<sendDim) == 0) {
|
||||
int foffset = (a*aggDelta) >> (recvDim+1);
|
||||
recvOffset = (foffset%postFreq)*nelem;
|
||||
postRecv = (sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<<recvDim) >= nranks) ? 1 : 0;
|
||||
recvStepOffset = (sendDim == 0) ? 0 : foffset/postFreq;
|
||||
ps->outIx = sendDataRank * count + offset;
|
||||
ps->recvDim = s ? firstBitSet(s, nrPow2) : -1;
|
||||
ps->sendOffset = ps->recvOffset = (a % postFreq) * nelem;
|
||||
ps->postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
|
||||
ps->postRecv = (ps->sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0;
|
||||
ps->stepOffset = (ps->sendDim == 0) ? 0 : a/postFreq;
|
||||
if (ps->recvDim == -1) {
|
||||
ps->recvOffset = -1;
|
||||
ps->postRecv = 0;
|
||||
} else if (as - (1<<ps->sendDim) == 0) {
|
||||
int foffset = (a*aggDelta) >> (ps->recvDim+1);
|
||||
ps->recvOffset = (foffset%postFreq)*nelem;
|
||||
ps->postRecv = (ps->sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<<ps->recvDim) >= nranks) ? 1 : 0;
|
||||
ps->stepOffset = (ps->sendDim == 0) ? 0 : foffset/postFreq;
|
||||
}
|
||||
if (s < nranks && sendDim == 0 && skip) {
|
||||
if (s < nranks && ps->sendDim == 0 && skip) {
|
||||
// Don't forget to receive at least once even if we don't send afterwards
|
||||
sendDim = -1;
|
||||
sendOffset = -1;
|
||||
postSend = 0;
|
||||
ps->sendDim = -1;
|
||||
ps->sendOffset = -1;
|
||||
ps->postSend = 0;
|
||||
skip = 0;
|
||||
}
|
||||
if (++a == lastA) {
|
||||
if (as % 2 == 1) {
|
||||
phase = 0;
|
||||
} else {
|
||||
as = nextAs();
|
||||
}
|
||||
resetA();
|
||||
}
|
||||
if (skip == 0) return;
|
||||
} else if (phase == 2) {
|
||||
int s = (2*a+1)*scale*aggDelta;
|
||||
postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0;
|
||||
postRecv = 0;
|
||||
ps->postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0;
|
||||
ps->postRecv = 0;
|
||||
if (s >= nranks) skip = 1;
|
||||
sendDim = firstBitSet(s, nrPow2);
|
||||
s -= (1<<sendDim);
|
||||
sendOffset = (a%postFreq) * nelem;
|
||||
recvStepOffset = a / postFreq;
|
||||
ps->sendDim = firstBitSet(s, nrPow2);
|
||||
s -= (1<<ps->sendDim);
|
||||
ps->sendOffset = (a%postFreq) * nelem;
|
||||
ps->stepOffset = a / postFreq;
|
||||
int sendDataRank = (rank + nranks + s) % nranks;
|
||||
outIx = sendDataRank * count + offset;
|
||||
recvDim = s ? firstBitSet(s, nrPow2) : -1;
|
||||
if (recvDim == -1) {
|
||||
recvOffset = -1;
|
||||
ps->outIx = sendDataRank * count + offset;
|
||||
ps->recvDim = s ? firstBitSet(s, nrPow2) : -1;
|
||||
if (ps->recvDim == -1) {
|
||||
ps->recvOffset = -1;
|
||||
} else {
|
||||
s -= (1<<recvDim);
|
||||
int foffset = (a*2*scale*aggDelta) >> (recvDim+1);
|
||||
recvOffset = (foffset%postFreq)*nelem;
|
||||
recvStepOffset = foffset / postFreq;
|
||||
s -= (1<<ps->recvDim);
|
||||
int foffset = (a*2*scale*aggDelta) >> (ps->recvDim+1);
|
||||
ps->recvOffset = (foffset%postFreq)*nelem;
|
||||
ps->stepOffset = foffset / postFreq;
|
||||
}
|
||||
if (++a == lastA) {
|
||||
scale /= 2;
|
||||
phase = scale ? 2 : 1;
|
||||
}
|
||||
a++;
|
||||
if (a >= lastA && a >= parallelFactor) {
|
||||
int p = phase;
|
||||
if (p == 2) scale /= 2;
|
||||
phase =
|
||||
p == 2 ? scale ? 2 : 1 :
|
||||
p == 1 ? as % 2 == 1 ? 0 : 1 :
|
||||
1;
|
||||
if (p == 0 || (p == 1 && as % 2 == 0)) as = nextAs();
|
||||
if (p == 0 && as == aggDelta/2) {
|
||||
offset += chunkCount;
|
||||
if (offset >= end) {
|
||||
ps->last = 2;
|
||||
} else {
|
||||
reset();
|
||||
}
|
||||
} else {
|
||||
resetA();
|
||||
}
|
||||
if (skip == 0) return;
|
||||
} else if (phase == 0 && as == 1 && offset + chunkCount >= end && a-1 >= ((lastA-1) / parallelFactor) * parallelFactor) {
|
||||
ps->last = 1;
|
||||
}
|
||||
goto restart;
|
||||
int flags = PatUsed | (skip ? PatSkipped : 0);
|
||||
#if __CUDA_ARCH__ >= 600
|
||||
cuda::atomic_ref<int, cuda::thread_scope_block> a(ps->flags);
|
||||
a.store(flags, cuda::memory_order_release);
|
||||
#else
|
||||
ps->flags = flags;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
@@ -139,6 +139,9 @@ struct ncclSharedResources {
|
||||
int* tpRankToLocalRank;
|
||||
// Internal streams
|
||||
struct ncclStrongStream deviceStream, hostStream;
|
||||
int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
|
||||
int persistentRefs;
|
||||
cudaEvent_t launchEvent, scratchEvent;
|
||||
|
||||
/* proxy related shared res */
|
||||
struct ncclProxyState* proxyState;
|
||||
@@ -437,6 +440,7 @@ struct ncclComm {
|
||||
// List of destructors to run when comm is destructed
|
||||
struct ncclDestructor* destructorHead;
|
||||
|
||||
struct ncclCudaContext* context;
|
||||
struct ncclSharedResources* sharedRes;
|
||||
/* map to top parent ranks. */
|
||||
int* topParentRanks;
|
||||
@@ -449,6 +453,7 @@ struct ncclComm {
|
||||
|
||||
int netPluginLoaded;
|
||||
ncclNet_t* ncclNet;
|
||||
int ncclNetVer;
|
||||
ncclNetDeviceType netDeviceType;
|
||||
ncclCollNet_t* ncclCollNet;
|
||||
void* bootstrap;
|
||||
@@ -456,6 +461,7 @@ struct ncclComm {
|
||||
struct channelMasks* connectSend;
|
||||
struct channelMasks* connectRecv;
|
||||
struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
|
||||
int maxTreePattern;
|
||||
bool initAlgoChannels[NCCL_NUM_ALGORITHMS];
|
||||
bool runtimeConn; // if dynamic connection is supported
|
||||
bool directMode;
|
||||
@@ -603,8 +609,7 @@ struct ncclComm {
|
||||
struct ncclComm* groupNext;
|
||||
// Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
|
||||
struct ncclComm* preconnectNext;
|
||||
int persistentRefs; // number of persistent plan-lists capturing this comm
|
||||
int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
|
||||
int localPersistentRefs; // number of persistent plan-lists capturing this comm
|
||||
struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule;
|
||||
|
||||
struct ncclKernelPlanner planner;
|
||||
@@ -669,6 +674,7 @@ struct ncclComm {
|
||||
// Profiler plugin
|
||||
void* profilerContext;
|
||||
uint64_t seqNumber[NCCL_NUM_FUNCTIONS];
|
||||
struct ncclProfilerProxy profiler;
|
||||
|
||||
// buffer registration cache
|
||||
struct ncclRegCache regCache;
|
||||
|
||||
@@ -163,6 +163,7 @@ struct ncclProxyConnector {
|
||||
|
||||
struct ncclConnector {
|
||||
int connected;
|
||||
int hasSeen;
|
||||
struct ncclProxyConnector proxyConn;
|
||||
struct ncclTransportComm* transportComm;
|
||||
void* transportResources;
|
||||
@@ -256,6 +257,8 @@ struct alignas(16) ncclDevWorkP2p {
|
||||
uint8_t sendNetReg:1, recvNetReg:1;
|
||||
uint8_t sendIpcReg:1, recvIpcReg:1;
|
||||
|
||||
uint8_t profilerEnabled:1;
|
||||
|
||||
uint8_t sendConnIndex:2, recvConnIndex:2;
|
||||
};
|
||||
|
||||
@@ -304,7 +307,7 @@ struct alignas(16) ncclDevWorkColl {
|
||||
uint32_t nWarps:8;
|
||||
uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1, rcclUseOneSlice:1;
|
||||
uint32_t root:30, connIndex:2;
|
||||
uint16_t pivotA2ANumBiRings;
|
||||
uint16_t pivotA2ANumBiRings:15, profilerEnabled:1;
|
||||
void* recvbuff;
|
||||
void* sendbuff;
|
||||
uintptr_t sendbuffOffset;
|
||||
@@ -498,6 +501,7 @@ struct alignas(16) ncclDevChannel {
|
||||
struct ncclTree binTree;
|
||||
struct ncclNvls nvls;
|
||||
uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
|
||||
uint64_t workCounter;
|
||||
};
|
||||
|
||||
struct ncclDevComm {
|
||||
@@ -523,6 +527,10 @@ struct ncclDevComm {
|
||||
|
||||
int* rankToLocalRank;
|
||||
|
||||
// Profiler counters
|
||||
uint64_t* workStarted/*[MAXCHANNELS]*/;
|
||||
uint64_t* workCompleted/*[MAXCHANNELS]*/;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
NpKitEventCollectContext* npKitEventCollectContexts;
|
||||
uint64_t* cpuTimestamp;
|
||||
@@ -621,7 +629,7 @@ __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int
|
||||
|
||||
__host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
|
||||
// Our collective unroll should move to the same bytes&insns model as NVLS.
|
||||
return cudaArch >= 800 ? 8 : 4;
|
||||
return cudaArch >= 800 ? (cudaArch == 1200 ? 6 : 8) : 4;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
|
||||
|
||||
+19
-10
@@ -37,17 +37,24 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
|
||||
ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, int* useGdr);
|
||||
#define MAX_XGMI_INTER_GPUS 4
|
||||
ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int64_t* id, int* dev);
|
||||
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
|
||||
ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush);
|
||||
enum ncclTopoGdrMode {
|
||||
ncclTopoGdrModeDisable = 0,
|
||||
ncclTopoGdrModeDefault = 1,
|
||||
ncclTopoGdrModePci = 2,
|
||||
ncclTopoGdrModeNum = 3
|
||||
};
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode);
|
||||
ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev, int rank, int* flush);
|
||||
ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail);
|
||||
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net);
|
||||
int ncclPxnDisable(struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
|
||||
ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu);
|
||||
|
||||
#define MAX_XGMI_INTER_GPUS 4
|
||||
ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int64_t* id, int* dev);
|
||||
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
|
||||
|
||||
// Find CPU affinity
|
||||
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
|
||||
|
||||
@@ -59,11 +66,13 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
|
||||
#define NCCL_TOPO_CPU_VENDOR_AMD 2
|
||||
#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
|
||||
#define NCCL_TOPO_CPU_VENDOR_MIXED 4
|
||||
#define NCCL_TOPO_CPU_TYPE_BDW 1
|
||||
#define NCCL_TOPO_CPU_TYPE_SKL 2
|
||||
#define NCCL_TOPO_CPU_TYPE_ZEN 3
|
||||
#define NCCL_TOPO_CPU_TYPE_ROME 4
|
||||
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
|
||||
#define NCCL_TOPO_CPU_MODEL_INTEL_BDW 1
|
||||
#define NCCL_TOPO_CPU_MODEL_INTEL_SKL 2
|
||||
#define NCCL_TOPO_CPU_MODEL_INTEL_SRP 3
|
||||
#define NCCL_TOPO_CPU_MODEL_INTEL_ERP 4
|
||||
#define NCCL_TOPO_CPU_MODEL_AMD_ZEN 5
|
||||
#define NCCL_TOPO_CPU_MODEL_AMD_ROME 6
|
||||
#define NCCL_TOPO_CPU_MODEL_YONGFENG 1
|
||||
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
|
||||
ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
|
||||
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
|
||||
|
||||
@@ -108,6 +108,12 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
|
||||
struct ncclComm** pp = &ncclGroupCommHead;
|
||||
while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
|
||||
pp = &(*pp)->groupNext;
|
||||
|
||||
// didn't find its clique, we need to insert it with ascending order based on commHash
|
||||
if (*pp == nullptr) {
|
||||
pp = &ncclGroupCommHead;
|
||||
while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext;
|
||||
}
|
||||
comm->groupNext = *pp;
|
||||
*pp = comm;
|
||||
// Comms gets a new memory stack scope upon joining. Each task batched for
|
||||
|
||||
@@ -1,610 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_NET_H_
|
||||
#define NCCL_NET_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "nccl_common.h"
|
||||
#include "net_device.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#define NCCL_NET_HANDLE_MAXSIZE 128
|
||||
//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
|
||||
#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
|
||||
#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
|
||||
|
||||
#define NCCL_PTR_HOST 0x1
|
||||
#define NCCL_PTR_CUDA 0x2
|
||||
#define NCCL_PTR_DMABUF 0x4
|
||||
|
||||
// Maximum number of requests per comm object
|
||||
#define NCCL_NET_MAX_REQUESTS 32
|
||||
|
||||
// Max number of ncclNet objects which can live in the same process
|
||||
#define NCCL_NET_MAX_PLUGINS 3
|
||||
|
||||
#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
|
||||
#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
|
||||
|
||||
typedef struct {
|
||||
int ndevs;
|
||||
int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
|
||||
} ncclNetVDeviceProps_v9_t;
|
||||
typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int regIsGlobal; // regMr is not tied to a particular comm
|
||||
int forceFlush; // Force a flush on receives
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
ncclNetDeviceType netDeviceType; // Network offload type
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
ncclNetVDeviceProps_v9_t vProps;
|
||||
size_t maxP2pBytes; // Max transfer size for point-to-point operations
|
||||
size_t maxCollBytes; // Max transfer size for collective operations
|
||||
} ncclNetProperties_v9_t;
|
||||
typedef ncclNetProperties_v9_t ncclNetProperties_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
|
||||
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
|
||||
|
||||
// Notify the plugin that a recv has completed by the device
|
||||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
|
||||
|
||||
// Create a virtual NIC given the specified properties, which can be accessed at device index d
|
||||
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
|
||||
} ncclNet_v9_t;
|
||||
|
||||
typedef ncclNet_v9_t ncclNet_t;
|
||||
|
||||
#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v9
|
||||
|
||||
typedef struct {
|
||||
void* mhandle;
|
||||
void* address;
|
||||
size_t size;
|
||||
} ncclNetSGE_v9_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
|
||||
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
|
||||
void* sendMhandle, void** request);
|
||||
ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
|
||||
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp,
|
||||
void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Create a virtual NIC given the specified properties, which can be accessed at device index d
|
||||
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
|
||||
} ncclCollNet_v9_t;
|
||||
|
||||
typedef ncclCollNet_v9_t ncclCollNet_t;
|
||||
|
||||
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v9
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int regIsGlobal; // regMr is not tied to a particular comm
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
ncclNetDeviceType netDeviceType; // Network offload type
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
} ncclNetProperties_v8_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
|
||||
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
|
||||
|
||||
// Notify the plugin that a recv has completed by the device
|
||||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
|
||||
} ncclNet_v8_t;
|
||||
|
||||
typedef struct {
|
||||
void* mhandle;
|
||||
void* address;
|
||||
uint32_t size;
|
||||
} ncclNetSGE_v8_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
|
||||
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
|
||||
void* sendMhandle, void** request);
|
||||
ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
|
||||
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp,
|
||||
void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v8_t;
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
ncclNetDeviceType netDeviceType; // Network offload type
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
} ncclNetProperties_v7_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
|
||||
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
|
||||
|
||||
// Notify the plugin that a recv has completed by the device
|
||||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
|
||||
} ncclNet_v7_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v7_t;
|
||||
|
||||
#define NCCL_NET_MAX_REQUESTS_V6 8
|
||||
|
||||
// v6 struct for backwards compatibility
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
} ncclNetProperties_v6_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclNet_v6_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v6_t;
|
||||
|
||||
// v5 struct for backwards compatibility
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclNet_v5_t;
|
||||
|
||||
// v5 struct for backwards compatibility
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v5_t;
|
||||
|
||||
// context passed from RCCL lib to n/w plugin
|
||||
typedef struct {
|
||||
// channel id
|
||||
uint32_t chId;
|
||||
} ncclNet_ctxt_t;
|
||||
|
||||
#endif // end include guard
|
||||
@@ -1,235 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_PROFILER_H_
|
||||
#define NCCL_PROFILER_H_
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
enum {
|
||||
ncclProfileGroup = (1 << 0), // group event type
|
||||
ncclProfileColl = (1 << 1), // host collective call event type
|
||||
ncclProfileP2p = (1 << 2), // host point-to-point call event type
|
||||
ncclProfileProxyOp = (1 << 3), // proxy operation event type
|
||||
ncclProfileProxyStep = (1 << 4), // proxy step event type
|
||||
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint64_t seqNumber;
|
||||
const char* func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
const char* datatype;
|
||||
size_t trafficBytes;
|
||||
uint8_t nMaxChannels;
|
||||
uint8_t nWarps;
|
||||
const char* algo;
|
||||
const char* proto;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
const char* func;
|
||||
void* buff;
|
||||
const char* datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
};
|
||||
} ncclProfilerEventDescr_v2_t;
|
||||
|
||||
typedef enum {
|
||||
ncclProfilerProxyOpSendPosted,
|
||||
ncclProfilerProxyOpSendRemFifoWait,
|
||||
ncclProfilerProxyOpSendTransmitted,
|
||||
ncclProfilerProxyOpSendDone,
|
||||
ncclProfilerProxyOpRecvPosted,
|
||||
ncclProfilerProxyOpRecvReceived,
|
||||
ncclProfilerProxyOpRecvTransmitted,
|
||||
ncclProfilerProxyOpRecvDone,
|
||||
|
||||
/* Legacy proxy profiler states */
|
||||
ncclProfilerProxyStepSendGPUWait,
|
||||
ncclProfilerProxyStepSendWait,
|
||||
ncclProfilerProxyStepRecvWait,
|
||||
ncclProfilerProxyStepRecvFlushWait,
|
||||
ncclProfilerProxyStepRecvGPUWait,
|
||||
|
||||
/* Legacy proxy control states */
|
||||
ncclProfilerProxyCtrlIdle,
|
||||
ncclProfilerProxyCtrlActive,
|
||||
ncclProfilerProxyCtrlSleep,
|
||||
ncclProfilerProxyCtrlWakeup,
|
||||
ncclProfilerProxyCtrlAppend,
|
||||
ncclProfilerProxyCtrlAppendEnd,
|
||||
} ncclProfilerEventState_v2_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
int steps;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v2_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v2_t;
|
||||
|
||||
typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
|
||||
typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
|
||||
typedef ncclProfiler_v2_t ncclProfiler_t;
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint64_t seqNumber;
|
||||
uint8_t func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
uint8_t datatype;
|
||||
uint32_t op;
|
||||
size_t trafficBytes;
|
||||
uint8_t nMaxChannels;
|
||||
uint8_t nWarps;
|
||||
uint8_t algo;
|
||||
uint8_t proto;
|
||||
int isCollnet;
|
||||
int isNvls;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint8_t func;
|
||||
void* buff;
|
||||
uint8_t datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
};
|
||||
} ncclProfilerEventDescr_v1_t;
|
||||
|
||||
typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
|
||||
typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v1_t;
|
||||
|
||||
#endif
|
||||
@@ -1,149 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_TUNER_H_
|
||||
#define NCCL_TUNER_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "nccl_common.h"
|
||||
|
||||
// API to be implemented by external tuner
|
||||
typedef struct {
|
||||
// Name of the tuner
|
||||
const char* name;
|
||||
|
||||
// Initializes tuner states.
|
||||
// Inputs:
|
||||
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
|
||||
// - nNodes: number of nodes in current communicator.
|
||||
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
|
||||
// Outputs:
|
||||
// - context: tuner context object
|
||||
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
|
||||
|
||||
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
|
||||
// Inputs:
|
||||
// - context: tuner context object
|
||||
// - collType: collective type , e.g., allreduce, allgather…
|
||||
// - nBytes: collective size in bytes
|
||||
// - numPipeOps: number of operations in the group
|
||||
// - numAlgo: number of algorithms in collCostTable
|
||||
// - numProto: number of protocols in collCostTable
|
||||
// - regBuff: can register user buffer
|
||||
//
|
||||
// Outputs:
|
||||
// - nChannels: number of channels (hence SMs) to be used.
|
||||
//
|
||||
// InOut:
|
||||
// - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
|
||||
// NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
|
||||
//
|
||||
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
|
||||
// default tuning for the given collective.
|
||||
// Also, the plugin is allowed to not set any output, or set only the
|
||||
// algorithm and protocol, but not only the algorithm or only the protocol.
|
||||
// Unset fields will be set automatically by NCCL.
|
||||
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
|
||||
int regBuff, int* nChannels);
|
||||
|
||||
// Terminates the plugin and cleans up any resources that the plugin allocated.
|
||||
// context: tuner context object
|
||||
ncclResult_t (*destroy)(void* context);
|
||||
} ncclTuner_v4_t;
|
||||
|
||||
typedef ncclTuner_v4_t ncclTuner_t;
|
||||
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
|
||||
|
||||
// API to be implemented by external tuner
|
||||
typedef struct {
|
||||
// Name of the tuner
|
||||
const char* name;
|
||||
|
||||
// Initializes tuner states.
|
||||
// Inputs:
|
||||
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
|
||||
// - nNodes: number of nodes in current communicator.
|
||||
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
|
||||
// Outputs:
|
||||
// - context: tuner context object
|
||||
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
|
||||
|
||||
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
|
||||
// Inputs:
|
||||
// - context: tuner context object
|
||||
// - collType: collective type , e.g., allreduce, allgather…
|
||||
// - nBytes: collective size in bytes
|
||||
// - numPipeOps: number of operations in the group
|
||||
// - numAlgo: number of algorithms in collCostTable
|
||||
// - numProto: number of protocols in collCostTable
|
||||
//
|
||||
// Outputs:
|
||||
// - nChannels: number of channels (hence SMs) to be used.
|
||||
//
|
||||
// InOut:
|
||||
// - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
|
||||
// NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
|
||||
//
|
||||
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
|
||||
// default tuning for the given collective.
|
||||
// Also, the plugin is allowed to not set any output, or set only the
|
||||
// algorithm and protocol, but not only the algorithm or only the protocol.
|
||||
// Unset fields will be set automatically by NCCL.
|
||||
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
|
||||
int* nChannels);
|
||||
|
||||
// Terminates the plugin and cleans up any resources that the plugin allocated.
|
||||
// context: tuner context object
|
||||
ncclResult_t (*destroy)(void* context);
|
||||
} ncclTuner_v3_t;
|
||||
|
||||
// API to be implemented by external tuner
|
||||
typedef struct {
|
||||
// Name of the tuner
|
||||
const char* name;
|
||||
|
||||
// Initializes tuner states.
|
||||
// Inputs:
|
||||
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
|
||||
// - nNodes: number of nodes in current communicator.
|
||||
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
|
||||
// Outputs:
|
||||
// - context: tuner context object
|
||||
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
|
||||
|
||||
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
|
||||
// Inputs:
|
||||
// - context: tuner context object
|
||||
// - collType: collective type , e.g., allreduce, allgather…
|
||||
// - nBytes: collective size in bytes
|
||||
// - collNetTypeSupport: whether collnet supports this type
|
||||
// - nvlsTypeSupport: whether nvlink sharp supports this time
|
||||
// - numPipeOps: number of operations in the group
|
||||
//
|
||||
// Outputs:
|
||||
// - algorithm: selected algorithm to be used for the given collective
|
||||
// - protocol: selected protocol to be used for the give collective
|
||||
// - nChannels: number of channels (hence SMs) to be used.
|
||||
//
|
||||
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
|
||||
// default tuning for the given collective.
|
||||
// Also, the plugin is allowed to not set any output, or set only the
|
||||
// algorithm and protocol, but not only the algorithm or only the protocol.
|
||||
// Unset fields will be set automatically by NCCL.
|
||||
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int* algorithm, int* protocol, int* nChannels);
|
||||
|
||||
// Terminates the plugin and cleans up any resources that the plugin allocated.
|
||||
// context: tuner context object
|
||||
ncclResult_t (*destroy)(void* context);
|
||||
} ncclTuner_v2_t;
|
||||
|
||||
#endif
|
||||
@@ -18,7 +18,6 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
|
||||
ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
|
||||
ncclResult_t ncclNetInit(struct ncclComm* comm);
|
||||
ncclResult_t ncclNetFinalize(struct ncclComm* comm);
|
||||
int ncclNetVersion(struct ncclComm* comm);
|
||||
|
||||
// Test whether the current GPU support GPU Direct RDMA.
|
||||
ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
|
||||
|
||||
@@ -26,6 +26,7 @@ typedef struct {
|
||||
|
||||
typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
|
||||
typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
|
||||
typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
|
||||
typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
|
||||
typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -37,9 +37,10 @@
|
||||
#define NVTX_SID_CommInitRankScalable 17 // same schema as NVTX_SID_CommInitRank
|
||||
#define NVTX_SID_CommSplit 18
|
||||
#define NVTX_SID_CommFinalize 19
|
||||
// When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below!
|
||||
|
||||
// Define static schema ID for the reduction operation.
|
||||
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 19 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
|
||||
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 20 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
|
||||
|
||||
extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
|
||||
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_NET_H_
|
||||
#define NCCL_NET_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "nccl_common.h"
|
||||
#include "net_device.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#define NCCL_NET_HANDLE_MAXSIZE 128
|
||||
//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
|
||||
#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
|
||||
#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
|
||||
|
||||
#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
|
||||
#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried
|
||||
|
||||
#define NCCL_PTR_HOST 0x1
|
||||
#define NCCL_PTR_CUDA 0x2
|
||||
#define NCCL_PTR_DMABUF 0x4
|
||||
|
||||
// Maximum number of requests per comm object
|
||||
#define NCCL_NET_MAX_REQUESTS 32
|
||||
|
||||
// Max number of ncclNet objects which can live in the same process
|
||||
#define NCCL_NET_MAX_PLUGINS 3
|
||||
|
||||
// NCCL core profiler callback for network defined events instrumentation
|
||||
typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
|
||||
|
||||
#include "net/net_v10.h"
|
||||
#include "net/net_v9.h"
|
||||
#include "net/net_v8.h"
|
||||
#include "net/net_v7.h"
|
||||
#include "net/net_v6.h"
|
||||
|
||||
typedef ncclNet_v10_t ncclNet_t;
|
||||
typedef ncclCollNet_v10_t ncclCollNet_t;
|
||||
typedef ncclNetSGE_v10_t ncclNetSGE_t;
|
||||
typedef ncclNetProperties_v10_t ncclNetProperties_t;
|
||||
typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
|
||||
typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
|
||||
|
||||
#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10
|
||||
|
||||
#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v10
|
||||
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v10
|
||||
|
||||
// context passed from RCCL lib to n/w plugin
|
||||
typedef struct {
|
||||
// channel id
|
||||
uint32_t chId;
|
||||
} ncclNet_ctxt_t;
|
||||
|
||||
#endif // end include guard
|
||||
@@ -0,0 +1,69 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_PROFILER_H_
|
||||
#define NCCL_PROFILER_H_
|
||||
|
||||
enum {
|
||||
ncclProfileGroup = (1 << 0), // group event type
|
||||
ncclProfileColl = (1 << 1), // host collective call event type
|
||||
ncclProfileP2p = (1 << 2), // host point-to-point call event type
|
||||
ncclProfileProxyOp = (1 << 3), // proxy operation event type
|
||||
ncclProfileProxyStep = (1 << 4), // proxy step event type
|
||||
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
|
||||
ncclProfileKernelCh = (1 << 6), // kernel channel event type
|
||||
ncclProfileNetPlugin = (1 << 7), // network plugin-defined, events
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
ncclProfilerProxyOpSendPosted,
|
||||
ncclProfilerProxyOpSendRemFifoWait,
|
||||
ncclProfilerProxyOpSendTransmitted,
|
||||
ncclProfilerProxyOpSendDone,
|
||||
ncclProfilerProxyOpRecvPosted,
|
||||
ncclProfilerProxyOpRecvReceived,
|
||||
ncclProfilerProxyOpRecvTransmitted,
|
||||
ncclProfilerProxyOpRecvDone,
|
||||
|
||||
/* Legacy proxy profiler states */
|
||||
ncclProfilerProxyStepSendGPUWait,
|
||||
ncclProfilerProxyStepSendWait,
|
||||
ncclProfilerProxyStepRecvWait,
|
||||
ncclProfilerProxyStepRecvFlushWait,
|
||||
ncclProfilerProxyStepRecvGPUWait,
|
||||
|
||||
/* Legacy proxy control states */
|
||||
ncclProfilerProxyCtrlIdle,
|
||||
ncclProfilerProxyCtrlActive,
|
||||
ncclProfilerProxyCtrlSleep,
|
||||
ncclProfilerProxyCtrlWakeup,
|
||||
ncclProfilerProxyCtrlAppend,
|
||||
ncclProfilerProxyCtrlAppendEnd,
|
||||
} ncclProfilerEventState_t;
|
||||
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
|
||||
|
||||
#include <cstdint>
|
||||
#include "profiler/profiler_v3.h"
|
||||
#include "profiler/profiler_v2.h"
|
||||
#include "profiler/profiler_v1.h"
|
||||
|
||||
typedef ncclProfiler_v3_t ncclProfiler_t;
|
||||
typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
|
||||
|
||||
#define NCCL_PROFILER_NET_VER_BITS (16)
|
||||
#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS)
|
||||
#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
|
||||
|
||||
typedef enum {
|
||||
NCCL_PROFILER_NET_TYPE_IB = (1U << NCCL_PROFILER_NET_VER_BITS),
|
||||
NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
|
||||
} ncclProfilerNetType;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,22 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_TUNER_H_
|
||||
#define NCCL_TUNER_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "nccl_common.h"
|
||||
|
||||
#include "tuner/tuner_v4.h"
|
||||
#include "tuner/tuner_v3.h"
|
||||
#include "tuner/tuner_v2.h"
|
||||
|
||||
typedef ncclTuner_v4_t ncclTuner_t;
|
||||
|
||||
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,158 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NET_V10_H_
|
||||
#define NET_V10_H_
|
||||
|
||||
#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
|
||||
|
||||
typedef struct {
|
||||
int ndevs;
|
||||
int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
|
||||
} ncclNetVDeviceProps_v10_t;
|
||||
|
||||
#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
|
||||
|
||||
typedef struct {
|
||||
// Plugin-specific TC value
|
||||
int trafficClass;
|
||||
} ncclNetCommConfig_v10_t;
|
||||
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int regIsGlobal; // regMr is not tied to a particular comm
|
||||
int forceFlush; // Force a flush on receives
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
ncclNetDeviceType netDeviceType; // Network offload type
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
ncclNetVDeviceProps_v10_t vProps;
|
||||
size_t maxP2pBytes; // Max transfer size for point-to-point operations
|
||||
size_t maxCollBytes; // Max transfer size for collective operations
|
||||
} ncclNetProperties_v10_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
|
||||
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
|
||||
|
||||
// Notify the plugin that a recv has completed by the device
|
||||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
|
||||
|
||||
// Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
|
||||
// what index this new vNIC exists at
|
||||
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
|
||||
} ncclNet_v10_t;
|
||||
|
||||
typedef struct {
|
||||
void* mhandle;
|
||||
void* address;
|
||||
size_t size;
|
||||
} ncclNetSGE_v10_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v10_t* recvParts,
|
||||
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
|
||||
void* sendMhandle, void** request);
|
||||
ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v10_t* sendParts, void* recvData,
|
||||
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp,
|
||||
void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Create a virtual NIC given the specified properties, which can be accessed at device index d
|
||||
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
|
||||
} ncclCollNet_v10_t;
|
||||
|
||||
#endif // end include guard
|
||||
@@ -0,0 +1,113 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NET_V6_H_
|
||||
#define NET_V6_H_
|
||||
|
||||
#define NCCL_NET_MAX_REQUESTS_V6 8
|
||||
|
||||
// v6 struct for backwards compatibility
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
} ncclNetProperties_v6_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclNet_v6_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v6_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,120 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NET_V7_H_
|
||||
#define NET_V7_H_
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
ncclNetDeviceType netDeviceType; // Network offload type
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
} ncclNetProperties_v7_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
|
||||
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
|
||||
|
||||
// Notify the plugin that a recv has completed by the device
|
||||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
|
||||
} ncclNet_v7_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v7_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,134 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NET_V8_H_
|
||||
#define NET_V8_H_
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int regIsGlobal; // regMr is not tied to a particular comm
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
ncclNetDeviceType netDeviceType; // Network offload type
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
} ncclNetProperties_v8_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
|
||||
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
|
||||
|
||||
// Notify the plugin that a recv has completed by the device
|
||||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
|
||||
} ncclNet_v8_t;
|
||||
|
||||
typedef struct {
|
||||
void* mhandle;
|
||||
void* address;
|
||||
uint32_t size;
|
||||
} ncclNetSGE_v8_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
|
||||
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
|
||||
void* sendMhandle, void** request);
|
||||
ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
|
||||
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp,
|
||||
void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v8_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,152 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NET_V9_H_
|
||||
#define NET_V9_H_
|
||||
|
||||
#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
|
||||
|
||||
typedef struct {
|
||||
int ndevs;
|
||||
int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
|
||||
} ncclNetVDeviceProps_v9_t;
|
||||
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int regIsGlobal; // regMr is not tied to a particular comm
|
||||
int forceFlush; // Force a flush on receives
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
ncclNetDeviceType netDeviceType; // Network offload type
|
||||
int netDeviceVersion; // Version number for network offload
|
||||
ncclNetVDeviceProps_v9_t vProps;
|
||||
size_t maxP2pBytes; // Max transfer size for point-to-point operations
|
||||
size_t maxCollBytes; // Max transfer size for collective operations
|
||||
} ncclNetProperties_v9_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v9_t** sendDevComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v9_t** recvDevComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Copy the given mhandle to a dptr in a format usable by this plugin's device code
|
||||
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
|
||||
|
||||
// Notify the plugin that a recv has completed by the device
|
||||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
|
||||
|
||||
// Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
|
||||
// what index this new vNIC exists at
|
||||
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
|
||||
} ncclNet_v9_t;
|
||||
|
||||
typedef struct {
|
||||
void* mhandle;
|
||||
void* address;
|
||||
size_t size;
|
||||
} ncclNetSGE_v9_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
|
||||
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
|
||||
void* sendMhandle, void** request);
|
||||
ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
|
||||
size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp,
|
||||
void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
|
||||
// Create a virtual NIC given the specified properties, which can be accessed at device index d
|
||||
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
|
||||
} ncclCollNet_v9_t;
|
||||
|
||||
#endif // end include guard
|
||||
@@ -0,0 +1,18 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_PLUGIN_H_
|
||||
#define NCCL_PLUGIN_H_
|
||||
|
||||
#include "nccl.h"
|
||||
|
||||
void* ncclOpenNetPluginLib(const char* name);
|
||||
void* ncclOpenTunerPluginLib(const char* name);
|
||||
void* ncclOpenProfilerPluginLib(const char* name);
|
||||
void* ncclGetNetPluginLib(void);
|
||||
ncclResult_t ncclClosePluginLib(void* handle);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,13 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NET_IB_H_
|
||||
#define NET_IB_H_
|
||||
|
||||
#include "nccl_profiler.h"
|
||||
#include "net_ib_v1.h"
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,34 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NET_IB_V1_H_
|
||||
#define NET_IB_V1_H_
|
||||
|
||||
#define NCCL_PROFILER_NET_IB_VER 1
|
||||
|
||||
enum {
|
||||
ncclProfileQp = (1 << 0),
|
||||
};
|
||||
|
||||
// The data structure version is encoded in the plugin identifier bitmask and
|
||||
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||
// identifier in the event descriptor before calling the profiler startEvent
|
||||
// function. The profiler should inspect the plugin id to find out the source
|
||||
// plugin as well as the version of the event struct
|
||||
typedef struct {
|
||||
uint8_t type; // event type (plugin defined)
|
||||
union {
|
||||
struct {
|
||||
int device; // network device id
|
||||
uint64_t wr_id; // work request id
|
||||
int opcode; // ibv opcode
|
||||
int qpNum; // QP number
|
||||
size_t length; // work request data length
|
||||
} qp;
|
||||
};
|
||||
} ncclProfilerNetIbDescr_v1_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,13 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NET_SOCKET_H_
|
||||
#define NET_SOCKET_H_
|
||||
|
||||
#include "nccl_profiler.h"
|
||||
#include "net_socket_v1.h"
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,32 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NET_SOCKET_V1_H_
|
||||
#define NET_SOCKET_V1_H_
|
||||
|
||||
#define NCCL_PROFILER_NET_SOCKET_VER 1
|
||||
|
||||
enum {
|
||||
ncclProfileSocket = (1 << 0),
|
||||
};
|
||||
|
||||
// The data structure version is encoded in the plugin identifier bitmask and
|
||||
// passed to NCCL core through the profiler callback. NCCL copies the plugin
|
||||
// identifier in the event descriptor before calling the profiler startEvent
|
||||
// function. The profiler should inspect the plugin id to find out the source
|
||||
// plugin as well as the version of the event struct
|
||||
typedef struct {
|
||||
uint8_t type; // event type (plugin defined)
|
||||
union {
|
||||
struct {
|
||||
int fd;
|
||||
int op;
|
||||
size_t length;
|
||||
} sock;
|
||||
};
|
||||
} ncclProfilerNetSockDescr_v1_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,107 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_V1_H_
|
||||
#define PROFILER_V1_H_
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint64_t seqNumber;
|
||||
uint8_t func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
uint8_t datatype;
|
||||
uint32_t op;
|
||||
size_t trafficBytes;
|
||||
uint8_t nMaxChannels;
|
||||
uint8_t nWarps;
|
||||
uint8_t algo;
|
||||
uint8_t proto;
|
||||
int isCollnet;
|
||||
int isNvls;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint8_t func;
|
||||
void* buff;
|
||||
uint8_t datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
};
|
||||
} ncclProfilerEventDescr_v1_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
int steps;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v1_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v1_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,104 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_V2_H_
|
||||
#define PROFILER_V2_H_
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint64_t seqNumber;
|
||||
const char* func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
const char* datatype;
|
||||
size_t trafficBytes;
|
||||
uint8_t nMaxChannels;
|
||||
uint8_t nWarps;
|
||||
const char* algo;
|
||||
const char* proto;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
const char* func;
|
||||
void* buff;
|
||||
const char* datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
};
|
||||
} ncclProfilerEventDescr_v2_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
int steps;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v2_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v2_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,112 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_V3_H_
|
||||
#define PROFILER_V3_H_
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint64_t seqNumber;
|
||||
const char* func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
const char* datatype;
|
||||
uint8_t nMaxChannels;
|
||||
uint8_t nWarps;
|
||||
const char* algo;
|
||||
const char* proto;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
const char* func;
|
||||
void* buff;
|
||||
const char* datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
uint8_t channelId;
|
||||
} kernelCh;
|
||||
|
||||
struct {
|
||||
int64_t id;
|
||||
void* data;
|
||||
} netPlugin;
|
||||
};
|
||||
} ncclProfilerEventDescr_v3_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
int steps;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v3_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v3_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,53 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef TUNER_V2_H_
|
||||
#define TUNER_V2_H_
|
||||
|
||||
// API to be implemented by external tuner
|
||||
typedef struct {
|
||||
// Name of the tuner
|
||||
const char* name;
|
||||
|
||||
// Initializes tuner states.
|
||||
// Inputs:
|
||||
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
|
||||
// - nNodes: number of nodes in current communicator.
|
||||
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
|
||||
// Outputs:
|
||||
// - context: tuner context object
|
||||
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
|
||||
|
||||
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
|
||||
// Inputs:
|
||||
// - context: tuner context object
|
||||
// - collType: collective type , e.g., allreduce, allgather…
|
||||
// - nBytes: collective size in bytes
|
||||
// - collNetTypeSupport: whether collnet supports this type
|
||||
// - nvlsTypeSupport: whether nvlink sharp supports this time
|
||||
// - numPipeOps: number of operations in the group
|
||||
//
|
||||
// Outputs:
|
||||
// - algorithm: selected algorithm to be used for the given collective
|
||||
// - protocol: selected protocol to be used for the give collective
|
||||
// - nChannels: number of channels (hence SMs) to be used.
|
||||
//
|
||||
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
|
||||
// default tuning for the given collective.
|
||||
// Also, the plugin is allowed to not set any output, or set only the
|
||||
// algorithm and protocol, but not only the algorithm or only the protocol.
|
||||
// Unset fields will be set automatically by NCCL.
|
||||
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int* algorithm, int* protocol, int* nChannels);
|
||||
|
||||
// Terminates the plugin and cleans up any resources that the plugin allocated.
|
||||
// context: tuner context object
|
||||
ncclResult_t (*destroy)(void* context);
|
||||
} ncclTuner_v2_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,55 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef TUNER_V3_H_
|
||||
#define TUNER_V3_H_
|
||||
|
||||
// API to be implemented by external tuner
|
||||
typedef struct {
|
||||
// Name of the tuner
|
||||
const char* name;
|
||||
|
||||
// Initializes tuner states.
|
||||
// Inputs:
|
||||
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
|
||||
// - nNodes: number of nodes in current communicator.
|
||||
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
|
||||
// Outputs:
|
||||
// - context: tuner context object
|
||||
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
|
||||
|
||||
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
|
||||
// Inputs:
|
||||
// - context: tuner context object
|
||||
// - collType: collective type , e.g., allreduce, allgather…
|
||||
// - nBytes: collective size in bytes
|
||||
// - numPipeOps: number of operations in the group
|
||||
// - numAlgo: number of algorithms in collCostTable
|
||||
// - numProto: number of protocols in collCostTable
|
||||
//
|
||||
// Outputs:
|
||||
// - nChannels: number of channels (hence SMs) to be used.
|
||||
//
|
||||
// InOut:
|
||||
// - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
|
||||
// NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
|
||||
//
|
||||
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
|
||||
// default tuning for the given collective.
|
||||
// Also, the plugin is allowed to not set any output, or set only the
|
||||
// algorithm and protocol, but not only the algorithm or only the protocol.
|
||||
// Unset fields will be set automatically by NCCL.
|
||||
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
|
||||
int* nChannels);
|
||||
|
||||
// Terminates the plugin and cleans up any resources that the plugin allocated.
|
||||
// context: tuner context object
|
||||
ncclResult_t (*destroy)(void* context);
|
||||
} ncclTuner_v3_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,56 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef TUNER_V4_H_
|
||||
#define TUNER_V4_H_
|
||||
|
||||
// API to be implemented by external tuner
|
||||
typedef struct {
|
||||
// Name of the tuner
|
||||
const char* name;
|
||||
|
||||
// Initializes tuner states.
|
||||
// Inputs:
|
||||
// - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
|
||||
// - nNodes: number of nodes in current communicator.
|
||||
// - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
|
||||
// Outputs:
|
||||
// - context: tuner context object
|
||||
ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
|
||||
|
||||
// Gets info (algo, protocol, number of ctas and threads) for a given collective.
|
||||
// Inputs:
|
||||
// - context: tuner context object
|
||||
// - collType: collective type , e.g., allreduce, allgather…
|
||||
// - nBytes: collective size in bytes
|
||||
// - numPipeOps: number of operations in the group
|
||||
// - numAlgo: number of algorithms in collCostTable
|
||||
// - numProto: number of protocols in collCostTable
|
||||
// - regBuff: can register user buffer
|
||||
//
|
||||
// Outputs:
|
||||
// - nChannels: number of channels (hence SMs) to be used.
|
||||
//
|
||||
// InOut:
|
||||
// - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
|
||||
// NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
|
||||
//
|
||||
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
|
||||
// default tuning for the given collective.
|
||||
// Also, the plugin is allowed to not set any output, or set only the
|
||||
// algorithm and protocol, but not only the algorithm or only the protocol.
|
||||
// Unset fields will be set automatically by NCCL.
|
||||
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
|
||||
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
|
||||
int regBuff, int* nChannels);
|
||||
|
||||
// Terminates the plugin and cleans up any resources that the plugin allocated.
|
||||
// context: tuner context object
|
||||
ncclResult_t (*destroy)(void* context);
|
||||
} ncclTuner_v4_t;
|
||||
|
||||
#endif
|
||||
@@ -17,6 +17,18 @@ struct ncclTaskP2p;
|
||||
struct ncclInfo;
|
||||
struct ncclComm;
|
||||
struct ncclProxyOp;
|
||||
struct ncclProxyConnector;
|
||||
|
||||
struct ncclProfilerProxy {
|
||||
bool initialized;
|
||||
uint64_t* workStarted/*[MAXCHANNELS]*/;
|
||||
uint64_t* workCompleted/*[MAXCHANNELS]*/;
|
||||
uint64_t workCounter[MAXCHANNELS]; // host work counter
|
||||
struct ncclProxyConnector sendProxyConn[MAXCHANNELS];
|
||||
struct ncclProxyConnector recvProxyConn[MAXCHANNELS];
|
||||
};
|
||||
|
||||
extern int ncclProfilerEventMask;
|
||||
|
||||
// Plugin Init/Finalize Wrappers
|
||||
ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm);
|
||||
@@ -44,6 +56,10 @@ ncclResult_t ncclProfilerStopProxyStepEvent(int sub, struct ncclProxyArgs* args,
|
||||
ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle);
|
||||
ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
|
||||
|
||||
// Kernel Channel Start/Stop Event Wrappers
|
||||
ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s);
|
||||
ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s);
|
||||
|
||||
// Record Event Wrappers
|
||||
ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
|
||||
ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState);
|
||||
@@ -51,5 +67,10 @@ ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, n
|
||||
|
||||
// Profiler utility functions
|
||||
ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op);
|
||||
bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op);
|
||||
bool ncclProfilerPluginLoaded(void);
|
||||
|
||||
// Profiler callback for network plugin
|
||||
ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -34,7 +34,8 @@ typedef enum : uint8_t {
|
||||
ncclPatternPatUp,
|
||||
ncclPatternPatDown,
|
||||
ncclPatternSend,
|
||||
ncclPatternRecv
|
||||
ncclPatternRecv,
|
||||
ncclPatternProfiler,
|
||||
} ncclPattern_t;
|
||||
|
||||
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
|
||||
@@ -93,12 +94,19 @@ struct ncclProxyOp {
|
||||
struct ncclTaskP2p* p2p;
|
||||
} task;
|
||||
|
||||
// Profiler work counter increment flag. Set to 'true' if the profiler work counter for this channel needs increment.
|
||||
// Always 'true' for collective operations. Grouped p2p operations are fused into one <send, recv> pair in the GPU kernel,
|
||||
// meaning the GPU profiler code increments the work counter for the pair rather than the individual p2p. For this
|
||||
// reason, the incWorkCounter flag is used to avoid incrementing the work counter twice in the host code. This is done
|
||||
// by setting incWorkCounter to 'true' only for one of the p2ps in the pair during enqueue.
|
||||
bool incWorkCounter;
|
||||
int eActivationMask;
|
||||
void* taskEventHandle;
|
||||
int rank;
|
||||
int peer;
|
||||
pid_t pid;
|
||||
void* profilerContext;
|
||||
uint64_t workCounter;
|
||||
|
||||
struct ncclProxyOp *enqNext;
|
||||
};
|
||||
@@ -135,12 +143,15 @@ struct ncclProxySubArgs {
|
||||
// Profiler plugin
|
||||
int eActivationMask;
|
||||
int rank;
|
||||
uint64_t profilerSteps;
|
||||
pid_t pid;
|
||||
void* profilerContext;
|
||||
void* taskEventHandle;
|
||||
void* opEventHandle;
|
||||
void* kernelEventHandle;
|
||||
void* stepEventHandles[NCCL_STEPS];
|
||||
size_t transSize;
|
||||
uint64_t workCounter;
|
||||
|
||||
void* recvRequestsCache[NCCL_STEPS];
|
||||
int recvRequestsSubCount;
|
||||
|
||||
@@ -15,6 +15,8 @@ struct rasRankInit {
|
||||
pid_t pid;
|
||||
int cudaDev;
|
||||
int nvmlDev;
|
||||
uint64_t hostHash;
|
||||
uint64_t pidHash;
|
||||
};
|
||||
|
||||
ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank);
|
||||
|
||||
@@ -42,7 +42,7 @@ struct ncclReg {
|
||||
uintptr_t baseAddr;
|
||||
size_t baseSize;
|
||||
CUdeviceptr regAddr;
|
||||
size_t regSize;
|
||||
size_t regUCSize, regMCSize;
|
||||
int dev;
|
||||
CUmemGenericAllocationHandle mcHandle;
|
||||
uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
|
||||
|
||||
@@ -14,7 +14,6 @@ struct shmCuIpc {
|
||||
CUmemFabricHandle handle;
|
||||
CUmemGenericAllocationHandle data;
|
||||
};
|
||||
int tpProxyRank;
|
||||
void *ptr;
|
||||
size_t size;
|
||||
};
|
||||
@@ -30,8 +29,8 @@ struct shmIpcDesc {
|
||||
|
||||
typedef struct shmIpcDesc ncclShmIpcDesc_t;
|
||||
|
||||
ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
|
||||
ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
|
||||
ncclResult_t ncclShmAllocateShareableBuffer(size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
|
||||
ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, int proxyRank, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
|
||||
ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -96,5 +96,5 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
|
||||
ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize);
|
||||
ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
|
||||
ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how);
|
||||
ncclResult_t ncclSocketClose(struct ncclSocket* sock);
|
||||
ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait = false);
|
||||
#endif
|
||||
|
||||
@@ -10,13 +10,24 @@
|
||||
#include "nccl.h"
|
||||
#include "checks.h"
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// ncclCudaContext: wraps a CUDA context with per-context state.
|
||||
struct ncclCudaContext;
|
||||
|
||||
// Get a ncclCudaContext to track the currently active CUDA context.
|
||||
ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out);
|
||||
// Drop reference.
|
||||
void ncclCudaContextDrop(struct ncclCudaContext* cxt);
|
||||
|
||||
/* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes
|
||||
* easily.
|
||||
*/
|
||||
struct ncclCudaGraph {
|
||||
#if ROCM_VERSION >= 60100
|
||||
cudaStream_t origin;
|
||||
cudaGraph_t graph;
|
||||
unsigned long long graphId;
|
||||
#endif
|
||||
@@ -25,6 +36,7 @@ struct ncclCudaGraph {
|
||||
inline struct ncclCudaGraph ncclCudaGraphNone() {
|
||||
struct ncclCudaGraph tmp;
|
||||
#if ROCM_VERSION >= 60100
|
||||
tmp.origin = nullptr;
|
||||
tmp.graph = nullptr;
|
||||
tmp.graphId = ULLONG_MAX;
|
||||
#endif
|
||||
@@ -33,7 +45,7 @@ inline struct ncclCudaGraph ncclCudaGraphNone() {
|
||||
|
||||
inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) {
|
||||
#if ROCM_VERSION >= 60100
|
||||
return graph.graph != nullptr;
|
||||
return graph.graphId != ULLONG_MAX;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
@@ -57,84 +69,69 @@ ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t
|
||||
* streams unfit for the use of serializing access to a persistent resource.
|
||||
* Strong streams have been introduced to address this need.
|
||||
*
|
||||
* - All updates to a strong stream must be enclosed by a Acquire/Release pair.
|
||||
* All updates to a strong stream must be enclosed by a Acquire/Release pair.
|
||||
*
|
||||
* - The Acquire, Release, and all updates take a ncclCudaGraph parameter
|
||||
* indicating the currently capturing graph (or none). This parameter must be
|
||||
* the same for the entire sequence of {Acquire; ...; Release}.
|
||||
* Acquire retrieves a "work" stream (cudaStream_t) which may be used to add
|
||||
* work.
|
||||
*
|
||||
* - An {Acquire; ...; Release} sequence must not be concurrent with any
|
||||
* other operations against the strong stream including graph launches which
|
||||
* reference this stream.
|
||||
* Release publishes the work streams work into the strong stream. The Release
|
||||
* must be issued by the same thread that did the Acquire.
|
||||
*/
|
||||
struct ncclStrongStream;
|
||||
|
||||
ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss);
|
||||
ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss);
|
||||
|
||||
// Acquire-fence the strong stream.
|
||||
// Acquire the strong stream. Upon return `*workStream` will be usable to add work.
|
||||
// `concurrent` indicates if other threads may be using the strong stream.
|
||||
ncclResult_t ncclStrongStreamAcquire(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, cudaStream_t* workStream
|
||||
);
|
||||
|
||||
// Acquire-fence the strong stream assuming no graph is capturing. This permits
|
||||
// the caller to enqueue directly to the `ss->cudaStream` member using native CUDA
|
||||
// calls. Strong stream still must be released via:
|
||||
// ncclStrongStreamRelease(ncclCudaGraphNone(), ss);
|
||||
ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss);
|
||||
|
||||
// Release-fence of the strong stream.
|
||||
ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss);
|
||||
|
||||
// Add a host launch to the stream.
|
||||
ncclResult_t ncclStrongStreamLaunchHost(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss,
|
||||
cudaHostFn_t fn, void* arg
|
||||
);
|
||||
// Add a kernel launch to the stream.
|
||||
ncclResult_t ncclStrongStreamLaunchKernel(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss,
|
||||
void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes
|
||||
// Get the workStream for an already acquired strong stream.
|
||||
// `concurrent` indicates if other threads may be using the strong stream.
|
||||
ncclResult_t ncclStrongStreamAcquiredWorkStream(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, cudaStream_t* workStream
|
||||
);
|
||||
|
||||
// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
|
||||
// `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus
|
||||
// we want to fast-forward `a` to be a clone of `b`. Knowing this permits the
|
||||
// implementation to induce few graph dependencies.
|
||||
ncclResult_t ncclStrongStreamWaitStream(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false
|
||||
);
|
||||
// `b` must be capturing within `graph`.
|
||||
ncclResult_t ncclStrongStreamWaitStream(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false
|
||||
);
|
||||
// `a` must be capturing within `graph`.
|
||||
ncclResult_t ncclStrongStreamWaitStream(
|
||||
struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false
|
||||
// Release of the strong stream.
|
||||
// `concurrent` indicates if other threads may be using the strong stream.
|
||||
ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent);
|
||||
|
||||
ncclResult_t ncclStreamWaitStream(
|
||||
cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent
|
||||
);
|
||||
|
||||
// Like cudaStreamWaitEvent except `e` must be strictly ahead of everything in `s`.
|
||||
ncclResult_t ncclStreamAdvanceToEvent(struct ncclCudaGraph g, cudaStream_t s, cudaEvent_t e);
|
||||
|
||||
// Synchrnoization does not need the strong stream to be acquired.
|
||||
ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct ncclStrongStreamGraph; // internal to ncclStrongStream
|
||||
struct ncclStrongStreamCapture; // internal to ncclStrongStream
|
||||
|
||||
struct ncclStrongStream {
|
||||
// Used when not graph capturing.
|
||||
cudaStream_t cudaStream;
|
||||
// The stream to use for non-captured work.
|
||||
cudaStream_t liveStream;
|
||||
void* liveAcquiredBy;
|
||||
#if ROCM_VERSION >= 60100
|
||||
// This stream ever appeared in a graph capture.
|
||||
bool everCaptured;
|
||||
pthread_mutex_t lock;
|
||||
struct ncclStrongStreamCapture* captureHead;
|
||||
// The event used to establish order between graphs and streams. During acquire
|
||||
// this event is waited on, during release it is recorded to.
|
||||
cudaEvent_t serialEvent;
|
||||
// This stream ever appeared in a graph capture.
|
||||
bool everCaptured;
|
||||
// Tracks whether serialEvent needs to be recorded to upon Release().
|
||||
bool serialEventNeedsRecord;
|
||||
struct ncclStrongStreamGraph* graphHead;
|
||||
#else
|
||||
cudaEvent_t scratchEvent;
|
||||
#endif
|
||||
};
|
||||
|
||||
struct ncclCudaContext {
|
||||
struct ncclCudaContext* next;
|
||||
CUcontext hcontext;
|
||||
int refCount;
|
||||
struct ncclStrongStream launchOrder;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#define TRANSPORT_SHM 1
|
||||
#define TRANSPORT_NET 2
|
||||
#define TRANSPORT_COLLNET 3
|
||||
#define TRANSPORT_PROFILER 4
|
||||
|
||||
#include "proxy.h"
|
||||
#include "comm.h"
|
||||
@@ -27,6 +28,7 @@ extern struct ncclTransport p2pTransport;
|
||||
extern struct ncclTransport shmTransport;
|
||||
extern struct ncclTransport netTransport;
|
||||
extern struct ncclTransport collNetTransport;
|
||||
extern struct ncclTransport profilerTransport;
|
||||
|
||||
extern struct ncclTransport* ncclTransports[];
|
||||
// Forward declarations
|
||||
@@ -50,8 +52,10 @@ struct ncclNvlsSharedRes {
|
||||
CUmulticastObjectProp signalProp;
|
||||
CUmemAccessDesc accessDesc;
|
||||
int dev;
|
||||
size_t buffSize;
|
||||
size_t creditSize;
|
||||
size_t creditUCSize;
|
||||
size_t creditMCSize;
|
||||
size_t buffUCSize;
|
||||
size_t buffMCSize;
|
||||
CUmemGenericAllocationHandle mcBuffHandle; // Multicast handle for NVLS buffer
|
||||
CUmemGenericAllocationHandle mcCreditHandle; // Multicast handle for NVLS credit buffer
|
||||
char* mcBuff; // Multicast NVLS buffer address
|
||||
@@ -108,7 +112,7 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm);
|
||||
ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm);
|
||||
ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
|
||||
ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
|
||||
ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
|
||||
ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize);
|
||||
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
|
||||
|
||||
enum { collNetRecv=0, collNetSend=1 };
|
||||
|
||||
+76
-73
@@ -87,17 +87,6 @@ NCCL_PARAM(RuntimeConnect, "RUNTIME_CONNECT", 1);
|
||||
struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {};
|
||||
static ncclResult_t commReclaim(ncclComm_t comm);
|
||||
|
||||
static uint64_t hashUniqueId(ncclUniqueId const &id) {
|
||||
char const *bytes = (char const*)&id;
|
||||
uint64_t h = 0xdeadbeef;
|
||||
for(int i=0; i < (int)sizeof(ncclUniqueId); i++) {
|
||||
h ^= h >> 32;
|
||||
h *= 0x8db3db47fa2994ad;
|
||||
h += bytes[i];
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
//RCCL runtime param to set Unroll Factor
|
||||
RCCL_PARAM(UnrollFactor, "UNROLL_FACTOR", 0);
|
||||
|
||||
@@ -131,7 +120,7 @@ ncclResult_t commSetUnrollFactor(struct ncclComm* comm) {
|
||||
|
||||
#ifdef ENABLE_MSCCLPP
|
||||
size_t std::hash<ncclUniqueId>::operator ()(const ncclUniqueId& uniqueId) const noexcept {
|
||||
return (size_t)hashUniqueId(uniqueId);
|
||||
return (size_t)getHash(uniqueId.internal, NCCL_UNIQUE_ID_BYTES);
|
||||
}
|
||||
|
||||
bool operator ==(const ncclUniqueId& a, const ncclUniqueId& b) {
|
||||
@@ -237,7 +226,7 @@ ncclResult_t ncclGetUniqueId_impl(ncclUniqueId* out) {
|
||||
// copy to avoid alignment mismatch
|
||||
memcpy(out, &handle, sizeof(handle));
|
||||
Recorder::instance().record(rrGetUniqueId, -1, -1, out);
|
||||
TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out));
|
||||
TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)getHash(out->internal, NCCL_UNIQUE_ID_BYTES));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -485,6 +474,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
free(comm->sharedRes->tpRankToLocalRank);
|
||||
NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream));
|
||||
NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream));
|
||||
CUDACHECK(cudaEventDestroy(comm->sharedRes->launchEvent));
|
||||
CUDACHECK(cudaEventDestroy(comm->sharedRes->scratchEvent));
|
||||
NCCLCHECK(ncclProxyDestroy(comm));
|
||||
free(comm->sharedRes);
|
||||
}
|
||||
@@ -524,6 +515,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
NCCLCHECK(ncclProfilerPluginFinalize(comm));
|
||||
NCCLCHECK(ncclNetFinalize(comm));
|
||||
NCCLCHECK(ncclNetPluginUnload(comm));
|
||||
|
||||
ncclCudaContextDrop(comm->context);
|
||||
|
||||
free(comm);
|
||||
|
||||
return ncclSuccess;
|
||||
@@ -570,17 +564,12 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
|
||||
ncclGroupJobAbort(comm->groupJob);
|
||||
} else {
|
||||
NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
|
||||
if (ret != ncclSuccess) {
|
||||
/* if ret is not ncclInProgress, we just keep it. */
|
||||
if (ret == ncclInProgress) {
|
||||
WARN("Attempt to use communicator before the previous operation returned ncclSuccess");
|
||||
if (ret == ncclInProgress) ret = ncclInvalidArgument;
|
||||
ret = ncclInvalidArgument;
|
||||
goto exit;
|
||||
}
|
||||
/* if there is linked group job, we should complete it. */
|
||||
if (comm->groupJob) {
|
||||
NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
|
||||
comm->groupJob = NULL;
|
||||
}
|
||||
/* if ret is not ncclInProgress, we just keep it. */
|
||||
}
|
||||
|
||||
exit:
|
||||
@@ -625,6 +614,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
comm->lastStream = nullptr;
|
||||
CUDACHECK(cudaGetDevice(&comm->cudaDev));
|
||||
|
||||
NCCLCHECK(ncclCudaContextTrack(&comm->context));
|
||||
|
||||
NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
|
||||
char busId[]="0000:00:00.0";
|
||||
NCCLCHECK(int64ToBusId(comm->busId, busId));
|
||||
@@ -688,6 +679,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks));
|
||||
NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream));
|
||||
NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream));
|
||||
CUDACHECK(cudaEventCreateWithFlags(&sharedRes->launchEvent, cudaEventDisableTiming));
|
||||
CUDACHECK(cudaEventCreateWithFlags(&sharedRes->scratchEvent, cudaEventDisableTiming));
|
||||
comm->sharedRes = sharedRes;
|
||||
sharedRes->refCount = 1;
|
||||
} else {
|
||||
@@ -730,13 +723,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
struct ncclDevCommAndChannels *devCommAndChans = NULL;
|
||||
struct ncclNvmlCCStatus ccStatus;
|
||||
bool ccEnable = false;
|
||||
cudaStream_t deviceStream;
|
||||
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, deviceStream), ret, fail);
|
||||
ncclCommPushCudaFree(comm, devCommAndChans);
|
||||
NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, deviceStream), ret, fail);
|
||||
ncclCommPushCudaFree(comm, tmpCommAndChans.comm.rankToLocalRank);
|
||||
NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, deviceStream), ret, fail);
|
||||
comm->devComm = &devCommAndChans->comm;
|
||||
tmpCommAndChans.comm.rank = comm->rank;
|
||||
tmpCommAndChans.comm.nRanks = nRanks;
|
||||
@@ -759,12 +753,22 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
if (ccEnable) {
|
||||
comm->workFifoBytes = 0;
|
||||
} else {
|
||||
comm->workFifoBytes = ncclParamWorkFifoBytes();
|
||||
if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) {
|
||||
WARN("NCCL_WORK_FIFO_BYTES=%d is being ignored because it is not a power of 2.", comm->workFifoBytes);
|
||||
comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
|
||||
int64_t workFifoBytesParam = ncclParamWorkFifoBytes();
|
||||
if (workFifoBytesParam == -1) {
|
||||
if (comm->MNNVL && (comm->compCap >= 100)) {
|
||||
// WAR: Disable work fifo for Blackwell all2all hang issue on MNNVL
|
||||
INFO(NCCL_INIT, "Disabling work fifo");
|
||||
comm->workFifoBytes = 0;
|
||||
} else {
|
||||
comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
|
||||
}
|
||||
} else {
|
||||
if (0 != (workFifoBytesParam & (workFifoBytesParam-1))) {
|
||||
WARN("NCCL_WORK_FIFO_BYTES=%ld is being ignored because it is not a power of 2.", workFifoBytesParam);
|
||||
comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
|
||||
}
|
||||
comm->workFifoBytes = std::min<uint64_t>(workFifoBytesParam, 1ul<<30);
|
||||
}
|
||||
comm->workFifoBytes = std::min(comm->workFifoBytes, 1u<<30);
|
||||
}
|
||||
#else
|
||||
comm->workFifoBytes = ncclParamWorkFifoBytes();
|
||||
@@ -797,10 +801,18 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
comm->workFifoConsumedLeast = 0;
|
||||
tmpCommAndChans.comm.workConsumed = comm->workFifoConsumed;
|
||||
|
||||
// Alloc profiler counters for the kernel
|
||||
NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workStarted, MAXCHANNELS), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workCompleted, MAXCHANNELS), ret, fail);
|
||||
tmpCommAndChans.comm.workStarted = comm->profiler.workStarted;
|
||||
tmpCommAndChans.comm.workCompleted = comm->profiler.workCompleted;
|
||||
ncclCommPushCudaHostFree(comm, comm->profiler.workStarted);
|
||||
ncclCommPushCudaHostFree(comm, comm->profiler.workCompleted);
|
||||
|
||||
if (comm->collNetDenseToUserRank != nullptr) {
|
||||
NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, deviceStream), ret, fail);
|
||||
ncclCommPushCudaFree(comm, tmpCommAndChans.comm.collNetDenseToUserRank);
|
||||
NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, deviceStream), ret, fail);
|
||||
}
|
||||
|
||||
for (int c=0; c < MAXCHANNELS; c++) {
|
||||
@@ -814,7 +826,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls;
|
||||
|
||||
if (comm->channels[c].ring.userRanks != nullptr) {
|
||||
NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, deviceStream), ret, fail);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -839,10 +851,10 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
tmpCommAndChans.comm.faults = comm->faults;
|
||||
#endif
|
||||
|
||||
NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, deviceStream), ret, fail);
|
||||
exit:
|
||||
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false));
|
||||
NCCLCHECK(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream));
|
||||
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream));
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
@@ -1507,6 +1519,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
|
||||
graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
|
||||
}
|
||||
comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern);
|
||||
}
|
||||
if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0;
|
||||
if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
|
||||
@@ -1932,12 +1945,12 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
timers[TIMER_INIT_ALLOC] = clockNano();
|
||||
NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
|
||||
timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
|
||||
// obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex),
|
||||
// add unique split counter and the color
|
||||
ncclUniqueId tmpId;
|
||||
memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits
|
||||
snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d-%d", job->parent->commHash, job->splitCount, job->color);
|
||||
comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES);
|
||||
// child hash obtained from (parent hash, split count, color)
|
||||
uint64_t hacc[2] = {1, 1};
|
||||
eatHash(hacc, &job->parent->commHash);
|
||||
eatHash(hacc, &job->splitCount);
|
||||
eatHash(hacc, &job->color);
|
||||
comm->commHash = digestHash(hacc);
|
||||
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName,
|
||||
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
|
||||
timers[TIMER_INIT_BOOTSTRAP] = clockNano();
|
||||
@@ -1950,8 +1963,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
|
||||
timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
|
||||
// obtain a unique hash using the first commId
|
||||
comm->commHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
|
||||
commIdHash = hashUniqueId(job->commId[0]);
|
||||
comm->commHash = commIdHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
|
||||
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName,
|
||||
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
|
||||
timers[TIMER_INIT_BOOTSTRAP] = clockNano();
|
||||
@@ -1991,12 +2003,12 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
auto& mscclppUniqueId = mscclpp_uniqueIdMap[*job->commId];
|
||||
if (comm->localRank == 0 && !mapContainsId) {
|
||||
NCCLCHECKGOTO(mscclpp_ncclGetUniqueId(&mscclppUniqueId), res, fail);
|
||||
TRACE_CALL("mscclpp_ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(mscclppUniqueId));
|
||||
TRACE_CALL("mscclpp_ncclGetUniqueId(0x%llx)", (unsigned long long)getHash(mscclppUniqueId.internal, NCCL_UNIQUE_ID_BYTES));
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, &mscclppUniqueId, sizeof(mscclppUniqueId)), res, fail);
|
||||
unsigned long long mscclppUniqueIdHash; (void)mscclppUniqueIdHash;
|
||||
TRACE_CALL("bootstrapIntraNodeBroadcast(rank=%d, nranks=%d, root=%d, bcastData=hash:0x%llx)", comm->localRank, comm->localRanks, 0, (mscclppUniqueIdHash = (unsigned long long)hashUniqueId(mscclppUniqueId)));
|
||||
TRACE_CALL("bootstrapIntraNodeBroadcast(rank=%d, nranks=%d, root=%d, bcastData=hash:0x%llx)", comm->localRank, comm->localRanks, 0, (mscclppUniqueIdHash = (unsigned long long)getHash(mscclppUniqueId.internal, NCCL_UNIQUE_ID_BYTES)));
|
||||
mscclpp_uniqueIdReverseMap[mscclppUniqueId].insert(*job->commId);
|
||||
|
||||
comm->mscclpp_threshold = rcclParamMscclppThreshold();
|
||||
@@ -2228,6 +2240,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
|
||||
NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d");
|
||||
NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s");
|
||||
NCCL_CONFIG_DEFAULT(internalConfigPtr, splitShare, NCCL_CONFIG_UNDEF_INT, 0, "Split share", "%d");
|
||||
NCCL_CONFIG_DEFAULT(internalConfigPtr, trafficClass, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "Traffic class", "%d");
|
||||
|
||||
/* assign config to communicator */
|
||||
comm->config.blocking = internalConfigPtr->blocking;
|
||||
@@ -2236,6 +2249,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
|
||||
comm->config.maxCTAs = internalConfigPtr->maxCTAs;
|
||||
comm->config.netName = internalConfigPtr->netName;
|
||||
comm->config.splitShare = internalConfigPtr->splitShare;
|
||||
comm->config.trafficClass = internalConfigPtr->trafficClass;
|
||||
|
||||
NCCLCHECKGOTO(envConfigOverride(comm), ret, fail);
|
||||
|
||||
@@ -2260,6 +2274,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
|
||||
const char* commIdEnv = NULL;
|
||||
ncclComm_t comm = NULL;
|
||||
struct ncclCommInitRankAsyncJob* job = NULL;
|
||||
bool launchedJob = false;
|
||||
// first call ncclInit, this will setup the environment
|
||||
NCCLCHECKGOTO(ncclInit(), res, fail);
|
||||
|
||||
@@ -2313,6 +2328,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
|
||||
// start the bootstrap root before bootstrapping, use only the first handle
|
||||
NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&job->commId[0], true), res, fail);
|
||||
}
|
||||
launchedJob = true;
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, ncclCommInitJobFree, comm), res, fail);
|
||||
|
||||
exit:
|
||||
@@ -2321,7 +2337,7 @@ exit:
|
||||
NCCLCHECK(Recorder::instance().record(rrCommInitDev, nranks, myrank, commId, comm, cudaDev));
|
||||
return ncclGroupErrCheck(res);
|
||||
fail:
|
||||
if (job) ncclCommInitJobFree(job);
|
||||
if (job && !launchedJob) ncclCommInitJobFree(job);
|
||||
if (comm) {
|
||||
free(comm->abortFlag);
|
||||
if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev);
|
||||
@@ -2520,7 +2536,7 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
|
||||
NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
|
||||
// And keep polling until all graphs referencing us die.
|
||||
while (comm->persistentRefs != 0) {
|
||||
while (comm->localPersistentRefs != 0) {
|
||||
NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail);
|
||||
}
|
||||
while (!ncclIntruQueueEmpty(&comm->legacyRegCleanupQueue)) {
|
||||
@@ -2613,7 +2629,6 @@ exit:
|
||||
}
|
||||
return ret;
|
||||
fail:
|
||||
free(job);
|
||||
if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret);
|
||||
goto exit;
|
||||
}
|
||||
@@ -2896,6 +2911,11 @@ ncclResult_t ncclCommGetAsyncError_impl(ncclComm_t comm, ncclResult_t *asyncErro
|
||||
|
||||
*asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE);
|
||||
if (*asyncError == ncclSuccess && comm->proxyState) *asyncError = __atomic_load_n(&comm->proxyState->asyncResult, __ATOMIC_ACQUIRE);
|
||||
/* if there is linked group job, we should complete it. */
|
||||
if (*asyncError == ncclSuccess && comm->groupJob) {
|
||||
NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
|
||||
comm->groupJob = NULL;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -2949,16 +2969,13 @@ ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
|
||||
|
||||
#if CUDART_VERSION >= 12010
|
||||
size_t memGran = 0;
|
||||
size_t mcGran = 0;
|
||||
CUdevice currentDev;
|
||||
CUmemAllocationProp memprop = {};
|
||||
CUmulticastObjectProp mcprop = {};
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
CUmemGenericAllocationHandle handle;
|
||||
int cudaDev;
|
||||
int flag;
|
||||
int dcnt;
|
||||
int mcSupport = 0;
|
||||
|
||||
if (ptr == NULL || size == 0) goto fallback;
|
||||
|
||||
@@ -2968,6 +2985,7 @@ ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
|
||||
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
||||
|
||||
if (ncclCuMemEnable()) {
|
||||
size_t handleSize = size;
|
||||
int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
// Query device to see if FABRIC handle support is available
|
||||
flag = 0;
|
||||
@@ -2983,40 +3001,25 @@ ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
|
||||
if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
|
||||
CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||
CUDACHECK(cudaGetDeviceCount(&dcnt));
|
||||
|
||||
if (CUPFN(cuMulticastCreate) != NULL) CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
|
||||
if (mcSupport) {
|
||||
/* mc property */
|
||||
mcprop.size = size;
|
||||
/* device cnt is a dummy value right now, it might affect mc granularity in the future. */
|
||||
mcprop.numDevices = dcnt;
|
||||
mcprop.handleTypes = requestedHandleTypes;
|
||||
mcprop.flags = 0;
|
||||
CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
|
||||
|
||||
/* only size needs to be aligned to mcGran */
|
||||
ALIGN_SIZE(size, mcGran);
|
||||
} else {
|
||||
ALIGN_SIZE(size, memGran);
|
||||
}
|
||||
ALIGN_SIZE(handleSize, memGran);
|
||||
|
||||
if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
|
||||
/* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
|
||||
CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0));
|
||||
CUresult err = CUPFN(cuMemCreate(&handle, handleSize, &memprop, 0));
|
||||
if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) {
|
||||
requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
|
||||
/* Allocate the physical memory on the device */
|
||||
CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
|
||||
CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
|
||||
}
|
||||
} else {
|
||||
/* Allocate the physical memory on the device */
|
||||
CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
|
||||
CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
|
||||
}
|
||||
/* Reserve a virtual address range */
|
||||
CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0));
|
||||
CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, handleSize, memGran, 0, 0));
|
||||
/* Map the virtual address range to the physical allocation */
|
||||
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
|
||||
CUCHECK(cuMemMap((CUdeviceptr)*ptr, handleSize, 0, handle, 0));
|
||||
/* Now allow RW access to the newly mapped memory */
|
||||
for (int i = 0; i < dcnt; ++i) {
|
||||
int p2p = 0;
|
||||
@@ -3024,7 +3027,7 @@ ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = i;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
|
||||
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, handleSize, &accessDesc, 1));
|
||||
}
|
||||
if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i);
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "alloc.h"
|
||||
#include "nccl.h"
|
||||
#include "debug.h"
|
||||
#include "param.h"
|
||||
@@ -67,6 +68,36 @@ int ncclCuMemHostEnable() {
|
||||
ncclCumemHostEnable = paramValue;
|
||||
else
|
||||
ncclCumemHostEnable = (cudaDriverVersion >= 12060) ? 1 : 0;
|
||||
if (ncclCumemHostEnable) {
|
||||
// Verify that host allocations actually work. Docker in particular is known to disable "get_mempolicy",
|
||||
// causing such allocations to fail (this can be fixed by invoking Docker with "--cap-add SYS_NICE").
|
||||
int cudaDev;
|
||||
CUdevice currentDev;
|
||||
int cpuNumaNodeId = -1;
|
||||
CUmemAllocationProp prop = {};
|
||||
size_t granularity = 0;
|
||||
size_t size;
|
||||
CUmemGenericAllocationHandle handle;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
||||
CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev));
|
||||
if (cpuNumaNodeId < 0) cpuNumaNodeId = 0;
|
||||
prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
|
||||
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
prop.requestedHandleTypes = ncclCuMemHandleType;
|
||||
prop.location.id = cpuNumaNodeId;
|
||||
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
||||
size = 1;
|
||||
ALIGN_SIZE(size, granularity);
|
||||
if (CUPFN(cuMemCreate(&handle, size, &prop, 0)) != CUDA_SUCCESS) {
|
||||
INFO(NCCL_INIT, "cuMem host allocations do not appear to be working; falling back to a /dev/shm/ based "
|
||||
"implementation. This could be due to the container runtime disabling NUMA support. "
|
||||
"To disable this warning, set NCCL_CUMEM_HOST_ENABLE=0");
|
||||
ncclCumemHostEnable = 0;
|
||||
} else {
|
||||
CUCHECK(cuMemRelease(handle));
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclCumemHostEnable;
|
||||
error:
|
||||
|
||||
@@ -169,7 +169,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
|
||||
} control_un;
|
||||
|
||||
struct cmsghdr *cmptr;
|
||||
char dummy_buffer[1];
|
||||
char dummy_buffer[1] = {'\0'};
|
||||
struct sockaddr_un cliaddr;
|
||||
|
||||
// Construct client address to send this shareable handle to
|
||||
@@ -190,6 +190,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
|
||||
TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp);
|
||||
|
||||
if (sendFd != -1) {
|
||||
memset(&control_un, '\0', sizeof(control_un));
|
||||
msg.msg_control = control_un.control;
|
||||
msg.msg_controllen = sizeof(control_un.control);
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ void setEnvFile(const char* fileName) {
|
||||
size_t n = 0;
|
||||
ssize_t read;
|
||||
while ((read = getline(&line, &n, file)) != -1) {
|
||||
if (line[0] == '#') continue;
|
||||
if (line[read-1] == '\n') line[read-1] = '\0';
|
||||
int s=0; // Env Var Size
|
||||
while (line[s] != '\0' && line[s] != '=') s++;
|
||||
|
||||
@@ -176,6 +176,7 @@ static int findInterfaces(const char* prefixList, char* names, union ncclSocketA
|
||||
strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
|
||||
// Store the IP address
|
||||
int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
|
||||
memset(addrs+found, '\0', sizeof(*addrs));
|
||||
memcpy(addrs+found, interface->ifa_addr, salen);
|
||||
found++;
|
||||
}
|
||||
@@ -919,9 +920,17 @@ ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
|
||||
ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait) {
|
||||
if (sock != NULL) {
|
||||
if (sock->state > ncclSocketStateNone && sock->state < ncclSocketStateNum && sock->fd >= 0) {
|
||||
if (wait) {
|
||||
char data;
|
||||
int closed = 0;
|
||||
do {
|
||||
int offset = 0;
|
||||
if (ncclSocketProgress(NCCL_SOCKET_RECV, sock, &data, sizeof(char), &offset, &closed) != ncclSuccess) break;
|
||||
} while (closed == 0);
|
||||
}
|
||||
/* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected
|
||||
* by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
|
||||
* the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
|
||||
|
||||
+239
-270
@@ -9,28 +9,61 @@
|
||||
#include "checks.h"
|
||||
#include "param.h"
|
||||
|
||||
// Tracks the chain of graph nodes for a given graph captured identified by
|
||||
// its graph id. This state has to live for as long as captured work is being
|
||||
// submitted. CUDA doesn't have mechanism to inform us when the user ends capture
|
||||
// so the best we can do is get notified when the graph is destroyed.
|
||||
struct ncclStrongStreamGraph {
|
||||
struct ncclStrongStreamGraph* next;
|
||||
// Atomically exchanged to false by both the main thread or the graph destructor
|
||||
// callback. The last to arrive deletes the node.
|
||||
bool alive;
|
||||
// Tracks the captured work a given graph captured identified by its graph id.
|
||||
struct ncclStrongStreamCapture {
|
||||
struct ncclStrongStreamCapture* next;
|
||||
cudaGraph_t graph;
|
||||
unsigned long long graphId;
|
||||
// For each graph we track the "tip" of the chain of graph nodes. A linear
|
||||
// chain would always have just one node at its tip, but since we have to merge
|
||||
// in chains from other streams (via ncclStrongStreamWaitStream) some spots
|
||||
// in the chain can be wider than a single node and thus need a list, so we
|
||||
// maintain a dynamically sized array of tip nodes.
|
||||
int tipCount, tipCapacity;
|
||||
cudaGraphNode_t* tipNodes;
|
||||
cudaStream_t captureStream;
|
||||
cudaGraphNode_t lastRecord;
|
||||
void* acquiredBy;
|
||||
};
|
||||
|
||||
static void ncclStrongStreamGraphDelete(struct ncclStrongStreamGraph* g) {
|
||||
free(g->tipNodes);
|
||||
free(g);
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static ncclCudaContext* cxtListHead = nullptr;
|
||||
static pthread_mutex_t cxtListLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
CUcontext hcontext;
|
||||
cuCtxGetCurrent(&hcontext);
|
||||
|
||||
pthread_mutex_lock(&cxtListLock);
|
||||
struct ncclCudaContext* p = cxtListHead;
|
||||
while (1) {
|
||||
if (p == nullptr) {
|
||||
p = (struct ncclCudaContext*)calloc(1, sizeof(struct ncclCudaContext));
|
||||
p->refCount = 1;
|
||||
p->hcontext = hcontext;
|
||||
p->next = cxtListHead;
|
||||
cxtListHead = p;
|
||||
NCCLCHECKGOTO(ncclStrongStreamConstruct(&p->launchOrder), result, leave);
|
||||
break;
|
||||
}
|
||||
if (p->hcontext == hcontext) {
|
||||
p->refCount += 1;
|
||||
break;
|
||||
}
|
||||
p = p->next;
|
||||
}
|
||||
leave:
|
||||
pthread_mutex_unlock(&cxtListLock);
|
||||
*out = p;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void ncclCudaContextDrop(struct ncclCudaContext* cxt) {
|
||||
pthread_mutex_lock(&cxtListLock);
|
||||
if (0 == --cxt->refCount) {
|
||||
struct ncclCudaContext** pp = &cxtListHead;
|
||||
while (*pp != cxt) pp = &(*pp)->next;
|
||||
*pp = cxt->next; // remove from list
|
||||
// Destroy resources held in cxt
|
||||
ncclStrongStreamDestruct(&cxt->launchOrder);
|
||||
free(cxt);
|
||||
}
|
||||
pthread_mutex_unlock(&cxtListLock);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -40,13 +73,14 @@ ncclResult_t ncclCudaGetCapturingGraph(
|
||||
) {
|
||||
#if ROCM_VERSION >= 60100
|
||||
hipStreamCaptureStatus status;
|
||||
unsigned long long gid;
|
||||
CUDACHECK(hipStreamGetCaptureInfo_v2(stream, &status, &gid, &graph->graph, nullptr, nullptr));
|
||||
CUDACHECK(hipStreamGetCaptureInfo_v2(stream, &status, &graph->graphId, &graph->graph, nullptr, nullptr));
|
||||
if (status != hipStreamCaptureStatusActive) {
|
||||
graph->origin = nullptr;
|
||||
graph->graph = nullptr;
|
||||
gid = ULLONG_MAX;
|
||||
graph->graphId = ULLONG_MAX;
|
||||
} else {
|
||||
graph->origin = stream;
|
||||
}
|
||||
graph->graphId = gid;
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -68,315 +102,250 @@ ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss) {
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&ss->cudaStream, cudaStreamNonBlocking));
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&ss->liveStream, cudaStreamNonBlocking));
|
||||
#if ROCM_VERSION >= 60100
|
||||
CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming));
|
||||
ss->everCaptured = false;
|
||||
ss->serialEventNeedsRecord = false;
|
||||
ss->graphHead = nullptr;
|
||||
#else
|
||||
CUDACHECK(cudaEventCreateWithFlags(&ss->scratchEvent, cudaEventDisableTiming));
|
||||
ss->captureHead = nullptr;
|
||||
pthread_mutex_init(&ss->lock, nullptr);
|
||||
CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming));
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static void graphDestructor(void* arg) {
|
||||
struct ncclStrongStreamGraph* g = (struct ncclStrongStreamGraph*)arg;
|
||||
if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) {
|
||||
// Last to arrive deletes list node.
|
||||
ncclStrongStreamGraphDelete(g);
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss) {
|
||||
CUDACHECK(cudaStreamDestroy(ss->cudaStream));
|
||||
CUDACHECK(cudaStreamDestroy(ss->liveStream));
|
||||
#if ROCM_VERSION >= 60100
|
||||
CUDACHECK(cudaEventDestroy(ss->serialEvent));
|
||||
// Delete list of per-graph chains.
|
||||
struct ncclStrongStreamGraph* g = ss->graphHead;
|
||||
while (g != nullptr) {
|
||||
struct ncclStrongStreamGraph* next = g->next;
|
||||
if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) {
|
||||
// Last to arrive deletes list node.
|
||||
ncclStrongStreamGraphDelete(g);
|
||||
}
|
||||
g = next;
|
||||
struct ncclStrongStreamCapture* cap = ss->captureHead;
|
||||
while (cap) {
|
||||
struct ncclStrongStreamCapture* next = cap->next;
|
||||
CUDACHECK(cudaStreamDestroy(cap->captureStream));
|
||||
free(cap);
|
||||
cap = next;
|
||||
}
|
||||
#else
|
||||
CUDACHECK(cudaEventDestroy(ss->scratchEvent));
|
||||
CUDACHECK(cudaEventDestroy(ss->serialEvent));
|
||||
pthread_mutex_destroy(&ss->lock);
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(GraphMixingSupport, "GRAPH_MIXING_SUPPORT", 0)
|
||||
NCCL_PARAM(LaunchRaceFatal, "LAUNCH_RACE_FATAL", 1);
|
||||
constexpr char const* launchRaceFatalMsg = "Fatal: host threads racing to launch NCCL on same device.";
|
||||
|
||||
static void ensureTips(struct ncclStrongStreamGraph* g, int n) {
|
||||
if (g->tipCapacity < n) {
|
||||
g->tipNodes = (cudaGraphNode_t*)realloc(g->tipNodes, n*sizeof(cudaGraphNode_t));
|
||||
g->tipCapacity = n;
|
||||
}
|
||||
}
|
||||
static __thread char threadIdMarker;
|
||||
static void* localThreadId() { return &threadIdMarker; }
|
||||
|
||||
ncclResult_t ncclStrongStreamAcquire(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent,
|
||||
cudaStream_t* workStream
|
||||
) {
|
||||
#if ROCM_VERSION >= 60100
|
||||
bool mixing = ncclParamGraphMixingSupport();
|
||||
if (graph.graph == nullptr) {
|
||||
if (mixing && ss->everCaptured) {
|
||||
CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
|
||||
ss->serialEventNeedsRecord = false;
|
||||
if (graph.graphId == ULLONG_MAX) {
|
||||
*workStream = ss->liveStream;
|
||||
ss->liveAcquiredBy = localThreadId();
|
||||
if (mixing && __atomic_load_n(&ss->everCaptured, __ATOMIC_RELAXED)) {
|
||||
CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0));
|
||||
}
|
||||
} else {
|
||||
ss->everCaptured = true;
|
||||
// Find the current graph in our list of graphs if it exists.
|
||||
struct ncclStrongStreamGraph** pg = &ss->graphHead;
|
||||
struct ncclStrongStreamGraph* g;
|
||||
while (*pg != nullptr) {
|
||||
g = *pg;
|
||||
if (g->graphId == graph.graphId) {
|
||||
// Move to front of list so that operations after acquire don't have to search the list.
|
||||
*pg = g->next;
|
||||
g->next = ss->graphHead;
|
||||
ss->graphHead = g;
|
||||
bool firstCapture = !ss->everCaptured;
|
||||
__atomic_store_n(&ss->everCaptured, true, __ATOMIC_RELAXED);
|
||||
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
if (concurrent) pthread_mutex_lock(&ss->lock);
|
||||
|
||||
// Look for capture in our list of active captures.
|
||||
struct ncclStrongStreamCapture** pcap = &ss->captureHead;
|
||||
struct ncclStrongStreamCapture* cap;
|
||||
struct ncclStrongStreamCapture* spare = nullptr;
|
||||
while (*pcap != nullptr) {
|
||||
cap = *pcap;
|
||||
if (cap->graphId == graph.graphId) { // Capture node already exists.
|
||||
*workStream = cap->captureStream;
|
||||
cap->acquiredBy = localThreadId();
|
||||
if (concurrent) pthread_mutex_unlock(&ss->lock);
|
||||
return ncclSuccess;
|
||||
} else if (false == __atomic_load_n(&g->alive, __ATOMIC_ACQUIRE)) {
|
||||
// Unrelated graph that has been destroyed. Remove and delete.
|
||||
*pg = g->next;
|
||||
ncclStrongStreamGraphDelete(g);
|
||||
} else {
|
||||
pg = &g->next;
|
||||
cudaStreamCaptureStatus status;
|
||||
CUDACHECKGOTO(cudaStreamIsCapturing(cap->captureStream, &status), ret, do_unlock);
|
||||
if (status == cudaStreamCaptureStatusActive) {
|
||||
pcap = &cap->next; // Active capture doesn't match, on to next.
|
||||
} else { // Capture no longer active
|
||||
*pcap = cap->next; // Remove from current list
|
||||
if (spare == nullptr) { // Keep one spare to reuse below.
|
||||
spare = cap;
|
||||
} else {
|
||||
cudaStreamDestroy(cap->captureStream);
|
||||
free(cap);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This is a new graph so add to the list.
|
||||
g = (struct ncclStrongStreamGraph*)malloc(sizeof(struct ncclStrongStreamGraph));
|
||||
g->graphId = graph.graphId;
|
||||
g->tipNodes = nullptr;
|
||||
g->tipCapacity = 0;
|
||||
g->tipCount = 0;
|
||||
g->next = ss->graphHead;
|
||||
ss->graphHead = g;
|
||||
g->alive = true;
|
||||
NCCLCHECK(ncclCudaGraphAddDestructor(graph, graphDestructor, (void*)g));
|
||||
|
||||
if (mixing && ss->serialEventNeedsRecord) {
|
||||
// Can only be here if previous release was for uncaptured work that
|
||||
// elided updating the event because no capture had yet occurred.
|
||||
CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
|
||||
CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream));
|
||||
// No matching capture, need a new entry.
|
||||
cap = spare;
|
||||
if (cap == nullptr) {
|
||||
cap = (struct ncclStrongStreamCapture*)calloc(1, sizeof(struct ncclStrongStreamCapture));
|
||||
CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock);
|
||||
}
|
||||
ss->serialEventNeedsRecord = false;
|
||||
cap->graphId = graph.graphId;
|
||||
cap->lastRecord = nullptr;
|
||||
cap->acquiredBy = localThreadId();
|
||||
// Push to capturing list.
|
||||
cap->next = ss->captureHead;
|
||||
ss->captureHead = cap;
|
||||
|
||||
// First node in the chain must be a wait on the serialEvent.
|
||||
do_unlock:
|
||||
if (concurrent) pthread_mutex_unlock(&ss->lock);
|
||||
if (ret != ncclSuccess) return ret;
|
||||
|
||||
*workStream = cap->captureStream;
|
||||
|
||||
// Bring captureStream into the graph but without any dependencies.
|
||||
cudaEvent_t scratch;
|
||||
CUDACHECK(cudaEventCreateWithFlags(&scratch, cudaEventDisableTiming));
|
||||
CUDACHECK(cudaEventRecord(scratch, graph.origin));
|
||||
CUDACHECK(cudaStreamWaitEvent(cap->captureStream, scratch, 0));
|
||||
CUDACHECK(cudaEventDestroy(scratch));
|
||||
CUDACHECK(cudaStreamUpdateCaptureDependencies(cap->captureStream, nullptr, 0, cudaStreamSetCaptureDependencies));
|
||||
|
||||
if (mixing && firstCapture) {
|
||||
CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream));
|
||||
}
|
||||
if (mixing) {
|
||||
ensureTips(g, 1);
|
||||
CUDACHECK(cudaGraphAddEventWaitNode(&g->tipNodes[0], graph.graph, nullptr, 0, ss->serialEvent));
|
||||
g->tipCount = 1;
|
||||
} else {
|
||||
g->tipCount = 0;
|
||||
// First dependency is to wait on serialEvent
|
||||
CUDACHECK(cudaStreamWaitEvent(cap->captureStream, ss->serialEvent, 0));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss) {
|
||||
ncclResult_t ncclStrongStreamAcquiredWorkStream(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent,
|
||||
cudaStream_t* workStream
|
||||
) {
|
||||
#if ROCM_VERSION >= 60100
|
||||
bool mixing = ncclParamGraphMixingSupport();
|
||||
if (mixing && ss->everCaptured) {
|
||||
CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
|
||||
if (graph.graphId == ULLONG_MAX) {
|
||||
*workStream = ss->liveStream;
|
||||
} else {
|
||||
if (concurrent) pthread_mutex_lock(&ss->lock);
|
||||
struct ncclStrongStreamCapture* cap = ss->captureHead;
|
||||
while (cap->graphId != graph.graphId) cap = cap->next;
|
||||
*workStream = cap->captureStream;
|
||||
if (concurrent) pthread_mutex_unlock(&ss->lock);
|
||||
}
|
||||
ss->serialEventNeedsRecord = true; // Assume the caller is going to add work to stream.
|
||||
#else
|
||||
*workStream = ss->liveStream
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t checkGraphId(struct ncclStrongStreamGraph* g, unsigned long long id) {
|
||||
if (g == nullptr || g->graphId != id) {
|
||||
WARN("Expected graph id=%llu was not at head of strong stream's internal list.", id);
|
||||
return ncclInternalError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss) {
|
||||
ncclResult_t ncclStrongStreamRelease(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent
|
||||
) {
|
||||
#if ROCM_VERSION >= 60100
|
||||
bool mixing = ncclParamGraphMixingSupport();
|
||||
if (mixing && ss->serialEventNeedsRecord) {
|
||||
if (graph.graph == nullptr) {
|
||||
if (ss->everCaptured) {
|
||||
CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream));
|
||||
ss->serialEventNeedsRecord = false;
|
||||
if (mixing) {
|
||||
if (graph.graphId == ULLONG_MAX) {
|
||||
if (__atomic_load_n(&ss->everCaptured, __ATOMIC_RELAXED)) {
|
||||
CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream));
|
||||
}
|
||||
if (ss->liveAcquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
|
||||
WARN("%s", launchRaceFatalMsg);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
} else {
|
||||
struct ncclStrongStreamGraph* g = ss->graphHead;
|
||||
NCCLCHECK(checkGraphId(g, graph.graphId));
|
||||
ensureTips(g, 1);
|
||||
CUDACHECK(cudaGraphAddEventRecordNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, ss->serialEvent));
|
||||
g->tipCount = 1;
|
||||
ss->serialEventNeedsRecord = false;
|
||||
if (concurrent) pthread_mutex_lock(&ss->lock);
|
||||
struct ncclStrongStreamCapture* cap = ss->captureHead;
|
||||
while (cap->graphId != graph.graphId) cap = cap->next;
|
||||
if (concurrent) pthread_mutex_unlock(&ss->lock);
|
||||
|
||||
// Add event record node with dependencies added further down.
|
||||
cudaGraphNode_t recordNode;
|
||||
CUDACHECK(cudaGraphAddEventRecordNode(&recordNode, graph.graph, nullptr, 0, ss->serialEvent));
|
||||
|
||||
// Make this record order after previous record on this stream.
|
||||
if (cap->lastRecord != nullptr) {
|
||||
CUDACHECK(cudaGraphAddDependencies(graph.graph, &cap->lastRecord, &recordNode, 1));
|
||||
}
|
||||
cap->lastRecord = recordNode;
|
||||
|
||||
// Get current nodes from work stream so we can add them as dependencies.
|
||||
cudaStreamCaptureStatus status;
|
||||
cudaGraphNode_t const* nodes;
|
||||
size_t count = 0;
|
||||
cudaError_t res = hipStreamGetCaptureInfo_v2(cap->captureStream, &status, nullptr, nullptr, &nodes, &count);
|
||||
|
||||
#if CUDART_VERSION >= 12030
|
||||
if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations.
|
||||
cudaGraphEdgeData const* edges;
|
||||
CUDACHECK(cudaStreamGetCaptureInfo_v3(cap->captureStream, &status, nullptr, nullptr, &nodes, &edges, &count));
|
||||
for (int i=0; i < (int)count; i++) {
|
||||
CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &nodes[i], &recordNode, &edges[i], 1));
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (false) {}
|
||||
#endif
|
||||
else {
|
||||
CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/);
|
||||
for (int i=0; i < (int)count; i++) {
|
||||
CUDACHECK(cudaGraphAddDependencies(graph.graph, &nodes[i], &recordNode, 1));
|
||||
}
|
||||
}
|
||||
|
||||
if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
|
||||
WARN("%s", launchRaceFatalMsg);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamLaunchHost(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss, cudaHostFn_t fn, void* arg
|
||||
) {
|
||||
#if ROCM_VERSION >= 60100
|
||||
if (graph.graph == nullptr) {
|
||||
CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg));
|
||||
} else {
|
||||
cudaHostNodeParams p;
|
||||
p.fn = fn;
|
||||
p.userData = arg;
|
||||
struct ncclStrongStreamGraph* g = ss->graphHead;
|
||||
NCCLCHECK(checkGraphId(g, graph.graphId));
|
||||
ensureTips(g, 1);
|
||||
CUDACHECK(cudaGraphAddHostNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p));
|
||||
g->tipCount = 1;
|
||||
}
|
||||
ss->serialEventNeedsRecord = true;
|
||||
#else
|
||||
CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg));
|
||||
#endif
|
||||
ncclResult_t ncclStreamWaitStream(cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent) {
|
||||
CUDACHECK(cudaEventRecord(scratchEvent, b));
|
||||
CUDACHECK(cudaStreamWaitEvent(a, scratchEvent, 0));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamLaunchKernel(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss,
|
||||
void* fn, dim3 grid, dim3 block, void* args[], size_t sharedMemBytes
|
||||
) {
|
||||
#if ROCM_VERSION >= 60100
|
||||
if (graph.graph == nullptr) {
|
||||
CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream));
|
||||
} else {
|
||||
cudaKernelNodeParams p;
|
||||
p.func = fn;
|
||||
p.gridDim = grid;
|
||||
p.blockDim = block;
|
||||
p.kernelParams = args;
|
||||
p.sharedMemBytes = sharedMemBytes;
|
||||
p.extra = nullptr;
|
||||
struct ncclStrongStreamGraph* g = ss->graphHead;
|
||||
NCCLCHECK(checkGraphId(g, graph.graphId));
|
||||
ensureTips(g, 1);
|
||||
CUDACHECK(cudaGraphAddKernelNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p));
|
||||
g->tipCount = 1;
|
||||
}
|
||||
ss->serialEventNeedsRecord = true;
|
||||
#else
|
||||
CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream));
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
ncclResult_t ncclStreamAdvanceToEvent(struct ncclCudaGraph g, cudaStream_t s, cudaEvent_t e) {
|
||||
if (g.graphId == ULLONG_MAX) {
|
||||
CUDACHECK(cudaStreamWaitEvent(s, e, 0));
|
||||
} else {
|
||||
cudaStream_t tmp;
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&tmp, cudaStreamNonBlocking));
|
||||
CUDACHECK(cudaStreamWaitEvent(tmp, e, 0));
|
||||
|
||||
// Merge node list `b` into list `a` but don't add duplicates.
|
||||
static void mergeTips(struct ncclStrongStreamGraph* a, cudaGraphNode_t const* bNodes, int bn) {
|
||||
int an = a->tipCount;
|
||||
ensureTips(a, an + bn);
|
||||
for (int bi=0; bi < bn; bi++) {
|
||||
for (int ai=0; ai < an; ai++) {
|
||||
if (a->tipNodes[ai] == bNodes[bi]) goto next_b;
|
||||
cudaStreamCaptureStatus status;
|
||||
cudaGraphNode_t const* nodes;
|
||||
size_t count = 0;
|
||||
cudaError_t res = hipStreamGetCaptureInfo_v2(tmp, &status, nullptr, nullptr, &nodes, &count);
|
||||
|
||||
#if CUDART_VERSION >= 12030
|
||||
if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations.
|
||||
cudaGraphEdgeData const* edges;
|
||||
CUDACHECK(cudaStreamGetCaptureInfo_v3(tmp, &status, nullptr, nullptr, &nodes, &edges, &count));
|
||||
CUDACHECK(cudaStreamUpdateCaptureDependencies_v2(s, (cudaGraphNode_t*)nodes, edges, count, cudaStreamSetCaptureDependencies));
|
||||
}
|
||||
a->tipNodes[a->tipCount++] = bNodes[bi];
|
||||
next_b:;
|
||||
#else
|
||||
if (false) {}
|
||||
#endif
|
||||
else {
|
||||
CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/);
|
||||
CUDACHECK(cudaStreamUpdateCaptureDependencies(s, (cudaGraphNode_t*)nodes, count, cudaStreamSetCaptureDependencies));
|
||||
}
|
||||
|
||||
CUDACHECK(cudaStreamDestroy(tmp));
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamWaitStream(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b,
|
||||
bool b_subsumes_a
|
||||
) {
|
||||
#if ROCM_VERSION >= 60100
|
||||
if (graph.graph == nullptr) {
|
||||
if (b->serialEventNeedsRecord) {
|
||||
b->serialEventNeedsRecord = false;
|
||||
CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream));
|
||||
}
|
||||
CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->serialEvent, 0));
|
||||
} else {
|
||||
struct ncclStrongStreamGraph* ag = a->graphHead;
|
||||
NCCLCHECK(checkGraphId(ag, graph.graphId));
|
||||
struct ncclStrongStreamGraph* bg = b->graphHead;
|
||||
NCCLCHECK(checkGraphId(bg, graph.graphId));
|
||||
if (b_subsumes_a) ag->tipCount = 0;
|
||||
mergeTips(ag, bg->tipNodes, bg->tipCount);
|
||||
}
|
||||
a->serialEventNeedsRecord = true;
|
||||
#else
|
||||
CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream));
|
||||
CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->scratchEvent, 0));
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamWaitStream(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b,
|
||||
bool b_subsumes_a
|
||||
) {
|
||||
#if ROCM_VERSION >= 60100
|
||||
if (graph.graph == nullptr) {
|
||||
// It is ok to use a->serialEvent to record b since we'll be setting
|
||||
// a->serialEventNeedsRecord so the event won't be considered accurate
|
||||
// until re-recorded.
|
||||
CUDACHECK(cudaEventRecord(a->serialEvent, b));
|
||||
CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->serialEvent, 0));
|
||||
} else {
|
||||
cudaStreamCaptureStatus status;
|
||||
unsigned long long bGraphId;
|
||||
cudaGraphNode_t const* bNodes;
|
||||
size_t bCount = 0;
|
||||
CUDACHECK(hipStreamGetCaptureInfo_v2(b, &status, &bGraphId, nullptr, &bNodes, &bCount));
|
||||
if (status != cudaStreamCaptureStatusActive || graph.graphId != bGraphId) {
|
||||
WARN("Stream is not being captured by the expected graph.");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
struct ncclStrongStreamGraph* ag = a->graphHead;
|
||||
NCCLCHECK(checkGraphId(ag, graph.graphId));
|
||||
if (b_subsumes_a) ag->tipCount = 0;
|
||||
mergeTips(ag, bNodes, bCount);
|
||||
}
|
||||
a->serialEventNeedsRecord = true;
|
||||
#else
|
||||
CUDACHECK(cudaEventRecord(a->scratchEvent, b));
|
||||
CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->scratchEvent, 0));
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamWaitStream(
|
||||
struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b,
|
||||
bool b_subsumes_a
|
||||
) {
|
||||
#if ROCM_VERSION >= 60100
|
||||
if (graph.graph == nullptr) {
|
||||
if (b->serialEventNeedsRecord) {
|
||||
b->serialEventNeedsRecord = false;
|
||||
CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream));
|
||||
}
|
||||
CUDACHECK(cudaStreamWaitEvent(a, b->serialEvent, 0));
|
||||
} else {
|
||||
struct ncclStrongStreamGraph* bg = b->graphHead;
|
||||
NCCLCHECK(checkGraphId(bg, graph.graphId));
|
||||
CUDACHECK(hipStreamUpdateCaptureDependencies(a, bg->tipNodes, bg->tipCount,
|
||||
b_subsumes_a ? cudaStreamSetCaptureDependencies : cudaStreamAddCaptureDependencies
|
||||
));
|
||||
}
|
||||
#else
|
||||
CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream));
|
||||
CUDACHECK(cudaStreamWaitEvent(a, b->scratchEvent, 0));
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) {
|
||||
#if ROCM_VERSION >= 60100
|
||||
CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
|
||||
ss->serialEventNeedsRecord = false;
|
||||
CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0));
|
||||
#endif
|
||||
CUDACHECK(cudaStreamSynchronize(ss->cudaStream));
|
||||
CUDACHECK(cudaStreamSynchronize(ss->liveStream));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -1,267 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "checks.h"
|
||||
#include "debug.h"
|
||||
#include "tuner.h"
|
||||
|
||||
pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static int tunerPluginRefCount;
|
||||
static void* tunerPluginLib = nullptr;
|
||||
static ncclTuner_v4_t* tunerSymbol = nullptr;
|
||||
static ncclTuner_v3_t* ncclTuner_v3 = nullptr;
|
||||
static ncclTuner_v2_t* ncclTuner_v2 = nullptr;
|
||||
static ncclTuner_v4_t ncclTuner_v2_as_v4;
|
||||
static ncclTuner_v4_t ncclTuner_v3_as_v4;
|
||||
|
||||
static int hasNvlsSupport(float** collCostTable) {
|
||||
// Requirements for support of different algorithms:
|
||||
//
|
||||
// - NVLS intra-node: nvlsSupport
|
||||
// - NVLS intra+inter-node: collNetSupport
|
||||
// - NVLSTree intra-node: always disabled
|
||||
// - NVLSTree inter-node: nvlsSupport
|
||||
// - Collnet* inter-node: collNetSupport
|
||||
//
|
||||
// nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1
|
||||
float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
|
||||
return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0;
|
||||
}
|
||||
|
||||
static int hasCollNetSupport(float** collCostTable) {
|
||||
float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
|
||||
return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTuner_v3_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) {
|
||||
NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto, nChannels));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTuner_v3_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
|
||||
NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logFunction, context));
|
||||
ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
|
||||
ncclTuner_v3_as_v4.getCollInfo = ncclTuner_v3_as_v4_getCollInfo;
|
||||
ncclTuner_v3_as_v4.destroy = ncclTuner_v3->destroy;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTuner_v2_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) {
|
||||
int algorithm = NCCL_ALGO_UNDEF;
|
||||
int protocol = NCCL_PROTO_UNDEF;
|
||||
int nvlsSupport = hasNvlsSupport(collCostTable);
|
||||
int collNetSupport = hasCollNetSupport(collCostTable);
|
||||
NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels));
|
||||
// set time to 0 below to make sure this algorithm/protocol is selected later on
|
||||
if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) {
|
||||
float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
|
||||
if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTuner_v2_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
|
||||
NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logFunction, context));
|
||||
ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
|
||||
ncclTuner_v2_as_v4.getCollInfo = ncclTuner_v2_as_v4_getCollInfo;
|
||||
ncclTuner_v2_as_v4.destroy = ncclTuner_v2->destroy;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#define MAX_STR_LEN 255
|
||||
|
||||
static void* tryOpenLib(const char* name, int* err, char* errStr) {
|
||||
*err = 0;
|
||||
if (nullptr == name || strlen(name) == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
|
||||
name = nullptr;
|
||||
}
|
||||
|
||||
void *handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
|
||||
if (nullptr == handle) {
|
||||
strncpy(errStr, dlerror(), MAX_STR_LEN);
|
||||
errStr[MAX_STR_LEN] = '\0';
|
||||
// "handle" and "name" won't be NULL at the same time.
|
||||
// coverity[var_deref_model]
|
||||
if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
|
||||
*err = ENOENT;
|
||||
}
|
||||
}
|
||||
return handle;
|
||||
}
|
||||
|
||||
static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
|
||||
if (openErr == ENOENT) {
|
||||
snprintf(nameList, *nameListLen, " %s", name);
|
||||
nameList += strlen(name) + 1;
|
||||
*nameListLen -= strlen(name) + 1;
|
||||
return nameList;
|
||||
}
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: %s", openErrStr);
|
||||
return nameList;
|
||||
}
|
||||
|
||||
static void* openTunerPluginLib(char* couldNotFindNames, int len) {
|
||||
int openErr;
|
||||
void *pluginLib;
|
||||
char tunerPluginLibName[PATH_MAX];
|
||||
char openErrStr[MAX_STR_LEN + 1] = { 0 };
|
||||
const char *envTunerPluginName = getenv("NCCL_TUNER_PLUGIN");
|
||||
if (envTunerPluginName && strlen(envTunerPluginName)) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: NCCL_TUNER_PLUGIN set to %s", envTunerPluginName);
|
||||
snprintf(tunerPluginLibName, PATH_MAX, "%s", envTunerPluginName);
|
||||
pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
|
||||
if (pluginLib) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
|
||||
return pluginLib;
|
||||
}
|
||||
couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
|
||||
|
||||
snprintf(tunerPluginLibName, PATH_MAX, "librccl-tuner-%s.so", envTunerPluginName);
|
||||
pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
|
||||
if (pluginLib) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
|
||||
return pluginLib;
|
||||
}
|
||||
couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
|
||||
} else {
|
||||
snprintf(tunerPluginLibName, PATH_MAX, "librccl-tuner.so");
|
||||
pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
|
||||
if (pluginLib) {
|
||||
return pluginLib;
|
||||
}
|
||||
couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
|
||||
}
|
||||
|
||||
const char *envNetPluginName = getenv("NCCL_NET_PLUGIN");
|
||||
if (envNetPluginName && strlen(envNetPluginName)) {
|
||||
// Users are allowed to pack tuner into the net plugin
|
||||
snprintf(tunerPluginLibName, PATH_MAX, "%s", envNetPluginName);
|
||||
pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
|
||||
if (pluginLib) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
|
||||
return pluginLib;
|
||||
}
|
||||
couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
|
||||
|
||||
snprintf(tunerPluginLibName, PATH_MAX, "librccl-net-%s.so", envNetPluginName);
|
||||
pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
|
||||
if (pluginLib) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
|
||||
return pluginLib;
|
||||
}
|
||||
couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
|
||||
} else {
|
||||
snprintf(tunerPluginLibName, PATH_MAX, "librccl-net.so");
|
||||
pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
|
||||
if (pluginLib) {
|
||||
return pluginLib;
|
||||
}
|
||||
couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
|
||||
}
|
||||
tunerPluginLibName[0] = '\0';
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
enum {
|
||||
tunerPluginLoadFailed = -1,
|
||||
tunerPluginLoadReady = 0,
|
||||
tunerPluginLoadSuccess = 1,
|
||||
};
|
||||
|
||||
#define MAX_PLUGIN_LOAD 4
|
||||
|
||||
static int status = tunerPluginLoadReady;
|
||||
|
||||
ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
|
||||
// Initialize to nullptr by default if plugin tuner cannot be loaded.
|
||||
char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
|
||||
comm->tuner = nullptr;
|
||||
if (tunerPluginLoadFailed == status) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
pthread_mutex_lock(&tunerPluginLock);
|
||||
if (tunerPluginLoadFailed == status) {
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (tunerPluginLoadSuccess == status) {
|
||||
comm->tuner = tunerSymbol;
|
||||
++tunerPluginRefCount;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
tunerPluginLib = openTunerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
|
||||
if (nullptr == tunerPluginLib) {
|
||||
if (strlen(couldNotFindNames)) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Could not find:%s. Using internal tuner plugin.", couldNotFindNames);
|
||||
} else {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using internal tuner plugin.");
|
||||
}
|
||||
goto fail;
|
||||
}
|
||||
|
||||
tunerSymbol = (ncclTuner_v4_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v4");
|
||||
if (tunerSymbol == nullptr) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.");
|
||||
ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3");
|
||||
if (ncclTuner_v3 == nullptr) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
|
||||
ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2");
|
||||
if (ncclTuner_v2 == nullptr) {
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
|
||||
dlclose(tunerPluginLib);
|
||||
goto fail;
|
||||
} else {
|
||||
ncclTuner_v2_as_v4.init = ncclTuner_v2_as_v4_init;
|
||||
ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
|
||||
tunerSymbol = &ncclTuner_v2_as_v4;
|
||||
}
|
||||
} else {
|
||||
ncclTuner_v3_as_v4.init = ncclTuner_v3_as_v4_init;
|
||||
ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
|
||||
tunerSymbol = &ncclTuner_v3_as_v4;
|
||||
}
|
||||
}
|
||||
|
||||
INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", tunerSymbol->name);
|
||||
comm->tuner = tunerSymbol;
|
||||
++tunerPluginRefCount;
|
||||
status = tunerPluginLoadSuccess;
|
||||
comm->tunerPluginLoaded = 1;
|
||||
|
||||
exit:
|
||||
pthread_mutex_unlock(&tunerPluginLock);
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
tunerPluginLib = nullptr;
|
||||
status = tunerPluginLoadFailed;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) {
|
||||
pthread_mutex_lock(&tunerPluginLock);
|
||||
if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) {
|
||||
INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name);
|
||||
dlclose(tunerPluginLib);
|
||||
tunerPluginLib = nullptr;
|
||||
tunerSymbol = nullptr;
|
||||
comm->tuner = nullptr;
|
||||
status = tunerPluginLoadReady;
|
||||
comm->tunerPluginLoaded = 0;
|
||||
}
|
||||
pthread_mutex_unlock(&tunerPluginLock);
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -82,6 +82,7 @@ typedef struct ncclConfig_v21700 {
|
||||
int maxCTAs; /*!< Maximum number of cooperative thread arrays (blocks) */
|
||||
const char *netName; /*!< Force NCCL to use a specfic network */
|
||||
int splitShare; /*!< Allow communicators to share resources */
|
||||
int trafficClass; /*!< Traffic class*/
|
||||
} ncclConfig_t;
|
||||
|
||||
/* Config initializer must be assigned to initialize config structure when it is created.
|
||||
@@ -95,7 +96,8 @@ typedef struct ncclConfig_v21700 {
|
||||
NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
|
||||
NCCL_CONFIG_UNDEF_PTR, /* netName */ \
|
||||
NCCL_CONFIG_UNDEF_INT /* splitShare */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* splitShare */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* trafficClass */ \
|
||||
}
|
||||
/*! @} */
|
||||
|
||||
|
||||
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Ссылка в новой задаче
Block a user