Merge remote-tracking branch 'nccl/master' into develop
Αυτή η υποβολή περιλαμβάνεται σε:
+3
-1
@@ -2,12 +2,14 @@
|
||||
|
||||
Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
|
||||
|
||||
## Unreleased - RCCL 2.26.6 for ROCm 7.1.0
|
||||
## Unreleased - RCCL 2.27.3 for ROCm 7.1.0
|
||||
|
||||
### Added
|
||||
|
||||
### Changed
|
||||
|
||||
* The MSCCL++ feature is now disabled by default. The `--disable-mscclpp` build flag is replaced with `--enable-mscclpp` in the `rccl/install.sh` script.
|
||||
* Compatibility with NCCL 2.27.3
|
||||
|
||||
### Resolved issues
|
||||
|
||||
|
||||
+42
-6
@@ -140,8 +140,8 @@ endif()
|
||||
# Set CMAKE flags
|
||||
#==================================================================================================
|
||||
set(CMAKE_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "")
|
||||
set(CMAKE_CXX_STANDARD 14) # We use C++14 features, this will add compile option: -std=c++14
|
||||
set(CMAKE_CXX_EXTENSIONS OFF) # Without this line, it will add -std=gnu++14 instead, which has some issues.
|
||||
set(CMAKE_CXX_STANDARD 17) # We use C++17 features, this will add compile option: -std=c++17
|
||||
set(CMAKE_CXX_EXTENSIONS OFF) # Without this line, it will add -std=gnu++17 instead, which has some issues.
|
||||
if(ROCM_PATH)
|
||||
list(APPEND CMAKE_PREFIX_PATH # Add ROCM_PATH to CMake search paths (for finding HIP / HSA
|
||||
${ROCM_PATH}
|
||||
@@ -425,6 +425,7 @@ configure_file(src/nccl.h.in ${PROJECT_BINARY_DIR}/include/nccl.h) # Used b
|
||||
#==================================================================================================
|
||||
# E.g: find src -type f \( -name "*.cc" -o -name "*.h" -o -name "*.hpp" \) | sort
|
||||
set(SRC_FILES
|
||||
src/allocator.cc
|
||||
src/bootstrap.cc
|
||||
src/channel.cc
|
||||
src/collectives.cc
|
||||
@@ -437,6 +438,7 @@ set(SRC_FILES
|
||||
src/msccl.cc
|
||||
src/proxy.cc
|
||||
src/rccl_wrap.cc
|
||||
src/symmetric.cc
|
||||
src/transport.cc
|
||||
src/device/all_gather.h
|
||||
src/device/all_reduce.h
|
||||
@@ -458,6 +460,11 @@ set(SRC_FILES
|
||||
src/device/onerank.cu
|
||||
src/device/network/unpack/unpack_defs.h
|
||||
src/device/network/unpack/unpack.h
|
||||
src/device/symmetric/all_gather.cuh
|
||||
src/device/symmetric/all_reduce.cuh
|
||||
src/device/symmetric/kernel.cuh
|
||||
src/device/symmetric/primitives.cuh
|
||||
src/device/symmetric/reduce_scatter.cuh
|
||||
src/graph/connect.cc
|
||||
src/graph/paths.cc
|
||||
src/graph/rings.cc
|
||||
@@ -472,6 +479,7 @@ set(SRC_FILES
|
||||
src/graph/xml.cc
|
||||
src/graph/xml.h
|
||||
src/include/alloc.h
|
||||
src/include/allocator.h
|
||||
src/include/alt_rsmi.h
|
||||
src/include/archinfo.h
|
||||
src/include/api_trace.h
|
||||
@@ -516,6 +524,7 @@ set(SRC_FILES
|
||||
src/include/rccl_common.h
|
||||
src/include/rccl_vars.h
|
||||
src/include/register.h
|
||||
src/include/register_inline.h
|
||||
src/include/rccl_float8.h
|
||||
src/include/rocm_smi_wrap.h
|
||||
src/include/rocmwrap.h
|
||||
@@ -526,11 +535,15 @@ set(SRC_FILES
|
||||
src/include/signals.h
|
||||
src/include/socket.h
|
||||
src/include/strongstream.h
|
||||
src/include/symmetric.h
|
||||
src/include/timer.h
|
||||
src/include/transport.h
|
||||
src/include/trees.h
|
||||
src/include/tuner.h
|
||||
src/include/utils.h
|
||||
src/include/mlx5/mlx5dvcore.h
|
||||
src/include/mlx5/mlx5dvsymbols.h
|
||||
src/include/mlx5/mlx5dvwrap.h
|
||||
src/include/msccl/msccl_lifecycle.h
|
||||
src/include/msccl/msccl_parser.h
|
||||
src/include/msccl/msccl_scheduler.h
|
||||
@@ -591,6 +604,7 @@ set(SRC_FILES
|
||||
src/include/plugin/profiler/profiler_v1.h
|
||||
src/include/plugin/profiler/profiler_v2.h
|
||||
src/include/plugin/profiler/profiler_v3.h
|
||||
src/include/plugin/profiler/profiler_v4.h
|
||||
src/include/plugin/tuner/tuner_v2.h
|
||||
src/include/plugin/tuner/tuner_v3.h
|
||||
src/include/plugin/tuner/tuner_v4.h
|
||||
@@ -604,6 +618,8 @@ set(SRC_FILES
|
||||
src/misc/ibvsymbols.cc
|
||||
src/misc/ibvwrap.cc
|
||||
src/misc/ipcsocket.cc
|
||||
src/misc/mlx5dvsymbols.cc
|
||||
src/misc/mlx5dvwrap.cc
|
||||
src/misc/npkit.cc
|
||||
# src/misc/nvmlwrap.cc
|
||||
src/misc/nvmlwrap_stub.cc
|
||||
@@ -634,6 +650,7 @@ set(SRC_FILES
|
||||
src/plugin/profiler/profiler_v1.cc
|
||||
src/plugin/profiler/profiler_v2.cc
|
||||
src/plugin/profiler/profiler_v3.cc
|
||||
src/plugin/profiler/profiler_v4.cc
|
||||
src/plugin/tuner/tuner_v2.cc
|
||||
src/plugin/tuner/tuner_v3.cc
|
||||
src/plugin/tuner/tuner_v4.cc
|
||||
@@ -706,6 +723,7 @@ foreach(SRC_FILE ${SRC_FILES})
|
||||
add_file_unique(HIP_SOURCES ${HIP_FILE})
|
||||
|
||||
# Convert .cu files to .cpp so that they get processed properly
|
||||
string(REPLACE "\.cuh" "\.h" HIP_FILE ${HIP_FILE})
|
||||
string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE})
|
||||
list(APPEND HIP_SOURCES ${HIP_FILE})
|
||||
|
||||
@@ -826,8 +844,13 @@ if (NOT Python3_FOUND)
|
||||
endif()
|
||||
|
||||
set(GEN_DIR "${HIPIFY_DIR}/gensrc")
|
||||
set(GEN_SYM_DIR "${GEN_DIR}/symmetric")
|
||||
|
||||
# Execute the python script to generate required files
|
||||
if(ONLY_FUNCS)
|
||||
message(WARNING "Using ONLY_FUNCS = ${ONLY_FUNCS}. Not meant for release builds.")
|
||||
endif()
|
||||
|
||||
# Execute the python script to generate required collective functions
|
||||
execute_process(
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/src/device/generate.py ${GEN_DIR} ${IFC_ENABLED} ${COLLTRACE} ${ENABLE_MSCCL_KERNEL} ${BUILD_LOCAL_GPU_TARGET_ONLY} ${ONLY_FUNCS}
|
||||
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
|
||||
@@ -839,8 +862,20 @@ if (gen_py_result)
|
||||
message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/generate.py failed")
|
||||
endif()
|
||||
|
||||
# Execute the python script to generate required symmetric memory kernels
|
||||
execute_process(
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py ${GEN_SYM_DIR}
|
||||
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
|
||||
RESULT_VARIABLE gen_sym_py_result
|
||||
ERROR_VARIABLE gen_sym_py_error
|
||||
)
|
||||
if (gen_sym_py_result)
|
||||
message(SEND_ERROR "Error: ${gen_sym_py_error}")
|
||||
message(FATAL_ERROR "${CMAKE_SOURCE_DIR}/src/device/symmetric/generate.py failed")
|
||||
endif()
|
||||
|
||||
# Find the generated files in the output directory
|
||||
file(GLOB GENERATED_FILES "${GEN_DIR}/*")
|
||||
file(GLOB_RECURSE GENERATED_FILES "${GEN_DIR}/*")
|
||||
|
||||
# Append all found generated files to the list
|
||||
foreach(file ${GENERATED_FILES})
|
||||
@@ -876,10 +911,11 @@ endif()
|
||||
## Set RCCL include directories
|
||||
target_include_directories(rccl PRIVATE ${PROJECT_BINARY_DIR}/include) # for generated rccl.h header
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src) # for hipfied headers
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/plugin)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/device/network/unpack)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/mlx5)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/src/include/plugin)
|
||||
target_include_directories(rccl PRIVATE ${HIPIFY_DIR}/gensrc)
|
||||
target_include_directories(rccl PRIVATE ${HSA_INCLUDE_PATH})
|
||||
target_include_directories(rccl PRIVATE ${ROCM_SMI_INCLUDE_DIR})
|
||||
|
||||
@@ -7,9 +7,15 @@
|
||||
#ifndef COMMON_H_
|
||||
#define COMMON_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
enum { ncclProfilerNetEventStart = 0, ncclProfilerNetEventStop, ncclProfilerNetEventUpdate, ncclProfilerNetEventUpdateAndStop };
|
||||
|
||||
typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -8,9 +8,9 @@
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "err.h"
|
||||
#include "net_device.h"
|
||||
#include "common.h"
|
||||
|
||||
#define NCCL_NET_HANDLE_MAXSIZE 128
|
||||
#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
|
||||
@@ -23,8 +23,6 @@
|
||||
// Maximum number of requests per comm object
|
||||
#define NCCL_NET_MAX_REQUESTS 32
|
||||
|
||||
typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
|
||||
|
||||
#include "net_v10.h"
|
||||
#include "net_v9.h"
|
||||
#include "net_v8.h"
|
||||
|
||||
@@ -49,9 +49,9 @@ of newer ones.
|
||||
The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
|
||||
from old API versions. It also provides error codes in `err.h`.
|
||||
|
||||
# API (v3)
|
||||
# API (v4)
|
||||
|
||||
Below is the main `ncclProfiler_v3` struct. Each function is explained in later sections.
|
||||
Below is the main `ncclProfiler_v4` struct. Each function is explained in later sections.
|
||||
|
||||
```
|
||||
typedef struct {
|
||||
@@ -60,9 +60,15 @@ typedef struct {
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// - commName : user assigned communicator name
|
||||
// - commHash : communicator id
|
||||
// - nNodes : number of nodes in communicator
|
||||
// - nranks : number of ranks in communicator
|
||||
// - rank : rank identifier in communicator
|
||||
// - logfn : logger function
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask);
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
@@ -70,7 +76,7 @@ typedef struct {
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
@@ -82,13 +88,13 @@ typedef struct {
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v3_t;
|
||||
} ncclProfiler_v4_t;
|
||||
```
|
||||
|
||||
## Error codes
|
||||
@@ -147,8 +153,6 @@ typedef struct {
|
||||
int rank; // rank that generated the event
|
||||
union {
|
||||
struct { // collective events metadata
|
||||
const char* name; // string containing name of the communicator
|
||||
uint64_t commHash; // unique hash/id for the communicator
|
||||
uint64_t seqNumber; // sequence number of this collective operation in the communicator
|
||||
const char* func; // string containing name of the collective
|
||||
void const* sendBuff; // address of send buffer
|
||||
@@ -156,20 +160,19 @@ typedef struct {
|
||||
size_t count; // data count
|
||||
int root; // root rank
|
||||
const char* datatype; // string containing the name of the datatype
|
||||
uint8_t nMaxChannels; // max number of channels for this collective
|
||||
uint8_t nChannels; // number of channels for this collective
|
||||
uint8_t nWarps; // number of GPU warps for this collective
|
||||
const char* algo; // string containing name of the algorithm for this collective
|
||||
const char* proto; // string containing name of the protocol for this collective
|
||||
} coll;
|
||||
|
||||
struct { // point-to-point events metadata
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
const char* func;
|
||||
void* buff;
|
||||
const char* datatype;
|
||||
size_t count;
|
||||
int peer; // peer rank for this point-to-point
|
||||
uint8_t nChannels; // number of channels for this p2p
|
||||
} p2p;
|
||||
|
||||
struct { // proxyOp events metadata
|
||||
@@ -178,7 +181,7 @@ typedef struct {
|
||||
int peer; // peer rank
|
||||
int nSteps; // number of network transfers/steps required by the `ncclProxyOp`
|
||||
int chunkSize; // chunk size for this `ncclProxyOp`
|
||||
int isSend; // set to 1 for sends and 0 for recvs
|
||||
int isSend; // type of network operation
|
||||
} proxyOp;
|
||||
|
||||
struct { // proxyStep events metadata
|
||||
@@ -187,6 +190,7 @@ typedef struct {
|
||||
|
||||
struct {
|
||||
uint8_t channelId; // id of the channel used by the kernel
|
||||
uint64_t ptimer; // kernel supplied timestamp
|
||||
} kernelCh;
|
||||
|
||||
struct {
|
||||
@@ -194,7 +198,7 @@ typedef struct {
|
||||
void* data; // pointer to network plugin defined event
|
||||
} netPlugin;
|
||||
};
|
||||
} ncclProfilerEventDescr_v3_t;
|
||||
} ncclProfilerEventDescr_v4_t;
|
||||
```
|
||||
|
||||
NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
|
||||
@@ -212,45 +216,57 @@ handle after `eventStop` is undefined behavior.
|
||||
Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`,
|
||||
`ncclProfileP2p`, cannot be updated through calls to `recordEventState`.
|
||||
|
||||
`ncclProfileProxyOp`, `ncclProfileProxyStep` and `ncclProfileProxyCtrl` can be updated through
|
||||
calls to `recordEventState`.
|
||||
`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
|
||||
`ncclProfileProxyCtrl` can be updated through calls to `recordEventState`.
|
||||
|
||||
The state of proxy generated events can be updated, along with event attributes, using
|
||||
`recordEventState`. These events can go through several states during their lifecycle.
|
||||
The list of supported states for the proxy-defined events is reported below.
|
||||
The state of these events can be updated, along with event attributes, using `recordEventState`.
|
||||
These events can go through several states during their lifecycle.
|
||||
|
||||
The list of supported states for the updatable events is reported below.
|
||||
|
||||
```
|
||||
typedef enum {
|
||||
// ncclProfileProxyOp event states
|
||||
ncclProfilerProxyOpSendPosted, // state marks the posting of send buffer to GPU for given network transfer/step
|
||||
ncclProfilerProxyOpSendRemFifoWait, // state marks the waiting of CTS credits from peer rank
|
||||
ncclProfilerProxyOpSendTransmitted, // state marks the sending of network transfer/step to peer rank
|
||||
ncclProfilerProxyOpSendDone, // state marks the ending of network transfer/step
|
||||
ncclProfilerProxyOpRecvPosted, // state marks the posting of recv to network for given network transfer/step
|
||||
ncclProfilerProxyOpRecvReceived, // state marks the recving of network transfer/step from peer rank
|
||||
ncclProfilerProxyOpRecvTransmitted, // state marks the ending of the network transfer/step
|
||||
ncclProfilerProxyOpRecvDone, // state marks the consuming of data from GPU
|
||||
ncclProfilerProxyOpSendPosted = 0, // deprecated in v4
|
||||
ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4
|
||||
ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4
|
||||
ncclProfilerProxyOpSendDone = 3, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvDone = 7, // deprecated in v4
|
||||
ncclProfilerProxyOpInProgress_v4 = 19,// state marks transition of proxy op to progress
|
||||
|
||||
// ncclProfileProxyStep event states
|
||||
ncclProfilerProxyStepSendGPUWait, // state marks the waiting of send data from GPU for given network transfer/step
|
||||
ncclProfilerProxyStepSendWait, // state marks the waiting of send data from network for given network transfer/step
|
||||
ncclProfilerProxyStepRecvWait, // state marks the waiting of recv data from network for given network transfer/step
|
||||
ncclProfilerProxyStepRecvFlushWait, // state marks the waiting of recv data flush to GPU for given network transfer/step
|
||||
ncclProfilerProxyStepRecvGPUWait, // state marks the waiting of recv data consumption from GPU for given network transfer/step
|
||||
ncclProfilerProxyStepSendGPUWait = 8, // state marks the waiting of send data from GPU for given network transfer/step
|
||||
ncclProfilerProxyStepSendPeerWait_v4 = 20,// state marks the waiting of recv clear to send credits for given network transfer/step
|
||||
ncclProfilerProxyStepSendWait = 9, // state marks the waiting of send data from network for given network transfer/step
|
||||
ncclProfilerProxyStepRecvWait = 10,// state marks the waiting of recv data from network for given network transfer/step
|
||||
ncclProfilerProxyStepRecvFlushWait = 11,// state marks the waiting of recv data flush to GPU for given network transfer/step
|
||||
ncclProfilerProxyStepRecvGPUWait = 12,// state marks the waiting of recv data consumption from GPU for given network transfer/step
|
||||
|
||||
// ncclProfileProxyCtrl event states
|
||||
ncclProfilerProxyCtrlIdle, // state marks proxy progress thread idle
|
||||
ncclProfilerProxyCtrlActive, // state marks proxy progress thread active
|
||||
ncclProfilerProxyCtrlSleep, // state marks proxy progress thread sleeping
|
||||
ncclProfilerProxyCtrlWakeup, // state marks proxy progress thread waking up
|
||||
ncclProfilerProxyCtrlAppend, // state marks append of new network work item begin
|
||||
ncclProfilerProxyCtrlAppendEnd, // state marks append of new network work item end
|
||||
} ncclProfilerEventState_v3_t;
|
||||
ncclProfilerProxyCtrlIdle = 13,// state marks proxy progress thread idle
|
||||
ncclProfilerProxyCtrlActive = 14,// state marks proxy progress thread active
|
||||
ncclProfilerProxyCtrlSleep = 15,// state marks proxy progress thread sleeping
|
||||
ncclProfilerProxyCtrlWakeup = 16,// state marks proxy progress thread waking up
|
||||
ncclProfilerProxyCtrlAppend = 17,// state marks append of new network work item begin
|
||||
ncclProfilerProxyCtrlAppendEnd = 18,// state marks append of new network work item end
|
||||
|
||||
// ncclProfileNetPlugin event states
|
||||
ncclProfilerNetPluginUpdate = 21,// state marks update of network defined event
|
||||
|
||||
// ncclProfileKernelCh event states
|
||||
ncclProfilerKernelChStop = 22,// state marks stop of kernelCh event and timestamp update
|
||||
} ncclProfilerEventState_v4_t;
|
||||
```
|
||||
|
||||
`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
|
||||
network requests for the GPU kernel. ProxyOp events are generated for every active channel and
|
||||
provide a summary of the activity of the proxy progress thread for that channel.
|
||||
provide a summary of the activity of the proxy progress thread for that channel. Most of the
|
||||
states for this event were duplicated with `ncclProfileProxyStep` events. Therefore, starting
|
||||
with version 4 of the profiler interface these states have been deprecated. The same level of
|
||||
information can still be obtained through the `ncclProfileProxyStep` events.
|
||||
|
||||
`ncclProfileProxyStep` events are generated by the proxy progress thread while it is processing
|
||||
network requests for the GPU kernel. ProxyStep events describe individual network transfer in
|
||||
@@ -348,15 +364,22 @@ reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported
|
||||
|
||||
```
|
||||
typedef union {
|
||||
struct { // attributes to update for ncclProfileProxyOp events
|
||||
size_t transSize; // data transferred thus far
|
||||
int steps; // network transfer/steps processed thus far
|
||||
} proxyOp;
|
||||
struct { // attributes for update for ncclProfileProxyStep events
|
||||
size_t transSize; // transfer size field for this proxy step
|
||||
} proxyStep;
|
||||
|
||||
struct { // attributes to update for ncclProfileProxyCtrl
|
||||
struct { // attributes to update for ncclProfileProxyCtrl events
|
||||
int appendedProxyOps; // number of appended proxy ops thus far
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v3_t;
|
||||
|
||||
struct { // attributes to update for ncclProfileNetPlugin events
|
||||
void* data; // network plugin opaque update data field
|
||||
} netPlugin;
|
||||
|
||||
struct { // attribute to update for ncclProfileKernelCh events
|
||||
uint64_t pTimer; // timestamp provided by the NCCL kernel
|
||||
} kernelCh;
|
||||
} ncclProfilerEventStateArgs_v4_t;
|
||||
```
|
||||
|
||||
The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
|
||||
@@ -396,12 +419,12 @@ ProxyCtrl event
|
||||
## Profiling of collective and p2p operations
|
||||
|
||||
The NCCL code is instrumented with profiler callbacks at different levels to capture start/stop of groups,
|
||||
collective and point-to-point operations, as well as proxy progress activity. Due to the asynchronous nature
|
||||
collective and point-to-point operations, as well as proxy, kernel and network activity. Due to the asynchronous nature
|
||||
of NCCL operations, events associated to collective and point-to-point operations are not easy to delimit
|
||||
precisely. For example, without both proxy and/or kernel activity it is impossible for the profiler to
|
||||
figure out when a collective operation completes. Therefore, `stopEvent` for collectives simply indicates to
|
||||
the profiler that the collective has been enqueued. The profiler can leverage proxy event information, if
|
||||
these are enabled, to estimate when the collective ends. In this case, the profiler can look at the `stopEvent`
|
||||
the profiler that the collective has been enqueued. The profiler can leverage proxy and/or kernel event information, if
|
||||
these are enabled, to estimate when the collective ends. For example, the profiler can look at the `stopEvent`
|
||||
call of the last `ncclProfileProxyOp` event to mark the completion of the associated collective event. This
|
||||
can be achieved by reference counting the collective event and letting calls to `startEvent` and `stopEvent`
|
||||
increment and decrement the reference counter, respectively.
|
||||
@@ -425,8 +448,14 @@ enqueue can be time stamped by the profiler (at start and stop) to reconstruct t
|
||||
collective. However, this time only represents the launch time of the collective and not the actual
|
||||
execution time. To reconstruct the execution time more accurately proxy and kernel events are provided.
|
||||
|
||||
With version 3 of the profiler interface network activity is no longer required to do intra-node profiling.
|
||||
Kernel events instrumentation leverages counters exposed by the kernel to the host and the proxy progress
|
||||
thread. Thus, the proxy progress thread infrastructure is shared between the network and the profiler. If
|
||||
the proxy is serving network requests the kernel profiling probing can be delayed, causing loss of
|
||||
accuracy. Similarly, if the CPU is under heavy load and the scheduling of the proxy progress thread is
|
||||
delayed, a similar loss of accuracy can be encountered. Keep this in mind when using kernel events.
|
||||
delayed, a similar loss of accuracy can be encountered.
|
||||
|
||||
To mitigate this effect, with version 4 of the profiler NCCL uses a per-channel ring buffer of 64 elements.
|
||||
Every counter is complemented by two timestamps (ptimers) supplied by the NCCL kernel (one for start and one
|
||||
for stop of the operation in the kernel). NCCL propagates these timestamps to the profiler plugin that it can
|
||||
convert them to CPU time domain.
|
||||
|
||||
@@ -15,24 +15,6 @@
|
||||
#define MAX_CHANNELS 32
|
||||
#define MAX_STEPS 16
|
||||
#define MAX_OPS 16 // Up to 64K ranks for PAT
|
||||
|
||||
#define PROXY_OP_SEND_STATE_OFFSET (ncclProfilerProxyOpSendPosted)
|
||||
#define PROXY_OP_RECV_STATE_OFFSET (ncclProfilerProxyOpRecvPosted)
|
||||
#define PROXY_STEP_SEND_STATE_OFFSET (ncclProfilerProxyStepSendGPUWait)
|
||||
#define PROXY_STEP_RECV_STATE_OFFSET (ncclProfilerProxyStepRecvWait)
|
||||
|
||||
#define NUM_PROXY_OP_SEND_STATES (ncclProfilerProxyOpSendDone - ncclProfilerProxyOpSendPosted + 1)
|
||||
#define NUM_PROXY_OP_RECV_STATES (ncclProfilerProxyOpRecvDone - ncclProfilerProxyOpRecvPosted + 1)
|
||||
#define NUM_PROXY_STEP_SEND_STATES (ncclProfilerProxyStepSendWait - ncclProfilerProxyStepSendGPUWait + 1)
|
||||
#define NUM_PROXY_STEP_RECV_STATES (ncclProfilerProxyStepRecvGPUWait - ncclProfilerProxyStepRecvWait + 1)
|
||||
|
||||
#define PROXY_OP_SEND_STATE_IDX(state) (state - PROXY_OP_SEND_STATE_OFFSET)
|
||||
#define PROXY_OP_RECV_STATE_IDX(state) (state - PROXY_OP_RECV_STATE_OFFSET)
|
||||
#define PROXY_STEP_SEND_STATE_IDX(state) (state - PROXY_STEP_SEND_STATE_OFFSET)
|
||||
#define PROXY_STEP_RECV_STATE_IDX(state) (state - PROXY_STEP_RECV_STATE_OFFSET)
|
||||
|
||||
#define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES)
|
||||
#define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES)
|
||||
#define MAX_EVENTS_PER_REQ (8)
|
||||
|
||||
struct proxyOp;
|
||||
@@ -68,13 +50,24 @@ struct kernelCh {
|
||||
struct taskEventBase* parent;
|
||||
double startTs;
|
||||
double stopTs;
|
||||
uint64_t startGpuClk;
|
||||
uint64_t stopGpuClk;
|
||||
};
|
||||
|
||||
#define PROXY_STEP_SEND_GPU_WAIT 0
|
||||
#define PROXY_STEP_SEND_PEER_WAIT 1
|
||||
#define PROXY_STEP_SEND_WAIT 2
|
||||
#define PROXY_STEP_RECV_WAIT 0
|
||||
#define PROXY_STEP_RECV_FLUSH_WAIT 1
|
||||
#define PROXY_STEP_RECV_GPU_WAIT 2
|
||||
#define PROXY_STEP_MAX_STATES 3
|
||||
|
||||
struct proxyStep {
|
||||
uint8_t type; // type of event: network transfer
|
||||
int state;
|
||||
int step; // network transfer id in given channel
|
||||
int isSend; // send/recv channel operation
|
||||
double timestamp[MAX_PROXY_STEP_STATES];
|
||||
double timestamp[PROXY_STEP_MAX_STATES];
|
||||
double startTs;
|
||||
double stopTs;
|
||||
struct proxyOp* parent;
|
||||
@@ -92,11 +85,8 @@ struct proxyOp {
|
||||
int chunkSize; // chunk size for this proxy operation
|
||||
int isSend; // send/recv channel operation
|
||||
size_t transSize; // transfer data size for this proxy operation
|
||||
struct {
|
||||
int steps; // completed steps for this proxy operation state
|
||||
double timestamp;
|
||||
} states[MAX_PROXY_OP_STATES];
|
||||
double startTs;
|
||||
double progrTs; // In progress state transition
|
||||
double stopTs;
|
||||
int stepCount; // last processed network operation for this proxy operation
|
||||
struct proxyStep step[MAX_STEPS]; // array of network transfer events
|
||||
@@ -119,8 +109,6 @@ struct proxyCtrl {
|
||||
struct taskEventBase {
|
||||
uint8_t type; // event type: collective/p2p
|
||||
int rank; // rank of the operation in NCCL communicator
|
||||
const char* name; // FIXME: unused
|
||||
uint64_t commHash; // communicator identifier
|
||||
const char* func; // ncclFunc*
|
||||
int refCount; // number of references for this operation
|
||||
struct group* parent; // parent event group
|
||||
@@ -137,12 +125,11 @@ struct collective {
|
||||
size_t count;
|
||||
int root;
|
||||
const char* datatype;
|
||||
uint8_t nMaxChannels;
|
||||
uint8_t nChannels;
|
||||
const char* algo;
|
||||
const char* proto;
|
||||
int nWarps;
|
||||
struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events
|
||||
struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events
|
||||
struct proxyOp op[MAX_CHANNELS][2*MAX_OPS];
|
||||
int nProxyOps[MAX_CHANNELS];
|
||||
struct kernelCh kernel[MAX_CHANNELS];
|
||||
};
|
||||
@@ -154,6 +141,7 @@ struct p2p {
|
||||
size_t count;
|
||||
const char* datatype;
|
||||
int peer;
|
||||
uint8_t nChannels;
|
||||
struct proxyOp op[MAX_CHANNELS];
|
||||
struct kernelCh kernel[MAX_CHANNELS];
|
||||
};
|
||||
@@ -172,6 +160,11 @@ struct group {
|
||||
|
||||
// arrays for different event objects
|
||||
struct context {
|
||||
const char* commName;
|
||||
uint64_t commHash;
|
||||
int nranks;
|
||||
int rank;
|
||||
|
||||
int groupPoolSize;
|
||||
int groupPoolBase;
|
||||
int groupPoolIndex;
|
||||
|
||||
@@ -25,42 +25,52 @@ enum {
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
ncclProfilerProxyOpSendPosted,
|
||||
ncclProfilerProxyOpSendRemFifoWait,
|
||||
ncclProfilerProxyOpSendTransmitted,
|
||||
ncclProfilerProxyOpSendDone,
|
||||
ncclProfilerProxyOpRecvPosted,
|
||||
ncclProfilerProxyOpRecvReceived,
|
||||
ncclProfilerProxyOpRecvTransmitted,
|
||||
ncclProfilerProxyOpRecvDone,
|
||||
ncclProfilerProxyOpSendPosted = 0, // deprecated in v4
|
||||
ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4
|
||||
ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4
|
||||
ncclProfilerProxyOpSendDone = 3, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvDone = 7, // deprecated in v4
|
||||
ncclProfilerProxyOpInProgress_v4 = 19,
|
||||
|
||||
/* Legacy proxy profiler states */
|
||||
ncclProfilerProxyStepSendGPUWait,
|
||||
ncclProfilerProxyStepSendWait,
|
||||
ncclProfilerProxyStepRecvWait,
|
||||
ncclProfilerProxyStepRecvFlushWait,
|
||||
ncclProfilerProxyStepRecvGPUWait,
|
||||
ncclProfilerProxyStepSendGPUWait = 8,
|
||||
ncclProfilerProxyStepSendPeerWait_v4 = 20,
|
||||
ncclProfilerProxyStepSendWait = 9,
|
||||
ncclProfilerProxyStepRecvWait = 10,
|
||||
ncclProfilerProxyStepRecvFlushWait = 11,
|
||||
ncclProfilerProxyStepRecvGPUWait = 12,
|
||||
|
||||
/* Legacy proxy control states */
|
||||
ncclProfilerProxyCtrlIdle,
|
||||
ncclProfilerProxyCtrlActive,
|
||||
ncclProfilerProxyCtrlSleep,
|
||||
ncclProfilerProxyCtrlWakeup,
|
||||
ncclProfilerProxyCtrlAppend,
|
||||
ncclProfilerProxyCtrlAppendEnd,
|
||||
ncclProfilerProxyCtrlIdle = 13,
|
||||
ncclProfilerProxyCtrlActive = 14,
|
||||
ncclProfilerProxyCtrlSleep = 15,
|
||||
ncclProfilerProxyCtrlWakeup = 16,
|
||||
ncclProfilerProxyCtrlAppend = 17,
|
||||
ncclProfilerProxyCtrlAppendEnd = 18,
|
||||
|
||||
/* Network defined events states */
|
||||
ncclProfilerNetPluginUpdate = 21,
|
||||
|
||||
/* Kernel event states */
|
||||
ncclProfilerKernelChStop = 22,
|
||||
} ncclProfilerEventState_t;
|
||||
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
|
||||
|
||||
#include "profiler_v4.h"
|
||||
#include "profiler_v3.h"
|
||||
#include "profiler_v2.h"
|
||||
#include "profiler_v1.h"
|
||||
#include "profiler_net.h"
|
||||
|
||||
typedef ncclProfiler_v3_t ncclProfiler_t;
|
||||
typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
|
||||
typedef ncclProfiler_v4_t ncclProfiler_t;
|
||||
typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
|
||||
|
||||
#endif // end include guard
|
||||
|
||||
@@ -111,9 +111,4 @@ typedef struct {
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v3_t;
|
||||
|
||||
typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventState_v3_t ncclProfilerEventState_t;
|
||||
typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
|
||||
typedef ncclProfiler_v3_t ncclProfiler_t;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,123 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_V4_H_
|
||||
#define PROFILER_V4_H_
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
uint64_t seqNumber;
|
||||
const char* func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
const char* datatype;
|
||||
uint8_t nChannels;
|
||||
uint8_t nWarps;
|
||||
const char* algo;
|
||||
const char* proto;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* func;
|
||||
void* buff;
|
||||
const char* datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
uint8_t nChannels;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
uint8_t channelId;
|
||||
uint64_t pTimer; // start timestamp from GPU globaltimer
|
||||
} kernelCh;
|
||||
|
||||
struct {
|
||||
int64_t id;
|
||||
void* data;
|
||||
} netPlugin;
|
||||
};
|
||||
} ncclProfilerEventDescr_v4_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
|
||||
struct {
|
||||
void* data;
|
||||
} netPlugin;
|
||||
|
||||
struct {
|
||||
uint64_t pTimer;
|
||||
} kernelCh;
|
||||
} ncclProfilerEventStateArgs_v4_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// - commName : user assigned communicator name
|
||||
// - commHash : communicator id
|
||||
// - nNodes : number of nodes in communicator
|
||||
// - nranks : number of ranks in communciator
|
||||
// - rank : rank identifier in communicator
|
||||
// - logfn : logger function
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v4_t;
|
||||
|
||||
#endif
|
||||
@@ -38,6 +38,9 @@ static int detachPoolIndex;
|
||||
static int detachPoolDone;
|
||||
static struct proxyOp* detachPool;
|
||||
|
||||
ncclDebugLogger_t logFn;
|
||||
#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
|
||||
|
||||
static double freq = -1;
|
||||
__hidden void calibrate() {
|
||||
struct timeval tv;
|
||||
@@ -60,7 +63,7 @@ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static pid_t pid;
|
||||
static int* eActivationMaskPtr;
|
||||
|
||||
__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) {
|
||||
__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
|
||||
pthread_mutex_lock(&lock);
|
||||
if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
|
||||
// first thread initializes event mask, environment and detach pool
|
||||
@@ -106,6 +109,13 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
|
||||
|
||||
// pre-allocate memory for event object pools in dedicated profiler context
|
||||
struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
|
||||
ctx->commName = commName;
|
||||
ctx->commHash = commHash;
|
||||
ctx->nranks = nranks;
|
||||
ctx->rank = rank;
|
||||
logFn = logfn;
|
||||
INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commHash, nranks, rank);
|
||||
|
||||
ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
|
||||
if (ctx->groupPool == NULL) goto fail;
|
||||
|
||||
@@ -142,17 +152,16 @@ fail:
|
||||
__hidden ncclResult_t exampleProfilerFinalize(void* context) {
|
||||
FILE* fh = NULL;
|
||||
char filename[PATH_MAX] = { 0 };
|
||||
char hostname[64] = { 0 };
|
||||
gethostname(hostname, 64);
|
||||
struct context* ctx = (struct context *)context;
|
||||
const char* dump = getenv("NCCL_PROFILE_DUMP_FILE");
|
||||
if (dump) {
|
||||
sprintf(filename, "%s-%s-%ld.txt", dump, hostname, syscall(SYS_gettid));
|
||||
sprintf(filename, "%s_%lu_%d.json", dump, ctx->commHash, ctx->rank);
|
||||
fh = fopen(filename, "w");
|
||||
fprintf(fh, "[\n");
|
||||
}
|
||||
INFO(NCCL_INIT, "PROFILER/Plugin: finalize commName: %s commHash: %lu nranks: %d rank: %d", ctx->commName ? ctx->commName : "", ctx->commHash, ctx->nranks, ctx->rank);
|
||||
|
||||
// print last N groups/collectives/p2ps
|
||||
struct context* ctx = (struct context *)context;
|
||||
int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0;
|
||||
int end = ctx->groupPoolIndex;
|
||||
for (int i = start; i < end; i++) {
|
||||
@@ -243,8 +252,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
|
||||
event->base.type = ncclProfileColl;
|
||||
event->base.rank = eDescr->rank;
|
||||
event->base.name = eDescr->coll.name;
|
||||
event->base.commHash = eDescr->coll.commHash;
|
||||
event->base.func = eDescr->coll.func;
|
||||
event->base.startTs = gettime() - startTime;
|
||||
event->base.parent = parent;
|
||||
@@ -254,7 +261,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
event->count = eDescr->coll.count;
|
||||
event->root = eDescr->coll.root;
|
||||
event->datatype = eDescr->coll.datatype;
|
||||
event->nMaxChannels = eDescr->coll.nMaxChannels;
|
||||
event->nChannels = eDescr->coll.nChannels;
|
||||
event->nWarps = eDescr->coll.nWarps;
|
||||
event->algo = eDescr->coll.algo;
|
||||
event->proto = eDescr->coll.proto;
|
||||
@@ -281,8 +288,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
|
||||
event->base.type = ncclProfileP2p;
|
||||
event->base.rank = eDescr->rank;
|
||||
event->base.name = eDescr->p2p.name;
|
||||
event->base.commHash = eDescr->p2p.commHash;
|
||||
event->base.func = eDescr->p2p.func;
|
||||
event->base.next = parent->eventHead;
|
||||
event->base.startTs = gettime() - startTime;
|
||||
@@ -291,6 +296,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
event->count = eDescr->p2p.count;
|
||||
event->datatype = eDescr->p2p.datatype;
|
||||
event->peer = eDescr->p2p.peer;
|
||||
event->nChannels = eDescr->p2p.nChannels;
|
||||
*eHandle = event;
|
||||
// increment the group ref counter so the event will staty open
|
||||
taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
|
||||
@@ -331,6 +337,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
event->isSend = eDescr->proxyOp.isSend;
|
||||
event->startTs = gettime() - startTime;
|
||||
event->parent = NULL;
|
||||
event->stepCount = 0;
|
||||
*eHandle = event;
|
||||
debugEvent(event, "PxnProxyOpStart");
|
||||
return ncclSuccess;
|
||||
@@ -339,9 +346,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
if (eventBase->type == ncclProfileColl) {
|
||||
struct collective* parent = (struct collective *)eDescr->parentObj;
|
||||
int channelId = eDescr->proxyOp.channelId;
|
||||
struct proxyOp* event = (eDescr->proxyOp.isSend) ?
|
||||
&parent->send[channelId][parent->nProxyOps[channelId]++] :
|
||||
&parent->recv[channelId][parent->nProxyOps[channelId]++];
|
||||
struct proxyOp* event = &parent->op[channelId][parent->nProxyOps[channelId]++];
|
||||
|
||||
event->type = ncclProfileProxyOp;
|
||||
event->channelId = channelId;
|
||||
@@ -353,6 +358,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
event->isSend = eDescr->proxyOp.isSend;
|
||||
event->parent = eventBase;
|
||||
event->startTs = gettime() - startTime;
|
||||
event->stepCount = 0;
|
||||
*eHandle = event;
|
||||
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
|
||||
debugEvent(event, "ProxyOpStart");
|
||||
@@ -370,6 +376,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
event->isSend = eDescr->proxyOp.isSend;
|
||||
event->parent = eventBase;
|
||||
event->startTs = gettime() - startTime;
|
||||
event->stepCount = 0;
|
||||
*eHandle = event;
|
||||
__atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
|
||||
debugEvent(event, "ProxyOpStart");
|
||||
@@ -382,9 +389,10 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
int s = parent->stepCount++ % MAX_STEPS;
|
||||
struct proxyStep* event = &parent->step[s];
|
||||
event->type = ncclProfileProxyStep;
|
||||
event->state = 0;
|
||||
event->step = eDescr->proxyStep.step;
|
||||
event->isSend = parent->isSend;
|
||||
event->parent = parent;
|
||||
event->isSend = parent->isSend;
|
||||
event->startTs = gettime() - startTime;
|
||||
event->nNetEvents = 0;
|
||||
*eHandle = event;
|
||||
@@ -397,6 +405,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
|
||||
event->type = ncclProfileKernelCh;
|
||||
event->channelId = eDescr->kernelCh.channelId;
|
||||
event->startGpuClk = eDescr->kernelCh.pTimer;
|
||||
event->parent = eventBase;
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
@@ -407,6 +416,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
|
||||
struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
|
||||
event->type = ncclProfileKernelCh;
|
||||
event->channelId = eDescr->kernelCh.channelId;
|
||||
event->startGpuClk = eDescr->kernelCh.pTimer;
|
||||
event->parent = eventBase;
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
@@ -563,29 +573,57 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
|
||||
// the event handle might be null if we run out of events
|
||||
if (eHandle == NULL) return ncclSuccess;
|
||||
|
||||
debugEvent(eHandle, "RecordEventState");
|
||||
uint8_t type = *(uint8_t *)eHandle;
|
||||
if (type == ncclProfileProxyOp) {
|
||||
struct proxyOp* event = (struct proxyOp *)eHandle;
|
||||
int steps = event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps;
|
||||
if (eState == ncclProfilerProxyOpSendRemFifoWait && eStateArgs->proxyOp.steps == steps) return ncclSuccess;
|
||||
event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps = eStateArgs->proxyOp.steps;
|
||||
event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].timestamp = gettime() - startTime;
|
||||
event->transSize = eStateArgs->proxyOp.transSize;
|
||||
if (eState == ncclProfilerProxyOpInProgress_v4) {
|
||||
event->progrTs = gettime() - startTime;
|
||||
}
|
||||
} else if (type == ncclProfileProxyStep) {
|
||||
struct proxyStep* event = (struct proxyStep *)eHandle;
|
||||
event->timestamp[event->isSend ? PROXY_STEP_SEND_STATE_IDX(eState) : PROXY_STEP_RECV_STATE_IDX(eState)] = gettime() - startTime;
|
||||
struct proxyOp* parent = event->parent;
|
||||
switch (eState) {
|
||||
case ncclProfilerProxyStepSendGPUWait:
|
||||
event->timestamp[PROXY_STEP_SEND_GPU_WAIT] = gettime() - startTime;
|
||||
break;
|
||||
case ncclProfilerProxyStepSendPeerWait_v4:
|
||||
// do not update step event if in SendPeerWait
|
||||
if (event->state == ncclProfilerProxyStepSendPeerWait_v4) break;
|
||||
event->timestamp[PROXY_STEP_SEND_PEER_WAIT] = gettime() - startTime;
|
||||
event->state = ncclProfilerProxyStepSendPeerWait_v4;
|
||||
break;
|
||||
case ncclProfilerProxyStepSendWait:
|
||||
event->timestamp[PROXY_STEP_SEND_WAIT] = gettime() - startTime;
|
||||
parent->transSize += eStateArgs->proxyStep.transSize;
|
||||
break;
|
||||
case ncclProfilerProxyStepRecvWait:
|
||||
event->timestamp[PROXY_STEP_RECV_WAIT] = gettime() - startTime;
|
||||
break;
|
||||
case ncclProfilerProxyStepRecvFlushWait:
|
||||
event->timestamp[PROXY_STEP_RECV_FLUSH_WAIT] = gettime() - startTime;
|
||||
parent->transSize += eStateArgs->proxyStep.transSize;
|
||||
break;
|
||||
case ncclProfilerProxyStepRecvGPUWait:
|
||||
event->timestamp[PROXY_STEP_RECV_GPU_WAIT] = gettime() - startTime;
|
||||
break;
|
||||
}
|
||||
} else if (type == ncclProfileProxyCtrl) {
|
||||
struct proxyCtrl* event = (struct proxyCtrl *)eHandle;
|
||||
if (eState == ncclProfilerProxyCtrlAppendEnd) {
|
||||
event->appended = eStateArgs->proxyCtrl.appendedProxyOps;
|
||||
}
|
||||
event->state = eState;
|
||||
} else if (type == ncclProfileKernelCh) {
|
||||
struct kernelCh* event = (struct kernelCh *)eHandle;
|
||||
if (eState == ncclProfilerKernelChStop) {
|
||||
event->stopGpuClk = eStateArgs->kernelCh.pTimer;
|
||||
}
|
||||
}
|
||||
debugEvent(eHandle, "RecordEventState");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclProfiler_t ncclProfiler_v3 = {
|
||||
ncclProfiler_t ncclProfiler_v4 = {
|
||||
"Example-profiler",
|
||||
exampleProfilerInit,
|
||||
exampleProfilerStartEvent,
|
||||
|
||||
@@ -27,8 +27,8 @@ __hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
|
||||
|
||||
static __thread int collId;
|
||||
__hidden void printCollEventHeader(FILE* fh, struct collective* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n",
|
||||
event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nMaxChannels);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nChannels\": %d}},\n",
|
||||
event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.parent->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels);
|
||||
}
|
||||
|
||||
__hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
|
||||
@@ -38,8 +38,8 @@ __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
|
||||
|
||||
static __thread int p2pId;
|
||||
__hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\"}},\n",
|
||||
event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"nChannels\": %d}},\n",
|
||||
event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.parent->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels);
|
||||
}
|
||||
|
||||
__hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
|
||||
@@ -50,47 +50,43 @@ __hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
|
||||
static __thread int proxyOpId;
|
||||
__hidden void printProxyOpEventHeader(FILE* fh, struct proxyOp* event) {
|
||||
if (event->isSend) {
|
||||
int posted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendPosted);
|
||||
int remFifoWait = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendRemFifoWait);
|
||||
int transmitted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendTransmitted);
|
||||
int done = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendDone);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"REM_FIFO_WAIT\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n",
|
||||
"Send", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[remFifoWait].steps, event->states[remFifoWait].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n",
|
||||
"ScheduleSend", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"ScheduleSend", proxyOpId, getpid(), 1, event->progrTs);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n",
|
||||
"ProgressSend", proxyOpId, getpid(), 1, event->progrTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize);
|
||||
} else {
|
||||
int posted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvPosted);
|
||||
int received = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvReceived);
|
||||
int transmitted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvTransmitted);
|
||||
int done = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvDone);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"RECEIVED\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n",
|
||||
"Recv", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[received].steps, event->states[received].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n",
|
||||
"ScheduleRecv", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"ScheduleRecv", proxyOpId, getpid(), 1, event->progrTs);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n",
|
||||
"ProgressRecv", proxyOpId, getpid(), 1, event->progrTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize);
|
||||
}
|
||||
}
|
||||
|
||||
__hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
event->isSend ? "Send" : "Recv", proxyOpId++, getpid(), 1, event->stopTs);
|
||||
event->isSend ? "ProgressSend" : "ProgressRecv", proxyOpId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
|
||||
static __thread int proxyStepId;
|
||||
__hidden void printProxyStepEventHeader(FILE* fh, struct proxyStep* event) {
|
||||
if (event->isSend) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
|
||||
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_GPU_WAIT], event->step);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"SendBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)]);
|
||||
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_PEER_WAIT]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)], event->step);
|
||||
"SendPeerWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_PEER_WAIT], event->step);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]);
|
||||
"SendPeerWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_WAIT]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step);
|
||||
"SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_WAIT], event->step);
|
||||
} else {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step);
|
||||
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_WAIT], event->step);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -100,13 +96,13 @@ __hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* event) {
|
||||
"SendWait", proxyStepId++, getpid(), 1, event->stopTs);
|
||||
} else {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]);
|
||||
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_FLUSH_WAIT]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)], event->step);
|
||||
"RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_FLUSH_WAIT], event->step);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)]);
|
||||
"RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_GPU_WAIT]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"RecvGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)], event->step);
|
||||
"RecvGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_GPU_WAIT], event->step);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"RecvGpuWait", proxyStepId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
@@ -115,8 +111,8 @@ __hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* event) {
|
||||
static __thread int kernelId;
|
||||
__hidden void printKernelChEventHeader(FILE* fh, struct kernelCh* event) {
|
||||
if (event->type != ncclProfileKernelCh) return;
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d}},\n",
|
||||
"KernelCh", kernelId, getpid(), 1, event->startTs, event->channelId);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"StartGpuClk\": %lu, \"StopGpuClk\": %lu}},\n",
|
||||
"KernelCh", kernelId, getpid(), 1, event->startTs, event->channelId, event->startGpuClk, event->stopGpuClk);
|
||||
}
|
||||
|
||||
__hidden void printKernelChEventTrailer(FILE* fh, struct kernelCh* event) {
|
||||
@@ -134,6 +130,8 @@ __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
|
||||
str = "Sleep";
|
||||
} else if (event->state == ncclProfilerProxyCtrlAppend || event->state == ncclProfilerProxyCtrlAppendEnd) {
|
||||
str = "Append";
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
if (event->state == ncclProfilerProxyCtrlAppendEnd) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"appended\": %d}},\n",
|
||||
@@ -188,9 +186,8 @@ void debugEvent(void* eHandle, const char* tag) {
|
||||
fprintf(fh, "Collective event %p tag = %s {\n", event, tag);
|
||||
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
|
||||
fprintf(fh, " parent = %p\n", event->base.parent);
|
||||
for (int j = 0; j < MAX_OPS; j++) {
|
||||
for (int i = 0; i < MAX_CHANNELS; i++) if (event->send[i][j].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]);
|
||||
for (int i = 0; i < MAX_CHANNELS; i++) if (event->recv[i][j].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]);
|
||||
for (int j = 0; j < 2*MAX_OPS; j++) {
|
||||
for (int i = 0; i < MAX_CHANNELS; i++) if (event->op[i][j].type == ncclProfileProxyOp) fprintf(fh, " op[%d] = %p\n", i, &event->op[i]);
|
||||
}
|
||||
fprintf(fh, " startTs = %f\n", event->base.startTs);
|
||||
fprintf(fh, " stopTs = %f\n", event->base.stopTs);
|
||||
@@ -207,17 +204,18 @@ void debugEvent(void* eHandle, const char* tag) {
|
||||
} else if (type == ncclProfileProxyOp) {
|
||||
struct proxyOp* event = (struct proxyOp *)eHandle;
|
||||
fprintf(fh, "ProxyOp event %p tag = %s {\n", event, tag);
|
||||
fprintf(fh, " type = %s\n", event->isSend ? "Send" : "Recv");
|
||||
fprintf(fh, " type = %s\n", event->isSend < 0 ? "Unknown" : event->isSend ? "Send" : "Recv");
|
||||
fprintf(fh, " channel = %d\n", event->channelId);
|
||||
fprintf(fh, " parent = %p\n", event->parent);
|
||||
fprintf(fh, " rank = %d\n", event->rank);
|
||||
fprintf(fh, " startTs = %f\n", event->startTs);
|
||||
fprintf(fh, " progrTs = %f\n", event->progrTs);
|
||||
fprintf(fh, " stopTs = %f\n", event->stopTs);
|
||||
fprintf(fh, "}\n");
|
||||
} else if (type == ncclProfileProxyStep) {
|
||||
struct proxyStep* event = (struct proxyStep *)eHandle;
|
||||
fprintf(fh, "ProxyStep event %p tag = %s {\n", event, tag);
|
||||
fprintf(fh, " type = %s\n", event->isSend ? "Send" : "Recv");
|
||||
fprintf(fh, " type = %s\n", event->isSend < 0 ? "Unknown" : event->isSend ? "Send" : "Recv");
|
||||
fprintf(fh, " parent = %p\n", event->parent);
|
||||
fprintf(fh, " startTs = %f\n", event->startTs);
|
||||
fprintf(fh, " stopTs = %f\n", event->stopTs);
|
||||
@@ -260,8 +258,7 @@ void printEvent(FILE* fh, void* handle) {
|
||||
for (int i = 0; i < MAX_CHANNELS; i++) {
|
||||
printKernelChEventHeader(fh, &c->kernel[i]);
|
||||
for (int j = 0; j < c->nProxyOps[i]; j++) {
|
||||
printEvent(fh, &c->send[i][j]);
|
||||
printEvent(fh, &c->recv[i][j]);
|
||||
printEvent(fh, &c->op[i][j]);
|
||||
}
|
||||
printKernelChEventTrailer(fh, &c->kernel[i]);
|
||||
}
|
||||
|
||||
@@ -7,6 +7,9 @@
|
||||
#ifndef PRINT_EVENT_H_
|
||||
#define PRINT_EVENT_H_
|
||||
|
||||
#include "nccl/common.h"
|
||||
extern ncclDebugLogger_t logFn;
|
||||
|
||||
void debugEvent(void* eHandle, const char* tag);
|
||||
void printEvent(FILE* fh, void* handle);
|
||||
|
||||
|
||||
@@ -17,6 +17,8 @@ PROFAPI ?= 1
|
||||
NVTX ?= 1
|
||||
RDMA_CORE ?= 0
|
||||
NET_PROFILER ?= 0
|
||||
MLX5DV ?= 0
|
||||
MAX_EXT_NET_PLUGINS ?= 0
|
||||
|
||||
NVCC = $(CUDA_HOME)/bin/nvcc
|
||||
|
||||
@@ -49,8 +51,10 @@ CUDA11_PTX = -gencode=arch=compute_80,code=compute_80
|
||||
CUDA12_PTX = -gencode=arch=compute_90,code=compute_90
|
||||
CUDA13_PTX = -gencode=arch=compute_120,code=compute_120
|
||||
|
||||
|
||||
ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 12; echo $$?),0)
|
||||
ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
|
||||
# Prior to SM75 is deprecated from CUDA13.0 onwards
|
||||
NVCC_GENCODE ?= $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
|
||||
else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0)
|
||||
# Include Blackwell support if we're using CUDA12.8 or above
|
||||
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
|
||||
else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
|
||||
@@ -66,14 +70,21 @@ else
|
||||
endif
|
||||
$(info NVCC_GENCODE is ${NVCC_GENCODE})
|
||||
|
||||
# CUDA 13.0 requires c++17
|
||||
ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
|
||||
CXXSTD ?= -std=c++17
|
||||
else
|
||||
CXXSTD ?= -std=c++11
|
||||
endif
|
||||
|
||||
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \
|
||||
-Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \
|
||||
-I $(CUDA_INC) \
|
||||
-Wall -Wno-unused-function -Wno-sign-compare $(CXXSTD) -Wvla \
|
||||
-I $(CUDA_INC) -I $(CUDA_INC)/cccl \
|
||||
$(CXXFLAGS)
|
||||
# Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)
|
||||
# 512 : 120, 640 : 96, 768 : 80, 1024 : 60
|
||||
# We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.
|
||||
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all
|
||||
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) $(CXXSTD) --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all
|
||||
# Use addprefix so that we can specify more than one path
|
||||
NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt
|
||||
|
||||
@@ -136,9 +147,17 @@ CXXFLAGS += -DPROFAPI
|
||||
endif
|
||||
|
||||
ifneq ($(RDMA_CORE), 0)
|
||||
CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1
|
||||
CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1 -libverbs
|
||||
endif
|
||||
|
||||
ifneq ($(MLX5DV), 0)
|
||||
CXXFLAGS += -DNCCL_BUILD_MLX5DV=1 -lmlx5
|
||||
endif
|
||||
|
||||
ifneq ($(NET_PROFILER), 0)
|
||||
CXXFLAGS += -DNCCL_ENABLE_NET_PROFILING=1
|
||||
endif
|
||||
|
||||
ifneq ($(MAX_EXT_NET_PLUGINS), 0)
|
||||
CXXFLAGS += -DNCCL_NET_MAX_PLUGINS=$(MAX_EXT_NET_PLUGINS)
|
||||
endif
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 26
|
||||
NCCL_PATCH := 6
|
||||
NCCL_MINOR := 27
|
||||
NCCL_PATCH := 3
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
+1
-1
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
|
||||
INCEXPORTS := nccl.h
|
||||
LIBSRCFILES := \
|
||||
bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
|
||||
init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc \
|
||||
init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc symmetric.cc \
|
||||
$(wildcard graph/*.cc) \
|
||||
$(wildcard misc/*.cc) \
|
||||
$(wildcard transport/*.cc) \
|
||||
|
||||
@@ -0,0 +1,198 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "comm.h"
|
||||
#include "transport.h"
|
||||
#include "group.h"
|
||||
|
||||
NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
|
||||
ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
#if ROCM_VERSION >= 70000
|
||||
size_t memGran = 0;
|
||||
CUdevice currentDev;
|
||||
CUmemAllocationProp memprop = {};
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
CUmemGenericAllocationHandle handle = (CUmemGenericAllocationHandle)-1;
|
||||
int cudaDev;
|
||||
int flag;
|
||||
int dcnt;
|
||||
|
||||
if (ptr == NULL || size == 0) goto fallback;
|
||||
|
||||
// if (rocmLibraryInit() != ncclSuccess) goto fallback;
|
||||
rocmLibraryInit();
|
||||
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
||||
|
||||
if (ncclCuMemEnable()) {
|
||||
size_t handleSize = size;
|
||||
int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
// Query device to see if FABRIC handle support is available
|
||||
flag = 0;
|
||||
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));
|
||||
if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
|
||||
memprop.location.id = currentDev;
|
||||
// Query device to see if RDMA support is available
|
||||
flag = 0;
|
||||
// CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
|
||||
if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
|
||||
CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||
CUDACHECK(cudaGetDeviceCount(&dcnt));
|
||||
ALIGN_SIZE(handleSize, memGran);
|
||||
|
||||
if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
|
||||
/* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
|
||||
CUresult err = CUPFN(cuMemCreate(&handle, handleSize, &memprop, 0));
|
||||
if (err == CUDA_ERROR_NOT_SUPPORTED) {
|
||||
requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
|
||||
/* Allocate the physical memory on the device */
|
||||
CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
|
||||
} else if (err != CUDA_SUCCESS) {
|
||||
// Catch and report any error from above
|
||||
CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
|
||||
}
|
||||
} else {
|
||||
/* Allocate the physical memory on the device */
|
||||
CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
|
||||
}
|
||||
/* Reserve a virtual address range */
|
||||
CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, handleSize, memGran, 0, 0));
|
||||
/* Map the virtual address range to the physical allocation */
|
||||
CUCHECK(cuMemMap((CUdeviceptr)*ptr, handleSize, 0, handle, 0));
|
||||
/* Now allow RW access to the newly mapped memory */
|
||||
for (int i = 0; i < dcnt; ++i) {
|
||||
int p2p = 0;
|
||||
if (i == cudaDev || ((cudaDeviceCanAccessPeer(&p2p, i, cudaDev) == cudaSuccess) && p2p)) {
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = i;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, handleSize, &accessDesc, 1));
|
||||
}
|
||||
if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i);
|
||||
}
|
||||
goto exit;
|
||||
}
|
||||
|
||||
fallback:
|
||||
#endif
|
||||
// Coverity is right to complain that we may pass a NULL ptr to cudaMalloc. That's deliberate though:
|
||||
// we want CUDA to return an error to the caller.
|
||||
// coverity[var_deref_model]
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail);
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
|
||||
ncclResult_t ncclMemFree_impl(void *ptr) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int saveDevice;
|
||||
|
||||
CUDACHECK(cudaGetDevice(&saveDevice));
|
||||
#if ROCM_VERSION >= 70000
|
||||
CUdevice ptrDev = 0;
|
||||
|
||||
if (ptr == NULL) goto fallback;
|
||||
// if (rocmLibraryInit() != ncclSuccess) goto fallback;
|
||||
rocmLibraryInit();
|
||||
|
||||
CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail);
|
||||
CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail);
|
||||
if (ncclCuMemEnable()) {
|
||||
NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
fallback:
|
||||
#endif
|
||||
CUDACHECKGOTO(cudaFree(ptr), ret, fail);
|
||||
|
||||
exit:
|
||||
CUDACHECK(cudaSetDevice(saveDevice));
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// This is a collective function and should be called by all ranks in the communicator
|
||||
ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
void* regSymAddr = NULL;
|
||||
size_t allocSize = size;
|
||||
size_t granularity;
|
||||
CUdevice cuDev;
|
||||
CUmemAllocationProp memprop = {};
|
||||
CUmemGenericAllocationHandle memHandle;
|
||||
int bit = 0, cnt = 0;
|
||||
|
||||
// aligment must be power of 2 as an input
|
||||
while (bit < sizeof(size_t) * 8) {
|
||||
if (alignment & (1L << bit)) cnt++;
|
||||
if (cnt == 2) {
|
||||
WARN("rank %d alignment %ld is not power of 2", comm->rank, alignment);
|
||||
goto fail;
|
||||
}
|
||||
bit++;
|
||||
}
|
||||
// temporarily align the alignment to NCCL_REC_PAGE_SIZE
|
||||
ALIGN_SIZE(alignment, NCCL_REC_PAGE_SIZE);
|
||||
|
||||
CUCHECKGOTO(cuDeviceGet(&cuDev, comm->cudaDev), ret, fail);
|
||||
memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
memprop.requestedHandleType = ncclCuMemHandleType;
|
||||
memprop.location.id = cuDev;
|
||||
CUCHECKGOTO(cuMemGetAllocationGranularity(&granularity, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
|
||||
ALIGN_SIZE(allocSize, granularity);
|
||||
|
||||
CUCHECKGOTO(cuMemCreate(&memHandle, allocSize, &memprop, 0), ret, fail);
|
||||
ALIGN_SIZE(comm->symAllocHead, alignment);
|
||||
NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, allocSize, memHandle, ®SymAddr), ret, fail);
|
||||
NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, allocSize, regSymAddr), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
|
||||
comm->symAllocHead += allocSize;
|
||||
*symPtr = regSymAddr;
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
*symPtr = NULL;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr) {
|
||||
CUmemGenericAllocationHandle handle;
|
||||
size_t size = 0;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int saveDev = comm->cudaDev;
|
||||
CUDACHECKGOTO(cudaGetDevice(&saveDev), ret, fail);
|
||||
if (ncclCuMemEnable()) {
|
||||
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
|
||||
CUCHECKGOTO(cuMemRetainAllocationHandle(&handle, symPtr), ret, fail);
|
||||
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
|
||||
CUCHECKGOTO(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)symPtr), ret, fail);
|
||||
NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, size, symPtr), ret, fail);
|
||||
NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, size, symPtr), ret, fail);
|
||||
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
|
||||
}
|
||||
exit:
|
||||
CUDACHECK(cudaSetDevice(saveDev));
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
@@ -95,6 +95,7 @@ ncclResult_t bootstrapNetInit() {
|
||||
pthread_mutex_lock(&bootstrapNetLock);
|
||||
if (bootstrapNetInitDone == 0) {
|
||||
const char* env = ncclGetEnv("NCCL_COMM_ID");
|
||||
int nIfs = 0;
|
||||
if (env) {
|
||||
union ncclSocketAddress remoteAddr;
|
||||
if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) {
|
||||
@@ -102,13 +103,15 @@ ncclResult_t bootstrapNetInit() {
|
||||
pthread_mutex_unlock(&bootstrapNetLock);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
if (ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
|
||||
NCCLCHECK(ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE,
|
||||
&nIfs));
|
||||
if (nIfs <= 0) {
|
||||
WARN("NET/Socket : No usable listening interface found");
|
||||
pthread_mutex_unlock(&bootstrapNetLock);
|
||||
return ncclSystemError;
|
||||
}
|
||||
} else {
|
||||
int nIfs = ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
|
||||
NCCLCHECK(ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1, &nIfs));
|
||||
if (nIfs <= 0) {
|
||||
WARN("Bootstrap : no socket interface found");
|
||||
pthread_mutex_unlock(&bootstrapNetLock);
|
||||
@@ -833,7 +836,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks), ret, fail);
|
||||
memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress));
|
||||
if (parent->config.splitShare) {
|
||||
if (parent->shareResources) {
|
||||
/* map local rank to top parent local rank. */
|
||||
for (int i = 0; i < nranks; ++i) {
|
||||
comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]];
|
||||
|
||||
@@ -147,7 +147,7 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
|
||||
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks) {
|
||||
int nPeers = nRanks + collnetNRanks + nvlsNRanks;
|
||||
/* channel peers are only valid when async init thread completes commAlloc() and
|
||||
* the channel is intialized with initChannel(); if either is not done, this channel
|
||||
* the channel is initialized with initChannel(); if either is not done, this channel
|
||||
* should never be free. */
|
||||
if (channel->id == -1 || channel->peers == NULL) return ncclSuccess;
|
||||
|
||||
|
||||
+42
-18
@@ -16,6 +16,8 @@
|
||||
#include <chrono>
|
||||
#include "param.h"
|
||||
|
||||
#define NCCL_DEBUG_RESET_TRIGGERED (-2)
|
||||
|
||||
int ncclDebugLevel = -1;
|
||||
static uint32_t ncclDebugTimestampLevels = 0; // bitmaps of levels that have timestamps turned on
|
||||
static char ncclDebugTimestampFormat[256]; // with space for subseconds
|
||||
@@ -26,7 +28,7 @@ static int pid = -1;
|
||||
static char hostname[1024];
|
||||
thread_local int ncclDebugNoWarn = 0;
|
||||
char ncclLastError[1024] = ""; // Global string for the last error in human readable form
|
||||
static uint64_t ncclDebugMask = NCCL_INIT | NCCL_BOOTSTRAP | NCCL_ENV; // Default debug sub-system mask is INIT and ENV
|
||||
static uint64_t ncclDebugMask = 0;
|
||||
FILE *ncclDebugFile = stdout;
|
||||
static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static std::chrono::steady_clock::time_point ncclEpoch;
|
||||
@@ -34,11 +36,16 @@ static bool ncclWarnSetDebugInfo = false;
|
||||
|
||||
static __thread int tid = -1;
|
||||
|
||||
// This function must be called with ncclDebugLock locked!
|
||||
static void ncclDebugInit() {
|
||||
pthread_mutex_lock(&ncclDebugLock);
|
||||
if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
|
||||
const char* nccl_debug = ncclGetEnv("NCCL_DEBUG");
|
||||
int tempNcclDebugLevel = -1;
|
||||
uint64_t tempNcclDebugMask = NCCL_INIT | NCCL_BOOTSTRAP | NCCL_ENV; // Default debug sub-system mask
|
||||
if (ncclDebugLevel == NCCL_DEBUG_RESET_TRIGGERED && ncclDebugFile != stdout) {
|
||||
// Finish the reset initiated via ncclResetDebugInit().
|
||||
fclose(ncclDebugFile);
|
||||
ncclDebugFile = stdout;
|
||||
}
|
||||
if (nccl_debug == NULL) {
|
||||
tempNcclDebugLevel = NCCL_LOG_NONE;
|
||||
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
|
||||
@@ -61,7 +68,7 @@ static void ncclDebugInit() {
|
||||
if (ncclDebugSubsysEnv != NULL) {
|
||||
int invert = 0;
|
||||
if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
|
||||
ncclDebugMask = invert ? ~0ULL : 0ULL;
|
||||
tempNcclDebugMask = invert ? ~0ULL : 0ULL;
|
||||
char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv);
|
||||
char *subsys = strtok(ncclDebugSubsys, ",");
|
||||
while (subsys != NULL) {
|
||||
@@ -104,7 +111,7 @@ static void ncclDebugInit() {
|
||||
mask = NCCL_ALL;
|
||||
}
|
||||
if (mask) {
|
||||
if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
|
||||
if (invert) tempNcclDebugMask &= ~mask; else tempNcclDebugMask |= mask;
|
||||
}
|
||||
subsys = strtok(NULL, ",");
|
||||
}
|
||||
@@ -248,15 +255,15 @@ static void ncclDebugInit() {
|
||||
if (debugFn[0] != '\0') {
|
||||
FILE *file = fopen(debugFn, "w");
|
||||
if (file != nullptr) {
|
||||
setbuf(file, nullptr); // disable buffering
|
||||
setlinebuf(file); // disable block buffering
|
||||
ncclDebugFile = file;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ncclEpoch = std::chrono::steady_clock::now();
|
||||
ncclDebugMask = tempNcclDebugMask;
|
||||
__atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE);
|
||||
pthread_mutex_unlock(&ncclDebugLock);
|
||||
}
|
||||
|
||||
/* Common logging function used by the INFO, WARN and TRACE macros
|
||||
@@ -264,19 +271,38 @@ static void ncclDebugInit() {
|
||||
* they can share the debugging mechanisms and output files
|
||||
*/
|
||||
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
|
||||
if (__atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE) == -1) ncclDebugInit();
|
||||
bool locked = false; // Keeps track of the ncclDebugLock state.
|
||||
int gotLevel = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE);
|
||||
|
||||
if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
|
||||
|
||||
// Save the last error (WARN) as a human readable string
|
||||
if (level == NCCL_LOG_WARN) {
|
||||
pthread_mutex_lock(&ncclDebugLock);
|
||||
locked = true;
|
||||
va_list vargs;
|
||||
va_start(vargs, fmt);
|
||||
(void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs);
|
||||
va_end(vargs);
|
||||
pthread_mutex_unlock(&ncclDebugLock);
|
||||
}
|
||||
if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;
|
||||
|
||||
if (gotLevel >= 0 && (gotLevel < level || (flags & ncclDebugMask) == 0)) {
|
||||
if (locked)
|
||||
pthread_mutex_unlock(&ncclDebugLock);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!locked) {
|
||||
pthread_mutex_lock(&ncclDebugLock);
|
||||
locked = true;
|
||||
}
|
||||
// From this point on ncclDebugLock is always locked so we don't need to check "locked" anymore.
|
||||
if (ncclDebugLevel < 0)
|
||||
ncclDebugInit();
|
||||
if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) {
|
||||
pthread_mutex_unlock(&ncclDebugLock);
|
||||
return;
|
||||
}
|
||||
|
||||
if (tid == -1) {
|
||||
tid = syscall(SYS_gettid);
|
||||
@@ -337,7 +363,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
// Add level specific formatting.
|
||||
if (level == NCCL_LOG_WARN) {
|
||||
len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %s:%d NCCL WARN ", cudaDev, filefunc, line);
|
||||
if (ncclWarnSetDebugInfo) ncclDebugLevel = NCCL_LOG_INFO;
|
||||
if (ncclWarnSetDebugInfo) __atomic_store_n(&ncclDebugLevel, NCCL_LOG_INFO, __ATOMIC_RELEASE);
|
||||
} else if (level == NCCL_LOG_INFO) {
|
||||
len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] NCCL INFO ", cudaDev);
|
||||
} else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
|
||||
@@ -362,19 +388,17 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
// necessary since we write bytes instead of the string.
|
||||
buffer[len++] = '\n';
|
||||
fwrite(buffer, 1, len, ncclDebugFile);
|
||||
pthread_mutex_unlock(&ncclDebugLock);
|
||||
}
|
||||
|
||||
NCCL_API(void, ncclResetDebugInit);
|
||||
void ncclResetDebugInit() {
|
||||
// Cleans up from a previous ncclDebugInit() and reruns.
|
||||
// Use this after changing NCCL_DEBUG and related parameters in the environment.
|
||||
__atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE);
|
||||
if (ncclDebugFile != stdout) {
|
||||
fclose(ncclDebugFile);
|
||||
ncclDebugFile = stdout;
|
||||
}
|
||||
ncclDebugLevel = -1;
|
||||
ncclDebugInit();
|
||||
pthread_mutex_lock(&ncclDebugLock);
|
||||
// Let ncclDebugInit() know to complete the reset.
|
||||
__atomic_store_n(&ncclDebugLevel, NCCL_DEBUG_RESET_TRIGGERED, __ATOMIC_RELEASE);
|
||||
pthread_mutex_unlock(&ncclDebugLock);
|
||||
}
|
||||
|
||||
NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
|
||||
|
||||
@@ -0,0 +1,127 @@
|
||||
#
|
||||
# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
||||
SHELL := /usr/bin/env bash
|
||||
MAKEFLAGS += -r
|
||||
.SUFFIXES:
|
||||
.SECONDARY:
|
||||
|
||||
NCCLDIR := ../..
|
||||
include $(NCCLDIR)/makefiles/common.mk
|
||||
include $(NCCLDIR)/makefiles/version.mk
|
||||
|
||||
BUILDDIR ?= $(abspath ../../build)
|
||||
OBJDIR := $(BUILDDIR)/obj/device
|
||||
|
||||
MANIFEST := $(OBJDIR)/manifest
|
||||
DEVGLUE_OBJ := $(OBJDIR)/device_glue.o
|
||||
|
||||
INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include
|
||||
NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
|
||||
CXXFLAGS += $(INCFLAGS)
|
||||
|
||||
NVCUFLAGS_SYM := -ccbin $(CXX) $(CXXSTD) --expt-extended-lambda -Xptxas -maxrregcount=128 -Xfatbin -compress-all
|
||||
NVCUFLAGS_SYM += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
|
||||
|
||||
SAY = @bash -c 'path="$$2"; [[ "$$(realpath "$$2")" =~ ^$(subst .,\.,$(abspath $(NCCLDIR)))/(.*)$$ ]] && path="$${BASH_REMATCH[1]}"; printf "%-15s %s\n" "$$1" "$$path"' SAY
|
||||
|
||||
COMPILE.cu = $(NVCC) $(NVCUFLAGS) -dc $2 -o $1
|
||||
COMPILE.cc = $(CXX) $(CXXFLAGS) -c $2 -o $1
|
||||
define COMPILE
|
||||
@$(SAY) "Compiling" $2;\
|
||||
mkdir -p $(dir $1);\
|
||||
$(call COMPILE$(or $3,$(suffix $2)),$1,$2)
|
||||
endef
|
||||
|
||||
ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12080))"),1)
|
||||
NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a \
|
||||
-gencode=arch=compute_120a,code=sm_120a
|
||||
else ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12070))"),1)
|
||||
NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a
|
||||
else
|
||||
NVCC_GENCODE_LDMC_FP8 =
|
||||
endif
|
||||
|
||||
define COMPILE_SYM
|
||||
@$(SAY) "Compiling" $2;\
|
||||
mkdir -p $(dir $1);\
|
||||
$(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1
|
||||
endef
|
||||
|
||||
DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1
|
||||
DEPENDS.cc = $(CXX) $(CXXFLAGS) -M -c $1
|
||||
define DEPENDS
|
||||
@$(SAY) "Dependencies" $2;\
|
||||
mkdir -p $(dir $1);\
|
||||
mk=$$($(call DEPENDS$(suffix $2),$2));\
|
||||
[[ $$mk =~ ^[^:]*:(.*)$$ ]];\
|
||||
files=$${BASH_REMATCH[1]};\
|
||||
files=$$(for x in $$files; do case "$$x" in '\'|$$'\t') ;; *) echo "$$x"; esac; done);\
|
||||
files=$$(for x in $$files; do [[ "$$(realpath "$$x")" == "$$(realpath "$(NCCLDIR)")"* ]] && echo "$$x"; done);\
|
||||
echo "$(patsubst %.d,%.o,$1) $1: " $$files > $1
|
||||
endef
|
||||
|
||||
all: $(MANIFEST)
|
||||
|
||||
$(OBJDIR)/gensrc: generate.py
|
||||
@mkdir -p $@
|
||||
(which python3 >/dev/null || \
|
||||
(bar='!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'; \
|
||||
printf "\n$${bar}\nERROR: Building NCCL requires a Python 3 installation invokable as 'python3'.\n$${bar}\n\n" 1>&2; \
|
||||
exit 1)) \
|
||||
&& ./generate.py $@ "$(ONLY_FUNCS)"
|
||||
|
||||
$(OBJDIR)/gensrc/symmetric: $(OBJDIR)/gensrc symmetric/generate.py
|
||||
@mkdir -p $@
|
||||
./symmetric/generate.py $@
|
||||
|
||||
# The trailing ";" is necessary to make this an "empty recipe":
|
||||
# https://www.gnu.org/software/make/manual/html_node/Empty-Recipes.html
|
||||
$(OBJDIR)/gensrc/rules.mk: $(OBJDIR)/gensrc ;
|
||||
|
||||
$(OBJDIR)/gensrc/symmetric/rules.mk: $(OBJDIR)/gensrc/symmetric ;
|
||||
|
||||
-include $(OBJDIR)/gensrc/rules.mk
|
||||
# "gensrc/rules.mk" populates $(LIB_OBJS_GEN)
|
||||
|
||||
-include $(OBJDIR)/gensrc/symmetric/rules.mk
|
||||
# "gensrc/symmetric/rules.mk" populates $(LIB_OBJS_SYM_GEN)
|
||||
|
||||
SRCS = common.cu onerank.cu
|
||||
|
||||
LIB_OBJS = $(patsubst %, $(OBJDIR)/%.o, $(SRCS)) $(LIB_OBJS_GEN) $(LIB_OBJS_SYM_GEN)
|
||||
|
||||
$(OBJDIR)/%.o: % $(OBJDIR)/%.d
|
||||
$(call COMPILE,$@,$<)
|
||||
|
||||
$(OBJDIR)/genobj/%.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/%.d
|
||||
$(call COMPILE,$@,$(OBJDIR)/gensrc/$*)
|
||||
|
||||
$(OBJDIR)/genobj/symmetric/%.o: $(OBJDIR)/gensrc/symmetric $(OBJDIR)/genobj/symmetric/%.d
|
||||
$(call COMPILE,$@,$(OBJDIR)/gensrc/symmetric/$*)
|
||||
|
||||
$(OBJDIR)/%.d: %
|
||||
$(call DEPENDS,$@,$<)
|
||||
|
||||
$(OBJDIR)/genobj/%.d: $(OBJDIR)/gensrc/%
|
||||
$(call DEPENDS,$@,$<)
|
||||
|
||||
$(OBJDIR)/genobj/symmetric/%.d: $(OBJDIR)/gensrc/symmetric/%
|
||||
$(call DEPENDS,$@,$<)
|
||||
|
||||
$(DEVGLUE_OBJ): $(LIB_OBJS)
|
||||
$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
|
||||
|
||||
$(MANIFEST): $(LIB_OBJS) $(DEVGLUE_OBJ)
|
||||
@echo $^ > $@
|
||||
|
||||
-include $(wildcard $(OBJDIR)/*.d)
|
||||
-include $(wildcard $(OBJDIR)/genobj/*.d)
|
||||
-include $(wildcard $(OBJDIR)/genobj/symmetric/*.d)
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm -rf $(OBJDIR)
|
||||
+206
-58
@@ -244,7 +244,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
|
||||
while (1) {
|
||||
struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
|
||||
int* poll = &ps->flags;
|
||||
while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0) {
|
||||
while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0) {
|
||||
pollCount++ ;// Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
|
||||
}
|
||||
patAlgo.getNextOp(ps);
|
||||
@@ -272,7 +272,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
|
||||
struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
|
||||
int* poll = &ps->flags;
|
||||
while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0){
|
||||
pollCount++; // Wait for compute thread
|
||||
pollCount++; // Wait for compute thread
|
||||
}
|
||||
int last = ps->last;
|
||||
prims.patCopy(ps, shmem);
|
||||
@@ -286,73 +286,221 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
template<bool BcastSendNotRecv>
|
||||
struct Scatterer {
|
||||
struct ncclDevWorkColl* work;
|
||||
ssize_t chunkSize;
|
||||
ssize_t railGridOffset;
|
||||
|
||||
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts, int MultimemSrcs, int MultimemDsts>
|
||||
__device__ __forceinline__ void operator()(
|
||||
int tid, int tn, int slice, int maxSliceSize,
|
||||
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
|
||||
) {
|
||||
static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
|
||||
static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
|
||||
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
int nNodes = ncclShmem.comm.nNodes;
|
||||
int nRails = nvls->nHeads;
|
||||
int part = ncclShmem.channelId - work->channelLo;
|
||||
char* inbuf = (char*)work->sendbuff;
|
||||
char* outbuf = (char*)work->recvbuff;
|
||||
ssize_t countPerRank = work->collnet.count;
|
||||
bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank * countPerRank);
|
||||
ssize_t railAllBeg = min(railGridOffset + part * chunkSize, nNodes * countPerRank);
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * countPerRank);
|
||||
int railAllSize = railAllEnd - railAllBeg;
|
||||
int rail = 0;
|
||||
int src = 0;
|
||||
|
||||
if (BcastSendNotRecv) {
|
||||
rail = nvls->headRank;
|
||||
} else {
|
||||
if (work->regUsed) return;
|
||||
rail = 0;
|
||||
}
|
||||
if (tid < nDsts) dstSizes[tid] = railAllSize;
|
||||
do {
|
||||
int node = railAllBeg / countPerRank;
|
||||
int railAllOffset = 0;
|
||||
while (railAllOffset < railAllSize) {
|
||||
ssize_t railOneBeg = node * countPerRank;
|
||||
ssize_t railOneEnd = railOneBeg + countPerRank;
|
||||
ssize_t railOneOffset = (railAllBeg + railAllOffset) - railOneBeg;
|
||||
int delta = min(railAllEnd, railOneEnd) - (railAllBeg + railAllOffset);
|
||||
int rank = ncclShmem.comm.collNetDenseToUserRank[node * nRails + rail];
|
||||
ssize_t userOneBeg = rank * countPerRank + railOneOffset;
|
||||
int outIsDst = (inPlace && rank == ncclShmem.comm.rank) || BcastSendNotRecv || work->regUsed ? 0 : 1;
|
||||
if (nSrcs != 0 && outIsDst + nDsts != 0) {
|
||||
reduceCopy<ncclCollUnroll(), USE_ACC, RedOp, T,
|
||||
/*MultimemSrcs,MinSrcs,MaxSrcs=*/MultimemSrcs, 1, 1,
|
||||
/*MultimemDsts=*/MultimemDsts, 0 + MultimemDsts + MinDsts, 1 + MaxDsts,
|
||||
/*PreOpSrcs=*/0>
|
||||
(tid, tn, 0, nullptr, false,
|
||||
/*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* {
|
||||
return (char*)srcPtrs[src] + railAllOffset;
|
||||
},
|
||||
/*nDsts=*/outIsDst + nDsts, [=]__device__(int d) -> void* {
|
||||
return d < outIsDst ? outbuf + userOneBeg
|
||||
: work->regUsed ? (char*)dstPtrs[d - outIsDst] + userOneBeg
|
||||
: (char*)dstPtrs[d - outIsDst] + railAllOffset;
|
||||
}, delta);
|
||||
}
|
||||
railAllOffset += delta;
|
||||
node += 1;
|
||||
}
|
||||
rail += 1;
|
||||
src += 1;
|
||||
} while (!BcastSendNotRecv && src < nRails);
|
||||
}
|
||||
};
|
||||
|
||||
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
const ssize_t rank = ncclShmem.comm.rank;
|
||||
size_t count, gridOffset, channelCount;
|
||||
size_t chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
const int nThreadsBcast = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
|
||||
const int nThreadsGather = work->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
|
||||
const int tidEndGather = nThreadsGather;
|
||||
const int tidEndBcast = tidEndGather + nThreadsBcast;
|
||||
const int nThreadsNetSend = work->oneNode ? 0 : (work->netRegUsed ? WARP_SIZE : 6 * WARP_SIZE);
|
||||
const int nThreadsGather = work->regUsed ? roundUp(nvls->nHeads << 2, WARP_SIZE) : 8 * WARP_SIZE;
|
||||
const int nThreadsBcast = NCCL_MAX_NTHREADS - nThreadsNetSend - nThreadsGather;
|
||||
|
||||
if (!work->regUsed) {
|
||||
if (tid < tidEndGather) {
|
||||
// Gather
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.gather(offset, nvls->nHeads * count, nelem, count, -1, 0);
|
||||
const int tidEndGather = nThreadsGather;
|
||||
const int tidEndNetSend = tidEndGather + nThreadsNetSend;
|
||||
const int tidEndBcast = tidEndNetSend + nThreadsBcast;
|
||||
|
||||
if (work->oneNode) {
|
||||
const ssize_t rank = ncclShmem.comm.rank;
|
||||
size_t count, gridOffset, channelCount, offset, chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
|
||||
if (!work->regUsed) {
|
||||
if (tid < tidEndGather) {
|
||||
// Gather
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.gather(offset, nvls->nHeads * count, nelem, count, -1, 0);
|
||||
}
|
||||
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
|
||||
} else if (tid < tidEndBcast) {
|
||||
// Bcast through NVLS
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL, 0, 1>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, work->sendbuff, NULL,
|
||||
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.send(offset, nelem);
|
||||
}
|
||||
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
|
||||
}
|
||||
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
|
||||
} else if (tid < tidEndBcast) {
|
||||
// Bcast through NVLS
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, work->sendbuff, NULL,
|
||||
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.send(offset, nelem);
|
||||
} else {
|
||||
if (tid < tidEndGather) {
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL,
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
|
||||
/* used as sync */
|
||||
prims.scatter(0, 0, 0, 0, -1, 0);
|
||||
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
prims.gather(0, 0, 0, 0, -1, 0);
|
||||
}
|
||||
} else if (tid < tidEndBcast) {
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL, 0, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, work->sendbuff, NULL,
|
||||
work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
/* used as sync */
|
||||
prims.recv(0, 0);
|
||||
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
ssize_t inpOffset = gridOffset + elemOffset;
|
||||
ssize_t outOffset = inpOffset + rank * count;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directSend(inpOffset, outOffset, nelem);
|
||||
}
|
||||
}
|
||||
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
|
||||
}
|
||||
} else {
|
||||
/* direct allgather */
|
||||
// NVLS + IB SHARP
|
||||
int nNodes = ncclShmem.comm.nNodes;
|
||||
int part = ncclShmem.channelId - work->channelLo;
|
||||
ssize_t countPerRank = work->collnet.count;
|
||||
const int nChannels = work->channelHi - work->channelLo + 1;
|
||||
ssize_t chunkCount = work->collnet.chunkCount;
|
||||
if (tid < tidEndGather) {
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL,
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
|
||||
/* used as sync */
|
||||
prims.scatter(0, 0, 0, 0, -1, 0);
|
||||
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
prims.gather(0, 0, 0, 0, -1, 0);
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid, nThreadsGather, nvls->up, nullptr, nullptr, work->recvbuff,
|
||||
/*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 1, 1, work);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) {
|
||||
Scatterer</*BcastSendNotRecv=*/false> scat;
|
||||
scat.work = work;
|
||||
scat.chunkSize = chunkCount;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.template process</*Recv=*/1, /*Send=*/0>(scat);
|
||||
}
|
||||
} else if (tid < tidEndBcast) {
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, work->sendbuff, NULL,
|
||||
work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
/* used as sync */
|
||||
prims.recv(0, 0);
|
||||
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
ssize_t inpOffset = gridOffset + elemOffset;
|
||||
ssize_t outOffset = inpOffset + rank * count;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directSend(inpOffset, outOffset, nelem);
|
||||
} else {
|
||||
if (work->netRegUsed) {
|
||||
using ProtoSend = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL>;
|
||||
using ProtoBcast = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL, 0, 1>;
|
||||
int maxSteps = (int)divUp(nNodes * countPerRank, nChannels * chunkCount);
|
||||
int curSteps = -1;
|
||||
int postThread = tid - tidEndGather == 0 ? 1 : 0;
|
||||
// for UB, we need to control the send speed to avoid net congestion.
|
||||
// first unroll 2 steps, then unroll the rest steps when the data is received.
|
||||
if (postThread) {
|
||||
curSteps = min(2, maxSteps);
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/1, ProtoSend, 0>::sendPeerNotify(nvls->out, 1, curSteps);
|
||||
}
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, ProtoBcast, 0>
|
||||
prims(tid - tidEndGather, nThreadsNetSend + nThreadsBcast, &nvls->out, &nvls->down, nullptr, nullptr,
|
||||
/*redOpArg=*/0, 2 * ProtoBcast::MaxGroupWidth, 0, 0, work);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) {
|
||||
Scatterer</*BcastSendNotRecv=*/true> scat;
|
||||
scat.work = work;
|
||||
scat.chunkSize = chunkCount;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.template process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
if (postThread && curSteps < maxSteps) {
|
||||
curSteps++;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/1, ProtoSend, 0>::sendPeerNotify(nvls->out, 1, 1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (tid < tidEndNetSend) {
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsNetSend, nullptr, &nvls->out, work->sendbuff, nullptr,
|
||||
/*redOpArg=*/0, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) {
|
||||
ssize_t railAllBeg = railGridOffset + part * chunkCount;
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkCount, nNodes * countPerRank);
|
||||
ssize_t railOneBeg = ncclShmem.comm.node * countPerRank;
|
||||
ssize_t railOneEnd = railOneBeg + countPerRank;
|
||||
ssize_t beg = max(railAllBeg, railOneBeg);
|
||||
ssize_t end = min(railAllEnd, railOneEnd);
|
||||
prims.send(beg - railOneBeg, max(ssize_t(0), end - beg));
|
||||
}
|
||||
} else {
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL, 0, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndNetSend, nThreadsBcast, &nvls->out, &nvls->down, nullptr, nullptr,
|
||||
/*redOpArg=*/0, 2 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) {
|
||||
Scatterer</*BcastSendNotRecv=*/true> scat;
|
||||
scat.work = work;
|
||||
scat.chunkSize = chunkCount;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.template process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -367,7 +515,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
|
||||
ssize_t chunkSize;
|
||||
ssize_t railGridOffset;
|
||||
|
||||
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
|
||||
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts, int MultimemSrcs, int MultimemDsts>
|
||||
__device__ __forceinline__ void operator()(
|
||||
int tid, int tn, int slice, int maxSliceSize,
|
||||
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
|
||||
|
||||
@@ -791,7 +791,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
|
||||
}
|
||||
} else if (tid < tidEndReduce && nvls->headRank != -1) {
|
||||
// Reduce, broadcast through NVLS
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL, 1, 1>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
|
||||
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
|
||||
+30
-42
@@ -145,7 +145,6 @@ struct ncclShmemData {
|
||||
uint16_t funcId;
|
||||
int nWorks;
|
||||
int workSize;
|
||||
uint32_t workConsumed;
|
||||
uint64_t workCounter;
|
||||
bool profilerEnabled;
|
||||
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
|
||||
@@ -331,7 +330,6 @@ __device__ __forceinline__ void loadWorkBatchToShmem(
|
||||
}
|
||||
if (tid == 0) {
|
||||
ncclShmem.workSize = workSize;
|
||||
ncclShmem.workConsumed = batch.offsetBase + (64-__clzll(batch.offsetBitset))*workSize;
|
||||
}
|
||||
// We deliberately replicate these div and mod calculations into the case
|
||||
// blocks above so that they get constant divisor optimizations by the compiler.
|
||||
@@ -392,6 +390,16 @@ __device__ __forceinline__ void loadWorkBatchToShmem(
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ unsigned long long int globaltimer() {
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
return wall_clock64();
|
||||
#else
|
||||
unsigned long long int timer;
|
||||
asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(timer));
|
||||
return timer;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int USE_ACC, int COLL_UNROLL, int Pipeline>
|
||||
struct RunWorkColl {
|
||||
__device__ void run(int tid, int tn, struct ncclDevWorkColl* work) {
|
||||
@@ -446,40 +454,30 @@ struct RunWorkBatch {
|
||||
#define STOP 1
|
||||
#define FINI 2
|
||||
|
||||
__device__ __forceinline__ bool profilerEnabled(void) {
|
||||
// Check if any of the workItems in the batch is profiled. If so, there is an equivalent
|
||||
// profiler ProxyOp waiting for the counter update in the host thread. If this check was
|
||||
// done only for the first workItem the profiler counter for other workItems in the batch
|
||||
// could never be updated, leaving the host thread spinning forever for the counter update
|
||||
// and causing a hang.
|
||||
bool enabled = false;
|
||||
for (int i = 0; i < ncclShmem.nWorks && !enabled; i++) {
|
||||
if (ncclShmem.workType == ncclDevWorkTypeP2p)
|
||||
enabled = ((struct ncclDevWorkP2p*)ncclShmem.workStorage)[i].profilerEnabled;
|
||||
else
|
||||
enabled = ((struct ncclDevWorkColl*)ncclShmem.workStorage)[i].profilerEnabled;
|
||||
}
|
||||
return enabled;
|
||||
__device__ __forceinline__ bool profilerEnabled(int workItemIdx) {
|
||||
return (ncclShmem.workType == ncclDevWorkTypeP2p) ?
|
||||
((struct ncclDevWorkP2p*)ncclShmem.workStorage)[workItemIdx].profilerEnabled :
|
||||
((struct ncclDevWorkColl*)ncclShmem.workStorage)[workItemIdx].profilerEnabled;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void profiler(int action) {
|
||||
if (action == START) {
|
||||
if (threadIdx.x == 0) {
|
||||
// increment workCounter regardless of the profiler being active or not
|
||||
if (threadIdx.x == 0) {
|
||||
int idx = 0;
|
||||
uint64_t wc = ncclShmem.channel.workCounter + 1;
|
||||
if (action == START) {
|
||||
for (; wc <= ncclShmem.channel.workCounter + ncclShmem.nWorks; wc++) {
|
||||
if (!profilerEnabled(idx++)) continue;
|
||||
ncclShmem.comm.workStarted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].timestamp = globaltimer();
|
||||
ncclShmem.comm.workStarted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].counter = wc;
|
||||
}
|
||||
} else {
|
||||
for (; wc <= ncclShmem.channel.workCounter + ncclShmem.nWorks; wc++) {
|
||||
if (!profilerEnabled(idx++)) continue;
|
||||
ncclShmem.comm.workCompleted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].timestamp = globaltimer();
|
||||
ncclShmem.comm.workCompleted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].counter = wc;
|
||||
}
|
||||
ncclShmem.channel.workCounter += ncclShmem.nWorks;
|
||||
if(!profilerEnabled()) return;
|
||||
ncclShmem.comm.workStarted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
|
||||
}
|
||||
} else if (action == STOP) {
|
||||
if (threadIdx.x == 0 && profilerEnabled()) {
|
||||
ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
|
||||
}
|
||||
} else { // FINI
|
||||
if (threadIdx.x == 0) {
|
||||
// store the workCounter back to vidmem regardless of the profiler being active or not
|
||||
((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
|
||||
if (!profilerEnabled()) return;
|
||||
ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
|
||||
if (action == FINI) ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -597,11 +595,6 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelLaunch(ncclCollTraceKernelLaunchType, 0);
|
||||
|
||||
if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
|
||||
// ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
|
||||
ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
|
||||
}
|
||||
|
||||
while (ncclShmem.aborted == 0) {
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
profiler(START);
|
||||
@@ -641,11 +634,6 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
profiler(STOP);
|
||||
loadWorkBatchToShmem(tid%WARP_SIZE, tn, args, batchIx);
|
||||
__syncthreads();
|
||||
|
||||
if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
|
||||
// ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
|
||||
ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
|
||||
}
|
||||
if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelLaunch(ncclCollTraceCollLaunchType, batchIx);
|
||||
}
|
||||
if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelEnd(ncclCollTraceKernelEndType);
|
||||
|
||||
+114
-13
@@ -93,37 +93,65 @@ template<>
|
||||
union BytePack<0> {};
|
||||
template<>
|
||||
union BytePack<1> {
|
||||
uint8_t u8, native;
|
||||
uint8_t u8[1], native;
|
||||
};
|
||||
template<>
|
||||
union BytePack<2> {
|
||||
BytePack<1> half[2];
|
||||
BytePack<1> b1[2];
|
||||
uint8_t u8[2];
|
||||
uint16_t u16, native;
|
||||
uint16_t u16[1], native;
|
||||
};
|
||||
template<>
|
||||
union BytePack<4> {
|
||||
BytePack<2> half[2];
|
||||
BytePack<1> b1[4];
|
||||
BytePack<2> b2[2];
|
||||
uint8_t u8[4];
|
||||
uint16_t u16[2];
|
||||
uint32_t u32, native;
|
||||
uint32_t u32[1], native;
|
||||
|
||||
inline __device__ BytePack<4>() = default;
|
||||
inline __device__ BytePack<4>(const BytePack<4>& other) {
|
||||
*this = other;
|
||||
}
|
||||
inline __device__ BytePack<4>& operator=(const BytePack<4>& other) {
|
||||
u32[0] = other.u32[0];
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
template<>
|
||||
union BytePack<8> {
|
||||
BytePack<4> half[2];
|
||||
BytePack<1> b1[8];
|
||||
BytePack<2> b2[4];
|
||||
BytePack<4> b4[2];
|
||||
uint8_t u8[8];
|
||||
uint16_t u16[4];
|
||||
uint32_t u32[2];
|
||||
uint64_t u64, native;
|
||||
uint64_t u64[1], native;
|
||||
|
||||
inline __device__ BytePack<8>() = default;
|
||||
inline __device__ BytePack<8>(const BytePack<8>& other) {
|
||||
*this = other;
|
||||
}
|
||||
inline __device__ BytePack<8>& operator=(const BytePack<8>& other) {
|
||||
u64[0] = other.u64[0];
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
template<>
|
||||
union alignas(16) BytePack<16> {
|
||||
BytePack<8> half[2];
|
||||
BytePack<1> b1[16];
|
||||
BytePack<2> b2[8];
|
||||
BytePack<4> b4[4];
|
||||
BytePack<8> b8[2];
|
||||
uint8_t u8[16];
|
||||
uint16_t u16[8];
|
||||
uint32_t u32[4];
|
||||
uint64_t u64[2];
|
||||
ulong2 ul2, native;
|
||||
ulong2 ul2[1], native;
|
||||
#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx942__) || defined(__gfx950__)
|
||||
inline __device__ BytePack<16>() = default;
|
||||
inline __device__ BytePack<16>(const BytePack<16>& other) {
|
||||
@@ -136,6 +164,30 @@ union alignas(16) BytePack<16> {
|
||||
}
|
||||
#endif
|
||||
};
|
||||
template<int Size>
|
||||
union BytePack {
|
||||
BytePack<Size/2> half[2];
|
||||
BytePack<1> b1[Size];
|
||||
BytePack<2> b2[Size/2];
|
||||
BytePack<4> b4[Size/4];
|
||||
BytePack<8> b8[Size/8];
|
||||
BytePack<16> b16[Size/16];
|
||||
uint8_t u8[Size];
|
||||
uint16_t u16[Size/2];
|
||||
uint32_t u32[Size/4];
|
||||
uint64_t u64[Size/8];
|
||||
|
||||
inline __device__ BytePack<Size>() = default;
|
||||
inline __device__ BytePack<Size>(const BytePack<Size>& other) {
|
||||
*this = other;
|
||||
}
|
||||
inline __device__ BytePack<Size>& operator=(const BytePack<Size>& other) {
|
||||
for (int i = 0; i < Size/8; i++) {
|
||||
u64[i] = other.u64[i];
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct BytePackOf {
|
||||
@@ -343,19 +395,19 @@ __device__ __forceinline__ void multimem_st_global<0>(uintptr_t addr, BytePack<0
|
||||
}
|
||||
template<>
|
||||
__device__ __forceinline__ void multimem_st_global<1>(uintptr_t addr, BytePack<1> val) {
|
||||
asm volatile("st.global.b8 [%0], %1;" :: "l"(addr), "r"((uint32_t)val.u8) : "memory");
|
||||
asm volatile("st.global.b8 [%0], %1;" :: "l"(addr), "r"((uint32_t)val.native) : "memory");
|
||||
}
|
||||
template<>
|
||||
__device__ __forceinline__ void multimem_st_global<2>(uintptr_t addr, BytePack<2> val) {
|
||||
asm volatile("st.global.b16 [%0], %1;" :: "l"(addr), "h"(val.u16) : "memory");
|
||||
asm volatile("st.global.b16 [%0], %1;" :: "l"(addr), "h"(val.native) : "memory");
|
||||
}
|
||||
template<>
|
||||
__device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) {
|
||||
asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.u32) : "memory");
|
||||
asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.native) : "memory");
|
||||
}
|
||||
template<>
|
||||
__device__ __forceinline__ void multimem_st_global<8>(uintptr_t addr, BytePack<8> val) {
|
||||
asm volatile("multimem.st.global.b64 [%0], %1;" :: "l"(addr), "l"(val.u64) : "memory");
|
||||
asm volatile("multimem.st.global.b64 [%0], %1;" :: "l"(addr), "l"(val.native) : "memory");
|
||||
}
|
||||
template<>
|
||||
__device__ __forceinline__ void multimem_st_global<16>(uintptr_t addr, BytePack<16> val) {
|
||||
@@ -370,6 +422,55 @@ __device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size
|
||||
}
|
||||
#endif
|
||||
|
||||
// Load pack starting at index in array. Ignore elements past end (length of array).
|
||||
template<typename Pack, typename T>
|
||||
__device__ __forceinline__ Pack loadPack(T* ptr, int ix, int end) {
|
||||
constexpr int Size = sizeof(Pack);
|
||||
ptr += ix;
|
||||
int n = end - ix;
|
||||
if (alignof(T) == Size && sizeof(T) == Size) {
|
||||
return *(Pack*)ptr;
|
||||
} else if ((Size+3)/4 + 1 < Size/sizeof(T)) {
|
||||
union { Pack ans; uint32_t part[Size/4]; };
|
||||
int misalign = reinterpret_cast<uintptr_t>(ptr) % 4;
|
||||
uint32_t* down = reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(ptr) & -uintptr_t(4));
|
||||
int i;
|
||||
#pragma unroll
|
||||
for (i=0; i < Size/4; i++) {
|
||||
if (i*4/sizeof(T) < 1 || i*4/sizeof(T) < n) part[i] = down[i];
|
||||
}
|
||||
uint32_t extra;
|
||||
if (misalign) extra = down[i];
|
||||
#pragma unroll
|
||||
for (i=0; i < Size/4; i++) {
|
||||
part[i] = __funnelshift_r(part[i], part[i+1], 8*misalign);
|
||||
}
|
||||
if (misalign) part[i] = __funnelshift_r(part[i], extra, 8*misalign);
|
||||
return ans;
|
||||
} else {
|
||||
union { Pack ans; BytePack<sizeof(T)> part[Size/sizeof(T)]; };
|
||||
#pragma unroll
|
||||
for (int i=0; i < Size/sizeof(T); i++) {
|
||||
if (i < 1 || i < n) part[i] = ((BytePack<sizeof(T)>*)ptr)[i];
|
||||
}
|
||||
return ans;
|
||||
}
|
||||
}
|
||||
|
||||
// Store pack starting at index in array. Ignore elements past end (length of array).
|
||||
template<typename Pack, typename T>
|
||||
__device__ __forceinline__ void storePack(T* ptr, int ix, int end, Pack val) {
|
||||
constexpr int Size = sizeof(Pack);
|
||||
union { Pack tmp; BytePack<sizeof(T)> part[Size/sizeof(T)]; };
|
||||
tmp = val;
|
||||
ptr += ix;
|
||||
int n = end - ix;
|
||||
#pragma unroll
|
||||
for (int i=0; i < Size/sizeof(T); i++) {
|
||||
if (i < 1 || i < n) ((BytePack<sizeof(T)>*)ptr)[i] = part[i];
|
||||
}
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
|
||||
// Warp-uniform memory copy from shared address (not generic) to global memory.
|
||||
// The number of bytes copied is `min(MaxBytes, nBytesAhead)`, a negative value
|
||||
@@ -413,10 +514,10 @@ __device__ __forceinline__ void copyGlobalShared_WarpUnrolled(
|
||||
b4[3] = ld_shared<4>(srcAddr + 3*4);
|
||||
if (srcMisalign != 0) {
|
||||
BytePack<4> b4_4 = ld_shared<4>(srcAddr + 4*4);
|
||||
b4[0].u32 = __funnelshift_r(b4[0].u32, b4[1].u32, srcMisalign*8);
|
||||
b4[1].u32 = __funnelshift_r(b4[1].u32, b4[2].u32, srcMisalign*8);
|
||||
b4[2].u32 = __funnelshift_r(b4[2].u32, b4[3].u32, srcMisalign*8);
|
||||
b4[3].u32 = __funnelshift_r(b4[3].u32, b4_4.u32, srcMisalign*8);
|
||||
b4[0].native = __funnelshift_r(b4[0].native, b4[1].native, srcMisalign*8);
|
||||
b4[1].native = __funnelshift_r(b4[1].native, b4[2].native, srcMisalign*8);
|
||||
b4[2].native = __funnelshift_r(b4[2].native, b4[3].native, srcMisalign*8);
|
||||
b4[3].native = __funnelshift_r(b4[3].native, b4_4.native, srcMisalign*8);
|
||||
}
|
||||
if (Multimem) multimem_st_global<16>(dstAddr, b16);
|
||||
else st_global<16>(dstAddr, b16);
|
||||
|
||||
@@ -155,7 +155,7 @@ private:
|
||||
|
||||
void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
|
||||
: (ncclShmem.groups[group].srcs + Src);
|
||||
if (flags & NetRegMode) {
|
||||
if ((flags & NetRegMode) && ((!isSendNotRecv && DirectRecv) || (isSendNotRecv && DirectSend))) {
|
||||
if (P2p) {
|
||||
ptrs[index] = NULL;
|
||||
} else {
|
||||
@@ -506,7 +506,7 @@ public:
|
||||
}
|
||||
|
||||
template<int Recv, int Send, typename Fn>
|
||||
__device__ __forceinline__ void process(Fn &&fn, uint32_t sendDirectFlag, uint32_t recvDirectFlag) {
|
||||
__device__ __forceinline__ void process(Fn &&fn, uint32_t sendDirectFlag = 0, uint32_t recvDirectFlag = 0) {
|
||||
#pragma unroll 1
|
||||
for (int slice=0; slice < SlicePerChunk; slice++) {
|
||||
if (tid < nworkers) {
|
||||
@@ -530,7 +530,7 @@ public:
|
||||
} else if (flags & DirectRead) { // empty send
|
||||
ptrs[index] = nullptr;
|
||||
} else {
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
|
||||
}
|
||||
} else {
|
||||
if (flags & DirectRead) {
|
||||
@@ -541,11 +541,11 @@ public:
|
||||
else
|
||||
ptrs[index] = nullptr;
|
||||
} else {
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
|
||||
}
|
||||
}
|
||||
subBarrier();
|
||||
@@ -560,7 +560,7 @@ public:
|
||||
} else {
|
||||
nsend = fan.nsend();
|
||||
}
|
||||
fn.template operator() < SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend >
|
||||
fn.template operator()<SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend, MultimemSrcs, MultimemDsts>
|
||||
(tid, nworkers, slice, stepSize * StepPerSlice,
|
||||
nrecv, ncclShmem.groups[group].srcs,
|
||||
nsend, ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes, sendDirectFlag, recvDirectFlag);
|
||||
@@ -1083,6 +1083,12 @@ public:
|
||||
__device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
|
||||
genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, postOp);
|
||||
}
|
||||
__device__ __forceinline__ void recvDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
|
||||
genericOp<0, 1, 1, 1, -1, -1>(-1, outIx, eltN, postOp);
|
||||
}
|
||||
__device__ __forceinline__ void directRecvSend(intptr_t outIx, int eltN, bool postOp=false) {
|
||||
genericOp<1, 0, 1, 1, -1, -1>(outIx, outIx, eltN, postOp);
|
||||
}
|
||||
__device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
|
||||
genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
|
||||
}
|
||||
|
||||
+318
-128
@@ -42,18 +42,18 @@ struct IsFloatingPoint<double>: std::true_type {};
|
||||
// 3. Have constructor taking `uint64_t opArg`.
|
||||
|
||||
template<typename T>
|
||||
struct FuncCopy { using EltType = T; __device__ FuncCopy(uint64_t opArg=0) {}; };
|
||||
struct FuncCopy { using EltType = T; __device__ __forceinline__ FuncCopy(uint64_t opArg=0) {}; };
|
||||
template<typename T>
|
||||
struct FuncSum { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; };
|
||||
struct FuncSum { using EltType = T; __device__ __forceinline__ FuncSum(uint64_t opArg=0) {}; };
|
||||
template<typename T>
|
||||
struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; };
|
||||
struct FuncProd { using EltType = T; __device__ __forceinline__ FuncProd(uint64_t opArg=0) {}; };
|
||||
|
||||
template<typename T>
|
||||
struct FuncMinMax {
|
||||
using EltType = T;
|
||||
BytePack<sizeof(T)> xormask; // only used by integers
|
||||
bool isMinNotMax; // only used by floats
|
||||
__device__ FuncMinMax(uint64_t opArg=0) {
|
||||
__device__ __forceinline__ FuncMinMax(uint64_t opArg=0) {
|
||||
xormask.native = opArg;
|
||||
isMinNotMax = (opArg&1)==0;
|
||||
}
|
||||
@@ -68,13 +68,13 @@ template<typename T> struct FuncSumPostDiv;
|
||||
template<typename Fn>
|
||||
struct RedOpArg { // default case: no argument
|
||||
static constexpr bool ArgUsed = false;
|
||||
__device__ static uint64_t loadArg(void *ptr) { return 0; }
|
||||
__device__ __forceinline__ static uint64_t loadArg(void *ptr) { return 0; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct RedOpArg<FuncMinMax<T>> {
|
||||
static constexpr bool ArgUsed = true;
|
||||
__device__ static uint64_t loadArg(void *ptr) {
|
||||
__device__ __forceinline__ static uint64_t loadArg(void *ptr) {
|
||||
union { uint64_t u64; T val; };
|
||||
u64 = 0;
|
||||
val = *(T*)ptr;
|
||||
@@ -88,6 +88,11 @@ struct RedOpArg<FuncMinMax<T>> {
|
||||
// of elements. These classes are intended to be specialized for specific
|
||||
// combinations of reduction function and pack size.
|
||||
|
||||
template<typename A, typename B, int EltPerPackA>
|
||||
struct Apply_Cast/*{
|
||||
static BytePack<EltPerPackA*sizeof(B)/sizeof(A)> cast(BytePack<EltPerPackA*sizeof(A)> a);
|
||||
}*/;
|
||||
|
||||
template<typename Fn, int EltPerPack>
|
||||
struct Apply_Reduce /*{
|
||||
static BytePack<EltPerPack*sizeof(T)> reduce(
|
||||
@@ -115,16 +120,60 @@ struct Apply_LoadMultimem/*{
|
||||
static BytePack<BytePerPack> load(Fn fn, uintptr_t addr);
|
||||
}*/;
|
||||
|
||||
|
||||
// Helpers for dealing with BytePack<0>'s
|
||||
template<typename A, typename B, int EltPerPack>
|
||||
struct Apply_Cast_MaybeEmpty: Apply_Cast<A, B, EltPerPack> {};
|
||||
template<typename A, typename B>
|
||||
struct Apply_Cast_MaybeEmpty<A, B, /*EltPerPack=*/0> {
|
||||
__device__ constexpr static BytePack<0> cast(BytePack<0> a) { return {}; }
|
||||
};
|
||||
|
||||
template<typename Fn, int EltPerPack>
|
||||
struct Apply_Reduce_MaybeEmpty: Apply_Reduce<Fn, EltPerPack> {};
|
||||
template<typename Fn>
|
||||
struct Apply_Reduce_MaybeEmpty<Fn, 0> {
|
||||
__device__ constexpr static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) { return {}; }
|
||||
};
|
||||
|
||||
template<typename Fn, int EltPerPack>
|
||||
struct Apply_PreOp_MaybeEmpty: Apply_PreOp<Fn, EltPerPack> {};
|
||||
template<typename Fn>
|
||||
struct Apply_PreOp_MaybeEmpty<Fn, 0> {
|
||||
static constexpr bool IsIdentity = true;
|
||||
__device__ constexpr static BytePack<0> preOp(Fn fn, BytePack<0> a) { return {}; }
|
||||
};
|
||||
|
||||
template<typename Fn, int EltPerPack>
|
||||
struct Apply_PostOp_MaybeEmpty: Apply_PostOp<Fn, EltPerPack> {};
|
||||
template<typename Fn>
|
||||
struct Apply_PostOp_MaybeEmpty<Fn, 0> {
|
||||
static constexpr bool IsIdentity = true;
|
||||
__device__ constexpr static BytePack<0> postOp(Fn fn, BytePack<0> a) { return {}; }
|
||||
};
|
||||
|
||||
template<typename Fn, int BytePerPack>
|
||||
struct Apply_LoadMultimem_MaybeEmpty: Apply_LoadMultimem<Fn, BytePerPack> {};
|
||||
template<typename Fn>
|
||||
struct Apply_LoadMultimem_MaybeEmpty<Fn, 0> {
|
||||
__device__ constexpr static BytePack<0> load(Fn fn, uintptr_t addr) { return {}; }
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Public API for calling the trait classes. These take the data elements as a
|
||||
// pack of any type, which could be a BytePack<?> or any integral type (uint64_t,
|
||||
// uint32_t, etc.), and will return a new pack where each element has been
|
||||
// transformed appropriately.
|
||||
|
||||
template<typename A, typename B, typename PackA>
|
||||
__device__ __forceinline__ BytePack<BytePackOf<PackA>::Size*sizeof(B)/sizeof(A)> applyCast(PackA a) {
|
||||
return Apply_Cast_MaybeEmpty<A, B, BytePackOf<PackA>::Size/sizeof(A)>::cast(toPack(a));
|
||||
}
|
||||
|
||||
template<typename Fn, typename Pack>
|
||||
__device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) {
|
||||
return fromPack<Pack>(
|
||||
Apply_Reduce<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
|
||||
Apply_Reduce_MaybeEmpty<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
|
||||
::reduce(fn, toPack(a), toPack(b))
|
||||
);
|
||||
}
|
||||
@@ -132,7 +181,7 @@ __device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) {
|
||||
template<typename Fn, typename Pack>
|
||||
__device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) {
|
||||
return fromPack<Pack>(
|
||||
Apply_PreOp<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
|
||||
Apply_PreOp_MaybeEmpty<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
|
||||
::preOp(fn, toPack(a))
|
||||
);
|
||||
}
|
||||
@@ -140,23 +189,107 @@ __device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) {
|
||||
template<typename Fn, typename Pack>
|
||||
__device__ __forceinline__ Pack applyPostOp(Fn fn, Pack a) {
|
||||
return fromPack<Pack>(
|
||||
Apply_PostOp<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
|
||||
Apply_PostOp_MaybeEmpty<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
|
||||
::postOp(fn, toPack(a))
|
||||
);
|
||||
}
|
||||
|
||||
template<typename Fn, int BytePerPack>
|
||||
__device__ __forceinline__ BytePack<BytePerPack> applyLoadMultimem(Fn fn, uintptr_t addr) {
|
||||
return Apply_LoadMultimem<Fn, BytePerPack>::load(fn, addr);
|
||||
return Apply_LoadMultimem_MaybeEmpty<Fn, BytePerPack>::load(fn, addr);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Apply_Cast
|
||||
|
||||
template<typename A, typename B, int EltPerPack>
|
||||
struct Apply_Cast {
|
||||
__device__ __forceinline__ static BytePack<EltPerPack*sizeof(B)> cast(BytePack<EltPerPack*sizeof(A)> a) {
|
||||
BytePack<EltPerPack*sizeof(B)> b;
|
||||
b.half[0] = Apply_Cast<A, B, EltPerPack/2>::cast(a.half[0]);
|
||||
b.half[1] = Apply_Cast<A, B, EltPerPack/2>::cast(a.half[1]);
|
||||
return b;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename A, typename B>
|
||||
struct Apply_Cast<A, B, /*EltPerPack=*/1> {
|
||||
__device__ __forceinline__ static BytePack<sizeof(B)> cast(BytePack<sizeof(A)> a) {
|
||||
return toPack(B(fromPack<A>(a)));
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct Apply_Cast<__half, float, /*EltPerPack=*/1> {
|
||||
__device__ __forceinline__ static BytePack<sizeof(float)> cast(BytePack<sizeof(__half)> a) {
|
||||
return toPack(__half2float(fromPack<__half>(a)));
|
||||
}
|
||||
};
|
||||
template<>
|
||||
struct Apply_Cast<float, __half, /*EltPerPack=*/1> {
|
||||
__device__ __forceinline__ static BytePack<sizeof(__half)> cast(BytePack<sizeof(float)> a) {
|
||||
return toPack(__float2half_rn(fromPack<float>(a)));
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct Apply_Cast<__half, float, /*EltPerPack=*/2> {
|
||||
__device__ __forceinline__ static BytePack<4*2> cast(BytePack<2*2> a) {
|
||||
return toPack(__half22float2(fromPack<__half2>(a)));
|
||||
}
|
||||
};
|
||||
template<>
|
||||
struct Apply_Cast<float, __half, /*EltPerPack=*/2> {
|
||||
__device__ __forceinline__ static BytePack<2*2> cast(BytePack<4*2> a) {
|
||||
return toPack(__float22half2_rn(fromPack<float2>(a)));
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__) && (CUDART_RUNTIME >= 12000 || __CUDA_ARCH__ >= 800)
|
||||
template<>
|
||||
struct Apply_Cast<__nv_bfloat16, float, /*EltPerPack=*/2> {
|
||||
__device__ __forceinline__ static BytePack<4*2> cast(BytePack<2*2> a) {
|
||||
return toPack(__bfloat1622float2(fromPack<__nv_bfloat162>(a)));
|
||||
}
|
||||
};
|
||||
template<>
|
||||
struct Apply_Cast<float ,__nv_bfloat16, /*EltPerPack=*/2> {
|
||||
__device__ __forceinline__ static BytePack<2*2> cast(BytePack<4*2> a) {
|
||||
return toPack(__float22bfloat162_rn(fromPack<float2>(a)));
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
#define EASY_CAST(A, B, EltPerPack, VecA, VecB) \
|
||||
template<> \
|
||||
struct Apply_Cast<A, B, EltPerPack> { \
|
||||
__device__ __forceinline__ static BytePack<sizeof(B)*EltPerPack> cast(BytePack<sizeof(A)*EltPerPack> a) { \
|
||||
return toPack(VecB(fromPack<VecA>(a))); \
|
||||
} \
|
||||
}; \
|
||||
template<> \
|
||||
struct Apply_Cast<B, A, EltPerPack> { \
|
||||
__device__ __forceinline__ static BytePack<sizeof(A)*EltPerPack> cast(BytePack<sizeof(B)*EltPerPack> b) { \
|
||||
return toPack(VecA(fromPack<VecB>(b))); \
|
||||
} \
|
||||
};
|
||||
|
||||
#if defined(__CUDA_FP8_TYPES_EXIST__)
|
||||
EASY_CAST(__nv_fp8_e5m2, float, 2, __nv_fp8x2_e5m2, float2)
|
||||
EASY_CAST(__nv_fp8_e5m2, float, 4, __nv_fp8x4_e5m2, float4)
|
||||
|
||||
EASY_CAST(__nv_fp8_e4m3, float, 2, __nv_fp8x2_e4m3, float2)
|
||||
EASY_CAST(__nv_fp8_e4m3, float, 4, __nv_fp8x4_e4m3, float4)
|
||||
#endif
|
||||
#undef EASY_CAST
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Apply_Reduce
|
||||
|
||||
// Nonsensical base case
|
||||
template<typename Fn>
|
||||
struct Apply_Reduce<Fn, /*EltPerPack=*/0> {
|
||||
__device__ static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) {
|
||||
__device__ __forceinline__ static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) {
|
||||
return {};
|
||||
}
|
||||
};
|
||||
@@ -168,7 +301,7 @@ struct Apply_Reduce<Fn, /*EltPerPack=*/0> {
|
||||
template<typename Fn, int EltPerPack>
|
||||
struct Apply_Reduce {
|
||||
template<int Size>
|
||||
__device__ static BytePack<Size> reduce(Fn fn, BytePack<Size> a, BytePack<Size> b) {
|
||||
__device__ __forceinline__ static BytePack<Size> reduce(Fn fn, BytePack<Size> a, BytePack<Size> b) {
|
||||
a.half[0] = Apply_Reduce<Fn, EltPerPack/2>::reduce(fn, a.half[0], b.half[0]);
|
||||
a.half[1] = Apply_Reduce<Fn, EltPerPack/2>::reduce(fn, a.half[1], b.half[1]);
|
||||
return a;
|
||||
@@ -178,25 +311,25 @@ struct Apply_Reduce {
|
||||
// Base case definitions (EltPerPack == 1)
|
||||
template<typename T>
|
||||
struct Apply_Reduce<FuncCopy<T>, /*EltPerPack=*/1> {
|
||||
__device__ static BytePack<sizeof(T)> reduce(FuncCopy<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
|
||||
__device__ __forceinline__ static BytePack<sizeof(T)> reduce(FuncCopy<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
|
||||
return a;
|
||||
}
|
||||
};
|
||||
template<typename T>
|
||||
struct Apply_Reduce<FuncSum<T>, /*EltPerPack=*/1> {
|
||||
__device__ static BytePack<sizeof(T)> reduce(FuncSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
|
||||
__device__ __forceinline__ static BytePack<sizeof(T)> reduce(FuncSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
|
||||
return toPack<T>(fromPack<T>(a) + fromPack<T>(b));
|
||||
}
|
||||
};
|
||||
template<typename T>
|
||||
struct Apply_Reduce<FuncProd<T>, /*EltPerPack=*/1> {
|
||||
__device__ static BytePack<sizeof(T)> reduce(FuncProd<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
|
||||
__device__ __forceinline__ static BytePack<sizeof(T)> reduce(FuncProd<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
|
||||
return toPack<T>(fromPack<T>(a) * fromPack<T>(b));
|
||||
}
|
||||
};
|
||||
template<typename T>
|
||||
struct Apply_Reduce<FuncMinMax<T>, /*EltPerPack=*/1> {
|
||||
__device__ static BytePack<sizeof(T)> reduce(FuncMinMax<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
|
||||
__device__ __forceinline__ static BytePack<sizeof(T)> reduce(FuncMinMax<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
|
||||
return (a.native ^ fn.xormask.native) < (b.native ^ fn.xormask.native) ? a : b;
|
||||
}
|
||||
};
|
||||
@@ -204,7 +337,7 @@ struct Apply_Reduce<FuncMinMax<T>, /*EltPerPack=*/1> {
|
||||
// Optimizations for specfic types and element count combinations:
|
||||
template<>
|
||||
struct Apply_Reduce<FuncSum<uint8_t>, /*EltPerPack=*/4> {
|
||||
__device__ static BytePack<4> reduce(FuncSum<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
|
||||
__device__ __forceinline__ static BytePack<4> reduce(FuncSum<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
|
||||
constexpr uint32_t even = 0x00ff00ffu;
|
||||
uint32_t x = (a.native & even) + (b.native & even);
|
||||
uint32_t y = (a.native & ~even) + (b.native & ~even);
|
||||
@@ -240,7 +373,7 @@ struct Apply_Reduce<FuncMinMax<uint8_t>, /*EltPerPack=*/4> {
|
||||
|
||||
// template<>
|
||||
// struct Apply_Reduce<FuncProd<uint8_t>, /*EltPerPack=*/4> {
|
||||
// __device__ static BytePack<4> reduce(FuncProd<uint8_t> fn, BytePack<4> apack, BytePack<4> bpack) {
|
||||
// __device__ __forceinline__ static BytePack<4> reduce(FuncProd<uint8_t> fn, BytePack<4> apack, BytePack<4> bpack) {
|
||||
// uint32_t a = apack.native;
|
||||
// uint32_t b = bpack.native;
|
||||
// uint32_t ab0 = (a*b) & 0xffu;
|
||||
@@ -326,7 +459,7 @@ template<typename Fn, int EltPerPack>
|
||||
struct Apply_PreOp {
|
||||
static constexpr bool IsIdentity = Apply_PreOp<Fn, EltPerPack/2>::IsIdentity;
|
||||
template<int Size>
|
||||
__device__ static BytePack<Size> preOp(Fn fn, BytePack<Size> a) {
|
||||
__device__ __forceinline__ static BytePack<Size> preOp(Fn fn, BytePack<Size> a) {
|
||||
#if __cpp_if_constexpr
|
||||
if constexpr(!IsIdentity) {
|
||||
#else
|
||||
@@ -346,7 +479,7 @@ template<typename Fn>
|
||||
struct Apply_PreOp<Fn, /*EltPerPack=*/1> {
|
||||
static constexpr bool IsIdentity = true;
|
||||
template<int Size>
|
||||
__device__ static BytePack<Size> preOp(Fn fn, BytePack<Size> a) {
|
||||
__device__ __forceinline__ static BytePack<Size> preOp(Fn fn, BytePack<Size> a) {
|
||||
return a;
|
||||
}
|
||||
};
|
||||
@@ -354,7 +487,7 @@ struct Apply_PreOp<Fn, /*EltPerPack=*/1> {
|
||||
template<typename Fn>
|
||||
struct Apply_PreOp<Fn, /*EltPerPack=*/0> {
|
||||
static constexpr bool IsIdentity = true;
|
||||
__device__ static BytePack<0> preOp(Fn fn, BytePack<0> a) {
|
||||
__device__ __forceinline__ static BytePack<0> preOp(Fn fn, BytePack<0> a) {
|
||||
return {};
|
||||
}
|
||||
};
|
||||
@@ -367,7 +500,7 @@ template<typename Fn, int EltPerPack>
|
||||
struct Apply_PostOp {
|
||||
static constexpr bool IsIdentity = Apply_PostOp<Fn, EltPerPack/2>::IsIdentity;
|
||||
template<int Size>
|
||||
__device__ static BytePack<Size> postOp(Fn fn, BytePack<Size> a) {
|
||||
__device__ __forceinline__ static BytePack<Size> postOp(Fn fn, BytePack<Size> a) {
|
||||
#if __cpp_if_constexpr
|
||||
if constexpr(!IsIdentity) {
|
||||
#else
|
||||
@@ -387,7 +520,7 @@ template<typename Fn>
|
||||
struct Apply_PostOp<Fn, /*EltPerPack=*/1> {
|
||||
static constexpr bool IsIdentity = true;
|
||||
template<int Size>
|
||||
__device__ static BytePack<Size> postOp(Fn fn, BytePack<Size> a) {
|
||||
__device__ __forceinline__ static BytePack<Size> postOp(Fn fn, BytePack<Size> a) {
|
||||
return a;
|
||||
}
|
||||
};
|
||||
@@ -395,7 +528,7 @@ struct Apply_PostOp<Fn, /*EltPerPack=*/1> {
|
||||
template<typename Fn>
|
||||
struct Apply_PostOp<Fn, /*EltPerPack=*/0> {
|
||||
static constexpr bool IsIdentity = true;
|
||||
__device__ static BytePack<0> postOp(Fn fn, BytePack<0> a) {
|
||||
__device__ __forceinline__ static BytePack<0> postOp(Fn fn, BytePack<0> a) {
|
||||
return {};
|
||||
}
|
||||
};
|
||||
@@ -407,7 +540,7 @@ struct Apply_PostOp<Fn, /*EltPerPack=*/0> {
|
||||
template<typename T>
|
||||
struct RedOpArg<FuncPreMulSum<T>> {
|
||||
static constexpr bool ArgUsed = true;
|
||||
__device__ static uint64_t loadArg(void *ptr) {
|
||||
__device__ __forceinline__ static uint64_t loadArg(void *ptr) {
|
||||
union { uint64_t u64; T val; };
|
||||
u64 = 0;
|
||||
val = *(T*)ptr;
|
||||
@@ -420,7 +553,7 @@ template<typename T>
|
||||
struct FuncPreMulSum {
|
||||
using EltType = T;
|
||||
T scalar;
|
||||
__device__ FuncPreMulSum(uint64_t opArg=0) {
|
||||
__device__ __forceinline__ FuncPreMulSum(uint64_t opArg=0) {
|
||||
union { uint64_t u64; T val; };
|
||||
u64 = opArg;
|
||||
scalar = val;
|
||||
@@ -434,7 +567,7 @@ template<>
|
||||
struct FuncPreMulSum<half> {
|
||||
using EltType = half;
|
||||
half2 scalar;
|
||||
__device__ FuncPreMulSum(uint64_t opArg=0) {
|
||||
__device__ __forceinline__ FuncPreMulSum(uint64_t opArg=0) {
|
||||
union { uint64_t u64; __half val; };
|
||||
u64 = opArg;
|
||||
scalar.x = val;
|
||||
@@ -451,7 +584,7 @@ struct FuncPreMulSum<half> {
|
||||
using EltType = hip_bfloat16;
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
__nv_bfloat162 scalar;
|
||||
__device__ FuncPreMulSum(uint64_t opArg=0) {
|
||||
__device__ __forceinline__ FuncPreMulSum(uint64_t opArg=0) {
|
||||
union { uint64_t u64; __nv_bfloat16 val; };
|
||||
u64 = opArg;
|
||||
scalar.x = val;
|
||||
@@ -459,7 +592,7 @@ struct FuncPreMulSum<half> {
|
||||
}
|
||||
#else
|
||||
float scalar;
|
||||
__device__ FuncPreMulSum(uint64_t opArg=0) {
|
||||
__device__ __forceinline__ FuncPreMulSum(uint64_t opArg=0) {
|
||||
union { uint64_t u64; hip_bfloat16 val; };
|
||||
u64 = opArg;
|
||||
scalar = (float)(val);
|
||||
@@ -474,7 +607,7 @@ struct FuncPreMulSum<half> {
|
||||
struct FuncPreMulSum<__nv_fp8_e4m3> {
|
||||
using EltType = __nv_fp8_e4m3;
|
||||
__half2 scalar2;
|
||||
__device__ FuncPreMulSum(uint64_t opArg) {
|
||||
__device__ __forceinline__ FuncPreMulSum(uint64_t opArg) {
|
||||
union { uint64_t u64; __nv_fp8_storage_t val; };
|
||||
u64 = opArg;
|
||||
scalar2.x = __half(__nv_cvt_fp8_to_halfraw(val, __NV_E4M3));
|
||||
@@ -486,7 +619,7 @@ struct FuncPreMulSum<half> {
|
||||
struct FuncPreMulSum<__nv_fp8_e5m2> {
|
||||
using EltType = __nv_fp8_e5m2;
|
||||
__half2 scalar2;
|
||||
__device__ FuncPreMulSum(uint64_t opArg) {
|
||||
__device__ __forceinline__ FuncPreMulSum(uint64_t opArg) {
|
||||
union { uint64_t u64; __nv_fp8_storage_t val; };
|
||||
u64 = opArg;
|
||||
scalar2.x = __half(__nv_cvt_fp8_to_halfraw(val, __NV_E5M2));
|
||||
@@ -528,7 +661,7 @@ struct FuncPreMulSum<half> {
|
||||
|
||||
template<typename T, int EltPerPack>
|
||||
struct Apply_Reduce<FuncPreMulSum<T>, EltPerPack> {
|
||||
__device__ static BytePack<EltPerPack*sizeof(T)> reduce(FuncPreMulSum<T> fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b) {
|
||||
__device__ __forceinline__ static BytePack<EltPerPack*sizeof(T)> reduce(FuncPreMulSum<T> fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b) {
|
||||
// FuncPreMulSum reduce dispatches to FuncSum.
|
||||
return Apply_Reduce<FuncSum<T>, EltPerPack>::reduce(FuncSum<T>(), a, b);
|
||||
}
|
||||
@@ -538,7 +671,7 @@ struct Apply_Reduce<FuncPreMulSum<T>, EltPerPack> {
|
||||
template<typename T>
|
||||
struct Apply_PreOp<FuncPreMulSum<T>, /*EltPerPack=*/1> {
|
||||
static constexpr bool IsIdentity = false;
|
||||
__device__ static BytePack<sizeof(T)> preOp(FuncPreMulSum<T> fn, BytePack<sizeof(T)> a) {
|
||||
__device__ __forceinline__ static BytePack<sizeof(T)> preOp(FuncPreMulSum<T> fn, BytePack<sizeof(T)> a) {
|
||||
return toPack<T>(fromPack<T>(a) * fn.scalar);
|
||||
}
|
||||
};
|
||||
@@ -549,7 +682,7 @@ struct Apply_PreOp<FuncPreMulSum<T>, /*EltPerPack=*/1> {
|
||||
template<>
|
||||
struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
|
||||
static constexpr bool IsIdentity = false;
|
||||
__device__ static BytePack<sizeof(half)> preOp(FuncPreMulSum<half> fn, BytePack<sizeof(half)> a) {
|
||||
__device__ __forceinline__ static BytePack<sizeof(half)> preOp(FuncPreMulSum<half> fn, BytePack<sizeof(half)> a) {
|
||||
return toPack<half>(__hmul(fromPack<half>(a), fn.scalar.x));
|
||||
}
|
||||
};
|
||||
@@ -557,7 +690,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
|
||||
template<>
|
||||
struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/2> {
|
||||
static constexpr bool IsIdentity = false;
|
||||
__device__ static BytePack<sizeof(half2)> preOp(FuncPreMulSum<half> fn, BytePack<sizeof(half2)> a) {
|
||||
__device__ __forceinline__ static BytePack<sizeof(half2)> preOp(FuncPreMulSum<half> fn, BytePack<sizeof(half2)> a) {
|
||||
return toPack<half2>(__hmul2(fromPack<half2>(a), fn.scalar));
|
||||
}
|
||||
};
|
||||
@@ -570,7 +703,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
|
||||
template<>
|
||||
struct Apply_PreOp<FuncPreMulSum<hip_bfloat16>, /*EltPerPack=*/1> {
|
||||
static constexpr bool IsIdentity = false;
|
||||
__device__ static BytePack<sizeof(hip_bfloat16)> preOp(
|
||||
__device__ __forceinline__ static BytePack<sizeof(hip_bfloat16)> preOp(
|
||||
FuncPreMulSum<hip_bfloat16> fn, BytePack<sizeof(hip_bfloat16)> a
|
||||
) {
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
@@ -584,7 +717,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
|
||||
template<>
|
||||
struct Apply_PreOp<FuncPreMulSum<hip_bfloat16>, /*EltPerPack=*/2> {
|
||||
static constexpr bool IsIdentity = false;
|
||||
__device__ static BytePack<sizeof(__nv_bfloat162)> preOp(
|
||||
__device__ __forceinline__ static BytePack<sizeof(__nv_bfloat162)> preOp(
|
||||
FuncPreMulSum<__nv_bfloat16> fn, BytePack<sizeof(__nv_bfloat162)> a
|
||||
) {
|
||||
return toPack<__nv_bfloat162>(__hmul2(fromPack<__nv_bfloat162>(a), fn.scalar));
|
||||
@@ -601,7 +734,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
|
||||
template<>
|
||||
struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e4m3>, /*EltPerPack=*/1> {
|
||||
static constexpr bool IsIdentity = false;
|
||||
__device__ static BytePack<sizeof(__nv_fp8_e4m3)> preOp(
|
||||
__device__ __forceinline__ static BytePack<sizeof(__nv_fp8_e4m3)> preOp(
|
||||
FuncPreMulSum<__nv_fp8_e4m3> fn, BytePack<sizeof(__nv_fp8_e4m3)> a
|
||||
) {
|
||||
return toPack<__nv_fp8_e4m3>(__nv_fp8_e4m3(__hmul(__half(fromPack<__nv_fp8_e4m3>(a)), fn.scalar2.x)));
|
||||
@@ -610,7 +743,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
|
||||
template<>
|
||||
struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e4m3>, /*EltPerPack=*/2> {
|
||||
static constexpr bool IsIdentity = false;
|
||||
__device__ static BytePack<sizeof(__nv_fp8x2_e4m3)> preOp(
|
||||
__device__ __forceinline__ static BytePack<sizeof(__nv_fp8x2_e4m3)> preOp(
|
||||
FuncPreMulSum<__nv_fp8_e4m3> fn, BytePack<sizeof(__nv_fp8x2_e4m3)> a
|
||||
) {
|
||||
return toPack<__nv_fp8x2_e4m3>(__nv_fp8x2_e4m3(__hmul2(__half2(fromPack<__nv_fp8x2_e4m3>(a)), fn.scalar2)));
|
||||
@@ -620,7 +753,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
|
||||
template<>
|
||||
struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e5m2>, /*EltPerPack=*/1> {
|
||||
static constexpr bool IsIdentity = false;
|
||||
__device__ static BytePack<sizeof(__nv_fp8_e5m2)> preOp(
|
||||
__device__ __forceinline__ static BytePack<sizeof(__nv_fp8_e5m2)> preOp(
|
||||
FuncPreMulSum<__nv_fp8_e5m2> fn, BytePack<sizeof(__nv_fp8_e5m2)> a
|
||||
) {
|
||||
return toPack<__nv_fp8_e5m2>(__nv_fp8_e5m2(__hmul(__half(fromPack<__nv_fp8_e5m2>(a)), fn.scalar2.x)));
|
||||
@@ -629,7 +762,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
|
||||
template<>
|
||||
struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e5m2>, /*EltPerPack=*/2> {
|
||||
static constexpr bool IsIdentity = false;
|
||||
__device__ static BytePack<sizeof(__nv_fp8x2_e5m2)> preOp(
|
||||
__device__ __forceinline__ static BytePack<sizeof(__nv_fp8x2_e5m2)> preOp(
|
||||
FuncPreMulSum<__nv_fp8_e5m2> fn, BytePack<sizeof(__nv_fp8x2_e5m2)> a
|
||||
) {
|
||||
return toPack<__nv_fp8x2_e5m2>(__nv_fp8x2_e5m2(__hmul2(__half2(fromPack<__nv_fp8x2_e5m2>(a)), fn.scalar2)));
|
||||
@@ -666,7 +799,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
|
||||
template<typename T>
|
||||
struct RedOpArg<FuncSumPostDiv<T>> {
|
||||
static constexpr bool ArgUsed = true;
|
||||
__device__ static uint64_t loadArg(void *ptr) {
|
||||
__device__ __forceinline__ static uint64_t loadArg(void *ptr) {
|
||||
return *(uint64_t*)ptr;
|
||||
}
|
||||
};
|
||||
@@ -709,12 +842,12 @@ struct FuncSumPostDiv {
|
||||
uint32_t divisor:31, isSigned:1;
|
||||
UintType recip;
|
||||
|
||||
__device__ FuncSumPostDiv(uint64_t opArg=0) {
|
||||
__device__ __forceinline__ FuncSumPostDiv(uint64_t opArg=0) {
|
||||
isSigned = opArg & 1;
|
||||
divisor = opArg >> 1;
|
||||
recip = Divider<UintType>::divide(UintType(-1), divisor);
|
||||
}
|
||||
__device__ T divide(T x) {
|
||||
__device__ __forceinline__ T divide(T x) {
|
||||
// x is negative iff we are in signed mode and the top bit is set
|
||||
bool xneg = isSigned && (x & ~(T(-1)>>1));
|
||||
// Compute abs(x):
|
||||
@@ -736,7 +869,7 @@ struct FuncSumPostDiv {
|
||||
template<typename T, int EltPerPack>
|
||||
struct Apply_Reduce<FuncSumPostDiv<T>, EltPerPack>:
|
||||
Apply_Reduce<FuncSum<T>, EltPerPack> {
|
||||
__device__ static BytePack<EltPerPack*sizeof(T)> reduce(FuncSumPostDiv<T> fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b) {
|
||||
__device__ __forceinline__ static BytePack<EltPerPack*sizeof(T)> reduce(FuncSumPostDiv<T> fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b) {
|
||||
// FuncSumPostDiv reduce dispatches to FuncSum.
|
||||
return Apply_Reduce<FuncSum<T>, EltPerPack>::reduce(FuncSum<T>(), a, b);
|
||||
}
|
||||
@@ -745,7 +878,7 @@ struct Apply_Reduce<FuncSumPostDiv<T>, EltPerPack>:
|
||||
template<typename T>
|
||||
struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
|
||||
static constexpr bool IsIdentity = false;
|
||||
__device__ static BytePack<sizeof(T)> postOp(FuncSumPostDiv<T> fn, BytePack<sizeof(T)> a) {
|
||||
__device__ __forceinline__ static BytePack<sizeof(T)> postOp(FuncSumPostDiv<T> fn, BytePack<sizeof(T)> a) {
|
||||
return toPack<T>(fn.divide(fromPack<T>(a)));
|
||||
}
|
||||
};
|
||||
@@ -753,120 +886,145 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Apply_LoadMultimem
|
||||
|
||||
#define SIZEOF_BytePack_field_u16 2
|
||||
#define PTX_REG_BytePack_field_u16 "h"
|
||||
#define RegCode_for_size_1 "r"
|
||||
#define RegCode_for_size_2 "h"
|
||||
#define RegCode_for_size_4 "r"
|
||||
#define RegCode_for_size_8 "l"
|
||||
|
||||
#define SIZEOF_BytePack_field_u32 4
|
||||
#define PTX_REG_BytePack_field_u32 "r"
|
||||
#define RegSize_for_size_1 4
|
||||
#define RegSize_for_size_2 2
|
||||
#define RegSize_for_size_4 4
|
||||
#define RegSize_for_size_8 8
|
||||
|
||||
#define SIZEOF_BytePack_field_u64 8
|
||||
#define PTX_REG_BytePack_field_u64 "l"
|
||||
#define PtxAcc_for_u32
|
||||
#define PtxAcc_for_s32
|
||||
#define PtxAcc_for_s64
|
||||
#define PtxAcc_for_u64
|
||||
#define PtxAcc_for_f32
|
||||
#define PtxAcc_for_f64
|
||||
#if CUDART_VERSION >= 12020
|
||||
#define PtxAcc_for_f16 ".acc::f32"
|
||||
#define PtxAcc_for_bf16 ".acc::f32"
|
||||
#define PtxAcc_for_f16x2 ".acc::f32"
|
||||
#define PtxAcc_for_bf16x2 ".acc::f32"
|
||||
#else
|
||||
#define PtxAcc_for_f16
|
||||
#define PtxAcc_for_bf16
|
||||
#define PtxAcc_for_f16x2
|
||||
#define PtxAcc_for_bf16x2
|
||||
#endif
|
||||
#define PtxAcc_for_e4m3 ".acc::f16"
|
||||
#define PtxAcc_for_e5m2 ".acc::f16"
|
||||
#define PtxAcc_for_e4m3x4 ".acc::f16"
|
||||
#define PtxAcc_for_e5m2x4 ".acc::f16"
|
||||
|
||||
#define DEFINE_Apply_LoadMultimem_sum(T, ptx_ty, pack_field) \
|
||||
#define DEFINE_Apply_LoadMultimem_sum(T, ptx_ty, PackSize) \
|
||||
template<> \
|
||||
struct Apply_LoadMultimem<FuncSum<T>, SIZEOF_BytePack_field_##pack_field> { \
|
||||
static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
|
||||
__device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
|
||||
BytePack<PackSize> ans; \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
|
||||
struct Apply_LoadMultimem<FuncSum<T>, PackSize> { \
|
||||
__device__ __forceinline__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
|
||||
BytePack<RegSize_for_size_##PackSize> reg; \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.add" PtxAcc_for_##ptx_ty "." #ptx_ty " %0, [%1];" \
|
||||
: "=" RegCode_for_size_##PackSize(reg.native) \
|
||||
: "l"(addr) : "memory"); \
|
||||
BytePack<PackSize> ans; \
|
||||
ans.native = reg.native; \
|
||||
return ans; \
|
||||
} \
|
||||
};
|
||||
#define DEFINE_Apply_LoadMultimem_minmax(T, ptx_ty, pack_field) \
|
||||
#define DEFINE_Apply_LoadMultimem_minmax(T, ptx_ty, PackSize) \
|
||||
template<> \
|
||||
struct Apply_LoadMultimem<FuncMinMax<T>, SIZEOF_BytePack_field_##pack_field> { \
|
||||
static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
|
||||
__device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
|
||||
BytePack<PackSize> ans; \
|
||||
struct Apply_LoadMultimem<FuncMinMax<T>, PackSize> { \
|
||||
__device__ __forceinline__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
|
||||
BytePack<RegSize_for_size_##PackSize> reg; \
|
||||
if (fn.isMinNotMax) { \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
|
||||
: "=" RegCode_for_size_##PackSize(reg.native) \
|
||||
: "l"(addr) : "memory"); \
|
||||
} else { \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
|
||||
: "=" RegCode_for_size_##PackSize(reg.native) \
|
||||
: "l"(addr) : "memory"); \
|
||||
} \
|
||||
BytePack<PackSize> ans; \
|
||||
ans.native = reg.native; \
|
||||
return ans; \
|
||||
} \
|
||||
};
|
||||
|
||||
#define DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \
|
||||
#define DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, VecEltSize) \
|
||||
template<> \
|
||||
struct Apply_LoadMultimem<FuncSum<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
|
||||
static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
|
||||
__device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
|
||||
BytePack<PackSize> ans; \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
|
||||
struct Apply_LoadMultimem<FuncSum<T>, 4*(VecEltSize)> { \
|
||||
static constexpr int PackSize = 4*(VecEltSize); \
|
||||
__device__ __forceinline__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
|
||||
union { BytePack<PackSize> ans; BytePack<VecEltSize> elts[4]; }; \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.add" PtxAcc_for_##ptx_ty ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
|
||||
: "=" RegCode_for_size_##VecEltSize(elts[0].native), \
|
||||
"=" RegCode_for_size_##VecEltSize(elts[1].native), \
|
||||
"=" RegCode_for_size_##VecEltSize(elts[2].native), \
|
||||
"=" RegCode_for_size_##VecEltSize(elts[3].native) \
|
||||
: "l"(addr) : "memory"); \
|
||||
return ans; \
|
||||
} \
|
||||
};
|
||||
#define DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \
|
||||
#define DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, VecEltSize) \
|
||||
template<> \
|
||||
struct Apply_LoadMultimem<FuncMinMax<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
|
||||
static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
|
||||
__device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
|
||||
BytePack<PackSize> ans; \
|
||||
struct Apply_LoadMultimem<FuncMinMax<T>, 4*(VecEltSize)> { \
|
||||
static constexpr int PackSize = 4*(VecEltSize); \
|
||||
__device__ __forceinline__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
|
||||
union { BytePack<PackSize> ans; BytePack<VecEltSize> elts[4]; }; \
|
||||
if (fn.isMinNotMax) { \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
|
||||
: "=" RegCode_for_size_##VecEltSize(elts[0].native), \
|
||||
"=" RegCode_for_size_##VecEltSize(elts[1].native), \
|
||||
"=" RegCode_for_size_##VecEltSize(elts[2].native), \
|
||||
"=" RegCode_for_size_##VecEltSize(elts[3].native) \
|
||||
: "l"(addr) : "memory"); \
|
||||
} else { \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
|
||||
: "=" RegCode_for_size_##VecEltSize(elts[0].native), \
|
||||
"=" RegCode_for_size_##VecEltSize(elts[1].native), \
|
||||
"=" RegCode_for_size_##VecEltSize(elts[2].native), \
|
||||
"=" RegCode_for_size_##VecEltSize(elts[3].native) \
|
||||
: "l"(addr) : "memory"); \
|
||||
} \
|
||||
return ans; \
|
||||
} \
|
||||
};
|
||||
|
||||
#define DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(T, ptx_ty, pack_field) \
|
||||
DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \
|
||||
#define DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(T, ptx_ty, VecEltSize) \
|
||||
DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, VecEltSize) \
|
||||
template<> \
|
||||
struct Apply_LoadMultimem<FuncSum<T>, sizeof(T)> { \
|
||||
__device__ static BytePack<sizeof(T)> load(FuncSum<T> fn, uintptr_t addr) { \
|
||||
BytePack<2*sizeof(T)> tmp; \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
|
||||
: "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
|
||||
return tmp.half[(addr/sizeof(T))%2]; \
|
||||
__device__ __forceinline__ static BytePack<sizeof(T)> load(FuncSum<T> fn, uintptr_t addr) { \
|
||||
union { BytePack<VecEltSize> tmp; BytePack<sizeof(T)> elts[(VecEltSize)/sizeof(T)]; }; \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.add" PtxAcc_for_##ptx_ty "." #ptx_ty " %0, [%1];" \
|
||||
: "=" RegCode_for_size_##VecEltSize(tmp.native) \
|
||||
: "l"(addr & -uintptr_t(VecEltSize)) : "memory"); \
|
||||
return elts[(addr/sizeof(T))%((VecEltSize)/sizeof(T))]; \
|
||||
} \
|
||||
};
|
||||
#define DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(T, ptx_ty, pack_field) \
|
||||
DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \
|
||||
#define DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(T, ptx_ty, VecEltSize) \
|
||||
DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, VecEltSize) \
|
||||
template<> \
|
||||
struct Apply_LoadMultimem<FuncMinMax<T>, sizeof(T)> { \
|
||||
__device__ static BytePack<sizeof(T)> load(FuncMinMax<T> fn, uintptr_t addr) { \
|
||||
BytePack<2*sizeof(T)> tmp; \
|
||||
__device__ __forceinline__ static BytePack<sizeof(T)> load(FuncMinMax<T> fn, uintptr_t addr) { \
|
||||
union { BytePack<VecEltSize> tmp; BytePack<sizeof(T)> elts[(VecEltSize)/sizeof(T)]; }; \
|
||||
if (fn.isMinNotMax) { \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
|
||||
: "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
|
||||
: "=" RegCode_for_size_##VecEltSize(tmp.native) \
|
||||
: "l"(addr & -uintptr_t(VecEltSize)) : "memory"); \
|
||||
} else { \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
|
||||
: "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
|
||||
: "=" RegCode_for_size_##VecEltSize(tmp.native) \
|
||||
: "l"(addr & -uintptr_t(VecEltSize)) : "memory"); \
|
||||
} \
|
||||
return tmp.half[(addr/sizeof(T))%2]; \
|
||||
return elts[(addr/sizeof(T))%((VecEltSize)/sizeof(T))]; \
|
||||
} \
|
||||
};
|
||||
|
||||
template<typename Fn, int BytePerPack>
|
||||
struct Apply_LoadMultimem {
|
||||
__device__ static BytePack<BytePerPack> load(Fn fn, uintptr_t addr) {
|
||||
__device__ __forceinline__ static BytePack<BytePerPack> load(Fn fn, uintptr_t addr) {
|
||||
//__trap();
|
||||
return {};
|
||||
}
|
||||
@@ -889,29 +1047,43 @@ struct Apply_LoadMultimem {
|
||||
/*multimem.ld_reduce not supported:*/ 0;
|
||||
};
|
||||
|
||||
DEFINE_Apply_LoadMultimem_sum(uint32_t, u32, u32)
|
||||
DEFINE_Apply_LoadMultimem_minmax(uint32_t, u32, u32)
|
||||
DEFINE_Apply_LoadMultimem_sum(uint32_t, u32, 4)
|
||||
DEFINE_Apply_LoadMultimem_minmax(uint32_t, u32, 4)
|
||||
|
||||
DEFINE_Apply_LoadMultimem_sum(int32_t, s32, u32)
|
||||
DEFINE_Apply_LoadMultimem_minmax(int32_t, s32, u32)
|
||||
DEFINE_Apply_LoadMultimem_sum(int32_t, s32, 4)
|
||||
DEFINE_Apply_LoadMultimem_minmax(int32_t, s32, 4)
|
||||
|
||||
DEFINE_Apply_LoadMultimem_sum(uint64_t, u64, u64)
|
||||
DEFINE_Apply_LoadMultimem_minmax(uint64_t, u64, u64)
|
||||
DEFINE_Apply_LoadMultimem_sum(uint64_t, u64, 8)
|
||||
DEFINE_Apply_LoadMultimem_minmax(uint64_t, u64, 8)
|
||||
|
||||
DEFINE_Apply_LoadMultimem_sum(int64_t, u64, u64)
|
||||
DEFINE_Apply_LoadMultimem_minmax(int64_t, s64, u64)
|
||||
DEFINE_Apply_LoadMultimem_sum(int64_t, u64, 8)
|
||||
DEFINE_Apply_LoadMultimem_minmax(int64_t, s64, 8)
|
||||
|
||||
DEFINE_Apply_LoadMultimem_sum(float, f32, u32)
|
||||
DEFINE_Apply_LoadMultimem_sum_v4(float, f32, u32)
|
||||
DEFINE_Apply_LoadMultimem_sum(float, f32, 4)
|
||||
DEFINE_Apply_LoadMultimem_sum_v4(float, f32, 4)
|
||||
|
||||
DEFINE_Apply_LoadMultimem_sum(double, f64, u64)
|
||||
DEFINE_Apply_LoadMultimem_sum(double, f64, 8)
|
||||
|
||||
DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(half, f16x2, u32)
|
||||
DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(half, f16x2, u32)
|
||||
DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(half, f16x2, 4)
|
||||
DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(half, f16x2, 4)
|
||||
|
||||
#if defined(RCCL_BFLOAT16)
|
||||
DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(hip_bfloat16, bf16x2, u32)
|
||||
DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(hip_bfloat16, bf16x2, u32)
|
||||
DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(hip_bfloat16, bf16x2, 4)
|
||||
DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(hip_bfloat16, bf16x2, 4)
|
||||
#endif
|
||||
|
||||
#if defined(RCCL_BFLOAT16)
|
||||
#if NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1000 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1010 || NCCL_CUDA_ARCH_SPECIFIC == 1200 || NCCL_CUDA_ARCH_SPECIFIC == 1210
|
||||
DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4)
|
||||
DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4)
|
||||
DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e5m2, e5m2x4, 4)
|
||||
DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_fp8_e5m2, e5m2x4, 4)
|
||||
#else
|
||||
DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(rccl_float8, e4m3x4, 4)
|
||||
DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(rccl_float8, e4m3x4, 4)
|
||||
DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(rccl_bfloat8, e5m2x4, 4)
|
||||
DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(rccl_bfloat8, e5m2x4, 4)
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
template<typename Fn>
|
||||
@@ -923,11 +1095,29 @@ struct Apply_LoadMultimem {
|
||||
#undef DEFINE_Apply_LoadMultimem
|
||||
#undef DEFINE_Apply_LoadMultimem_v4
|
||||
#undef DEFINE_Apply_LoadMultimem_v4x2_and_subhalf
|
||||
#undef SIZEOF_BytePack_field_u64
|
||||
#undef PTX_REG_BytePack_field_u64
|
||||
#undef SIZEOF_BytePack_field_u32
|
||||
#undef PTX_REG_BytePack_field_u32
|
||||
#undef SIZEOF_BytePack_field_u16
|
||||
#undef PTX_REG_BytePack_field_u16
|
||||
|
||||
#undef RegCode_for_size_2
|
||||
#undef RegCode_for_size_4
|
||||
#undef RegCode_for_size_8
|
||||
|
||||
#undef RegSize_for_size_1
|
||||
#undef RegSize_for_size_2
|
||||
#undef RegSize_for_size_4
|
||||
#undef RegSize_for_size_8
|
||||
|
||||
#undef PtxAcc_for_u32
|
||||
#undef PtxAcc_for_s32
|
||||
#undef PtxAcc_for_s64
|
||||
#undef PtxAcc_for_u64
|
||||
#undef PtxAcc_for_f32
|
||||
#undef PtxAcc_for_f64
|
||||
#undef PtxAcc_for_f16
|
||||
#undef PtxAcc_for_bf16
|
||||
#undef PtxAcc_for_f16x2
|
||||
#undef PtxAcc_for_bf16x2
|
||||
#undef PtxAcc_for_e4m3
|
||||
#undef PtxAcc_for_e5m2
|
||||
#undef PtxAcc_for_e4m3x4
|
||||
#undef PtxAcc_for_e5m2x4
|
||||
|
||||
#endif // REDUCE_KERNEL_H_
|
||||
|
||||
@@ -235,82 +235,206 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SI
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
template<bool ReduceSendNotRecv>
|
||||
struct Scatterer {
|
||||
struct ncclDevWorkColl* work;
|
||||
int chunkCount;
|
||||
ssize_t railGridOffset;
|
||||
|
||||
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts, int MultimemSrcs, int MultimemDsts>
|
||||
__device__ __forceinline__ void operator()(
|
||||
int tid, int tn, int slice, int maxSliceSize,
|
||||
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
|
||||
) {
|
||||
static_assert(SlicePerChunk == 1, "require: SlicePerChunk==1");
|
||||
static_assert(MaxDsts <= 1 || MaxSrcs <= 1, "require: MaxDsts<=1 || MaxSrcs<=1");
|
||||
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
int nNodes = ncclShmem.comm.nNodes;
|
||||
int nRails = nvls->nHeads;
|
||||
int part = ncclShmem.channelId - work->channelLo;
|
||||
void* inbuf = (void*)work->sendbuff;
|
||||
ssize_t countPerRank = work->collnet.count;
|
||||
|
||||
ssize_t railAllBeg = min(railGridOffset + part * chunkCount, nNodes * countPerRank);
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkCount, nNodes * countPerRank);
|
||||
int railAllSize = railAllEnd - railAllBeg;
|
||||
int rail = nvls->headRank;
|
||||
int dst = 0;
|
||||
if (ReduceSendNotRecv) {
|
||||
if (work->regUsed) return;
|
||||
rail = 0;
|
||||
nSrcs = 1;
|
||||
} else {
|
||||
rail = nvls->headRank;
|
||||
}
|
||||
if (tid < nDsts) dstSizes[tid] = railAllSize;
|
||||
do {
|
||||
int node = railAllBeg / countPerRank;
|
||||
int railAllOffset = 0;
|
||||
while (railAllOffset < railAllSize) {
|
||||
ssize_t railOneBeg = node * countPerRank;
|
||||
ssize_t railOneEnd = railOneBeg + countPerRank;
|
||||
ssize_t railOneOffset = (railAllBeg + railAllOffset) - railOneBeg;
|
||||
int delta = min(railAllEnd, railOneEnd) - (railAllBeg + railAllOffset);
|
||||
int rank = ncclShmem.comm.collNetDenseToUserRank[node * nRails + rail];
|
||||
ssize_t userOneBeg = rank * countPerRank + railOneOffset;
|
||||
if (nDsts != 0) {
|
||||
reduceCopy<ncclCollUnroll(), USE_ACC, RedOp, T,
|
||||
/*MultimemSrcs=*/MultimemSrcs, 1, 1 + MaxSrcs,
|
||||
/*MultimemDsts,MinDsts,MaxDsts=*/MultimemDsts, 1, 1,
|
||||
/*PreOpSrcs=*/1>
|
||||
(tid, tn, work->redOpArg, &work->redOpArg, false,
|
||||
/*nSrcs=*/nSrcs, [=]__device__(int s) {
|
||||
return work->regUsed ? (T*)srcPtrs[s] + userOneBeg :
|
||||
!ReduceSendNotRecv ? (T*)srcPtrs[s] + railAllOffset:
|
||||
(T*)inbuf + userOneBeg;
|
||||
},
|
||||
/*nDsts=*/1, [=]__device__(int d/*==0*/) {
|
||||
return (T*)dstPtrs[dst] + railAllOffset;
|
||||
}, delta);
|
||||
}
|
||||
railAllOffset += delta;
|
||||
node += 1;
|
||||
}
|
||||
dst += 1;
|
||||
rail += 1;
|
||||
} while (ReduceSendNotRecv && dst < nRails);
|
||||
}
|
||||
};
|
||||
|
||||
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
|
||||
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
|
||||
size_t count;
|
||||
size_t gridOffset;
|
||||
size_t channelCount;
|
||||
size_t chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
|
||||
const int rank = ncclShmem.comm.rank;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
/* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync;
|
||||
* if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth
|
||||
* and the rest are allocated to scatter. */
|
||||
const int nThreadsReduce = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
|
||||
const int nThreadsScatter = work->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce);
|
||||
const int tidEndScatter = nThreadsScatter;
|
||||
const int nThreadsNetRecv = work->oneNode ? 0 : (work->netRegUsed ? WARP_SIZE : 6 * WARP_SIZE);
|
||||
const int nThreadsScatter = work->regUsed ? roundUp(nvls->nHeads << 2, WARP_SIZE) : 8 * WARP_SIZE;
|
||||
const int nThreadsReduce = NCCL_MAX_NTHREADS - nThreadsNetRecv - nThreadsScatter;
|
||||
const int tidEndNetRecv = nThreadsNetRecv;
|
||||
const int tidEndScatter = tidEndNetRecv + nThreadsScatter;
|
||||
const int tidEndReduce = tidEndScatter + nThreadsReduce;
|
||||
|
||||
if (!work->regUsed) {
|
||||
if (tid < tidEndScatter) {
|
||||
// Scatter
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.scatter(offset, nvls->nHeads * count, nelem, count, -1, 0);
|
||||
if (work->oneNode) {
|
||||
const int rank = ncclShmem.comm.rank;
|
||||
size_t offset;
|
||||
size_t count, gridOffset, channelCount, chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
|
||||
if (!work->regUsed) {
|
||||
if (tid < tidEndScatter) {
|
||||
// Scatter
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.scatter(offset, nvls->nHeads * count, nelem, count, -1, 0);
|
||||
}
|
||||
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
|
||||
} else if (tid < tidEndReduce) {
|
||||
// Reduce through NVLS
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL, 1, 0>;
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, work->recvbuff,
|
||||
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recv(offset, nelem);
|
||||
}
|
||||
}
|
||||
// coverity[overrun-call] => Coverity think prims.index can be greater than 1
|
||||
} else if (tid < tidEndReduce) {
|
||||
// Reduce through NVLS
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL, 1, 0>;
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, work->recvbuff,
|
||||
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recv(offset, nelem);
|
||||
} else {
|
||||
if (tid < tidEndScatter) {
|
||||
// Scatter
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL,
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
prims.scatter(0, 0, 0, 0, -1, 0);
|
||||
}
|
||||
|
||||
/* gather used as sync */
|
||||
prims.gather(0, 0, 0, 0, -1, 0);
|
||||
} else if (tid < tidEndReduce) {
|
||||
// Reduce through NVLS
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL, 1, 0>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, work->recvbuff,
|
||||
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
size_t outOffset = gridOffset + elemOffset;
|
||||
size_t inpOffset = outOffset + rank * count;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
// Coverity complains about a possible overrun inside the method invoked below, but that's actually
|
||||
// a false positive.
|
||||
// coverity[overrun-call:FALSE]
|
||||
prims.directRecvCopy(inpOffset, outOffset, nelem);
|
||||
}
|
||||
|
||||
/* send for sync */
|
||||
prims.send(0, 0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (tid < tidEndScatter) {
|
||||
// Scatter
|
||||
// multi-node
|
||||
int nNodes = ncclShmem.comm.nNodes;
|
||||
int part = ncclShmem.channelId - work->channelLo;
|
||||
ssize_t countPerRank = work->collnet.count;
|
||||
const int nChannels = work->channelHi - work->channelLo + 1;
|
||||
ssize_t chunkCount = work->collnet.chunkCount;
|
||||
if (tid < tidEndNetRecv) {
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL,
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
prims.scatter(0, 0, 0, 0, -1, 0);
|
||||
if (work->netRegUsed) {
|
||||
if (tid == 0) {
|
||||
int steps = (int)divUp(nNodes * countPerRank, nChannels * chunkCount);
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(nvls->out, 0, steps);
|
||||
}
|
||||
__syncwarp();
|
||||
} else {
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nThreadsNetRecv, &nvls->out, nullptr, nullptr, work->recvbuff,
|
||||
work->redOpArg, 0 * Proto::MaxGroupWidth, 0, 0);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) {
|
||||
ssize_t railAllBeg = railGridOffset + part * chunkCount;
|
||||
ssize_t railAllEnd = min(railAllBeg + chunkCount, nNodes * countPerRank);
|
||||
ssize_t railOneBeg = ncclShmem.comm.node * countPerRank;
|
||||
ssize_t railOneEnd = railOneBeg + countPerRank;
|
||||
ssize_t beg = max(railAllBeg, railOneBeg);
|
||||
ssize_t end = min(railAllEnd, railOneEnd);
|
||||
prims.recv(beg - railOneBeg, max(ssize_t(0), end - beg), /*postOp=*/true);
|
||||
}
|
||||
}
|
||||
|
||||
/* gather used as sync */
|
||||
prims.gather(0, 0, 0, 0, -1, 0);
|
||||
} else if (tid < tidEndReduce) {
|
||||
// Reduce through NVLS
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL, 1, 0>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, work->recvbuff,
|
||||
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
size_t outOffset = gridOffset + elemOffset;
|
||||
size_t inpOffset = outOffset + rank * count;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
// Coverity complains about a possible overrun inside the method invoked below, but that's actually
|
||||
// a false positive.
|
||||
// coverity[overrun-call:FALSE]
|
||||
prims.directRecvCopy(inpOffset, outOffset, nelem);
|
||||
} else {
|
||||
if (tid < tidEndScatter) {
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL>;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndNetRecv, nThreadsScatter, nullptr, nvls->up, work->sendbuff, nullptr,
|
||||
work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1, work);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) {
|
||||
Scatterer</*ReduceSendNotRecv=*/true> scat;
|
||||
scat.work = work;
|
||||
scat.chunkCount = chunkCount;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.template process</*Recv=*/0, /*Send=*/1>(scat);
|
||||
}
|
||||
} else if (tid < tidEndReduce) {
|
||||
using Proto = ProtoSimple<1, 1, USE_ACC, COLL_UNROLL, 1, 0>;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->out, nullptr, nullptr,
|
||||
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) {
|
||||
Scatterer</*ReduceSendNotRecv=*/false> scat;
|
||||
scat.work = work;
|
||||
scat.chunkCount = chunkCount;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.template process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
}
|
||||
}
|
||||
|
||||
/* send for sync */
|
||||
prims.send(0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -324,7 +448,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
|
||||
int chunkSize;
|
||||
ssize_t railGridOffset;
|
||||
|
||||
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
|
||||
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts, int MultimemSrcs, int MultimemDsts>
|
||||
__device__ __forceinline__ void operator()(
|
||||
int tid, int tn, int slice, int maxSliceSize,
|
||||
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
|
||||
@@ -363,7 +487,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
|
||||
int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
|
||||
ssize_t userOneBeg = rank*countPerRank + railOneOffset;
|
||||
if (nDsts != 0) {
|
||||
reduceCopy<ncclCollUnroll(), RedOp, T,
|
||||
reduceCopy<ncclCollUnroll(), USE_ACC, RedOp, T,
|
||||
/*MultimemSrcs=*/0, 1+MinSrcs, 1+MaxSrcs,
|
||||
/*MultimemDsts,MinDsts,MaxDsts=*/0,1,1,
|
||||
/*PreOpSrcs=*/1>
|
||||
|
||||
@@ -0,0 +1,367 @@
|
||||
#include "symmetric.h"
|
||||
#include "symmetric/kernel.h"
|
||||
#include "symmetric/primitives.h"
|
||||
|
||||
template<int BytePerPack, int UnrollPacks, int UnrollPeers>
|
||||
static __device__ void bcastDeep(
|
||||
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
|
||||
char* inputHere, char* outputRank0, bool inPlace, int nIters
|
||||
) {
|
||||
using Pack = BytePack<BytePerPack>;
|
||||
int wn = tn/WARP_SIZE;
|
||||
int w = t/WARP_SIZE;
|
||||
int lane = t%WARP_SIZE;
|
||||
int const& rank = prim.rank;
|
||||
int const& nRanks = prim.nRanks;
|
||||
uint32_t const& stride4G = prim.stride4G;
|
||||
Pack* inpHere = (Pack*)inputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
|
||||
Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
|
||||
Pack tmp[UnrollPacks];
|
||||
|
||||
nIters -= w;
|
||||
if (0 < nIters) {
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
tmp[u] = inpHere[u*WARP_SIZE];
|
||||
}
|
||||
}
|
||||
|
||||
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
|
||||
if (0 < nIters) {
|
||||
while (true) {
|
||||
int dr = inPlace ? 1 : 0;
|
||||
int r = rank + dr;
|
||||
if (r == nRanks) r = 0;
|
||||
#pragma unroll 2
|
||||
for (int partial=0; partial <= 1; partial++) {
|
||||
#pragma unroll 1
|
||||
for (int i = 0;
|
||||
partial ? i < 1 : (dr + UnrollPeers <= nRanks);
|
||||
partial ? i++ : (dr += UnrollPeers)) {
|
||||
#pragma unroll
|
||||
for (int ur=0; ur < UnrollPeers-partial; ur++) {
|
||||
if (partial && dr == nRanks) break;
|
||||
#pragma unroll UnrollPacks
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
add4G(outRank0, r*stride4G)[u*WARP_SIZE] = tmp[u];
|
||||
}
|
||||
if (++r == nRanks) r = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
inpHere += intptr_t(wn)*UnrollPacks*WARP_SIZE;
|
||||
outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
|
||||
nIters -= wn;
|
||||
if (nIters <= 0) break;
|
||||
|
||||
// Load data for next iteration.
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
tmp[u] = inpHere[u*WARP_SIZE];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int UnrollPeers, typename T>
|
||||
static __device__ void bcastEnds(
|
||||
ncclSymPrims& prim, int tn, int t,
|
||||
T* inputHere, T* outputRank0, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts
|
||||
) {
|
||||
int const& rank = prim.rank;
|
||||
int const& nRanks = prim.nRanks;
|
||||
uint32_t const& stride4G = prim.stride4G;
|
||||
BytePack<sizeof(T)>* inpHere = (BytePack<sizeof(T)>*)inputHere;
|
||||
BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
|
||||
#pragma unroll 1
|
||||
for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
|
||||
size_t elt = i < nPreElts ? i : nElts-nPreElts-nSufElts+i;
|
||||
BytePack<sizeof(T)> tmp = inpHere[elt];
|
||||
int dr = inPlace ? 1 : 0;
|
||||
int r = rank + dr;
|
||||
if (r == nRanks) r = 0;
|
||||
#pragma unroll 1
|
||||
for (; dr + UnrollPeers <= nRanks; dr += UnrollPeers) {
|
||||
#pragma unroll UnrollPeers
|
||||
for (int u=0; u < UnrollPeers; u++) {
|
||||
*add4G(outRank0+elt, r*stride4G) = tmp;
|
||||
if (++r == nRanks) r = 0;
|
||||
}
|
||||
}
|
||||
#pragma unroll UnrollPeers
|
||||
for (int u=0; u < UnrollPeers; u++) {
|
||||
if (dr+u == nRanks) break;
|
||||
*add4G(outRank0+elt, r*stride4G) = tmp;
|
||||
if (++r == nRanks) r = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __device__ void bcast(
|
||||
ncclSymPrims& prim, int tn, int t, bool waitNeeded, T* input, T* output, size_t nElts
|
||||
) {
|
||||
bool inPlace = (input == output);
|
||||
// Mpve to rank=0
|
||||
output = prim.peerPtr(0, output);
|
||||
|
||||
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
|
||||
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
|
||||
size_t nBytes = nElts*sizeof(T);
|
||||
|
||||
uint32_t nPreBytes = (128u - inputUptr)%128u;
|
||||
nPreBytes = min((size_t)nPreBytes, nBytes);
|
||||
uintptr_t cursor = nPreBytes;
|
||||
|
||||
constexpr int MinWarpPerBlock = 4;
|
||||
|
||||
if ((inputUptr-outputUptr)%16 == 0) {
|
||||
constexpr int BytePerPack = 16, UnrollPacks = 1, UnrollPeers = 1;
|
||||
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
|
||||
chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
|
||||
if (chunks != 0) {
|
||||
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
|
||||
bcastDeep<BytePerPack, UnrollPacks, UnrollPeers>(
|
||||
prim, tn, t, waitNeeded,
|
||||
(char*)input + cursor, (char*)output + cursor, inPlace,
|
||||
chunks*MinWarpPerBlock
|
||||
);
|
||||
cursor = cursorAfter;
|
||||
waitNeeded = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
|
||||
constexpr int BytePerPack = 4, UnrollPacks = 1, UnrollPeers = 1;
|
||||
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
|
||||
chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
|
||||
if (chunks != 0) {
|
||||
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
|
||||
bcastDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers>(
|
||||
prim, tn, t, waitNeeded,
|
||||
(char*)input + cursor, (char*)output + cursor, inPlace,
|
||||
chunks*MinWarpPerBlock
|
||||
);
|
||||
cursor = cursorAfter;
|
||||
waitNeeded = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
|
||||
constexpr int UnrollPeers = 8;
|
||||
size_t nSufElts = (nBytes-cursor)/sizeof(T);
|
||||
bcastEnds<UnrollPeers>(prim, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ncclSymRun_AllGather_ST(ncclSymDevArgs const* args) {
|
||||
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
|
||||
int const& rank = prim.rank;
|
||||
|
||||
// Threads numbered over rank.
|
||||
int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
|
||||
prim.block, prim.nBlocks,
|
||||
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
|
||||
int btn = prim.nBlocks*blockDim.x;
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
|
||||
//prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
|
||||
bcast(prim, btn, bt, /*waitNeeded=*/true, (char*)args->input, (char*)args->output + rank*args->nElts, args->nElts);
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
|
||||
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
static __device__ void bcastMultimem(
|
||||
ncclSymPrims& prim, int tn, int t, T* input, T* output, size_t nElts
|
||||
) {
|
||||
// Move output to multimem
|
||||
output = prim.multimemPtr(output);
|
||||
|
||||
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
|
||||
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
|
||||
size_t nBytes = nElts*sizeof(T);
|
||||
|
||||
uint32_t nPreBytes = (16-inputUptr)%16;
|
||||
nPreBytes = min((size_t)nPreBytes, nBytes);
|
||||
uintptr_t nSufBytes;
|
||||
|
||||
if ((inputUptr-outputUptr)%16 == 0) {
|
||||
constexpr int BytePerPack = 16, UnrollPacks = 8;
|
||||
constexpr int BytePerChunk = UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
uintptr_t cursor = nPreBytes;
|
||||
uint32_t nChunks = (nBytes-cursor)/BytePerChunk;
|
||||
uintptr_t cursorAfter = cursor + uintptr_t(nChunks)*BytePerChunk;
|
||||
nSufBytes = nBytes - cursorAfter;
|
||||
cursor += (t/WARP_SIZE)*UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
cursor += (t%WARP_SIZE)*BytePerPack;
|
||||
int nIters = nChunks - t/WARP_SIZE;
|
||||
#pragma unroll 1
|
||||
while (0 < nIters) {
|
||||
BytePack<BytePerPack> tmp[UnrollPacks];
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
tmp[u] = *reinterpret_cast<BytePack<BytePerPack>*>(inputUptr + cursor + u*WARP_SIZE*BytePerPack);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
multimem_st_global(outputUptr + cursor + u*WARP_SIZE*BytePerPack, tmp[u]);
|
||||
}
|
||||
cursor += tn*UnrollPacks*BytePerPack;
|
||||
nIters -= tn/WARP_SIZE;
|
||||
}
|
||||
} else {
|
||||
nPreBytes = 0;
|
||||
nSufBytes = nBytes;
|
||||
}
|
||||
|
||||
// Get the prefix+suffix element one at a time.
|
||||
#pragma unroll 4
|
||||
for (uintptr_t i = t*sizeof(T); i < nPreBytes + nSufBytes; i += tn*sizeof(T)) {
|
||||
uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
|
||||
BytePack<sizeof(T)> val = *reinterpret_cast<BytePack<sizeof(T)>*>(inputUptr + cursor);
|
||||
multimem_st_global(outputUptr + cursor, val);
|
||||
cursor += tn*sizeof(T);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ncclSymRun_AllGather_STMC(ncclSymDevArgs const* args) {
|
||||
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
|
||||
int const& rank = prim.rank;
|
||||
|
||||
char* input = args->input;
|
||||
char* output = args->output;
|
||||
size_t bytes = args->nElts;
|
||||
// Round robin memory to blocks.
|
||||
int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
|
||||
prim.block, prim.nBlocks,
|
||||
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
|
||||
int tn = prim.nBlocks*blockDim.x;
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
|
||||
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
|
||||
bcastMultimem(prim, tn, t, input, output + rank*bytes, bytes);
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
|
||||
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
}
|
||||
|
||||
template<typename EltType>
|
||||
static __device__ void allgather_LL_body(
|
||||
ncclSymPrims &prim, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts
|
||||
) {
|
||||
using Pack = BytePack<8>;
|
||||
constexpr int EltPerPack = 8/sizeof(EltType);
|
||||
|
||||
ncclCoopCta cta;
|
||||
int rank = prim.rank;
|
||||
int nRanks = prim.nRanks;
|
||||
constexpr int tn = ncclSymMaxThreads;
|
||||
int t = threadIdx.x;
|
||||
|
||||
#pragma unroll 1
|
||||
while (0 < nElts) {
|
||||
int nIterPacks = min(nPacks, tn);
|
||||
if (t < nIterPacks) {
|
||||
Pack x = loadPack<Pack>(input, t*EltPerPack, nElts);
|
||||
prim.bcastLL(/*slot=*/nIterPacks*rank + t, x);
|
||||
}
|
||||
|
||||
int tn_div_nPacks = tn/nIterPacks;
|
||||
int tn_mod_nPacks = tn%nIterPacks;
|
||||
int peer = t/nIterPacks;
|
||||
int pack = t%nIterPacks;
|
||||
#if 1
|
||||
// NOTE: Unrolling speedup on eos nranks=8 size=64K: 5.7us vs 6.7us
|
||||
constexpr int Unroll = 1;
|
||||
#pragma unroll 1
|
||||
for (int i = t; i < (nRanks*nIterPacks & -(Unroll*tn)); i += Unroll*tn) {
|
||||
Pack got[Unroll];
|
||||
prim.template recvLL<Unroll, Unroll>(i, Unroll, tn, /*&*/got);
|
||||
#pragma unroll
|
||||
for (int u=0; u < Unroll; u++) {
|
||||
storePack<Pack>(output + peer*nStrideElts, pack*EltPerPack, nElts, got[u]);
|
||||
peer += tn_div_nPacks;
|
||||
pack += tn_mod_nPacks;
|
||||
if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; }
|
||||
}
|
||||
}
|
||||
|
||||
int i = (nRanks*nIterPacks & -(Unroll*tn)) + t;
|
||||
int n = (nRanks*nIterPacks)/tn % Unroll;
|
||||
if (i + n*tn < nRanks*nIterPacks) n += 1;
|
||||
if (n != 0) {
|
||||
Pack got[Unroll];
|
||||
prim.template recvLL<1, Unroll>(i, n, tn, /*&*/got);
|
||||
#pragma unroll
|
||||
for (int u=0; u < Unroll; u++) {
|
||||
if (u != 0 && u == n) break;
|
||||
storePack(output + peer*nStrideElts, pack*EltPerPack, nElts, got[u]);
|
||||
peer += tn_div_nPacks;
|
||||
pack += tn_mod_nPacks;
|
||||
if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; }
|
||||
}
|
||||
}
|
||||
#else
|
||||
// The non-unrolled but "obviously correct" implementation for reference.
|
||||
#pragma unroll 1
|
||||
for (int i = t; i < nRanks*nIterPacks; i += tn) {
|
||||
Pack got = prim.template recvLL<Pack>(i);
|
||||
storePack(output + peer*nStrideElts, pack*EltPerPack, nElts, got);
|
||||
peer += tn_div_nPacks;
|
||||
pack += tn_mod_nPacks;
|
||||
if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; }
|
||||
}
|
||||
#endif
|
||||
|
||||
prim.endLL(cta);
|
||||
|
||||
input += tn*EltPerPack;
|
||||
output += tn*EltPerPack;
|
||||
nElts -= tn*EltPerPack;
|
||||
nPacks -= tn;
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ void ncclSymRun_AllGather_LL_impl(ncclSymDevArgs const* args, bool multimem) {
|
||||
ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
|
||||
using Pack = BytePack<8>;
|
||||
constexpr int BytePerPack = 8;
|
||||
int nElts = args->nElts;
|
||||
int nPacks = divUp(nElts, BytePerPack);
|
||||
|
||||
uint32_t nPackPerBlock, nPackModBlock;
|
||||
idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
|
||||
int blockPackBegin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
|
||||
int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
|
||||
int nBlockPacks = blockPackEnd - blockPackBegin;
|
||||
int nBlockElts = nElts - blockPackBegin*BytePerPack;
|
||||
nBlockElts = min(nBlockElts, nBlockPacks*BytePerPack);
|
||||
char* blockInput = args->input + blockPackBegin*BytePerPack;
|
||||
char* blockOutput = args->output + blockPackBegin*BytePerPack;
|
||||
|
||||
uint32_t lowBits = args->nElts;
|
||||
lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->input);
|
||||
lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->output);
|
||||
if (__builtin_expect(lowBits%8 == 0, true)) {
|
||||
// NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us
|
||||
allgather_LL_body(prim, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput, nBlockElts/8, nBlockPacks, nElts/8);
|
||||
} else {
|
||||
allgather_LL_body(prim, blockInput, blockOutput, nBlockElts, nBlockPacks, nElts);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ncclSymRun_AllGather_LL(ncclSymDevArgs const* args) {
|
||||
ncclSymRun_AllGather_LL_impl(args, /*multimem=*/false);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(ncclSymDevArgs const* args) {
|
||||
ncclSymRun_AllGather_LL_impl(args, /*multimem=*/true);
|
||||
}
|
||||
@@ -0,0 +1,432 @@
|
||||
#include "symmetric.h"
|
||||
#include "symmetric/kernel.h"
|
||||
#include "symmetric/primitives.h"
|
||||
|
||||
template<int BytePerPack, int UnrollPacks, int UnrollPeers, typename T, typename Red>
|
||||
static __device__ __forceinline__ void allreduceDeep(
|
||||
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
|
||||
Red red, char* inputRank0, char* outputRank0, int32_t nIters
|
||||
) {
|
||||
using Pack = BytePack<BytePerPack>;
|
||||
using Acc = typename Red::EltType;
|
||||
using AccPack = BytePack<BytePerPack*sizeof(Acc)/sizeof(T)>;
|
||||
|
||||
int wn = tn/WARP_SIZE;
|
||||
int w = t/WARP_SIZE;
|
||||
int lane = t%WARP_SIZE;
|
||||
int const& rank = prim.rank;
|
||||
int const& nRanks = prim.nRanks;
|
||||
uint32_t const& stride4G = prim.stride4G;
|
||||
Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
|
||||
Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
|
||||
Pack acc0[UnrollPacks];
|
||||
|
||||
nIters -= w;
|
||||
if (0 < nIters) {
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
|
||||
}
|
||||
}
|
||||
|
||||
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
|
||||
if (0 < nIters) {
|
||||
while (true) {
|
||||
AccPack acc1[UnrollPacks];
|
||||
int r = rank;
|
||||
if (++r == nRanks) r = 0;
|
||||
{ Pack tmp1[UnrollPacks];
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
acc1[u] = applyReduce(red, applyCast<T, Acc>(acc0[u]), applyCast<T, Acc>(tmp1[u]));
|
||||
}
|
||||
}
|
||||
|
||||
if (++r == nRanks) r = 0;
|
||||
|
||||
int dr = 2;
|
||||
#pragma unroll 2
|
||||
for (int partial=0; partial <= 1; partial++) {
|
||||
#pragma unroll 1
|
||||
for (int i = 0;
|
||||
partial ? i < 1 : (dr + UnrollPeers <= nRanks);
|
||||
partial ? i++ : (dr += UnrollPeers)) {
|
||||
if (partial && dr == nRanks) break;
|
||||
|
||||
Pack tmp1[UnrollPeers][UnrollPacks];
|
||||
#pragma unroll
|
||||
for (int ur=0; ur < UnrollPeers-partial; ur++) {
|
||||
if (partial && ur!=0 && dr+ur == nRanks) break;
|
||||
#pragma unroll UnrollPacks
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
|
||||
}
|
||||
if (++r == nRanks) r = 0;
|
||||
}
|
||||
#pragma unroll
|
||||
for (int ur=0; ur < UnrollPeers-partial; ur++) {
|
||||
if (partial && ur!=0 && dr+ur == nRanks) break;
|
||||
#pragma unroll UnrollPacks
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
acc1[u] = applyReduce(red, acc1[u], applyCast<T, Acc>(tmp1[ur][u]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) acc0[u] = applyCast<Acc, T>(acc1[u]);
|
||||
|
||||
dr = 0;
|
||||
r = rank;
|
||||
#pragma unroll 2
|
||||
for (int partial=0; partial <= 1; partial++) {
|
||||
#pragma unroll 1
|
||||
for (int i = 0;
|
||||
partial ? i < 1 : (dr + UnrollPeers <= nRanks);
|
||||
partial ? i++ : (dr += UnrollPeers)) {
|
||||
#pragma unroll
|
||||
for (int ur=0; ur < UnrollPeers-partial; ur++) {
|
||||
if (partial && dr == nRanks) break;
|
||||
#pragma unroll UnrollPacks
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
add4G(outRank0, r*stride4G)[u*WARP_SIZE] = acc0[u];
|
||||
}
|
||||
if (++r == nRanks) r = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
|
||||
outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
|
||||
nIters -= wn;
|
||||
if (nIters <= 0) break;
|
||||
|
||||
// Load data for next iteration.
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int UnrollPeers, typename Red, typename T>
|
||||
static __device__ __forceinline__ void allreduceEnds(
|
||||
ncclSymPrims& prim, int tn, int t, Red red,
|
||||
T* inputRank0, T* outputRank0, size_t nElts, uint32_t nPreElts, size_t nSufElts
|
||||
) {
|
||||
using Acc = typename Red::EltType;
|
||||
|
||||
int const& rank = prim.rank;
|
||||
int const& nRanks = prim.nRanks;
|
||||
uint32_t const& stride4G = prim.stride4G;
|
||||
BytePack<sizeof(T)>* inpRank0 = (BytePack<sizeof(T)>*)inputRank0;
|
||||
BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
|
||||
|
||||
#pragma unroll 1
|
||||
for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
|
||||
size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i;
|
||||
BytePack<sizeof(T)> acc0 = *add4G(inpRank0+elt, rank*stride4G);
|
||||
BytePack<sizeof(Acc)> acc1;
|
||||
BytePack<sizeof(T)> tmp[UnrollPeers];
|
||||
int dr = 1;
|
||||
int r = rank+1;
|
||||
if (nRanks == r) r = 0;
|
||||
bool first = true;
|
||||
|
||||
#pragma unroll 2
|
||||
for (int partial=0; partial <= 1; partial++) {
|
||||
#pragma unroll 1
|
||||
for (int j = 0;
|
||||
partial ? j < 1 : (dr + UnrollPeers <= nRanks);
|
||||
partial ? j++ : (dr += UnrollPeers)) {
|
||||
if (partial && dr == nRanks) break;
|
||||
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPeers-partial; u++) {
|
||||
if (partial && u!=0 && dr+u == nRanks) break;
|
||||
tmp[u] = *add4G(inpRank0+elt, r*stride4G);
|
||||
r += 1;
|
||||
if (r == nRanks) r = 0;
|
||||
}
|
||||
if (first) {
|
||||
first = false;
|
||||
acc1 = applyCast<T, Acc>(acc0);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPeers-partial; u++) {
|
||||
if (partial && u!=0 && dr+u == nRanks) break;
|
||||
acc1 = applyReduce(red, acc1, applyCast<T, Acc>(tmp[u]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
acc0 = applyCast<Acc, T>(acc1);
|
||||
dr = 0;
|
||||
r = rank;
|
||||
#pragma unroll 2
|
||||
for (int partial=0; partial <= 1; partial++) {
|
||||
#pragma unroll 1
|
||||
for (int j=0;
|
||||
partial ? j < 1 : (dr + UnrollPeers <= nRanks);
|
||||
partial ? j++ : (dr += UnrollPeers)) {
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPeers-partial; u++) {
|
||||
if (partial && dr+u == nRanks) break;
|
||||
*add4G(outRank0+elt, r*stride4G) = acc0;
|
||||
r += 1;
|
||||
if (r == nRanks) r = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Red, typename T>
|
||||
static __device__ void allreduce(
|
||||
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
|
||||
Red red, T* input, T* output, size_t nElts
|
||||
) {
|
||||
int nRanks = prim.nRanks;
|
||||
int nBlocks = prim.nBlocks;
|
||||
// Mpve to rank=0
|
||||
input = prim.peerPtr(0, input);
|
||||
output = prim.peerPtr(0, output);
|
||||
|
||||
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
|
||||
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
|
||||
size_t nBytes = nElts*sizeof(T);
|
||||
|
||||
uint32_t nPreBytes = (16u - inputUptr)%16u;
|
||||
nPreBytes = min((size_t)nPreBytes, nBytes);
|
||||
uintptr_t cursor = nPreBytes;
|
||||
|
||||
constexpr int MinWarpPerBlock = 4;
|
||||
|
||||
if ((inputUptr-outputUptr)%16 == 0) {
|
||||
constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
|
||||
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
|
||||
chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
|
||||
if (chunks != 0) {
|
||||
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
|
||||
allreduceDeep<BytePerPack, UnrollPacks, UnrollPeers, T>(
|
||||
prim, tn, t, waitNeeded, red,
|
||||
(char*)input + cursor, (char*)output + cursor,
|
||||
chunks*MinWarpPerBlock
|
||||
);
|
||||
cursor = cursorAfter;
|
||||
waitNeeded = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
|
||||
constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4;
|
||||
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
|
||||
chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
|
||||
if (chunks != 0) {
|
||||
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
|
||||
allreduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>(
|
||||
prim, tn, t, waitNeeded, red,
|
||||
(char*)input + cursor, (char*)output + cursor,
|
||||
chunks*MinWarpPerBlock
|
||||
);
|
||||
cursor = cursorAfter;
|
||||
waitNeeded = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
|
||||
constexpr int UnrollPeers = 8;
|
||||
size_t nSufElts = (nBytes-cursor)/sizeof(T);
|
||||
allreduceEnds<UnrollPeers>(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
|
||||
}
|
||||
|
||||
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(ncclSymDevArgs const* args) {
|
||||
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
|
||||
int /*const&*/ rank = prim.rank;
|
||||
int /*const&*/ nRanks = prim.nRanks;
|
||||
Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
|
||||
|
||||
// Threads numbered globally such that we round robin warps by rank then block.
|
||||
int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
|
||||
rank, nRanks,
|
||||
prim.block, prim.nBlocks,
|
||||
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
|
||||
int gtn = nRanks*prim.nBlocks*blockDim.x;
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
|
||||
//prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
|
||||
allreduce(prim, gtn, gt, /*waitNeeded=*/true, red, (T*)args->input, (T*)args->output, args->nElts);
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
|
||||
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
}
|
||||
|
||||
|
||||
template<typename Red, typename T>
|
||||
static __device__ void allreduceMultimem(
|
||||
ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts
|
||||
) {
|
||||
// Mpve to multimem
|
||||
input = prim.multimemPtr(input);
|
||||
output = prim.multimemPtr(output);
|
||||
|
||||
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
|
||||
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
|
||||
size_t nBytes = nElts*sizeof(T);
|
||||
|
||||
constexpr int BytePerPack = LoadMultimem_BigPackSize<Red>::BigPackSize;
|
||||
uint32_t nPreBytes = (BytePerPack - inputUptr)%BytePerPack;
|
||||
nPreBytes = min((size_t)nPreBytes, nBytes);
|
||||
uintptr_t nSufBytes;
|
||||
|
||||
if (alignof(T) == BytePerPack || (inputUptr-outputUptr)%BytePerPack == 0) {
|
||||
constexpr int UnrollPacks = 16*8/BytePerPack;
|
||||
constexpr int BytePerChunk = UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
uintptr_t cursor = nPreBytes;
|
||||
int nChunks = (nBytes-cursor)/BytePerChunk;
|
||||
uintptr_t cursorAfter = cursor + uintptr_t(nChunks)*BytePerChunk;
|
||||
nSufBytes = nBytes - cursorAfter;
|
||||
cursor += (t/WARP_SIZE)*UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
cursor += (t%WARP_SIZE)*BytePerPack;
|
||||
int nIters = nChunks - t/WARP_SIZE;
|
||||
#pragma unroll 1
|
||||
while (0 < nIters) {
|
||||
BytePack<BytePerPack> tmp[UnrollPacks];
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
tmp[u] = applyLoadMultimem<Red, BytePerPack>(red, inputUptr + cursor + u*WARP_SIZE*BytePerPack);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
multimem_st_global(outputUptr + cursor + u*WARP_SIZE*BytePerPack, tmp[u]);
|
||||
}
|
||||
cursor += tn*UnrollPacks*BytePerPack;
|
||||
nIters -= tn/WARP_SIZE;
|
||||
}
|
||||
} else {
|
||||
nPreBytes = 0;
|
||||
nSufBytes = nBytes;
|
||||
}
|
||||
|
||||
// Get the prefix+suffix element one at a time.
|
||||
#pragma unroll 4
|
||||
for (uintptr_t i = t*sizeof(T); i < nPreBytes + nSufBytes; i += tn*sizeof(T)) {
|
||||
uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
|
||||
BytePack<sizeof(T)> val = applyLoadMultimem<Red, sizeof(T)>(red, inputUptr + cursor);
|
||||
multimem_st_global(outputUptr + cursor, val);
|
||||
cursor += tn*sizeof(T);
|
||||
}
|
||||
}
|
||||
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymDevArgs const* args) {
|
||||
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
|
||||
Red<typename ncclSymAccumType<Red, T, /*nvls=*/true>::Type> red(args->redOpArg);
|
||||
|
||||
// Threads numbered globally such that we round robin warps by rank then block.
|
||||
int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
|
||||
prim.rank, prim.nRanks,
|
||||
prim.block, prim.nBlocks,
|
||||
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
|
||||
int gtn = prim.nRanks*prim.nBlocks*blockDim.x;
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
|
||||
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
|
||||
allreduceMultimem(prim, gtn, gt, red, (T*)args->input, (T*)args->output, args->nElts);
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/true);
|
||||
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
}
|
||||
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R_impl(ncclSymDevArgs const* args, bool multimem) {
|
||||
ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
|
||||
int /*const&*/ rank = prim.rank;
|
||||
using Acc = typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type;
|
||||
Red<Acc> red(args->redOpArg);
|
||||
|
||||
using Pack = BytePack<8>;
|
||||
using AccPack = BytePack<8*sizeof(Acc)/sizeof(T)>;
|
||||
constexpr int EltPerPack = 8/sizeof(T);
|
||||
int nElts = args->nElts;
|
||||
int nPacks = divUp(nElts, EltPerPack);
|
||||
|
||||
bool packAligned = 8 <= alignof(T) || (
|
||||
args->nElts*sizeof(T) |
|
||||
(uint32_t)reinterpret_cast<uintptr_t>(args->input) |
|
||||
(uint32_t)reinterpret_cast<uintptr_t>(args->output)
|
||||
)%8 == 0;
|
||||
|
||||
uint32_t nPackPerBlock, nPackModBlock;
|
||||
idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
|
||||
int begin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
|
||||
int end = begin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
|
||||
|
||||
nPacks = end - begin;
|
||||
nElts -= begin*EltPerPack;
|
||||
nElts = min(nElts, nPacks*EltPerPack);
|
||||
T* input = (T*)args->input + begin*EltPerPack;
|
||||
T* output = (T*)args->output + begin*EltPerPack;
|
||||
|
||||
ncclCoopCta cta;
|
||||
int t = threadIdx.x;
|
||||
int tn = ncclSymMaxThreads;
|
||||
|
||||
if (__builtin_expect(packAligned, true)) {
|
||||
#pragma unroll 1
|
||||
while (0 < nPacks) {
|
||||
if (t < nPacks) {
|
||||
int nIterPacks = min(nPacks, tn);
|
||||
Pack inp = loadPack<Pack>((Pack*)input, t, nPacks);
|
||||
prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
|
||||
Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
|
||||
storePack((Pack*)output, t, nPacks, out);
|
||||
}
|
||||
prim.endLL(cta);
|
||||
|
||||
input += tn*EltPerPack;
|
||||
output += tn*EltPerPack;
|
||||
nPacks -= tn;
|
||||
}
|
||||
} else {
|
||||
#pragma unroll 1
|
||||
while (0 < nElts) {
|
||||
if (t*EltPerPack < nElts) {
|
||||
int nIterPacks = min(nPacks, tn);
|
||||
Pack inp = loadPack<Pack>(input, t*EltPerPack, nElts);
|
||||
prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
|
||||
Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
|
||||
storePack(output, t*EltPerPack, nElts, out);
|
||||
}
|
||||
prim.endLL(cta);
|
||||
|
||||
input += tn*EltPerPack;
|
||||
output += tn*EltPerPack;
|
||||
nElts -= tn*EltPerPack;
|
||||
nPacks -= tn;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(ncclSymDevArgs const* args) {
|
||||
ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/false);
|
||||
}
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(ncclSymDevArgs const* args) {
|
||||
ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/true);
|
||||
}
|
||||
Εκτελέσιμο αρχείο
+247
@@ -0,0 +1,247 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import sys
|
||||
|
||||
################################################################################
|
||||
# The first command line argument is the path to the directory to generate and
|
||||
# populate.
|
||||
|
||||
gensrc = sys.argv[1]
|
||||
|
||||
if os.path.exists(gensrc):
|
||||
for name in os.listdir(gensrc):
|
||||
os.remove(os.path.join(gensrc, name))
|
||||
#os.truncate(os.path.join(gensrc, name), 0)
|
||||
else:
|
||||
os.mkdir(gensrc)
|
||||
|
||||
def paste(sep, *args):
|
||||
return sep.join(args)
|
||||
|
||||
indents = 0
|
||||
def emitln(f, lines):
|
||||
global indents
|
||||
for ln in ((lines,) if isinstance(lines, str) else lines):
|
||||
f.write(' '*indents + ln + '\n')
|
||||
|
||||
def indent(s):
|
||||
return '\n'.join(' '+l for l in s.splitlines())
|
||||
|
||||
class Rec(object):
|
||||
def __init__(me, **kw):
|
||||
me.__dict__.update(kw)
|
||||
def __eq__(x, y):
|
||||
if len(x) != len(y): return False
|
||||
for k in x:
|
||||
if k not in y: return False
|
||||
if x[k] != y[k]: return False
|
||||
return True
|
||||
def __hash__(me):
|
||||
h = 0
|
||||
for k in me.__dict__:
|
||||
h += hash((k, me.__dict__[k]))
|
||||
return h
|
||||
|
||||
################################################################################
|
||||
# Edit this region for introducing new algos etc
|
||||
|
||||
reductions = ["AllReduce","ReduceScatter"]
|
||||
all_reds = ["sum"]
|
||||
all_tys = ["f32","f16","bf16","f8e4m3","f8e5m2"]
|
||||
|
||||
nvls_algos_by_coll = {
|
||||
"AllReduce": ["AGxLLMC_R","RSxLDMC_AGxSTMC"],
|
||||
"ReduceScatter": ["LDMC"]
|
||||
}
|
||||
ldmc_algos = ["RSxLDMC_AGxSTMC", "LDMC"]
|
||||
|
||||
coll_to_lower = {
|
||||
"AllGather": "all_gather",
|
||||
"AllReduce": "all_reduce",
|
||||
"ReduceScatter": "reduce_scatter"
|
||||
}
|
||||
|
||||
red_to_ncclDevRedOp = {
|
||||
"sum": "ncclDevSum"
|
||||
}
|
||||
red_to_Func = {
|
||||
"sum": "FuncSum"
|
||||
}
|
||||
|
||||
ty_to_ncclDataType = {
|
||||
"f32": "ncclFloat32",
|
||||
"f16": "ncclFloat16",
|
||||
"bf16": "ncclBfloat16",
|
||||
"f8e4m3": "ncclFloat8e4m3",
|
||||
"f8e5m2": "ncclFloat8e5m2"
|
||||
}
|
||||
ty_to_cxxtype = {
|
||||
"f32": "float",
|
||||
"f16": "half",
|
||||
"bf16": "hip_bfloat16",
|
||||
"f8e4m3": "rccl_float8",
|
||||
"f8e5m2": "rccl_bfloat8"
|
||||
}
|
||||
|
||||
def enumerate_kernels():
|
||||
for algo in ["LL","ST"]:
|
||||
yield Rec(coll="AllGather", algo=algo)
|
||||
for red in all_reds:
|
||||
for ty in all_tys:
|
||||
for algo in ["AGxLL_R","RSxLD_AGxST"]:
|
||||
yield Rec(coll="AllReduce", algo=algo, red=red, ty=ty)
|
||||
for algo in ["LL","LD"]:
|
||||
yield Rec(coll="ReduceScatter", algo=algo, red=red, ty=ty)
|
||||
|
||||
def required_cuda(k):
|
||||
cudart, arch, specific_sms = 0, 0, None
|
||||
is_nvls = k.algo in nvls_algos_by_coll.get(k.coll, [])
|
||||
if is_nvls:
|
||||
cudart = max(cudart, 12010)
|
||||
arch = 900
|
||||
if k.coll in reductions:
|
||||
if k.ty == "bf16":
|
||||
cudart = max(cudart, 11000)
|
||||
if k.ty.startswith("f8"):
|
||||
cudart = max(cudart, 11080)
|
||||
arch = 900
|
||||
if k.algo in ldmc_algos:
|
||||
cudart = 12070
|
||||
arch = None
|
||||
specific_sms = [100, 120]
|
||||
return (cudart, arch, specific_sms)
|
||||
|
||||
################################################################################
|
||||
|
||||
def kernel_fdep(k):
|
||||
return coll_to_lower[k.coll] + '.cpp'
|
||||
|
||||
def kernel_fname(k):
|
||||
if k.coll in reductions:
|
||||
if k.algo in ldmc_algos and k.ty.startswith('f8'):
|
||||
return paste('_', coll_to_lower[k.coll], k.red, k.ty, k.algo) + '.cpp'
|
||||
else:
|
||||
return paste('_', coll_to_lower[k.coll], k.red, k.ty) + '.cpp'
|
||||
else:
|
||||
return coll_to_lower[k.coll] + '.cpp'
|
||||
|
||||
def kernel_gencode(k):
|
||||
if k.coll in reductions and k.algo in ldmc_algos and k.ty.startswith('f8'):
|
||||
return "$(NVCC_GENCODE_LDMC_FP8)"
|
||||
else:
|
||||
return "$(NVCC_GENCODE)"
|
||||
|
||||
def kernel_cname(k):
|
||||
if k.coll in reductions:
|
||||
return paste("_", "ncclSymDevKernel", k.coll, k.algo, k.red, k.ty)
|
||||
else:
|
||||
return paste("_", "ncclSymDevKernel", k.coll, k.algo)
|
||||
|
||||
def kernel_conds(k):
|
||||
cudart, arch, specific_sms = required_cuda(k)
|
||||
if cudart == 0: return (None, None)
|
||||
|
||||
cudart_cond = "CUDART_VERSION >= %d"%cudart
|
||||
if not specific_sms:
|
||||
arch_cond = "__CUDA_ARCH__ >= %d"%arch
|
||||
else:
|
||||
arch_cond = " || ".join(["0"] + ["NCCL_CUDA_ARCH_SPECIFIC==%d"%(10*sm) for sm in specific_sms])
|
||||
return cudart_cond, arch_cond
|
||||
|
||||
def instantiate(k):
|
||||
form_red_ty = (
|
||||
"__global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const *args) {{\n"
|
||||
" ncclSymRun_{id}<{red}, {ty}>(args);\n"
|
||||
"}}"
|
||||
)
|
||||
form = (
|
||||
"__global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const *args) {{\n"
|
||||
" ncclSymRun_{id}(args);\n"
|
||||
"}}"
|
||||
)
|
||||
|
||||
id = k.coll+'_'+k.algo
|
||||
cname = kernel_cname(k)
|
||||
if k.coll in reductions:
|
||||
inst = form_red_ty.format(cname=cname, id=id, red=red_to_Func[k.red], ty=ty_to_cxxtype[k.ty])
|
||||
else:
|
||||
inst = form.format(cname=cname, id=id)
|
||||
return inst
|
||||
|
||||
def prototype(k):
|
||||
return "__global__ void {cname}(ncclSymDevArgs const *args);".format(cname=kernel_cname(k))
|
||||
|
||||
################################################################################
|
||||
|
||||
def partition(vals, keyfn):
|
||||
ans = {}
|
||||
for x in vals:
|
||||
k = keyfn(x)
|
||||
if k not in ans:
|
||||
ans[k] = []
|
||||
ans[k].append(x)
|
||||
return ans
|
||||
|
||||
|
||||
kernels_by_file = partition(enumerate_kernels(), lambda k: (kernel_fname(k), k.coll))
|
||||
|
||||
# Add dependency only files (e.g. allreduce.cpp)
|
||||
for coll in set(k.coll for k in enumerate_kernels()):
|
||||
fname = coll_to_lower[coll]+'.cpp'
|
||||
if (fname, coll) not in kernels_by_file:
|
||||
kernels_by_file[fname, coll] = []
|
||||
|
||||
# Generate each kernel instantiation file
|
||||
for (fname, coll), ks in kernels_by_file.items():
|
||||
with open(os.path.join(gensrc, fname), "w") as f:
|
||||
print("-- Generating %s" % os.path.join(gensrc, fname))
|
||||
emitln(f, '#include "symmetric.h"')
|
||||
emitln(f, '#include "symmetric/kernel.h"')
|
||||
emitln(f, '#include "symmetric/{coll}.h"'.format(coll=coll_to_lower[coll]))
|
||||
for k in ks:
|
||||
emitln(f, instantiate(k))
|
||||
|
||||
# Generate <gensrc>/symmetric_host.cc
|
||||
with open(os.path.join(gensrc, "symmetric_kernels.cc"), "w") as f:
|
||||
print("-- Generating %s" % os.path.join(gensrc, "symmetric_kernels.cc"))
|
||||
emitln(f, '#include "symmetric.h"')
|
||||
emitln(f, '#include "device.h"')
|
||||
emitln(f, '')
|
||||
|
||||
for k in enumerate_kernels():
|
||||
emitln(f, prototype(k))
|
||||
emitln(f, '')
|
||||
|
||||
emitln(f, 'extern int const ncclSymKernelCount = %d;' % len(list(enumerate_kernels())))
|
||||
emitln(f, 'extern void* const ncclSymKernelList[] = {')
|
||||
for k in enumerate_kernels():
|
||||
emitln(f, '(void*){cname},'.format(cname=kernel_cname(k)))
|
||||
emitln(f, 'nullptr};')
|
||||
emitln(f, '')
|
||||
|
||||
emitln(f, 'void* ncclSymGetKernelPtr(ncclSymKernelId id, int red, ncclDataType_t ty) {')
|
||||
indents += 1
|
||||
emitln(f, 'switch (id) {')
|
||||
emitln(f, 'default: return nullptr;')
|
||||
for (coll, algo), coll_algo_ks in partition(enumerate_kernels(), lambda k: (k.coll, k.algo)).items():
|
||||
emitln(f, 'case ncclSymKernelId_'+coll+'_'+algo+':')
|
||||
indents += 1
|
||||
if len(coll_algo_ks) == 1:
|
||||
emitln(f, 'return (void*)&'+kernel_cname(coll_algo_ks[0])+';')
|
||||
else:
|
||||
emitln(f, 'switch ((ncclDevRedOp_t)red) {')
|
||||
emitln(f, 'default: return nullptr;')
|
||||
for red, coll_algo_red_ks in partition(coll_algo_ks, lambda k: k.red).items():
|
||||
emitln(f, 'case '+red_to_ncclDevRedOp[red]+':')
|
||||
indents += 1
|
||||
emitln(f, 'switch (ty) {')
|
||||
emitln(f, 'default: return nullptr;')
|
||||
for k in coll_algo_red_ks:
|
||||
emitln(f, 'case '+ty_to_ncclDataType[k.ty]+': return (void*)'+kernel_cname(k)+';')
|
||||
emitln(f, '}')
|
||||
indents -= 1
|
||||
emitln(f, '}')
|
||||
indents -=1
|
||||
emitln(f, '}')
|
||||
indents -= 1
|
||||
emitln(f, '}')
|
||||
@@ -0,0 +1,27 @@
|
||||
#ifndef NCCL_DEVICE_SYMMETRIC_KERNEL_H_
|
||||
#define NCCL_DEVICE_SYMMETRIC_KERNEL_H_
|
||||
|
||||
#include "symmetric.h"
|
||||
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(struct ncclSymDevArgs const* args);
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(struct ncclSymDevArgs const* args);
|
||||
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(struct ncclSymDevArgs const* args);
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(struct ncclSymDevArgs const* args);
|
||||
|
||||
__device__ __forceinline__ void ncclSymRun_AllGather_LL(struct ncclSymDevArgs const* args);
|
||||
__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(struct ncclSymDevArgs const* args);
|
||||
__device__ __forceinline__ void ncclSymRun_AllGather_ST(struct ncclSymDevArgs const* args);
|
||||
__device__ __forceinline__ void ncclSymRun_AllGather_STMC(struct ncclSymDevArgs const* args);
|
||||
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(struct ncclSymDevArgs const* args);
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(struct ncclSymDevArgs const* args);
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(struct ncclSymDevArgs const* args);
|
||||
#endif
|
||||
@@ -0,0 +1,477 @@
|
||||
#ifndef NCCL_DEVICE_SYMMETRIC_PRIMITIVES_H_
|
||||
#define NCCL_DEVICE_SYMMETRIC_PRIMITIVES_H_
|
||||
|
||||
#include "symmetric.h"
|
||||
#include "bitops.h"
|
||||
#include "collectives.h"
|
||||
#include "op128.h"
|
||||
#include "reduce_kernel.h"
|
||||
#include "common.h"
|
||||
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
// __grid_constant__ appears to break cuda-gdb
|
||||
#define NCCL_GRID_CONSTANT __grid_constant__
|
||||
#else
|
||||
#define NCCL_GRID_CONSTANT
|
||||
#endif
|
||||
|
||||
// flattenIx(pos0, dim0, pos1, dim1, pos2, dim2, ...)
|
||||
// Given a position vector `pos` in a rectangular index space with lengths in the `dim`
|
||||
// vector, flatten that down to a linear index. The fastest moving dimension is given first.
|
||||
__device__ __forceinline__ int flattenIx() { return 0; }
|
||||
|
||||
template<typename Int0, typename Int1, typename ...Ints>
|
||||
static __device__ Int0 flattenIx(Int0 pos, Int1 size, Ints ...more) {
|
||||
return pos + size*flattenIx(more...);
|
||||
}
|
||||
|
||||
// Precomputed integer reciprocoals for denominator values 1..64 inclusive.
|
||||
// Pass these to idivFast64() for fast division on the GPU.
|
||||
static __device__ uint64_t idivRcp64_upto64(int x) {
|
||||
static constexpr uint64_t table[65] = {
|
||||
idivRcp64(0x01), idivRcp64(0x01), idivRcp64(0x02), idivRcp64(0x03),
|
||||
idivRcp64(0x04), idivRcp64(0x05), idivRcp64(0x06), idivRcp64(0x07),
|
||||
idivRcp64(0x08), idivRcp64(0x09), idivRcp64(0x0a), idivRcp64(0x0b),
|
||||
idivRcp64(0x0c), idivRcp64(0x0d), idivRcp64(0x0e), idivRcp64(0x0f),
|
||||
idivRcp64(0x10), idivRcp64(0x11), idivRcp64(0x12), idivRcp64(0x13),
|
||||
idivRcp64(0x14), idivRcp64(0x15), idivRcp64(0x16), idivRcp64(0x17),
|
||||
idivRcp64(0x18), idivRcp64(0x19), idivRcp64(0x1a), idivRcp64(0x1b),
|
||||
idivRcp64(0x1c), idivRcp64(0x1d), idivRcp64(0x1e), idivRcp64(0x1f),
|
||||
idivRcp64(0x20), idivRcp64(0x21), idivRcp64(0x22), idivRcp64(0x23),
|
||||
idivRcp64(0x24), idivRcp64(0x25), idivRcp64(0x26), idivRcp64(0x27),
|
||||
idivRcp64(0x28), idivRcp64(0x29), idivRcp64(0x2a), idivRcp64(0x2b),
|
||||
idivRcp64(0x2c), idivRcp64(0x2d), idivRcp64(0x2e), idivRcp64(0x2f),
|
||||
idivRcp64(0x30), idivRcp64(0x31), idivRcp64(0x32), idivRcp64(0x33),
|
||||
idivRcp64(0x34), idivRcp64(0x35), idivRcp64(0x36), idivRcp64(0x37),
|
||||
idivRcp64(0x38), idivRcp64(0x39), idivRcp64(0x3a), idivRcp64(0x3b),
|
||||
idivRcp64(0x3c), idivRcp64(0x3d), idivRcp64(0x3e), idivRcp64(0x3f),
|
||||
idivRcp64(0x40)
|
||||
};
|
||||
return table[x];
|
||||
}
|
||||
|
||||
static __device__ uint32_t idivRcp32_upto64(int x) {
|
||||
return idivRcp64_upto64(x)>>32;
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct ncclCoopCta {
|
||||
__device__ void sync() { __syncthreads(); }
|
||||
__device__ int self() { return threadIdx.x; }
|
||||
__device__ int count() { return blockDim.x; }
|
||||
};
|
||||
struct ncclCoopWarps {
|
||||
int log2_nWarps;
|
||||
__device__ void sync() {
|
||||
asm volatile("barrier.sync %0, %1;" :: "r"(1 + (threadIdx.x>>(5+log2_nWarps))), "r"(32<<log2_nWarps) : "memory");
|
||||
}
|
||||
__device__ int self() { return threadIdx.x & ((32<<log2_nWarps)-1); }
|
||||
__device__ int count() { return 32<<log2_nWarps; }
|
||||
};
|
||||
struct ncclCoopWarp {
|
||||
__device__ void sync() { __syncwarp(); }
|
||||
__device__ int self() { return threadIdx.x%32; }
|
||||
__device__ int count() { return 32; }
|
||||
};
|
||||
}
|
||||
|
||||
namespace {
|
||||
static constexpr int ncclSymPrims_UseBarrier = 1;
|
||||
static constexpr int ncclSymPrims_UseLL = 2;
|
||||
static constexpr int ncclSymPrims_UseMultimem = 4;
|
||||
struct ncclSymPrims {
|
||||
int flags;
|
||||
int const &rank;
|
||||
int const &nRanks;
|
||||
uint32_t const &nRanks_rcp32;
|
||||
int block, nBlocks;
|
||||
uint32_t nBlocks_rcp32;
|
||||
uint32_t nBlocks_nWarps_rcp32;
|
||||
uint32_t nRanks_nBlocks_rcp32;
|
||||
uint32_t nWarpPerRank, nWarpPerRank_rcp32;
|
||||
struct ncclSymDevBase* const &base;
|
||||
uintptr_t offsetMc;
|
||||
|
||||
uint32_t const &stride4G;
|
||||
uint32_t barEpoch;
|
||||
uint32_t llEpoch;
|
||||
|
||||
__device__ ncclSymPrims(ncclSymDevComm const &comm, int flags):
|
||||
flags(flags),
|
||||
rank(comm.rank),
|
||||
nRanks(comm.nRanks),
|
||||
nRanks_rcp32(comm.nRanks_rcp32),
|
||||
block(blockIdx.x),
|
||||
nBlocks(gridDim.x),
|
||||
nBlocks_rcp32(idivRcp32_upto64(nBlocks)),
|
||||
nBlocks_nWarps_rcp32(imulRcp32(nBlocks, nBlocks_rcp32, blockDim.x/32, idivRcp32_upto64(blockDim.x/32))),
|
||||
nRanks_nBlocks_rcp32(imulRcp32(nRanks, nRanks_rcp32, gridDim.x, nBlocks_rcp32)),
|
||||
nWarpPerRank(idivFast32(nBlocks*blockDim.x/32, nRanks, nRanks_rcp32)),
|
||||
nWarpPerRank_rcp32(idivRcp32_upto64(nWarpPerRank)),
|
||||
base(comm.base),
|
||||
offsetMc((flags & ncclSymPrims_UseMultimem) ? (char*)comm.baseMc - (char*)base : 0x0),
|
||||
stride4G(comm.stride4G) {
|
||||
|
||||
#if CUDART_VERSION >= 12030 && __CUDA_ARCH__ >= 900
|
||||
cudaGridDependencySynchronize();
|
||||
#endif
|
||||
|
||||
if ((flags & ncclSymPrims_UseBarrier) && threadIdx.x < nRanks) {
|
||||
barEpoch = (flags & ncclSymPrims_UseMultimem) ? base->barEpochMc[block] : base->barEpochUc[block];
|
||||
}
|
||||
if (flags & ncclSymPrims_UseLL) llEpoch = base->llEpoch[block] + 2;
|
||||
}
|
||||
__device__ ~ncclSymPrims() {
|
||||
if (threadIdx.x == 0) {
|
||||
if (flags & ncclSymPrims_UseBarrier) {
|
||||
((flags & ncclSymPrims_UseMultimem) ? base->barEpochMc : base->barEpochUc)[block] = barEpoch;
|
||||
}
|
||||
if (flags & ncclSymPrims_UseLL) base->llEpoch[block] = llEpoch - 2;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__device__ T* peerPtr(int peer, T* selfPtr) {
|
||||
return add4G(selfPtr, (peer-rank)*stride4G);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__device__ T* multimemPtr(T* selfPtr) {
|
||||
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(selfPtr) + offsetMc);
|
||||
}
|
||||
|
||||
__device__ void barrierArrive(ncclCoopCta cta, bool release) {
|
||||
cta.sync();
|
||||
#if __CUDA_ARCH__ < 700
|
||||
if (release) {
|
||||
if (cta.self() == 0) __threadfence_system();
|
||||
cta.sync();
|
||||
}
|
||||
#endif
|
||||
if (flags & ncclSymPrims_UseMultimem) {
|
||||
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
|
||||
if (cta.self() == 0) {
|
||||
uint32_t* inbox = &multimemPtr(base)->barInboxMc[block];
|
||||
if (release) {
|
||||
asm volatile("multimem.red.release.sys.add.u32 [%0],1;" :: "l"(inbox));
|
||||
} else {
|
||||
asm volatile("multimem.red.relaxed.sys.add.u32 [%0],1;" :: "l"(inbox));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
int r = cta.self();
|
||||
if (r != rank && r < nRanks) {
|
||||
uint32_t* inbox = &peerPtr(r, base)->barInboxPerPeer[block*nRanks + rank];
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
if (release) {
|
||||
asm volatile("st.release.sys.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1));
|
||||
} else {
|
||||
asm volatile("st.relaxed.sys.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1));
|
||||
}
|
||||
#else
|
||||
if (release) {
|
||||
__atomic_store_n(inbox, barEpoch + 1, __ATOMIC_RELEASE);
|
||||
} else {
|
||||
__atomic_store_n(inbox, barEpoch + 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
// asm volatile("st.volatile.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void barrierWait(ncclCoopCta cta, bool acquire) {
|
||||
if (flags & ncclSymPrims_UseMultimem) {
|
||||
#if __CUDA_ARCH__ >= 900
|
||||
if (cta.self() == 0) {
|
||||
uint32_t* inbox = &base->barInboxMc[block];
|
||||
while (true) {
|
||||
uint32_t got;
|
||||
if (acquire) {
|
||||
asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
|
||||
} else {
|
||||
asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
|
||||
}
|
||||
if (got-(barEpoch+nRanks) <= uint32_t(-1)>>1) break;
|
||||
}
|
||||
barEpoch += nRanks;
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
int r = cta.self();
|
||||
if (r != rank && r < nRanks) {
|
||||
uint32_t* inbox = &base->barInboxPerPeer[block*nRanks + r];
|
||||
while (true) {
|
||||
uint32_t got;
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
if (acquire) {
|
||||
asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
|
||||
} else {
|
||||
asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
|
||||
}
|
||||
#else
|
||||
if (acquire) {
|
||||
got = __atomic_load_n(inbox, __ATOMIC_ACQUIRE);
|
||||
} else {
|
||||
got = __atomic_load_n(inbox, __ATOMIC_RELAXED);
|
||||
}
|
||||
// asm volatile("ld.volatile.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
|
||||
#endif
|
||||
if (got-(barEpoch+1) <= uint32_t(-1)>>1) break;
|
||||
}
|
||||
}
|
||||
#if __CUDA_ARCH__ < 700
|
||||
if (acquire) {
|
||||
cta.sync();
|
||||
if (cta.self() == 0) __threadfence();
|
||||
}
|
||||
#endif
|
||||
barEpoch += 1;
|
||||
}
|
||||
cta.sync();
|
||||
}
|
||||
|
||||
__device__ void endLL(ncclCoopCta cta) {
|
||||
if (__builtin_expect(llEpoch >= -2u, false)) {
|
||||
cta.sync();
|
||||
uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch);
|
||||
int epochSize = ncclSymLLEpochSize(nRanks);
|
||||
#pragma unroll 4
|
||||
for (int i=cta.self(); i*16 < epochSize; i += cta.count()) {
|
||||
buf[i] = uint4{0, 0, 0, 0};
|
||||
}
|
||||
}
|
||||
cta.sync();
|
||||
llEpoch += (llEpoch == -1u) ? 3 : 1;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__device__ void sendLL(int peer, int slot, T val) {
|
||||
union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
|
||||
tmp = val;
|
||||
uint4* buf = ncclSymDevBase_getLLBuf(peerPtr(peer, base), nRanks, block, llEpoch) + slot;
|
||||
#pragma unroll
|
||||
for (int u=0; u < divUp(sizeof(T),8); u++) {
|
||||
using Vec = uint32_t __attribute__((ext_vector_type(4)));
|
||||
Vec i4;
|
||||
i4[0] = u32[u][0];
|
||||
i4[1] = llEpoch;
|
||||
i4[2] = u32[u][1];
|
||||
i4[3] = llEpoch;
|
||||
#if defined(__gfx950__)
|
||||
asm volatile ("flat_store_dwordx4 %0, %1 sc0 sc1 nt" :: "v"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "v"(i4));
|
||||
#else
|
||||
__builtin_nontemporal_store(i4, (Vec*)(buf + ncclSymLLMaxSlots(sizeof(T))*u));
|
||||
#endif
|
||||
// asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__device__ void bcastLL(int slot, T val) {
|
||||
if (flags & ncclSymPrims_UseMultimem) {
|
||||
union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
|
||||
tmp = val;
|
||||
uint4* bufmc = ncclSymDevBase_getLLBuf(multimemPtr(base), nRanks, block, llEpoch) + slot;
|
||||
#pragma unroll
|
||||
for (int u=0; u < divUp(sizeof(T),8); u++) {
|
||||
using Vec = uint32_t __attribute__((ext_vector_type(4)));
|
||||
Vec i4;
|
||||
i4[0] = u32[u][0];
|
||||
i4[1] = llEpoch;
|
||||
i4[2] = u32[u][1];
|
||||
i4[3] = llEpoch;
|
||||
#if defined(__gfx950__)
|
||||
asm volatile ("flat_store_dwordx4 %0, %1 sc0 sc1 nt" :: "v"(bufmc + ncclSymLLMaxSlots(sizeof(T))*u), "v"(i4));
|
||||
#else
|
||||
__builtin_nontemporal_store(i4, (Vec*)(bufmc + ncclSymLLMaxSlots(sizeof(T))*u));
|
||||
#endif
|
||||
// asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(bufmc + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
|
||||
}
|
||||
} else {
|
||||
union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
|
||||
tmp = val;
|
||||
uint4* buf0 = ncclSymDevBase_getLLBuf(peerPtr(0, base), nRanks, block, llEpoch) + slot;
|
||||
int dr = 0;
|
||||
int r = rank;
|
||||
#pragma unroll 1
|
||||
for (; dr+8 <= nRanks; dr += 8) {
|
||||
#pragma unroll
|
||||
for (int ur=0; ur < 8; ur++) {
|
||||
uint4* buf = add4G(buf0, r*stride4G);
|
||||
#pragma unroll
|
||||
for (int u=0; u < divUp(sizeof(T),8); u++) {
|
||||
using Vec = uint32_t __attribute__((ext_vector_type(4)));
|
||||
Vec i4;
|
||||
i4[0] = u32[u][0];
|
||||
i4[1] = llEpoch;
|
||||
i4[2] = u32[u][1];
|
||||
i4[3] = llEpoch;
|
||||
#if defined(__gfx950__)
|
||||
asm volatile ("flat_store_dwordx4 %0, %1 sc0 sc1 nt" :: "v"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "v"(i4));
|
||||
#else
|
||||
__builtin_nontemporal_store(i4, (Vec*)((buf + ncclSymLLMaxSlots(sizeof(T))*u)));
|
||||
#endif
|
||||
// asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
|
||||
}
|
||||
r += 1;
|
||||
if (r == nRanks) r = 0;
|
||||
}
|
||||
}
|
||||
#pragma unroll
|
||||
for (int ur=0; ur < 8; ur++, dr++) {
|
||||
if (dr == nRanks) break;
|
||||
uint4* buf = add4G(buf0, r*stride4G);
|
||||
#pragma unroll
|
||||
for (int u=0; u < divUp(sizeof(T),8); u++) {
|
||||
using Vec = uint32_t __attribute__((ext_vector_type(4)));
|
||||
Vec i4;
|
||||
i4[0] = u32[u][0];
|
||||
i4[1] = llEpoch;
|
||||
i4[2] = u32[u][1];
|
||||
i4[3] = llEpoch;
|
||||
#if defined(__gfx950__)
|
||||
asm volatile ("flat_store_dwordx4 %0, %1 sc0 sc1 nt" :: "v"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "v"(i4));
|
||||
#else
|
||||
__builtin_nontemporal_store(i4, (Vec*)(buf + ncclSymLLMaxSlots(sizeof(T))*u));
|
||||
#endif
|
||||
// asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
|
||||
}
|
||||
r += 1;
|
||||
if (r == nRanks) r = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int nSlotsMin, int nSlotsMax, typename T>
|
||||
__device__ void recvLL(int slot0, int nSlots, int stride, T(&elts)[nSlotsMax]) {
|
||||
uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch) + slot0;
|
||||
uint4 tmp[nSlotsMax][divUp(sizeof(T),8)];
|
||||
//int spins=0;
|
||||
while (true) {
|
||||
#pragma unroll
|
||||
for (int u=0; u < nSlotsMax; u++) {
|
||||
if (u < nSlotsMin || u < nSlots) {
|
||||
#pragma unroll
|
||||
for (int v=0; v < divUp(sizeof(T),8); v++) {
|
||||
tmp[u][v] = *(buf + u * stride + v * ncclSymLLMaxSlots(sizeof(T)));
|
||||
// asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(tmp[u][v].x), "=r"(tmp[u][v].y), "=r"(tmp[u][v].z), "=r"(tmp[u][v].w) : "l"(buf + u*stride + v*ncclSymLLMaxSlots(sizeof(T))));
|
||||
}
|
||||
}
|
||||
}
|
||||
bool okAll = true;
|
||||
#pragma unroll
|
||||
for (int u=0; u < nSlotsMax; u++) {
|
||||
#pragma unroll
|
||||
for (int v=0; v < divUp(sizeof(T),8); v++) {
|
||||
if (u < nSlotsMin || u < nSlots) {
|
||||
bool ok = tmp[u][v].y == llEpoch &&
|
||||
tmp[u][v].w == llEpoch;
|
||||
okAll &= ok;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (__builtin_expect(okAll, true)) break;
|
||||
//if (spins++ == 10<<20) spins=0;
|
||||
}
|
||||
#pragma unroll
|
||||
for (int u=0; u < nSlotsMax; u++) {
|
||||
if (nSlotsMin <= u && u == nSlots) break;
|
||||
union { T val; uint32_t u32[divUp(sizeof(T),8)][2]; };
|
||||
#pragma unroll
|
||||
for (int v=0; v < divUp(sizeof(T),8); v++) {
|
||||
u32[v][0] = tmp[u][v].x;
|
||||
u32[v][1] = tmp[u][v].z;
|
||||
}
|
||||
elts[u] = val;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Pack, typename T, typename Red, int Unroll=8>
|
||||
__device__ Pack recvReduceLL(int slot, int stride, Red red) {
|
||||
using Acc = typename Red::EltType;
|
||||
using AccPack = BytePack<sizeof(Pack)*sizeof(Acc)/sizeof(T)>;
|
||||
AccPack acc;
|
||||
bool first = true;
|
||||
int r = 0;
|
||||
#pragma unroll 1
|
||||
for (; r+Unroll <= nRanks; r += Unroll) {
|
||||
Pack got[Unroll];
|
||||
this->template recvLL</*Min=*/Unroll>(slot + r*stride, Unroll, stride, got);
|
||||
AccPack acc0 = applyCast<T, Acc>(got[0]);
|
||||
acc = first ? acc0 : applyReduce(red, acc, acc0);
|
||||
first = false;
|
||||
#pragma unroll
|
||||
for (int i=1; i < Unroll; i++) acc = applyReduce(red, acc, applyCast<T, Acc>(got[i]));
|
||||
}
|
||||
if (r < nRanks) {
|
||||
Pack got[Unroll];
|
||||
this->template recvLL</*Min=*/1>(slot + r*stride, nRanks-r, stride, got);
|
||||
AccPack acc0 = applyCast<T, Acc>(got[0]);
|
||||
acc = first ? acc0 : applyReduce(red, acc, acc0);
|
||||
#pragma unroll
|
||||
for (int i=1; i < Unroll-1; i++) {
|
||||
if (r+i < nRanks) acc = applyReduce(red, acc, applyCast<T, Acc>(got[i]));
|
||||
}
|
||||
}
|
||||
return applyCast<Acc, T>(acc);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__device__ T recvLL(int slot) {
|
||||
T one[1];
|
||||
this->template recvLL<1, 1, T>(slot, 1, 0, one);
|
||||
return one[0];
|
||||
}
|
||||
|
||||
template<typename Coop, typename T>
|
||||
__device__ void coopRecvLL(Coop coop, int slot0, int nSlots, T* dst) {
|
||||
int me = coop.self();
|
||||
if (me < nSlots) {
|
||||
uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch) + slot0 + me;
|
||||
uint4 got[divUp(sizeof(T), 8)];
|
||||
//int spins=0;
|
||||
#pragma unroll 1
|
||||
while (true) {
|
||||
#pragma unroll
|
||||
for (int u=0; u < divUp(sizeof(T), 8); u++) {
|
||||
got[u] = *((buf + u * ncclSymLLMaxSlots(sizeof(T))));
|
||||
// asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(got[u].x), "=r"(got[u].y), "=r"(got[u].z), "=r"(got[u].w) : "l"(buf + u*ncclSymLLMaxSlots(sizeof(T))));
|
||||
}
|
||||
bool ok = true;
|
||||
#pragma unroll
|
||||
for (int u=0; u < divUp(sizeof(T), 8); u++) {
|
||||
ok &= got[u].y == llEpoch;
|
||||
ok &= got[u].w == llEpoch;
|
||||
}
|
||||
if (__builtin_expect(ok, true)) break;
|
||||
//if (++spins == 10<<20) { spins=0; printf("r=%d LL spin @ ix=%d got=%d want=%d\n", rank, slot0+me, got[0].y, llEpoch); }
|
||||
}
|
||||
union { T val; uint32_t u32[divUp(sizeof(T), 8)][2]; };
|
||||
#pragma unroll
|
||||
for (int u=0; u < divUp(sizeof(T), 8); u++) {
|
||||
u32[u][0] = got[u].x;
|
||||
u32[u][1] = got[u].z;
|
||||
}
|
||||
dst[slot0 + me] = val;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
template<template<typename> typename Red, typename T, bool nvls>
|
||||
struct ncclSymAccumType { using Type = T; };
|
||||
|
||||
// Only Red's whose opArg is invariant w.r.t. the datatype can have a different
|
||||
// accumulator type. At the moment this excludes integer min/max, sumpostdiv,
|
||||
// and premulsum.
|
||||
template<> struct ncclSymAccumType<FuncSum, __half, false> { using Type = float; };
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
template<> struct ncclSymAccumType<FuncSum, __nv_bfloat16, false> { using Type = float; };
|
||||
#endif
|
||||
#if defined(__CUDA_FP8_TYPES_EXIST__)
|
||||
template<> struct ncclSymAccumType<FuncSum, __nv_fp8_e4m3, false> { using Type = float; };
|
||||
template<> struct ncclSymAccumType<FuncSum, __nv_fp8_e5m2, false> { using Type = float; };
|
||||
#endif
|
||||
#endif
|
||||
@@ -0,0 +1,387 @@
|
||||
#include "symmetric.h"
|
||||
#include "symmetric/kernel.h"
|
||||
#include "symmetric/primitives.h"
|
||||
|
||||
template<int BytePerPack, int UnrollPacks, int UnrollPeers, typename T, typename Red>
|
||||
static __device__ void reduceDeep(
|
||||
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
|
||||
Red red, char* inputRank0, char* outputHere, int32_t nIters
|
||||
) {
|
||||
using Pack = BytePack<BytePerPack>;
|
||||
using Acc = typename Red::EltType;
|
||||
using AccPack = BytePack<BytePerPack*sizeof(Acc)/sizeof(T)>;
|
||||
|
||||
int wn = tn/WARP_SIZE;
|
||||
int w = t/WARP_SIZE;
|
||||
int lane = t%WARP_SIZE;
|
||||
int const& rank = prim.rank;
|
||||
int const& nRanks = prim.nRanks;
|
||||
uint32_t const& stride4G = prim.stride4G;
|
||||
Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
|
||||
Pack* outHere = (Pack*)outputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
|
||||
Pack acc0[UnrollPacks];
|
||||
|
||||
nIters -= w;
|
||||
if (0 < nIters) {
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
|
||||
}
|
||||
}
|
||||
|
||||
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
|
||||
if (0 < nIters) {
|
||||
while (true) {
|
||||
AccPack acc1[UnrollPacks];
|
||||
int r = rank+1;
|
||||
if (r == nRanks) r = 0;
|
||||
{ Pack tmp1[UnrollPacks];
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
acc1[u] = applyReduce(red, applyCast<T, Acc>(acc0[u]), applyCast<T, Acc>(tmp1[u]));
|
||||
}
|
||||
}
|
||||
|
||||
r += 1;
|
||||
if (r == nRanks) r = 0;
|
||||
|
||||
int dr = 2;
|
||||
#pragma unroll 2
|
||||
for (int partial=0; partial <= 1; partial++) {
|
||||
#pragma unroll 1
|
||||
for (int i = 0;
|
||||
partial ? i < 1 : (dr + UnrollPeers <= nRanks);
|
||||
partial ? i++ : (dr += UnrollPeers)) {
|
||||
if (partial && dr == nRanks) break;
|
||||
|
||||
Pack tmp1[UnrollPeers][UnrollPacks];
|
||||
#pragma unroll
|
||||
for (int ur=0; ur < UnrollPeers-partial; ur++) {
|
||||
if (partial && ur!=0 && dr+ur == nRanks) break;
|
||||
#pragma unroll UnrollPacks
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
|
||||
}
|
||||
r += 1;
|
||||
if (r == nRanks) r = 0;
|
||||
}
|
||||
#pragma unroll
|
||||
for (int ur=0; ur < UnrollPeers-partial; ur++) {
|
||||
if (partial && ur!=0 && dr+ur == nRanks) break;
|
||||
#pragma unroll UnrollPacks
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
acc1[u] = applyReduce(red, acc1[u], applyCast<T, Acc>(tmp1[ur][u]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) acc0[u] = applyCast<Acc, T>(acc1[u]);
|
||||
|
||||
#pragma unroll UnrollPacks
|
||||
for (int u=0; u < UnrollPacks; u++) outHere[u*WARP_SIZE] = acc0[u];
|
||||
|
||||
inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
|
||||
outHere += intptr_t(wn)*UnrollPacks*WARP_SIZE;
|
||||
nIters -= wn;
|
||||
if (nIters <= 0) break;
|
||||
|
||||
// Load data for next iteration.
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int UnrollPeers, typename Red, typename T>
|
||||
static __device__ void reduceEnds(
|
||||
ncclSymPrims& prim, int tn, int t, Red red,
|
||||
T* inputRank0, T* outputHere, size_t nElts, uint32_t nPreElts, size_t nSufElts
|
||||
) {
|
||||
using Acc = typename Red::EltType;
|
||||
|
||||
int const& rank = prim.rank;
|
||||
int const& nRanks = prim.nRanks;
|
||||
uint32_t const& stride4G = prim.stride4G;
|
||||
BytePack<sizeof(T)>* inpRank0 = (BytePack<sizeof(T)>*)inputRank0;
|
||||
BytePack<sizeof(T)>* outHere = (BytePack<sizeof(T)>*)outputHere;
|
||||
#pragma unroll 1
|
||||
for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
|
||||
size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i;
|
||||
BytePack<sizeof(T)> acc0 = *add4G(inpRank0+elt, rank*stride4G);
|
||||
BytePack<sizeof(Acc)> acc1;
|
||||
BytePack<sizeof(T)> tmp[UnrollPeers];
|
||||
int dr = 1;
|
||||
int r = rank+1;
|
||||
if (nRanks == r) r = 0;
|
||||
bool first = true;
|
||||
|
||||
#pragma unroll 2
|
||||
for (int partial=0; partial <= 1; partial++) {
|
||||
#pragma unroll 1
|
||||
for (int j = 0;
|
||||
partial ? j < 1 : (dr + UnrollPeers <= nRanks);
|
||||
partial ? j++ : (dr += UnrollPeers)) {
|
||||
if (partial && dr == nRanks) break;
|
||||
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPeers-partial; u++) {
|
||||
if (partial && u!=0 && dr+u == nRanks) break;
|
||||
tmp[u] = *add4G(inpRank0+elt, r*stride4G);
|
||||
r += 1;
|
||||
if (r == nRanks) r = 0;
|
||||
}
|
||||
if (first) {
|
||||
first = false;
|
||||
acc1 = applyCast<T, Acc>(acc0);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPeers-partial; u++) {
|
||||
if (partial && u!=0 && dr+u == nRanks) break;
|
||||
acc1 = applyReduce(red, acc1, applyCast<T, Acc>(tmp[u]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
acc0 = applyCast<Acc, T>(acc1);
|
||||
outHere[elt] = acc0;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Red, typename T>
|
||||
static __device__ void reduce(
|
||||
ncclSymPrims& prim, int tn, int t, bool waitNeeded,
|
||||
Red red, T* input, T* output, size_t nElts
|
||||
) {
|
||||
int nRanks = prim.nRanks;
|
||||
int nBlocks = prim.nBlocks;
|
||||
// Mpve input to rank=0
|
||||
input = prim.peerPtr(0, input);
|
||||
|
||||
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
|
||||
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
|
||||
uint32_t alignment = uint32_t(inputUptr - outputUptr);
|
||||
size_t nBytes = nElts*sizeof(T);
|
||||
|
||||
uint32_t nPreBytes = (16u - inputUptr)%16u;
|
||||
nPreBytes = min((size_t)nPreBytes, nBytes);
|
||||
uintptr_t cursor = nPreBytes;
|
||||
|
||||
constexpr int MinWarpPerBlock = 4;
|
||||
|
||||
if (alignment%16 == 0) {
|
||||
constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
|
||||
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
|
||||
chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
|
||||
if (chunks != 0) {
|
||||
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
|
||||
reduceDeep<BytePerPack, UnrollPacks, UnrollPeers, T>(
|
||||
prim, tn, t, waitNeeded, red,
|
||||
(char*)input + cursor, (char*)output + cursor,
|
||||
chunks*MinWarpPerBlock
|
||||
);
|
||||
cursor = cursorAfter;
|
||||
waitNeeded = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (sizeof(T) == 4 || (sizeof(T) < 4 && alignment%4 == 0)) {
|
||||
constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4;
|
||||
constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
uint32_t chunks = (nBytes-cursor)/BytePerChunk;
|
||||
chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
|
||||
if (chunks != 0) {
|
||||
uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
|
||||
reduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>(
|
||||
prim, tn, t, waitNeeded, red,
|
||||
(char*)input + cursor, (char*)output + cursor,
|
||||
chunks*MinWarpPerBlock
|
||||
);
|
||||
cursor = cursorAfter;
|
||||
waitNeeded = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
|
||||
constexpr int UnrollPeers = 8;
|
||||
size_t nSufElts = (nBytes-cursor)/sizeof(T);
|
||||
reduceEnds<UnrollPeers>(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
|
||||
}
|
||||
|
||||
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(ncclSymDevArgs const* args) {
|
||||
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
|
||||
Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
|
||||
|
||||
// Round robin warps over blocks.
|
||||
int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
|
||||
prim.block, prim.nBlocks,
|
||||
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
|
||||
int tn = prim.nBlocks*blockDim.x;
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
|
||||
//prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
|
||||
reduce(prim, tn, t, /*waitNeeded=*/true, red, (T*)args->input + prim.rank*args->nElts, (T*)args->output, args->nElts);
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
|
||||
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
}
|
||||
|
||||
|
||||
template<typename Red, typename T>
|
||||
static __device__ void reduceMultimem(
|
||||
ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts
|
||||
) {
|
||||
// Mpve input to multimem
|
||||
input = prim.multimemPtr(input);
|
||||
|
||||
uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
|
||||
uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
|
||||
size_t nBytes = nElts*sizeof(T);
|
||||
|
||||
constexpr int BytePerPack = LoadMultimem_BigPackSize<Red>::BigPackSize;
|
||||
uint32_t nPreBytes = (BytePerPack - inputUptr)%BytePerPack;
|
||||
nPreBytes = min((size_t)nPreBytes, nBytes);
|
||||
uintptr_t nSufBytes;
|
||||
|
||||
if (sizeof(T) == BytePerPack || (inputUptr-outputUptr)%BytePerPack == 0) {
|
||||
constexpr int UnrollPacks = 8*(16/BytePerPack);
|
||||
constexpr int BytePerChunk = UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
uintptr_t cursor = nPreBytes;
|
||||
uint32_t nChunks = (nBytes-cursor)/BytePerChunk;
|
||||
uintptr_t cursorAfter = cursor + uintptr_t(nChunks)*BytePerChunk;
|
||||
nSufBytes = nBytes - cursorAfter;
|
||||
cursor += (t/WARP_SIZE)*UnrollPacks*WARP_SIZE*BytePerPack;
|
||||
cursor += (t%WARP_SIZE)*BytePerPack;
|
||||
int nIters = nChunks - t/WARP_SIZE;
|
||||
#pragma unroll 1
|
||||
while (0 < nIters) {
|
||||
BytePack<BytePerPack> tmp[UnrollPacks];
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
tmp[u] = applyLoadMultimem<Red, BytePerPack>(red, inputUptr + cursor + u*WARP_SIZE*BytePerPack);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int u=0; u < UnrollPacks; u++) {
|
||||
*reinterpret_cast<BytePack<BytePerPack>*>(outputUptr + cursor + u*WARP_SIZE*BytePerPack) = tmp[u];
|
||||
}
|
||||
cursor += tn*UnrollPacks*BytePerPack;
|
||||
nIters -= tn/WARP_SIZE;
|
||||
}
|
||||
} else {
|
||||
nPreBytes = 0;
|
||||
nSufBytes = nBytes;
|
||||
}
|
||||
|
||||
// Get the prefix+suffix element one at a time.
|
||||
#pragma unroll 4
|
||||
for (uintptr_t i = t*sizeof(T); i < nPreBytes + nSufBytes; i += tn*sizeof(T)) {
|
||||
uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
|
||||
BytePack<sizeof(T)> val = applyLoadMultimem<Red, sizeof(T)>(red, inputUptr + cursor);
|
||||
*reinterpret_cast<BytePack<sizeof(T)>*>(outputUptr + cursor) = val;
|
||||
cursor += tn*sizeof(T);
|
||||
}
|
||||
}
|
||||
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(ncclSymDevArgs const* args) {
|
||||
ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
|
||||
Red<typename ncclSymAccumType<Red, T, /*nvls=*/true>::Type> red(args->redOpArg);
|
||||
|
||||
// Round robin warps over blocks.
|
||||
int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
|
||||
prim.block, prim.nBlocks,
|
||||
threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
|
||||
int tn = prim.nBlocks*blockDim.x;
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
|
||||
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
|
||||
reduceMultimem(prim, tn, t, red, (T*)args->input + prim.rank*args->nElts, (T*)args->output, args->nElts);
|
||||
|
||||
prim.barrierArrive(ncclCoopCta(), /*release=*/false);
|
||||
prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
|
||||
}
|
||||
|
||||
// T is user type, EltType is the most aligned type
|
||||
template<typename T, typename Red, typename EltType>
|
||||
__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL_body(
|
||||
ncclSymPrims &prim, Red red, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts) {
|
||||
using Pack = BytePack<8>;
|
||||
constexpr int EltPerPack = 8/sizeof(EltType);
|
||||
|
||||
int nRanks = prim.nRanks;
|
||||
int rank = prim.rank;
|
||||
int t = threadIdx.x;
|
||||
int tn = ncclSymMaxThreads;
|
||||
ncclCoopCta cta;
|
||||
|
||||
#pragma unroll 1
|
||||
while (0 < nElts) {
|
||||
int nIterPacks = min(nPacks, tn);
|
||||
int tn_div_nPacks = tn/nIterPacks;
|
||||
int tn_mod_nPacks = tn%nIterPacks;
|
||||
int peer = t/nIterPacks;
|
||||
int pack = t%nIterPacks;
|
||||
|
||||
#pragma unroll 1
|
||||
for (int i = t; i < nRanks*nIterPacks; i += tn) {
|
||||
Pack got = loadPack<Pack>(input + peer*nStrideElts, pack*EltPerPack, nElts);
|
||||
prim.sendLL(peer, rank*nIterPacks + pack, got);
|
||||
peer += tn_div_nPacks;
|
||||
pack += tn_mod_nPacks;
|
||||
if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; }
|
||||
}
|
||||
|
||||
if (t < nIterPacks) {
|
||||
Pack got = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
|
||||
storePack(output, t*EltPerPack, nElts, got);
|
||||
}
|
||||
prim.endLL(cta);
|
||||
|
||||
input += tn*EltPerPack;
|
||||
output += tn*EltPerPack;
|
||||
nElts -= tn*EltPerPack;
|
||||
nPacks -= tn;
|
||||
}
|
||||
}
|
||||
template<template<typename> typename Red, typename T>
|
||||
__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(ncclSymDevArgs const* args) {
|
||||
ncclSymPrims prim(args->comm, ncclSymPrims_UseLL);
|
||||
Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
|
||||
|
||||
using Pack = BytePack<8>;
|
||||
constexpr int EltPerPack = 8/sizeof(T);
|
||||
int nAllElts = args->nElts;
|
||||
int nAllPacks = divUp(nAllElts, EltPerPack);
|
||||
uint32_t nPackPerBlock, nPackModBlock;
|
||||
idivmodFast32(&nPackPerBlock, &nPackModBlock, nAllPacks, prim.nBlocks, prim.nBlocks_rcp32);
|
||||
int blockPackBegin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
|
||||
int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
|
||||
int nPacks = blockPackEnd - blockPackBegin;
|
||||
int nElts = nAllElts - blockPackBegin*EltPerPack;
|
||||
nElts = min(nElts, nPacks*EltPerPack);
|
||||
T* input = (T*)args->input + blockPackBegin*EltPerPack;
|
||||
T* output = (T*)args->output + blockPackBegin*EltPerPack;
|
||||
|
||||
uint32_t lowBits = args->nElts*sizeof(T);
|
||||
lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->input);
|
||||
lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->output);
|
||||
if (__builtin_expect(lowBits%8 == 0, true)) {
|
||||
ncclSymRun_ReduceScatter_LL_body<T>(prim, red, (Pack*)input, (Pack*)output, nPacks, nPacks, nAllElts/EltPerPack);
|
||||
} else {
|
||||
ncclSymRun_ReduceScatter_LL_body<T>(prim, red, input, output, nElts, nPacks, nAllElts);
|
||||
}
|
||||
}
|
||||
+262
-144
@@ -20,8 +20,10 @@
|
||||
#include "rccl_vars.h"
|
||||
#include "profiler.h"
|
||||
#include "transport.h"
|
||||
#include "register_inline.h"
|
||||
#include "common.h"
|
||||
#include "api_trace.h"
|
||||
|
||||
#include <cstring> // std::memcpy
|
||||
#include <cinttypes> // PRIx64
|
||||
#include <cassert>
|
||||
@@ -94,35 +96,40 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
|
||||
CUDACHECK(hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, cudaDev));
|
||||
int ncclMaxSharedMem = rcclShmemDynamicSize(cudaArch, WarpSize);
|
||||
|
||||
for (int k=0; k < KernelCount; k++) {
|
||||
void* fn = ncclKerns[k].kernelFn;
|
||||
cudaFuncAttributes attr = {0};
|
||||
if (fn == nullptr) continue;
|
||||
for (int sym=0; sym <= 1; sym++) {
|
||||
int kcount = sym==0 ? KernelCount : ncclSymKernelCount;
|
||||
for (int k=0; k < kcount; k++) {
|
||||
void* fn = sym==0 ? ncclKerns[k].kernelFn : ncclSymKernelList[k];
|
||||
cudaFuncAttributes attr = {0};
|
||||
if (fn == nullptr) continue;
|
||||
|
||||
CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
|
||||
if (maxStackSize) {
|
||||
if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
|
||||
ignore0:;
|
||||
}
|
||||
cudaError_t errcode = cudaFuncGetAttributes(&attr, fn);
|
||||
if (errcode == cudaErrorNoKernelImageForDevice) continue;
|
||||
CUDACHECKGOTO(errcode, result, ignore0);
|
||||
|
||||
if (carveout) {
|
||||
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
|
||||
cudaFuncAttributePreferredSharedMemoryCarveout, carveout),
|
||||
result, ignore1);
|
||||
ignore1:;
|
||||
}
|
||||
if (ncclMaxSharedMem != 0) {
|
||||
int sharedMemSize = ncclMaxSharedMem;
|
||||
if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) {
|
||||
WARN("cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
|
||||
cudaArch, sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
|
||||
return ncclSystemError;
|
||||
if (maxStackSize) {
|
||||
if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
|
||||
ignore0:;
|
||||
}
|
||||
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize),
|
||||
result, next_kernel);
|
||||
if (carveout) {
|
||||
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
|
||||
cudaFuncAttributePreferredSharedMemoryCarveout, carveout),
|
||||
result, ignore1);
|
||||
ignore1:;
|
||||
}
|
||||
if (ncclMaxSharedMem != 0) {
|
||||
int sharedMemSize = ncclMaxSharedMem;
|
||||
if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) {
|
||||
WARN("cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
|
||||
cudaArch, sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
|
||||
return ncclSystemError;
|
||||
}
|
||||
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize),
|
||||
result, next_kernel);
|
||||
}
|
||||
next_kernel:;
|
||||
}
|
||||
next_kernel:;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@@ -344,8 +351,8 @@ bool gfx942CheapFenceOff(const ncclDevWorkColl& devWork, bool disabledByPrecheck
|
||||
|
||||
ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
|
||||
struct ncclKernelPlanner* planner = &comm->planner;
|
||||
if (planner->isSymColl) return ncclSuccess;
|
||||
struct ncclTaskColl *task;
|
||||
|
||||
task = ncclIntruQueueHead(&planner->collTaskQueue);
|
||||
while (task != nullptr) {
|
||||
// Build a ncclDevWorkColl[Reg?] struct for each task.
|
||||
@@ -421,6 +428,38 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
|
||||
int fnOpTyIndices[ncclNumFuncs*ncclNumDevRedOps*ncclNumTypes];
|
||||
int fnOpTyCount = 0;
|
||||
|
||||
if (comm->nNodes == 1 && planner->nTasksColl == 1 && planner->nTasksP2p == 0) {
|
||||
void* sendSymPtr;
|
||||
void* recvSymPtr;
|
||||
struct ncclReg* sendReg;
|
||||
struct ncclReg* recvReg;
|
||||
size_t size = task->count*ncclTypeSize(task->datatype);
|
||||
NCCLCHECK(ncclRegFindSymmetric(comm, task->sendbuff, size, &sendSymPtr, &sendReg));
|
||||
NCCLCHECK(ncclRegFindSymmetric(comm, task->recvbuff, size, &recvSymPtr, &recvReg));
|
||||
bool implemented = ncclSymImplemented(task->func, task->opDev.op, task->datatype);
|
||||
|
||||
if (sendReg && recvReg && (sendReg->winFlags & recvReg->winFlags & NCCL_WIN_COLL_SYMMETRIC) && implemented) {
|
||||
enum ncclSymKernelId kernel;
|
||||
int nChannels, nWarps;
|
||||
float estTimeUs = 1.e18;
|
||||
NCCLCHECK(ncclSymPickKernel(comm, task->func, task->opDev.op, task->datatype, task->count, &estTimeUs, &kernel, &nChannels, &nWarps));
|
||||
|
||||
// We should only use symmetric kernel if it beats the asymmetric kernel. But the
|
||||
// perf model accuracy from asymmetric kernels is too inaccurate and reports too high
|
||||
// of a bandwidth. For now just always use symmetric if available.
|
||||
if (kernel != ncclSymKernelId_Count) {
|
||||
task->sendbuff = sendSymPtr;
|
||||
task->recvbuff = recvSymPtr;
|
||||
task->devFuncId = (int)kernel;
|
||||
task->nMaxChannels = nChannels;
|
||||
task->nWarps = nWarps;
|
||||
ncclIntruQueueEnqueue(&planner->collTaskQueue, task);
|
||||
planner->isSymColl = true;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Walk the size sorted tasks, binning them by (fn,op,ty).
|
||||
while (task != nullptr) {
|
||||
struct ncclTaskColl* next = task->next;
|
||||
@@ -703,6 +742,10 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
(countHi != 0 ? countHi : countLo) -= cells*elementsPerCell - task->count;
|
||||
|
||||
nChannels = (countLo!=0 ? 1 : 0) + nMidChannels + (cellsHi!=0 ? 1 : 0);
|
||||
|
||||
// Update number of channels propagated to the profiler
|
||||
task->nChannels = (uint8_t)nChannels;
|
||||
|
||||
// Ensure room for worst case of one new batch per channel
|
||||
if (!testBudget(budget, plan->nWorkBatches + nChannels, plan->workBytes + workNode->size)) {
|
||||
return ncclSuccess;
|
||||
@@ -990,6 +1033,8 @@ static ncclResult_t addP2pToPlan(
|
||||
partSize = divUp(bytes[dir], nChannels[dir]);
|
||||
}
|
||||
}
|
||||
// Update number of channels propagated to the profiler
|
||||
if (p2pTasks[dir]) p2pTasks[dir]->nChannels = nChannels[dir];
|
||||
}
|
||||
|
||||
struct ncclWorkList* workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkP2p>(&comm->memScoped, 1);
|
||||
@@ -1198,60 +1243,29 @@ static ncclResult_t scheduleP2pTasksToPlan(
|
||||
}
|
||||
|
||||
// Spin until its safe to increase comm->workFifoProduced to desiredProduced.
|
||||
static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduced) {
|
||||
bool hasRoom = (desiredProduced - comm->workFifoConsumedLeast) <= comm->workFifoBytes;
|
||||
static ncclResult_t waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduced) {
|
||||
bool hasRoom = (desiredProduced - comm->workFifoConsumed) <= comm->workFifoBytes;
|
||||
uint64_t count = 0;
|
||||
int warned = 0;
|
||||
if (hasRoom) return;
|
||||
while (true) {
|
||||
// We have to poll for notifications from device.
|
||||
uint32_t* consumedLive = comm->workFifoConsumed;
|
||||
uint32_t consumed[MAXCHANNELS];
|
||||
for (int c=0; c < MAXCHANNELS; c++) {
|
||||
consumed[c] = __atomic_load_n(&consumedLive[c], __ATOMIC_RELAXED);
|
||||
}
|
||||
// Compiler-only fence to prevent fusion of loops to encourage dense loads.
|
||||
__atomic_signal_fence(__ATOMIC_SEQ_CST);
|
||||
if (!hasRoom) {
|
||||
while (true) {
|
||||
NCCLCHECK(ncclCommPollEventCallbacks(comm, /*waitSome=*/true));
|
||||
hasRoom = (desiredProduced - comm->workFifoConsumed) <= comm->workFifoBytes;
|
||||
if (hasRoom) break;
|
||||
sched_yield();
|
||||
|
||||
uint32_t produced = comm->workFifoProduced;
|
||||
uint32_t consumedLeast = produced;
|
||||
for (int c=0; c < MAXCHANNELS; c++) {
|
||||
// consumedLeast is min over all non-quiesced channels
|
||||
if (consumed[c] != comm->channels[c].workFifoProduced) {
|
||||
if ((produced - consumedLeast) < (produced - consumed[c])) {
|
||||
consumedLeast = consumed[c];
|
||||
}
|
||||
/* Warn if we get stuck waiting for workFifo. */
|
||||
count++;
|
||||
if (warned == 0 && count == 100000 && comm->rank == 0) {
|
||||
warned = 1;
|
||||
WARN("Waiting for work FIFO to become available. "
|
||||
"Work fifo exhaustion can happen in large scale/high iteration count of alltoall. "
|
||||
"In order to increase work FIFO size, set NCCL_WORK_FIFO_BYTES to higher number (current: %ld).\n\n"
|
||||
"RCCL continues to retry...", comm->workFifoBytes);
|
||||
}
|
||||
}
|
||||
|
||||
// Compiler only fence to prevent fusion of loops to encourage dense stores.
|
||||
__atomic_signal_fence(__ATOMIC_SEQ_CST);
|
||||
|
||||
for (int c=0; c < MAXCHANNELS; c++) {
|
||||
// Advance counter on quiesced channels so they don't lag behind
|
||||
// too far where they could get lost in 32-bit wraparound.
|
||||
if (consumed[c] == comm->channels[c].workFifoProduced) {
|
||||
comm->channels[c].workFifoProduced = consumedLeast;
|
||||
__atomic_store_n(&consumedLive[c], consumedLeast, __ATOMIC_RELAXED);
|
||||
}
|
||||
}
|
||||
comm->workFifoConsumedLeast = consumedLeast;
|
||||
|
||||
hasRoom = (desiredProduced - comm->workFifoConsumedLeast) <= comm->workFifoBytes;
|
||||
if (hasRoom) break;
|
||||
sched_yield();
|
||||
|
||||
/* Warn if we get stuck waiting for workFifo. */
|
||||
count++;
|
||||
if (warned == 0 && count == 100000 && comm->rank == 0) {
|
||||
warned = 1;
|
||||
WARN("Waiting for work FIFO to become available. "
|
||||
"Work fifo exhaustion can happen in large scale/high iteration count of alltoall. "
|
||||
"In order to increase work FIFO size, set NCCL_WORK_FIFO_BYTES to higher number (current: %d).\n\n"
|
||||
"RCCL continues to retry...", comm->workFifoBytes);
|
||||
}
|
||||
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
namespace {
|
||||
@@ -1265,11 +1279,14 @@ namespace {
|
||||
struct uploadWork_cleanup_t* me = (struct uploadWork_cleanup_t*)cb;
|
||||
free(me->hostBuf);
|
||||
CUDACHECK(cudaEventDestroy(me->base.event));
|
||||
free(me);
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
||||
if (plan->isSymColl) return ncclSuccess;
|
||||
|
||||
size_t workBytes = plan->workBytes;
|
||||
size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
|
||||
void* fifoBufHost;
|
||||
@@ -1286,7 +1303,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
|
||||
fifoBufHost = comm->workFifoBuf;
|
||||
fifoCursor = comm->workFifoProduced;
|
||||
fifoMask = comm->workFifoBytes-1;
|
||||
waitWorkFifoAvailable(comm, fifoCursor + workBytes);
|
||||
NCCLCHECK(waitWorkFifoAvailable(comm, fifoCursor + workBytes));
|
||||
plan->kernelArgs->workBuf = comm->workFifoBufDev;
|
||||
break;
|
||||
case ncclDevWorkStorageTypePersistent:
|
||||
@@ -1367,7 +1384,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
|
||||
ncclIntruQueueEnqueue(&comm->eventCallbackQueue, (struct ncclCommEventCallback *)cleanup);
|
||||
|
||||
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), result, fail);
|
||||
NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, fail);
|
||||
NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm, /*waitSome=*/false), result, fail);
|
||||
|
||||
finish_scope:
|
||||
if (mode != cudaStreamCaptureModeRelaxed) (void)cudaThreadExchangeStreamCaptureMode(&mode);
|
||||
@@ -1385,6 +1402,7 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
|
||||
uint64_t collOpCount = comm->sharedRes->collOpCount;
|
||||
uint64_t p2pOpBump[MAXCHANNELS] = {/*0...*/};
|
||||
// Advance comm's collOpCount by number of colls in this plan.
|
||||
int hasp2p = 0;
|
||||
comm->sharedRes->collOpCount += plan->collOpCount;
|
||||
comm->collOpCount += plan->collOpCount;
|
||||
|
||||
@@ -1403,6 +1421,7 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
|
||||
// remember last value to compute max.
|
||||
p2pOpBump[op->channelId] = (oldId>>1) + 1; // +1 to ensure next plan doesn't collide
|
||||
op->opCount = (comm->sharedRes->p2pOpCount[op->channelId]<<1) + oldId;
|
||||
hasp2p = 1;
|
||||
} else { // coll
|
||||
op->opCount = (collOpCount<<1) + oldId;
|
||||
}
|
||||
@@ -1412,9 +1431,11 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
|
||||
op = op->enqNext;
|
||||
}
|
||||
|
||||
for (int c=0; c < MAXCHANNELS; c++) {
|
||||
// Advance channel's p2pOpCount by number of p2p's in this plan channel.
|
||||
comm->sharedRes->p2pOpCount[c] += p2pOpBump[c];
|
||||
if (hasp2p) {
|
||||
for (int c=0; c < MAXCHANNELS; c++) {
|
||||
// Advance channel's p2pOpCount by number of p2p's in this plan channel.
|
||||
comm->sharedRes->p2pOpCount[c] += p2pOpBump[c];
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -1422,8 +1443,10 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
|
||||
static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
||||
NCCLCHECK(ncclProfilerStartGroupEvent(plan));
|
||||
NCCLCHECK(ncclProfilerStartTaskEvents(plan));
|
||||
NCCLCHECK(uploadProxyOps(comm, plan));
|
||||
NCCLCHECK(ncclProxyStart(comm));
|
||||
if (ncclIntruQueueHead(&plan->proxyOpQueue)) {
|
||||
NCCLCHECK(uploadProxyOps(comm, plan));
|
||||
NCCLCHECK(ncclProxyStart(comm));
|
||||
}
|
||||
NCCLCHECK(ncclProfilerStopTaskEvents(plan));
|
||||
NCCLCHECK(ncclProfilerStopGroupEvent(plan));
|
||||
if (!plan->persistent) {
|
||||
@@ -1440,7 +1463,6 @@ static void HIPRT_CB hostStreamPlanCallback(void *plan_) {
|
||||
if (result != ncclSuccess) {
|
||||
WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result));
|
||||
}
|
||||
if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->sharedRes->noncapturedRefs);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1517,9 +1539,8 @@ namespace {
|
||||
static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool capturing, int driver=-1) {
|
||||
if (ncclParamLaunchOrderImplicit()) {
|
||||
#if !defined(__HIP_PLATFORM_AMD__) || !defined(__HIPCC__)
|
||||
// Due to an unresolved bug in CUDA ncclImplicitOrderLaunch is not supported in graphs
|
||||
if (capturing) { *mode = ncclImplicitOrderSerial; return ncclSuccess; }
|
||||
if (driver < 0) { NCCLCHECK(ncclCudaDriverVersion(&driver)); }
|
||||
if (capturing && driver < 12090) { *mode = ncclImplicitOrderSerial; return ncclSuccess; }
|
||||
*mode = 12030 <= std::min<int>(CUDART_VERSION, driver) ? ncclImplicitOrderLaunch : ncclImplicitOrderSerial;
|
||||
#else
|
||||
*mode = ncclImplicitOrderNone;
|
||||
@@ -1549,26 +1570,53 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
|
||||
plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent
|
||||
: ncclDevWorkStorageTypeFifo;
|
||||
|
||||
struct ncclKernelPlanBudget budget;
|
||||
budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs);
|
||||
// Non-persistent kernels fill up at most half of our fifo per kernel.
|
||||
budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2;
|
||||
if (planner->isSymColl) {
|
||||
plan->workStorageType = ncclDevWorkStorageTypeArgs;
|
||||
|
||||
// Drain coll tasks first. This is essential since we partition tasks based
|
||||
// on the work budget and p2p work isn't collective. If we were to drain p2p
|
||||
// first, the place where we cut the kernel could vary by rank which would
|
||||
// cause the "shortest channel first" channel picker to have divergent results.
|
||||
if (planner->nTasksColl != 0) {
|
||||
NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure);
|
||||
}
|
||||
// And only drain p2p tasks once colls are depleted.
|
||||
if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) {
|
||||
NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure);
|
||||
}
|
||||
finishPlan(comm, plan);
|
||||
if (plan->workBytes != 0) {
|
||||
struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue);
|
||||
plan->isSymColl = true;
|
||||
plan->kernelFn = ncclSymGetKernelPtr((ncclSymKernelId)task->devFuncId, task->opDev.op, task->datatype);
|
||||
plan->threadPerBlock = task->nWarps*WARP_SIZE;
|
||||
for (int i = 0; i < MAXCHANNELS/64; i++)
|
||||
plan->channelMask.masks[i] = uint64_t(-1) >> (64-task->nMaxChannels);
|
||||
// plan->channelMask = uint64_t(-1) >> (64-task->nMaxChannels);
|
||||
|
||||
plan->kernelArgsSize = sizeof(struct ncclSymDevArgs);
|
||||
plan->kernelSymArgs = ncclMemoryStackAlloc<struct ncclSymDevArgs>(&comm->memScoped);
|
||||
plan->kernelSymArgs->comm = comm->symDevComm;
|
||||
plan->kernelSymArgs->rootRank = task->root;
|
||||
plan->kernelSymArgs->redOpArg = task->opDev.scalarArg;
|
||||
plan->kernelSymArgs->nElts = task->count;
|
||||
plan->kernelSymArgs->input = (char*)task->sendbuff;
|
||||
plan->kernelSymArgs->output = (char*)task->recvbuff;
|
||||
|
||||
planner->nTasksColl -= 1;
|
||||
ncclIntruQueueEnqueue(&planner->planQueue, plan);
|
||||
INFO(NCCL_TUNING, "%s [Symmetric]: %ld Bytes -> Kernel %s nchannels %d nthreads %d",
|
||||
ncclFuncToString(task->func), task->count * ncclTypeSize(task->datatype), ncclSymKernelIdToString(task->devFuncId), task->nMaxChannels, plan->threadPerBlock);
|
||||
nPlans += 1;
|
||||
} else {
|
||||
struct ncclKernelPlanBudget budget;
|
||||
budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs);
|
||||
// Non-persistent kernels fill up at most half of our fifo per kernel.
|
||||
budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2;
|
||||
|
||||
// Drain coll tasks first. This is essential since we partition tasks based
|
||||
// on the work budget and p2p work isn't collective. If we were to drain p2p
|
||||
// first, the place where we cut the kernel could vary by rank which would
|
||||
// cause the "shortest channel first" channel picker to have divergent results.
|
||||
if (planner->nTasksColl != 0) {
|
||||
NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure);
|
||||
}
|
||||
// And only drain p2p tasks once colls are depleted.
|
||||
if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) {
|
||||
NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure);
|
||||
}
|
||||
finishPlan(comm, plan);
|
||||
if (plan->workBytes != 0) {
|
||||
ncclIntruQueueEnqueue(&planner->planQueue, plan);
|
||||
nPlans += 1;
|
||||
}
|
||||
}
|
||||
} while (planner->nTasksColl + planner->nTasksP2p != 0);
|
||||
|
||||
@@ -1596,6 +1644,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
|
||||
|
||||
bool capturing = ncclCudaGraphValid(planner->capturingGraph);
|
||||
enum ncclImplicitOrder implicitOrder;
|
||||
cudaError_t status = cudaSuccess;
|
||||
NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing), result, failure);
|
||||
|
||||
if (implicitOrder != ncclImplicitOrderNone) {
|
||||
@@ -1607,7 +1656,8 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
|
||||
NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, launchOrder, comm->sharedRes->scratchEvent), result, failure);
|
||||
}
|
||||
|
||||
if (persistent || comm->sharedRes->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->sharedRes->noncapturedRefs, __ATOMIC_ACQUIRE)) {
|
||||
if (!persistent && comm->sharedRes->persistentRefs) status = cudaEventQuery(comm->sharedRes->hostStream.serialEvent);
|
||||
if (persistent || ncclCudaLaunchBlocking || status == cudaErrorNotReady) {
|
||||
// We have to launch host tasks to push proxy args. We are careful to only
|
||||
// do this if necessary since host tasks impose a high performance cost in CUDA.
|
||||
bool acquired = false;
|
||||
@@ -1618,7 +1668,6 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
|
||||
acquired = true;
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), result, failure);
|
||||
}
|
||||
if (!persistent) ncclAtomicRefCountIncrement(&comm->sharedRes->noncapturedRefs);
|
||||
plan->isHostCbEnq = true;
|
||||
CUDACHECKGOTO(cudaLaunchHostFunc(hostStream, hostStreamPlanCallback, plan), result, failure);
|
||||
}
|
||||
@@ -1653,6 +1702,8 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
|
||||
NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
|
||||
#endif
|
||||
|
||||
NCCL_PARAM(NvlinkUtilCentricSchedEnable, "NVLINK_UTIL_CENTRIC_SCHED_ENABLE", 0);
|
||||
|
||||
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclKernelPlanner* planner = &comm->planner;
|
||||
@@ -1691,7 +1742,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
|
||||
unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0;
|
||||
|
||||
CUlaunchConfig launchConfig = {0};
|
||||
CUlaunchAttribute launchAttrs[4] = {};
|
||||
CUlaunchAttribute launchAttrs[6] = {};
|
||||
int attrs = 0;
|
||||
/* Cooperative Group Array (CGA)
|
||||
* On sm90 and later we have an extra level of hierarchy where we
|
||||
@@ -1728,6 +1779,18 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
|
||||
launchAttrs[attrs].value.launchCompletionEvent.flags = 0;
|
||||
attrs++;
|
||||
}
|
||||
if (comm->planner.isSymColl && compCap >= 90 && driverVersion >= 12030) {
|
||||
launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION;
|
||||
launchAttrs[attrs].value.programmaticStreamSerializationAllowed = 1;
|
||||
attrs++;
|
||||
}
|
||||
#endif
|
||||
#if CUDART_VERSION >= 13000
|
||||
if (compCap >= 90 && driverVersion >= 13000) {
|
||||
launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING;
|
||||
launchAttrs[attrs].value.nvlinkUtilCentricScheduling = ncclParamNvlinkUtilCentricSchedEnable();
|
||||
attrs++;
|
||||
}
|
||||
#endif
|
||||
launchConfig.gridDimX = grid.x;
|
||||
launchConfig.gridDimY = grid.y;
|
||||
@@ -1762,21 +1825,30 @@ do_return:
|
||||
}
|
||||
|
||||
ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
||||
if (!(plan->persistent || ncclCudaLaunchBlocking || plan->isHostCbEnq)) {
|
||||
// We are not using the host stream for proxy ops and reclaimation submission.
|
||||
if (!plan->isHostCbEnq) {
|
||||
// we are not using the host stream for proxy ops and reclaimation submission, call
|
||||
// hostStreamPlanTask directly
|
||||
NCCLCHECK(hostStreamPlanTask(comm, plan));
|
||||
} else {
|
||||
// We are using the host stream for proxy ops and reclaimation submission.
|
||||
// Only plans with proxy ops have a callback pushed by ncclLaunchPrepare.
|
||||
// Since non-persistent plans also require reclaimation, we have to do it
|
||||
// here.
|
||||
if (!plan->persistent && !plan->hasProxyOps) {
|
||||
ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer);
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct KernelFinishCallback {
|
||||
struct ncclCommEventCallback base;
|
||||
uint32_t workFifoConsumed;
|
||||
};
|
||||
ncclResult_t KernelFinishCallback_fn(
|
||||
struct ncclComm* comm, struct ncclCommEventCallback* cb
|
||||
) {
|
||||
struct KernelFinishCallback* me = (struct KernelFinishCallback*)cb;
|
||||
comm->workFifoConsumed = me->workFifoConsumed;
|
||||
CUDACHECK(cudaEventDestroy(me->base.event));
|
||||
free(me);
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
|
||||
struct ncclKernelPlanner* planner = &comm->planner;
|
||||
if (!ncclIntruQueueEmpty(&planner->planQueue)) {
|
||||
@@ -1788,8 +1860,23 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
|
||||
//cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch // unused variable - compiler warning
|
||||
cudaStream_t deviceStream, launchOrder;
|
||||
|
||||
cudaEvent_t finishedEvent = comm->sharedRes->scratchEvent;
|
||||
|
||||
if (comm->workFifoProduced - comm->workFifoProducedLastRecorded > comm->workFifoBytes/8) {
|
||||
comm->workFifoProducedLastRecorded = comm->workFifoProduced;
|
||||
struct KernelFinishCallback* cb;
|
||||
NCCLCHECK(ncclCalloc(&cb, 1));
|
||||
cb->base.event = finishedEvent;
|
||||
cb->base.fn = KernelFinishCallback_fn;
|
||||
cb->workFifoConsumed = comm->workFifoProduced;
|
||||
ncclIntruQueueEnqueue(&comm->eventCallbackQueue, &cb->base);
|
||||
// We just stole scratchEvent so must create a new one.
|
||||
CUDACHECK(cudaEventCreateWithFlags(&comm->sharedRes->scratchEvent, cudaEventDisableTiming));
|
||||
}
|
||||
|
||||
if (capturing || planner->numStreams != 1) {
|
||||
// CUDACHECK(cudaEventRecord(comm->sharedRes->scratchEvent, launchStream));
|
||||
// CUDACHECK(cudaEventRecord(finishedEvent, launchStream));
|
||||
|
||||
// deviceStream waits on userStream[0]
|
||||
NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
|
||||
|
||||
@@ -1798,13 +1885,13 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
|
||||
// on launchStream as a fast-forward. When building CUDA graphs fast forwards should
|
||||
// be handled specially so as not to create graphs with a blowup in the number of edges.
|
||||
// So we could do this:
|
||||
// CUDACHECK(cudaStreamWaitEvent(deviceStream, comm->sharedRes->scratchEvent, 0));
|
||||
// CUDACHECK(cudaStreamWaitEvent(deviceStream, finishedEvent, 0));
|
||||
// But instead we do:
|
||||
NCCLCHECK(ncclStreamAdvanceToEvent(planner->capturingGraph, deviceStream, comm->sharedRes->scratchEvent));
|
||||
NCCLCHECK(ncclStreamAdvanceToEvent(planner->capturingGraph, deviceStream, finishedEvent));
|
||||
|
||||
// Each userStream[i] waits on userStream[0]
|
||||
for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) {
|
||||
CUDACHECK(cudaStreamWaitEvent(l->stream, comm->sharedRes->scratchEvent, 0));
|
||||
CUDACHECK(cudaStreamWaitEvent(l->stream, finishedEvent, 0));
|
||||
}
|
||||
}
|
||||
enum ncclImplicitOrder implicitOrder;
|
||||
@@ -1815,7 +1902,7 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
|
||||
// Incorporate launch event into per-device (context) launch order.
|
||||
NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder));
|
||||
// If we don't have launch events (requires CUDA 12.3) then just use completion event (serialize execution).
|
||||
CUDACHECK(cudaStreamWaitEvent(launchOrder, implicitOrder == ncclImplicitOrderLaunch ? comm->sharedRes->launchEvent : comm->sharedRes->scratchEvent));
|
||||
CUDACHECK(cudaStreamWaitEvent(launchOrder, implicitOrder == ncclImplicitOrderLaunch ? comm->sharedRes->launchEvent : finishedEvent));
|
||||
// Release launchOrder as acquired in ncclLaunchPrepare()
|
||||
NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->context->launchOrder, concurrent));
|
||||
}
|
||||
@@ -1837,7 +1924,7 @@ static inline ncclResult_t getCollNetSupport(
|
||||
if (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv) {
|
||||
netOp = ncclSum;
|
||||
}
|
||||
*collNetSupport = comm->collNetSupport;
|
||||
*collNetSupport = comm->config.collnetEnable;
|
||||
switch (info->func) {
|
||||
case ncclFuncAllReduce:
|
||||
case ncclFuncReduce:
|
||||
@@ -1875,10 +1962,8 @@ static ncclResult_t updateCollCostTable(
|
||||
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
|
||||
// CollNetDirect is only supported for up to 8 local GPUs
|
||||
if (a == NCCL_ALGO_COLLNET_DIRECT && comm->maxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue;
|
||||
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue;
|
||||
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && (!nvlsSupport || (info->func != ncclFuncAllReduce && comm->localRanks > NCCL_MAX_NVLS_ARITY))) continue;
|
||||
if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
|
||||
/* now we only support single-node NVLS allgather and reducescatter */
|
||||
if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && (comm->nNodes > 1 || comm->nRanks > NCCL_MAX_NVLS_ARITY)) continue;
|
||||
/* Tree reduceScatter doesn't support scaling yet */
|
||||
if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter
|
||||
&& (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue;
|
||||
@@ -2029,7 +2114,14 @@ rccl_static ncclResult_t getAlgoInfo(
|
||||
struct ncclComm* comm, struct ncclTaskColl* info,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo/* = NULL*/
|
||||
) {
|
||||
size_t nBytes = ncclTypeSize(info->datatype)*ncclFuncMaxSendRecvCount(info->func, comm->nRanks, info->count);
|
||||
size_t elementSize = ncclTypeSize(info->datatype);
|
||||
size_t nBytes = elementSize * ncclFuncMaxSendRecvCount(info->func, comm->nRanks, info->count);
|
||||
struct ncclReg* regSendBuf = NULL;
|
||||
struct ncclReg* regRecvBuf = NULL;
|
||||
int regBuff;
|
||||
bool isSendValid, isRecvValid;
|
||||
size_t sendbuffSize = elementSize * ncclFuncSendCount(info->func, comm->nRanks, info->count);
|
||||
size_t recvbuffSize = elementSize * ncclFuncRecvCount(info->func, comm->nRanks, info->count);
|
||||
info->algorithm = NCCL_ALGO_UNDEF;
|
||||
info->protocol = NCCL_PROTO_UNDEF;
|
||||
int nMaxChannels = 0;
|
||||
@@ -2037,20 +2129,42 @@ rccl_static ncclResult_t getAlgoInfo(
|
||||
initCollCostTable((float **)collCostTable);
|
||||
NCCLCHECK(updateCollCostTable(comm, info, nBytes, collNetSupport, nvlsSupport, numPipeOps, (float **)collCostTable));
|
||||
if (comm->tuner != NULL) {
|
||||
size_t elementSize = ncclTypeSize(info->datatype);
|
||||
size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
|
||||
size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
|
||||
struct ncclReg* regSendBuf;
|
||||
struct ncclReg* regRecvBuf;
|
||||
NCCLCHECK(ncclRegFind(comm, info->sendbuff, sendbuffSize, ®SendBuf));
|
||||
NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, ®RecvBuf));
|
||||
int regBuff = ((regSendBuf && regRecvBuf) || (ncclCudaGraphValid(comm->planner.capturingGraph) && ncclParamGraphRegister()));
|
||||
NCCLCHECK(ncclRegLocalIsValid(regSendBuf, &isSendValid));
|
||||
NCCLCHECK(ncclRegLocalIsValid(regRecvBuf, &isRecvValid));
|
||||
regBuff = (regSendBuf && regRecvBuf && isSendValid && isRecvValid) || (ncclCudaGraphValid(comm->planner.capturingGraph) && ncclParamGraphRegister());
|
||||
NCCLCHECK(comm->tuner->getCollInfo(
|
||||
comm->tunerContext, info->func, nBytes,
|
||||
numPipeOps, (float **)collCostTable, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
|
||||
regBuff, &nMaxChannels));
|
||||
NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, simInfo));
|
||||
} else {
|
||||
NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, simInfo));
|
||||
// NCCL_CTA_POLICY_EFFICIENCY requires user (non-symmetric) buffer registration (currently unsupported with MNNVL)
|
||||
if (comm->config.CTAPolicy == NCCL_CTA_POLICY_EFFICIENCY && ncclGetEnv("NCCL_ALGO") == NULL && ncclGetEnv("NCCL_PROTO") == NULL && !comm->MNNVL) {
|
||||
// make algorithm selection based on buffer registration
|
||||
// there can be other specialized policies for algorithms and protocols pickup in the future
|
||||
NCCLCHECK(ncclRegFind(comm, info->sendbuff, sendbuffSize, ®SendBuf));
|
||||
NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, ®RecvBuf));
|
||||
NCCLCHECK(ncclRegLocalIsValid(regSendBuf, &isSendValid));
|
||||
NCCLCHECK(ncclRegLocalIsValid(regRecvBuf, &isRecvValid));
|
||||
regBuff = (regSendBuf && regRecvBuf && isSendValid && isRecvValid) || (ncclCudaGraphValid(comm->planner.capturingGraph) && ncclParamGraphRegister());
|
||||
if (regBuff && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter)) {
|
||||
if ((comm->nNodes > 1 && collNetSupport && nvlsSupport) || (comm->nNodes == 1 && nvlsSupport)) {
|
||||
int recChannels;
|
||||
NCCLCHECK(ncclNvlsRegResourcesQuery(comm, info, &recChannels));
|
||||
if (recChannels <= info->nMaxChannels) {
|
||||
info->algorithm = NCCL_ALGO_NVLS;
|
||||
info->protocol = NCCL_PROTO_SIMPLE;
|
||||
info->nMaxChannels = recChannels;
|
||||
info->nWarps = comm->maxThreads[info->algorithm][info->protocol] / WARP_SIZE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, simInfo));
|
||||
|
||||
info->nMaxChannels = nMaxChannels == 0 ? info->nMaxChannels : nMaxChannels;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -2138,16 +2252,20 @@ static ncclResult_t calcCollChunking(
|
||||
while (nBytes / (nChannels * chunkSize) < comm->channels[0].collnetChain.depth * 8 && chunkSize > 65536) chunkSize /= 2;
|
||||
while (nBytes / (nChannels * chunkSize) < comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
|
||||
} else if (info->algorithm == NCCL_ALGO_NVLS) {
|
||||
int maxChunkSize = comm->nvlsChunkSize;
|
||||
if (comm->nNodes > 1 && comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
|
||||
if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
|
||||
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow.
|
||||
// However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits.
|
||||
// coverity[overflow_before_widen]
|
||||
uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads;
|
||||
if ((nBytes < (64 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
|
||||
if ((nBytes < (8 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
|
||||
if ((nBytes < (2 * (concurrentOps * chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
|
||||
if ((info->regBufType & NCCL_NVLS_REG_BUFFER) && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter)) {
|
||||
chunkSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
|
||||
} else {
|
||||
int maxChunkSize = comm->nvlsChunkSize;
|
||||
if (comm->nNodes > 1 && comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
|
||||
if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
|
||||
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow.
|
||||
// However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits.
|
||||
// coverity[overflow_before_widen]
|
||||
uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads;
|
||||
if ((nBytes < (64 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
|
||||
if ((nBytes < (8 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
|
||||
if ((nBytes < (2 * (concurrentOps * chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
|
||||
}
|
||||
} else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
|
||||
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow.
|
||||
// However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits.
|
||||
@@ -2291,7 +2409,7 @@ static ncclResult_t calcCollChunking(
|
||||
proxyOp->reg = 0;
|
||||
}
|
||||
|
||||
if (pattern == ncclPatternCollnetDirect) {
|
||||
if (pattern == ncclPatternCollnetDirect || pattern == ncclPatternNvls) {
|
||||
proxyOp->specifics.collnetDirect.nNodes = comm->nNodes;
|
||||
proxyOp->specifics.collnetDirect.node = comm->node;
|
||||
if (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) {
|
||||
@@ -2415,7 +2533,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
|
||||
bool isSendNotRecv = info->coll == ncclFuncSend;
|
||||
|
||||
// Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
|
||||
ncclGroupCommJoin(info->comm);
|
||||
ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
|
||||
struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc<struct ncclTaskP2p>(&comm->memPool_ncclTaskP2p, &comm->memPermanent);
|
||||
p2p->buff = (void*)info->recvbuff;
|
||||
p2p->count = info->count;
|
||||
@@ -2496,7 +2614,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
|
||||
return ncclSuccess;
|
||||
} else {
|
||||
// Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
|
||||
ncclGroupCommJoin(info->comm);
|
||||
ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
|
||||
struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
|
||||
t->func = info->coll;
|
||||
t->sendbuff = info->sendbuff;
|
||||
|
||||
@@ -456,7 +456,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
|
||||
channel->nvls.out = -1; // NVLS+SHARP not yet implemented.
|
||||
channel->nvls.headRank = headRank;
|
||||
channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
|
||||
if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
|
||||
if (comm->config.collnetEnable && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
|
||||
}
|
||||
if (comm->nNodes == 1) return ncclSuccess;
|
||||
|
||||
@@ -528,7 +528,7 @@ int ncclMinNchannels() {
|
||||
if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
|
||||
if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
|
||||
if (minNchannels > MAXCHANNELS) {
|
||||
WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
|
||||
INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
|
||||
minNchannels = MAXCHANNELS;
|
||||
}
|
||||
if (minNchannels < 0) minNchannels = 0;
|
||||
@@ -544,7 +544,7 @@ int ncclMaxNchannels() {
|
||||
maxNchannels = std::min(maxNchannels, ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes()));
|
||||
if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
|
||||
if (maxNchannels < 1) {
|
||||
WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
|
||||
INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a maximum of %d channels, setting it to 1", maxNchannels);
|
||||
maxNchannels = 1;
|
||||
}
|
||||
return maxNchannels;
|
||||
@@ -718,7 +718,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
int nNodes = comm->nNodes;
|
||||
int nChannels = comm->nChannels;
|
||||
int minHeadNum = INT_MAX;
|
||||
int shared = parent && parent->nvlsSupport && parent->config.splitShare;
|
||||
int shared = parent && parent->nvlsSupport && parent->shareResources;
|
||||
int maxChannels;
|
||||
int minNchannels, maxNchannels;
|
||||
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
|
||||
@@ -839,7 +839,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
nChannels = comm->nChannels = std::min(maxChannels, (nChannels <= maxChannels/2) ? nChannels*2 : nChannels);
|
||||
|
||||
// Setup CollNet
|
||||
if (comm->collNetSupport == 1) {
|
||||
if (comm->config.collnetEnable) {
|
||||
struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
|
||||
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
|
||||
if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) {
|
||||
|
||||
+62
-35
@@ -219,7 +219,7 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
|
||||
const char* str = ncclGetEnv(disableEnv);
|
||||
if (str) {
|
||||
int disable = strtol(str, NULL, 0);
|
||||
if (disable == 1) l = 0;
|
||||
if (disable == 1) l = PATH_LOC;
|
||||
if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %d", disableEnv, disable);
|
||||
}
|
||||
}
|
||||
@@ -252,7 +252,18 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
|
||||
|
||||
NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0);
|
||||
|
||||
int ncclTopoUserP2pLevel = -1;
|
||||
static int ncclTopoUserP2pLevel = -1; // Initially "uninitialized". When initialized but unset, changes to -2.
|
||||
|
||||
// Gets the user-provided value of NCCL_P2P_LEVEL/NCCL_P2P_DISABLE. If the user did not provide any, the value
|
||||
// of the "level" argument is left unchanged.
|
||||
ncclResult_t ncclGetUserP2pLevel(int* level) {
|
||||
if (ncclTopoUserP2pLevel == -1)
|
||||
NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL"));
|
||||
if (ncclTopoUserP2pLevel != -2)
|
||||
*level = ncclTopoUserP2pLevel;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2,
|
||||
int* p2p, int *read, int* intermediateRank) {
|
||||
int mnnvl = 0;
|
||||
@@ -280,9 +291,9 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst
|
||||
|
||||
// Get GPUs from topology
|
||||
int g1, g2;
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1));
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1, /*showWarn=*/true));
|
||||
struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
|
||||
if (ncclTopoRankToIndex(system, rank2, &g2) == ncclInternalError) {
|
||||
if (ncclTopoRankToIndex(system, rank2, &g2, /*showWarn=*/false) == ncclInternalError) {
|
||||
// GPU not found, we can't use p2p.
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -305,12 +316,7 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst
|
||||
int p2pLevel = PATH_SYS;
|
||||
|
||||
// User override
|
||||
if (ncclTopoUserP2pLevel == -1)
|
||||
NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL"));
|
||||
if (ncclTopoUserP2pLevel != -2) {
|
||||
p2pLevel = ncclTopoUserP2pLevel;
|
||||
goto compare;
|
||||
}
|
||||
NCCLCHECK(ncclGetUserP2pLevel(&p2pLevel));
|
||||
|
||||
// Don't use P2P through ARM CPUs
|
||||
int arch, vendor, model;
|
||||
@@ -323,7 +329,6 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst
|
||||
p2pLevel = PATH_PXB;
|
||||
}
|
||||
|
||||
compare:
|
||||
// Compute the PCI distance and compare with the p2pLevel.
|
||||
if (path->type <= p2pLevel) *p2p = 1;
|
||||
|
||||
@@ -393,7 +398,8 @@ NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
|
||||
int ncclTopoUserGdrLevel = -1;
|
||||
const char* ncclTopoGdrModeStr[ncclTopoGdrModeNum] = { "Disabled", "Default", "PCI" };
|
||||
|
||||
NCCL_PARAM(NetGdrC2c, "NET_GDR_C2C", 0);
|
||||
// On C2C platforms use GDRDMA on NICs which are connected to the CPUs
|
||||
NCCL_PARAM(NetGdrC2c, "NET_GDR_C2C", 1);
|
||||
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode) {
|
||||
*gdrMode = ncclTopoGdrModeDisable;
|
||||
@@ -402,7 +408,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t n
|
||||
int n, g;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g, /*showWarn=*/true));
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
|
||||
// Check that both the NIC and GPUs support it
|
||||
@@ -459,29 +465,29 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t n
|
||||
// In case of PXN, use the intermediate GPU distance instead
|
||||
int proxyRank;
|
||||
NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netId, &proxyRank));
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g, /*showWarn=*/true));
|
||||
gpu = system->nodes[GPU].nodes+g;
|
||||
distance = gpu->paths[NET][n].type;
|
||||
}
|
||||
|
||||
int c;
|
||||
NCCLCHECK(ncclGetLocalCpu(system, g, &c));
|
||||
if (ncclParamNetGdrC2c() && distance == PATH_PHB && gpu->paths[CPU][c].type == PATH_C2C) {
|
||||
// On C2C platforms we can still use GDRDMA on NICs connected to the CPUs
|
||||
INFO(NCCL_NET, "GPU %d / HCA %lx connected to CPU %d via C2C link", rank, netId, c);
|
||||
// On C2C platforms we can still use GDRDMA on NICs connected to the CPUs
|
||||
if (ncclParamNetGdrC2c() && distance == PATH_P2C) {
|
||||
INFO(NCCL_GRAPH | NCCL_NET, "GPU %d / HCA %lx connected via C2C link", rank, netId);
|
||||
distance = PATH_C2C;
|
||||
}
|
||||
|
||||
if (distance > netGdrLevel) {
|
||||
INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel);
|
||||
INFO(NCCL_GRAPH|NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Force PCIe mapping if path goes through PCI on a C2C system
|
||||
int c;
|
||||
NCCLCHECK(ncclGetLocalCpu(system, g, &c));
|
||||
if (gpu->paths[CPU][c].type == PATH_C2C && distance != PATH_C2C) *gdrMode = ncclTopoGdrModePci;
|
||||
else *gdrMode = ncclTopoGdrModeDefault;
|
||||
|
||||
INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d mode %s", rank, netId, distance, netGdrLevel, read, ncclTopoGdrModeStr[*gdrMode]);
|
||||
INFO(NCCL_GRAPH|NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d mode %s", rank, netId, distance, netGdrLevel, read, ncclTopoGdrModeStr[*gdrMode]);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -516,7 +522,7 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev,
|
||||
if (props.forceFlush == 1 || ncclParamNetForceFlush()) return ncclSuccess;
|
||||
int g;
|
||||
struct ncclTopoSystem* system = comm->topo;
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g, /*showWarn=*/true));
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
*flush = 1;
|
||||
#else
|
||||
@@ -546,8 +552,8 @@ ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank
|
||||
*net = 1;
|
||||
// First check the current GPU-to-GPU speed.
|
||||
int g1, g2;
|
||||
if (ncclTopoRankToIndex(system, rank1, &g1) != ncclSuccess ||
|
||||
ncclTopoRankToIndex(system, rank2, &g2) != ncclSuccess) {
|
||||
if (ncclTopoRankToIndex(system, rank1, &g1, /*showWarn=*/false) != ncclSuccess ||
|
||||
ncclTopoRankToIndex(system, rank2, &g2, /*showWarn=*/false) != ncclSuccess) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -573,7 +579,7 @@ ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank
|
||||
// Get GPU and NET
|
||||
int n, g;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g, /*showWarn=*/true));
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
struct ncclTopoLinkList* path = gpu->paths[NET]+n;
|
||||
if (path->type == PATH_PXN) {
|
||||
@@ -666,6 +672,8 @@ static bool rcclPathOverride(struct ncclTopoSystem* system, uint64_t distance) {
|
||||
}
|
||||
}
|
||||
|
||||
NCCL_PARAM(PxnC2c, "PXN_C2C", 0);
|
||||
|
||||
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm) {
|
||||
// Precompute paths between GPUs/NICs.
|
||||
|
||||
@@ -724,6 +732,20 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
||||
}
|
||||
}
|
||||
}
|
||||
// update the GPU -> NIC path in the case of C2C + PHB
|
||||
for (int n = 0; n < system->nodes[NET].count; n++) {
|
||||
struct ncclTopoNode* netNode = system->nodes[NET].nodes + n;
|
||||
for (int g = 0; g < system->nodes[GPU].count; g++) {
|
||||
struct ncclTopoNode* gpuNode = system->nodes[GPU].nodes + g;
|
||||
int c;
|
||||
NCCLCHECK(ncclGetLocalCpu(system, g, &c));
|
||||
if (c == -1) continue;
|
||||
if (gpuNode->paths[NET][n].type == PATH_PHB && gpuNode->paths[CPU][c].type == PATH_C2C) {
|
||||
gpuNode->paths[NET][n].type = PATH_P2C;
|
||||
netNode->paths[GPU][g].type = PATH_P2C;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Special handling of gfx94x and gfx950
|
||||
|
||||
@@ -759,15 +781,20 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
||||
// PXN = PCI + NVLink.
|
||||
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex;
|
||||
// Only use PXN for NIC n if remote GPU p ...
|
||||
if (peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI
|
||||
peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink
|
||||
NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) && // Is on the same node as us
|
||||
(peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC
|
||||
gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU
|
||||
// We can use that GPU as relay to communicate with that NIC.
|
||||
// Only enabling it in the GPU->NIC direction for now to favor
|
||||
// receiving locally and sending remotely (consistent with net.cc)
|
||||
NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
|
||||
if (/* (1) is either connected to the NIC with PXB*/
|
||||
(peerNode->paths[NET][n].type <= PATH_PXB ||
|
||||
/* or with P2C and PxN over C2C is enabled */
|
||||
(ncclParamPxnC2c() && peerNode->paths[NET][n].type == PATH_P2C)) &&
|
||||
/* and (2) is connected to us through NVLink */
|
||||
peerNode->paths[GPU][g].type <= PATH_NVL &&
|
||||
/* and (3) is on the same node as us */
|
||||
NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) &&
|
||||
/* and (4) has either higher bw to that NIC or avoid going through the CPU*/
|
||||
(peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > PATH_PXB))
|
||||
// We can use that GPU as relay to communicate with that NIC.
|
||||
// Only enabling it in the GPU->NIC direction for now to favor
|
||||
// receiving locally and sending remotely (consistent with net.cc)
|
||||
NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
|
||||
}
|
||||
}
|
||||
if (gpu->paths[NET][n].type < PATH_PHB) {
|
||||
@@ -904,7 +931,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp
|
||||
int peer;
|
||||
struct ncclTopoSystem* system = comm->topo;
|
||||
struct ncclTopoLinkList* path = NULL;
|
||||
if (ncclTopoRankToIndex(system, peerRank, &peer) == ncclSuccess) {
|
||||
if (ncclTopoRankToIndex(system, peerRank, &peer, /*showWarn=*/false) == ncclSuccess) {
|
||||
// Same rank
|
||||
if (g == peer) {
|
||||
*nChannels = -1;
|
||||
|
||||
+16
-27
@@ -141,6 +141,7 @@ static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncc
|
||||
float bw = intra ? graph->bwIntra : graph->bwInter;
|
||||
int type = intra ? graph->typeIntra : graph->typeInter;
|
||||
|
||||
if (path->type >= PATH_DIS) return ncclSuccess;
|
||||
if (mult == 1 && (path->type > type)) return ncclSuccess;
|
||||
if (mult == 1 && (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
|
||||
graph->pattern == NCCL_TOPO_PATTERN_TREE ||
|
||||
@@ -332,8 +333,7 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
*g = i;
|
||||
return ncclSuccess;
|
||||
}
|
||||
if (*g == -1) return ncclInternalError;
|
||||
return ncclSuccess;
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time);
|
||||
@@ -709,24 +709,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
}
|
||||
|
||||
// Then try the most local GPUs
|
||||
float maxBw = 0;
|
||||
int minHops = 0xfffffff;
|
||||
struct ncclTopoLinkList* paths = net->paths[GPU];
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
if (paths[g].bw > maxBw) {
|
||||
maxBw = paths[g].bw;
|
||||
minHops = paths[g].count;
|
||||
} else if (paths[g].bw == maxBw && paths[g].count > 0 && paths[g].count < minHops) {
|
||||
minHops = paths[g].count;
|
||||
}
|
||||
}
|
||||
if (maxBw >= bw) {
|
||||
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||
int g = (graph->nChannels+i)%system->nodes[GPU].count;
|
||||
if (paths[g].bw == maxBw && paths[g].count == minHops) {
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
|
||||
}
|
||||
}
|
||||
int localGpus[NCCL_TOPO_MAX_NODES], localGpuCount, pathType;
|
||||
NCCLCHECK(ncclTopoGetLocal(system, NET, n, GPU, localGpus, &localGpuCount, &pathType));
|
||||
// if no GPUs are connected, skip this net
|
||||
if (pathType == PATH_DIS) continue;
|
||||
for (int g = 0; g < localGpuCount; ++g) {
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, localGpus[g]));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -813,6 +801,7 @@ struct kvDict kvDictLinkType[] = {
|
||||
{ "PIX", PATH_PIX },
|
||||
{ "PXB", PATH_PXB },
|
||||
{ "PXN", PATH_PXN },
|
||||
{ "P2C", PATH_P2C },
|
||||
{ "PHB", PATH_PHB },
|
||||
{ "SYS", PATH_SYS },
|
||||
{ NULL, 0 }
|
||||
@@ -980,8 +969,8 @@ float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0,
|
||||
|
||||
RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0);
|
||||
|
||||
float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0 };
|
||||
float sm100SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0, 18.0 };
|
||||
float sm100SpeedArrayInter[] = { 47.9, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
#define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float))
|
||||
#define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float))
|
||||
|
||||
@@ -1168,13 +1157,13 @@ search:
|
||||
int maxIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : maxTypeIntra;
|
||||
if (tmpGraph.typeIntra < maxIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
|
||||
tmpGraph.typeIntra += 1;
|
||||
goto search;
|
||||
if (tmpGraph.typeIntra < PATH_DIS) goto search;
|
||||
}
|
||||
tmpGraph.typeIntra = minTypeIntra;
|
||||
|
||||
if (system->nodes[NET].count > 0 && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
|
||||
tmpGraph.typeInter += 1;
|
||||
goto search;
|
||||
if (tmpGraph.typeInter < PATH_DIS) goto search;
|
||||
}
|
||||
tmpGraph.typeInter = minTypeInter;
|
||||
|
||||
@@ -1232,7 +1221,7 @@ done:
|
||||
}
|
||||
|
||||
if (graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
|
||||
WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
|
||||
INFO(NCCL_GRAPH, "Could not find a path for pattern %d, falling back to simple order", graph->pattern);
|
||||
for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
|
||||
graph->inter[0] = graph->inter[1] = 0;
|
||||
graph->bwIntra = graph->bwInter = 0.1;
|
||||
@@ -1366,7 +1355,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
|
||||
}
|
||||
if (pxnLevel == 1) {
|
||||
int g, n;
|
||||
NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g));
|
||||
NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g, /*showWarn=*/true));
|
||||
NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netId, &n));
|
||||
struct ncclTopoNode* gpu = comm->topo->nodes[GPU].nodes+g;
|
||||
if (gpu->paths[NET][n].type <= PATH_PXN) {
|
||||
@@ -1378,7 +1367,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
|
||||
// Check which local GPU corresponds to that NIC and see if we can use PXN.
|
||||
int n, g1, g2;
|
||||
NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netId, &n));
|
||||
NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1));
|
||||
NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1, /*showWarn=*/true));
|
||||
NCCLCHECK(ncclTopoGetLocalGpu(comm->topo, netId, &g2));
|
||||
if (g2 != -1) {
|
||||
struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2;
|
||||
|
||||
+86
-52
@@ -10,12 +10,10 @@
|
||||
#include "topo.h"
|
||||
#include "comm.h"
|
||||
#include "nvmlwrap.h"
|
||||
#include "net.h"
|
||||
#include "coll_net.h"
|
||||
#include "transport.h"
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include "xml.h"
|
||||
#include "cpuset.h"
|
||||
#include "bootstrap.h"
|
||||
|
||||
@@ -24,11 +22,11 @@
|
||||
|
||||
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
const char* topoLinkTypeStr[] = { "LOC", "XGMI", "", "C2C", "PCI", "", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "C2C", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" };
|
||||
const char* topoLinkTypeStr[] = { "LOC", "XGMI", "", "C2C", "PCI", "", "", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "C2C", "PIX", "PXB", "PXN", "P2C", "PHB", "SYS", "NET", "DIS" };
|
||||
#else
|
||||
const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "C2C", "PCI", "", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" };
|
||||
const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "C2C", "PCI", "", "", "", "", "SYS", "NET" };
|
||||
const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "P2C", "PHB", "SYS", "NET", "DIS" };
|
||||
#endif
|
||||
|
||||
/******************************************************************/
|
||||
@@ -257,7 +255,7 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
|
||||
pciSwitch->pci.device |= 0xffff;
|
||||
free(subSwIds);
|
||||
// Restart, as system->nodes[PCI].nodes has changed.
|
||||
s = 0;
|
||||
s = -1; // Will be incremented to 0 in the next loop iteration
|
||||
continue;
|
||||
fail:
|
||||
free(subSwIds);
|
||||
@@ -427,7 +425,9 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { "0x120000", GPU }, { NULL, PCI /* Default fallback value */ } };
|
||||
#define PCI_BRIDGE_DEVICE_CLASS "0x060400"
|
||||
|
||||
struct kvDict kvDictPciClass[] = { { PCI_BRIDGE_DEVICE_CLASS, PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { "0x120000", GPU }, { NULL, PCI /* Default fallback value */ } };
|
||||
struct kvDict kvDictPciGen[] = {
|
||||
{ "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
|
||||
{ "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
|
||||
@@ -786,6 +786,7 @@ static ncclResult_t xmlInitAttrInt(struct ncclXmlNode* node, const char* attrNam
|
||||
if (index == -1) {
|
||||
index = node->nAttrs++;
|
||||
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
|
||||
node->attrs[index].key[MAX_STR_LEN] = '\0';
|
||||
snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -796,6 +797,7 @@ static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attr
|
||||
if (index == -1) {
|
||||
index = node->nAttrs++;
|
||||
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
|
||||
node->attrs[index].key[MAX_STR_LEN] = '\0';
|
||||
snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value);
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -806,6 +808,7 @@ static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrN
|
||||
if (index == -1) {
|
||||
index = node->nAttrs++;
|
||||
strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
|
||||
node->attrs[index].key[MAX_STR_LEN] = '\0';
|
||||
snprintf(node->attrs[index].value, MAX_STR_LEN, "%f", value);
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -886,6 +889,17 @@ typedef struct xmlNodeStack {
|
||||
|
||||
} xmlNodeStack;
|
||||
|
||||
ncclResult_t ncclFindFirstPciParent(ncclXmlNode** parent) {
|
||||
ncclXmlNode* newParent = *parent;
|
||||
while (strcmp(newParent->name, "pci") != 0) {
|
||||
newParent = newParent->parent;
|
||||
if (newParent == nullptr) return ncclSuccess;
|
||||
if (strcmp(newParent->name, "system") == 0) return ncclSuccess;
|
||||
}
|
||||
*parent = newParent;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// 1. Find the common parent xmlNode between the given set of nodes
|
||||
ncclResult_t ncclTopoGetPath(ncclXmlNode** nodes, int nNodes, int* path, ncclXmlNode** parent) {
|
||||
// Track a stack of parents per-net node being merged
|
||||
@@ -984,6 +998,7 @@ ncclResult_t ncclTopoGetPath(ncclXmlNode** nodes, int nNodes, int* path, ncclXml
|
||||
}
|
||||
|
||||
out:
|
||||
ncclFindFirstPciParent(&common);
|
||||
*parent = common;
|
||||
free(parents);
|
||||
return ncclSuccess;
|
||||
@@ -1047,13 +1062,19 @@ ncclResult_t ncclTopoMakePciParent(struct ncclXml* xml, struct ncclXmlNode** par
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoMakeVnic(ncclComm_t comm, struct ncclXml* xml, ncclNetVDeviceProps_t* vProps,
|
||||
struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
|
||||
ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, ncclNetVDeviceProps_t* vProps,
|
||||
struct ncclXmlNode** physNetNodes, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
|
||||
if (vProps->ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
|
||||
WARN("TOPO/NET : Tried to merge too many NICs. %d > %d", vProps->ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
// Don't make vNics of size 1
|
||||
if (vProps->ndevs == 1) {
|
||||
TRACE(NCCL_GRAPH, "TOPO/NET : Skipping vNic of size 1");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Trigger the merge, then get the new device's properties
|
||||
int vDevIndex = 0;
|
||||
ncclResult_t ret = makeVDevice(&vDevIndex, vProps);
|
||||
@@ -1063,11 +1084,18 @@ struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (*
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Mark original NICs as keep="0" in the topology
|
||||
for (int i = 0; i < vProps->ndevs; i++) {
|
||||
int dev = vProps->devs[i];
|
||||
struct ncclXmlNode* netNode = physNetNodes[dev];
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 0));
|
||||
}
|
||||
|
||||
INFO(NCCL_GRAPH, "TOPO/NET : Made vNic %d", vDevIndex);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
|
||||
ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
|
||||
char* ncStr;
|
||||
@@ -1105,8 +1133,7 @@ ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char
|
||||
goto fail;
|
||||
}
|
||||
|
||||
struct ncclXmlNode* netNode;
|
||||
ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice);
|
||||
ret = ncclTopoMakeVnic(xml, &vProps, physNetNodes, makeVDevice);
|
||||
if (ret == ncclSuccess) {
|
||||
// Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
|
||||
for (int i = 0; i < vProps.ndevs; i++) {
|
||||
@@ -1128,7 +1155,7 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
|
||||
ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
|
||||
// Compute the path type between each device
|
||||
int* paths = NULL;
|
||||
ncclResult_t res = ncclSuccess;
|
||||
@@ -1172,8 +1199,7 @@ ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLe
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
struct ncclXmlNode* netNode;
|
||||
ncclResult_t ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice);
|
||||
ncclResult_t ret = ncclTopoMakeVnic(xml, &vProps, physNetNodes, makeVDevice);
|
||||
|
||||
// Merging failed.
|
||||
// Mark all as unplaced and increase their distance to disconnected (PATH_DIS)
|
||||
@@ -1205,6 +1231,7 @@ struct kvDict nicPathKvList[] = {
|
||||
{ "PIX", PATH_PIX },
|
||||
{ "PXB", PATH_PXB },
|
||||
{ "PXN", PATH_PXN },
|
||||
{ "P2C", PATH_P2C },
|
||||
{ "PHB", PATH_PHB },
|
||||
{ "SYS", PATH_SYS },
|
||||
{ NULL, 0 }
|
||||
@@ -1226,14 +1253,19 @@ ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProper
|
||||
if (path == PATH_LOC) {
|
||||
*parent = NULL;
|
||||
} else if (parent && strcmp((*parent)->name, "pci") == 0) {
|
||||
// If the common parent is PCI, we must reparent the new NIC under a made up busId
|
||||
NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0]));
|
||||
// Compare PCI class here to avoid NCCL WARN when the "class" attribute doesn't exist
|
||||
const char* c;
|
||||
NCCLCHECK(xmlGetAttrStr(*parent, "class", &c));
|
||||
if (strcmp(c, PCI_BRIDGE_DEVICE_CLASS) == 0) {
|
||||
// If the common parent is a PCI switch, we must reparent the new NIC under a made up pci device with a unique busid
|
||||
NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0]));
|
||||
}
|
||||
}
|
||||
TRACE(NCCL_GRAPH, "Selected parent %s with path %d", (*parent)->name, path);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*getProperties)(int, ncclNetProperties_t*), int physicalDevs) {
|
||||
ncclResult_t ncclTopoMakeVNics(struct ncclXml* xml, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*getProperties)(int, ncclNetProperties_t*), int physicalDevs) {
|
||||
int* placedDevs = NULL;
|
||||
struct ncclXmlNode** physNetNodes = NULL;
|
||||
if (physicalDevs == 0) return ncclSuccess;
|
||||
@@ -1257,15 +1289,15 @@ ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_
|
||||
{ // Avoids warnings related to jumping to "out"
|
||||
const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL");
|
||||
if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
|
||||
const char* forceMerge = ncclGetEnv("NCCL_NET_FORCE_MERGE");
|
||||
char* forceMerge = (char*) ncclGetEnv("NCCL_NET_FORCE_MERGE");
|
||||
NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
|
||||
memset(placedDevs, 0, sizeof(int)*physicalDevs);
|
||||
|
||||
if (forceMerge) {
|
||||
NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
|
||||
NCCLCHECKGOTO(ncclTopoForceMerge(xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
|
||||
}
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTopoAutoMerge(comm, xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
|
||||
NCCLCHECKGOTO(ncclTopoAutoMerge(xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
|
||||
|
||||
out:
|
||||
free(physNetNodes);
|
||||
@@ -1274,7 +1306,7 @@ out:
|
||||
return res;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int keep, int virtualNics) {
|
||||
static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int virtualNics, bool dmaBufSupport) {
|
||||
for (int n = startIndex; n < endIndex; n++) {
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECK(getProperties(n, &props));
|
||||
@@ -1293,15 +1325,17 @@ static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int star
|
||||
const char* colAttr;
|
||||
NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr));
|
||||
|
||||
// If coll == 0 but the netNode is tagged as coll, don't update the keep value
|
||||
if (colAttr == NULL || coll != 0 || strcmp(colAttr,"1") != 0) NCCLCHECK(xmlSetAttrInt(netNode, "keep", keep));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
|
||||
int dev;
|
||||
xmlGetAttrIntDefault(netNode, "dev", &dev, -1);
|
||||
if (dev != -1 && dev != n) INFO(NCCL_GRAPH, "TOPO/NET : Changing %s dev index from %d to %d", netName, dev, n);
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "latency", props.latency));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
|
||||
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
|
||||
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
|
||||
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
|
||||
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netName, gdrSupport ? "Enabled" : "Disabled", n, props.name);
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
|
||||
// Only set coll if it's not 0
|
||||
@@ -1317,30 +1351,22 @@ static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int star
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclTopoNetState {
|
||||
int nVirtualNics;
|
||||
int nPhysicalNics;
|
||||
const char* name;
|
||||
};
|
||||
|
||||
// Calls to network plugin APIs should be protected. This function should be called inside a per-process lock.
|
||||
static ncclResult_t ncclTopoProcessNet(ncclComm_t comm, ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName) {
|
||||
ncclResult_t ncclTopoProcessNet(ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName, bool dmaBufSupport) {
|
||||
int usePhysicalDevices = (dumpXmlFile || makeVDevice == NULL);
|
||||
if (state->nPhysicalNics == -1) NCCLCHECK(devices(&state->nPhysicalNics));
|
||||
// Enumerate physical devices
|
||||
NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 1, 0));
|
||||
NCCLCHECK(ncclTopoPopulateNics(xml, 0, state->nPhysicalNics, getProperties, netName, coll, false, dmaBufSupport));
|
||||
if (!usePhysicalDevices) {
|
||||
if (state->nVirtualNics == -1) {
|
||||
NCCLCHECK(ncclTopoMakeVNics(comm, xml, makeVDevice, getProperties, state->nPhysicalNics));
|
||||
NCCLCHECK(ncclTopoMakeVNics(xml, makeVDevice, getProperties, state->nPhysicalNics));
|
||||
int nDevs;
|
||||
NCCLCHECK(devices(&nDevs));
|
||||
state->nVirtualNics = nDevs - state->nPhysicalNics;
|
||||
}
|
||||
// Remove keep=1 for physical collnets
|
||||
if (state->nVirtualNics > 0) {
|
||||
NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 0, 0));
|
||||
// Populate new devices
|
||||
NCCLCHECK(ncclTopoPopulateNics(comm, xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, 1, 1));
|
||||
NCCLCHECK(ncclTopoPopulateNics(xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, true, dmaBufSupport));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1388,6 +1414,15 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
// Try default XML topology location
|
||||
NCCLCHECKGOTO(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0), ret, fail);
|
||||
}
|
||||
// Fixup the cpu's host_hashes.
|
||||
struct ncclXmlNode* node;
|
||||
// Update every cpu node's host_hash attribute since those are not
|
||||
// intended to be preserved from the XML files that have been read.
|
||||
NCCLCHECKGOTO(xmlFindTag(xml, "cpu", &node), ret, fail);
|
||||
while (node != nullptr) {
|
||||
NCCLCHECKGOTO(xmlSetAttrLong(node, "host_hash", getHostHash()), ret, fail);
|
||||
NCCLCHECKGOTO(xmlFindNextTag(xml, "cpu", node, &node), ret, fail);
|
||||
}
|
||||
if (xml->maxIndex == 0) {
|
||||
// Create top tag
|
||||
struct ncclXmlNode* top;
|
||||
@@ -1400,7 +1435,6 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
// Detect only the GPU managed by this process. We'll get any others through XML fusion.
|
||||
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
NCCLCHECKGOTO(int64ToBusId(comm->peerInfo[comm->rank].busId, busId), ret, fail);
|
||||
struct ncclXmlNode* node;
|
||||
NCCLCHECKGOTO(ncclTopoFillGpu(xml, busId, &node), ret, fail);
|
||||
if (node) {
|
||||
NCCLCHECKGOTO(xmlSetAttrInt(node, "keep", 1), ret, fail);
|
||||
@@ -1417,13 +1451,13 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
state = NULL;
|
||||
if (collNetSupport(comm)) {
|
||||
NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclCollNet->name, collNetStates), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 1, dumpXmlFile, state,
|
||||
comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoProcessNet(xml, 1, dumpXmlFile, state,
|
||||
comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name, comm->dmaBufSupport), ret, fail);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclNet->name, netStates), ret, fail);
|
||||
// [RCCL] Disabled virtual devices
|
||||
NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 0, dumpXmlFile, state,
|
||||
comm->ncclNet->getProperties, nullptr /*comm->ncclNet->makeVDevice*/, comm->ncclNet->devices, comm->ncclNet->name), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoProcessNet(xml, 0, dumpXmlFile, state,
|
||||
comm->ncclNet->getProperties, nullptr /*comm->ncclNet->makeVDevice*/, comm->ncclNet->devices, comm->ncclNet->name, comm->dmaBufSupport), ret, fail);
|
||||
pthread_mutex_unlock(&netLock);
|
||||
netLockHeld = 0;
|
||||
|
||||
@@ -1487,7 +1521,7 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType,
|
||||
ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType,
|
||||
int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType) {
|
||||
int minType = PATH_DIS;
|
||||
float maxBw = 0;
|
||||
@@ -1540,7 +1574,7 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
|
||||
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
|
||||
int gpu;
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu, /*showWarn=*/true));
|
||||
|
||||
int localNets[NCCL_TOPO_MAX_NODES];
|
||||
int localNetCount;
|
||||
@@ -1607,7 +1641,7 @@ NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
|
||||
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) {
|
||||
struct ncclTopoNode* cpu = NULL, *gpu = NULL;
|
||||
int gpuIndex, cpuIndex;
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpuIndex));
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpuIndex, /*showWarn=*/true));
|
||||
NCCLCHECK(ncclGetLocalCpu(system, gpuIndex, &cpuIndex));
|
||||
gpu = system->nodes[GPU].nodes+gpuIndex;
|
||||
cpu = system->nodes[CPU].nodes+cpuIndex;
|
||||
@@ -1619,8 +1653,8 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
|
||||
#ifdef ENABLE_TRACE
|
||||
{
|
||||
char affinityStr[sizeof(cpu_set_t)*2];
|
||||
NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
|
||||
TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
|
||||
TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev,
|
||||
ncclCpusetToRangeStr(&mask, affinityStr, sizeof(affinityStr)));
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1630,8 +1664,8 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
|
||||
#ifdef ENABLE_TRACE
|
||||
{
|
||||
char affinityStr[sizeof(cpu_set_t)*2];
|
||||
NCCLCHECK(ncclCpusetToStr(&cpuMask, affinityStr));
|
||||
TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
|
||||
TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev,
|
||||
ncclCpusetToRangeStr(&cpuMask, affinityStr, sizeof(affinityStr)));
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1648,8 +1682,8 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
|
||||
// If there is a non empty set, use it to set affinity
|
||||
if (CPU_COUNT(&finalMask)) {
|
||||
char affinityStr[sizeof(cpu_set_t)*2];
|
||||
NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
|
||||
INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr);
|
||||
INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev,
|
||||
ncclCpusetToRangeStr(&finalMask, affinityStr, sizeof(affinityStr)));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
+22
-8
@@ -10,6 +10,8 @@
|
||||
|
||||
#include "graph.h"
|
||||
#include "core.h"
|
||||
#include "xml.h"
|
||||
#include "net.h"
|
||||
#include "archinfo.h"
|
||||
#include <string.h>
|
||||
|
||||
@@ -57,9 +59,10 @@ extern const char* topoNodeTypeStr[];
|
||||
#define LINK_PCI 4
|
||||
// Skipping 5 for PATH_PXB
|
||||
// Skipping 6 for PATH_PXN
|
||||
// Skipping 7 for PATH_PHB
|
||||
#define LINK_SYS 8
|
||||
#define LINK_NET 9
|
||||
// Skipping 7 for PATH_P2C
|
||||
// Skipping 8 for PATH_PHB
|
||||
#define LINK_SYS 9
|
||||
#define LINK_NET 10
|
||||
extern const char* topoLinkTypeStr[];
|
||||
|
||||
// Local (myself)
|
||||
@@ -83,20 +86,23 @@ extern const char* topoLinkTypeStr[];
|
||||
// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
|
||||
#define PATH_PXN 6
|
||||
|
||||
// Connection between a GPU and a NIC using the C2C connection to the CPU and the PCIe connection to the NIC
|
||||
#define PATH_P2C 7
|
||||
|
||||
// Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
|
||||
#define PATH_PHB 7
|
||||
#define PATH_PHB 8
|
||||
|
||||
// Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
|
||||
#define PATH_SYS 8
|
||||
#define PATH_SYS 9
|
||||
|
||||
// Connection through the network
|
||||
#define PATH_NET 9
|
||||
#define PATH_NET 10
|
||||
|
||||
// New type of path which should precede PATH_PIX
|
||||
#define PATH_PORT PATH_NVL
|
||||
|
||||
// Disconnected
|
||||
#define PATH_DIS 10
|
||||
#define PATH_DIS 11
|
||||
extern const char* topoPathTypeStr[];
|
||||
|
||||
struct ncclTopoNode;
|
||||
@@ -217,6 +223,13 @@ ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int*
|
||||
ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max);
|
||||
ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink);
|
||||
|
||||
struct ncclTopoNetState {
|
||||
int nVirtualNics;
|
||||
int nPhysicalNics;
|
||||
const char* name;
|
||||
};
|
||||
ncclResult_t ncclTopoProcessNet(ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName, bool dmaBufSupport);
|
||||
|
||||
#define NCCL_TOPO_XML_MAX_NODES 8192
|
||||
#define NCCL_GRAPH_XML_MAX_NODES 8192
|
||||
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, uint64_t localHostHash);
|
||||
@@ -236,7 +249,7 @@ static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, i
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index) {
|
||||
static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index, bool showWarn) {
|
||||
*index = -1;
|
||||
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||
if (system->nodes[GPU].nodes[i].gpu.rank == rank) {
|
||||
@@ -244,6 +257,7 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank,
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
if (showWarn) WARN("ncclTopoRankToIndex could not find rank %d", rank);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
|
||||
+57
-34
@@ -18,13 +18,13 @@ static int getNthreads(const char* name, int env, int min, int max, int def, int
|
||||
int nt = env;
|
||||
if (nt > 0) {
|
||||
if (nt % WarpSize != 0) {
|
||||
WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WarpSize);
|
||||
INFO(NCCL_GRAPH|NCCL_ENV, "Invalid %s %d (must be a multiple of %d)", name, nt, WarpSize);
|
||||
nt = max;
|
||||
} else if (nt > max) {
|
||||
WARN("Invalid %s %d (maximum %d).", name, nt, max);
|
||||
INFO(NCCL_GRAPH|NCCL_ENV, "Invalid %s %d (maximum %d).", name, nt, max);
|
||||
nt = max;
|
||||
} else if (nt < min) {
|
||||
WARN("Invalid %s %d (minimum %d).", name, nt, min);
|
||||
INFO(NCCL_GRAPH|NCCL_ENV, "Invalid %s %d (minimum %d).", name, nt, min);
|
||||
nt = min;
|
||||
}
|
||||
} else {
|
||||
@@ -53,11 +53,14 @@ static int getNthreads(const char* name, int env, int min, int max, int def, int
|
||||
// NCCL_PROTO="^LL128;allreduce:LL128"
|
||||
// Enable everything but LL128, but only LL128 for allreduce.
|
||||
ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes, const char* elems[], int nelems, int* list) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
char* fullStr = strdup(str);
|
||||
char* tmpFullStr;
|
||||
char* fullToken = strtok_r(fullStr, ";", &tmpFullStr);
|
||||
char* subToken = nullptr;
|
||||
char* tokStr = nullptr;
|
||||
while (fullToken) {
|
||||
char* subToken = strdup(fullToken);
|
||||
subToken = strdup(fullToken);
|
||||
char* tmpSubStr;
|
||||
char* prefix = strtok_r(subToken, ":", &tmpSubStr);
|
||||
char* elemList = strtok_r(NULL, ":", &tmpSubStr);
|
||||
@@ -67,7 +70,8 @@ ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes
|
||||
// because then all the prefixes before the prefix-less entry would be
|
||||
// overwritten.
|
||||
WARN("All entries except the first must have a prefix: \"%s\"", str);
|
||||
return ncclInvalidUsage;
|
||||
ret = ncclInvalidUsage;
|
||||
goto fail;
|
||||
}
|
||||
elemList = prefix;
|
||||
prefix = NULL;
|
||||
@@ -86,7 +90,7 @@ ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes
|
||||
foundPrefix = true;
|
||||
for (int e=0; e<nelems; e++) list[p*nelems+e] = unset;
|
||||
|
||||
char* tokStr = strdup(elemList);
|
||||
tokStr = strdup(elemList);
|
||||
char* tmpStr;
|
||||
char* elem = strtok_r(tokStr, ",", &tmpStr);
|
||||
while (elem) {
|
||||
@@ -99,22 +103,32 @@ ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes
|
||||
}
|
||||
if (e==nelems) {
|
||||
WARN("Unrecognized element token \"%s\" when parsing \"%s\"", elem, str);
|
||||
return ncclInvalidUsage;
|
||||
ret = ncclInvalidUsage;
|
||||
goto fail;
|
||||
}
|
||||
elem = strtok_r(NULL, ",", &tmpStr);
|
||||
}
|
||||
free(tokStr);
|
||||
tokStr = nullptr;
|
||||
}
|
||||
if (!foundPrefix) {
|
||||
WARN("Unrecognized prefix token \"%s\" when parsing \"%s\"", prefix, str);
|
||||
return ncclInvalidUsage;
|
||||
ret = ncclInvalidUsage;
|
||||
goto fail;
|
||||
}
|
||||
free(subToken);
|
||||
subToken = nullptr;
|
||||
|
||||
fullToken = strtok_r(NULL, ";", &tmpFullStr);
|
||||
}
|
||||
|
||||
exit:
|
||||
free(tokStr);
|
||||
free(subToken);
|
||||
free(fullStr);
|
||||
return ncclSuccess;
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// Latencies in us, Bandwidths in GB/s
|
||||
@@ -448,6 +462,8 @@ static float getNetOverhead(struct ncclComm* comm) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
NCCL_PARAM(Ll128C2c, "LL128_C2C", 1);
|
||||
|
||||
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
|
||||
int simpleDefaultThreads = (graphs[NCCL_ALGO_RING]->bwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
|
||||
@@ -521,7 +537,14 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw;
|
||||
//INFO(NCCL_INIT, "algo %s proto %s busBw %f baseBw %f bw %f nChannels %d bwIntra %f bwInter %f", ncclAlgoStr[a], ncclProtoStr[p], busBw, comm->topo->baseBw, bw, graphs[a]->nChannels, graphs[a]->bwIntra, graphs[a]->bwInter);
|
||||
|
||||
if (a == NCCL_ALGO_NVLS) bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter);
|
||||
if (a == NCCL_ALGO_NVLS) {
|
||||
if (coll == ncclFuncAllReduce) {
|
||||
bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter);
|
||||
} else {
|
||||
// allgather and reducescatter
|
||||
bw = std::min(graphs[a]->bwIntra * (ppn - 1.0f) / ppn, graphs[a]->bwInter * 0.9f);
|
||||
}
|
||||
}
|
||||
if (a == NCCL_ALGO_NVLS_TREE) bw = std::min(graphs[a]->bwIntra, nNodes <= 2 ? graphs[a]->bwInter : graphs[a]->bwInter/2);
|
||||
|
||||
// Various model refinements
|
||||
@@ -543,19 +566,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
||||
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
|
||||
if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
|
||||
busBw = ppn * bw;
|
||||
// AllGather/ReduceScatter requires 1:1 GPU:NIC
|
||||
int nicPerNode = comm->collNetHeadsNum;
|
||||
if (coll == ncclFuncAllGather && comm->nNodes > 1) {
|
||||
if (!comm->ncclCollNet || !comm->ncclCollNet->iallgather || ppn > nicPerNode) busBw = 0;
|
||||
}
|
||||
if (coll == ncclFuncReduceScatter && comm->nNodes > 1) {
|
||||
if (!comm->ncclCollNet || !comm->ncclCollNet->ireducescatter || ppn > nicPerNode) busBw = 0;
|
||||
}
|
||||
// Measured corrective ratio needed at 1 ppn and 8ppn. Here we hackishly
|
||||
// interpolate the two.
|
||||
float w = (ppn-1)/(8-1);
|
||||
busBw *= w*0.85 + (1-w)*0.95;
|
||||
busBw = ppn * std::min(graphs[a]->bwIntra, graphs[a]->bwInter * 0.9f);
|
||||
} else {
|
||||
// Collnet+Direct requires all GPUs to have a local NIC to work at full speed
|
||||
float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio
|
||||
@@ -564,8 +575,27 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
if (minCompCap >= 90) busBw *= .85;
|
||||
}
|
||||
}
|
||||
// disable collnet for allgather/reducescatter if #localranks > #heads
|
||||
// AllGather/ReduceScatter requires 1:1 GPU:NIC
|
||||
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_COLLNET_DIRECT) && p == NCCL_PROTO_SIMPLE && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) && comm->nNodes > 1) {
|
||||
int nHeads = 0;
|
||||
if (coll == ncclFuncAllGather && comm->nNodes > 1 && (!comm->ncclCollNet || !comm->ncclCollNet->iallgather)) busBw = 0.0f;
|
||||
if (coll == ncclFuncReduceScatter && comm->nNodes > 1 && (!comm->ncclCollNet || !comm->ncclCollNet->ireducescatter)) busBw = 0.0f;
|
||||
if (comm->config.collnetEnable)
|
||||
nHeads = comm->collNetHeadsNum;
|
||||
else
|
||||
busBw = 0.0f;
|
||||
if (busBw > 0.0f) {
|
||||
for (int r = 0; r < comm->nRanks; r++) {
|
||||
int node = comm->rankToNode[r];
|
||||
if (comm->nodeRanks[node].localRanks > nHeads) {
|
||||
busBw = 0.0f;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Convert bus BW to algorithm BW
|
||||
if (!(a != NCCL_ALGO_RING && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
|
||||
float ratio = 1.0f;
|
||||
@@ -689,7 +719,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
// Disable NVLS Tree on a single node
|
||||
if (comm->nNodes == 1 && a == NCCL_ALGO_NVLS_TREE) disable = 1;
|
||||
// Disable Collnet+Direct, Collnet+Chain or Collnet+NVLS if collnet is not supported.
|
||||
if (comm->collNetSupport == 0 &&
|
||||
if (comm->config.collnetEnable == 0 &&
|
||||
(a == NCCL_ALGO_COLLNET_DIRECT ||
|
||||
a == NCCL_ALGO_COLLNET_CHAIN ||
|
||||
(a == NCCL_ALGO_NVLS && comm->nNodes > 1))) disable = 1;
|
||||
@@ -716,17 +746,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
#else
|
||||
// Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption.
|
||||
pEnable = 1;
|
||||
pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN));
|
||||
pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= (ncclParamLl128C2c() ? PATH_P2C : PATH_PXN)));
|
||||
pEnable &= (graphs[a]->typeIntra <= PATH_NVB);
|
||||
pEnable &= (minCompCap == maxCompCap);
|
||||
switch (minCompCap) {
|
||||
case 70: pEnable &= 1; break;
|
||||
case 80: pEnable &= 1; break;
|
||||
case 90: pEnable &= !(CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2); break;
|
||||
case 100: pEnable &= 1; break;
|
||||
case 120: pEnable &= 1; break;
|
||||
default: pEnable &= 0; break;
|
||||
}
|
||||
pEnable &= !(minCompCap < 70 || (minCompCap == 90 && CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2));
|
||||
#endif
|
||||
}
|
||||
if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
|
||||
|
||||
+29
-8
@@ -42,7 +42,13 @@ ncclResult_t xmlGetValue(FILE* file, char* value, char* last) {
|
||||
#if INT_OK
|
||||
int o = 0;
|
||||
do {
|
||||
value[o++] = c;
|
||||
value[o] = c;
|
||||
if (o == MAX_STR_LEN-1) {
|
||||
value[o] = '\0';
|
||||
WARN("Error : value %s too long (max %d)", value, MAX_STR_LEN);
|
||||
return ncclInternalError;
|
||||
}
|
||||
o++;
|
||||
NCCLCHECK(xmlGetChar(file, &c));
|
||||
} while (c >= '0' && c <= '9');
|
||||
value[o] = '\0';
|
||||
@@ -54,10 +60,17 @@ ncclResult_t xmlGetValue(FILE* file, char* value, char* last) {
|
||||
#endif
|
||||
}
|
||||
int o = 0;
|
||||
char quote = c; // Remember which quote type we started with
|
||||
do {
|
||||
NCCLCHECK(xmlGetChar(file, &c));
|
||||
value[o++] = c;
|
||||
} while (c != '"');
|
||||
value[o] = c;
|
||||
if (o == MAX_STR_LEN-1) {
|
||||
value[o] = '\0';
|
||||
WARN("Error : value %s too long (max %d)", value, MAX_STR_LEN);
|
||||
return ncclInternalError;
|
||||
}
|
||||
o++;
|
||||
} while (c != quote);
|
||||
value[o-1] = '\0';
|
||||
NCCLCHECK(xmlGetChar(file, last));
|
||||
return ncclSuccess;
|
||||
@@ -270,7 +283,7 @@ ncclResult_t ncclTopoDumpXmlRec(int indent, FILE* file, struct ncclXmlNode* node
|
||||
ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml) {
|
||||
FILE* file = fopen(xmlTopoFile, "w");
|
||||
if (file == NULL) {
|
||||
WARN("Unable to open %s, not dumping topology.", xmlTopoFile);
|
||||
INFO(NCCL_GRAPH|NCCL_ENV, "Unable to open %s, not dumping topology.", xmlTopoFile);
|
||||
return ncclSuccess;
|
||||
}
|
||||
NCCLCHECK(ncclTopoDumpXmlRec(0, file, xml->nodes));
|
||||
@@ -385,7 +398,7 @@ ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml
|
||||
FILE* file = fopen(xmlTopoFile, "r");
|
||||
if (file == NULL) {
|
||||
if (warn) {
|
||||
WARN("Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno));
|
||||
INFO(NCCL_GRAPH|NCCL_ENV, "Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -835,7 +848,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, uint32_t rocmDev
|
||||
int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : (sm < 90) ? 12 : 18;
|
||||
|
||||
if (maxNvLinks > 0 && nvmlDev == NULL) {
|
||||
WARN("No NVML device handle. Skipping nvlink detection.");
|
||||
INFO(NCCL_GRAPH, "No NVML device handle. Skipping nvlink detection.");
|
||||
maxNvLinks = 0;
|
||||
}
|
||||
|
||||
@@ -1052,8 +1065,16 @@ ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node, int* keep) {
|
||||
NCCLCHECK(ncclTopoTrimXmlRec(subs[s], &k));
|
||||
*keep += k;
|
||||
}
|
||||
if (*keep == 0 && // Trim PCI switches or CPU with no used GPU/NIC under them.
|
||||
(strcmp(node->name, "pci") == 0 || strcmp(node->name, "cpu") == 0)) {
|
||||
// Remove node if it has no children and no keep attribute
|
||||
if (*keep == 0 && // Trim PCI switches, CPUs with no used GPU/NIC under them, or pruned NICs
|
||||
(strcmp(node->name, "pci") == 0 || strcmp(node->name, "cpu") == 0 || strcmp(node->name, "nic") == 0 || strcmp(node->name, "net") == 0)) {
|
||||
#ifdef ENABLE_TRACE
|
||||
const char* name;
|
||||
const char* busid;
|
||||
NCCLCHECK(xmlGetAttr(node, "name", &name));
|
||||
NCCLCHECK(xmlGetAttr(node, "busid", &busid));
|
||||
TRACE(NCCL_GRAPH, "Removing node %s %s %s\n", node->name, name, busid);
|
||||
#endif
|
||||
NCCLCHECK(xmlRemoveNode(node));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,6 +121,13 @@ static ncclResult_t xmlGetAttrIntDefault(struct ncclXmlNode* node, const char* a
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t xmlGetAttrUint64(struct ncclXmlNode* node, const char* attrName, uint64_t* value) {
|
||||
const char* str;
|
||||
NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
|
||||
*value = strtoull(str, NULL, 0);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char* attrName, int64_t* value) {
|
||||
const char* str;
|
||||
NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
|
||||
@@ -128,7 +135,6 @@ static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char* attrNam
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) {
|
||||
const char* str;
|
||||
NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
|
||||
@@ -258,7 +264,6 @@ static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName
|
||||
node->attrs[index].key[MAX_STR_LEN] = '\0';
|
||||
}
|
||||
snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
|
||||
node->attrs[index].value[MAX_STR_LEN] = '\0';
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -271,7 +276,6 @@ static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrNa
|
||||
node->attrs[index].key[MAX_STR_LEN] = '\0';
|
||||
}
|
||||
snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value);
|
||||
node->attrs[index].value[MAX_STR_LEN] = '\0';
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -284,7 +288,6 @@ static ncclResult_t xmlSetAttrLong(struct ncclXmlNode* node, const char* attrNam
|
||||
node->attrs[index].key[MAX_STR_LEN] = '\0';
|
||||
}
|
||||
snprintf(node->attrs[index].value, MAX_STR_LEN, "%#lx", value);
|
||||
node->attrs[index].value[MAX_STR_LEN] = '\0';
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
+245
-115
@@ -19,16 +19,14 @@
|
||||
|
||||
using namespace rccl;
|
||||
|
||||
#define GROUP_MAX_RECLAIM_STEPS 10
|
||||
|
||||
__thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting
|
||||
__thread ncclResult_t ncclGroupError = ncclSuccess;
|
||||
__thread struct ncclComm* ncclGroupCommHead = nullptr;
|
||||
__thread struct ncclComm* ncclGroupCommHead[ncclGroupTaskTypeNum] = {nullptr};
|
||||
__thread struct ncclComm* ncclGroupCommPreconnectHead = nullptr;
|
||||
__thread struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> ncclAsyncJobs;
|
||||
__thread struct ncclGroupJob *ncclGroupJobMainPtr = NULL;
|
||||
__thread struct ncclGroupJob ncclGroupJobMain;
|
||||
__thread int ncclGroupBlocking = -1; /* default mode */
|
||||
__thread bool ncclGroupJobAbortFlag = false;
|
||||
|
||||
void* ncclAsyncJobMain(void* arg);
|
||||
|
||||
ncclResult_t ncclAsyncLaunch(
|
||||
@@ -219,6 +217,66 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
struct ncclGroupSymmetricJob {
|
||||
struct ncclAsyncJob base;
|
||||
struct ncclComm* comm;
|
||||
};
|
||||
|
||||
NCCL_PARAM(WinStride, "WIN_STRIDE", -1);
|
||||
|
||||
ncclResult_t ncclCommGroupRegisterSymmetric(struct ncclAsyncJob* job_) {
|
||||
struct ncclGroupSymmetricJob* job = (struct ncclGroupSymmetricJob*)job_;
|
||||
struct ncclComm* comm = job->comm;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
|
||||
if (comm->baseStride == 0) {
|
||||
cudaStream_t hostStream;
|
||||
// first time to allocate symmetric VA space.
|
||||
// calling into this function means symmetric is supported.
|
||||
struct ncclSymDevBase* symBase = NULL;
|
||||
size_t size = ncclSymDevBase::size(comm->localRanks);
|
||||
if (ncclParamWinStride() != -1) {
|
||||
comm->baseStride = ncclParamWinStride();
|
||||
} else {
|
||||
size_t maxStride = 0;
|
||||
for (int r = 0; r < comm->nRanks; ++r)
|
||||
if (comm->peerInfo[r].totalGlobalMem > maxStride) maxStride = comm->peerInfo[r].totalGlobalMem;
|
||||
comm->baseStride = maxStride;
|
||||
}
|
||||
INFO(NCCL_INIT, "rank %d base stride %zuGB total VM %zuGB", comm->rank, comm->baseStride >> 30, (comm->baseStride * comm->localRanks) >> 30);
|
||||
NCCLCHECKGOTO(ncclIpcSymmetricInit(comm), ret, fail);
|
||||
NCCLCHECKGOTO(ncclNvlsSymmetricInit(comm), ret, fail);
|
||||
comm->symAllocHead = 0;
|
||||
|
||||
// Allocate symmetric memory for NCCL internal usage
|
||||
NCCLCHECKGOTO(ncclCommSymmetricAllocInternal(comm, size, alignof(struct ncclSymDevBase), (void**)&symBase), ret, fail);
|
||||
assert((void*)symBase == (void*)(comm->baseUCSymPtr + comm->localRank * comm->baseStride));
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail);
|
||||
CUDACHECKGOTO(cudaMemsetAsync(symBase, 0, size, hostStream), ret, fail);
|
||||
CUDACHECKGOTO(cudaStreamSynchronize(hostStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), ret, fail);
|
||||
|
||||
comm->symDevComm.base = (struct ncclSymDevBase*)(comm->baseUCSymPtr + comm->localRank * comm->baseStride);
|
||||
comm->symDevComm.baseMc = (struct ncclSymDevBase*)comm->baseMCSymPtr;
|
||||
comm->symDevComm.nRanks = comm->localRanks;
|
||||
comm->symDevComm.nRanks_rcp32 = idivRcp32(comm->localRanks);
|
||||
comm->symDevComm.rank = comm->localRank;
|
||||
comm->symDevComm.stride4G = comm->baseStride >> 32;
|
||||
}
|
||||
|
||||
while (!ncclIntruQueueEmpty(&comm->symRegTaskQueue)) {
|
||||
struct ncclSymRegTask* task = ncclIntruQueueDequeue(&comm->symRegTaskQueue);
|
||||
NCCLCHECKGOTO(ncclCommSymmetricRegisterInternal(comm, task->buff, task->baseSize, task->alignment, task->memHandle, task->regHandle), ret, fail);
|
||||
free(task);
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t doLaunches(struct ncclComm* head) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
struct ncclComm* cliqueHead = head;
|
||||
@@ -235,7 +293,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
|
||||
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
|
||||
NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
|
||||
if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
|
||||
comm = comm->groupNext;
|
||||
comm = comm->groupNext[ncclGroupTaskTypeCollective];
|
||||
} while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
|
||||
cliqueNextHead = comm;
|
||||
|
||||
@@ -252,7 +310,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
|
||||
bool moreRounds = false;
|
||||
comm = cliqueHead;
|
||||
do { // Iterate clique members.
|
||||
struct ncclComm* next = comm->groupNext;
|
||||
struct ncclComm* next = comm->groupNext[ncclGroupTaskTypeCollective];
|
||||
if (useBarrier) {
|
||||
// Barrier reduction result tells us if this was the final round.
|
||||
moreRounds = 0 != ncclCommIntraBarrierOut(comm);
|
||||
@@ -287,66 +345,62 @@ failure:
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline void groupResetJobState(struct ncclGroupJob* job) {
|
||||
if (job) {
|
||||
if (job->groupBlockingPtr) *job->groupBlockingPtr = -1;
|
||||
if (job->abortFlagPtr) *job->abortFlagPtr = false;
|
||||
if (job->groupErrorPtr) *job->groupErrorPtr = ncclSuccess;
|
||||
if (job->groupCommHeadPtr) *job->groupCommHeadPtr = NULL;
|
||||
if (job->groupCommPreconnectHeadPtr) *job->groupCommPreconnectHeadPtr = NULL;
|
||||
memset(job, 0, sizeof(struct ncclGroupJob));
|
||||
}
|
||||
static inline void groupLocalResetJobState() {
|
||||
ncclGroupError = ncclSuccess;
|
||||
for (int type = 0; type < ncclGroupTaskTypeNum; ++type) ncclGroupCommHead[type] = NULL;
|
||||
ncclGroupCommPreconnectHead = NULL;
|
||||
ncclGroupBlocking = -1;
|
||||
ncclIntruQueueConstruct(&ncclAsyncJobs);
|
||||
return;
|
||||
}
|
||||
|
||||
static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncJobsPtr, ncclResult_t* groupErrorPtr, int* groupBlockingPtr, volatile bool* groupJobAbortFlagPtr, ncclResult_t error) {
|
||||
struct ncclComm* comm = *groupCommHeadPtr;
|
||||
|
||||
/* reset all thread local variables */
|
||||
*groupCommHeadPtr = NULL;
|
||||
*groupCommPreconnectHeadPtr = NULL;
|
||||
*groupErrorPtr = ncclSuccess;
|
||||
*groupBlockingPtr = -1;
|
||||
*groupJobAbortFlagPtr = false;
|
||||
|
||||
while (comm != nullptr) {
|
||||
struct ncclComm* next = comm->groupNext;
|
||||
(void) ncclGroupCommLeave(comm); // overwrites comm->groupNext
|
||||
// We don't know if preconnect succeeded or happened at all, so clear
|
||||
// the flags that let `taskAppend()` skip over checking if preconnect
|
||||
// is needed.
|
||||
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
for (int j = 0; j < MAXCHANNELS/64; j++) {
|
||||
comm->connectSend[i].masks[j] = 0UL;
|
||||
comm->connectRecv[i].masks[j] = 0UL;
|
||||
}
|
||||
}
|
||||
// Reclaim abandoned kernel plan memory. Note ncclWork structs were already
|
||||
// reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
|
||||
while (!ncclIntruQueueEmpty(&comm->planner.planQueue)) {
|
||||
struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planner.planQueue);
|
||||
// Persistent plans will be reclaimed via the callbackQueue when the
|
||||
// graph drops its UserObject reference.
|
||||
if (!plan->persistent) {
|
||||
while (!ncclIntruQueueEmpty(&plan->proxyOpQueue)) {
|
||||
struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->proxyOpQueue);
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
|
||||
static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncJobsPtr, ncclResult_t error) {
|
||||
struct ncclComm* comm;
|
||||
for (int type = 0; type < ncclGroupTaskTypeNum; ++type) {
|
||||
comm = groupCommHeadPtr[type];
|
||||
// reset groupCommHeadPtr[type]
|
||||
groupCommHeadPtr[type] = nullptr;
|
||||
while (comm != nullptr) {
|
||||
struct ncclComm* next = comm->groupNext[type];
|
||||
(void)ncclGroupCommLeave(comm, type); // overwrites comm->groupNext
|
||||
// We don't know if preconnect succeeded or happened at all, so clear
|
||||
// the flags that let `taskAppend()` skip over checking if preconnect
|
||||
// is needed.
|
||||
if (type == ncclGroupTaskTypeCollective) {
|
||||
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
for (int j = 0; j < MAXCHANNELS/64; j++) {
|
||||
comm->connectSend[i].masks[j] = 0UL;
|
||||
comm->connectRecv[i].masks[j] = 0UL;
|
||||
}
|
||||
}
|
||||
// Reclaim abandoned kernel plan memory. Note ncclWork structs were already
|
||||
// reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
|
||||
while (!ncclIntruQueueEmpty(&comm->planner.planQueue)) {
|
||||
struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planner.planQueue);
|
||||
// Persistent plans will be reclaimed via the callbackQueue when the
|
||||
// graph drops its UserObject reference.
|
||||
if (!plan->persistent) {
|
||||
while (!ncclIntruQueueEmpty(&plan->proxyOpQueue)) {
|
||||
struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->proxyOpQueue);
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
|
||||
}
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
|
||||
}
|
||||
}
|
||||
|
||||
{ // Reset comm->planner to empty.
|
||||
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
|
||||
memset(&comm->planner, 0, sizeof(comm->planner));
|
||||
comm->planner.peers = tmp;
|
||||
if (comm->planner.peers != NULL) memset(comm->planner.peers, 0, comm->nRanks * sizeof(comm->planner.peers[0]));
|
||||
}
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
|
||||
}
|
||||
}
|
||||
|
||||
{ // Reset comm->planner to empty.
|
||||
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
|
||||
memset(&comm->planner, 0, sizeof(comm->planner));
|
||||
comm->planner.peers = tmp;
|
||||
if (comm->planner.peers != NULL) memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0]));
|
||||
if (!comm->config.blocking)
|
||||
(void)ncclCommSetAsyncError(comm, error);
|
||||
comm = next;
|
||||
}
|
||||
|
||||
if (!comm->config.blocking)
|
||||
(void) ncclCommSetAsyncError(comm, error);
|
||||
comm = next;
|
||||
}
|
||||
|
||||
/* reset everything */
|
||||
@@ -423,11 +477,10 @@ fail:
|
||||
static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
|
||||
struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
|
||||
struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
|
||||
|
||||
bool *groupAbortFlag = gjob->abortFlagPtr;
|
||||
struct ncclComm **groupCommHeadMain = gjob->groupCommHead;
|
||||
struct ncclComm *groupCommPreconnectHeadMain = gjob->groupCommPreconnectHead;
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = &gjob->asyncJobs;
|
||||
bool *groupAbortFlag = &gjob->abortFlag;
|
||||
|
||||
if (!simInfo && groupCommPreconnectHeadMain != nullptr) {
|
||||
struct ncclComm* comm = groupCommPreconnectHeadMain;
|
||||
@@ -451,9 +504,41 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
|
||||
|
||||
NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail);
|
||||
|
||||
// only loop through sym alloc and register tasks
|
||||
for (int type = ncclGroupTaskTypeSymRegister; type <= ncclGroupTaskTypeSymRegister; ++type) {
|
||||
if (groupCommHeadMain[type]) {
|
||||
struct ncclComm* cliqueHead = groupCommHeadMain[type];
|
||||
struct ncclComm* comm = NULL;
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncSymJobs;
|
||||
ncclIntruQueueConstruct(&asyncSymJobs);
|
||||
do {
|
||||
comm = cliqueHead;
|
||||
do {
|
||||
struct ncclGroupSymmetricJob* job;
|
||||
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
|
||||
job->base.func = ncclCommGroupRegisterSymmetric;
|
||||
job->base.undo = nullptr;
|
||||
job->base.destructor = free;
|
||||
job->base.state = ncclGroupJobRunning;
|
||||
job->base.abortFlag = comm->abortFlag;
|
||||
job->base.abortFlagDev = comm->abortFlagDev;
|
||||
job->comm = comm;
|
||||
ncclIntruQueueEnqueue(&asyncSymJobs, (struct ncclAsyncJob*)job);
|
||||
comm = comm->groupNext[type];
|
||||
} while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
|
||||
NCCLCHECKGOTO(asyncJobLaunch(&asyncSymJobs, groupAbortFlag), ret, fail);
|
||||
while (!ncclIntruQueueEmpty(&asyncSymJobs)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncSymJobs);
|
||||
if (job->destructor) job->destructor((void*)job);
|
||||
}
|
||||
cliqueHead = comm;
|
||||
} while (cliqueHead != nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
/* Connect channels at runtime if cumem is supported */
|
||||
if (groupCommHeadMain != nullptr) {
|
||||
struct ncclComm* cliqueHead = groupCommHeadMain;
|
||||
if (groupCommHeadMain[ncclGroupTaskTypeCollective] != nullptr) {
|
||||
struct ncclComm* cliqueHead = groupCommHeadMain[ncclGroupTaskTypeCollective];
|
||||
struct ncclComm* comm = NULL;
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncCollJobs;
|
||||
ncclIntruQueueConstruct(&asyncCollJobs);
|
||||
@@ -484,7 +569,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
|
||||
memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
|
||||
ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
|
||||
}
|
||||
comm = comm->groupNext;
|
||||
comm = comm->groupNext[ncclGroupTaskTypeCollective];
|
||||
} while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
|
||||
// connect
|
||||
NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
|
||||
@@ -496,42 +581,49 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
|
||||
} while (cliqueHead != nullptr);
|
||||
|
||||
// done with all buffer allocation, start registration and enqueue
|
||||
comm = groupCommHeadMain;
|
||||
comm = groupCommHeadMain[ncclGroupTaskTypeCollective];
|
||||
do {
|
||||
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTasksRegAndEnqueue(comm), ret, fail);
|
||||
comm = comm->groupNext;
|
||||
comm = comm->groupNext[ncclGroupTaskTypeCollective];
|
||||
} while (comm);
|
||||
}
|
||||
|
||||
if ((!simInfo) && (groupCommHeadMain != nullptr)) {
|
||||
NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
|
||||
if ((!simInfo) && (groupCommHeadMain[ncclGroupTaskTypeCollective] != nullptr)) {
|
||||
NCCLCHECKGOTO(doLaunches(groupCommHeadMain[ncclGroupTaskTypeCollective]), ret, fail);
|
||||
}
|
||||
|
||||
while (!ncclIntruQueueEmpty(asyncJobsMain)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
|
||||
if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
|
||||
if (!job->destroyFlag && job->comm && !job->comm->config.blocking && groupCommHeadMain[ncclGroupTaskTypeCollective] == nullptr)
|
||||
(void) ncclCommSetAsyncError(job->comm, ret);
|
||||
if (job->destructor) job->destructor((void*)job);
|
||||
}
|
||||
|
||||
while (groupCommHeadMain != nullptr) {
|
||||
struct ncclComm* comm = groupCommHeadMain;
|
||||
struct ncclComm* next = comm->groupNext;
|
||||
// Poll for callbacks sent to us from other threads. Typically these free
|
||||
// resources from to our memory pools and UB
|
||||
NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/false), ret, fail);
|
||||
(void) ncclGroupCommLeave(comm);
|
||||
if (!comm->config.blocking) {
|
||||
(void) ncclCommSetAsyncError(comm, ret);
|
||||
for (int type = 0; type < ncclGroupTaskTypeNum; ++type) {
|
||||
while (groupCommHeadMain[type] != nullptr) {
|
||||
struct ncclComm* comm = groupCommHeadMain[type];
|
||||
struct ncclComm* next = comm->groupNext[type];
|
||||
// Poll for callbacks sent to us from other threads. Typically these free
|
||||
// resources from to our memory pools and UB
|
||||
if (comm->reclaimSteps == GROUP_MAX_RECLAIM_STEPS) {
|
||||
NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/false), ret, fail);
|
||||
comm->reclaimSteps = 0;
|
||||
} else {
|
||||
comm->reclaimSteps++;
|
||||
}
|
||||
(void)ncclGroupCommLeave(comm, type);
|
||||
if (!comm->config.blocking) {
|
||||
(void)ncclCommSetAsyncError(comm, ret);
|
||||
}
|
||||
groupCommHeadMain[type] = next;
|
||||
}
|
||||
groupCommHeadMain = next;
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, gjob->groupBlockingPtr, gjob->abortFlagPtr, ret);
|
||||
groupCleanup(gjob->groupCommHead, &gjob->asyncJobs, ret);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
@@ -544,6 +636,8 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
|
||||
ncclSimInfo_t internalSimInfo = NCCL_SIM_INFO_INITIALIZER;
|
||||
ncclSimInfo_t* internalSimInfoPtr = NULL;
|
||||
size_t realSize = 0;
|
||||
bool hasCommHead = false;
|
||||
ncclGroupJob* groupJob = NULL;
|
||||
|
||||
internalSimInfo.magic = 0;
|
||||
|
||||
@@ -573,72 +667,108 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
|
||||
internalSimInfoPtr = &internalSimInfo;
|
||||
}
|
||||
|
||||
if (ncclGroupCommHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs) || ncclGroupCommPreconnectHead != nullptr) {
|
||||
ncclGroupJobMain.groupCommHeadPtr = &ncclGroupCommHead;
|
||||
ncclGroupJobMain.groupCommPreconnectHeadPtr = &ncclGroupCommPreconnectHead;
|
||||
ncclGroupJobMain.groupErrorPtr = &ncclGroupError;
|
||||
ncclGroupJobMain.asyncJobsPtr = &ncclAsyncJobs;
|
||||
ncclGroupJobMain.abortFlagPtr = &ncclGroupJobAbortFlag;
|
||||
ncclGroupJobMain.groupBlockingPtr = &ncclGroupBlocking;
|
||||
ncclGroupJobMain.initialized = true;
|
||||
ncclGroupJobMainPtr = &ncclGroupJobMain;
|
||||
for (int type = 0; type < ncclGroupTaskTypeNum; ++type) {
|
||||
if (ncclGroupCommHead[type]) {
|
||||
hasCommHead = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&groupJob, 1), ret, fail);
|
||||
ncclIntruQueueConstruct(&groupJob->asyncJobs);
|
||||
groupJob->groupRefCount = 0;
|
||||
groupJob->nonBlockingInit = false;
|
||||
memcpy(groupJob->groupCommHead, ncclGroupCommHead, sizeof(ncclGroupCommHead));
|
||||
groupJob->groupCommPreconnectHead = ncclGroupCommPreconnectHead;
|
||||
groupJob->groupError = ncclSuccess;
|
||||
groupJob->abortFlag = false;
|
||||
groupJob->joined = false;
|
||||
ncclIntruQueueTransfer(&groupJob->asyncJobs, &ncclAsyncJobs);
|
||||
|
||||
if (hasCommHead || !ncclIntruQueueEmpty(&groupJob->asyncJobs) || ncclGroupCommPreconnectHead != nullptr) {
|
||||
/* make sure ncclGroupBlocking has been set. */
|
||||
assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1);
|
||||
if (ncclGroupBlocking == 0 && (ncclGroupCommPreconnectHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs))) {
|
||||
/* nonblocking group */
|
||||
if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
|
||||
ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
|
||||
if (!ncclIntruQueueEmpty(&groupJob->asyncJobs)) {
|
||||
ncclAsyncJob* job = ncclIntruQueueHead(&groupJob->asyncJobs);
|
||||
do {
|
||||
NCCLCHECKGOTO(ncclCommSetAsyncError(job->comm, ncclInProgress), ret, fail);
|
||||
job->comm->groupJob = ncclGroupJobMainPtr;
|
||||
if (job->comm->groupJob == NULL) {
|
||||
job->comm->groupJob = groupJob;
|
||||
groupJob->groupRefCount++;
|
||||
}
|
||||
job = job->next;
|
||||
} while (job);
|
||||
}
|
||||
|
||||
if (ncclGroupCommHead) {
|
||||
ncclComm_t comm = ncclGroupCommHead;
|
||||
do {
|
||||
NCCLCHECKGOTO(ncclCommSetAsyncError(comm, ncclInProgress), ret, fail);
|
||||
/* link group job to communicators. */
|
||||
comm->groupJob = ncclGroupJobMainPtr;
|
||||
comm = comm->groupNext;
|
||||
} while (comm);
|
||||
for (int type = 0; type < ncclGroupTaskTypeNum; ++type) {
|
||||
if (ncclGroupCommHead[type]) {
|
||||
ncclComm_t comm = ncclGroupCommHead[type];
|
||||
do {
|
||||
NCCLCHECKGOTO(ncclCommSetAsyncError(comm, ncclInProgress), ret, fail);
|
||||
/* link group job to communicators. */
|
||||
if (comm->groupJob == NULL) {
|
||||
comm->groupJob = groupJob;
|
||||
groupJob->groupRefCount++;
|
||||
}
|
||||
comm = comm->groupNext[type];
|
||||
} while (comm);
|
||||
}
|
||||
}
|
||||
|
||||
ncclGroupJobMainPtr->base.func = groupLaunchNonBlocking;
|
||||
PTHREADCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), "pthread_create", ret, fail);
|
||||
groupJob->base.func = groupLaunchNonBlocking;
|
||||
PTHREADCHECKGOTO(pthread_create(&groupJob->base.thread, NULL, ncclAsyncJobMain, (void*)&groupJob->base), "pthread_create", ret, fail);
|
||||
groupJob->nonBlockingInit = true;
|
||||
ret = ncclInProgress;
|
||||
} else {
|
||||
/* blocking group */
|
||||
int savedDev;
|
||||
CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
|
||||
NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base, internalSimInfoPtr), ret, fail);
|
||||
NCCLCHECKGOTO(groupLaunch(&groupJob->base, internalSimInfoPtr), ret, fail);
|
||||
CUDACHECKGOTO(cudaSetDevice(savedDev), ret, fail);
|
||||
if (simInfo) memcpy((void*)simInfo, (void*)internalSimInfoPtr, realSize);
|
||||
groupResetJobState(ncclGroupJobMainPtr);
|
||||
free(groupJob);
|
||||
}
|
||||
}
|
||||
/* Reset the job state for the next group call. */
|
||||
groupLocalResetJobState();
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, &ncclGroupBlocking, &ncclGroupJobAbortFlag, ret);
|
||||
if (groupJob) {
|
||||
groupCleanup(groupJob->groupCommHead, &groupJob->asyncJobs, ret);
|
||||
free(groupJob);
|
||||
} else {
|
||||
groupCleanup(ncclGroupCommHead, &ncclAsyncJobs, ret);
|
||||
}
|
||||
groupLocalResetJobState();
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
if (groupJob && groupJob->initialized) {
|
||||
ret = ncclAsyncJobComplete(&groupJob->base);
|
||||
groupResetJobState(groupJob);
|
||||
if (groupJob && groupJob->nonBlockingInit) {
|
||||
if (!__atomic_exchange_n(&groupJob->joined, true, __ATOMIC_ACQ_REL)) {
|
||||
ret = ncclAsyncJobComplete(&groupJob->base);
|
||||
}
|
||||
if (ncclAtomicRefCountDecrement(&groupJob->groupRefCount) == 0) {
|
||||
free(groupJob);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) {
|
||||
if (groupJob && groupJob->initialized) {
|
||||
__atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELEASE);
|
||||
NCCLCHECK(ncclGroupJobComplete(groupJob));
|
||||
if (groupJob && groupJob->nonBlockingInit) {
|
||||
if (!__atomic_exchange_n(&groupJob->joined, true, __ATOMIC_ACQ_REL)) {
|
||||
__atomic_store_n(&groupJob->abortFlag, true, __ATOMIC_RELAXED);
|
||||
ncclAsyncJobComplete(&groupJob->base);
|
||||
}
|
||||
if (ncclAtomicRefCountDecrement(&groupJob->groupRefCount) == 0) {
|
||||
free(groupJob);
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -193,10 +193,9 @@ static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be
|
||||
#define MAX_ALLOC_TRACK_NGPU 128
|
||||
extern struct allocationTracker allocTracker[];
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
#if ROCM_VERSION >= 70000
|
||||
|
||||
#include <cuda.h>
|
||||
#include "cudawrap.h"
|
||||
#include "rocmwrap.h"
|
||||
|
||||
// ncclCuMemAllocAddr takes memory handle and size and returns the mapped address pointer
|
||||
static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
|
||||
@@ -262,7 +261,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
|
||||
prop.requestedHandleTypes = type;
|
||||
prop.location.id = currentDev;
|
||||
// Query device to see if RDMA support is available
|
||||
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
|
||||
// CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
|
||||
if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
|
||||
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
||||
ALIGN_SIZE(size, granularity);
|
||||
@@ -318,21 +317,21 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
|
||||
extern int ncclCuMemEnable();
|
||||
|
||||
static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, int type, size_t size) {
|
||||
WARN("CUMEM not supported prior to CUDA 11.3");
|
||||
WARN("CUMEM not supported prior to ROCm 7.0");
|
||||
return ncclInternalError;
|
||||
}
|
||||
static inline ncclResult_t ncclCuMemFree(void *ptr) {
|
||||
WARN("CUMEM not supported prior to CUDA 11.3");
|
||||
WARN("CUMEM not supported prior to ROCm 7.0");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
|
||||
WARN("CUMEM not supported prior to CUDA 11.3");
|
||||
WARN("CUMEM not supported prior to ROCm 7.0");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
|
||||
WARN("CUMEM not supported prior to CUDA 11.3");
|
||||
WARN("CUMEM not supported prior to ROCm 7.0");
|
||||
return ncclInternalError;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_ALLOCATOR_H_
|
||||
#define NCCL_ALLOCATOR_H_
|
||||
|
||||
ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr);
|
||||
ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr);
|
||||
|
||||
#endif
|
||||
@@ -122,6 +122,10 @@ typedef ncclResult_t (*ncclCommDestroy_fn_t)(ncclComm_t comm);
|
||||
|
||||
typedef ncclResult_t (*ncclCommAbort_fn_t)(ncclComm_t comm);
|
||||
|
||||
typedef ncclResult_t (*ncclCommShrink_fn_t)(ncclComm_t comm, int* excludeRanksList,
|
||||
int excludeRanksCount, ncclComm_t *newcomm,
|
||||
ncclConfig_t* config, int shrinkFlags);
|
||||
|
||||
typedef ncclResult_t (*ncclCommSplit_fn_t)(ncclComm_t comm, int color, int key,
|
||||
ncclComm_t* newcomm, ncclConfig_t* config);
|
||||
|
||||
@@ -158,6 +162,10 @@ typedef ncclResult_t (*ncclCommRegister_fn_t)(const ncclComm_t comm, void* buff,
|
||||
|
||||
typedef ncclResult_t (*ncclCommDeregister_fn_t)(const ncclComm_t comm, void* handle);
|
||||
|
||||
typedef ncclResult_t (*ncclCommWindowRegister_fn_t)(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
|
||||
|
||||
typedef ncclResult_t (*ncclCommWindowDeregister_fn_t)(ncclComm_t comm, ncclWindow_t win);
|
||||
|
||||
typedef struct rcclApiFuncTable
|
||||
{
|
||||
uint64_t size;
|
||||
@@ -184,6 +192,7 @@ typedef struct rcclApiFuncTable
|
||||
ncclCommFinalize_fn_t ncclCommFinalize_fn;
|
||||
ncclCommDestroy_fn_t ncclCommDestroy_fn;
|
||||
ncclCommAbort_fn_t ncclCommAbort_fn;
|
||||
ncclCommShrink_fn_t ncclCommShrink_fn;
|
||||
ncclCommSplit_fn_t ncclCommSplit_fn;
|
||||
ncclGetErrorString_fn_t ncclGetErrorString_fn;
|
||||
ncclGetLastError_fn_t ncclGetLastError_fn;
|
||||
@@ -198,6 +207,8 @@ typedef struct rcclApiFuncTable
|
||||
mscclUnloadAlgo_fn_t mscclUnloadAlgo_fn;
|
||||
ncclCommRegister_fn_t ncclCommRegister_fn;
|
||||
ncclCommDeregister_fn_t ncclCommDeregister_fn;
|
||||
ncclCommWindowRegister_fn_t ncclCommWindowRegister_fn;
|
||||
ncclCommWindowDeregister_fn_t ncclCommWindowDeregister_fn;
|
||||
ncclAllReduceWithBias_fn_t ncclAllReduceWithBias_fn;
|
||||
|
||||
} rcclApiFuncTable;
|
||||
|
||||
+163
-23
@@ -19,6 +19,28 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
template<typename Int>
|
||||
constexpr static __host__ __device__ Int minval(Int a) { return a; }
|
||||
template<typename Int, typename ...More>
|
||||
constexpr static __host__ __device__ Int minval(Int a, Int b, More ...more) {
|
||||
#if __CUDA_ARCH__
|
||||
return minval(min(a, b), more...);
|
||||
#else
|
||||
return minval(a < b ? a : b, more...);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
constexpr static __host__ __device__ Int maxval(Int a) { return a; }
|
||||
template<typename Int, typename ...More>
|
||||
constexpr static __host__ __device__ Int maxval(Int a, Int b, More ...more) {
|
||||
#if __CUDA_ARCH__
|
||||
return maxval(max(a, b), more...);
|
||||
#else
|
||||
return maxval(a > b ? a : b, more...);
|
||||
#endif
|
||||
}
|
||||
|
||||
#define DIVUP(x, y) \
|
||||
(((x)+(y)-1)/(y))
|
||||
|
||||
@@ -32,32 +54,150 @@
|
||||
size = ((size + (align) - 1) / (align)) * (align);
|
||||
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z divUp(X x, Y y) {
|
||||
static __host__ __device__ constexpr Z divUp(X x, Y y) {
|
||||
return (x+y-1)/y;
|
||||
}
|
||||
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z roundUp(X x, Y y) {
|
||||
static __host__ __device__ constexpr Z roundUp(X x, Y y) {
|
||||
return (x+y-1) - (x+y-1)%y;
|
||||
}
|
||||
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
||||
__host__ __device__ constexpr Z roundDown(X x, Y y) {
|
||||
static __host__ __device__ constexpr Z roundDown(X x, Y y) {
|
||||
return x - x%y;
|
||||
}
|
||||
|
||||
// assumes second argument is a power of 2
|
||||
template<typename X, typename Z = decltype(X()+int())>
|
||||
__host__ __device__ constexpr Z alignUp(X x, int a) {
|
||||
static __host__ __device__ constexpr Z alignUp(X x, int a) {
|
||||
return (x + a-1) & Z(-a);
|
||||
}
|
||||
// assumes second argument is a power of 2
|
||||
template<typename X, typename Z = decltype(X()+int())>
|
||||
__host__ __device__ constexpr Z alignDown(X x, int a) {
|
||||
static __host__ __device__ constexpr Z alignDown(X x, int a) {
|
||||
return x & Z(-a);
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int countOneBits(Int x) {
|
||||
constexpr __host__ __device__ bool isPow2(Int x) {
|
||||
return (x & (x-1)) == 0;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __host__ __device__ T add4G(T base, int delta4G) {
|
||||
union { T tmp; uint32_t u32[2]; };
|
||||
tmp = base;
|
||||
u32[1] += delta4G;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __host__ __device__ T incWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) {
|
||||
union { T tmp; uint32_t u32[2]; };
|
||||
tmp = ptr;
|
||||
u32[1] += delta4G;
|
||||
if (u32[1] >= hi4G) u32[1] -= hi4G-lo4G;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __host__ __device__ T decWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) {
|
||||
union { T tmp; uint32_t u32[2]; };
|
||||
tmp = ptr;
|
||||
u32[1] -= delta4G;
|
||||
if (u32[1] < lo4G) u32[1] += hi4G-lo4G;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
// Produce the reciprocal of x for use in idivByRcp
|
||||
constexpr __host__ __device__ uint32_t idivRcp32(uint32_t x) {
|
||||
return uint32_t(uint64_t(0x100000000)/x);
|
||||
}
|
||||
constexpr __host__ __device__ uint64_t idivRcp64(uint64_t x) {
|
||||
return uint64_t(-1)/x + isPow2(x);
|
||||
}
|
||||
|
||||
static __host__ __device__ uint32_t mul32hi(uint32_t a, uint32_t b) {
|
||||
#if __CUDA_ARCH__
|
||||
return __umulhi(a, b);
|
||||
#else
|
||||
return uint64_t(a)*b >> 32;
|
||||
#endif
|
||||
}
|
||||
static __host__ __device__ uint64_t mul64hi(uint64_t a, uint64_t b) {
|
||||
#if __CUDA_ARCH__
|
||||
return __umul64hi(a, b);
|
||||
#else
|
||||
return (uint64_t)(((unsigned __int128)a)*b >> 64);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Produce the reciprocal of x*y given their respective reciprocals. This incurs
|
||||
// no integer division on device.
|
||||
static __host__ __device__ uint32_t imulRcp32(uint32_t x, uint32_t xrcp, uint32_t y, uint32_t yrcp) {
|
||||
if (xrcp == 0) return yrcp;
|
||||
if (yrcp == 0) return xrcp;
|
||||
uint32_t rcp = mul32hi(xrcp, yrcp);
|
||||
uint32_t rem = -x*y*rcp;
|
||||
if (x*y <= rem) rcp += 1;
|
||||
return rcp;
|
||||
}
|
||||
static __host__ __device__ uint64_t imulRcp64(uint64_t x, uint64_t xrcp, uint64_t y, uint64_t yrcp) {
|
||||
if (xrcp == 0) return yrcp;
|
||||
if (yrcp == 0) return xrcp;
|
||||
uint64_t rcp = mul64hi(xrcp, yrcp);
|
||||
uint64_t rem = -x*y*rcp;
|
||||
if (x*y <= rem) rcp += 1;
|
||||
return rcp;
|
||||
}
|
||||
|
||||
// Fast integer division where divisor has precomputed reciprocal.
|
||||
// idivFast(x, y, idivRcp(y)) == x/y
|
||||
static __host__ __device__ void idivmodFast32(uint32_t *quo, uint32_t *rem, uint32_t x, uint32_t y, uint32_t yrcp) {
|
||||
uint32_t q = x, r = 0;
|
||||
if (yrcp != 0) {
|
||||
q = mul32hi(x, yrcp);
|
||||
r = x - y*q;
|
||||
if (r >= y) { q += 1; r -= y; }
|
||||
}
|
||||
*quo = q;
|
||||
*rem = r;
|
||||
}
|
||||
static __host__ __device__ void idivmodFast64(uint64_t *quo, uint64_t *rem, uint64_t x, uint64_t y, uint64_t yrcp) {
|
||||
uint64_t q = x, r = 0;
|
||||
if (yrcp != 0) {
|
||||
q = mul64hi(x, yrcp);
|
||||
r = x - y*q;
|
||||
if (r >= y) { q += 1; r -= y; }
|
||||
}
|
||||
*quo = q;
|
||||
*rem = r;
|
||||
}
|
||||
|
||||
static __host__ __device__ uint32_t idivFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
|
||||
uint32_t q, r;
|
||||
idivmodFast32(&q, &r, x, y, yrcp);
|
||||
return q;
|
||||
}
|
||||
static __host__ __device__ uint32_t idivFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
|
||||
uint64_t q, r;
|
||||
idivmodFast64(&q, &r, x, y, yrcp);
|
||||
return q;
|
||||
}
|
||||
|
||||
static __host__ __device__ uint32_t imodFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
|
||||
uint32_t q, r;
|
||||
idivmodFast32(&q, &r, x, y, yrcp);
|
||||
return r;
|
||||
}
|
||||
static __host__ __device__ uint32_t imodFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
|
||||
uint64_t q, r;
|
||||
idivmodFast64(&q, &r, x, y, yrcp);
|
||||
return r;
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
static __host__ __device__ int countOneBits(Int x) {
|
||||
#if __CUDA_ARCH__
|
||||
if (sizeof(Int) <= sizeof(unsigned int)) {
|
||||
return __popc((unsigned int)x);
|
||||
@@ -83,7 +223,7 @@ inline __host__ __device__ int countOneBits(Int x) {
|
||||
|
||||
// Returns index of first one bit or returns -1 if mask is zero.
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int firstOneBit(Int mask) {
|
||||
static __host__ __device__ int firstOneBit(Int mask) {
|
||||
int i;
|
||||
#if __CUDA_ARCH__
|
||||
if (sizeof(Int) <= sizeof(int)) {
|
||||
@@ -108,14 +248,14 @@ inline __host__ __device__ int firstOneBit(Int mask) {
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int popFirstOneBit(Int* mask) {
|
||||
static __host__ __device__ int popFirstOneBit(Int* mask) {
|
||||
Int tmp = *mask;
|
||||
*mask &= *mask-1;
|
||||
return firstOneBit(tmp);
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int log2Down(Int x) {
|
||||
static __host__ __device__ int log2Down(Int x) {
|
||||
int w, n;
|
||||
#if __CUDA_ARCH__
|
||||
if (sizeof(Int) <= sizeof(int)) {
|
||||
@@ -147,7 +287,7 @@ inline __host__ __device__ int log2Down(Int x) {
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ int log2Up(Int x) {
|
||||
static __host__ __device__ int log2Up(Int x) {
|
||||
int w, n;
|
||||
if (x != 0) x -= 1;
|
||||
#if __CUDA_ARCH__
|
||||
@@ -180,19 +320,19 @@ inline __host__ __device__ int log2Up(Int x) {
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ Int pow2Up(Int x) {
|
||||
static __host__ __device__ Int pow2Up(Int x) {
|
||||
return Int(1)<<log2Up(x);
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ Int pow2Down(Int x) {
|
||||
static __host__ __device__ Int pow2Down(Int x) {
|
||||
// True, log2Down can return -1, but we don't normally pass 0 as an argument...
|
||||
// coverity[negative_shift]
|
||||
return Int(1)<<log2Down(x);
|
||||
}
|
||||
|
||||
template<typename UInt, int nSubBits>
|
||||
inline __host__ __device__ UInt reverseSubBits(UInt x) {
|
||||
static __host__ __device__ UInt reverseSubBits(UInt x) {
|
||||
if (nSubBits >= 16 && 8*sizeof(UInt) == nSubBits) {
|
||||
switch (8*sizeof(UInt)) {
|
||||
case 16: x = __builtin_bswap16(x); break;
|
||||
@@ -225,7 +365,7 @@ template<> struct ncclToUnsigned<unsigned long long> { using type = unsigned lon
|
||||
|
||||
// Reverse the bottom nBits bits of x. The top bits will be overwritten with 0's.
|
||||
template<typename Int>
|
||||
inline __host__ __device__ Int reverseBits(Int x, int nBits) {
|
||||
static __host__ __device__ Int reverseBits(Int x, int nBits) {
|
||||
using UInt = typename ncclToUnsigned<Int>::type;
|
||||
union { UInt ux; Int sx; };
|
||||
sx = x;
|
||||
@@ -249,7 +389,7 @@ inline __host__ __device__ Int reverseBits(Int x, int nBits) {
|
||||
// has nearly the full range of uint32_t except it only keeps the top 3 bits
|
||||
// beneath the leading 1 bit and thus has a max value of 0xf0000000.
|
||||
|
||||
inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
|
||||
static __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
|
||||
int log2x;
|
||||
#if __CUDA_ARCH__
|
||||
log2x = 31-__clz(x|1);
|
||||
@@ -261,7 +401,7 @@ inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
|
||||
return exponent<<bitsPerPow2 | mantissa;
|
||||
}
|
||||
|
||||
inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
|
||||
static __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
|
||||
uint32_t exponent = x>>bitsPerPow2;
|
||||
uint32_t mantissa = (x & ((1u<<bitsPerPow2)-1)) | (exponent!=0 ? 0x8 : 0);
|
||||
if (exponent != 0) exponent -= 1;
|
||||
@@ -270,16 +410,16 @@ inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
|
||||
|
||||
constexpr uint32_t u32fp8MaxValue() { return 0xf0000000; }
|
||||
|
||||
inline __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
|
||||
static __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
|
||||
return u32fpEncode(x, 3);
|
||||
}
|
||||
inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
|
||||
static __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
|
||||
return u32fpDecode(x, 3);
|
||||
}
|
||||
|
||||
// The hash isn't just a function of the bytes but also where the bytes are split
|
||||
// into different calls to eatHash().
|
||||
inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
|
||||
static __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
|
||||
char const* ptr = (char const*)bytes;
|
||||
acc[0] ^= size;
|
||||
while (size != 0) {
|
||||
@@ -302,11 +442,11 @@ inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
|
||||
static __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
|
||||
eatHash(acc, (const void*)bytes, sizeof(T));
|
||||
}
|
||||
|
||||
inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
|
||||
static __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
|
||||
uint64_t h = acc[0];
|
||||
h ^= h >> 31;
|
||||
h *= 0xbac3bd562846de6b;
|
||||
@@ -316,13 +456,13 @@ inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
|
||||
return h;
|
||||
}
|
||||
|
||||
inline __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
|
||||
static __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
|
||||
uint64_t acc[2] = {1, 1};
|
||||
eatHash(acc, bytes, size);
|
||||
return digestHash(acc);
|
||||
}
|
||||
template<typename T>
|
||||
inline __host__ __device__ uint64_t getHash(const T* bytes) {
|
||||
static __host__ __device__ uint64_t getHash(const T* bytes) {
|
||||
return getHash((const void*)bytes, sizeof(T));
|
||||
}
|
||||
|
||||
|
||||
+48
-18
@@ -19,6 +19,7 @@
|
||||
#include "graph.h"
|
||||
#include "nvmlwrap.h"
|
||||
#include "profiler.h"
|
||||
#include "allocator.h"
|
||||
#include "latency_profiler/CollTrace.h"
|
||||
#include "rccl_common.h"
|
||||
#include "recorder.h"
|
||||
@@ -140,7 +141,6 @@ struct ncclSharedResources {
|
||||
int* tpRankToLocalRank;
|
||||
// Internal streams
|
||||
struct ncclStrongStream deviceStream, hostStream;
|
||||
int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
|
||||
int persistentRefs;
|
||||
cudaEvent_t launchEvent, scratchEvent;
|
||||
|
||||
@@ -229,6 +229,7 @@ struct ncclTaskColl {
|
||||
// Profiler plugin
|
||||
int eActivationMask;
|
||||
void* eventHandle;
|
||||
uint8_t nChannels;
|
||||
};
|
||||
struct ncclTaskP2p {
|
||||
struct ncclTaskP2p* next;
|
||||
@@ -243,6 +244,7 @@ struct ncclTaskP2p {
|
||||
// Profiler plugin
|
||||
int eActivationMask;
|
||||
void* eventHandle;
|
||||
uint8_t nChannels;
|
||||
};
|
||||
|
||||
struct ncclKernelPlan {
|
||||
@@ -255,10 +257,14 @@ struct ncclKernelPlan {
|
||||
|
||||
bool persistent; // aka captured in a graph
|
||||
bool isHostCbEnq;
|
||||
bool isSymColl;
|
||||
enum ncclDevWorkStorageType workStorageType;
|
||||
bool kernelSpecialized;
|
||||
void *kernelFn;
|
||||
struct ncclDevKernelArgs* kernelArgs;
|
||||
void* kernelFn;
|
||||
union {
|
||||
struct ncclDevKernelArgs* kernelArgs;
|
||||
struct ncclSymDevArgs* kernelSymArgs;
|
||||
};
|
||||
size_t kernelArgsSize;
|
||||
struct channelMasks channelMask;
|
||||
bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
|
||||
@@ -367,6 +373,7 @@ struct ncclKernelPlanner {
|
||||
struct Peer* peers/*[nRanks]*/;
|
||||
int nTasksColl, nTasksP2p;
|
||||
bool persistent;
|
||||
bool isSymColl;
|
||||
|
||||
// The list of user streams aggregated over all tasks present.
|
||||
struct ncclCudaStreamList* streams;
|
||||
@@ -430,12 +437,19 @@ struct ncclPeerInfo {
|
||||
int64_t busId;
|
||||
struct ncclComm* comm;
|
||||
int cudaCompCap;
|
||||
size_t totalGlobalMem;
|
||||
// MNNVL support
|
||||
nvmlGpuFabricInfoV_t fabricInfo;
|
||||
int cuMemSupport;
|
||||
int version;
|
||||
};
|
||||
|
||||
typedef enum ncclGroupTaskType {
|
||||
ncclGroupTaskTypeCollective = 0,
|
||||
ncclGroupTaskTypeSymRegister = 1,
|
||||
ncclGroupTaskTypeNum = 2,
|
||||
} ncclGroupTaskType_t;
|
||||
|
||||
struct ncclComm {
|
||||
uint64_t startMagic;
|
||||
struct ncclMemoryStack memPermanent, memScoped;
|
||||
@@ -452,9 +466,10 @@ struct ncclComm {
|
||||
struct ncclTopoSystem* topo;
|
||||
struct ncclProxyConnector* gproxyConn;
|
||||
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> legacyRegCleanupQueue;
|
||||
bool peerInfoValid;
|
||||
|
||||
int netPluginLoaded;
|
||||
ncclNet_t* ncclNet;
|
||||
int netPluginIndex;
|
||||
int ncclNetVer;
|
||||
ncclNetDeviceType netDeviceType;
|
||||
ncclCollNet_t* ncclCollNet;
|
||||
@@ -471,7 +486,6 @@ struct ncclComm {
|
||||
|
||||
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
|
||||
|
||||
const char* commName;
|
||||
uint64_t commHash;
|
||||
int rank; // my rank in the communicator
|
||||
int nRanks; // number of GPUs in communicator
|
||||
@@ -556,6 +570,7 @@ struct ncclComm {
|
||||
|
||||
// Device side of the communicator (for cudaFree's)
|
||||
struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
|
||||
struct ncclSymDevComm symDevComm;
|
||||
|
||||
uint32_t workArgsBytes; // max size of kernel args
|
||||
uint32_t workFifoBytes; // size of workFifoBuf, power of 2
|
||||
@@ -563,12 +578,10 @@ struct ncclComm {
|
||||
void* workFifoBufDev;
|
||||
void* workFifoBufGdrHandle;
|
||||
|
||||
// Monotonic number of bytes (mod 1<<32) consumed per channel. In cudaHost memory.
|
||||
uint32_t* workFifoConsumed/*[MAXCHANNELS]*/;
|
||||
// Last observed value of: min(workFifoConsumed[c] for c < MAXCHANNELS)
|
||||
uint32_t workFifoConsumedLeast;
|
||||
// Monotonic number of bytes (mod 1<<32) sent to fifo.
|
||||
uint32_t workFifoProduced;
|
||||
uint32_t workFifoProducedLastRecorded;
|
||||
uint32_t workFifoConsumed;
|
||||
|
||||
// Intra-process sync
|
||||
struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
|
||||
@@ -584,10 +597,8 @@ struct ncclComm {
|
||||
struct ncclProxyState* proxyState;
|
||||
int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
|
||||
// Whether this communicator uses collNet
|
||||
int collNetSupport;
|
||||
bool isOneRPN;
|
||||
uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
|
||||
bool intraNodeP2pSupport;
|
||||
int* collNetHeads;
|
||||
int collNetHeadsNum;
|
||||
int* collNetDenseToUserRank;
|
||||
@@ -609,7 +620,7 @@ struct ncclComm {
|
||||
|
||||
// Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
|
||||
// this comm is not yet in a group.
|
||||
struct ncclComm* groupNext;
|
||||
struct ncclComm* groupNext[ncclGroupTaskTypeNum];
|
||||
// Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
|
||||
struct ncclComm* preconnectNext;
|
||||
int localPersistentRefs; // number of persistent plan-lists capturing this comm
|
||||
@@ -631,6 +642,7 @@ struct ncclComm {
|
||||
ncclUserRedOp *userRedOps;
|
||||
|
||||
// Queue of things for the main thread to do
|
||||
int reclaimSteps;
|
||||
struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
|
||||
|
||||
hipEvent_t doneEvent;
|
||||
@@ -670,6 +682,9 @@ struct ncclComm {
|
||||
// group job to support multi-thread FT
|
||||
struct ncclGroupJob *groupJob;
|
||||
|
||||
// Flag indicating if this communicator shares resources with parent or children
|
||||
bool shareResources;
|
||||
|
||||
// Tuning plugin
|
||||
int tunerPluginLoaded;
|
||||
ncclTuner_t* tuner;
|
||||
@@ -683,16 +698,25 @@ struct ncclComm {
|
||||
// buffer registration cache
|
||||
struct ncclRegCache regCache;
|
||||
int isAllNvlink;
|
||||
bool isAllDirectP2p;
|
||||
int symmetricSupport;
|
||||
bool useNetPXN;
|
||||
bool useGdr;
|
||||
int splitCount;
|
||||
|
||||
// symmetric buffer
|
||||
uint8_t* baseUCSymPtr;
|
||||
uint8_t* baseMCSymPtr;
|
||||
size_t baseStride;
|
||||
size_t symAllocHead;
|
||||
CUmemGenericAllocationHandle symMCHandle;
|
||||
struct ncclIntruQueue<struct ncclSymRegTask, &ncclSymRegTask::next> symRegTaskQueue;
|
||||
|
||||
// Unroll factor for comm [RCCL]
|
||||
int unroll;
|
||||
|
||||
// custom collective
|
||||
// custom collective [RCCL]
|
||||
bool enableCustColl;
|
||||
|
||||
|
||||
uint64_t endMagic;
|
||||
};
|
||||
|
||||
@@ -724,15 +748,21 @@ inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome)
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) {
|
||||
inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm, bool waitSome) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
while (true) {
|
||||
struct ncclCommEventCallback* cb = ncclIntruQueueHead(&comm->eventCallbackQueue);
|
||||
if (cb == nullptr) break;
|
||||
cudaError_t ok = cudaEventSynchronize(cb->event);
|
||||
if (ok == cudaErrorNotReady) break;
|
||||
cudaError_t ok;
|
||||
if (waitSome) {
|
||||
ok = cudaEventSynchronize(cb->event);
|
||||
waitSome = false;
|
||||
} else {
|
||||
ok = cudaEventQuery(cb->event);
|
||||
if (ok == cudaErrorNotReady) break;
|
||||
}
|
||||
ncclIntruQueueDequeue(&comm->eventCallbackQueue);
|
||||
if (ok == cudaSuccess) {
|
||||
NCCLCHECKGOTO(cb->fn(comm, cb), result, finish);
|
||||
|
||||
@@ -58,4 +58,29 @@ static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static char* ncclCpusetToRangeStr(cpu_set_t* mask, char* str, size_t len) {
|
||||
int c = 0;
|
||||
int start = -1;
|
||||
// Iterate through all possible CPU bits plus one extra position
|
||||
for (int cpu = 0; cpu <= CPU_SETSIZE; cpu++) {
|
||||
int isSet = (cpu == CPU_SETSIZE) ? 0 : CPU_ISSET(cpu, mask);
|
||||
// Start of a new range
|
||||
if (isSet && start == -1) {
|
||||
start = cpu;
|
||||
}
|
||||
// End of a range, add comma between ranges
|
||||
if (!isSet && start != -1) {
|
||||
if (cpu-1 == start) {
|
||||
c += snprintf(str+c, len-c, "%s%d", c ? "," : "", start);
|
||||
} else {
|
||||
c += snprintf(str+c, len-c, "%s%d-%d", c ? "," : "", start, cpu-1);
|
||||
}
|
||||
if (c >= len-1) break;
|
||||
start = -1;
|
||||
}
|
||||
}
|
||||
if (c == 0) str[0] = '\0';
|
||||
return str;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -36,6 +36,10 @@ extern CUmemAllocationHandleType ncclCuMemHandleType;
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define CUCALL(cmd) do { \
|
||||
pfn_##cmd; \
|
||||
} while(false)
|
||||
|
||||
#define CUCHECKGOTO(cmd, res, label) do { \
|
||||
CUresult err = pfn_##cmd; \
|
||||
if( err != CUDA_SUCCESS ) { \
|
||||
@@ -66,49 +70,49 @@ extern CUmemAllocationHandleType ncclCuMemHandleType;
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
|
||||
#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGet);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorString);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorName);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 11040);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel, 4000);
|
||||
#if CUDART_VERSION >= 11080
|
||||
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx, 11060);
|
||||
#endif
|
||||
// cuMem API support
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemCreate);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemMap);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemRelease);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle, 10020);
|
||||
#if CUDA_VERSION >= 11070
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
|
||||
#endif
|
||||
#if CUDA_VERSION >= 12010
|
||||
/* NVSwitch Multicast support */
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include <hip/hip_bfloat16.h>
|
||||
#include "nccl_common.h"
|
||||
#include "bitops.h"
|
||||
#include "symmetric.h"
|
||||
#if defined(ENABLE_NPKIT)
|
||||
#include "npkit/npkit_struct.h"
|
||||
#endif
|
||||
@@ -41,6 +42,30 @@ extern const char* funcNames[];
|
||||
#define NCCL_CUDA_ARCH 0
|
||||
#endif
|
||||
|
||||
#ifdef __CUDA_ARCH_SPECIFIC__
|
||||
#define NCCL_CUDA_ARCH_SPECIFIC __CUDA_ARCH_SPECIFIC__
|
||||
#elif defined(__CUDA_ARCH_HAS_FEATURE__)
|
||||
#if __CUDA_ARCH_HAS_FEATURE__(SM90_ALL)
|
||||
#define NCCL_CUDA_ARCH_SPECIFIC 900
|
||||
#elif __CUDA_ARCH_HAS_FEATURE__(SM100_ALL)
|
||||
#define NCCL_CUDA_ARCH_SPECIFIC 1000
|
||||
#elif __CUDA_ARCH_HAS_FEATURE__(SM101_ALL)
|
||||
#define NCCL_CUDA_ARCH_SPECIFIC 1010
|
||||
#elif __CUDA_ARCH_HAS_FEATURE__(SM120_ALL)
|
||||
#define NCCL_CUDA_ARCH_SPECIFIC 1200
|
||||
#else
|
||||
#define NCCL_CUDA_ARCH_SPECIFIC 0
|
||||
#endif
|
||||
#else
|
||||
#define NCCL_CUDA_ARCH_SPECIFIC 0
|
||||
#endif
|
||||
|
||||
#ifdef __CUDA_ARCH_FAMILY_SPECIFIC__
|
||||
#define NCCL_CUDA_ARCH_FAMILY_SPECIFIC __CUDA_ARCH_FAMILY_SPECIFIC__
|
||||
#else
|
||||
#define NCCL_CUDA_ARCH_FAMILY_SPECIFIC 0
|
||||
#endif
|
||||
|
||||
#include "net_device.h"
|
||||
|
||||
enum ncclDevRedOp_t {
|
||||
@@ -516,6 +541,14 @@ struct alignas(16) ncclDevChannel {
|
||||
uint64_t workCounter;
|
||||
};
|
||||
|
||||
#define MAX_PROFILER_EVENTS_PER_CHANNEL 64
|
||||
struct ncclDevProfiler {
|
||||
struct {
|
||||
uint64_t counter;
|
||||
uint64_t timestamp;
|
||||
} data[MAX_PROFILER_EVENTS_PER_CHANNEL];
|
||||
};
|
||||
|
||||
struct ncclDevComm {
|
||||
int rank;
|
||||
int nRanks;
|
||||
@@ -526,9 +559,6 @@ struct ncclDevComm {
|
||||
int isAllNvlink;
|
||||
int p2pnChannelsPerPeer;
|
||||
|
||||
// Work fifo return credits
|
||||
uint32_t* workConsumed/*[MAXCHANNELS]*/;
|
||||
|
||||
int* collNetDenseToUserRank;
|
||||
|
||||
// Flag to ask NCCL kernels to abort
|
||||
@@ -540,8 +570,8 @@ struct ncclDevComm {
|
||||
int* rankToLocalRank;
|
||||
|
||||
// Profiler counters
|
||||
uint64_t* workStarted/*[MAXCHANNELS]*/;
|
||||
uint64_t* workCompleted/*[MAXCHANNELS]*/;
|
||||
struct ncclDevProfiler* workStarted/*[MAXCHANNELS]*/;
|
||||
struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
NpKitEventCollectContext* npKitEventCollectContexts;
|
||||
@@ -641,7 +671,7 @@ __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int
|
||||
|
||||
__host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
|
||||
// Our collective unroll should move to the same bytes&insns model as NVLS.
|
||||
return cudaArch >= 800 ? (cudaArch == 1200 ? 6 : 8) : 4;
|
||||
return cudaArch >= 800 ? (cudaArch / 100 == 12 ? 6 : 8) : 4;
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
|
||||
@@ -672,7 +702,6 @@ extern int const ncclDevKernelCount;
|
||||
extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];
|
||||
|
||||
// Table of most specialized kernel function to run given func index.
|
||||
extern int const ncclDevFuncIdCount;
|
||||
extern int const ncclDevFuncRowToId[];
|
||||
extern void* const ncclDevKernelForFunc[/*funcIndex*/];
|
||||
extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/];
|
||||
|
||||
@@ -51,6 +51,8 @@ int ncclPxnDisable(struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
|
||||
ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu);
|
||||
|
||||
ncclResult_t ncclGetUserP2pLevel(int* level);
|
||||
|
||||
#define MAX_XGMI_INTER_GPUS 4
|
||||
ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int64_t* id, int* dev);
|
||||
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
|
||||
@@ -81,7 +83,9 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
|
||||
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex);
|
||||
ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count);
|
||||
|
||||
// Allows for up to 32 NICs per node on GB200-NVL72
|
||||
#define NCCL_TOPO_MAX_NODES 64
|
||||
ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType);
|
||||
|
||||
// Init search. Needs to be done before calling ncclTopoCompute
|
||||
ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
|
||||
|
||||
+27
-41
@@ -10,9 +10,11 @@
|
||||
|
||||
#include "nccl.h"
|
||||
#include "comm.h"
|
||||
#include "allocator.h"
|
||||
#include "register.h"
|
||||
|
||||
ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
|
||||
void ncclGroupCommJoin(struct ncclComm* comm);
|
||||
void ncclGroupCommJoin(struct ncclComm* comm, int type);
|
||||
void ncclGroupCommPreconnect(struct ncclComm* comm);
|
||||
ncclResult_t ncclGroupCommLeave(struct ncclComm* comm);
|
||||
ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob);
|
||||
@@ -53,13 +55,14 @@ ncclResult_t ncclAsyncLaunch(
|
||||
|
||||
struct ncclGroupJob {
|
||||
struct ncclAsyncJob base;
|
||||
struct ncclComm **groupCommHeadPtr;
|
||||
struct ncclComm **groupCommPreconnectHeadPtr;
|
||||
ncclResult_t *groupErrorPtr;
|
||||
bool *abortFlagPtr;
|
||||
int *groupBlockingPtr;
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
|
||||
bool initialized;
|
||||
int groupRefCount;
|
||||
bool nonBlockingInit;
|
||||
bool joined;
|
||||
struct ncclComm *groupCommHead[ncclGroupTaskTypeNum];
|
||||
struct ncclComm *groupCommPreconnectHead;
|
||||
ncclResult_t groupError;
|
||||
bool abortFlag;
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncJobs;
|
||||
};
|
||||
|
||||
ncclResult_t ncclGroupStartInternal();
|
||||
@@ -70,27 +73,9 @@ ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);
|
||||
|
||||
extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
|
||||
extern __thread ncclResult_t ncclGroupError;
|
||||
extern __thread struct ncclComm* ncclGroupCommHead;
|
||||
extern __thread struct ncclComm* ncclGroupCommHead[ncclGroupTaskTypeNum];
|
||||
extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
|
||||
extern __thread int ncclGroupBlocking;
|
||||
extern __thread struct ncclGroupJob *ncclGroupJobMainPtr;
|
||||
extern __thread struct ncclGroupJob ncclGroupJobMain;
|
||||
|
||||
static inline void groupResetJobState() {
|
||||
ncclGroupBlocking = -1;
|
||||
ncclGroupJobMainPtr = NULL;
|
||||
memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
|
||||
return;
|
||||
}
|
||||
|
||||
static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
if (job) {
|
||||
ret = ncclAsyncJobComplete(&job->base);
|
||||
groupResetJobState();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
|
||||
if (ncclGroupDepth > 0) {
|
||||
@@ -100,31 +85,32 @@ inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
|
||||
}
|
||||
|
||||
// Add comm to this thread's group
|
||||
inline void ncclGroupCommJoin(struct ncclComm* comm) {
|
||||
if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
|
||||
inline void ncclGroupCommJoin(struct ncclComm* comm, int type) {
|
||||
if (comm->groupNext[type] == reinterpret_cast<struct ncclComm*>(0x1)) {
|
||||
// Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
|
||||
// the users program order yet insures siblings occur consecutively. This
|
||||
// is required by doLaunches() in "group.cc".
|
||||
struct ncclComm** pp = &ncclGroupCommHead;
|
||||
struct ncclComm** pp = &ncclGroupCommHead[type];
|
||||
while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
|
||||
pp = &(*pp)->groupNext;
|
||||
pp = &(*pp)->groupNext[type];
|
||||
|
||||
// didn't find its clique, we need to insert it with ascending order based on commHash
|
||||
if (*pp == nullptr) {
|
||||
pp = &ncclGroupCommHead;
|
||||
while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext;
|
||||
pp = &ncclGroupCommHead[type];
|
||||
while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext[type];
|
||||
}
|
||||
comm->groupNext = *pp;
|
||||
comm->groupNext[type] = *pp;
|
||||
*pp = comm;
|
||||
// Comms gets a new memory stack scope upon joining. Each task batched for
|
||||
// this comm is allocated there.
|
||||
ncclMemoryStackPush(&comm->memScoped);
|
||||
// Initialize planner
|
||||
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
|
||||
memset(&comm->planner, 0, sizeof(comm->planner));
|
||||
comm->planner.peers = tmp;
|
||||
if (type == ncclGroupTaskTypeCollective) {
|
||||
// Initialize planner
|
||||
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
|
||||
memset(&comm->planner, 0, sizeof(comm->planner));
|
||||
comm->planner.peers = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
ncclGroupBlocking = comm->config.blocking;
|
||||
}
|
||||
|
||||
@@ -137,8 +123,8 @@ inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
|
||||
}
|
||||
|
||||
// Comm has left group
|
||||
inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) {
|
||||
comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm, int type) {
|
||||
comm->groupNext[type] = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
ncclMemoryStackPop(&comm->memScoped);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
#ifndef NCCL_MLX5DV_CORE_H_
|
||||
#define NCCL_MLX5DV_CORE_H_
|
||||
|
||||
/* Basic MLX5 direct verbs structs. Needed to dynamically load MLX5 direct verbs functions without
|
||||
* explicit including of MLX5 direct verbs header.
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include "ibvwrap.h"
|
||||
|
||||
enum mlx5dv_reg_dmabuf_access {
|
||||
MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT = (1<<0),
|
||||
};
|
||||
|
||||
#endif // NCCL_MLX5DV_CORE_H_
|
||||
@@ -0,0 +1,23 @@
|
||||
#ifndef NCCL_MLX5DV_SYMBOLS_H_
|
||||
#define NCCL_MLX5DV_SYMBOLS_H_
|
||||
|
||||
#ifdef NCCL_BUILD_MLX5DV
|
||||
#include <infiniband/mlx5dv.h>
|
||||
#else
|
||||
#include "mlx5/mlx5dvcore.h"
|
||||
#endif
|
||||
|
||||
#include "nccl.h"
|
||||
|
||||
/* MLX5 Direct Verbs Function Pointers*/
|
||||
struct ncclMlx5dvSymbols {
|
||||
bool (*mlx5dv_internal_is_supported)(struct ibv_device *device);
|
||||
int (*mlx5dv_internal_get_data_direct_sysfs_path)(struct ibv_context *context, char *buf, size_t buf_len);
|
||||
/* DMA-BUF support */
|
||||
struct ibv_mr * (*mlx5dv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
|
||||
};
|
||||
|
||||
/* Constructs MLX5 direct verbs symbols per rdma-core linking or dynamic loading mode */
|
||||
ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols);
|
||||
|
||||
#endif // NCCL_MLX5DV_SYMBOLS_H_
|
||||
@@ -0,0 +1,41 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
|
||||
* Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved.
|
||||
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_MLX5DVWRAP_H_
|
||||
#define NCCL_MLX5DVWRAP_H_
|
||||
|
||||
#include <arpa/inet.h>
|
||||
#include <netinet/in.h>
|
||||
#ifdef NCCL_BUILD_MLX5DV
|
||||
#include <infiniband/mlx5dv.h>
|
||||
#else
|
||||
#include "mlx5/mlx5dvcore.h"
|
||||
#endif
|
||||
|
||||
#include "core.h"
|
||||
#include "ibvwrap.h"
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
typedef enum mlx5dv_return_enum
|
||||
{
|
||||
MLX5DV_SUCCESS = 0, //!< The operation was successful
|
||||
} mlx5dv_return_t;
|
||||
|
||||
ncclResult_t wrap_mlx5dv_symbols(void);
|
||||
/* NCCL wrappers of MLX5 direct verbs functions */
|
||||
bool wrap_mlx5dv_is_supported(struct ibv_device *device);
|
||||
ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context *context, char *buf, size_t buf_len);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t wrap_mlx5dv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
|
||||
struct ibv_mr * wrap_direct_mlx5dv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
|
||||
|
||||
#endif // NCCL_MLX5DVWRAP_H_
|
||||
@@ -7,6 +7,9 @@
|
||||
#ifndef NCCL_DEBUG_H_
|
||||
#define NCCL_DEBUG_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include "nccl.h"
|
||||
|
||||
typedef enum {
|
||||
NCCL_LOG_NONE = 0,
|
||||
NCCL_LOG_VERSION = 1,
|
||||
@@ -39,6 +42,16 @@ typedef enum {
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
// NCCL core profiler callback for network defined events instrumentation
|
||||
enum {
|
||||
ncclProfilerNetEventStart = 0,
|
||||
ncclProfilerNetEventStop,
|
||||
ncclProfilerNetEventUpdate,
|
||||
ncclProfilerNetEventUpdateAndStop,
|
||||
};
|
||||
|
||||
typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
|
||||
|
||||
#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
|
||||
typedef enum {
|
||||
ncclFuncBroadcast = 0,
|
||||
@@ -54,7 +67,7 @@ typedef enum {
|
||||
ncclNumFuncs = 10
|
||||
} ncclFunc_t;
|
||||
|
||||
#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
|
||||
#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*/PAT
|
||||
#define NCCL_ALGO_UNDEF -1
|
||||
#define NCCL_ALGO_TREE 0
|
||||
#define NCCL_ALGO_RING 1
|
||||
|
||||
@@ -14,8 +14,6 @@
|
||||
|
||||
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
|
||||
|
||||
ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
|
||||
ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
|
||||
ncclResult_t ncclNetInit(struct ncclComm* comm);
|
||||
ncclResult_t ncclNetFinalize(struct ncclComm* comm);
|
||||
|
||||
|
||||
@@ -37,10 +37,11 @@
|
||||
#define NVTX_SID_CommInitRankScalable 17 // same schema as NVTX_SID_CommInitRank
|
||||
#define NVTX_SID_CommSplit 18
|
||||
#define NVTX_SID_CommFinalize 19
|
||||
#define NVTX_SID_CommShrink 20
|
||||
// When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below!
|
||||
|
||||
// Define static schema ID for the reduction operation.
|
||||
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 20 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
|
||||
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 21 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
|
||||
|
||||
extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
|
||||
|
||||
|
||||
@@ -70,6 +70,16 @@ NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommSplit, static cons
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommShrink, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr),
|
||||
(int, nranks, TYPE_INT, nccl_nvtxNranksStr),
|
||||
(int, myrank, TYPE_INT, nccl_nvtxRankStr),
|
||||
(int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr),
|
||||
(int, num_exclude, TYPE_INT, "num_exclude")
|
||||
)
|
||||
)
|
||||
|
||||
NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommFinalize, static constexpr,
|
||||
NCCL_NVTX_PAYLOAD_ENTRIES(
|
||||
(uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr)
|
||||
|
||||
@@ -29,10 +29,9 @@
|
||||
#define NCCL_NET_MAX_REQUESTS 32
|
||||
|
||||
// Max number of ncclNet objects which can live in the same process
|
||||
#define NCCL_NET_MAX_PLUGINS 3
|
||||
|
||||
// NCCL core profiler callback for network defined events instrumentation
|
||||
typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
|
||||
#ifndef NCCL_NET_MAX_PLUGINS
|
||||
#define NCCL_NET_MAX_PLUGINS 16
|
||||
#endif
|
||||
|
||||
#include "net/net_v10.h"
|
||||
#include "net/net_v9.h"
|
||||
|
||||
@@ -19,43 +19,53 @@ enum {
|
||||
};
|
||||
|
||||
typedef enum {
|
||||
ncclProfilerProxyOpSendPosted,
|
||||
ncclProfilerProxyOpSendRemFifoWait,
|
||||
ncclProfilerProxyOpSendTransmitted,
|
||||
ncclProfilerProxyOpSendDone,
|
||||
ncclProfilerProxyOpRecvPosted,
|
||||
ncclProfilerProxyOpRecvReceived,
|
||||
ncclProfilerProxyOpRecvTransmitted,
|
||||
ncclProfilerProxyOpRecvDone,
|
||||
ncclProfilerProxyOpSendPosted = 0, // deprecated in v4
|
||||
ncclProfilerProxyOpSendRemFifoWait = 1, // deprecated in v4
|
||||
ncclProfilerProxyOpSendTransmitted = 2, // deprecated in v4
|
||||
ncclProfilerProxyOpSendDone = 3, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvPosted = 4, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvReceived = 5, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvTransmitted = 6, // deprecated in v4
|
||||
ncclProfilerProxyOpRecvDone = 7, // deprecated in v4
|
||||
ncclProfilerProxyOpInProgress_v4 = 19,
|
||||
|
||||
/* Legacy proxy profiler states */
|
||||
ncclProfilerProxyStepSendGPUWait,
|
||||
ncclProfilerProxyStepSendWait,
|
||||
ncclProfilerProxyStepRecvWait,
|
||||
ncclProfilerProxyStepRecvFlushWait,
|
||||
ncclProfilerProxyStepRecvGPUWait,
|
||||
ncclProfilerProxyStepSendGPUWait = 8,
|
||||
ncclProfilerProxyStepSendPeerWait_v4 = 20,
|
||||
ncclProfilerProxyStepSendWait = 9,
|
||||
ncclProfilerProxyStepRecvWait = 10,
|
||||
ncclProfilerProxyStepRecvFlushWait = 11,
|
||||
ncclProfilerProxyStepRecvGPUWait = 12,
|
||||
|
||||
/* Legacy proxy control states */
|
||||
ncclProfilerProxyCtrlIdle,
|
||||
ncclProfilerProxyCtrlActive,
|
||||
ncclProfilerProxyCtrlSleep,
|
||||
ncclProfilerProxyCtrlWakeup,
|
||||
ncclProfilerProxyCtrlAppend,
|
||||
ncclProfilerProxyCtrlAppendEnd,
|
||||
ncclProfilerProxyCtrlIdle = 13,
|
||||
ncclProfilerProxyCtrlActive = 14,
|
||||
ncclProfilerProxyCtrlSleep = 15,
|
||||
ncclProfilerProxyCtrlWakeup = 16,
|
||||
ncclProfilerProxyCtrlAppend = 17,
|
||||
ncclProfilerProxyCtrlAppendEnd = 18,
|
||||
|
||||
/* Network defined event states */
|
||||
ncclProfilerNetPluginUpdate = 21,
|
||||
|
||||
/* Kernel event states */
|
||||
ncclProfilerKernelChStop = 22,
|
||||
} ncclProfilerEventState_t;
|
||||
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
|
||||
typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
|
||||
|
||||
#include <cstdint>
|
||||
#include "profiler/profiler_v4.h"
|
||||
#include "profiler/profiler_v3.h"
|
||||
#include "profiler/profiler_v2.h"
|
||||
#include "profiler/profiler_v1.h"
|
||||
|
||||
typedef ncclProfiler_v3_t ncclProfiler_t;
|
||||
typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
|
||||
typedef ncclProfiler_v4_t ncclProfiler_t;
|
||||
typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
|
||||
|
||||
#define NCCL_PROFILER_NET_VER_BITS (16)
|
||||
#define NCCL_PROFILER_NET_VER_MASK (~0U >> NCCL_PROFILER_NET_VER_BITS)
|
||||
|
||||
@@ -0,0 +1,123 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PROFILER_V4_H_
|
||||
#define PROFILER_V4_H_
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
uint64_t seqNumber;
|
||||
const char* func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
const char* datatype;
|
||||
uint8_t nChannels;
|
||||
uint8_t nWarps;
|
||||
const char* algo;
|
||||
const char* proto;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* func;
|
||||
void* buff;
|
||||
const char* datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
uint8_t nChannels;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
uint8_t channelId;
|
||||
uint64_t pTimer; // start timestamp from GPU globaltimer
|
||||
} kernelCh;
|
||||
|
||||
struct {
|
||||
int64_t id;
|
||||
void* data;
|
||||
} netPlugin;
|
||||
};
|
||||
} ncclProfilerEventDescr_v4_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
} proxyStep;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
|
||||
struct {
|
||||
void* data;
|
||||
} netPlugin;
|
||||
|
||||
struct {
|
||||
uint64_t pTimer;
|
||||
} kernelCh;
|
||||
} ncclProfilerEventStateArgs_v4_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// - commName : user assigned communicator name
|
||||
// - commHash : communicator id
|
||||
// - nNodes : number of nodes in communicator
|
||||
// - nranks : number of ranks in communicator
|
||||
// - rank : rank identifier in communicator
|
||||
// - logfn : logger function
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v4_t;
|
||||
|
||||
#endif
|
||||
@@ -21,8 +21,8 @@ struct ncclProxyConnector;
|
||||
|
||||
struct ncclProfilerProxy {
|
||||
bool initialized;
|
||||
uint64_t* workStarted/*[MAXCHANNELS]*/;
|
||||
uint64_t* workCompleted/*[MAXCHANNELS]*/;
|
||||
struct ncclDevProfiler* workStarted/*[MAXCHANNELS]*/;
|
||||
struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/;
|
||||
uint64_t workCounter[MAXCHANNELS]; // host work counter
|
||||
struct ncclProxyConnector sendProxyConn[MAXCHANNELS];
|
||||
struct ncclProxyConnector recvProxyConn[MAXCHANNELS];
|
||||
@@ -43,8 +43,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan);
|
||||
ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan);
|
||||
|
||||
// Proxy Op Start/Stop Event Wrappers
|
||||
ncclResult_t ncclProfilerStartSendProxyOpEvent(int sub, struct ncclProxyArgs* args);
|
||||
ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* args);
|
||||
ncclResult_t ncclProfilerStartProxyOpEvent(int sub, struct ncclProxyArgs* args);
|
||||
ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args);
|
||||
|
||||
// Proxy Step Start/Stop Event Wrappers
|
||||
@@ -57,11 +56,11 @@ ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHand
|
||||
ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
|
||||
|
||||
// Kernel Channel Start/Stop Event Wrappers
|
||||
ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s);
|
||||
ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s);
|
||||
ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t start);
|
||||
ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t stop);
|
||||
|
||||
// Record Event Wrappers
|
||||
ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
|
||||
ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, ncclProfilerEventState_t eState);
|
||||
ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState);
|
||||
ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState);
|
||||
|
||||
|
||||
@@ -118,6 +118,13 @@ struct ncclProxyOp {
|
||||
facebook_rccl::ProxyTraceExtraInfo traceInfo;
|
||||
};
|
||||
|
||||
struct ncclProxySubArgs;
|
||||
|
||||
struct ncclProxyEventHandle {
|
||||
void* stepEventHandle;
|
||||
struct ncclProxySubArgs* subArgPtr;
|
||||
};
|
||||
|
||||
struct ncclProxySubArgs {
|
||||
struct ncclProxyConnection* connection;
|
||||
int reg;
|
||||
@@ -150,13 +157,12 @@ struct ncclProxySubArgs {
|
||||
// Profiler plugin
|
||||
int eActivationMask;
|
||||
int rank;
|
||||
uint64_t profilerSteps;
|
||||
pid_t pid;
|
||||
void* profilerContext;
|
||||
void* taskEventHandle;
|
||||
void* opEventHandle;
|
||||
void* kernelEventHandle;
|
||||
void* stepEventHandles[NCCL_STEPS];
|
||||
struct ncclProxyEventHandle pHandles[NCCL_STEPS];
|
||||
size_t transSize;
|
||||
uint64_t workCounter;
|
||||
|
||||
@@ -254,6 +260,8 @@ struct ncclProxyPeer {
|
||||
};
|
||||
|
||||
struct ncclSharedNetComms {
|
||||
int activeConnect[MAXCHANNELS];
|
||||
int activeAccept[MAXCHANNELS];
|
||||
void* sendComm[MAXCHANNELS];
|
||||
void* recvComm[MAXCHANNELS];
|
||||
int sendRefCount[MAXCHANNELS];
|
||||
|
||||
@@ -29,18 +29,24 @@ struct ncclRegNetHandles {
|
||||
struct ncclRegNetHandles* next;
|
||||
};
|
||||
|
||||
struct ncclSymRegTask {
|
||||
struct ncclSymRegTask *next;
|
||||
void* buff;
|
||||
size_t baseSize;
|
||||
CUmemGenericAllocationHandle memHandle;
|
||||
struct ncclReg* regHandle;
|
||||
size_t alignment;
|
||||
};
|
||||
|
||||
struct ncclReg {
|
||||
// common attributes
|
||||
size_t pages;
|
||||
uintptr_t begAddr, endAddr; // page aligned
|
||||
int localRefs;
|
||||
int graphRefs;
|
||||
uintptr_t addr;
|
||||
uint32_t state;
|
||||
// net reg
|
||||
struct ncclRegNetHandles* netHandleHead;
|
||||
// nvls reg
|
||||
uintptr_t baseAddr;
|
||||
size_t baseSize;
|
||||
CUdeviceptr regAddr;
|
||||
size_t regUCSize, regMCSize;
|
||||
int dev;
|
||||
@@ -52,6 +58,10 @@ struct ncclReg {
|
||||
// general ipc reg
|
||||
struct ncclPeerRegIpcAddr regIpcAddrs;
|
||||
struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS];
|
||||
// symmetric reg
|
||||
void* baseSymPtr;
|
||||
size_t symSize;
|
||||
int winFlags;
|
||||
};
|
||||
|
||||
struct ncclRegCache {
|
||||
@@ -60,10 +70,14 @@ struct ncclRegCache {
|
||||
uintptr_t pageSize;
|
||||
};
|
||||
|
||||
struct ncclWindow {
|
||||
struct ncclReg* handle;
|
||||
};
|
||||
|
||||
ncclResult_t ncclRegCleanup(struct ncclComm* comm);
|
||||
ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg);
|
||||
ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
|
||||
ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle);
|
||||
ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid);
|
||||
ncclResult_t ncclCommSymmetricRegisterInternal(struct ncclComm* comm, void* buff, size_t baseSize, size_t alignment, CUmemGenericAllocationHandle memHandle, struct ncclReg* regHandle);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
#ifndef NCCL_REGISTER_INLINE_H_
|
||||
#define NCCL_REGISTER_INLINE_H_
|
||||
|
||||
#include "comm.h"
|
||||
#include "register.h"
|
||||
|
||||
static inline ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** outReg) {
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
*outReg = NULL;
|
||||
for (int slot=0; /*true*/; slot++) {
|
||||
if (slot == cache->population) return ncclSuccess;
|
||||
struct ncclReg *reg = cache->slots[slot];
|
||||
if ((uintptr_t)data < reg->begAddr) return ncclSuccess;
|
||||
if ((uintptr_t)data + size <= reg->endAddr) {
|
||||
*outReg = reg;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclRegFindSymmetric(struct ncclComm* comm, const void* data, size_t size, void** symPtr, struct ncclReg** outReg) {
|
||||
struct ncclReg* regRecord = NULL;
|
||||
*symPtr = NULL;
|
||||
*outReg = NULL;
|
||||
NCCLCHECK(ncclRegFind(comm, data, size, ®Record));
|
||||
if (regRecord && regRecord->baseSymPtr) {
|
||||
*symPtr = (void*)((uintptr_t)regRecord->baseSymPtr + (uintptr_t)data - (uintptr_t)regRecord->begAddr);
|
||||
*outReg = regRecord;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -15,25 +15,35 @@ typedef hsa_status_t (*PFN_hsa_system_get_info)(hsa_system_info_t attribute, voi
|
||||
typedef hsa_status_t (*PFN_hsa_status_string)(hsa_status_t status, const char ** status_string);
|
||||
typedef hsa_status_t (*PFN_hsa_amd_portable_export_dmabuf)(const void* ptr, size_t size, int* dmabuf, uint64_t* offset);
|
||||
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
#define CUPFN(symbol) symbol
|
||||
#else
|
||||
#define CUPFN(symbol) pfn_##symbol
|
||||
#endif
|
||||
|
||||
// Check CUDA PFN driver calls
|
||||
#define CUCHECK(cmd) do { \
|
||||
#define HSACHECK(cmd) do { \
|
||||
hsa_status_t err = pfn_##cmd; \
|
||||
if( err != HSA_STATUS_SUCCESS ) { \
|
||||
const char *errStr; \
|
||||
pfn_hsa_status_string(err, &errStr); \
|
||||
WARN("ROCr failure '%s'", errStr); \
|
||||
WARN("HIP failure '%s'", errStr); \
|
||||
return ncclUnhandledCudaError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
// Check CUDA PFN driver calls
|
||||
#define CUCHECK(cmd) do { \
|
||||
hipError_t err = cmd; \
|
||||
if( err != hipSuccess ) { \
|
||||
WARN("HIP failure '%s' at %s:%d", hipGetErrorString(err), __FILE__, __LINE__); \
|
||||
return ncclUnhandledCudaError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define CUCHECKGOTO(cmd, res, label) do { \
|
||||
hsa_status_t err = pfn_##cmd; \
|
||||
if( err != HSA_STATUS_SUCCESS ) { \
|
||||
const char *errStr; \
|
||||
pfn_hsa_status_string(err, &errStr); \
|
||||
WARN("ROCr failure '%s'", errStr); \
|
||||
hipError_t err = cmd; \
|
||||
if( err != hipSuccess ) { \
|
||||
WARN("HIP failure '%s' at %s:%d", hipGetErrorString(err), __FILE__, __LINE__); \
|
||||
res = ncclUnhandledCudaError; \
|
||||
goto label; \
|
||||
} \
|
||||
@@ -45,7 +55,7 @@ typedef hsa_status_t (*PFN_hsa_amd_portable_export_dmabuf)(const void* ptr, size
|
||||
if( err != HSA_STATUS_SUCCESS ) { \
|
||||
const char *errStr; \
|
||||
pfn_hsa_status_string(err, &errStr); \
|
||||
INFO(NCCL_ALL,"%s:%d ROCr failure '%s'", __FILE__, __LINE__, errStr); \
|
||||
INFO(NCCL_ALL,"%s:%d HIP failure '%s'", __FILE__, __LINE__, errStr); \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
|
||||
@@ -69,8 +69,10 @@ struct ncclSocket {
|
||||
|
||||
const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
|
||||
ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
|
||||
int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
|
||||
int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
|
||||
ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr,
|
||||
union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int* found);
|
||||
ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs,
|
||||
int* nIfs);
|
||||
|
||||
// Initialize a socket
|
||||
ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0, int customRetry = 0);
|
||||
|
||||
@@ -0,0 +1,90 @@
|
||||
#ifndef NCCL_DEVICE_SYMMETRIC_H_
|
||||
#define NCCL_DEVICE_SYMMETRIC_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "nccl_common.h"
|
||||
#include "bitops.h"
|
||||
|
||||
constexpr int ncclSymMaxBlocks = 64;
|
||||
constexpr int ncclSymMaxThreads = 512;
|
||||
constexpr int ncclSymLLMaxEltSize = 64;
|
||||
|
||||
constexpr __host__ __device__ int ncclSymLLMaxSlots(int eltSize = ncclSymLLMaxEltSize) {
|
||||
return ncclSymMaxThreads*ncclSymLLMaxEltSize/eltSize;
|
||||
}
|
||||
|
||||
constexpr __host__ __device__ int ncclSymLLEpochSize(int nRanks) {
|
||||
return /*LL Overhead*/2 * maxval(ncclSymMaxThreads*nRanks*8, ncclSymLLMaxSlots(ncclSymLLMaxEltSize)*ncclSymLLMaxEltSize);
|
||||
}
|
||||
|
||||
struct alignas(16) ncclSymDevBase {
|
||||
uint32_t llEpoch[ncclSymMaxBlocks];
|
||||
uint32_t barEpochMc[ncclSymMaxBlocks], barEpochUc[ncclSymMaxBlocks];
|
||||
uint32_t barInboxMc[ncclSymMaxBlocks];
|
||||
uint32_t barInboxPerPeer[];
|
||||
|
||||
static constexpr size_t size(int nRanks) {
|
||||
return sizeof(ncclSymDevBase) +
|
||||
alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16) +
|
||||
ncclSymMaxBlocks * /*epochs=*/2 * ncclSymLLEpochSize(nRanks);
|
||||
}
|
||||
};
|
||||
|
||||
static __device__ uint4* ncclSymDevBase_getLLBuf(struct ncclSymDevBase* base, int nRanks, int block, uint32_t epoch) {
|
||||
// Get pointer to buffer trailing the header struct.
|
||||
char* ans = (char*)(base + 1);
|
||||
// Skip over barInboxPerPeer[]
|
||||
ans += alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16);
|
||||
// Skip to our block
|
||||
int epochSize = ncclSymLLEpochSize(nRanks);
|
||||
ans += block * /*epochs=*/2 * epochSize;
|
||||
ans += (epoch & 1)*epochSize;
|
||||
return (uint4*)ans;
|
||||
}
|
||||
|
||||
struct ncclSymDevComm {
|
||||
ncclSymDevBase* base;
|
||||
ncclSymDevBase* baseMc;
|
||||
uint32_t stride4G;
|
||||
int nRanks, rank;
|
||||
uint32_t nRanks_rcp32; // idivRcp32(nRanks)
|
||||
};
|
||||
|
||||
struct alignas(16) ncclSymDevArgs {
|
||||
struct ncclSymDevComm comm;
|
||||
int rootRank;
|
||||
uint64_t redOpArg; // must be collectively uniform
|
||||
size_t nElts;
|
||||
char* input;
|
||||
char* output;
|
||||
};
|
||||
|
||||
enum ncclSymKernelId {
|
||||
ncclSymKernelId_AllReduce_AGxLL_R,
|
||||
ncclSymKernelId_AllReduce_AGxLLMC_R,
|
||||
ncclSymKernelId_AllReduce_RSxLD_AGxST,
|
||||
ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC,
|
||||
|
||||
ncclSymKernelId_AllGather_LL,
|
||||
ncclSymKernelId_AllGather_LLMC,
|
||||
ncclSymKernelId_AllGather_ST,
|
||||
ncclSymKernelId_AllGather_STMC,
|
||||
|
||||
ncclSymKernelId_ReduceScatter_LL,
|
||||
ncclSymKernelId_ReduceScatter_LD,
|
||||
ncclSymKernelId_ReduceScatter_LDMC,
|
||||
|
||||
ncclSymKernelId_Count
|
||||
};
|
||||
|
||||
bool ncclSymImplemented(ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
|
||||
|
||||
ncclResult_t ncclSymPickKernel(struct ncclComm* comm, ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts, float* estTimeUs, ncclSymKernelId* kernelId, int* nBlocks, int* nWarps);
|
||||
|
||||
// Generated by src/device/symmetric/generate.py
|
||||
extern int const ncclSymKernelCount;
|
||||
extern void* const ncclSymKernelList[];
|
||||
void* ncclSymGetKernelPtr(ncclSymKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
|
||||
const char* ncclSymKernelIdToString(int kernelId);
|
||||
|
||||
#endif
|
||||
@@ -23,6 +23,7 @@
|
||||
|
||||
#include "proxy.h"
|
||||
#include "comm.h"
|
||||
#include "bootstrap.h"
|
||||
|
||||
extern struct ncclTransport p2pTransport;
|
||||
extern struct ncclTransport shmTransport;
|
||||
@@ -37,7 +38,15 @@ struct ncclConnector;
|
||||
struct ncclComm;
|
||||
|
||||
#define CHANNEL_MASK_OFFSET(nranks, connIndex) (nranks * (connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0))
|
||||
|
||||
#define CONNECT_SIZE 256
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
#define NCCL_MAX_PAGE_SIZE (512L * 1024L)
|
||||
#define NCCL_REC_PAGE_SIZE (4L * 1024L)
|
||||
#else
|
||||
#define NCCL_MAX_PAGE_SIZE (512L * 1024L * 1024L)
|
||||
#define NCCL_REC_PAGE_SIZE (2L * 1024L * 1024L)
|
||||
#endif
|
||||
struct ncclConnect {
|
||||
char data[CONNECT_SIZE];
|
||||
};
|
||||
@@ -65,6 +74,7 @@ struct ncclNvlsSharedRes {
|
||||
char* ucBuff; // Unicast NVLS buffer address
|
||||
char* ucCredit; // Unicast NVLS credit address
|
||||
int nChannels;
|
||||
int nHeads;
|
||||
struct ncclShmemCollBuff nvlsShmem;
|
||||
void *nvlsShmemHandle;
|
||||
};
|
||||
@@ -104,7 +114,8 @@ struct ncclTransport {
|
||||
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, bool* needsProxy=NULL);
|
||||
ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode);
|
||||
ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode);
|
||||
ncclResult_t ncclTransportIsAllDirectP2p(struct ncclComm* comm, int* isAllDirectP2p);
|
||||
|
||||
ncclResult_t ncclNvlsInit(struct ncclComm* comm);
|
||||
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
|
||||
@@ -139,5 +150,15 @@ ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, siz
|
||||
ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue);
|
||||
ncclResult_t ncclRegisterCollBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
|
||||
ncclResult_t ncclRegisterCollNvlsBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
|
||||
ncclResult_t ncclNvlsRegResourcesQuery(struct ncclComm* comm, struct ncclTaskColl* info, int* recChannels);
|
||||
|
||||
ncclResult_t ncclIpcSymmetricInit(struct ncclComm* comm);
|
||||
ncclResult_t ncclIpcSymmetricMap(struct ncclComm* comm, size_t offset, size_t size, CUmemGenericAllocationHandle memHandle, void** symPtr);
|
||||
ncclResult_t ncclIpcSymmetricFree(struct ncclComm* comm, size_t size, void* symPtr);
|
||||
ncclResult_t ncclIpcSymmetricFinalize(struct ncclComm* comm);
|
||||
ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm);
|
||||
ncclResult_t ncclNvlsSymmetricMap(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr);
|
||||
ncclResult_t ncclNvlsSymmetricFree(struct ncclComm* comm, size_t ucsize, void* ucaddr);
|
||||
ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -44,6 +44,12 @@ static long log2i(long n) {
|
||||
return log2Down(n);
|
||||
}
|
||||
|
||||
// Comparator function for qsort/bsearch to compare integers
|
||||
static int compareInts(const void *a, const void *b) {
|
||||
int ia = *(const int*)a, ib = *(const int*)b;
|
||||
return (ia > ib) - (ia < ib);
|
||||
}
|
||||
|
||||
inline uint64_t clockNano() {
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
|
||||
+259
-211
@@ -25,6 +25,7 @@
|
||||
#endif
|
||||
#include "tuner.h"
|
||||
#include "ras.h"
|
||||
#include "profiler.h"
|
||||
#include "mnnvl.h"
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
@@ -41,6 +42,7 @@
|
||||
#include "archinfo.h"
|
||||
#include "param.h"
|
||||
#include "nvtx_payload_schemas.h"
|
||||
#include "utils.h"
|
||||
|
||||
// [RCCL]
|
||||
#include "git_version.h"
|
||||
@@ -86,6 +88,10 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
|
||||
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
|
||||
NCCL_PARAM(CommBlocking, "COMM_BLOCKING", NCCL_CONFIG_UNDEF_INT);
|
||||
NCCL_PARAM(RuntimeConnect, "RUNTIME_CONNECT", 1);
|
||||
NCCL_PARAM(WinEnable, "WIN_ENABLE", 1);
|
||||
NCCL_PARAM(CollnetEnable, "COLLNET_ENABLE", NCCL_CONFIG_UNDEF_INT);
|
||||
NCCL_PARAM(CtaPolicy, "CTA_POLICY", NCCL_CONFIG_UNDEF_INT);
|
||||
NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", NCCL_CONFIG_UNDEF_INT);
|
||||
|
||||
struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {};
|
||||
static ncclResult_t commReclaim(ncclComm_t comm);
|
||||
@@ -372,6 +378,10 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
if (comm == NULL)
|
||||
return ncclSuccess;
|
||||
|
||||
if (comm->symmetricSupport && comm->symDevComm.base) {
|
||||
NCCLCHECK(ncclCommSymmetricFreeInternal(comm, comm->baseUCSymPtr + comm->rank * comm->baseStride));
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclRasCommFini(comm));
|
||||
|
||||
/* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will
|
||||
@@ -494,16 +504,17 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
|
||||
NCCLCHECK(ncclRegCleanup(comm));
|
||||
|
||||
if (comm->symmetricSupport) {
|
||||
NCCLCHECK(ncclNvlsSymmetricFinalize(comm));
|
||||
NCCLCHECK(ncclIpcSymmetricFinalize(comm));
|
||||
}
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy");
|
||||
|
||||
commPoison(comm); // poison comm before free to avoid comm reuse.
|
||||
NCCLCHECK(ncclProfilerPluginFinalize(comm));
|
||||
NCCLCHECK(ncclNetFinalize(comm));
|
||||
NCCLCHECK(ncclNetPluginUnload(comm));
|
||||
|
||||
// Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support.
|
||||
//ncclCudaContextDrop(comm->context);
|
||||
|
||||
free(comm);
|
||||
|
||||
return ncclSuccess;
|
||||
@@ -580,12 +591,10 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
comm->rank = rank;
|
||||
comm->nRanks = ndev;
|
||||
|
||||
NCCLCHECK(ncclNetPluginLoad(comm));
|
||||
NCCLCHECK(ncclNetInit(comm));
|
||||
NCCLCHECK(ncclProfilerPluginInit(comm));
|
||||
INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
|
||||
|
||||
if (parent && parent->config.splitShare) {
|
||||
if (parent && parent->shareResources) {
|
||||
if (parent->ncclNet != comm->ncclNet) {
|
||||
WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parent->ncclNet->name, comm->ncclNet->name);
|
||||
return ncclInvalidUsage;
|
||||
@@ -641,13 +650,14 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
#endif
|
||||
}
|
||||
|
||||
comm->collNetSupport = 0;
|
||||
memset(comm->collNetSupportMatrix, 0, sizeof(comm->collNetSupportMatrix));
|
||||
|
||||
ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan);
|
||||
ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
|
||||
|
||||
comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
for (int i = 0; i < ncclGroupTaskTypeNum; i++) {
|
||||
comm->groupNext[i] = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
}
|
||||
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
|
||||
static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
|
||||
@@ -658,7 +668,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
// Mark channels as non initialized.
|
||||
for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1;
|
||||
|
||||
if (parent == NULL || !parent->config.splitShare) {
|
||||
if (parent == NULL || !parent->shareResources) {
|
||||
struct ncclSharedResources* sharedRes = NULL;
|
||||
NCCLCHECK(ncclCalloc(&sharedRes, 1));
|
||||
/* most of attributes are assigned later in initTransportsRank(). */
|
||||
@@ -713,6 +723,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
bool ccEnable = false;
|
||||
cudaStream_t deviceStream;
|
||||
|
||||
memset(&tmpCommAndChans, '\0', sizeof(tmpCommAndChans));
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, deviceStream), ret, fail);
|
||||
ncclCommPushCudaFree(comm, devCommAndChans);
|
||||
@@ -741,22 +752,12 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
if (ccEnable) {
|
||||
comm->workFifoBytes = 0;
|
||||
} else {
|
||||
int64_t workFifoBytesParam = ncclParamWorkFifoBytes();
|
||||
if (workFifoBytesParam == -1) {
|
||||
if (comm->MNNVL && (comm->compCap >= 100)) {
|
||||
// WAR: Disable work fifo for Blackwell all2all hang issue on MNNVL
|
||||
INFO(NCCL_INIT, "Disabling work fifo");
|
||||
comm->workFifoBytes = 0;
|
||||
} else {
|
||||
comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
|
||||
}
|
||||
} else {
|
||||
if (0 != (workFifoBytesParam & (workFifoBytesParam-1))) {
|
||||
WARN("NCCL_WORK_FIFO_BYTES=%ld is being ignored because it is not a power of 2.", workFifoBytesParam);
|
||||
comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
|
||||
}
|
||||
comm->workFifoBytes = std::min<uint64_t>(workFifoBytesParam, 1ul<<30);
|
||||
comm->workFifoBytes = ncclParamWorkFifoBytes();
|
||||
if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) {
|
||||
WARN("NCCL_WORK_FIFO_BYTES=%d is being ignored because it is not a power of 2.", comm->workFifoBytes);
|
||||
comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
|
||||
}
|
||||
comm->workFifoBytes = std::min(comm->workFifoBytes, 1u<<30);
|
||||
}
|
||||
#else
|
||||
comm->workFifoBytes = ncclParamWorkFifoBytes();
|
||||
@@ -783,11 +784,9 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
comm->workFifoBufDev = comm->workFifoBuf;
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoConsumed, MAXCHANNELS), ret, fail);
|
||||
ncclCommPushCudaHostFree(comm, comm->workFifoConsumed);
|
||||
comm->workFifoProduced = 0;
|
||||
comm->workFifoConsumedLeast = 0;
|
||||
tmpCommAndChans.comm.workConsumed = comm->workFifoConsumed;
|
||||
comm->workFifoProducedLastRecorded = 0;
|
||||
comm->workFifoConsumed = 0;
|
||||
|
||||
// Alloc profiler counters for the kernel
|
||||
NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workStarted, MAXCHANNELS), ret, fail);
|
||||
@@ -892,6 +891,7 @@ NCCL_PARAM(MNNVLUUID, "MNNVL_UUID", -1);
|
||||
NCCL_PARAM(MNNVLCliqueId, "MNNVL_CLIQUE_ID", -1);
|
||||
|
||||
static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
|
||||
cudaDeviceProp prop;
|
||||
info->rank = comm->rank;
|
||||
info->cudaDev = comm->cudaDev;
|
||||
info->nvmlDev = comm->nvmlDev;
|
||||
@@ -899,6 +899,8 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
|
||||
info->hostHash=getHostHash()+commHash;
|
||||
info->pidHash=getPidHash()+commHash;
|
||||
info->cuMemSupport = ncclCuMemEnable();
|
||||
CUDACHECK(cudaGetDeviceProperties(&prop, comm->cudaDev));
|
||||
info->totalGlobalMem = ROUNDUP(prop.totalGlobalMem, (1L << 32));
|
||||
|
||||
// Get the device MAJOR:MINOR of /dev/shm so we can use that
|
||||
// information to decide whether we can use SHM for inter-process
|
||||
@@ -1068,6 +1070,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
struct ncclTopoRanks topoRanks;
|
||||
int cpuArch;
|
||||
int cpuVendor;
|
||||
int localRanks;
|
||||
int nc;
|
||||
bool pivotA2AEnabled;
|
||||
bool ll128Enabled;
|
||||
@@ -1083,6 +1086,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
struct ncclProxyConnector proxyConn;
|
||||
int* pxnPeers = NULL;
|
||||
int *topParentLocalRanks = NULL;
|
||||
int p2pLevel = -1;
|
||||
|
||||
bool needsProxy = false;
|
||||
bool mscclNeedsProxy = needsProxy;
|
||||
@@ -1092,6 +1096,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root
|
||||
NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
|
||||
__atomic_store_n(&comm->peerInfoValid, true, __ATOMIC_RELEASE);
|
||||
|
||||
comm->cuMemSupport = 1;
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
@@ -1114,7 +1119,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER];
|
||||
|
||||
// Check for MNNVL support
|
||||
if ((nNodes > 1 && ncclParamMNNVLEnable() != 0) || ncclParamMNNVLEnable() == 1) {
|
||||
NCCLCHECKGOTO(ncclGetUserP2pLevel(&p2pLevel), ret, fail);
|
||||
if ((nNodes > 1 && ncclParamMNNVLEnable() != 0 && p2pLevel != 0) || ncclParamMNNVLEnable() == 1) {
|
||||
NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail);
|
||||
}
|
||||
|
||||
@@ -1225,14 +1231,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
}
|
||||
|
||||
// Determine local CollNet support
|
||||
if (collNetSupport(comm)) {
|
||||
const char *collNetEnable = ncclGetEnv("NCCL_COLLNET_ENABLE");
|
||||
if (collNetEnable != NULL) {
|
||||
INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
|
||||
if (strcmp(collNetEnable, "1") == 0) {
|
||||
comm->collNetSupport = 1;
|
||||
}
|
||||
}
|
||||
if (!collNetSupport(comm)) {
|
||||
comm->config.collnetEnable = 0;
|
||||
}
|
||||
|
||||
// Determine local Nvls support
|
||||
@@ -1290,7 +1290,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
collNetDirectGraph->collNet = 1;
|
||||
collNetDirectGraph->minChannels = 1;
|
||||
collNetDirectGraph->maxChannels = MAXCHANNELS;
|
||||
if (comm->collNetSupport) {
|
||||
if (comm->config.collnetEnable) {
|
||||
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetChainGraph), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetChainGraph), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetDirectGraph), ret, fail);
|
||||
@@ -1523,7 +1523,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
}
|
||||
comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern);
|
||||
}
|
||||
if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0;
|
||||
if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->config.collnetEnable = 0;
|
||||
if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
|
||||
|
||||
comm->nChannels = treeGraph->nChannels = ringGraph->nChannels =
|
||||
@@ -1536,11 +1536,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
}
|
||||
|
||||
// Determine CollNet support after all-gather now that we know nNodes and each node localRanks
|
||||
if (comm->collNetSupport == 1) {
|
||||
if (comm->config.collnetEnable == 1) {
|
||||
int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
|
||||
if (comm->nNodes < collNetNodeThreshold) {
|
||||
INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
|
||||
comm->collNetSupport = 0;
|
||||
comm->config.collnetEnable = 0;
|
||||
}
|
||||
}
|
||||
NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink));
|
||||
@@ -1590,9 +1590,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
}
|
||||
comm->topParentLocalRanks = topParentLocalRanks;
|
||||
|
||||
NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->intraNodeP2pSupport, &comm->directMode), ret, fail);
|
||||
// Profiler plugin context has to be initialized before proxy thread
|
||||
NCCLCHECK(ncclProfilerPluginInit(comm));
|
||||
|
||||
NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->isAllDirectP2p, &comm->directMode), ret, fail);
|
||||
// Launch proxy service thread, after this, the proxy calls can be used.
|
||||
if (parent && parent->config.splitShare) {
|
||||
if (parent && parent->shareResources) {
|
||||
comm->proxyState = parent->sharedRes->proxyState;
|
||||
ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
|
||||
} else {
|
||||
@@ -1662,10 +1665,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
|
||||
}
|
||||
// Setup NVLS
|
||||
// Attempt to setup NVLS, may silently fail and disable NVLS
|
||||
NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
|
||||
// Check if we can setup CollNet
|
||||
if (comm->collNetSupport > 0) ncclCollNetSetup(comm, parent, graphs);
|
||||
if (comm->config.collnetEnable) ncclCollNetSetup(comm, parent, graphs);
|
||||
} else {
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
|
||||
@@ -1689,7 +1692,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
// Connect PAT only for communicators with 1 GPU per node
|
||||
if (comm->maxLocalRanks == 1) NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
|
||||
|
||||
// Setup NVLS
|
||||
// Attempt to setup NVLS, may silently fail and disable NVLS
|
||||
NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
|
||||
NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
|
||||
|
||||
@@ -1697,7 +1700,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
|
||||
|
||||
// Check if we can setup CollNet
|
||||
if (comm->collNetSupport > 0) {
|
||||
if (comm->config.collnetEnable) {
|
||||
ncclCollNetSetup(comm, parent, graphs);
|
||||
NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
|
||||
if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
|
||||
@@ -1770,9 +1773,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
}
|
||||
}
|
||||
|
||||
comm->symmetricSupport = comm->isAllDirectP2p && comm->nNodes == 1 && ncclParamWinEnable() && ncclCuMemEnable();
|
||||
comm->baseStride = 0;
|
||||
|
||||
// Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to
|
||||
// launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
|
||||
NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
|
||||
|
||||
timers[TIMER_INIT_CONNECT] = clockNano() - timers[TIMER_INIT_CONNECT];
|
||||
|
||||
if (mscclEnabled() && (comm->topo->mscclEnabled || mscclForceEnabled())) {
|
||||
@@ -1793,7 +1800,7 @@ exit:
|
||||
/* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can
|
||||
* attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be
|
||||
* properly cleaned up. */
|
||||
if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm);
|
||||
if (comm->sharedRes->owner == comm && !comm->shareResources && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm);
|
||||
free(allTopoRanks);
|
||||
free(nodesTreePatterns);
|
||||
free(nodesFirstRank);
|
||||
@@ -1832,6 +1839,9 @@ struct ncclCommInitRankAsyncJob {
|
||||
struct ncclComm* parent;
|
||||
int color, key;
|
||||
int splitCount;
|
||||
// For Shrink
|
||||
int* excludeRanksList;
|
||||
int excludeRanksCount;
|
||||
// name of the function calling
|
||||
char funcName[NCCL_COMMINIT_FUNCNAME_LEN];
|
||||
};
|
||||
@@ -1842,6 +1852,7 @@ struct ncclCommFinalizeAsyncJob {
|
||||
};
|
||||
|
||||
NCCL_PARAM(CommSplitShareResources, "COMM_SPLIT_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT);
|
||||
NCCL_PARAM(CommShrinkShareResources, "COMM_SHRINK_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT);
|
||||
|
||||
typedef struct{
|
||||
int key;
|
||||
@@ -1889,6 +1900,21 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t getParentRanks(int parentRanks, int parentRank, int* excludeRanksList, int excludeRanksCount, int* nRanksRet, int* myRankRet, int* parentRanksRet) {
|
||||
int count = 0, j = 0;
|
||||
for (int i = 0; i < parentRanks; i++) {
|
||||
// we assume excludeRanksList is sorted
|
||||
if (j < excludeRanksCount && excludeRanksList[j] == i) {
|
||||
j++;
|
||||
continue;
|
||||
}
|
||||
if (i == parentRank) *myRankRet = count;
|
||||
parentRanksRet[count++] = i;
|
||||
}
|
||||
*nRanksRet = parentRanks - excludeRanksCount;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
|
||||
ncclComm_t comm = job->comm;
|
||||
@@ -1940,9 +1966,13 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
|
||||
if (job->parent) {
|
||||
NCCLCHECKGOTO(ncclCalloc(&parentRanks, job->parent->nRanks), res, fail);
|
||||
NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail);
|
||||
// Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now.
|
||||
if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
|
||||
if (job->excludeRanksCount) {
|
||||
NCCLCHECKGOTO(getParentRanks(job->parent->nRanks, job->parent->rank, job->excludeRanksList, job->excludeRanksCount, &job->nranks, &job->myrank, parentRanks), res, fail);
|
||||
} else {
|
||||
NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail);
|
||||
// Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now.
|
||||
if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
|
||||
}
|
||||
timers[TIMER_INIT_ALLOC] = clockNano();
|
||||
NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
|
||||
timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
|
||||
@@ -2097,6 +2127,10 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
|
||||
int minCTAsEnv;
|
||||
int maxCTAsEnv;
|
||||
int splitShareEnv;
|
||||
int collnetEnableEnv;
|
||||
int ctaPolicyEnv;
|
||||
int shrinkShareEnv;
|
||||
int nvlsCTAsEnv;
|
||||
|
||||
/* override configuration from env variable. */
|
||||
blockingEnv = ncclParamCommBlocking();
|
||||
@@ -2142,6 +2176,25 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
|
||||
if (splitShareEnv != NCCL_CONFIG_UNDEF_INT) {
|
||||
comm->config.splitShare = splitShareEnv;
|
||||
}
|
||||
shrinkShareEnv = ncclParamCommShrinkShareResources();
|
||||
if (shrinkShareEnv != NCCL_CONFIG_UNDEF_INT) {
|
||||
comm->config.shrinkShare = shrinkShareEnv;
|
||||
}
|
||||
|
||||
collnetEnableEnv = ncclParamCollnetEnable();
|
||||
if (collnetEnableEnv != NCCL_CONFIG_UNDEF_INT) {
|
||||
comm->config.collnetEnable = collnetEnableEnv;
|
||||
}
|
||||
|
||||
ctaPolicyEnv = ncclParamCtaPolicy();
|
||||
if (ctaPolicyEnv != NCCL_CONFIG_UNDEF_INT) {
|
||||
comm->config.CTAPolicy = ctaPolicyEnv;
|
||||
}
|
||||
|
||||
nvlsCTAsEnv = ncclParamNvlsChannels();
|
||||
if (nvlsCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
|
||||
comm->config.nvlsCTAs = nvlsCTAsEnv;
|
||||
}
|
||||
|
||||
/* cap channels if needed */
|
||||
if (comm->config.minCTAs > MAXCHANNELS) {
|
||||
@@ -2164,6 +2217,20 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
|
||||
comm->config.splitShare = 0;
|
||||
}
|
||||
|
||||
if (comm->config.collnetEnable != 1 && comm->config.collnetEnable != 0) {
|
||||
INFO(NCCL_ENV, "collnetEnable %d is not a valid value 0/1, set it to 0", comm->config.collnetEnable);
|
||||
comm->config.collnetEnable = 0;
|
||||
}
|
||||
|
||||
if (comm->config.CTAPolicy < NCCL_CTA_POLICY_DEFAULT || comm->config.CTAPolicy > NCCL_CTA_POLICY_EFFICIENCY) {
|
||||
INFO(NCCL_ENV, "CTAPolicy %d is not a valid value, set it to %d", comm->config.CTAPolicy, NCCL_CTA_POLICY_DEFAULT);
|
||||
comm->config.CTAPolicy = NCCL_CTA_POLICY_DEFAULT;
|
||||
}
|
||||
|
||||
if (comm->config.nvlsCTAs != NCCL_CONFIG_UNDEF_INT && comm->config.nvlsCTAs <= 0) {
|
||||
INFO(NCCL_ENV, "nvlsCTAs %d is not a valid value, NCCL will decide the default value automatically", comm->config.nvlsCTAs);
|
||||
comm->config.nvlsCTAs = NCCL_CONFIG_UNDEF_INT;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -2204,6 +2271,17 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
|
||||
internalConfigPtr->maxCTAs = defaultConfig.maxCTAs;
|
||||
internalConfigPtr->netName = defaultConfig.netName;
|
||||
}
|
||||
|
||||
if (internalConfigPtr->version < NCCL_VERSION(2, 25, 0)) {
|
||||
internalConfigPtr->trafficClass = defaultConfig.trafficClass;
|
||||
}
|
||||
|
||||
if (internalConfigPtr->version < NCCL_VERSION(2, 27, 0)) {
|
||||
internalConfigPtr->collnetEnable = defaultConfig.collnetEnable;
|
||||
internalConfigPtr->CTAPolicy = defaultConfig.CTAPolicy;
|
||||
internalConfigPtr->shrinkShare = defaultConfig.shrinkShare;
|
||||
internalConfigPtr->nvlsCTAs = defaultConfig.nvlsCTAs;
|
||||
}
|
||||
}
|
||||
|
||||
/* check input config attributes, -1 means user-undefined and we should use default value from NCCL. */
|
||||
@@ -2235,6 +2313,31 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (internalConfigPtr->collnetEnable != NCCL_CONFIG_UNDEF_INT && (internalConfigPtr->collnetEnable < 0 || internalConfigPtr->collnetEnable > 1)) {
|
||||
WARN("Invalid config collnetEnable attribute value %d", internalConfigPtr->collnetEnable);
|
||||
ret = ncclInvalidArgument;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (internalConfigPtr->CTAPolicy != NCCL_CONFIG_UNDEF_INT && (internalConfigPtr->CTAPolicy < NCCL_CTA_POLICY_DEFAULT ||
|
||||
internalConfigPtr->CTAPolicy > NCCL_CTA_POLICY_EFFICIENCY)) {
|
||||
WARN("Invalid config policy attribute value %d", internalConfigPtr->CTAPolicy);
|
||||
ret = ncclInvalidArgument;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (internalConfigPtr->shrinkShare != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->shrinkShare != 0 && internalConfigPtr->shrinkShare != 1) {
|
||||
WARN("Invalid config shrinkShare attribute value %d", internalConfigPtr->shrinkShare);
|
||||
ret = ncclInvalidArgument;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (internalConfigPtr->nvlsCTAs != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->nvlsCTAs <= 0) {
|
||||
WARN("Invalid config nvlsCTAs attribute value %d", internalConfigPtr->nvlsCTAs);
|
||||
ret = ncclInvalidArgument;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* default config value can be tuned on different platform. */
|
||||
NCCL_CONFIG_DEFAULT(internalConfigPtr, blocking, NCCL_CONFIG_UNDEF_INT, 1, "Blocking", "%d");
|
||||
NCCL_CONFIG_DEFAULT(internalConfigPtr, cgaClusterSize, NCCL_CONFIG_UNDEF_INT, 4, "CGA cluster size", "%d");
|
||||
@@ -2243,6 +2346,11 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
|
||||
NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s");
|
||||
NCCL_CONFIG_DEFAULT(internalConfigPtr, splitShare, NCCL_CONFIG_UNDEF_INT, 0, "Split share", "%d");
|
||||
NCCL_CONFIG_DEFAULT(internalConfigPtr, trafficClass, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "Traffic class", "%d");
|
||||
NCCL_CONFIG_DEFAULT(internalConfigPtr, commName, NCCL_CONFIG_UNDEF_PTR, NULL, "Comm name", "%s");
|
||||
NCCL_CONFIG_DEFAULT(internalConfigPtr, collnetEnable, NCCL_CONFIG_UNDEF_INT, 0, "Collnet enable", "%d");
|
||||
NCCL_CONFIG_DEFAULT(internalConfigPtr, CTAPolicy, NCCL_CONFIG_UNDEF_INT, NCCL_CTA_POLICY_DEFAULT, "CTA policy flags", "%d");
|
||||
NCCL_CONFIG_DEFAULT(internalConfigPtr, shrinkShare, NCCL_CONFIG_UNDEF_INT, 0, "shrinkShare", "%d");
|
||||
NCCL_CONFIG_DEFAULT(internalConfigPtr, nvlsCTAs, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "nvlsCTAs", "%d");
|
||||
|
||||
/* assign config to communicator */
|
||||
comm->config.blocking = internalConfigPtr->blocking;
|
||||
@@ -2252,7 +2360,11 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
|
||||
comm->config.netName = internalConfigPtr->netName;
|
||||
comm->config.splitShare = internalConfigPtr->splitShare;
|
||||
comm->config.trafficClass = internalConfigPtr->trafficClass;
|
||||
|
||||
comm->config.commName = internalConfigPtr->commName;
|
||||
comm->config.collnetEnable = internalConfigPtr->collnetEnable;
|
||||
comm->config.CTAPolicy = internalConfigPtr->CTAPolicy;
|
||||
comm->config.shrinkShare = internalConfigPtr->shrinkShare;
|
||||
comm->config.nvlsCTAs = internalConfigPtr->nvlsCTAs;
|
||||
NCCLCHECKGOTO(envConfigOverride(comm), ret, fail);
|
||||
|
||||
exit:
|
||||
@@ -2536,7 +2648,7 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
|
||||
WARN("commDestroySync: comm %p rank %d sync deviceStream error %d\n", comm, comm->rank, ret);
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm, true), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
|
||||
// And keep polling until all graphs referencing us die.
|
||||
while (comm->localPersistentRefs != 0) {
|
||||
@@ -2728,7 +2840,6 @@ ncclResult_t ncclCommDestroy_impl(ncclComm_t comm) {
|
||||
NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
|
||||
|
||||
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
// Try and prevent a double free of the comm struct (user error)
|
||||
if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) {
|
||||
WARN("comm %p has already been destroyed", comm);
|
||||
@@ -2743,13 +2854,22 @@ ncclResult_t ncclCommDestroy_impl(ncclComm_t comm) {
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
|
||||
|
||||
exit:
|
||||
ncclGroupErrCheck(res);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
return res;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t setCommAbortFlags(ncclComm_t comm, int value) {
|
||||
// Set abort flags
|
||||
if (comm->childAbortFlag != nullptr) {
|
||||
__atomic_store_n(comm->childAbortFlag, value, __ATOMIC_RELEASE);
|
||||
__atomic_store_n(comm->childAbortFlagDev, value, __ATOMIC_RELEASE);
|
||||
}
|
||||
__atomic_store_n(comm->abortFlag, value, __ATOMIC_RELEASE);
|
||||
__atomic_store_n(comm->abortFlagDev, value, __ATOMIC_RELEASE);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
|
||||
ncclResult_t ncclCommAbort_impl(ncclComm_t comm) {
|
||||
NCCLCHECK(Recorder::instance().record(rrCommAbort, comm));
|
||||
@@ -2758,14 +2878,8 @@ ncclResult_t ncclCommAbort_impl(ncclComm_t comm) {
|
||||
if (comm == NULL) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
// Ask anything that might still be running on the device to quit
|
||||
if (comm->childAbortFlag != nullptr) {
|
||||
__atomic_store_n(comm->childAbortFlag, 1, __ATOMIC_RELEASE);
|
||||
__atomic_store_n(comm->childAbortFlagDev, 1, __ATOMIC_RELEASE);
|
||||
}
|
||||
__atomic_store_n(comm->abortFlag, 1, __ATOMIC_RELEASE);
|
||||
__atomic_store_n(comm->abortFlagDev, 1, __ATOMIC_RELEASE);
|
||||
NCCLCHECK(setCommAbortFlags(comm,1));
|
||||
comm->destroyFlag = 1;
|
||||
/* init thread must be joined before we destroy the comm,
|
||||
* and we should ignore the init error here. */
|
||||
@@ -2786,38 +2900,51 @@ ncclResult_t ncclCommAbort_impl(ncclComm_t comm) {
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
|
||||
|
||||
exit:
|
||||
ncclGroupErrCheck(res);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
|
||||
ncclResult_t ncclCommSplit_impl(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
|
||||
static void childCommCleanupJob(void* job) {
|
||||
struct ncclCommInitRankAsyncJob* initJob = (struct ncclCommInitRankAsyncJob*)job;
|
||||
if (initJob->excludeRanksList) free(initJob->excludeRanksList);
|
||||
free(job);
|
||||
}
|
||||
|
||||
// initializing a child communicator (for both split and shrink)
|
||||
static ncclResult_t ncclCommInitChildComm(ncclComm_t comm, ncclComm_t* newcomm, bool isShrink, int flags, int color, int key, int* excludeRanksList, int excludeRanksCount,
|
||||
ncclConfig_t* config, const char* caller) {
|
||||
struct ncclCommInitRankAsyncJob *job = NULL;
|
||||
struct ncclComm* childComm = NCCL_COMM_NULL;
|
||||
ncclResult_t res = ncclSuccess;
|
||||
|
||||
NVTX3_RANGE(NcclNvtxParamsCommSplit)
|
||||
|
||||
int oldDev;
|
||||
CUDACHECK(cudaGetDevice(&oldDev));
|
||||
NCCLCHECKGOTO(CommCheck(comm, caller, "comm"), res, exit);
|
||||
NCCLCHECKGOTO(PtrCheck(newcomm, caller, "newcomm"), res, exit);
|
||||
if (isShrink) {
|
||||
NCCLCHECKGOTO(PtrCheck(excludeRanksList, caller, "excludeRanksList"), res, exit);
|
||||
NCCLCHECKGOTO(excludeRanksCount > 0 ? ncclSuccess : ncclInvalidArgument, res, exit);
|
||||
// excludeRanksList may not be sorted, need to sort it
|
||||
qsort(excludeRanksList, excludeRanksCount, sizeof(int), compareInts);
|
||||
// ranks in excludeRanksList should not call into this function
|
||||
NCCLCHECKGOTO(bsearch(&comm->rank, excludeRanksList, excludeRanksCount, sizeof(int), compareInts) ? ncclInvalidArgument : ncclSuccess, res, exit);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, exit);
|
||||
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), res, exit);
|
||||
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
NCCLCHECKGOTO(CommCheck(comm, "CommSplit", "comm"), res, fail);
|
||||
NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail);
|
||||
NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, fail);
|
||||
|
||||
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), res, fail);
|
||||
/* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */
|
||||
*newcomm = NCCL_COMM_NULL;
|
||||
if (color == NCCL_SPLIT_NOCOLOR) {
|
||||
if (!isShrink && color == NCCL_SPLIT_NOCOLOR) {
|
||||
INFO(NCCL_INIT, "Rank %d has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator", comm->rank);
|
||||
} else {
|
||||
NCCLCHECKGOTO(ncclCalloc(&childComm, 1), res, fail);
|
||||
childComm->startMagic = childComm->endMagic = NCCL_MAGIC;
|
||||
if (comm->config.splitShare) {
|
||||
|
||||
// Set the shareResource field, this is used throughout the init and must be reset every time.
|
||||
// If we shrink, we only reuse resources if we shrink in the default mode
|
||||
comm->shareResources = isShrink ? (!(flags & NCCL_SHRINK_ABORT) && comm->config.shrinkShare) : comm->config.splitShare;
|
||||
if (comm->shareResources) {
|
||||
childComm->abortFlag = comm->abortFlag;
|
||||
childComm->abortFlagDev = comm->abortFlagDev;
|
||||
childComm->abortFlagRefCount = comm->abortFlagRefCount;
|
||||
@@ -2838,20 +2965,29 @@ ncclResult_t ncclCommSplit_impl(ncclComm_t comm, int color, int key, ncclComm_t
|
||||
NCCLCHECKGOTO(parseCommConfig(childComm, config), res, fail);
|
||||
}
|
||||
|
||||
/* start with ncclInProgress and will be changed to ncclSuccess if init succeeds. */
|
||||
childComm->initState = ncclInProgress;
|
||||
/* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */
|
||||
childComm->initState = ncclInternalError;
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
|
||||
job->comm = childComm;
|
||||
job->newcomm = newcomm;
|
||||
job->parent = comm;
|
||||
job->splitCount = ++comm->splitCount;
|
||||
job->color = color;
|
||||
job->key = key;
|
||||
if (excludeRanksList) {
|
||||
// need to copy the list of ranks to exclude because the job is async
|
||||
job->excludeRanksCount = excludeRanksCount;
|
||||
NCCLCHECKGOTO(ncclCalloc(&job->excludeRanksList, excludeRanksCount), res, fail);
|
||||
memcpy(job->excludeRanksList, excludeRanksList, excludeRanksCount * sizeof(int));
|
||||
} else {
|
||||
// each split has to lead to a unique comm, so increment the splitCount
|
||||
job->splitCount = ++comm->splitCount;
|
||||
job->excludeRanksList = NULL;
|
||||
}
|
||||
job->cudaDev = comm->cudaDev;
|
||||
snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", __func__);
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, free, comm), res, fail);
|
||||
snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", caller);
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, /*undo=*/NULL, /*destructor=*/childCommCleanupJob, comm), res, fail);
|
||||
|
||||
exit:
|
||||
// for loggin only, not ready for replaying
|
||||
@@ -2859,21 +2995,13 @@ exit:
|
||||
// !recording at sink
|
||||
Recorder::instance().record(rrCommSplit, color, key, (ncclUniqueId*)comm, config, *newcomm);
|
||||
(void)cudaSetDevice(oldDev);
|
||||
(void)ncclGroupErrCheck(res);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
|
||||
if (res == ncclSuccess && *newcomm) {
|
||||
NVTX3_RANGE_ADD_PAYLOAD(CommSplit, NcclNvtxParamsCommSplitSchema,
|
||||
NVTX3_PAYLOAD((*newcomm)->commHash, comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, color, key));
|
||||
}
|
||||
|
||||
return res;
|
||||
fail:
|
||||
if (childComm) {
|
||||
if (!comm->config.splitShare) {
|
||||
free(childComm->abortFlag);
|
||||
if (!comm->shareResources) {
|
||||
if (childComm->abortFlag) free(childComm->abortFlag);
|
||||
if (childComm->abortFlagDev) ncclCudaHostFree(childComm->abortFlagDev);
|
||||
free(childComm->abortFlagRefCount);
|
||||
if (childComm->abortFlagRefCount) free(childComm->abortFlagRefCount);
|
||||
}
|
||||
free(childComm);
|
||||
}
|
||||
@@ -2881,6 +3009,44 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommShrink, ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
|
||||
ncclResult_t ncclCommShrink_impl(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t *newcomm, ncclConfig_t* config, int shrinkFlags) {
|
||||
NVTX3_RANGE(NcclNvtxParamsCommShrink)
|
||||
ncclResult_t res = ncclSuccess;
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
// Handle error mode by setting abort flags and waiting for kernels to complete and unset the flags to avoid bootstrap issues
|
||||
if (shrinkFlags & NCCL_SHRINK_ABORT) {
|
||||
NCCLCHECKGOTO(setCommAbortFlags(comm, 1), res, exit);
|
||||
NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), res, exit);
|
||||
NCCLCHECKGOTO(setCommAbortFlags(comm, 0), res, exit);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclCommInitChildComm(comm, newcomm, /*isShrink=*/true, shrinkFlags, /*color=*/0, /*key=*/comm->rank, excludeRanksList, excludeRanksCount, config, __func__), res, exit);
|
||||
|
||||
if (*newcomm) NVTX3_RANGE_ADD_PAYLOAD(CommShrink, NcclNvtxParamsCommShrinkSchema, NVTX3_PAYLOAD(comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, excludeRanksCount));
|
||||
|
||||
exit:
|
||||
(void)ncclGroupErrCheck(res);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
return res;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
|
||||
ncclResult_t ncclCommSplit_impl(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
|
||||
NVTX3_RANGE(NcclNvtxParamsCommSplit)
|
||||
|
||||
ncclResult_t res = ncclSuccess;
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
NCCLCHECKGOTO(ncclCommInitChildComm(comm, newcomm, /*isShrink=*/false, /*shrink mode=*/NCCL_SHRINK_DEFAULT, color, key, NULL, 0, config, __func__), res, exit);
|
||||
|
||||
if (*newcomm)
|
||||
NVTX3_RANGE_ADD_PAYLOAD(CommSplit, NcclNvtxParamsCommSplitSchema, NVTX3_PAYLOAD((*newcomm)->commHash, comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, color, key));
|
||||
|
||||
exit:
|
||||
(void)ncclGroupErrCheck(res);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
return res;
|
||||
}
|
||||
|
||||
NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
|
||||
const char* ncclGetErrorString_impl(ncclResult_t code) {
|
||||
Recorder::instance().record("GetErrorString");
|
||||
@@ -2964,121 +3130,3 @@ ncclResult_t ncclCommUserRank_impl(const ncclComm_t comm, int* rank) {
|
||||
*rank = comm->rank;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
|
||||
ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
#if CUDART_VERSION >= 12010
|
||||
size_t memGran = 0;
|
||||
CUdevice currentDev;
|
||||
CUmemAllocationProp memprop = {};
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
CUmemGenericAllocationHandle handle;
|
||||
int cudaDev;
|
||||
int flag;
|
||||
int dcnt;
|
||||
|
||||
if (ptr == NULL || size == 0) goto fallback;
|
||||
|
||||
if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
|
||||
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
||||
|
||||
if (ncclCuMemEnable()) {
|
||||
size_t handleSize = size;
|
||||
int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
// Query device to see if FABRIC handle support is available
|
||||
flag = 0;
|
||||
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));
|
||||
if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
|
||||
memprop.location.id = currentDev;
|
||||
// Query device to see if RDMA support is available
|
||||
flag = 0;
|
||||
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
|
||||
if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
|
||||
CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||
CUDACHECK(cudaGetDeviceCount(&dcnt));
|
||||
ALIGN_SIZE(handleSize, memGran);
|
||||
|
||||
if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
|
||||
/* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
|
||||
CUresult err = CUPFN(cuMemCreate(&handle, handleSize, &memprop, 0));
|
||||
if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) {
|
||||
requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
|
||||
/* Allocate the physical memory on the device */
|
||||
CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
|
||||
}
|
||||
} else {
|
||||
/* Allocate the physical memory on the device */
|
||||
CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
|
||||
}
|
||||
/* Reserve a virtual address range */
|
||||
CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, handleSize, memGran, 0, 0));
|
||||
/* Map the virtual address range to the physical allocation */
|
||||
CUCHECK(cuMemMap((CUdeviceptr)*ptr, handleSize, 0, handle, 0));
|
||||
/* Now allow RW access to the newly mapped memory */
|
||||
for (int i = 0; i < dcnt; ++i) {
|
||||
int p2p = 0;
|
||||
if (i == cudaDev || ((cudaDeviceCanAccessPeer(&p2p, cudaDev, i) == cudaSuccess) && p2p)) {
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = i;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, handleSize, &accessDesc, 1));
|
||||
}
|
||||
if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i);
|
||||
}
|
||||
goto exit;
|
||||
}
|
||||
|
||||
fallback:
|
||||
#endif
|
||||
// Coverity is right to complain that we may pass a NULL ptr to cudaMalloc. That's deliberate though:
|
||||
// we want CUDA to return an error to the caller.
|
||||
// coverity[var_deref_model]
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail);
|
||||
|
||||
exit:
|
||||
NCCLCHECK(Recorder::instance().record(rrMemAlloc, *ptr, size));
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
|
||||
ncclResult_t ncclMemFree_impl(void *ptr) {
|
||||
NCCLCHECK(Recorder::instance().record(rrMemFree, ptr));
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int saveDevice;
|
||||
|
||||
CUDACHECK(cudaGetDevice(&saveDevice));
|
||||
#if CUDART_VERSION >= 12010
|
||||
CUdevice ptrDev = 0;
|
||||
|
||||
if (ptr == NULL) goto fallback;
|
||||
if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
|
||||
|
||||
CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail);
|
||||
CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail);
|
||||
if (ncclCuMemEnable()) {
|
||||
NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
fallback:
|
||||
#endif
|
||||
CUDACHECKGOTO(cudaFree(ptr), ret, fail);
|
||||
|
||||
exit:
|
||||
CUDACHECK(cudaSetDevice(saveDevice));
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
+60
-16
@@ -105,6 +105,10 @@ ncclCommDestroy_impl(ncclComm_t comm);
|
||||
ncclResult_t
|
||||
ncclCommAbort_impl(ncclComm_t comm);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommShrink_impl(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t *newcomm,
|
||||
ncclConfig_t* config, int shrinkFlags);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommSplit_impl(ncclComm_t comm, int color, int key, ncclComm_t* newcomm,
|
||||
ncclConfig_t* config);
|
||||
@@ -153,6 +157,12 @@ ncclCommRegister_impl(const ncclComm_t comm, void* buff, size_t size, void** han
|
||||
ncclResult_t
|
||||
ncclCommDeregister_impl(const ncclComm_t comm, void* handle);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommWindowRegister_impl(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
|
||||
|
||||
ncclResult_t
|
||||
ncclCommWindowDeregister_impl(ncclComm_t comm, ncclWindow_t win);
|
||||
|
||||
ncclResult_t
|
||||
ncclAllReduceWithBias_impl(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm,
|
||||
@@ -202,25 +212,28 @@ RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommInitRankConfig_fn, 19);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommFinalize_fn, 20);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommDestroy_fn, 21);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommAbort_fn, 22);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommSplit_fn, 23);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetErrorString_fn, 24);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetLastError_fn, 25);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommGetAsyncError_fn, 26);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommCount_fn, 27);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommCuDevice_fn, 28);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommUserRank_fn, 29);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclMemAlloc_fn, 30);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclMemFree_fn, 31);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, mscclLoadAlgo_fn, 32);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, mscclRunAlgo_fn, 33);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, mscclUnloadAlgo_fn, 34);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommRegister_fn, 35);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommDeregister_fn, 36);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclAllReduceWithBias_fn, 37);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommShrink_fn, 23);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommSplit_fn, 24);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetErrorString_fn, 25);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclGetLastError_fn, 26);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommGetAsyncError_fn, 27);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommCount_fn, 28);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommCuDevice_fn, 29);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommUserRank_fn, 30);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclMemAlloc_fn, 31);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclMemFree_fn, 32);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, mscclLoadAlgo_fn, 33);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, mscclRunAlgo_fn, 34);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, mscclUnloadAlgo_fn, 35);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommRegister_fn, 36);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommDeregister_fn, 37);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommWindowRegister_fn, 38);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclCommWindowDeregister_fn, 39);
|
||||
RCCL_ASSERT_OFFSET(rcclApiFuncTable, ncclAllReduceWithBias_fn, 40);
|
||||
|
||||
#undef RCCL_ASSERT_OFFSET
|
||||
|
||||
static_assert(sizeof(rcclApiFuncTable) == compute_table_size(38),
|
||||
static_assert(sizeof(rcclApiFuncTable) == compute_table_size(41),
|
||||
"Update table major/step version and add a new offset assertion if this "
|
||||
"fails to compile");
|
||||
|
||||
@@ -254,6 +267,7 @@ RcclGetFunctionTable_impl()
|
||||
&ncclCommFinalize_impl,
|
||||
&ncclCommDestroy_impl,
|
||||
&ncclCommAbort_impl,
|
||||
&ncclCommShrink_impl,
|
||||
&ncclCommSplit_impl,
|
||||
&ncclGetErrorString_impl,
|
||||
&ncclGetLastError_impl,
|
||||
@@ -268,6 +282,8 @@ RcclGetFunctionTable_impl()
|
||||
&mscclUnloadAlgo_impl,
|
||||
&ncclCommRegister_impl,
|
||||
&ncclCommDeregister_impl,
|
||||
&ncclCommWindowRegister_impl,
|
||||
&ncclCommWindowDeregister_impl,
|
||||
&ncclAllReduceWithBias_impl };
|
||||
|
||||
#if defined(RCCL_ROCPROFILER_REGISTER) && RCCL_ROCPROFILER_REGISTER > 0
|
||||
@@ -370,6 +386,9 @@ NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommShrink, ncclComm_t comm, int* excludeRanksList, int excludeRanksCount,
|
||||
ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key,
|
||||
ncclComm_t* newcomm, ncclConfig_t* config);
|
||||
|
||||
@@ -405,6 +424,11 @@ NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommWindowRegister, ncclComm_t comm, void* buff, size_t size,
|
||||
ncclWindow_t* win, int winFlags);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommWindowDeregister, ncclComm_t comm, ncclWindow_t win);
|
||||
|
||||
ncclResult_t
|
||||
ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream)
|
||||
@@ -581,6 +605,14 @@ ncclCommAbort(ncclComm_t comm)
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommAbort_fn(comm);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm,
|
||||
ncclConfig_t* config, int shrinkFlags)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommShrink_fn(comm, excludeRanksList, excludeRanksCount,
|
||||
newcomm, config, shrinkFlags);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t* newcomm,
|
||||
ncclConfig_t* config)
|
||||
@@ -672,3 +704,15 @@ ncclCommDeregister(const ncclComm_t comm, void* handle)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommDeregister_fn(comm, handle);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommWindowRegister_fn(comm, buff, size, win, winFlags);
|
||||
}
|
||||
|
||||
ncclResult_t
|
||||
ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win)
|
||||
{
|
||||
return ::rccl::RcclGetFunctionTable()->ncclCommWindowDeregister_fn(comm, win);
|
||||
}
|
||||
|
||||
+77
-68
@@ -105,53 +105,53 @@ error:
|
||||
#endif
|
||||
}
|
||||
|
||||
#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
|
||||
#define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
|
||||
DECLARE_CUDA_PFN(cuDeviceGet);
|
||||
DECLARE_CUDA_PFN(cuDeviceGetAttribute);
|
||||
DECLARE_CUDA_PFN(cuGetErrorString);
|
||||
DECLARE_CUDA_PFN(cuGetErrorName);
|
||||
DECLARE_CUDA_PFN(cuDeviceGet, 2000);
|
||||
DECLARE_CUDA_PFN(cuDeviceGetAttribute, 2000);
|
||||
DECLARE_CUDA_PFN(cuGetErrorString, 6000);
|
||||
DECLARE_CUDA_PFN(cuGetErrorName, 6000);
|
||||
/* enqueue.cc */
|
||||
DECLARE_CUDA_PFN(cuMemGetAddressRange);
|
||||
DECLARE_CUDA_PFN(cuLaunchKernel);
|
||||
DECLARE_CUDA_PFN(cuMemGetAddressRange, 3020);
|
||||
DECLARE_CUDA_PFN(cuLaunchKernel, 4000);
|
||||
#if CUDA_VERSION >= 11080
|
||||
DECLARE_CUDA_PFN(cuLaunchKernelEx);
|
||||
DECLARE_CUDA_PFN(cuLaunchKernelEx, 11060);
|
||||
#endif
|
||||
/* proxy.cc */
|
||||
DECLARE_CUDA_PFN(cuCtxCreate);
|
||||
DECLARE_CUDA_PFN(cuCtxDestroy);
|
||||
DECLARE_CUDA_PFN(cuCtxGetCurrent);
|
||||
DECLARE_CUDA_PFN(cuCtxSetCurrent);
|
||||
DECLARE_CUDA_PFN(cuCtxGetDevice);
|
||||
DECLARE_CUDA_PFN(cuCtxCreate, 11040);
|
||||
DECLARE_CUDA_PFN(cuCtxDestroy, 4000);
|
||||
DECLARE_CUDA_PFN(cuCtxGetCurrent, 4000);
|
||||
DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000);
|
||||
DECLARE_CUDA_PFN(cuCtxGetDevice, 2000);
|
||||
/* cuMem API support */
|
||||
DECLARE_CUDA_PFN(cuMemAddressReserve);
|
||||
DECLARE_CUDA_PFN(cuMemAddressFree);
|
||||
DECLARE_CUDA_PFN(cuMemCreate);
|
||||
DECLARE_CUDA_PFN(cuMemGetAllocationGranularity);
|
||||
DECLARE_CUDA_PFN(cuMemExportToShareableHandle);
|
||||
DECLARE_CUDA_PFN(cuMemImportFromShareableHandle);
|
||||
DECLARE_CUDA_PFN(cuMemMap);
|
||||
DECLARE_CUDA_PFN(cuMemRelease);
|
||||
DECLARE_CUDA_PFN(cuMemRetainAllocationHandle);
|
||||
DECLARE_CUDA_PFN(cuMemSetAccess);
|
||||
DECLARE_CUDA_PFN(cuMemUnmap);
|
||||
DECLARE_CUDA_PFN(cuMemGetAllocationPropertiesFromHandle);
|
||||
DECLARE_CUDA_PFN(cuMemAddressReserve, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemAddressFree, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemCreate, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemGetAllocationGranularity, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemExportToShareableHandle, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemImportFromShareableHandle, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemMap, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemRelease, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemRetainAllocationHandle, 11000);
|
||||
DECLARE_CUDA_PFN(cuMemSetAccess, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemUnmap, 10020);
|
||||
DECLARE_CUDA_PFN(cuMemGetAllocationPropertiesFromHandle, 10020);
|
||||
/* ncclMemAlloc/Free */
|
||||
DECLARE_CUDA_PFN(cuPointerGetAttribute);
|
||||
DECLARE_CUDA_PFN(cuPointerGetAttribute, 4000);
|
||||
#if CUDA_VERSION >= 11070
|
||||
/* transport/collNet.cc/net.cc*/
|
||||
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support
|
||||
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
|
||||
#endif
|
||||
#if CUDA_VERSION >= 12010
|
||||
/* NVSwitch Multicast support */
|
||||
DECLARE_CUDA_PFN(cuMulticastAddDevice);
|
||||
DECLARE_CUDA_PFN(cuMulticastBindMem);
|
||||
DECLARE_CUDA_PFN(cuMulticastBindAddr);
|
||||
DECLARE_CUDA_PFN(cuMulticastCreate);
|
||||
DECLARE_CUDA_PFN(cuMulticastGetGranularity);
|
||||
DECLARE_CUDA_PFN(cuMulticastUnbind);
|
||||
DECLARE_CUDA_PFN(cuMulticastAddDevice, 12010);
|
||||
DECLARE_CUDA_PFN(cuMulticastBindMem, 12010);
|
||||
DECLARE_CUDA_PFN(cuMulticastBindAddr, 12010);
|
||||
DECLARE_CUDA_PFN(cuMulticastCreate, 12010);
|
||||
DECLARE_CUDA_PFN(cuMulticastGetGranularity, 12010);
|
||||
DECLARE_CUDA_PFN(cuMulticastUnbind, 12010);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -162,8 +162,17 @@ bool ncclCudaLaunchBlocking = false;
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
|
||||
#if CUDART_VERSION >= 12000
|
||||
#define LOAD_SYM(symbol, ignore) do { \
|
||||
#if CUDART_VERSION >= 13000
|
||||
#define LOAD_SYM(symbol, version, ignore) do { \
|
||||
cudaDriverEntryPointQueryResult driverStatus = cudaDriverEntryPointSymbolNotFound; \
|
||||
res = cudaGetDriverEntryPointByVersion(#symbol, (void **) (&pfn_##symbol), version, cudaEnableDefault, &driverStatus); \
|
||||
if (res != cudaSuccess || driverStatus != cudaDriverEntryPointSuccess) { \
|
||||
if (!ignore) { \
|
||||
WARN("Retrieve %s version %d failed with %d status %d", #symbol, version, res, driverStatus); \
|
||||
return ncclSystemError; } \
|
||||
} } while(0)
|
||||
#elif CUDART_VERSION >= 12000
|
||||
#define LOAD_SYM(symbol, version, ignore) do { \
|
||||
cudaDriverEntryPointQueryResult driverStatus = cudaDriverEntryPointSymbolNotFound; \
|
||||
res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault, &driverStatus); \
|
||||
if (res != cudaSuccess || driverStatus != cudaDriverEntryPointSuccess) { \
|
||||
@@ -172,7 +181,7 @@ bool ncclCudaLaunchBlocking = false;
|
||||
return ncclSystemError; } \
|
||||
} } while(0)
|
||||
#else
|
||||
#define LOAD_SYM(symbol, ignore) do { \
|
||||
#define LOAD_SYM(symbol, version, ignore) do { \
|
||||
res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault); \
|
||||
if (res != cudaSuccess) { \
|
||||
if (!ignore) { \
|
||||
@@ -188,46 +197,46 @@ static ncclResult_t cudaPfnFuncLoader(void) {
|
||||
|
||||
cudaError_t res;
|
||||
|
||||
LOAD_SYM(cuGetErrorString, 0);
|
||||
LOAD_SYM(cuGetErrorName, 0);
|
||||
LOAD_SYM(cuDeviceGet, 0);
|
||||
LOAD_SYM(cuDeviceGetAttribute, 0);
|
||||
LOAD_SYM(cuMemGetAddressRange, 1);
|
||||
LOAD_SYM(cuCtxCreate, 1);
|
||||
LOAD_SYM(cuCtxDestroy, 1);
|
||||
LOAD_SYM(cuCtxGetCurrent, 1);
|
||||
LOAD_SYM(cuCtxSetCurrent, 1);
|
||||
LOAD_SYM(cuCtxGetDevice, 1);
|
||||
LOAD_SYM(cuLaunchKernel, 1);
|
||||
LOAD_SYM(cuGetErrorString, 6000, 0);
|
||||
LOAD_SYM(cuGetErrorName, 6000, 0);
|
||||
LOAD_SYM(cuDeviceGet, 2000, 0);
|
||||
LOAD_SYM(cuDeviceGetAttribute, 2000, 0);
|
||||
LOAD_SYM(cuMemGetAddressRange, 3020, 1);
|
||||
LOAD_SYM(cuCtxCreate, 11040, 1);
|
||||
LOAD_SYM(cuCtxDestroy, 4000, 1);
|
||||
LOAD_SYM(cuCtxGetCurrent, 4000, 1);
|
||||
LOAD_SYM(cuCtxSetCurrent, 4000, 1);
|
||||
LOAD_SYM(cuCtxGetDevice, 2000, 1);
|
||||
LOAD_SYM(cuLaunchKernel, 4000, 1);
|
||||
#if CUDA_VERSION >= 11080
|
||||
LOAD_SYM(cuLaunchKernelEx, 1);
|
||||
LOAD_SYM(cuLaunchKernelEx, 11060, 1);
|
||||
#endif
|
||||
/* cuMem API support */
|
||||
LOAD_SYM(cuMemAddressReserve, 1);
|
||||
LOAD_SYM(cuMemAddressFree, 1);
|
||||
LOAD_SYM(cuMemCreate, 1);
|
||||
LOAD_SYM(cuMemGetAllocationGranularity, 1);
|
||||
LOAD_SYM(cuMemExportToShareableHandle, 1);
|
||||
LOAD_SYM(cuMemImportFromShareableHandle, 1);
|
||||
LOAD_SYM(cuMemMap, 1);
|
||||
LOAD_SYM(cuMemRelease, 1);
|
||||
LOAD_SYM(cuMemRetainAllocationHandle, 1);
|
||||
LOAD_SYM(cuMemSetAccess, 1);
|
||||
LOAD_SYM(cuMemUnmap, 1);
|
||||
LOAD_SYM(cuMemGetAllocationPropertiesFromHandle, 1);
|
||||
LOAD_SYM(cuMemAddressReserve, 10020, 1);
|
||||
LOAD_SYM(cuMemAddressFree, 10020, 1);
|
||||
LOAD_SYM(cuMemCreate, 10020, 1);
|
||||
LOAD_SYM(cuMemGetAllocationGranularity, 10020, 1);
|
||||
LOAD_SYM(cuMemExportToShareableHandle, 10020, 1);
|
||||
LOAD_SYM(cuMemImportFromShareableHandle, 10020, 1);
|
||||
LOAD_SYM(cuMemMap, 10020, 1);
|
||||
LOAD_SYM(cuMemRelease, 10020, 1);
|
||||
LOAD_SYM(cuMemRetainAllocationHandle, 11000, 1);
|
||||
LOAD_SYM(cuMemSetAccess, 10020, 1);
|
||||
LOAD_SYM(cuMemUnmap, 10020, 1);
|
||||
LOAD_SYM(cuMemGetAllocationPropertiesFromHandle, 10020, 1);
|
||||
/* ncclMemAlloc/Free */
|
||||
LOAD_SYM(cuPointerGetAttribute, 1);
|
||||
LOAD_SYM(cuPointerGetAttribute, 4000, 1);
|
||||
#if CUDA_VERSION >= 11070
|
||||
LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support
|
||||
LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
|
||||
#endif
|
||||
#if CUDA_VERSION >= 12010
|
||||
/* NVSwitch Multicast support */
|
||||
LOAD_SYM(cuMulticastAddDevice, 1);
|
||||
LOAD_SYM(cuMulticastBindMem, 1);
|
||||
LOAD_SYM(cuMulticastBindAddr, 1);
|
||||
LOAD_SYM(cuMulticastCreate, 1);
|
||||
LOAD_SYM(cuMulticastGetGranularity, 1);
|
||||
LOAD_SYM(cuMulticastUnbind, 1);
|
||||
LOAD_SYM(cuMulticastAddDevice, 12010, 1);
|
||||
LOAD_SYM(cuMulticastBindMem, 12010, 1);
|
||||
LOAD_SYM(cuMulticastBindAddr, 12010, 1);
|
||||
LOAD_SYM(cuMulticastCreate, 12010, 1);
|
||||
LOAD_SYM(cuMulticastGetGranularity, 12010, 1);
|
||||
LOAD_SYM(cuMulticastUnbind, 12010, 1);
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -8,7 +8,11 @@
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#ifdef NCCL_BUILD_RDMA_CORE
|
||||
#include <infiniband/verbs.h>
|
||||
#else
|
||||
#include "ibvcore.h"
|
||||
#endif
|
||||
#include "ibvsymbols.h"
|
||||
|
||||
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
|
||||
|
||||
@@ -0,0 +1,74 @@
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "mlx5/mlx5dvsymbols.h"
|
||||
|
||||
#ifdef NCCL_BUILD_MLX5DV
|
||||
/* Mlx5dv linking mode. Symbols are pointers to linked MLX5 Direct Verbs */
|
||||
|
||||
#define ASSIGN_SYM(container, symbol, name) container->name= &symbol;
|
||||
|
||||
ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols) {
|
||||
ASSIGN_SYM(mlx5dvSymbols, mlx5dv_is_supported, mlx5dv_internal_is_supported);
|
||||
ASSIGN_SYM(mlx5dvSymbols, mlx5dv_get_data_direct_sysfs_path, mlx5dv_internal_get_data_direct_sysfs_path);
|
||||
ASSIGN_SYM(mlx5dvSymbols, mlx5dv_reg_dmabuf_mr, mlx5dv_internal_reg_dmabuf_mr);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#else
|
||||
/* Mlx5dv dynamic loading mode. Symbols are loaded from shared objects. */
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include "core.h"
|
||||
|
||||
// MLX5DV Library versioning
|
||||
#define MLX5DV_VERSION "MLX5_1.8"
|
||||
|
||||
ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols) {
|
||||
static void* mlx5dvhandle = NULL;
|
||||
void* tmp;
|
||||
void** cast;
|
||||
|
||||
mlx5dvhandle=dlopen("libmlx5.so", RTLD_NOW);
|
||||
if (!mlx5dvhandle) {
|
||||
mlx5dvhandle=dlopen("libmlx5.so.1", RTLD_NOW);
|
||||
if (!mlx5dvhandle) {
|
||||
INFO(NCCL_INIT, "Failed to open libmlx5.so[.1]");
|
||||
goto teardown;
|
||||
}
|
||||
}
|
||||
|
||||
#define LOAD_SYM(handle, symbol, funcptr) do { \
|
||||
cast = (void**)&funcptr; \
|
||||
tmp = dlvsym(handle, symbol, MLX5DV_VERSION); \
|
||||
if (tmp == NULL) { \
|
||||
WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), MLX5DV_VERSION); \
|
||||
goto teardown; \
|
||||
} \
|
||||
*cast = tmp; \
|
||||
} while (0)
|
||||
|
||||
// Attempt to load a specific symbol version - fail silently
|
||||
#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \
|
||||
cast = (void**)&funcptr; \
|
||||
*cast = dlvsym(handle, symbol, version); \
|
||||
} while (0)
|
||||
|
||||
LOAD_SYM(mlx5dvhandle, "mlx5dv_is_supported", mlx5dvSymbols->mlx5dv_internal_is_supported);
|
||||
// Cherry-pick the mlx5dv_get_data_direct_sysfs_path API from MLX5 1.25
|
||||
LOAD_SYM_VERSION(mlx5dvhandle, "mlx5dv_get_data_direct_sysfs_path", mlx5dvSymbols->mlx5dv_internal_get_data_direct_sysfs_path, "MLX5_1.25");
|
||||
// Cherry-pick the ibv_reg_dmabuf_mr API from MLX5 1.25
|
||||
LOAD_SYM_VERSION(mlx5dvhandle, "mlx5dv_reg_dmabuf_mr", mlx5dvSymbols->mlx5dv_internal_reg_dmabuf_mr, "MLX5_1.25");
|
||||
|
||||
return ncclSuccess;
|
||||
|
||||
teardown:
|
||||
mlx5dvSymbols->mlx5dv_internal_is_supported = NULL;
|
||||
mlx5dvSymbols->mlx5dv_internal_get_data_direct_sysfs_path = NULL;
|
||||
mlx5dvSymbols->mlx5dv_internal_reg_dmabuf_mr = NULL;
|
||||
|
||||
if (mlx5dvhandle != NULL) dlclose(mlx5dvhandle);
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,75 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "mlx5/mlx5dvwrap.h"
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#ifdef NCCL_BUILD_MLX5DV
|
||||
#include <infiniband/mlx5dv.h>
|
||||
#else
|
||||
#include "mlx5/mlx5dvcore.h"
|
||||
#endif
|
||||
#include "mlx5/mlx5dvsymbols.h"
|
||||
|
||||
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
|
||||
static ncclResult_t initResult;
|
||||
struct ncclMlx5dvSymbols mlx5dvSymbols;
|
||||
|
||||
ncclResult_t wrap_mlx5dv_symbols(void) {
|
||||
pthread_once(&initOnceControl,
|
||||
[](){ initResult = buildMlx5dvSymbols(&mlx5dvSymbols); });
|
||||
return initResult;
|
||||
}
|
||||
|
||||
/* CHECK_NOT_NULL: helper macro to check for NULL symbol */
|
||||
#define CHECK_NOT_NULL(container, internal_name) \
|
||||
if (container.internal_name == NULL) { \
|
||||
WARN("lib wrapper not initialized."); \
|
||||
return ncclInternalError; \
|
||||
}
|
||||
|
||||
#define MLX5DV_PTR_CHECK_ERRNO(container, internal_name, call, retval, error_retval, name) \
|
||||
CHECK_NOT_NULL(container, internal_name); \
|
||||
retval = container.call; \
|
||||
if (retval == error_retval) { \
|
||||
WARN("Call to " name " failed with error %s", strerror(errno)); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
return ncclSuccess;
|
||||
|
||||
#define MLX5DV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \
|
||||
CHECK_NOT_NULL(container, internal_name); \
|
||||
int ret = container.call; \
|
||||
if (ret != success_retval) { \
|
||||
INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
return ncclSuccess;
|
||||
|
||||
bool wrap_mlx5dv_is_supported(struct ibv_device *device) {
|
||||
if (mlx5dvSymbols.mlx5dv_internal_is_supported == NULL) {
|
||||
return 0;
|
||||
}
|
||||
return mlx5dvSymbols.mlx5dv_internal_is_supported(device);
|
||||
}
|
||||
|
||||
ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context *context, char *buf, size_t buf_len) {
|
||||
MLX5DV_INT_CHECK_RET_ERRNO(mlx5dvSymbols, mlx5dv_internal_get_data_direct_sysfs_path, mlx5dv_internal_get_data_direct_sysfs_path(context, buf, buf_len), 0, "mlx5dv_get_data_direct_sysfs_path");
|
||||
}
|
||||
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t wrap_mlx5dv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access) {
|
||||
MLX5DV_PTR_CHECK_ERRNO(mlx5dvSymbols, mlx5dv_internal_reg_dmabuf_mr, mlx5dv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access), *ret, NULL, "mlx5dv_reg_dmabuf_mr");
|
||||
}
|
||||
|
||||
struct ibv_mr * wrap_direct_mlx5dv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access) {
|
||||
if (mlx5dvSymbols.mlx5dv_internal_reg_dmabuf_mr == NULL) {
|
||||
errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set
|
||||
return NULL;
|
||||
}
|
||||
return mlx5dvSymbols.mlx5dv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access);
|
||||
}
|
||||
+50
-11
@@ -10,6 +10,7 @@
|
||||
#include "rocmwrap.h"
|
||||
#include "hsa/hsa.h"
|
||||
#include "param.h"
|
||||
#include "bootstrap.h"
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <sys/utsname.h>
|
||||
@@ -24,9 +25,6 @@ DECLARE_ROCM_PFN(hsa_init);
|
||||
DECLARE_ROCM_PFN(hsa_system_get_info);
|
||||
DECLARE_ROCM_PFN(hsa_status_string);
|
||||
|
||||
// Handle type used for cuMemCreate()
|
||||
CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
|
||||
static void *hsaLib;
|
||||
static uint16_t version_major, version_minor;
|
||||
bool ncclCudaLaunchBlocking = false;
|
||||
@@ -34,6 +32,52 @@ bool ncclCudaLaunchBlocking = false;
|
||||
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
|
||||
static ncclResult_t initResult;
|
||||
|
||||
// This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
|
||||
NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", 0);
|
||||
NCCL_PARAM(CuMemHostEnable, "CUMEM_HOST_ENABLE", -1);
|
||||
// Handle type used for cuMemCreate()
|
||||
CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
|
||||
static int ncclCuMemSupported = 0;
|
||||
|
||||
// Determine whether CUMEM & VMM RDMA is supported on this platform
|
||||
int ncclIsCuMemSupported() {
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
return 0;
|
||||
#else
|
||||
CUdevice currentDev;
|
||||
int cudaDev;
|
||||
int cudaDriverVersion;
|
||||
int flag = 0;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
CUDACHECKGOTO(cudaDriverGetVersion(&cudaDriverVersion), ret, error);
|
||||
if (cudaDriverVersion < 12000) return 0; // Need CUDA_VISIBLE_DEVICES support
|
||||
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, error);
|
||||
if (CUPFN(cuMemCreate) == NULL) return 0;
|
||||
CUCHECKGOTO(cuDeviceGet(¤tDev, cudaDev), ret, error);
|
||||
// Query device to see if CUMEM VMM support is available
|
||||
CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, currentDev), ret, error);
|
||||
if (!flag) return 0;
|
||||
|
||||
error:
|
||||
return (ret == ncclSuccess);
|
||||
#endif
|
||||
}
|
||||
|
||||
int ncclCuMemEnable() {
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
return 0;
|
||||
#else
|
||||
// NCCL_CUMEM_ENABLE=-2 means auto-detect CUMEM support
|
||||
int param = ncclParamCuMemEnable();
|
||||
return param >= 0 ? param : (param == -2 && ncclCuMemSupported);
|
||||
#endif
|
||||
}
|
||||
|
||||
int ncclCuMemHostEnable() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void initOnceFunc() {
|
||||
do {
|
||||
char* val = getenv("CUDA_LAUNCH_BLOCKING");
|
||||
@@ -100,6 +144,9 @@ static void initOnceFunc() {
|
||||
//goto error;
|
||||
//}
|
||||
|
||||
// Determine whether we support the cuMem APIs or not
|
||||
ncclCuMemSupported = ncclIsCuMemSupported();
|
||||
|
||||
/* DMA-BUF support */
|
||||
//ROCm support
|
||||
if (ncclParamDmaBufEnable() == 0 ) {
|
||||
@@ -170,14 +217,6 @@ error:
|
||||
initResult = ncclSystemError;
|
||||
}
|
||||
|
||||
int ncclCuMemEnable() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ncclCuMemHostEnable() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
ncclResult_t rocmLibraryInit() {
|
||||
pthread_once(&initOnceControl, initOnceFunc);
|
||||
return initResult;
|
||||
|
||||
+101
-67
@@ -73,7 +73,8 @@ static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, i
|
||||
return ncclSuccess;
|
||||
} else {
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
|
||||
WARN("socketProgress: Connection closed by remote peer %s",
|
||||
ncclSocketToString(&sock->addr, line, /*numericHostForm*/0));
|
||||
return ncclRemoteError;
|
||||
}
|
||||
}
|
||||
@@ -91,17 +92,22 @@ static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int s
|
||||
* Output: "IPv4/IPv6 address<port>"
|
||||
*/
|
||||
const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
|
||||
if (buf == NULL || addr == NULL) return NULL;
|
||||
const struct sockaddr *saddr = &addr->sa;
|
||||
if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
|
||||
const struct sockaddr *saddr;
|
||||
char host[NI_MAXHOST], service[NI_MAXSERV];
|
||||
int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0);
|
||||
if (buf == NULL || addr == NULL) goto fail;
|
||||
saddr = &addr->sa;
|
||||
if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) goto fail;
|
||||
/* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
|
||||
* (When not set, this will still happen in case the node's name cannot be determined.)
|
||||
*/
|
||||
int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0);
|
||||
(void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag);
|
||||
if (getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag)) goto fail;
|
||||
sprintf(buf, "%s<%s>", host, service);
|
||||
return buf;
|
||||
fail:
|
||||
if (buf)
|
||||
buf[0] = '\0';
|
||||
return buf;
|
||||
}
|
||||
|
||||
static uint16_t socketToPort(union ncclSocketAddress *addr) {
|
||||
@@ -125,7 +131,8 @@ static int envSocketFamily(void) {
|
||||
return family;
|
||||
}
|
||||
|
||||
static int findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
|
||||
static ncclResult_t findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family,
|
||||
int maxIfNameSize, int maxIfs, int* found) {
|
||||
#ifdef ENABLE_TRACE
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
#endif
|
||||
@@ -136,10 +143,10 @@ static int findInterfaces(const char* prefixList, char* names, union ncclSocketA
|
||||
if (searchExact) prefixList++;
|
||||
int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
|
||||
|
||||
int found = 0;
|
||||
*found = 0;
|
||||
struct ifaddrs *interfaces, *interface;
|
||||
getifaddrs(&interfaces);
|
||||
for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) {
|
||||
SYSCHECK(getifaddrs(&interfaces), "getifaddrs");
|
||||
for (interface = interfaces; interface && *found < maxIfs; interface = interface->ifa_next) {
|
||||
if (interface->ifa_addr == NULL) continue;
|
||||
|
||||
/* We only support IPv4 & IPv6 */
|
||||
@@ -167,23 +174,23 @@ static int findInterfaces(const char* prefixList, char* names, union ncclSocketA
|
||||
// Check that this interface has not already been saved
|
||||
// getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
|
||||
bool duplicate = false;
|
||||
for (int i = 0; i < found; i++) {
|
||||
for (int i = 0; i < *found; i++) {
|
||||
if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
|
||||
}
|
||||
|
||||
if (!duplicate) {
|
||||
// Store the interface name
|
||||
strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
|
||||
strncpy(names + (*found)*maxIfNameSize, interface->ifa_name, maxIfNameSize);
|
||||
// Store the IP address
|
||||
int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
|
||||
memset(addrs+found, '\0', sizeof(*addrs));
|
||||
memcpy(addrs+found, interface->ifa_addr, salen);
|
||||
found++;
|
||||
memset(addrs + *found, '\0', sizeof(*addrs));
|
||||
memcpy(addrs + *found, interface->ifa_addr, salen);
|
||||
(*found)++;
|
||||
}
|
||||
}
|
||||
|
||||
freeifaddrs(interfaces);
|
||||
return found;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote) {
|
||||
@@ -224,20 +231,21 @@ static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote
|
||||
same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
|
||||
return same;
|
||||
} else {
|
||||
WARN("Net : Unsupported address family type");
|
||||
INFO(NCCL_NET, "Net : Unsupported address family type");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
|
||||
ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr,
|
||||
union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int* found) {
|
||||
#ifdef ENABLE_TRACE
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
#endif
|
||||
char line_a[SOCKET_NAME_MAXLEN+1];
|
||||
int found = 0;
|
||||
#endif
|
||||
*found = 0;
|
||||
struct ifaddrs *interfaces, *interface;
|
||||
getifaddrs(&interfaces);
|
||||
for (interface = interfaces; interface && !found; interface = interface->ifa_next) {
|
||||
SYSCHECK(getifaddrs(&interfaces), "getifaddrs");
|
||||
for (interface = interfaces; interface && !*found; interface = interface->ifa_next) {
|
||||
if (interface->ifa_addr == NULL) continue;
|
||||
|
||||
/* We only support IPv4 & IPv6 */
|
||||
@@ -252,21 +260,18 @@ int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAd
|
||||
|
||||
// Store the local IP address
|
||||
int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
|
||||
memcpy(localAddrs+found, interface->ifa_addr, salen);
|
||||
memcpy(localAddr, interface->ifa_addr, salen);
|
||||
|
||||
// Store the interface name
|
||||
strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
|
||||
strncpy(ifName, interface->ifa_name, ifNameMaxSize);
|
||||
|
||||
TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, ncclSocketToString(localAddrs+found, line), ncclSocketToString(remoteAddr, line_a));
|
||||
found++;
|
||||
if (found == maxIfs) break;
|
||||
TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s",
|
||||
interface->ifa_name, ncclSocketToString(localAddr, line), ncclSocketToString(remoteAddr, line_a));
|
||||
*found = 1;
|
||||
}
|
||||
|
||||
if (found == 0) {
|
||||
WARN("Net : No interface found in the same subnet as remote address %s", ncclSocketToString(remoteAddr, line_a));
|
||||
}
|
||||
freeifaddrs(interfaces);
|
||||
return found;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) {
|
||||
@@ -349,40 +354,41 @@ ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
|
||||
ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs,
|
||||
int* nIfs) {
|
||||
static int shownIfName = 0;
|
||||
int nIfs = 0;
|
||||
// Allow user to force the INET socket family selection
|
||||
int sock_family = envSocketFamily();
|
||||
// User specified interface
|
||||
const char* env = ncclGetEnv("NCCL_SOCKET_IFNAME");
|
||||
*nIfs = 0;
|
||||
if (env && strlen(env) > 1) {
|
||||
INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
|
||||
// Specified by user : find or fail
|
||||
if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
|
||||
nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
NCCLCHECK(findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
|
||||
} else {
|
||||
// Try to automatically pick the right one
|
||||
// Start with IB
|
||||
nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
NCCLCHECK(findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
|
||||
// else see if we can get some hint from COMM ID
|
||||
if (nIfs == 0) {
|
||||
if (*nIfs == 0) {
|
||||
const char* commId = ncclGetEnv("NCCL_COMM_ID");
|
||||
if (commId && strlen(commId) > 1) {
|
||||
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
|
||||
// Try to find interface that is in the same subnet as the IP in comm id
|
||||
union ncclSocketAddress idAddr;
|
||||
ncclSocketGetAddrFromString(&idAddr, commId);
|
||||
nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
|
||||
NCCLCHECK(ncclSocketGetAddrFromString(&idAddr, commId));
|
||||
NCCLCHECK(ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, nIfs));
|
||||
}
|
||||
}
|
||||
// Then look for anything else (but not docker or lo)
|
||||
if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
if (*nIfs == 0) NCCLCHECK(findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
|
||||
// Finally look for docker, then lo.
|
||||
if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
|
||||
if (*nIfs == 0) NCCLCHECK(findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
|
||||
if (*nIfs == 0) NCCLCHECK(findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
|
||||
}
|
||||
return nIfs;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
|
||||
@@ -444,17 +450,20 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
|
||||
/* per accept's man page, for linux sockets, the following errors might be already pending errors
|
||||
* and should be considered as EAGAIN. To avoid infinite loop in case of errors, we use the retry count*/
|
||||
if (++sock->errorRetries == ncclParamRetryCnt()) {
|
||||
WARN("socketTryAccept: exceeded error retry count (%d), %s", sock->errorRetries, strerror(errno));
|
||||
WARN("socketTryAccept: exceeded error retry count after %d attempts, %s", sock->errorRetries, strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
INFO(NCCL_ALL, "Call to accept returned %s, retrying", strerror(errno));
|
||||
} else if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
INFO(NCCL_NET|NCCL_INIT, "Call to accept returned %s, retrying", strerror(errno));
|
||||
} else if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
WARN("socketTryAccept: Accept failed: %s", strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(SocketMaxRecvBuff, "SOCKET_RCVBUF", -1);
|
||||
NCCL_PARAM(SocketMaxSendBuff, "SOCKET_SNDBUF", -1);
|
||||
|
||||
static ncclResult_t socketSetFlags(struct ncclSocket* sock) {
|
||||
const int one = 1;
|
||||
/* Set socket as non-blocking if async or if we need to be able to abort */
|
||||
@@ -463,34 +472,55 @@ static ncclResult_t socketSetFlags(struct ncclSocket* sock) {
|
||||
SYSCHECK(flags = fcntl(sock->fd, F_GETFL), "fcntl");
|
||||
SYSCHECK(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
|
||||
}
|
||||
SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
|
||||
SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt TCP NODELAY");
|
||||
// setsockopt should not fail even if the sizes are too large, do not change the default if unset by the user (=-1)
|
||||
int rcvBuf = ncclParamSocketMaxRecvBuff(), sndBuf = ncclParamSocketMaxSendBuff();
|
||||
if (sndBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (char*)&sndBuf, sizeof(int)), "setsockopt SO_SNDBUF");
|
||||
if (rcvBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (char*)&rcvBuf, sizeof(int)), "setsockopt SO_RCVBUF");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static void socketResetAccept(struct ncclSocket* sock) {
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
INFO(NCCL_NET|NCCL_INIT, "socketFinalizeAccept: didn't receive a valid magic from %s",
|
||||
ncclSocketToString(&sock->addr, line));
|
||||
// Ignore spurious connection and accept again
|
||||
(void)close(sock->fd);
|
||||
sock->fd = -1;
|
||||
sock->state = ncclSocketStateAccepting;
|
||||
sock->finalizeCounter = 0;
|
||||
}
|
||||
|
||||
static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
|
||||
uint64_t magic;
|
||||
enum ncclSocketType type;
|
||||
int received;
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
// once accepted, linux sockets do NOT inherit file status flags such as O_NONBLOCK (BSD ones do)
|
||||
NCCLCHECK(socketSetFlags(sock));
|
||||
|
||||
if (sock->asyncFlag == 0 || sock->finalizeCounter < sizeof(magic)) {
|
||||
if (sock->asyncFlag == 0) {
|
||||
received = 0;
|
||||
NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
|
||||
if (socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received) != ncclSuccess) {
|
||||
socketResetAccept(sock);
|
||||
return ncclSuccess;
|
||||
}
|
||||
} else {
|
||||
int closed = 0;
|
||||
received = sock->finalizeCounter;
|
||||
NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(magic), &received));
|
||||
NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(magic), &received, &closed));
|
||||
sock->finalizeCounter = received;
|
||||
if (received < sizeof(magic)) return ncclSuccess;
|
||||
if (received < sizeof(magic)) {
|
||||
if (closed) {
|
||||
socketResetAccept(sock);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
memcpy(&magic, sock->finalizeBuffer, sizeof(magic));
|
||||
}
|
||||
if (magic != sock->magic) {
|
||||
WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic);
|
||||
close(sock->fd);
|
||||
sock->fd = -1;
|
||||
// Ignore spurious connection and accept again
|
||||
sock->state = ncclSocketStateAccepting;
|
||||
socketResetAccept(sock);
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
@@ -505,7 +535,7 @@ static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
|
||||
memcpy(&type, sock->finalizeBuffer, sizeof(type));
|
||||
}
|
||||
if (type != sock->type) {
|
||||
WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type);
|
||||
WARN("socketFinalizeAccept from %s: wrong type %d != %d", ncclSocketToString(&sock->addr, line), type, sock->type);
|
||||
sock->state = ncclSocketStateError;
|
||||
close(sock->fd);
|
||||
sock->fd = -1;
|
||||
@@ -537,32 +567,38 @@ cleanup:
|
||||
}
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t socketConnectCheck(struct ncclSocket* sock, int errCode, const char funcName[]) {
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
if (errCode == 0) {
|
||||
sock->state = ncclSocketStateConnected;
|
||||
} else if (errCode == EINPROGRESS) {
|
||||
sock->state = ncclSocketStateConnectPolling;
|
||||
} else if (errCode == ETIMEDOUT || errCode == EHOSTUNREACH || errCode == ECONNREFUSED) {
|
||||
} else if (errCode == EINTR || errCode == EWOULDBLOCK || errCode == EAGAIN || errCode == ETIMEDOUT ||
|
||||
errCode == EHOSTUNREACH || errCode == ECONNREFUSED) {
|
||||
if (sock->customRetry == 0) {
|
||||
if (sock->errorRetries++ == ncclParamRetryCnt()) {
|
||||
sock->state = ncclSocketStateError;
|
||||
WARN("%s: connect returned %s, exceeded error retry count (%d)", funcName, strerror(errCode), sock->errorRetries);
|
||||
WARN("%s: connect to %s returned %s, exceeded error retry count after %d attempts",
|
||||
funcName, ncclSocketToString(&sock->addr, line), strerror(errCode), sock->errorRetries);
|
||||
return ncclRemoteError;
|
||||
}
|
||||
unsigned int sleepTime = sock->errorRetries * ncclParamRetryTimeOut();
|
||||
INFO(NCCL_ALL, "%s: connect returned %s, retrying (%d/%ld) after sleep for %u msec", funcName, strerror(errCode), sock->errorRetries, ncclParamRetryCnt(), sleepTime);
|
||||
INFO(NCCL_NET|NCCL_INIT, "%s: connect to %s returned %s, retrying (%d/%ld) after sleep for %u msec",
|
||||
funcName, ncclSocketToString(&sock->addr, line), strerror(errCode),
|
||||
sock->errorRetries, ncclParamRetryCnt(), sleepTime);
|
||||
msleep(sleepTime);
|
||||
}
|
||||
NCCLCHECK(socketResetFd(sock)); /* in case of failure in connect, socket state is unspecified */
|
||||
sock->state = ncclSocketStateConnecting;
|
||||
} else {
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
sock->state = ncclSocketStateError;
|
||||
WARN("%s: Connect to %s failed : %s", funcName, ncclSocketToString(&sock->addr, line), strerror(errCode));
|
||||
WARN("%s: connect to %s failed : %s", funcName, ncclSocketToString(&sock->addr, line), strerror(errCode));
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
|
||||
/* blocking/non-blocking connect() is determined by asyncFlag. */
|
||||
int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
|
||||
@@ -573,6 +609,7 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
|
||||
struct pollfd pfd;
|
||||
int timeout = 1, ret;
|
||||
socklen_t rlen = sizeof(int);
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
|
||||
memset(&pfd, 0, sizeof(struct pollfd));
|
||||
pfd.fd = sock->fd;
|
||||
@@ -582,10 +619,7 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
|
||||
if (ret == 0 || (ret < 0 && errno == EINTR)) {
|
||||
return ncclSuccess;
|
||||
} else if (ret < 0) {
|
||||
WARN("socketPollConnect poll() failed with error %s", strerror(errno));
|
||||
return ncclRemoteError;
|
||||
} else if (ret != 1 || (pfd.revents & POLLOUT) == 0) {
|
||||
WARN("socketPollConnect poll() returned %d%s", ret, (pfd.revents & POLLOUT) ? "" : ", no POLLOUT events");
|
||||
WARN("socketPollConnect to %s failed with error %s", ncclSocketToString(&sock->addr, line), strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
@@ -914,7 +948,7 @@ ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int
|
||||
ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) {
|
||||
if (sock != NULL) {
|
||||
if (sock->fd >= 0) {
|
||||
shutdown(sock->fd, how);
|
||||
SYSCHECK(shutdown(sock->fd, how), "shutdown");
|
||||
}
|
||||
sock->state = ncclSocketStateTerminating;
|
||||
}
|
||||
@@ -936,8 +970,8 @@ ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait) {
|
||||
* by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
|
||||
* the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
|
||||
* connection close here. */
|
||||
shutdown(sock->fd, SHUT_RDWR);
|
||||
close(sock->fd);
|
||||
(void)shutdown(sock->fd, SHUT_RDWR);
|
||||
(void)close(sock->fd);
|
||||
}
|
||||
sock->state = ncclSocketStateClosed;
|
||||
sock->fd = -1;
|
||||
|
||||
@@ -9,6 +9,12 @@
|
||||
#include "checks.h"
|
||||
#include "param.h"
|
||||
|
||||
#if CUDART_VERSION >= 13000
|
||||
#define cudaStreamGetCaptureInfo_v3 cudaStreamGetCaptureInfo
|
||||
#define cudaGraphAddDependencies_v2 cudaGraphAddDependencies
|
||||
#define cudaStreamUpdateCaptureDependencies_v2 cudaStreamUpdateCaptureDependencies
|
||||
#endif
|
||||
|
||||
// Tracks the captured work a given graph captured identified by its graph id.
|
||||
struct ncclStrongStreamCapture {
|
||||
struct ncclStrongStreamCapture* next;
|
||||
@@ -206,7 +212,11 @@ ncclResult_t ncclStrongStreamAcquire(
|
||||
CUDACHECK(cudaEventRecord(scratch, graph.origin));
|
||||
CUDACHECK(cudaStreamWaitEvent(cap->captureStream, scratch, 0));
|
||||
CUDACHECK(cudaEventDestroy(scratch));
|
||||
#if CUDART_VERSION >= 13000
|
||||
CUDACHECK(cudaStreamUpdateCaptureDependencies_v2(cap->captureStream, nullptr, nullptr, 0, cudaStreamSetCaptureDependencies));
|
||||
#else
|
||||
CUDACHECK(cudaStreamUpdateCaptureDependencies(cap->captureStream, nullptr, 0, cudaStreamSetCaptureDependencies));
|
||||
#endif
|
||||
|
||||
if (mixing && firstCapture) {
|
||||
CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream));
|
||||
@@ -266,7 +276,11 @@ ncclResult_t ncclStrongStreamRelease(
|
||||
|
||||
// Make this record order after previous record on this stream.
|
||||
if (cap->lastRecord != nullptr) {
|
||||
#if CUDART_VERSION >= 13000
|
||||
CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &cap->lastRecord, &recordNode, nullptr, 1));
|
||||
#else
|
||||
CUDACHECK(cudaGraphAddDependencies(graph.graph, &cap->lastRecord, &recordNode, 1));
|
||||
#endif
|
||||
}
|
||||
cap->lastRecord = recordNode;
|
||||
|
||||
@@ -274,7 +288,11 @@ ncclResult_t ncclStrongStreamRelease(
|
||||
cudaStreamCaptureStatus status;
|
||||
cudaGraphNode_t const* nodes;
|
||||
size_t count = 0;
|
||||
#if CUDART_VERSION >= 13000
|
||||
cudaError_t res = hipStreamGetCaptureInfo_v3(cap->captureStream, &status, nullptr, nullptr, &nodes, nullptr, &count);
|
||||
#else
|
||||
cudaError_t res = hipStreamGetCaptureInfo_v2(cap->captureStream, &status, nullptr, nullptr, &nodes, &count);
|
||||
#endif
|
||||
|
||||
#if CUDART_VERSION >= 12030
|
||||
if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations.
|
||||
@@ -290,7 +308,11 @@ ncclResult_t ncclStrongStreamRelease(
|
||||
else {
|
||||
CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/);
|
||||
for (int i=0; i < (int)count; i++) {
|
||||
#if CUDART_VERSION >= 13000
|
||||
CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &nodes[i], &recordNode, nullptr, 1));
|
||||
#else
|
||||
CUDACHECK(cudaGraphAddDependencies(graph.graph, &nodes[i], &recordNode, 1));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -321,7 +343,11 @@ ncclResult_t ncclStreamAdvanceToEvent(struct ncclCudaGraph g, cudaStream_t s, cu
|
||||
cudaStreamCaptureStatus status;
|
||||
cudaGraphNode_t const* nodes;
|
||||
size_t count = 0;
|
||||
#if CUDART_VERSION >= 13000
|
||||
cudaError_t res = hipStreamGetCaptureInfo_v3(tmp, &status, nullptr, nullptr, &nodes, nullptr, &count);
|
||||
#else
|
||||
cudaError_t res = hipStreamGetCaptureInfo_v2(tmp, &status, nullptr, nullptr, &nodes, &count);
|
||||
#endif
|
||||
|
||||
#if CUDART_VERSION >= 12030
|
||||
if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations.
|
||||
@@ -334,7 +360,11 @@ ncclResult_t ncclStreamAdvanceToEvent(struct ncclCudaGraph g, cudaStream_t s, cu
|
||||
#endif
|
||||
else {
|
||||
CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/);
|
||||
#if CUDART_VERSION >= 13000
|
||||
CUDACHECK(cudaStreamUpdateCaptureDependencies_v2(s, (cudaGraphNode_t*)nodes, nullptr, count, cudaStreamSetCaptureDependencies));
|
||||
#else
|
||||
CUDACHECK(cudaStreamUpdateCaptureDependencies(s, (cudaGraphNode_t*)nodes, count, cudaStreamSetCaptureDependencies));
|
||||
#endif
|
||||
}
|
||||
|
||||
CUDACHECK(cudaStreamDestroy(tmp));
|
||||
|
||||
+9
-2
@@ -11,6 +11,7 @@
|
||||
|
||||
// Determine if MNNVL support is available
|
||||
ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) {
|
||||
#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
|
||||
// MNNVL requires cuMem to be enabled
|
||||
if (!ncclCuMemEnable()) return ncclSuccess;
|
||||
|
||||
@@ -58,7 +59,12 @@ ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) {
|
||||
|
||||
// Allocate FABRIC handle compatible memory
|
||||
ncclResult_t ret = ncclCuMemAlloc(&ptr, &handle, CU_MEM_HANDLE_TYPE_FABRIC, CUDA_IPC_MIN);
|
||||
if (ret != ncclSuccess) return ncclSuccess;
|
||||
if (ret != ncclSuccess) {
|
||||
// Return an error if this is a MNNVL capable system but FABRIC handles are not supported
|
||||
WARN("MNNVL (cliqueSize %d) is available but not working on this system. Check the IMEX channel configuration (/dev/nvidia-caps-imex-channels). Set NCCL_MNNVL_ENABLE=0 to ignore this issue.",
|
||||
comm->clique.size);
|
||||
return ncclSystemError;
|
||||
}
|
||||
err = cuMemExportToShareableHandle(&cuDesc, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0);
|
||||
if (err != CUDA_SUCCESS ||
|
||||
(err = cuMemImportFromShareableHandle(&handle, &cuDesc, CU_MEM_HANDLE_TYPE_FABRIC)) != CUDA_SUCCESS) {
|
||||
@@ -66,7 +72,7 @@ ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) {
|
||||
(void) cuGetErrorString(err, &errStr);
|
||||
NCCLCHECK(ncclCuMemFree(ptr));
|
||||
// Return an error if this is a MNNVL capable system but it's not working
|
||||
WARN("MNNVL (cliqueSize %d) is available but not supported on this system. Check the IMEX configuration.",
|
||||
WARN("MNNVL (cliqueSize %d) is available but not working on this system. Check the IMEX configuration (nvidia-imex-ctl -N). Set NCCL_MNNVL_ENABLE=0 to ignore this issue.",
|
||||
comm->clique.size);
|
||||
return ncclSystemError;
|
||||
}
|
||||
@@ -78,5 +84,6 @@ ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) {
|
||||
INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d",
|
||||
comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank);
|
||||
}
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
+53
-2
@@ -34,6 +34,7 @@ extern "C" {
|
||||
/*! @brief Opaque handle to communicator
|
||||
@details A communicator contains information required to facilitate collective communications calls */
|
||||
typedef struct ncclComm* ncclComm_t;
|
||||
typedef struct ncclWindow* ncclWindow_t;
|
||||
#define NCCL_COMM_NULL NULL
|
||||
|
||||
#define NCCL_UNIQUE_ID_BYTES 128
|
||||
@@ -65,13 +66,25 @@ typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; /*!< Opaque array>*/} nccl
|
||||
#define NCCL_SPLIT_NOCOLOR -1
|
||||
#define NCCL_UNDEF_FLOAT -1.0f
|
||||
|
||||
/* Window Registration flags */
|
||||
#define NCCL_WIN_DEFAULT 0x00
|
||||
#define NCCL_WIN_COLL_SYMMETRIC 0x01
|
||||
|
||||
/* NCCL performance policy */
|
||||
#define NCCL_CTA_POLICY_DEFAULT 0x00
|
||||
#define NCCL_CTA_POLICY_EFFICIENCY 0x01
|
||||
|
||||
/* ncclCommShrink flags*/
|
||||
#define NCCL_SHRINK_DEFAULT 0x00 /* shrink the parent communicator */
|
||||
#define NCCL_SHRINK_ABORT 0x01 /* First, terminate ongoing parent operations, and then shrink the parent communicator */
|
||||
|
||||
/*! @defgroup rccl_config_type Communicator Configuration
|
||||
@details Structure that allows for customizing Communicator behavior via ncclCommInitRankConfig
|
||||
@{ */
|
||||
|
||||
/*! @brief Communicator configuration
|
||||
@details Users can assign value to attributes to specify the behavior of a communicator */
|
||||
typedef struct ncclConfig_v21700 {
|
||||
typedef struct ncclConfig_v22700 {
|
||||
/* attributes that users should never touch. */
|
||||
size_t size; /*!< Should not be touched */
|
||||
unsigned int magic; /*!< Should not be touched */
|
||||
@@ -84,6 +97,11 @@ typedef struct ncclConfig_v21700 {
|
||||
const char *netName; /*!< Force NCCL to use a specfic network */
|
||||
int splitShare; /*!< Allow communicators to share resources */
|
||||
int trafficClass; /*!< Traffic class*/
|
||||
const char *commName; /*!< Name of the communicator*/
|
||||
int collnetEnable; /*!< Check for collnet enablement*/
|
||||
int CTAPolicy; /*!< CTA Policy*/
|
||||
int shrinkShare; /*!< Shrink size*/
|
||||
int nvlsCTAs; /*!< Number of NVLS cooperative thread arrays (blocks)*/
|
||||
} ncclConfig_t;
|
||||
|
||||
/* Config initializer must be assigned to initialize config structure when it is created.
|
||||
@@ -99,6 +117,11 @@ typedef struct ncclConfig_v21700 {
|
||||
NCCL_CONFIG_UNDEF_PTR, /* netName */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* splitShare */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* trafficClass */ \
|
||||
NCCL_CONFIG_UNDEF_PTR, /* commName */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* collnetEnable */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* CTAPolicy */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* shrinkShare */ \
|
||||
NCCL_CONFIG_UNDEF_INT, /* nvlsCTAs */ \
|
||||
}
|
||||
/*! @} */
|
||||
|
||||
@@ -270,7 +293,23 @@ ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *new
|
||||
/*! @endcond */
|
||||
/*! @} */
|
||||
|
||||
/*! @brief Creates a new communicator (multi thread/process version), similar to ncclCommInitRankConfig.
|
||||
/*! @brief Shrink existing communicator.
|
||||
@details Ranks in excludeRanksList will be removed form the existing communicator.
|
||||
Within the new communicator, ranks will be re-ordered to fill the gap of removed ones.
|
||||
If config is NULL, the new communicator will inherit the original communicator's configuration.
|
||||
The flag enables NCCL to adapt to various states of the parent communicator, see NCCL_SHRINK flags.
|
||||
@return Result code. See @ref rccl_result_code for more details.
|
||||
|
||||
@param[in] comm Original communicator object for this rank
|
||||
@param[in] excludeRanksList List of ranks to be exluded
|
||||
@param[in] excludeRanksCount Number of ranks to be excluded
|
||||
@param[out] newcomm Pointer to new communicator
|
||||
@param[in] config Config file for new communicator. May be NULL to inherit from comm
|
||||
@param[in] shrinkFlags Flag to adapt to various states of the parent communicator (see NCCL_SHRINK flags)*/
|
||||
ncclResult_t ncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
|
||||
ncclResult_t pncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
|
||||
|
||||
/*! @brief Creates a new communicator (multi thread/process version), similar to ncclCommInitRankConfig.
|
||||
@details Allows to use more than one ncclUniqueId (up to one per rank),
|
||||
indicated by nId, to accelerate the init operation.
|
||||
The number of ncclUniqueIds and their order must be the same for every rank.
|
||||
@@ -376,6 +415,18 @@ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
|
||||
ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
|
||||
/*! @endcond */
|
||||
|
||||
/* Register memory window */
|
||||
ncclResult_t ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
|
||||
/*! @cond include_hidden */
|
||||
ncclResult_t pncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
|
||||
/*! @endcond */
|
||||
|
||||
/* Deregister symmetric memory */
|
||||
ncclResult_t ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
|
||||
/*! @cond include_hidden */
|
||||
ncclResult_t pncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
|
||||
/*! @endcond */
|
||||
|
||||
/*! @defgroup rccl_api_enumerations API Enumerations
|
||||
@details Enumerations used by collective communication calls
|
||||
@{ */
|
||||
|
||||
+219
-171
@@ -8,6 +8,7 @@
|
||||
#include "bootstrap.h"
|
||||
#include "checks.h"
|
||||
#include "plugin.h"
|
||||
#include "nccl_net.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
@@ -15,137 +16,100 @@
|
||||
//#include <sys/stat.h>
|
||||
//#include <unistd.h>
|
||||
|
||||
extern ncclNet_t* getNcclNet_v6(void* netPluginLib);
|
||||
extern ncclNet_t* getNcclNet_v7(void* netPluginLib);
|
||||
extern ncclNet_t* getNcclNet_v8(void* netPluginLib);
|
||||
extern ncclNet_t* getNcclNet_v9(void* netPluginLib);
|
||||
extern ncclNet_t* getNcclNet_v10(void* netPluginLib);
|
||||
typedef ncclNet_t* getNcclNet_t(void* netPluginLib);
|
||||
typedef ncclCollNet_t* getNcclCollNet_t(void* netPluginLib);
|
||||
|
||||
extern ncclCollNet_t* getNcclCollNet_v6(void* netPluginLib);
|
||||
extern ncclCollNet_t* getNcclCollNet_v7(void* netPluginLib);
|
||||
extern ncclCollNet_t* getNcclCollNet_v8(void* netPluginLib);
|
||||
extern ncclCollNet_t* getNcclCollNet_v9(void* netPluginLib);
|
||||
extern ncclCollNet_t* getNcclCollNet_v10(void* netPluginLib);
|
||||
|
||||
static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket };
|
||||
static int ncclNetsVer[NCCL_NET_MAX_PLUGINS] = { -1, 10, 10 };
|
||||
ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr };
|
||||
enum ncclNetState {
|
||||
ncclNetStateInit = 0,
|
||||
ncclNetStateEnabled = 1,
|
||||
ncclNetStateDisabled = 2
|
||||
};
|
||||
enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
|
||||
enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
|
||||
extern getNcclNet_t getNcclNet_v6;
|
||||
extern getNcclNet_t getNcclNet_v7;
|
||||
extern getNcclNet_t getNcclNet_v8;
|
||||
extern getNcclNet_t getNcclNet_v9;
|
||||
extern getNcclNet_t getNcclNet_v10;
|
||||
extern getNcclCollNet_t getNcclCollNet_v6;
|
||||
extern getNcclCollNet_t getNcclCollNet_v7;
|
||||
extern getNcclCollNet_t getNcclCollNet_v8;
|
||||
extern getNcclCollNet_t getNcclCollNet_v9;
|
||||
extern getNcclCollNet_t getNcclCollNet_v10;
|
||||
|
||||
NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 1);
|
||||
#define NCCL_NET_VERSION_COUNT 5
|
||||
int ncclNetVersion[NCCL_NET_VERSION_COUNT] = {10, 9, 8, 7, 6};
|
||||
getNcclNet_t* getNcclNet[NCCL_NET_VERSION_COUNT] = {getNcclNet_v10, getNcclNet_v9, getNcclNet_v8, getNcclNet_v7, getNcclNet_v6};
|
||||
getNcclCollNet_t* getNcclCollNet[NCCL_NET_VERSION_COUNT] = {getNcclCollNet_v10, getNcclCollNet_v9, getNcclCollNet_v8, getNcclCollNet_v7, getNcclCollNet_v6};
|
||||
|
||||
#define NCCL_NET_NUM_INTERNAL_PLUGINS 2
|
||||
|
||||
typedef enum ncclNetPluginState {
|
||||
ncclNetPluginStateDisabled = -2, // Plugin library failed to initialize
|
||||
ncclNetPluginStateLoadFailed = -1, // Plugin library failed to load
|
||||
ncclNetPluginStateLoadReady = 0, // Plugin library is ready to be loaded
|
||||
ncclNetPluginStateInitReady = 1, // Plugin library is loaded and ready to be initialized
|
||||
ncclNetPluginStateEnabled = 2, // Plugin library is loaded and initialized
|
||||
} ncclNetPluginState_t;
|
||||
|
||||
#define MAX_STR_LEN 255
|
||||
typedef struct netPluginLib {
|
||||
char name[MAX_STR_LEN]; // Name of the plugin library
|
||||
void* dlHandle; // Handle to the plugin library
|
||||
ncclNet_t* ncclNet; // Pointer to the ncclNet_t structure
|
||||
int ncclNetVer; // Version of the nccl net plugin
|
||||
ncclCollNet_t* ncclCollNet; // Pointer to the ncclCollNet_t structure
|
||||
ncclNetPluginState_t ncclNetPluginState; // State of the nccl net plugin
|
||||
ncclNetPluginState_t ncclCollNetPluginState; // State of the nccl coll net plugin
|
||||
int ncclNetPluginRefCount; // Reference count for the nccl net plugin
|
||||
} netPluginLib_t;
|
||||
|
||||
int pluginCount = 0;
|
||||
bool netPluginLibsInitialized = false;
|
||||
netPluginLib_t netPluginLibs[NCCL_NET_MAX_PLUGINS] = { 0 };
|
||||
static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static void* netPluginLib;
|
||||
static pthread_once_t initPluginLibsOnceControl = PTHREAD_ONCE_INIT;
|
||||
|
||||
static int netPluginRefCount;
|
||||
static void initNetPluginRefCountOnce(void) { netPluginRefCount = ncclParamNetPluginRefCount();}
|
||||
|
||||
enum {
|
||||
netPluginLoadFailed = -1,
|
||||
netPluginLoadReady = 0,
|
||||
netPluginLoadSuccess = 1,
|
||||
};
|
||||
|
||||
static int netPluginStatus = netPluginLoadReady;
|
||||
|
||||
ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
|
||||
static pthread_once_t netPluginRefCountOnce = PTHREAD_ONCE_INIT;
|
||||
pthread_once(&netPluginRefCountOnce, initNetPluginRefCountOnce);
|
||||
|
||||
pthread_mutex_lock(&netPluginLock);
|
||||
if (netPluginLoadFailed == netPluginStatus) {
|
||||
goto exit;
|
||||
static ncclResult_t ncclNetPluginUnload(netPluginLib_t* pluginLib) {
|
||||
if ((pluginLib->dlHandle) && ((pluginLib->ncclNetPluginRefCount) == 0)) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "Unloading plugin %s", pluginLib->name);
|
||||
NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle));
|
||||
memset(pluginLib, 0, sizeof(netPluginLib_t));
|
||||
}
|
||||
if (netPluginLoadSuccess == netPluginStatus) {
|
||||
++netPluginRefCount;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
netPluginLib = ncclOpenNetPluginLib(ncclGetEnv("NCCL_NET_PLUGIN"));
|
||||
if (netPluginLib == nullptr) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
ncclNets[0] = getNcclNet_v10(netPluginLib);
|
||||
if (ncclNets[0]) ncclNetsVer[0] = 10;
|
||||
if (ncclNets[0] == nullptr) {
|
||||
// Try v9 plugin
|
||||
ncclNets[0] = getNcclNet_v9(netPluginLib);
|
||||
if (ncclNets[0]) ncclNetsVer[0] = 9;
|
||||
}
|
||||
if (ncclNets[0] == nullptr) {
|
||||
// Try v8 plugin
|
||||
ncclNets[0] = getNcclNet_v8(netPluginLib);
|
||||
if (ncclNets[0]) ncclNetsVer[0] = 8;
|
||||
}
|
||||
if (ncclNets[0] == nullptr) {
|
||||
// Try v7 plugin
|
||||
ncclNets[0] = getNcclNet_v7(netPluginLib);
|
||||
if (ncclNets[0]) ncclNetsVer[0] = 7;
|
||||
}
|
||||
if (ncclNets[0] == nullptr) {
|
||||
// Try v6 plugin
|
||||
ncclNets[0] = getNcclNet_v6(netPluginLib);
|
||||
if (ncclNets[0]) ncclNetsVer[0] = 6;
|
||||
}
|
||||
if (ncclNets[0] == nullptr) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
// Check for CollNet
|
||||
ncclCollNets[0] = getNcclCollNet_v10(netPluginLib);
|
||||
if (ncclCollNets[0] == nullptr) {
|
||||
ncclCollNets[0] = getNcclCollNet_v9(netPluginLib);
|
||||
}
|
||||
if (ncclCollNets[0] == nullptr) {
|
||||
ncclCollNets[0] = getNcclCollNet_v8(netPluginLib);
|
||||
}
|
||||
if (ncclCollNets[0] == nullptr) {
|
||||
ncclCollNets[0] = getNcclCollNet_v7(netPluginLib);
|
||||
}
|
||||
if (ncclCollNets[0] == nullptr) {
|
||||
ncclCollNets[0] = getNcclCollNet_v6(netPluginLib);
|
||||
}
|
||||
|
||||
++netPluginRefCount;
|
||||
netPluginStatus = netPluginLoadSuccess;
|
||||
comm->netPluginLoaded = 1;
|
||||
|
||||
exit:
|
||||
pthread_mutex_unlock(&netPluginLock);
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
if (netPluginLib) NCCLCHECK(ncclClosePluginLib(netPluginLib));
|
||||
netPluginStatus = netPluginLoadFailed;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) {
|
||||
pthread_mutex_lock(&netPluginLock);
|
||||
if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) {
|
||||
if (ncclNets[0]) {
|
||||
INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name);
|
||||
}
|
||||
if (ncclCollNets[0]) {
|
||||
INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name);
|
||||
}
|
||||
NCCLCHECK(ncclClosePluginLib(netPluginLib));
|
||||
netPluginLib = nullptr;
|
||||
ncclNets[0] = nullptr;
|
||||
ncclCollNets[0] = nullptr;
|
||||
netPluginStatus = netPluginLoadReady;
|
||||
comm->netPluginLoaded = 0;
|
||||
for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i)
|
||||
ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit;
|
||||
static ncclResult_t ncclNetPluginLoad(netPluginLib_t* pluginLib) {
|
||||
pluginLib->dlHandle = ncclOpenNetPluginLib(pluginLib->name);
|
||||
|
||||
if (pluginLib->dlHandle == nullptr) goto fail;
|
||||
// load ncclNet
|
||||
for (int i = 0; i < NCCL_NET_VERSION_COUNT; i++) {
|
||||
pluginLib->ncclNetVer = ncclNetVersion[i];
|
||||
pluginLib->ncclNet = getNcclNet[i](pluginLib->dlHandle);
|
||||
if (pluginLib->ncclNet) break;
|
||||
}
|
||||
pthread_mutex_unlock(&netPluginLock);
|
||||
|
||||
// if we fail to find a net, exit
|
||||
if (pluginLib->ncclNet == nullptr) goto fail;
|
||||
|
||||
pluginLib->ncclNetPluginState = ncclNetPluginStateInitReady;
|
||||
|
||||
// load ncclColNet
|
||||
for (int i = 0; i < NCCL_NET_VERSION_COUNT; i++) {
|
||||
pluginLib->ncclCollNet = getNcclCollNet[i](pluginLib->dlHandle);
|
||||
if (pluginLib->ncclCollNet) break;
|
||||
}
|
||||
|
||||
if (pluginLib->ncclCollNet == nullptr)
|
||||
pluginLib->ncclCollNetPluginState = ncclNetPluginStateLoadFailed;
|
||||
else
|
||||
pluginLib->ncclCollNetPluginState = ncclNetPluginStateInitReady;
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET, "Successfully loaded external plugin %s", pluginLib->name);
|
||||
exit:
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
if (pluginLib->dlHandle) {
|
||||
NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle));
|
||||
}
|
||||
pluginLib->ncclNetPluginState = ncclNetPluginStateLoadFailed;
|
||||
pluginLib->ncclCollNetPluginState = ncclNetPluginStateLoadFailed;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) {
|
||||
@@ -172,72 +136,156 @@ ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, in
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t netGetState(int i, enum ncclNetState* state) {
|
||||
pthread_mutex_lock(&netLock);
|
||||
if (ncclNetStates[i] == ncclNetStateInit) {
|
||||
int ndev;
|
||||
if (ncclNets[i]->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled;
|
||||
else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled;
|
||||
else ncclNetStates[i] = ncclNetStateEnabled;
|
||||
static ncclResult_t ncclNetPluginInit(netPluginLib_t* pluginLib) {
|
||||
int ndev;
|
||||
if (pluginLib->ncclNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclNet) {
|
||||
if (pluginLib->ncclNet->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) goto fail;
|
||||
if (pluginLib->ncclNet->devices(&ndev) != ncclSuccess || ndev <= 0) goto fail;
|
||||
}
|
||||
pluginLib->ncclNetPluginState = ncclNetPluginStateEnabled;
|
||||
INFO(NCCL_INIT|NCCL_NET, "Initialized NET plugin %s", pluginLib->ncclNet->name);
|
||||
|
||||
if (pluginLib->ncclCollNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclCollNet) {
|
||||
if (pluginLib->ncclCollNet->init(ncclDebugLog) != ncclSuccess) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
|
||||
else if (pluginLib->ncclCollNet->devices(&ndev) != ncclSuccess || ndev <= 0) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
|
||||
else {
|
||||
pluginLib->ncclCollNetPluginState = ncclNetPluginStateEnabled;
|
||||
}
|
||||
}
|
||||
exit:
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
pluginLib->ncclNetPluginState = ncclNetPluginStateDisabled;
|
||||
pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNetPluginAssignToComm(struct ncclComm* comm, int pluginIndex, bool* isAssigned) {
|
||||
const char* netName = comm->config.netName;
|
||||
if (netName && strcasecmp(netName, netPluginLibs[pluginIndex].ncclNet->name) != 0) goto fail;
|
||||
if (ncclSuccess != ncclNetCheckDeviceVersion(comm, netPluginLibs[pluginIndex].ncclNet, 0)) goto fail;
|
||||
|
||||
if (netPluginLibs[pluginIndex].ncclNetPluginState >= ncclNetPluginStateEnabled) {
|
||||
comm->ncclNet = netPluginLibs[pluginIndex].ncclNet;
|
||||
comm->ncclNetVer = netPluginLibs[pluginIndex].ncclNetVer;
|
||||
comm->netPluginIndex = pluginIndex;
|
||||
netPluginLibs[pluginIndex].ncclNetPluginRefCount++;
|
||||
*isAssigned = true;
|
||||
INFO(NCCL_INIT|NCCL_NET, "Assigned NET plugin %s to comm", netPluginLibs[pluginIndex].ncclNet->name);
|
||||
if (netPluginLibs[pluginIndex].ncclCollNetPluginState >= ncclNetPluginStateEnabled) {
|
||||
comm->ncclCollNet = netPluginLibs[pluginIndex].ncclCollNet;
|
||||
}
|
||||
}
|
||||
exit:
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
*isAssigned = false;
|
||||
netPluginLibs[pluginIndex].ncclNetPluginState = ncclNetPluginStateEnabled;
|
||||
netPluginLibs[pluginIndex].ncclCollNetPluginState = ncclNetPluginStateEnabled;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNetPluginDisableOtherExternal(int pluginIndex) {
|
||||
// Only if an external plugin is enabled, disable other external plugins
|
||||
if (pluginIndex >= (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) return ncclSuccess;
|
||||
char names[MAX_STR_LEN*(NCCL_NET_MAX_PLUGINS - NCCL_NET_NUM_INTERNAL_PLUGINS)] = { 0 };
|
||||
for (int i = 0; i < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS); i++) {
|
||||
if (i != pluginIndex) {
|
||||
// Append all disabled plugin names to a string
|
||||
snprintf(names+strlen(names), sizeof(names)-strlen(names), (strlen(names) == 0) ? "%s" : ", %s", netPluginLibs[i].name);
|
||||
netPluginLibs[i].ncclNetPluginState = ncclNetPluginStateDisabled;
|
||||
}
|
||||
}
|
||||
if(strlen(names) > 0) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "Disabling external plugins: %s", names);
|
||||
}
|
||||
*state = ncclNetStates[i];
|
||||
pthread_mutex_unlock(&netLock);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
|
||||
pthread_mutex_lock(&netLock);
|
||||
if (ncclCollNetStates[i] == ncclNetStateInit) {
|
||||
int ndev;
|
||||
if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
|
||||
else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled;
|
||||
else ncclCollNetStates[i] = ncclNetStateEnabled;
|
||||
static void initPluginLibsOnceFunc() {
|
||||
char* netPluginName = nullptr;
|
||||
const char* defaultNetPlugin = "libnccl-net.so";
|
||||
const char* envNetPlugin = nullptr;
|
||||
char* envNetPluginList = nullptr;
|
||||
char* savePtr = nullptr;
|
||||
int pluginCounter = 0;
|
||||
|
||||
memset(netPluginLibs, 0, NCCL_NET_MAX_PLUGINS * sizeof(netPluginLib_t));
|
||||
envNetPlugin = ncclGetEnv("NCCL_NET_PLUGIN");
|
||||
if (envNetPlugin) {
|
||||
envNetPluginList = strdup(envNetPlugin);
|
||||
// Iterate over list until the list is empty
|
||||
netPluginName = strtok_r(envNetPluginList, ",", &savePtr);
|
||||
while(netPluginName) {
|
||||
// We have 2 internal plugins (ib and socket)
|
||||
// So, we can have at most( NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS)) in the NCCL_NET_PLUGIN list
|
||||
if (pluginCounter >= (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS))) {
|
||||
INFO(NCCL_NET|NCCL_INIT,"NCCL_NET_PLUGIN list contains more than %d plugins, ignoring the rest", (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS + 1)));
|
||||
break;
|
||||
}
|
||||
// need to leave space for the name + "\n"
|
||||
if((strlen(netPluginName)+1) <= MAX_STR_LEN) {
|
||||
netPluginLibs[pluginCounter].ncclNetPluginState = ncclNetPluginStateLoadReady;
|
||||
netPluginLibs[pluginCounter].ncclNetPluginRefCount = ncclParamNetPluginRefCount();
|
||||
strcpy(netPluginLibs[pluginCounter].name, netPluginName);
|
||||
pluginCounter++;
|
||||
} else {
|
||||
INFO(NCCL_NET|NCCL_INIT,"NCCL_NET_PLUGIN list contains a plugin name %s longer than %d characters, ignoring it.", netPluginName, MAX_STR_LEN);
|
||||
}
|
||||
netPluginName = strtok_r(nullptr, ",", &savePtr);
|
||||
}
|
||||
if (envNetPluginList) free(envNetPluginList);
|
||||
} else {
|
||||
// Add default net plugin
|
||||
netPluginLibs[pluginCounter].ncclNetPluginState = ncclNetPluginStateLoadReady;
|
||||
netPluginLibs[pluginCounter].ncclNetPluginRefCount = ncclParamNetPluginRefCount();
|
||||
strcpy(netPluginLibs[pluginCounter++].name, defaultNetPlugin);
|
||||
}
|
||||
*state = ncclCollNetStates[i];
|
||||
pthread_mutex_unlock(&netLock);
|
||||
return ncclSuccess;
|
||||
|
||||
// Add 2 internal ib and socket plugins
|
||||
netPluginLibs[pluginCounter].ncclNet = &ncclNetIb;
|
||||
netPluginLibs[pluginCounter++].ncclNetPluginState = ncclNetPluginStateInitReady;
|
||||
netPluginLibs[pluginCounter].ncclNet = &ncclNetSocket;
|
||||
netPluginLibs[pluginCounter++].ncclNetPluginState = ncclNetPluginStateInitReady;
|
||||
pluginCount = pluginCounter;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetInit(struct ncclComm* comm) {
|
||||
// Initialize main communication network
|
||||
const char* netName;
|
||||
bool ok = false;
|
||||
|
||||
netName = comm->config.netName;
|
||||
for (int i=0; i<3; i++) {
|
||||
if (ncclNets[i] == nullptr) continue;
|
||||
enum ncclNetState state;
|
||||
NCCLCHECK(netGetState(i, &state));
|
||||
if (state != ncclNetStateEnabled) continue;
|
||||
if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
|
||||
if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) {
|
||||
// Mismatched device plugin version
|
||||
continue;
|
||||
bool ncclNetPluginInitialized = false;
|
||||
pthread_once(&initPluginLibsOnceControl, initPluginLibsOnceFunc);
|
||||
pthread_mutex_lock(&netPluginLock);
|
||||
for (int pluginIndex = 0; pluginIndex < pluginCount; pluginIndex++) {
|
||||
if ((pluginIndex < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) && (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateLoadReady)) {
|
||||
NCCLCHECK(ncclNetPluginLoad(&netPluginLibs[pluginIndex]));
|
||||
}
|
||||
|
||||
comm->ncclNet = ncclNets[i];
|
||||
comm->ncclNetVer = ncclNetsVer[i];
|
||||
ok = true;
|
||||
|
||||
if (ncclCollNets[i]) {
|
||||
NCCLCHECK(collNetGetState(i, &state));
|
||||
if (state == ncclNetStateEnabled) {
|
||||
comm->ncclCollNet = ncclCollNets[i];
|
||||
if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateInitReady) {
|
||||
NCCLCHECK(ncclNetPluginInit(&netPluginLibs[pluginIndex]));
|
||||
}
|
||||
if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateEnabled) {
|
||||
bool isAssigned = false;
|
||||
NCCLCHECK(ncclNetPluginAssignToComm(comm, pluginIndex, &isAssigned));
|
||||
if (isAssigned) {
|
||||
// If one external plugin is assigned to a comm, then disable all other external plugins
|
||||
ncclNetPluginDisableOtherExternal(pluginIndex);
|
||||
ncclNetPluginInitialized = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
WARN("Error: network %s not found.", netName ? netName : "");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
return ncclSuccess;
|
||||
pthread_mutex_unlock(&netPluginLock);
|
||||
if (ncclNetPluginInitialized) return ncclSuccess;
|
||||
WARN("Failed to initialize any NET plugin");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetFinalize(struct ncclComm* comm) {
|
||||
comm->ncclNet = nullptr;
|
||||
comm->ncclCollNet = nullptr;
|
||||
int pluginIndex = comm->netPluginIndex;
|
||||
pthread_mutex_lock(&netPluginLock);
|
||||
netPluginLibs[pluginIndex].ncclNetPluginRefCount--;
|
||||
for (int i = 0; i < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS); i++) {
|
||||
NCCLCHECK(ncclNetPluginUnload(&netPluginLibs[i]));
|
||||
}
|
||||
pthread_mutex_unlock(&netPluginLock);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ enum ncclPluginType {
|
||||
static void *libHandles[NUM_LIBS];
|
||||
static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" };
|
||||
static const char *pluginPrefix[NUM_LIBS] = { "librccl-net", "librccl-tuner", "librccl-profiler" };
|
||||
static const char *pluginFallback[NUM_LIBS] = { "Using internal net plugin.", "Using internal tuner plugin.", "" };
|
||||
static const char *pluginFallback[NUM_LIBS] = { "", "Using internal tuner plugin.", "" };
|
||||
static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT };
|
||||
|
||||
static void* tryOpenLib(char* name, int* err, char* errStr) {
|
||||
@@ -49,10 +49,9 @@ static void* tryOpenLib(char* name, int* err, char* errStr) {
|
||||
return handle;
|
||||
}
|
||||
|
||||
static void appendNameToList(char* nameList, int *nameListLen, char* name) {
|
||||
snprintf(nameList, *nameListLen, " %s", name);
|
||||
nameList += strlen(name) + 1;
|
||||
*nameListLen -= strlen(name) + 1;
|
||||
static void appendNameToList(char* nameList, int *leftChars, char* name) {
|
||||
snprintf(nameList + PATH_MAX - *leftChars, *leftChars, " %s", name);
|
||||
*leftChars -= strlen(name) + 1;
|
||||
}
|
||||
|
||||
static void* openPluginLib(enum ncclPluginType type, const char* libName) {
|
||||
@@ -62,28 +61,31 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) {
|
||||
char eNoEntNameList[PATH_MAX] = { 0 };
|
||||
|
||||
if (libName && strlen(libName)) {
|
||||
snprintf(libName_, MAX_STR_LEN, "%s", libName);
|
||||
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
|
||||
if (libHandles[type]) {
|
||||
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
|
||||
return libHandles[type];
|
||||
}
|
||||
if (openErr == ENOENT) {
|
||||
appendNameToList(eNoEntNameList, &len, libName_);
|
||||
// match names that start with 'lib' and end with '.so'
|
||||
if (strlen(libName) >= strlen("libX.so") && strncmp(libName, "lib", strlen("lib")) == 0 && strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")) == 0) {
|
||||
snprintf(libName_, MAX_STR_LEN, "%s", libName);
|
||||
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
|
||||
if (libHandles[type]) {
|
||||
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
|
||||
return libHandles[type];
|
||||
}
|
||||
if (openErr == ENOENT) {
|
||||
appendNameToList(eNoEntNameList, &len, libName_);
|
||||
} else {
|
||||
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
|
||||
}
|
||||
} else {
|
||||
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
|
||||
}
|
||||
|
||||
snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
|
||||
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
|
||||
if (libHandles[type]) {
|
||||
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
|
||||
return libHandles[type];
|
||||
}
|
||||
if (openErr == ENOENT) {
|
||||
appendNameToList(eNoEntNameList, &len, libName_);
|
||||
} else {
|
||||
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
|
||||
snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
|
||||
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
|
||||
if (libHandles[type]) {
|
||||
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
|
||||
return libHandles[type];
|
||||
}
|
||||
if (openErr == ENOENT) {
|
||||
appendNameToList(eNoEntNameList, &len, libName_);
|
||||
} else {
|
||||
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
snprintf(libName_, MAX_STR_LEN, "%s.so", pluginPrefix[type]);
|
||||
@@ -123,12 +125,17 @@ void* ncclGetNetPluginLib(void) {
|
||||
}
|
||||
|
||||
ncclResult_t ncclClosePluginLib(void* handle) {
|
||||
bool found = false;
|
||||
for (int l=0; l<NUM_LIBS; l++) {
|
||||
if (libHandles[l] == handle) {
|
||||
libHandles[l] = nullptr;
|
||||
dlclose(handle);
|
||||
return ncclSuccess;
|
||||
if (!found) {
|
||||
if (handle) {
|
||||
dlclose(handle);
|
||||
}
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclInternalError;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
extern ncclProfiler_t* getNcclProfiler_v1(void* lib);
|
||||
extern ncclProfiler_t* getNcclProfiler_v2(void* lib);
|
||||
extern ncclProfiler_t* getNcclProfiler_v3(void* lib);
|
||||
extern ncclProfiler_t* getNcclProfiler_v4(void* lib);
|
||||
|
||||
static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static int profilerPluginRefCount;
|
||||
@@ -51,7 +52,10 @@ static ncclResult_t ncclProfilerPluginLoad(void) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
ncclProfiler = getNcclProfiler_v3(profilerPluginLib);
|
||||
ncclProfiler = getNcclProfiler_v4(profilerPluginLib);
|
||||
if (ncclProfiler == nullptr) {
|
||||
ncclProfiler = getNcclProfiler_v3(profilerPluginLib);
|
||||
}
|
||||
if (ncclProfiler == nullptr) {
|
||||
ncclProfiler = getNcclProfiler_v2(profilerPluginLib);
|
||||
}
|
||||
@@ -164,7 +168,7 @@ ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) {
|
||||
TIME_START_EVENT(init);
|
||||
ncclProfilerPluginLoad();
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask);
|
||||
int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask, comm->config.commName, comm->commHash, comm->nNodes, comm->nRanks, comm->rank, ncclDebugLog);
|
||||
if (err) {
|
||||
WARN("Profiler init failed with error (%d). Continue without profiler.", err);
|
||||
ncclProfiler = NULL;
|
||||
@@ -241,8 +245,6 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
|
||||
eDescr.type = ncclProfileColl;
|
||||
eDescr.parentObj = plan->groupEventHandle;
|
||||
eDescr.rank = plan->comm->rank;
|
||||
eDescr.coll.name = plan->comm->commName;
|
||||
eDescr.coll.commHash = plan->comm->commHash;
|
||||
eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func];
|
||||
eDescr.coll.func = ncclFuncToString(ct->func);
|
||||
eDescr.coll.sendBuff = ct->sendbuff;
|
||||
@@ -250,7 +252,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
|
||||
eDescr.coll.count = ct->count;
|
||||
eDescr.coll.root = ct->root;
|
||||
eDescr.coll.datatype = ncclDatatypeToString(ct->datatype);
|
||||
eDescr.coll.nMaxChannels = ct->nMaxChannels;
|
||||
eDescr.coll.nChannels = ct->nChannels;
|
||||
eDescr.coll.nWarps = ct->nWarps;
|
||||
eDescr.coll.algo = ncclAlgoToString(ct->algorithm);
|
||||
eDescr.coll.proto = ncclProtoToString(ct->protocol);
|
||||
@@ -266,7 +268,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
|
||||
// gives the consistency.
|
||||
if (!plan->persistent || (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle &&
|
||||
(ct->eActivationMask & ncclProfileKernelCh)))
|
||||
plan->comm->seqNumber[ct->func]++;
|
||||
__atomic_fetch_add(&plan->comm->seqNumber[ct->func], 1, __ATOMIC_RELAXED);
|
||||
ct = ct->next;
|
||||
}
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
@@ -279,13 +281,12 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
|
||||
eDescr.type = ncclProfileP2p;
|
||||
eDescr.parentObj = plan->groupEventHandle;
|
||||
eDescr.rank = plan->comm->rank;
|
||||
eDescr.p2p.name = plan->comm->commName;
|
||||
eDescr.p2p.commHash = plan->comm->commHash;
|
||||
eDescr.p2p.func = ncclFuncToString(pt->func);
|
||||
eDescr.p2p.buff = pt->buff;
|
||||
eDescr.p2p.count = pt->count;
|
||||
eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype);
|
||||
eDescr.p2p.peer = pt->root;
|
||||
eDescr.p2p.nChannels = pt->nChannels;
|
||||
ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
|
||||
}
|
||||
pt = pt->next;
|
||||
@@ -321,7 +322,7 @@ ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) {
|
||||
// made of sliceSteps steps rather than one step. In the profiler we are still
|
||||
// interested in whole network transfers though, so we account for this when
|
||||
// computing the actual network step number.
|
||||
ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args) {
|
||||
ncclResult_t ncclProfilerStartProxyOpEvent(int s, struct ncclProxyArgs* args) {
|
||||
TIME_START_EVENT(proxyOpStart);
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
@@ -335,29 +336,7 @@ ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args
|
||||
eDescr.proxyOp.peer = sub->peer;
|
||||
eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps);
|
||||
eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps;
|
||||
eDescr.proxyOp.isSend = 1;
|
||||
ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr);
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyOpStart);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args) {
|
||||
TIME_START_EVENT(proxyOpStart);
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
if (sub->eActivationMask & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileNetPlugin)) {
|
||||
ncclProfilerEventDescr_t eDescr = { 0 };
|
||||
eDescr.type = ncclProfileProxyOp;
|
||||
eDescr.parentObj = sub->taskEventHandle;
|
||||
eDescr.rank = sub->rank;
|
||||
eDescr.proxyOp.pid = sub->pid;
|
||||
eDescr.proxyOp.channelId = sub->channelId;
|
||||
eDescr.proxyOp.peer = sub->peer;
|
||||
eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps);
|
||||
eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps;
|
||||
eDescr.proxyOp.isSend = 0;
|
||||
eDescr.proxyOp.isSend = args->progress == ncclTransports[TRANSPORT_NET]->send.proxyProgress ? 1 : 0;
|
||||
ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr);
|
||||
}
|
||||
}
|
||||
@@ -387,7 +366,8 @@ ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* ar
|
||||
eDescr.parentObj = sub->opEventHandle;
|
||||
eDescr.rank = sub->rank;
|
||||
eDescr.proxyStep.step = step_;
|
||||
ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr);
|
||||
ncclProfiler->startEvent(sub->profilerContext, &sub->pHandles[step_%NCCL_STEPS].stepEventHandle, &eDescr);
|
||||
sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub;
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyStepStart);
|
||||
@@ -405,7 +385,8 @@ ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* ar
|
||||
eDescr.parentObj = sub->opEventHandle;
|
||||
eDescr.rank = sub->rank;
|
||||
eDescr.proxyStep.step = step_;
|
||||
ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr);
|
||||
ncclProfiler->startEvent(sub->profilerContext, &sub->pHandles[step_%NCCL_STEPS].stepEventHandle, &eDescr);
|
||||
sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub;
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyStepStart);
|
||||
@@ -417,9 +398,9 @@ ncclResult_t ncclProfilerStopProxyStepEvent(int s, struct ncclProxyArgs* args, i
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
int step_ = DIVUP(stepId, args->sliceSteps);
|
||||
if (sub->stepEventHandles[step_%NCCL_STEPS]) {
|
||||
ncclProfiler->stopEvent(sub->stepEventHandles[step_%NCCL_STEPS]);
|
||||
sub->stepEventHandles[step_%NCCL_STEPS] = NULL;
|
||||
if (sub->pHandles[step_%NCCL_STEPS].stepEventHandle) {
|
||||
ncclProfiler->stopEvent(sub->pHandles[step_%NCCL_STEPS].stepEventHandle);
|
||||
sub->pHandles[step_%NCCL_STEPS].stepEventHandle = NULL;
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyStepStop);
|
||||
@@ -453,7 +434,7 @@ ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s) {
|
||||
ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t start) {
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (sub->eActivationMask & ncclProfileKernelCh) {
|
||||
@@ -461,29 +442,31 @@ ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s) {
|
||||
eDescr.type = ncclProfileKernelCh;
|
||||
eDescr.parentObj = sub->taskEventHandle;
|
||||
eDescr.kernelCh.channelId = sub->channelId;
|
||||
eDescr.kernelCh.pTimer = start;
|
||||
ncclProfiler->startEvent(sub->profilerContext, &sub->kernelEventHandle, &eDescr);
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s) {
|
||||
ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t stop) {
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (sub->kernelEventHandle) {
|
||||
ncclProfilerEventStateArgs_t a = { };
|
||||
a.kernelCh.pTimer = stop;
|
||||
ncclProfiler->recordEventState(sub->kernelEventHandle, ncclProfilerKernelChStop, &a);
|
||||
ncclProfiler->stopEvent(sub->kernelEventHandle);
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState) {
|
||||
ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, ncclProfilerEventState_t eState) {
|
||||
TIME_START_EVENT(proxyOpRecord);
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
|
||||
ncclProfilerEventStateArgs_t a = { };
|
||||
a.proxyOp.steps = DIVUP(steps, args->sliceSteps);
|
||||
a.proxyOp.transSize = transSize;
|
||||
ncclProfiler->recordEventState(sub->opEventHandle, eState, &a);
|
||||
}
|
||||
TIME_STOP_EVENT(proxyOpRecord);
|
||||
@@ -495,8 +478,10 @@ ncclResult_t ncclProfilerRecordProxyStepEventState(int s, struct ncclProxyArgs*
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
|
||||
int step_ = DIVUP(stepId, args->sliceSteps);
|
||||
if (sub->stepEventHandles[step_%NCCL_STEPS]) {
|
||||
ncclProfiler->recordEventState(sub->stepEventHandles[step_%NCCL_STEPS], eState, 0);
|
||||
if (sub->pHandles[step_%NCCL_STEPS].stepEventHandle) {
|
||||
ncclProfilerEventStateArgs_t a = { };
|
||||
a.proxyStep.transSize = sub->transSize;
|
||||
ncclProfiler->recordEventState(sub->pHandles[step_%NCCL_STEPS].stepEventHandle, eState, &a);
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyStepRecord);
|
||||
@@ -549,18 +534,28 @@ bool ncclProfilerPluginLoaded(void) {
|
||||
|
||||
ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData) {
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
struct ncclProxySubArgs* sub = (struct ncclProxySubArgs*)pHandle;
|
||||
if (type == 0) { // start
|
||||
if (type == ncclProfilerNetEventStart) { // start
|
||||
struct ncclProxyEventHandle* p = (struct ncclProxyEventHandle*)pHandle;
|
||||
struct ncclProxySubArgs* sub = p->subArgPtr;
|
||||
if (sub->eActivationMask & ncclProfileNetPlugin) {
|
||||
ncclProfilerEventDescr_t eDescr = { 0 };
|
||||
eDescr.type = ncclProfileNetPlugin;
|
||||
eDescr.parentObj = sub->stepEventHandles[sub->profilerSteps%NCCL_STEPS];
|
||||
eDescr.parentObj = p->stepEventHandle;
|
||||
eDescr.rank = sub->rank;
|
||||
eDescr.netPlugin.id = pluginId;
|
||||
eDescr.netPlugin.data = extData;
|
||||
ncclProfiler->startEvent(sub->profilerContext, eHandle, &eDescr);
|
||||
}
|
||||
} else { // stop
|
||||
} else if (type == ncclProfilerNetEventStop) { // stop
|
||||
ncclProfiler->stopEvent(*eHandle);
|
||||
} else if (type == ncclProfilerNetEventUpdate) { // update
|
||||
ncclProfilerEventStateArgs_t args = { };
|
||||
args.netPlugin.data = extData;
|
||||
ncclProfiler->recordEventState(*eHandle, ncclProfilerNetPluginUpdate, &args);
|
||||
} else { // update and stop
|
||||
ncclProfilerEventStateArgs_t args = { };
|
||||
args.netPlugin.data = extData;
|
||||
ncclProfiler->recordEventState(*eHandle, ncclProfilerNetPluginUpdate, &args);
|
||||
ncclProfiler->stopEvent(*eHandle);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,6 +53,7 @@ static uint8_t ncclStringToDatatype(const char* dt) {
|
||||
}
|
||||
|
||||
static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
|
||||
*eHandle = NULL;
|
||||
ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 };
|
||||
eDescr_v1.type = eDescr->type;
|
||||
eDescr_v1.parentObj = eDescr->parentObj;
|
||||
@@ -60,8 +61,8 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
|
||||
switch(eDescr->type) {
|
||||
case ncclProfileGroup: break;
|
||||
case ncclProfileColl: {
|
||||
eDescr_v1.coll.name = eDescr->coll.name;
|
||||
eDescr_v1.coll.commHash = eDescr->coll.commHash;
|
||||
eDescr_v1.coll.name = nullptr; // removed in v4
|
||||
eDescr_v1.coll.commHash = 0; // removed in v4
|
||||
eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber;
|
||||
eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func);
|
||||
eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff;
|
||||
@@ -71,14 +72,14 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
|
||||
eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype);
|
||||
eDescr_v1.coll.op = 0; // removed in v2
|
||||
eDescr_v1.coll.trafficBytes = 0; // removed in v3
|
||||
eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels;
|
||||
eDescr_v1.coll.nMaxChannels = eDescr->coll.nChannels;
|
||||
eDescr_v1.coll.nWarps = eDescr->coll.nWarps;
|
||||
eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo);
|
||||
eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto);
|
||||
} break;
|
||||
case ncclProfileP2p: {
|
||||
eDescr_v1.p2p.name = eDescr->p2p.name;
|
||||
eDescr_v1.p2p.commHash = eDescr->p2p.commHash;
|
||||
eDescr_v1.p2p.name = nullptr; // removed in v4
|
||||
eDescr_v1.p2p.commHash = 0; // removed in v4
|
||||
eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func);
|
||||
eDescr_v1.p2p.buff = eDescr->p2p.buff;
|
||||
eDescr_v1.p2p.count = eDescr->p2p.count;
|
||||
@@ -97,21 +98,34 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
|
||||
eDescr_v1.proxyStep.step = eDescr->proxyStep.step;
|
||||
} break;
|
||||
case ncclProfileProxyCtrl: break;
|
||||
case ncclProfileKernelCh:
|
||||
case ncclProfileNetPlugin: {
|
||||
*eHandle = NULL;
|
||||
return ncclSuccess;
|
||||
}
|
||||
default:;
|
||||
default: return ncclSuccess;
|
||||
}
|
||||
return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
|
||||
return ncclProfiler_v1->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v1_t*)eStateArgs);
|
||||
ncclProfilerEventStateArgs_v1_t args = { };
|
||||
switch (eState) {
|
||||
case ncclProfilerProxyCtrlIdle:
|
||||
case ncclProfilerProxyCtrlActive:
|
||||
case ncclProfilerProxyCtrlSleep:
|
||||
case ncclProfilerProxyCtrlWakeup:
|
||||
case ncclProfilerProxyCtrlAppend:
|
||||
case ncclProfilerProxyCtrlAppendEnd:
|
||||
args.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps;
|
||||
break;
|
||||
case ncclProfilerProxyStepSendGPUWait:
|
||||
case ncclProfilerProxyStepSendWait:
|
||||
case ncclProfilerProxyStepRecvWait:
|
||||
case ncclProfilerProxyStepRecvFlushWait:
|
||||
case ncclProfilerProxyStepRecvGPUWait:
|
||||
break;
|
||||
default: return ncclSuccess;
|
||||
}
|
||||
return ncclProfiler_v1->recordEventState(eHandle, eState, &args);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) {
|
||||
static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclProfiler_v1->init(context, eActivationMask));
|
||||
ncclProfiler.startEvent = ncclProfiler_startEvent;
|
||||
ncclProfiler.stopEvent = ncclProfiler_v1->stopEvent;
|
||||
|
||||
@@ -20,8 +20,8 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
|
||||
switch(eDescr->type) {
|
||||
case ncclProfileGroup: break;
|
||||
case ncclProfileColl: {
|
||||
eDescr_v2.coll.name = eDescr->coll.name;
|
||||
eDescr_v2.coll.commHash = eDescr->coll.commHash;
|
||||
eDescr_v2.coll.name = nullptr; // removed in v4
|
||||
eDescr_v2.coll.commHash = 0; // removed in v4
|
||||
eDescr_v2.coll.seqNumber = eDescr->coll.seqNumber;
|
||||
eDescr_v2.coll.func = eDescr->coll.func;
|
||||
eDescr_v2.coll.sendBuff = eDescr->coll.sendBuff;
|
||||
@@ -30,14 +30,14 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
|
||||
eDescr_v2.coll.root = eDescr->coll.root;
|
||||
eDescr_v2.coll.datatype = eDescr->coll.datatype;
|
||||
eDescr_v2.coll.trafficBytes = 0; // removed in v3
|
||||
eDescr_v2.coll.nMaxChannels = eDescr->coll.nMaxChannels;
|
||||
eDescr_v2.coll.nMaxChannels = eDescr->coll.nChannels;
|
||||
eDescr_v2.coll.nWarps = eDescr->coll.nWarps;
|
||||
eDescr_v2.coll.algo = eDescr->coll.algo;
|
||||
eDescr_v2.coll.proto = eDescr->coll.proto;
|
||||
} break;
|
||||
case ncclProfileP2p: {
|
||||
eDescr_v2.p2p.name = eDescr->p2p.name;
|
||||
eDescr_v2.p2p.commHash = eDescr->p2p.commHash;
|
||||
eDescr_v2.p2p.name = nullptr; // removed in v4
|
||||
eDescr_v2.p2p.commHash = 0; // removed in v4
|
||||
eDescr_v2.p2p.func = eDescr->p2p.func;
|
||||
eDescr_v2.p2p.buff = eDescr->p2p.buff;
|
||||
eDescr_v2.p2p.count = eDescr->p2p.count;
|
||||
@@ -62,10 +62,28 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
|
||||
}
|
||||
|
||||
static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
|
||||
return ncclProfiler_v2->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v2_t *)eStateArgs);
|
||||
ncclProfilerEventStateArgs_v2_t args = { };
|
||||
switch (eState) {
|
||||
case ncclProfilerProxyCtrlIdle:
|
||||
case ncclProfilerProxyCtrlActive:
|
||||
case ncclProfilerProxyCtrlSleep:
|
||||
case ncclProfilerProxyCtrlWakeup:
|
||||
case ncclProfilerProxyCtrlAppend:
|
||||
case ncclProfilerProxyCtrlAppendEnd:
|
||||
args.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps;
|
||||
break;
|
||||
case ncclProfilerProxyStepSendGPUWait:
|
||||
case ncclProfilerProxyStepSendWait:
|
||||
case ncclProfilerProxyStepRecvWait:
|
||||
case ncclProfilerProxyStepRecvFlushWait:
|
||||
case ncclProfilerProxyStepRecvGPUWait:
|
||||
break;
|
||||
default: return ncclSuccess;
|
||||
}
|
||||
return ncclProfiler_v2->recordEventState(eHandle, eState, &args);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) {
|
||||
static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclProfiler_v2->init(context, eActivationMask));
|
||||
ncclProfiler.startEvent = ncclProfiler_startEvent;
|
||||
ncclProfiler.stopEvent = ncclProfiler_v2->stopEvent;
|
||||
|
||||
@@ -6,14 +6,105 @@
|
||||
|
||||
#include "comm.h"
|
||||
#include "nccl_profiler.h"
|
||||
#include "checks.h"
|
||||
|
||||
static ncclProfiler_t ncclProfiler;
|
||||
static ncclProfiler_v3_t* ncclProfiler_v3;
|
||||
|
||||
static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
|
||||
*eHandle = nullptr;
|
||||
ncclProfilerEventDescr_v3_t eDescr_v3 = { };
|
||||
eDescr_v3.type = eDescr->type;
|
||||
eDescr_v3.parentObj = eDescr->parentObj;
|
||||
eDescr_v3.rank = eDescr->rank;
|
||||
switch(eDescr->type) {
|
||||
case ncclProfileGroup: break;
|
||||
case ncclProfileColl: {
|
||||
eDescr_v3.coll.name = nullptr; // removed in v4
|
||||
eDescr_v3.coll.commHash = 0; // removed in v4
|
||||
eDescr_v3.coll.seqNumber = eDescr->coll.seqNumber;
|
||||
eDescr_v3.coll.func = eDescr->coll.func;
|
||||
eDescr_v3.coll.sendBuff = eDescr->coll.sendBuff;
|
||||
eDescr_v3.coll.recvBuff = eDescr->coll.recvBuff;
|
||||
eDescr_v3.coll.count = eDescr->coll.count;
|
||||
eDescr_v3.coll.root = eDescr->coll.root;
|
||||
eDescr_v3.coll.datatype = eDescr->coll.datatype;
|
||||
eDescr_v3.coll.nMaxChannels = eDescr->coll.nChannels;
|
||||
eDescr_v3.coll.nWarps = eDescr->coll.nWarps;
|
||||
eDescr_v3.coll.algo = eDescr->coll.algo;
|
||||
eDescr_v3.coll.proto = eDescr->coll.proto;
|
||||
} break;
|
||||
case ncclProfileP2p: {
|
||||
eDescr_v3.p2p.name = nullptr; // removed in v4
|
||||
eDescr_v3.p2p.commHash = 0; // removed in v4
|
||||
eDescr_v3.p2p.func = eDescr->p2p.func;
|
||||
eDescr_v3.p2p.buff = eDescr->p2p.buff;
|
||||
eDescr_v3.p2p.count = eDescr->p2p.count;
|
||||
eDescr_v3.p2p.datatype = eDescr->p2p.datatype;
|
||||
eDescr_v3.p2p.peer = eDescr->p2p.peer;
|
||||
} break;
|
||||
case ncclProfileProxyOp: {
|
||||
eDescr_v3.proxyOp.pid = eDescr->proxyOp.pid;
|
||||
eDescr_v3.proxyOp.channelId = eDescr->proxyOp.channelId;
|
||||
eDescr_v3.proxyOp.peer = eDescr->proxyOp.peer;
|
||||
eDescr_v3.proxyOp.nSteps = eDescr->proxyOp.nSteps;
|
||||
eDescr_v3.proxyOp.chunkSize = eDescr->proxyOp.chunkSize;
|
||||
eDescr_v3.proxyOp.isSend = eDescr->proxyOp.isSend;
|
||||
} break;
|
||||
case ncclProfileProxyStep: {
|
||||
eDescr_v3.proxyStep.step = eDescr->proxyStep.step;
|
||||
} break;
|
||||
case ncclProfileProxyCtrl: break;
|
||||
case ncclProfileKernelCh: {
|
||||
eDescr_v3.kernelCh.channelId = eDescr->kernelCh.channelId;
|
||||
} break;
|
||||
case ncclProfileNetPlugin: {
|
||||
eDescr_v3.netPlugin.id = eDescr->netPlugin.id;
|
||||
eDescr_v3.netPlugin.data = eDescr->netPlugin.data;
|
||||
} break;
|
||||
default: return ncclSuccess;
|
||||
}
|
||||
return ncclProfiler_v3->startEvent(context, eHandle, &eDescr_v3);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
|
||||
ncclProfilerEventStateArgs_v3_t args = { };
|
||||
switch (eState) {
|
||||
case ncclProfilerProxyCtrlIdle:
|
||||
case ncclProfilerProxyCtrlActive:
|
||||
case ncclProfilerProxyCtrlSleep:
|
||||
case ncclProfilerProxyCtrlWakeup:
|
||||
case ncclProfilerProxyCtrlAppend:
|
||||
case ncclProfilerProxyCtrlAppendEnd:
|
||||
args.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps;
|
||||
break;
|
||||
case ncclProfilerProxyStepSendGPUWait:
|
||||
case ncclProfilerProxyStepSendWait:
|
||||
case ncclProfilerProxyStepRecvWait:
|
||||
case ncclProfilerProxyStepRecvFlushWait:
|
||||
case ncclProfilerProxyStepRecvGPUWait:
|
||||
break;
|
||||
default: return ncclSuccess;
|
||||
}
|
||||
return ncclProfiler_v3->recordEventState(eHandle, eState, &args);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclProfiler_v3->init(context, eActivationMask));
|
||||
ncclProfiler.startEvent = ncclProfiler_startEvent;
|
||||
ncclProfiler.stopEvent = ncclProfiler_v3->stopEvent;
|
||||
ncclProfiler.recordEventState = ncclProfiler_recordEventState;
|
||||
ncclProfiler.finalize = ncclProfiler_v3->finalize;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclProfiler_t* getNcclProfiler_v3(void* lib) {
|
||||
ncclProfiler_v3 = (ncclProfiler_v3_t*)dlsym(lib, "ncclProfiler_v3");
|
||||
if (ncclProfiler_v3) {
|
||||
ncclProfiler.name = ncclProfiler_v3->name;
|
||||
ncclProfiler.init = ncclProfiler_init;
|
||||
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v3->name);
|
||||
return ncclProfiler_v3;
|
||||
return &ncclProfiler;
|
||||
}
|
||||
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v3");
|
||||
return NULL;
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "comm.h"
|
||||
#include "nccl_profiler.h"
|
||||
#include "checks.h"
|
||||
|
||||
static ncclProfiler_v4_t* ncclProfiler_v4;
|
||||
|
||||
ncclProfiler_t* getNcclProfiler_v4(void* lib) {
|
||||
ncclProfiler_v4 = (ncclProfiler_v4_t*)dlsym(lib, "ncclProfiler_v4");
|
||||
if (ncclProfiler_v4) {
|
||||
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v4->name);
|
||||
return ncclProfiler_v4;
|
||||
}
|
||||
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v4");
|
||||
return NULL;
|
||||
}
|
||||
+36
-10
@@ -437,6 +437,7 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
|
||||
args->state = ncclProxyOpReady;
|
||||
args->progress = op->connection->tcomm->proxyProgress;
|
||||
args->proxyAppendPtr = op->connection->proxyAppendPtr;
|
||||
if (args->pattern != ncclPatternProfiler) ncclProfilerStartProxyOpEvent(subIndex, args);
|
||||
args->send = op->connection->send;
|
||||
args->prevRank = op->prevRank;
|
||||
args->nextRank = op->nextRank;
|
||||
@@ -668,10 +669,10 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
|
||||
const int rank = comm->rank, nranks = comm->nRanks;
|
||||
int *nstepsSend = NULL, *nstepsRecv = NULL;
|
||||
PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks);
|
||||
struct ncclPatStep ps = {0};
|
||||
NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_up);
|
||||
NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_up);
|
||||
|
||||
struct ncclPatStep ps;
|
||||
do {
|
||||
algo.getNextOp(&ps);
|
||||
if (ps.flags & PatSkipped) continue;
|
||||
@@ -702,10 +703,10 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
|
||||
const int rank = comm->rank, nranks = comm->nRanks;
|
||||
int *nstepsSend = NULL, *nstepsRecv = NULL;
|
||||
PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks);
|
||||
struct ncclPatStep ps = {0};
|
||||
NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_down);
|
||||
NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_down);
|
||||
|
||||
struct ncclPatStep ps;
|
||||
do {
|
||||
algo.getNextOp(&ps);
|
||||
if (ps.flags & PatSkipped) continue;
|
||||
@@ -970,11 +971,13 @@ void* ncclProxyProgress(void *proxyState_) {
|
||||
INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
|
||||
break;
|
||||
}
|
||||
void* eHandle;
|
||||
ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
|
||||
if (lastIdle == 0 && idle == 1) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlIdle);
|
||||
if (lastIdle == 1 && idle == 0) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlActive);
|
||||
ncclProfilerStopProxyCtrlEvent(eHandle);
|
||||
if ((lastIdle == 0 && idle == 1) || (lastIdle == 1 && idle == 0)) {
|
||||
void* eHandle;
|
||||
ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
|
||||
if (lastIdle == 0 && idle == 1) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlIdle);
|
||||
if (lastIdle == 1 && idle == 0) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlActive);
|
||||
ncclProfilerStopProxyCtrlEvent(eHandle);
|
||||
}
|
||||
if (idle || !state->active || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) {
|
||||
int added = 0;
|
||||
proxyOpAppendCounter = 0;
|
||||
@@ -1226,12 +1229,17 @@ error:
|
||||
// The request/response is sent out-of-band using ncclIpcSocket for this specific command
|
||||
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int proxyRank, void *handle, int* convertedFd) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
uint64_t hipHandleVal = (uint64_t)(uintptr_t)(*(hipMemGenericAllocationHandle_t*)handle);
|
||||
|
||||
// Request the allocation of a UDS fd for the handle
|
||||
if (comm->gproxyConn[proxyRank].initialized == false) {
|
||||
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, proxyRank, &comm->gproxyConn[proxyRank]), ret, error);
|
||||
}
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, &comm->gproxyConn[proxyRank], ncclProxyMsgGetFd, (void*)&hipHandleVal, sizeof(hipHandleVal), NULL, 0, NULL, convertedFd), ret, error);
|
||||
#else
|
||||
NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, &comm->gproxyConn[proxyRank], ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), NULL, 0, NULL, convertedFd), ret, error);
|
||||
#endif
|
||||
|
||||
// We have now received the converted fd over UDS
|
||||
INFO(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx tpRank %d returned fd %d sameProcess %d", *(uint64_t*)handle, comm->topParentRanks[proxyRank], *convertedFd, comm->gproxyConn[proxyRank].sameProcess);
|
||||
@@ -1443,7 +1451,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
|
||||
}
|
||||
|
||||
static ncclResult_t proxyQueryFd(struct ncclProxyState* proxyState, int rank, void *opId, int rmtFd) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
#if ROCM_VERSION >= 70000
|
||||
struct ncclIpcSocket ipcSock = { 0 };
|
||||
uint64_t hash = (uint64_t) opId;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
@@ -1459,8 +1467,14 @@ exit:
|
||||
}
|
||||
|
||||
// cuMem API support
|
||||
static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void *opId, uint64_t handle) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void *opId,
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
hipMemGenericAllocationHandle_t handle
|
||||
#else
|
||||
uint64_t handle
|
||||
#endif
|
||||
) {
|
||||
#if ROCM_VERSION >= 70000
|
||||
// cuMem API support
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclIpcSocket ipcSock = { 0 };
|
||||
@@ -1747,6 +1761,14 @@ void* ncclProxyService(void* _args) {
|
||||
pollfds[s].fd = -1;
|
||||
npeers--;
|
||||
}
|
||||
|
||||
// Close any lingering connections after the stop condition is set
|
||||
if (stop != PROXY_RUNNING && pollfds[s].fd != -1) {
|
||||
INFO(NCCL_PROXY, "[Proxy Service %d] Force closing peer=%d fd: %d", proxyState->tpRank, s, pollfds[s].fd);
|
||||
(void)ncclSocketClose(sock);
|
||||
pollfds[s].fd = -1;
|
||||
npeers--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1776,7 +1798,11 @@ static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd
|
||||
// cuMem API support for non-UB case, and rmtFd is not used since UDS proxy thread need to export
|
||||
// fd from handle and send it back to the main thread to import the buffer. We just need to close
|
||||
// this dummy rmtFd.
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
hipMemGenericAllocationHandle_t handle = (hipMemGenericAllocationHandle_t)(uintptr_t)(*(uint64_t*)hdr.data);
|
||||
#else
|
||||
uint64_t handle = *(uint64_t*)hdr.data;
|
||||
#endif
|
||||
INFO(NCCL_PROXY, "proxyUDSRecvReq::ncclProxyMsgGetFd rank %d opId %p handle=0x%lx", hdr.rank, hdr.opId, handle);
|
||||
close(rmtFd);
|
||||
return proxyGetFd(proxyState, hdr.rank, hdr.opId, handle);
|
||||
|
||||
@@ -606,6 +606,10 @@ static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqL
|
||||
for (int commIdx = 0; commIdx < nNcclComms; commIdx++) {
|
||||
if (ncclComms[commIdx] == nullptr) // nullptr's are always at the end after sorting.
|
||||
break;
|
||||
if (!__atomic_load_n(&ncclComms[commIdx]->peerInfoValid, __ATOMIC_ACQUIRE)) {
|
||||
// Critical data is not yet initialized -- ignore the communicator.
|
||||
continue;
|
||||
}
|
||||
// A process may manage multiple GPUs and thus have multiple communicators with the same commHash.
|
||||
// Comparing just the commHash is OK though within communicators that are part of the same process.
|
||||
if (commIdx == 0 || ncclComms[commIdx]->commHash != ncclComms[commIdx-1]->commHash) {
|
||||
@@ -651,6 +655,8 @@ static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqL
|
||||
// collCommIdx counts rasCollComms::comm (comm); commIdx indexes ncclComms.
|
||||
for (int collCommIdx = 0, commIdx = 0; collCommIdx < nComms; collCommIdx++) {
|
||||
struct ncclComm* ncclComm = ncclComms[commIdx];
|
||||
if (!__atomic_load_n(&ncclComm->peerInfoValid, __ATOMIC_ACQUIRE))
|
||||
continue;
|
||||
|
||||
comm->commId.commHash = ncclComm->commHash;
|
||||
comm->commId.hostHash = ncclComm->peerInfo->hostHash;
|
||||
@@ -663,15 +669,15 @@ static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqL
|
||||
commIdx++) {
|
||||
ncclComm = ncclComms[commIdx];
|
||||
struct rasCollComms::comm::rank* rank = comm->ranks+comm->nRanks;
|
||||
ncclResult_t asyncError;
|
||||
rank->commRank = ncclComm->rank;
|
||||
// rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially
|
||||
// always 0. It will increase after we send this response back to the peer we got the request from.
|
||||
rank->peerIdx = 0;
|
||||
memcpy(rank->collOpCounts, ncclComm->seqNumber, sizeof(rank->collOpCounts));
|
||||
rank->status.initState = ncclComm->initState;
|
||||
if (ncclCommGetAsyncError(ncclComm, &asyncError) == ncclSuccess)
|
||||
rank->status.asyncError = asyncError;
|
||||
rank->status.asyncError = __atomic_load_n(&ncclComm->asyncResult, __ATOMIC_ACQUIRE);
|
||||
if (rank->status.asyncError == ncclSuccess && ncclComm->proxyState)
|
||||
rank->status.asyncError = __atomic_load_n(&ncclComm->proxyState->asyncResult, __ATOMIC_ACQUIRE);
|
||||
rank->status.finalizeCalled = (ncclComm->finalizeCalled != 0);
|
||||
rank->status.destroyFlag = (ncclComm->destroyFlag != 0);
|
||||
rank->status.abortFlag = (__atomic_load_n(ncclComm->abortFlag, __ATOMIC_ACQUIRE) != 0);
|
||||
@@ -680,7 +686,7 @@ static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqL
|
||||
comm->nRanks++;
|
||||
} // for (commIdx)
|
||||
|
||||
if (firstNewSkipMissingIdx != -1 &&
|
||||
if (__atomic_load_n(&ncclComm->peerInfoValid, __ATOMIC_ACQUIRE) && firstNewSkipMissingIdx != -1 &&
|
||||
memcmp(req->comms.skipMissingRanksComms+firstNewSkipMissingIdx, &comm->commId, sizeof(comm->commId)) == 0) {
|
||||
// Fill in the missingRanks array that follows the comm->ranks.
|
||||
struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks);
|
||||
|
||||
+17
-11
@@ -365,15 +365,16 @@ ncclResult_t rasNetAcceptNewSocket() {
|
||||
NCCLCHECKGOTO(ncclSocketAccept(&sock->sock, &rasNetListeningSocket), ret, fail);
|
||||
NCCLCHECKGOTO(ncclSocketReady(&sock->sock, &ready), ret, fail);
|
||||
|
||||
if (sock->sock.fd != -1) {
|
||||
NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail);
|
||||
rasPfds[sock->pfd].fd = sock->sock.fd;
|
||||
rasPfds[sock->pfd].events = POLLIN; // Initially we'll just wait for a handshake from the other side. This also
|
||||
// helps the code tell the sides apart.
|
||||
sock->status = RAS_SOCK_CONNECTING;
|
||||
if (sock->sock.fd == -1)
|
||||
goto fail; // We'll return ncclSuccess, but we need to clean up the incomplete socket first.
|
||||
|
||||
INFO(NCCL_RAS, "RAS new incoming socket connection from %s", ncclSocketToString(&sock->sock.addr, rasLine));
|
||||
}
|
||||
NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail);
|
||||
rasPfds[sock->pfd].fd = sock->sock.fd;
|
||||
rasPfds[sock->pfd].events = POLLIN; // Initially we'll just wait for a handshake from the other side. This also
|
||||
// helps the code tell the sides apart.
|
||||
sock->status = RAS_SOCK_CONNECTING;
|
||||
|
||||
INFO(NCCL_RAS, "RAS new incoming socket connection from %s", ncclSocketToString(&sock->sock.addr, rasLine));
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
@@ -480,7 +481,10 @@ void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) {
|
||||
// Once we get an EOF when receiving data, we finalize the termination.
|
||||
// For not fully established sockets, we can terminate immediately as there's no useful data to extract.
|
||||
void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRetryOffset, bool retry) {
|
||||
assert(sock->status != RAS_SOCK_CLOSED);
|
||||
if (sock->status == RAS_SOCK_CLOSED) {
|
||||
INFO(NCCL_RAS, "RAS socket in closed state passed for termination -- internal error?");
|
||||
// The code below can actually handle such a case gracefully.
|
||||
}
|
||||
if (sock->conn) {
|
||||
struct rasConnection* conn = sock->conn;
|
||||
// If the sock of the connection points back to us, it means that we are the current socket of this
|
||||
@@ -542,8 +546,10 @@ void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRet
|
||||
} else {
|
||||
// Either the caller requested finalization or we cannot receive on it.
|
||||
(void)ncclSocketClose(&sock->sock);
|
||||
rasPfds[sock->pfd].fd = -1;
|
||||
rasPfds[sock->pfd].events = rasPfds[sock->pfd].revents = 0;
|
||||
if (sock->pfd != -1) {
|
||||
rasPfds[sock->pfd].fd = -1;
|
||||
rasPfds[sock->pfd].events = rasPfds[sock->pfd].revents = 0;
|
||||
}
|
||||
free(sock->recvMsg);
|
||||
freeSockEntry(sock);
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include "register.h"
|
||||
#include "transport.h"
|
||||
#include "enqueue.h"
|
||||
#include "register_inline.h"
|
||||
|
||||
static ncclResult_t registerCheckP2PConnection(struct ncclComm* comm, struct ncclConnector* conn, struct ncclTopoGraph* graph, int peer, bool* needReg) {
|
||||
if (conn->connected) {
|
||||
@@ -69,32 +70,34 @@ ncclResult_t ncclRegisterCollNvlsBuffers(
|
||||
|
||||
if (nvlsReged && comm->nNodes > 1 && info->algorithm == NCCL_ALGO_NVLS) {
|
||||
if (comm->planner.persistent && ncclParamGraphRegister()) {
|
||||
ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
|
||||
if (collnetReged) ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
|
||||
if (info->func == ncclFuncAllGather) {
|
||||
ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
|
||||
} else if (info->func == ncclFuncReduceScatter) {
|
||||
ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
|
||||
} else if (info->func == ncclFuncAllReduce) {
|
||||
ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
|
||||
if (collnetReged) ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
|
||||
}
|
||||
}
|
||||
|
||||
if (collnetReged == 0 && ncclParamLocalRegister()) {
|
||||
ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle);
|
||||
if (collnetReged) ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle);
|
||||
if (info->func == ncclFuncAllGather) {
|
||||
ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &collnetReged, &sendHandle);
|
||||
} else if (info->func == ncclFuncReduceScatter) {
|
||||
ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle);
|
||||
} else if (info->func == ncclFuncAllReduce) {
|
||||
ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle);
|
||||
if (collnetReged) ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nvlsReged) {
|
||||
*regNeedConnect = 0;
|
||||
/* tweak NVLS channels usage; for registered NVLS buffer to saturate bandwidth. */
|
||||
if (comm->nNodes == 1) {
|
||||
if (info->func == ncclFuncReduceScatter) {
|
||||
// RS: Further tweaks for Blackwell with NVLS registered buffers
|
||||
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 6 : 5));
|
||||
}
|
||||
else {
|
||||
// AR/AG: Further tweaks for Blackwell with NVLS registered buffers
|
||||
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 8 : 4));
|
||||
}
|
||||
} else {
|
||||
// Further tweaks for Blackwell with NVLS registered buffers
|
||||
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 7 : 6));
|
||||
}
|
||||
int recChannels;
|
||||
NCCLCHECK(ncclNvlsRegResourcesQuery(comm, info, &recChannels));
|
||||
info->nMaxChannels = recChannels;
|
||||
info->regBufType |= NCCL_NVLS_REG_BUFFER;
|
||||
}
|
||||
|
||||
@@ -196,7 +199,7 @@ ncclResult_t ncclRegisterCollBuffers(
|
||||
struct ncclChannel* channel = comm->channels;
|
||||
int ipcRegFlag = 0, netSendRegFlag = 0, netRecvRegFlag = 0;
|
||||
void *sendHandle, *recvHandle;
|
||||
if (info->func != ncclFuncReduceScatter && comm->intraNodeP2pSupport) {
|
||||
if (info->func != ncclFuncReduceScatter && comm->isAllDirectP2p) {
|
||||
for (int r = 0; r < NCCL_MAX_DIRECT_ARITY; ++r) {
|
||||
for (int down = 0; down < 2; ++down) {
|
||||
int peer = down ? channel->collnetDirect.down[r] : channel->collnetDirect.up[r];
|
||||
@@ -316,7 +319,7 @@ ncclResult_t ncclRegisterCollBuffers(
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nPeers > 0 && comm->intraNodeP2pSupport) {
|
||||
if (nPeers > 0 && comm->isAllDirectP2p) {
|
||||
if (comm->planner.persistent && ncclParamGraphRegister()) {
|
||||
ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
|
||||
}
|
||||
@@ -373,7 +376,7 @@ ncclResult_t ncclRegisterCollBuffers(
|
||||
void *sendHandle, *recvHandle;
|
||||
NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
|
||||
if (recvRegRecord == NULL && !(comm->planner.persistent && ncclParamGraphRegister())) goto exit;
|
||||
if (comm->intraNodeP2pSupport) {
|
||||
if (comm->isAllDirectP2p) {
|
||||
for (int c = 0; c < comm->nChannels; ++c) {
|
||||
struct ncclChannel* channel = comm->channels + c;
|
||||
struct ncclTree* tree = NULL;
|
||||
|
||||
+119
-21
@@ -10,6 +10,7 @@
|
||||
#include "net.h"
|
||||
#include "register.h"
|
||||
#include "transport.h"
|
||||
#include "group.h"
|
||||
#include "api_trace.h"
|
||||
#ifdef ENABLE_MSCCLPP
|
||||
#include "mscclpp/mscclpp_nccl.h"
|
||||
@@ -17,23 +18,19 @@
|
||||
|
||||
using namespace rccl;
|
||||
|
||||
ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) {
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
uintptr_t pageSize = cache->pageSize;
|
||||
uintptr_t addr = (uintptr_t)data & -pageSize;
|
||||
size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
|
||||
NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
|
||||
|
||||
*reg = NULL;
|
||||
for (int slot=0; /*true*/; slot++) {
|
||||
if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess;
|
||||
if ((addr >= cache->slots[slot]->addr) &&
|
||||
((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
|
||||
*reg = cache->slots[slot];
|
||||
return ncclSuccess;
|
||||
static ncclResult_t regFindHandleFromSymAddr(struct ncclComm* comm, void* baseSymPtr, struct ncclReg** handle) {
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
*handle = NULL;
|
||||
for (int slot = 0; slot < cache->population; slot++) {
|
||||
if (baseSymPtr == cache->slots[slot]->baseSymPtr) {
|
||||
*handle = cache->slots[slot];
|
||||
break;
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
|
||||
|
||||
ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid) {
|
||||
if (reg && isValid) {
|
||||
@@ -49,14 +46,14 @@ ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, bool i
|
||||
NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
uintptr_t pageSize = cache->pageSize;
|
||||
uintptr_t addr = (uintptr_t)data & -pageSize;
|
||||
size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
|
||||
uintptr_t begAddr = (uintptr_t)data & -pageSize;
|
||||
uintptr_t endAddr = ((uintptr_t)data + size + pageSize-1) & -pageSize;
|
||||
|
||||
if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(data, comm, "buff", "ncclCommRegister"));
|
||||
INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size);
|
||||
|
||||
for (int slot=0; /*true*/; slot++) {
|
||||
if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) {
|
||||
if ((slot == cache->population) || (begAddr < cache->slots[slot]->begAddr)) {
|
||||
if (cache->population == cache->capacity) { // must grow cache
|
||||
cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
|
||||
NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity));
|
||||
@@ -64,15 +61,15 @@ ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, bool i
|
||||
memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*));
|
||||
NCCLCHECK(ncclCalloc(cache->slots+slot, 1));
|
||||
struct ncclReg* regSlot = cache->slots[slot];
|
||||
regSlot->addr = addr;
|
||||
regSlot->pages = pages;
|
||||
regSlot->begAddr = begAddr;
|
||||
regSlot->endAddr = endAddr;
|
||||
if (isGraph) regSlot->graphRefs = 1;
|
||||
else regSlot->localRefs = 1;
|
||||
cache->population += 1;
|
||||
*handle = regSlot;
|
||||
goto exit;
|
||||
} else if ((addr >= cache->slots[slot]->addr) &&
|
||||
((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
|
||||
} else if ((cache->slots[slot]->begAddr <= begAddr) &&
|
||||
(cache->slots[slot]->endAddr >= endAddr)) {
|
||||
if (isGraph) cache->slots[slot]->graphRefs++;
|
||||
else cache->slots[slot]->localRefs++;
|
||||
*handle = cache->slots[slot];
|
||||
@@ -126,7 +123,7 @@ ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
for (int i = 0; i < cache->population; i++) {
|
||||
struct ncclReg* reg = cache->slots[i];
|
||||
INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)reg->addr, reg->pages);
|
||||
INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)reg->begAddr, (reg->endAddr-reg->begAddr)/cache->pageSize);
|
||||
NCCLCHECK(regCleanup(comm, reg));
|
||||
free(reg);
|
||||
}
|
||||
@@ -217,3 +214,104 @@ ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *hand
|
||||
NCCLCHECK(commDeregister(comm, true, handle));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCommSymmetricRegisterInternal(struct ncclComm* comm, void* buff, size_t baseSize, size_t alignment, CUmemGenericAllocationHandle memHandle, struct ncclReg* regHandle) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
void* regSymAddr = NULL;
|
||||
ALIGN_SIZE(comm->symAllocHead, alignment);
|
||||
NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, baseSize, memHandle, ®SymAddr), ret, fail);
|
||||
NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, baseSize, regSymAddr), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
|
||||
comm->symAllocHead += baseSize;
|
||||
regHandle->baseSymPtr = regSymAddr;
|
||||
regHandle->symSize = baseSize;
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
regHandle->baseSymPtr = NULL;
|
||||
regHandle->symSize = 0;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommWindowRegister, ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
|
||||
ncclResult_t ncclCommWindowRegister_impl(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
CUmemGenericAllocationHandle memHandle;
|
||||
size_t baseSize;
|
||||
void* baseAddr = NULL;
|
||||
struct ncclReg* regHandle = NULL;
|
||||
int saveDev;
|
||||
|
||||
*win = NULL;
|
||||
|
||||
CUDACHECK(cudaGetDevice(&saveDev));
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
if (!ncclParamLocalRegister() || !ncclCuMemEnable()) {
|
||||
goto exit;
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclCommEnsureReady(comm), ret, fail);
|
||||
|
||||
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
|
||||
if (comm && buff && size && win) {
|
||||
size_t alignment = 0;
|
||||
CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)buff), ret, fail);
|
||||
// size and alignment check
|
||||
if (!((uintptr_t)baseAddr % NCCL_REC_PAGE_SIZE == 0 && baseSize % NCCL_REC_PAGE_SIZE == 0 && (uintptr_t)buff + size <= (uintptr_t)baseAddr + baseSize)) {
|
||||
WARN("buffer %p (baseAddr %p align %d) size %zu (baseSize %ld align %d) does not satisfy symmetric registration requirements", buff, baseAddr, (uintptr_t)baseAddr % NCCL_REC_PAGE_SIZE == 0, size, baseSize, baseSize % NCCL_REC_PAGE_SIZE == 0);
|
||||
goto fail;
|
||||
}
|
||||
NCCLCHECKGOTO(ncclRegister(comm, baseAddr, baseSize, false, (void**)®Handle), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc(win, 1), ret, fail);
|
||||
(*win)->handle = regHandle;
|
||||
regHandle->winFlags = winFlags;
|
||||
if (regHandle->baseSymPtr == NULL && comm->symmetricSupport) {
|
||||
struct ncclSymRegTask* task;
|
||||
CUCHECKGOTO(cuMemRetainAllocationHandle(&memHandle, baseAddr), ret, fail);
|
||||
CUCHECKGOTO(cuMemRelease(memHandle), ret, fail);
|
||||
alignment = baseSize >= NCCL_REC_PAGE_SIZE * 72L ? NCCL_MAX_PAGE_SIZE : NCCL_REC_PAGE_SIZE;
|
||||
NCCLCHECKGOTO(ncclCalloc(&task, 1), ret, fail);
|
||||
task->buff = buff;
|
||||
task->baseSize = baseSize;
|
||||
task->memHandle = memHandle;
|
||||
task->regHandle = regHandle;
|
||||
task->alignment = alignment;
|
||||
ncclIntruQueueEnqueue(&comm->symRegTaskQueue, task);
|
||||
ncclGroupCommJoin(comm, ncclGroupTaskTypeSymRegister);
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
ncclGroupErrCheck(ret);
|
||||
NCCLCHECK(ret = ncclGroupEndInternal());
|
||||
cudaSetDevice(saveDev);
|
||||
return ret;
|
||||
fail:
|
||||
free(*win);
|
||||
*win = NULL;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommWindowDeregister, ncclComm_t comm, ncclWindow_t win);
|
||||
ncclResult_t ncclCommWindowDeregister_impl(ncclComm_t comm, ncclWindow_t win) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int saveDev;
|
||||
struct ncclReg* regHandle;
|
||||
CUDACHECK(cudaGetDevice(&saveDev));
|
||||
if (win == NULL) goto exit;
|
||||
regHandle = win->handle;
|
||||
if (regHandle && ncclParamLocalRegister() && ncclCuMemEnable()) {
|
||||
if (regHandle->baseSymPtr) {
|
||||
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
|
||||
NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, regHandle->symSize, regHandle->baseSymPtr), ret, fail);
|
||||
NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, regHandle->symSize, regHandle->baseSymPtr), ret, fail);
|
||||
}
|
||||
NCCLCHECKGOTO(commDeregister(comm, false, regHandle), ret, fail);
|
||||
}
|
||||
free(win);
|
||||
exit:
|
||||
CUDACHECK(cudaSetDevice(saveDev));
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,296 @@
|
||||
#include "symmetric.h"
|
||||
#include "comm.h"
|
||||
#include "device.h"
|
||||
#include <cmath>
|
||||
|
||||
constexpr char const* kernelName[] = {
|
||||
// Must align with enum ncclSymKernelId definition in src/include/symmetric.h
|
||||
"AllReduce_AGxLL_R",
|
||||
"AllReduce_AGxLLMC_R",
|
||||
"AllReduce_RSxLD_AGxST",
|
||||
"AllReduce_RSxLDMC_AGxSTMC",
|
||||
"AllGather_LL",
|
||||
"AllGather_LLMC",
|
||||
"AllGather_ST",
|
||||
"AllGather_STMC",
|
||||
"ReduceScatter_LL",
|
||||
"ReduceScatter_LD",
|
||||
"ReduceScatter_LDMC"
|
||||
};
|
||||
|
||||
constexpr uint32_t kernelMask_STMC = 1<<ncclSymKernelId_AllGather_LLMC |
|
||||
1<<ncclSymKernelId_AllGather_STMC |
|
||||
1<<ncclSymKernelId_AllReduce_AGxLLMC_R |
|
||||
1<<ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC |
|
||||
1<<ncclSymKernelId_ReduceScatter_LDMC;
|
||||
|
||||
constexpr uint32_t kernelMask_LDMC = 1<<ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC |
|
||||
1<<ncclSymKernelId_ReduceScatter_LDMC;
|
||||
|
||||
constexpr uint32_t kernelMask_LL = 1<<ncclSymKernelId_AllReduce_AGxLL_R |
|
||||
1<<ncclSymKernelId_AllReduce_AGxLLMC_R |
|
||||
1<<ncclSymKernelId_AllGather_LL |
|
||||
1<<ncclSymKernelId_AllGather_LLMC |
|
||||
1<<ncclSymKernelId_ReduceScatter_LL;
|
||||
|
||||
constexpr uint32_t kernelMask_AG = 1<<ncclSymKernelId_AllGather_LL |
|
||||
1<<ncclSymKernelId_AllGather_LLMC |
|
||||
1<<ncclSymKernelId_AllGather_ST |
|
||||
1<<ncclSymKernelId_AllGather_STMC;
|
||||
|
||||
constexpr uint32_t kernelMask_AR = 1<<ncclSymKernelId_AllReduce_AGxLLMC_R |
|
||||
1<<ncclSymKernelId_AllReduce_AGxLL_R |
|
||||
1<<ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC |
|
||||
1<<ncclSymKernelId_AllReduce_RSxLD_AGxST;
|
||||
|
||||
constexpr uint32_t kernelMask_RS = 1<<ncclSymKernelId_ReduceScatter_LD |
|
||||
1<<ncclSymKernelId_ReduceScatter_LDMC |
|
||||
1<<ncclSymKernelId_ReduceScatter_LL;
|
||||
|
||||
static uint32_t kernelMask_coll(ncclFunc_t coll) {
|
||||
switch (coll) {
|
||||
case ncclFuncAllGather: return kernelMask_AG;
|
||||
case ncclFuncAllReduce: return kernelMask_AR;
|
||||
case ncclFuncReduceScatter: return kernelMask_RS;
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static uint32_t kernelMask_user() {
|
||||
static uint32_t cache = -1u;
|
||||
uint32_t got = __atomic_load_n(&cache, __ATOMIC_RELAXED);
|
||||
if (got == -1u) {
|
||||
// TODO: Enhance this to be a pattern match. I like regex's but we also have
|
||||
// the parseList() used by NCCL_ALGO/PROTO.
|
||||
char const* name = ncclGetEnv("NCCL_SYM_KERNEL");
|
||||
if (name == nullptr || strcmp(name, "^") == 0) {
|
||||
static_assert((int)ncclSymKernelId_Count < 32, "Use more than 32 bits");
|
||||
got = (1<<(int)ncclSymKernelId_Count)-1;
|
||||
} else {
|
||||
got = 0;
|
||||
for (int k=0; k < (int)ncclSymKernelId_Count; k++) {
|
||||
if (strcmp(kernelName[k], name) == 0) {
|
||||
__atomic_store_n(&cache, 1<<k, __ATOMIC_RELAXED);
|
||||
got = 1<<k;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
__atomic_store_n(&cache, got, __ATOMIC_RELAXED);
|
||||
}
|
||||
return got;
|
||||
}
|
||||
|
||||
NCCL_PARAM(SymCTAs, "SYM_CTAS", 0)
|
||||
|
||||
static double softmin(double x, double ceiling, double softness) {
|
||||
// looks like a smooth version of: min(x, ceiling)
|
||||
return ceiling - softness*std::log1p((std::exp(ceiling/softness) - 1)*std::exp(-x/softness));
|
||||
}
|
||||
|
||||
static double softplus(double x, double softness) {
|
||||
// looks like a smooth version of: max(0, x)
|
||||
double z = x/softness;
|
||||
return 100.0 <= z ? x : softness*std::log1p(std::exp(z));
|
||||
}
|
||||
|
||||
static double model(double busBytes, double baseLat, int nSMs, double smBw, double busMultiplier, double peakBw) {
|
||||
double bw = softmin(nSMs*smBw*busMultiplier, peakBw, smBw);
|
||||
return baseLat + softplus(busBytes/bw - 1, 1);
|
||||
}
|
||||
|
||||
// Given the kernel and bytes, return the minimum number of blocks to run on such that
|
||||
// perf is 99% of running at max blocks, and return the estimate runtime for that
|
||||
// block count.
|
||||
static void queryModel(struct ncclComm* comm, ncclSymKernelId k, size_t nBytes, float* timeUs, int* nBlocks) {
|
||||
constexpr double LL_BusFactor = 9; // 2X the bytes, plus some processing, plus no unrolling
|
||||
|
||||
int nRanks = comm->nRanks;
|
||||
int nMaxBlocks = ncclSymMaxBlocks;
|
||||
int nMaxBlocksNvls = divUp((comm->cudaArch < 1000 ? 16 : 32), nRanks);
|
||||
size_t busBytes; // max(bytes sent, bytes received)
|
||||
double busMultiplier = 1;
|
||||
|
||||
switch (k) {
|
||||
default:
|
||||
busBytes = size_t(1)<<50;
|
||||
break;
|
||||
|
||||
case ncclSymKernelId_AllReduce_AGxLL_R:
|
||||
busBytes = nRanks*nBytes*LL_BusFactor;
|
||||
break;
|
||||
case ncclSymKernelId_AllReduce_AGxLLMC_R:
|
||||
busBytes = nRanks*nBytes*LL_BusFactor;
|
||||
busMultiplier = 1.1; // To beat non-MC LL
|
||||
break;
|
||||
case ncclSymKernelId_AllReduce_RSxLD_AGxST:
|
||||
busBytes = 2*nBytes*(nRanks-1)/nRanks;
|
||||
break;
|
||||
case ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC:
|
||||
busBytes = nBytes/nRanks + nBytes;
|
||||
busMultiplier = nRanks;
|
||||
nMaxBlocks = nMaxBlocksNvls;
|
||||
break;
|
||||
|
||||
case ncclSymKernelId_AllGather_LL:
|
||||
busBytes = nRanks*nBytes*LL_BusFactor;
|
||||
break;
|
||||
case ncclSymKernelId_AllGather_LLMC:
|
||||
busBytes = nRanks*nBytes*LL_BusFactor;
|
||||
busMultiplier = 1.1; // To beat non-MC LL
|
||||
break;
|
||||
case ncclSymKernelId_AllGather_ST:
|
||||
busBytes = (nRanks-1)*nBytes;
|
||||
break;
|
||||
case ncclSymKernelId_AllGather_STMC:
|
||||
busBytes = (nRanks-1)*nBytes; // Wrong. Should be nRanks*nBytes but we want to beat non-MC.
|
||||
busMultiplier = 0.55*nRanks;
|
||||
nMaxBlocks = nMaxBlocksNvls;
|
||||
break;
|
||||
|
||||
case ncclSymKernelId_ReduceScatter_LL:
|
||||
busBytes = nRanks*nBytes*LL_BusFactor;
|
||||
break;
|
||||
case ncclSymKernelId_ReduceScatter_LD:
|
||||
busBytes = (nRanks-1)*nBytes;
|
||||
break;
|
||||
case ncclSymKernelId_ReduceScatter_LDMC:
|
||||
busBytes = (nRanks-1)*nBytes; // Wrong. Should be nRanks*nBytes but we want to beat non-MC.
|
||||
busMultiplier = 0.55*nRanks;
|
||||
nMaxBlocks = nMaxBlocksNvls;
|
||||
break;
|
||||
}
|
||||
|
||||
nMaxBlocks = std::min<int>(nMaxBlocks, comm->config.maxCTAs);
|
||||
int nMinBlocks = comm->config.minCTAs;
|
||||
|
||||
int nUserCTAs = std::min<int>(ncclSymMaxBlocks, ncclParamSymCTAs());
|
||||
if (nUserCTAs > 0) nMinBlocks = nMaxBlocks = nUserCTAs;
|
||||
|
||||
bool isLL = kernelMask_LL>>k & 1;
|
||||
bool isAG = kernelMask_AG>>k & 1;
|
||||
bool isAR = kernelMask_AR>>k & 1;
|
||||
constexpr double GBps = (1<<30)/1.e6;
|
||||
double baseLat, smBw, peakBw;
|
||||
if (comm->cudaArch < 1000) {
|
||||
baseLat = isLL ? 4.5 : 7.8;
|
||||
smBw = isAR ? 65*GBps : 44*GBps;
|
||||
peakBw = k == ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC ? 480*GBps : 320*GBps;
|
||||
} else {
|
||||
baseLat = isLL ? (isAG ? 8.5 : 11) : (isAR ? 19.5 : 13.0);
|
||||
smBw = 55*GBps;
|
||||
peakBw = k == ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC ? 1000*GBps : 600*GBps;
|
||||
}
|
||||
*nBlocks = nMaxBlocks;
|
||||
*timeUs = model(busBytes, baseLat, nMaxBlocks, smBw, busMultiplier, peakBw);
|
||||
// Use least number of blocks that puts us within a tolerance of peak performance.
|
||||
for (int bn = nMinBlocks; bn < nMaxBlocks; bn++) {
|
||||
double time = model(busBytes, baseLat, bn, smBw, busMultiplier, peakBw);
|
||||
if (time <= 1.025*(*timeUs)) {
|
||||
*nBlocks = bn;
|
||||
*timeUs = time;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool ncclSymImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) {
|
||||
bool isFloat;
|
||||
switch (ty) {
|
||||
case ncclFloat64:
|
||||
case ncclFloat32:
|
||||
case ncclFloat16:
|
||||
case ncclBfloat16:
|
||||
case ncclFloat8e4m3:
|
||||
case ncclFloat8e5m2:
|
||||
isFloat = true;
|
||||
break;
|
||||
default:
|
||||
isFloat = false;
|
||||
break;
|
||||
}
|
||||
|
||||
switch (coll) {
|
||||
case ncclFuncAllGather:
|
||||
return true;
|
||||
case ncclFuncAllReduce:
|
||||
case ncclFuncReduceScatter:
|
||||
return red == ncclDevSum && isFloat && ty != ncclFloat64;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t ncclSymPickKernel(
|
||||
struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts,
|
||||
float* estTimeUs, ncclSymKernelId* kernelId, int* nBlocks, int* nWarps
|
||||
) {
|
||||
uint32_t kmask = kernelMask_coll(coll);
|
||||
kmask &= kernelMask_user();
|
||||
|
||||
bool hasSTMC = comm->nvlsSupport;
|
||||
bool hasLDMC = false;
|
||||
if (comm->nvlsSupport) {
|
||||
switch (ty) {
|
||||
case ncclInt32:
|
||||
case ncclUint32:
|
||||
case ncclInt64:
|
||||
case ncclUint64:
|
||||
case ncclFloat16:
|
||||
case ncclBfloat16:
|
||||
hasLDMC = red == ncclDevSum || red == ncclDevMinMax;
|
||||
break;
|
||||
case ncclFloat8e4m3:
|
||||
case ncclFloat8e5m2:
|
||||
hasLDMC = red == ncclDevSum || red == ncclDevMinMax;
|
||||
hasLDMC &= comm->compCap >= 100;
|
||||
break;
|
||||
case ncclFloat:
|
||||
case ncclDouble:
|
||||
hasLDMC = red == ncclDevSum;
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
if (!hasSTMC) kmask &= ~kernelMask_STMC;
|
||||
if (!hasLDMC) kmask &= ~kernelMask_LDMC;
|
||||
|
||||
size_t nBytes = nElts*ncclTypeSize(ty);
|
||||
size_t nBusBytes = (coll == ncclFuncAllReduce ? 1 : comm->nRanks)*nBytes;
|
||||
// LL kernels use 32-bit ints to track element counts and indices.
|
||||
if (nBusBytes >= (size_t(2)<<30)) kmask &= ~kernelMask_LL;
|
||||
// Any kernel might use 32-bit int to track unrolled loop chunks (which are going
|
||||
// to be at least 32 bytes per chunk)
|
||||
if (nBusBytes >= 32*(size_t(2)<<30)) kmask = 0;
|
||||
|
||||
ncclSymKernelId bestKernel = ncclSymKernelId_Count;
|
||||
float bestTime = 1.e30f;
|
||||
int bestBlocks = 999;
|
||||
|
||||
constexpr float smPenalty = .025f; // 2.5% percent increase in time per SM
|
||||
uint32_t kmaskRemain = kmask;
|
||||
while (kmaskRemain != 0) {
|
||||
ncclSymKernelId k = (ncclSymKernelId)popFirstOneBit(&kmaskRemain);
|
||||
float kTime;
|
||||
int kBlocks;
|
||||
queryModel(comm, k, nBytes, &kTime, &kBlocks);
|
||||
if (kTime*(1.0f + smPenalty*kBlocks) < bestTime*(1.0f + smPenalty*bestBlocks)) {
|
||||
bestKernel = k;
|
||||
bestTime = kTime;
|
||||
bestBlocks = kBlocks;
|
||||
}
|
||||
}
|
||||
|
||||
*kernelId = bestKernel;
|
||||
*estTimeUs = kmask==0 || kernelMask_user() == (1<<ncclSymKernelId_Count)-1 ? bestTime : 0.0f;
|
||||
*nBlocks = bestBlocks;
|
||||
*nWarps = 16;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
const char* ncclSymKernelIdToString(int kernelId) {
|
||||
if (kernelId < 0 || kernelId >= ncclSymKernelId_Count) {
|
||||
return "Unknown";
|
||||
}
|
||||
return kernelName[kernelId];
|
||||
}
|
||||
@@ -88,7 +88,7 @@ NCCL_PARAM(ConnectRoundMaxPeers, "CONNECT_ROUND_MAX_PEERS", 128);
|
||||
NCCL_PARAM(ReportConnectProgress, "REPORT_CONNECT_PROGRESS", 0);
|
||||
#include <sys/time.h>
|
||||
|
||||
ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode) {
|
||||
ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode) {
|
||||
bool supportFlag = true;
|
||||
bool directFlag = false;
|
||||
if (comm->localRanks == 1) {
|
||||
@@ -101,8 +101,9 @@ ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2p
|
||||
struct ncclPeerInfo* ipeerInfo = &comm->peerInfo[ipeer];
|
||||
struct ncclPeerInfo* jpeerInfo = &comm->peerInfo[jpeer];
|
||||
int canConnect = 0;
|
||||
NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, NULL, ipeerInfo, jpeerInfo));
|
||||
if (!canConnect && supportFlag == true) {
|
||||
int intermediateRank = -1;
|
||||
NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, ipeerInfo->rank, jpeerInfo->rank, &canConnect, NULL, &intermediateRank));
|
||||
if (!canConnect || intermediateRank != -1) {
|
||||
supportFlag = false;
|
||||
}
|
||||
if (ipeerInfo->hostHash == jpeerInfo->hostHash && ipeerInfo->pidHash == jpeerInfo->pidHash) directFlag = true;
|
||||
@@ -110,9 +111,9 @@ ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2p
|
||||
}
|
||||
}
|
||||
}
|
||||
*intraNodeP2pSupport = supportFlag;
|
||||
*isAllDirectP2p = supportFlag;
|
||||
*directMode = directFlag;
|
||||
if (comm->rank == 0) INFO(NCCL_INIT, "Check P2P Type intraNodeP2pSupport %d directMode %d", supportFlag, directFlag);
|
||||
if (comm->rank == 0) INFO(NCCL_INIT, "Check P2P Type isAllDirectP2p %d directMode %d", supportFlag, directFlag);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "assert.h"
|
||||
#include "bootstrap.h"
|
||||
#include "channel.h"
|
||||
#include "register_inline.h"
|
||||
|
||||
int64_t ncclParamGdrCopySyncEnable();
|
||||
int64_t ncclParamGdrCopyFlushEnable();
|
||||
@@ -1196,7 +1197,7 @@ static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* use
|
||||
goto exit;
|
||||
} else {
|
||||
/* start register collnet buffer */
|
||||
struct collnetRegInfo info = { regRecord->addr, regRecord->pages * comm->regCache.pageSize };
|
||||
struct collnetRegInfo info = { regRecord->begAddr, regRecord->endAddr - regRecord->begAddr };
|
||||
void* handle = NULL;
|
||||
struct ncclConnInfo* conn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].conn : &comm->channels[0].peers[comm->nRanks]->send[type].conn;
|
||||
|
||||
@@ -1397,7 +1398,7 @@ ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
char line[1024];
|
||||
|
||||
if (comm->collNetSupport == 0) goto exit;
|
||||
if (comm->config.collnetEnable == 0) goto exit;
|
||||
// Connect Collnet + chain
|
||||
for (int c = 0; c < comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels + c;
|
||||
@@ -1429,7 +1430,7 @@ fail:
|
||||
ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
if (comm->collNetSupport == 0) goto exit;
|
||||
if (comm->config.collnetEnable == 0) goto exit;
|
||||
|
||||
// Connect intra-node CollNet + Direct
|
||||
for (int c = 0; c < comm->nChannels; c++) {
|
||||
@@ -1506,8 +1507,8 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop
|
||||
|
||||
comm->collNetHeads = headsUnique;
|
||||
comm->collNetHeadsNum = nHeadsUnique;
|
||||
if (parent && parent->collNetSupport && parent->nNodes == comm->nNodes) {
|
||||
if (!parent->config.splitShare) {
|
||||
if (parent && parent->config.collnetEnable && parent->nNodes == comm->nNodes) {
|
||||
if (!parent->shareResources) {
|
||||
collNetSetupFail = 1;
|
||||
goto fail;
|
||||
}
|
||||
@@ -1555,9 +1556,6 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop
|
||||
|
||||
NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail);
|
||||
} else {
|
||||
/* TODO: CX-6 and CX-7 both do not support multiple sharp resources per process, if child comm cannot
|
||||
* share the sharp resource from parent, we cannot use sharp in this case. This restriction might be
|
||||
* lifted by sharp plugin/IB hardware in the future. */
|
||||
collNetSetupFail = 1;
|
||||
if (comm->rank == 0) {
|
||||
WARN("Child comms (nRanks %d) fails to share parent comms (nRanks %d) sharp resources", comm->nRanks, parent->nRanks);
|
||||
@@ -1637,7 +1635,7 @@ exit:
|
||||
return ret;
|
||||
fail:
|
||||
ncclTransportCollNetFree(comm);
|
||||
comm->collNetSupport = 0;
|
||||
comm->config.collnetEnable = 0;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
|
||||
+37
-38
@@ -21,6 +21,7 @@
|
||||
#include "graph.h"
|
||||
#include "graph/topo.h"
|
||||
#include "nccl_net.h"
|
||||
#include "register_inline.h"
|
||||
#if defined(ENABLE_NPKIT)
|
||||
#include "npkit/npkit.h"
|
||||
#endif
|
||||
@@ -679,8 +680,6 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
|
||||
resources->netDeviceVersion = props.netDeviceVersion;
|
||||
resources->netDeviceType = props.netDeviceType;
|
||||
|
||||
resources->netDeviceVersion = props.netDeviceVersion;
|
||||
resources->netDeviceType = props.netDeviceType;
|
||||
/* point-to-point size limits*/
|
||||
resources->maxP2pBytes = props.maxP2pBytes;
|
||||
if((resources->maxP2pBytes <= 0) || (resources->maxP2pBytes > NCCL_MAX_NET_SIZE_BYTES)) {
|
||||
@@ -785,12 +784,18 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
|
||||
}
|
||||
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank;
|
||||
if (comms->sendComm[resources->channelId] == NULL) {
|
||||
// let only one localrank connect to a tpRemoteRank to avoid duplicate connections
|
||||
if (comms->activeConnect[resources->channelId] == 0)
|
||||
comms->activeConnect[resources->channelId] = (resources->tpLocalRank + 1);
|
||||
if (comms->sendComm[resources->channelId] == NULL
|
||||
&& comms->activeConnect[resources->channelId] == (resources->tpLocalRank + 1)) {
|
||||
if (rccl_anp) {
|
||||
ncclNetCtxt.chId = resources->channelId;
|
||||
ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, comms->sendComm + resources->channelId, (ncclNetDeviceHandle_t **)&ncclNetCtxt);
|
||||
ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle,
|
||||
comms->sendComm + resources->channelId, (ncclNetDeviceHandle_t **)&ncclNetCtxt);
|
||||
} else {
|
||||
ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle);
|
||||
ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle,
|
||||
comms->sendComm + resources->channelId, &resources->netDeviceHandle);
|
||||
}
|
||||
}
|
||||
resources->netSendComm = comms->sendComm[resources->channelId];
|
||||
@@ -929,7 +934,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
if (type == NCCL_PTR_CUDA && proxyState->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
|
||||
int dmabuf_fd;
|
||||
uint64_t offset;
|
||||
CUCHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset));
|
||||
HSACHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset));
|
||||
NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
|
||||
(void)close(dmabuf_fd);
|
||||
TRACE(NCCL_INIT|NCCL_NET, "hsa_amd_portable_export_dmabuf buffer %p size %d handle %x offset %ld",
|
||||
@@ -981,13 +986,20 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
||||
NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
|
||||
}
|
||||
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteProxyRank;
|
||||
if (comms->recvComm[resources->channelId] == NULL) {
|
||||
if (rccl_anp) {
|
||||
ncclNetCtxt.chId = resources->channelId;
|
||||
ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId, (ncclNetDeviceHandle_t **)&ncclNetCtxt);
|
||||
} else {
|
||||
ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId, &resources->netDeviceHandle);
|
||||
}
|
||||
// reuse handle to for netdev/remote rank to avoid duplicate connections
|
||||
if (comms->activeAccept[resources->channelId] == 0)
|
||||
comms->activeAccept[resources->channelId] = (resources->tpLocalRank + 1);
|
||||
//try connecting while comm is null
|
||||
if (comms->recvComm[resources->channelId] == NULL
|
||||
&& comms->activeAccept[resources->channelId] == (resources->tpLocalRank + 1)) {
|
||||
if (rccl_anp) {
|
||||
ncclNetCtxt.chId = resources->channelId;
|
||||
ret = proxyState->ncclNet->accept(resources->netListenComm,
|
||||
comms->recvComm+resources->channelId, (ncclNetDeviceHandle_t **)&ncclNetCtxt);
|
||||
} else {
|
||||
ret = proxyState->ncclNet->accept(resources->netListenComm,
|
||||
comms->recvComm+resources->channelId, &resources->netDeviceHandle);
|
||||
}
|
||||
}
|
||||
resources->netRecvComm = comms->recvComm[resources->channelId];
|
||||
if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
|
||||
@@ -1115,7 +1127,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
||||
if (type == NCCL_PTR_CUDA && proxyState->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
|
||||
int dmabuf_fd;
|
||||
uint64_t offset;
|
||||
CUCHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset));
|
||||
HSACHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset));
|
||||
NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
|
||||
(void)close(dmabuf_fd);
|
||||
TRACE(NCCL_INIT|NCCL_NET, "hsa_amd_portable_export_dmabuf buffer %p size %d handle %x offset %ld",
|
||||
@@ -1243,7 +1255,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
// Set step base for next op
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
sub->posted = sub->transmitted = sub->done = 0;
|
||||
ncclProfilerStartSendProxyOpEvent(s, args);
|
||||
ncclProfilerRecordProxyOpEventState(s, args, ncclProfilerProxyOpInProgress_v4);
|
||||
facebook_rccl::addNewProxyOp(proxyState->proxyTrace, sub->traceKey,
|
||||
sub->traceInfo, facebook_rccl::ProxyOpType::SEND,
|
||||
sub->channelId, sub->nsteps, sub->nbytes, sub->peer);
|
||||
@@ -1286,7 +1298,6 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
} else {
|
||||
sub->posted += args->sliceSteps;
|
||||
}
|
||||
ncclProfilerRecordProxyOpEventState(s, args, sub->posted, sub->transSize, ncclProfilerProxyOpSendPosted);
|
||||
ncclProfilerRecordProxyStepEventState(s, args, postedStepId, ncclProfilerProxyStepSendGPUWait);
|
||||
facebook_rccl::updateProxyOpCounter(proxyState->proxyTrace, sub->traceKey, facebook_rccl::ProxyCounterTypes::POSTED, sub->posted);
|
||||
args->idle = 0;
|
||||
@@ -1353,22 +1364,22 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
args->hdp_flushed = *recvTail;
|
||||
*resources->curr_hdp_reg = 1;
|
||||
}
|
||||
ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted+args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait);
|
||||
ncclProfilerRecordProxyStepEventState(s, args, transmittedStepId, ncclProfilerProxyStepSendPeerWait_v4);
|
||||
facebook_rccl::updateProxyOpCounter(proxyState->proxyTrace, sub->traceKey,
|
||||
facebook_rccl::ProxyCounterTypes::KERNEL_COPY_READY, sub->reg ? 1: sub->transmitted + args->sliceSteps);
|
||||
// Data is ready, try to send.
|
||||
// Coverity complains about the size here as pointing to an out-of-scope temporary. Which is nonsense,
|
||||
// since size is a plain integer.
|
||||
// coverity[use_invalid:FALSE]
|
||||
void* phandle = &sub->pHandles[DIVUP(transmittedStepId, args->sliceSteps)%NCCL_STEPS];
|
||||
void **requestPtr = sub->requests+buffSlot;
|
||||
// for LL/LL128 protocols, completion event for write operation is not needed on the receiver side as
|
||||
// the LL flags are actively polled to detect if full data is received or not, so this hint can be used
|
||||
// by network plugin to optimize the transport for LL/LL128
|
||||
bool ignoreCompletion = ncclParamNetOptionalRecvCompletion() && ((args->protocol == NCCL_PROTO_LL128) || (args->protocol == NCCL_PROTO_LL));
|
||||
if (ignoreCompletion) *requestPtr = (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION;
|
||||
NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub, requestPtr));
|
||||
NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, phandle, requestPtr));
|
||||
if (*requestPtr != NULL) {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
|
||||
NpKit::CollectCpuEvent(
|
||||
NPKIT_EVENT_NET_SEND_ENTRY,
|
||||
@@ -1384,12 +1395,9 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
#endif
|
||||
sub->timestamp[buffSlot] = 0;
|
||||
#endif
|
||||
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle);
|
||||
sub->transSize += size;
|
||||
sub->transSize = size;
|
||||
sub->transmitted += args->sliceSteps;
|
||||
sub->profilerSteps++;
|
||||
ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted);
|
||||
ncclProfilerRecordProxyStepEventState(s, args, transmittedStepId, ncclProfilerProxyStepSendWait);
|
||||
facebook_rccl::updateProxyOpCounter(proxyState->proxyTrace, sub->traceKey,
|
||||
facebook_rccl::ProxyCounterTypes::TRANSMITTED, sub->transmitted);
|
||||
@@ -1458,9 +1466,6 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] request %p done", sub->done, buffSlot, sub->nsteps, sub->requests[buffSlot]);
|
||||
sub->done += args->sliceSteps;
|
||||
ncclProfilerStopProxyStepEvent(s, args, doneStepId);
|
||||
ncclProfilerRecordProxyOpEventState(s, args, sub->done, sub->transSize, ncclProfilerProxyOpSendDone);
|
||||
facebook_rccl::updateProxyOpCounter(proxyState->proxyTrace, sub->traceKey,
|
||||
facebook_rccl::ProxyCounterTypes::DONE, sub->done);
|
||||
if (resources->shared == 0) {
|
||||
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
|
||||
*sendHead = sub->base + sub->done;
|
||||
@@ -1526,7 +1531,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
sub->posted = sub->received = sub->transmitted = sub->done = 0;
|
||||
sub->regBufferReady = 0;
|
||||
for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize;
|
||||
ncclProfilerStartRecvProxyOpEvent(s, args);
|
||||
ncclProfilerRecordProxyOpEventState(s, args, ncclProfilerProxyOpInProgress_v4);
|
||||
facebook_rccl::addNewProxyOp(proxyState->proxyTrace, sub->traceKey, sub->traceInfo,
|
||||
facebook_rccl::ProxyOpType::RECV, sub->channelId, sub->nsteps, sub->nbytes, sub->peer);
|
||||
if (!sub->reg)
|
||||
@@ -1589,7 +1594,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
|
||||
tags[subCount] = resources->tpRemoteRank;
|
||||
mhandles[subCount] = sub->recvMhandle;
|
||||
phandles[subCount] = sub;
|
||||
phandles[subCount] = &sub->pHandles[DIVUP(postedStepId, args->sliceSteps)%NCCL_STEPS];
|
||||
subCount++;
|
||||
}
|
||||
}
|
||||
@@ -1624,8 +1629,6 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
#endif
|
||||
|
||||
sub->posted += args->sliceSteps;
|
||||
sub->profilerSteps++;
|
||||
ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted);
|
||||
ncclProfilerRecordProxyStepEventState(s+i, args, postedStepId, ncclProfilerProxyStepRecvWait);
|
||||
facebook_rccl::updateProxyOpCounter(proxyState->proxyTrace,
|
||||
sub->traceKey, facebook_rccl::ProxyCounterTypes::POSTED, sub->posted);
|
||||
@@ -1673,9 +1676,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
struct recvNetResources* resources = (struct recvNetResources*)(sub->connection->transportResources);
|
||||
volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
|
||||
connFifo[buffSlot].size = -1;
|
||||
sub->transSize += sizes[i];
|
||||
sub->transSize = sizes[i];
|
||||
sub->received += args->sliceSteps;
|
||||
ncclProfilerRecordProxyOpEventState(s+i, args, sub->received, sub->transSize, ncclProfilerProxyOpRecvReceived);
|
||||
ncclProfilerRecordProxyStepEventState(s+i, args, receivedStepId, ncclProfilerProxyStepRecvFlushWait);
|
||||
facebook_rccl::updateProxyOpCounter(proxyState->proxyTrace, sub->traceKey, facebook_rccl::ProxyCounterTypes::RECEIVED, sub->received);
|
||||
if (step < sub->nsteps) {
|
||||
@@ -1766,7 +1768,6 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
int transmittedStepId = sub->transmitted;
|
||||
|
||||
sub->transmitted += args->sliceSteps;
|
||||
ncclProfilerRecordProxyOpEventState(s+i, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpRecvTransmitted);
|
||||
ncclProfilerRecordProxyStepEventState(s+i, args, transmittedStepId, ncclProfilerProxyStepRecvGPUWait);
|
||||
facebook_rccl::updateProxyOpCounter(proxyState->proxyTrace, sub->traceKey, facebook_rccl::ProxyCounterTypes::TRANSMITTED, sub->transmitted);
|
||||
if (step < sub->nsteps) {
|
||||
@@ -1788,7 +1789,6 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
struct ncclProxySubArgs* subGroup = args->subs+s;
|
||||
for (int i=0; i<subGroup->groupSize; i++) {
|
||||
struct ncclProxySubArgs* sub = subGroup + i;
|
||||
int doneStepId = sub->done;
|
||||
if (sub->done == sub->nsteps) continue;
|
||||
if (sub->transmitted > sub->done) {
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
@@ -1805,10 +1805,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
NCCLCHECK(proxyState->ncclNet->irecvConsumed(resources->netRecvComm, subGroup->recvRequestsSubCount, subGroup->recvRequestsCache[sub->done%NCCL_STEPS]));
|
||||
subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL;
|
||||
}
|
||||
int doneStepId = sub->done;
|
||||
sub->done += args->sliceSteps;
|
||||
ncclProfilerStopProxyStepEvent(s+i, args, doneStepId);
|
||||
ncclProfilerRecordProxyOpEventState(s+i, args, sub->done, sub->transSize, ncclProfilerProxyOpRecvDone);
|
||||
facebook_rccl::updateProxyOpCounter(proxyState->proxyTrace, sub->traceKey, facebook_rccl::ProxyCounterTypes::DONE, sub->done);
|
||||
args->idle = 0;
|
||||
if (sub->done == sub->nsteps) {
|
||||
args->done++;
|
||||
@@ -1859,9 +1858,9 @@ static ncclResult_t netRegisterBuffer(ncclComm* comm, const void* userbuff, size
|
||||
if (found) {
|
||||
*outRegBufFlag = 1;
|
||||
outHandle[p] = netHandle->handle;
|
||||
INFO(NCCL_REG, "rank %d - NET reuse buffer %p size %ld (baseAddr %p size %ld) handle %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, netHandle->handle);
|
||||
INFO(NCCL_REG, "rank %d - NET reuse buffer %p size %ld (baseAddr %p size %ld) handle %p", comm->rank, userbuff, buffSize, (void*)regRecord->begAddr, regRecord->endAddr - regRecord->begAddr, netHandle->handle);
|
||||
} else {
|
||||
struct netRegInfo info = { regRecord->addr, regRecord->pages * comm->regCache.pageSize };
|
||||
struct netRegInfo info = { regRecord->begAddr, regRecord->endAddr - regRecord->begAddr };
|
||||
void* handle = NULL;
|
||||
|
||||
if (peerConn->conn.flags & NCCL_DIRECT_NIC) {
|
||||
|
||||
+197
-105
@@ -27,9 +27,11 @@
|
||||
#include <sys/utsname.h>
|
||||
|
||||
#include "ibvwrap.h"
|
||||
#include "mlx5/mlx5dvwrap.h"
|
||||
#include "graph/xml.h"
|
||||
|
||||
#define MAXNAMESIZE 64
|
||||
#define MAXSUFFIXSIZE 16
|
||||
#define MAXNAMESIZE (64 + MAXSUFFIXSIZE)
|
||||
static char ncclIbIfName[MAX_IF_NAME_SIZE+1];
|
||||
static union ncclSocketAddress ncclIbIfAddr;
|
||||
|
||||
@@ -58,6 +60,17 @@ struct ncclIbStats {
|
||||
int fatalErrorCount;
|
||||
};
|
||||
|
||||
enum ncclIbProvider {
|
||||
IB_PROVIDER_NONE = 0,
|
||||
IB_PROVIDER_MLX5 = 1,
|
||||
IB_PROVIDER_MAX = 2,
|
||||
};
|
||||
|
||||
const char* ibProviderName[] = {
|
||||
"None",
|
||||
"Mlx5",
|
||||
};
|
||||
|
||||
static int ncclNIbDevs = -1;
|
||||
struct alignas(64) ncclIbDev {
|
||||
pthread_mutex_t lock;
|
||||
@@ -80,6 +93,12 @@ struct alignas(64) ncclIbDev {
|
||||
struct ibv_port_attr portAttr;
|
||||
struct ncclIbStats stats;
|
||||
int dmaBufSupported;
|
||||
enum ncclIbProvider ibProvider;
|
||||
union {
|
||||
struct {
|
||||
int dataDirect;
|
||||
} mlx5;
|
||||
} capsProvider;
|
||||
};
|
||||
|
||||
#define MAX_IB_DEVS 32
|
||||
@@ -109,6 +128,7 @@ NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
|
||||
NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", -1);
|
||||
NCCL_PARAM(IbAsyncEvents,"IB_RETURN_ASYNC_EVENTS",1);
|
||||
NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1);
|
||||
NCCL_PARAM(IbDataDirect,"IB_DATA_DIRECT",1);
|
||||
|
||||
static ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat) {
|
||||
__atomic_store_n(&stat->fatalErrorCount, 0, __ATOMIC_RELAXED);
|
||||
@@ -454,6 +474,10 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort)
|
||||
if (p == NULL) {
|
||||
WARN("Could not find real path of %s (%s)", devName, devicePath);
|
||||
} else {
|
||||
// Merge multi-port NICs into the same PCI device
|
||||
p[strlen(p)-1] = '0';
|
||||
// Also merge virtual functions (VF) into the same device
|
||||
if (ncclParamIbMergeVfs()) p[strlen(p)-3] = p[strlen(p)-4] = '0';
|
||||
// Keep the real port aside (the ibv port is always 1 on recent cards)
|
||||
*realPort = 0;
|
||||
for (int d=0; d<ncclNIbDevs; d++) {
|
||||
@@ -498,9 +522,29 @@ static int ncclIbRelaxedOrderingCapable(void) {
|
||||
return r == ncclInternalError ? 0 : 1;
|
||||
}
|
||||
|
||||
static bool ncclMlx5dvDmaBufCapable(ibv_context *context){
|
||||
ncclResult_t res;
|
||||
int dev_fail = 0;
|
||||
|
||||
struct ibv_pd* pd;
|
||||
NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, context), res, failure);
|
||||
// Test kernel DMA-BUF support with a dummy call (fd=-1)
|
||||
(void)wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL /*offset*/, 0ULL /*len*/, 0ULL /*iova*/, -1 /*fd*/, 0 /*flags*/);
|
||||
// ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
|
||||
(void)wrap_direct_mlx5dv_reg_dmabuf_mr(pd, 0ULL /*offset*/, 0ULL /*len*/, 0ULL /*iova*/, -1 /*fd*/, 0 /*flags*/, 0 /* mlx5 flags*/);
|
||||
// mlx5dv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
|
||||
dev_fail |= (errno == EOPNOTSUPP) || (errno == EPROTONOSUPPORT);
|
||||
NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
|
||||
// stop the search and goto failure
|
||||
if (dev_fail) goto failure;
|
||||
return true;
|
||||
failure:
|
||||
return false;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) {
|
||||
if (ncclParamIbMergeNics() == 0 && props->ndevs > 1) {
|
||||
WARN("NET/IB : Trying to merge multiple devices together when NCCL_IB_MERGE_NICS=0. Please enable it or disable device merging in NCCL.");
|
||||
INFO(NCCL_NET, "NET/IB : Skipping makeVDevice, NCCL_IB_MERGE_NICS=0");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
|
||||
@@ -568,6 +612,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
|
||||
if (ncclParamIbDisable()) return ncclInternalError;
|
||||
static int shownIbHcaEnv = 0;
|
||||
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
|
||||
if(wrap_mlx5dv_symbols() != ncclSuccess) { INFO(NCCL_NET, "NET/IB : Failed to open mlx5dv symbols. Advance features like CX-8 Direct-NIC will be disabled."); }
|
||||
|
||||
// Detect IB cards
|
||||
int nIbDevs = 0;
|
||||
@@ -577,9 +622,11 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
|
||||
pthread_mutex_lock(&ncclIbLock);
|
||||
wrap_ibv_fork_init();
|
||||
if (ncclNIbDevs == -1) {
|
||||
int nIpIfs = 0;
|
||||
ncclNIbDevs = 0;
|
||||
ncclNMergedIbDevs = 0;
|
||||
if (ncclFindInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
|
||||
NCCLCHECK(ncclFindInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1, &nIpIfs));
|
||||
if (nIpIfs != 1) {
|
||||
WARN("NET/IB : No IP interface found.");
|
||||
ret = ncclInternalError;
|
||||
goto fail;
|
||||
@@ -603,6 +650,17 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
|
||||
WARN("NET/IB : Unable to open device %s", devices[d]->name);
|
||||
continue;
|
||||
}
|
||||
enum ncclIbProvider ibProvider = IB_PROVIDER_NONE;
|
||||
char dataDirectDevicePath[PATH_MAX];
|
||||
int dataDirectSupported = 0;
|
||||
if (wrap_mlx5dv_is_supported(devices[d])) {
|
||||
ibProvider = IB_PROVIDER_MLX5;
|
||||
snprintf(dataDirectDevicePath, PATH_MAX, "/sys");
|
||||
if((ncclMlx5dvDmaBufCapable(context)) && (wrap_mlx5dv_get_data_direct_sysfs_path(context, dataDirectDevicePath + 4, PATH_MAX - 4) == ncclSuccess)) {
|
||||
INFO(NCCL_NET, "Data Direct DMA Interface is detected for device:%s", devices[d]->name);
|
||||
if(ncclParamIbDataDirect()) dataDirectSupported = 1;
|
||||
}
|
||||
}
|
||||
int nPorts = 0;
|
||||
struct ibv_device_attr devAttr;
|
||||
memset(&devAttr, 0, sizeof(devAttr));
|
||||
@@ -616,58 +674,69 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
|
||||
continue;
|
||||
}
|
||||
for (int port_num = 1; port_num <= devAttr.phys_port_cnt; port_num++) {
|
||||
struct ibv_port_attr portAttr;
|
||||
if (ncclSuccess != wrap_ibv_query_port(context, port_num, &portAttr)) {
|
||||
WARN("NET/IB : Unable to query port_num %d", port_num);
|
||||
continue;
|
||||
for (int dataDirect = 0; dataDirect < 1 + dataDirectSupported; ++dataDirect) {
|
||||
struct ibv_port_attr portAttr;
|
||||
if (ncclSuccess != wrap_ibv_query_port(context, port_num, &portAttr)) {
|
||||
WARN("NET/IB : Unable to query port_num %d", port_num);
|
||||
continue;
|
||||
}
|
||||
if (portAttr.state != IBV_PORT_ACTIVE) continue;
|
||||
if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND
|
||||
&& portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
|
||||
|
||||
// check against user specified HCAs/ports
|
||||
if (! (matchIfList(devices[d]->name, port_num, userIfs, nUserIfs, searchExact) ^ searchNot)) {
|
||||
continue;
|
||||
}
|
||||
pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL);
|
||||
ncclIbDevs[ncclNIbDevs].device = d;
|
||||
ncclIbDevs[ncclNIbDevs].ibProvider = ibProvider;
|
||||
ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
|
||||
ncclIbDevs[ncclNIbDevs].portAttr = portAttr;
|
||||
ncclIbDevs[ncclNIbDevs].portNum = port_num;
|
||||
ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
|
||||
ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
|
||||
ncclIbDevs[ncclNIbDevs].context = context;
|
||||
ncclIbDevs[ncclNIbDevs].pdRefs = 0;
|
||||
ncclIbDevs[ncclNIbDevs].pd = NULL;
|
||||
if (!dataDirect) {
|
||||
strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
|
||||
NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail);
|
||||
}
|
||||
else {
|
||||
snprintf(ncclIbDevs[ncclNIbDevs].devName, MAXNAMESIZE, "%s_dma", devices[d]->name);
|
||||
NCCLCHECK(ncclCalloc(&ncclIbDevs[ncclNIbDevs].pciPath, PATH_MAX));
|
||||
strncpy(ncclIbDevs[ncclNIbDevs].pciPath, dataDirectDevicePath, PATH_MAX);
|
||||
ncclIbDevs[ncclNIbDevs].capsProvider.mlx5.dataDirect = 1;
|
||||
}
|
||||
ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
|
||||
ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0;
|
||||
ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
|
||||
ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
|
||||
NCCLCHECK(ncclIbStatsInit(&ncclIbDevs[ncclNIbDevs].stats));
|
||||
|
||||
// Enable ADAPTIVE_ROUTING by default on IB networks
|
||||
// But allow it to be overloaded by an env parameter
|
||||
ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0;
|
||||
if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting();
|
||||
|
||||
INFO(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s provider=%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum,
|
||||
NCCL_IB_LLSTR(portAttr.link_layer), ibProviderName[ncclIbDevs[ncclNIbDevs].ibProvider], ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
|
||||
|
||||
PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail);
|
||||
ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
|
||||
PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d
|
||||
|
||||
// Add this plain physical device to the list of virtual devices
|
||||
int vDev;
|
||||
ncclNetVDeviceProps_t vProps = {0};
|
||||
vProps.ndevs = 1;
|
||||
vProps.devs[0] = ncclNIbDevs;
|
||||
NCCLCHECK(ncclIbMakeVDeviceInternal(&vDev, &vProps));
|
||||
|
||||
ncclNIbDevs++;
|
||||
nPorts++;
|
||||
}
|
||||
if (portAttr.state != IBV_PORT_ACTIVE) continue;
|
||||
if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND
|
||||
&& portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
|
||||
|
||||
// check against user specified HCAs/ports
|
||||
if (! (matchIfList(devices[d]->name, port_num, userIfs, nUserIfs, searchExact) ^ searchNot)) {
|
||||
continue;
|
||||
}
|
||||
pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL);
|
||||
ncclIbDevs[ncclNIbDevs].device = d;
|
||||
ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
|
||||
ncclIbDevs[ncclNIbDevs].portAttr = portAttr;
|
||||
ncclIbDevs[ncclNIbDevs].portNum = port_num;
|
||||
ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
|
||||
ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
|
||||
ncclIbDevs[ncclNIbDevs].context = context;
|
||||
ncclIbDevs[ncclNIbDevs].pdRefs = 0;
|
||||
ncclIbDevs[ncclNIbDevs].pd = NULL;
|
||||
strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
|
||||
NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail);
|
||||
ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
|
||||
ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0;
|
||||
ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
|
||||
ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
|
||||
NCCLCHECK(ncclIbStatsInit(&ncclIbDevs[ncclNIbDevs].stats));
|
||||
|
||||
// Enable ADAPTIVE_ROUTING by default on IB networks
|
||||
// But allow it to be overloaded by an env parameter
|
||||
ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0;
|
||||
if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting();
|
||||
|
||||
TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum,
|
||||
NCCL_IB_LLSTR(portAttr.link_layer), ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
|
||||
|
||||
PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail);
|
||||
ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
|
||||
PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d
|
||||
|
||||
// Add this plain physical device to the list of virtual devices
|
||||
int vDev;
|
||||
ncclNetVDeviceProps_t vProps = {0};
|
||||
vProps.ndevs = 1;
|
||||
vProps.devs[0] = ncclNIbDevs;
|
||||
NCCLCHECK(ncclIbMakeVDeviceInternal(&vDev, &vProps));
|
||||
|
||||
ncclNIbDevs++;
|
||||
nPorts++;
|
||||
}
|
||||
if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; }
|
||||
}
|
||||
@@ -858,6 +927,9 @@ ncclResult_t ncclIbGetPhysProperties(int dev, ncclNetProperties_t* props) {
|
||||
props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF
|
||||
}
|
||||
props->forceFlush = 0;
|
||||
if (ibDev->capsProvider.mlx5.dataDirect) {
|
||||
props->forceFlush = 1;
|
||||
}
|
||||
props->latency = 0; // Not set
|
||||
props->port = ibDev->portNum + ibDev->realPort;
|
||||
props->maxComms = ibDev->maxQp;
|
||||
@@ -974,6 +1046,7 @@ struct ncclProfilerInfo {
|
||||
int qpIndex[MAX_QPS_PER_REQ];
|
||||
int nEventHandles;
|
||||
ncclProfilerNetIbDescr_v1_t data;
|
||||
void* pHandle;
|
||||
};
|
||||
|
||||
struct ncclIbRequest {
|
||||
@@ -1397,23 +1470,27 @@ ib_recv_dev_list:
|
||||
devInfo->gid.global.interface_id = commDev->base.gidInfo.localGid.global.interface_id;
|
||||
|
||||
// info logging
|
||||
if (devInfo->link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB
|
||||
for (int q = 0; q < comm->base.nqps; q++) {
|
||||
// Print just the QPs for this dev
|
||||
if (comm->base.qps[q].devIndex == i)
|
||||
for (int q = 0; q < comm->base.nqps; q++) {
|
||||
// Print just the QPs for this dev
|
||||
if (comm->base.qps[q].devIndex == i) {
|
||||
if (devInfo->link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB
|
||||
INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d LID %d subnet-prefix %lu FLID %d fifoRkey=0x%x fifoLkey=0x%x",
|
||||
comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev",
|
||||
dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid,
|
||||
devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey);
|
||||
}
|
||||
} else { // RoCE
|
||||
for (int q = 0; q < comm->base.nqps; q++) {
|
||||
// Print just the QPs for this dev
|
||||
if (comm->base.qps[q].devIndex == i)
|
||||
INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x",
|
||||
comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
|
||||
commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, (int64_t)commDev->base.gidInfo.localGidIndex,
|
||||
devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey);
|
||||
comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev",
|
||||
dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid,
|
||||
devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey);
|
||||
} else { // RoCE
|
||||
INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x",
|
||||
comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
|
||||
commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu,
|
||||
(int64_t)commDev->base.gidInfo.localGidIndex,
|
||||
devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey);
|
||||
}
|
||||
// Log ECE info
|
||||
if (meta.qpInfo[q].ece_supported) {
|
||||
INFO(NCCL_NET,"NET/IB: IbDev %d Port %d qpn %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x}",
|
||||
commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn,
|
||||
meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (link_layer == IBV_LINK_LAYER_UNSPECIFIED) link_layer = devInfo->link_layer;
|
||||
@@ -1490,8 +1567,14 @@ ib_connect:
|
||||
ncclIbSendCommDev* commDev = comm->devs + devIndex;
|
||||
|
||||
struct ibv_qp* qp = comm->base.qps[q].qp;
|
||||
if (remQpInfo->ece_supported)
|
||||
if (remQpInfo->ece_supported) {
|
||||
struct ncclIbQp* nqp = comm->base.qps + q;
|
||||
int ibDevN = comm->devs[nqp->devIndex].base.ibDevN;
|
||||
struct ncclIbDev* ibDev = ncclIbDevs + ibDevN;
|
||||
INFO(NCCL_NET,"NET/IB: IbDev %d Port %d qpn %d set_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x}",
|
||||
ibDevN, ibDev->portNum, qp->qp_num, remMeta.qpInfo[q].ece_supported, remMeta.qpInfo[q].ece.vendor_id, remMeta.qpInfo[q].ece.options, remMeta.qpInfo[q].ece.comp_mask);
|
||||
NCCLCHECKGOTO(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported), ret, fail);
|
||||
}
|
||||
|
||||
ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
|
||||
remDevInfo->mtu = std::min(remDevInfo->mtu, ibDev->portAttr.active_mtu);
|
||||
@@ -1499,16 +1582,6 @@ ib_connect:
|
||||
NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail);
|
||||
}
|
||||
|
||||
if (link_layer == IBV_LINK_LAYER_ETHERNET ) { // RoCE
|
||||
for (int q = 0; q < comm->base.nqps; q++) {
|
||||
struct ncclIbQp* qp = comm->base.qps + q;
|
||||
int ibDevN = comm->devs[qp->devIndex].base.ibDevN;
|
||||
struct ncclIbDev* ibDev = ncclIbDevs + ibDevN;
|
||||
INFO(NCCL_NET,"NET/IB: IbDev %d Port %d qpn %d set_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x}",
|
||||
ibDevN, ibDev->portNum, remMeta.qpInfo[q].qpn, remMeta.qpInfo[q].ece_supported, remMeta.qpInfo[q].ece.vendor_id, remMeta.qpInfo[q].ece.options, remMeta.qpInfo[q].ece.comp_mask);
|
||||
}
|
||||
}
|
||||
|
||||
comm->base.nDataQps = std::max(comm->base.vProps.ndevs, comm->base.nRemDevs);
|
||||
|
||||
comm->base.ready = 1;
|
||||
@@ -1867,9 +1940,8 @@ ncclResult_t ncclIbGetRequest(struct ncclIbNetCommBase* base, struct ncclIbReque
|
||||
if (r->type == NCCL_NET_IB_REQ_UNUSED) {
|
||||
r->base = base;
|
||||
r->sock = NULL;
|
||||
r->devBases[0] = NULL;
|
||||
r->devBases[1] = NULL;
|
||||
r->events[0] = r->events[1] = 0;
|
||||
memset(r->devBases, 0, sizeof(r->devBases));
|
||||
memset(r->events, 0, sizeof(r->events));
|
||||
*req = r;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -1906,7 +1978,11 @@ ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, s
|
||||
if (ncclIbRelaxedOrderingEnabled) flags |= IBV_ACCESS_RELAXED_ORDERING;
|
||||
if (fd != -1) {
|
||||
/* DMA-BUF support */
|
||||
NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags), res, returning);
|
||||
if (!ncclIbDevs[base->ibDevN].capsProvider.mlx5.dataDirect) {
|
||||
NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags), res, returning);
|
||||
} else {
|
||||
NCCLCHECKGOTO(wrap_mlx5dv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT), res, returning);
|
||||
}
|
||||
} else {
|
||||
if (ncclIbRelaxedOrderingEnabled) {
|
||||
// Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
|
||||
@@ -2014,7 +2090,7 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
|
||||
|
||||
NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0);
|
||||
|
||||
ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, void* pHandle) {
|
||||
ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
|
||||
struct ncclIbRequest** reqs = comm->fifoReqs[slot];
|
||||
volatile struct ncclIbSendFifo* slots = comm->fifo[slot];
|
||||
int nreqs = slots[0].nreqs;
|
||||
@@ -2106,19 +2182,21 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, void* pHandl
|
||||
struct ibv_send_wr* bad_wr;
|
||||
#ifdef NCCL_ENABLE_NET_PROFILING
|
||||
// QP profiling loop
|
||||
for (int r=0; r<nreqs && pHandle; r++) {
|
||||
for (int r=0; r<nreqs; r++) {
|
||||
// Store comm qpIndex for this request
|
||||
int nEventHandles = reqs[r]->pInfo[0].nEventHandles;
|
||||
reqs[r]->pInfo[0].qpIndex[nEventHandles%MAX_QPS_PER_REQ] = qpIndex;
|
||||
assert(nEventHandles < MAX_QPS_PER_REQ);
|
||||
reqs[r]->pInfo[0].qpIndex[nEventHandles] = qpIndex;
|
||||
// Store info for profiler
|
||||
int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
|
||||
int64_t pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
|
||||
reqs[r]->pInfo[0].data.type = ncclProfileQp;
|
||||
reqs[r]->pInfo[0].data.qp.device = devIndex;
|
||||
reqs[r]->pInfo[0].data.qp.wr_id = comm->wrs[r].wr_id;
|
||||
reqs[r]->pInfo[0].data.qp.opcode = comm->wrs[r].opcode;
|
||||
reqs[r]->pInfo[0].data.qp.qpNum = qp->qp->qp_num;
|
||||
reqs[r]->pInfo[0].data.qp.length = comm->sges[r].length;
|
||||
NCCLCHECK(ncclProfilerFunction(&reqs[r]->pInfo[0].qpEventHandles[nEventHandles%MAX_QPS_PER_REQ], 0, pHandle, pluginId, &reqs[r]->pInfo[0].data));
|
||||
void* pHandle = reqs[r]->pInfo[0].pHandle;
|
||||
NCCLCHECK(ncclProfilerFunction(&reqs[r]->pInfo[0].qpEventHandles[nEventHandles], ncclProfilerNetEventStart, pHandle, pluginId, &reqs[r]->pInfo[0].data));
|
||||
reqs[r]->pInfo[0].nEventHandles++;
|
||||
}
|
||||
#endif
|
||||
@@ -2145,8 +2223,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, void* pHandl
|
||||
|
||||
ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) {
|
||||
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
|
||||
if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; }
|
||||
if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
|
||||
if (comm->base.ready == 0) {
|
||||
WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0");
|
||||
*request = NULL;
|
||||
return ncclInternalError;
|
||||
}
|
||||
NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
|
||||
|
||||
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle;
|
||||
@@ -2187,6 +2268,9 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
|
||||
req->send.size = size;
|
||||
req->send.data = data;
|
||||
req->send.offset = 0;
|
||||
#ifdef NCCL_ENABLE_NET_PROFILING
|
||||
req->pInfo[0].pHandle = phandle;
|
||||
#endif
|
||||
|
||||
// Populate events
|
||||
int nEvents = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps;
|
||||
@@ -2216,7 +2300,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
|
||||
}
|
||||
|
||||
TIME_START(0);
|
||||
NCCLCHECK(ncclIbMultiSend(comm, slot, phandle));
|
||||
NCCLCHECK(ncclIbMultiSend(comm, slot));
|
||||
|
||||
// Clear slots[0]->nreqs, as well as other fields to help debugging and sanity checks
|
||||
memset((void*)slots, 0, sizeof(struct ncclIbSendFifo));
|
||||
@@ -2314,8 +2398,11 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
|
||||
|
||||
ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) {
|
||||
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
|
||||
if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; }
|
||||
if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
|
||||
if (comm->base.ready == 0) {
|
||||
WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0");
|
||||
*request = NULL;
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
|
||||
NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
|
||||
|
||||
@@ -2349,14 +2436,17 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
|
||||
ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
|
||||
#ifdef NCCL_ENABLE_NET_PROFILING
|
||||
// Start a QP event for every request in the multirecv and every qp
|
||||
for (int r = 0; r < n && phandles; r++) {
|
||||
for (int r = 0; r < n; r++) {
|
||||
int nEventHandles = req->pInfo[r].nEventHandles;
|
||||
assert(nEventHandles < MAX_QPS_PER_REQ);
|
||||
req->pInfo[r].qpIndex[nEventHandles] = comm->base.qpIndex;
|
||||
// Store info for profiler
|
||||
int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
|
||||
int64_t pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
|
||||
req->pInfo[r].data.type = ncclProfileQp;
|
||||
req->pInfo[r].data.qp.device = qp->devIndex;
|
||||
req->pInfo[r].data.qp.wr_id = wr.wr_id;
|
||||
req->pInfo[r].data.qp.qpNum = qp->qp->qp_num;
|
||||
NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[i], 0, phandles[r], pluginId, &req->pInfo[r].data));
|
||||
NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[nEventHandles], ncclProfilerNetEventStart, phandles[r], pluginId, &req->pInfo[r].data));
|
||||
req->pInfo[r].nEventHandles++;
|
||||
}
|
||||
#endif
|
||||
@@ -2454,7 +2544,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
|
||||
sizes[i] = r->recv.sizes[i];
|
||||
#ifdef NCCL_ENABLE_NET_PROFILING
|
||||
for (int j = 0; j < r->pInfo[i].nEventHandles; j++) {
|
||||
NCCLCHECK(ncclProfilerFunction(&r->pInfo[i].qpEventHandles[j], 1, NULL, 0, NULL));
|
||||
NCCLCHECK(ncclProfilerFunction(&r->pInfo[i].qpEventHandles[j], ncclProfilerNetEventStop, NULL, 0, NULL));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -2463,7 +2553,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
|
||||
sizes[0] = r->send.size;
|
||||
#ifdef NCCL_ENABLE_NET_PROFILING
|
||||
for (int j = 0; j < r->pInfo[0].nEventHandles; j++) {
|
||||
NCCLCHECK(ncclProfilerFunction(&r->pInfo[0].qpEventHandles[j], 1, NULL, 0, NULL));
|
||||
NCCLCHECK(ncclProfilerFunction(&r->pInfo[0].qpEventHandles[j], ncclProfilerNetEventStop, NULL, 0, NULL));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -2511,20 +2601,21 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
|
||||
|
||||
#ifdef ENABLE_TRACE
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%u wr_id=%lu r=%p type=%d events={%d,%d}, i=%d",
|
||||
ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i);
|
||||
TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%u wr_id=%lu r=%p type=%d events={%d,%d,%d,%d}, i=%d",
|
||||
ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], req->events[2], req->events[3], i);
|
||||
#endif
|
||||
if (req && req->type == NCCL_NET_IB_REQ_SEND) {
|
||||
for (int j = 0; j < req->nreqs; j++) {
|
||||
struct ncclIbRequest* sendReq = r->base->reqs+((wc->wr_id >> (j*8)) & 0xff);
|
||||
if ((sendReq->events[i] <= 0)) {
|
||||
WARN("NET/IB: sendReq(%p)->events={%d,%d}, i=%d, j=%d <= 0", sendReq, sendReq->events[0], sendReq->events[1], i, j);
|
||||
WARN("NET/IB: sendReq(%p)->events={%d,%d,%d,%d}, i=%d, j=%d <= 0", sendReq, sendReq->events[0], sendReq->events[1], sendReq->events[2], sendReq->events[3], i, j);
|
||||
return ncclInternalError;
|
||||
}
|
||||
sendReq->events[i]--;
|
||||
#ifdef NCCL_ENABLE_NET_PROFILING
|
||||
// Stop Qp event for sendReq
|
||||
NCCLCHECK(ncclProfilerFunction(&sendReq->pInfo[j].qpEventHandles[getReqQpIndex(sendReq, j, wc->qp_num)], 1, NULL, 0, NULL));
|
||||
int qpIndex = getReqQpIndex(sendReq, j, wc->qp_num);
|
||||
NCCLCHECK(ncclProfilerFunction(&sendReq->pInfo[j].qpEventHandles[qpIndex], ncclProfilerNetEventStop, NULL, 0, NULL));
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
@@ -2541,7 +2632,8 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
|
||||
#ifdef NCCL_ENABLE_NET_PROFILING
|
||||
// Stop Qp event for workFifo
|
||||
for (int j = 0; j < req->nreqs; j++) {
|
||||
NCCLCHECK(ncclProfilerFunction(&req->pInfo[j].qpEventHandles[getReqQpIndex(req, j, wc->qp_num)], 1, NULL, 0, NULL));
|
||||
int qpIndex = getReqQpIndex(req, j, wc->qp_num);
|
||||
NCCLCHECK(ncclProfilerFunction(&req->pInfo[j].qpEventHandles[qpIndex], ncclProfilerNetEventStop, NULL, 0, NULL));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
Ορισμένα αρχεία δεν εμφανίστηκαν επειδή έχουν αλλάξει πάρα πολλά αρχεία σε αυτή τη διαφορά Εμφάνιση Περισσότερων
Αναφορά σε νέο ζήτημα
Block a user