2.23.4-1
Add scalable init API
* Add new ncclCommInitRankScalable to allow for passing multiple
unique IDs to the init function.
* Spreads the load onto multiple bootstrap roots, allowing for
constant bootstrap time.
* Requires multiple ranks to create a unique ID, and the CPU-side
ID exchange code to call allgather[v] instead of broadcast.
Accelerate init bootstrap operations
* Reduce the number of calls to allgather.
* Allow roots to reply early to ranks when information is already
available.
* Add an option to use ncclNet instead of sockets to perform
bootstrap allgather operations.
Add PAT algorithms for Allgather and ReduceScatter
* Parallel Aggregated Trees, variation of Bruck algorithm.
* Logarithmic number of network steps for small sizes at scale.
* Only supports one rank per node at the moment.
Add support for registered buffers for intra-node communication.
* Allow registered user buffers to be accessed directly intra-node
* Avoids extra copies in algorithms which permit it, saving
memory bandwidth and helping with compute overlap.
Add profiler plugin API
* New plugin API for profiling
* Supports various levels of profiling, with a hierarchy.
Asynchronous graph allocation
* Make calls to cudaMalloc and cudaMemcpy during graph allocation
asynchronous.
* Significantly speeds up graph capture.
Use fatal IB asynchronous events to stop network operation
* Avoids many other error messages
* Only fatal errors are affected; potentially transient errors
(e.g. port down) do not cause an immediate stop.
Set P2P level to PXB on AMD CPUs when using more than 2 GPUs per node
* P2P would cause a significant performance degradation when using
many GPUs, and therefore many interleaved data flows.
* Disable P2P through the CPU when we have 3+ GPUs per node; keep it
enabled when we only have 2 GPUs.
Improve the init logs to report the real NCCL function.
* Make the log report ncclCommInitRank or ncclCommSplit, rather than
the generic ncclCommInitRankFunc.
Add a parameter to set the location of the user configuration file.
* Add NCCL_CONF_FILE environment variable to set where the user's
configuration file resides.
Increase default IB timeout
* Increase IB timeout value from 18 to 20.
* Should help avoid fatal errors on large RoCE systems.
Add new check for nvidia peermem
* On linux kernels 6.6+, /sys/kernel/mm/memory_peers is no longer
present; check for /sys/module/nvidia_peermem/version instead.
Fix old performance regression when mixing small and large operations.
* Improves distribution of work on channels.
Fix crash when NUMA IDs are equal to -1.
* Can happen when a NIC is a virtual NIC, or when linux doesn't
know which NUMA node a device is attached to
* Issue NVIDIA/nccl-tests#233
Fix tree graph search when NCCL_CROSS_NIC is set to 1.
* Would force NCCL to use the balanced_tree pattern, thereby
disabling LL128 on platforms with 1 GPU+1 NIC per PCI switch.
* Would also try to use alternate rings even though it was not
needed.
Compiler tweaks and fixes
* PR #1177
* PR #1228
Fix stack smash
* PR #1325
Fixes for multi-node NVLink + IB operation
Coverity fixes and comments.
[ROCm/rccl commit: 68b542363f]
This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
#
|
||||
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
NCCL_HOME := ../../build
|
||||
INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
|
||||
PLUGIN_SO := libnccl-profiler.so
|
||||
|
||||
default: $(PLUGIN_SO)
|
||||
|
||||
$(PLUGIN_SO): plugin.c event.c print_event.c
|
||||
$(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
|
||||
|
||||
clean:
|
||||
rm -f $(PLUGIN_SO)
|
||||
@@ -0,0 +1,30 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "event.h"
|
||||
|
||||
int taskEventQueueEmpty(struct group* g) {
|
||||
return g->eventHead == NULL;
|
||||
}
|
||||
|
||||
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
|
||||
event->next = NULL;
|
||||
if (g->eventHead) g->eventTail->next = event;
|
||||
else g->eventHead = event;
|
||||
g->eventTail = event;
|
||||
}
|
||||
|
||||
struct taskEventBase* taskEventQueueHead(struct group* g) {
|
||||
return g->eventHead;
|
||||
}
|
||||
|
||||
struct taskEventBase* taskEventQueueDequeue(struct group* g) {
|
||||
struct taskEventBase* tmp = g->eventHead;
|
||||
g->eventHead = g->eventHead->next;
|
||||
if (g->eventHead == NULL) g->eventTail = NULL;
|
||||
return tmp;
|
||||
}
|
||||
@@ -0,0 +1,167 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef EVENT_H_
|
||||
#define EVENT_H_
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <unistd.h>
|
||||
#include "profiler.h"
|
||||
|
||||
#define MAX_CHANNELS 32
|
||||
#define MAX_STEPS 16
|
||||
|
||||
#define PROXY_OP_SEND_STATE_OFFSET (ncclProfilerProxyOpSendPosted)
|
||||
#define PROXY_OP_RECV_STATE_OFFSET (ncclProfilerProxyOpRecvPosted)
|
||||
#define PROXY_STEP_SEND_STATE_OFFSET (ncclProfilerProxyStepSendGPUWait)
|
||||
#define PROXY_STEP_RECV_STATE_OFFSET (ncclProfilerProxyStepRecvWait)
|
||||
|
||||
#define NUM_PROXY_OP_SEND_STATES (ncclProfilerProxyOpSendDone - ncclProfilerProxyOpSendPosted + 1)
|
||||
#define NUM_PROXY_OP_RECV_STATES (ncclProfilerProxyOpRecvDone - ncclProfilerProxyOpRecvPosted + 1)
|
||||
#define NUM_PROXY_STEP_SEND_STATES (ncclProfilerProxyStepSendWait - ncclProfilerProxyStepSendGPUWait + 1)
|
||||
#define NUM_PROXY_STEP_RECV_STATES (ncclProfilerProxyStepRecvGPUWait - ncclProfilerProxyStepRecvWait + 1)
|
||||
|
||||
#define PROXY_OP_SEND_STATE_IDX(state) (state - PROXY_OP_SEND_STATE_OFFSET)
|
||||
#define PROXY_OP_RECV_STATE_IDX(state) (state - PROXY_OP_RECV_STATE_OFFSET)
|
||||
#define PROXY_STEP_SEND_STATE_IDX(state) (state - PROXY_STEP_SEND_STATE_OFFSET)
|
||||
#define PROXY_STEP_RECV_STATE_IDX(state) (state - PROXY_STEP_RECV_STATE_OFFSET)
|
||||
|
||||
#define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES)
|
||||
#define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES)
|
||||
|
||||
#define MAX_COMM_CLIQUES (32 * 8)
|
||||
|
||||
struct proxyOp;
|
||||
|
||||
struct proxyStep {
|
||||
uint8_t type; // type of event: network transfer
|
||||
int step; // network transfer id in given channel
|
||||
int isSend; // send/recv channel operation
|
||||
double timestamp[MAX_PROXY_STEP_STATES];
|
||||
double startTs;
|
||||
double stopTs;
|
||||
struct proxyOp* parent;
|
||||
};
|
||||
|
||||
struct proxyOp {
|
||||
uint8_t type; // type of event: proxy operation
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
pid_t pid;
|
||||
int rank;
|
||||
int peer; // peer rank for this proxy operation
|
||||
int nSteps; // total number of network transfers for this proxy operation
|
||||
int chunkSize; // chunk size for this proxy operation
|
||||
int isSend; // send/recv channel operation
|
||||
size_t transSize; // transfer data size for this proxy operation
|
||||
struct {
|
||||
int steps; // completed steps for this proxy operation state
|
||||
double timestamp;
|
||||
} states[MAX_PROXY_OP_STATES];
|
||||
double startTs;
|
||||
double stopTs;
|
||||
int stepCount; // last processed network operation for this proxy operation
|
||||
struct proxyStep step[MAX_STEPS]; // array of network transfer events
|
||||
struct taskEventBase* parent; // parent event p2p/collective
|
||||
};
|
||||
|
||||
struct group;
|
||||
struct context;
|
||||
|
||||
struct proxyCtrl {
|
||||
uint8_t type;
|
||||
struct context* ctx; // profiler context
|
||||
double startTs;
|
||||
double stopTs;
|
||||
int state;
|
||||
int appended; // appended proxy operations
|
||||
};
|
||||
|
||||
// task level event base structure
|
||||
struct taskEventBase {
|
||||
uint8_t type; // event type: collective/p2p
|
||||
int rank; // rank of the operation in NCCL communicator
|
||||
const char* name; // FIXME: unused
|
||||
uint64_t commHash; // communicator identifier
|
||||
uint8_t func; // ncclFunc*
|
||||
int refCount; // number of references for this operation
|
||||
struct group* parent; // parent event group
|
||||
struct taskEventBase* next; // next top level event in group
|
||||
double startTs;
|
||||
double stopTs;
|
||||
};
|
||||
|
||||
struct collective {
|
||||
struct taskEventBase base; // base structure for this event
|
||||
uint64_t seqNumber; // sequence number for this collective in communicator
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
size_t trafficBytes;
|
||||
int root;
|
||||
uint8_t datatype;
|
||||
uint8_t nMaxChannels;
|
||||
uint8_t algo;
|
||||
uint8_t proto;
|
||||
int op;
|
||||
int nWarps;
|
||||
int isCollnet;
|
||||
int isNvls;
|
||||
struct proxyOp send[MAX_CHANNELS];// array of send proxy operation events
|
||||
struct proxyOp recv[MAX_CHANNELS];// array of recv proxy operation events
|
||||
};
|
||||
|
||||
struct p2p {
|
||||
struct taskEventBase base; // base structure for this event
|
||||
uint8_t func;
|
||||
void const* buff;
|
||||
size_t count;
|
||||
uint8_t datatype;
|
||||
int peer;
|
||||
struct proxyOp op;
|
||||
};
|
||||
|
||||
struct group {
|
||||
uint8_t type;
|
||||
struct context* ctx; // profiler context
|
||||
int groupId;
|
||||
int refCount;
|
||||
struct taskEventBase* eventHead; // queue head for task events
|
||||
struct taskEventBase* eventTail; // queue tail for task events
|
||||
double startTs;
|
||||
double stopTs;
|
||||
struct group* next; // next group event in queue
|
||||
};
|
||||
|
||||
// arrays for different event objects
|
||||
struct context {
|
||||
int groupPoolSize;
|
||||
int groupPoolBase;
|
||||
int groupPoolIndex;
|
||||
struct group* groupPool;
|
||||
|
||||
int collPoolSize;
|
||||
int collPoolBase;
|
||||
int collPoolIndex;
|
||||
struct collective* collPool;
|
||||
|
||||
int p2pPoolSize;
|
||||
int p2pPoolBase;
|
||||
int p2pPoolIndex;
|
||||
struct p2p* p2pPool;
|
||||
|
||||
int proxyCtrlPoolSize;
|
||||
int proxyCtrlPoolBase;
|
||||
int proxyCtrlPoolIndex;
|
||||
struct proxyCtrl* proxyCtrlPool;
|
||||
};
|
||||
|
||||
int taskEventQueueEmpty(struct group* g);
|
||||
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event);
|
||||
struct taskEventBase* taskEventQueueHead(struct group* g);
|
||||
struct taskEventBase* taskEventQueueDequeue(struct group* g);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,15 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef COMMON_H_
|
||||
#define COMMON_H_
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,19 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_ERR_H_
|
||||
#define NCCL_ERR_H_
|
||||
|
||||
/* Error type for plugins */
|
||||
typedef enum { ncclSuccess = 0,
|
||||
ncclUnhandledCudaError = 1,
|
||||
ncclSystemError = 2,
|
||||
ncclInternalError = 3,
|
||||
ncclInvalidArgument = 4,
|
||||
ncclInvalidUsage = 5,
|
||||
ncclRemoteError = 6 } ncclResult_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,18 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_PROFILER_H_
|
||||
#define NCCL_PROFILER_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "err.h"
|
||||
|
||||
#include "profiler_v1.h"
|
||||
|
||||
#endif // end include guard
|
||||
@@ -0,0 +1,150 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_PROFILER_V1_H_
|
||||
#define NCCL_PROFILER_V1_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
enum {
|
||||
ncclProfileGroup = (1 << 0), // group event type
|
||||
ncclProfileColl = (1 << 1), // host collective call event type
|
||||
ncclProfileP2p = (1 << 2), // host point-to-point call event type
|
||||
ncclProfileProxyOp = (1 << 3), // proxy operation event type
|
||||
ncclProfileProxyStep = (1 << 4), // proxy step event type
|
||||
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
|
||||
ncclProfileNumEvents = ( 6),
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint64_t seqNumber;
|
||||
uint8_t func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
uint8_t datatype;
|
||||
uint32_t op;
|
||||
size_t trafficBytes;
|
||||
uint8_t nMaxChannels;
|
||||
uint8_t nWarps;
|
||||
uint8_t algo;
|
||||
uint8_t proto;
|
||||
int isCollnet;
|
||||
int isNvls;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint8_t func;
|
||||
void* buff;
|
||||
uint8_t datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
};
|
||||
} ncclProfilerEventDescr_v1_t;
|
||||
|
||||
typedef enum {
|
||||
ncclProfilerProxyOpSendPosted,
|
||||
ncclProfilerProxyOpSendRemFifoWait,
|
||||
ncclProfilerProxyOpSendTransmitted,
|
||||
ncclProfilerProxyOpSendDone,
|
||||
ncclProfilerProxyOpRecvPosted,
|
||||
ncclProfilerProxyOpRecvReceived,
|
||||
ncclProfilerProxyOpRecvTransmitted,
|
||||
ncclProfilerProxyOpRecvDone,
|
||||
|
||||
/* Legacy proxy profiler states */
|
||||
ncclProfilerProxyStepSendGPUWait,
|
||||
ncclProfilerProxyStepSendWait,
|
||||
ncclProfilerProxyStepRecvWait,
|
||||
ncclProfilerProxyStepRecvFlushWait,
|
||||
ncclProfilerProxyStepRecvGPUWait,
|
||||
|
||||
/* Legacy proxy control states */
|
||||
ncclProfilerProxyCtrlIdle,
|
||||
ncclProfilerProxyCtrlActive,
|
||||
ncclProfilerProxyCtrlSleep,
|
||||
ncclProfilerProxyCtrlWakeup,
|
||||
ncclProfilerProxyCtrlAppend,
|
||||
ncclProfilerProxyCtrlAppendEnd,
|
||||
} ncclProfilerEventState_v1_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
int steps;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v1_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v1_t;
|
||||
|
||||
typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t;
|
||||
typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t;
|
||||
typedef ncclProfiler_v1_t ncclProfiler_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,21 @@
|
||||
/*
|
||||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef NCCL_TYPES_H_
|
||||
#define NCCL_TYPES_H_
|
||||
|
||||
/* Data types */
|
||||
typedef enum { ncclInt8 = 0, ncclChar = 0,
|
||||
ncclUint8 = 1,
|
||||
ncclInt32 = 2, ncclInt = 2,
|
||||
ncclUint32 = 3,
|
||||
ncclInt64 = 4,
|
||||
ncclUint64 = 5,
|
||||
ncclFloat16 = 6, ncclHalf = 6,
|
||||
ncclFloat32 = 7, ncclFloat = 7,
|
||||
ncclFloat64 = 8, ncclDouble = 8,
|
||||
ncclBfloat16 = 9,
|
||||
} ncclDataType_t;
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,492 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <pthread.h>
|
||||
#include <string.h>
|
||||
#include <linux/limits.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <unistd.h>
|
||||
#include <x86intrin.h>
|
||||
#include "event.h"
|
||||
#include "print_event.h"
|
||||
|
||||
#define __hidden __attribute__ ((visibility("hidden")))
|
||||
|
||||
static int initialized; // initialization counter for profiler
|
||||
static double startTime; // profiler start time
|
||||
|
||||
static int groupPoolSize = 16;
|
||||
static int collPoolSize = 16;
|
||||
static int p2pPoolSize = 1024;
|
||||
static int proxyCtrlPoolSize = 16;
|
||||
static int detachPoolSize = 128;
|
||||
static int detachPoolBase;
|
||||
static int detachPoolIndex;
|
||||
static int detachPoolDone;
|
||||
static struct proxyOp* detachPool;
|
||||
|
||||
static double freq = -1;
|
||||
__hidden void calibrate() {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
uint64_t timeCycles = __rdtsc();
|
||||
double time = - tv.tv_sec*1e6 - tv.tv_usec;
|
||||
uint64_t total = 0ULL;
|
||||
for (int i = 0; i < 10000; i++) total += __rdtsc();
|
||||
gettimeofday(&tv, NULL);
|
||||
timeCycles = __rdtsc() - timeCycles;
|
||||
time += tv.tv_sec*1e6 + tv.tv_usec;
|
||||
freq = timeCycles / time;
|
||||
}
|
||||
|
||||
__hidden double gettime(void) {
|
||||
return __rdtsc() / freq;
|
||||
}
|
||||
|
||||
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static pid_t pid;
|
||||
|
||||
__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) {
|
||||
pthread_mutex_lock(&lock);
|
||||
if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
|
||||
// first thread initializes event mask, environment and detach pool
|
||||
__atomic_store_n(eActivationMask, ncclProfileColl | ncclProfileP2p, __ATOMIC_RELAXED);
|
||||
if (getenv("NCCL_PROFILE_EVENT_MASK")) {
|
||||
__atomic_store_n(eActivationMask, atoi(getenv("NCCL_PROFILE_EVENT_MASK")), __ATOMIC_RELAXED);
|
||||
}
|
||||
if (getenv("NCCL_PROFILE_GROUP_POOL_SIZE")) {
|
||||
groupPoolSize = atoi(getenv("NCCL_PROFILE_GROUP_POOL_SIZE"));
|
||||
}
|
||||
if (getenv("NCCL_PROFILE_COLL_POOL_SIZE")) {
|
||||
collPoolSize = atoi(getenv("NCCL_PROFILE_COLL_POOL_SIZE"));
|
||||
}
|
||||
if (getenv("NCCL_PROFILE_P2P_POOL_SIZE")) {
|
||||
p2pPoolSize = atoi(getenv("NCCL_PROFILE_P2P_POOL_SIZE"));
|
||||
}
|
||||
if (getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE")) {
|
||||
proxyCtrlPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE"));
|
||||
}
|
||||
if (getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE")) {
|
||||
detachPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE"));
|
||||
}
|
||||
// detach pool is used to store PXN proxyOps and is shared among threads
|
||||
detachPool = (struct proxyOp *)calloc(detachPoolSize, sizeof(*detachPool));
|
||||
if (detachPool == NULL) {
|
||||
pthread_mutex_unlock(&lock);
|
||||
return ncclSystemError;
|
||||
}
|
||||
// Pid of the process initializing the profiler first.
|
||||
// This is compared against the pid of proxyOp events
|
||||
// to figure out if they have a parent event in this
|
||||
// process address space.
|
||||
pid = getpid();
|
||||
|
||||
// calibrate and start timer
|
||||
calibrate();
|
||||
startTime = gettime();
|
||||
}
|
||||
pthread_mutex_unlock(&lock);
|
||||
|
||||
// pre-allocate memory for event object pools in dedicated profiler context
|
||||
struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
|
||||
ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
|
||||
if (ctx->groupPool == NULL) goto fail;
|
||||
|
||||
ctx->collPool = (struct collective *)calloc(collPoolSize, sizeof(*ctx->collPool));
|
||||
if (ctx->collPool == NULL) goto fail;
|
||||
|
||||
ctx->p2pPool = (struct p2p *)calloc(p2pPoolSize, sizeof(*ctx->p2pPool));
|
||||
if (ctx->p2pPool == NULL) goto fail;
|
||||
|
||||
ctx->proxyCtrlPool = (struct proxyCtrl *)calloc(proxyCtrlPoolSize, sizeof(*ctx->proxyCtrlPool));
|
||||
if (ctx->proxyCtrlPool == NULL) goto fail;
|
||||
|
||||
*context = ctx;
|
||||
return ncclSuccess;
|
||||
|
||||
fail:
|
||||
// cleanup resources
|
||||
if (ctx->proxyCtrlPool) free(ctx->proxyCtrlPool);
|
||||
if (ctx->p2pPool) free(ctx->p2pPool);
|
||||
if (ctx->collPool) free(ctx->collPool);
|
||||
if (ctx->groupPool) free(ctx->groupPool);
|
||||
free(ctx);
|
||||
if (detachPool) free(detachPool);
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t exampleProfilerFinalize(void* context) {
|
||||
FILE* fh = NULL;
|
||||
char filename[PATH_MAX] = { 0 };
|
||||
char hostname[64] = { 0 };
|
||||
gethostname(hostname, 64);
|
||||
const char* dump = getenv("NCCL_PROFILE_DUMP_FILE");
|
||||
if (dump) {
|
||||
sprintf(filename, "%s-%s-%ld.txt", dump, hostname, syscall(SYS_gettid));
|
||||
fh = fopen(filename, "w");
|
||||
fprintf(fh, "[\n");
|
||||
}
|
||||
|
||||
// print last N groups/collectives/p2ps
|
||||
struct context* ctx = (struct context *)context;
|
||||
int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0;
|
||||
int end = ctx->groupPoolIndex;
|
||||
for (int i = start; i < end; i++) {
|
||||
printEvent(fh, &ctx->groupPool[i%groupPoolSize]);
|
||||
}
|
||||
|
||||
start = (ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize >= 0) ? ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize : 0;
|
||||
end = ctx->proxyCtrlPoolIndex;
|
||||
for (int i = start; i < end; i++) {
|
||||
printEvent(fh, &ctx->proxyCtrlPool[i%proxyCtrlPoolSize]);
|
||||
}
|
||||
|
||||
free(ctx->groupPool);
|
||||
free(ctx->collPool);
|
||||
free(ctx->p2pPool);
|
||||
free(ctx->proxyCtrlPool);
|
||||
free(ctx);
|
||||
|
||||
// last thread cleans up shared detach pool
|
||||
if (__atomic_fetch_sub(&initialized, 1, __ATOMIC_RELAXED) - 1 == 0) {
|
||||
start = (detachPoolIndex - detachPoolSize >= 0) ? detachPoolIndex - detachPoolSize : 0;
|
||||
end = detachPoolIndex;
|
||||
for (int i = start; i < end; i++) {
|
||||
printEvent(fh, &detachPool[i%detachPoolSize]);
|
||||
}
|
||||
free(detachPool);
|
||||
}
|
||||
|
||||
if (fh) fprintf(fh, "{}]\n");
|
||||
if (fh) fclose(fh);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden void updateEvent(void* handle);
|
||||
|
||||
__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr) {
|
||||
*eHandle = NULL;
|
||||
struct context* ctx = (struct context *)context;
|
||||
if (eDescr->type == ncclProfileGroup) {
|
||||
struct group* event;
|
||||
int groupId = __atomic_fetch_add(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
if ((groupId - __atomic_load_n(&ctx->groupPoolBase, __ATOMIC_RELAXED)) < groupPoolSize) {
|
||||
// if there are available group events grab one
|
||||
event = &ctx->groupPool[groupId%groupPoolSize];
|
||||
while (!taskEventQueueEmpty(event)) {
|
||||
struct taskEventBase* base = taskEventQueueDequeue(event);
|
||||
if (base->type == ncclProfileColl) {
|
||||
struct collective* c = (struct collective *)base;
|
||||
// reset event proxyOps & proxySteps
|
||||
memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
|
||||
memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
|
||||
// release collective events in the group and return them to the collective pool
|
||||
__atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
|
||||
} else if (base->type == ncclProfileP2p) {
|
||||
struct p2p* p = (struct p2p *)base;
|
||||
// reset event proxyOp and proxySteps
|
||||
memset(&p->op, 0, sizeof(struct proxyOp));
|
||||
// release p2p events in the group and return them to the p2p pool
|
||||
__atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// else drop this event
|
||||
__atomic_fetch_sub(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
return ncclSuccess;
|
||||
}
|
||||
event->type = ncclProfileGroup;
|
||||
__atomic_store_n(&event->refCount, 1, __ATOMIC_RELAXED);
|
||||
event->ctx = ctx;
|
||||
event->groupId = groupId;
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
debugEvent(event, "GroupStart");
|
||||
} else if (eDescr->type == ncclProfileColl) {
|
||||
// the parent might be null if we run out of events
|
||||
struct group* parent = (struct group *)eDescr->parentObj;
|
||||
if (parent == NULL) return ncclSuccess;
|
||||
|
||||
struct collective* event;
|
||||
int collId = __atomic_fetch_add(&ctx->collPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
if ((collId - __atomic_load_n(&ctx->collPoolBase, __ATOMIC_RELAXED)) < collPoolSize) {
|
||||
// if there are available collective events grab one
|
||||
event = &ctx->collPool[collId%collPoolSize];
|
||||
} else {
|
||||
// else drop this event
|
||||
__atomic_fetch_sub(&ctx->collPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
event->base.type = ncclProfileColl;
|
||||
event->base.rank = eDescr->rank;
|
||||
event->base.name = eDescr->coll.name;
|
||||
event->base.commHash = eDescr->coll.commHash;
|
||||
event->base.func = eDescr->coll.func;
|
||||
event->base.startTs = gettime() - startTime;
|
||||
event->base.parent = parent;
|
||||
event->seqNumber = eDescr->coll.seqNumber;
|
||||
event->sendBuff = eDescr->coll.sendBuff;
|
||||
event->recvBuff = eDescr->coll.recvBuff;
|
||||
event->count = eDescr->coll.count;
|
||||
event->root = eDescr->coll.root;
|
||||
event->datatype = eDescr->coll.datatype;
|
||||
event->op = eDescr->coll.op;
|
||||
event->trafficBytes = eDescr->coll.trafficBytes;
|
||||
event->nMaxChannels = eDescr->coll.nMaxChannels;
|
||||
event->nWarps = eDescr->coll.nWarps;
|
||||
event->algo = eDescr->coll.algo;
|
||||
event->proto = eDescr->coll.proto;
|
||||
event->isCollnet = eDescr->coll.isCollnet;
|
||||
event->isNvls = eDescr->coll.isNvls;
|
||||
*eHandle = event;
|
||||
taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
|
||||
// increment the group ref counter so the event will staty open
|
||||
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
|
||||
debugEvent(event, "CollStart");
|
||||
} else if (eDescr->type == ncclProfileP2p) {
|
||||
// the parent might be null if we run out of events
|
||||
struct group* parent = (struct group *)eDescr->parentObj;
|
||||
if (parent == NULL) return ncclSuccess;
|
||||
|
||||
struct p2p* event;
|
||||
int p2pId = __atomic_fetch_add(&ctx->p2pPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
if ((p2pId - __atomic_load_n(&ctx->p2pPoolBase, __ATOMIC_RELAXED)) < p2pPoolSize) {
|
||||
// if there are available p2p events grab one
|
||||
event = &ctx->p2pPool[p2pId%p2pPoolSize];
|
||||
} else {
|
||||
// else drop this event
|
||||
__atomic_fetch_sub(&ctx->p2pPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
event->base.type = ncclProfileP2p;
|
||||
event->base.rank = eDescr->rank;
|
||||
event->base.name = eDescr->p2p.name;
|
||||
event->base.commHash = eDescr->p2p.commHash;
|
||||
event->base.func = eDescr->p2p.func;
|
||||
event->base.next = parent->eventHead;
|
||||
event->base.startTs = gettime() - startTime;
|
||||
event->base.parent = parent;
|
||||
event->buff = eDescr->p2p.buff;
|
||||
event->count = eDescr->p2p.count;
|
||||
event->datatype = eDescr->p2p.datatype;
|
||||
event->peer = eDescr->p2p.peer;
|
||||
*eHandle = event;
|
||||
// increment the group ref counter so the event will staty open
|
||||
taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
|
||||
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
|
||||
debugEvent(event, "P2pStart");
|
||||
} else if (eDescr->type == ncclProfileProxyCtrl) {
|
||||
int proxyCtrlId = __atomic_fetch_add(&ctx->proxyCtrlPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
struct proxyCtrl* event = &ctx->proxyCtrlPool[proxyCtrlId%proxyCtrlPoolSize];
|
||||
event->type = ncclProfileProxyCtrl;
|
||||
event->ctx = ctx;
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
} else if (eDescr->type == ncclProfileProxyOp) {
|
||||
// the eventBase might be null if we run out of events
|
||||
struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj;
|
||||
if (eventBase == NULL) return ncclSuccess;
|
||||
|
||||
if (eDescr->proxyOp.pid != pid) {
|
||||
// PXN captured proxyOp events
|
||||
struct proxyOp* event;
|
||||
int detachId = __atomic_fetch_add(&detachPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
if ((detachId - detachPoolBase) < detachPoolSize) {
|
||||
// if there are available detached proxyOp events grab one
|
||||
event = &detachPool[detachId%detachPoolSize];
|
||||
} else {
|
||||
// else drop this event
|
||||
__atomic_fetch_sub(&detachPoolIndex, 1, __ATOMIC_RELAXED);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
event->type = ncclProfileProxyOp;
|
||||
event->channelId = eDescr->proxyOp.channelId;
|
||||
event->pid = eDescr->proxyOp.pid;
|
||||
event->rank = eDescr->rank;
|
||||
event->peer = eDescr->proxyOp.peer;
|
||||
event->nSteps = eDescr->proxyOp.nSteps;
|
||||
event->chunkSize = eDescr->proxyOp.chunkSize;
|
||||
event->isSend = eDescr->proxyOp.isSend;
|
||||
event->startTs = gettime() - startTime;
|
||||
event->parent = NULL;
|
||||
*eHandle = event;
|
||||
debugEvent(event, "PxnProxyOpStart");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
if (eventBase->type == ncclProfileColl) {
|
||||
struct collective* parent = (struct collective *)eDescr->parentObj;
|
||||
struct proxyOp* event = (eDescr->proxyOp.isSend) ? &parent->send[eDescr->proxyOp.channelId] : &parent->recv[eDescr->proxyOp.channelId];
|
||||
event->type = ncclProfileProxyOp;
|
||||
event->channelId = eDescr->proxyOp.channelId;
|
||||
event->pid = eDescr->proxyOp.pid;
|
||||
event->rank = eDescr->rank;
|
||||
event->peer = eDescr->proxyOp.peer;
|
||||
event->nSteps = eDescr->proxyOp.nSteps;
|
||||
event->chunkSize = eDescr->proxyOp.chunkSize;
|
||||
event->isSend = eDescr->proxyOp.isSend;
|
||||
event->parent = eventBase;
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
__atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED);
|
||||
debugEvent(event, "ProxyOpStart");
|
||||
} else { // ncclProfileP2p
|
||||
struct p2p* parent = (struct p2p *)eDescr->parentObj;
|
||||
struct proxyOp* event = &parent->op;
|
||||
event->type = ncclProfileProxyOp;
|
||||
event->channelId = eDescr->proxyOp.channelId;
|
||||
event->pid = eDescr->proxyOp.pid;
|
||||
event->rank = eDescr->rank;
|
||||
event->peer = eDescr->proxyOp.peer;
|
||||
event->nSteps = eDescr->proxyOp.nSteps;
|
||||
event->chunkSize = eDescr->proxyOp.chunkSize;
|
||||
event->isSend = eDescr->proxyOp.isSend;
|
||||
event->parent = eventBase;
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
__atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED);
|
||||
debugEvent(event, "ProxyOpStart");
|
||||
}
|
||||
} else if (eDescr->type == ncclProfileProxyStep) {
|
||||
// the parent might be null if we run out of events
|
||||
struct proxyOp* parent = (struct proxyOp *)eDescr->parentObj;
|
||||
if (parent == NULL) return ncclSuccess;
|
||||
|
||||
int s = parent->stepCount++ % MAX_STEPS;
|
||||
struct proxyStep* event = &parent->step[s];
|
||||
event->type = ncclProfileProxyStep;
|
||||
event->step = eDescr->proxyStep.step;
|
||||
event->isSend = parent->isSend;
|
||||
event->parent = parent;
|
||||
event->startTs = gettime() - startTime;
|
||||
*eHandle = event;
|
||||
debugEvent(event, "ProxyStepStart");
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void updateEvent(void* handle) {
|
||||
uint8_t type = *(uint8_t *)handle;
|
||||
if (type == ncclProfileGroup) {
|
||||
struct group* event = (struct group *)handle;
|
||||
if (__atomic_fetch_sub(&event->refCount, 1, __ATOMIC_RELAXED) == 1) {
|
||||
event->stopTs = gettime() - startTime;
|
||||
// return group event to the pool
|
||||
__atomic_fetch_add(&event->ctx->groupPoolBase, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
debugEvent(event, "GroupStop");
|
||||
} else if (type == ncclProfileColl) {
|
||||
struct collective* event = (struct collective *)handle;
|
||||
if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) {
|
||||
event->base.stopTs = gettime() - startTime;
|
||||
debugEvent(event, "CollStop");
|
||||
updateEvent(event->base.parent);
|
||||
return;
|
||||
}
|
||||
debugEvent(event, "CollStop");
|
||||
} else if (type == ncclProfileP2p) {
|
||||
struct p2p* event = (struct p2p *)handle;
|
||||
if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) {
|
||||
event->base.stopTs = gettime() - startTime;
|
||||
debugEvent(event, "P2pStop");
|
||||
updateEvent(event->base.parent);
|
||||
return;
|
||||
}
|
||||
debugEvent(event, "P2pStop");
|
||||
} else if (type == ncclProfileProxyOp) {
|
||||
struct proxyOp* event = (struct proxyOp *)handle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
if (event->pid != pid) {
|
||||
// only for proxyOps that don't have a parent collective/p2p (i.e., PXN)
|
||||
int done = __atomic_fetch_add(&detachPoolDone, 1, __ATOMIC_RELAXED) + 1;
|
||||
if (done == detachPoolSize) {
|
||||
// reset the event completed (done) counter
|
||||
__atomic_store_n(&detachPoolDone, 0, __ATOMIC_RELAXED);
|
||||
// update the base pointer to the top of the pool
|
||||
int index = __atomic_load_n(&detachPoolIndex, __ATOMIC_RELAXED);
|
||||
__atomic_store_n(&detachPoolBase, index, __ATOMIC_RELAXED);
|
||||
}
|
||||
debugEvent(event, "ProxyOpStop");
|
||||
return;
|
||||
}
|
||||
updateEvent(event->parent);
|
||||
debugEvent(event, "ProxyOpStop");
|
||||
} else if (type == ncclProfileProxyStep) {
|
||||
struct proxyStep* event = (struct proxyStep *)handle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
debugEvent(event, "ProxyStepStop");
|
||||
} else if (type == ncclProfileProxyCtrl) {
|
||||
struct proxyCtrl* event = (struct proxyCtrl *)handle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
debugEvent(event, "ProxyCtrlStop");
|
||||
}
|
||||
}
|
||||
|
||||
__hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
|
||||
// the event handle might be null if we run out of events
|
||||
if (eHandle == NULL) return ncclSuccess;
|
||||
|
||||
uint8_t type = *(uint8_t *)eHandle;
|
||||
if (type == ncclProfileGroup) {
|
||||
// stopping the group event in NCCL core does not
|
||||
// mean the group has completed. It means the group
|
||||
// was submitted/enqueued so we need to keep the event open
|
||||
struct group* event = (struct group *)eHandle;
|
||||
event->stopTs = gettime() - startTime;
|
||||
return ncclSuccess;
|
||||
} else if (type == ncclProfileColl) {
|
||||
// stopping the collective event in NCCL core does not
|
||||
// mean the collective has completed. It means the collective
|
||||
// was submitted/enqueued so we need to keep the event open
|
||||
struct collective* event = (struct collective *)eHandle;
|
||||
event->base.stopTs = gettime() - startTime;
|
||||
return ncclSuccess;
|
||||
}
|
||||
updateEvent(eHandle);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
__hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs) {
|
||||
// the event handle might be null if we run out of events
|
||||
if (eHandle == NULL) return ncclSuccess;
|
||||
|
||||
debugEvent(eHandle, "RecordEventState");
|
||||
uint8_t type = *(uint8_t *)eHandle;
|
||||
if (type == ncclProfileProxyOp) {
|
||||
struct proxyOp* event = (struct proxyOp *)eHandle;
|
||||
int steps = event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps;
|
||||
if (eState == ncclProfilerProxyOpSendRemFifoWait && eStateArgs->proxyOp.steps == steps) return ncclSuccess;
|
||||
event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps = eStateArgs->proxyOp.steps;
|
||||
event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].timestamp = gettime() - startTime;
|
||||
event->transSize = eStateArgs->proxyOp.transSize;
|
||||
} else if (type == ncclProfileProxyStep) {
|
||||
struct proxyStep* event = (struct proxyStep *)eHandle;
|
||||
event->timestamp[event->isSend ? PROXY_STEP_SEND_STATE_IDX(eState) : PROXY_STEP_RECV_STATE_IDX(eState)] = gettime() - startTime;
|
||||
} else if (type == ncclProfileProxyCtrl) {
|
||||
struct proxyCtrl* event = (struct proxyCtrl *)eHandle;
|
||||
if (eState == ncclProfilerProxyCtrlAppendEnd) {
|
||||
event->appended = eStateArgs->proxyCtrl.appendedProxyOps;
|
||||
}
|
||||
event->state = eState;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclProfiler_v1_t ncclProfiler_v1 = {
|
||||
"Example-profiler",
|
||||
exampleProfilerInit,
|
||||
exampleProfilerStartEvent,
|
||||
exampleProfilerStopEvent,
|
||||
exampleProfilerRecordEventState,
|
||||
exampleProfilerFinalize,
|
||||
};
|
||||
@@ -0,0 +1,277 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "profiler.h"
|
||||
#include "event.h"
|
||||
#include "print_event.h"
|
||||
|
||||
#define __hidden __attribute__ ((visibility("hidden")))
|
||||
|
||||
__hidden const char* ncclFuncToString(int func) {
|
||||
switch(func) {
|
||||
case 0:
|
||||
return "ncclBroadcast";
|
||||
case 1:
|
||||
return "ncclReduce";
|
||||
case 2:
|
||||
return "ncclAllGather";
|
||||
case 3:
|
||||
return "ncclReduceScatter";
|
||||
case 4:
|
||||
return "ncclAllReduce";
|
||||
case 5:
|
||||
return "ncclSendRecv";
|
||||
case 6:
|
||||
return "ncclSend";
|
||||
case 7:
|
||||
return "ncclRecv";
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
__hidden const char* ncclAlgoToString(int algo) {
|
||||
switch(algo) {
|
||||
case 0:
|
||||
return "Tree";
|
||||
case 1:
|
||||
return "Ring";
|
||||
case 2:
|
||||
return "CollnetDirect";
|
||||
case 3:
|
||||
return "CollnetChain";
|
||||
case 4:
|
||||
return "Nvls";
|
||||
case 5:
|
||||
return "NvlsTree";
|
||||
}
|
||||
}
|
||||
|
||||
__hidden const char* ncclProtoToString(int proto) {
|
||||
switch(proto) {
|
||||
case 0:
|
||||
return "LL";
|
||||
case 1:
|
||||
return "LL128";
|
||||
case 2:
|
||||
return "Simple";
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category
|
||||
// It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a
|
||||
// category that matches the type of event (GROUP, COLL, P2P, PROXY, NET)
|
||||
static __thread int groupId;
|
||||
__hidden void printGroupEventHeader(FILE* fh, struct group* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d}},\n",
|
||||
"Group", groupId, getpid(), 1, event->startTs, event->groupId);
|
||||
}
|
||||
|
||||
__hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"Group", groupId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
|
||||
static __thread int collId;
|
||||
__hidden void printCollEventHeader(FILE* fh, struct collective* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": %d, \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n",
|
||||
ncclFuncToString(event->base.func), collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, ncclAlgoToString(event->algo), ncclProtoToString(event->proto), event->nMaxChannels);
|
||||
}
|
||||
|
||||
__hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
ncclFuncToString(event->base.func), collId++, getpid(), 1, event->base.stopTs);
|
||||
}
|
||||
|
||||
static __thread int p2pId;
|
||||
__hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": %d}},\n",
|
||||
ncclFuncToString(event->base.func), p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype);
|
||||
}
|
||||
|
||||
__hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
ncclFuncToString(event->base.func), p2pId++, getpid(), 1, event->base.stopTs);
|
||||
}
|
||||
|
||||
static __thread int proxyOpId;
|
||||
__hidden void printProxyOpEventHeader(FILE* fh, struct proxyOp* event) {
|
||||
if (event->isSend) {
|
||||
int posted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendPosted);
|
||||
int remFifoWait = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendRemFifoWait);
|
||||
int transmitted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendTransmitted);
|
||||
int done = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendDone);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"REM_FIFO_WAIT\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n",
|
||||
"Send", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[remFifoWait].steps, event->states[remFifoWait].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp);
|
||||
} else {
|
||||
int posted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvPosted);
|
||||
int received = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvReceived);
|
||||
int transmitted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvTransmitted);
|
||||
int done = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvDone);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"RECEIVED\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n",
|
||||
"Recv", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[received].steps, event->states[received].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp);
|
||||
}
|
||||
}
|
||||
|
||||
__hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
event->isSend ? "Send" : "Recv", proxyOpId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
|
||||
static __thread int proxyStepId;
|
||||
__hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
|
||||
if (event->isSend) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"SendBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)], event->step);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"SendWait", proxyStepId++, getpid(), 1, event->stopTs);
|
||||
} else {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)], event->step);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)]);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
|
||||
"RecvGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)], event->step);
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
"RecvGpuWait", proxyStepId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
}
|
||||
|
||||
static __thread int proxyCtrlId;
|
||||
__hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
|
||||
const char* str;
|
||||
if (event->state == ncclProfilerProxyCtrlIdle || event->state == ncclProfilerProxyCtrlActive) {
|
||||
str = "Idle";
|
||||
} else if (event->state == ncclProfilerProxyCtrlSleep || event->state == ncclProfilerProxyCtrlWakeup) {
|
||||
str = "Sleep";
|
||||
} else if (event->state == ncclProfilerProxyCtrlAppend || event->state == ncclProfilerProxyCtrlAppendEnd) {
|
||||
str = "Append";
|
||||
}
|
||||
if (event->state == ncclProfilerProxyCtrlAppendEnd) {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"appended\": %d}},\n",
|
||||
str, proxyCtrlId, getpid(), 1, event->startTs, event->appended);
|
||||
} else {
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
str, proxyCtrlId, getpid(), 1, event->startTs);
|
||||
}
|
||||
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
|
||||
str, proxyCtrlId++, getpid(), 1, event->stopTs);
|
||||
}
|
||||
|
||||
//#define DEBUG_EVENTS
|
||||
void debugEvent(void* eHandle, const char* tag) {
|
||||
#ifdef DEBUG_EVENTS
|
||||
char filename[64] = { 0 };
|
||||
sprintf(filename, "EventDebug-%d", getpid());
|
||||
FILE* fh = fopen(filename, "a+");
|
||||
uint8_t type = *(uint8_t *)eHandle;
|
||||
if (type == ncclProfileGroup) {
|
||||
struct group* event = (struct group *)eHandle;
|
||||
fprintf(fh, "Group event %p tag = %s {\n", event, tag);
|
||||
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->refCount, __ATOMIC_RELAXED));
|
||||
fprintf(fh, " startTs = %f\n", event->startTs);
|
||||
fprintf(fh, " stopTs = %f\n", event->stopTs);
|
||||
fprintf(fh, "}\n");
|
||||
} else if (type == ncclProfileColl) {
|
||||
struct collective* event = (struct collective *)eHandle;
|
||||
fprintf(fh, "Collective event %p tag = %s {\n", event, tag);
|
||||
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
|
||||
fprintf(fh, " parent = %p\n", event->base.parent);
|
||||
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->send[i].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]);
|
||||
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->recv[i].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]);
|
||||
fprintf(fh, " startTs = %f\n", event->base.startTs);
|
||||
fprintf(fh, " stopTs = %f\n", event->base.stopTs);
|
||||
fprintf(fh, "}\n");
|
||||
} else if (type == ncclProfileP2p) {
|
||||
struct p2p* event = (struct p2p *)eHandle;
|
||||
fprintf(fh, "P2p event %p tag = %s {\n", event, tag);
|
||||
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
|
||||
fprintf(fh, " parent = %p\n", event->base.parent);
|
||||
fprintf(fh, " op = %p\n", &event->op);
|
||||
fprintf(fh, " startTs = %f\n", event->base.startTs);
|
||||
fprintf(fh, " stopTs = %f\n", event->base.stopTs);
|
||||
fprintf(fh, "}\n");
|
||||
} else if (type == ncclProfileProxyOp) {
|
||||
struct proxyOp* event = (struct proxyOp *)eHandle;
|
||||
fprintf(fh, "ProxyOp event %p tag = %s {\n", event, tag);
|
||||
fprintf(fh, " type = %s\n", event->isSend ? "Send" : "Recv");
|
||||
fprintf(fh, " channel = %d\n", event->channelId);
|
||||
fprintf(fh, " parent = %p\n", event->parent);
|
||||
fprintf(fh, " rank = %d\n", event->rank);
|
||||
fprintf(fh, " startTs = %f\n", event->startTs);
|
||||
fprintf(fh, " stopTs = %f\n", event->stopTs);
|
||||
fprintf(fh, "}\n");
|
||||
} else if (type == ncclProfileProxyStep) {
|
||||
struct proxyStep* event = (struct proxyStep *)eHandle;
|
||||
fprintf(fh, "ProxyStep event %p tag = %s {\n", event, tag);
|
||||
fprintf(fh, " type = %s\n", event->isSend ? "Send" : "Recv");
|
||||
fprintf(fh, " parent = %p\n", event->parent);
|
||||
fprintf(fh, " startTs = %f\n", event->startTs);
|
||||
fprintf(fh, " stopTs = %f\n", event->stopTs);
|
||||
fprintf(fh, "}\n");
|
||||
}
|
||||
fclose(fh);
|
||||
#endif
|
||||
}
|
||||
|
||||
void printEvent(FILE* fh, void* handle) {
|
||||
if (handle == NULL || fh == NULL) return;
|
||||
uint8_t type = *(uint8_t *)handle;
|
||||
if (type == ncclProfileGroup) {
|
||||
struct group* g = (struct group *)handle;
|
||||
printGroupEventHeader(fh, g);
|
||||
struct taskEventBase* base = taskEventQueueHead(g);
|
||||
while (base) {
|
||||
struct taskEventBase* next = base->next;
|
||||
printEvent(fh, base);
|
||||
base = next;
|
||||
}
|
||||
printGroupEventTrailer(fh, g);
|
||||
} else if (type == ncclProfileColl) {
|
||||
struct collective* c = (struct collective *)handle;
|
||||
printCollEventHeader(fh, c);
|
||||
for (int i = 0; i < MAX_CHANNELS; i++) {
|
||||
printEvent(fh, &c->send[i]);
|
||||
printEvent(fh, &c->recv[i]);
|
||||
}
|
||||
printCollEventTrailer(fh, c);
|
||||
} else if (type == ncclProfileP2p) {
|
||||
struct p2p* p = (struct p2p *)handle;
|
||||
printP2pEventHeader(fh, p);
|
||||
printEvent(fh, &p->op);
|
||||
printP2pEventTrailer(fh, p);
|
||||
} else if (type == ncclProfileProxyOp) {
|
||||
struct proxyOp* p = (struct proxyOp *)handle;
|
||||
printProxyOpEventHeader(fh, p);
|
||||
for (int i = 0; i < MAX_STEPS; i++) {
|
||||
printEvent(fh, &p->step[i]);
|
||||
}
|
||||
printProxyOpEventTrailer(fh, p);
|
||||
} else if (type == ncclProfileProxyStep) {
|
||||
struct proxyStep* p = (struct proxyStep *)handle;
|
||||
printProxyStepEvent(fh, p);
|
||||
} else if (type == ncclProfileProxyCtrl) {
|
||||
struct proxyCtrl* p = (struct proxyCtrl *)handle;
|
||||
printProxyCtrlEvent(fh, p);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef PRINT_EVENT_H_
|
||||
#define PRINT_EVENT_H_
|
||||
|
||||
void debugEvent(void* eHandle, const char* tag);
|
||||
void printEvent(FILE* fh, void* handle);
|
||||
|
||||
#endif
|
||||
@@ -27,7 +27,7 @@ typedef enum {
|
||||
ncclNumFuncs = 8
|
||||
} ncclFunc_t;
|
||||
|
||||
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
|
||||
#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
|
||||
#define NCCL_ALGO_UNDEF -1
|
||||
#define NCCL_ALGO_TREE 0
|
||||
#define NCCL_ALGO_RING 1
|
||||
@@ -35,6 +35,7 @@ typedef enum {
|
||||
#define NCCL_ALGO_COLLNET_CHAIN 3
|
||||
#define NCCL_ALGO_NVLS 4
|
||||
#define NCCL_ALGO_NVLS_TREE 5
|
||||
#define NCCL_ALGO_PAT 6
|
||||
|
||||
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
|
||||
#define NCCL_PROTO_UNDEF -1
|
||||
|
||||
@@ -10,6 +10,7 @@ VERBOSE ?= 0
|
||||
KEEP ?= 0
|
||||
DEBUG ?= 0
|
||||
ASAN ?= 0
|
||||
UBSAN ?= 0
|
||||
TRACE ?= 0
|
||||
PROFAPI ?= 1
|
||||
NVTX ?= 1
|
||||
@@ -93,6 +94,12 @@ LDFLAGS += -fsanitize=address -static-libasan
|
||||
NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan
|
||||
endif
|
||||
|
||||
ifneq ($(UBSAN), 0)
|
||||
CXXFLAGS += -fsanitize=undefined
|
||||
LDFLAGS += -fsanitize=undefined -static-libubsan
|
||||
NVLDFLAGS += -Xcompiler -fsanitize=undefined,-static-libubsan
|
||||
endif
|
||||
|
||||
ifneq ($(VERBOSE), 0)
|
||||
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
|
||||
CXXFLAGS += -Wall -Wextra
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 22
|
||||
NCCL_PATCH := 3
|
||||
NCCL_MINOR := 23
|
||||
NCCL_PATCH := 4
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
+712
-238
File diff suppressed because it is too large
Load Diff
@@ -59,6 +59,7 @@ const char* ncclAlgoToString(int algo) {
|
||||
case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN";
|
||||
case NCCL_ALGO_NVLS: return "NVLS";
|
||||
case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE";
|
||||
case NCCL_ALGO_PAT: return "PAT";
|
||||
default: return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
+21
-11
@@ -19,7 +19,7 @@ static int pid = -1;
|
||||
static char hostname[1024];
|
||||
thread_local int ncclDebugNoWarn = 0;
|
||||
char ncclLastError[1024] = ""; // Global string for the last error in human readable form
|
||||
static uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV
|
||||
static uint64_t ncclDebugMask = NCCL_INIT | NCCL_BOOTSTRAP | NCCL_ENV; // Default debug sub-system mask is INIT and ENV
|
||||
FILE *ncclDebugFile = stdout;
|
||||
static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static std::chrono::steady_clock::time_point ncclEpoch;
|
||||
@@ -122,7 +122,7 @@ static void ncclDebugInit() {
|
||||
int c = 0;
|
||||
char debugFn[PATH_MAX+1] = "";
|
||||
char *dfn = debugFn;
|
||||
while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
|
||||
while (ncclDebugFileEnv[c] != '\0' && (dfn - debugFn) < PATH_MAX) {
|
||||
if (ncclDebugFileEnv[c++] != '%') {
|
||||
*dfn++ = ncclDebugFileEnv[c-1];
|
||||
continue;
|
||||
@@ -132,16 +132,24 @@ static void ncclDebugInit() {
|
||||
*dfn++ = '%';
|
||||
break;
|
||||
case 'h': // %h = hostname
|
||||
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
|
||||
dfn += snprintf(dfn, PATH_MAX + 1 - (dfn - debugFn), "%s", hostname);
|
||||
break;
|
||||
case 'p': // %p = pid
|
||||
dfn += snprintf(dfn, PATH_MAX, "%d", pid);
|
||||
dfn += snprintf(dfn, PATH_MAX + 1 - (dfn - debugFn), "%d", pid);
|
||||
break;
|
||||
default: // Echo everything we don't understand
|
||||
*dfn++ = '%';
|
||||
*dfn++ = ncclDebugFileEnv[c-1];
|
||||
if ((dfn - debugFn) < PATH_MAX) {
|
||||
*dfn++ = ncclDebugFileEnv[c-1];
|
||||
}
|
||||
break;
|
||||
}
|
||||
if ((dfn - debugFn) > PATH_MAX) {
|
||||
// snprintf wanted to overfill the buffer: set dfn to the end
|
||||
// of the buffer (for null char) and it will naturally exit
|
||||
// the loop.
|
||||
dfn = debugFn + PATH_MAX;
|
||||
}
|
||||
}
|
||||
*dfn = '\0';
|
||||
if (debugFn[0] != '\0') {
|
||||
@@ -181,9 +189,9 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
tid = syscall(SYS_gettid);
|
||||
}
|
||||
|
||||
int cudaDev;
|
||||
int cudaDev = 0;
|
||||
if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
|
||||
cudaGetDevice(&cudaDev);
|
||||
(void)cudaGetDevice(&cudaDev);
|
||||
}
|
||||
|
||||
char buffer[1024];
|
||||
@@ -207,11 +215,13 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
va_start(vargs, fmt);
|
||||
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
|
||||
va_end(vargs);
|
||||
// vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
|
||||
// vsnprintf may return len >= sizeof(buffer) in the case of a truncated output.
|
||||
// Rewind len so that we can replace the final \0 by \n
|
||||
if (len > sizeof(buffer)) len = sizeof(buffer)-1;
|
||||
buffer[len++] = '\n';
|
||||
if (len) fwrite(buffer, 1, len, ncclDebugFile);
|
||||
if (len >= sizeof(buffer)) len = sizeof(buffer)-1;
|
||||
if (len) {
|
||||
buffer[len++] = '\n';
|
||||
fwrite(buffer, 1, len, ncclDebugFile);
|
||||
}
|
||||
}
|
||||
|
||||
NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
|
||||
|
||||
@@ -23,8 +23,11 @@ namespace {
|
||||
|
||||
T *inputBuf = (T*)work->sendbuff;
|
||||
T *outputBuf = (T*)work->recvbuff;
|
||||
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
|
||||
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
|
||||
// coverity[callee_ptr_arith:FALSE]
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
|
||||
(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg);
|
||||
(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);
|
||||
|
||||
for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) {
|
||||
/////////////// begin AllGather steps ///////////////
|
||||
@@ -46,7 +49,7 @@ namespace {
|
||||
rankDest = ringRanks[nranks-j];
|
||||
offset = dataOffset + rankDest * count;
|
||||
|
||||
prims.directRecvCopySend(offset, nelem);
|
||||
prims.directRecvCopyDirectSend(offset, nelem);
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
@@ -54,7 +57,7 @@ namespace {
|
||||
offset = dataOffset + rankDest * count;
|
||||
|
||||
// Final wait/copy.
|
||||
prims.directRecv(offset, nelem);
|
||||
prims.directRecv(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -81,6 +84,31 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const int rank = ncclShmem.comm.rank;
|
||||
size_t count, channelOffset, channelCount, chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
|
||||
|
||||
T *inputBuf = (T*)work->sendbuff;
|
||||
T *outputBuf = (T*)work->recvbuff;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
|
||||
(tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatAg);
|
||||
|
||||
PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
|
||||
int last = 0;
|
||||
while (!last) {
|
||||
int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
|
||||
size_t inpIx, outIx;
|
||||
patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
|
||||
prims.patCopy(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
|
||||
@@ -165,7 +193,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
|
||||
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
|
||||
__device__ __forceinline__ void operator()(
|
||||
int tid, int tn, int slice, int maxSliceSize,
|
||||
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes
|
||||
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
|
||||
) {
|
||||
static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
|
||||
static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
|
||||
@@ -203,19 +231,22 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
|
||||
int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
|
||||
ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
|
||||
int outIsDst = (inPlace && rank == ncclShmem.comm.rank) ? 0 : 1;
|
||||
reduceCopy<ncclCollUnroll(), RedOp, T,
|
||||
if (nSrcs != 0 && outIsDst+nDsts != 0) {
|
||||
reduceCopy<ncclCollUnroll(), RedOp, T,
|
||||
/*MultimemSrcs,MinSrcs,MaxSrcs=*/0,1,1,
|
||||
/*MultimemDsts=*/0, 0+MinDsts, 1+MaxDsts,
|
||||
/*PreOpSrcs=*/0>
|
||||
(tid, tn, 0, nullptr, false,
|
||||
/*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* {
|
||||
return (char*)srcPtrs[src] + railAllOffset;
|
||||
return work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ) ? (char*)srcPtrs[src] + userOneBeg : (char*)srcPtrs[src] + railAllOffset;
|
||||
},
|
||||
/*nDsts=*/outIsDst+nDsts, [=]__device__(int d) -> void* {
|
||||
return d < outIsDst ? outbuf + userOneBeg
|
||||
: work->regUsed && (sendDirectFlag & NCCL_DIRECT_WRITE) ? (char*)dstPtrs[d-outIsDst] + userOneBeg
|
||||
: (char*)dstPtrs[d-outIsDst] + railAllOffset;
|
||||
},
|
||||
delta);
|
||||
}
|
||||
railAllOffset += delta;
|
||||
node += 1;
|
||||
}
|
||||
@@ -281,15 +312,15 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
|
||||
__syncwarp();
|
||||
} else {
|
||||
// Phase 2: Recv network -> deposit output + send to bcast
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, &direct->out, direct->heads + 1, nullptr, nullptr,
|
||||
/*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0);
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid, tn, &direct->out, direct->heads + 1, nullptr, work->recvbuff,
|
||||
/*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
|
||||
Scatterer</*BcastSendNotRecv=*/true> scat;
|
||||
scat.work = work;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.template process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
prims.template process</*Recv=*/1, /*Send=*/1>(scat, work->direct, 0);
|
||||
}
|
||||
}
|
||||
return;
|
||||
@@ -299,15 +330,15 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
|
||||
tn = nWarps3*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 3: Recv bcast -> deposit output
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, direct->heads+1, nullptr, nullptr, nullptr,
|
||||
/*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0);
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid, tn, direct->heads+1, nullptr, nullptr, work->recvbuff,
|
||||
/*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0, work);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
Scatterer</*BcastSendNotRecv=*/false> scat;
|
||||
scat.work = work;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.template process</*Recv=*/1, /*Send=*/0>(scat);
|
||||
prims.template process</*Recv=*/1, /*Send=*/0>(scat, 0, work->direct);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -23,8 +23,11 @@ namespace {
|
||||
int nelem;
|
||||
int chunk;
|
||||
|
||||
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
|
||||
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
|
||||
// coverity[callee_ptr_arith:FALSE]
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
|
||||
(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
|
||||
(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work);
|
||||
|
||||
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
|
||||
ssize_t remCount = channelCount - elemOffset;
|
||||
@@ -41,7 +44,7 @@ namespace {
|
||||
chunkOffset = chunk * chunkCount;
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
prims.send(offset, nelem);
|
||||
prims.directSend(offset, offset, nelem);
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
for (int j = 2; j < nranks; ++j) {
|
||||
@@ -49,7 +52,7 @@ namespace {
|
||||
chunkOffset = chunk * chunkCount;
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
prims.directRecvReduceDirectSend(offset, offset, nelem);
|
||||
}
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
@@ -58,7 +61,7 @@ namespace {
|
||||
chunkOffset = chunk * chunkCount;
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
prims.directRecvReduceCopySend(offset, offset, nelem, /*postOp=*/true);
|
||||
prims.directRecvReduceCopyDirectSend(offset, offset, nelem, /*postOp=*/true);
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j = 1; j < nranks - 1; ++j) {
|
||||
@@ -66,7 +69,7 @@ namespace {
|
||||
chunkOffset = chunk * chunkCount;
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
prims.directRecvCopySend(offset, nelem);
|
||||
prims.directRecvCopyDirectSend(offset, nelem);
|
||||
}
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
@@ -75,7 +78,7 @@ namespace {
|
||||
offset = gridOffset + elemOffset + chunkOffset;
|
||||
nelem = (int)min(chunkCount, remCount - chunkOffset);
|
||||
|
||||
prims.directRecv(offset, nelem);
|
||||
prims.directRecv(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,34 +93,34 @@ namespace {
|
||||
int nelem;
|
||||
|
||||
{ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
|
||||
(tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg);
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/1, Proto, 0> prims
|
||||
(tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work);
|
||||
if (tree->up == -1) {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
|
||||
prims.directRecvReduceCopy(offset, offset, nelem, /*postOp=*/true);
|
||||
}
|
||||
}
|
||||
else if (tree->down[0] == -1) {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.send(offset, nelem);
|
||||
prims.directSend(offset, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
prims.directRecvReduceDirectSend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{ // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_TREE_ARITY>, /*Direct=*/1, Proto, 0> prims
|
||||
(tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
|
||||
(tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work);
|
||||
if (tree->up == -1) {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
@@ -129,14 +132,14 @@ namespace {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecv(offset, nelem);
|
||||
prims.directRecv(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecvCopySend(offset, nelem);
|
||||
prims.directRecvCopyDirectSend(offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -164,11 +167,11 @@ namespace {
|
||||
if (tree->up == -1) {
|
||||
// Reduce and broadcast. Max number of recv is 2, max number of send is 2
|
||||
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_TREE_ARITY_TOP>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
|
||||
prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work);
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecvReduceCopySend(offset, offset, nelem, /*doPost=*/true);
|
||||
prims.directRecvReduceCopyDirectSend(offset, offset, nelem, /*doPost=*/true);
|
||||
}
|
||||
}
|
||||
else if (tid < nthreadsSplit) {
|
||||
@@ -180,40 +183,46 @@ namespace {
|
||||
* into DirectRecv and DirectSend capabilities, this ctor would have both=0,
|
||||
* but the ctor above for tree roots would be DirectRecv=0 DirectSend=1.
|
||||
*/
|
||||
// Coverity reports that the callee treats &tree->up as an array. However, due to the use of
|
||||
// FanAsymmetric<n, 1>, only the first element is ever accessed, so it's fine.
|
||||
// coverity[callee_ptr_arith:FALSE]
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth);
|
||||
prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work);
|
||||
if (tree->down[0] == -1) {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.send(offset, nelem);
|
||||
prims.directSend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
prims.directRecvReduceDirectSend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
// Coverity reports that the callee treats &tree->up as an array. However, due to the use of
|
||||
// FanAsymmetric<1, n>, only the first element is ever accessed, so it's fine.
|
||||
// coverity[callee_ptr_arith:FALSE]
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_TREE_ARITY>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, 1*Proto::MaxGroupWidth);
|
||||
work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
|
||||
if (tree->down[0] == -1) {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecv(offset, nelem);
|
||||
prims.directRecv(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
prims.directRecvCopySend(offset, nelem);
|
||||
prims.directRecvCopyDirectSend(offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -264,9 +273,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
|
||||
|
||||
if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
|
||||
// Scatter
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, work);
|
||||
work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
|
||||
int nelem = min(direct->nHeads*chunkSize, size-offset);
|
||||
@@ -276,12 +285,15 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
|
||||
prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
|
||||
}
|
||||
}
|
||||
// Coverity complains about a possible overrun inside the destructor of "prims", but that's actually
|
||||
// a false positive.
|
||||
// coverity[overrun-call:FALSE]
|
||||
} else if (tid >= tidStartReduce && direct->out != -1) {
|
||||
if (hasDn) {
|
||||
// Reduce, send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, work);
|
||||
work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
@@ -323,6 +335,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
|
||||
} else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
|
||||
if (hasDn) {
|
||||
// Recv from network, broadcast
|
||||
// Coverity complains about a possible overrun inside the class below, but that's actually
|
||||
// a false positive.
|
||||
// coverity[identity_transfer:FALSE]
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
|
||||
@@ -382,7 +397,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
|
||||
ssize_t offset;
|
||||
int nelem;
|
||||
int remCount = channelCount%(nvls->nHeads*chunkSize);
|
||||
int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T));
|
||||
int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16384/sizeof(T));
|
||||
|
||||
if (tid < tidEndScatter) {
|
||||
// Scatter
|
||||
@@ -456,6 +471,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
|
||||
if (!hasOut) {
|
||||
// Reduce, broadcast through NVLS
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
|
||||
// Coverity complains about a possible overrun inside the class below, but that's actually
|
||||
// a false positive.
|
||||
// coverity[identity_transfer:FALSE]
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
|
||||
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
@@ -467,6 +485,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
|
||||
} else {
|
||||
// Reduce, send to network
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
|
||||
// Coverity complains about a possible overrun inside the class below, but that's actually
|
||||
// a false positive.
|
||||
// coverity[identity_transfer:FALSE]
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
|
||||
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
|
||||
@@ -479,6 +500,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
|
||||
} else if (tid < tidEndBcast && nvls->headRank != -1) {
|
||||
// Recv from network, broadcast
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
|
||||
// Coverity complains about a possible overrun inside the class below, but that's actually
|
||||
// a false positive.
|
||||
// coverity[identity_transfer:FALSE]
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
|
||||
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
@@ -564,6 +588,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_
|
||||
} else {
|
||||
// Reduce, send to network
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
|
||||
// Coverity reports that the callee treats &treeUp as an array. However, due to the use of
|
||||
// FanAsymmetric<3, 1>, only the first element is ever accessed, so it's fine.
|
||||
// coverity[callee_ptr_arith:FALSE]
|
||||
Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
|
||||
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
@@ -579,6 +606,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_
|
||||
} else if (tid < tidEndBcast && nvls->headRank != -1) {
|
||||
// Recv from network, broadcast
|
||||
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
|
||||
// Coverity reports that the callee treats &treeUp as an array. However, due to the use of
|
||||
// FanAsymmetric<1, 3>, only the first element is ever accessed, so it's fine.
|
||||
// coverity[callee_ptr_arith:FALSE]
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
|
||||
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
|
||||
@@ -639,21 +669,21 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
|
||||
} else {
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * int(chunkSize);
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.send(offset, nelem);
|
||||
prims.directSend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * int(chunkSize);
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
prims.directRecvReduceDirectSend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -668,40 +698,49 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
|
||||
}
|
||||
__syncwarp();
|
||||
} else {
|
||||
// Coverity reports that the callee treats &send as an array. However, due to the use of
|
||||
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
|
||||
// coverity[callee_ptr_arith:FALSE]
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * int(chunkSize);
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.recv(offset, nelem, /*postOp*/true);
|
||||
prims.directRecv(offset, offset, nelem, /*postOp*/true);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Coverity reports that the callee treats &send as an array. However, due to the use of
|
||||
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
|
||||
// coverity[callee_ptr_arith:FALSE]
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid * int(chunkSize);
|
||||
int nelem = min(chunkSize, size - offset);
|
||||
prims.recvCopyDirectSend(offset, nelem, /*postOp*/true);
|
||||
prims.directRecvCopyDirectSend(offset, nelem, /*postOp*/true);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Coverity reports that the callee treats &send as an array. However, due to the use of
|
||||
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
|
||||
// coverity[callee_ptr_arith:FALSE]
|
||||
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
|
||||
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
|
||||
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
|
||||
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
|
||||
if (send == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.directRecv(offset, nelem);
|
||||
prims.directRecv(offset, offset, nelem);
|
||||
}
|
||||
} else {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.directRecvCopySend(offset, nelem);
|
||||
prims.directRecvCopyDirectSend(offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,8 +24,11 @@ namespace {
|
||||
|
||||
T *inputBuf = (T*)work->sendbuff;
|
||||
T *outputBuf = (T*)work->recvbuff;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg);
|
||||
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
|
||||
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
|
||||
// coverity[callee_ptr_arith:FALSE]
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);
|
||||
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
offset = gridOffset + elemOffset;
|
||||
@@ -33,14 +36,14 @@ namespace {
|
||||
|
||||
if (rank == root) {
|
||||
if (inputBuf == outputBuf) {
|
||||
prims.send(offset, nelem);
|
||||
prims.directSend(offset, offset, nelem);
|
||||
} else {
|
||||
prims.copySend(offset, offset, nelem);
|
||||
prims.directCopySend(offset, offset, nelem);
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
prims.recv(offset, nelem);
|
||||
prims.directRecv(offset, offset, nelem);
|
||||
} else {
|
||||
prims.recvCopySend(offset, nelem);
|
||||
prims.directRecvCopyDirectSend(offset, nelem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,7 +97,7 @@ __device__ inline void barrier_sync_aligned(int name, int nThreads) {
|
||||
|
||||
__device__ inline bool barrier_red_or(bool vote, int name) {
|
||||
int ans;
|
||||
asm("{ .reg .pred p;"
|
||||
asm volatile("{ .reg .pred p;"
|
||||
" setp.ne.s32 p, %1, 0;"
|
||||
" barrier.red.or.pred p, %2, p; "
|
||||
" selp.s32 %0, 1, 0, p; }"
|
||||
@@ -106,7 +106,7 @@ __device__ inline bool barrier_red_or(bool vote, int name) {
|
||||
}
|
||||
__device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
|
||||
int ans;
|
||||
asm("{ .reg .pred p;"
|
||||
asm volatile("{ .reg .pred p;"
|
||||
" setp.ne.s32 p, %1, 0;"
|
||||
" barrier.red.or.pred p, %2, %3, p; "
|
||||
" selp.s32 %0, 1, 0, p; }"
|
||||
@@ -115,7 +115,7 @@ __device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
|
||||
}
|
||||
__device__ inline bool barrier_red_or_aligned(bool vote, int name) {
|
||||
int ans;
|
||||
asm("{ .reg .pred p;"
|
||||
asm volatile("{ .reg .pred p;"
|
||||
" setp.ne.s32 p, %1, 0;"
|
||||
" barrier.red.or.pred.aligned p, %2, p; "
|
||||
" selp.s32 %0, 1, 0, p; }"
|
||||
@@ -137,9 +137,9 @@ inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int by
|
||||
int offset = 16*tid;
|
||||
if (offset < bytes) {
|
||||
uint64_t a=0, b=0;
|
||||
asm("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset));
|
||||
asm volatile("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset) : "memory");
|
||||
uint32_t udst = (uint32_t)__cvta_generic_to_shared(dst);
|
||||
asm volatile("st.shared.v2.u64 [%0],{%1,%2};" :: "r"(udst + offset), "l"(a), "l"(b));
|
||||
asm volatile("st.shared.v2.u64 [%0],{%1,%2};" :: "r"(udst + offset), "l"(a), "l"(b) : "memory");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -300,6 +300,9 @@ struct RunWorkBatch {
|
||||
if (work->nWarps != workPrev->nWarps) __syncthreads();
|
||||
}
|
||||
int subtn = work->nWarps*WARP_SIZE;
|
||||
// Coverity reports a possible thread divergence due to not all threads participating in the collective.
|
||||
// However, the code ensures that the participation is on a per-warp basis.
|
||||
// coverity[device_thread_diverged:FALSE]
|
||||
if (tid < subtn) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid, subtn, work);
|
||||
}
|
||||
}
|
||||
@@ -348,6 +351,9 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
default:
|
||||
{ int subtid = tid - 2*WARP_SIZE;
|
||||
int subtn = tn - 2*WARP_SIZE;
|
||||
// Coverity reports a possible thread divergence due to not all threads participating in the collective.
|
||||
// However, the code ensures that the participation is on a per-warp basis.
|
||||
// coverity[device_thread_diverged:FALSE]
|
||||
loadWorkBatchToShmem(subtid, subtn, args, /*batchIx=*/blockIdx.x);
|
||||
} break;
|
||||
}
|
||||
|
||||
@@ -69,6 +69,8 @@ __device__ __forceinline__ void reduceCopyPacks(
|
||||
minSrcs[s] = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
|
||||
#pragma unroll
|
||||
for (int d=0; d < MinDsts; d++)
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
minDsts[d] = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;
|
||||
|
||||
// We dictate loop termination condition according to whether partial hunks
|
||||
@@ -93,13 +95,17 @@ __device__ __forceinline__ void reduceCopyPacks(
|
||||
|
||||
#pragma unroll (MinSrcs-1 + !(MinSrcs-1))
|
||||
for (int s=1; s < MinSrcs; s++) {
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_begin]
|
||||
BytePack<BytePerPack> tmp[Unroll];
|
||||
// coverity[dead_error_line]
|
||||
RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
|
||||
#pragma unroll Unroll
|
||||
for (int u=0; u < Unroll; u++) {
|
||||
if (s < MultimemSrcs) {
|
||||
// applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
|
||||
acc[u] = applyLoadMultimem<RedFn, BytePerPack>(redFn, minSrcs[s]);
|
||||
// coverity[dead_error_line]
|
||||
tmp[u] = applyLoadMultimem<RedFn, BytePerPack>(redFn, minSrcs[s]);
|
||||
} else {
|
||||
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
|
||||
tmp[u] = ld_volatile_global<BytePerPack>(minSrcs[s]);
|
||||
@@ -108,6 +114,7 @@ __device__ __forceinline__ void reduceCopyPacks(
|
||||
}
|
||||
#pragma unroll Unroll
|
||||
for (int u=0; u < Unroll; u++) {
|
||||
// coverity[dead_error_line]
|
||||
if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]);
|
||||
acc[u] = applyReduce(redFn, acc[u], tmp[u]);
|
||||
}
|
||||
@@ -116,6 +123,8 @@ __device__ __forceinline__ void reduceCopyPacks(
|
||||
for (int s=MinSrcs; (MinSrcs < MaxSrcs) && (s < MaxSrcs) && (s < nSrcs); s++) {
|
||||
uintptr_t src = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
|
||||
BytePack<BytePerPack> tmp[Unroll];
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
|
||||
#pragma unroll Unroll
|
||||
for (int u=0; u < Unroll; u++) {
|
||||
@@ -125,6 +134,8 @@ __device__ __forceinline__ void reduceCopyPacks(
|
||||
}
|
||||
#pragma unroll Unroll
|
||||
for (int u=0; u < Unroll; u++) {
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]);
|
||||
acc[u] = applyReduce(redFn, acc[u], tmp[u]);
|
||||
}
|
||||
@@ -139,7 +150,10 @@ __device__ __forceinline__ void reduceCopyPacks(
|
||||
#pragma unroll (MinDsts + !MinDsts)
|
||||
for (int d=0; d < MinDsts; d++) {
|
||||
#pragma unroll Unroll
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_begin]
|
||||
for (int u=0; u < Unroll; u++) {
|
||||
// coverity[dead_error_condition]
|
||||
if (d < MultimemDsts) {
|
||||
multimem_st_global(minDsts[d], acc[u]);
|
||||
} else {
|
||||
@@ -161,6 +175,8 @@ __device__ __forceinline__ void reduceCopyPacks(
|
||||
#pragma unroll
|
||||
for (int s=0; s < MinSrcs; s++) minSrcs[s] += (nWarps-1)*BytePerHunk;
|
||||
#pragma unroll
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
for (int d=0; d < MinDsts; d++) minDsts[d] += (nWarps-1)*BytePerHunk;
|
||||
threadBytesBehind += nWarps*BytePerHunk;
|
||||
threadBytesAhead -= nWarps*BytePerHunk;
|
||||
|
||||
@@ -7,7 +7,7 @@ all_colls = ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","Send
|
||||
all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
|
||||
all_tys = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16"]
|
||||
all_protos = ["LL","LL128","SIMPLE"]
|
||||
all_algos = ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"]
|
||||
all_algos = ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE","PAT"]
|
||||
|
||||
################################################################################
|
||||
# The first command line argument is the path to the directory to generate and
|
||||
@@ -74,11 +74,11 @@ else:
|
||||
################################################################################
|
||||
|
||||
algos_of_coll = {
|
||||
"AllGather": ["RING","COLLNET_DIRECT","NVLS"],
|
||||
"AllReduce": all_algos,
|
||||
"AllGather": ["RING","COLLNET_DIRECT","NVLS","PAT"],
|
||||
"AllReduce": ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"],
|
||||
"Broadcast": ["RING"],
|
||||
"Reduce": ["RING"],
|
||||
"ReduceScatter": ["RING","COLLNET_DIRECT","NVLS"],
|
||||
"ReduceScatter": ["RING","COLLNET_DIRECT","NVLS","PAT"],
|
||||
"SendRecv": [None]
|
||||
}
|
||||
|
||||
@@ -253,6 +253,9 @@ with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
|
||||
cudart, _ = required_cuda(*kfn)
|
||||
sym = paste("_", "ncclDevKernel", *kfn)
|
||||
if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
|
||||
# __global__ below gets removed by the host compiler, which results in
|
||||
# Coverity diagnosing a specifiers inconsistency.
|
||||
out("// coverity[declaration]\n")
|
||||
out("__global__ void %s(ncclDevKernelArgs4K const);\n" % sym)
|
||||
if cudart != 0: out("#endif\n")
|
||||
out("\n")
|
||||
|
||||
@@ -19,10 +19,10 @@
|
||||
inline __device__ void load64gpu(const uint64_t* ptr, uint64_t &v) {
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
asm volatile("ld.relaxed.gpu.u64 {%0}, [%1];"
|
||||
: "=l"(v) : "l"(ptr));
|
||||
: "=l"(v) : "l"(ptr) : "memory");
|
||||
#else
|
||||
asm volatile("ld.volatile.global.u64 {%0}, [%1];"
|
||||
: "=l"(v) : "l"(ptr));
|
||||
: "=l"(v) : "l"(ptr) : "memory");
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -226,6 +226,8 @@ inline __device__ void ncclNetDeviceUnpackInner(
|
||||
|
||||
int PPW = ppw(nbytes, nw);
|
||||
|
||||
// Coverity reports a potential overflow but in reality PPW is tiny so there's no need to store it in an uint64_t.
|
||||
// coverity[overflow_before_widen]
|
||||
for (uint64_t meta_s = w * PPW; meta_s < meta_cnt; meta_s += nw * PPW) {
|
||||
|
||||
uint64_t iter_meta_cnt = meta_cnt - meta_s;
|
||||
|
||||
@@ -11,28 +11,28 @@
|
||||
|
||||
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
|
||||
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
|
||||
: "=l"(v0), "=l"(v1) : "l"(ptr));
|
||||
: "=l"(v0), "=l"(v1) : "l"(ptr) : "memory");
|
||||
}
|
||||
|
||||
inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
|
||||
asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};"
|
||||
:: "l"(v0), "l"(v1), "l"(ptr));
|
||||
:: "l"(v0), "l"(v1), "l"(ptr) : "memory");
|
||||
}
|
||||
|
||||
inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
|
||||
uint64_t* shmemAsmPtr;
|
||||
asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr));
|
||||
asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr) : "memory");
|
||||
return shmemAsmPtr;
|
||||
}
|
||||
|
||||
inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
|
||||
asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];"
|
||||
: "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr));
|
||||
: "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr) : "memory");
|
||||
}
|
||||
|
||||
inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
|
||||
asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};"
|
||||
:: "l"(v0), "l"(v1), "l"(shmemAsmPtr));
|
||||
:: "l"(v0), "l"(v1), "l"(shmemAsmPtr) : "memory");
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
@@ -48,20 +48,20 @@ inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1
|
||||
// Produce 4 bytes of sub-register type by reading 2 4-byte
|
||||
// aligned values and shifting.
|
||||
uint32_t lo, hi;
|
||||
asm("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0));
|
||||
asm("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1));
|
||||
asm volatile("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0) : "memory");
|
||||
asm volatile("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1) : "memory");
|
||||
tmp4[e] = __funnelshift_r(lo, hi, 8*(int(reinterpret_cast<uintptr_t>(ptr))%4));
|
||||
}
|
||||
}
|
||||
else if(sizeof(T) == 4) {
|
||||
#pragma unroll
|
||||
for(int e=0; e < 4; e++)
|
||||
asm("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e));
|
||||
asm volatile("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e) : "memory");
|
||||
}
|
||||
else /*sizeof(T)==8*/ {
|
||||
#pragma unroll
|
||||
for(int e=0; e < 2; e++)
|
||||
asm("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e));
|
||||
asm volatile("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e) : "memory");
|
||||
}
|
||||
v0 = tmp8[0];
|
||||
v1 = tmp8[1];
|
||||
@@ -146,6 +146,9 @@ struct BytePackOf<BytePack<0>> {
|
||||
template<typename T>
|
||||
__device__ __forceinline__ typename BytePackOf<T>::Pack toPack(T value) {
|
||||
union { typename BytePackOf<T>::Pack p; T v; };
|
||||
// Coverity recommends the use of std::move here but, given that T is a POD
|
||||
// scalar, a plain copy will be just as efficient.
|
||||
// coverity[copy_assignment_call]
|
||||
v = value;
|
||||
return p;
|
||||
}
|
||||
@@ -183,7 +186,7 @@ template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t ad
|
||||
template<> \
|
||||
__device__ __forceinline__ BytePack<bytes> ld_##space<bytes>(addr_cxx_ty addr) { \
|
||||
data_cxx_ty tmp; \
|
||||
asm("ld." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \
|
||||
asm volatile("ld." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr) : "memory"); \
|
||||
BytePack<bytes> ans; \
|
||||
ans.native = tmp; \
|
||||
return ans; \
|
||||
@@ -191,7 +194,7 @@ template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t ad
|
||||
template<> \
|
||||
__device__ __forceinline__ BytePack<bytes> ld_volatile_##space<bytes>(addr_cxx_ty addr) { \
|
||||
data_cxx_ty tmp; \
|
||||
asm("ld.volatile." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \
|
||||
asm volatile("ld.volatile." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr) : "memory"); \
|
||||
BytePack<bytes> ans; \
|
||||
ans.native = tmp; \
|
||||
return ans; \
|
||||
@@ -212,7 +215,7 @@ template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t ad
|
||||
template<> \
|
||||
__device__ __forceinline__ BytePack<bytes> ld_relaxed_gpu_global<bytes>(uintptr_t addr) { \
|
||||
data_cxx_ty tmp; \
|
||||
asm("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr)); \
|
||||
asm volatile("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr) : "memory"); \
|
||||
BytePack<bytes> ans; \
|
||||
ans.native = tmp; \
|
||||
return ans; \
|
||||
@@ -242,18 +245,18 @@ DEFINE_ld_st__size(8, uint64_t, b64, l)
|
||||
template<> \
|
||||
__device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \
|
||||
BytePack<16> ans; \
|
||||
asm("ld." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \
|
||||
asm volatile("ld." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr) : "memory"); \
|
||||
return ans; \
|
||||
} \
|
||||
template<> \
|
||||
__device__ __forceinline__ BytePack<16> ld_volatile_##space<16>(addr_cxx_ty addr) { \
|
||||
BytePack<16> ans; \
|
||||
asm("ld.volatile." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \
|
||||
asm volatile("ld.volatile." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr) : "memory"); \
|
||||
return ans; \
|
||||
} \
|
||||
template<> \
|
||||
__device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \
|
||||
asm("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \
|
||||
asm volatile("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \
|
||||
}
|
||||
DEFINE_ld_st_16__space(global, uintptr_t, l)
|
||||
DEFINE_ld_st_16__space(shared, uint32_t, r)
|
||||
@@ -262,7 +265,7 @@ DEFINE_ld_st_16__space(shared, uint32_t, r)
|
||||
template<>
|
||||
__device__ __forceinline__ BytePack<16> ld_relaxed_gpu_global<16>(uintptr_t addr) {
|
||||
BytePack<16> ans;
|
||||
asm("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr));
|
||||
asm volatile("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr) : "memory");
|
||||
return ans;
|
||||
}
|
||||
template<>
|
||||
@@ -277,33 +280,33 @@ __device__ __forceinline__ void st_relaxed_gpu_global<16>(uintptr_t addr, BytePa
|
||||
|
||||
__device__ __forceinline__ uint64_t ld_volatile_global(uint64_t *ptr) {
|
||||
uint64_t ans;
|
||||
asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
|
||||
asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
|
||||
return ans;
|
||||
}
|
||||
__device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) {
|
||||
uint64_t ans;
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
asm("ld.relaxed.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
|
||||
asm volatile("ld.relaxed.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
|
||||
#else
|
||||
asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
|
||||
asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
|
||||
#endif
|
||||
return ans;
|
||||
}
|
||||
__device__ __forceinline__ uint64_t ld_relaxed_gpu_global(uint64_t *ptr) {
|
||||
uint64_t ans;
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
asm("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
|
||||
asm volatile("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
|
||||
#else
|
||||
asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
|
||||
asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
|
||||
#endif
|
||||
return ans;
|
||||
}
|
||||
__device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) {
|
||||
uint64_t ans;
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
asm("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
|
||||
asm volatile("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
|
||||
#else
|
||||
asm("ld.volatile.sys.global.u64 %0, [%1]; membar.gl;" : "=l"(ans) : "l"(cvta_to_global(ptr)));
|
||||
asm volatile("ld.volatile.sys.global.u64 %0, [%1]; membar.gl;" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
|
||||
#endif
|
||||
return ans;
|
||||
}
|
||||
|
||||
@@ -115,19 +115,25 @@ struct PrimitivesWithoutDirect {
|
||||
__device__ void directSendFromOutput(intptr_t outIx, int eltN) {
|
||||
static_cast<RealPrimitives*>(this)->sendFromOutput(outIx, eltN);
|
||||
}
|
||||
__device__ void directRecv(intptr_t outIx, int eltN) {
|
||||
__device__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN) {
|
||||
static_cast<RealPrimitives*>(this)->recv(outIx, eltN, /*postOp=*/false);
|
||||
}
|
||||
__device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
|
||||
static_cast<RealPrimitives*>(this)->copySend(inpIx, outIx, eltN, postOp);
|
||||
}
|
||||
__device__ void directRecvCopySend(intptr_t outIx, int eltN) {
|
||||
__device__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
|
||||
static_cast<RealPrimitives*>(this)->recvCopySend(outIx, eltN, /*postOp=*/false);
|
||||
}
|
||||
__device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
|
||||
__device__ void recvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
|
||||
// Direct is only for the send part
|
||||
static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
|
||||
}
|
||||
__device__ __forceinline__ void directRecvReduceDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
|
||||
static_cast<RealPrimitives*>(this)->recvReduceSend(inpIx, eltN);
|
||||
}
|
||||
__device__ __forceinline__ void directRecvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
|
||||
static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
|
||||
}
|
||||
};
|
||||
|
||||
#include "prims_simple.h"
|
||||
|
||||
@@ -101,7 +101,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
uint32_t data1, flag1, data2, flag2;
|
||||
int spins = 0;
|
||||
do {
|
||||
asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
|
||||
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4) : "memory");
|
||||
if (checkAbort(spins, 0)) break;
|
||||
} while ((flag1 != flag) || (flag2 != flag));
|
||||
uint64_t val64 = data1 + (((uint64_t)data2) << 32);
|
||||
@@ -112,9 +112,11 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
__device__ void readLLBeginAll(int offset, ncclLLFifoLine(&line)[MaxRecv]) {
|
||||
#pragma unroll
|
||||
for (int i=BeginIx; i < MaxRecv; i++) {
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
if (i < fan.nrecv()) {
|
||||
union ncclLLFifoLine* src = recvPtr(i) + offset;
|
||||
asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4));
|
||||
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4) : "memory");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -123,7 +125,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
uint32_t flag = recvFlag(i);
|
||||
int spins = 0;
|
||||
while (line[i].flag1 != flag || line[i].flag2 != flag) {
|
||||
asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4));
|
||||
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4) : "memory");
|
||||
if (checkAbort(spins, 0)) break;
|
||||
}
|
||||
uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32);
|
||||
@@ -131,7 +133,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
}
|
||||
|
||||
__device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
|
||||
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
|
||||
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag) : "memory");
|
||||
}
|
||||
|
||||
static constexpr int EltPerLine = sizeof(uint64_t)/sizeof(T);
|
||||
@@ -145,13 +147,13 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
uint64_t u8;
|
||||
};
|
||||
if(sizeof(U) == 1)
|
||||
asm("ld.volatile.global.b8 %0,[%1];" : "=r"(u4) : "l"(src));
|
||||
asm volatile("ld.volatile.global.b8 %0,[%1];" : "=r"(u4) : "l"(src) : "memory");
|
||||
else if(sizeof(U) == 2)
|
||||
asm("ld.volatile.global.b16 %0,[%1];" : "=h"(u2) : "l"(src));
|
||||
asm volatile("ld.volatile.global.b16 %0,[%1];" : "=h"(u2) : "l"(src) : "memory");
|
||||
else if(sizeof(U) == 4)
|
||||
asm("ld.volatile.global.b32 %0,[%1];" : "=r"(u4) : "l"(src));
|
||||
asm volatile("ld.volatile.global.b32 %0,[%1];" : "=r"(u4) : "l"(src) : "memory");
|
||||
else
|
||||
asm("ld.volatile.global.b64 %0,[%1];" : "=l"(u8) : "l"(src));
|
||||
asm volatile("ld.volatile.global.b64 %0,[%1];" : "=l"(u8) : "l"(src) : "memory");
|
||||
return elt;
|
||||
}
|
||||
|
||||
@@ -165,13 +167,13 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
};
|
||||
elt = val;
|
||||
if(sizeof(U) == 1)
|
||||
asm("st.volatile.global.b8 [%0],%1;" :: "l"(dst), "r"(u4));
|
||||
asm volatile("st.volatile.global.b8 [%0],%1;" :: "l"(dst), "r"(u4) : "memory");
|
||||
else if(sizeof(U) == 2)
|
||||
asm("st.volatile.global.b16 [%0],%1;" :: "l"(dst), "h"(u2));
|
||||
asm volatile("st.volatile.global.b16 [%0],%1;" :: "l"(dst), "h"(u2) : "memory");
|
||||
else if(sizeof(U) == 4)
|
||||
asm("st.volatile.global.b32 [%0],%1;" :: "l"(dst), "r"(u4));
|
||||
asm volatile("st.volatile.global.b32 [%0],%1;" :: "l"(dst), "r"(u4) : "memory");
|
||||
else
|
||||
asm("st.volatile.global.b64 [%0],%1;" :: "l"(dst), "l"(u8));
|
||||
asm volatile("st.volatile.global.b64 [%0],%1;" :: "l"(dst), "l"(u8) : "memory");
|
||||
}
|
||||
|
||||
struct DataLoader {
|
||||
@@ -194,6 +196,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
else {
|
||||
#pragma unroll
|
||||
for(int i=0; i < EltPerLine; i++) {
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
if(i==0 || i < eltN)
|
||||
elt[i] = load(src + i);
|
||||
}
|
||||
@@ -218,6 +222,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
u8 = val;
|
||||
#pragma unroll
|
||||
for(int i=0; i < EltPerLine; i++) {
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
if (i==0 || i < eltN)
|
||||
//store(dst+i, elt[i]);
|
||||
dst[i] = elt[i];
|
||||
@@ -261,6 +267,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
if (RECV) {
|
||||
data = !SRC ? peerData : applyReduce(redOp, peerData, data);
|
||||
#pragma unroll MaxRecv
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) {
|
||||
peerData = readLLFinish(offset, line, i);
|
||||
data = applyReduce(redOp, peerData, data);
|
||||
@@ -271,6 +279,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
|
||||
// Send : inter-node, then intra-node, then local
|
||||
if (SEND) {
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
for (int i=1; i < MaxSend && i < fan.nsend(); i++)
|
||||
storeLL(sendPtr(i)+offset, data, sendFlag(i));
|
||||
storeLL(sendPtr(0)+offset, data, sendFlag(0));
|
||||
@@ -288,6 +298,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
postRecv();
|
||||
}
|
||||
if (SEND) {
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
for (int i=1; i < MaxSend && i < fan.nsend(); i++)
|
||||
incSend(i, offset);
|
||||
incSend(0, offset);
|
||||
@@ -324,8 +336,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
__device__ Primitives(
|
||||
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
|
||||
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
|
||||
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr,
|
||||
bool userBufReg=false, int stepSize_=0
|
||||
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclDevWorkColl* e = nullptr,
|
||||
bool ipcReg = false, bool netReg = false, int stepSize_ = 0
|
||||
):
|
||||
redOp(redOpArg),
|
||||
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
|
||||
@@ -334,16 +346,23 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
// If we are going to support oneshot collNet + LL, then we would need to add connector index here
|
||||
int nrecv=0, nsend=0;
|
||||
// We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) {
|
||||
loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv);
|
||||
nrecv++;
|
||||
}
|
||||
// coverity[dead_error_line]
|
||||
while (nsend < MaxSend && sendPeers[nsend] >= 0) {
|
||||
loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend);
|
||||
nsend++;
|
||||
}
|
||||
this->fan = Fan(nrecv, nsend);
|
||||
// Coverity reports recvConn and sendConn being possibly NULL at this point but that won't actually
|
||||
// happen given the two "while" loops just above.
|
||||
// coverity[var_deref_model:FALSE]
|
||||
loadRecvSync();
|
||||
// coverity[var_deref_model:FALSE]
|
||||
loadSendSync();
|
||||
setDataPtrs(inputBuf, outputBuf);
|
||||
}
|
||||
|
||||
@@ -234,6 +234,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
||||
}
|
||||
}
|
||||
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
for (int i=1; i<MaxRecv && i<fan.nrecv(); i++) {
|
||||
uint64_t flag = recvFlag(i);
|
||||
uint64_t* ptr = recvPtr(i)+ll128Offset;
|
||||
@@ -272,6 +274,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
||||
|
||||
/************************ Send **************************/
|
||||
if (SEND) {
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
for (int i=1; i<MaxSend && i<fan.nsend(); i++) {
|
||||
uint64_t flag = sendFlag(i);
|
||||
uint64_t* ptr = sendPtr(i)+ll128Offset;
|
||||
@@ -365,7 +369,7 @@ public:
|
||||
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
|
||||
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
|
||||
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclDevWorkColl* e = nullptr,
|
||||
bool userBufReg=false, int stepSize_=0
|
||||
bool ipcReg = false, bool netReg = false, int stepSize_ = 0
|
||||
):
|
||||
redOp(redOpArg),
|
||||
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
|
||||
@@ -383,7 +387,11 @@ public:
|
||||
nsend++;
|
||||
}
|
||||
this->fan = Fan(nrecv, nsend);
|
||||
// Coverity reports recvConn and sendConn being possibly NULL at this point but that won't actually
|
||||
// happen given the two "while" loops just above.
|
||||
// coverity[var_deref_model:FALSE]
|
||||
loadRecvSync();
|
||||
// coverity[var_deref_model:FALSE]
|
||||
loadSendSync();
|
||||
setDataPtrs(inputBuf, outputBuf);
|
||||
}
|
||||
|
||||
@@ -7,6 +7,12 @@
|
||||
#include "network/unpack/unpack.h"
|
||||
#include <cassert>
|
||||
|
||||
enum primsMode {
|
||||
primsModeDefault = 0,
|
||||
primsModePatRs = 1,
|
||||
primsModePatAg = 2
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp, typename Fan, int Direct,
|
||||
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
|
||||
class Primitives<
|
||||
@@ -14,21 +20,25 @@ class Primitives<
|
||||
> {
|
||||
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
|
||||
static constexpr int Input=0, Output=1;
|
||||
static constexpr int RoleWaitRecv = 0x04, // 0x1 0x2 are free to use
|
||||
static constexpr int RoleInput = 0x01,
|
||||
RoleOutput = 0x02,
|
||||
RoleWaitRecv = 0x04,
|
||||
RoleWaitSend = 0x08,
|
||||
RolePostSend = 0x10,
|
||||
RolePostRecv = 0x20,
|
||||
Aborted = 0x40,
|
||||
UserBufferMode = 0x80,
|
||||
NetRegMode = 0x80,
|
||||
ConnFifoEnabled = 0x100,
|
||||
DirectWrite = 0x200,
|
||||
DirectRead = 0x400,
|
||||
// 0x800 is free to use
|
||||
PatMode = 0x800,
|
||||
NvlsMinPolling = 0x1000,
|
||||
NetDeviceUnpack = 0x2000,
|
||||
AnyNetDeviceUnpack = 0x4000,
|
||||
NvlsDirectRead = 0x8000,
|
||||
NvlsDirectWrite = 0x10000;
|
||||
NvlsDirectWrite = 0x10000,
|
||||
IpcWrite = 0x20000,
|
||||
IpcRead = 0x40000;
|
||||
const int tid, tidInBlock;
|
||||
const int nthreads;
|
||||
int nworkers;
|
||||
@@ -38,13 +48,15 @@ class Primitives<
|
||||
int flags;
|
||||
int group;
|
||||
uint64_t step;
|
||||
struct ncclConnInfo* conn = NULL;
|
||||
struct ncclConnFifo* connFifo = NULL;
|
||||
T* connEltsFifo;
|
||||
T* directBuff;
|
||||
T* directBuff = NULL;
|
||||
uint64_t *connStepPtr;
|
||||
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
|
||||
int connStepSize; // Connection step size
|
||||
void* netDeviceHandle;
|
||||
uint64_t accSize; // Accumulated size. Used by PAT operations
|
||||
|
||||
// Don't use barrier 0 as it's used by the final sync
|
||||
__device__ void barrier() {
|
||||
@@ -95,7 +107,7 @@ class Primitives<
|
||||
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
|
||||
if (flags & NvlsMinPolling) {
|
||||
uint64_t ans;
|
||||
asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
|
||||
asm volatile("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
|
||||
return ans;
|
||||
}
|
||||
#endif
|
||||
@@ -107,8 +119,10 @@ class Primitives<
|
||||
template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
|
||||
__device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) {
|
||||
const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
|
||||
const bool noRecvWait = DirectRecv && Src && (flags & DirectRead); // no wait when directly reading from remote input
|
||||
const bool noRecvWait = DirectRecv && Src && (flags & (DirectRead | IpcRead)); // no wait when directly reading from remote input
|
||||
const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) ||
|
||||
((flags & (Send*RoleWaitSend)) && !noSendWait)) {
|
||||
int spins = 0;
|
||||
@@ -125,28 +139,30 @@ class Primitives<
|
||||
|
||||
void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
|
||||
: (ncclShmem.groups[group].srcs + Src);
|
||||
if (flags & UserBufferMode) {
|
||||
if (flags & NetRegMode) {
|
||||
// Do nothing
|
||||
} else if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
|
||||
ptrs[index] = connEltsFifo + loadInt(&connFifo[step%NCCL_STEPS].offset)/sizeof(T);
|
||||
} else if (isSendNotRecv && DirectSend) {
|
||||
if (flags & (DirectWrite | NvlsDirectWrite)) {
|
||||
if (flags & (DirectWrite | NvlsDirectWrite | IpcWrite)) {
|
||||
ptrs[index] = directBuff + dstIx + offset;
|
||||
} else if (flags & DirectRead) { // empty send
|
||||
} else if ((flags & DirectRead) || (flags & IpcRead)) { // empty send
|
||||
ptrs[index] = nullptr;
|
||||
} else {
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
|
||||
}
|
||||
} else if (!isSendNotRecv && DirectRecv) {
|
||||
if (flags & (DirectRead | NvlsDirectRead)) {
|
||||
if (flags & (DirectRead | NvlsDirectRead | IpcRead)) {
|
||||
ptrs[index] = directBuff + srcIx + offset;
|
||||
} else if (flags & DirectWrite) {
|
||||
} else if ((flags & DirectWrite) || (flags & IpcWrite)) {
|
||||
ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer
|
||||
} else {
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
|
||||
}
|
||||
if (flags & NetDeviceUnpack) {
|
||||
@@ -182,7 +198,7 @@ class Primitives<
|
||||
int slice = 0;
|
||||
int offset = 0;
|
||||
|
||||
if (tid < nworkers && offset < nelem && ((flags & UserBufferMode) == 0)) {
|
||||
if (tid < nworkers && offset < nelem && ((flags & NetRegMode) == 0)) {
|
||||
// Worker-only loop for non-empty slices. Non-workers and empty slices are
|
||||
// processed in the loop following this if block. The benefit of splitting
|
||||
// the loop like this is we pull two branches out of the critical path.
|
||||
@@ -234,7 +250,7 @@ class Primitives<
|
||||
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]
|
||||
/* NVLS can have srcs[0] == dsts[0], but we cannot enter this "if branch",
|
||||
* so we need to check whether MultimemSrcs and MultimemDsts are 0. */
|
||||
&& MultimemSrcs == 0 && MultimemDsts == 0) {
|
||||
&& MultimemSrcs == 0 && MultimemDsts == 0 && !Src) {
|
||||
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
|
||||
if (Send) {
|
||||
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, MaxSend, /*PreOpSrcs*/0>
|
||||
@@ -250,7 +266,7 @@ class Primitives<
|
||||
Recv, ncclShmem.groups[group].srcs,
|
||||
Dst, ncclShmem.groups[group].dsts,
|
||||
workSize);
|
||||
} else {
|
||||
} else if (ncclShmem.groups[group].srcs[0] && ncclShmem.groups[group].dsts[0]) {
|
||||
constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
|
||||
DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
|
||||
reduceCopy<Unroll, RedOp, T,
|
||||
@@ -265,6 +281,8 @@ class Primitives<
|
||||
postPeer<Recv, Send>(0 < sliceSize);
|
||||
offset += sliceSize;
|
||||
slice += 1;
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
} while (slice < SlicePerChunk && offset < nelem);
|
||||
}
|
||||
|
||||
@@ -310,12 +328,13 @@ public:
|
||||
}
|
||||
|
||||
template<int Recv, int Send, typename Fn>
|
||||
__device__ __forceinline__ void process(Fn &&fn) {
|
||||
__device__ __forceinline__ void process(Fn &&fn, uint32_t sendDirectFlag, uint32_t recvDirectFlag) {
|
||||
#pragma unroll 1
|
||||
for (int slice=0; slice < SlicePerChunk; slice++) {
|
||||
if (tid < nworkers) {
|
||||
int nsend, nrecv;
|
||||
if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
|
||||
bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
|
||||
const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
|
||||
int spins = 0;
|
||||
while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
@@ -326,19 +345,53 @@ public:
|
||||
if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
|
||||
int offset = loadInt(&connFifo[step%NCCL_STEPS].offset);
|
||||
ptrs[index] = connEltsFifo + offset/sizeof(T);
|
||||
} else if (Direct && fn.work->regUsed) {
|
||||
if (isSendNotRecv) {
|
||||
if (flags & (DirectWrite | IpcWrite)) {
|
||||
ptrs[index] = directBuff;
|
||||
} else if (flags & (DirectRead | IpcRead)) { // empty send
|
||||
ptrs[index] = nullptr;
|
||||
} else {
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
|
||||
}
|
||||
} else {
|
||||
if (flags & (DirectRead | IpcRead)) {
|
||||
ptrs[index] = directBuff;
|
||||
} else if (flags & (DirectWrite | IpcWrite)) {
|
||||
if (Send)
|
||||
ptrs[index] = directBuff; // send to next from my output buffer
|
||||
else
|
||||
ptrs[index] = nullptr;
|
||||
} else {
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
|
||||
}
|
||||
}
|
||||
subBarrier();
|
||||
fn.template operator()<SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend>
|
||||
(tid, nworkers, slice, stepSize*StepPerSlice,
|
||||
fan.nrecv(), ncclShmem.groups[group].srcs,
|
||||
fan.nsend(), ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes);
|
||||
if (Recv == 0 || ncclShmem.groups[group].srcs[0] == nullptr) {
|
||||
nrecv = 0;
|
||||
} else {
|
||||
nrecv = fan.nrecv();
|
||||
}
|
||||
|
||||
if (Send == 0 || ncclShmem.groups[group].dsts[0] == nullptr) {
|
||||
nsend = 0;
|
||||
} else {
|
||||
nsend = fan.nsend();
|
||||
}
|
||||
fn.template operator() < SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend >
|
||||
(tid, nworkers, slice, stepSize * StepPerSlice,
|
||||
nrecv, ncclShmem.groups[group].srcs,
|
||||
nsend, ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes, sendDirectFlag, recvDirectFlag);
|
||||
}
|
||||
barrier();
|
||||
int32_t dstSize = 0;
|
||||
if (flags & Send*RolePostSend) {
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_begin]
|
||||
dstSize = ncclShmem.groups[group].dstSizes[index];
|
||||
ncclShmem.groups[group].dstSizes[index] = 0;
|
||||
if (flags & ConnFifoEnabled) connFifo[step%NCCL_STEPS].size = dstSize*sizeof(T);
|
||||
@@ -421,99 +474,97 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
|
||||
if (flags & (RoleWaitRecv|RolePostRecv)) {
|
||||
auto *conn = &peer->recv[connIndex];
|
||||
if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
|
||||
// handle must be a device ptr
|
||||
netDeviceHandle = conn->netDeviceHandle.handle;
|
||||
// Cache the handle
|
||||
ncclNetDeviceUnpackSetup(netDeviceHandle, group, index);
|
||||
flags |= NetDeviceUnpack;
|
||||
}
|
||||
step = conn->step;
|
||||
step = roundUp(step, SlicePerChunk*StepPerSlice);
|
||||
if (flags & RolePostRecv) {
|
||||
connStepPtr = conn->head;
|
||||
*connStepPtr = step; // Return credits in case we rounded up.
|
||||
}
|
||||
if (flags & RoleWaitRecv) {
|
||||
ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
|
||||
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
|
||||
connStepPtr = conn->tail;
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
connStepSize = conn->stepSize/sizeof(T);
|
||||
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
if (conn->connFifo != nullptr) {
|
||||
flags |= ConnFifoEnabled;
|
||||
connFifo = conn->connFifo;
|
||||
} else if (Direct) {
|
||||
// User buffers have been registered
|
||||
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
|
||||
if (connIndex == 1 && P2p == 0) {
|
||||
flags |= DirectRead; // scatter-reduce use direct pull
|
||||
} else {
|
||||
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
|
||||
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
|
||||
}
|
||||
} else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
|
||||
if (connIndex == 1 && P2p == 0) {
|
||||
flags |= DirectRead; // scatter-reduce use direct pull
|
||||
} else {
|
||||
// direct read not allowed in non-register case
|
||||
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
|
||||
flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
|
||||
}
|
||||
} else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) {
|
||||
/* NVLS direct */
|
||||
flags |= NvlsDirectRead;
|
||||
__device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) {
|
||||
conn = &peer->recv[connIndex];
|
||||
if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
|
||||
// handle must be a device ptr
|
||||
netDeviceHandle = conn->netDeviceHandle.handle;
|
||||
// Cache the handle
|
||||
ncclNetDeviceUnpackSetup(netDeviceHandle, group, index);
|
||||
flags |= NetDeviceUnpack;
|
||||
}
|
||||
step = conn->step;
|
||||
step = roundUp(step, SlicePerChunk*StepPerSlice);
|
||||
if (flags & RolePostRecv) {
|
||||
connStepPtr = conn->head;
|
||||
*connStepPtr = step; // Return credits in case we rounded up.
|
||||
}
|
||||
if (flags & RoleWaitRecv) {
|
||||
if ((flags & PatMode) == 0) ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
|
||||
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
|
||||
connStepPtr = conn->tail;
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
connStepSize = conn->stepSize/sizeof(T);
|
||||
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
if (conn->connFifo != nullptr) {
|
||||
flags |= ConnFifoEnabled;
|
||||
connFifo = conn->connFifo;
|
||||
} else if (Direct && regFlag) {
|
||||
// User buffers have been registered
|
||||
if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) {
|
||||
if (P2p) {
|
||||
flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead;
|
||||
} else if (connIndex == 1 && direct) {
|
||||
flags |= IpcRead;
|
||||
} else {
|
||||
flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite;
|
||||
}
|
||||
} else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
|
||||
if (P2p) {
|
||||
flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead;
|
||||
} else if (connIndex == 1 && direct) {
|
||||
flags |= DirectRead; // scatter-reduce use direct pull
|
||||
} else {
|
||||
flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite;
|
||||
}
|
||||
} else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
|
||||
/* NVLS direct */
|
||||
flags |= NvlsDirectRead;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
|
||||
if (flags & (RoleWaitSend|RolePostSend)) {
|
||||
auto *conn = &peer->send[connIndex];
|
||||
step = conn->step;
|
||||
step = roundUp(step, SlicePerChunk*StepPerSlice);
|
||||
__device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) {
|
||||
conn = &peer->send[connIndex];
|
||||
step = conn->step;
|
||||
step = roundUp(step, SlicePerChunk*StepPerSlice);
|
||||
|
||||
connFifo = conn->connFifo;
|
||||
if (connFifo != nullptr) flags |= ConnFifoEnabled;
|
||||
connFifo = conn->connFifo;
|
||||
if (connFifo != nullptr) flags |= ConnFifoEnabled;
|
||||
|
||||
if (flags & RolePostSend) {
|
||||
connStepPtr = conn->tail;
|
||||
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
}
|
||||
if (flags & RoleWaitSend) {
|
||||
ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
|
||||
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
|
||||
connStepPtr = conn->head;
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
connStepSize = conn->stepSize/sizeof(T);
|
||||
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
if (connFifo == nullptr && Direct) {
|
||||
// User buffers have been registered
|
||||
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
|
||||
if (connIndex == 1 && P2p == 0) {
|
||||
flags |= DirectRead; // scatter-reduce use direct pull
|
||||
} else {
|
||||
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
|
||||
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
|
||||
}
|
||||
} else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
|
||||
if (connIndex == 1 && P2p == 0) {
|
||||
flags |= DirectRead; // scatter-reduce use direct pull
|
||||
} else {
|
||||
// direct read not allowed in non-register case
|
||||
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
|
||||
flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
|
||||
}
|
||||
} else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) {
|
||||
/* NVLS direct */
|
||||
flags |= NvlsDirectWrite;
|
||||
if (flags & RolePostSend) {
|
||||
connStepPtr = conn->tail;
|
||||
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
}
|
||||
if (flags & RoleWaitSend) {
|
||||
if ((flags & PatMode) == 0) ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
|
||||
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
|
||||
connStepPtr = conn->head;
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
connStepSize = conn->stepSize/sizeof(T);
|
||||
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
|
||||
if (connFifo == nullptr && Direct && regFlag) {
|
||||
// User buffers have been registered
|
||||
if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) {
|
||||
if (P2p) {
|
||||
flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead;
|
||||
} else if (connIndex == 1 && direct) {
|
||||
flags |= IpcRead;
|
||||
} else {
|
||||
flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite;
|
||||
}
|
||||
} else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
|
||||
if (P2p) {
|
||||
flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead;
|
||||
} else if (connIndex == 1 && direct) {
|
||||
flags |= DirectRead; // scatter-reduce use direct pull
|
||||
} else {
|
||||
flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite;
|
||||
}
|
||||
} else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
|
||||
/* NVLS direct */
|
||||
flags |= NvlsDirectWrite;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -523,7 +574,8 @@ private:
|
||||
__device__ Primitives(
|
||||
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
|
||||
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
|
||||
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,bool userBufReg=false, int stepSize_=0
|
||||
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,
|
||||
bool ipcReg = false, bool netReg = false, int stepSize_ = 0, int mode = primsModeDefault
|
||||
):
|
||||
tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
|
||||
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
|
||||
@@ -531,33 +583,71 @@ private:
|
||||
// For send operations, we need an extra warp to overlap the threadfence and the copy
|
||||
this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0);
|
||||
|
||||
int nrecv=0, nsend=0;
|
||||
while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
|
||||
while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++;
|
||||
this->fan = Fan(nrecv, nsend);
|
||||
|
||||
constexpr int ThreadPerSync =
|
||||
MaxSend >= 16 || MaxRecv >= 16 ? 32 : // NVLS may have an arity > 8. In that case increase the size of the groups
|
||||
MaxSend >= 8 || MaxRecv >= 8 ? 16 :
|
||||
8; // Allows for all roles (WaitRecv/WaitSend/PostRecv/PostSend) within a single warp
|
||||
static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
|
||||
|
||||
index = -1;
|
||||
int peer = -1;
|
||||
flags = 0;
|
||||
assert(2*(nrecv+nsend) <= nthreads); // Ensure no thread is assigned more than one role.
|
||||
if (tid < nrecv) { flags |= RoleWaitRecv; index = tid; }
|
||||
else if (tid < nrecv+nsend) { flags |= RoleWaitSend; index = tid-nrecv; }
|
||||
else if (nthreads-nsend <= tid) { flags |= RolePostSend; index = tid-(nthreads-nsend); }
|
||||
else if (nthreads-nrecv-nsend <= tid) { flags |= RolePostRecv; index = tid-(nthreads-nrecv-nsend); }
|
||||
index = -1;
|
||||
if (mode == primsModeDefault) { // Connect to ranks in sendPeers/recvPeers
|
||||
int nrecv=0, nsend=0;
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_line]
|
||||
while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
|
||||
// coverity[dead_error_line]
|
||||
while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++;
|
||||
this->fan = Fan(nrecv, nsend);
|
||||
|
||||
int peer = 0;
|
||||
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
|
||||
if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
|
||||
constexpr int ThreadPerSync =
|
||||
MaxSend >= 16 || MaxRecv >= 16 ? 32 : // NVLS may have an arity > 8. In that case increase the size of the groups
|
||||
MaxSend >= 8 || MaxRecv >= 8 ? 16 :
|
||||
8; // Allows for all roles (WaitRecv/WaitSend/PostRecv/PostSend) within a single warp
|
||||
static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
|
||||
|
||||
loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
|
||||
loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
|
||||
assert(2*(nrecv+nsend) <= nthreads); // Ensure no thread is assigned more than one role.
|
||||
// Coverity assumes that index will equal tid based on the line below, but it doesn't consider the setting
|
||||
// of flags. This results in multiple false positive overruns being reported here and in all_reduce.h.
|
||||
// Unfortunately, we've been unsuccessful in trying to silence them with a single directive here so
|
||||
// instead it's being done at the callers.
|
||||
// coverity[assignment:FALSE]
|
||||
if (tid < nrecv) { flags |= RoleWaitRecv; index = tid; }
|
||||
// Yes, for some template arguments this code will be unreachable. That's fine.
|
||||
// coverity[dead_error_begin]
|
||||
else if (tid < nrecv+nsend) { flags |= RoleWaitSend; index = tid-nrecv; }
|
||||
else if (nthreads-nsend <= tid) { flags |= RolePostSend; index = tid-(nthreads-nsend); }
|
||||
else if (nthreads-nrecv-nsend <= tid) { flags |= RolePostRecv; index = tid-(nthreads-nrecv-nsend); }
|
||||
|
||||
if (userBufReg) flags |= UserBufferMode;
|
||||
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
|
||||
if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
|
||||
} else if (mode == primsModePatRs || mode == primsModePatAg) { // Connect to all ranks +/- 2^n
|
||||
flags |= PatMode;
|
||||
accSize = 0;
|
||||
int nranks = ncclShmem.comm.nRanks;
|
||||
int rank = ncclShmem.comm.rank;
|
||||
// A thread is responsible for rank +/- 2 ^ (tid%32). That should be fine as long as rank is a 32-bits integer.
|
||||
index = tid % 32;
|
||||
uint32_t delta = 1 << index;
|
||||
const int roles[4] = { RoleWaitRecv, RoleWaitSend, RolePostSend, RolePostRecv};
|
||||
int block = tid / 32;
|
||||
if (block < 4 && delta < nranks) {
|
||||
int role = roles[block];
|
||||
if (mode == primsModePatRs) {
|
||||
if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank - delta + nranks) % nranks;
|
||||
if (role & (RoleWaitSend|RolePostSend)) peer = (rank + delta) % nranks;
|
||||
} else if (mode == primsModePatAg) {
|
||||
if (role & (RoleWaitSend|RolePostSend)) peer = (rank - delta + nranks) % nranks;
|
||||
if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank + delta) % nranks;
|
||||
}
|
||||
flags |= role;
|
||||
} else if (tid == 128) {
|
||||
flags |= RoleInput | RoleOutput; // Only one will be used depending on the operation
|
||||
}
|
||||
}
|
||||
|
||||
// Coverity thinks that index could be -1 here but that's not actually the case.
|
||||
// coverity[negative_returns:FALSE]
|
||||
if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e ? e->direct : 0, e ? e->regUsed : ipcReg);
|
||||
// coverity[negative_returns:FALSE]
|
||||
if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e ? e->direct : 0, e ? e->regUsed : ipcReg);
|
||||
|
||||
if (netReg) flags |= NetRegMode;
|
||||
|
||||
if (barrierAny(flags & NetDeviceUnpack)) {
|
||||
flags |= AnyNetDeviceUnpack;
|
||||
@@ -569,18 +659,14 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e);
|
||||
// coverity[negative_returns:FALSE]
|
||||
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e, (uint8_t)(e ? e->regUsed : ipcReg), peer);
|
||||
}
|
||||
|
||||
__device__ ~Primitives() {
|
||||
// Ensure ncclShmem.groups[].send/recvConns are available
|
||||
barrier();
|
||||
// Save steps for the next operation
|
||||
if (flags & (RolePostSend|RolePostRecv)) {
|
||||
auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
|
||||
conns[index]->step = step;
|
||||
}
|
||||
if ((flags & UserBufferMode) && (flags & RoleWaitSend)) {
|
||||
if (flags & (RolePostSend|RolePostRecv)) conn->step = step;
|
||||
if ((flags & NetRegMode) && (flags & RoleWaitSend)) {
|
||||
// Make sure we wait until the proxy has sent data before we return.
|
||||
// We don't want the next CUDA kernel to overwrite the send buffer which
|
||||
// was accessed directly.
|
||||
@@ -599,97 +685,111 @@ private:
|
||||
barrier();
|
||||
}
|
||||
|
||||
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* e) {
|
||||
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* work, uint8_t ipcReg, int peer) {
|
||||
if (tid==0) {
|
||||
ncclShmem.groups[group].userInput = (void*)inputBuf;
|
||||
ncclShmem.groups[group].userOutput = (void*)outputBuf;
|
||||
ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input
|
||||
}
|
||||
bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite);
|
||||
bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite));
|
||||
bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
|
||||
bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead) || (flags == (flags|RoleWaitRecv|NvlsDirectRead)); // receiver accepts direct buffer
|
||||
int regUsed = e != nullptr ? e->coll.regUsed : 0;
|
||||
|
||||
if (Direct && recvProvider) {
|
||||
int spins = 0;
|
||||
void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
|
||||
// Wait for consumer to consume previous value before trampling it.
|
||||
if (slot) {
|
||||
while (*slot != nullptr && !checkAbort(spins));
|
||||
directBuff = (T*)outputBuf;
|
||||
// Encode pointer by XOR'ing against some address they definitely wouldn't send
|
||||
// since we want to allow them sending us nullptr while not colliding with
|
||||
// the empty slot value.
|
||||
*slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
|
||||
}
|
||||
}
|
||||
if (Direct && sendAcceptor) {
|
||||
int spins = 0;
|
||||
void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
|
||||
void *ptr;
|
||||
while (slot) {
|
||||
ptr = *slot;
|
||||
if (ptr != nullptr || checkAbort(spins)) break;
|
||||
}
|
||||
|
||||
if (slot) {
|
||||
directBuff = regUsed ? (T*)(e->dnOutputs[index]) :
|
||||
reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
|
||||
*slot = nullptr;
|
||||
} else {
|
||||
/* slot is NULL, it must be regUsed == 1 */
|
||||
directBuff = (T*)e->dnOutputs[index];
|
||||
}
|
||||
}
|
||||
if (Direct && sendProvider) {
|
||||
int spins = 0;
|
||||
void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
|
||||
volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
|
||||
volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1;
|
||||
// Wait for consumer to consume previous value before trampling it.
|
||||
if (slot && argSlot0 && argSlot1) {
|
||||
while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
|
||||
// If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
|
||||
// Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
|
||||
directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
|
||||
// Exchange pre-scalers for use in direct pull
|
||||
*argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg;
|
||||
*argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32);
|
||||
// Encode pointer by XOR'ing against some address they definitely wouldn't send
|
||||
// since we want to allow them sending us nullptr while not colliding with
|
||||
// the empty slot value.
|
||||
*slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
|
||||
}
|
||||
}
|
||||
if (Direct && recvAcceptor) {
|
||||
int spins = 0;
|
||||
void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
|
||||
volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
|
||||
volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1;
|
||||
void *ptr;
|
||||
while (slot) {
|
||||
ptr = *slot;
|
||||
if (ptr != nullptr || checkAbort(spins)) break;
|
||||
}
|
||||
|
||||
if (slot && argSlot0 && argSlot1) {
|
||||
directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) :
|
||||
reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
|
||||
if (MaxSend != 0) { // reduce group rather than gather group
|
||||
// Store scalers for remote inputs
|
||||
uint64_t arg0, arg1;
|
||||
while (true) {
|
||||
arg0 = *argSlot0;
|
||||
arg1 = *argSlot1;
|
||||
if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
|
||||
if (Direct && ipcReg) {
|
||||
bool recvProvider = (flags & RoleWaitRecv) && (flags & DirectWrite || flags & IpcWrite);
|
||||
bool sendAcceptor = (flags & RoleWaitSend) && (flags & DirectWrite || flags & IpcWrite || flags & NvlsDirectWrite);
|
||||
bool sendProvider = (flags & RoleWaitSend) && (flags & DirectRead || flags & IpcRead); // sender provides direct buffer (to be fetched)
|
||||
bool recvAcceptor = (flags & RoleWaitRecv) && (flags & DirectRead || flags & IpcRead || flags & NvlsDirectRead); // receiver accepts direct buffer
|
||||
if (recvProvider) {
|
||||
int spins = 0;
|
||||
void* volatile* slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
|
||||
// Wait for consumer to consume previous value before trampling it.
|
||||
if (slot) {
|
||||
T* exchgPtr;
|
||||
directBuff = (T*)outputBuf;
|
||||
while (*slot != nullptr && !checkAbort(spins));
|
||||
if (P2p) {
|
||||
exchgPtr = (T*)outputBuf;
|
||||
} else {
|
||||
int localPeer = ncclShmem.comm.rankToLocalRank[peer];
|
||||
exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]);
|
||||
}
|
||||
ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
|
||||
*slot = reinterpret_cast<void*>(exchgPtr);
|
||||
}
|
||||
}
|
||||
if (sendAcceptor) {
|
||||
int spins = 0;
|
||||
void* volatile* slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
|
||||
void* ptr;
|
||||
while (slot) {
|
||||
ptr = *slot;
|
||||
if (ptr != nullptr || checkAbort(spins)) break;
|
||||
}
|
||||
|
||||
if (slot) {
|
||||
directBuff = reinterpret_cast<T*>(ptr);
|
||||
*slot = nullptr;
|
||||
} else {
|
||||
directBuff = (T*)work->dnOutputs[index];
|
||||
}
|
||||
}
|
||||
if (sendProvider) {
|
||||
int spins = 0;
|
||||
void* volatile* slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
|
||||
volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
|
||||
volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange + 1;
|
||||
// Wait for consumer to consume previous value before trampling it.
|
||||
if (slot && argSlot0 && argSlot1) {
|
||||
T* exchgPtr;
|
||||
while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(spins));
|
||||
// If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
|
||||
// Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
|
||||
directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
|
||||
if (P2p) {
|
||||
exchgPtr = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
|
||||
} else {
|
||||
int localPeer = ncclShmem.comm.rankToLocalRank[peer];
|
||||
if (MaxRecv == 0)
|
||||
exchgPtr = (T*)(work->coll.sendbuffOffset + work->coll.sendbuffRmtAddrs[localPeer]);
|
||||
else
|
||||
exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]);
|
||||
}
|
||||
|
||||
// Exchange pre-scalers for use in direct pull
|
||||
*argSlot0 = (uint64_t(1) << 32) | (uint32_t)redOpArg;
|
||||
*argSlot1 = (uint64_t(1) << 32) | (uint32_t)(redOpArg >> 32);
|
||||
*slot = reinterpret_cast<T*>(exchgPtr);
|
||||
}
|
||||
}
|
||||
if (recvAcceptor) {
|
||||
int spins = 0;
|
||||
void* volatile* slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
|
||||
volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
|
||||
volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange + 1;
|
||||
void* ptr;
|
||||
while (slot) {
|
||||
ptr = *slot;
|
||||
if (ptr != nullptr || checkAbort(spins)) break;
|
||||
}
|
||||
|
||||
if (slot && argSlot0 && argSlot1) {
|
||||
directBuff = reinterpret_cast<T*>(ptr);
|
||||
if (MaxSend != 0) { // reduce group rather than gather group
|
||||
// Store scalers for remote inputs
|
||||
uint64_t arg0, arg1;
|
||||
while (true) {
|
||||
arg0 = *argSlot0;
|
||||
arg1 = *argSlot1;
|
||||
if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
|
||||
}
|
||||
ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
|
||||
}
|
||||
*argSlot0 = 0; *argSlot1 = 0;
|
||||
*slot = nullptr;
|
||||
} else {
|
||||
// Coverity complains about work being possibly NULL below. However, slot
|
||||
// being NULL means that the NVLS buffer is registered (regUsed == 1)
|
||||
// so work can't be NULL in this code path.
|
||||
// coverity[var_deref_op]
|
||||
directBuff = (T*)work->dnInputs[index];
|
||||
}
|
||||
*argSlot0 = 0; *argSlot1 = 0;
|
||||
*slot = nullptr;
|
||||
} else {
|
||||
directBuff = (T*)e->dnInputs[index];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -717,8 +817,8 @@ private:
|
||||
__device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
|
||||
genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp);
|
||||
}
|
||||
__device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) {
|
||||
genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false);
|
||||
__device__ __forceinline__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
|
||||
genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, postOp);
|
||||
}
|
||||
__device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) {
|
||||
genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false);
|
||||
@@ -737,8 +837,8 @@ private:
|
||||
__device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
|
||||
genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
|
||||
}
|
||||
__device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) {
|
||||
genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false);
|
||||
__device__ __forceinline__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
|
||||
genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
|
||||
}
|
||||
__device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN) {
|
||||
genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, false);
|
||||
@@ -750,6 +850,9 @@ private:
|
||||
__device__ __forceinline__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
|
||||
genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
|
||||
}
|
||||
__device__ __forceinline__ void directRecvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
|
||||
genericOp<1, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void recvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
|
||||
genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
|
||||
@@ -757,14 +860,20 @@ private:
|
||||
__device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
|
||||
genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
|
||||
}
|
||||
__device__ __forceinline__ void directRecvReduceDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
|
||||
genericOp<1, 1, 1, 1, Input, -1>(inpIx, outIx, eltN, postOp);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
|
||||
genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
|
||||
}
|
||||
__device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
|
||||
__device__ __forceinline__ void recvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
|
||||
// Direct is only for the send part
|
||||
genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
|
||||
}
|
||||
__device__ __forceinline__ void directRecvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
|
||||
genericOp<1, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void
|
||||
scatter(intptr_t inpIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
|
||||
@@ -783,4 +892,126 @@ private:
|
||||
directGather(intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
|
||||
ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void patReduce(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int sendStepOffset, int nelem, int postRecv, int postSend) {
|
||||
nelem = nelem < 0 ? 0 : nelem;
|
||||
T* userInput = (T*)ncclShmem.groups[group].userInput;
|
||||
T* userOutput = (T*)ncclShmem.groups[group].userOutput;
|
||||
|
||||
if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
|
||||
ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + recvOffset;
|
||||
int spins = 0;
|
||||
while (connStepCache < step + StepPerSlice) {
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
if (checkAbort(spins)) break;
|
||||
}
|
||||
if (postRecv) step += StepPerSlice;
|
||||
}
|
||||
if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
|
||||
int spins = 0;
|
||||
while (connStepCache + NCCL_STEPS < step + sendStepOffset + StepPerSlice) {
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
if (checkAbort(spins)) break;
|
||||
}
|
||||
ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + ((step+sendStepOffset)%NCCL_STEPS)*connStepSize) + sendOffset;
|
||||
if (accSize < sendOffset + nelem + (step+sendStepOffset)*connStepSize) {
|
||||
// New data, add our own data to it.
|
||||
ncclShmem.groups[group].srcs[1] = userInput + inpIx;
|
||||
accSize = sendOffset + nelem + (step+sendStepOffset)*connStepSize;
|
||||
if (flags & ConnFifoEnabled)
|
||||
connFifo[(step+sendStepOffset)%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
|
||||
} else {
|
||||
// There is already data in there, accumulate instead of writing to it.
|
||||
ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
|
||||
}
|
||||
if (postSend) step += StepPerSlice;
|
||||
}
|
||||
if (sendPow2 < 0 && (flags & RoleOutput)) { // Destination is our own local buffer
|
||||
ncclShmem.groups[group].dsts[0] = userOutput + outIx;
|
||||
if (accSize < outIx + nelem) {
|
||||
// New data, add our own data to it.
|
||||
ncclShmem.groups[group].srcs[1] = userInput + inpIx;
|
||||
accSize = outIx + nelem;
|
||||
} else {
|
||||
// There is already data in there, accumulate instead of writing to it.
|
||||
ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
int nSrcs = 2;
|
||||
void** srcs = ncclShmem.groups[group].srcs;
|
||||
if (recvPow2 < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source
|
||||
|
||||
int workSize = ncclShmem.aborted ? 0 : nelem;
|
||||
|
||||
reduceCopy<Unroll, RedOp, T, 0, 1, 2, 0, 1, 1, /*PreOpSrcs*/0>
|
||||
(tid, nthreads, ncclShmem.redOpArgs[0], nullptr, /*postOp=*/false,
|
||||
nSrcs, srcs, 1, ncclShmem.groups[group].dsts, workSize);
|
||||
|
||||
barrier();
|
||||
if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
|
||||
if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void patCopy(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int recvStepOffset, int nelem, int postRecv, int postSend) {
|
||||
nelem = nelem < 0 ? 0 : nelem;
|
||||
T* userInput = (T*)ncclShmem.groups[group].userInput;
|
||||
T* userOutput = (T*)ncclShmem.groups[group].userOutput;
|
||||
|
||||
if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
|
||||
ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + ((step+recvStepOffset)%NCCL_STEPS)*connStepSize) + recvOffset;
|
||||
int spins = 0;
|
||||
while (connStepCache < step + recvStepOffset + StepPerSlice) {
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
if (checkAbort(spins)) break;
|
||||
}
|
||||
if (accSize < recvOffset + nelem + (step+recvStepOffset)*connStepSize) {
|
||||
// New data, copy to our output buffer.
|
||||
ncclShmem.groups[group].dsts[1] = userOutput + outIx;
|
||||
accSize = recvOffset + nelem + (step+recvStepOffset)*connStepSize;
|
||||
} else {
|
||||
ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
|
||||
}
|
||||
if (postRecv) step += StepPerSlice;
|
||||
}
|
||||
if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
|
||||
int spins = 0;
|
||||
while (connStepCache + NCCL_STEPS < step + StepPerSlice) {
|
||||
connStepCache = loadStepValue(connStepPtr);
|
||||
if (checkAbort(spins)) break;
|
||||
}
|
||||
ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + sendOffset;
|
||||
if (postSend) {
|
||||
if (flags & ConnFifoEnabled)
|
||||
connFifo[step%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
|
||||
step += StepPerSlice;
|
||||
}
|
||||
}
|
||||
if (recvPow2 < 0 && (flags & RoleInput)) { // Source is our own local buffer
|
||||
ncclShmem.groups[group].srcs[0] = userInput + inpIx;
|
||||
if (accSize < inpIx + nelem) {
|
||||
// New data, copy to our output buffer.
|
||||
ncclShmem.groups[group].dsts[1] = userOutput + outIx;
|
||||
accSize = inpIx + nelem;
|
||||
} else {
|
||||
ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
int nDsts = 2;
|
||||
void** dsts = ncclShmem.groups[group].dsts;
|
||||
if (sendPow2 < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest
|
||||
if (ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[1]) nDsts--; // In-place or already done.
|
||||
|
||||
int workSize = ncclShmem.aborted ? 0 : nelem;
|
||||
|
||||
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 2, /*PreOpSrcs*/0>
|
||||
(tid, nthreads, ncclShmem.redOpArgs[0], nullptr, /*postOp=*/false,
|
||||
1, ncclShmem.groups[group].srcs, nDsts, dsts, workSize);
|
||||
|
||||
barrier();
|
||||
if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
|
||||
if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
@@ -23,6 +23,9 @@ namespace {
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
|
||||
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
|
||||
// coverity[callee_ptr_arith:FALSE]
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
|
||||
|
||||
|
||||
@@ -234,10 +234,10 @@ struct Apply_Reduce<FuncProd<uint8_t>, /*EltPerPack=*/4> {
|
||||
uint32_t a = apack.native;
|
||||
uint32_t b = bpack.native;
|
||||
uint32_t ab0 = (a*b) & 0xffu;
|
||||
asm("mad.lo.u32 %0, %1, %2, %0;" : "+r"(ab0) : "r"(a&0xff00u), "r"(b&0xff00u));
|
||||
asm volatile("mad.lo.u32 %0, %1, %2, %0;" : "+r"(ab0) : "r"(a&0xff00u), "r"(b&0xff00u));
|
||||
uint32_t ab1;
|
||||
asm("mul.hi.u32 %0, %1, %2;" : "=r"(ab1) : "r"(a&0xff0000), "r"(b&0xff0000));
|
||||
asm("mad.hi.u32 %0, %1, %2, %0;" : "+r"(ab1) : "r"(a&0xff000000u), "r"(b&0xff000000u));
|
||||
asm volatile("mul.hi.u32 %0, %1, %2;" : "=r"(ab1) : "r"(a&0xff0000), "r"(b&0xff0000));
|
||||
asm volatile("mad.hi.u32 %0, %1, %2, %0;" : "+r"(ab1) : "r"(a&0xff000000u), "r"(b&0xff000000u));
|
||||
apack.native = __byte_perm(ab0, ab1, 0x6420);
|
||||
return apack;
|
||||
}
|
||||
@@ -260,8 +260,12 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f
|
||||
|
||||
#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
|
||||
SPECIALIZE_REDUCE(FuncSum, half, 1, half, __hadd(x, y))
|
||||
// Coverity recommends the use of std::move here but, given that half is a scalar,
|
||||
// a plain copy will be just as efficient.
|
||||
// coverity[copy_constructor_call]
|
||||
SPECIALIZE_REDUCE(FuncSum, half, 2, half2, __hadd2(x, y))
|
||||
SPECIALIZE_REDUCE(FuncProd, half, 1, half, __hmul(x, y))
|
||||
// coverity[copy_constructor_call]
|
||||
SPECIALIZE_REDUCE(FuncProd, half, 2, half2, __hmul2(x, y))
|
||||
#else
|
||||
SPECIALIZE_REDUCE(FuncSum, half, 1, half, __float2half(__half2float(x) + __half2float(y)))
|
||||
@@ -270,6 +274,7 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f
|
||||
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
|
||||
// coverity[copy_constructor_call]
|
||||
SPECIALIZE_REDUCE(FuncMinMax, half, 2, half2, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
|
||||
#else
|
||||
SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, __float2half(fn.isMinNotMax ? fminf(__half2float(x), __half2float(y)) : fmaxf(__half2float(x), __half2float(y))))
|
||||
@@ -278,10 +283,13 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __hadd(x, y))
|
||||
// coverity[copy_constructor_call]
|
||||
SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 2, __nv_bfloat162, __hadd2(x, y))
|
||||
SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __hmul(x, y))
|
||||
// coverity[copy_constructor_call]
|
||||
SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 2, __nv_bfloat162, __hmul2(x, y))
|
||||
SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
|
||||
// coverity[copy_constructor_call]
|
||||
SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 2, __nv_bfloat162, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
|
||||
#else
|
||||
SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) + __bfloat162float(y)))
|
||||
@@ -402,6 +410,9 @@ struct FuncPreMulSum {
|
||||
};
|
||||
|
||||
template<>
|
||||
// Coverity recommends the users of this type to use std::move in certain cases but,
|
||||
// given that half is a scalar, a plain copy will be just as efficient.
|
||||
// coverity[moveable_type]
|
||||
struct FuncPreMulSum<half> {
|
||||
using EltType = half;
|
||||
#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
|
||||
@@ -424,6 +435,9 @@ struct FuncPreMulSum<half> {
|
||||
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
template<>
|
||||
// Coverity recommends the users of this type to use std::move in certain cases but,
|
||||
// given that __nv_bfloat16 is a scalar, a plain copy will be just as efficient.
|
||||
// coverity[moveable_type]
|
||||
struct FuncPreMulSum<__nv_bfloat16> {
|
||||
using EltType = __nv_bfloat16;
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
@@ -584,9 +598,9 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
|
||||
static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
|
||||
__device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
|
||||
BytePack<PackSize> ans; \
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
|
||||
: "l"(addr)); \
|
||||
: "l"(addr) : "memory"); \
|
||||
return ans; \
|
||||
} \
|
||||
};
|
||||
@@ -597,13 +611,13 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
|
||||
__device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
|
||||
BytePack<PackSize> ans; \
|
||||
if (fn.isMinNotMax) { \
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
|
||||
: "l"(addr)); \
|
||||
: "l"(addr) : "memory"); \
|
||||
} else { \
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
|
||||
: "l"(addr)); \
|
||||
: "l"(addr) : "memory"); \
|
||||
} \
|
||||
return ans; \
|
||||
} \
|
||||
@@ -615,12 +629,12 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
|
||||
static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
|
||||
__device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
|
||||
BytePack<PackSize> ans; \
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
|
||||
: "l"(addr)); \
|
||||
: "l"(addr) : "memory"); \
|
||||
return ans; \
|
||||
} \
|
||||
};
|
||||
@@ -631,19 +645,19 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
|
||||
__device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
|
||||
BytePack<PackSize> ans; \
|
||||
if (fn.isMinNotMax) { \
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
|
||||
: "l"(addr)); \
|
||||
: "l"(addr) : "memory"); \
|
||||
} else { \
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
|
||||
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
|
||||
: "l"(addr)); \
|
||||
: "l"(addr) : "memory"); \
|
||||
} \
|
||||
return ans; \
|
||||
} \
|
||||
@@ -655,9 +669,9 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
|
||||
struct Apply_LoadMultimem<FuncSum<T>, sizeof(T)> { \
|
||||
__device__ static BytePack<sizeof(T)> load(FuncSum<T> fn, uintptr_t addr) { \
|
||||
BytePack<2*sizeof(T)> tmp; \
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
|
||||
: "l"(addr & -uintptr_t(2*sizeof(T)))); \
|
||||
: "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
|
||||
return tmp.half[(addr/sizeof(T))%2]; \
|
||||
} \
|
||||
};
|
||||
@@ -668,13 +682,13 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
|
||||
__device__ static BytePack<sizeof(T)> load(FuncMinMax<T> fn, uintptr_t addr) { \
|
||||
BytePack<2*sizeof(T)> tmp; \
|
||||
if (fn.isMinNotMax) { \
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
|
||||
: "l"(addr & -uintptr_t(2*sizeof(T)))); \
|
||||
: "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
|
||||
} else { \
|
||||
asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
|
||||
asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
|
||||
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
|
||||
: "l"(addr & -uintptr_t(2*sizeof(T)))); \
|
||||
: "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
|
||||
} \
|
||||
return tmp.half[(addr/sizeof(T))%2]; \
|
||||
} \
|
||||
|
||||
@@ -24,6 +24,9 @@ namespace {
|
||||
uint32_t nelem;
|
||||
int rankDest;
|
||||
|
||||
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
|
||||
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
|
||||
// coverity[callee_ptr_arith:FALSE]
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
|
||||
|
||||
@@ -74,6 +77,32 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_L
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const int rank = ncclShmem.comm.rank;
|
||||
size_t count, channelOffset, channelCount, chunkCount;
|
||||
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
|
||||
|
||||
T *inputBuf = (T*)work->sendbuff;
|
||||
T *outputBuf = (T*)work->recvbuff;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
|
||||
(tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatRs);
|
||||
|
||||
PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
|
||||
int last = 0;
|
||||
while (!last) {
|
||||
int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
|
||||
size_t inpIx, outIx;
|
||||
patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
|
||||
prims.patReduce(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
|
||||
@@ -88,7 +117,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_S
|
||||
size_t offset;
|
||||
int nelem;
|
||||
|
||||
/* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync;
|
||||
/* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync;
|
||||
* if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth
|
||||
* and the rest are allocated to scatter. */
|
||||
const int nThreadsReduce = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
|
||||
@@ -143,6 +172,9 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_S
|
||||
size_t outOffset = gridOffset + elemOffset;
|
||||
size_t inpOffset = outOffset + rank * count;
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
// Coverity complains about a possible overrun inside the method invoked below, but that's actually
|
||||
// a false positive.
|
||||
// coverity[overrun-call:FALSE]
|
||||
prims.directRecvCopy(inpOffset, outOffset, nelem);
|
||||
}
|
||||
|
||||
@@ -164,7 +196,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
|
||||
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
|
||||
__device__ __forceinline__ void operator()(
|
||||
int tid, int tn, int slice, int maxSliceSize,
|
||||
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes
|
||||
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
|
||||
) {
|
||||
static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
|
||||
static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
|
||||
@@ -199,19 +231,23 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
|
||||
int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
|
||||
int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
|
||||
ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
|
||||
reduceCopy<ncclCollUnroll(), RedOp, T,
|
||||
if (nDsts != 0) {
|
||||
reduceCopy<ncclCollUnroll(), RedOp, T,
|
||||
/*MultimemSrcs=*/0, 1+MinSrcs, 1+MaxSrcs,
|
||||
/*MultimemDsts,MinDsts,MaxDsts=*/0,1,1,
|
||||
/*PreOpSrcs=*/1>
|
||||
(tid, tn, work->redOpArg, &work->redOpArg, false,
|
||||
/*nSrcs=*/1+nSrcs, [=]__device__(int s) {
|
||||
return s==0 ? (T*)inbuf + userOneBeg
|
||||
: work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ)
|
||||
? (T*)srcPtrs[s-1] + userOneBeg
|
||||
: (T*)srcPtrs[s-1] + railAllOffset;
|
||||
},
|
||||
/*nDsts=*/1, [=]__device__(int d/*==0*/) {
|
||||
return (T*)dstPtrs[dst] + railAllOffset;
|
||||
},
|
||||
delta);
|
||||
}
|
||||
railAllOffset += delta;
|
||||
node += 1;
|
||||
}
|
||||
@@ -245,15 +281,15 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
|
||||
int tn = nWarps1*WARP_SIZE;
|
||||
if (tid < tn) {
|
||||
// Phase 1: Scatter inputs to peers
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, tn, nullptr, direct->heads+1, nullptr, nullptr,
|
||||
work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
|
||||
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid, tn, nullptr, direct->heads+1, work->sendbuff, nullptr,
|
||||
work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1, work);
|
||||
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
|
||||
Scatterer</*ReduceSendNotRecv=*/true> scat;
|
||||
scat.work = work;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.template process</*Recv=*/0, /*Send=*/1>(scat);
|
||||
prims.template process</*Recv=*/0, /*Send=*/1>(scat, NCCL_DIRECT_READ, 0);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -269,15 +305,15 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
|
||||
__syncwarp();
|
||||
} else {
|
||||
// Phase 2: Reduce from peers + local input -> send to network
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
|
||||
prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr,
|
||||
work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
|
||||
work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1, work);
|
||||
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
|
||||
Scatterer</*ReduceSendNotRecv=*/false> scat;
|
||||
scat.work = work;
|
||||
scat.chunkSize = chunkSize;
|
||||
scat.railGridOffset = railGridOffset;
|
||||
prims.template process</*Recv=*/1, /*Send=*/1>(scat);
|
||||
prims.template process</*Recv=*/1, /*Send=*/1>(scat, 0, NCCL_DIRECT_READ);
|
||||
}
|
||||
}
|
||||
return;
|
||||
|
||||
@@ -15,11 +15,11 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
|
||||
template<typename Proto>
|
||||
__device__ void runSend(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
|
||||
size_t bytes = work->sendBytes;
|
||||
int chunkSize = u32fp8Decode(work->sendChunkSize_u32fp8);
|
||||
int chunkSize = work->sendIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->sendChunkSize_u32fp8);
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1>
|
||||
prims(tid, tn, nullptr, &work->sendRank, work->sendAddr, nullptr,
|
||||
/*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
|
||||
/*userBufferMode=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize);
|
||||
/*ipcReg=*/work->sendIpcReg, /*netReg=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize);
|
||||
size_t cursor = 0;
|
||||
do {
|
||||
int n = min(size_t(chunkSize), bytes-cursor);
|
||||
@@ -31,15 +31,15 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
|
||||
template<typename Proto>
|
||||
__device__ void runRecv(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
|
||||
size_t bytes = work->recvBytes;
|
||||
int chunkSize = u32fp8Decode(work->recvChunkSize_u32fp8);
|
||||
int chunkSize = work->recvIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->recvChunkSize_u32fp8);
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1>
|
||||
prims(tid, tn, &work->recvRank, nullptr, nullptr, work->recvAddr,
|
||||
/*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
|
||||
/*userBufferMode=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize);
|
||||
/*ipcReg=*/work->recvIpcReg, /*netReg=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize);
|
||||
size_t cursor = 0;
|
||||
do {
|
||||
int n = min(size_t(chunkSize), bytes-cursor);
|
||||
prims.directRecv(cursor, n);
|
||||
prims.directRecv(cursor, cursor, n);
|
||||
cursor += n;
|
||||
} while (cursor < bytes && work->recvRegistered == 0);
|
||||
}
|
||||
@@ -80,6 +80,9 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
|
||||
(isSend ? work->sendBytes : work->recvBytes) = partEnd - partBeg;
|
||||
}
|
||||
}
|
||||
// Coverity reports a possible thread divergence due to not all threads participating in the collective.
|
||||
// However, the code ensures that the participation is on a per-warp basis.
|
||||
// coverity[device_thread_diverged:FALSE]
|
||||
uint32_t mask = __ballot_sync(~0u, hasWork);
|
||||
if (lane == 0) {
|
||||
shared->workSendMask = mask>>16;
|
||||
|
||||
+402
-123
@@ -11,6 +11,7 @@
|
||||
#include "bootstrap.h"
|
||||
#include "channel.h"
|
||||
#include "cudawrap.h"
|
||||
#include "profiler.h"
|
||||
#include "transport.h"
|
||||
|
||||
#include <cstring> // std::memcpy
|
||||
@@ -121,6 +122,10 @@ static void addWorkBatchToPlan(
|
||||
if (newBatch || extendBatch) {
|
||||
if (!newBatch) batch->nextExtends = extendBatch; // Extending the previous batch.
|
||||
struct ncclWorkBatchList* batchNode = ncclMemoryStackAlloc<ncclWorkBatchList>(&comm->memScoped);
|
||||
// Coverity thinks that ncclIntruQueueEnqueue will access chan->workBatchQueue->tail, which might
|
||||
// be NULL. But that code is guarded by chan->workBatchQueue->head not being NULL, in which
|
||||
// case tail won't be NULL either.
|
||||
// coverity[var_deref_model:FALSE]
|
||||
ncclIntruQueueEnqueue(&chan->workBatchQueue, batchNode);
|
||||
batch = &batchNode->batch;
|
||||
batch->nextExtends = 0;
|
||||
@@ -239,7 +244,29 @@ static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* c
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t registerIntraNodeBuffers(
|
||||
static ncclResult_t registerCheckP2PConnection(struct ncclComm* comm, struct ncclConnector* conn, struct ncclTopoGraph* graph, int peer, bool* needReg) {
|
||||
if (conn->connected) {
|
||||
if (conn->conn.flags & (NCCL_IPC_READ | NCCL_IPC_WRITE | NCCL_DIRECT_READ | NCCL_DIRECT_WRITE)) {
|
||||
*needReg = true;
|
||||
} else {
|
||||
// network connection
|
||||
*needReg = false;
|
||||
}
|
||||
} else {
|
||||
struct ncclPeerInfo* peerInfo = &comm->peerInfo[peer];
|
||||
struct ncclPeerInfo* myInfo = &comm->peerInfo[comm->rank];
|
||||
int canConnect = 0;
|
||||
NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, graph, myInfo, peerInfo));
|
||||
if (canConnect) {
|
||||
*needReg = true;
|
||||
} else {
|
||||
*needReg = false;
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t registerCollBuffers(
|
||||
struct ncclComm* comm, struct ncclTaskColl* info,
|
||||
void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
|
||||
void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
|
||||
@@ -250,8 +277,10 @@ static ncclResult_t registerIntraNodeBuffers(
|
||||
|
||||
info->regBufType = NCCL_REGULAR_BUFFER;
|
||||
*regNeedConnect = true;
|
||||
if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit;
|
||||
#if CUDART_VERSION >= 11030
|
||||
if ((info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) && comm->nvlsRegSupport) {
|
||||
if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
|
||||
if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit;
|
||||
bool regBufUsed = false;
|
||||
const void *sendbuff = info->sendbuff;
|
||||
void *recvbuff = info->recvbuff;
|
||||
@@ -284,60 +313,6 @@ static ncclResult_t registerIntraNodeBuffers(
|
||||
}
|
||||
info->regBufType = NCCL_NVLS_REG_BUFFER;
|
||||
}
|
||||
} else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now
|
||||
comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
|
||||
comm->intraRanks < comm->localRanks && // only with inter-process & intra-node peers
|
||||
comm->planner.persistent && 0) {
|
||||
/* Disable CollnetDirect registration since it does not support cuMem* allocated memory. */
|
||||
int localRank = comm->localRank;
|
||||
cudaPointerAttributes sattr, rattr;
|
||||
|
||||
CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff));
|
||||
CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff));
|
||||
if (sattr.type != cudaMemoryTypeDevice || rattr.type != cudaMemoryTypeDevice) return ncclSuccess;
|
||||
|
||||
if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess;
|
||||
|
||||
struct HandlePair {
|
||||
cudaIpcMemHandle_t ipc[2]; // {send, recv}
|
||||
size_t offset[2]; // {send, recv}
|
||||
};
|
||||
struct HandlePair handles[NCCL_MAX_LOCAL_RANKS];
|
||||
|
||||
CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback);
|
||||
CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback);
|
||||
|
||||
void *baseSend, *baseRecv;
|
||||
size_t size;
|
||||
CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff));
|
||||
handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend;
|
||||
CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff));
|
||||
handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv;
|
||||
|
||||
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair)));
|
||||
|
||||
// Open handles locally
|
||||
for (int i=0; i < comm->localRanks; i++) {
|
||||
if (i == localRank) { // Skip self
|
||||
outRegBufSend[i] = nullptr;
|
||||
outRegBufRecv[i] = nullptr;
|
||||
} else {
|
||||
for (int sr=0; sr < 2; sr++) {
|
||||
// Get base address of mapping
|
||||
void* base;
|
||||
CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess));
|
||||
// Get real buffer address by adding offset in the mapping
|
||||
(sr == 0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr];
|
||||
// Enqueue reminder to close memory handle
|
||||
struct ncclIpcCleanupCallback* cb = (struct ncclIpcCleanupCallback*)malloc(sizeof(struct ncclIpcCleanupCallback));
|
||||
cb->base.fn = cleanupIpc;
|
||||
cb->ptr = base;
|
||||
ncclIntruQueueEnqueue(cleanupQueue, &cb->base);
|
||||
info->nCleanupQueueElts += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
info->regBufType = NCCL_IPC_REG_BUFFER;
|
||||
} else if ((info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && comm->collNetRegSupport && info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv) {
|
||||
size_t elementSize = ncclTypeSize(info->datatype);
|
||||
size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
|
||||
@@ -356,27 +331,200 @@ static ncclResult_t registerIntraNodeBuffers(
|
||||
}
|
||||
|
||||
if ((sendRegBufFlag == 0 || recvRegBufFlag == 0) && comm->planner.persistent && ncclParamGraphRegister()) {
|
||||
ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
|
||||
info->sendMhandle = sendHandle;
|
||||
if (sendRegBufFlag) {
|
||||
if (!sendRegBufFlag) {
|
||||
ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
|
||||
info->sendMhandle = sendHandle;
|
||||
}
|
||||
if (sendRegBufFlag && !recvRegBufFlag) {
|
||||
ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
|
||||
info->recvMhandle = recvHandle;
|
||||
}
|
||||
}
|
||||
|
||||
if (sendRegBufFlag && recvRegBufFlag) {
|
||||
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 1));
|
||||
info->nMaxChannels = 1;
|
||||
info->regBufType = NCCL_COLLNET_REG_BUFFER;
|
||||
if (sendRegBufFlag == 1 && recvRegBufFlag == 1) {
|
||||
INFO(NCCL_REG, "rank %d successfully registered collNet sendbuff %p (handle %p), sendbuff size %ld, recvbuff %p (handle %p), recvbuff size %ld", comm->rank, info->sendbuff, sendHandle, sendbuffSize, info->recvbuff, recvHandle, recvbuffSize);
|
||||
}
|
||||
}
|
||||
} else if (comm->intraNodeP2pSupport && info->protocol == NCCL_PROTO_SIMPLE) {
|
||||
// IPC buffer registration
|
||||
if (info->func == ncclFuncReduceScatter) goto exit;
|
||||
if (info->algorithm == NCCL_ALGO_RING && ((info->func == ncclFuncAllReduce && info->sendbuff == info->recvbuff) || info->func == ncclFuncReduce)) goto exit;
|
||||
if ((info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && info->sendbuff == info->recvbuff) goto exit;
|
||||
if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) goto exit;
|
||||
|
||||
int peerRanks[NCCL_MAX_LOCAL_RANKS];
|
||||
int nPeers = 0;
|
||||
size_t elementSize = ncclTypeSize(info->datatype);
|
||||
size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
|
||||
size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
|
||||
int regBufFlag = 0;
|
||||
memset(peerRanks, 0xff, sizeof(int) * NCCL_MAX_LOCAL_RANKS);
|
||||
|
||||
if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
|
||||
struct ncclChannel* channel = comm->channels;
|
||||
for (int r = 0; r < NCCL_MAX_DIRECT_ARITY; ++r) {
|
||||
for (int updown = 0; updown < 2; ++updown) {
|
||||
int peer;
|
||||
if (updown == 0)
|
||||
peer = channel->collnetDirect.up[r];
|
||||
else
|
||||
peer = channel->collnetDirect.down[r];
|
||||
if (peer != -1) {
|
||||
struct ncclConnector* peerConn = &channel->peers[peer]->recv[0];
|
||||
bool needReg = false;
|
||||
|
||||
NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], peer, &needReg));
|
||||
if (needReg) {
|
||||
bool found = false;
|
||||
for (int p = 0; p < nPeers; ++p) {
|
||||
if (peerRanks[p] == peer) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) peerRanks[nPeers++] = peer;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nPeers > 0) {
|
||||
if (ncclParamLocalRegister())
|
||||
ncclIpcLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs);
|
||||
if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
|
||||
ncclIpcGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
|
||||
}
|
||||
if (regBufFlag) {
|
||||
if (ncclParamLocalRegister())
|
||||
ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
|
||||
if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
|
||||
ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (regBufFlag) {
|
||||
info->regBufType = NCCL_IPC_REG_BUFFER;
|
||||
}
|
||||
} else if (info->algorithm == NCCL_ALGO_RING) {
|
||||
struct ncclReg* recvRegRecord;
|
||||
NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
|
||||
if (recvRegRecord == NULL) goto exit;
|
||||
for (int c = 0; c < comm->nChannels; ++c) {
|
||||
struct ncclChannel* channel = comm->channels + c;
|
||||
for (int r = 0; r < 2; ++r) {
|
||||
bool needReg = false;
|
||||
int peer;
|
||||
struct ncclConnector* peerConn;
|
||||
// P2P transport
|
||||
if (r == 0)
|
||||
peer = channel->ring.prev;
|
||||
else
|
||||
peer = channel->ring.next;
|
||||
peerConn = &channel->peers[peer]->recv[0];
|
||||
NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_RING], peer, &needReg));
|
||||
|
||||
if (needReg) {
|
||||
bool found = false;
|
||||
for (int p = 0; p < nPeers; ++p) {
|
||||
if (peerRanks[p] == peer) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) peerRanks[nPeers++] = peer;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nPeers > 0) {
|
||||
if (ncclParamLocalRegister()) {
|
||||
ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
|
||||
}
|
||||
if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
|
||||
ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
|
||||
}
|
||||
}
|
||||
if (regBufFlag) {
|
||||
info->regBufType = NCCL_IPC_REG_BUFFER;
|
||||
}
|
||||
} else if (info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
|
||||
struct ncclReg* recvRegRecord;
|
||||
NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
|
||||
if (recvRegRecord == NULL) goto exit;
|
||||
for (int c = 0; c < comm->nChannels; ++c) {
|
||||
struct ncclChannel* channel = comm->channels + c;
|
||||
struct ncclTree* tree = NULL;
|
||||
int peers[NCCL_MAX_TREE_ARITY + 1];
|
||||
|
||||
if (info->algorithm == NCCL_ALGO_TREE)
|
||||
tree = &channel->tree;
|
||||
else
|
||||
tree = &channel->collnetChain;
|
||||
for (int p = 0; p < NCCL_MAX_TREE_ARITY; ++p) peers[p] = tree->down[p];
|
||||
peers[NCCL_MAX_TREE_ARITY] = tree->up;
|
||||
for (int p = 0; p < NCCL_MAX_TREE_ARITY + 1; ++p) {
|
||||
int peer = peers[p];
|
||||
bool peerNeedReg = false;
|
||||
struct ncclConnector* recvConn = NULL;
|
||||
// P2P transport
|
||||
if (peer == -1 || peer == comm->nRanks) continue;
|
||||
recvConn = &channel->peers[peer]->recv[0];
|
||||
NCCLCHECK(registerCheckP2PConnection(comm, recvConn, &comm->graphs[info->algorithm], peer, &peerNeedReg));
|
||||
|
||||
if (peerNeedReg) {
|
||||
bool found = false;
|
||||
for (int pindex = 0; pindex < nPeers; ++pindex) {
|
||||
if (peerRanks[pindex] == peer) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) peerRanks[nPeers++] = peer;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nPeers > 0) {
|
||||
if (ncclParamLocalRegister()) {
|
||||
ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
|
||||
}
|
||||
if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
|
||||
ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, ®BufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
|
||||
}
|
||||
}
|
||||
if (regBufFlag) {
|
||||
info->regBufType = NCCL_IPC_REG_BUFFER;
|
||||
}
|
||||
}
|
||||
|
||||
if (info->regBufType == NCCL_IPC_REG_BUFFER && comm->nNodes == 1 && 16 < info->nMaxChannels && info->nMaxChannels <= 24) {
|
||||
info->nMaxChannels = 16;
|
||||
}
|
||||
}
|
||||
fallback:
|
||||
exit:
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
static ncclResult_t registerP2pBuffer(struct ncclComm* comm, void* userbuff, int peerRank, size_t size, int* regFlag, void** regAddr, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
uintptr_t offset = 0;
|
||||
uintptr_t* peerRmtAddrs = NULL;
|
||||
|
||||
*regFlag = 0;
|
||||
if (ncclParamLocalRegister()) {
|
||||
ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs);
|
||||
}
|
||||
if (*regFlag == 0 && comm->planner.persistent && ncclParamGraphRegister()) {
|
||||
ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast<void*>(cleanupQueue), NULL);
|
||||
}
|
||||
|
||||
if (*regFlag)
|
||||
*regAddr = (void*)((uintptr_t)peerRmtAddrs + offset);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ncclResult_t getCollNetSupport(struct ncclComm* comm, struct ncclTaskColl* task, int* collNetSupport);
|
||||
static ncclResult_t getAlgoInfo(
|
||||
struct ncclComm* comm, struct ncclTaskColl* task,
|
||||
@@ -500,7 +648,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
|
||||
void* regBufSend[NCCL_MAX_LOCAL_RANKS];
|
||||
void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
|
||||
bool regNeedConnect = true;
|
||||
registerIntraNodeBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, ®NeedConnect);
|
||||
registerCollBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, ®NeedConnect);
|
||||
|
||||
if (comm->runtimeConn && comm->initAlgoChannels[task->algorithm] == false) {
|
||||
if (task->algorithm == NCCL_ALGO_NVLS_TREE && comm->initAlgoChannels[NCCL_ALGO_NVLS] == false && regNeedConnect == true) {
|
||||
@@ -517,6 +665,10 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
|
||||
struct ncclDevWorkColl devWork = {};
|
||||
devWork.sendbuff = (void*)task->sendbuff;
|
||||
devWork.recvbuff = (void*)task->recvbuff;
|
||||
devWork.sendbuffOffset = task->sendbuffOffset;
|
||||
devWork.recvbuffOffset = task->recvbuffOffset;
|
||||
devWork.sendbuffRmtAddrs = task->sendbuffRmtAddrs;
|
||||
devWork.recvbuffRmtAddrs = task->recvbuffRmtAddrs;
|
||||
devWork.root = task->root;
|
||||
devWork.nWarps = task->nWarps;
|
||||
devWork.redOpArg = task->opDev.scalarArg;
|
||||
@@ -527,35 +679,13 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
|
||||
struct ncclWorkList* workNode;
|
||||
switch (task->regBufType) {
|
||||
case NCCL_REGULAR_BUFFER:
|
||||
case NCCL_IPC_REG_BUFFER:
|
||||
case NCCL_COLLNET_REG_BUFFER:
|
||||
{ workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkColl>(&comm->memScoped, 1);
|
||||
workNode->workType = ncclDevWorkTypeColl;
|
||||
workNode->size = sizeof(struct ncclDevWorkColl);
|
||||
memcpy((void*)(workNode+1), (void*)&devWork, workNode->size);
|
||||
} break;
|
||||
case NCCL_IPC_REG_BUFFER:
|
||||
{ struct ncclDevWorkCollReg workReg = {};
|
||||
workReg.coll = devWork;
|
||||
struct ncclChannel *channel0 = &comm->channels[0];
|
||||
for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
|
||||
int peer = channel0->collnetDirect.down[i];
|
||||
if (peer == -1) break;
|
||||
int j = comm->rankToLocalRank[peer]; // Get intra-node slot
|
||||
workReg.dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer
|
||||
workReg.dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer
|
||||
}
|
||||
for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
|
||||
int peer = channel0->collnetDirect.up[i];
|
||||
if (peer == -1) break;
|
||||
int j = comm->rankToLocalRank[peer];
|
||||
// Output buffer of root peer
|
||||
workReg.upOutputs[i] = regBufRecv[j];
|
||||
}
|
||||
workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkCollReg>(&comm->memScoped, 1);
|
||||
workNode->workType = ncclDevWorkTypeCollReg;
|
||||
workNode->size = sizeof(struct ncclDevWorkCollReg);
|
||||
memcpy((void*)(workNode+1), (void*)&workReg, workNode->size);
|
||||
} break;
|
||||
case NCCL_NVLS_REG_BUFFER:
|
||||
{ struct ncclDevWorkCollReg workReg = {};
|
||||
workReg.coll = devWork; // C++ struct assignment
|
||||
@@ -590,6 +720,7 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
int nChannels[2*2] = {0, 0, 0, 0}; // [collnet][nvls]
|
||||
int const nMaxChannels[2*2] = {comm->nChannels, comm->nvlsChannels, // [collnet][nvls]
|
||||
comm->nChannels, comm->nvlsChannels};
|
||||
constexpr size_t MinTrafficPerChannel = 16 << 10; // 16K traffic as minimal
|
||||
do {
|
||||
size_t workBytes = 0;
|
||||
struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue);
|
||||
@@ -601,7 +732,7 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
nPlanColls += 1;
|
||||
workBytes += workNode->size;
|
||||
int kind = 2*task->isCollnet + task->isNvls;
|
||||
trafficBytes[kind] += task->trafficBytes;
|
||||
trafficBytes[kind] += std::max(MinTrafficPerChannel, task->trafficBytes);
|
||||
nChannels[kind] += task->nMaxChannels;
|
||||
nChannels[kind] = std::min(nChannels[kind], nMaxChannels[kind]);
|
||||
task = task->next;
|
||||
@@ -611,7 +742,6 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
} while (0);
|
||||
|
||||
int kindPrev = -1;
|
||||
constexpr size_t MinTrafficPerChannel = 512;
|
||||
size_t trafficPerChannel = 0;
|
||||
int channelId = 0;
|
||||
size_t currentTraffic = 0;
|
||||
@@ -650,14 +780,16 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
for (int c=devWork->channelLo; c <= (int)devWork->channelHi; c++) {
|
||||
proxyOp.channelId = c;
|
||||
proxyOp.opCount = proxyOpId;
|
||||
proxyOp.task.coll = task;
|
||||
proxyOp.rank = comm->rank;
|
||||
addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
|
||||
NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp));
|
||||
}
|
||||
} else { // not task->isCollnet
|
||||
constexpr size_t cellSize = 16;
|
||||
int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks);
|
||||
size_t cellSize = divUp(divUp(MinTrafficPerChannel, (size_t)trafficPerByte), 16) * 16;
|
||||
int elementsPerCell = cellSize/elementSize;
|
||||
size_t cells = divUp(task->count*elementSize, cellSize);
|
||||
int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks);
|
||||
size_t trafficPerElement = elementSize*trafficPerByte;
|
||||
size_t trafficPerCell = cellSize*trafficPerByte;
|
||||
size_t cellsPerChannel = std::min(cells, divUp(trafficPerChannel, trafficPerCell));
|
||||
@@ -665,7 +797,7 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
if (channelId+1 == nMaxChannels[kind]) { // On last channel everything goes to "lo"
|
||||
cellsLo = cells;
|
||||
} else {
|
||||
cellsLo = std::min(cells, (trafficPerChannel-currentTraffic)/trafficPerCell);
|
||||
cellsLo = std::min(cells, divUp((trafficPerChannel-currentTraffic),trafficPerCell));
|
||||
}
|
||||
int nMidChannels = (cells-cellsLo)/cellsPerChannel;
|
||||
size_t cellsHi = (cells-cellsLo)%cellsPerChannel;
|
||||
@@ -725,12 +857,12 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
// Update the current channel and vacant traffic budget.
|
||||
if (countHi != 0) {
|
||||
channelId += nChannels-1;
|
||||
currentTraffic = countHi*trafficPerElement;
|
||||
currentTraffic = cellsHi*elementsPerCell*trafficPerElement;
|
||||
} else if (nMidChannels != 0) {
|
||||
channelId += nChannels;
|
||||
currentTraffic = 0;
|
||||
} else {
|
||||
currentTraffic += countLo*trafficPerElement;
|
||||
currentTraffic += cellsLo*elementsPerCell*trafficPerElement;
|
||||
}
|
||||
|
||||
if (currentTraffic >= trafficPerChannel && channelId+1 != nMaxChannels[kind]) {
|
||||
@@ -750,7 +882,12 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
}
|
||||
proxyOp->channelId = c;
|
||||
proxyOp->opCount = proxyOpId;
|
||||
proxyOp->task.coll = task;
|
||||
proxyOp->rank = comm->rank;
|
||||
addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
|
||||
// Coverity reports "proxyOp->connection" as being possibly uninitialized. It's hard to
|
||||
// determine if that's actually true but it's also not clear if that would be an issue.
|
||||
// coverity[uninit_use_in_call:FALSE]
|
||||
NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp));
|
||||
}
|
||||
}
|
||||
@@ -790,6 +927,7 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
ncclIntruQueueDequeue(&planner->collWorkQueue);
|
||||
nPlanColls -= 1;
|
||||
planner->nTasksColl -= 1;
|
||||
ncclIntruQueueEnqueue(&plan->collTaskQueue, task);
|
||||
ncclIntruQueueEnqueue(&plan->workQueue, workNode);
|
||||
plan->workBytes += workNode->size;
|
||||
}
|
||||
@@ -807,7 +945,8 @@ static ncclResult_t addP2pToPlan(
|
||||
struct ncclComm* comm, struct ncclKernelPlan* plan,
|
||||
int nChannelsMin, int nChannelsMax, int p2pRound,
|
||||
int sendRank, void* sendAddr, ssize_t sendBytes,
|
||||
int recvRank, void* recvAddr, ssize_t recvBytes
|
||||
int recvRank, void* recvAddr, ssize_t recvBytes,
|
||||
struct ncclTaskP2p** p2pTasks
|
||||
) {
|
||||
constexpr int connIndex = 1;
|
||||
bool selfSend = (sendRank == comm->rank);
|
||||
@@ -842,7 +981,8 @@ static ncclResult_t addP2pToPlan(
|
||||
int chunkSize[2];
|
||||
int chunkDataSize[2];
|
||||
int chunkDataSize_u32fp8[2];
|
||||
bool registered[2];
|
||||
bool registered[2] = {false, false};
|
||||
bool ipcRegistered[2] = {false, false};
|
||||
|
||||
for (int dir=0; dir < 2; dir++) { // 0=recv, 1=send
|
||||
if (bytes[dir] != -1) protoLL[dir] &= bytes[dir] <= thresholdLL;
|
||||
@@ -866,11 +1006,29 @@ static ncclResult_t addP2pToPlan(
|
||||
chunkSize[dir] = chunkDataSize[dir];
|
||||
if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2;
|
||||
|
||||
registered[dir] = false;
|
||||
if (bytes[dir] > 0 && network[dir] && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) {
|
||||
struct ncclReg* regRecord;
|
||||
NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], ®Record));
|
||||
registered[dir] = (regRecord && regRecord->nDevs);
|
||||
if (network[dir]) {
|
||||
if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) {
|
||||
struct ncclReg* regRecord;
|
||||
NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], ®Record));
|
||||
registered[dir] = regRecord && regRecord->nDevs;
|
||||
}
|
||||
} else if (bytes[dir] > 0 && addrs[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && !selfSend) {
|
||||
int peerRank = dir ? sendRank : recvRank;
|
||||
int regFlag = 0;
|
||||
int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, 0);
|
||||
struct ncclChannelPeer** channelPeers = comm->channels[channelId].peers;
|
||||
struct ncclConnector* conn = dir ? &channelPeers[peerRank]->send[connIndex]
|
||||
: &channelPeers[peerRank]->recv[connIndex];
|
||||
void* regAddr = NULL;
|
||||
if (conn->conn.flags & (NCCL_IPC_WRITE | NCCL_IPC_READ | NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
|
||||
// We require users registering buffers on both sides
|
||||
NCCLCHECK(registerP2pBuffer(comm, addrs[dir], peerRank, bytes[dir], ®Flag, ®Addr, &plan->cleanupQueue));
|
||||
if (regFlag) {
|
||||
if (dir == 0 && conn->conn.flags & (NCCL_IPC_WRITE | NCCL_DIRECT_WRITE)) recvAddr = regAddr;
|
||||
else if (dir == 1 && conn->conn.flags & (NCCL_IPC_READ | NCCL_DIRECT_READ)) sendAddr = regAddr;
|
||||
}
|
||||
}
|
||||
ipcRegistered[dir] = regFlag ? true : false;
|
||||
}
|
||||
|
||||
if (bytes[dir] == -1) nChannels[dir] = 0;
|
||||
@@ -900,6 +1058,7 @@ static ncclResult_t addP2pToPlan(
|
||||
work->nSendChannels = nChannels[1];
|
||||
work->sendProtoLL = protoLL[1];
|
||||
work->sendRegistered = registered[1];
|
||||
work->sendIpcReg = ipcRegistered[1];
|
||||
work->sendChunkSize_u32fp8 = chunkDataSize_u32fp8[1];
|
||||
work->sendRank = sendRank;
|
||||
work->sendAddr = sendAddr;
|
||||
@@ -907,6 +1066,7 @@ static ncclResult_t addP2pToPlan(
|
||||
work->nRecvChannels = nChannels[0];
|
||||
work->recvProtoLL = protoLL[0];
|
||||
work->recvRegistered = registered[0];
|
||||
work->recvIpcReg = ipcRegistered[0];
|
||||
work->recvChunkSize_u32fp8 = chunkDataSize_u32fp8[0];
|
||||
work->recvRank = recvRank;
|
||||
work->recvAddr = recvAddr;
|
||||
@@ -925,6 +1085,9 @@ static ncclResult_t addP2pToPlan(
|
||||
op->pattern = dir ? ncclPatternSend : ncclPatternRecv;
|
||||
op->chunkSize = chunkSize[dir];
|
||||
op->reg = registered[dir];
|
||||
op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0;
|
||||
op->task.p2p = p2pTasks[dir];
|
||||
op->rank = comm->rank;
|
||||
// The following are modified per channel part in addWorkToChannels():
|
||||
// op->buffer, op->nbytes, op->nsteps = ...;
|
||||
}
|
||||
@@ -1041,13 +1204,16 @@ static ncclResult_t scheduleP2pTasksToPlan(
|
||||
if (!testBudget(budget, plan->nWorkBatches+nChannelsMax, plan->workBytes + sizeof(struct ncclDevWorkP2p))) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes));
|
||||
struct ncclTaskP2p* p2pTasks[2] = { recv, send };
|
||||
NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes, p2pTasks));
|
||||
if (send != nullptr) {
|
||||
ncclIntruQueueDequeue(&peers[sendRank].sendQueue);
|
||||
ncclIntruQueueEnqueue(&plan->p2pTaskQueue, send);
|
||||
comm->planner.nTasksP2p -= 1;
|
||||
}
|
||||
if (recv != nullptr) {
|
||||
ncclIntruQueueDequeue(&peers[recvRank].recvQueue);
|
||||
ncclIntruQueueEnqueue(&plan->p2pTaskQueue, recv);
|
||||
comm->planner.nTasksP2p -= 1;
|
||||
}
|
||||
}
|
||||
@@ -1100,29 +1266,44 @@ static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduce
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct uploadWork_cleanup_t {
|
||||
struct ncclCommEventCallback base;
|
||||
void *hostBuf;
|
||||
};
|
||||
ncclResult_t uploadWork_cleanup_fn(
|
||||
struct ncclComm* comm, struct ncclCommEventCallback* cb
|
||||
) {
|
||||
struct uploadWork_cleanup_t* me = (struct uploadWork_cleanup_t*)cb;
|
||||
free(me->hostBuf);
|
||||
CUDACHECK(cudaEventDestroy(me->base.event));
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
||||
size_t workBytes = plan->workBytes;
|
||||
size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
|
||||
void* fifoBuf;
|
||||
void* fifoBufHost;
|
||||
uint32_t fifoCursor, fifoMask;
|
||||
|
||||
switch (plan->workStorageType) {
|
||||
case ncclDevWorkStorageTypeArgs:
|
||||
plan->kernelArgs->workBuf = nullptr;
|
||||
fifoBuf = (void*)plan->kernelArgs;
|
||||
fifoBufHost = (void*)plan->kernelArgs;
|
||||
fifoCursor = sizeof(ncclDevKernelArgs) + batchBytes;
|
||||
fifoMask = ~0u;
|
||||
break;
|
||||
case ncclDevWorkStorageTypeFifo:
|
||||
fifoBuf = comm->workFifoBuf;
|
||||
fifoBufHost = comm->workFifoBuf;
|
||||
fifoCursor = comm->workFifoProduced;
|
||||
fifoMask = comm->workFifoBytes-1;
|
||||
waitWorkFifoAvailable(comm, fifoCursor + workBytes);
|
||||
plan->kernelArgs->workBuf = comm->workFifoBufDev;
|
||||
break;
|
||||
case ncclDevWorkStorageTypePersistent:
|
||||
ncclMemoryStackPush(&comm->memScoped);
|
||||
fifoBuf = ncclMemoryStackAlloc(&comm->memScoped, workBytes, /*align=*/16);
|
||||
static_assert(16 <= alignof(max_align_t), "We rely on 16-byte alignment.");
|
||||
fifoBufHost = malloc(workBytes);
|
||||
fifoCursor = 0;
|
||||
fifoMask = ~0u;
|
||||
break;
|
||||
@@ -1144,7 +1325,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
|
||||
// Write the channel-shared work structs.
|
||||
struct ncclWorkList* workNode = ncclIntruQueueHead(&plan->workQueue);
|
||||
while (workNode != nullptr) {
|
||||
char* dst = (char*)fifoBuf;
|
||||
char* dst = (char*)fifoBufHost;
|
||||
char* src = (char*)(workNode+1);
|
||||
for (int n = workNode->size; n != 0; n -= 16) {
|
||||
memcpy(
|
||||
@@ -1164,11 +1345,39 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
|
||||
if (comm->workFifoBufGdrHandle != nullptr) wc_store_fence();
|
||||
break;
|
||||
case ncclDevWorkStorageTypePersistent:
|
||||
NCCLCHECK(ncclCudaMalloc(&plan->workBufPersistent, workBytes));
|
||||
plan->kernelArgs->workBuf = plan->workBufPersistent;
|
||||
NCCLCHECK(ncclCudaMemcpy(plan->workBufPersistent, fifoBuf, workBytes));
|
||||
ncclMemoryStackPop(&comm->memScoped);
|
||||
break;
|
||||
{ ncclResult_t result = ncclSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
void* fifoBufDev = nullptr;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
|
||||
// Acquire deviceStream to gain access to deviceStream.cudaStream. Since the
|
||||
// user's graph will be launched later, and it also acquires the deviceStream,
|
||||
// it will observe this upload.
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, finish_scope);
|
||||
|
||||
CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
|
||||
plan->workBufPersistent = fifoBufDev;
|
||||
plan->kernelArgs->workBuf = fifoBufDev;
|
||||
|
||||
CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
|
||||
cudaEvent_t memcpyDone;
|
||||
CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, finish_scope);
|
||||
CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
|
||||
|
||||
struct uploadWork_cleanup_t* cleanup;
|
||||
NCCLCHECK(ncclCalloc(&cleanup, 1));
|
||||
cleanup->base.fn = uploadWork_cleanup_fn;
|
||||
cleanup->base.event = memcpyDone;
|
||||
cleanup->hostBuf = fifoBufHost;
|
||||
ncclIntruQueueEnqueue(&comm->eventCallbackQueue, &cleanup->base);
|
||||
|
||||
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, finish_scope);
|
||||
NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, finish_scope);
|
||||
|
||||
finish_scope:
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
if (result != ncclSuccess) return result;
|
||||
} break;
|
||||
default: break;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -1182,6 +1391,11 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
|
||||
|
||||
struct ncclProxyOp* op = ncclIntruQueueHead(&plan->proxyOpQueue);
|
||||
while (op != nullptr) {
|
||||
op->profilerContext = comm->profilerContext;
|
||||
op->eActivationMask = op->coll <= ncclFuncAllReduce ? op->task.coll->eActivationMask : op->task.p2p->eActivationMask;
|
||||
op->taskEventHandle = op->coll <= ncclFuncAllReduce ? op->task.coll->eventHandle : op->task.p2p->eventHandle;
|
||||
ncclProfilerAddPidToProxyOp(op);
|
||||
|
||||
uint64_t oldId = op->opCount;
|
||||
// Ignoring the bottom tag bit, opCount's are zero-based within plan so
|
||||
// translate them to the tip of the comm's history.
|
||||
@@ -1216,8 +1430,12 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
|
||||
}
|
||||
|
||||
static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
||||
NCCLCHECK(ncclProfilerStartGroupEvent(plan));
|
||||
NCCLCHECK(ncclProfilerStartTaskEvents(plan));
|
||||
NCCLCHECK(uploadProxyOps(comm, plan));
|
||||
NCCLCHECK(ncclProxyStart(comm));
|
||||
NCCLCHECK(ncclProfilerStopTaskEvents(plan));
|
||||
NCCLCHECK(ncclProfilerStopGroupEvent(plan));
|
||||
if (!plan->persistent) {
|
||||
// Notify main thread of our reclaiming. This will reclaim plan concurrently.
|
||||
ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer);
|
||||
@@ -1238,13 +1456,30 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
|
||||
struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim`
|
||||
if (plan->persistent) {
|
||||
comm->persistentRefs -= 1;
|
||||
NCCLCHECK(ncclCudaFree(plan->workBufPersistent));
|
||||
if (plan->workStorageType == ncclDevWorkStorageTypePersistent) {
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECK(cudaFree(plan->workBufPersistent));
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
}
|
||||
struct ncclProxyOp* q = ncclIntruQueueHead(&plan->proxyOpQueue);
|
||||
while (q != nullptr) {
|
||||
struct ncclProxyOp* q1 = q->enqNext;
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q);
|
||||
q = q1;
|
||||
}
|
||||
struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
|
||||
while (ct != nullptr) {
|
||||
struct ncclTaskColl* ct1 = ct->next;
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, ct);
|
||||
ct = ct1;
|
||||
}
|
||||
struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
|
||||
while (pt != nullptr) {
|
||||
struct ncclTaskP2p* pt1 = pt->next;
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, pt);
|
||||
pt = pt1;
|
||||
}
|
||||
ncclResult_t result = ncclSuccess;
|
||||
while (!ncclIntruQueueEmpty(&plan->cleanupQueue)) {
|
||||
struct ncclCommCallback* cb = ncclIntruQueueDequeue(&plan->cleanupQueue);
|
||||
@@ -1286,7 +1521,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
|
||||
plan->comm = comm;
|
||||
plan->reclaimer.fn = reclaimPlan;
|
||||
plan->persistent = persistent;
|
||||
// uploadWork() promotes ncclDevWorkStorageType[Fifo|Buf]->Args if the work can fit.
|
||||
// finishPlan() promotes ncclDevWorkStorageType[Fifo|Persistent]->Args if the work can fit.
|
||||
plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent
|
||||
: ncclDevWorkStorageTypeFifo;
|
||||
|
||||
@@ -1554,10 +1789,15 @@ static ncclResult_t updateCollCostTable(
|
||||
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
|
||||
// CollNetDirect is only supported for up to 8 local GPUs
|
||||
if (a == NCCL_ALGO_COLLNET_DIRECT && comm->maxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue;
|
||||
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue;
|
||||
if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
|
||||
/* now we only support single-node NVLS allgather and reducescatter */
|
||||
if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue;
|
||||
/* Tree reduceScatter doesn't support scaling yet */
|
||||
if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter
|
||||
&& (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
bool backup;
|
||||
float time;
|
||||
@@ -1601,6 +1841,8 @@ static ncclResult_t topoGetAlgoInfo(
|
||||
info->protocol = protocol;
|
||||
float time = minTime;
|
||||
|
||||
// Yes, we are first assigning and then testing if protocol is sane, but that's OK in this case.
|
||||
// coverity[check_after_sink]
|
||||
if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
|
||||
if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) {
|
||||
WARN("Error : no algorithm/protocol available");
|
||||
@@ -1610,7 +1852,7 @@ static ncclResult_t topoGetAlgoInfo(
|
||||
info->protocol = backupProto;
|
||||
time = backupTime;
|
||||
}
|
||||
if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);
|
||||
if (comm->rank == 0) INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %d proto %d time %f", ncclFuncToString(info->func), nBytes, info->algorithm, info->protocol, time);
|
||||
if (simInfo) simInfo->estimatedTime = time;
|
||||
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);
|
||||
|
||||
@@ -1653,6 +1895,7 @@ static ncclResult_t topoGetAlgoInfo(
|
||||
}
|
||||
nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
|
||||
if (info->algorithm == NCCL_ALGO_TREE) nt = NCCL_MAX_NTHREADS; // Tree now uses all threads always.
|
||||
if (info->algorithm == NCCL_ALGO_PAT) nt = NCCL_MAX_NTHREADS;
|
||||
info->nMaxChannels = nc;
|
||||
info->nWarps = nt/WARP_SIZE;
|
||||
return ncclSuccess;
|
||||
@@ -1704,8 +1947,15 @@ static ncclResult_t calcCollChunking(
|
||||
pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo;
|
||||
break;
|
||||
case ncclFuncReduceScatter:
|
||||
pattern =
|
||||
info->algorithm == NCCL_ALGO_PAT ? ncclPatternPatUp :
|
||||
info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
|
||||
info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
|
||||
ncclPatternRing;
|
||||
break;
|
||||
case ncclFuncAllGather:
|
||||
pattern =
|
||||
info->algorithm == NCCL_ALGO_PAT ? ncclPatternPatDown :
|
||||
info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
|
||||
info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
|
||||
ncclPatternRing;
|
||||
@@ -1729,6 +1979,8 @@ static ncclResult_t calcCollChunking(
|
||||
case ncclPatternTreeUp:
|
||||
case ncclPatternTreeDown:
|
||||
case ncclPatternTreeUpDown:
|
||||
case ncclPatternPatUp:
|
||||
case ncclPatternPatDown:
|
||||
case ncclPatternPipelineFrom:
|
||||
case ncclPatternPipelineTo:
|
||||
case ncclPatternCollnetChain:
|
||||
@@ -1776,13 +2028,17 @@ static ncclResult_t calcCollChunking(
|
||||
int maxChunkSize = comm->nvlsChunkSize;
|
||||
if (comm->nNodes > 1 && comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
|
||||
if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
|
||||
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
|
||||
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow.
|
||||
// However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits.
|
||||
// coverity[overflow_before_widen]
|
||||
uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads;
|
||||
if ((nBytes < (64 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
|
||||
if ((nBytes < (8 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
|
||||
if ((nBytes < (2 * (concurrentOps * chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
|
||||
} else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
|
||||
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
|
||||
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow.
|
||||
// However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits.
|
||||
// coverity[overflow_before_widen]
|
||||
uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads;
|
||||
chunkSize = comm->nvlsChunkSize;
|
||||
int maxChunkSize = (int)ncclParamNvlsTreeMaxChunkSize();
|
||||
@@ -1796,14 +2052,21 @@ static ncclResult_t calcCollChunking(
|
||||
int nNodes = comm->nNodes;
|
||||
float ppn = comm->nRanks / (float)nNodes;
|
||||
float nstepsLL128 = 1+log2i(nNodes) + 0.1*ppn;
|
||||
// Yes, we are OK with the division on the left side of the < operand being integer.
|
||||
// coverity[integer_division]
|
||||
while (nBytes / (nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
|
||||
// coverity[integer_division]
|
||||
while (nBytes / (nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
|
||||
} else if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) {
|
||||
while (chunkSize*nChannels*32 > nBytes && chunkSize > 65536) chunkSize /= 2;
|
||||
} else if (info->func == ncclFuncReduceScatter && info->algorithm == NCCL_ALGO_PAT) {
|
||||
while (chunkSize*nChannels*16 > nBytes && chunkSize > 65536) chunkSize /= 2;
|
||||
}
|
||||
|
||||
// Compute directFlags of work struct.
|
||||
if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
|
||||
// Set direct direction for broadcast-gather (read or write)
|
||||
*outDirectFlags = (nBytes/nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ;
|
||||
*outDirectFlags = (nBytes/nChannels <= 1024 * 4) ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||
} else {
|
||||
*outDirectFlags = 0;
|
||||
}
|
||||
@@ -1852,6 +2115,10 @@ static ncclResult_t calcCollChunking(
|
||||
}
|
||||
}
|
||||
|
||||
if (pattern == ncclPatternPatUp || pattern == ncclPatternPatDown) {
|
||||
proxyOp->nbytes = DIVUP(nBytes, nChannels);
|
||||
}
|
||||
|
||||
*outChunkSize = chunkSize;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -1874,6 +2141,7 @@ static ncclResult_t hostToDevRedOp(
|
||||
opFull->proxyOp = op;
|
||||
|
||||
int nbits = 8*ncclTypeSize(datatype);
|
||||
if (nbits <= 0) return ncclInvalidArgument;
|
||||
uint64_t allBits = uint64_t(-1)>>(64-nbits);
|
||||
uint64_t signBit = allBits^(allBits>>1);
|
||||
|
||||
@@ -1947,8 +2215,12 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
|
||||
|
||||
// Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
|
||||
ncclGroupCommJoin(info->comm);
|
||||
struct ncclTaskP2p* p2p = ncclMemoryStackAlloc<struct ncclTaskP2p>(&comm->memScoped);
|
||||
struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc<struct ncclTaskP2p>(&comm->memPool_ncclTaskP2p, &comm->memPermanent);
|
||||
p2p->func = info->coll;
|
||||
p2p->buff = (void*)info->recvbuff;
|
||||
p2p->count = info->count;
|
||||
p2p->datatype = info->datatype;
|
||||
p2p->root = info->root;
|
||||
p2p->bytes = nBytes;
|
||||
ncclIntruQueueEnqueue(
|
||||
isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
|
||||
@@ -1996,7 +2268,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
|
||||
} else {
|
||||
// Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
|
||||
ncclGroupCommJoin(info->comm);
|
||||
struct ncclTaskColl* t = ncclMemoryStackAlloc<struct ncclTaskColl>(&comm->memScoped);
|
||||
struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
|
||||
t->func = info->coll;
|
||||
t->sendbuff = info->sendbuff;
|
||||
t->recvbuff = info->recvbuff;
|
||||
@@ -2026,7 +2298,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
|
||||
while (true) {
|
||||
if (l == nullptr) { // Got to the end, this must be a new stream.
|
||||
struct ncclCudaGraph graph;
|
||||
NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream))
|
||||
NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream));
|
||||
if (planner->streams != nullptr && !ncclCudaGraphSame(planner->capturingGraph, graph)) {
|
||||
WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph.");
|
||||
return ncclInvalidUsage;
|
||||
@@ -2075,7 +2347,7 @@ exit:
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
/* if depth is 1, ncclGroupEndInternal() will trigger group ops. The state can change
|
||||
* so we have to check state here. */
|
||||
if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) };
|
||||
if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)); }
|
||||
return ret;
|
||||
fail:
|
||||
if (info->comm && !info->comm->config.blocking) (void) ncclCommSetAsyncError(info->comm, ret);
|
||||
@@ -2093,7 +2365,8 @@ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataTyp
|
||||
int cap = 2*comm->userRedOpCapacity;
|
||||
if (cap < 4) cap = 4;
|
||||
ncclUserRedOp *ops = new ncclUserRedOp[cap];
|
||||
std::memcpy(ops, comm->userRedOps, comm->userRedOpCapacity*sizeof(ncclUserRedOp));
|
||||
if (comm->userRedOpCapacity > 0)
|
||||
std::memcpy(ops, comm->userRedOps, comm->userRedOpCapacity*sizeof(ncclUserRedOp));
|
||||
for(int ix=comm->userRedOpCapacity; ix < cap; ix++)
|
||||
ops[ix].freeNext = ix + 1;
|
||||
delete[] comm->userRedOps;
|
||||
@@ -2109,8 +2382,10 @@ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataTyp
|
||||
user->datatype = datatype;
|
||||
user->opFull.op = ncclDevPreMulSum;
|
||||
if (residence == ncclScalarHostImmediate) {
|
||||
int size = ncclTypeSize(datatype);
|
||||
if (size < 1) return ncclInternalError;
|
||||
user->opFull.scalarArgIsPtr = false;
|
||||
std::memcpy(&user->opFull.scalarArg, scalar, ncclTypeSize(datatype));
|
||||
std::memcpy(&user->opFull.scalarArg, scalar, size);
|
||||
} else {
|
||||
user->opFull.scalarArgIsPtr = true;
|
||||
user->opFull.scalarArg = reinterpret_cast<uint64_t>(scalar);
|
||||
@@ -2127,6 +2402,10 @@ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) {
|
||||
WARN("ncclRedOpDestroy : operator is a NCCL builtin.");
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
// int(ncclMaxRedOp) < int(op) will always be false due to the sizes of
|
||||
// the datatypes involved, and that's by design. We keep the check though
|
||||
// just as a reminder.
|
||||
// coverity[result_independent_of_operands]
|
||||
if (int(op) < 0 || int(ncclMaxRedOp) < int(op)) {
|
||||
WARN("ncclRedOpDestroy : operator is garbage.");
|
||||
return ncclInvalidArgument;
|
||||
|
||||
@@ -226,6 +226,8 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
}
|
||||
}
|
||||
channel->collnetDirect.nHeads = nHeads;
|
||||
// nHeads should always be greater than 0.
|
||||
// coverity[divide_by_zero]
|
||||
channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
|
||||
channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
|
||||
sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
|
||||
@@ -374,20 +376,21 @@ NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1);
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
|
||||
// Gather data from all ranks
|
||||
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int *ringRecv = NULL, *ringSend = NULL, *ringPrev = NULL, *ringNext = NULL, *treeToParent = NULL, *treeToChild0 = NULL, *treeToChild1 = NULL, *nvlsHeads = NULL;
|
||||
int nranks = comm->nRanks;
|
||||
int nNodes = comm->nNodes;
|
||||
int nChannels = comm->nChannels;
|
||||
int minHeadNum = INT_MAX;
|
||||
int shared = parent && parent->nvlsSupport && parent->config.splitShare;
|
||||
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));
|
||||
NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc(&ringNext, nranks*MAXCHANNELS), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
|
||||
|
||||
// Alternate rings to avoid crossing rails
|
||||
if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) {
|
||||
@@ -433,8 +436,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
}
|
||||
|
||||
// Connect rings and trees. This should also duplicate the channels.
|
||||
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
|
||||
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
|
||||
NCCLCHECKGOTO(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext), ret, fail);
|
||||
NCCLCHECKGOTO(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns), ret, fail);
|
||||
|
||||
// Duplicate ringPrev/ringNext for ncclBuildRing
|
||||
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
|
||||
@@ -459,7 +462,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
|
||||
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
|
||||
}
|
||||
NCCLCHECK(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]));
|
||||
NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
|
||||
}
|
||||
|
||||
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
|
||||
@@ -493,7 +496,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
if (comm->nChannels < comm->nvlsChannels) {
|
||||
nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
|
||||
}
|
||||
NCCLCHECK(connectNvls(comm, nvlsHeads, minHeadNum));
|
||||
NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail);
|
||||
#endif
|
||||
if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
|
||||
nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
|
||||
@@ -501,16 +504,18 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
}
|
||||
|
||||
// Create rings array and check all is fine
|
||||
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
|
||||
NCCLCHECKGOTO(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext), ret, fail);
|
||||
|
||||
free(ringRecv);
|
||||
free(ringSend);
|
||||
free(ringPrev);
|
||||
free(ringNext);
|
||||
free(treeToParent);
|
||||
free(treeToChild0);
|
||||
free(treeToChild1);
|
||||
free(nvlsHeads);
|
||||
|
||||
return ncclSuccess;
|
||||
exit:
|
||||
if (ringRecv) free(ringRecv);
|
||||
if (ringSend) free(ringSend);
|
||||
if (ringPrev) free(ringPrev);
|
||||
if (ringNext) free(ringNext);
|
||||
if (treeToParent) free(treeToParent);
|
||||
if (treeToChild0) free(treeToChild0);
|
||||
if (treeToChild1) free(treeToChild1);
|
||||
if (nvlsHeads) free(nvlsHeads);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
@@ -36,13 +36,13 @@ NCCL_PARAM(NvbDisable, "NVB_DISABLE", 0);
|
||||
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
|
||||
if (baseNode->paths[baseNode->type] == NULL) {
|
||||
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
|
||||
for (int i=0; i<system->nodes[baseNode->type].count; i++) baseNode->paths[baseNode->type][i].type = PATH_DIS;
|
||||
}
|
||||
|
||||
// breadth-first search to set all paths to that node in the system
|
||||
struct ncclTopoNodeList nodeList;
|
||||
struct ncclTopoNodeList nextNodeList;
|
||||
struct ncclTopoNodeList nextNodeList = { { 0 }, 0 };
|
||||
nodeList.count = 1; nodeList.list[0] = baseNode;
|
||||
nextNodeList.count = 0;
|
||||
struct ncclTopoLinkList* basePath;
|
||||
NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
|
||||
basePath->count = 0;
|
||||
@@ -116,9 +116,9 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n
|
||||
const int linesize = 1024;
|
||||
char line[linesize];
|
||||
#ifdef ENABLE_TRACE
|
||||
INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id);
|
||||
INFO(NCCL_GRAPH, "Paths from %s/%lx-%lx :", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id));
|
||||
#else
|
||||
snprintf(line, linesize, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
|
||||
snprintf(line, linesize, "%s/%lx-%lx :", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id));
|
||||
int offset = strlen(line);
|
||||
#endif
|
||||
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
|
||||
@@ -155,14 +155,14 @@ ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
|
||||
ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
|
||||
// Find the closest CPU to a GPU
|
||||
int minHops = 0;
|
||||
int localCpu = -1;
|
||||
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
|
||||
for (int c=0; c<system->nodes[CPU].count; c++) {
|
||||
int hops = paths[c].count;
|
||||
if (minHops == 0 || hops < minHops) {
|
||||
if (hops > 0 && (minHops == 0 || hops < minHops)) {
|
||||
localCpu = c;
|
||||
minHops = hops;
|
||||
}
|
||||
@@ -193,20 +193,15 @@ static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix,
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Remove/free paths for a given type
|
||||
static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) {
|
||||
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
|
||||
// Remove links _to_ the given type
|
||||
for (int n=0; n<system->nodes[t].count; n++) {
|
||||
struct ncclTopoNode* node = system->nodes[t].nodes+n;
|
||||
free(node->paths[nodeType]);
|
||||
node->paths[nodeType] = NULL;
|
||||
}
|
||||
// Remove links _from_ the given type
|
||||
for (int n=0; n<system->nodes[nodeType].count; n++) {
|
||||
struct ncclTopoNode* node = system->nodes[nodeType].nodes+n;
|
||||
free(node->paths[t]);
|
||||
node->paths[t] = NULL;
|
||||
// Remove/free all paths
|
||||
static void ncclTopoRemovePaths(struct ncclTopoSystem* system) {
|
||||
for (int t1=0; t1<NCCL_TOPO_NODE_TYPES; t1++) {
|
||||
for (int n=0; n<system->nodes[t1].count; n++) {
|
||||
struct ncclTopoNode* node = system->nodes[t1].nodes+n;
|
||||
for (int t2=0; t2<NCCL_TOPO_NODE_TYPES; t2++) {
|
||||
if (node->paths[t2]) free(node->paths[t2]);
|
||||
node->paths[t2] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -220,6 +215,7 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
|
||||
if (str) {
|
||||
int disable = strtol(str, NULL, 0);
|
||||
if (disable == 1) l = 0;
|
||||
if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %d", disableEnv, disable);
|
||||
}
|
||||
}
|
||||
if (l == -1) {
|
||||
@@ -241,9 +237,9 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
|
||||
if (oldLevel > maxOldLevel) oldLevel = maxOldLevel;
|
||||
l = levelsOldToNew[oldLevel];
|
||||
}
|
||||
if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
|
||||
}
|
||||
}
|
||||
if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
|
||||
*level = l >= 0 ? l : -2;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -252,16 +248,16 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
|
||||
NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0);
|
||||
|
||||
int ncclTopoUserP2pLevel = -1;
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) {
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank) {
|
||||
*p2p = 0;
|
||||
if (read) *read = 0;
|
||||
if (intermediateRank) *intermediateRank = -1;
|
||||
|
||||
// Get GPUs from topology
|
||||
int g1, g2;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, GPU, id1, &g1));
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1));
|
||||
struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
|
||||
if (ncclTopoIdToIndex(system, GPU, id2, &g2) == ncclInternalError) {
|
||||
if (ncclTopoRankToIndex(system, rank2, &g2) == ncclInternalError) {
|
||||
// GPU not found, we can't use p2p.
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -277,8 +273,13 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
|
||||
}
|
||||
}
|
||||
|
||||
// In general, use P2P whenever we can.
|
||||
int p2pLevel = PATH_SYS;
|
||||
// By default don't use P2P across CPU Host Bridges and further apart
|
||||
int p2pLevel = PATH_PXB;
|
||||
|
||||
int arch, vendor, model;
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
// Allow P2P between pairs of GPUs on AMD systems
|
||||
if ((arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD) && system->nodes[GPU].count <= 2) p2pLevel = PATH_SYS;
|
||||
|
||||
// User override
|
||||
if (ncclTopoUserP2pLevel == -1)
|
||||
@@ -288,16 +289,6 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
|
||||
goto compare;
|
||||
}
|
||||
|
||||
// Don't use P2P through ARM CPUs
|
||||
int arch, vendor, model;
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB;
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
|
||||
p2pLevel = PATH_PXB;
|
||||
}
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
|
||||
p2pLevel = PATH_PXB;
|
||||
}
|
||||
|
||||
compare:
|
||||
// Compute the PCI distance and compare with the p2pLevel.
|
||||
@@ -438,7 +429,7 @@ ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int
|
||||
NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0);
|
||||
|
||||
// Check whether going through the network would be faster than going through P2P/SHM.
|
||||
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net) {
|
||||
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net) {
|
||||
if (ncclParamNetDisableIntra() == 1) {
|
||||
*net = 0;
|
||||
return ncclSuccess;
|
||||
@@ -446,8 +437,8 @@ ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_
|
||||
*net = 1;
|
||||
// First check the current GPU-to-GPU speed.
|
||||
int g1, g2;
|
||||
if (ncclTopoIdToIndex(system, GPU, id1, &g1) != ncclSuccess ||
|
||||
ncclTopoIdToIndex(system, GPU, id2, &g2) != ncclSuccess) {
|
||||
if (ncclTopoRankToIndex(system, rank1, &g1) != ncclSuccess ||
|
||||
ncclTopoRankToIndex(system, rank2, &g2) != ncclSuccess) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -545,7 +536,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
||||
// Precompute paths between GPUs/NICs.
|
||||
|
||||
// Remove everything in case we're re-computing
|
||||
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
|
||||
ncclTopoRemovePaths(system);
|
||||
|
||||
// Set direct paths to CPUs. We need them in many cases.
|
||||
for (int c=0; c<system->nodes[CPU].count; c++) {
|
||||
@@ -571,11 +562,11 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
for (int p=0; p<system->nodes[GPU].count; p++) {
|
||||
int p2p;
|
||||
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
|
||||
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].gpu.rank, system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL));
|
||||
if (p2p == 0) {
|
||||
// Divert all traffic through the CPU
|
||||
int cpu;
|
||||
NCCLCHECK(getLocalCpu(system, g, &cpu));
|
||||
NCCLCHECK(ncclGetLocalCpu(system, g, &cpu));
|
||||
NCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g));
|
||||
}
|
||||
}
|
||||
@@ -587,10 +578,10 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
||||
if (p == g) continue;
|
||||
struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank;
|
||||
int p2p;
|
||||
NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
|
||||
NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, comm, NULL, srcInfo, dstInfo));
|
||||
if (p2p == 0) {
|
||||
int shm;
|
||||
NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
|
||||
NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, comm, NULL, srcInfo, dstInfo));
|
||||
if (shm == 0) {
|
||||
// Mark this peer as inaccessible. We'll trim it later.
|
||||
system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET;
|
||||
@@ -631,7 +622,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
||||
if (gdr == 0) {
|
||||
// We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
|
||||
int localCpu;
|
||||
NCCLCHECK(getLocalCpu(system, g, &localCpu));
|
||||
NCCLCHECK(ncclGetLocalCpu(system, g, &localCpu));
|
||||
NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
|
||||
NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
|
||||
}
|
||||
@@ -642,11 +633,13 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int *domains;
|
||||
int64_t *ids;
|
||||
NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
|
||||
NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count));
|
||||
int64_t *ids = NULL;
|
||||
int myDomain = 0;
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
|
||||
NCCLCHECKGOTO(ncclCalloc(&ids, system->nodes[GPU].count), ret, fail);
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
domains[g] = g;
|
||||
@@ -659,7 +652,6 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
if (gpu->gpu.rank == comm->rank) myDomain = domains[g];
|
||||
}
|
||||
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
if (domains[i] == myDomain) continue;
|
||||
struct ncclTopoNode* gpu = NULL;
|
||||
@@ -670,24 +662,26 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
}
|
||||
if (gpu == NULL) {
|
||||
WARN("Could not find id %lx", ids[i]);
|
||||
free(domains);
|
||||
free(ids);
|
||||
return ncclInternalError;
|
||||
ret = ncclInternalError;
|
||||
goto fail;
|
||||
}
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
|
||||
NCCLCHECKGOTO(ncclTopoRemoveNode(system, GPU, g), ret, fail);
|
||||
}
|
||||
|
||||
if (system->nodes[GPU].count == comm->nRanks) {
|
||||
for (int n=system->nodes[NET].count-1; n>=0; n--)
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
|
||||
NCCLCHECKGOTO(ncclTopoRemoveNode(system, NET, n), ret, fail);
|
||||
}
|
||||
exit:
|
||||
free(domains);
|
||||
free(ids);
|
||||
return ncclSuccess;
|
||||
if (ids) free(ids);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
void ncclTopoFree(struct ncclTopoSystem* system) {
|
||||
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
|
||||
ncclTopoRemovePaths(system);
|
||||
free(system);
|
||||
}
|
||||
|
||||
|
||||
@@ -6,17 +6,23 @@
|
||||
|
||||
#include "core.h"
|
||||
|
||||
#define MAXWIDTH 20
|
||||
#define PREFIXLEN 15
|
||||
#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
|
||||
void dumpLine(int* values, int nranks, const char* prefix) {
|
||||
int prefixlen = strlen(prefix);
|
||||
char line[STRLENGTH+1];
|
||||
line[STRLENGTH] = '\0';
|
||||
memset(line, ' ', STRLENGTH);
|
||||
strncpy(line, prefix, PREFIXLEN);
|
||||
for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
|
||||
INFO(NCCL_INIT,"%s", line);
|
||||
constexpr int line_length = 128;
|
||||
char line[line_length];
|
||||
int num_width = snprintf(nullptr, 0, "%d", nranks-1); // safe as per "man snprintf"
|
||||
int n = snprintf(line, line_length, "%s", prefix);
|
||||
for (int i = 0; i < nranks && n < line_length-1; i++) {
|
||||
n += snprintf(line + n, line_length - n, " %*d", num_width, values[i]);
|
||||
// At this point n may be more than line_length-1, so don't use it
|
||||
// for indexing into "line".
|
||||
}
|
||||
if (n >= line_length) {
|
||||
// Sprintf wanted to write more than would fit in the buffer. Assume
|
||||
// line_length is at least 4 and replace the end with "..." to
|
||||
// indicate that it was truncated.
|
||||
snprintf(line+line_length-4, 4, "...");
|
||||
}
|
||||
INFO(NCCL_INIT, "%s", line);
|
||||
}
|
||||
|
||||
ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
|
||||
@@ -32,7 +38,7 @@ ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* p
|
||||
rings[r*nranks+i] = current;
|
||||
current = next[r*nranks+current];
|
||||
}
|
||||
sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
|
||||
snprintf(prefix, sizeof(prefix), "Channel %02d/%02d :", r, nrings);
|
||||
if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
|
||||
if (current != rank) {
|
||||
WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
|
||||
|
||||
@@ -104,6 +104,9 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
|
||||
if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink));
|
||||
revBw += fwBw;
|
||||
}
|
||||
// Coverity thinks that revLink could be NULL below. However, we access it only if revBw is non-0, and the
|
||||
// logic of the code is that revBw can become non-0 only if revLink is non-NULL (see the "if" statement right above).
|
||||
// coverity[var_deref_op]
|
||||
if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; }
|
||||
SUB_ROUND(link->bw, fwBw);
|
||||
if (revBw) SUB_ROUND(revLink->bw, revBw);
|
||||
@@ -444,6 +447,7 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
|
||||
// 2. add other NETs satisfying typeInter but not already in the list.
|
||||
|
||||
ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int netCount = 0;
|
||||
int localNetCount;
|
||||
int* localNets;
|
||||
@@ -456,8 +460,8 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
for (int c = 0; c<MAXCHANNELS; c++) {
|
||||
int64_t netId;
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
|
||||
NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount), ret, fail);
|
||||
if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
|
||||
localNetCount++;
|
||||
}
|
||||
@@ -491,12 +495,15 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
|
||||
}
|
||||
|
||||
*netCountRet = netCount;
|
||||
exit:
|
||||
free(localNets);
|
||||
|
||||
return ncclSuccess;
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
if ((*time) <= 0) return ncclSuccess;
|
||||
(*time)--;
|
||||
|
||||
@@ -518,6 +525,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
}
|
||||
graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
|
||||
int g = gpu - system->nodes[GPU].nodes;
|
||||
int* nets = NULL;
|
||||
if (step == backToNet) {
|
||||
// first get back to NIC
|
||||
if (system->nodes[NET].count) {
|
||||
@@ -525,15 +533,17 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
|
||||
struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
|
||||
int netCount;
|
||||
int* nets;
|
||||
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
|
||||
NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount));
|
||||
NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount), ret, fail);
|
||||
for (int i=0; i<netCount; i++) {
|
||||
int n = nets[i];
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
|
||||
if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
|
||||
if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2]) continue;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2) {
|
||||
if (graph->nChannels & 1 && net->id != graph->inter[(graph->nChannels-1)*2]) continue;
|
||||
} else {
|
||||
if (graph->crossNic == 0 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
|
||||
}
|
||||
|
||||
// Balanced Tree : count half of the bandwidth on first two GPUs
|
||||
int nextBackToNet = -1;
|
||||
@@ -545,18 +555,17 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
graph->bwInter /= 2;
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
|
||||
NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net), ret, fail);
|
||||
graph->bwInter = bwInterSave;
|
||||
if (net) {
|
||||
graph->inter[graph->nChannels*2+1] = net->id;
|
||||
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
|
||||
NCCLCHECKGOTO(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time), ret, fail);
|
||||
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2;
|
||||
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
|
||||
NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net), ret, fail);
|
||||
graph->bwInter = bwInterSave;
|
||||
}
|
||||
}
|
||||
free(nets);
|
||||
}
|
||||
} else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
|
||||
NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time));
|
||||
@@ -592,23 +601,29 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
// Next path
|
||||
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
|
||||
}
|
||||
return ncclSuccess;
|
||||
exit:
|
||||
if (nets) free(nets);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
const int bw = graph->bwInter;
|
||||
int* nets;
|
||||
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
|
||||
int netCount;
|
||||
int graphFound = 0;
|
||||
NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
|
||||
NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount), ret, fail);
|
||||
for (int i=0; i<netCount; i++) {
|
||||
if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) continue;
|
||||
if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) break;
|
||||
int n = nets[(graph->nChannels+i)%netCount];
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
if (graph->collNet && net->net.collSupport == 0) continue;
|
||||
if (net->net.bw < bw) continue;
|
||||
if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2+1]) continue;
|
||||
if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2
|
||||
&& (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2+1]) continue;
|
||||
|
||||
graph->inter[graph->nChannels*2] = net->id;
|
||||
graph->latencyInter = net->net.latency;
|
||||
@@ -624,31 +639,34 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
// NVLS search only tries to find NIC:GPU combinations to compute the heads.
|
||||
if (graph->nChannels < netCount) {
|
||||
int gpu;
|
||||
int duplicate = 0;
|
||||
NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
|
||||
// check whether there is duplicate head when one GPU connects with multiple NICs
|
||||
for (int gc = 0; gc < graph->nChannels; gc++) {
|
||||
if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) {
|
||||
duplicate = 1;
|
||||
break;
|
||||
NCCLCHECKGOTO(ncclTopoGetLocalGpu(system, net->id, &gpu), ret, fail);
|
||||
if (gpu != -1) {
|
||||
int duplicate = 0;
|
||||
// check whether there is duplicate head when one GPU connects with multiple NICs
|
||||
for (int gc = 0; gc < graph->nChannels; gc++) {
|
||||
if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) {
|
||||
duplicate = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!duplicate) {
|
||||
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu), ret, fail);
|
||||
graphFound = 1;
|
||||
}
|
||||
}
|
||||
if (duplicate) continue;
|
||||
if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
|
||||
graphFound = 1;
|
||||
}
|
||||
} else {
|
||||
if (graph->nChannels > 0) {
|
||||
// Try to replay the last channel
|
||||
int g;
|
||||
NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
|
||||
NCCLCHECKGOTO(ncclTopoReplayGetGpu(system, graph, -1, &g), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g), ret, fail);
|
||||
}
|
||||
if (graph->nChannels == 0 || graph->sameChannels == 0) {
|
||||
if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
|
||||
// Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
|
||||
int t = 1 << 10;
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
|
||||
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0), ret, fail);
|
||||
if (t == -1) *time = -1;
|
||||
}
|
||||
|
||||
@@ -660,7 +678,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
if (paths[g].bw > maxBw) {
|
||||
maxBw = paths[g].bw;
|
||||
minHops = paths[g].count;
|
||||
} else if (paths[g].bw == maxBw && paths[g].count < minHops) {
|
||||
} else if (paths[g].bw == maxBw && paths[g].count > 0 && paths[g].count < minHops) {
|
||||
minHops = paths[g].count;
|
||||
}
|
||||
}
|
||||
@@ -668,7 +686,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||
int g = (graph->nChannels+i)%system->nodes[GPU].count;
|
||||
if (paths[g].bw == maxBw && paths[g].count == minHops) {
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
|
||||
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g), ret, fail);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -682,8 +700,11 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
}
|
||||
}
|
||||
}
|
||||
exit:
|
||||
free(nets);
|
||||
return ncclSuccess;
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
/* Search Patterns
|
||||
@@ -1040,9 +1061,10 @@ search:
|
||||
}
|
||||
tmpGraph.typeInter = PATH_PIX;
|
||||
|
||||
if (crossNic == 2 && tmpGraph.crossNic == 0) {
|
||||
if (crossNic == 2 && tmpGraph.crossNic == 0
|
||||
&& (graph->pattern == NCCL_TOPO_PATTERN_RING || graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE)) {
|
||||
// Try again with crossNic if permitted
|
||||
tmpGraph.crossNic = 1;
|
||||
tmpGraph.crossNic = 2;
|
||||
goto search;
|
||||
}
|
||||
tmpGraph.crossNic = crossNic == 1 ? 1 : 0;
|
||||
@@ -1112,7 +1134,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
|
||||
sprintf(line, "%2d :", c);
|
||||
int offset = strlen(line);
|
||||
if (system->nodes[NET].count > 0) {
|
||||
sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c]);
|
||||
sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c]));
|
||||
offset = strlen(line);
|
||||
}
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
@@ -1120,7 +1142,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
|
||||
offset = strlen(line);
|
||||
}
|
||||
if (system->nodes[NET].count > 0) {
|
||||
sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c+1]);
|
||||
sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c]));
|
||||
offset = strlen(line);
|
||||
}
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
@@ -1129,16 +1151,20 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
const char* str = ncclGetEnv("NCCL_GRAPH_DUMP_FILE");
|
||||
struct ncclXml* xml = NULL;
|
||||
if (str) {
|
||||
INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str);
|
||||
struct ncclXml* xml;
|
||||
NCCLCHECK(xmlAlloc(&xml, NCCL_GRAPH_XML_MAX_NODES));
|
||||
NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml));
|
||||
NCCLCHECK(ncclTopoDumpXmlToFile(str, xml));
|
||||
free(xml);
|
||||
NCCLCHECKGOTO(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoDumpXmlToFile(str, xml), ret, fail);
|
||||
}
|
||||
return ncclSuccess;
|
||||
exit:
|
||||
if (xml) free(xml);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
#include "comm.h"
|
||||
|
||||
@@ -192,6 +192,7 @@ int getBcmGen(uint64_t id, int level) {
|
||||
return 0;
|
||||
}
|
||||
ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
for (int s=0; s<system->nodes[PCI].count; s++) {
|
||||
struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s;
|
||||
int gen = getBcmGen(pciSwitch->pci.device, 0);
|
||||
@@ -217,7 +218,7 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
|
||||
for (int s=0; s<subs; s++) {
|
||||
// Find sub switch (system->nodes[PCI].nodes is changing every time we remove a node)
|
||||
int index;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index));
|
||||
NCCLCHECKGOTO(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index), ret, fail);
|
||||
struct ncclTopoNode* sub = system->nodes[PCI].nodes+index;
|
||||
// Connect all sub PCI devices to the parent switch
|
||||
for (int l=0; l<sub->nlinks; l++) {
|
||||
@@ -226,7 +227,8 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
|
||||
// Add link from parent PCI switch -> PCI device
|
||||
if (pciSwitch->nlinks == NCCL_TOPO_MAX_LINKS) {
|
||||
WARN("Error : too many Topo links (max %d)", NCCL_TOPO_MAX_LINKS);
|
||||
return ncclInternalError;
|
||||
ret = ncclInternalError;
|
||||
goto fail;
|
||||
}
|
||||
memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink));
|
||||
pciSwitch->nlinks++;
|
||||
@@ -238,16 +240,20 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
|
||||
}
|
||||
}
|
||||
}
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, PCI, index));
|
||||
NCCLCHECKGOTO(ncclTopoRemoveNode(system, PCI, index), ret, fail);
|
||||
}
|
||||
// Set subdevice to 0xffff to make sure we don't merge this switch again.
|
||||
pciSwitch->pci.device |= 0xffff;
|
||||
free(subSwIds);
|
||||
// Restart, as system->nodes[PCI].nodes has changed.
|
||||
s = 0;
|
||||
continue;
|
||||
fail:
|
||||
free(subSwIds);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
return ret;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
|
||||
@@ -281,7 +287,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
|
||||
for (int l=0; l<node->nlinks; l++) {
|
||||
struct ncclTopoLink* link = node->links+l;
|
||||
if (link->type == LINK_LOC) {
|
||||
sprintf(line+offset, "+ %s[%2.1f] - %s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], link->remNode->id);
|
||||
sprintf(line+offset, "+ %s[%2.1f] - %s/%lx-%lx", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id));
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
} else if (link->type != LINK_PCI || link->remNode != prevNode) {
|
||||
sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw);
|
||||
@@ -290,9 +296,9 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
|
||||
NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
|
||||
} else {
|
||||
if (link->remNode->type == NET) {
|
||||
sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
|
||||
sprintf(line+nextOffset, "%s/%lx-%lx (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id), link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
|
||||
} else {
|
||||
sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
|
||||
sprintf(line+nextOffset, "%s/%lx-%lx", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id));
|
||||
}
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
}
|
||||
@@ -720,84 +726,87 @@ ncclResult_t ncclTopoRefreshBcmP2pLinks(void) {
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclXml* xml;
|
||||
char* mem = NULL;
|
||||
int* localRanks = NULL;
|
||||
int netDevCount = 0;
|
||||
struct ncclXml* rankXml;
|
||||
int localRank = -1, nLocalRanks = 0;
|
||||
NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES));
|
||||
const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
|
||||
if (xmlTopoFile) {
|
||||
INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
|
||||
NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
|
||||
NCCLCHECKGOTO(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1), ret, fail);
|
||||
} else {
|
||||
// Try default XML topology location
|
||||
NCCLCHECK(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0));
|
||||
NCCLCHECKGOTO(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0), ret, fail);
|
||||
}
|
||||
if (xml->maxIndex == 0) {
|
||||
// Create top tag
|
||||
struct ncclXmlNode* top;
|
||||
NCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
|
||||
NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
|
||||
NCCLCHECKGOTO(xmlAddNode(xml, NULL, "system", &top), ret, fail);
|
||||
NCCLCHECKGOTO(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION), ret, fail);
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclTopoRefreshBcmP2pLinks());
|
||||
NCCLCHECKGOTO(ncclTopoRefreshBcmP2pLinks(), ret, fail);
|
||||
|
||||
// Detect only the GPU managed by this process. We'll get any others through XML fusion.
|
||||
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
|
||||
NCCLCHECK(int64ToBusId(comm->peerInfo[comm->rank].busId, busId));
|
||||
NCCLCHECKGOTO(int64ToBusId(comm->peerInfo[comm->rank].busId, busId), ret, fail);
|
||||
struct ncclXmlNode* node;
|
||||
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
|
||||
NCCLCHECKGOTO(ncclTopoFillGpu(xml, busId, &node), ret, fail);
|
||||
if (node) {
|
||||
NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
|
||||
NCCLCHECK(xmlSetAttrInt(node, "rank", comm->rank));
|
||||
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport));
|
||||
NCCLCHECKGOTO(xmlSetAttrInt(node, "keep", 1), ret, fail);
|
||||
NCCLCHECKGOTO(xmlSetAttrInt(node, "rank", comm->rank), ret, fail);
|
||||
NCCLCHECKGOTO(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport), ret, fail);
|
||||
}
|
||||
// Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
|
||||
// so we start with collnet so that it has precedence.
|
||||
int netDevCount = 0;
|
||||
if (collNetSupport(comm)) {
|
||||
NCCLCHECK(collNetDevices(comm, &netDevCount));
|
||||
NCCLCHECKGOTO(collNetDevices(comm, &netDevCount), ret, fail);
|
||||
for (int n=0; n<netDevCount; n++) {
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECK(collNetGetProperties(comm, n, &props));
|
||||
NCCLCHECKGOTO(collNetGetProperties(comm, n, &props), ret, fail);
|
||||
struct ncclXmlNode* netNode;
|
||||
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
|
||||
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
|
||||
NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail);
|
||||
NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail);
|
||||
NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail);
|
||||
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail);
|
||||
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail);
|
||||
NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail);
|
||||
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail);
|
||||
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
|
||||
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
|
||||
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail);
|
||||
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "coll", 1), ret, fail);
|
||||
}
|
||||
}
|
||||
if (netDevCount == 0) {
|
||||
NCCLCHECK(comm->ncclNet->devices(&netDevCount));
|
||||
NCCLCHECKGOTO(comm->ncclNet->devices(&netDevCount), ret, fail);
|
||||
}
|
||||
for (int n=0; n<netDevCount; n++) {
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECK(comm->ncclNet->getProperties(n, &props));
|
||||
NCCLCHECKGOTO(comm->ncclNet->getProperties(n, &props), ret, fail);
|
||||
comm->netDeviceType = props.netDeviceType;
|
||||
struct ncclXmlNode* netNode;
|
||||
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
|
||||
NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
|
||||
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
|
||||
NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail);
|
||||
NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail);
|
||||
NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail);
|
||||
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail);
|
||||
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail);
|
||||
NCCLCHECKGOTO(xmlInitAttrFloat(netNode, "latency", props.latency), ret, fail);
|
||||
NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail);
|
||||
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail);
|
||||
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
|
||||
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
|
||||
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail);
|
||||
}
|
||||
|
||||
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
|
||||
NCCLCHECK(ncclTopoTrimXml(xml));
|
||||
NCCLCHECKGOTO(ncclTopoTrimXml(xml), ret, fail);
|
||||
|
||||
// XML topo fusion.
|
||||
int* localRanks;
|
||||
int localRank = -1, nLocalRanks = 0;
|
||||
if (comm->MNNVL) {
|
||||
// MNNVL clique support
|
||||
nLocalRanks = comm->clique.size;
|
||||
@@ -805,7 +814,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
localRanks = comm->clique.ranks;
|
||||
} else {
|
||||
// Intra-node fusion. Much of the comm is not initialized yet at this point so we need to do our own calculations.
|
||||
NCCLCHECK(ncclCalloc(&localRanks, comm->nRanks));
|
||||
NCCLCHECKGOTO(ncclCalloc(&localRanks, comm->nRanks), ret, fail);
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
if (comm->peerInfo[i].hostHash == comm->peerInfo[comm->rank].hostHash) {
|
||||
if (i == comm->rank)
|
||||
@@ -814,37 +823,42 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
}
|
||||
}
|
||||
}
|
||||
char* mem;
|
||||
NCCLCHECK(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
|
||||
struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank);
|
||||
NCCLCHECKGOTO(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)), ret, fail);
|
||||
rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank);
|
||||
memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
|
||||
NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
|
||||
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
|
||||
NCCLCHECKGOTO(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1), ret, fail);
|
||||
// nLocalRanks can't actually be 0, or we wouldn't be running at all...
|
||||
// coverity[divide_by_zero]
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)), ret, fail);
|
||||
if (comm->MNNVL) {
|
||||
// Ensure that we have enough room when fusing topos from multiple nodes.
|
||||
free(xml);
|
||||
NCCLCHECK(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES));
|
||||
xml = NULL;
|
||||
NCCLCHECKGOTO(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES), ret, fail);
|
||||
} else {
|
||||
// In the intra-node case there's no need to enlarge the topo xml.
|
||||
xml->maxIndex = 0;
|
||||
free(localRanks);
|
||||
}
|
||||
for (int i = 0; i < nLocalRanks; i++) {
|
||||
struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
|
||||
NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
|
||||
NCCLCHECK(ncclTopoFuseXml(xml, peerXml));
|
||||
NCCLCHECKGOTO(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoFuseXml(xml, peerXml), ret, fail);
|
||||
}
|
||||
free(mem);
|
||||
|
||||
xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
|
||||
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
|
||||
INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
|
||||
NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
|
||||
NCCLCHECKGOTO(ncclTopoDumpXmlToFile(xmlTopoFile, xml), ret, fail);
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash));
|
||||
NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail);
|
||||
exit:
|
||||
if (!comm->MNNVL && localRanks) free(localRanks);
|
||||
if (mem) free(mem);
|
||||
free(xml);
|
||||
return ncclSuccess;
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) {
|
||||
@@ -853,6 +867,7 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index
|
||||
int count = 0;
|
||||
NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count));
|
||||
struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType];
|
||||
if (paths == NULL) { *localCount = 0; return ncclSuccess; }
|
||||
for (int i=0; i<system->nodes[resultType].count; i++) {
|
||||
if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) {
|
||||
maxBw = paths[i].bw;
|
||||
@@ -891,6 +906,7 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int gpu;
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
|
||||
int* localNets;
|
||||
@@ -898,39 +914,46 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
|
||||
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
|
||||
int* localGpus = NULL;
|
||||
int localGpuCount;
|
||||
NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL));
|
||||
int net = system->nodes[GPU].nodes[gpu].gpu.dev;
|
||||
int net;
|
||||
NCCLCHECKGOTO(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL), ret, fail);
|
||||
net = system->nodes[GPU].nodes[gpu].gpu.dev;
|
||||
if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
|
||||
net += channelId%(DIVUP(localNetCount,localGpuCount));
|
||||
if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
|
||||
if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
|
||||
exit:
|
||||
free(localNets);
|
||||
free(localGpus);
|
||||
return ncclSuccess;
|
||||
if (localGpus) free(localGpus);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int netIndex;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &netIndex));
|
||||
int* localGpus = NULL;
|
||||
int localGpuCount;
|
||||
int foundGpu = -1;
|
||||
NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
|
||||
for (int c=0; c<MAXCHANNELS; c++) {
|
||||
for (int lg=0; lg<localGpuCount; lg++) {
|
||||
int g = localGpus[lg];
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
int64_t id;
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL));
|
||||
NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL), ret, fail);
|
||||
if (netId == id) {
|
||||
*gpuIndex = g;
|
||||
free(localGpus);
|
||||
return ncclSuccess;
|
||||
foundGpu = g;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
}
|
||||
exit:
|
||||
*gpuIndex = foundGpu;
|
||||
fail:
|
||||
free(localGpus);
|
||||
*gpuIndex = -1;
|
||||
return ncclSuccess;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/****************************/
|
||||
@@ -948,25 +971,11 @@ NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
|
||||
|
||||
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) {
|
||||
struct ncclTopoNode* cpu = NULL, *gpu = NULL;
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
|
||||
gpu = system->nodes[GPU].nodes+g;
|
||||
// Find closer CPU
|
||||
int cpuIndex = -1, minHops = 0;
|
||||
for (int c=0; c<system->nodes[CPU].count; c++) {
|
||||
int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
|
||||
if (cpuIndex == -1 || nHops < minHops) {
|
||||
cpuIndex = c;
|
||||
minHops = nHops;
|
||||
}
|
||||
}
|
||||
cpu = system->nodes[CPU].nodes+cpuIndex;
|
||||
}
|
||||
}
|
||||
if (cpu == NULL) {
|
||||
WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank);
|
||||
return ncclInternalError;
|
||||
}
|
||||
int gpuIndex, cpuIndex;
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpuIndex));
|
||||
NCCLCHECK(ncclGetLocalCpu(system, gpuIndex, &cpuIndex));
|
||||
gpu = system->nodes[GPU].nodes+gpuIndex;
|
||||
cpu = system->nodes[CPU].nodes+cpuIndex;
|
||||
|
||||
// Query the CPU affinity set we were provided
|
||||
cpu_set_t mask;
|
||||
|
||||
@@ -30,7 +30,7 @@
|
||||
// to GPU traffic consumes more PCI bandwidth.
|
||||
#define INTEL_P2P_OVERHEAD(bw) (bw*6/5)
|
||||
|
||||
#define NCCL_TOPO_NODE_TYPES 7
|
||||
#define NCCL_TOPO_NODE_TYPES 6
|
||||
#define GPU 0
|
||||
#define PCI 1
|
||||
#define NVS 2
|
||||
@@ -103,9 +103,10 @@ struct ncclTopoLinkList {
|
||||
|
||||
#define NCCL_TOPO_UNDEF (-1)
|
||||
|
||||
#define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff
|
||||
#define NCCL_TOPO_ID_SYSTEM_ID(id) (id >> 56)
|
||||
#define NCCL_TOPO_ID_LOCAL_ID(id) (id & 0x00ffffffffffffff)
|
||||
#define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + localid)
|
||||
#define NCCL_TOPO_ID_LOCAL_ID(id) (id & NCCL_TOPO_ID_LOCAL_ID_MASK)
|
||||
#define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + (localid & NCCL_TOPO_ID_LOCAL_ID_MASK))
|
||||
|
||||
struct ncclTopoNode {
|
||||
int type;
|
||||
|
||||
@@ -54,7 +54,7 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
|
||||
// Latencies in us, Bandwidths in GB/s
|
||||
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
|
||||
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
|
||||
{ 6.8, 14.0, 0 }, { 6.6, 14.0, 8.4 }, // Tree, Ring
|
||||
{ 6.8, 14.0, 8.4 }, { 6.6, 14.0, 8.4 }, // Tree, Ring
|
||||
{ 0, 0, 0 }, { 0, 0, 0 }, // Collnet Direct, Chain
|
||||
{ 0, 0, 0 }, { 0, 0, 0 }}; // NVLS, NVLS Tree
|
||||
|
||||
@@ -64,15 +64,15 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
|
||||
#define NCCL_HW_NET 2
|
||||
static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
|
||||
{ /* NVLINK */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 4.0 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
|
||||
/* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
|
||||
/* NVLS */ { 0, 0, 25 }, /* NVLSTree */ { 0, 0, 25 } },
|
||||
/* PCI */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 4.0 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
|
||||
/* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
|
||||
/* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } },
|
||||
/* NET */
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 },
|
||||
{ /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 14 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 },
|
||||
/* CollNetDirect (Simple)*/ { 0, 0, 31 }, /* CollNetChain (Simple)*/ { 0, 0, 30 },
|
||||
/* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 14 } }
|
||||
};
|
||||
@@ -105,6 +105,15 @@ static const double perChMaxTreeBws[3][3] = {
|
||||
/* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
|
||||
};
|
||||
|
||||
NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
|
||||
static int ncclPatEnable(struct ncclComm* comm) {
|
||||
int patEnable = ncclParamPatEnable();
|
||||
if (patEnable != 2) return patEnable;
|
||||
if (comm->nNodes != comm->nRanks) return 0; // PAT only supports 1 GPU per node
|
||||
if (comm->netDeviceType != NCCL_NET_DEVICE_HOST) return 0; // PAT doesn't support net device offload
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Network post overhead in ns (1000 = 1 us)
|
||||
NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);
|
||||
|
||||
@@ -146,7 +155,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
|
||||
// De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
|
||||
if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
|
||||
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
|
||||
float ppn = (float)nRanks / nNodes;
|
||||
|
||||
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
|
||||
@@ -156,18 +165,18 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
|
||||
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
|
||||
nRanks;
|
||||
int nInterSteps = coll == ncclFuncAllReduce ? (nNodes > 1 ? 2*nNodes :0) :
|
||||
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
|
||||
nNodes;
|
||||
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
|
||||
if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
|
||||
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
|
||||
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
|
||||
if ((coll == ncclFuncBroadcast || coll == ncclFuncReduce) && a != NCCL_ALGO_RING) continue;
|
||||
if ((coll == ncclFuncReduceScatter || coll == ncclFuncAllGather)
|
||||
&& a != NCCL_ALGO_PAT && a != NCCL_ALGO_RING
|
||||
&& a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
|
||||
if (coll == ncclFuncAllReduce && a == NCCL_ALGO_PAT) continue;
|
||||
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
|
||||
if ((coll == ncclFuncReduceScatter || coll == ncclFuncAllGather)
|
||||
&& a == NCCL_ALGO_PAT && (p != NCCL_PROTO_SIMPLE || ncclPatEnable(comm) == 0)) continue;
|
||||
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
|
||||
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
|
||||
if (a == NCCL_ALGO_NVLS) bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter);
|
||||
@@ -176,11 +185,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
|
||||
// Various model refinements
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * .5); }
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
|
||||
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
|
||||
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
|
||||
if (a == NCCL_ALGO_TREE && coll == ncclFuncAllReduce) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
|
||||
if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
|
||||
if (a == NCCL_ALGO_PAT) busBw *= .85;
|
||||
if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
||||
if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
|
||||
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
|
||||
@@ -208,7 +218,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
}
|
||||
|
||||
// Convert bus BW to algorithm BW
|
||||
if (!(a == NCCL_ALGO_COLLNET_DIRECT && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
|
||||
if (!(a != NCCL_ALGO_RING && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
|
||||
float ratio = 1.0f;
|
||||
if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps;
|
||||
else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0;
|
||||
@@ -222,7 +232,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
|
||||
comm->latencies[coll][a][p] = baseLat[a][p];
|
||||
float intraLat = hwLat[intraHw[a]][a][p];
|
||||
float interLat = hwLat[NCCL_HW_NET][a][p] + graphs[a]->latencyInter;
|
||||
// With ppn=1 latencies are fully exposed, use the Tree network latency
|
||||
float interLat = ppn == 1 ? hwLat[NCCL_HW_NET][NCCL_ALGO_TREE][p] : hwLat[NCCL_HW_NET][a][p];
|
||||
interLat += graphs[a]->latencyInter;
|
||||
// Also add the flush extra latency
|
||||
if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter;
|
||||
|
||||
@@ -243,11 +255,14 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3;
|
||||
}
|
||||
intraLat = std::max(intraLat, netOverhead);
|
||||
int nInterSteps = nNodes == 1 ? 0 : coll == ncclFuncAllReduce ? 2*(nNodes-1) : nNodes-1;
|
||||
comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat;
|
||||
}
|
||||
} else if (a == NCCL_ALGO_TREE) {
|
||||
comm->latencies[coll][a][p] +=
|
||||
2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
|
||||
if (coll == ncclFuncAllReduce) {
|
||||
comm->latencies[coll][a][p] +=
|
||||
2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
|
||||
}
|
||||
} else if (a == NCCL_ALGO_COLLNET_DIRECT) {
|
||||
comm->latencies[coll][a][p] +=
|
||||
2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.4) + interLat; // Add 0.4 us arity serialization latency
|
||||
@@ -258,6 +273,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
if (nNodes > 1) comm->latencies[coll][a][p] += interLat;
|
||||
} else if (a == NCCL_ALGO_NVLS_TREE) {
|
||||
comm->latencies[coll][a][p] += intraLat + 2 * log2i(nNodes) * interLat;
|
||||
} else if (a == NCCL_ALGO_PAT) {
|
||||
if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
|
||||
comm->latencies[coll][a][p] = 8 // Base time
|
||||
+ log2i(nNodes) * (interLat/3.5) // Log latency
|
||||
+ nRanks * 2.8; // Still a linear part; hopefully we'll manage to remove it at some point.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -266,7 +287,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
// Protocols/Algorithms enable/disable, and user overrides.
|
||||
// All are enabled except ll128 which is enabled by default only in certain cases.
|
||||
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
|
||||
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 };
|
||||
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1, 1 };
|
||||
|
||||
const char *protoStr = ncclGetEnv("NCCL_PROTO");
|
||||
if (protoStr) {
|
||||
@@ -336,23 +357,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
|
||||
if (comm->rank == 0) {
|
||||
char line[1024];
|
||||
for (int block=0; block<2; block++) {
|
||||
for (int block=0; block<DIVUP(NCCL_NUM_ALGORITHMS, 3); block++) {
|
||||
sprintf(line, " Algorithm |");
|
||||
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
|
||||
int a = block*NCCL_NUM_ALGORITHMS/2+ba;
|
||||
for (int ba=0; ba<3; ba++) {
|
||||
int a = block*3+ba;
|
||||
if (a >= NCCL_NUM_ALGORITHMS) continue;
|
||||
sprintf(line+strlen(line), " %14s %14s %14s |", "", ncclAlgoStr[a], "");
|
||||
}
|
||||
INFO(NCCL_TUNING, "%s", line);
|
||||
sprintf(line, " Protocol |");
|
||||
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
|
||||
for (int ba=0; ba<3; ba++) {
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
sprintf(line+strlen(line), " %14s |", ncclProtoStr[p]);
|
||||
}
|
||||
}
|
||||
INFO(NCCL_TUNING, "%s", line);
|
||||
sprintf(line, " Max NThreads |");
|
||||
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
|
||||
int a = block*NCCL_NUM_ALGORITHMS/2+ba;
|
||||
for (int ba=0; ba<3; ba++) {
|
||||
int a = block*3+ba;
|
||||
if (a >= NCCL_NUM_ALGORITHMS) continue;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
|
||||
}
|
||||
@@ -360,8 +383,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
INFO(NCCL_TUNING, "%s", line);
|
||||
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
|
||||
sprintf(line, "%13s |", ncclFuncStr[c]);
|
||||
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
|
||||
int a = block*NCCL_NUM_ALGORITHMS/2+ba;
|
||||
for (int ba=0; ba<3; ba++) {
|
||||
int a = block*3+ba;
|
||||
if (a >= NCCL_NUM_ALGORITHMS) continue;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
|
||||
}
|
||||
@@ -431,7 +455,7 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm,
|
||||
*time = -1.0; return ncclSuccess;
|
||||
}
|
||||
int logSize = log2i(nBytes>>6);
|
||||
if (algorithm == NCCL_ALGO_TREE && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
|
||||
if (algorithm == NCCL_ALGO_TREE && coll == ncclFuncAllReduce && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
|
||||
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && comm->nNodes > 1
|
||||
&& coll == ncclFuncAllReduce && nBytes/(comm->nChannels*comm->nRanks) >= 64) {
|
||||
lat *= comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
|
||||
|
||||
@@ -468,8 +468,8 @@ ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml*
|
||||
return ncclInternalError;
|
||||
}
|
||||
// Set affinity
|
||||
char cpumaskPath[] = "/sys/devices/system/node/node0000";
|
||||
sprintf(cpumaskPath, "/sys/devices/system/node/node%s", numaId);
|
||||
char cpumaskPath[] = "/sys/devices/system/node/node000000";
|
||||
snprintf(cpumaskPath, sizeof(cpumaskPath), "/sys/devices/system/node/node%s", numaId);
|
||||
NCCLCHECK(ncclTopoSetAttrFromSys(cpuNode, cpumaskPath, "cpumap", "affinity"));
|
||||
}
|
||||
|
||||
@@ -690,6 +690,9 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
|
||||
}
|
||||
pciNode->parent = parent;
|
||||
// Keep PCI sub devices ordered by PCI Bus ID (Issue #820)
|
||||
// Coverity complains about dereferenced parent being NULL
|
||||
// but this can never happen.
|
||||
// coverity[var_deref_op]
|
||||
int subIndex = parent->nSubs;
|
||||
const char* newBusId;
|
||||
NCCLCHECK(xmlGetAttrStr(pciNode, "busid", &newBusId));
|
||||
|
||||
+39
-19
@@ -57,7 +57,12 @@ ncclResult_t ncclAsyncLaunch(
|
||||
WARN("Blocking and nonblocking communicators are not allowed in the same group.");
|
||||
ret = ncclInvalidArgument;
|
||||
}
|
||||
ncclIntruQueueEnqueue(&ncclAsyncJobs, job);
|
||||
if (ret == ncclSuccess) {
|
||||
ncclIntruQueueEnqueue(&ncclAsyncJobs, job);
|
||||
} else {
|
||||
// no need to undo, the job hasn't run
|
||||
if (destructor) destructor(job);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
@@ -75,7 +80,7 @@ void* ncclAsyncJobMain(void* arg) {
|
||||
|
||||
ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) {
|
||||
ncclResult_t ret;
|
||||
SYSCHECK(pthread_join(job->thread, NULL), "pthread_join");
|
||||
PTHREADCHECK(pthread_join(job->thread, NULL), "pthread_join");
|
||||
if (job->result != ncclSuccess) {
|
||||
WARN("ncclAsyncJobComplete: job %p failed, job error %d", job, job->result);
|
||||
}
|
||||
@@ -165,6 +170,12 @@ ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) {
|
||||
NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
|
||||
break;
|
||||
}
|
||||
case NCCL_ALGO_PAT: {
|
||||
NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
|
||||
break;
|
||||
}
|
||||
// Yes, it's a dead code. That's fine...
|
||||
// coverity[dead_error_begin]
|
||||
default: {
|
||||
ret = ncclInternalError;
|
||||
goto fail;
|
||||
@@ -301,7 +312,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
|
||||
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
|
||||
memset(&comm->planner, 0, sizeof(comm->planner));
|
||||
comm->planner.peers = tmp;
|
||||
memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0]));
|
||||
if (comm->planner.peers != NULL) memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0]));
|
||||
}
|
||||
|
||||
if (!comm->config.blocking)
|
||||
@@ -329,7 +340,7 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
|
||||
if (!ncclIntruQueueEmpty(asyncJobsMain)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain);
|
||||
do {
|
||||
SYSCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), ret, fail);
|
||||
PTHREADCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), "pthread_create", ret, fail);
|
||||
job = job->next;
|
||||
} while (job != nullptr);
|
||||
|
||||
@@ -341,8 +352,9 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
|
||||
if (state == ncclGroupJobRunning) {
|
||||
jobsDone = false;
|
||||
} else if (state == ncclGroupJobDone) {
|
||||
if (pthread_join(job->thread, nullptr) != 0) {
|
||||
WARN("Error waiting for pthread_join : %s", strerror(errno));
|
||||
int err;
|
||||
if ((err = pthread_join(job->thread, nullptr)) != 0) {
|
||||
WARN("Error waiting for pthread_join: %s", strerror(err));
|
||||
ret = ncclSystemError;
|
||||
}
|
||||
job->state = ncclGroupJobJoined;
|
||||
@@ -373,13 +385,6 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
|
||||
if (ret != ncclSuccess) goto fail;
|
||||
}
|
||||
|
||||
while (!ncclIntruQueueEmpty(asyncJobsMain)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
|
||||
if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
|
||||
(void) ncclCommSetAsyncError(job->comm, ret);
|
||||
if (job->destructor) job->destructor((void*)job);
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
@@ -393,6 +398,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
|
||||
struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
|
||||
struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
|
||||
|
||||
bool *groupAbortFlag = gjob->abortFlagPtr;
|
||||
|
||||
CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
|
||||
@@ -409,7 +415,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
|
||||
job->base.abortFlag = comm->abortFlag;
|
||||
job->base.abortFlagDev = comm->abortFlagDev;
|
||||
job->comm = comm;
|
||||
ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
|
||||
ncclIntruQueueEnqueue(asyncJobsMain, (struct ncclAsyncJob*)job);
|
||||
|
||||
struct ncclComm* next = comm->preconnectNext;
|
||||
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
@@ -422,12 +428,14 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
|
||||
/* Connect channels at runtime if cumem is supported */
|
||||
if (groupCommHeadMain != nullptr) {
|
||||
struct ncclComm* comm = groupCommHeadMain;
|
||||
|
||||
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncCollJobs;
|
||||
ncclIntruQueueConstruct(&asyncCollJobs);
|
||||
do {
|
||||
bool needConnect = false;
|
||||
bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
|
||||
memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
|
||||
|
||||
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
|
||||
NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
|
||||
|
||||
if (comm->cuMemSupport && needConnect) {
|
||||
@@ -438,21 +446,33 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
|
||||
job->base.destructor = free;
|
||||
job->base.state = ncclGroupJobRunning;
|
||||
job->base.abortFlag = comm->abortFlag;
|
||||
job->base.abortFlagDev = comm->abortFlagDev;
|
||||
job->comm = comm;
|
||||
NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
|
||||
memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
|
||||
ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
|
||||
ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
|
||||
}
|
||||
comm = comm->groupNext;
|
||||
} while (comm);
|
||||
|
||||
NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail);
|
||||
NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
|
||||
while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
|
||||
if (job->destructor) job->destructor((void*)job);
|
||||
}
|
||||
}
|
||||
|
||||
if ((!simInfo) && (groupCommHeadMain != nullptr)) {
|
||||
NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
|
||||
}
|
||||
|
||||
while (!ncclIntruQueueEmpty(asyncJobsMain)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
|
||||
if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
|
||||
(void) ncclCommSetAsyncError(job->comm, ret);
|
||||
if (job->destructor) job->destructor((void*)job);
|
||||
}
|
||||
|
||||
while (groupCommHeadMain != nullptr) {
|
||||
struct ncclComm* comm = groupCommHeadMain;
|
||||
struct ncclComm* next = comm->groupNext;
|
||||
@@ -517,7 +537,7 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
|
||||
ncclGroupJobMainPtr = &ncclGroupJobMain;
|
||||
/* make sure ncclGroupBlocking has been set. */
|
||||
assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1);
|
||||
if (ncclGroupBlocking == 0 && (ncclGroupCommPreconnectHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs))) {
|
||||
if (ncclGroupBlocking == 0) {
|
||||
/* nonblocking group */
|
||||
if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
|
||||
ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
|
||||
@@ -539,7 +559,7 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
|
||||
}
|
||||
|
||||
ncclGroupJobMainPtr->base.func = groupLaunchNonBlocking;
|
||||
SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail);
|
||||
PTHREADCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), "pthread_create", ret, fail);
|
||||
ret = ncclInProgress;
|
||||
} else {
|
||||
/* blocking group */
|
||||
|
||||
@@ -17,6 +17,11 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
#include <cuda.h>
|
||||
#include "cudawrap.h"
|
||||
#endif
|
||||
|
||||
uint64_t clockNano(); // from utils.h with which we have a circular dependency
|
||||
|
||||
template<typename T>
|
||||
@@ -24,6 +29,81 @@ constexpr size_t ncclSizeOfT() { return sizeof(T); }
|
||||
template<>
|
||||
constexpr size_t ncclSizeOfT<void>() { return 1; }
|
||||
|
||||
#if CUDART_VERSION >= 12020
|
||||
|
||||
static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
size_t granularity = 0;
|
||||
CUdevice currentDev;
|
||||
CUmemAllocationProp prop = {};
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
CUmemGenericAllocationHandle handle;
|
||||
int cudaDev;
|
||||
int cpuNumaNodeId = -1;
|
||||
CUmemAllocationHandleType type = ncclCuMemHandleType;
|
||||
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
||||
CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev));
|
||||
if (cpuNumaNodeId < 0) cpuNumaNodeId = 0;
|
||||
prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
|
||||
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
prop.requestedHandleTypes = type; // So it can be exported
|
||||
prop.location.id = cpuNumaNodeId;
|
||||
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
||||
ALIGN_SIZE(size, granularity);
|
||||
/* Allocate the physical memory on the device */
|
||||
CUCHECK(cuMemCreate(&handle, size, &prop, 0));
|
||||
/* Reserve a virtual address range */
|
||||
CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, granularity, 0, 0));
|
||||
/* Map the virtual address range to the physical allocation */
|
||||
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
|
||||
/* Now allow RW access to the newly mapped memory for local GPU */
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = cudaDev;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
|
||||
|
||||
/* Now allow RW access to the newly mapped memory from the CPU */
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
|
||||
accessDesc.location.id = cpuNumaNodeId;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
|
||||
|
||||
if (handlep) *handlep = handle;
|
||||
INFO(NCCL_ALLOC, "CUMEM Host Alloc Size %zi pointer %p handle %llx numa %d dev %d granularity %ld", size, *ptr, handle, cpuNumaNodeId, cudaDev, granularity);
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
|
||||
if (ptr == NULL) return ncclSuccess;
|
||||
ncclResult_t result = ncclSuccess;
|
||||
CUmemGenericAllocationHandle handle;
|
||||
size_t size = 0;
|
||||
CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
|
||||
CUCHECK(cuMemRelease(handle));
|
||||
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
|
||||
TRACE(NCCL_ALLOC, "CUMEM Host Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
|
||||
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
|
||||
CUCHECK(cuMemRelease(handle));
|
||||
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
|
||||
return result;
|
||||
}
|
||||
|
||||
#else /* CUDART_VERSION >= 12020 */
|
||||
|
||||
static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, void* handlep, size_t size) {
|
||||
WARN("CUMEM Host is not supported prior to CUDA 12.2");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
|
||||
WARN("CUMEM Host is not supported prior to CUDA 12.2");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
#endif /* CUDART_VERSION >= 12020 */
|
||||
|
||||
template <typename T>
|
||||
ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
@@ -40,24 +120,25 @@ finish:
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
|
||||
return result;
|
||||
}
|
||||
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
inline ncclResult_t ncclCudaHostFree(void* ptr) {
|
||||
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
|
||||
CUDACHECK(cudaFreeHost(ptr));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
template <typename T>
|
||||
ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
||||
if (nelem > 0) {
|
||||
void* p = malloc(nelem*ncclSizeOfT<T>());
|
||||
T* p = (T*)malloc(nelem*ncclSizeOfT<T>());
|
||||
if (p == NULL) {
|
||||
WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
|
||||
return ncclSystemError;
|
||||
}
|
||||
//INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), p);
|
||||
memset(p, 0, nelem*ncclSizeOfT<T>());
|
||||
*ptr = (T*)p;
|
||||
*ptr = p;
|
||||
} else {
|
||||
*ptr = NULL;
|
||||
}
|
||||
@@ -67,17 +148,17 @@ ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int li
|
||||
|
||||
template <typename T>
|
||||
ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
|
||||
if (nelem < oldNelem) return ncclInternalError;
|
||||
T* oldp = *ptr;
|
||||
if (nelem < oldNelem || (oldp == NULL && oldNelem > 0)) return ncclInternalError;
|
||||
if (nelem == oldNelem) return ncclSuccess;
|
||||
|
||||
T* oldp = *ptr;
|
||||
T* p = (T*)malloc(nelem*ncclSizeOfT<T>());
|
||||
if (p == NULL) {
|
||||
WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
|
||||
return ncclSystemError;
|
||||
}
|
||||
memcpy(p, oldp, oldNelem*ncclSizeOfT<T>());
|
||||
free(oldp);
|
||||
if (oldp && oldNelem) memcpy(p, oldp, oldNelem * ncclSizeOfT<T>());
|
||||
if (oldp) free(oldp);
|
||||
memset(p+oldNelem, 0, (nelem-oldNelem)*ncclSizeOfT<T>());
|
||||
*ptr = (T*)p;
|
||||
INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*ncclSizeOfT<T>(), nelem*ncclSizeOfT<T>(), *ptr);
|
||||
@@ -89,6 +170,40 @@ ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
|
||||
#include <cuda.h>
|
||||
#include "cudawrap.h"
|
||||
|
||||
// ncclCuMemAllocAddr takes memory handle and size and returns the mapped address pointer
|
||||
static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
size_t granularity = 0;
|
||||
CUmemAllocationProp prop = {};
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
int cudaDev;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUCHECK(cuMemGetAllocationPropertiesFromHandle(&prop, *handleIn));
|
||||
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
||||
ALIGN_SIZE(size, granularity);
|
||||
/* Reserve a virtual address range */
|
||||
CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
|
||||
/* Map the virtual address range to the physical allocation */
|
||||
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, *handleIn, 0));
|
||||
/* Now allow RW access to the newly mapped memory */
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = cudaDev;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
|
||||
TRACE(NCCL_ALLOC, "CuMem Map Size %zu pointer %p handle %llx", size, *ptr, *handleIn);
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
|
||||
if (ptr == NULL) return ncclSuccess;
|
||||
ncclResult_t result = ncclSuccess;
|
||||
size_t size = 0;
|
||||
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
|
||||
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
|
||||
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
size_t granularity = 0;
|
||||
@@ -106,7 +221,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
|
||||
prop.requestedHandleTypes = type;
|
||||
prop.location.id = currentDev;
|
||||
// Query device to see if RDMA support is available
|
||||
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
|
||||
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
|
||||
if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
|
||||
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
||||
ALIGN_SIZE(size, granularity);
|
||||
@@ -154,6 +269,15 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
|
||||
WARN("CUMEM not supported prior to CUDA 11.3");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
|
||||
WARN("CUMEM not supported prior to CUDA 11.3");
|
||||
return ncclInternalError;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
@@ -274,7 +398,8 @@ finish:
|
||||
// and if they are shared, that could cause a crash in a child process
|
||||
inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
|
||||
if (size > 0) {
|
||||
size_t page_size = sysconf(_SC_PAGESIZE);
|
||||
long page_size = sysconf(_SC_PAGESIZE);
|
||||
if (page_size < 0) return ncclSystemError;
|
||||
void* p;
|
||||
int size_aligned = ROUNDUP(size, page_size);
|
||||
int ret = posix_memalign(&p, page_size, size_aligned);
|
||||
|
||||
@@ -185,6 +185,8 @@ inline __host__ __device__ Int pow2Up(Int x) {
|
||||
|
||||
template<typename Int>
|
||||
inline __host__ __device__ Int pow2Down(Int x) {
|
||||
// True, log2Down can return -1, but we don't normally pass 0 as an argument...
|
||||
// coverity[negative_shift]
|
||||
return Int(1)<<log2Down(x);
|
||||
}
|
||||
|
||||
@@ -274,4 +276,13 @@ inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
|
||||
return u32fpDecode(x, 3);
|
||||
}
|
||||
|
||||
inline __host__ __device__ uint64_t getHash(const char* string, int n) {
|
||||
// Based on DJB2a, result = result * 33 ^ char
|
||||
uint64_t result = 5381;
|
||||
for (int c = 0; c < n; c++) {
|
||||
result = ((result << 5) + result) ^ string[c];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -19,8 +19,8 @@ static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Boots
|
||||
ncclResult_t bootstrapNetInit();
|
||||
ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
|
||||
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
|
||||
ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
|
||||
ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
|
||||
ncclResult_t bootstrapInit(int nHandles, void* handle, struct ncclComm* comm);
|
||||
ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
|
||||
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
|
||||
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
|
||||
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
|
||||
|
||||
@@ -38,21 +38,17 @@
|
||||
|
||||
#include <errno.h>
|
||||
// Check system calls
|
||||
#define SYSCHECK(call, name) do { \
|
||||
#define SYSCHECK(statement, name) do { \
|
||||
int retval; \
|
||||
SYSCHECKVAL(call, name, retval); \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKVAL(call, name, retval) do { \
|
||||
SYSCHECKSYNC(call, name, retval); \
|
||||
SYSCHECKSYNC((statement), name, retval); \
|
||||
if (retval == -1) { \
|
||||
WARN("Call to " name " failed : %s", strerror(errno)); \
|
||||
WARN("Call to " name " failed: %s", strerror(errno)); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKSYNC(call, name, retval) do { \
|
||||
retval = call; \
|
||||
#define SYSCHECKSYNC(statement, name, retval) do { \
|
||||
retval = (statement); \
|
||||
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
|
||||
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
|
||||
} else { \
|
||||
@@ -60,14 +56,33 @@
|
||||
} \
|
||||
} while(true)
|
||||
|
||||
#define SYSCHECKGOTO(statement, RES, label) do { \
|
||||
if ((statement) == -1) { \
|
||||
/* Print the back trace*/ \
|
||||
RES = ncclSystemError; \
|
||||
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
|
||||
#define SYSCHECKGOTO(statement, name, RES, label) do { \
|
||||
int retval; \
|
||||
SYSCHECKSYNC((statement), name, retval); \
|
||||
if (retval == -1) { \
|
||||
WARN("Call to " name " failed: %s", strerror(errno)); \
|
||||
RES = ncclSystemError; \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
} while (0)
|
||||
|
||||
// Pthread calls don't set errno and never return EINTR.
|
||||
#define PTHREADCHECK(statement, name) do { \
|
||||
int retval = (statement); \
|
||||
if (retval != 0) { \
|
||||
WARN("Call to " name " failed: %s", strerror(retval)); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PTHREADCHECKGOTO(statement, name, RES, label) do { \
|
||||
int retval = (statement); \
|
||||
if (retval != 0) { \
|
||||
WARN("Call to " name " failed: %s", strerror(retval)); \
|
||||
RES = ncclSystemError; \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define NEQCHECK(statement, value) do { \
|
||||
if ((statement) != value) { \
|
||||
@@ -75,7 +90,7 @@
|
||||
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while (0);
|
||||
} while (0)
|
||||
|
||||
#define NEQCHECKGOTO(statement, value, RES, label) do { \
|
||||
if ((statement) != value) { \
|
||||
@@ -84,7 +99,7 @@
|
||||
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
} while (0)
|
||||
|
||||
#define EQCHECK(statement, value) do { \
|
||||
if ((statement) == value) { \
|
||||
@@ -92,7 +107,7 @@
|
||||
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while (0);
|
||||
} while (0)
|
||||
|
||||
#define EQCHECKGOTO(statement, value, RES, label) do { \
|
||||
if ((statement) == value) { \
|
||||
@@ -101,7 +116,7 @@
|
||||
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
} while (0)
|
||||
|
||||
// Propagate errors up
|
||||
#define NCCLCHECK(call) do { \
|
||||
@@ -111,7 +126,7 @@
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
|
||||
return RES; \
|
||||
} \
|
||||
} while (0);
|
||||
} while (0)
|
||||
|
||||
#define NCCLCHECKGOTO(call, RES, label) do { \
|
||||
RES = call; \
|
||||
@@ -120,7 +135,7 @@
|
||||
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (0);
|
||||
} while (0)
|
||||
|
||||
#define NCCLWAIT(call, cond, abortFlagPtr) do { \
|
||||
uint32_t* tmpAbortFlag = (abortFlagPtr); \
|
||||
@@ -130,7 +145,7 @@
|
||||
return ncclInternalError; \
|
||||
} \
|
||||
if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECK(*tmpAbortFlag, 0); \
|
||||
} while (!(cond));
|
||||
} while (!(cond))
|
||||
|
||||
#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
|
||||
uint32_t* tmpAbortFlag = (abortFlagPtr); \
|
||||
@@ -140,7 +155,7 @@
|
||||
goto label; \
|
||||
} \
|
||||
if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
|
||||
} while (!(cond));
|
||||
} while (!(cond))
|
||||
|
||||
#define NCCLCHECKTHREAD(a, args) do { \
|
||||
if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \
|
||||
|
||||
@@ -64,4 +64,490 @@ struct ncclConnFifo {
|
||||
ssize_t size;
|
||||
void* ptr;
|
||||
};
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
template<typename T>
|
||||
class PatRSAlgorithm{
|
||||
size_t offset;
|
||||
size_t end;
|
||||
size_t count;
|
||||
int chunkCount;
|
||||
int nelem;
|
||||
int rank;
|
||||
int nranks;
|
||||
int nrPow2;
|
||||
int postFreq;
|
||||
int lastA;
|
||||
|
||||
int aggFactor;
|
||||
int as; // aggregated steps
|
||||
int a; // step inside aggregated step
|
||||
int sendSkipped; // number of skipped steps during aggregation
|
||||
int recvSkipped; // number of skipped steps during aggregation
|
||||
int phase2recv; // receive offset for phase 2
|
||||
int aggDelta;
|
||||
int scale;
|
||||
int phase;
|
||||
|
||||
__device__ __host__ int min(int a, int b) {
|
||||
return (a<b)?a:b;
|
||||
}
|
||||
|
||||
__device__ __host__ int getNelem() {
|
||||
return min(chunkCount, end-offset);
|
||||
}
|
||||
|
||||
__device__ __host__ int mirrorInvert(int i, int max) {
|
||||
int ret = 0;
|
||||
for (int mask=1, imask=max/2; mask<max; mask<<=1, imask>>=1) {
|
||||
if ((i&mask) == 0) ret += imask;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __host__ int firstBitSet(int i, int max) {
|
||||
int ffs =
|
||||
#ifdef __CUDA_ARCH__
|
||||
__ffs(i);
|
||||
#else
|
||||
__builtin_ffs(i);
|
||||
#endif
|
||||
return ffs ? ffs-1 : max;
|
||||
}
|
||||
|
||||
__device__ __host__ void resetA() {
|
||||
a = 0;
|
||||
sendSkipped = recvSkipped = 0;
|
||||
lastA = aggFactor;
|
||||
if (phase >= 2) lastA /= 2*scale;
|
||||
}
|
||||
|
||||
__device__ __host__ void reset() {
|
||||
nelem = getNelem();
|
||||
phase = 0;
|
||||
scale = 1;
|
||||
phase2recv = 0;
|
||||
as = aggDelta - 1;
|
||||
resetA();
|
||||
}
|
||||
|
||||
__device__ __host__ int nBitsSet(int i) {
|
||||
int nbits =
|
||||
#ifdef __CUDA_ARCH__
|
||||
__popc(i);
|
||||
#else
|
||||
__builtin_popcount(i);
|
||||
#endif
|
||||
return nbits;
|
||||
}
|
||||
|
||||
// Return 1 when only upper bits are set. For example, if nrpow2==16 we'll return 1 for 8, 12, 14, 15.
|
||||
// A number being in the form of 1111000 implies that the complementary is 0000111 meaning it's a power of 2 minus 1.
|
||||
__device__ __host__ int newPeer(int i, int pow2) {
|
||||
//printf("New peer %d/%d -> %d\n", i, pow2, nBitsSet((i ^ (pow2-1)) + 1) == 1 ? 1 : 0);
|
||||
return nBitsSet((i ^ (pow2-1)) + 1) == 1 ? 1 : 0;
|
||||
}
|
||||
|
||||
public:
|
||||
__device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
|
||||
offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
|
||||
aggDelta = nrPow2 = (1<<log2Up(nranks));
|
||||
|
||||
aggFactor = 1;
|
||||
size_t channelSize = end-offset;
|
||||
while (stepSize / (channelSize*sizeof(T)*aggFactor) >= 2 && aggFactor < nranks/2) {
|
||||
aggFactor *= 2;
|
||||
aggDelta /= 2;
|
||||
}
|
||||
postFreq = aggFactor;
|
||||
int d = stepDepth;
|
||||
while (d > 1 && aggFactor < nranks/2) {
|
||||
d /= 2;
|
||||
aggFactor *= 2;
|
||||
aggDelta /= 2;
|
||||
}
|
||||
|
||||
reset();
|
||||
}
|
||||
|
||||
__device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &sendStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
|
||||
restart:
|
||||
last = 0;
|
||||
nelemOut = nelem;
|
||||
outIx = offset;
|
||||
int skip = 0;
|
||||
//printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
|
||||
if (phase == 0) {
|
||||
int s = mirrorInvert(a, lastA)*aggDelta + as;
|
||||
if (s >= nranks) skip = 1;
|
||||
int sendDataRank = (rank + s) % nranks;
|
||||
inpIx = sendDataRank * count + offset;
|
||||
recvDim = -1;
|
||||
sendDim = 0;
|
||||
outIx = 0;
|
||||
recvOffset = -1;
|
||||
sendOffset = ((a - sendSkipped)%postFreq) * nelem;
|
||||
sendStepOffset = 0;
|
||||
if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
|
||||
postSend = 1;
|
||||
} else {
|
||||
postSend = 0;
|
||||
}
|
||||
postRecv = 0;
|
||||
if (skip) sendSkipped++;
|
||||
if (++a == lastA) {
|
||||
phase = as == 1 ? (aggFactor > 1 ? 2 : 4) : 1; // If as == 1, switch to phase 2
|
||||
resetA();
|
||||
}
|
||||
if (skip == 0) return;
|
||||
} else if (phase == 1) {
|
||||
int s = mirrorInvert(a, lastA)*aggDelta + as;
|
||||
if (s >= nranks) skip = 1;
|
||||
recvDim = firstBitSet(s, nrPow2);
|
||||
sendOffset = ((a - sendSkipped)%postFreq)*nelem;
|
||||
recvOffset = ((a - recvSkipped)%postFreq)*nelem;
|
||||
postSend = 0;
|
||||
if (recvDim == 0) {
|
||||
if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) postSend = 1;
|
||||
sendStepOffset = 0;
|
||||
} else {
|
||||
sendStepOffset = (a - sendSkipped)/postFreq;
|
||||
}
|
||||
if ((((a - recvSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
|
||||
postRecv = 1;
|
||||
} else {
|
||||
postRecv = 0;
|
||||
}
|
||||
s -= (1<<recvDim);
|
||||
int recvDataRank = (rank + nranks + s) % nranks;
|
||||
inpIx = recvDataRank * count + offset;
|
||||
sendDim = s ? firstBitSet(s, nrPow2) : -1;
|
||||
if (sendDim == -1) {
|
||||
sendOffset = -1;
|
||||
sendStepOffset = 0;
|
||||
} else if (as - (1<<recvDim) == 0) {
|
||||
if (newPeer(a, aggFactor)) sendSkipped = a;
|
||||
int foffset = a - sendSkipped;
|
||||
sendStepOffset = recvDim == 0 ? 0 : foffset/postFreq;
|
||||
sendOffset = (foffset%postFreq)*nelem;
|
||||
}
|
||||
if (s < nranks && skip) {
|
||||
recvDim = -1;
|
||||
recvOffset = -1;
|
||||
postRecv = 0;
|
||||
skip = 0;
|
||||
}
|
||||
if (skip || recvDim == -1) recvSkipped++;
|
||||
if (skip) sendSkipped++;
|
||||
if (++a == lastA) {
|
||||
as--;
|
||||
phase = as % 2 == 1 ? 0 : 1;
|
||||
resetA();
|
||||
}
|
||||
if (skip == 0) return;
|
||||
} else if (phase == 2) {
|
||||
int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta + 1;
|
||||
postRecv = 0;
|
||||
if (s >= nranks) skip = 1;
|
||||
recvDim = 0;
|
||||
postSend = a == lastA-1 ? 1 : 0;
|
||||
s -= 1;
|
||||
if (s < nranks && skip) {
|
||||
recvDim = -1;
|
||||
recvOffset = -1;
|
||||
skip = 0;
|
||||
} else if (!skip) {
|
||||
int foffset = phase2recv;
|
||||
phase2recv++;
|
||||
postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
|
||||
recvOffset = (foffset%postFreq) * nelem;
|
||||
}
|
||||
int recvDataRank = (rank + nranks + s) % nranks;
|
||||
inpIx = recvDataRank * count + offset;
|
||||
sendDim = s ? firstBitSet(s, nrPow2) : -1;
|
||||
int foffset = a - sendSkipped;
|
||||
postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
|
||||
sendStepOffset = 0;
|
||||
sendOffset = (foffset%postFreq) * nelem;
|
||||
if (skip || sendDim == -1) sendSkipped++;
|
||||
if (++a == lastA) {
|
||||
phase = 3;
|
||||
resetA();
|
||||
}
|
||||
if (skip == 0) return;
|
||||
} else if (phase == 3) {
|
||||
int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta;
|
||||
postRecv = a == lastA-1 ? 1 : 0;
|
||||
if (s >= nranks) skip = 1;
|
||||
recvDim = firstBitSet(s, nrPow2);
|
||||
postSend = 0;
|
||||
s -= (1<<recvDim);
|
||||
int foffset = a - recvSkipped;
|
||||
postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0;
|
||||
recvOffset = (foffset%postFreq) * nelem;
|
||||
int recvDataRank = (rank + nranks + s) % nranks;
|
||||
inpIx = recvDataRank * count + offset;
|
||||
sendDim = s ? firstBitSet(s, nrPow2) : -1;
|
||||
if (s < nranks && skip) {
|
||||
recvDim = -1;
|
||||
recvOffset = -1;
|
||||
postRecv = 0;
|
||||
skip = 0;
|
||||
}
|
||||
if (newPeer(a, aggFactor/(2*scale))) sendSkipped = a;
|
||||
foffset = a - sendSkipped;
|
||||
sendStepOffset = foffset / postFreq; // Accumulate on next steps
|
||||
sendOffset = sendDim >= 0 ? (foffset%postFreq) * nelem : -1;
|
||||
if (skip || recvDim == -1) recvSkipped++;
|
||||
if (skip) sendSkipped++;
|
||||
if (++a == lastA) {
|
||||
scale *= 2;
|
||||
phase = scale < aggFactor ? 2 : 4;
|
||||
resetA();
|
||||
}
|
||||
if (skip == 0) return;
|
||||
} else if (phase == 4) {
|
||||
recvDim = 0;
|
||||
sendDim = -1;
|
||||
inpIx = rank * count + offset;
|
||||
recvOffset = (phase2recv%postFreq) * nelem;
|
||||
sendStepOffset = 0;
|
||||
sendOffset = -1;
|
||||
postRecv = 1;
|
||||
postSend = 0;
|
||||
offset += chunkCount;
|
||||
if (offset >= end) {
|
||||
last = 1;
|
||||
} else {
|
||||
reset();
|
||||
}
|
||||
return;
|
||||
}
|
||||
goto restart;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class PatAGAlgorithm{
|
||||
size_t offset;
|
||||
size_t end;
|
||||
size_t count;
|
||||
int chunkCount;
|
||||
int nelem;
|
||||
int rank;
|
||||
int nranks;
|
||||
int nrPow2;
|
||||
int postFreq;
|
||||
int lastA;
|
||||
|
||||
int aggFactor;
|
||||
int as; // aggregated steps
|
||||
int a; // step inside aggregated step
|
||||
int aggDelta;
|
||||
|
||||
int scale;
|
||||
|
||||
int phase;
|
||||
|
||||
// AS computation
|
||||
int asDim;
|
||||
int v;
|
||||
int bitCount[32];
|
||||
int bitZeroStep[32];
|
||||
|
||||
__device__ __host__ int min(int a, int b) {
|
||||
return (a<b)?a:b;
|
||||
}
|
||||
|
||||
__device__ __host__ int getNelem() {
|
||||
return min(chunkCount, end-offset);
|
||||
}
|
||||
|
||||
__device__ __host__ int mirror(int i, int max) {
|
||||
int ret = 0;
|
||||
for (int mask=1, imask=max/2; mask<max; mask<<=1, imask>>=1) {
|
||||
if ((i&mask)) ret += imask;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
__device__ __host__ int firstBitSet(int i, int max) {
|
||||
int ffs =
|
||||
#ifdef __CUDA_ARCH__
|
||||
__ffs(i);
|
||||
#else
|
||||
__builtin_ffs(i);
|
||||
#endif
|
||||
return ffs ? ffs-1 : max;
|
||||
}
|
||||
|
||||
__device__ __host__ void resetA() {
|
||||
a = 0;
|
||||
lastA = aggFactor;
|
||||
if (phase >= 2) lastA /= 2*scale;
|
||||
}
|
||||
|
||||
__device__ __host__ void reset() {
|
||||
nelem = getNelem();
|
||||
scale = aggFactor/2;
|
||||
phase = scale ? 2 : 1;
|
||||
v = 0;
|
||||
for (int i = 0; i<asDim; i++) {
|
||||
bitCount[i] = asDim-i;
|
||||
bitZeroStep[i] = 1;
|
||||
}
|
||||
as = nextAs();
|
||||
resetA();
|
||||
}
|
||||
|
||||
__device__ __host__ int nextAs() {
|
||||
for (int d=0; d<asDim; d++) {
|
||||
int p = 1<<d;
|
||||
bitCount[d]--;
|
||||
if (bitCount[d] == 0) {
|
||||
v ^= p;
|
||||
bitCount[d] = p;
|
||||
if ((v&p) == 0) {
|
||||
bitCount[d] += firstBitSet(bitZeroStep[d], asDim) - 1;
|
||||
if (bitCount[d] == 0) {
|
||||
v ^= p;
|
||||
bitCount[d] = p;
|
||||
}
|
||||
bitZeroStep[d]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
|
||||
public:
|
||||
__device__ __host__ PatAGAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
|
||||
offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
|
||||
aggDelta = nrPow2 = (1<<log2Up(nranks));
|
||||
|
||||
aggFactor = 1;
|
||||
size_t channelSize = end-offset;
|
||||
while (stepSize / (channelSize*sizeof(T)*aggFactor) >= 2 && aggFactor < nranks/2) {
|
||||
aggFactor *= 2;
|
||||
aggDelta /= 2;
|
||||
}
|
||||
postFreq = aggFactor;
|
||||
int d = stepDepth;
|
||||
while (d > 1 && aggFactor < nranks/2) {
|
||||
d /= 2;
|
||||
aggFactor *= 2;
|
||||
aggDelta /= 2;
|
||||
}
|
||||
//printf("AggFactor %d PostFreq %d AggDelta %d\n", aggFactor, postFreq, aggDelta);
|
||||
|
||||
asDim = log2Up(aggDelta);
|
||||
reset();
|
||||
}
|
||||
|
||||
__device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &recvStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
|
||||
restart:
|
||||
//printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
|
||||
last = 0;
|
||||
nelemOut = nelem;
|
||||
inpIx = offset;
|
||||
int skip = 0;
|
||||
if (phase == 0) {
|
||||
int s = a*aggDelta + as;
|
||||
if (s >= nranks) skip = 1;
|
||||
int nextSkip = (a+1)*aggDelta + as >= nranks ? 1 : 0;
|
||||
int recvDataRank = (rank + s) % nranks;
|
||||
outIx = recvDataRank * count + offset;
|
||||
sendDim = -1;
|
||||
recvDim = 0;
|
||||
inpIx = 0;
|
||||
sendOffset = -1;
|
||||
recvOffset = (a % postFreq) * nelem;
|
||||
recvStepOffset = 0;
|
||||
postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
|
||||
postSend = 0;
|
||||
a++;
|
||||
if (nextSkip) {
|
||||
as = nextAs();
|
||||
if (as == aggDelta/2) {
|
||||
offset += chunkCount;
|
||||
if (offset >= end) {
|
||||
last = 1;
|
||||
} else {
|
||||
reset();
|
||||
}
|
||||
return;
|
||||
}
|
||||
phase = 1;
|
||||
resetA();
|
||||
}
|
||||
if (skip == 0) return;
|
||||
} else if (phase == 1) {
|
||||
int s = a*aggDelta + as;
|
||||
if (s >= nranks) skip = 1;
|
||||
sendDim = firstBitSet(s, nrPow2);
|
||||
s -= (1<<sendDim);
|
||||
int sendDataRank = (rank + nranks + s) % nranks;
|
||||
outIx = sendDataRank * count + offset;
|
||||
recvDim = s ? firstBitSet(s, nrPow2) : -1;
|
||||
sendOffset = recvOffset = (a % postFreq) * nelem;
|
||||
postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
|
||||
postRecv = (sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0;
|
||||
recvStepOffset = (sendDim == 0) ? 0 : a/postFreq;
|
||||
if (recvDim == -1) {
|
||||
recvOffset = -1;
|
||||
postRecv = 0;
|
||||
} else if (as - (1<<sendDim) == 0) {
|
||||
int foffset = (a*aggDelta) >> (recvDim+1);
|
||||
recvOffset = (foffset%postFreq)*nelem;
|
||||
postRecv = (sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<<recvDim) >= nranks) ? 1 : 0;
|
||||
recvStepOffset = (sendDim == 0) ? 0 : foffset/postFreq;
|
||||
}
|
||||
if (s < nranks && sendDim == 0 && skip) {
|
||||
// Don't forget to receive at least once even if we don't send afterwards
|
||||
sendDim = -1;
|
||||
sendOffset = -1;
|
||||
postSend = 0;
|
||||
skip = 0;
|
||||
}
|
||||
if (++a == lastA) {
|
||||
if (as % 2 == 1) {
|
||||
phase = 0;
|
||||
} else {
|
||||
as = nextAs();
|
||||
}
|
||||
resetA();
|
||||
}
|
||||
if (skip == 0) return;
|
||||
} else if (phase == 2) {
|
||||
int s = (2*a+1)*scale*aggDelta;
|
||||
postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0;
|
||||
postRecv = 0;
|
||||
if (s >= nranks) skip = 1;
|
||||
sendDim = firstBitSet(s, nrPow2);
|
||||
s -= (1<<sendDim);
|
||||
sendOffset = (a%postFreq) * nelem;
|
||||
recvStepOffset = a / postFreq;
|
||||
int sendDataRank = (rank + nranks + s) % nranks;
|
||||
outIx = sendDataRank * count + offset;
|
||||
recvDim = s ? firstBitSet(s, nrPow2) : -1;
|
||||
s -= (1<<recvDim);
|
||||
if (recvDim == -1) {
|
||||
recvOffset = -1;
|
||||
} else {
|
||||
int foffset = (a*2*scale*aggDelta) >> (recvDim+1);
|
||||
recvOffset = (foffset%postFreq)*nelem;
|
||||
recvStepOffset = foffset / postFreq;
|
||||
}
|
||||
if (++a == lastA) {
|
||||
scale /= 2;
|
||||
phase = scale ? 2 : 1;
|
||||
resetA();
|
||||
}
|
||||
if (skip == 0) return;
|
||||
}
|
||||
goto restart;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "nccl_net.h"
|
||||
#include "register.h"
|
||||
#include "graph.h"
|
||||
#include "profiler.h"
|
||||
|
||||
#if CUDART_VERSION < 9000
|
||||
struct cudaLaunchParams {
|
||||
@@ -104,6 +105,11 @@ struct ncclCommCallback {
|
||||
struct ncclCommCallback* next;
|
||||
ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
|
||||
};
|
||||
struct ncclCommEventCallback {
|
||||
struct ncclCommEventCallback* next;
|
||||
cudaEvent_t event;
|
||||
ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommEventCallback* cb);
|
||||
};
|
||||
|
||||
struct ncclSharedResources {
|
||||
int refCount;
|
||||
@@ -173,6 +179,54 @@ struct ncclCollnetHandleList {
|
||||
struct ncclProxyConnector* proxyconn;
|
||||
};
|
||||
|
||||
struct ncclTaskColl {
|
||||
struct ncclTaskColl* next;
|
||||
ncclFunc_t func;
|
||||
void const* sendbuff;
|
||||
void* recvbuff;
|
||||
size_t count;
|
||||
int root;
|
||||
ncclDataType_t datatype;
|
||||
ncclRedOp_t opHost;
|
||||
struct ncclDevRedOpFull opDev;
|
||||
int chunkSteps, sliceSteps;
|
||||
// Computed later:
|
||||
size_t trafficBytes;
|
||||
int32_t nMaxChannels:8;
|
||||
int32_t nWarps:8;
|
||||
int32_t algorithm:8, protocol:8;
|
||||
uint32_t isCollnet:1, isNvls:1;
|
||||
uint32_t devFuncId:30;
|
||||
enum ncclRegBufferType regBufType;
|
||||
// number of elements in planner->ipcMemQueue associated with this collective
|
||||
int nCleanupQueueElts;
|
||||
|
||||
void* sendMhandle;
|
||||
void* recvMhandle;
|
||||
// index for IPC record lookup
|
||||
uintptr_t sendbuffOffset;
|
||||
uintptr_t recvbuffOffset;
|
||||
uintptr_t* sendbuffRmtAddrs;
|
||||
uintptr_t* recvbuffRmtAddrs;
|
||||
|
||||
// Profiler plugin
|
||||
int eActivationMask;
|
||||
void* eventHandle;
|
||||
};
|
||||
struct ncclTaskP2p {
|
||||
struct ncclTaskP2p* next;
|
||||
ncclFunc_t func;
|
||||
void* buff;
|
||||
size_t count;
|
||||
ncclDataType_t datatype;
|
||||
int root;
|
||||
size_t bytes;
|
||||
|
||||
// Profiler plugin
|
||||
int eActivationMask;
|
||||
void* eventHandle;
|
||||
};
|
||||
|
||||
struct ncclKernelPlan {
|
||||
// A kernel plan is also a callback that reclaims itself. Hence this must
|
||||
// be the first member.
|
||||
@@ -198,40 +252,12 @@ struct ncclKernelPlan {
|
||||
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> cleanupQueue;
|
||||
void* workBufPersistent;
|
||||
|
||||
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> p2pTaskQueue;
|
||||
struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collTaskQueue;
|
||||
struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct ncclTaskColl {
|
||||
struct ncclTaskColl* next;
|
||||
ncclFunc_t func;
|
||||
void const* sendbuff;
|
||||
void* recvbuff;
|
||||
size_t count;
|
||||
int root;
|
||||
ncclDataType_t datatype;
|
||||
ncclRedOp_t opHost;
|
||||
struct ncclDevRedOpFull opDev;
|
||||
int chunkSteps, sliceSteps;
|
||||
// Computed later:
|
||||
size_t trafficBytes;
|
||||
int32_t nMaxChannels:8;
|
||||
int32_t nWarps:8;
|
||||
int32_t algorithm:8, protocol:8;
|
||||
uint32_t isCollnet:1, isNvls:1;
|
||||
uint32_t devFuncId:30;
|
||||
enum ncclRegBufferType regBufType;
|
||||
// number of elements in planner->ipcMemQueue associated with this collective
|
||||
int nCleanupQueueElts;
|
||||
|
||||
void* sendMhandle;
|
||||
void* recvMhandle;
|
||||
};
|
||||
struct ncclTaskP2p {
|
||||
struct ncclTaskP2p* next;
|
||||
void* buff;
|
||||
size_t bytes;
|
||||
// Profiler plugin
|
||||
void* groupEventHandle;
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -383,6 +409,8 @@ struct ncclComm {
|
||||
struct ncclChannel channels[MAXCHANNELS];
|
||||
struct ncclPeerInfo* peerInfo;
|
||||
struct ncclTopoSystem* topo;
|
||||
struct ncclProxyConnector* gproxyConn;
|
||||
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> legacyRegCleanupQueue;
|
||||
|
||||
int netPluginLoaded;
|
||||
ncclNet_t* ncclNet;
|
||||
@@ -395,10 +423,12 @@ struct ncclComm {
|
||||
struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
|
||||
bool initAlgoChannels[NCCL_NUM_ALGORITHMS];
|
||||
bool runtimeConn; // if dynamic connection is supported
|
||||
bool directMode;
|
||||
int cuMemSupport;
|
||||
|
||||
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
|
||||
|
||||
const char* commName;
|
||||
uint64_t commHash;
|
||||
int rank; // my rank in the communicator
|
||||
int nRanks; // number of GPUs in communicator
|
||||
@@ -504,7 +534,7 @@ struct ncclComm {
|
||||
int collNetSupport;
|
||||
bool collNetRegSupport;
|
||||
uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
|
||||
int intraHighestTransportType;
|
||||
bool intraNodeP2pSupport;
|
||||
int* collNetHeads;
|
||||
int collNetHeadsNum;
|
||||
int* collNetDenseToUserRank;
|
||||
@@ -519,6 +549,8 @@ struct ncclComm {
|
||||
struct ncclNvlsSharedRes* nvlsResources;
|
||||
|
||||
// pools backed by comm->memPermanent
|
||||
struct ncclMemoryPool memPool_ncclTaskColl;
|
||||
struct ncclMemoryPool memPool_ncclTaskP2p;
|
||||
struct ncclMemoryPool memPool_ncclProxyOp;
|
||||
struct ncclMemoryPool memPool_ncclKernelPlan;
|
||||
|
||||
@@ -532,6 +564,13 @@ struct ncclComm {
|
||||
|
||||
struct ncclKernelPlanner planner;
|
||||
|
||||
cudaMemPool_t memPool;
|
||||
// Queue of events and associated callbacks for cleaning up asynchronous work.
|
||||
// Using this is preferable to using CUDA host callbacks because host callbacks
|
||||
// won't allow the work following the callback to run until the callback completes,
|
||||
// which comes at expense to perf.
|
||||
struct ncclIntruQueue<struct ncclCommEventCallback, &ncclCommEventCallback::next> eventCallbackQueue;
|
||||
|
||||
// user-created reduction ops
|
||||
int userRedOpCapacity, userRedOpFreeHead;
|
||||
ncclUserRedOp *userRedOps;
|
||||
@@ -553,6 +592,11 @@ struct ncclComm {
|
||||
int tunerPluginLoaded;
|
||||
ncclTuner_t* tuner;
|
||||
void *tunerContext;
|
||||
|
||||
// Profiler plugin
|
||||
void* profilerContext;
|
||||
uint64_t seqNumber[NCCL_NUM_FUNCTIONS];
|
||||
|
||||
// buffer registration cache
|
||||
struct ncclRegCache regCache;
|
||||
uint64_t endMagic;
|
||||
@@ -583,6 +627,27 @@ inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome)
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
while (true) {
|
||||
struct ncclCommEventCallback* cb = ncclIntruQueueHead(&comm->eventCallbackQueue);
|
||||
if (cb == nullptr) break;
|
||||
cudaError_t ok = cudaEventSynchronize(cb->event);
|
||||
if (ok == cudaErrorNotReady) break;
|
||||
ncclIntruQueueDequeue(&comm->eventCallbackQueue);
|
||||
if (ok == cudaSuccess) {
|
||||
NCCLCHECKGOTO(cb->fn(comm, cb), result, finish);
|
||||
} else {
|
||||
CUDACHECKGOTO(ok, result, finish);
|
||||
}
|
||||
}
|
||||
finish:
|
||||
cudaThreadExchangeStreamCaptureMode(&mode);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) {
|
||||
int phase = comm->intraBarrierPhase;
|
||||
if (comm->intraRanks == 1) {
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
|
||||
// Is cuMem API usage enabled
|
||||
extern int ncclCuMemEnable();
|
||||
extern int ncclCuMemHostEnable();
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
#include <cudaTypedefs.h>
|
||||
@@ -96,6 +97,7 @@ DECLARE_CUDA_PFN_EXTERN(cuMemRelease);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle);
|
||||
#if CUDA_VERSION >= 11070
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
|
||||
#endif
|
||||
|
||||
@@ -128,6 +128,8 @@ struct ncclConnInfo {
|
||||
};
|
||||
|
||||
struct ncclProxyConnector {
|
||||
bool initialized;
|
||||
int rank;
|
||||
int tpRank;
|
||||
int tpLocalRank;
|
||||
int sameProcess;
|
||||
@@ -141,6 +143,8 @@ struct ncclConnector {
|
||||
struct ncclTransportComm* transportComm;
|
||||
void* transportResources;
|
||||
struct ncclConnInfo conn;
|
||||
int sendMemSameProcess;
|
||||
int recvMemSameProcess;
|
||||
};
|
||||
|
||||
struct ncclRing {
|
||||
@@ -225,6 +229,7 @@ struct alignas(16) ncclDevWorkP2p {
|
||||
|
||||
uint8_t sendProtoLL:1, recvProtoLL:1;
|
||||
uint8_t sendRegistered:1, recvRegistered:1;
|
||||
uint8_t sendIpcReg:1, recvIpcReg:1;
|
||||
};
|
||||
|
||||
// Compute the subset of the data transfer corresponding to the given part index.
|
||||
@@ -266,6 +271,10 @@ struct alignas(16) ncclDevWorkColl {
|
||||
uint32_t root;
|
||||
void* recvbuff;
|
||||
void* sendbuff;
|
||||
uintptr_t sendbuffOffset;
|
||||
uintptr_t recvbuffOffset;
|
||||
uintptr_t* sendbuffRmtAddrs;
|
||||
uintptr_t* recvbuffRmtAddrs;
|
||||
union {
|
||||
// Continuous-byte-distribution scheduling. The lo and hi channels are of
|
||||
// different size than the channels in the middle.
|
||||
@@ -384,6 +393,7 @@ struct ncclDevComm {
|
||||
int nNodes;
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
int p2pChunkSize;
|
||||
int isNvlink;
|
||||
|
||||
// Work fifo return credits
|
||||
uint32_t* workConsumed/*[MAXCHANNELS]*/;
|
||||
@@ -395,6 +405,7 @@ struct ncclDevComm {
|
||||
|
||||
// Channels, device side
|
||||
struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
|
||||
int* rankToLocalRank;
|
||||
};
|
||||
|
||||
struct alignas(16) ncclDevCommAndChannels {
|
||||
@@ -539,11 +550,12 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto)
|
||||
if (coll == ncclFuncSendRecv) break;
|
||||
row += 1;
|
||||
|
||||
int nAlgos = 3;
|
||||
int nAlgos = 4;
|
||||
if (coll == ncclFuncAllGather) {
|
||||
int algo1 = algo == NCCL_ALGO_RING ? 0 :
|
||||
algo == NCCL_ALGO_COLLNET_DIRECT ? 1 :
|
||||
/*algo == NCCL_ALGO_NVLS*/ 2;
|
||||
algo == NCCL_ALGO_NVLS ? 2 :
|
||||
/*algo == NCCL_ALGO_PAT*/ 3;
|
||||
row += algo1*NCCL_NUM_PROTOCOLS + proto;
|
||||
break;
|
||||
}
|
||||
@@ -556,7 +568,7 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto)
|
||||
}
|
||||
row += nAlgos*NCCL_NUM_PROTOCOLS;
|
||||
|
||||
nAlgos = NCCL_NUM_ALGORITHMS;
|
||||
nAlgos = 6;
|
||||
if (coll == ncclFuncAllReduce) {
|
||||
row += ((devRedOp*NumTypes + type)*nAlgos + algo)*NCCL_NUM_PROTOCOLS + proto;
|
||||
break;
|
||||
@@ -570,11 +582,12 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto)
|
||||
}
|
||||
row += ncclNumDevRedOps*NumTypes*nAlgos*NCCL_NUM_PROTOCOLS;
|
||||
|
||||
nAlgos = 3;
|
||||
nAlgos = 4;
|
||||
if (coll == ncclFuncReduceScatter) {
|
||||
int algo1 = algo == NCCL_ALGO_RING ? 0 :
|
||||
algo == NCCL_ALGO_COLLNET_DIRECT ? 1 :
|
||||
/*algo == NCCL_ALGO_NVLS*/ 2;
|
||||
algo == NCCL_ALGO_NVLS ? 2 :
|
||||
/*algo == NCCL_ALGO_PAT*/ 3;
|
||||
row += ((devRedOp*NumTypes + type)*nAlgos + algo1)*NCCL_NUM_PROTOCOLS + proto;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -33,13 +33,14 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
|
||||
|
||||
// Query topology
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
|
||||
ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int64_t netId, int read, int* useGdr);
|
||||
ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush);
|
||||
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
|
||||
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net);
|
||||
int ncclPxnDisable(struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
|
||||
ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu);
|
||||
|
||||
// Find CPU affinity
|
||||
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
|
||||
@@ -76,7 +77,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
|
||||
#define NCCL_TOPO_PATTERN_COLLNET_DIRECT 6 // Collnet Direct
|
||||
struct ncclTopoGraph {
|
||||
// Input / output
|
||||
int id; // ring : 0, tree : 1, collnet : 2
|
||||
int id; // ring : 0, tree : 1, collnet : 2, nvls : 3, collnetDirect : 4
|
||||
int pattern;
|
||||
int crossNic;
|
||||
int collNet;
|
||||
|
||||
@@ -50,7 +50,7 @@ typedef enum {
|
||||
ncclNumFuncs = 8
|
||||
} ncclFunc_t;
|
||||
|
||||
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
|
||||
#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
|
||||
#define NCCL_ALGO_UNDEF -1
|
||||
#define NCCL_ALGO_TREE 0
|
||||
#define NCCL_ALGO_RING 1
|
||||
@@ -58,6 +58,7 @@ typedef enum {
|
||||
#define NCCL_ALGO_COLLNET_CHAIN 3
|
||||
#define NCCL_ALGO_NVLS 4
|
||||
#define NCCL_ALGO_NVLS_TREE 5
|
||||
#define NCCL_ALGO_PAT 6
|
||||
|
||||
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
|
||||
#define NCCL_PROTO_UNDEF -1
|
||||
|
||||
@@ -0,0 +1,150 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_PROFILER_H_
|
||||
#define NCCL_PROFILER_H_
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
enum {
|
||||
ncclProfileGroup = (1 << 0), // group event type
|
||||
ncclProfileColl = (1 << 1), // host collective call event type
|
||||
ncclProfileP2p = (1 << 2), // host point-to-point call event type
|
||||
ncclProfileProxyOp = (1 << 3), // proxy operation event type
|
||||
ncclProfileProxyStep = (1 << 4), // proxy step event type
|
||||
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
|
||||
ncclProfileNumEvents = ( 6),
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
uint8_t type; // event type descriptor: ncclProfileColl, ...
|
||||
void* parentObj; // pointer to the profiler parent object (for coll is the group)
|
||||
int rank; // originating rank
|
||||
union {
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint64_t seqNumber;
|
||||
uint8_t func;
|
||||
void const* sendBuff;
|
||||
void* recvBuff;
|
||||
size_t count;
|
||||
int root;
|
||||
uint8_t datatype;
|
||||
uint32_t op;
|
||||
size_t trafficBytes;
|
||||
uint8_t nMaxChannels;
|
||||
uint8_t nWarps;
|
||||
uint8_t algo;
|
||||
uint8_t proto;
|
||||
int isCollnet;
|
||||
int isNvls;
|
||||
} coll;
|
||||
|
||||
struct {
|
||||
const char* name;
|
||||
uint64_t commHash;
|
||||
uint8_t func;
|
||||
void* buff;
|
||||
uint8_t datatype;
|
||||
size_t count;
|
||||
int peer;
|
||||
} p2p;
|
||||
|
||||
struct {
|
||||
pid_t pid; // pid of the originating process
|
||||
uint8_t channelId; // channel id for this proxy operation
|
||||
int peer; // remote rank for send/recv
|
||||
int nSteps; // number of steps for this proxy operation
|
||||
int chunkSize; // amount of data transferred by this proxy operation
|
||||
int isSend;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int step;
|
||||
} proxyStep;
|
||||
};
|
||||
} ncclProfilerEventDescr_v1_t;
|
||||
|
||||
typedef enum {
|
||||
ncclProfilerProxyOpSendPosted,
|
||||
ncclProfilerProxyOpSendRemFifoWait,
|
||||
ncclProfilerProxyOpSendTransmitted,
|
||||
ncclProfilerProxyOpSendDone,
|
||||
ncclProfilerProxyOpRecvPosted,
|
||||
ncclProfilerProxyOpRecvReceived,
|
||||
ncclProfilerProxyOpRecvTransmitted,
|
||||
ncclProfilerProxyOpRecvDone,
|
||||
|
||||
/* Legacy proxy profiler states */
|
||||
ncclProfilerProxyStepSendGPUWait,
|
||||
ncclProfilerProxyStepSendWait,
|
||||
ncclProfilerProxyStepRecvWait,
|
||||
ncclProfilerProxyStepRecvFlushWait,
|
||||
ncclProfilerProxyStepRecvGPUWait,
|
||||
|
||||
/* Legacy proxy control states */
|
||||
ncclProfilerProxyCtrlIdle,
|
||||
ncclProfilerProxyCtrlActive,
|
||||
ncclProfilerProxyCtrlSleep,
|
||||
ncclProfilerProxyCtrlWakeup,
|
||||
ncclProfilerProxyCtrlAppend,
|
||||
ncclProfilerProxyCtrlAppendEnd,
|
||||
} ncclProfilerEventState_v1_t;
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
size_t transSize;
|
||||
int steps;
|
||||
} proxyOp;
|
||||
|
||||
struct {
|
||||
int appendedProxyOps;
|
||||
} proxyCtrl;
|
||||
} ncclProfilerEventStateArgs_v1_t;
|
||||
|
||||
typedef struct {
|
||||
const char* name;
|
||||
|
||||
// init - initialize the profiler plugin
|
||||
// Input
|
||||
// - context : opaque profiler context object for separating profiler behavior across comms
|
||||
// Output
|
||||
// - eActivationMask: bitmask of active events set by the plugin
|
||||
ncclResult_t (*init)(void** context, int* eActivationMask);
|
||||
|
||||
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
// - eDescr : pointer to ncclProfilerEventDescr_t object
|
||||
// Output
|
||||
// - eHandle: return event handle for supplied event descriptor object
|
||||
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
|
||||
|
||||
// stopEvent - stop/finalize an event inside and event set
|
||||
// Input
|
||||
// - eHandle: handle to event object
|
||||
ncclResult_t (*stopEvent)(void* eHandle);
|
||||
|
||||
// recordEventState - record event state transitions and event attribute updates
|
||||
// Input
|
||||
// - eHandle : handle to event object created through startEvent
|
||||
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
|
||||
// - eState : event state transition
|
||||
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
|
||||
|
||||
// finalize - finalize the profiler plugin
|
||||
// Input
|
||||
// - context: opaque profiler context object
|
||||
ncclResult_t (*finalize)(void* context);
|
||||
} ncclProfiler_v1_t;
|
||||
|
||||
typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t;
|
||||
typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t;
|
||||
typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t;
|
||||
typedef ncclProfiler_v1_t ncclProfiler_t;
|
||||
|
||||
#endif
|
||||
@@ -16,20 +16,23 @@
|
||||
#endif
|
||||
|
||||
// Define all NCCL-provided static schema IDs here (avoid duplicates).
|
||||
#define NVTX_SID_CommInitRank 0
|
||||
#define NVTX_SID_CommInitAll 1
|
||||
#define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank
|
||||
#define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank
|
||||
#define NVTX_SID_AllGather 4
|
||||
#define NVTX_SID_AllReduce 5
|
||||
#define NVTX_SID_Broadcast 6
|
||||
#define NVTX_SID_ReduceScatter 7
|
||||
#define NVTX_SID_Reduce 8
|
||||
#define NVTX_SID_Send 9
|
||||
#define NVTX_SID_Recv 10
|
||||
#define NVTX_SID_CommInitRank 0
|
||||
#define NVTX_SID_CommInitAll 1
|
||||
#define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank
|
||||
#define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank
|
||||
#define NVTX_SID_AllGather 4
|
||||
#define NVTX_SID_AllReduce 5
|
||||
#define NVTX_SID_Broadcast 6
|
||||
#define NVTX_SID_ReduceScatter 7
|
||||
#define NVTX_SID_Reduce 8
|
||||
#define NVTX_SID_Send 9
|
||||
#define NVTX_SID_Recv 10
|
||||
#define NVTX_SID_CommInitRankConfig 11 // same schema as NVTX_SID_CommInitRank
|
||||
#define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank
|
||||
#define NVTX_SID_CommSplit 13
|
||||
|
||||
// Define static schema ID for the reduction operation.
|
||||
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
|
||||
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
|
||||
|
||||
extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
|
||||
|
||||
|
||||
@@ -34,11 +34,36 @@ typedef union {
|
||||
// Legacy CUDA IPC
|
||||
cudaIpcMemHandle_t devIpc;
|
||||
// cuMem API support
|
||||
ncclCuDesc cuDesc;
|
||||
struct {
|
||||
ncclCuDesc cuDesc;
|
||||
CUmemGenericAllocationHandle memHandle;
|
||||
};
|
||||
} ncclIpcDesc;
|
||||
|
||||
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr);
|
||||
enum ncclIpcRegType {
|
||||
NCCL_IPC_SENDRECV = 0,
|
||||
NCCL_IPC_COLLECTIVE = 1
|
||||
};
|
||||
|
||||
struct ncclIpcImpInfo {
|
||||
void* rmtRegAddr;
|
||||
bool legacyIpcCap;
|
||||
uintptr_t offset;
|
||||
};
|
||||
|
||||
struct ncclIpcRegInfo {
|
||||
int peerRank;
|
||||
void* baseAddr;
|
||||
struct ncclProxyConnector* ipcProxyconn;
|
||||
struct ncclIpcImpInfo impInfo;
|
||||
};
|
||||
|
||||
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int directMap, ncclIpcDesc *ipcDesc, void **ptr);
|
||||
ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
|
||||
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
|
||||
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
|
||||
ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut);
|
||||
ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts);
|
||||
|
||||
ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -4,34 +4,52 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_PROFILER_H_
|
||||
#define NCCL_PROFILER_H_
|
||||
#ifndef PROFILER_H_
|
||||
#define PROFILER_H_
|
||||
|
||||
#include "proxy.h"
|
||||
#include <cuda_runtime.h>
|
||||
#include "nccl_profiler.h"
|
||||
|
||||
enum ncclProxyProfileState {
|
||||
ncclProxyProfileBegin = 0,
|
||||
struct ncclProxyArgs;
|
||||
struct ncclKernelPlan;
|
||||
struct ncclTaskColl;
|
||||
struct ncclTaskP2p;
|
||||
struct ncclInfo;
|
||||
struct ncclComm;
|
||||
struct ncclProxyOp;
|
||||
|
||||
ncclProxyProfileSendGPUWait = 1,
|
||||
ncclProxyProfileSendWait = 2,
|
||||
// Plugin Init/Finalize Wrappers
|
||||
ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm);
|
||||
ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm);
|
||||
|
||||
ncclProxyProfileRecvWait = 1,
|
||||
ncclProxyProfileRecvFlushWait = 2,
|
||||
ncclProxyProfileRecvGPUWait = 3,
|
||||
// Profiler Start/Stop Group Wrappers
|
||||
ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan);
|
||||
ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan);
|
||||
|
||||
ncclProxyProfileEnd = 4,
|
||||
// Profiler Start/Stop Task Events Wrappers
|
||||
ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan);
|
||||
ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan);
|
||||
|
||||
ncclProxyProfileSleep = 8,
|
||||
ncclProxyProfileWakeup = 9,
|
||||
// Proxy Op Start/Stop Event Wrappers
|
||||
ncclResult_t ncclProfilerStartSendProxyOpEvent(int sub, struct ncclProxyArgs* args);
|
||||
ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* args);
|
||||
ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args);
|
||||
|
||||
ncclProxyProfileIdle = 16,
|
||||
ncclProxyProfileActive = 17,
|
||||
// Proxy Step Start/Stop Event Wrappers
|
||||
ncclResult_t ncclProfilerStartSendProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
|
||||
ncclResult_t ncclProfilerStartRecvProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
|
||||
ncclResult_t ncclProfilerStopProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
|
||||
|
||||
ncclProxyProfileAppend = 24,
|
||||
ncclProxyProfileAppendEnd = 25
|
||||
};
|
||||
// Proxy Control Start/Stop Events Wrappers
|
||||
ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle);
|
||||
ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
|
||||
|
||||
ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state);
|
||||
void ncclProfilingDump();
|
||||
// Record Event Wrappers
|
||||
ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
|
||||
ncclResult_t ncclProfilerRecordProxyStepEventStates(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState);
|
||||
ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState);
|
||||
|
||||
// Profiler utility functions
|
||||
ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
#include "ipcsocket.h"
|
||||
#include "nccl_net.h"
|
||||
#include <pthread.h>
|
||||
#include "shm.h"
|
||||
#include "shmutils.h"
|
||||
#include "p2p.h"
|
||||
|
||||
typedef enum : uint8_t {
|
||||
@@ -28,6 +28,8 @@ typedef enum : uint8_t {
|
||||
ncclPatternCollnetDirect,
|
||||
ncclPatternNvls,
|
||||
ncclPatternNvlsTree,
|
||||
ncclPatternPatUp,
|
||||
ncclPatternPatDown,
|
||||
ncclPatternSend,
|
||||
ncclPatternRecv
|
||||
} ncclPattern_t;
|
||||
@@ -72,6 +74,19 @@ struct ncclProxyOp {
|
||||
|
||||
union ncclProxyOpSpecifics specifics;
|
||||
|
||||
// Profiler plugin
|
||||
union {
|
||||
struct ncclTaskColl* coll;
|
||||
struct ncclTaskP2p* p2p;
|
||||
} task;
|
||||
|
||||
int eActivationMask;
|
||||
void* taskEventHandle;
|
||||
int rank;
|
||||
int peer;
|
||||
pid_t pid;
|
||||
void* profilerContext;
|
||||
|
||||
struct ncclProxyOp *enqNext;
|
||||
};
|
||||
|
||||
@@ -100,7 +115,15 @@ struct ncclProxySubArgs {
|
||||
uint64_t done;
|
||||
uint64_t end;
|
||||
void* requests[NCCL_STEPS];
|
||||
void* profilingEvents[NCCL_STEPS];
|
||||
|
||||
// Profiler plugin
|
||||
int eActivationMask;
|
||||
int rank;
|
||||
void* taskEventHandle;
|
||||
void* opEventHandle;
|
||||
void* stepEventHandles[NCCL_STEPS];
|
||||
size_t transSize;
|
||||
|
||||
void* recvRequestsCache[NCCL_STEPS];
|
||||
int recvRequestsSubCount;
|
||||
};
|
||||
@@ -129,6 +152,10 @@ struct ncclProxyArgs {
|
||||
|
||||
int idle;
|
||||
|
||||
// Profiler plugin
|
||||
pid_t pid;
|
||||
void* profilerContext;
|
||||
|
||||
// Element linking
|
||||
struct ncclProxyArgs* next;
|
||||
struct ncclProxyArgs* nextPeer;
|
||||
@@ -261,6 +288,7 @@ struct ncclProxyState {
|
||||
ncclNet_t* ncclNet;
|
||||
ncclCollNet_t* ncclCollNet;
|
||||
uint32_t* abortFlag;
|
||||
bool directMode;
|
||||
// Service threads
|
||||
pthread_t thread;
|
||||
pthread_t threadUDS;
|
||||
@@ -281,6 +309,9 @@ struct ncclProxyState {
|
||||
// Progress thread
|
||||
struct ncclProxyProgressState progressState;
|
||||
|
||||
// Profiler plugin
|
||||
void* profilerContext;
|
||||
|
||||
// Queue of expected responses from the proxy
|
||||
struct ncclExpectedProxyResponse* expectedResponses;
|
||||
};
|
||||
@@ -332,8 +363,9 @@ enum ncclProxyMsgType {
|
||||
ncclProxyMsgAbort = 7,
|
||||
ncclProxyMsgStop = 8,
|
||||
ncclProxyMsgGetFd = 9, // cuMem API support (UDS)
|
||||
ncclProxyMsgRegister = 10,
|
||||
ncclProxyMsgDeregister = 11
|
||||
ncclProxyMsgQueryFd = 10,
|
||||
ncclProxyMsgRegister = 11,
|
||||
ncclProxyMsgDeregister = 12
|
||||
};
|
||||
|
||||
// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
|
||||
@@ -347,6 +379,7 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec
|
||||
|
||||
// UDS support
|
||||
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int rank, void *handle, int* convertedFd);
|
||||
ncclResult_t ncclProxyClientQueryFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int localFd, int* rmtFd);
|
||||
|
||||
ncclResult_t ncclProxyStop(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
|
||||
|
||||
@@ -11,7 +11,13 @@ enum {
|
||||
NVLS_REG_COMPLETE = 0x02,
|
||||
NVLS_REG_POSSIBLE = 0x04,
|
||||
NVLS_REG_NO_SUPPORT = 0x08,
|
||||
COLLNET_REG_COMPLETE = 0x10
|
||||
COLLNET_REG_COMPLETE = 0x10,
|
||||
IPC_REG_COMPLETE = 0x20
|
||||
};
|
||||
|
||||
struct ncclPeerRegIpcAddr {
|
||||
uintptr_t* devPeerRmtAddrs;
|
||||
uintptr_t* hostPeerRmtAddrs;
|
||||
};
|
||||
|
||||
struct ncclReg {
|
||||
@@ -34,7 +40,10 @@ struct ncclReg {
|
||||
uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
|
||||
// collnet reg
|
||||
void* collnetHandle;
|
||||
struct ncclProxyConnector* proxyconn;
|
||||
struct ncclProxyConnector* collnetProxyconn;
|
||||
// general ipc reg
|
||||
struct ncclPeerRegIpcAddr regIpcAddrs;
|
||||
struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS];
|
||||
};
|
||||
|
||||
struct ncclRegCache {
|
||||
|
||||
@@ -1,26 +1,37 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_SHM_H_
|
||||
#define NCCL_SHM_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "comm.h"
|
||||
|
||||
typedef void* ncclShmHandle_t;
|
||||
ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
|
||||
ncclResult_t ncclShmClose(ncclShmHandle_t handle);
|
||||
ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
|
||||
|
||||
struct ncclShmemCollBuff {
|
||||
volatile size_t *cnt[2];
|
||||
volatile void *ptr[2];
|
||||
int round;
|
||||
size_t maxTypeSize;
|
||||
struct shmLegacyIpc {
|
||||
char shmSuffix[7];
|
||||
ncclShmHandle_t handle;
|
||||
size_t shmSize;
|
||||
};
|
||||
|
||||
ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
|
||||
struct shmCuIpc {
|
||||
union {
|
||||
CUmemFabricHandle handle;
|
||||
CUmemGenericAllocationHandle data;
|
||||
};
|
||||
int tpProxyRank;
|
||||
void *ptr;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
struct shmIpcDesc {
|
||||
union
|
||||
{
|
||||
struct shmLegacyIpc shmli;
|
||||
struct shmCuIpc shmci;
|
||||
};
|
||||
bool legacy;
|
||||
};
|
||||
|
||||
typedef struct shmIpcDesc ncclShmIpcDesc_t;
|
||||
|
||||
ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
|
||||
ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
|
||||
ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_SHMUTILS_H_
|
||||
#define NCCL_SHMUTILS_H_
|
||||
|
||||
#include "nccl.h"
|
||||
|
||||
typedef void* ncclShmHandle_t;
|
||||
ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
|
||||
ncclResult_t ncclShmClose(ncclShmHandle_t handle);
|
||||
ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
|
||||
|
||||
struct ncclShmemCollBuff {
|
||||
volatile size_t *cnt[2];
|
||||
volatile void *ptr[2];
|
||||
int round;
|
||||
size_t maxTypeSize;
|
||||
};
|
||||
|
||||
ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
|
||||
|
||||
#endif
|
||||
@@ -33,15 +33,15 @@ static double startTimes[8];
|
||||
#define TIME_START(index) do { \
|
||||
counts[index]++; \
|
||||
startTimes[index] = gettime(); \
|
||||
} while (0);
|
||||
} while (0)
|
||||
|
||||
#define TIME_STOP(index) do { \
|
||||
times[index] += gettime() - startTimes[index]; \
|
||||
} while (0);
|
||||
} while (0)
|
||||
|
||||
#define TIME_CANCEL(index) do { \
|
||||
counts[index]--; \
|
||||
} while (0);
|
||||
} while (0)
|
||||
|
||||
#define TIME_PRINT(name) do { \
|
||||
printf("%s stats", name); \
|
||||
@@ -50,11 +50,11 @@ static double startTimes[8];
|
||||
counts[i] = 0; \
|
||||
} \
|
||||
printf("\n"); \
|
||||
} while (0);
|
||||
} while (0)
|
||||
#else
|
||||
#define TIME_START(index) while(0);
|
||||
#define TIME_STOP(index) while(0);
|
||||
#define TIME_CANCEL(index) while(0);
|
||||
#define TIME_START(index) do {} while(0)
|
||||
#define TIME_STOP(index) do {} while(0)
|
||||
#define TIME_CANCEL(index) do {} while(0)
|
||||
#define TIME_PRINT(name)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -48,9 +48,10 @@ struct ncclPeerInfo {
|
||||
// MNNVL support
|
||||
nvmlGpuFabricInfoV_t fabricInfo;
|
||||
int cuMemSupport;
|
||||
int version;
|
||||
};
|
||||
|
||||
#define CONNECT_SIZE 128
|
||||
#define CONNECT_SIZE 256
|
||||
struct ncclConnect {
|
||||
char data[CONNECT_SIZE];
|
||||
};
|
||||
@@ -91,7 +92,6 @@ struct ncclCollNetSharedRes {
|
||||
void* resources;
|
||||
int nChannels;
|
||||
size_t buffSize;
|
||||
int intraHighestTransportType;
|
||||
};
|
||||
|
||||
struct ncclTransportComm {
|
||||
@@ -109,13 +109,14 @@ struct ncclTransportComm {
|
||||
|
||||
struct ncclTransport {
|
||||
const char name[8];
|
||||
ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
|
||||
ncclResult_t (*canConnect)(int*, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
|
||||
struct ncclTransportComm send;
|
||||
struct ncclTransportComm recv;
|
||||
};
|
||||
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
|
||||
ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode);
|
||||
|
||||
ncclResult_t ncclNvlsInit(struct ncclComm* comm);
|
||||
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
|
||||
@@ -127,7 +128,7 @@ ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdevi
|
||||
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
|
||||
|
||||
enum { collNetRecv=0, collNetSend=1 };
|
||||
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect);
|
||||
bool ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect);
|
||||
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
|
||||
ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
|
||||
ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufUsed, void** outHandle);
|
||||
@@ -136,6 +137,7 @@ ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConne
|
||||
|
||||
ncclResult_t ncclTransportRingConnect(struct ncclComm* comm);
|
||||
ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm);
|
||||
ncclResult_t ncclTransportPatConnect(struct ncclComm* comm);
|
||||
|
||||
ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* graphs[]);
|
||||
ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm);
|
||||
|
||||
@@ -27,7 +27,6 @@ ncclResult_t busIdToInt64(const char* busId, int64_t* id);
|
||||
ncclResult_t getBusId(int cudaDev, int64_t *busId);
|
||||
|
||||
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||
uint64_t getHash(const char* string, int n);
|
||||
uint64_t getHostHash();
|
||||
uint64_t getPidHash();
|
||||
ncclResult_t getRandomData(void* buffer, size_t bytes);
|
||||
|
||||
+300
-122
@@ -37,7 +37,7 @@
|
||||
#endif
|
||||
|
||||
const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
|
||||
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree" };
|
||||
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree", "PAT" };
|
||||
const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
|
||||
|
||||
NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
|
||||
@@ -101,9 +101,15 @@ NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
|
||||
ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
|
||||
NCCLCHECK(ncclInit());
|
||||
NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
|
||||
ncclResult_t res = bootstrapGetUniqueId((struct ncclBootstrapHandle*)out);
|
||||
struct ncclBootstrapHandle handle;
|
||||
NCCLCHECK(bootstrapGetUniqueId(&handle));
|
||||
// ncclUniqueId and bootstrapHandle don't have the same size and alignment
|
||||
// reset to 0 to avoid undefined data
|
||||
memset(out, 0, sizeof(*out));
|
||||
// copy to avoid alignment mismatch
|
||||
memcpy(out, &handle, sizeof(handle));
|
||||
TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out));
|
||||
return res;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Prevent compiler from optimizing out these operations
|
||||
@@ -147,7 +153,7 @@ void ncclCommPushCudaFree(struct ncclComm* comm, void* obj) {
|
||||
}
|
||||
|
||||
static ncclResult_t ncclDestructorFnCudaHostFree(struct ncclDestructor* dtor) {
|
||||
CUDACHECK(cudaFreeHost(dtor->obj));
|
||||
NCCLCHECK(ncclCudaHostFree(dtor->obj));
|
||||
return ncclSuccess;
|
||||
}
|
||||
void ncclCommPushCudaHostFree(struct ncclComm* comm, void* obj) {
|
||||
@@ -180,13 +186,15 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
* free all intra-process communicators; therefore, we only need to focus on local
|
||||
* resource cleanup in commFree(). */
|
||||
if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
|
||||
pthread_join(comm->proxyState->thread, nullptr);
|
||||
PTHREADCHECK(pthread_join(comm->proxyState->thread, nullptr), "pthread_join");
|
||||
if (comm->proxyState->threadUDS) {
|
||||
// UDS support
|
||||
pthread_join(comm->proxyState->threadUDS, nullptr);;
|
||||
PTHREADCHECK(pthread_join(comm->proxyState->threadUDS, nullptr), "pthread_join");
|
||||
}
|
||||
}
|
||||
|
||||
CUDACHECK(cudaMemPoolDestroy(comm->memPool));
|
||||
|
||||
delete[] comm->userRedOps;
|
||||
|
||||
free(comm->connectSend);
|
||||
@@ -244,12 +252,14 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
|
||||
free(comm->topParentRanks);
|
||||
free(comm->topParentLocalRanks);
|
||||
free(comm->gproxyConn);
|
||||
|
||||
NCCLCHECK(ncclRegCleanup(comm));
|
||||
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy");
|
||||
|
||||
commPoison(comm); // poison comm before free to avoid comm reuse.
|
||||
NCCLCHECK(ncclProfilerPluginFinalize(comm));
|
||||
NCCLCHECK(ncclNetFinalize(comm));
|
||||
NCCLCHECK(ncclNetPluginUnload(comm));
|
||||
free(comm);
|
||||
@@ -328,6 +338,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
|
||||
NCCLCHECK(ncclNetPluginLoad(comm));
|
||||
NCCLCHECK(ncclNetInit(comm));
|
||||
NCCLCHECK(ncclProfilerPluginInit(comm));
|
||||
INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
|
||||
|
||||
if (parent && parent->config.splitShare) {
|
||||
@@ -393,8 +404,28 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
}
|
||||
|
||||
ncclIntruQueueMpscConstruct(&comm->callbackQueue);
|
||||
ncclIntruQueueConstruct(&comm->legacyRegCleanupQueue);
|
||||
|
||||
comm->regCache.pageSize = sysconf(_SC_PAGESIZE);
|
||||
|
||||
do {
|
||||
cudaMemPoolProps props = {};
|
||||
props.allocType = cudaMemAllocationTypePinned;
|
||||
props.handleTypes = cudaMemHandleTypeNone;
|
||||
props.location.type = cudaMemLocationTypeDevice;
|
||||
props.location.id = comm->cudaDev;
|
||||
CUDACHECK(cudaMemPoolCreate(&comm->memPool, &props));
|
||||
uint64_t releaseThreshold = ~uint64_t(0);
|
||||
CUDACHECK(cudaMemPoolSetAttribute(comm->memPool, cudaMemPoolAttrReleaseThreshold, &releaseThreshold));
|
||||
} while (0);
|
||||
|
||||
ncclIntruQueueConstruct(&comm->eventCallbackQueue);
|
||||
|
||||
// setup intraComm0 and intraRanks 0 to default values to ensure proper cleanup of the communicator
|
||||
comm->intraComm0 = comm;
|
||||
comm->intraRank = 0;
|
||||
comm->intraRanks = 1;
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -408,12 +439,16 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
|
||||
ncclCommPushCudaFree(comm, devCommAndChans);
|
||||
NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
|
||||
ncclCommPushCudaFree(comm, tmpCommAndChans.comm.rankToLocalRank);
|
||||
NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
|
||||
comm->devComm = &devCommAndChans->comm;
|
||||
tmpCommAndChans.comm.rank = comm->rank;
|
||||
tmpCommAndChans.comm.nRanks = nRanks;
|
||||
tmpCommAndChans.comm.node = comm->node;
|
||||
tmpCommAndChans.comm.nNodes = comm->nNodes;
|
||||
tmpCommAndChans.comm.abortFlag = comm->abortFlagDev;
|
||||
tmpCommAndChans.comm.isNvlink = ncclTopoPathAllNVLink(comm->topo);
|
||||
for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
|
||||
tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
|
||||
}
|
||||
@@ -498,10 +533,13 @@ static void showVersion() {
|
||||
}
|
||||
}
|
||||
|
||||
NCCL_PARAM(MNNVLCliqueId, "MNNVL_CLIQUE_ID", -1);
|
||||
|
||||
static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
|
||||
info->rank = comm->rank;
|
||||
info->cudaDev = comm->cudaDev;
|
||||
info->nvmlDev = comm->nvmlDev;
|
||||
NCCLCHECK(ncclGetVersion(&info->version));
|
||||
info->hostHash=getHostHash()+commHash;
|
||||
info->pidHash=getPidHash()+commHash;
|
||||
info->cuMemSupport = ncclCuMemEnable();
|
||||
@@ -534,6 +572,7 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
|
||||
((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
|
||||
info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
|
||||
}
|
||||
if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
@@ -677,7 +716,8 @@ static int checkMNNVL(struct ncclComm* comm) {
|
||||
#define TIMER_INIT_TOPO 4
|
||||
#define TIMER_INIT_GRAPHS 5
|
||||
#define TIMER_INIT_CONNECT 6
|
||||
#define TIMERS_INIT_COUNT 7
|
||||
#define TIMER_INIT_ALLOC 7
|
||||
#define TIMERS_INIT_COUNT 8
|
||||
|
||||
static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
|
||||
// We use 2 AllGathers
|
||||
@@ -693,7 +733,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
struct ncclTopoGraph* collNetChainGraph = &comm->graphs[NCCL_ALGO_COLLNET_CHAIN];
|
||||
struct ncclTopoGraph* collNetDirectGraph = &comm->graphs[NCCL_ALGO_COLLNET_DIRECT];
|
||||
struct ncclTopoGraph* nvlsGraph = &comm->graphs[NCCL_ALGO_NVLS];
|
||||
struct ncclTopoGraph* graphs[] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph };
|
||||
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph, treeGraph };
|
||||
|
||||
struct graphInfo {
|
||||
int pattern;
|
||||
@@ -722,7 +762,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
struct ncclProxyConnector proxyConn;
|
||||
int* pxnPeers = NULL;
|
||||
int *topParentLocalRanks = NULL;
|
||||
int tpProxyRank;
|
||||
|
||||
timers[TIMER_INIT_ALLGATHER] = clockNano();
|
||||
// AllGather1 - begin
|
||||
@@ -732,6 +771,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
|
||||
comm->cuMemSupport = 1;
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
if (comm->peerInfo[i].version != comm->peerInfo[rank].version) {
|
||||
WARN("Mismatched NCCL version detected : rank %d version %d rank %d version %d",
|
||||
i, comm->peerInfo[i].version, rank, comm->peerInfo[rank].version);
|
||||
ret = ncclInvalidUsage;
|
||||
goto fail;
|
||||
}
|
||||
if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++;
|
||||
if (!comm->peerInfo[i].cuMemSupport) comm->cuMemSupport = 0;
|
||||
if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
|
||||
@@ -869,7 +914,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
collNetChainGraph->maxChannels = ringGraph->nChannels;
|
||||
|
||||
memset(collNetDirectGraph, 0, sizeof(struct ncclTopoGraph));
|
||||
collNetDirectGraph->id = 2;
|
||||
collNetDirectGraph->id = 4;
|
||||
collNetDirectGraph->pattern = NCCL_TOPO_PATTERN_COLLNET_DIRECT;
|
||||
collNetDirectGraph->collNet = 1;
|
||||
collNetDirectGraph->minChannels = 1;
|
||||
@@ -1031,18 +1076,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
|
||||
comm->collNetSupport = 0;
|
||||
}
|
||||
comm->collNetRegSupport = true;
|
||||
for (int n=0; n<comm->nNodes; n++) {
|
||||
if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) {
|
||||
WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1);
|
||||
comm->collNetSupport = 0;
|
||||
break;
|
||||
}
|
||||
if (comm->nodeRanks[n].localRanks > 1) {
|
||||
// As long as there is more than 1 rank on any node, we need to disable collnet reg
|
||||
comm->collNetRegSupport = false;
|
||||
}
|
||||
}
|
||||
// As long as there is more than 1 rank on any node, we need to disable collnet reg
|
||||
comm->collNetRegSupport = (comm->maxLocalRanks == 1);
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
|
||||
@@ -1085,6 +1120,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
}
|
||||
comm->topParentLocalRanks = topParentLocalRanks;
|
||||
|
||||
NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->intraNodeP2pSupport, &comm->directMode), ret, fail);
|
||||
// Launch proxy service thread, after this, the proxy calls can be used.
|
||||
if (parent && parent->config.splitShare) {
|
||||
comm->proxyState = parent->sharedRes->proxyState;
|
||||
@@ -1092,7 +1128,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
} else {
|
||||
NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&comm->gproxyConn, comm->nRanks), ret, fail);
|
||||
|
||||
timers[TIMER_INIT_CONNECT] = clockNano();
|
||||
do { // Build p2p schedule
|
||||
int node = comm->node;
|
||||
@@ -1168,6 +1205,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
// Connect Trees
|
||||
NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
|
||||
|
||||
// Connect PAT only for communicators with 1 GPU per node
|
||||
if (comm->maxLocalRanks == 1) NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
|
||||
|
||||
// Setup NVLS
|
||||
NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
|
||||
NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
|
||||
@@ -1179,12 +1219,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
if (comm->collNetSupport > 0) {
|
||||
ncclCollNetSetup(comm, parent, graphs);
|
||||
NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
|
||||
if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
|
||||
NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
|
||||
}
|
||||
}
|
||||
|
||||
// Connect to local net proxy
|
||||
tpProxyRank = comm->topParentRanks[comm->rank];
|
||||
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
|
||||
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
|
||||
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
|
||||
|
||||
// Then to remote ones when using PXN
|
||||
@@ -1192,8 +1233,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
int nranks;
|
||||
NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
|
||||
for (int r=0; r<nranks; r++) {
|
||||
tpProxyRank = comm->topParentRanks[pxnPeers[r]];
|
||||
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
|
||||
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
|
||||
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
|
||||
}
|
||||
}
|
||||
@@ -1286,17 +1326,20 @@ NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT);
|
||||
NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT);
|
||||
#define NCCL_MAX_CGA_CLUSTER_SIZE 8
|
||||
|
||||
#define NCCL_COMMINIT_FUNCNAME_LEN 128
|
||||
struct ncclCommInitRankAsyncJob {
|
||||
struct ncclAsyncJob base;
|
||||
struct ncclComm* comm;
|
||||
struct ncclComm** newcomm;
|
||||
int cudaDev;
|
||||
// For ncclCommInitRank
|
||||
int nranks, myrank;
|
||||
ncclUniqueId commId;
|
||||
int nranks, myrank, nId;
|
||||
ncclUniqueId* commId;
|
||||
// for ncclCommSplit
|
||||
struct ncclComm* parent;
|
||||
int color, key;
|
||||
// name of the function calling
|
||||
char funcName[NCCL_COMMINIT_FUNCNAME_LEN];
|
||||
};
|
||||
|
||||
struct ncclCommFinalizeAsyncJob {
|
||||
@@ -1306,30 +1349,31 @@ struct ncclCommFinalizeAsyncJob {
|
||||
|
||||
NCCL_PARAM(CommSplitShareResources, "COMM_SPLIT_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT);
|
||||
|
||||
typedef struct{
|
||||
int key;
|
||||
int color;
|
||||
} commSplitInfo;
|
||||
static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* nRanksRet, int* myRankRet, int* parentRanksRet) {
|
||||
int* colors = NULL;
|
||||
int* keys = NULL;
|
||||
int nRanks = 0, myRank = 0;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&colors, parent->nRanks), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc(&keys, parent->nRanks), ret, fail);
|
||||
commSplitInfo* info = NULL;
|
||||
NCCLCHECKGOTO(ncclCalloc(&info, parent->nRanks), ret, fail);
|
||||
|
||||
// Compute nRanks, my rank and the ranks (of the original comm) before and after me
|
||||
colors[parent->rank] = color;
|
||||
keys[parent->rank] = key;
|
||||
NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, colors, sizeof(int)), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, keys, sizeof(int)), ret, fail);
|
||||
info[parent->rank].color = color;
|
||||
info[parent->rank].key = key;
|
||||
NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, info, sizeof(commSplitInfo)), ret, fail);
|
||||
|
||||
// Negative color does not create a new comm. Return now.
|
||||
if (color == NCCL_SPLIT_NOCOLOR) goto exit;
|
||||
|
||||
memset(parentRanksRet, 0xff, sizeof(int) * parent->nRanks);
|
||||
for (int i = 0; i < parent->nRanks; i++) {
|
||||
if (colors[i] != color) continue;
|
||||
if (info[i].color != color) continue;
|
||||
// Find where to insert this rank
|
||||
int insert = 0;
|
||||
while (insert < nRanks && keys[parentRanksRet[insert]] <= keys[i]) insert++;
|
||||
while (insert < nRanks && info[parentRanksRet[insert]].key <= info[i].key) insert++;
|
||||
// Shift ranks by one after insert
|
||||
for (int r = nRanks; r > insert; r--) parentRanksRet[r] = parentRanksRet[r - 1];
|
||||
// Insert our rank
|
||||
@@ -1345,8 +1389,7 @@ static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* par
|
||||
*myRankRet = myRank;
|
||||
|
||||
exit:
|
||||
free(colors);
|
||||
free(keys);
|
||||
free(info);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
@@ -1361,7 +1404,9 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
int cudaDev = job->cudaDev;
|
||||
int* parentRanks = NULL;
|
||||
int cudaArch;
|
||||
uint64_t timers[TIMERS_INIT_COUNT];
|
||||
double sum_timers = 0;
|
||||
uint64_t timers[TIMERS_INIT_COUNT] = {0};
|
||||
unsigned long long commIdHash;
|
||||
|
||||
timers[TIMER_INIT_TOTAL] = clockNano();
|
||||
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
|
||||
@@ -1379,34 +1424,42 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
}
|
||||
timers[TIMER_INIT_KERNELS] = clockNano() - timers[TIMER_INIT_KERNELS];
|
||||
|
||||
timers[TIMER_INIT_BOOTSTRAP] = clockNano();
|
||||
if (job->parent) {
|
||||
NCCLCHECKGOTO(ncclCalloc(&parentRanks, job->parent->nRanks), res, fail);
|
||||
NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail);
|
||||
// Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now.
|
||||
if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
|
||||
snprintf((char*)&job->commId, sizeof(job->commId), "%016lx-%d", job->parent->commHash, job->color);
|
||||
timers[TIMER_INIT_ALLOC] = clockNano();
|
||||
NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
|
||||
NCCLCHECKGOTO(bootstrapSplit((struct ncclBootstrapHandle*)&job->commId, comm, job->parent, job->color, job->key, parentRanks), res, fail);
|
||||
timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
|
||||
// obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex), add the color
|
||||
ncclUniqueId tmpId;
|
||||
memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits
|
||||
snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d", job->parent->commHash, job->color);
|
||||
comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES);
|
||||
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d- Init START", job->funcName,
|
||||
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key);
|
||||
timers[TIMER_INIT_BOOTSTRAP] = clockNano();
|
||||
NCCLCHECKGOTO(bootstrapSplit(comm->commHash, comm, job->parent, job->color, job->key, parentRanks), res, fail);
|
||||
timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
|
||||
// debug info, no commId was used
|
||||
commIdHash = 0;
|
||||
} else {
|
||||
timers[TIMER_INIT_ALLOC] = clockNano();
|
||||
NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
|
||||
NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail);
|
||||
timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
|
||||
// obtain a unique hash using the first commId
|
||||
comm->commHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
|
||||
commIdHash = hashUniqueId(job->commId[0]);
|
||||
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName,
|
||||
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
|
||||
timers[TIMER_INIT_BOOTSTRAP] = clockNano();
|
||||
NCCLCHECKGOTO(bootstrapInit(job->nId, (struct ncclBootstrapHandle*)job->commId, comm), res, fail);
|
||||
timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
|
||||
}
|
||||
timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
|
||||
|
||||
comm->cudaArch = cudaArch;
|
||||
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES);
|
||||
|
||||
if (job->parent) {
|
||||
INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx - Init START",
|
||||
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId));
|
||||
} else {
|
||||
INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START",
|
||||
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail);
|
||||
|
||||
NCCLCHECKGOTO(ncclTunerPluginLoad(comm), res, fail);
|
||||
if (comm->tuner) {
|
||||
NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog, &comm->tunerContext));
|
||||
@@ -1420,23 +1473,25 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
if (job->parent) {
|
||||
/* unlink child abort flag. */
|
||||
__atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE);
|
||||
TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)",
|
||||
job->parent, job->color, job->key, comm, comm->rank, comm->nRanks);
|
||||
TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", job->parent, job->color, job->key, comm, comm->rank, comm->nRanks);
|
||||
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d - Init COMPLETE", job->funcName,
|
||||
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key);
|
||||
} else {
|
||||
TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)",
|
||||
comm, comm->nRanks, (unsigned long long)hashUniqueId(job->commId), comm->rank, comm->cudaDev);
|
||||
// the name for the replay tool is ncclCommInitRank for all the variations
|
||||
TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", comm, comm->nRanks, commIdHash, comm->rank, comm->cudaDev);
|
||||
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE", job->funcName,
|
||||
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
|
||||
}
|
||||
|
||||
if (job->parent) {
|
||||
INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx - Init COMPLETE",
|
||||
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId));
|
||||
} else {
|
||||
INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE",
|
||||
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
|
||||
}
|
||||
INFO(NCCL_INIT|NCCL_PROFILE,"Init timings: rank %d nranks %d total %.2f (kernels %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, connections %.2f, rest %.2f)", comm->rank, comm->nRanks, timers[TIMER_INIT_TOTAL]/1e9,
|
||||
timers[TIMER_INIT_KERNELS]/1e9, timers[TIMER_INIT_BOOTSTRAP]/1e9, timers[TIMER_INIT_ALLGATHER]/1e9, timers[TIMER_INIT_TOPO]/1e9, timers[TIMER_INIT_GRAPHS]/1e9, timers[TIMER_INIT_CONNECT]/1e9,
|
||||
(timers[TIMER_INIT_TOTAL]-timers[TIMER_INIT_KERNELS]-timers[TIMER_INIT_BOOTSTRAP]-timers[TIMER_INIT_ALLGATHER]-timers[TIMER_INIT_TOPO]-timers[TIMER_INIT_GRAPHS]-timers[TIMER_INIT_CONNECT])/1e9);
|
||||
sum_timers = 0.0;
|
||||
for (int it = 1; it < TIMERS_INIT_COUNT; ++it)
|
||||
sum_timers += (timers[it] / 1e9);
|
||||
INFO(NCCL_INIT | NCCL_PROFILE,
|
||||
"Init timings - %s: rank %d nranks %d total %.2f (kernels %.2f, alloc %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, "
|
||||
"connections %.2f, rest %.2f)",
|
||||
job->funcName, comm->rank, comm->nRanks,
|
||||
timers[TIMER_INIT_TOTAL] / 1e9, timers[TIMER_INIT_KERNELS] / 1e9, timers[TIMER_INIT_ALLOC] / 1e9,
|
||||
timers[TIMER_INIT_BOOTSTRAP] / 1e9, timers[TIMER_INIT_ALLGATHER] / 1e9, timers[TIMER_INIT_TOPO] / 1e9,
|
||||
timers[TIMER_INIT_GRAPHS] / 1e9, timers[TIMER_INIT_CONNECT] / 1e9, timers[TIMER_INIT_TOTAL] / 1e9 - sum_timers);
|
||||
exit:
|
||||
if (job->newcomm) {
|
||||
/* assign it to user pointer. */
|
||||
@@ -1621,17 +1676,24 @@ fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev, ncclConfig_t *config) {
|
||||
ncclResult_t res = ncclSuccess;
|
||||
ncclComm_t comm = NULL;
|
||||
struct ncclCommInitRankAsyncJob *job = NULL;
|
||||
const char* env = ncclGetEnv("NCCL_COMM_ID");
|
||||
if (env && myrank == 0) {
|
||||
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
|
||||
NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&commId, true), res, fail);
|
||||
}
|
||||
static void ncclCommInitJobFree(void* _job) {
|
||||
struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)_job;
|
||||
free(job->commId);
|
||||
free(_job);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId, ncclUniqueId* commId, int myrank, int cudaDev, ncclConfig_t *config, const char funcName[]) {
|
||||
if (nId <= 0 || nId > nranks) {
|
||||
WARN("improper usage of ncclCommInitRank: nId = %d, nranks=%d", nId, nranks);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
ncclResult_t res = ncclSuccess;
|
||||
const char* commIdEnv = NULL;
|
||||
ncclComm_t comm = NULL;
|
||||
struct ncclCommInitRankAsyncJob* job = NULL;
|
||||
// first call ncclInit, this will setup the environment
|
||||
NCCLCHECKGOTO(ncclInit(), res, fail);
|
||||
|
||||
if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel != NCCL_LOG_NONE && myrank == 0)) {
|
||||
static pthread_once_t once = PTHREAD_ONCE_INIT;
|
||||
pthread_once(&once, showVersion);
|
||||
@@ -1659,19 +1721,37 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
|
||||
*newcomm = comm;
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
|
||||
job->nId = nId;
|
||||
job->comm = comm;
|
||||
job->nranks = nranks;
|
||||
job->commId = commId; // C++ struct assignment
|
||||
job->myrank = myrank;
|
||||
job->cudaDev = cudaDev;
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail);
|
||||
snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", funcName);
|
||||
// need to copy the commIds to allow async commInit and to avoid alignement issues when casting from ncclUNiqueId and ncclBootstrapHandle
|
||||
// ncclUniqueIds and ncclBootstrapHandle don't have the same alignment requirements.
|
||||
// Therefore the array of Ids coming from the user might not be properly aligned to be cast into a ncclBootstrapHandle
|
||||
// copying into allocated memory guarantees that the memory is properly aligned for any objects, removing that issue
|
||||
NCCLCHECKGOTO(ncclCalloc(&job->commId, nId), res, fail);
|
||||
memcpy(job->commId, commId, nId * NCCL_UNIQUE_ID_BYTES);
|
||||
|
||||
commIdEnv = ncclGetEnv("NCCL_COMM_ID");
|
||||
if (commIdEnv && myrank == 0) {
|
||||
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commIdEnv);
|
||||
if (nId > 1) {
|
||||
INFO(NCCL_INIT | NCCL_ENV, "NCCL_COMM_ID cannot be used with more than one ncclUniqueId");
|
||||
job->nId = 1;
|
||||
}
|
||||
// start the bootstrap root before bootstrapping, use only the first handle
|
||||
NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&job->commId[0], true), res, fail);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, ncclCommInitJobFree, comm), res, fail);
|
||||
|
||||
exit:
|
||||
return ncclGroupErrCheck(res);
|
||||
fail:
|
||||
if (comm) {
|
||||
free(comm->abortFlag);
|
||||
if (comm->abortFlagDev) ncclCudaHostFree((void*)comm->abortFlagDev);
|
||||
if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev);
|
||||
free(comm->abortFlagRefCount);
|
||||
free(comm);
|
||||
}
|
||||
@@ -1703,7 +1783,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
|
||||
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
|
||||
NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
|
||||
|
||||
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config));
|
||||
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, &config, __func__));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -1713,6 +1793,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
||||
int totalnDev;
|
||||
int *gpuFlags = NULL;
|
||||
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
|
||||
int oldDev = 0;
|
||||
|
||||
constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"}
|
||||
@@ -1722,6 +1803,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
||||
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
|
||||
(void)ncclCudaLibraryInit();
|
||||
|
||||
CUDACHECK(cudaGetDevice(&oldDev));
|
||||
NCCLCHECKGOTO(PtrCheck(comms, "CommInitAll", "comms"), ret, fail);
|
||||
if (ndev < 0) {
|
||||
WARN("Invalid device count requested : %d", ndev);
|
||||
@@ -1735,7 +1817,8 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
||||
for (int i = 0; i < ndev; ++i) {
|
||||
/* invalid device check. */
|
||||
if (devlist[i] < 0 || devlist[i] >= totalnDev) {
|
||||
ret = ncclUnhandledCudaError;
|
||||
WARN("Invalid device %d (totalnDev=%d)", devlist[i], totalnDev);
|
||||
ret = ncclInvalidArgument;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
@@ -1756,13 +1839,18 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
||||
NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
|
||||
for (int i=0; i<ndev; i++) {
|
||||
// Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
|
||||
ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, &config);
|
||||
int dev = devlist ? devlist[i] : i;
|
||||
CUDACHECKGOTO(cudaSetDevice(dev), ret, fail);
|
||||
ncclCommInitRankDev(comms+i, ndev,1, &uniqueId, i, dev, &config, __func__);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
|
||||
|
||||
fail:
|
||||
exit:
|
||||
cudaSetDevice(oldDev);
|
||||
free(gpuFlags);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState) {
|
||||
@@ -1777,7 +1865,6 @@ ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState) {
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config);
|
||||
ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
int cudaDev;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
|
||||
@@ -1785,13 +1872,46 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
|
||||
(void)ncclCudaLibraryInit();
|
||||
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, fail);
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
|
||||
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
|
||||
NVTX3_FUNC_WITH_PARAMS(CommInitRankConfig, CommInitRankSchema, payload)
|
||||
|
||||
if (config == NULL)
|
||||
internalConfigPtr = &internalConfig;
|
||||
else
|
||||
internalConfigPtr = config;
|
||||
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, internalConfigPtr, __func__), ret, fail);
|
||||
|
||||
exit:
|
||||
ncclGroupErrCheck(ret);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
|
||||
return ret;
|
||||
fail:
|
||||
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommInitRankScalable, ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config);
|
||||
ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config) {
|
||||
int cudaDev;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
|
||||
ncclConfig_t *internalConfigPtr = NULL;
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
|
||||
(void)ncclCudaLibraryInit();
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
|
||||
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
|
||||
NVTX3_FUNC_WITH_PARAMS(CommInitRankScalable, CommInitRankSchema, payload)
|
||||
|
||||
if (config == NULL)
|
||||
internalConfigPtr = &internalConfig;
|
||||
else
|
||||
internalConfigPtr = config;
|
||||
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, nId, commId, myrank, cudaDev, internalConfigPtr, __func__), ret, fail);
|
||||
|
||||
exit:
|
||||
ncclGroupErrCheck(ret);
|
||||
@@ -1818,13 +1938,25 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
|
||||
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult);
|
||||
|
||||
if (comm->initState == ncclSuccess) {
|
||||
NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->hostStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), ret, fail);
|
||||
if ((ret = ncclStrongStreamSynchronize(&comm->sharedRes->hostStream)) != ncclSuccess) {
|
||||
WARN("commDestroySync: comm %p rank %d sync hostStream error %d\n", comm, comm->rank, ret);
|
||||
}
|
||||
if ((ret = ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream)) != ncclSuccess) {
|
||||
WARN("commDestroySync: comm %p rank %d sync deviceStream error %d\n", comm, comm->rank, ret);
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
|
||||
// And keep polling until all graphs referencing us die.
|
||||
while (comm->persistentRefs != 0) {
|
||||
NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail);
|
||||
}
|
||||
}
|
||||
while (!ncclIntruQueueEmpty(&comm->legacyRegCleanupQueue)) {
|
||||
struct ncclCommCallback* cb = ncclIntruQueueDequeue(&comm->legacyRegCleanupQueue);
|
||||
if (cb->fn(comm, cb) != ncclSuccess) {
|
||||
WARN("Legacy IPC cleanup callback failed comm %p (rank = %d) cb %p", comm, comm->rank, cb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((ret = ncclProxyStop(comm)) != ncclSuccess) {
|
||||
@@ -1886,14 +2018,15 @@ ncclResult_t ncclCommFinalize(ncclComm_t comm) {
|
||||
/* launch async thread to finalize comm. */
|
||||
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
|
||||
job->comm = comm;
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commDestroySync, NULL, free, comm), ret, fail);
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commDestroySync, NULL, free, comm), ret, fail);
|
||||
|
||||
exit:
|
||||
ncclGroupErrCheck(ret);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)) };
|
||||
if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); }
|
||||
return ret;
|
||||
fail:
|
||||
free(job);
|
||||
if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret);
|
||||
goto exit;
|
||||
}
|
||||
@@ -1940,13 +2073,15 @@ static ncclResult_t commReclaim(struct ncclAsyncJob* job_) {
|
||||
nextIntraComm = nextIntraComm->intraNext;
|
||||
|
||||
if ((ret = commCleanup(curIntraComm)) != ncclSuccess) {
|
||||
// We pass a freed pointer, but we don't dereference; we merely print its value, so it's OK.
|
||||
// coverity[pass_freed_arg]
|
||||
WARN("commReclaim: cleanup comm %p rank %d failed in destroy/abort, error %d", curIntraComm, curRank, ret);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
|
||||
@@ -1975,12 +2110,11 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
|
||||
NCCLCHECK(ncclCommEnsureReady(comm));
|
||||
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
|
||||
job->comm = comm;
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail);
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
|
||||
|
||||
exit:
|
||||
return res;
|
||||
fail:
|
||||
free(job);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
@@ -1991,15 +2125,6 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
|
||||
struct ncclCommFinalizeAsyncJob *job = NULL;
|
||||
ncclResult_t res = ncclSuccess;
|
||||
|
||||
NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
|
||||
NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload)
|
||||
|
||||
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
|
||||
|
||||
// Ask anything that might still be running on the device to quit
|
||||
if (comm->childAbortFlag != nullptr) {
|
||||
__atomic_store_n(comm->childAbortFlag, 1, __ATOMIC_RELEASE);
|
||||
@@ -2010,30 +2135,61 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
|
||||
comm->destroyFlag = 1;
|
||||
/* init thread must be joined before we destroy the comm,
|
||||
* and we should ignore the init error here. */
|
||||
ncclCommEnsureReady(comm);
|
||||
(void)ncclCommEnsureReady(comm);
|
||||
|
||||
// once the comm is ready, we can access ranks etc
|
||||
int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
|
||||
struct ncclCommFinalizeAsyncJob *job = NULL;
|
||||
ncclResult_t res = ncclSuccess;
|
||||
|
||||
NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
|
||||
NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload)
|
||||
|
||||
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
|
||||
job->comm = comm;
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail);
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
|
||||
|
||||
exit:
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
free(job);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
struct NvtxParamsCommSplit {
|
||||
int rank;
|
||||
int nranks;
|
||||
int cudaDev;
|
||||
int color;
|
||||
int key;
|
||||
};
|
||||
constexpr nvtxPayloadSchemaEntry_t CommSplitSchema[] = {
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommSplit, nranks)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommSplit, cudaDev)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "color", nullptr, 0, offsetof(NvtxParamsCommSplit, color)},
|
||||
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "key", nullptr, 0, offsetof(NvtxParamsCommSplit, key)},
|
||||
};
|
||||
|
||||
NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
|
||||
ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
|
||||
struct ncclCommInitRankAsyncJob *job = NULL;
|
||||
struct ncclComm* childComm = NCCL_COMM_NULL;
|
||||
ncclResult_t res = ncclSuccess;
|
||||
|
||||
NvtxParamsCommSplit payload{comm->rank, comm->nRanks, comm->cudaDev, color, key};
|
||||
NVTX3_FUNC_WITH_PARAMS(CommSplit, CommSplitSchema, payload)
|
||||
|
||||
int oldDev;
|
||||
CUDACHECK(cudaGetDevice(&oldDev));
|
||||
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
NCCLCHECKGOTO(CommCheck(comm, "CommSplit", "comm"), res, fail);
|
||||
NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail);
|
||||
NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, fail);
|
||||
|
||||
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), res, fail);
|
||||
/* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */
|
||||
*newcomm = NCCL_COMM_NULL;
|
||||
if (color == NCCL_SPLIT_NOCOLOR) {
|
||||
@@ -2073,10 +2229,12 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
|
||||
job->color = color;
|
||||
job->key = key;
|
||||
job->cudaDev = comm->cudaDev;
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail);
|
||||
snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", __func__);
|
||||
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, free, comm), res, fail);
|
||||
|
||||
exit:
|
||||
ncclGroupErrCheck(res);
|
||||
cudaSetDevice(oldDev);
|
||||
(void)ncclGroupErrCheck(res);
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
return res;
|
||||
fail:
|
||||
@@ -2179,7 +2337,7 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
CUmemGenericAllocationHandle handle;
|
||||
int cudaDev;
|
||||
int flag = 0;
|
||||
int flag;
|
||||
int dcnt;
|
||||
int mcSupport = 0;
|
||||
|
||||
@@ -2193,12 +2351,18 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
|
||||
CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
|
||||
|
||||
if (mcSupport) {
|
||||
int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
// Query device to see if FABRIC handle support is available
|
||||
flag = 0;
|
||||
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
|
||||
if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
memprop.requestedHandleTypes = ncclCuMemHandleType;
|
||||
memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
|
||||
memprop.location.id = currentDev;
|
||||
// Query device to see if RDMA support is available
|
||||
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
|
||||
flag = 0;
|
||||
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
|
||||
if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
|
||||
CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||
|
||||
@@ -2207,14 +2371,25 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
|
||||
mcprop.size = size;
|
||||
/* device cnt is a dummy value right now, it might affect mc granularity in the future. */
|
||||
mcprop.numDevices = dcnt;
|
||||
mcprop.handleTypes = ncclCuMemHandleType;
|
||||
mcprop.handleTypes = requestedHandleTypes;
|
||||
mcprop.flags = 0;
|
||||
CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
|
||||
|
||||
/* only size needs to be aligned to mcGran */
|
||||
ALIGN_SIZE(size, mcGran);
|
||||
/* Allocate the physical memory on the device */
|
||||
CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
|
||||
if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
|
||||
/* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
|
||||
CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0));
|
||||
if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) {
|
||||
requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC;
|
||||
memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
|
||||
/* Allocate the physical memory on the device */
|
||||
CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
|
||||
}
|
||||
} else {
|
||||
/* Allocate the physical memory on the device */
|
||||
CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
|
||||
}
|
||||
/* Reserve a virtual address range */
|
||||
CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0));
|
||||
/* Map the virtual address range to the physical allocation */
|
||||
@@ -2234,6 +2409,9 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
|
||||
|
||||
fallback:
|
||||
#endif
|
||||
// Coverity is right to complain that we may pass a NULL ptr to cudaMalloc. That's deliberate though:
|
||||
// we want CUDA to return an error to the caller.
|
||||
// coverity[var_deref_model]
|
||||
CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail);
|
||||
|
||||
exit:
|
||||
@@ -2272,7 +2450,7 @@ fallback:
|
||||
CUDACHECKGOTO(cudaFree(ptr), ret, fail);
|
||||
|
||||
exit:
|
||||
cudaSetDevice(saveDevice);
|
||||
CUDACHECK(cudaSetDevice(saveDevice));
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
|
||||
@@ -53,6 +53,10 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
|
||||
// ncclMaxRedOp < info->op will always be false due to the sizes of
|
||||
// the datatypes involved, and that's by design. We keep the check though
|
||||
// just as a reminder.
|
||||
// coverity[result_independent_of_operands]
|
||||
if (info->op < 0 || ncclMaxRedOp < info->op) {
|
||||
WARN("%s : invalid reduction operation %d", info->opName, info->op);
|
||||
return ncclInvalidArgument;
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
|
||||
// This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
|
||||
NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);
|
||||
|
||||
NCCL_PARAM(CuMemHostEnable, "CUMEM_HOST_ENABLE", 0);
|
||||
// Handle type used for cuMemCreate()
|
||||
CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
|
||||
@@ -49,6 +49,14 @@ int ncclCuMemEnable() {
|
||||
return param >= 0 ? param : (param == -2 && ncclCuMemSupported);
|
||||
}
|
||||
|
||||
int ncclCuMemHostEnable() {
|
||||
#if CUDART_VERSION < 12020
|
||||
return 0;
|
||||
#else
|
||||
return ncclParamCuMemHostEnable();
|
||||
#endif
|
||||
}
|
||||
|
||||
#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
@@ -81,6 +89,7 @@ DECLARE_CUDA_PFN(cuMemRelease);
|
||||
DECLARE_CUDA_PFN(cuMemRetainAllocationHandle);
|
||||
DECLARE_CUDA_PFN(cuMemSetAccess);
|
||||
DECLARE_CUDA_PFN(cuMemUnmap);
|
||||
DECLARE_CUDA_PFN(cuMemGetAllocationPropertiesFromHandle);
|
||||
/* ncclMemAlloc/Free */
|
||||
DECLARE_CUDA_PFN(cuPointerGetAttribute);
|
||||
#if CUDA_VERSION >= 11070
|
||||
@@ -107,7 +116,7 @@ bool ncclCudaLaunchBlocking = false;
|
||||
|
||||
#if CUDART_VERSION >= 12000
|
||||
#define LOAD_SYM(symbol, ignore) do { \
|
||||
cudaDriverEntryPointQueryResult driverStatus; \
|
||||
cudaDriverEntryPointQueryResult driverStatus = cudaDriverEntryPointSymbolNotFound; \
|
||||
res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault, &driverStatus); \
|
||||
if (res != cudaSuccess || driverStatus != cudaDriverEntryPointSuccess) { \
|
||||
if (!ignore) { \
|
||||
@@ -157,6 +166,7 @@ static ncclResult_t cudaPfnFuncLoader(void) {
|
||||
LOAD_SYM(cuMemRetainAllocationHandle, 1);
|
||||
LOAD_SYM(cuMemSetAccess, 1);
|
||||
LOAD_SYM(cuMemUnmap, 1);
|
||||
LOAD_SYM(cuMemGetAllocationPropertiesFromHandle, 1);
|
||||
/* ncclMemAlloc/Free */
|
||||
LOAD_SYM(cuPointerGetAttribute, 1);
|
||||
#if CUDA_VERSION >= 11070
|
||||
@@ -208,6 +218,20 @@ static void initOnceFunc() {
|
||||
// Determine whether we support the cuMem APIs or not
|
||||
ncclCuMemSupported = ncclIsCuMemSupported();
|
||||
|
||||
#if 12020 <= CUDART_VERSION && CUDART_VERSION <= 12030
|
||||
/* To use cuMem* for host memory allocation, we need to create context on each
|
||||
* visible device. This is workaround needed in CUDA 12.3 which is fixed in 12.4. */
|
||||
if (ncclCuMemSupported && ncclCuMemHostEnable()) {
|
||||
int deviceCnt, saveDevice;
|
||||
cudaGetDevice(&saveDevice);
|
||||
cudaGetDeviceCount(&deviceCnt);
|
||||
for (int i = 0; i < deviceCnt; ++i) {
|
||||
cudaSetDevice(i);
|
||||
cudaFree(NULL);
|
||||
}
|
||||
cudaSetDevice(saveDevice);
|
||||
}
|
||||
#endif
|
||||
initResult = ret;
|
||||
return;
|
||||
error:
|
||||
|
||||
@@ -41,6 +41,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
|
||||
int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
|
||||
if (len > (sizeof(cliaddr.sun_path) - 1)) {
|
||||
WARN("UDS: Cannot bind provided name to socket. Name too large");
|
||||
close(fd);
|
||||
return ncclInternalError;
|
||||
}
|
||||
#ifndef USE_ABSTRACT_SOCKET
|
||||
@@ -66,7 +67,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
|
||||
// Mark socket as non-blocking
|
||||
if (handle->abortFlag) {
|
||||
int flags;
|
||||
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
|
||||
SYSCHECK(flags = fcntl(fd, F_GETFL), "fcntl");
|
||||
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
|
||||
}
|
||||
|
||||
@@ -186,20 +187,16 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
|
||||
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
|
||||
#endif
|
||||
|
||||
TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d to UDS socket %s", hdr, hdrLen, temp);
|
||||
TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp);
|
||||
|
||||
if (sendFd != -1) {
|
||||
TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
|
||||
msg.msg_control = control_un.control;
|
||||
msg.msg_controllen = sizeof(control_un.control);
|
||||
|
||||
msg.msg_control = control_un.control;
|
||||
msg.msg_controllen = sizeof(control_un.control);
|
||||
|
||||
cmptr = CMSG_FIRSTHDR(&msg);
|
||||
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
|
||||
cmptr->cmsg_level = SOL_SOCKET;
|
||||
cmptr->cmsg_type = SCM_RIGHTS;
|
||||
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
|
||||
}
|
||||
cmptr = CMSG_FIRSTHDR(&msg);
|
||||
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
|
||||
cmptr->cmsg_level = SOL_SOCKET;
|
||||
cmptr->cmsg_type = SCM_RIGHTS;
|
||||
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
|
||||
|
||||
msg.msg_name = (void *)&cliaddr;
|
||||
msg.msg_namelen = sizeof(struct sockaddr_un);
|
||||
|
||||
@@ -102,6 +102,10 @@ ncclResult_t ncclNvmlEnsureInitialized() {
|
||||
for(Symbol sym: symbols) {
|
||||
*sym.ppfn = dlsym(libhandle, sym.name);
|
||||
}
|
||||
// Coverity complains that we never dlclose this object, but that's
|
||||
// deliberate, since we want the loaded object to remain in memory until
|
||||
// the process terminates, so that we can use its code.
|
||||
// coverity[leaked_storage]
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ void setEnvFile(const char* fileName) {
|
||||
while (line[s] != '\0' && line[s] != '=') s++;
|
||||
if (line[s] == '\0') continue;
|
||||
strncpy(envVar, line, std::min(1023,s));
|
||||
envVar[s] = '\0';
|
||||
envVar[std::min(1023,s)] = '\0';
|
||||
s++;
|
||||
strncpy(envValue, line+s, 1023);
|
||||
envValue[1023]='\0';
|
||||
@@ -48,17 +48,28 @@ void setEnvFile(const char* fileName) {
|
||||
fclose(file);
|
||||
}
|
||||
|
||||
void initEnv() {
|
||||
static void initEnvFunc() {
|
||||
char confFilePath[1024];
|
||||
const char * userDir = userHomeDir();
|
||||
if (userDir) {
|
||||
sprintf(confFilePath, "%s/.nccl.conf", userDir);
|
||||
const char* userFile = getenv("NCCL_CONF_FILE");
|
||||
if (userFile && strlen(userFile) > 0) {
|
||||
snprintf(confFilePath, sizeof(confFilePath), "%s", userFile);
|
||||
setEnvFile(confFilePath);
|
||||
} else {
|
||||
const char* userDir = userHomeDir();
|
||||
if (userDir) {
|
||||
snprintf(confFilePath, sizeof(confFilePath), "%s/.nccl.conf", userDir);
|
||||
setEnvFile(confFilePath);
|
||||
}
|
||||
}
|
||||
sprintf(confFilePath, "/etc/nccl.conf");
|
||||
snprintf(confFilePath, sizeof(confFilePath), "/etc/nccl.conf");
|
||||
setEnvFile(confFilePath);
|
||||
}
|
||||
|
||||
void initEnv() {
|
||||
static pthread_once_t once = PTHREAD_ONCE_INIT;
|
||||
pthread_once(&once, initEnvFunc);
|
||||
}
|
||||
|
||||
void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
|
||||
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
pthread_mutex_lock(&mutex);
|
||||
@@ -80,8 +91,7 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
|
||||
pthread_mutex_unlock(&mutex);
|
||||
}
|
||||
|
||||
const char *ncclGetEnv(const char *name) {
|
||||
static pthread_once_t once = PTHREAD_ONCE_INIT;
|
||||
pthread_once(&once, initEnv);
|
||||
const char* ncclGetEnv(const char* name) {
|
||||
initEnv();
|
||||
return getenv(name);
|
||||
}
|
||||
|
||||
@@ -1,115 +1,524 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "param.h"
|
||||
#include "checks.h"
|
||||
#include "comm.h"
|
||||
#include "enqueue.h"
|
||||
#include "utils.h"
|
||||
#include "proxy.h"
|
||||
#include "profiler.h"
|
||||
|
||||
//#define PROFILE_PROXY 1
|
||||
#ifdef PROFILE_PROXY
|
||||
#include "timer.h"
|
||||
#include "alloc.h"
|
||||
static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static int profilerPluginRefCount;
|
||||
static void* profilerPluginLib;
|
||||
static ncclProfiler_t* ncclProfiler;
|
||||
|
||||
static const char* profilingStateSendStr[] = { "BufferWait", "GPUWait", "SendWait", "", "End" };
|
||||
static const char* profilingStateRecvStr[] = { "BufferWait", "RecvWait", "FlushWait", "GPUWait", "End" };
|
||||
static const char* profilingEventStr[] = { "SendRecv", "Sleep", "Idle", "Append" };
|
||||
struct ncclProxyProfileEvent {
|
||||
double timestamp[6];
|
||||
uint64_t opCount;
|
||||
int peer;
|
||||
int step;
|
||||
uint16_t channel;
|
||||
uint8_t type; // send / recv
|
||||
uint8_t opIndex;
|
||||
};
|
||||
#define MAX_STR_LEN 256
|
||||
#define NCCL_PROFILER_PLUGIN_SYMBOL "ncclProfiler_v1"
|
||||
|
||||
struct ncclProxyProfileEvent* profilingEvents = NULL;
|
||||
int profilingIndex = 0;
|
||||
double profilingStart = 0;
|
||||
#define MAX_EVENTS 200000
|
||||
|
||||
ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) {
|
||||
if (profilingEvents == NULL) {
|
||||
NCCLCHECK(ncclCalloc(&profilingEvents, MAX_EVENTS));
|
||||
profilingStart = gettime();
|
||||
static void* tryOpenLib(char* name, int *err, char* errStr) {
|
||||
if (nullptr == name || strlen(name) == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
struct ncclProxyProfileEvent* event = NULL;
|
||||
if (state%8 == 0) {
|
||||
if (profilingIndex == MAX_EVENTS) return ncclSuccess;
|
||||
args->subs[sub].profilingEvents[step%NCCL_STEPS] = event = profilingEvents+profilingIndex++;
|
||||
if (state == ncclProxyProfileBegin) {
|
||||
// Proxy operation information
|
||||
event->opCount = args->opCount;
|
||||
event->channel = args->subs[sub].channelId;
|
||||
event->peer = args->subs[sub].peer;
|
||||
event->type = args->pattern;
|
||||
event->step = step;
|
||||
event->opIndex = (((uint64_t)args)/sizeof(struct ncclProxyArgs))%256;
|
||||
} else event->peer = -state;
|
||||
|
||||
if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
|
||||
name = nullptr;
|
||||
}
|
||||
|
||||
void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
|
||||
if (nullptr == handle) {
|
||||
strncpy(errStr, dlerror(), MAX_STR_LEN);
|
||||
errStr[MAX_STR_LEN] = 0;
|
||||
if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
|
||||
*err = ENOENT;
|
||||
}
|
||||
}
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
|
||||
if (openErr == ENOENT) {
|
||||
snprintf(nameList, *nameListLen, " %s", name);
|
||||
nameList += strlen(name) + 1;
|
||||
*nameListLen -= strlen(name) + 1;
|
||||
return nameList;
|
||||
}
|
||||
INFO(NCCL_ENV, "PROFILER/Plugin: %s", openErrStr);
|
||||
return nameList;
|
||||
}
|
||||
|
||||
static void* openProfilerPluginLib(char* couldNotFindNames, int len) {
|
||||
int openErr;
|
||||
void *pluginLib;
|
||||
char profilerPluginLibName[PATH_MAX];
|
||||
char openErrStr[MAX_STR_LEN + 1] = { 0 };
|
||||
|
||||
const char *envProfilerPluginName = getenv("NCCL_PROFILER_PLUGIN");
|
||||
if (envProfilerPluginName && strlen(envProfilerPluginName)) {
|
||||
snprintf(profilerPluginLibName, PATH_MAX, "%s", envProfilerPluginName);
|
||||
pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
|
||||
if (pluginLib) {
|
||||
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName);
|
||||
return pluginLib;
|
||||
}
|
||||
|
||||
couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
|
||||
pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
|
||||
if (pluginLib) {
|
||||
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName);
|
||||
return pluginLib;
|
||||
}
|
||||
couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
|
||||
} else {
|
||||
event = (struct ncclProxyProfileEvent*)args->subs[sub].profilingEvents[step%NCCL_STEPS];
|
||||
if (state == ncclProxyProfileEnd) args->subs[sub].profilingEvents[step%NCCL_STEPS] = NULL;
|
||||
if (state == ncclProxyProfileAppendEnd) event->opCount = args->opCount;
|
||||
snprintf(profilerPluginLibName, PATH_MAX, "libnccl-profiler.so");
|
||||
pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
|
||||
if (pluginLib) {
|
||||
return pluginLib;
|
||||
}
|
||||
couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
|
||||
}
|
||||
// Timestamp
|
||||
event->timestamp[state%8] = gettime()-profilingStart;
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
enum {
|
||||
profilerPluginLoadFailed = -1,
|
||||
profilerPluginLoadReady = 0,
|
||||
profilerPluginLoadSuccess = 1,
|
||||
};
|
||||
static int profilerPluginStatus = profilerPluginLoadReady;
|
||||
static pid_t pid;
|
||||
|
||||
#define MAX_PLUGIN_LOAD 2
|
||||
|
||||
static ncclResult_t ncclProfilerPluginLoad(void) {
|
||||
if (profilerPluginLoadFailed == profilerPluginStatus) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
|
||||
pthread_mutex_lock(&profilerLock);
|
||||
if (profilerPluginLoadSuccess == profilerPluginStatus) {
|
||||
++profilerPluginRefCount;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
profilerPluginLib = openProfilerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
|
||||
if (profilerPluginLib == nullptr) {
|
||||
if (strlen(couldNotFindNames)) {
|
||||
INFO(NCCL_ENV, "PROFILER/Plugin: Could not find:%s.", couldNotFindNames);
|
||||
}
|
||||
goto fail;
|
||||
}
|
||||
|
||||
ncclProfiler = (ncclProfiler_t*)dlsym(profilerPluginLib, NCCL_PROFILER_PLUGIN_SYMBOL);
|
||||
if (ncclProfiler == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find " NCCL_PROFILER_PLUGIN_SYMBOL ".");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
++profilerPluginRefCount;
|
||||
profilerPluginStatus = profilerPluginLoadSuccess;
|
||||
|
||||
// Store the pid of the process loading the profiler.
|
||||
// This is attached to the proxyOp event descriptor
|
||||
// so the plugin can figure out if the parent event
|
||||
// is in the same address space or not
|
||||
pid = getpid();
|
||||
|
||||
exit:
|
||||
pthread_mutex_unlock(&profilerLock);
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
if (profilerPluginLib) dlclose(profilerPluginLib);
|
||||
profilerPluginStatus = profilerPluginLoadFailed;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclProfilerPluginUnload(void) {
|
||||
pthread_mutex_lock(&profilerLock);
|
||||
if (0 == (--profilerPluginRefCount)) {
|
||||
INFO(NCCL_ENV, "PROFILER/Plugin: Closing profiler plugin %s", ncclProfiler->name);
|
||||
dlclose(profilerPluginLib);
|
||||
profilerPluginLib = nullptr;
|
||||
ncclProfiler = nullptr;
|
||||
profilerPluginStatus = profilerPluginLoadReady;
|
||||
}
|
||||
pthread_mutex_unlock(&profilerLock);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void ncclProfilingDump() {
|
||||
static int dumpDone = 0;
|
||||
if (dumpDone) return;
|
||||
dumpDone = 1;
|
||||
const char* str = ncclGetEnv("NCCL_PROXY_PROFILE");
|
||||
if (!str) { free(profilingEvents); return; }
|
||||
FILE* f = fopen(str, "w");
|
||||
fprintf(f, "[\n");
|
||||
#define ENABLE_TIMER 0
|
||||
#include "timer.h"
|
||||
|
||||
for (int i=0; i<profilingIndex; i++) {
|
||||
struct ncclProxyProfileEvent* e = profilingEvents+i;
|
||||
const int sendrecv = e->peer >= 0;
|
||||
const char* typeStr = sendrecv ? (e->type == ncclPatternSend ? "Send" : "Recv") :
|
||||
profilingEventStr[-(e->peer/8)];
|
||||
#if ENABLE_TIMER
|
||||
static int64_t elapsedCount;
|
||||
static int64_t initCount, finalizeCount;
|
||||
static int64_t groupStartCount, groupStopCount;
|
||||
static int64_t taskStartCount, taskStopCount;
|
||||
static int64_t proxyOpStartCount, proxyOpStopCount;
|
||||
static int64_t proxyStepStartCount, proxyStepStopCount;
|
||||
static int64_t proxyCtrlStartCount, proxyCtrlStopCount;
|
||||
static int64_t proxyOpRecordCount, proxyStepRecordCount, proxyCtrlRecordCount;
|
||||
|
||||
static double elapsedTs[2];
|
||||
static double initTs[2], finalizeTs[2];
|
||||
static double groupStartTs[2], groupStopTs[2];
|
||||
static double taskStartTs[2], taskStopTs[2];
|
||||
static double proxyOpStartTs[2], proxyOpStopTs[2];
|
||||
static double proxyStepStartTs[2], proxyStepStopTs[2];
|
||||
static double proxyCtrlStartTs[2], proxyCtrlStopTs[2];
|
||||
static double proxyOpRecordTs[2], proxyStepRecordTs[2], proxyCtrlRecordTs[2];
|
||||
|
||||
#define TIME_START_EVENT(event) do { \
|
||||
(event ## Count)++; \
|
||||
(event ## Ts)[0] = gettime(); \
|
||||
} while(0)
|
||||
|
||||
#define TIME_STOP_EVENT(event) do { \
|
||||
double val = gettime() - (event ## Ts)[0]; \
|
||||
(event ## Ts)[1] += val; \
|
||||
} while(0)
|
||||
|
||||
#define TIME_PRINT_EVENTS(name) do { \
|
||||
printf("%s ", name); \
|
||||
if (elapsedCount) printf("[elapsed] %g/%ld = %g ", elapsedTs[1], elapsedCount, elapsedTs[1]/elapsedCount); \
|
||||
if (initCount) printf("[init] %g/%ld = %g ", initTs[1], initCount, initTs[1]/initCount); \
|
||||
if (finalizeCount) printf("[finalize] %g/%ld = %g ", finalizeTs[1], finalizeCount, finalizeTs[1]/finalizeCount); \
|
||||
if (groupStartCount) printf("[groupStart] %g/%ld = %g ", groupStartTs[1], groupStartCount, groupStartTs[1]/groupStartCount); \
|
||||
if (groupStopCount) printf("[groupStop] %g/%ld = %g ", groupStopTs[1], groupStopCount, groupStopTs[1]/groupStopCount); \
|
||||
if (taskStartCount) printf("[taskStart] %g/%ld = %g ", taskStartTs[1], taskStartCount, taskStartTs[1]/taskStartCount); \
|
||||
if (taskStopCount) printf("[taskStop] %g/%ld = %g ", taskStopTs[1], taskStopCount, taskStopTs[1]/taskStopCount); \
|
||||
if (proxyOpStartCount) printf("[proxyOpStart] %g/%ld = %g ", proxyOpStartTs[1], proxyOpStartCount, proxyOpStartTs[1]/proxyOpStartCount); \
|
||||
if (proxyOpStopCount) printf("[proxyOpStop] %g/%ld = %g ", proxyOpStopTs[1], proxyOpStopCount, proxyOpStopTs[1]/proxyOpStopCount); \
|
||||
if (proxyStepStartCount) printf("[proxyStepStart] %g/%ld = %g ", proxyStepStartTs[1], proxyStepStartCount, proxyStepStartTs[1]/proxyStepStartCount); \
|
||||
if (proxyStepStopCount) printf("[proxyStepStop] %g/%ld = %g ", proxyStepStopTs[1], proxyStepStopCount, proxyStepStopTs[1]/proxyStepStopCount); \
|
||||
if (proxyCtrlStartCount) printf("[proxyCtrlStart] %g/%ld = %g ", proxyCtrlStartTs[1], proxyCtrlStartCount, proxyCtrlStartTs[1]/proxyCtrlStartCount); \
|
||||
if (proxyCtrlStopCount) printf("[proxyCtrlStop] %g/%ld = %g ", proxyCtrlStopTs[1], proxyCtrlStopCount, proxyCtrlStopTs[1]/proxyCtrlStopCount); \
|
||||
if (proxyOpRecordCount) printf("[proxyOpRecord] %g/%ld = %g ", proxyOpRecordTs[1], proxyOpRecordCount, proxyOpRecordTs[1]/proxyOpRecordCount); \
|
||||
if (proxyStepRecordCount) printf("[proxyStepRecord] %g/%ld = %g ", proxyStepRecordTs[1], proxyStepRecordCount, proxyStepRecordTs[1]/proxyStepRecordCount); \
|
||||
if (proxyCtrlRecordCount) printf("[proxyCtrlRecord] %g/%ld = %g", proxyCtrlRecordTs[1], proxyCtrlRecordCount, proxyCtrlRecordTs[1]/proxyCtrlRecordCount); \
|
||||
printf("\n"); \
|
||||
} while(0)
|
||||
#else
|
||||
#define TIME_START_EVENT(event) do {} while(0)
|
||||
#define TIME_STOP_EVENT(event) do {} while(0)
|
||||
#define TIME_PRINT_EVENTS(name) do {} while(0)
|
||||
#endif
|
||||
|
||||
|
||||
if (sendrecv) {
|
||||
int state = ncclProxyProfileBegin;
|
||||
const char** stateStr = e->type == ncclPatternSend ? profilingStateSendStr : profilingStateRecvStr;
|
||||
fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f, \"args\": { \"opCount\": %ld, \"proxyOpIndex\":%d } },\n",
|
||||
typeStr, e->peer, e->step, i, e->channel, e->timestamp[state], e->opCount, e->opIndex);
|
||||
static int eActivationMask; // Set by profiler
|
||||
static int eActivationMaskGroup; // Cached for current group
|
||||
|
||||
while (state<ncclProxyProfileEnd) {
|
||||
if (e->timestamp[state]) {
|
||||
const char* name = stateStr[state];
|
||||
fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
|
||||
name, i, e->channel, e->timestamp[state]);
|
||||
state++;
|
||||
while (e->timestamp[state] == 0) state++;
|
||||
fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
|
||||
name, i, e->channel, e->timestamp[state]);
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
|
||||
typeStr, e->peer, e->step, i, e->channel, e->timestamp[state]);
|
||||
} else {
|
||||
if (e->peer == -ncclProxyProfileAppend) {
|
||||
fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f, \"args\": { \"added\": %ld } },\n",
|
||||
typeStr, i, e->timestamp[0], e->opCount);
|
||||
} else {
|
||||
fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
|
||||
typeStr, i, e->timestamp[0]);
|
||||
}
|
||||
fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
|
||||
typeStr, i, e->timestamp[1]);
|
||||
ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) {
|
||||
TIME_START_EVENT(elapsed);
|
||||
TIME_START_EVENT(init);
|
||||
ncclProfilerPluginLoad();
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
int err = ncclProfiler->init(&comm->profilerContext, &eActivationMask);
|
||||
if (err) {
|
||||
WARN("Profiler init failed with error (%d). Continue without profiler.", err);
|
||||
ncclProfiler = NULL;
|
||||
}
|
||||
}
|
||||
fprintf(f, "{} ]\n");
|
||||
fclose(f);
|
||||
free(profilingEvents);
|
||||
TIME_STOP_EVENT(init);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm) {
|
||||
TIME_START_EVENT(finalize);
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
ncclProfiler->finalize(comm->profilerContext);
|
||||
}
|
||||
ncclProfilerPluginUnload();
|
||||
TIME_STOP_EVENT(finalize);
|
||||
TIME_STOP_EVENT(elapsed);
|
||||
TIME_PRINT_EVENTS("Profiler");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) {
|
||||
TIME_START_EVENT(groupStart);
|
||||
eActivationMaskGroup = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
if (eActivationMaskGroup & (ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep)) {
|
||||
ncclProfilerEventDescr_v1_t eDescr = { 0 };
|
||||
eDescr.type = ncclProfileGroup;
|
||||
ncclProfiler->startEvent(plan->comm->profilerContext, &plan->groupEventHandle, &eDescr);
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(groupStart);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan) {
|
||||
TIME_START_EVENT(groupStop);
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle) {
|
||||
ncclProfiler->stopEvent(plan->groupEventHandle);
|
||||
}
|
||||
TIME_STOP_EVENT(groupStop);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
|
||||
TIME_START_EVENT(taskStart);
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl);
|
||||
if (plan->groupEventHandle && enable) {
|
||||
struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
|
||||
while (ct) {
|
||||
ncclProfilerEventDescr_t eDescr = { 0 };
|
||||
eDescr.type = ncclProfileColl;
|
||||
eDescr.parentObj = plan->groupEventHandle;
|
||||
eDescr.rank = plan->comm->rank;
|
||||
eDescr.coll.name = plan->comm->commName;
|
||||
eDescr.coll.commHash = plan->comm->commHash;
|
||||
eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]++;
|
||||
eDescr.coll.func = ct->func;
|
||||
eDescr.coll.sendBuff = ct->sendbuff;
|
||||
eDescr.coll.recvBuff = ct->recvbuff;
|
||||
eDescr.coll.count = ct->count;
|
||||
eDescr.coll.root = ct->root;
|
||||
eDescr.coll.datatype = ct->datatype;
|
||||
eDescr.coll.op = ct->opHost;
|
||||
eDescr.coll.trafficBytes = ct->trafficBytes;
|
||||
eDescr.coll.nMaxChannels = ct->nMaxChannels;
|
||||
eDescr.coll.nWarps = ct->nWarps;
|
||||
eDescr.coll.algo = ct->algorithm;
|
||||
eDescr.coll.proto = ct->protocol;
|
||||
eDescr.coll.isCollnet = ct->isCollnet;
|
||||
eDescr.coll.isNvls = ct->isNvls;
|
||||
ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr);
|
||||
|
||||
// update collective task with group event activation mask
|
||||
ct->eActivationMask = eActivationMaskGroup;
|
||||
ct = ct->next;
|
||||
}
|
||||
struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
|
||||
while (pt) {
|
||||
ncclProfilerEventDescr_t eDescr = { 0 };
|
||||
eDescr.type = ncclProfileP2p;
|
||||
eDescr.parentObj = plan->groupEventHandle;
|
||||
eDescr.rank = plan->comm->rank;
|
||||
eDescr.p2p.name = plan->comm->commName;
|
||||
eDescr.p2p.commHash = plan->comm->commHash;
|
||||
eDescr.p2p.func = pt->func;
|
||||
eDescr.p2p.buff = pt->buff;
|
||||
eDescr.p2p.count = pt->count;
|
||||
eDescr.p2p.datatype = pt->datatype;
|
||||
eDescr.p2p.peer = pt->root;
|
||||
ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
|
||||
|
||||
// update collective task with group event activation mask
|
||||
pt->eActivationMask = eActivationMaskGroup;
|
||||
pt = pt->next;
|
||||
}
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(taskStart);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) {
|
||||
TIME_START_EVENT(taskStop);
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl);
|
||||
if (plan->groupEventHandle && enable) {
|
||||
struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
|
||||
while (ct) {
|
||||
ncclProfiler->stopEvent(ct->eventHandle);
|
||||
ct = ct->next;
|
||||
}
|
||||
struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
|
||||
while (pt) {
|
||||
ncclProfiler->stopEvent(pt->eventHandle);
|
||||
pt = pt->next;
|
||||
}
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(taskStop);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args) {
|
||||
TIME_START_EVENT(proxyOpStart);
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) {
|
||||
ncclProfilerEventDescr_t eDescr = { 0 };
|
||||
eDescr.type = ncclProfileProxyOp;
|
||||
eDescr.parentObj = sub->taskEventHandle;
|
||||
eDescr.rank = sub->rank;
|
||||
eDescr.proxyOp.pid = args->pid;
|
||||
eDescr.proxyOp.channelId = sub->channelId;
|
||||
eDescr.proxyOp.peer = sub->peer;
|
||||
eDescr.proxyOp.nSteps = sub->nsteps;
|
||||
eDescr.proxyOp.chunkSize = args->chunkSize;
|
||||
eDescr.proxyOp.isSend = 1;
|
||||
ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr);
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyOpStart);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args) {
|
||||
TIME_START_EVENT(proxyOpStart);
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) {
|
||||
ncclProfilerEventDescr_t eDescr = { 0 };
|
||||
eDescr.type = ncclProfileProxyOp;
|
||||
eDescr.parentObj = sub->taskEventHandle;
|
||||
eDescr.rank = sub->rank;
|
||||
eDescr.proxyOp.pid = args->pid;
|
||||
eDescr.proxyOp.channelId = sub->channelId;
|
||||
eDescr.proxyOp.peer = sub->peer;
|
||||
eDescr.proxyOp.nSteps = sub->nsteps;
|
||||
eDescr.proxyOp.chunkSize = args->chunkSize;
|
||||
eDescr.proxyOp.isSend = 0;
|
||||
ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr);
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyOpStart);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStopProxyOpEvent(int s, struct ncclProxyArgs* args) {
|
||||
TIME_START_EVENT(proxyOpStop);
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
|
||||
ncclProfiler->stopEvent(sub->opEventHandle);
|
||||
sub->opEventHandle = NULL;
|
||||
}
|
||||
TIME_STOP_EVENT(proxyOpStop);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStartSendProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
|
||||
TIME_START_EVENT(proxyStepStart);
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
|
||||
for (uint64_t step = stepLo; step < stepHi; step++) {
|
||||
ncclProfilerEventDescr_t eDescr = { 0 };
|
||||
eDescr.type = ncclProfileProxyStep;
|
||||
eDescr.parentObj = sub->opEventHandle;
|
||||
eDescr.rank = sub->rank;
|
||||
eDescr.proxyStep.step = step;
|
||||
ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr);
|
||||
}
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyStepStart);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStartRecvProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
|
||||
TIME_START_EVENT(proxyStepStart);
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
|
||||
for (uint64_t step = stepLo; step < stepHi; step++) {
|
||||
ncclProfilerEventDescr_t eDescr = { 0 };
|
||||
eDescr.type = ncclProfileProxyStep;
|
||||
eDescr.parentObj = sub->opEventHandle;
|
||||
eDescr.rank = sub->rank;
|
||||
eDescr.proxyStep.step = step;
|
||||
ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr);
|
||||
}
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyStepStart);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStopProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
|
||||
TIME_START_EVENT(proxyStepStop);
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
for (uint64_t step = stepLo; step < stepHi; step++) {
|
||||
if (sub->stepEventHandles[step%NCCL_STEPS]) {
|
||||
ncclProfiler->stopEvent(sub->stepEventHandles[step%NCCL_STEPS]);
|
||||
sub->stepEventHandles[step%NCCL_STEPS] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyStepStop);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle) {
|
||||
TIME_START_EVENT(proxyCtrlStart);
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
// for proxy control events we allow profiling mode to change on a per event basis
|
||||
int eActivationMaskProxy = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
|
||||
if (eActivationMaskProxy & ncclProfileProxyCtrl) {
|
||||
ncclProfilerEventDescr_t eDescr = { 0 };
|
||||
eDescr.type = ncclProfileProxyCtrl;
|
||||
ncclProfiler->startEvent(profilerContext, eHandle, &eDescr);
|
||||
TIME_STOP_EVENT(proxyCtrlStart);
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
*eHandle = NULL;
|
||||
TIME_STOP_EVENT(proxyCtrlStart);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle) {
|
||||
TIME_START_EVENT(proxyCtrlStop);
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle) {
|
||||
ncclProfiler->stopEvent(eHandle);
|
||||
}
|
||||
TIME_STOP_EVENT(proxyCtrlStop);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState) {
|
||||
TIME_START_EVENT(proxyOpRecord);
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
|
||||
ncclProfilerEventStateArgs_t a = { 0 };
|
||||
a.proxyOp.steps = steps;
|
||||
a.proxyOp.transSize = transSize;
|
||||
ncclProfiler->recordEventState(sub->opEventHandle, eState, &a);
|
||||
}
|
||||
TIME_STOP_EVENT(proxyOpRecord);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerRecordProxyStepEventStates(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState) {
|
||||
TIME_START_EVENT(proxyStepRecord);
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
|
||||
for (uint64_t step = stepLo; step < stepHi; step++) {
|
||||
if (sub->stepEventHandles[step%NCCL_STEPS]) {
|
||||
ncclProfiler->recordEventState(sub->stepEventHandles[step%NCCL_STEPS], eState, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyStepRecord);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerRecordProxyCtrlEventState(void* eHandle, int appended, ncclProfilerEventState_t eState) {
|
||||
TIME_START_EVENT(proxyCtrlRecord);
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) {
|
||||
ncclProfilerEventStateArgs_t args = { 0 };
|
||||
args.proxyCtrl.appendedProxyOps = appended;
|
||||
ncclProfiler->recordEventState(eHandle, eState, &args);
|
||||
}
|
||||
TIME_STOP_EVENT(proxyCtrlRecord);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op) {
|
||||
op->pid = pid;
|
||||
return ncclSuccess;
|
||||
}
|
||||
#else
|
||||
ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; }
|
||||
void ncclProfilingDump() {}
|
||||
#endif
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "shm.h"
|
||||
#include "shmutils.h"
|
||||
#include "comm.h"
|
||||
#include "checks.h"
|
||||
#include <sys/types.h>
|
||||
@@ -75,7 +75,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
|
||||
goto fail;
|
||||
}
|
||||
} else {
|
||||
SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
|
||||
SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "open", ret, fail);
|
||||
}
|
||||
|
||||
retry_fallocate:
|
||||
@@ -90,7 +90,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
|
||||
}
|
||||
INFO(NCCL_ALLOC, "Allocated %ld bytes of shared memory in %s", realShmSize, shmPath);
|
||||
} else {
|
||||
SYSCHECKGOTO(fd = open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
|
||||
SYSCHECKGOTO(fd = open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", ret, fail);
|
||||
}
|
||||
|
||||
hptr = (char*)mmap(NULL, realShmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
@@ -114,7 +114,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
|
||||
}
|
||||
|
||||
if (devShmPtr) {
|
||||
CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterMapped), ret, fail);
|
||||
CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped), ret, fail);
|
||||
CUDACHECKGOTO(cudaHostGetDevicePointer(&dptr, (void*)hptr, 0), ret, fail);
|
||||
}
|
||||
|
||||
@@ -129,7 +129,7 @@ fail:
|
||||
shmPath, shmSize, strerror(errno), errno);
|
||||
if (tmphandle) {
|
||||
shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle);
|
||||
ncclShmClose((ncclShmHandle_t)tmphandle);
|
||||
(void)ncclShmClose((ncclShmHandle_t)tmphandle);
|
||||
tmphandle = NULL;
|
||||
}
|
||||
hptr = NULL;
|
||||
@@ -182,7 +182,7 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) {
|
||||
|
||||
ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int curRound = shmem->round;
|
||||
int curRound;
|
||||
size_t mycnt;
|
||||
|
||||
if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL || shmem->maxTypeSize < typeSize) {
|
||||
@@ -190,6 +190,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff
|
||||
goto exit;
|
||||
}
|
||||
|
||||
curRound = shmem->round;
|
||||
memcpy((char*)shmem->ptr[curRound] + comm->localRank * typeSize, sendbuff, typeSize);
|
||||
/* sync among local ranks */
|
||||
mycnt = __atomic_add_fetch(shmem->cnt[curRound], 1, __ATOMIC_ACQ_REL);
|
||||
|
||||
@@ -284,6 +284,7 @@ ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char
|
||||
sin6.sin6_scope_id = 0; // should be global scope, set to 0
|
||||
} else {
|
||||
WARN("Net : unsupported IP family");
|
||||
freeaddrinfo(p);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
|
||||
@@ -408,7 +409,7 @@ ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress*
|
||||
|
||||
static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
|
||||
socklen_t socklen = sizeof(union ncclSocketAddress);
|
||||
sock->fd = accept(sock->acceptFd, &sock->addr.sa, &socklen);
|
||||
sock->fd = accept(sock->acceptFd, (struct sockaddr*)&sock->addr, &socklen);
|
||||
if (sock->fd != -1) {
|
||||
sock->state = ncclSocketStateAccepted;
|
||||
} else if (errno != EAGAIN && errno != EWOULDBLOCK) {
|
||||
@@ -501,8 +502,9 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
|
||||
} else if (ret < 0) {
|
||||
WARN("socketPollConnect poll() failed with error %s", strerror(errno));
|
||||
return ncclRemoteError;
|
||||
} else {
|
||||
EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0);
|
||||
} else if (ret != 1 || (pfd.revents & POLLOUT) == 0) {
|
||||
WARN("socketPollConnect poll() returned %d%s", ret, (pfd.revents & POLLOUT) ? "" : ", no POLLOUT events");
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
/* check socket status */
|
||||
@@ -734,13 +736,17 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad
|
||||
/* Set socket as non-blocking if async or if we need to be able to abort */
|
||||
if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
|
||||
int flags;
|
||||
EQCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), -1, ret, fail);
|
||||
SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), ret, fail);
|
||||
SYSCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), "fcntl", ret, fail);
|
||||
SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl", ret, fail);
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
if (sock->fd != -1) {
|
||||
close(sock->fd);
|
||||
sock->fd = -1;
|
||||
}
|
||||
goto exit;
|
||||
}
|
||||
|
||||
|
||||
@@ -77,6 +77,8 @@ static void* tryOpenLib(const char* name, int* err, char* errStr) {
|
||||
if (nullptr == handle) {
|
||||
strncpy(errStr, dlerror(), MAX_STR_LEN);
|
||||
errStr[MAX_STR_LEN] = '\0';
|
||||
// "handle" and "name" won't be NULL at the same time.
|
||||
// coverity[var_deref_model]
|
||||
if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
|
||||
*err = ENOENT;
|
||||
}
|
||||
|
||||
@@ -65,15 +65,7 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
uint64_t getHash(const char* string, int n) {
|
||||
// Based on DJB2a, result = result * 33 ^ char
|
||||
uint64_t result = 5381;
|
||||
for (int c = 0; c < n; c++) {
|
||||
result = ((result << 5) + result) ^ string[c];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static uint64_t hostHashValue = 0;
|
||||
/* Generate a hash of the unique identifying string for this host
|
||||
* that will be unique for both bare-metal and container instances
|
||||
* Equivalent of a hash of;
|
||||
@@ -83,7 +75,7 @@ uint64_t getHash(const char* string, int n) {
|
||||
* This string can be overridden by using the NCCL_HOSTID env var.
|
||||
*/
|
||||
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
|
||||
uint64_t getHostHash(void) {
|
||||
static void getHostHashOnce() {
|
||||
char hostHash[1024];
|
||||
const char *hostId;
|
||||
|
||||
@@ -103,8 +95,8 @@ uint64_t getHostHash(void) {
|
||||
strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
|
||||
free(p);
|
||||
}
|
||||
fclose(file);
|
||||
}
|
||||
fclose(file);
|
||||
}
|
||||
|
||||
// Make sure the string is terminated
|
||||
@@ -112,7 +104,12 @@ uint64_t getHostHash(void) {
|
||||
|
||||
TRACE(NCCL_INIT,"unique hostname '%s'", hostHash);
|
||||
|
||||
return getHash(hostHash, strlen(hostHash));
|
||||
hostHashValue = getHash(hostHash, strlen(hostHash));
|
||||
}
|
||||
uint64_t getHostHash(void) {
|
||||
static pthread_once_t once = PTHREAD_ONCE_INIT;
|
||||
pthread_once(&once, getHostHashOnce);
|
||||
return hostHashValue;
|
||||
}
|
||||
|
||||
/* Generate a hash of the unique identifying string for this process
|
||||
|
||||
@@ -168,6 +168,13 @@ ncclResult_t pncclCommAbort(ncclComm_t comm);
|
||||
ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
|
||||
ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
|
||||
|
||||
/* Creates a new communicator (multi thread/process version), similar to ncclCommInitRankConfig.
|
||||
* Allows to use more than one ncclUniqueId (up to one per rank), indicated by nId, to accelerate the init operation.
|
||||
* The number of ncclUniqueIds and their order must be the same for every rank.
|
||||
*/
|
||||
ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
|
||||
ncclResult_t pncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
|
||||
|
||||
/* Returns a string for each error code. */
|
||||
const char* ncclGetErrorString(ncclResult_t result);
|
||||
const char* pncclGetErrorString(ncclResult_t result);
|
||||
|
||||
@@ -355,6 +355,8 @@ static void* tryOpenLib(char* name, int* err, char* errStr) {
|
||||
if (nullptr == handle) {
|
||||
strncpy(errStr, dlerror(), MAX_STR_LEN);
|
||||
errStr[MAX_STR_LEN] = '\0';
|
||||
// "handle" and "name" won't be NULL at the same time.
|
||||
// coverity[var_deref_model]
|
||||
if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
|
||||
*err = ENOENT;
|
||||
}
|
||||
@@ -422,11 +424,10 @@ static int netPluginStatus = netPluginLoadReady;
|
||||
|
||||
ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
|
||||
char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
|
||||
if (netPluginLoadFailed == netPluginStatus) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
pthread_mutex_lock(&netPluginLock);
|
||||
if (netPluginLoadFailed == netPluginStatus) {
|
||||
goto exit;
|
||||
}
|
||||
if (netPluginLoadSuccess == netPluginStatus) {
|
||||
++netPluginRefCount;
|
||||
goto exit;
|
||||
|
||||
+225
-79
@@ -8,18 +8,21 @@
|
||||
#include "info.h"
|
||||
#include "collectives.h"
|
||||
#include "socket.h"
|
||||
#include "shm.h"
|
||||
#include "shmutils.h"
|
||||
#include "profiler.h"
|
||||
#define ENABLE_TIMER 0
|
||||
#include "timer.h"
|
||||
#include "profiler.h"
|
||||
#include "transport.h"
|
||||
|
||||
#include <sys/syscall.h>
|
||||
#include <assert.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/time.h>
|
||||
#include <sched.h>
|
||||
|
||||
enum { proxyRecv=0, proxySend=1 };
|
||||
void* ncclProxyServiceUDS(void* _args);
|
||||
|
||||
static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
|
||||
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
|
||||
@@ -67,8 +70,10 @@ static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, voi
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
memcpy(elem->respBuff, respBuff, respSize);
|
||||
free(respBuff);
|
||||
if (respSize > 0) {
|
||||
memcpy(elem->respBuff, respBuff, respSize);
|
||||
free(respBuff);
|
||||
}
|
||||
elem->done = true;
|
||||
elem->res = res;
|
||||
return ncclSuccess;
|
||||
@@ -360,12 +365,17 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
|
||||
sub->nsteps = op->nsteps;
|
||||
sub->nbytes = op->nbytes;
|
||||
sub->offset = 0;
|
||||
sub->peer = op->root;
|
||||
sub->peer = op->peer;
|
||||
sub->reg = op->reg;
|
||||
sub->sendMhandle = op->sendMhandle;
|
||||
sub->recvMhandle = op->recvMhandle;
|
||||
sub->sendbuff = op->sendbuff;
|
||||
sub->recvbuff = op->recvbuff;
|
||||
sub->eActivationMask = op->eActivationMask;
|
||||
sub->taskEventHandle = op->taskEventHandle;
|
||||
sub->rank = op->rank;
|
||||
args->pid = op->pid;
|
||||
args->profilerContext = op->profilerContext;
|
||||
args->nsubs = subIndex+1;
|
||||
if (subIndex) {
|
||||
if ((args->sliceSteps != op->sliceSteps) ||
|
||||
@@ -527,6 +537,7 @@ static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel
|
||||
|
||||
if (justInquire) *justInquire = true;
|
||||
else {
|
||||
op->peer = peer;
|
||||
NCCLCHECK(ncclLocalOpAppend(comm, &connector->proxyConn, op));
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -588,6 +599,64 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
|
||||
NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[2], op, 0, justInquire));
|
||||
NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeUp, op, 0, justInquire));
|
||||
} break;
|
||||
case ncclPatternPatUp: {
|
||||
// Run full algorithm to count the number of steps for each peer.
|
||||
int *nstepsSend, *nstepsRecv;
|
||||
const int rank = comm->rank, nranks = comm->nRanks;
|
||||
NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks)));
|
||||
NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks)));
|
||||
const ssize_t size = op->nbytes/comm->nRanks;
|
||||
PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
|
||||
int last = 0;
|
||||
while (last == 0) {
|
||||
int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
|
||||
size_t inpIx, outIx;
|
||||
algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
|
||||
if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++;
|
||||
if (sendDim != -1 && postSend) nstepsSend[sendDim]++;
|
||||
}
|
||||
for (int i=0; i<log2Up(nranks); i++) {
|
||||
if (nstepsSend[i]) {
|
||||
int sendPeer = (rank + (1<<i)) % nranks;
|
||||
op->nsteps = nstepsSend[i];
|
||||
NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire));
|
||||
}
|
||||
if (nstepsRecv[i]) {
|
||||
int recvPeer = (rank - (1<<i) + nranks) % nranks;
|
||||
op->nsteps = nstepsRecv[i];
|
||||
NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire));
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case ncclPatternPatDown: {
|
||||
// Run full algorithm to count the number of steps for each peer.
|
||||
int *nstepsSend, *nstepsRecv;
|
||||
const int rank = comm->rank, nranks = comm->nRanks;
|
||||
NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks)));
|
||||
NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks)));
|
||||
const ssize_t size = op->nbytes/comm->nRanks;
|
||||
PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
|
||||
int last = 0;
|
||||
while (last == 0) {
|
||||
int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
|
||||
size_t inpIx, outIx;
|
||||
algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
|
||||
if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++;
|
||||
if (sendDim != -1 && postSend) nstepsSend[sendDim]++;
|
||||
}
|
||||
for (int i=0; i<log2Up(nranks); i++) {
|
||||
if (nstepsSend[i]) {
|
||||
int sendPeer = (rank - (1<<i) + nranks) % nranks;
|
||||
op->nsteps = nstepsSend[i];
|
||||
NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire));
|
||||
}
|
||||
if (nstepsRecv[i]) {
|
||||
int recvPeer = (rank + (1<<i)) % nranks;
|
||||
op->nsteps = nstepsRecv[i];
|
||||
NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire));
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case ncclPatternSend:
|
||||
case ncclPatternRecv: {
|
||||
if (op->root == comm->rank) return ncclSuccess;
|
||||
@@ -657,9 +726,9 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
|
||||
if (state->opsPool == NULL) return ncclInternalError;
|
||||
struct ncclProxyOpsPool* pool = state->opsPool;
|
||||
|
||||
struct ncclProxyArgs profArgs; // Only used for profiling purposes
|
||||
if (state->nextOps != -1) goto process_nextops;
|
||||
|
||||
void* eHandle;
|
||||
// If we have ops to progress, no need to block waiting for something to arrive or even wait for the lock
|
||||
// to be available. Exit, continue progress, and come back later.
|
||||
if (state->active != NULL && (pool->nextOps == -1 || pthread_mutex_trylock(&pool->mutex) != 0)) return ncclSuccess;
|
||||
@@ -667,10 +736,11 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
|
||||
if (state->active == NULL) {
|
||||
pthread_mutex_lock(&pool->mutex);
|
||||
while (pool->nextOps == -1 && !state->stop) {
|
||||
struct ncclProxyArgs profArgs; // Only used for profiling purposes
|
||||
ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep);
|
||||
ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
|
||||
ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlSleep);
|
||||
pthread_cond_wait(&pool->cond, &pool->mutex);
|
||||
ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup);
|
||||
ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlWakeup);
|
||||
ncclProfilerStopProxyCtrlEvent(eHandle);
|
||||
}
|
||||
if (state->stop) { // We might have been woken up to stop.
|
||||
pthread_mutex_unlock(&pool->mutex);
|
||||
@@ -684,7 +754,8 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
|
||||
if (state->nextOps == -1) return ncclInternalError;
|
||||
|
||||
process_nextops:
|
||||
ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppend);
|
||||
ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
|
||||
ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlAppend);
|
||||
TIME_START(2);
|
||||
int freeOp[NCCL_MAX_LOCAL_RANKS];
|
||||
int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
|
||||
@@ -720,6 +791,10 @@ process_nextops:
|
||||
if (freeOp[i] == -1) continue;
|
||||
int newFree = freeOp[i];
|
||||
int oldFree = pool->freeOps[i];
|
||||
// Coverity gets confused by the complex code structure here. The previous "for" loop ensures that freeOpEnd[i]
|
||||
// is initialized so long as freeOp[i] is initialized (is not -1). In the current loop we filter out uninitialized
|
||||
// freeOp[i], hence ensuring that freeOpEnd[i] is also initialized.
|
||||
// coverity[uninit_use:FALSE]
|
||||
pool->ops[freeOpEnd[i]].next = oldFree;
|
||||
if (oldFree == -1) {
|
||||
// Nothing for the main thread to consume, we can set it.
|
||||
@@ -735,8 +810,8 @@ process_nextops:
|
||||
}
|
||||
}
|
||||
}
|
||||
profArgs.opCount = *added;
|
||||
ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppendEnd);
|
||||
ncclProfilerRecordProxyCtrlEventState(eHandle, *added, ncclProfilerProxyCtrlAppendEnd);
|
||||
ncclProfilerStopProxyCtrlEvent(eHandle);
|
||||
TIME_STOP(2);
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -758,6 +833,7 @@ static int setProxyThreadContext(struct ncclProxyState* proxyState) {
|
||||
if (CUPFN(cuCtxCreate) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
|
||||
WARN("Unable to create thread context due to old driver, disabling.");
|
||||
createThreadContext = 0;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -767,15 +843,17 @@ static int setProxyThreadContext(struct ncclProxyState* proxyState) {
|
||||
NULL, 0, CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) {
|
||||
WARN("Failed to create CUDA context on device %d", proxyState->cudaDev);
|
||||
createThreadContext = 0;
|
||||
goto exit;
|
||||
}
|
||||
} else {
|
||||
if (CUPFN(cuCtxSetCurrent(proxyState->cudaCtx)) != CUDA_SUCCESS) {
|
||||
WARN("Failed to set CUDA context on device %d", proxyState->cudaDev);
|
||||
return 0;
|
||||
goto exit;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
exit:
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
@@ -787,12 +865,14 @@ NCCL_PARAM(ProgressAppendOpFreq, "PROGRESS_APPENDOP_FREQ", 8);
|
||||
void* ncclProxyProgress(void *proxyState_) {
|
||||
struct ncclProxyState* proxyState = (struct ncclProxyState*)proxyState_;
|
||||
if (setProxyThreadContext(proxyState)) {
|
||||
INFO(NCCL_INIT, "[Proxy Progress] Created CUDA context on device %d", proxyState->cudaDev);
|
||||
INFO(NCCL_INIT, "[Proxy Progress] Set CUDA context on device %d", proxyState->cudaDev);
|
||||
} else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
|
||||
WARN("[Proxy Progress] Failed to set CUDA device %d", proxyState->cudaDev);
|
||||
}
|
||||
// if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
|
||||
|
||||
INFO(NCCL_INIT, "[Proxy Progress] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
|
||||
|
||||
struct ncclProxyProgressState* state = &proxyState->progressState;
|
||||
state->nextOps = -1;
|
||||
const int sig = ncclParamProxyDumpSignal();
|
||||
@@ -809,9 +889,7 @@ void* ncclProxyProgress(void *proxyState_) {
|
||||
* ncclParamProgressAppendOpFreq(). If they are equal, we will append proxy ops. This will decrease the
|
||||
* frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */
|
||||
int proxyOpAppendCounter = 0;
|
||||
struct ncclProxyArgs profArgs; // Only used for profiling purposes
|
||||
while ((state->stop == 0 || (state->stop == 1 && state->active)) &&
|
||||
__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) == 0) {
|
||||
while (state->stop == 0 || (state->stop == 1 && state->active)) {
|
||||
int idle = 1;
|
||||
ncclResult_t ret = progressOps(proxyState, state, state->active, &idle);
|
||||
if (ret != ncclSuccess) {
|
||||
@@ -819,8 +897,11 @@ void* ncclProxyProgress(void *proxyState_) {
|
||||
INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
|
||||
continue;
|
||||
}
|
||||
if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle);
|
||||
if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive);
|
||||
void* eHandle;
|
||||
ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
|
||||
if (lastIdle == 0 && idle == 1) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlIdle);
|
||||
if (lastIdle == 1 && idle == 0) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlActive);
|
||||
ncclProfilerStopProxyCtrlEvent(eHandle);
|
||||
if (idle || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) {
|
||||
int added = 0;
|
||||
proxyOpAppendCounter = 0;
|
||||
@@ -860,7 +941,7 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm) {
|
||||
static ncclResult_t ncclProxyProgressCreate(struct ncclProxyState* proxyState) {
|
||||
struct ncclProxyProgressState* state = &proxyState->progressState;
|
||||
if (!state->thread) {
|
||||
pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState);
|
||||
PTHREADCHECK(pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState), "pthread_create");
|
||||
ncclSetThreadName(state->thread, "NCCL Progress%2d", proxyState->tpLocalnRanks);
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -875,7 +956,7 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) {
|
||||
state->stop = 1;
|
||||
pthread_cond_signal(&state->opsPool->cond);
|
||||
pthread_mutex_unlock(&state->opsPool->mutex);
|
||||
pthread_join(state->thread, NULL);
|
||||
PTHREADCHECK(pthread_join(state->thread, NULL), "pthread_join");
|
||||
}
|
||||
|
||||
// Free off any memory allocated for the proxy arg pools
|
||||
@@ -885,7 +966,6 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) {
|
||||
state->pools = next;
|
||||
}
|
||||
|
||||
ncclProfilingDump();
|
||||
TIME_PRINT("Proxy");
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -962,23 +1042,17 @@ struct ncclProxyInitResp {
|
||||
char devShmPath[6]; // "XXXXXX" - May or may not be set
|
||||
};
|
||||
|
||||
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int tpProxyRank, struct ncclProxyConnector* proxyConn) {
|
||||
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn) {
|
||||
struct ncclSocket* sock;
|
||||
int ready, proxyRank = -1;
|
||||
int ready;
|
||||
struct ncclProxyState* sharedProxyState = comm->proxyState;
|
||||
int tpProxyRank = comm->topParentRanks[proxyRank];
|
||||
|
||||
// Keep one connection per local rank
|
||||
for (int i = 0; i < comm->localRanks; ++i) {
|
||||
/* find the proxy rank in comm. */
|
||||
if (comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) {
|
||||
proxyRank = comm->localRankToRank[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
|
||||
// Keep one connection per local rank
|
||||
proxyConn->connection = NULL;
|
||||
proxyConn->tpRank = tpProxyRank;
|
||||
proxyConn->rank = proxyRank;
|
||||
if (sharedProxyState->peerSocks == NULL) {
|
||||
NCCLCHECK(ncclCalloc(&sharedProxyState->peerSocks, comm->sharedRes->tpNLocalRanks));
|
||||
NCCLCHECK(ncclCalloc(&sharedProxyState->proxyOps, comm->sharedRes->tpNLocalRanks));
|
||||
@@ -1020,68 +1094,93 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
|
||||
proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
|
||||
}
|
||||
}
|
||||
proxyConn->initialized = true;
|
||||
INFO(NCCL_NET|NCCL_PROXY, "Connected to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// UDS support
|
||||
ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, int tpRank, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int *respFd) {
|
||||
ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int* reqFd, int *respFd) {
|
||||
ncclResult_t res = ncclSuccess;
|
||||
struct ncclIpcSocket ipcSock = { 0 };
|
||||
void *opId;
|
||||
NCCLCHECK(getRandomData(&opId, sizeof(opId)));
|
||||
int reqFdtmp = -1;
|
||||
|
||||
int rank = comm->topParentLocalRanks[comm->localRank];
|
||||
struct ncclProxyState* sharedProxyState = comm->proxyState;
|
||||
uint64_t pidHash = sharedProxyState->peerAddressesUDS[tpRank];
|
||||
uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
|
||||
|
||||
INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %p opId %p",
|
||||
comm, rank, tpRank, pidHash, reqSize, respSize, respFd, opId);
|
||||
comm, rank, proxyConn->tpRank, pidHash, reqSize, respSize, respFd, opId);
|
||||
|
||||
// cuMem: Create a UDS socket to receive the response
|
||||
NCCLCHECK(ncclIpcSocketInit(&ipcSock, rank, (uint64_t)opId, comm->abortFlag));
|
||||
|
||||
if (reqFd) {
|
||||
reqFdtmp = *reqFd;
|
||||
} else {
|
||||
// give a dummy fd for the other side of UDS socket
|
||||
NCCLCHECK(ncclIpcSocketGetFd(&ipcSock, &reqFdtmp));
|
||||
}
|
||||
|
||||
ncclIpcHdr hdr;
|
||||
hdr.type = type;
|
||||
hdr.rank = rank;
|
||||
hdr.reqSize = reqSize;
|
||||
hdr.respSize = respSize;
|
||||
hdr.opId = opId;
|
||||
|
||||
assert(reqSize <= sizeof(hdr.data));
|
||||
memcpy(&hdr.data, reqBuff, reqSize);
|
||||
NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &hdr, sizeof(hdr), -1, tpRank, pidHash), res, error);
|
||||
NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &hdr, sizeof(hdr), reqFdtmp, proxyConn->tpRank, pidHash), res, error);
|
||||
NCCLCHECKGOTO(ncclIpcSocketRecvMsg(&ipcSock, respBuff, respSize, respFd), res, error);
|
||||
NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), res, error);
|
||||
|
||||
INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %d opId %p - DONE",
|
||||
comm, rank, tpRank, pidHash, reqSize, respSize, (respFd ? *respFd : -1), opId);
|
||||
comm, rank, proxyConn->tpRank, pidHash, reqSize, respSize, (respFd ? *respFd : -1), opId);
|
||||
|
||||
return res;
|
||||
|
||||
error:
|
||||
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
|
||||
WARN("ncclProxyCallBlockingUDS call to tpRank %d(%lx) failed : %d", tpRank, pidHash, res);
|
||||
WARN("ncclProxyCallBlockingUDS call to tpRank %d(%lx) failed : %d", proxyConn->tpRank, pidHash, res);
|
||||
return res;
|
||||
}
|
||||
|
||||
// cuMem API support
|
||||
// The request/response is sent out-of-band using ncclIpcSocket for this specific command
|
||||
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int tpRank, void *handle, int* convertedFd) {
|
||||
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int proxyRank, void *handle, int* convertedFd) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
// Request the allocation of a UDS fd for the handle
|
||||
NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, tpRank, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), NULL, 0, convertedFd), ret, error);
|
||||
if (comm->gproxyConn[proxyRank].initialized == false) {
|
||||
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, proxyRank, &comm->gproxyConn[proxyRank]), ret, error);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, &comm->gproxyConn[proxyRank], ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), NULL, 0, NULL, convertedFd), ret, error);
|
||||
|
||||
// We have now received the converted fd over UDS
|
||||
INFO(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx tpRank %d returned fd %d", *(uint64_t*)handle, tpRank, *convertedFd);
|
||||
INFO(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx tpRank %d returned fd %d sameProcess %d", *(uint64_t*)handle, comm->topParentRanks[proxyRank], *convertedFd, comm->gproxyConn[proxyRank].sameProcess);
|
||||
|
||||
return ret;
|
||||
|
||||
error:
|
||||
WARN("ncclProxyClientGetFd call to tpRank %d handle 0x%lx failed : %d", tpRank, *(uint64_t*)handle, ret);
|
||||
WARN("ncclProxyClientGetFd call to tpRank %d handle 0x%lx failed : %d", comm->topParentRanks[proxyRank], *(uint64_t*)handle, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxyClientQueryFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int localFd, int* rmtFd) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, proxyConn, ncclProxyMsgQueryFd, NULL, 0, (void*)rmtFd, sizeof(int), &localFd, NULL), ret, fail);
|
||||
exit:
|
||||
// We have now received the converted fd over UDS
|
||||
INFO(NCCL_PROXY, "UDS: ClientQueryFd localFd %d tpRank %d remote fd %d sameProcess %d", localFd, proxyConn->tpRank, *rmtFd, proxyConn->sameProcess);
|
||||
return ret;
|
||||
fail:
|
||||
WARN("ncclProxyClientQueryFdBlocking call to tpRank %d localFd %d failed : %d", proxyConn->tpRank, localFd, ret);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" };
|
||||
ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
|
||||
struct ncclSocket* sock;
|
||||
@@ -1091,7 +1190,6 @@ ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector
|
||||
if (sharedProxyState->peerSocks == NULL) return ncclInternalError;
|
||||
|
||||
sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
|
||||
if (sock == NULL) return ncclInternalError;
|
||||
|
||||
NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
|
||||
NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
|
||||
@@ -1267,6 +1365,22 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t proxyQueryFd(struct ncclProxyState* proxyState, int rank, void *opId, int rmtFd) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
struct ncclIpcSocket ipcSock = { 0 };
|
||||
uint64_t hash = (uint64_t) opId;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
|
||||
NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, exit);
|
||||
NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &rmtFd, sizeof(int), rmtFd, rank, hash), ret, exit);
|
||||
exit:
|
||||
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
|
||||
return ncclSuccess;
|
||||
#else
|
||||
return ncclInternalError;
|
||||
#endif
|
||||
}
|
||||
|
||||
// cuMem API support
|
||||
static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void *opId, uint64_t handle) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
@@ -1286,7 +1400,7 @@ static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void
|
||||
error:
|
||||
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
|
||||
// We can now safely close the exported fd
|
||||
(void) close(fd);
|
||||
SYSCHECK(close(fd), "close");
|
||||
return ret;
|
||||
#else
|
||||
return ncclInternalError;
|
||||
@@ -1352,30 +1466,37 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
|
||||
}
|
||||
|
||||
static ncclResult_t proxyServiceInitOp(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclProxyState* proxyState, int* asyncOpCount) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclSocket* sock = &peer->sock;
|
||||
struct ncclProxyAsyncOp* asyncOp;
|
||||
NCCLCHECK(ncclCalloc(&asyncOp, 1));
|
||||
|
||||
asyncOp->type = type;
|
||||
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)));
|
||||
NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)), ret, fail);
|
||||
|
||||
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int)));
|
||||
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int)));
|
||||
NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int)), ret, fail);
|
||||
NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int)), ret, fail);
|
||||
if (asyncOp->reqSize) {
|
||||
NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize));
|
||||
NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize));
|
||||
NCCLCHECKGOTO(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize), ret, fail);
|
||||
NCCLCHECKGOTO(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize), ret, fail);
|
||||
}
|
||||
|
||||
// Store opId for completion response
|
||||
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)));
|
||||
NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)), ret, fail);
|
||||
|
||||
if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize));
|
||||
if (asyncOp->respSize) NCCLCHECKGOTO(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize), ret, fail);
|
||||
|
||||
asyncProxyOpEnqueue(peer, asyncOp);
|
||||
|
||||
(*asyncOpCount)++;
|
||||
NCCLCHECK(proxyProgressAsync(asyncOp, proxyState, asyncOpCount, peer, connectionPool));
|
||||
return ncclSuccess;
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
if (asyncOp->reqBuff) free(asyncOp->reqBuff);
|
||||
if (asyncOp->respBuff) free(asyncOp->respBuff);
|
||||
free(asyncOp);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
#include <poll.h>
|
||||
@@ -1395,6 +1516,12 @@ static bool proxyMatchOpType(int type) {
|
||||
}
|
||||
}
|
||||
|
||||
enum {
|
||||
PROXY_RUNNING = 0,
|
||||
PROXY_STOP = 1,
|
||||
PROXY_ABORT = 2
|
||||
};
|
||||
|
||||
void* ncclProxyService(void* _args) {
|
||||
struct ncclProxyState* proxyState = (struct ncclProxyState*) _args;
|
||||
// if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
|
||||
@@ -1405,6 +1532,8 @@ void* ncclProxyService(void* _args) {
|
||||
}
|
||||
// if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
|
||||
|
||||
INFO(NCCL_INIT, "[Proxy Service] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
|
||||
|
||||
// Prepare poll descriptor
|
||||
struct ncclProxyConnectionPool connectionPool;
|
||||
connectionPool.pools = NULL;
|
||||
@@ -1426,13 +1555,13 @@ void* ncclProxyService(void* _args) {
|
||||
|
||||
int maxnpeers = 0;
|
||||
int npeers = 0;
|
||||
int stop = 0;
|
||||
int stop = PROXY_RUNNING;
|
||||
int asyncOpCount = 0;
|
||||
while (stop == 0 || (stop == 1 && npeers > 0)) {
|
||||
while (stop == PROXY_RUNNING || npeers > 0) {
|
||||
/* Even if local comm aborts, we cannot let proxy thread exit if we still have peer
|
||||
* connections. Need to wait until all other related comms call abort and safely exit
|
||||
* together, or we could face segmentation fault. */
|
||||
if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) stop = 1;
|
||||
if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) stop = PROXY_ABORT;
|
||||
/* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
|
||||
int ret;
|
||||
do {
|
||||
@@ -1474,10 +1603,14 @@ void* ncclProxyService(void* _args) {
|
||||
if (pollfds[s].fd == -1) continue;
|
||||
|
||||
// Progress all ops for this ncclProxyLocalPeer
|
||||
if (stop == PROXY_ABORT && ncclCuMemEnable() && ncclCuMemHostEnable() && !proxyState->directMode) closeConn = 1;
|
||||
ncclProxyAsyncOp* op = peer->asyncOps;
|
||||
while (op != nullptr) {
|
||||
ncclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */
|
||||
type = op->type;
|
||||
// Coverity gets confused here by complex code structure. Yes, connectionPool.pools gets dereferenced, and
|
||||
// while calling proxyProgressAsync() connectionPool.pools is NULL, but that changes before it's dereferenced.
|
||||
// coverity[var_deref_model:FALSE]
|
||||
res = proxyProgressAsync(op, proxyState, &asyncOpCount, peer, &connectionPool);
|
||||
if (res == ncclSuccess || res == ncclInProgress) {
|
||||
op = opnext;
|
||||
@@ -1494,14 +1627,15 @@ void* ncclProxyService(void* _args) {
|
||||
int closed;
|
||||
res = ncclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/);
|
||||
if (res != ncclSuccess && res != ncclInProgress) {
|
||||
WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed);
|
||||
if (!__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED))
|
||||
WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed);
|
||||
closeConn = 1;
|
||||
} else if (closed) {
|
||||
INFO(NCCL_INIT|NCCL_NET|NCCL_PROXY, "[Service thread] Connection closed by localRank %d", peer->tpLocalRank);
|
||||
closeConn = 1;
|
||||
} else if (res == ncclSuccess) { // We received something from the sock
|
||||
if (type == ncclProxyMsgStop) {
|
||||
stop = 1;
|
||||
stop = PROXY_STOP;
|
||||
closeConn = 1;
|
||||
} else if (type == ncclProxyMsgClose) {
|
||||
closeConn = 1;
|
||||
@@ -1518,12 +1652,13 @@ void* ncclProxyService(void* _args) {
|
||||
closeConn = 1;
|
||||
}
|
||||
if (res != ncclSuccess && res != ncclInProgress) {
|
||||
WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", proxyState->tpRank, ncclProxyMsgTypeStr[type], peer->tpRank, res);
|
||||
if (!__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED))
|
||||
WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", proxyState->tpRank, ncclProxyMsgTypeStr[type], peer->tpRank, res);
|
||||
closeConn = 1;
|
||||
}
|
||||
|
||||
if (closeConn) {
|
||||
ncclSocketClose(sock);
|
||||
(void)ncclSocketClose(sock);
|
||||
|
||||
if (op != nullptr) {
|
||||
asyncProxyOpDequeue(peer, op);
|
||||
@@ -1540,10 +1675,10 @@ void* ncclProxyService(void* _args) {
|
||||
WARN("[Proxy Service] proxyDestroy failed");
|
||||
}
|
||||
for (int s=0; s<maxnpeers; s++) {
|
||||
ncclSocketClose(&peers[s].sock);
|
||||
(void)ncclSocketClose(&peers[s].sock);
|
||||
}
|
||||
ncclProxyFreeConnections(&connectionPool, proxyState);
|
||||
ncclSocketClose(proxyState->listenSock);
|
||||
(void)ncclSocketClose(proxyState->listenSock);
|
||||
free(proxyState->listenSock);
|
||||
proxyOpsFree(proxyState);
|
||||
return NULL;
|
||||
@@ -1553,12 +1688,17 @@ void* ncclProxyService(void* _args) {
|
||||
// Process a request on the UDS socket
|
||||
static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd) {
|
||||
ncclIpcHdr hdr;
|
||||
NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), NULL));
|
||||
int rmtFd = -1;
|
||||
|
||||
NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), &rmtFd));
|
||||
if (hdr.type == ncclProxyMsgGetFd) {
|
||||
// cuMem API support
|
||||
uint64_t handle = *(uint64_t*)hdr.data;
|
||||
INFO(NCCL_PROXY, "proxyUDSRecvReq::ncclProxyMsgGetFd rank %d opId %p handle=0x%lx", hdr.rank, hdr.opId, handle);
|
||||
return proxyGetFd(proxyState, hdr.rank, hdr.opId, handle);
|
||||
} else if (hdr.type == ncclProxyMsgQueryFd) {
|
||||
INFO(NCCL_PROXY, "proxyUDSRecvReq::proxyQueryFd rank %d opId %p rmtFd %d", hdr.rank, hdr.opId, rmtFd);
|
||||
return proxyQueryFd(proxyState, hdr.rank, hdr.opId, rmtFd);
|
||||
}
|
||||
|
||||
return ncclInternalError;
|
||||
@@ -1570,11 +1710,13 @@ void* ncclProxyServiceUDS(void* _args) {
|
||||
struct pollfd pollfds[1];
|
||||
|
||||
if (setProxyThreadContext(proxyState)) {
|
||||
INFO(NCCL_INIT, "[Proxy Service UDS] Created CUDA context on device %d", proxyState->cudaDev);
|
||||
INFO(NCCL_INIT, "[Proxy Service UDS] Set CUDA context on device %d", proxyState->cudaDev);
|
||||
} else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
|
||||
WARN("[Proxy Service UDS] Failed to set CUDA device %d", proxyState->cudaDev);
|
||||
}
|
||||
|
||||
INFO(NCCL_INIT, "[Proxy Service UDS] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
|
||||
|
||||
if (ncclIpcSocketGetFd(&proxyState->ipcSock, &pollfds[0].fd) != ncclSuccess) {
|
||||
WARN("[Proxy Service UDS] Get listenSock fd fails");
|
||||
return NULL;
|
||||
@@ -1593,7 +1735,7 @@ void* ncclProxyServiceUDS(void* _args) {
|
||||
}
|
||||
|
||||
// Check for stop/abort
|
||||
if (proxyState->stop || *proxyState->abortFlag) break;
|
||||
if (__atomic_load_n(&proxyState->stop, __ATOMIC_ACQUIRE) || __atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE)) break;
|
||||
|
||||
if (pollfds[0].revents) {
|
||||
// A request was seen on the UDS fd
|
||||
@@ -1638,14 +1780,16 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
|
||||
proxyState->dmaBufSupport = comm->dmaBufSupport;
|
||||
proxyState->ncclNet = comm->ncclNet;
|
||||
proxyState->ncclCollNet = comm->ncclCollNet;
|
||||
proxyState->profilerContext = comm->profilerContext;
|
||||
proxyState->directMode = comm->directMode;
|
||||
memcpy(proxyState->buffSizes, comm->buffSizes, sizeof(comm->buffSizes));
|
||||
|
||||
pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState);
|
||||
PTHREADCHECK(pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState), "pthread_create");
|
||||
ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev);
|
||||
|
||||
// UDS support
|
||||
INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d", comm, comm->rank);
|
||||
pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState);
|
||||
PTHREADCHECK(pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState), "pthread_create");
|
||||
ncclSetThreadName(comm->proxyState->threadUDS, "NCCL UDS Service %2d", comm->cudaDev);
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -1658,17 +1802,17 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
|
||||
if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
|
||||
if (comm->proxyState->threadUDS) {
|
||||
// UDS support
|
||||
comm->proxyState->stop = 1;
|
||||
__atomic_store_n(&comm->proxyState->stop, 1, __ATOMIC_RELEASE);
|
||||
}
|
||||
|
||||
if (sharedProxyState->peerAddresses) {
|
||||
if (*comm->abortFlag == 0 && sharedProxyState->peerAddresses) {
|
||||
struct ncclSocket sock;
|
||||
int type = ncclProxyMsgStop;
|
||||
ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag);
|
||||
if (ncclSocketConnect(&sock) == ncclSuccess) {
|
||||
ncclSocketSend(&sock, &type, sizeof(int));
|
||||
(void)ncclSocketSend(&sock, &type, sizeof(int));
|
||||
}
|
||||
ncclSocketClose(&sock);
|
||||
(void)ncclSocketClose(&sock);
|
||||
}
|
||||
|
||||
if (sharedProxyState->peerSocks) {
|
||||
@@ -1686,7 +1830,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
|
||||
}
|
||||
}
|
||||
int type = ncclProxyMsgClose;
|
||||
ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int));
|
||||
(void)ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int));
|
||||
NCCLCHECK(ncclSocketClose(sharedProxyState->peerSocks + i));
|
||||
}
|
||||
}
|
||||
@@ -1700,13 +1844,15 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
|
||||
ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
|
||||
struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState;
|
||||
|
||||
assert(sharedProxyState->refCount == 0);
|
||||
free(sharedProxyState->peerAddresses);
|
||||
free(sharedProxyState->peerAddressesUDS);
|
||||
free(sharedProxyState->peerSocks);
|
||||
free(sharedProxyState->proxyOps);
|
||||
free(sharedProxyState->sharedDevMems);
|
||||
expectedProxyResponseFree(sharedProxyState);
|
||||
free(sharedProxyState);
|
||||
if (sharedProxyState) {
|
||||
assert(sharedProxyState->refCount == 0);
|
||||
free(sharedProxyState->peerAddresses);
|
||||
free(sharedProxyState->peerAddressesUDS);
|
||||
free(sharedProxyState->peerSocks);
|
||||
free(sharedProxyState->proxyOps);
|
||||
free(sharedProxyState->sharedDevMems);
|
||||
expectedProxyResponseFree(sharedProxyState);
|
||||
free(sharedProxyState);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -26,8 +26,8 @@ ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) {
|
||||
|
||||
ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, struct ncclReg* reg) {
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
int netCount;
|
||||
NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount));
|
||||
int netCount = 0;
|
||||
if (comm->topo != NULL) NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount));
|
||||
if (netCount == 0) return ncclSuccess;
|
||||
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
@@ -105,7 +105,11 @@ ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, s
|
||||
NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
|
||||
|
||||
ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, void** handle) {
|
||||
if (!ncclParamLocalRegister()) return ncclSuccess;
|
||||
if (!ncclParamLocalRegister()) {
|
||||
*handle = NULL;
|
||||
return ncclSuccess;
|
||||
}
|
||||
INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size);
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
uintptr_t pageSize = cache->pageSize;
|
||||
uintptr_t addr = (uintptr_t)data & -pageSize;
|
||||
@@ -166,6 +170,10 @@ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
|
||||
struct ncclReg* reg = (struct ncclReg*)handle;
|
||||
struct ncclRegCache* cache = &comm->regCache;
|
||||
int slot;
|
||||
int saveDev;
|
||||
if (handle == NULL) goto exit;
|
||||
CUDACHECK(cudaGetDevice(&saveDev));
|
||||
CUDACHECK(cudaSetDevice(comm->cudaDev));
|
||||
for (slot=0; slot<cache->population && cache->slots[slot] != reg; slot++);
|
||||
if (slot == cache->population) {
|
||||
WARN("Deregister: Could not find handle");
|
||||
@@ -178,10 +186,19 @@ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
|
||||
reg->regAddr = (CUdeviceptr)NULL;
|
||||
}
|
||||
if (reg->state & COLLNET_REG_COMPLETE) {
|
||||
NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->proxyconn, reg->collnetHandle));
|
||||
NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->collnetProxyconn, reg->collnetHandle));
|
||||
}
|
||||
if (reg->state & IPC_REG_COMPLETE) {
|
||||
for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; ++i)
|
||||
if (reg->ipcInfos[i])
|
||||
NCCLCHECK(ncclIpcDeregBuffer(comm, reg->ipcInfos[i]));
|
||||
if (reg->regIpcAddrs.hostPeerRmtAddrs) free(reg->regIpcAddrs.hostPeerRmtAddrs);
|
||||
if (reg->regIpcAddrs.devPeerRmtAddrs) NCCLCHECK(ncclCudaFree(reg->regIpcAddrs.devPeerRmtAddrs));
|
||||
}
|
||||
free(reg);
|
||||
memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*));
|
||||
cache->population -= 1;
|
||||
CUDACHECK(cudaSetDevice(saveDev));
|
||||
exit:
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -28,7 +28,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
struct ncclTransport *transport = ncclTransports[t];
|
||||
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
|
||||
int ret = 0;
|
||||
NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
|
||||
NCCLCHECK(transport->canConnect(&ret, comm, graph, myInfo, peerInfo));
|
||||
if (ret) {
|
||||
connector->transportComm = transportComm;
|
||||
NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex));
|
||||
@@ -70,25 +70,52 @@ NCCL_PARAM(ConnectRoundMaxPeers, "CONNECT_ROUND_MAX_PEERS", 128);
|
||||
NCCL_PARAM(ReportConnectProgress, "REPORT_CONNECT_PROGRESS", 0);
|
||||
#include <sys/time.h>
|
||||
|
||||
ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode) {
|
||||
bool supportFlag = true;
|
||||
bool directFlag = false;
|
||||
if (comm->localRanks == 1) {
|
||||
supportFlag = false;
|
||||
} else {
|
||||
for (int i = 0; i < comm->localRanks; ++i) {
|
||||
for (int j = i + 1; j < comm->localRanks; ++j) {
|
||||
int ipeer = comm->localRankToRank[i];
|
||||
int jpeer = comm->localRankToRank[j];
|
||||
struct ncclPeerInfo* ipeerInfo = &comm->peerInfo[ipeer];
|
||||
struct ncclPeerInfo* jpeerInfo = &comm->peerInfo[jpeer];
|
||||
int canConnect = 0;
|
||||
NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, NULL, ipeerInfo, jpeerInfo));
|
||||
if (!canConnect && supportFlag == true) {
|
||||
supportFlag = false;
|
||||
}
|
||||
if (ipeerInfo->hostHash == jpeerInfo->hostHash && ipeerInfo->pidHash == jpeerInfo->pidHash) directFlag = true;
|
||||
if (!supportFlag && directFlag) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
*intraNodeP2pSupport = supportFlag;
|
||||
*directMode = directFlag;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
|
||||
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int highestType = TRANSPORT_UNDEFINED; // track highest transport type
|
||||
struct ncclConnect** data; // Store intermediate send/recvData structs for connect
|
||||
struct ncclConnect** recvData; // Points to entries inside data for given recv connection within a channel
|
||||
struct ncclConnect** sendData; // Points to entries inside data for given send connection within a channel
|
||||
struct ncclConnect** recvData = NULL; // Points to entries inside data for given recv connection within a channel
|
||||
struct ncclConnect** sendData = NULL; // Points to entries inside data for given send connection within a channel
|
||||
int done = 0;
|
||||
|
||||
int maxPeers = ncclParamConnectRoundMaxPeers();
|
||||
NCCLCHECK(ncclCalloc(&data, maxPeers));
|
||||
NCCLCHECK(ncclCalloc(&recvData, maxPeers));
|
||||
NCCLCHECK(ncclCalloc(&sendData, maxPeers));
|
||||
|
||||
struct timeval timeStart, timeLast;
|
||||
gettimeofday(&timeStart, NULL);
|
||||
timeLast = timeStart; // struct copy
|
||||
bool timeReported = false;
|
||||
|
||||
NCCLCHECK(ncclCalloc(&data, maxPeers));
|
||||
NCCLCHECKGOTO(ncclCalloc(&recvData, maxPeers), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCalloc(&sendData, maxPeers), ret, fail);
|
||||
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
|
||||
// First time initialization
|
||||
for (int i=1; i<comm->nRanks; i++) {
|
||||
@@ -104,7 +131,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
// The next M entries contain sendData, connection information for send connections
|
||||
// It's not guaranteed that each entry of data has the same number of total or send/recv specific connections
|
||||
int p = i-(done+1);
|
||||
if (recvMask || sendMask) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS));
|
||||
if (recvMask || sendMask) NCCLCHECKGOTO(ncclCalloc(data+p, 2*MAXCHANNELS), ret, fail);
|
||||
recvData[p] = data[p];
|
||||
int sendChannels = 0, recvChannels = 0;
|
||||
int type;
|
||||
@@ -163,7 +190,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
struct ncclConnector* conn = comm->channels[c].peers[sendPeer]->send + connIndex;
|
||||
// This connector hasn't completed connection yet
|
||||
if (conn->connected == 0) {
|
||||
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[p] + sendDataOffset++, 1, comm->rank, conn), ret, fail);
|
||||
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[p] + sendDataOffset, 1, comm->rank, conn), ret, fail);
|
||||
if (ret == ncclSuccess) {
|
||||
conn->connected = 1;
|
||||
/* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */
|
||||
@@ -172,6 +199,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
allChannelsConnected = false;
|
||||
}
|
||||
}
|
||||
sendDataOffset++;
|
||||
}
|
||||
TIME_STOP(3);
|
||||
|
||||
@@ -181,7 +209,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
struct ncclConnector* conn = comm->channels[c].peers[recvPeer]->recv + connIndex;
|
||||
// This connector hasn't completed connection yet
|
||||
if (conn->connected == 0) {
|
||||
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[p] + recvDataOffset++, 1, comm->rank, conn), ret, fail);
|
||||
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[p] + recvDataOffset, 1, comm->rank, conn), ret, fail);
|
||||
if (ret == ncclSuccess) {
|
||||
conn->connected = 1;
|
||||
/* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */
|
||||
@@ -190,6 +218,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
allChannelsConnected = false;
|
||||
}
|
||||
}
|
||||
recvDataOffset++;
|
||||
}
|
||||
TIME_STOP(4);
|
||||
}
|
||||
@@ -198,7 +227,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
data[p] = NULL;
|
||||
}
|
||||
}
|
||||
if (ncclParamReportConnectProgress() && comm->rank == 0) {
|
||||
if (ncclParamReportConnectProgress() && comm->rank == 0 && done > 0) {
|
||||
struct timeval now;
|
||||
gettimeofday(&now, NULL);
|
||||
if (((now.tv_sec - timeLast.tv_sec)*1.0 + (now.tv_usec-timeLast.tv_usec)*1e-6) > 1) {
|
||||
@@ -236,34 +265,31 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
int bootstrapTag = (i << 8) + (1 << 7) + (graph ? graph->id + 1 : 0);
|
||||
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
||||
int sendPeer = (comm->rank + i) % comm->nRanks;
|
||||
int flag = 0;
|
||||
|
||||
if (recvPeer != sendPeer) {
|
||||
if (comm->connectSend[sendPeer] != 0UL)
|
||||
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
|
||||
if (comm->connectRecv[recvPeer] != 0UL)
|
||||
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
|
||||
|
||||
if (comm->connectSend[sendPeer] != 0UL)
|
||||
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
|
||||
if (comm->connectRecv[recvPeer] != 0UL)
|
||||
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
|
||||
if (comm->connectSend[sendPeer] != 0UL) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
|
||||
if (comm->connectRecv[recvPeer] != 0UL) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, NULL, 0), ret, fail);
|
||||
if (comm->connectSend[sendPeer] != 0UL) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
|
||||
if (comm->connectRecv[recvPeer] != 0UL) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, NULL, 0), ret, fail);
|
||||
} else {
|
||||
if (comm->connectSend[sendPeer] != 0UL || comm->connectRecv[recvPeer] != 0UL) {
|
||||
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
|
||||
}
|
||||
}
|
||||
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
|
||||
}
|
||||
|
||||
free(data);
|
||||
free(sendData);
|
||||
free(recvData);
|
||||
|
||||
if (highestTransportType != NULL) *highestTransportType = highestType;
|
||||
TIME_PRINT("P2P Setup/Connect");
|
||||
exit:
|
||||
for(int i=0; i<maxPeers; ++i){
|
||||
if(data[i]) free(data[i]);
|
||||
}
|
||||
free(data);
|
||||
if (sendData) free(sendData);
|
||||
if (recvData) free(recvData);
|
||||
|
||||
NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream));
|
||||
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream));
|
||||
return ret;
|
||||
@@ -275,8 +301,8 @@ extern struct ncclTransport collNetTransport;
|
||||
|
||||
// All ranks must participate in collNetSetup call
|
||||
// We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails
|
||||
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect) {
|
||||
int fail = 1;
|
||||
bool ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int rank = comm->rank;
|
||||
int nranks = comm->nRanks;
|
||||
int nMasters = comm->nNodes;
|
||||
@@ -297,24 +323,23 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
|
||||
struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
|
||||
conn->transportComm = transportComm;
|
||||
// setup
|
||||
struct ncclConnect myConnect;
|
||||
if (isMaster) {
|
||||
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
|
||||
}
|
||||
// prepare connect handles
|
||||
ncclResult_t res;
|
||||
struct ncclConnect myConnect = { 0 };
|
||||
struct {
|
||||
int isMaster;
|
||||
ncclConnect connect;
|
||||
} *allConnects = NULL;
|
||||
ncclConnect *masterConnects = NULL;
|
||||
if (isMaster) {
|
||||
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
|
||||
}
|
||||
// prepare connect handles
|
||||
NCCLCHECK(ncclCalloc(&masterConnects, nMasters));
|
||||
if (type == collNetRecv) { // recv side: AllGather
|
||||
// all ranks must participate
|
||||
NCCLCHECK(ncclCalloc(&allConnects, nranks));
|
||||
NCCLCHECKGOTO(ncclCalloc(&allConnects, nranks), ret, cleanup);
|
||||
allConnects[rank].isMaster = isMaster;
|
||||
memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect));
|
||||
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup);
|
||||
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), ret, cleanup);
|
||||
// consolidate
|
||||
int c = 0;
|
||||
for (int r = 0; r < nranks; r++) {
|
||||
@@ -328,21 +353,20 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
|
||||
}
|
||||
// connect
|
||||
if (isMaster) {
|
||||
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, comm->node, conn), res, cleanup);
|
||||
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, comm->node, conn), ret, cleanup);
|
||||
struct ncclDevChannelPeer* devRoot;
|
||||
CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup);
|
||||
CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), ret, cleanup);
|
||||
struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv + type : devRoot->send + type;
|
||||
CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup);
|
||||
CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), ret, cleanup);
|
||||
}
|
||||
if (isMaster && type == collNetRecv) {
|
||||
memcpy(connect, masterConnects+comm->node, sizeof(struct ncclConnect));
|
||||
TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, comm->node, nMasters, masterPeer);
|
||||
}
|
||||
fail = 0;
|
||||
cleanup:
|
||||
if (allConnects != NULL) free(allConnects);
|
||||
if (masterConnects != NULL) free(masterConnects);
|
||||
return fail;
|
||||
return ret != ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) {
|
||||
|
||||
@@ -18,15 +18,15 @@ int64_t ncclParamGdrCopySyncEnable();
|
||||
int64_t ncclParamGdrCopyFlushEnable();
|
||||
|
||||
struct collNetRecvConnectInfo {
|
||||
int rank;
|
||||
int nranks;
|
||||
collNetHandle_t collNetHandle;
|
||||
};
|
||||
static_assert(sizeof(collNetRecvConnectInfo) <= CONNECT_SIZE, "Collnet Recv Connect info is too large");
|
||||
|
||||
struct collNetSendConnectInfo {
|
||||
void* mhandles[NCCL_NUM_PROTOCOLS];
|
||||
void* reqFifo;
|
||||
};
|
||||
static_assert(sizeof(collNetSendConnectInfo) <= CONNECT_SIZE, "Collnet Send Connect info is too large");
|
||||
|
||||
#define COLLNET_GROUP_NSUBS 8
|
||||
#define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS)
|
||||
@@ -135,7 +135,7 @@ struct recvResources {
|
||||
int collNetRank;
|
||||
};
|
||||
|
||||
static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
// This transport cannot be used for p2p
|
||||
*ret = 0;
|
||||
return ncclSuccess;
|
||||
@@ -154,15 +154,14 @@ struct setupReq {
|
||||
static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
|
||||
struct setupReq req = { 0 };
|
||||
|
||||
int proxyRank, tpProxyRank;
|
||||
int proxyRank;
|
||||
int64_t netId;
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
|
||||
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
|
||||
send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
|
||||
tpProxyRank = comm->topParentRanks[myInfo->rank];
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn));
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
|
||||
ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
|
||||
req.collNet = comm->collNetSharedRes;
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
|
||||
@@ -175,7 +174,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
|
||||
struct setupReq req = { 0 };
|
||||
|
||||
int proxyRank, tpProxyRank;
|
||||
int proxyRank;
|
||||
int64_t netId;
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
|
||||
@@ -184,8 +183,8 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
|
||||
|
||||
recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
|
||||
tpProxyRank = comm->topParentRanks[myInfo->rank];
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn));
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
|
||||
static_assert(sizeof(collNetRecvConnectInfo) <= sizeof(struct ncclConnect), "Collnet Recv Connect info is too big");
|
||||
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
|
||||
ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
|
||||
req.collNet = comm->collNetSharedRes;
|
||||
@@ -442,6 +441,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
|
||||
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
|
||||
struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
|
||||
static_assert(sizeof(collNetSendConnectInfo) <= sizeof(struct ncclConnect), "Collnet Send Connect info is too big");
|
||||
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
|
||||
|
||||
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
|
||||
@@ -1039,7 +1039,7 @@ ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* u
|
||||
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
|
||||
if (handle) {
|
||||
regRecord->state |= COLLNET_REG_COMPLETE;
|
||||
regRecord->proxyconn = proxyconn;
|
||||
regRecord->collnetProxyconn = proxyconn;
|
||||
*outHandle = regRecord->collnetHandle = handle;
|
||||
*outRegBufFlag = 1;
|
||||
}
|
||||
@@ -1091,7 +1091,7 @@ ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* u
|
||||
record->size = buffSize;
|
||||
*outHandle = record->mhandle = handle;
|
||||
*outRegBufFlag = 1;
|
||||
ncclIntruQueueEnqueue(cleanupQueue, &record->base);
|
||||
ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
|
||||
*nCleanupQueueElts += 1;
|
||||
|
||||
exit:
|
||||
@@ -1214,23 +1214,6 @@ ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) {
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1, &highestTransportType1), ret, fail);
|
||||
|
||||
// Exchange highest intra-node transport type among ranks
|
||||
// because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
|
||||
if (highestTransportType0 != TRANSPORT_UNDEFINED && highestTransportType1 != TRANSPORT_UNDEFINED) {
|
||||
int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_UNDEFINED };
|
||||
|
||||
comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail);
|
||||
for (int i = 0; i < comm->localRanks; i++) {
|
||||
if (highestTypes[i] > comm->intraHighestTransportType)
|
||||
comm->intraHighestTransportType = highestTypes[i];
|
||||
}
|
||||
if (comm->collNetSharedRes->intraHighestTransportType < comm->intraHighestTransportType)
|
||||
comm->collNetSharedRes->intraHighestTransportType = comm->intraHighestTransportType;
|
||||
} else if (comm->intraHighestTransportType == TRANSPORT_UNDEFINED) {
|
||||
// reuse previous shared intraHighestTransportType
|
||||
comm->intraHighestTransportType = comm->collNetSharedRes->intraHighestTransportType;
|
||||
}
|
||||
INFO(NCCL_INIT, "rank %d Connected CollNet", comm->rank);
|
||||
|
||||
exit:
|
||||
|
||||
@@ -34,3 +34,26 @@ exit:
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTransportPatConnect(struct ncclComm* comm) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
if (comm && comm->nRanks > 1) {
|
||||
for (int mask=1; mask<comm->nRanks; mask<<=1) {
|
||||
int prevPeer = (comm->rank + mask) % comm->nRanks;
|
||||
int nextPeer = (comm->rank + comm->nRanks - mask) % comm->nRanks;
|
||||
for (int c = 0; c < comm->nChannels; c++) {
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &prevPeer, 1, &nextPeer, 0), ret, fail); // ReduceScatter
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail);
|
||||
for (int c = 0; c < comm->nChannels; c++) {
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &nextPeer, 1, &prevPeer, 0), ret, fail); // AllGather
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail);
|
||||
}
|
||||
INFO(NCCL_INIT, "Connected binomial trees");
|
||||
}
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
@@ -10,10 +10,11 @@
|
||||
#include "proxy.h"
|
||||
#include "collectives.h"
|
||||
#include "gdrwrap.h"
|
||||
#include "shm.h"
|
||||
#include "shmutils.h"
|
||||
#include "p2p.h"
|
||||
#include "profiler.h"
|
||||
#include "transport.h"
|
||||
#include "shm.h"
|
||||
|
||||
static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large");
|
||||
|
||||
@@ -62,9 +63,8 @@ struct connectMapMem{
|
||||
char* cpuPtr;
|
||||
int size;
|
||||
ncclIpcDesc ipcDesc;
|
||||
char shmPath[PATH_MAX];
|
||||
ncclShmHandle_t attachHandle;
|
||||
ncclShmHandle_t createHandle;
|
||||
ncclShmIpcDesc_t attachDesc;
|
||||
ncclShmIpcDesc_t createDesc;
|
||||
};
|
||||
|
||||
struct connectMap {
|
||||
@@ -142,11 +142,11 @@ struct recvNetResources {
|
||||
};
|
||||
|
||||
/* Determine if two peers can communicate with NET */
|
||||
static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
*ret = 1;
|
||||
if (info1->hostHash == info2->hostHash) {
|
||||
// If on the same host, check intra-node net is not disabled.
|
||||
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, ret));
|
||||
NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, ret));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -173,9 +173,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
* information for this peer */
|
||||
static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
|
||||
struct setupReq req = { 0 };
|
||||
int tpProxyRank;
|
||||
|
||||
send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
|
||||
send->conn.shared = req.shared = graph || connIndex == 0 ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
|
||||
req.channelId = channelId;
|
||||
req.connIndex = connIndex;
|
||||
|
||||
@@ -185,8 +184,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
|
||||
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
|
||||
|
||||
tpProxyRank = comm->topParentRanks[proxyRank];
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn));
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
|
||||
req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
|
||||
req.tpRank = comm->topParentRanks[myInfo->rank];
|
||||
req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
|
||||
@@ -199,7 +197,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
|
||||
proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
|
||||
}
|
||||
*((int*)connectInfo) = tpProxyRank;
|
||||
*((int*)connectInfo) = comm->topParentRanks[proxyRank];
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -212,12 +210,12 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
|
||||
static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
|
||||
struct setupReq req = { 0 };
|
||||
|
||||
recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
|
||||
recv->conn.shared = req.shared = graph || connIndex == 0 ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
|
||||
req.channelId = channelId;
|
||||
req.connIndex = connIndex;
|
||||
|
||||
// Use myInfo->rank as the receiver uses its own NIC
|
||||
int proxyRank, tpProxyRank;
|
||||
int proxyRank;
|
||||
int64_t netId;
|
||||
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &netId, &req.netDev, &proxyRank));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
|
||||
@@ -226,8 +224,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
|
||||
|
||||
// We don't support PXN on receive yet
|
||||
tpProxyRank = comm->topParentRanks[myInfo->rank];
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn));
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
|
||||
|
||||
req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
|
||||
req.tpRank = comm->topParentRanks[myInfo->rank];
|
||||
@@ -238,26 +235,24 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t netMapShm(struct connectMapMem* mem) {
|
||||
mem->cpuPtr = NULL;
|
||||
mem->gpuPtr = NULL;
|
||||
NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, -1, &mem->attachHandle));
|
||||
static ncclResult_t netMapShm(struct ncclComm *comm, struct connectMapMem* mem) {
|
||||
NCCLCHECK(ncclShmImportShareableBuffer(comm, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, &mem->attachDesc));
|
||||
return ncclSuccess;
|
||||
}
|
||||
static ncclResult_t netCreateShm(struct connectMapMem* mem) {
|
||||
mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file
|
||||
NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1, &mem->createHandle));
|
||||
|
||||
static ncclResult_t netCreateShm(struct ncclProxyState* proxyState, struct connectMapMem* mem) {
|
||||
NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, mem->size, false, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t netDumpMap(struct connectMap* map) {
|
||||
printf("Dump map same process %d shared %d\n", map->sameProcess, map->shared);
|
||||
struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM;
|
||||
printf("Mem 0: Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr);
|
||||
printf("Mem 0: Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
|
||||
mem = map->mems+NCCL_NET_MAP_DEVMEM;
|
||||
printf("Mem 1: Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
|
||||
mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM;
|
||||
printf("Mem 2: Shared Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr);
|
||||
printf("Mem 2: Shared Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
|
||||
mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM;
|
||||
printf("Mem 3: Shared Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
|
||||
printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
|
||||
@@ -328,10 +323,10 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
|
||||
}
|
||||
}
|
||||
} else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
|
||||
if (!map->sameProcess) NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
|
||||
if (!map->sameProcess) NCCLCHECK(netMapShm(comm, map->mems + NCCL_NET_MAP_HOSTMEM));
|
||||
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
|
||||
map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr = NULL;
|
||||
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
|
||||
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.rank,
|
||||
map->mems[NCCL_NET_MAP_DEVMEM].size,
|
||||
&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
|
||||
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
|
||||
@@ -341,7 +336,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
|
||||
void** sharedDevMemPtr = comm->proxyState->sharedDevMems + send->proxyConn.tpLocalRank;
|
||||
if (*sharedDevMemPtr == NULL) {
|
||||
map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = NULL;
|
||||
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
|
||||
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.rank,
|
||||
map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size,
|
||||
&map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipcDesc,
|
||||
sharedDevMemPtr));
|
||||
@@ -463,24 +458,19 @@ static ncclResult_t sendFree(struct ncclConnector* send) {
|
||||
if (map) {
|
||||
int cudaDev;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
if (map->sameProcess && map->cudaDev == cudaDev) {
|
||||
// Our own GPU, so it wasn't mapped in
|
||||
free(map);
|
||||
return ncclSuccess;
|
||||
}
|
||||
if (!map->sameProcess || ncclCuMemEnable()) {
|
||||
if (!map->sameProcess) NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].attachHandle));
|
||||
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
|
||||
if (ncclCuMemEnable()) {
|
||||
// cuMem API support
|
||||
NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
|
||||
NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
|
||||
} else {
|
||||
// Legacy CUDA IPC support
|
||||
CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
|
||||
}
|
||||
if (map->cudaDev != cudaDev && map->mems[NCCL_NET_MAP_DEVMEM].size) {
|
||||
if (ncclCuMemEnable()) {
|
||||
// cuMem API support
|
||||
NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
|
||||
NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
|
||||
} else {
|
||||
// Legacy CUDA IPC support
|
||||
CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
|
||||
}
|
||||
}
|
||||
if (!map->sameProcess) {
|
||||
NCCLCHECK(ncclShmIpcClose(&map->mems[NCCL_NET_MAP_HOSTMEM].attachDesc));
|
||||
}
|
||||
free(map);
|
||||
}
|
||||
|
||||
@@ -518,7 +508,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
|
||||
|
||||
if (cuda && state->cudaBuff == NULL) {
|
||||
if (sameProcess == 0 || ncclCuMemEnable()) {
|
||||
NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, &state->ipcDesc, (void**)&state->cudaBuff));
|
||||
NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, 0, &state->ipcDesc, (void**)&state->cudaBuff));
|
||||
} else {
|
||||
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size));
|
||||
}
|
||||
@@ -527,7 +517,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
|
||||
NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size));
|
||||
}
|
||||
if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
|
||||
if (gpuPtr) *gpuPtr = sameProcess ? *cpuPtr : NULL;
|
||||
if (gpuPtr) *gpuPtr = (cpuPtr && sameProcess) ? *cpuPtr : NULL;
|
||||
if (ipcDesc) memcpy(ipcDesc, &state->ipcDesc, sizeof(state->ipcDesc));
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -543,7 +533,7 @@ static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int chan
|
||||
static ncclResult_t sharedNetBuffersDestroy(struct ncclProxyState* proxyState, int tpLocalRank, int type, struct ncclProxyConnection* connection) {
|
||||
if (proxyState->progressState.localPeers == NULL) NCCLCHECK(ncclInternalError);
|
||||
struct ncclProxyPeer* peer = proxyState->progressState.localPeers[tpLocalRank];
|
||||
if (peer == NULL) NCCLCHECK(ncclInternalError;)
|
||||
if (peer == NULL) NCCLCHECK(ncclInternalError);
|
||||
struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
|
||||
if (state->size == 0) NCCLCHECK(ncclInternalError);
|
||||
if (ncclAtomicRefCountDecrement(&state->refcount) == 0) {
|
||||
@@ -746,7 +736,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
if (resources->shared == 0) {
|
||||
if (!map->sameProcess || ncclCuMemEnable()) {
|
||||
ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN);
|
||||
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
|
||||
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, 0, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
|
||||
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
|
||||
} else {
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size));
|
||||
@@ -758,7 +748,11 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
|
||||
map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
|
||||
} else {
|
||||
NCCLCHECK(netCreateShm(map->mems+NCCL_NET_MAP_HOSTMEM));
|
||||
NCCLCHECK(netCreateShm(proxyState, map->mems+NCCL_NET_MAP_HOSTMEM));
|
||||
void* sendMem = (void*)NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
|
||||
void* recvMem = (void*)NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
|
||||
memset(sendMem, 0, sizeof(struct ncclSendMem));
|
||||
memset(recvMem, 0, sizeof(struct ncclRecvMem));
|
||||
}
|
||||
if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) {
|
||||
uint64_t *cpuPtr, *gpuPtr;
|
||||
@@ -896,7 +890,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
||||
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
|
||||
if (resources->shared == 0) {
|
||||
if (ncclCuMemEnable()) {
|
||||
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
|
||||
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, 0, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
|
||||
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
|
||||
} else {
|
||||
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size));
|
||||
@@ -968,7 +962,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
|
||||
if (resources->map.sameProcess) {
|
||||
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
|
||||
} else {
|
||||
NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].createHandle));
|
||||
NCCLCHECK(ncclShmIpcClose(&mems[NCCL_NET_MAP_HOSTMEM].createDesc));
|
||||
}
|
||||
NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
|
||||
if (!resources->map.sameProcess || ncclCuMemEnable()) {
|
||||
@@ -1050,7 +1044,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
// Set step base for next op
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
sub->posted = sub->transmitted = sub->done = 0;
|
||||
for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
|
||||
ncclProfilerStartSendProxyOpEvent(s, args);
|
||||
if (sub->reg && sub->nbytes > 0) {
|
||||
NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
|
||||
} else {
|
||||
@@ -1072,6 +1066,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
|
||||
// Post buffers to the GPU
|
||||
if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) {
|
||||
ncclProfilerStartSendProxyStepEvents(s, args, sub->posted, sub->posted+args->sliceSteps);
|
||||
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
|
||||
if (resources->shared) {
|
||||
if (!sub->reg) {
|
||||
@@ -1087,9 +1082,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
if (sub->reg == 0 || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS;
|
||||
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
|
||||
} else sub->posted += args->sliceSteps;
|
||||
for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) {
|
||||
ncclProfilingRecord(args, s, step, ncclProxyProfileSendGPUWait);
|
||||
}
|
||||
ncclProfilerRecordProxyOpEventState(s, args, sub->posted, sub->transSize, ncclProfilerProxyOpSendPosted);
|
||||
ncclProfilerRecordProxyStepEventStates(s, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepSendGPUWait);
|
||||
args->idle = 0;
|
||||
continue;
|
||||
}
|
||||
@@ -1130,12 +1124,18 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
buff = sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset;
|
||||
}
|
||||
if (ready) {
|
||||
ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted + args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait);
|
||||
// Data is ready, try to send.
|
||||
// Coverity complains about the size here as pointing to an out-of-scope temporary. Which is nonsense,
|
||||
// since size is a plain integer.
|
||||
// coverity[use_invalid:FALSE]
|
||||
NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->mhandle, sub->requests+buffSlot));
|
||||
if (sub->requests[buffSlot] != NULL) {
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p, size %d, proto %d, myRank %d, channelId %d", sub->transmitted, buffSlot, sub->requests[buffSlot], size, p, proxyState->tpRank, sub->channelId);
|
||||
sub->transmitted += args->sliceSteps;
|
||||
for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileSendWait);
|
||||
ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted);
|
||||
ncclProfilerRecordProxyStepEventStates(s, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepSendWait);
|
||||
sub->transSize += size;
|
||||
args->idle = 0;
|
||||
continue;
|
||||
}
|
||||
@@ -1165,7 +1165,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
__sync_synchronize();
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
|
||||
sub->done += args->sliceSteps;
|
||||
for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
|
||||
ncclProfilerStopProxyStepEvents(s, args, sub->done-args->sliceSteps, sub->done);
|
||||
ncclProfilerRecordProxyOpEventState(s, args, sub->done, sub->transSize, ncclProfilerProxyOpSendDone);
|
||||
|
||||
if (resources->shared == 0) {
|
||||
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
|
||||
@@ -1188,6 +1189,9 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
}
|
||||
}
|
||||
if (args->done == args->nsubs) {
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
ncclProfilerStopProxyOpEvent(s, args);
|
||||
}
|
||||
args->state = ncclProxyOpNone;
|
||||
}
|
||||
}
|
||||
@@ -1229,7 +1233,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
sub->posted = sub->received = sub->transmitted = sub->done = 0;
|
||||
for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize;
|
||||
for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
|
||||
ncclProfilerStartRecvProxyOpEvent(s, args);
|
||||
if (sub->reg && sub->nbytes > 0) {
|
||||
// Register buffer
|
||||
NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
|
||||
@@ -1254,6 +1258,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
struct ncclProxySubArgs* sub = subGroup + i;
|
||||
if (sub->posted < sub->nsteps) {
|
||||
if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; }
|
||||
ncclProfilerStartRecvProxyStepEvents(s+i, args, sub->posted, sub->posted+args->sliceSteps);
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
if (sub->reg) maxDepth = 1;
|
||||
int stepSize = resources->buffSizes[p] / NCCL_STEPS;
|
||||
@@ -1294,7 +1299,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
for (int i=0; i<subGroup->groupSize; i++) {
|
||||
struct ncclProxySubArgs* sub = subGroup+i;
|
||||
sub->posted += args->sliceSteps;
|
||||
for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait);
|
||||
ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted);
|
||||
ncclProfilerRecordProxyStepEventStates(s+i, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepRecvWait);
|
||||
}
|
||||
args->idle = 0;
|
||||
}
|
||||
@@ -1337,7 +1343,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
}
|
||||
}
|
||||
sub->received += args->sliceSteps;
|
||||
for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
|
||||
sub->transSize += sizes[i];
|
||||
ncclProfilerRecordProxyOpEventState(s+i, args, sub->received, sub->transSize, ncclProfilerProxyOpRecvReceived);
|
||||
ncclProfilerRecordProxyStepEventStates(s+i, args, sub->received-args->sliceSteps, sub->received, ncclProfilerProxyStepRecvFlushWait);
|
||||
if (step < sub->nsteps) {
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
if (resources->useGdr) needFlush |= resources->needFlush;
|
||||
@@ -1393,7 +1401,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
struct ncclProxySubArgs* sub = subGroup + i;
|
||||
|
||||
sub->transmitted += args->sliceSteps;
|
||||
for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait);
|
||||
ncclProfilerRecordProxyOpEventState(s+i, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpRecvTransmitted);
|
||||
ncclProfilerRecordProxyStepEventStates(s+i, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepRecvGPUWait);
|
||||
if (step < sub->nsteps) {
|
||||
__sync_synchronize();
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
@@ -1431,7 +1440,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL;
|
||||
}
|
||||
sub->done += args->sliceSteps;
|
||||
for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileEnd);
|
||||
ncclProfilerStopProxyStepEvents(s+i, args, sub->done-args->sliceSteps, sub->done);
|
||||
ncclProfilerRecordProxyOpEventState(s+i, args, sub->done, sub->transSize, ncclProfilerProxyOpRecvDone);
|
||||
args->idle = 0;
|
||||
if (sub->done == sub->nsteps) {
|
||||
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
|
||||
@@ -1447,6 +1457,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
|
||||
}
|
||||
if (args->done == args->nsubs) {
|
||||
args->state = ncclProxyOpNone;
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
ncclProfilerStopProxyOpEvent(s, args);
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
|
||||
@@ -49,6 +49,11 @@ struct alignas(64) ncclIbMergedDev {
|
||||
int devs[NCCL_IB_MAX_DEVS_PER_NIC]; // Points to an index in ncclIbDevs
|
||||
int speed;
|
||||
char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+'
|
||||
int dmaBufSupported; // 0 = uninit, 1 = yes, -1 = no
|
||||
};
|
||||
|
||||
struct ncclIbStats {
|
||||
int fatalErrorCount;
|
||||
};
|
||||
|
||||
static int ncclNIbDevs = -1;
|
||||
@@ -69,6 +74,7 @@ struct alignas(64) ncclIbDev {
|
||||
struct ncclIbMrCache mrCache;
|
||||
int ar; // ADAPTIVE_ROUTING
|
||||
struct ibv_port_attr portAttr;
|
||||
struct ncclIbStats stats;
|
||||
};
|
||||
|
||||
#define MAX_IB_DEVS 32
|
||||
@@ -80,7 +86,7 @@ static int ncclIbRelaxedOrderingEnabled = 0;
|
||||
NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", -1);
|
||||
NCCL_PARAM(IbRoutableFlidIbGidIndex, "IB_ROUTABLE_FLID_GID_INDEX", 1);
|
||||
NCCL_PARAM(IbRoceVersionNum, "IB_ROCE_VERSION_NUM", 2);
|
||||
NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 18);
|
||||
NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 20);
|
||||
NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
|
||||
NCCL_PARAM(IbPkey, "IB_PKEY", 0);
|
||||
NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
|
||||
@@ -90,6 +96,32 @@ NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
|
||||
NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
|
||||
NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
|
||||
NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", 0);
|
||||
NCCL_PARAM(IbAsyncEvents,"IB_RETURN_ASYNC_EVENTS",1);
|
||||
NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1);
|
||||
|
||||
static ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat) {
|
||||
__atomic_store_n(&stat->fatalErrorCount, 0, __ATOMIC_RELAXED);
|
||||
return ncclSuccess;
|
||||
}
|
||||
static void ncclIbStatsFatalError(struct ncclIbStats* stat){
|
||||
__atomic_fetch_add(&stat->fatalErrorCount, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
static ncclResult_t ncclIbStatsCheckFatalCount(struct ncclIbStats* stat, const char* funcName) {
|
||||
if (ncclParamIbAsyncEvents() && __atomic_load_n(&stat->fatalErrorCount, __ATOMIC_RELAXED)) {
|
||||
WARN("communicator encountered a fatal error (detected in %s)\n", funcName);
|
||||
return ncclSystemError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
static void ncclIbQpFatalError(struct ibv_qp* qp) {
|
||||
ncclIbStatsFatalError((struct ncclIbStats*)qp->qp_context);
|
||||
}
|
||||
static void ncclIbCqFatalError(struct ibv_cq* cq) {
|
||||
ncclIbStatsFatalError((struct ncclIbStats*)cq->cq_context);
|
||||
}
|
||||
static void ncclIbDevFatalError(struct ncclIbDev* dev) {
|
||||
ncclIbStatsFatalError(&dev->stats);
|
||||
}
|
||||
|
||||
pthread_t ncclIbAsyncThread;
|
||||
static void* ncclIbAsyncThreadMain(void* args) {
|
||||
@@ -98,9 +130,53 @@ static void* ncclIbAsyncThreadMain(void* args) {
|
||||
struct ibv_async_event event;
|
||||
if (ncclSuccess != wrap_ibv_get_async_event(dev->context, &event)) { break; }
|
||||
char *str;
|
||||
struct ibv_cq* cq = event.element.cq; // only valid if CQ error
|
||||
struct ibv_qp* qp = event.element.qp; // only valid if QP error
|
||||
struct ibv_srq* srq = event.element.srq; // only valid if SRQ error
|
||||
if (ncclSuccess != wrap_ibv_event_type_str(&str, event.event_type)) { break; }
|
||||
if (event.event_type != IBV_EVENT_COMM_EST)
|
||||
WARN("NET/IB : %s:%d Got async event : %s", dev->devName, dev->portNum, str);
|
||||
switch (event.event_type) {
|
||||
case IBV_EVENT_DEVICE_FATAL:
|
||||
// the above is device fatal error
|
||||
WARN("NET/IB : %s:%d async fatal event: %s", dev->devName, dev->portNum, str);
|
||||
ncclIbDevFatalError(dev);
|
||||
break;
|
||||
case IBV_EVENT_CQ_ERR:
|
||||
// the above is a CQ fatal error
|
||||
WARN("NET/IB : %s:%d async fatal event on CQ (%p): %s", dev->devName, dev->portNum, cq, str);
|
||||
ncclIbCqFatalError(cq);
|
||||
break;
|
||||
case IBV_EVENT_QP_FATAL:
|
||||
case IBV_EVENT_QP_REQ_ERR:
|
||||
case IBV_EVENT_QP_ACCESS_ERR:
|
||||
// the above are QP fatal errors
|
||||
WARN("NET/IB : %s:%d async fatal event on QP (%p): %s", dev->devName, dev->portNum, qp, str);
|
||||
ncclIbQpFatalError(qp);
|
||||
break;
|
||||
case IBV_EVENT_SRQ_ERR:
|
||||
// SRQ are not used in NCCL
|
||||
WARN("NET/IB : %s:%d async fatal event on SRQ, unused for now (%p): %s", dev->devName, dev->portNum, srq, str);
|
||||
break;
|
||||
case IBV_EVENT_PATH_MIG_ERR:
|
||||
case IBV_EVENT_PORT_ERR:
|
||||
case IBV_EVENT_PATH_MIG:
|
||||
case IBV_EVENT_PORT_ACTIVE:
|
||||
case IBV_EVENT_SQ_DRAINED:
|
||||
case IBV_EVENT_LID_CHANGE:
|
||||
case IBV_EVENT_PKEY_CHANGE:
|
||||
case IBV_EVENT_SM_CHANGE:
|
||||
case IBV_EVENT_QP_LAST_WQE_REACHED:
|
||||
case IBV_EVENT_CLIENT_REREGISTER:
|
||||
case IBV_EVENT_SRQ_LIMIT_REACHED:
|
||||
// the above are non-fatal
|
||||
WARN("NET/IB : %s:%d Got async error event: %s", dev->devName, dev->portNum, str);
|
||||
break;
|
||||
case IBV_EVENT_COMM_EST:
|
||||
break;
|
||||
default:
|
||||
WARN("NET/IB : %s:%d unknown event type (%d)", dev->devName, dev->portNum, event.event_type);
|
||||
break;
|
||||
}
|
||||
// acknowledgment needs to happen last to avoid user-after-free
|
||||
if (ncclSuccess != wrap_ibv_ack_async_event(&event)) { break; }
|
||||
}
|
||||
return NULL;
|
||||
@@ -140,11 +216,11 @@ static void* envIbAddrRange(sa_family_t af, int* mask) {
|
||||
char addrString[128] = { 0 };
|
||||
snprintf(addrString, 128, "%s", env);
|
||||
char *addrStrPtr = addrString;
|
||||
char *maskStrPtr = strstr(addrString, "/") + 1;
|
||||
char *maskStrPtr = strstr(addrString, "/");
|
||||
if (NULL == maskStrPtr) {
|
||||
return NULL;
|
||||
}
|
||||
*(maskStrPtr - 1) = '\0';
|
||||
*(maskStrPtr++) = '\0';
|
||||
|
||||
if (inet_pton(af, addrStrPtr, ret) == 0) {
|
||||
WARN("NET/IB: Ip address '%s' is invalid for family %s, ignoring address", addrStrPtr, (af == AF_INET) ? "AF_INET" : "AF_INET6");
|
||||
@@ -242,12 +318,14 @@ static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum,
|
||||
|
||||
int fd = open(roceTypePath, O_RDONLY);
|
||||
if (fd == -1) {
|
||||
WARN("NET/IB: open failed in ncclIbRoceGetVersionNum: %s", strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
int ret = read(fd, gidRoceVerStr, 15);
|
||||
close(fd);
|
||||
|
||||
if (ret == -1) {
|
||||
WARN("NET/IB: read failed in ncclIbRoceGetVersionNum: %s", strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
@@ -420,7 +498,7 @@ int ncclIbFindMatchingDev(int dev) {
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
ncclResult_t ret;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
if (ncclParamIbDisable()) return ncclInternalError;
|
||||
static int shownIbHcaEnv = 0;
|
||||
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
|
||||
@@ -496,11 +574,12 @@ build_ib_list:
|
||||
ncclIbDevs[ncclNIbDevs].pdRefs = 0;
|
||||
ncclIbDevs[ncclNIbDevs].pd = NULL;
|
||||
strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
|
||||
NCCLCHECK(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort));
|
||||
NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail);
|
||||
ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
|
||||
ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0;
|
||||
ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
|
||||
ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
|
||||
NCCLCHECK(ncclIbStatsInit(&ncclIbDevs[ncclNIbDevs].stats));
|
||||
|
||||
// Enable ADAPTIVE_ROUTING by default on IB networks
|
||||
// But allow it to be overloaded by an env parameter
|
||||
@@ -510,9 +589,9 @@ build_ib_list:
|
||||
TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum,
|
||||
portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
|
||||
|
||||
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs);
|
||||
PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail);
|
||||
ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
|
||||
pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
|
||||
PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d
|
||||
|
||||
int mergedDev = ncclNMergedIbDevs;
|
||||
if (mergeNics) {
|
||||
@@ -592,10 +671,11 @@ build_ib_list:
|
||||
}
|
||||
pthread_mutex_unlock(&ncclIbLock);
|
||||
}
|
||||
return ncclSuccess;
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
pthread_mutex_unlock(&ncclIbLock);
|
||||
return ret;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbDevices(int* ndev) {
|
||||
@@ -607,46 +687,63 @@ ncclResult_t ncclIbDevices(int* ndev) {
|
||||
// Returns :
|
||||
// ncclSuccess : GDR works
|
||||
// ncclSystemError : no module or module loaded but not supported by GPU
|
||||
#define KNL_MODULE_LOADED(a) ((access(a, F_OK) == -1) ? 0 : 1)
|
||||
static int ncclIbGdrModuleLoaded = 0; // 1 = true, 0 = false
|
||||
static void ibGdrSupportInitOnce() {
|
||||
// Check for the nv_peer_mem module being loaded
|
||||
ncclIbGdrModuleLoaded = KNL_MODULE_LOADED("/sys/kernel/mm/memory_peers/nv_mem/version") ||
|
||||
KNL_MODULE_LOADED("/sys/kernel/mm/memory_peers/nv_mem_nc/version") ||
|
||||
KNL_MODULE_LOADED("/sys/module/nvidia_peermem/version");
|
||||
}
|
||||
ncclResult_t ncclIbGdrSupport() {
|
||||
static int moduleLoaded = -1;
|
||||
if (moduleLoaded == -1) {
|
||||
// Check for the nv_peer_mem module being loaded
|
||||
moduleLoaded = ((access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) &&
|
||||
// Also support the new nv_mem_nc module
|
||||
(access("/sys/kernel/mm/memory_peers/nv_mem_nc/version", F_OK) == -1)) ? 0 : 1;
|
||||
}
|
||||
if (moduleLoaded == 0) return ncclSystemError;
|
||||
static pthread_once_t once = PTHREAD_ONCE_INIT;
|
||||
pthread_once(&once, ibGdrSupportInitOnce);
|
||||
if (!ncclIbGdrModuleLoaded)
|
||||
return ncclSystemError;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static __thread int ibDmaSupportInitDev; // which device to init, must be thread local
|
||||
static void ibDmaBufSupportInitOnce(){
|
||||
ncclResult_t res;
|
||||
// select the appropriate
|
||||
struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev;
|
||||
// Test each real devices
|
||||
int dev_fail = 0;
|
||||
for (int i = 0; i < mergedDev->ndevs; i++) {
|
||||
int ibDev = mergedDev->devs[i];
|
||||
struct ibv_pd* pd;
|
||||
struct ibv_context* ctx = ncclIbDevs[ibDev].context;
|
||||
NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
|
||||
// Test kernel DMA-BUF support with a dummy call (fd=-1)
|
||||
(void)wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL /*offset*/, 0ULL /*len*/, 0ULL /*iova*/, -1 /*fd*/, 0 /*flags*/);
|
||||
// ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
|
||||
dev_fail |= (errno == EOPNOTSUPP) || (errno == EPROTONOSUPPORT);
|
||||
NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
|
||||
// stop the search and goto failure
|
||||
if (dev_fail) goto failure;
|
||||
}
|
||||
mergedDev->dmaBufSupported = 1;
|
||||
return;
|
||||
failure:
|
||||
mergedDev->dmaBufSupported = -1;
|
||||
return;
|
||||
}
|
||||
// Detect whether DMA-BUF support is present in the kernel
|
||||
// Returns :
|
||||
// ncclSuccess : DMA-BUF support is available
|
||||
// ncclSystemError : DMA-BUF is not supported by the kernel
|
||||
ncclResult_t ncclIbDmaBufSupport(int dev) {
|
||||
static int dmaBufSupported = -1;
|
||||
if (dmaBufSupported == -1) {
|
||||
ncclResult_t res;
|
||||
struct ibv_pd* pd;
|
||||
struct ibv_context* ctx;
|
||||
struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + dev;
|
||||
struct oncewrap {
|
||||
pthread_once_t once = PTHREAD_ONCE_INIT;
|
||||
};
|
||||
static oncewrap onces[MAX_IB_DEVS];
|
||||
// init the device only once
|
||||
ibDmaSupportInitDev = dev;
|
||||
pthread_once(&onces[dev].once, ibDmaBufSupportInitOnce);
|
||||
|
||||
// Test each dev
|
||||
for (int i = 0; i < mergedDev->ndevs; i++) {
|
||||
int ibDev = mergedDev->devs[i];
|
||||
ctx = ncclIbDevs[ibDev].context;
|
||||
NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
|
||||
// Test kernel DMA-BUF support with a dummy call (fd=-1)
|
||||
(void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/);
|
||||
// ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
|
||||
dmaBufSupported = (errno != EOPNOTSUPP && errno != EPROTONOSUPPORT) ? 1 : 0;
|
||||
NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
|
||||
}
|
||||
}
|
||||
if (dmaBufSupported == 0) return ncclSystemError;
|
||||
return ncclSuccess;
|
||||
failure:
|
||||
dmaBufSupported = 0;
|
||||
int dmaBufSupported = ncclIbMergedDevs[dev].dmaBufSupported;
|
||||
if (dmaBufSupported == 1) return ncclSuccess;
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
@@ -842,16 +939,19 @@ struct alignas(32) ncclIbNetCommBase {
|
||||
// Track necessary remDevInfo here
|
||||
int nRemDevs;
|
||||
struct ncclIbDevInfo remDevs[NCCL_IB_MAX_DEVS_PER_NIC];
|
||||
// statistics about the comm
|
||||
struct ncclIbStats stats;
|
||||
};
|
||||
|
||||
struct ncclIbSendComm {
|
||||
struct ncclIbNetCommBase base;
|
||||
// Start with fifo and ibv structs as they have alignment restrictions
|
||||
struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
|
||||
struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
|
||||
struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS + 1];
|
||||
// Each dev correlates to a mergedIbDev
|
||||
struct ncclIbSendCommDev devs[NCCL_IB_MAX_DEVS_PER_NIC];
|
||||
struct ncclIbRequest* fifoReqs[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
|
||||
struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
|
||||
struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1];
|
||||
struct ncclIbRemSizesFifo remSizesFifo;
|
||||
uint64_t fifoHead;
|
||||
int ar; // Use adaptive routing when all merged devices have it enabled
|
||||
@@ -903,8 +1003,7 @@ static void ncclIbAddEvent(struct ncclIbRequest* req, int devIndex, struct ncclI
|
||||
req->events[devIndex]++;
|
||||
req->devBases[devIndex] = base;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base) {
|
||||
ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base, void* cq_context) {
|
||||
base->ibDevN = ibDevN;
|
||||
ncclIbDev* ibDev = ncclIbDevs + ibDevN;
|
||||
pthread_mutex_lock(&ibDev->lock);
|
||||
@@ -921,7 +1020,7 @@ ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base
|
||||
pthread_mutex_unlock(&ibDev->lock);
|
||||
|
||||
// Recv requests can generate 2 completions (one for the post FIFO, one for the Recv).
|
||||
NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0));
|
||||
NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), cq_context, NULL, 0));
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -940,9 +1039,10 @@ returning:
|
||||
return res;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, struct ncclIbQp* qp) {
|
||||
ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, void* qp_context, struct ncclIbQp* qp) {
|
||||
struct ibv_qp_init_attr qpInitAttr;
|
||||
memset(&qpInitAttr, 0, sizeof(struct ibv_qp_init_attr));
|
||||
qpInitAttr.qp_context = qp_context;
|
||||
qpInitAttr.send_cq = base->cq;
|
||||
qpInitAttr.recv_cq = base->cq;
|
||||
qpInitAttr.qp_type = IBV_QPT_RC;
|
||||
@@ -1026,6 +1126,7 @@ ncclResult_t ncclIbRtsQp(struct ibv_qp* qp) {
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclIbListenComm* comm;
|
||||
NCCLCHECK(ncclCalloc(&comm, 1));
|
||||
struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
|
||||
@@ -1033,14 +1134,20 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
|
||||
memset(handle, 0, sizeof(struct ncclIbHandle));
|
||||
comm->dev = dev;
|
||||
handle->magic = NCCL_SOCKET_MAGIC;
|
||||
NCCLCHECK(ncclSocketInit(&comm->sock, &ncclIbIfAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
|
||||
NCCLCHECK(ncclSocketListen(&comm->sock));
|
||||
NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr));
|
||||
NCCLCHECKGOTO(ncclSocketInit(&comm->sock, &ncclIbIfAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1), ret, fail);
|
||||
NCCLCHECKGOTO(ncclSocketListen(&comm->sock), ret, fail);
|
||||
NCCLCHECKGOTO(ncclSocketGetAddr(&comm->sock, &handle->connectAddr), ret, fail);
|
||||
*listenComm = comm;
|
||||
return ncclSuccess;
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
(void)ncclSocketClose(&comm->sock);
|
||||
free(comm);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
|
||||
struct ncclIbCommStage* stage = &handle->stage;
|
||||
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
|
||||
@@ -1055,16 +1162,18 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
|
||||
WARN("Error: trying to connect already connected sendComm");
|
||||
return ncclInternalError;
|
||||
}
|
||||
stage->buffer = NULL;
|
||||
|
||||
NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
|
||||
NCCLCHECK(ncclSocketInit(&comm->base.sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
|
||||
NCCLCHECKGOTO(ncclIbStatsInit(&comm->base.stats), ret, fail);
|
||||
NCCLCHECKGOTO(ncclSocketInit(&comm->base.sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1), ret, fail);
|
||||
stage->comm = comm;
|
||||
stage->state = ncclIbCommStateConnect;
|
||||
NCCLCHECK(ncclSocketConnect(&comm->base.sock));
|
||||
NCCLCHECKGOTO(ncclSocketConnect(&comm->base.sock), ret, fail);
|
||||
|
||||
ib_connect_check:
|
||||
/* since ncclSocketConnect is async, we must check if connection is complete */
|
||||
NCCLCHECK(ncclSocketReady(&comm->base.sock, &ready));
|
||||
NCCLCHECKGOTO(ncclSocketReady(&comm->base.sock, &ready), ret, fail);
|
||||
if (!ready) return ncclSuccess;
|
||||
|
||||
// IB Setup
|
||||
@@ -1078,7 +1187,7 @@ ib_connect_check:
|
||||
comm->ar = 1; // Set to 1 for logic
|
||||
for (int i = 0; i < mergedDev->ndevs; i++) {
|
||||
int ibDevN = mergedDev->devs[i];
|
||||
NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base));
|
||||
NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base, &comm->base.stats), ret, fail);
|
||||
comm->ar = comm->ar && ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING - if all merged devs have it enabled
|
||||
}
|
||||
|
||||
@@ -1091,13 +1200,17 @@ ib_connect_check:
|
||||
for (int q = 0; q < comm->base.nqps; q++) {
|
||||
ncclIbSendCommDev* commDev = comm->devs + devIndex;
|
||||
ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
|
||||
NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, comm->base.qps+q));
|
||||
NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, &comm->base.stats, comm->base.qps + q), ret, fail);
|
||||
comm->base.qps[q].devIndex = devIndex;
|
||||
meta.qpInfo[q].qpn = comm->base.qps[q].qp->qp_num;
|
||||
meta.qpInfo[q].devIndex = comm->base.qps[q].devIndex;
|
||||
|
||||
// Query ece capabilities (enhanced connection establishment)
|
||||
NCCLCHECK(wrap_ibv_query_ece(comm->base.qps[q].qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
|
||||
if (ncclParamIbEceEnable()) {
|
||||
// Query ece capabilities (enhanced connection establishment)
|
||||
NCCLCHECKGOTO(wrap_ibv_query_ece(comm->base.qps[q].qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
|
||||
} else {
|
||||
meta.qpInfo[q].ece_supported = 0;
|
||||
}
|
||||
devIndex = (devIndex + 1) % comm->base.ndevs;
|
||||
}
|
||||
|
||||
@@ -1112,13 +1225,13 @@ ib_connect_check:
|
||||
devInfo->lid = ibDev->portAttr.lid;
|
||||
|
||||
// Prepare my fifo
|
||||
NCCLCHECK(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
|
||||
NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
|
||||
devInfo->fifoRkey = commDev->fifoMr->rkey;
|
||||
|
||||
// Pack local GID info
|
||||
devInfo->link_layer = commDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer;
|
||||
NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &commDev->base.gidInfo.localGidIndex));
|
||||
NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid));
|
||||
NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &commDev->base.gidInfo.localGidIndex), ret, fail);
|
||||
NCCLCHECKGOTO(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid), ret, fail);
|
||||
devInfo->gid.global.subnet_prefix = commDev->base.gidInfo.localGid.global.subnet_prefix;
|
||||
devInfo->gid.global.interface_id = commDev->base.gidInfo.localGid.global.interface_id;
|
||||
|
||||
@@ -1148,12 +1261,12 @@ ib_connect_check:
|
||||
|
||||
stage->state = ncclIbCommStateSend;
|
||||
stage->offset = 0;
|
||||
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)));
|
||||
NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)), ret, fail);
|
||||
|
||||
memcpy(stage->buffer, &meta, sizeof(meta));
|
||||
|
||||
ib_send:
|
||||
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(meta), &stage->offset));
|
||||
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(meta), &stage->offset), ret, fail);
|
||||
if (stage->offset != sizeof(meta)) return ncclSuccess;
|
||||
|
||||
stage->state = ncclIbCommStateConnecting;
|
||||
@@ -1163,7 +1276,7 @@ ib_send:
|
||||
|
||||
ib_connect:
|
||||
struct ncclIbConnectionMetadata remMeta;
|
||||
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclIbConnectionMetadata), &stage->offset));
|
||||
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclIbConnectionMetadata), &stage->offset), ret, fail);
|
||||
if (stage->offset != sizeof(remMeta)) return ncclSuccess;
|
||||
|
||||
memcpy(&remMeta, stage->buffer, sizeof(ncclIbConnectionMetadata));
|
||||
@@ -1197,7 +1310,7 @@ ib_connect:
|
||||
}
|
||||
|
||||
for (int i=0; i < comm->base.ndevs; i++) {
|
||||
NCCLCHECK(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
|
||||
NCCLCHECKGOTO(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
|
||||
}
|
||||
comm->base.nRemDevs = remMeta.ndevs;
|
||||
|
||||
@@ -1212,10 +1325,10 @@ ib_connect:
|
||||
|
||||
struct ibv_qp* qp = comm->base.qps[q].qp;
|
||||
if (remQpInfo->ece_supported)
|
||||
NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported));
|
||||
NCCLCHECKGOTO(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported), ret, fail);
|
||||
|
||||
NCCLCHECK(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false));
|
||||
NCCLCHECK(ncclIbRtsQp(qp));
|
||||
NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false), ret, fail);
|
||||
NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail);
|
||||
}
|
||||
|
||||
if (link_layer == IBV_LINK_LAYER_ETHERNET ) { // RoCE
|
||||
@@ -1233,19 +1346,23 @@ ib_connect:
|
||||
stage->offset = 0;
|
||||
|
||||
ib_send_ready:
|
||||
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, &comm->base.ready, sizeof(int), &stage->offset));
|
||||
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, &comm->base.ready, sizeof(int), &stage->offset), ret, fail);
|
||||
if (stage->offset != sizeof(int)) return ncclSuccess;
|
||||
|
||||
free(stage->buffer);
|
||||
stage->state = ncclIbCommStateStart;
|
||||
|
||||
*sendComm = comm;
|
||||
return ncclSuccess;
|
||||
exit:
|
||||
if (stage->buffer) free(stage->buffer);
|
||||
stage->state = ncclIbCommStateStart;
|
||||
return ret;
|
||||
fail:
|
||||
free(comm);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
|
||||
|
||||
ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
|
||||
struct ncclIbCommStage* stage = &lComm->stage;
|
||||
struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
|
||||
@@ -1262,22 +1379,23 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
|
||||
NCCLCHECKGOTO(ncclIbStatsInit(&rComm->base.stats), ret, fail);
|
||||
stage->comm = rComm;
|
||||
stage->state = ncclIbCommStateAccept;
|
||||
NCCLCHECK(ncclSocketInit(&rComm->base.sock));
|
||||
NCCLCHECK(ncclSocketAccept(&rComm->base.sock, &lComm->sock));
|
||||
NCCLCHECKGOTO(ncclSocketInit(&rComm->base.sock), ret, fail);
|
||||
NCCLCHECKGOTO(ncclSocketAccept(&rComm->base.sock, &lComm->sock), ret, fail);
|
||||
|
||||
ib_accept_check:
|
||||
NCCLCHECK(ncclSocketReady(&rComm->base.sock, &ready));
|
||||
NCCLCHECKGOTO(ncclSocketReady(&rComm->base.sock, &ready), ret, fail);
|
||||
if (!ready) return ncclSuccess;
|
||||
|
||||
struct ncclIbConnectionMetadata remMeta;
|
||||
stage->state = ncclIbCommStateRecv;
|
||||
stage->offset = 0;
|
||||
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta)));
|
||||
NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta)), ret, fail);
|
||||
|
||||
ib_recv:
|
||||
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset));
|
||||
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset), ret, fail);
|
||||
if (stage->offset != sizeof(remMeta)) return ncclSuccess;
|
||||
|
||||
/* copy back the received info */
|
||||
@@ -1308,10 +1426,10 @@ ib_recv:
|
||||
for (int i = 0; i < rComm->base.ndevs; i++) {
|
||||
rCommDev = rComm->devs + i;
|
||||
ibDevN = mergedDev->devs[i];
|
||||
NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &rCommDev->base));
|
||||
NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &rCommDev->base, &rComm->base.stats), ret, fail);
|
||||
ibDev = ncclIbDevs + ibDevN;
|
||||
NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex));
|
||||
NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid));
|
||||
NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex), ret, fail);
|
||||
NCCLCHECKGOTO(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid), ret, fail);
|
||||
}
|
||||
|
||||
// Copy remDevInfo for things like remGidInfo, remFifoAddr, etc.
|
||||
@@ -1336,23 +1454,26 @@ ib_recv:
|
||||
// Local ibDevN
|
||||
ibDevN = rComm->devs[devIndex].base.ibDevN;
|
||||
ibDev = ncclIbDevs + ibDevN;
|
||||
NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, qp));
|
||||
NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp), ret, fail);
|
||||
qp->devIndex = devIndex;
|
||||
devIndex = (devIndex + 1) % rComm->base.ndevs;
|
||||
|
||||
// Set the ece (enhanced connection establishment) on this QP before RTR
|
||||
if (remMeta.qpInfo[q].ece_supported) {
|
||||
NCCLCHECK(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
|
||||
// Coverity suspects a copy-paste error below due to the use of remMeta in one argument and meta in another.
|
||||
// However, this has been confirmed to be intentional.
|
||||
// coverity[copy_paste_error]
|
||||
NCCLCHECKGOTO(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
|
||||
|
||||
// Query the reduced ece for this QP (matching enhancements between the requestor and the responder)
|
||||
// Store this in our own qpInfo for returning to the requestor
|
||||
if (meta.qpInfo[q].ece_supported)
|
||||
NCCLCHECK(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
|
||||
NCCLCHECKGOTO(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
|
||||
}
|
||||
|
||||
bool override_tc = (q == 0) ? true : false;
|
||||
NCCLCHECK(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc));
|
||||
NCCLCHECK(ncclIbRtsQp(qp->qp));
|
||||
NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc), ret, fail);
|
||||
NCCLCHECKGOTO(ncclIbRtsQp(qp->qp), ret, fail);
|
||||
}
|
||||
|
||||
rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || ncclIbDmaBufSupport(lComm->dev) == ncclSuccess)
|
||||
@@ -1366,17 +1487,17 @@ ib_recv:
|
||||
// Retain remote fifo info and prepare my RDMA ops
|
||||
rCommDev->fifoRkey = remMeta.devs[i].fifoRkey;
|
||||
rComm->remFifo.addr = remMeta.fifoAddr;
|
||||
NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
|
||||
NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
|
||||
rCommDev->fifoSge.lkey = rCommDev->fifoMr->lkey;
|
||||
if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
|
||||
|
||||
// Allocate Flush dummy buffer for GPU Direct RDMA
|
||||
if (rComm->flushEnabled) {
|
||||
NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE));
|
||||
NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE), ret, fail);
|
||||
rCommDev->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlushHostMem;
|
||||
rCommDev->gpuFlush.sge.length = 1;
|
||||
rCommDev->gpuFlush.sge.lkey = rCommDev->gpuFlush.hostMr->lkey;
|
||||
NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rCommDev->gpuFlush.qp));
|
||||
NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rComm->base.stats, &rCommDev->gpuFlush.qp), ret, fail);
|
||||
struct ncclIbDevInfo devInfo;
|
||||
devInfo.lid = ibDev->portAttr.lid;
|
||||
devInfo.link_layer = ibDev->portAttr.link_layer;
|
||||
@@ -1384,8 +1505,8 @@ ib_recv:
|
||||
devInfo.gid.global.subnet_prefix = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
|
||||
devInfo.gid.global.interface_id = rCommDev->base.gidInfo.localGid.global.interface_id;
|
||||
devInfo.mtu = ibDev->portAttr.active_mtu;
|
||||
NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false));
|
||||
NCCLCHECK(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp));
|
||||
NCCLCHECKGOTO(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false), ret, fail);
|
||||
NCCLCHECKGOTO(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp), ret, fail);
|
||||
}
|
||||
|
||||
// Fill Handle
|
||||
@@ -1400,7 +1521,7 @@ ib_recv:
|
||||
meta.devs[i].mtu = remMeta.devs[i].mtu;
|
||||
|
||||
// Prepare sizes fifo
|
||||
NCCLCHECK(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
|
||||
NCCLCHECKGOTO(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
|
||||
meta.devs[i].fifoRkey = rComm->devs[i].sizesFifoMr->rkey;
|
||||
}
|
||||
meta.fifoAddr = (uint64_t)rComm->sizesFifo;
|
||||
@@ -1415,30 +1536,36 @@ ib_recv:
|
||||
|
||||
stage->state = ncclIbCommStateSend;
|
||||
stage->offset = 0;
|
||||
if (stage->buffer) free(stage->buffer);
|
||||
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbConnectionMetadata)));
|
||||
if (stage->buffer) {
|
||||
free(stage->buffer);
|
||||
stage->buffer = NULL;
|
||||
}
|
||||
NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbConnectionMetadata)), ret, fail);
|
||||
memcpy(stage->buffer, &meta, sizeof(struct ncclIbConnectionMetadata));
|
||||
|
||||
ib_send:
|
||||
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(struct ncclIbConnectionMetadata), &stage->offset));
|
||||
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(struct ncclIbConnectionMetadata), &stage->offset), ret, fail);
|
||||
if (stage->offset < sizeof(struct ncclIbConnectionMetadata)) return ncclSuccess;
|
||||
|
||||
stage->offset = 0;
|
||||
stage->state = ncclIbCommStatePendingReady;
|
||||
|
||||
ib_recv_ready:
|
||||
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, &rComm->base.ready, sizeof(int), &stage->offset));
|
||||
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, &rComm->base.ready, sizeof(int), &stage->offset), ret, fail);
|
||||
if (stage->offset != sizeof(int)) return ncclSuccess;
|
||||
|
||||
free(stage->buffer);
|
||||
*recvComm = rComm;
|
||||
|
||||
exit:
|
||||
/* reset lComm stage */
|
||||
if (stage->buffer) free(stage->buffer);
|
||||
stage->state = ncclIbCommStateStart;
|
||||
stage->offset = 0;
|
||||
stage->comm = NULL;
|
||||
stage->buffer = NULL;
|
||||
return ncclSuccess;
|
||||
return ret;
|
||||
fail:
|
||||
free(rComm);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbGetRequest(struct ncclIbNetCommBase* base, struct ncclIbRequest** req) {
|
||||
@@ -1531,16 +1658,21 @@ struct ncclIbNetCommDevBase* ncclIbGetNetCommDevBase(ncclIbNetCommBase* base, in
|
||||
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
assert(size > 0);
|
||||
struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
|
||||
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle));
|
||||
for (int i = 0; i < base->ndevs; i++) {
|
||||
// Each ncclIbNetCommDevBase is at different offset in send and recv netComms
|
||||
struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i);
|
||||
NCCLCHECK(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i));
|
||||
NCCLCHECKGOTO(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i), ret, fail);
|
||||
}
|
||||
*mhandle = (void*) mhandleWrapper;
|
||||
return ncclSuccess;
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
free(mhandleWrapper);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbRegMr(void* comm, void* data, size_t size, int type, void** mhandle) {
|
||||
@@ -1694,6 +1826,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
|
||||
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
|
||||
if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; }
|
||||
if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
|
||||
NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
|
||||
|
||||
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle;
|
||||
|
||||
@@ -1858,6 +1991,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
|
||||
if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; }
|
||||
if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
|
||||
if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
|
||||
NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
|
||||
|
||||
struct ncclIbRequest* req;
|
||||
NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
|
||||
@@ -1937,10 +2071,13 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#define HCA_NAME(req, index) ((req)->devBases[(index)]->pd->context->device->name)
|
||||
|
||||
ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
|
||||
struct ncclIbRequest *r = (struct ncclIbRequest*)request;
|
||||
*done = 0;
|
||||
while (1) {
|
||||
NCCLCHECK(ncclIbStatsCheckFatalCount(&r->base->stats,__func__));
|
||||
if (r->events[0] == 0 && r->events[1] == 0) {
|
||||
TRACE(NCCL_NET, "r=%p done", r);
|
||||
*done = 1;
|
||||
@@ -1996,7 +2133,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
|
||||
TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%d wr_id=%ld r=%p type=%d events={%d,%d}, i=%d",
|
||||
ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i);
|
||||
#endif
|
||||
if (req->type == NCCL_NET_IB_REQ_SEND) {
|
||||
if (req && req->type == NCCL_NET_IB_REQ_SEND) {
|
||||
for (int j = 0; j < req->nreqs; j++) {
|
||||
struct ncclIbRequest* sendReq = r->base->reqs+((wc->wr_id >> (j*8)) & 0xff);
|
||||
if ((sendReq->events[i] <= 0)) {
|
||||
@@ -2018,6 +2155,9 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
|
||||
req->events[i]--;
|
||||
}
|
||||
}
|
||||
// Once the IB fatal event is reported in the async thread, we want to propagate this error
|
||||
// to communicator and prevent further polling to reduce error pollution.
|
||||
NCCLCHECK(ncclIbStatsCheckFatalCount(&ncclIbDevs[r->devBases[i]->ibDevN].stats,__func__));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -73,22 +73,27 @@ ncclResult_t ncclNetSocketDevices(int* ndev) {
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNetSocketGetSpeed(char* devName, int* speed) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
*speed = 0;
|
||||
char speedPath[PATH_MAX];
|
||||
sprintf(speedPath, "/sys/class/net/%s/speed", devName);
|
||||
int fd = open(speedPath, O_RDONLY);
|
||||
int fd = -1;
|
||||
SYSCHECKSYNC(open(speedPath, O_RDONLY), "open", fd);
|
||||
if (fd != -1) {
|
||||
char speedStr[] = " ";
|
||||
if (read(fd, speedStr, sizeof(speedStr)-1) > 0) {
|
||||
int n;
|
||||
// Allow this to silently fail
|
||||
n = read(fd, speedStr, sizeof(speedStr)-1);
|
||||
if (n > 0) {
|
||||
*speed = strtol(speedStr, NULL, 0);
|
||||
}
|
||||
close(fd);
|
||||
}
|
||||
if (*speed <= 0) {
|
||||
INFO(NCCL_NET, "Could not get speed from %s. Defaulting to 10 Gbps.", speedPath);
|
||||
*speed = 10000;
|
||||
}
|
||||
return ncclSuccess;
|
||||
if (fd != -1) SYSCHECK(close(fd), "close");
|
||||
return ret;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
|
||||
@@ -235,19 +240,24 @@ void* persistentSocketThread(void *args_) {
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int nSocksPerThread = ncclParamSocketNsocksPerThread();
|
||||
int nThreads = ncclParamSocketNthreads();
|
||||
if (nThreads > MAX_THREADS) {
|
||||
WARN("NET/Socket : NCCL_SOCKET_NTHREADS is greater than the maximum allowed, setting to %d", MAX_THREADS);
|
||||
nThreads = MAX_THREADS;
|
||||
}
|
||||
int fd = -1;
|
||||
int nSocks;
|
||||
if (nThreads == -2 || nSocksPerThread == -2) {
|
||||
// Auto-detection
|
||||
int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
|
||||
char vendorPath[PATH_MAX];
|
||||
snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetSocketDevs[dev].devName);
|
||||
// Coverity is wrong. NULL second argument to realpath() is OK by POSIX.1-2008.
|
||||
// coverity[alias_transfer:FALSE]
|
||||
char* rPath = realpath(vendorPath, NULL);
|
||||
int fd = open(rPath, O_RDONLY);
|
||||
fd = open(rPath, O_RDONLY);
|
||||
free(rPath);
|
||||
if (fd == -1) {
|
||||
// Could not find device vendor. This is handled silently so
|
||||
@@ -257,9 +267,7 @@ ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
|
||||
}
|
||||
char vendor[7];
|
||||
strncpy(vendor, "0x0000", 7);
|
||||
int len;
|
||||
SYSCHECKVAL(read(fd, vendor, 6), "read", len);
|
||||
SYSCHECK(close(fd), "close");
|
||||
SYSCHECKGOTO(read(fd, vendor, 6), "read", ret, fail);
|
||||
if (strcmp(vendor, "0x1d0f") == 0) { // AWS
|
||||
autoNt = 2;
|
||||
autoNs = 8;
|
||||
@@ -271,7 +279,7 @@ end:
|
||||
if (nThreads == -2) nThreads = autoNt;
|
||||
if (nSocksPerThread == -2) nSocksPerThread = autoNs;
|
||||
}
|
||||
int nSocks = nSocksPerThread * nThreads;
|
||||
nSocks = nSocksPerThread * nThreads;
|
||||
if (nSocks > MAX_SOCKETS) {
|
||||
nSocksPerThread = MAX_SOCKETS/nThreads;
|
||||
WARN("NET/Socket : the total number of sockets is greater than the maximum allowed, setting NCCL_NSOCKS_PERTHREAD to %d", nSocksPerThread);
|
||||
@@ -280,28 +288,38 @@ end:
|
||||
*ns = nSocks;
|
||||
*nt = nThreads;
|
||||
if (nSocks > 0) INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread);
|
||||
return ncclSuccess;
|
||||
exit:
|
||||
if (fd != -1) close(fd);
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) {
|
||||
if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
|
||||
return ncclInternalError;
|
||||
}
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclNetSocketHandle* handle = (struct ncclNetSocketHandle*) opaqueHandle;
|
||||
memset(handle, 0, sizeof(struct ncclNetSocketHandle));
|
||||
static_assert(sizeof(struct ncclNetSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclNetSocketHandle size too large");
|
||||
struct ncclNetSocketListenComm* comm;
|
||||
NCCLCHECK(ncclCalloc(&comm, 1));
|
||||
handle->magic = NCCL_SOCKET_MAGIC;
|
||||
NCCLCHECK(ncclSocketInit(&comm->sock, &ncclNetSocketDevs[dev].addr, handle->magic, ncclSocketTypeNetSocket, NULL, 1));
|
||||
NCCLCHECK(ncclSocketListen(&comm->sock));
|
||||
NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr));
|
||||
NCCLCHECK(ncclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
|
||||
NCCLCHECKGOTO(ncclSocketInit(&comm->sock, &ncclNetSocketDevs[dev].addr, handle->magic, ncclSocketTypeNetSocket, NULL, 1), ret, fail);
|
||||
NCCLCHECKGOTO(ncclSocketListen(&comm->sock), ret, fail);
|
||||
NCCLCHECKGOTO(ncclSocketGetAddr(&comm->sock, &handle->connectAddr), ret, fail);
|
||||
NCCLCHECKGOTO(ncclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads), ret, fail);
|
||||
handle->nSocks = comm->nSocks;
|
||||
handle->nThreads = comm->nThreads;
|
||||
comm->dev = dev;
|
||||
*listenComm = comm;
|
||||
return ncclSuccess;
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
(void)ncclSocketClose(&comm->sock);
|
||||
free(comm);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
|
||||
@@ -437,7 +455,7 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void*
|
||||
res->comm = comm;
|
||||
pthread_mutex_init(&res->threadLock, NULL);
|
||||
pthread_cond_init(&res->threadCond, NULL);
|
||||
pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res);
|
||||
PTHREADCHECK(pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res), "pthread_create");
|
||||
ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev);
|
||||
}
|
||||
struct ncclNetSocketTask* r = queue->tasks+queue->next;
|
||||
@@ -482,7 +500,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
|
||||
if (r->op == NCCL_SOCKET_RECV && data > r->size) {
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
union ncclSocketAddress addr;
|
||||
ncclSocketGetAddr(r->ctrlSock, &addr);
|
||||
NCCLCHECK(ncclSocketGetAddr(r->ctrlSock, &addr));
|
||||
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
|
||||
there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
|
||||
ncclSocketToString(&addr, line), data, r->size);
|
||||
@@ -579,7 +597,7 @@ ncclResult_t ncclNetSocketClose(void* opaqueComm) {
|
||||
res->stop = 1;
|
||||
pthread_cond_signal(&res->threadCond);
|
||||
pthread_mutex_unlock(&res->threadLock);
|
||||
pthread_join(comm->helperThread[i], NULL);
|
||||
PTHREADCHECK(pthread_join(comm->helperThread[i], NULL), "pthread_join");
|
||||
}
|
||||
free(res->threadTaskQueue.tasks);
|
||||
}
|
||||
|
||||
@@ -26,7 +26,7 @@ struct localRegData {
|
||||
intptr_t offset;
|
||||
};
|
||||
|
||||
ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
ncclResult_t nvlsCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
// This transport cannot be used for p2p
|
||||
*ret = 0;
|
||||
return ncclSuccess;
|
||||
@@ -71,28 +71,31 @@ ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop,
|
||||
|
||||
ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle) {
|
||||
CUmemAllocationHandleType type = ncclCuMemHandleType;
|
||||
|
||||
int fd = -1;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank);
|
||||
|
||||
// Import and map the remote memory descriptor to the local GPU
|
||||
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
|
||||
// cuMem UDS support
|
||||
int fd = -1;
|
||||
TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle %p from rank %d", comm->localRank, shareableHandle, rank);
|
||||
int tpProxyRank = comm->topParentRanks[rank];
|
||||
TRACE(NCCL_NVLS, "NVLS rank %d request conversion of handle 0x%lx from rank %d", comm->localRank, *(uint64_t*)shareableHandle, rank);
|
||||
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpProxyRank, shareableHandle, &fd));
|
||||
NCCLCHECKGOTO(ncclProxyClientGetFdBlocking(comm, rank, shareableHandle, &fd), ret, fail);
|
||||
TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank);
|
||||
CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type));
|
||||
(void) close(fd);
|
||||
CUCHECKGOTO(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type), ret, fail);
|
||||
SYSCHECK(close(fd), "close");
|
||||
} else {
|
||||
if (type == CU_MEM_HANDLE_TYPE_FABRIC) {
|
||||
CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)shareableHandle, type));
|
||||
CUCHECKGOTO(cuMemImportFromShareableHandle(mcHandle, (void *)shareableHandle, type), ret, fail);
|
||||
} else {
|
||||
memcpy(mcHandle, shareableHandle, sizeof(CUmemGenericAllocationHandle));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
if (fd != -1) close(fd);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAllocationHandle* mcHandle) {
|
||||
@@ -100,7 +103,7 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll
|
||||
INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zu dev %d", *mcHandle, size, dev);
|
||||
|
||||
// Unbind physical memory from group for the given device
|
||||
CUCHECK(cuMulticastUnbind(*mcHandle, dev, 0/*mcOffset*/, size));
|
||||
if (size) CUCHECK(cuMulticastUnbind(*mcHandle, dev, 0/*mcOffset*/, size));
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -117,14 +120,18 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr,
|
||||
INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", *ucHandle, ucptr, *mcHandle, mcptr);
|
||||
|
||||
// Release the UC memory and mapping
|
||||
CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size));
|
||||
CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size));
|
||||
CUCHECK(cuMemRelease(*ucHandle));
|
||||
if (ucptr) {
|
||||
CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size));
|
||||
CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size));
|
||||
CUCHECK(cuMemRelease(*ucHandle));
|
||||
}
|
||||
|
||||
// Release the MC memory and mapping
|
||||
CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size));
|
||||
CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size));
|
||||
CUCHECK(cuMemRelease(*mcHandle));
|
||||
if (mcptr) {
|
||||
CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size));
|
||||
CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size));
|
||||
CUCHECK(cuMemRelease(*mcHandle));
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -191,7 +198,9 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
|
||||
size_t size = *sizePtr;
|
||||
size_t originSize = size;
|
||||
size_t ucgran, mcgran;
|
||||
int allocMcHandle = 0;
|
||||
|
||||
*ucptr = *mcptr = NULL;
|
||||
memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
|
||||
mcprop.numDevices = comm->localRanks;
|
||||
mcprop.handleTypes = ncclCuMemHandleType;
|
||||
@@ -203,10 +212,12 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
|
||||
|
||||
if (comm->localRank == 0) {
|
||||
NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail);
|
||||
allocMcHandle = 1;
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
|
||||
} else {
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
|
||||
NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], mcHandle), ret, fail);
|
||||
allocMcHandle = 1;
|
||||
}
|
||||
|
||||
CUCHECKGOTO(cuMulticastAddDevice(*mcHandle, comm->cudaDev), ret, fail);
|
||||
@@ -226,6 +237,8 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
|
||||
CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, size, desc, 1), ret, fail);
|
||||
CUDACHECKGOTO(cudaMemset(*ucptr, 0, size), ret, fail);
|
||||
|
||||
// intra-node barrier to mitigate the possible hang in cuMulticastBindMem during abort
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
|
||||
// Bind physical memory to the Multicast group
|
||||
// NB: It will block until all ranks have been added to the Group
|
||||
CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, size, 0/*flags*/), ret, fail);
|
||||
@@ -239,6 +252,7 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
if (allocMcHandle && *mcptr == NULL && *ucptr == NULL) CUCHECK(cuMemRelease(*mcHandle));
|
||||
goto exit;
|
||||
}
|
||||
|
||||
@@ -350,10 +364,10 @@ setup:
|
||||
struct ncclNvlsSharedRes* resources = NULL;
|
||||
int nHeads = comm->channels[0].nvls.nHeads;
|
||||
int nChannels = comm->nChannels;
|
||||
size_t memSize = 16;
|
||||
size_t memSize = 64;
|
||||
size_t creditSize = nChannels * 2 * memSize * nHeads;
|
||||
int nvlsStepSize = comm->nvlsChunkSize;
|
||||
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&comm->nvlsResources, 1), res, fail);
|
||||
comm->nvlsResources->inited = false;
|
||||
comm->nvlsResources->refCount = 1;
|
||||
@@ -466,7 +480,7 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
|
||||
if (!comm->MNNVL && resources->nvlsShmemHandle)
|
||||
NCCLCHECK(ncclShmClose(resources->nvlsShmemHandle));
|
||||
|
||||
if (resources->ucCredit && resources->mcCredit) {
|
||||
if (resources->ucCredit || resources->mcCredit) {
|
||||
NCCLCHECK(nvlsGroupUnbind(comm, resources->creditSize, &resources->mcCreditHandle));
|
||||
NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditSize, resources->ucCredit, &resources->ucCreditHandle, resources->mcCredit, &resources->mcCreditHandle));
|
||||
}
|
||||
@@ -490,7 +504,6 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
|
||||
char shareableHandle[NVLS_HANDLE_SIZE];
|
||||
CUmemGenericAllocationHandle mcHandle;
|
||||
size_t minSize = SIZE_MAX;
|
||||
bool localRegBufUsed = false;
|
||||
struct localRegData* regData = NULL;
|
||||
cudaPointerAttributes attr;
|
||||
size_t ucgran, mcgran;
|
||||
@@ -500,7 +513,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
|
||||
if (userBuff) {
|
||||
NCCLCHECKGOTO(ncclRegFind(comm, (void*)userBuff, buffSize, ®Record), ret, fail);
|
||||
if (regRecord) {
|
||||
CUDACHECK(cudaPointerGetAttributes(&attr, (void*)regRecord->addr));
|
||||
CUDACHECKGOTO(cudaPointerGetAttributes(&attr, (void*)regRecord->addr), ret, fail);
|
||||
if (attr.type == cudaMemoryTypeDevice) {
|
||||
size_t regSize = regRecord->pages * comm->regCache.pageSize;
|
||||
memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
|
||||
@@ -508,7 +521,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
|
||||
mcprop.handleTypes = ncclCuMemHandleType;
|
||||
mcprop.flags = 0;
|
||||
mcprop.size = regSize;
|
||||
CUCHECK(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
|
||||
CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
|
||||
|
||||
memset(&ucprop, 0, sizeof(CUmemAllocationProp));
|
||||
ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
@@ -517,7 +530,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
|
||||
ucprop.requestedHandleTypes = ncclCuMemHandleType;
|
||||
CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
|
||||
|
||||
CUCHECK(cuMemGetAddressRange((CUdeviceptr*)®Record->baseAddr, ®Record->baseSize, (CUdeviceptr)regRecord->addr));
|
||||
CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)®Record->baseAddr, ®Record->baseSize, (CUdeviceptr)regRecord->addr), ret, fail);
|
||||
if (regSize % mcgran == 0) {
|
||||
regRecord->regSize = regSize;
|
||||
} else {
|
||||
@@ -560,6 +573,9 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
|
||||
}
|
||||
|
||||
CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->nvlsResources->dev), ret, fail);
|
||||
// Coverity complains that regRecord could be NULL. That won't in practice be the case because we've already checked
|
||||
// (regData[i].reg.state & NVLS_REG_POSSIBLE) of all local ranks, which would catch it and bail out.
|
||||
// coverity[var_deref_op]
|
||||
CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail);
|
||||
|
||||
// Create a VA for the NVLS
|
||||
@@ -584,15 +600,13 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
|
||||
}
|
||||
}
|
||||
|
||||
localRegBufUsed = true;
|
||||
|
||||
*regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
|
||||
*regUsed = true;
|
||||
exit:
|
||||
if (localRegBufUsed) *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
|
||||
*regUsed = localRegBufUsed;
|
||||
free(regData);
|
||||
return ret;
|
||||
fail:
|
||||
localRegBufUsed = false;
|
||||
*regUsed = false;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
@@ -862,19 +876,21 @@ exit:
|
||||
}
|
||||
|
||||
if (recvRecord) {
|
||||
// Yes, it's a dead code. That's fine...
|
||||
// coverity[dead_error_begin]
|
||||
ncclNvlsDeregBuffer(&recvRecord->mcHandle, recvRecord->ptr, recvRecord->dev, recvRecord->size);
|
||||
free(recvRecord);
|
||||
}
|
||||
} else {
|
||||
if (sendRecord) {
|
||||
*outRegBufSend = (void*)((uintptr_t)regSendPtr + (uintptr_t)sendbuff - (uintptr_t)baseSend);
|
||||
ncclIntruQueueEnqueue(cleanupQueue, &sendRecord->base);
|
||||
ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)sendRecord);
|
||||
*nCleanupQueueEltsAdded += 1;
|
||||
}
|
||||
|
||||
if (recvRecord) {
|
||||
*outRegBufRecv = (void*)((uintptr_t)regRecvPtr + (uintptr_t)recvbuff - (uintptr_t)baseRecv);
|
||||
ncclIntruQueueEnqueue(cleanupQueue, &recvRecord->base);
|
||||
ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)recvRecord);
|
||||
*nCleanupQueueEltsAdded += 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -7,9 +7,11 @@
|
||||
#include "comm.h"
|
||||
#include "graph.h"
|
||||
#include "utils.h"
|
||||
#include "shm.h"
|
||||
#include "shmutils.h"
|
||||
#include "p2p.h"
|
||||
#include "transport.h"
|
||||
#include <assert.h>
|
||||
#include "shm.h"
|
||||
|
||||
enum p2pType { P2P_DIRECT, P2P_INTERMEDIATE, P2P_IPC, P2P_CUMEM };
|
||||
|
||||
@@ -19,16 +21,28 @@ struct ncclP2pBuff {
|
||||
ncclIpcDesc ipcDesc;
|
||||
};
|
||||
|
||||
struct ncclP2pRequest {
|
||||
size_t size;
|
||||
int refcount;
|
||||
};
|
||||
|
||||
struct p2pConnectInfo {
|
||||
int rank;
|
||||
int read;
|
||||
struct ncclP2pBuff p2pBuff;
|
||||
// Used by CE memcpy
|
||||
char shmName[7];
|
||||
int shmSize;
|
||||
ncclShmIpcDesc_t desc;
|
||||
};
|
||||
static_assert(sizeof(struct p2pConnectInfo) <= CONNECT_SIZE, "p2pConnectInfo is too large");
|
||||
|
||||
struct p2pIpcExpInfo {
|
||||
ncclIpcDesc ipcDesc;
|
||||
bool legacyIpcCap;
|
||||
int impFd;
|
||||
size_t size;
|
||||
uintptr_t offset;
|
||||
};
|
||||
|
||||
struct p2pShm {
|
||||
struct ncclSendMem sendMem;
|
||||
struct ncclRecvMem recvMem;
|
||||
@@ -37,9 +51,7 @@ struct p2pShmProxyInfo {
|
||||
// Shared memory between proxy and receiving GPU
|
||||
struct p2pShm* shm;
|
||||
struct p2pShm* devShm;
|
||||
char shmName[7];
|
||||
int shmSize;
|
||||
ncclShmHandle_t handle;
|
||||
ncclShmIpcDesc_t desc;
|
||||
|
||||
// Intermediate step for sender
|
||||
struct ncclRecvMem* ceRecvMem;
|
||||
@@ -62,13 +74,14 @@ struct p2pResources {
|
||||
struct ncclRecvMem* recvDevMem;
|
||||
};
|
||||
void* sendMemIpc;
|
||||
int sendMemSameProc;
|
||||
void* recvMemIpc;
|
||||
int recvMemSameProc;
|
||||
// CE memcpy support
|
||||
struct p2pShmProxyInfo proxyInfo;
|
||||
struct p2pShm* shm;
|
||||
struct p2pShm* devShm;
|
||||
int shmSize;
|
||||
ncclShmHandle_t handle;
|
||||
ncclShmIpcDesc_t desc;
|
||||
};
|
||||
|
||||
// cuMem API support
|
||||
@@ -104,12 +117,12 @@ static void initCeOperation();
|
||||
extern int64_t ncclParamMNNVLEnable();
|
||||
|
||||
/* Determine if two peers can communicate through p2p */
|
||||
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
initCeOperation();
|
||||
|
||||
// MNNVL support
|
||||
if (ncclParamMNNVLEnable() != 0 && info1->hostHash != info2->hostHash) {
|
||||
NCCLCHECK(ncclTopoCheckMNNVL(topo, info1, info2, ret));
|
||||
if (comm->MNNVL && info1->hostHash != info2->hostHash) {
|
||||
NCCLCHECK(ncclTopoCheckMNNVL(comm->topo, info1, info2, ret));
|
||||
if (*ret) return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -121,7 +134,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
|
||||
// Check topology / p2p level.
|
||||
int intermediateRank;
|
||||
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank));
|
||||
NCCLCHECK(ncclTopoCheckP2p(comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank));
|
||||
if (*ret == 0) return ncclSuccess;
|
||||
if (intermediateRank != -1) {
|
||||
if (useMemcpy) *ret = 0;
|
||||
@@ -130,7 +143,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
|
||||
// Check if NET would work better
|
||||
int useNet = 0;
|
||||
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
|
||||
NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, &useNet));
|
||||
if (useNet) {
|
||||
*ret = 0;
|
||||
return ncclSuccess;
|
||||
@@ -197,7 +210,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
} while (0)
|
||||
|
||||
// cuMem API support
|
||||
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr) {
|
||||
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int refcount, ncclIpcDesc *ipcDesc, void **ptr) {
|
||||
if (ncclCuMemEnable()) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
CUmemAllocationHandleType type = ncclCuMemHandleType;
|
||||
@@ -211,6 +224,10 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, v
|
||||
} else {
|
||||
CUCHECK(cuMemExportToShareableHandle(&ipcDesc->cuDesc, handle, type, 0));
|
||||
}
|
||||
if (refcount) {
|
||||
memcpy(&ipcDesc->memHandle, &handle, sizeof(handle));
|
||||
for (int r = 0; r < refcount; ++r) CUCHECK(cuMemRetainAllocationHandle(&handle, *ptr));
|
||||
}
|
||||
#else
|
||||
return ncclInternalError;
|
||||
#endif
|
||||
@@ -233,7 +250,7 @@ ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) {
|
||||
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) {
|
||||
if (ncclCuMemEnable()) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
// cuMem API support
|
||||
@@ -241,16 +258,25 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
|
||||
CUmemAllocationHandleType type = ncclCuMemHandleType;
|
||||
CUmemGenericAllocationHandle handle;
|
||||
ncclCuDesc *cuDesc = &ipcDesc->cuDesc;
|
||||
CUmemAllocationProp prop = {};
|
||||
size_t granularity = 0;
|
||||
|
||||
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
prop.requestedHandleTypes = type;
|
||||
prop.location.id = comm->cudaDev;
|
||||
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
||||
ALIGN_SIZE(size, granularity);
|
||||
|
||||
// Import and map the remote memory descriptor to the local GPU
|
||||
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
|
||||
// UDS fd support
|
||||
int fd = -1;
|
||||
// Send cuMem handle to remote for conversion to an fd
|
||||
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpPeer, &cuDesc->data, &fd));
|
||||
INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, tpPeer);
|
||||
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, peer, &cuDesc->data, &fd));
|
||||
INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, peer);
|
||||
CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type));
|
||||
(void) close(fd);
|
||||
SYSCHECK(close(fd), "close");
|
||||
} else {
|
||||
CUCHECK(cuMemImportFromShareableHandle(&handle, cuDesc, type));
|
||||
}
|
||||
@@ -291,7 +317,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
|
||||
int p2p;
|
||||
// Queries the topology to see if the GPUs are Ampere and
|
||||
// connected via NVLink, if so we enable P2P Read by default
|
||||
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, read, intermediateRank));
|
||||
NCCLCHECK(ncclTopoCheckP2p(topo, info1->rank, info2->rank, &p2p, read, intermediateRank));
|
||||
|
||||
int readEnable = ncclParamP2pReadEnable();
|
||||
if (readEnable != -2) *read = readEnable;
|
||||
@@ -311,24 +337,23 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
|
||||
peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
|
||||
return ncclInternalError;
|
||||
}
|
||||
#if CUDART_VERSION >= 11030
|
||||
// cuMem API support
|
||||
if (ncclCuMemEnable()) {
|
||||
// Allow direct access to the remote buffer from the local GPU
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = myInfo->cudaDev;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
INFO(NCCL_P2P, "Set Access for buffer %p size %zu on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev);
|
||||
CUCHECK(cuMemSetAccess((CUdeviceptr) p2pBuff->directPtr, p2pBuff->size, &accessDesc, 1));
|
||||
// for intra-process ranks, we should map memHandle of the peers to increase refcount.
|
||||
// Otherwise, if peers abort and free the buffer, the rank can suffer invalid access.
|
||||
NCCLCHECK(ncclCuMemAllocAddr(devMem, &p2pBuff->ipcDesc.memHandle, p2pBuff->size));
|
||||
CUCHECK(cuMemRelease(p2pBuff->ipcDesc.memHandle));
|
||||
*ipcPtr = *devMem;
|
||||
} else {
|
||||
*devMem = p2pBuff->directPtr;
|
||||
*ipcPtr = NULL;
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
*devMem = p2pBuff->directPtr;
|
||||
*ipcPtr = NULL;
|
||||
}
|
||||
*devMem = p2pBuff->directPtr;
|
||||
*ipcPtr = NULL;
|
||||
} else {
|
||||
// Different PID
|
||||
NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem));
|
||||
NCCLCHECK(ncclP2pImportShareableBuffer(comm, peerInfo->rank, p2pBuff->size, &p2pBuff->ipcDesc, devMem));
|
||||
*ipcPtr = *devMem;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -338,7 +363,7 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
|
||||
ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
|
||||
struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
|
||||
struct p2pResources* resources;
|
||||
int tpProxyRank;
|
||||
struct ncclP2pRequest req;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
send->transportResources = resources;
|
||||
int useRead, intermediateRank;
|
||||
@@ -387,15 +412,18 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
comm->peerInfo[intermediateRank].nvmlDev, useReadStr);
|
||||
}
|
||||
|
||||
tpProxyRank = comm->topParentRanks[info->rank];
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &send->proxyConn));
|
||||
req.size = sendSize;
|
||||
req.refcount = 0;
|
||||
if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++;
|
||||
if (P2P_SAME_PID((comm->peerInfo + info->rank), myInfo) && (comm->peerInfo[info->rank].cudaDev != myInfo->cudaDev)) req.refcount++;
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
|
||||
if (useMemcpy) {
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pShmProxyInfo)));
|
||||
info->shmSize = resources->proxyInfo.shmSize;
|
||||
memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
|
||||
memcpy(&info->desc, &resources->proxyInfo.desc, sizeof(ncclShmIpcDesc_t));
|
||||
} else {
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(struct ncclP2pRequest), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
|
||||
NCCLCHECK(p2pMap(comm, &send->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->sendDevMem, &resources->sendMemIpc));
|
||||
resources->sendMemSameProc = P2P_SAME_PID(myInfo, (comm->peerInfo + info->rank));
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
@@ -405,7 +433,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
|
||||
struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) {
|
||||
struct p2pResources* resources;
|
||||
int tpProxyRank;
|
||||
struct ncclP2pRequest req;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
recv->transportResources = resources;
|
||||
int useRead, intermediateRank;
|
||||
@@ -444,11 +472,15 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
info->rank = intermediateRank;
|
||||
}
|
||||
|
||||
tpProxyRank = comm->topParentRanks[info->rank];
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, tpProxyRank, &recv->proxyConn));
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
|
||||
req.size = recvSize;
|
||||
req.refcount = 0;
|
||||
if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++;
|
||||
if (P2P_SAME_PID((comm->peerInfo + info->rank), myInfo) && (comm->peerInfo[info->rank].cudaDev != myInfo->cudaDev)) req.refcount++;
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn));
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(struct ncclP2pRequest), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
|
||||
|
||||
NCCLCHECK(p2pMap(comm, &recv->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->recvDevMem, &resources->recvMemIpc));
|
||||
resources->recvMemSameProc = P2P_SAME_PID(myInfo, (comm->peerInfo + info->rank));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -459,6 +491,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
|
||||
|
||||
NCCLCHECK(p2pMap(comm, &send->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc));
|
||||
resources->recvMemSameProc = P2P_SAME_PID((comm->peerInfo + rank), (comm->peerInfo + info->rank));
|
||||
|
||||
char* buff = (char*)(remDevMem+1);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
@@ -499,17 +532,14 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
|
||||
struct ncclSendMem* remDevMem = NULL;
|
||||
|
||||
if (useMemcpy) {
|
||||
char shmPath[PATH_MAX];
|
||||
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
|
||||
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
|
||||
resources->shmSize = info->shmSize;
|
||||
// Attach to peer's SHM segment
|
||||
NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, -1, &resources->handle));
|
||||
NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->shm, (void**)&resources->devShm, &resources->desc));
|
||||
|
||||
recv->conn.tail = &resources->devShm->recvMem.tail;
|
||||
recv->conn.head = &resources->devShm->sendMem.head;
|
||||
} else {
|
||||
NCCLCHECK(p2pMap(comm, &recv->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
|
||||
resources->sendMemSameProc = P2P_SAME_PID((comm->peerInfo + rank), (comm->peerInfo + info->rank));
|
||||
|
||||
struct ncclRecvMem* devMem = resources->recvDevMem;
|
||||
recv->conn.tail = &devMem->tail;
|
||||
@@ -538,8 +568,21 @@ ncclResult_t p2pSendFree(struct ncclConnector* send) {
|
||||
if (resources) {
|
||||
if (ncclCuMemEnable()) {
|
||||
// cuMem API support
|
||||
if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
|
||||
if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
|
||||
if (resources->sendMemIpc) {
|
||||
if (resources->sendMemSameProc) {
|
||||
NCCLCHECK(ncclCuMemFreeAddr(resources->sendMemIpc));
|
||||
} else {
|
||||
NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
|
||||
}
|
||||
}
|
||||
|
||||
if (resources->recvMemIpc) {
|
||||
if (resources->recvMemSameProc) {
|
||||
NCCLCHECK(ncclCuMemFreeAddr(resources->recvMemIpc));
|
||||
} else {
|
||||
NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
|
||||
@@ -555,14 +598,27 @@ ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
|
||||
if (resources) {
|
||||
if (ncclCuMemEnable()) {
|
||||
// cuMem API support
|
||||
if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
|
||||
if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
|
||||
if (resources->sendMemIpc) {
|
||||
if (resources->sendMemSameProc) {
|
||||
NCCLCHECK(ncclCuMemFreeAddr(resources->sendMemIpc));
|
||||
} else {
|
||||
NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
|
||||
}
|
||||
}
|
||||
|
||||
if (resources->recvMemIpc) {
|
||||
if (resources->recvMemSameProc) {
|
||||
NCCLCHECK(ncclCuMemFreeAddr(resources->recvMemIpc));
|
||||
} else {
|
||||
NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
|
||||
if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
|
||||
if (useMemcpy) {
|
||||
NCCLCHECK(ncclShmClose(resources->handle));
|
||||
NCCLCHECK(ncclShmIpcClose(&resources->desc));
|
||||
}
|
||||
}
|
||||
free(resources);
|
||||
@@ -574,29 +630,27 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
|
||||
if (useMemcpy) {
|
||||
// CE memcpy support
|
||||
struct p2pShmProxyInfo* proxyInfo;
|
||||
size_t shmSize;
|
||||
|
||||
if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError;
|
||||
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
|
||||
connection->transportResources = proxyInfo;
|
||||
|
||||
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE]));
|
||||
|
||||
char shmPath[PATH_MAX];
|
||||
shmPath[0] = '\0';
|
||||
proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
|
||||
// Create a SHM segment for the peer to attach to
|
||||
NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1, &proxyInfo->handle));
|
||||
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
|
||||
memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));
|
||||
shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
|
||||
NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, shmSize, false, &proxyInfo->desc, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm));
|
||||
|
||||
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
|
||||
|
||||
if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError;
|
||||
memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo));
|
||||
} else {
|
||||
if (reqSize != sizeof(int)) return ncclInternalError;
|
||||
int size = *((int*)reqBuff);
|
||||
struct ncclP2pRequest* req = (struct ncclP2pRequest*)reqBuff;
|
||||
if (reqSize != sizeof(struct ncclP2pRequest)) return ncclInternalError;
|
||||
int size = req->size;
|
||||
if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
|
||||
struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
|
||||
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
|
||||
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, req->refcount, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
|
||||
p2pBuff->size = size;
|
||||
if (ncclCuMemEnable()) {
|
||||
// cuMem API support
|
||||
@@ -613,11 +667,12 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
|
||||
}
|
||||
|
||||
static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||
if (reqSize != sizeof(int)) return ncclInternalError;
|
||||
int size = *((int*)reqBuff);
|
||||
struct ncclP2pRequest* req = (struct ncclP2pRequest*)reqBuff;
|
||||
if (reqSize != sizeof(struct ncclP2pRequest)) return ncclInternalError;
|
||||
int size = req->size;
|
||||
if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
|
||||
struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
|
||||
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
|
||||
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, req->refcount, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
|
||||
p2pBuff->size = size;
|
||||
if (ncclCuMemEnable()) {
|
||||
// cuMem API support
|
||||
@@ -651,7 +706,7 @@ static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, str
|
||||
if (useMemcpy) {
|
||||
struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources;
|
||||
if (proxyInfo) {
|
||||
NCCLCHECK(ncclShmClose(proxyInfo->handle));
|
||||
NCCLCHECK(ncclShmIpcClose(&proxyInfo->desc));
|
||||
NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
|
||||
NCCLCHECK(ncclCudaFree(proxyInfo->ceDevBuff));
|
||||
CUDACHECK(cudaStreamDestroy(proxyInfo->stream));
|
||||
@@ -752,11 +807,382 @@ static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, stru
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclReg *regRecord = NULL;
|
||||
struct ncclIpcRegInfo* newInfo = NULL;
|
||||
uintptr_t* peerRmtAddrs = NULL;
|
||||
bool legacyIpcCap = false;
|
||||
size_t baseSize = 0;
|
||||
void* baseAddr = NULL;
|
||||
bool needUpdate = false;
|
||||
|
||||
*regBufFlag = 0;
|
||||
*offsetOut = 0;
|
||||
*peerRmtAddrsOut = NULL;
|
||||
if (comm && userbuff && buffSize > 0 && nPeers > 0) {
|
||||
NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, ®Record), ret, fail);
|
||||
if (regRecord) {
|
||||
// buffer was registered by by users, we need to start to register or reuse it
|
||||
int peerLocalRank;
|
||||
for (int p = 0; p < nPeers; p++) {
|
||||
int peerRank = peerRanks[p];
|
||||
peerLocalRank = comm->rankToLocalRank[peerRank];
|
||||
if (regRecord->ipcInfos[peerLocalRank]) {
|
||||
// We already have IPC info for peerLocalRank, no need to register it, we can reuse it
|
||||
*regBufFlag = 1;
|
||||
INFO(NCCL_REG, "rank %d - IPC local reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr);
|
||||
} else {
|
||||
// Register buffer with peerLocalRank
|
||||
struct ncclProxyConnector* proxyConn = NULL;
|
||||
struct p2pIpcExpInfo ipcInfo;
|
||||
|
||||
if (baseAddr == NULL) {
|
||||
CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
|
||||
CUCHECKGOTO(cuPointerGetAttribute((void*)&legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
|
||||
}
|
||||
if (comm->gproxyConn[peerRank].initialized == false)
|
||||
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
|
||||
proxyConn = &comm->gproxyConn[peerRank];
|
||||
|
||||
ipcInfo.legacyIpcCap = legacyIpcCap;
|
||||
// Get the mem handle for that buffer. It may have been allocated through cudaMalloc in which case we'll
|
||||
// get the CUDA legacy mem handle, or through cuMem*.
|
||||
if (ipcInfo.legacyIpcCap) {
|
||||
// legacy export
|
||||
if (comm->directMode) goto fail;
|
||||
CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
|
||||
} else if (ncclCuMemEnable()) {
|
||||
CUmemGenericAllocationHandle handle;
|
||||
if (CUPFN(cuMemRetainAllocationHandle(&handle, baseAddr)) != CUDA_SUCCESS) {
|
||||
// if cuMem* export fails, retry legacy export
|
||||
if (comm->directMode) goto fail;
|
||||
CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
|
||||
ipcInfo.legacyIpcCap = true;
|
||||
} else {
|
||||
// cuMem* export to file descriptor or fabric handle
|
||||
if (proxyConn->sameProcess) {
|
||||
memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
|
||||
} else {
|
||||
if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
|
||||
int expFd = -1;
|
||||
CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
|
||||
NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
|
||||
SYSCHECKGOTO(close(expFd), "close", ret, fail);
|
||||
} else {
|
||||
// Allow this to silently fail for cases where the user buff cannot be registered
|
||||
if (CUPFN(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0)) != CUDA_SUCCESS) {
|
||||
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
}
|
||||
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
|
||||
}
|
||||
} else {
|
||||
// nothing works, just return
|
||||
goto fail;
|
||||
}
|
||||
|
||||
void* rmtRegAddr = NULL;
|
||||
ipcInfo.size = baseSize;
|
||||
ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr;
|
||||
// Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side
|
||||
// and get the remote register address back.
|
||||
if (proxyConn)
|
||||
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
|
||||
if (rmtRegAddr) {
|
||||
NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail);
|
||||
assert(regRecord->ipcInfos[peerLocalRank] == NULL);
|
||||
regRecord->state |= IPC_REG_COMPLETE;
|
||||
newInfo->peerRank = peerRank;
|
||||
newInfo->baseAddr = baseAddr;
|
||||
newInfo->impInfo.rmtRegAddr = rmtRegAddr;
|
||||
newInfo->impInfo.offset = ipcInfo.offset;
|
||||
newInfo->impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
|
||||
newInfo->ipcProxyconn = proxyConn;
|
||||
regRecord->ipcInfos[peerLocalRank] = newInfo;
|
||||
if (regRecord->regIpcAddrs.hostPeerRmtAddrs == NULL) {
|
||||
NCCLCHECKGOTO(ncclCalloc(®Record->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
|
||||
}
|
||||
regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr;
|
||||
needUpdate = true;
|
||||
*regBufFlag = 1;
|
||||
INFO(NCCL_REG, "rank %d - IPC local register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (*regBufFlag) {
|
||||
if (type == NCCL_IPC_COLLECTIVE) {
|
||||
// for collective, store registered remote buffers into dev memory for future reference
|
||||
if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) {
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
|
||||
if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL)
|
||||
NCCLCHECKGOTO(ncclCudaCallocAsync(®Record->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
|
||||
if (needUpdate)
|
||||
NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
|
||||
}
|
||||
peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs;
|
||||
} else {
|
||||
assert(nPeers == 1);
|
||||
// p2p always returns remote addr here since remote buffer addr is passed in ncclDevWorkP2p struct
|
||||
peerRmtAddrs = (uintptr_t*)regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank];
|
||||
}
|
||||
*offsetOut = (uintptr_t)userbuff - regRecord->addr;
|
||||
*peerRmtAddrsOut = peerRmtAddrs;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
*regBufFlag = 0;
|
||||
*offsetOut = 0;
|
||||
*peerRmtAddrsOut = NULL;
|
||||
if (newInfo) free(newInfo);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
struct ncclIpcCleanupCallback {
|
||||
struct ncclCommCallback base;
|
||||
bool isAddrs;
|
||||
union {
|
||||
struct ncclIpcRegInfo regInfo;
|
||||
struct ncclPeerRegIpcAddr regIpcAddrs;
|
||||
};
|
||||
};
|
||||
|
||||
static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) {
|
||||
struct ncclIpcCleanupCallback* obj = (struct ncclIpcCleanupCallback*)cb;
|
||||
if (obj->isAddrs) {
|
||||
if (obj->regIpcAddrs.hostPeerRmtAddrs)
|
||||
free(obj->regIpcAddrs.hostPeerRmtAddrs);
|
||||
if (obj->regIpcAddrs.devPeerRmtAddrs)
|
||||
NCCLCHECK(ncclCudaFree(obj->regIpcAddrs.devPeerRmtAddrs));
|
||||
} else {
|
||||
NCCLCHECK(ncclIpcDeregBuffer(comm, &obj->regInfo));
|
||||
}
|
||||
free(obj);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclProxyConnector* proxyConn = NULL;
|
||||
struct p2pIpcExpInfo ipcInfo;
|
||||
void* baseAddr;
|
||||
size_t baseSize;
|
||||
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue = reinterpret_cast<struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>*>(cleanupQueuePtr);
|
||||
uintptr_t* peerRmtAddrs = NULL;
|
||||
struct ncclIpcCleanupCallback* addrsRecord = NULL;
|
||||
|
||||
*regBufFlag = 0;
|
||||
CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
|
||||
CUCHECKGOTO(cuPointerGetAttribute((void*)&ipcInfo.legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
|
||||
|
||||
if (type == NCCL_IPC_COLLECTIVE) {
|
||||
// collective needs host memory array to hold all remote buffer addrs.
|
||||
// We need to put this into graph release queue
|
||||
NCCLCHECKGOTO(ncclCalloc(&addrsRecord, 1), ret, fail);
|
||||
addrsRecord->base.fn = cleanupIpc;
|
||||
addrsRecord->isAddrs = true;
|
||||
NCCLCHECKGOTO(ncclCalloc(&addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
|
||||
} else {
|
||||
assert(nPeers == 1);
|
||||
// p2p does not need anything, just returning the remote buffer is enough, but for now, we register
|
||||
// peer one by one so nPeers must be 1
|
||||
}
|
||||
|
||||
for (int p = 0; p < nPeers; ++p) {
|
||||
int peerRank = peerRanks[p];
|
||||
if (comm->gproxyConn[peerRank].initialized == false)
|
||||
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
|
||||
proxyConn = &comm->gproxyConn[peerRank];
|
||||
// Same as local registration. Get the mem handle for that buffer. It may have been allocated through
|
||||
// cudaMalloc in which case we'll get the CUDA legacy mem handle, or through cuMem*.
|
||||
if (ipcInfo.legacyIpcCap) {
|
||||
if (comm->directMode) goto fail;
|
||||
CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
|
||||
} else if (ncclCuMemEnable()) {
|
||||
// cuMem* export
|
||||
CUmemGenericAllocationHandle handle;
|
||||
if (pfn_cuMemRetainAllocationHandle(&handle, baseAddr) != CUDA_SUCCESS) {
|
||||
if (comm->directMode) goto fail;
|
||||
CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
|
||||
ipcInfo.legacyIpcCap = true;
|
||||
} else {
|
||||
if (proxyConn->sameProcess) {
|
||||
memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
|
||||
} else {
|
||||
if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
|
||||
int expFd = -1;
|
||||
CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
|
||||
if (proxyConn->sameProcess) {
|
||||
ipcInfo.impFd = expFd;
|
||||
} else {
|
||||
NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
|
||||
SYSCHECKGOTO(close(expFd), "close", ret, fail);
|
||||
}
|
||||
} else {
|
||||
CUCHECKGOTO(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0), ret, fail);
|
||||
}
|
||||
}
|
||||
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
|
||||
}
|
||||
} else {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
void* rmtRegAddr = NULL;
|
||||
ipcInfo.size = baseSize;
|
||||
ipcInfo.offset = 0;
|
||||
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(struct p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
|
||||
if (rmtRegAddr) {
|
||||
struct ncclIpcCleanupCallback* record;
|
||||
NCCLCHECKGOTO(ncclCalloc(&record, 1), ret, fail);
|
||||
record->base.fn = cleanupIpc;
|
||||
record->isAddrs = false;
|
||||
record->regInfo.peerRank = peerRank;
|
||||
record->regInfo.baseAddr = baseAddr;
|
||||
record->regInfo.impInfo.rmtRegAddr = rmtRegAddr;
|
||||
record->regInfo.impInfo.offset = 0;
|
||||
record->regInfo.impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
|
||||
record->regInfo.ipcProxyconn = proxyConn;
|
||||
// store the remote address into host addr array
|
||||
if (type == NCCL_IPC_COLLECTIVE)
|
||||
addrsRecord->regIpcAddrs.hostPeerRmtAddrs[comm->rankToLocalRank[peerRank]] = (uintptr_t)rmtRegAddr;
|
||||
else
|
||||
peerRmtAddrs = (uintptr_t*)rmtRegAddr;
|
||||
*regBufFlag = 1;
|
||||
if (ipcInfo.legacyIpcCap)
|
||||
ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &record->base);
|
||||
else
|
||||
ncclIntruQueueEnqueue(cleanupQueue, &record->base);
|
||||
if (nCleanupQueueElts) *nCleanupQueueElts += 1;
|
||||
INFO(NCCL_REG, "rank %d - IPC graph register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, baseAddr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - (uintptr_t)baseAddr);
|
||||
}
|
||||
}
|
||||
|
||||
if (type == NCCL_IPC_COLLECTIVE) {
|
||||
// allocate the dev addr array and copy all previously stored addrs into it.
|
||||
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaCallocAsync(&addrsRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaMemcpyAsync(addrsRecord->regIpcAddrs.devPeerRmtAddrs, addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->nRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
|
||||
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
|
||||
peerRmtAddrs = addrsRecord->regIpcAddrs.devPeerRmtAddrs;
|
||||
if (ipcInfo.legacyIpcCap)
|
||||
ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &addrsRecord->base);
|
||||
else
|
||||
ncclIntruQueueEnqueue(cleanupQueue, &addrsRecord->base);
|
||||
}
|
||||
*offsetOut = (uintptr_t)userbuff - (uintptr_t)baseAddr;
|
||||
*peerRmtAddrsOut = peerRmtAddrs;
|
||||
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
*regBufFlag = 0;
|
||||
*offsetOut = 0;
|
||||
*peerRmtAddrsOut = NULL;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo) {
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, regInfo->ipcProxyconn, ncclProxyMsgDeregister, ®Info->impInfo, sizeof(struct ncclIpcImpInfo), NULL, 0));
|
||||
INFO(NCCL_REG, "rank %d - IPC deregistered buffer %p peer %d ipc remote buffer %p", comm->rank, regInfo->baseAddr, regInfo->peerRank, regInfo->impInfo.rmtRegAddr);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||
struct p2pIpcExpInfo* ipcExpInfo = (struct p2pIpcExpInfo*)reqBuff;
|
||||
void* regAddr = NULL;
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
bool mapped = false;
|
||||
bool imported = false;
|
||||
CUmemGenericAllocationHandle handle;
|
||||
|
||||
assert(sizeof(struct p2pIpcExpInfo) == reqSize);
|
||||
assert(sizeof(void*) == respSize);
|
||||
|
||||
// request peer passes all necessary buffer info to import. The proxy thread would register
|
||||
// the buffer locally and return register addr back
|
||||
if (ipcExpInfo->legacyIpcCap) {
|
||||
// legacy import
|
||||
CUDACHECKGOTO(cudaIpcOpenMemHandle(®Addr, ipcExpInfo->ipcDesc.devIpc, cudaIpcMemLazyEnablePeerAccess), ret, fail);
|
||||
regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset);
|
||||
} else {
|
||||
// cuMem import
|
||||
if (connection->sameProcess) {
|
||||
// if proxy is same process as request peer, we just need to map the handle.
|
||||
memcpy(&handle, &ipcExpInfo->ipcDesc.memHandle, sizeof(CUmemGenericAllocationHandle));
|
||||
} else {
|
||||
if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
|
||||
CUCHECKGOTO(cuMemImportFromShareableHandle(&handle, (void*)(uintptr_t)ipcExpInfo->impFd, ncclCuMemHandleType), ret, fail);
|
||||
SYSCHECKGOTO(close(ipcExpInfo->impFd), "close", ret, fail);
|
||||
} else {
|
||||
CUCHECKGOTO(cuMemImportFromShareableHandle(&handle, (void*)&ipcExpInfo->ipcDesc.cuDesc, ncclCuMemHandleType), ret, fail);
|
||||
}
|
||||
}
|
||||
imported = true;
|
||||
CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)®Addr, ipcExpInfo->size, /* alignment */ 0, /* addr */ 0, /* flags */ 0), ret, fail);
|
||||
CUCHECKGOTO(cuMemMap((CUdeviceptr)regAddr, ipcExpInfo->size, /* offset */ 0, handle, /* flags */ 0), ret, fail);
|
||||
mapped = true;
|
||||
// Allow access by the local GPU
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = proxyState->cudaDev;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)regAddr, ipcExpInfo->size, &accessDesc, 1), ret, fail);
|
||||
regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset);
|
||||
}
|
||||
INFO(NCCL_REG, "Proxy rank %d register succeeds, regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
|
||||
|
||||
exit:
|
||||
memcpy(respBuff, (void*)®Addr, sizeof(void*));
|
||||
*done = 1;
|
||||
return ret;
|
||||
fail:
|
||||
if (!ipcExpInfo->legacyIpcCap) {
|
||||
if (mapped) CUCHECK(cuMemUnmap((CUdeviceptr)regAddr, ipcExpInfo->size));
|
||||
if (regAddr) CUCHECK(cuMemAddressFree((CUdeviceptr)regAddr, ipcExpInfo->size));
|
||||
if (imported) CUCHECK(cuMemRelease(handle));
|
||||
}
|
||||
regAddr = NULL;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t p2pProxyDeregister(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
struct ncclIpcImpInfo* ipcInfo = (struct ncclIpcImpInfo*)reqBuff;
|
||||
assert(sizeof(struct ncclIpcImpInfo) == reqSize);
|
||||
|
||||
if (ipcInfo->legacyIpcCap) {
|
||||
CUDACHECKGOTO(cudaIpcCloseMemHandle((void*)((uintptr_t)ipcInfo->rmtRegAddr - ipcInfo->offset)), ret, fail);
|
||||
} else {
|
||||
if (connection->sameProcess) {
|
||||
NCCLCHECKGOTO(ncclCuMemFreeAddr((void*)((uintptr_t)ipcInfo->rmtRegAddr - ipcInfo->offset)), ret, fail);
|
||||
} else {
|
||||
NCCLCHECKGOTO(ncclCudaFree((void*)((uintptr_t)ipcInfo->rmtRegAddr - ipcInfo->offset)), ret, fail);
|
||||
}
|
||||
}
|
||||
|
||||
exit:
|
||||
*done = 1;
|
||||
return ret;
|
||||
fail:
|
||||
goto exit;
|
||||
}
|
||||
|
||||
struct ncclTransport p2pTransport = {
|
||||
"P2P",
|
||||
p2pCanConnect,
|
||||
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL, NULL },
|
||||
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL, NULL }
|
||||
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL, p2pProxyRegister, p2pProxyDeregister },
|
||||
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL, p2pProxyRegister, p2pProxyDeregister }
|
||||
};
|
||||
|
||||
static void initCeOperation() {
|
||||
|
||||
+313
-101
@@ -5,35 +5,58 @@
|
||||
************************************************************************/
|
||||
|
||||
#include "comm.h"
|
||||
#include "shmutils.h"
|
||||
#include "shm.h"
|
||||
#include "transport.h"
|
||||
|
||||
struct shmConnectInfo {
|
||||
char shmName[7];
|
||||
int shmSize;
|
||||
#define SHM_PATH_MAX 128
|
||||
#define SHM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
|
||||
|
||||
struct shmBuffInfo {
|
||||
void *hptr;
|
||||
void *dptr;
|
||||
};
|
||||
|
||||
struct shmConnectInfo {
|
||||
ncclShmIpcDesc_t desc;
|
||||
struct shmBuffInfo buf;
|
||||
};
|
||||
static_assert(sizeof(shmConnectInfo) <= CONNECT_SIZE, "SHM Connect info is too large");
|
||||
|
||||
struct shmSendResources {
|
||||
int remShmSize;
|
||||
struct ncclRecvMem* remHostMem;
|
||||
struct ncclRecvMem* devRemHostMem;
|
||||
ncclShmHandle_t remHandle;
|
||||
int shmSize;
|
||||
ncclShmIpcDesc_t remDesc;
|
||||
struct ncclSendMem* hostMem;
|
||||
struct ncclSendMem* devHostMem;
|
||||
ncclShmHandle_t hostHandle;
|
||||
};
|
||||
|
||||
struct shmRecvResources {
|
||||
int remShmSize;
|
||||
struct ncclSendMem* remHostMem;
|
||||
struct ncclSendMem* devRemHostMem;
|
||||
ncclShmHandle_t remHandle;
|
||||
int shmSize;
|
||||
ncclShmIpcDesc_t remDesc;
|
||||
struct ncclRecvMem* hostMem;
|
||||
struct ncclRecvMem* devHostMem;
|
||||
ncclShmHandle_t hostHandle;
|
||||
};
|
||||
|
||||
struct shmProxyInfo {
|
||||
struct ncclRecvMem* ceRecvMem;
|
||||
char* devFifo;
|
||||
char* shmFifo;
|
||||
struct ncclSendMem* sendMem;
|
||||
struct ncclRecvMem* recvMem;
|
||||
|
||||
// used by progress only
|
||||
uint64_t step;
|
||||
cudaStream_t stream;
|
||||
cudaEvent_t events[NCCL_STEPS];
|
||||
|
||||
// ipc desc
|
||||
ncclShmIpcDesc_t desc;
|
||||
};
|
||||
|
||||
struct shmRequest {
|
||||
size_t size;
|
||||
bool legacy;
|
||||
};
|
||||
|
||||
#define SHM_SEND_SIDE 1
|
||||
@@ -48,14 +71,14 @@ static int shmLocality = 0;
|
||||
static void initCeOperation();
|
||||
|
||||
/* Determine two peers can communicate with SHM */
|
||||
static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
static ncclResult_t shmCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
*ret = 0;
|
||||
initCeOperation();
|
||||
|
||||
if (ncclParamShmDisable() == 1) return ncclSuccess;
|
||||
|
||||
int useNet = 0;
|
||||
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
|
||||
NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, &useNet));
|
||||
if (useNet) return ncclSuccess;
|
||||
|
||||
// Same host?
|
||||
@@ -76,22 +99,29 @@ static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct
|
||||
/* Create and return connect structures for this peer to connect to me */
|
||||
static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
|
||||
struct shmSendResources* resources;
|
||||
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
|
||||
size_t shmSize = sizeof(struct ncclSendMem);
|
||||
struct shmRequest req;
|
||||
|
||||
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
|
||||
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
send->transportResources = resources;
|
||||
|
||||
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
|
||||
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
|
||||
|
||||
char shmPath[PATH_MAX];
|
||||
shmPath[0] = '\0';
|
||||
int shmSize = sizeof(struct ncclSendMem);
|
||||
if (shmLocality == SHM_SEND_SIDE) {
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += comm->buffSizes[p];
|
||||
}
|
||||
info->shmSize = resources->shmSize = shmSize;
|
||||
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
|
||||
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
|
||||
memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
|
||||
req.size = shmSize;
|
||||
if (myInfo->hostHash == peerInfo->hostHash && myInfo->pidHash == peerInfo->pidHash)
|
||||
req.legacy = true;
|
||||
else
|
||||
req.legacy = false;
|
||||
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, myInfo->rank, &send->proxyConn));
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo)));
|
||||
|
||||
resources->hostMem = (struct ncclSendMem*)info->buf.hptr;
|
||||
resources->devHostMem = (struct ncclSendMem*)info->buf.dptr;
|
||||
|
||||
INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%d] -> %d[%d] via SHM/%s/%s", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useMemcpySend?"CE":"direct", useMemcpyRecv?"CE":"direct");
|
||||
return ncclSuccess;
|
||||
@@ -99,52 +129,43 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
|
||||
|
||||
static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
|
||||
struct shmRecvResources* resources;
|
||||
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
|
||||
size_t shmSize = sizeof(struct ncclRecvMem);
|
||||
struct shmRequest req;
|
||||
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
recv->transportResources = resources;
|
||||
|
||||
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
|
||||
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
|
||||
|
||||
char shmPath[PATH_MAX];
|
||||
shmPath[0] = '\0';
|
||||
int shmSize = sizeof(struct ncclRecvMem);
|
||||
if (shmLocality == SHM_RECV_SIDE) {
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += comm->buffSizes[p];
|
||||
}
|
||||
info->shmSize = resources->shmSize = shmSize;
|
||||
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
|
||||
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
|
||||
memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
|
||||
req.size = shmSize;
|
||||
if (myInfo->hostHash == peerInfo->hostHash && myInfo->pidHash == peerInfo->pidHash)
|
||||
req.legacy = true;
|
||||
else
|
||||
req.legacy = false;
|
||||
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, myInfo->rank, &recv->proxyConn));
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo)));
|
||||
|
||||
resources->hostMem = (struct ncclRecvMem*)info->buf.hptr;
|
||||
resources->devHostMem = (struct ncclRecvMem*)info->buf.dptr;
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct shmProxyInfo {
|
||||
struct ncclRecvMem* ceRecvMem;
|
||||
char* devFifo;
|
||||
char* shmFifo;
|
||||
struct ncclSendMem* sendMem;
|
||||
struct ncclRecvMem* recvMem;
|
||||
|
||||
// used by progress only
|
||||
uint64_t step;
|
||||
cudaStream_t stream;
|
||||
cudaEvent_t events[NCCL_STEPS];
|
||||
};
|
||||
|
||||
/* Connect to this peer */
|
||||
static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
|
||||
// Setup device pointers
|
||||
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
|
||||
struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
|
||||
char* buff;
|
||||
|
||||
char shmPath[PATH_MAX];
|
||||
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
|
||||
resources->remShmSize = info->shmSize;
|
||||
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
|
||||
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle));
|
||||
NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
|
||||
|
||||
char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
|
||||
buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
send->conn.buffs[p] = buff;
|
||||
buff += comm->buffSizes[p];
|
||||
@@ -157,9 +178,6 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
send->conn.connFifo = resources->devRemHostMem->connFifo;
|
||||
}
|
||||
if (useMemcpySend) {
|
||||
int tpProxyRank;
|
||||
tpProxyRank = comm->topParentRanks[comm->rank];
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, tpProxyRank, &send->proxyConn));
|
||||
struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
|
||||
send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
|
||||
@@ -177,14 +195,11 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
// Setup device pointers
|
||||
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
|
||||
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
|
||||
char* buff;
|
||||
|
||||
char shmPath[PATH_MAX];
|
||||
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
|
||||
resources->remShmSize = info->shmSize;
|
||||
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
|
||||
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle));
|
||||
NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
|
||||
|
||||
char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
|
||||
buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
recv->conn.buffs[p] = buff;
|
||||
buff += comm->buffSizes[p];
|
||||
@@ -194,7 +209,6 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
recv->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
|
||||
|
||||
if (useMemcpyRecv) {
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
|
||||
struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
|
||||
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
|
||||
recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
|
||||
@@ -210,8 +224,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
static ncclResult_t shmSendFree(struct ncclConnector* send) {
|
||||
struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
|
||||
if (resources) {
|
||||
NCCLCHECK(ncclShmClose(resources->hostHandle));
|
||||
NCCLCHECK(ncclShmClose(resources->remHandle));
|
||||
NCCLCHECK(ncclShmIpcClose(&resources->remDesc));
|
||||
free(resources);
|
||||
send->transportResources = NULL;
|
||||
}
|
||||
@@ -221,8 +234,7 @@ static ncclResult_t shmSendFree(struct ncclConnector* send) {
|
||||
static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
|
||||
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
|
||||
if (resources) {
|
||||
NCCLCHECK(ncclShmClose(resources->hostHandle));
|
||||
NCCLCHECK(ncclShmClose(resources->remHandle));
|
||||
NCCLCHECK(ncclShmIpcClose(&resources->remDesc));
|
||||
free(resources);
|
||||
recv->transportResources = NULL;
|
||||
}
|
||||
@@ -230,51 +242,76 @@ static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
|
||||
}
|
||||
|
||||
static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
if (reqSize != sizeof(struct shmProxyInfo) || respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
|
||||
struct shmProxyInfo* proxyInfo;
|
||||
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
|
||||
if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
|
||||
memcpy(proxyInfo, reqBuff, reqSize);
|
||||
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
|
||||
struct shmProxyInfo* reqInfo = (struct shmProxyInfo*)reqBuff;
|
||||
|
||||
proxyInfo = (struct shmProxyInfo*)connection->transportResources;
|
||||
proxyInfo->shmFifo = reqInfo->shmFifo;
|
||||
proxyInfo->sendMem = reqInfo->sendMem;
|
||||
proxyInfo->recvMem = reqInfo->recvMem;
|
||||
NCCLCHECKGOTO(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1), ret, fail);
|
||||
CUDACHECKGOTO(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking), ret, fail);
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(cudaEventCreate(proxyInfo->events+i));
|
||||
CUDACHECKGOTO(cudaEventCreate(proxyInfo->events+i), ret, fail);
|
||||
}
|
||||
connection->proxyAppendPtr = &connection->proxyAppend;
|
||||
connection->transportResources = proxyInfo;
|
||||
if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
|
||||
memcpy(respBuff, proxyInfo, respSize);
|
||||
return ncclSuccess;
|
||||
*done = 1;
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
if (proxyInfo->ceRecvMem) ncclCudaHostFree(proxyInfo->ceRecvMem);
|
||||
if (proxyInfo->devFifo) (void)ncclCudaFree(proxyInfo->devFifo);
|
||||
free(proxyInfo);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
if (reqSize != sizeof(struct shmProxyInfo) || respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
|
||||
struct shmProxyInfo* proxyInfo;
|
||||
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
|
||||
if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
|
||||
memcpy(proxyInfo, reqBuff, reqSize);
|
||||
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
|
||||
CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
|
||||
struct shmProxyInfo* reqInfo = (struct shmProxyInfo*)reqBuff;
|
||||
|
||||
proxyInfo = (struct shmProxyInfo*)connection->transportResources;
|
||||
proxyInfo->shmFifo = reqInfo->shmFifo;
|
||||
proxyInfo->sendMem = reqInfo->sendMem;
|
||||
proxyInfo->recvMem = reqInfo->recvMem;
|
||||
NCCLCHECKGOTO(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]), ret, fail);
|
||||
NCCLCHECKGOTO(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1), ret, fail);
|
||||
CUDACHECKGOTO(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking), ret, fail);
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(cudaEventCreate(proxyInfo->events+i));
|
||||
CUDACHECKGOTO(cudaEventCreate(proxyInfo->events+i), ret, fail);
|
||||
}
|
||||
connection->proxyAppendPtr = &connection->proxyAppend;
|
||||
connection->transportResources = proxyInfo;
|
||||
if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
|
||||
memcpy(respBuff, proxyInfo, respSize);
|
||||
return ncclSuccess;
|
||||
*done = 1;
|
||||
exit:
|
||||
return ret;
|
||||
fail:
|
||||
if (proxyInfo->ceRecvMem) ncclCudaHostFree(proxyInfo->ceRecvMem);
|
||||
if (proxyInfo->devFifo) (void)ncclCudaFree(proxyInfo->devFifo);
|
||||
free(proxyInfo);
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
|
||||
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
|
||||
|
||||
if (resources) {
|
||||
CUDACHECK(cudaStreamDestroy(resources->stream));
|
||||
NCCLCHECK(ncclCudaFree(resources->devFifo));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(cudaEventDestroy(resources->events[i]));
|
||||
if (useMemcpySend) {
|
||||
CUDACHECK(cudaStreamDestroy(resources->stream));
|
||||
NCCLCHECK(ncclCudaFree(resources->devFifo));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(cudaEventDestroy(resources->events[i]));
|
||||
}
|
||||
}
|
||||
NCCLCHECK(ncclShmIpcClose(&resources->desc));
|
||||
free(connection->transportResources);
|
||||
connection->transportResources = NULL;
|
||||
}
|
||||
@@ -285,12 +322,15 @@ static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, str
|
||||
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
|
||||
|
||||
if (resources) {
|
||||
CUDACHECK(cudaStreamDestroy(resources->stream));
|
||||
NCCLCHECK(ncclCudaFree(resources->devFifo));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(cudaEventDestroy(resources->events[i]));
|
||||
if (useMemcpyRecv) {
|
||||
CUDACHECK(cudaStreamDestroy(resources->stream));
|
||||
NCCLCHECK(ncclCudaFree(resources->devFifo));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(cudaEventDestroy(resources->events[i]));
|
||||
}
|
||||
}
|
||||
NCCLCHECK(ncclShmIpcClose(&resources->desc));
|
||||
free(connection->transportResources);
|
||||
connection->transportResources = NULL;
|
||||
}
|
||||
@@ -413,12 +453,37 @@ static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, stru
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclTransport shmTransport = {
|
||||
"SHM",
|
||||
shmCanConnect,
|
||||
{ shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL, NULL },
|
||||
{ shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL, NULL }
|
||||
};
|
||||
static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||
struct shmRequest* req = (struct shmRequest*)reqBuff;
|
||||
/* check message size */
|
||||
if (reqSize != sizeof(struct shmRequest)) return ncclInternalError;
|
||||
if (respSize != sizeof(struct shmConnectInfo)) return ncclInternalError;
|
||||
|
||||
struct shmConnectInfo* info = (struct shmConnectInfo*)respBuff;
|
||||
struct shmProxyInfo* proxyInfo;
|
||||
|
||||
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
|
||||
NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr));
|
||||
memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
|
||||
connection->transportResources = proxyInfo;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||
struct shmRequest* req = (struct shmRequest*)reqBuff;
|
||||
/* check message size */
|
||||
if (reqSize != sizeof(struct shmRequest)) return ncclInternalError;
|
||||
if (respSize != sizeof(struct shmConnectInfo)) return ncclInternalError;
|
||||
|
||||
struct shmConnectInfo* info = (struct shmConnectInfo*)respBuff;
|
||||
struct shmProxyInfo* proxyInfo;
|
||||
|
||||
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
|
||||
NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr));
|
||||
memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
|
||||
connection->transportResources = proxyInfo;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static void initCeOperation() {
|
||||
static int init = 0;
|
||||
@@ -427,12 +492,10 @@ static void initCeOperation() {
|
||||
useMemcpyRecv = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 2);
|
||||
if (useMemcpySend) {
|
||||
shmTransport.send.proxyConnect = shmSendProxyConnect;
|
||||
shmTransport.send.proxyFree = shmSendProxyFree;
|
||||
shmTransport.send.proxyProgress = shmSendProxyProgress;
|
||||
}
|
||||
if (useMemcpyRecv) {
|
||||
shmTransport.recv.proxyConnect = shmRecvProxyConnect;
|
||||
shmTransport.recv.proxyFree = shmRecvProxyFree;
|
||||
shmTransport.recv.proxyProgress = shmRecvProxyProgress;
|
||||
}
|
||||
shmLocality = ncclParamShmLocality();
|
||||
@@ -443,3 +506,152 @@ static void initCeOperation() {
|
||||
init = 1;
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *desc, void **hptr, void **dptr) {
|
||||
if (desc == NULL || hptr == NULL || tpProxyRank < -1) {
|
||||
WARN("Invalid argument desc %p, hptr %p, tpProxyRank %d", desc, hptr, tpProxyRank);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
#if CUDART_VERSION >= 12020
|
||||
if (ncclCuMemEnable() && ncclCuMemHostEnable() && !legacy) {
|
||||
// cuMem API support
|
||||
CUmemAllocationHandleType type = SHM_HANDLE_TYPE;
|
||||
CUmemGenericAllocationHandle handle;
|
||||
|
||||
NCCLCHECK(ncclCuMemHostAlloc(hptr, &handle, size));
|
||||
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
|
||||
// Return the native cuMem handle for later Export/Import via UDS
|
||||
memcpy(&desc->shmci.data, &handle, sizeof(handle));
|
||||
desc->shmci.tpProxyRank = tpProxyRank;
|
||||
} else {
|
||||
CUCHECK(cuMemExportToShareableHandle(&desc->shmci.handle, handle, type, 0));
|
||||
}
|
||||
desc->shmci.size = size;
|
||||
desc->shmci.ptr = *hptr;
|
||||
if (dptr) *dptr = *hptr;
|
||||
desc->legacy = false;
|
||||
INFO(NCCL_SHM, "CUMEM allocated shareable buffer %p size %zi", desc->shmci.ptr, desc->shmci.size);
|
||||
} else {
|
||||
char shmPath[SHM_PATH_MAX] = { '\0' };
|
||||
desc->shmli.shmSize = size;
|
||||
NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle));
|
||||
memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix));
|
||||
desc->legacy = true;
|
||||
INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
|
||||
}
|
||||
#else /* CUDART_VERSION >= 12020 */
|
||||
char shmPath[SHM_PATH_MAX] = { '\0' };
|
||||
desc->shmli.shmSize = size;
|
||||
NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle));
|
||||
memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix));
|
||||
desc->legacy = true;
|
||||
INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, size, *hptr);
|
||||
#endif /* CUDART_VERSION >= 12020 */
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut) {
|
||||
if (comm == NULL || desc == NULL || hptr == NULL || descOut == NULL) {
|
||||
WARN("Invalid argument comm %p, desc %p, hptr %p, descOut %p", comm, desc, hptr, descOut);
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
#if CUDART_VERSION >= 12020
|
||||
if (ncclCuMemEnable() && ncclCuMemHostEnable() && !desc->legacy) {
|
||||
// cuMem API support
|
||||
CUdeviceptr hostptr = 0;
|
||||
CUmemAllocationHandleType type = SHM_HANDLE_TYPE;
|
||||
CUmemGenericAllocationHandle handle;
|
||||
int cudaDev;
|
||||
CUdevice currentDev;
|
||||
CUmemAccessDesc accessDesc = {};
|
||||
int cpuNumaNodeId;
|
||||
size_t granularity;
|
||||
size_t size = desc->shmci.size;
|
||||
CUmemAllocationProp prop = {};
|
||||
|
||||
// Import and map the remote memory descriptor to the local GPU
|
||||
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
|
||||
// UDS fd support
|
||||
int fd = -1;
|
||||
// Send cuMem handle to remote for conversion to an fd
|
||||
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, desc->shmci.tpProxyRank, &desc->shmci.data, &fd));
|
||||
CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type));
|
||||
(void) close(fd);
|
||||
} else {
|
||||
CUCHECK(cuMemImportFromShareableHandle(&handle, &desc->shmci.handle, type));
|
||||
}
|
||||
|
||||
// Get cpu numa id
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUCHECK(cuDeviceGet(¤tDev, cudaDev));
|
||||
CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev));
|
||||
if (cpuNumaNodeId < 0) cpuNumaNodeId = 0;
|
||||
|
||||
// Get granularity
|
||||
prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
|
||||
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
prop.requestedHandleTypes = type;
|
||||
prop.location.id = cpuNumaNodeId;
|
||||
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
||||
|
||||
ALIGN_SIZE(size, granularity);
|
||||
|
||||
// Reserve and map address
|
||||
CUCHECK(cuMemAddressReserve(&hostptr, size, /* alignment */ 0, /* addr */ 0, /* flags */ 0));
|
||||
CUCHECK(cuMemMap(hostptr, size, /* offset */ 0, handle, /* flags */ 0));
|
||||
|
||||
// Allow access by the local GPU
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
accessDesc.location.id = cudaDev;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CUCHECK(cuMemSetAccess(hostptr, size, &accessDesc, 1));
|
||||
|
||||
// Allow access by the local numa
|
||||
accessDesc.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
|
||||
accessDesc.location.id = cpuNumaNodeId;
|
||||
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CUCHECK(cuMemSetAccess(hostptr, size, &accessDesc, 1));
|
||||
|
||||
descOut->shmci.ptr = *hptr = (void *)hostptr;
|
||||
descOut->legacy = false;
|
||||
if (dptr) *dptr = (void *)hostptr;
|
||||
INFO(NCCL_SHM, "CUMEM imported shareable host buffer from tpProxyRank %d size %zi ptr %p, granularity %ld", desc->shmci.tpProxyRank, desc->shmci.size, descOut->shmci.ptr, granularity);
|
||||
} else {
|
||||
char shmPath[SHM_PATH_MAX];
|
||||
sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
|
||||
NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
|
||||
descOut->legacy = true;
|
||||
INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
|
||||
}
|
||||
#else /* CUDART_VERSION >= 12020 */
|
||||
char shmPath[SHM_PATH_MAX];
|
||||
sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
|
||||
NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
|
||||
descOut->legacy = true;
|
||||
INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc) {
|
||||
if (desc) {
|
||||
#if CUDART_VERSION >= 12020
|
||||
if (ncclCuMemEnable() && ncclCuMemHostEnable() && !desc->legacy) {
|
||||
NCCLCHECK(ncclCuMemHostFree(desc->shmci.ptr));
|
||||
} else {
|
||||
NCCLCHECK(ncclShmClose(desc->shmli.handle));
|
||||
}
|
||||
#else
|
||||
NCCLCHECK(ncclShmClose(desc->shmli.handle));
|
||||
#endif
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclTransport shmTransport = {
|
||||
"SHM",
|
||||
shmCanConnect,
|
||||
{ shmSendSetup, shmSendConnect, shmSendFree, NULL, shmSendProxySetup, NULL, shmSendProxyFree, NULL },
|
||||
{ shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, shmRecvProxySetup, NULL, shmRecvProxyFree, NULL }
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user