Merge remote-tracking branch 'nccl/master' into develop

This commit is contained in:
BertanDogancay
2025-03-27 12:51:55 -05:00
melakukan 0b2062c560
92 mengubah file dengan 7322 tambahan dan 2168 penghapusan
+2
Melihat File
@@ -487,6 +487,7 @@ set(SRC_FILES
src/include/ipcsocket.h
src/include/nccl_common.h
src/include/nccl_net.h
src/include/nccl_profiler.h
src/include/nccl_tuner.h
src/include/net_device.h
src/include/net.h
@@ -504,6 +505,7 @@ set(SRC_FILES
src/include/rocmwrap.h
src/include/roctx.h
src/include/shm.h
src/include/shmutils.h
src/include/signals.h
src/include/socket.h
src/include/strongstream.h
+16
Melihat File
@@ -0,0 +1,16 @@
#
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
NCCL_HOME := ../../build
INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
PLUGIN_SO := libnccl-profiler.so
default: $(PLUGIN_SO)
$(PLUGIN_SO): plugin.c event.c print_event.c
$(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
clean:
rm -f $(PLUGIN_SO)
+30
Melihat File
@@ -0,0 +1,30 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <stdio.h>
#include "event.h"
int taskEventQueueEmpty(struct group* g) {
return g->eventHead == NULL;
}
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
event->next = NULL;
if (g->eventHead) g->eventTail->next = event;
else g->eventHead = event;
g->eventTail = event;
}
struct taskEventBase* taskEventQueueHead(struct group* g) {
return g->eventHead;
}
struct taskEventBase* taskEventQueueDequeue(struct group* g) {
struct taskEventBase* tmp = g->eventHead;
g->eventHead = g->eventHead->next;
if (g->eventHead == NULL) g->eventTail = NULL;
return tmp;
}
+167
Melihat File
@@ -0,0 +1,167 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef EVENT_H_
#define EVENT_H_
#include <sys/types.h>
#include <stdint.h>
#include <unistd.h>
#include "profiler.h"
#define MAX_CHANNELS 32
#define MAX_STEPS 16
#define PROXY_OP_SEND_STATE_OFFSET (ncclProfilerProxyOpSendPosted)
#define PROXY_OP_RECV_STATE_OFFSET (ncclProfilerProxyOpRecvPosted)
#define PROXY_STEP_SEND_STATE_OFFSET (ncclProfilerProxyStepSendGPUWait)
#define PROXY_STEP_RECV_STATE_OFFSET (ncclProfilerProxyStepRecvWait)
#define NUM_PROXY_OP_SEND_STATES (ncclProfilerProxyOpSendDone - ncclProfilerProxyOpSendPosted + 1)
#define NUM_PROXY_OP_RECV_STATES (ncclProfilerProxyOpRecvDone - ncclProfilerProxyOpRecvPosted + 1)
#define NUM_PROXY_STEP_SEND_STATES (ncclProfilerProxyStepSendWait - ncclProfilerProxyStepSendGPUWait + 1)
#define NUM_PROXY_STEP_RECV_STATES (ncclProfilerProxyStepRecvGPUWait - ncclProfilerProxyStepRecvWait + 1)
#define PROXY_OP_SEND_STATE_IDX(state) (state - PROXY_OP_SEND_STATE_OFFSET)
#define PROXY_OP_RECV_STATE_IDX(state) (state - PROXY_OP_RECV_STATE_OFFSET)
#define PROXY_STEP_SEND_STATE_IDX(state) (state - PROXY_STEP_SEND_STATE_OFFSET)
#define PROXY_STEP_RECV_STATE_IDX(state) (state - PROXY_STEP_RECV_STATE_OFFSET)
#define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES)
#define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES)
#define MAX_COMM_CLIQUES (32 * 8)
struct proxyOp;
struct proxyStep {
uint8_t type; // type of event: network transfer
int step; // network transfer id in given channel
int isSend; // send/recv channel operation
double timestamp[MAX_PROXY_STEP_STATES];
double startTs;
double stopTs;
struct proxyOp* parent;
};
struct proxyOp {
uint8_t type; // type of event: proxy operation
uint8_t channelId; // channel id for this proxy operation
pid_t pid;
int rank;
int peer; // peer rank for this proxy operation
int nSteps; // total number of network transfers for this proxy operation
int chunkSize; // chunk size for this proxy operation
int isSend; // send/recv channel operation
size_t transSize; // transfer data size for this proxy operation
struct {
int steps; // completed steps for this proxy operation state
double timestamp;
} states[MAX_PROXY_OP_STATES];
double startTs;
double stopTs;
int stepCount; // last processed network operation for this proxy operation
struct proxyStep step[MAX_STEPS]; // array of network transfer events
struct taskEventBase* parent; // parent event p2p/collective
};
struct group;
struct context;
struct proxyCtrl {
uint8_t type;
struct context* ctx; // profiler context
double startTs;
double stopTs;
int state;
int appended; // appended proxy operations
};
// task level event base structure
struct taskEventBase {
uint8_t type; // event type: collective/p2p
int rank; // rank of the operation in NCCL communicator
const char* name; // FIXME: unused
uint64_t commHash; // communicator identifier
uint8_t func; // ncclFunc*
int refCount; // number of references for this operation
struct group* parent; // parent event group
struct taskEventBase* next; // next top level event in group
double startTs;
double stopTs;
};
struct collective {
struct taskEventBase base; // base structure for this event
uint64_t seqNumber; // sequence number for this collective in communicator
void const* sendBuff;
void* recvBuff;
size_t count;
size_t trafficBytes;
int root;
uint8_t datatype;
uint8_t nMaxChannels;
uint8_t algo;
uint8_t proto;
int op;
int nWarps;
int isCollnet;
int isNvls;
struct proxyOp send[MAX_CHANNELS];// array of send proxy operation events
struct proxyOp recv[MAX_CHANNELS];// array of recv proxy operation events
};
struct p2p {
struct taskEventBase base; // base structure for this event
uint8_t func;
void const* buff;
size_t count;
uint8_t datatype;
int peer;
struct proxyOp op;
};
struct group {
uint8_t type;
struct context* ctx; // profiler context
int groupId;
int refCount;
struct taskEventBase* eventHead; // queue head for task events
struct taskEventBase* eventTail; // queue tail for task events
double startTs;
double stopTs;
struct group* next; // next group event in queue
};
// arrays for different event objects
struct context {
int groupPoolSize;
int groupPoolBase;
int groupPoolIndex;
struct group* groupPool;
int collPoolSize;
int collPoolBase;
int collPoolIndex;
struct collective* collPool;
int p2pPoolSize;
int p2pPoolBase;
int p2pPoolIndex;
struct p2p* p2pPool;
int proxyCtrlPoolSize;
int proxyCtrlPoolBase;
int proxyCtrlPoolIndex;
struct proxyCtrl* proxyCtrlPool;
};
int taskEventQueueEmpty(struct group* g);
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event);
struct taskEventBase* taskEventQueueHead(struct group* g);
struct taskEventBase* taskEventQueueDequeue(struct group* g);
#endif
+15
Melihat File
@@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COMMON_H_
#define COMMON_H_
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
#endif
+19
Melihat File
@@ -0,0 +1,19 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_ERR_H_
#define NCCL_ERR_H_
/* Error type for plugins */
typedef enum { ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6 } ncclResult_t;
#endif
+18
Melihat File
@@ -0,0 +1,18 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_PROFILER_H_
#define NCCL_PROFILER_H_
#include <stdint.h>
#include <stdlib.h>
#include "common.h"
#include "err.h"
#include "profiler_v1.h"
#endif // end include guard
+150
Melihat File
@@ -0,0 +1,150 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_PROFILER_V1_H_
#define NCCL_PROFILER_V1_H_
#include <stdint.h>
enum {
ncclProfileGroup = (1 << 0), // group event type
ncclProfileColl = (1 << 1), // host collective call event type
ncclProfileP2p = (1 << 2), // host point-to-point call event type
ncclProfileProxyOp = (1 << 3), // proxy operation event type
ncclProfileProxyStep = (1 << 4), // proxy step event type
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
ncclProfileNumEvents = ( 6),
};
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
const char* name;
uint64_t commHash;
uint64_t seqNumber;
uint8_t func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
uint8_t datatype;
uint32_t op;
size_t trafficBytes;
uint8_t nMaxChannels;
uint8_t nWarps;
uint8_t algo;
uint8_t proto;
int isCollnet;
int isNvls;
} coll;
struct {
const char* name;
uint64_t commHash;
uint8_t func;
void* buff;
uint8_t datatype;
size_t count;
int peer;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
};
} ncclProfilerEventDescr_v1_t;
typedef enum {
ncclProfilerProxyOpSendPosted,
ncclProfilerProxyOpSendRemFifoWait,
ncclProfilerProxyOpSendTransmitted,
ncclProfilerProxyOpSendDone,
ncclProfilerProxyOpRecvPosted,
ncclProfilerProxyOpRecvReceived,
ncclProfilerProxyOpRecvTransmitted,
ncclProfilerProxyOpRecvDone,
/* Legacy proxy profiler states */
ncclProfilerProxyStepSendGPUWait,
ncclProfilerProxyStepSendWait,
ncclProfilerProxyStepRecvWait,
ncclProfilerProxyStepRecvFlushWait,
ncclProfilerProxyStepRecvGPUWait,
/* Legacy proxy control states */
ncclProfilerProxyCtrlIdle,
ncclProfilerProxyCtrlActive,
ncclProfilerProxyCtrlSleep,
ncclProfilerProxyCtrlWakeup,
ncclProfilerProxyCtrlAppend,
ncclProfilerProxyCtrlAppendEnd,
} ncclProfilerEventState_v1_t;
typedef union {
struct {
size_t transSize;
int steps;
} proxyOp;
struct {
int appendedProxyOps;
} proxyCtrl;
} ncclProfilerEventStateArgs_v1_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v1_t;
typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t;
typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t;
typedef ncclProfiler_v1_t ncclProfiler_t;
#endif
+21
Melihat File
@@ -0,0 +1,21 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NCCL_TYPES_H_
#define NCCL_TYPES_H_
/* Data types */
typedef enum { ncclInt8 = 0, ncclChar = 0,
ncclUint8 = 1,
ncclInt32 = 2, ncclInt = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6, ncclHalf = 6,
ncclFloat32 = 7, ncclFloat = 7,
ncclFloat64 = 8, ncclDouble = 8,
ncclBfloat16 = 9,
} ncclDataType_t;
#endif
+492
Melihat File
@@ -0,0 +1,492 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <stdio.h>
#include <pthread.h>
#include <string.h>
#include <linux/limits.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <x86intrin.h>
#include "event.h"
#include "print_event.h"
#define __hidden __attribute__ ((visibility("hidden")))
static int initialized; // initialization counter for profiler
static double startTime; // profiler start time
static int groupPoolSize = 16;
static int collPoolSize = 16;
static int p2pPoolSize = 1024;
static int proxyCtrlPoolSize = 16;
static int detachPoolSize = 128;
static int detachPoolBase;
static int detachPoolIndex;
static int detachPoolDone;
static struct proxyOp* detachPool;
static double freq = -1;
__hidden void calibrate() {
struct timeval tv;
gettimeofday(&tv, NULL);
uint64_t timeCycles = __rdtsc();
double time = - tv.tv_sec*1e6 - tv.tv_usec;
uint64_t total = 0ULL;
for (int i = 0; i < 10000; i++) total += __rdtsc();
gettimeofday(&tv, NULL);
timeCycles = __rdtsc() - timeCycles;
time += tv.tv_sec*1e6 + tv.tv_usec;
freq = timeCycles / time;
}
__hidden double gettime(void) {
return __rdtsc() / freq;
}
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
static pid_t pid;
__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) {
pthread_mutex_lock(&lock);
if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
// first thread initializes event mask, environment and detach pool
__atomic_store_n(eActivationMask, ncclProfileColl | ncclProfileP2p, __ATOMIC_RELAXED);
if (getenv("NCCL_PROFILE_EVENT_MASK")) {
__atomic_store_n(eActivationMask, atoi(getenv("NCCL_PROFILE_EVENT_MASK")), __ATOMIC_RELAXED);
}
if (getenv("NCCL_PROFILE_GROUP_POOL_SIZE")) {
groupPoolSize = atoi(getenv("NCCL_PROFILE_GROUP_POOL_SIZE"));
}
if (getenv("NCCL_PROFILE_COLL_POOL_SIZE")) {
collPoolSize = atoi(getenv("NCCL_PROFILE_COLL_POOL_SIZE"));
}
if (getenv("NCCL_PROFILE_P2P_POOL_SIZE")) {
p2pPoolSize = atoi(getenv("NCCL_PROFILE_P2P_POOL_SIZE"));
}
if (getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE")) {
proxyCtrlPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE"));
}
if (getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE")) {
detachPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE"));
}
// detach pool is used to store PXN proxyOps and is shared among threads
detachPool = (struct proxyOp *)calloc(detachPoolSize, sizeof(*detachPool));
if (detachPool == NULL) {
pthread_mutex_unlock(&lock);
return ncclSystemError;
}
// Pid of the process initializing the profiler first.
// This is compared against the pid of proxyOp events
// to figure out if they have a parent event in this
// process address space.
pid = getpid();
// calibrate and start timer
calibrate();
startTime = gettime();
}
pthread_mutex_unlock(&lock);
// pre-allocate memory for event object pools in dedicated profiler context
struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
if (ctx->groupPool == NULL) goto fail;
ctx->collPool = (struct collective *)calloc(collPoolSize, sizeof(*ctx->collPool));
if (ctx->collPool == NULL) goto fail;
ctx->p2pPool = (struct p2p *)calloc(p2pPoolSize, sizeof(*ctx->p2pPool));
if (ctx->p2pPool == NULL) goto fail;
ctx->proxyCtrlPool = (struct proxyCtrl *)calloc(proxyCtrlPoolSize, sizeof(*ctx->proxyCtrlPool));
if (ctx->proxyCtrlPool == NULL) goto fail;
*context = ctx;
return ncclSuccess;
fail:
// cleanup resources
if (ctx->proxyCtrlPool) free(ctx->proxyCtrlPool);
if (ctx->p2pPool) free(ctx->p2pPool);
if (ctx->collPool) free(ctx->collPool);
if (ctx->groupPool) free(ctx->groupPool);
free(ctx);
if (detachPool) free(detachPool);
return ncclSystemError;
}
__hidden ncclResult_t exampleProfilerFinalize(void* context) {
FILE* fh = NULL;
char filename[PATH_MAX] = { 0 };
char hostname[64] = { 0 };
gethostname(hostname, 64);
const char* dump = getenv("NCCL_PROFILE_DUMP_FILE");
if (dump) {
sprintf(filename, "%s-%s-%ld.txt", dump, hostname, syscall(SYS_gettid));
fh = fopen(filename, "w");
fprintf(fh, "[\n");
}
// print last N groups/collectives/p2ps
struct context* ctx = (struct context *)context;
int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0;
int end = ctx->groupPoolIndex;
for (int i = start; i < end; i++) {
printEvent(fh, &ctx->groupPool[i%groupPoolSize]);
}
start = (ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize >= 0) ? ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize : 0;
end = ctx->proxyCtrlPoolIndex;
for (int i = start; i < end; i++) {
printEvent(fh, &ctx->proxyCtrlPool[i%proxyCtrlPoolSize]);
}
free(ctx->groupPool);
free(ctx->collPool);
free(ctx->p2pPool);
free(ctx->proxyCtrlPool);
free(ctx);
// last thread cleans up shared detach pool
if (__atomic_fetch_sub(&initialized, 1, __ATOMIC_RELAXED) - 1 == 0) {
start = (detachPoolIndex - detachPoolSize >= 0) ? detachPoolIndex - detachPoolSize : 0;
end = detachPoolIndex;
for (int i = start; i < end; i++) {
printEvent(fh, &detachPool[i%detachPoolSize]);
}
free(detachPool);
}
if (fh) fprintf(fh, "{}]\n");
if (fh) fclose(fh);
return ncclSuccess;
}
__hidden void updateEvent(void* handle);
__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr) {
*eHandle = NULL;
struct context* ctx = (struct context *)context;
if (eDescr->type == ncclProfileGroup) {
struct group* event;
int groupId = __atomic_fetch_add(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
if ((groupId - __atomic_load_n(&ctx->groupPoolBase, __ATOMIC_RELAXED)) < groupPoolSize) {
// if there are available group events grab one
event = &ctx->groupPool[groupId%groupPoolSize];
while (!taskEventQueueEmpty(event)) {
struct taskEventBase* base = taskEventQueueDequeue(event);
if (base->type == ncclProfileColl) {
struct collective* c = (struct collective *)base;
// reset event proxyOps & proxySteps
memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
// release collective events in the group and return them to the collective pool
__atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
} else if (base->type == ncclProfileP2p) {
struct p2p* p = (struct p2p *)base;
// reset event proxyOp and proxySteps
memset(&p->op, 0, sizeof(struct proxyOp));
// release p2p events in the group and return them to the p2p pool
__atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
}
}
} else {
// else drop this event
__atomic_fetch_sub(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->type = ncclProfileGroup;
__atomic_store_n(&event->refCount, 1, __ATOMIC_RELAXED);
event->ctx = ctx;
event->groupId = groupId;
event->startTs = gettime() - startTime;
*eHandle = event;
debugEvent(event, "GroupStart");
} else if (eDescr->type == ncclProfileColl) {
// the parent might be null if we run out of events
struct group* parent = (struct group *)eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
struct collective* event;
int collId = __atomic_fetch_add(&ctx->collPoolIndex, 1, __ATOMIC_RELAXED);
if ((collId - __atomic_load_n(&ctx->collPoolBase, __ATOMIC_RELAXED)) < collPoolSize) {
// if there are available collective events grab one
event = &ctx->collPool[collId%collPoolSize];
} else {
// else drop this event
__atomic_fetch_sub(&ctx->collPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->base.type = ncclProfileColl;
event->base.rank = eDescr->rank;
event->base.name = eDescr->coll.name;
event->base.commHash = eDescr->coll.commHash;
event->base.func = eDescr->coll.func;
event->base.startTs = gettime() - startTime;
event->base.parent = parent;
event->seqNumber = eDescr->coll.seqNumber;
event->sendBuff = eDescr->coll.sendBuff;
event->recvBuff = eDescr->coll.recvBuff;
event->count = eDescr->coll.count;
event->root = eDescr->coll.root;
event->datatype = eDescr->coll.datatype;
event->op = eDescr->coll.op;
event->trafficBytes = eDescr->coll.trafficBytes;
event->nMaxChannels = eDescr->coll.nMaxChannels;
event->nWarps = eDescr->coll.nWarps;
event->algo = eDescr->coll.algo;
event->proto = eDescr->coll.proto;
event->isCollnet = eDescr->coll.isCollnet;
event->isNvls = eDescr->coll.isNvls;
*eHandle = event;
taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
// increment the group ref counter so the event will staty open
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "CollStart");
} else if (eDescr->type == ncclProfileP2p) {
// the parent might be null if we run out of events
struct group* parent = (struct group *)eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
struct p2p* event;
int p2pId = __atomic_fetch_add(&ctx->p2pPoolIndex, 1, __ATOMIC_RELAXED);
if ((p2pId - __atomic_load_n(&ctx->p2pPoolBase, __ATOMIC_RELAXED)) < p2pPoolSize) {
// if there are available p2p events grab one
event = &ctx->p2pPool[p2pId%p2pPoolSize];
} else {
// else drop this event
__atomic_fetch_sub(&ctx->p2pPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->base.type = ncclProfileP2p;
event->base.rank = eDescr->rank;
event->base.name = eDescr->p2p.name;
event->base.commHash = eDescr->p2p.commHash;
event->base.func = eDescr->p2p.func;
event->base.next = parent->eventHead;
event->base.startTs = gettime() - startTime;
event->base.parent = parent;
event->buff = eDescr->p2p.buff;
event->count = eDescr->p2p.count;
event->datatype = eDescr->p2p.datatype;
event->peer = eDescr->p2p.peer;
*eHandle = event;
// increment the group ref counter so the event will staty open
taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "P2pStart");
} else if (eDescr->type == ncclProfileProxyCtrl) {
int proxyCtrlId = __atomic_fetch_add(&ctx->proxyCtrlPoolIndex, 1, __ATOMIC_RELAXED);
struct proxyCtrl* event = &ctx->proxyCtrlPool[proxyCtrlId%proxyCtrlPoolSize];
event->type = ncclProfileProxyCtrl;
event->ctx = ctx;
event->startTs = gettime() - startTime;
*eHandle = event;
} else if (eDescr->type == ncclProfileProxyOp) {
// the eventBase might be null if we run out of events
struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj;
if (eventBase == NULL) return ncclSuccess;
if (eDescr->proxyOp.pid != pid) {
// PXN captured proxyOp events
struct proxyOp* event;
int detachId = __atomic_fetch_add(&detachPoolIndex, 1, __ATOMIC_RELAXED);
if ((detachId - detachPoolBase) < detachPoolSize) {
// if there are available detached proxyOp events grab one
event = &detachPool[detachId%detachPoolSize];
} else {
// else drop this event
__atomic_fetch_sub(&detachPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->type = ncclProfileProxyOp;
event->channelId = eDescr->proxyOp.channelId;
event->pid = eDescr->proxyOp.pid;
event->rank = eDescr->rank;
event->peer = eDescr->proxyOp.peer;
event->nSteps = eDescr->proxyOp.nSteps;
event->chunkSize = eDescr->proxyOp.chunkSize;
event->isSend = eDescr->proxyOp.isSend;
event->startTs = gettime() - startTime;
event->parent = NULL;
*eHandle = event;
debugEvent(event, "PxnProxyOpStart");
return ncclSuccess;
}
if (eventBase->type == ncclProfileColl) {
struct collective* parent = (struct collective *)eDescr->parentObj;
struct proxyOp* event = (eDescr->proxyOp.isSend) ? &parent->send[eDescr->proxyOp.channelId] : &parent->recv[eDescr->proxyOp.channelId];
event->type = ncclProfileProxyOp;
event->channelId = eDescr->proxyOp.channelId;
event->pid = eDescr->proxyOp.pid;
event->rank = eDescr->rank;
event->peer = eDescr->proxyOp.peer;
event->nSteps = eDescr->proxyOp.nSteps;
event->chunkSize = eDescr->proxyOp.chunkSize;
event->isSend = eDescr->proxyOp.isSend;
event->parent = eventBase;
event->startTs = gettime() - startTime;
*eHandle = event;
__atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "ProxyOpStart");
} else { // ncclProfileP2p
struct p2p* parent = (struct p2p *)eDescr->parentObj;
struct proxyOp* event = &parent->op;
event->type = ncclProfileProxyOp;
event->channelId = eDescr->proxyOp.channelId;
event->pid = eDescr->proxyOp.pid;
event->rank = eDescr->rank;
event->peer = eDescr->proxyOp.peer;
event->nSteps = eDescr->proxyOp.nSteps;
event->chunkSize = eDescr->proxyOp.chunkSize;
event->isSend = eDescr->proxyOp.isSend;
event->parent = eventBase;
event->startTs = gettime() - startTime;
*eHandle = event;
__atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "ProxyOpStart");
}
} else if (eDescr->type == ncclProfileProxyStep) {
// the parent might be null if we run out of events
struct proxyOp* parent = (struct proxyOp *)eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
int s = parent->stepCount++ % MAX_STEPS;
struct proxyStep* event = &parent->step[s];
event->type = ncclProfileProxyStep;
event->step = eDescr->proxyStep.step;
event->isSend = parent->isSend;
event->parent = parent;
event->startTs = gettime() - startTime;
*eHandle = event;
debugEvent(event, "ProxyStepStart");
}
return ncclSuccess;
}
void updateEvent(void* handle) {
uint8_t type = *(uint8_t *)handle;
if (type == ncclProfileGroup) {
struct group* event = (struct group *)handle;
if (__atomic_fetch_sub(&event->refCount, 1, __ATOMIC_RELAXED) == 1) {
event->stopTs = gettime() - startTime;
// return group event to the pool
__atomic_fetch_add(&event->ctx->groupPoolBase, 1, __ATOMIC_RELAXED);
}
debugEvent(event, "GroupStop");
} else if (type == ncclProfileColl) {
struct collective* event = (struct collective *)handle;
if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) {
event->base.stopTs = gettime() - startTime;
debugEvent(event, "CollStop");
updateEvent(event->base.parent);
return;
}
debugEvent(event, "CollStop");
} else if (type == ncclProfileP2p) {
struct p2p* event = (struct p2p *)handle;
if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) {
event->base.stopTs = gettime() - startTime;
debugEvent(event, "P2pStop");
updateEvent(event->base.parent);
return;
}
debugEvent(event, "P2pStop");
} else if (type == ncclProfileProxyOp) {
struct proxyOp* event = (struct proxyOp *)handle;
event->stopTs = gettime() - startTime;
if (event->pid != pid) {
// only for proxyOps that don't have a parent collective/p2p (i.e., PXN)
int done = __atomic_fetch_add(&detachPoolDone, 1, __ATOMIC_RELAXED) + 1;
if (done == detachPoolSize) {
// reset the event completed (done) counter
__atomic_store_n(&detachPoolDone, 0, __ATOMIC_RELAXED);
// update the base pointer to the top of the pool
int index = __atomic_load_n(&detachPoolIndex, __ATOMIC_RELAXED);
__atomic_store_n(&detachPoolBase, index, __ATOMIC_RELAXED);
}
debugEvent(event, "ProxyOpStop");
return;
}
updateEvent(event->parent);
debugEvent(event, "ProxyOpStop");
} else if (type == ncclProfileProxyStep) {
struct proxyStep* event = (struct proxyStep *)handle;
event->stopTs = gettime() - startTime;
debugEvent(event, "ProxyStepStop");
} else if (type == ncclProfileProxyCtrl) {
struct proxyCtrl* event = (struct proxyCtrl *)handle;
event->stopTs = gettime() - startTime;
debugEvent(event, "ProxyCtrlStop");
}
}
__hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
// the event handle might be null if we run out of events
if (eHandle == NULL) return ncclSuccess;
uint8_t type = *(uint8_t *)eHandle;
if (type == ncclProfileGroup) {
// stopping the group event in NCCL core does not
// mean the group has completed. It means the group
// was submitted/enqueued so we need to keep the event open
struct group* event = (struct group *)eHandle;
event->stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileColl) {
// stopping the collective event in NCCL core does not
// mean the collective has completed. It means the collective
// was submitted/enqueued so we need to keep the event open
struct collective* event = (struct collective *)eHandle;
event->base.stopTs = gettime() - startTime;
return ncclSuccess;
}
updateEvent(eHandle);
return ncclSuccess;
}
__hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs) {
// the event handle might be null if we run out of events
if (eHandle == NULL) return ncclSuccess;
debugEvent(eHandle, "RecordEventState");
uint8_t type = *(uint8_t *)eHandle;
if (type == ncclProfileProxyOp) {
struct proxyOp* event = (struct proxyOp *)eHandle;
int steps = event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps;
if (eState == ncclProfilerProxyOpSendRemFifoWait && eStateArgs->proxyOp.steps == steps) return ncclSuccess;
event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps = eStateArgs->proxyOp.steps;
event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].timestamp = gettime() - startTime;
event->transSize = eStateArgs->proxyOp.transSize;
} else if (type == ncclProfileProxyStep) {
struct proxyStep* event = (struct proxyStep *)eHandle;
event->timestamp[event->isSend ? PROXY_STEP_SEND_STATE_IDX(eState) : PROXY_STEP_RECV_STATE_IDX(eState)] = gettime() - startTime;
} else if (type == ncclProfileProxyCtrl) {
struct proxyCtrl* event = (struct proxyCtrl *)eHandle;
if (eState == ncclProfilerProxyCtrlAppendEnd) {
event->appended = eStateArgs->proxyCtrl.appendedProxyOps;
}
event->state = eState;
}
return ncclSuccess;
}
ncclProfiler_v1_t ncclProfiler_v1 = {
"Example-profiler",
exampleProfilerInit,
exampleProfilerStartEvent,
exampleProfilerStopEvent,
exampleProfilerRecordEventState,
exampleProfilerFinalize,
};
+277
Melihat File
@@ -0,0 +1,277 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <stdio.h>
#include "profiler.h"
#include "event.h"
#include "print_event.h"
#define __hidden __attribute__ ((visibility("hidden")))
__hidden const char* ncclFuncToString(int func) {
switch(func) {
case 0:
return "ncclBroadcast";
case 1:
return "ncclReduce";
case 2:
return "ncclAllGather";
case 3:
return "ncclReduceScatter";
case 4:
return "ncclAllReduce";
case 5:
return "ncclSendRecv";
case 6:
return "ncclSend";
case 7:
return "ncclRecv";
}
return NULL;
}
__hidden const char* ncclAlgoToString(int algo) {
switch(algo) {
case 0:
return "Tree";
case 1:
return "Ring";
case 2:
return "CollnetDirect";
case 3:
return "CollnetChain";
case 4:
return "Nvls";
case 5:
return "NvlsTree";
}
}
__hidden const char* ncclProtoToString(int proto) {
switch(proto) {
case 0:
return "LL";
case 1:
return "LL128";
case 2:
return "Simple";
}
}
// FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category
// It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a
// category that matches the type of event (GROUP, COLL, P2P, PROXY, NET)
static __thread int groupId;
__hidden void printGroupEventHeader(FILE* fh, struct group* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d}},\n",
"Group", groupId, getpid(), 1, event->startTs, event->groupId);
}
__hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"Group", groupId++, getpid(), 1, event->stopTs);
}
static __thread int collId;
__hidden void printCollEventHeader(FILE* fh, struct collective* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": %d, \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n",
ncclFuncToString(event->base.func), collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, ncclAlgoToString(event->algo), ncclProtoToString(event->proto), event->nMaxChannels);
}
__hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
ncclFuncToString(event->base.func), collId++, getpid(), 1, event->base.stopTs);
}
static __thread int p2pId;
__hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": %d}},\n",
ncclFuncToString(event->base.func), p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype);
}
__hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
ncclFuncToString(event->base.func), p2pId++, getpid(), 1, event->base.stopTs);
}
static __thread int proxyOpId;
__hidden void printProxyOpEventHeader(FILE* fh, struct proxyOp* event) {
if (event->isSend) {
int posted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendPosted);
int remFifoWait = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendRemFifoWait);
int transmitted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendTransmitted);
int done = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendDone);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"REM_FIFO_WAIT\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n",
"Send", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[remFifoWait].steps, event->states[remFifoWait].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp);
} else {
int posted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvPosted);
int received = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvReceived);
int transmitted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvTransmitted);
int done = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvDone);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"RECEIVED\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n",
"Recv", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[received].steps, event->states[received].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp);
}
}
__hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
event->isSend ? "Send" : "Recv", proxyOpId++, getpid(), 1, event->stopTs);
}
static __thread int proxyStepId;
__hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
if (event->isSend) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"SendBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"SendWait", proxyStepId++, getpid(), 1, event->stopTs);
} else {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"RecvGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"RecvGpuWait", proxyStepId++, getpid(), 1, event->stopTs);
}
}
static __thread int proxyCtrlId;
__hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
const char* str;
if (event->state == ncclProfilerProxyCtrlIdle || event->state == ncclProfilerProxyCtrlActive) {
str = "Idle";
} else if (event->state == ncclProfilerProxyCtrlSleep || event->state == ncclProfilerProxyCtrlWakeup) {
str = "Sleep";
} else if (event->state == ncclProfilerProxyCtrlAppend || event->state == ncclProfilerProxyCtrlAppendEnd) {
str = "Append";
}
if (event->state == ncclProfilerProxyCtrlAppendEnd) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"appended\": %d}},\n",
str, proxyCtrlId, getpid(), 1, event->startTs, event->appended);
} else {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
str, proxyCtrlId, getpid(), 1, event->startTs);
}
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
str, proxyCtrlId++, getpid(), 1, event->stopTs);
}
//#define DEBUG_EVENTS
void debugEvent(void* eHandle, const char* tag) {
#ifdef DEBUG_EVENTS
char filename[64] = { 0 };
sprintf(filename, "EventDebug-%d", getpid());
FILE* fh = fopen(filename, "a+");
uint8_t type = *(uint8_t *)eHandle;
if (type == ncclProfileGroup) {
struct group* event = (struct group *)eHandle;
fprintf(fh, "Group event %p tag = %s {\n", event, tag);
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->refCount, __ATOMIC_RELAXED));
fprintf(fh, " startTs = %f\n", event->startTs);
fprintf(fh, " stopTs = %f\n", event->stopTs);
fprintf(fh, "}\n");
} else if (type == ncclProfileColl) {
struct collective* event = (struct collective *)eHandle;
fprintf(fh, "Collective event %p tag = %s {\n", event, tag);
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
fprintf(fh, " parent = %p\n", event->base.parent);
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->send[i].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]);
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->recv[i].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]);
fprintf(fh, " startTs = %f\n", event->base.startTs);
fprintf(fh, " stopTs = %f\n", event->base.stopTs);
fprintf(fh, "}\n");
} else if (type == ncclProfileP2p) {
struct p2p* event = (struct p2p *)eHandle;
fprintf(fh, "P2p event %p tag = %s {\n", event, tag);
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
fprintf(fh, " parent = %p\n", event->base.parent);
fprintf(fh, " op = %p\n", &event->op);
fprintf(fh, " startTs = %f\n", event->base.startTs);
fprintf(fh, " stopTs = %f\n", event->base.stopTs);
fprintf(fh, "}\n");
} else if (type == ncclProfileProxyOp) {
struct proxyOp* event = (struct proxyOp *)eHandle;
fprintf(fh, "ProxyOp event %p tag = %s {\n", event, tag);
fprintf(fh, " type = %s\n", event->isSend ? "Send" : "Recv");
fprintf(fh, " channel = %d\n", event->channelId);
fprintf(fh, " parent = %p\n", event->parent);
fprintf(fh, " rank = %d\n", event->rank);
fprintf(fh, " startTs = %f\n", event->startTs);
fprintf(fh, " stopTs = %f\n", event->stopTs);
fprintf(fh, "}\n");
} else if (type == ncclProfileProxyStep) {
struct proxyStep* event = (struct proxyStep *)eHandle;
fprintf(fh, "ProxyStep event %p tag = %s {\n", event, tag);
fprintf(fh, " type = %s\n", event->isSend ? "Send" : "Recv");
fprintf(fh, " parent = %p\n", event->parent);
fprintf(fh, " startTs = %f\n", event->startTs);
fprintf(fh, " stopTs = %f\n", event->stopTs);
fprintf(fh, "}\n");
}
fclose(fh);
#endif
}
void printEvent(FILE* fh, void* handle) {
if (handle == NULL || fh == NULL) return;
uint8_t type = *(uint8_t *)handle;
if (type == ncclProfileGroup) {
struct group* g = (struct group *)handle;
printGroupEventHeader(fh, g);
struct taskEventBase* base = taskEventQueueHead(g);
while (base) {
struct taskEventBase* next = base->next;
printEvent(fh, base);
base = next;
}
printGroupEventTrailer(fh, g);
} else if (type == ncclProfileColl) {
struct collective* c = (struct collective *)handle;
printCollEventHeader(fh, c);
for (int i = 0; i < MAX_CHANNELS; i++) {
printEvent(fh, &c->send[i]);
printEvent(fh, &c->recv[i]);
}
printCollEventTrailer(fh, c);
} else if (type == ncclProfileP2p) {
struct p2p* p = (struct p2p *)handle;
printP2pEventHeader(fh, p);
printEvent(fh, &p->op);
printP2pEventTrailer(fh, p);
} else if (type == ncclProfileProxyOp) {
struct proxyOp* p = (struct proxyOp *)handle;
printProxyOpEventHeader(fh, p);
for (int i = 0; i < MAX_STEPS; i++) {
printEvent(fh, &p->step[i]);
}
printProxyOpEventTrailer(fh, p);
} else if (type == ncclProfileProxyStep) {
struct proxyStep* p = (struct proxyStep *)handle;
printProxyStepEvent(fh, p);
} else if (type == ncclProfileProxyCtrl) {
struct proxyCtrl* p = (struct proxyCtrl *)handle;
printProxyCtrlEvent(fh, p);
}
return;
}
+13
Melihat File
@@ -0,0 +1,13 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PRINT_EVENT_H_
#define PRINT_EVENT_H_
void debugEvent(void* eHandle, const char* tag);
void printEvent(FILE* fh, void* handle);
#endif
+2 -1
Melihat File
@@ -27,7 +27,7 @@ typedef enum {
ncclNumFuncs = 8
} ncclFunc_t;
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
#define NCCL_ALGO_UNDEF -1
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
@@ -35,6 +35,7 @@ typedef enum {
#define NCCL_ALGO_COLLNET_CHAIN 3
#define NCCL_ALGO_NVLS 4
#define NCCL_ALGO_NVLS_TREE 5
#define NCCL_ALGO_PAT 6
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define NCCL_PROTO_UNDEF -1
+7
Melihat File
@@ -10,6 +10,7 @@ VERBOSE ?= 0
KEEP ?= 0
DEBUG ?= 0
ASAN ?= 0
UBSAN ?= 0
TRACE ?= 0
PROFAPI ?= 1
NVTX ?= 1
@@ -93,6 +94,12 @@ LDFLAGS += -fsanitize=address -static-libasan
NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan
endif
ifneq ($(UBSAN), 0)
CXXFLAGS += -fsanitize=undefined
LDFLAGS += -fsanitize=undefined -static-libubsan
NVLDFLAGS += -Xcompiler -fsanitize=undefined,-static-libubsan
endif
ifneq ($(VERBOSE), 0)
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
CXXFLAGS += -Wall -Wextra
+2 -2
Melihat File
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 22
NCCL_PATCH := 3
NCCL_MINOR := 23
NCCL_PATCH := 4
NCCL_SUFFIX :=
PKG_REVISION := 1
+712 -238
Melihat File
File diff ditekan karena terlalu besar Load Diff
+1
Melihat File
@@ -67,6 +67,7 @@ const char* ncclAlgoToString(int algo) {
case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN";
case NCCL_ALGO_NVLS: return "NVLS";
case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE";
case NCCL_ALGO_PAT: return "PAT";
default: return "Unknown";
}
}
+21 -11
Melihat File
@@ -19,7 +19,7 @@ static int pid = -1;
static char hostname[1024];
thread_local int ncclDebugNoWarn = 0;
char ncclLastError[1024] = ""; // Global string for the last error in human readable form
static uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV
static uint64_t ncclDebugMask = NCCL_INIT | NCCL_BOOTSTRAP | NCCL_ENV; // Default debug sub-system mask is INIT and ENV
FILE *ncclDebugFile = stdout;
static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
static std::chrono::steady_clock::time_point ncclEpoch;
@@ -124,7 +124,7 @@ static void ncclDebugInit() {
int c = 0;
char debugFn[PATH_MAX+1] = "";
char *dfn = debugFn;
while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
while (ncclDebugFileEnv[c] != '\0' && (dfn - debugFn) < PATH_MAX) {
if (ncclDebugFileEnv[c++] != '%') {
*dfn++ = ncclDebugFileEnv[c-1];
continue;
@@ -134,16 +134,24 @@ static void ncclDebugInit() {
*dfn++ = '%';
break;
case 'h': // %h = hostname
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
dfn += snprintf(dfn, PATH_MAX + 1 - (dfn - debugFn), "%s", hostname);
break;
case 'p': // %p = pid
dfn += snprintf(dfn, PATH_MAX, "%d", pid);
dfn += snprintf(dfn, PATH_MAX + 1 - (dfn - debugFn), "%d", pid);
break;
default: // Echo everything we don't understand
*dfn++ = '%';
*dfn++ = ncclDebugFileEnv[c-1];
if ((dfn - debugFn) < PATH_MAX) {
*dfn++ = ncclDebugFileEnv[c-1];
}
break;
}
if ((dfn - debugFn) > PATH_MAX) {
// snprintf wanted to overfill the buffer: set dfn to the end
// of the buffer (for null char) and it will naturally exit
// the loop.
dfn = debugFn + PATH_MAX;
}
}
*dfn = '\0';
if (debugFn[0] != '\0') {
@@ -183,9 +191,9 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
tid = syscall(SYS_gettid);
}
int cudaDev;
int cudaDev = 0;
if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
cudaGetDevice(&cudaDev);
(void)cudaGetDevice(&cudaDev);
}
char buffer[4096];
@@ -209,11 +217,13 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
va_start(vargs, fmt);
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
va_end(vargs);
// vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
// vsnprintf may return len >= sizeof(buffer) in the case of a truncated output.
// Rewind len so that we can replace the final \0 by \n
if (len > sizeof(buffer)) len = sizeof(buffer)-1;
buffer[len++] = '\n';
if (len) fwrite(buffer, 1, len, ncclDebugFile);
if (len >= sizeof(buffer)) len = sizeof(buffer)-1;
if (len) {
buffer[len++] = '\n';
fwrite(buffer, 1, len, ncclDebugFile);
}
}
NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
+55 -24
Melihat File
@@ -54,8 +54,11 @@ namespace {
T *inputBuf = (T*)work->sendbuff;
T *outputBuf = (T*)work->recvbuff;
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, work->connIndex, work->connIndex);
(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, work->connIndex, work->connIndex, work);
#if defined(ENABLE_NPKIT)
if (tid == 0) {
@@ -106,7 +109,7 @@ namespace {
rankDest = ringRanks[nranks-j];
offset = dataOffset + rankDest * count;
prims.directRecvCopySend(offset, nelem);
prims.directRecvCopyDirectSend(offset, nelem);
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT)
@@ -128,7 +131,7 @@ namespace {
}
#endif
// Final wait/copy.
prims.directRecv(offset, nelem);
prims.directRecv(offset, offset, nelem);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT)
if (tid == 0) {
@@ -171,6 +174,31 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128
}
};
template<typename T, typename RedOp>
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
using Proto = ProtoSimple<1, 1>;
const int nranks = ncclShmem.comm.nRanks;
const int rank = ncclShmem.comm.rank;
size_t count, channelOffset, channelCount, chunkCount;
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
T *inputBuf = (T*)work->sendbuff;
T *outputBuf = (T*)work->recvbuff;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
(tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatAg);
PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
int last = 0;
while (!last) {
int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
size_t inpIx, outIx;
patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
prims.patCopy(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend);
}
}
};
template<typename T, typename RedOp>
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
@@ -255,7 +283,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
__device__ __forceinline__ void operator()(
int tid, int tn, int slice, int maxSliceSize,
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
) {
static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
@@ -293,19 +321,22 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
int outIsDst = (inPlace && rank == ncclShmem.comm.rank) ? 0 : 1;
reduceCopy<ncclCollUnroll(), RedOp, T,
/*MultimemSrcs,MinSrcs,MaxSrcs=*/0,1,1,
/*MultimemDsts=*/0, 0+MinDsts, 1+MaxDsts,
/*PreOpSrcs=*/0>
if (nSrcs != 0 && outIsDst+nDsts != 0) {
reduceCopy<ncclCollUnroll(), RedOp, T,
/*MultimemSrcs,MinSrcs,MaxSrcs=*/0,1,1,
/*MultimemDsts=*/0, 0+MinDsts, 1+MaxDsts,
/*PreOpSrcs=*/0>
(tid, tn, 0, nullptr, false,
/*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* {
return (char*)srcPtrs[src] + railAllOffset;
},
/*nDsts=*/outIsDst+nDsts, [=]__device__(int d) -> void* {
return d < outIsDst ? outbuf + userOneBeg
: (char*)dstPtrs[d-outIsDst] + railAllOffset;
},
delta);
/*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* {
return work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ) ? (char*)srcPtrs[src] + userOneBeg : (char*)srcPtrs[src] + railAllOffset;
},
/*nDsts=*/outIsDst+nDsts, [=]__device__(int d) -> void* {
return d < outIsDst ? outbuf + userOneBeg
: work->regUsed && (sendDirectFlag & NCCL_DIRECT_WRITE) ? (char*)dstPtrs[d-outIsDst] + userOneBeg
: (char*)dstPtrs[d-outIsDst] + railAllOffset;
},
delta);
}
railAllOffset += delta;
node += 1;
}
@@ -371,15 +402,15 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
__syncwarp();
} else {
// Phase 2: Recv network -> deposit output + send to bcast
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, tn, &direct->out, direct->heads + 1, nullptr, nullptr,
/*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0);
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
prims(tid, tn, &direct->out, direct->heads + 1, nullptr, work->recvbuff,
/*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0, work);
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
Scatterer</*BcastSendNotRecv=*/true> scat;
scat.work = work;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.template process</*Recv=*/1, /*Send=*/1>(scat);
prims.template process</*Recv=*/1, /*Send=*/1>(scat, work->direct, 0);
}
}
return;
@@ -389,15 +420,15 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
tn = nWarps3*WARP_SIZE;
if (tid < tn) {
// Phase 3: Recv bcast -> deposit output
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid, tn, direct->heads+1, nullptr, nullptr, nullptr,
/*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0);
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
prims(tid, tn, direct->heads+1, nullptr, nullptr, work->recvbuff,
/*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0, work);
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
Scatterer</*BcastSendNotRecv=*/false> scat;
scat.work = work;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.template process</*Recv=*/1, /*Send=*/0>(scat);
prims.template process</*Recv=*/1, /*Send=*/0>(scat, 0, work->direct);
}
return;
}
+84 -46
Melihat File
@@ -58,9 +58,11 @@ namespace {
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg, 0, work->connIndex, work->connIndex);
(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg, 0, work->connIndex, work->connIndex, work);
#if defined(ENABLE_NPKIT)
if (tid == 0) {
@@ -92,7 +94,7 @@ namespace {
}
#endif
prims.send(offset, nelem);
prims.directSend(offset, offset, nelem);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT)
if (tid == 0) {
@@ -116,7 +118,7 @@ namespace {
chunkOffset = chunk * chunkCount;
offset = gridOffset + elemOffset + chunkOffset;
nelem = (int)min(chunkCount, remCount - chunkOffset);
prims.recvReduceSend(offset, nelem);
prims.directRecvReduceDirectSend(offset, offset, nelem);
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT)
@@ -141,7 +143,7 @@ namespace {
}
#endif
prims.directRecvReduceCopySend(offset, offset, nelem, /*postOp=*/true);
prims.directRecvReduceCopyDirectSend(offset, offset, nelem, /*postOp=*/true);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
if (tid == 0) {
@@ -164,7 +166,7 @@ namespace {
chunkOffset = chunk * chunkCount;
offset = gridOffset + elemOffset + chunkOffset;
nelem = (int)min(chunkCount, remCount - chunkOffset);
prims.directRecvCopySend(offset, nelem);
prims.directRecvCopyDirectSend(offset, nelem);
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT)
@@ -188,7 +190,7 @@ namespace {
offset = gridOffset + elemOffset + chunkOffset;
nelem = (int)min(chunkCount, remCount - chunkOffset);
prims.directRecv(offset, nelem);
prims.directRecv(offset, offset, nelem);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
if (tid == 0) {
@@ -251,7 +253,7 @@ namespace {
{ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
(tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg);
(tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work);
#if defined(ENABLE_NPKIT)
if (tid == 0) {
@@ -271,21 +273,21 @@ namespace {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
prims.directRecvReduceCopy(offset, offset, nelem, /*postOp=*/true);
}
}
else if (tree->down[0] == -1) {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.send(offset, nelem);
prims.directSend(offset, offset, nelem);
}
}
else {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.recvReduceSend(offset, nelem);
prims.directRecvReduceDirectSend(offset, offset, nelem);
}
}
@@ -300,7 +302,7 @@ namespace {
{ // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0> prims
(tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
(tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work);
#if defined(ENABLE_NPKIT)
if (tid == 0) {
@@ -327,14 +329,14 @@ namespace {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.directRecv(offset, nelem);
prims.directRecv(offset, offset, nelem);
}
}
else {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.directRecvCopySend(offset, nelem);
prims.directRecvCopyDirectSend(offset, nelem);
}
}
@@ -417,7 +419,7 @@ namespace {
if (tree->up == -1) {
// Reduce and broadcast. Max number of recv is 2, max number of send is 2
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work);
#if defined(ENABLE_NPKIT)
if (isNpKitThread) {
@@ -436,7 +438,7 @@ namespace {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.directRecvReduceCopySend(offset, offset, nelem, /*doPost=*/true);
prims.directRecvReduceCopyDirectSend(offset, offset, nelem, /*doPost=*/true);
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT)
@@ -449,15 +451,18 @@ namespace {
}
else if (tid < nthreadsSplit) {
/* Reduce up. Max number of recv is 3, max number of send is 1 (binary tree + local).
* Why Direct=1????
* Answer: Because despite not performing any direct operations, the ctor
* must assume Direct so that it can exchange direct pointers with remote ctors
* that are Direct, otherwise it hangs. A cleaner solution would be to seperate
* into DirectRecv and DirectSend capabilities, this ctor would have both=0,
* but the ctor above for tree roots would be DirectRecv=0 DirectSend=1.
*/
* Why Direct=1????
* Answer: Because despite not performing any direct operations, the ctor
* must assume Direct so that it can exchange direct pointers with remote ctors
* that are Direct, otherwise it hangs. A cleaner solution would be to seperate
* into DirectRecv and DirectSend capabilities, this ctor would have both=0,
* but the ctor above for tree roots would be DirectRecv=0 DirectSend=1.
*/
// Coverity reports that the callee treats &tree->up as an array. However, due to the use of
// FanAsymmetric<n, 1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto, 0>
prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth);
prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work);
#if defined(ENABLE_NPKIT)
if (isNpKitThread) {
@@ -477,14 +482,14 @@ namespace {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.send(offset, nelem);
prims.directSend(offset, offset, nelem);
}
}
else {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.recvReduceSend(offset, nelem);
prims.directRecvReduceDirectSend(offset, offset, nelem);
}
}
@@ -498,9 +503,12 @@ namespace {
}
else {
// Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
// Coverity reports that the callee treats &tree->up as an array. However, due to the use of
// FanAsymmetric<1, n>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, work->sendbuff, work->recvbuff,
work->redOpArg, 1*Proto::MaxGroupWidth);
work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
#if defined(ENABLE_NPKIT)
if (isNpKitThread) {
@@ -520,14 +528,14 @@ namespace {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.directRecv(offset, nelem);
prims.directRecv(offset, offset, nelem);
}
}
else {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.directRecvCopySend(offset, nelem);
prims.directRecvCopyDirectSend(offset, nelem);
}
}
@@ -598,7 +606,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
// Scatter
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, work->sendbuff, work->recvbuff,
work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, work);
work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
int nelem = min(direct->nHeads*chunkSize, size-offset);
@@ -608,12 +616,15 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
}
}
// Coverity complains about a possible overrun inside the destructor of "prims", but that's actually
// a false positive.
// coverity[overrun-call:FALSE]
} else if (tid >= tidStartReduce && direct->out != -1) {
if (hasDn) {
// Reduce, send to network
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, work->sendbuff, work->recvbuff,
work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, work);
work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
@@ -634,7 +645,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
} else {
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, work->sendbuff, work->recvbuff,
work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
@@ -646,7 +657,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
// Gather
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsGather, direct->up, NULL, work->sendbuff, work->recvbuff,
work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work);
work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
int nelem = min(direct->nHeads*chunkSize, size-offset);
@@ -655,9 +666,12 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
} else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
if (hasDn) {
// Recv from network, broadcast
// Coverity complains about a possible overrun inside the class below, but that's actually
// a false positive.
// coverity[identity_transfer:FALSE]
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, work->sendbuff, work->recvbuff,
work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
@@ -714,7 +728,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
ssize_t offset;
int nelem;
int remCount = channelCount%(nvls->nHeads*chunkSize);
int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T));
int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16384/sizeof(T));
if (tid < tidEndScatter) {
// Scatter
@@ -788,6 +802,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
if (!hasOut) {
// Reduce, broadcast through NVLS
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
// Coverity complains about a possible overrun inside the class below, but that's actually
// a false positive.
// coverity[identity_transfer:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
@@ -799,6 +816,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
} else {
// Reduce, send to network
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
// Coverity complains about a possible overrun inside the class below, but that's actually
// a false positive.
// coverity[identity_transfer:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
@@ -811,6 +831,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
} else if (tid < tidEndBcast && nvls->headRank != -1) {
// Recv from network, broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
// Coverity complains about a possible overrun inside the class below, but that's actually
// a false positive.
// coverity[identity_transfer:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
@@ -896,6 +919,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_
} else {
// Reduce, send to network
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
// Coverity reports that the callee treats &treeUp as an array. However, due to the use of
// FanAsymmetric<3, 1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
@@ -911,6 +937,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_
} else if (tid < tidEndBcast && nvls->headRank != -1) {
// Recv from network, broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
// Coverity reports that the callee treats &treeUp as an array. However, due to the use of
// FanAsymmetric<1, 3>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
@@ -971,21 +1000,21 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
} else {
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
prims.send(offset, nelem);
prims.directSend(offset, offset, nelem);
}
}
} else {
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
prims.recvReduceSend(offset, nelem);
prims.directRecvReduceDirectSend(offset, offset, nelem);
}
}
}
@@ -1000,40 +1029,49 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
}
__syncwarp();
} else {
// Coverity reports that the callee treats &send as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
prims.recv(offset, nelem, /*postOp*/true);
prims.directRecv(offset, offset, nelem, /*postOp*/true);
}
}
} else {
// Coverity reports that the callee treats &send as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
prims.recvCopyDirectSend(offset, nelem, /*postOp*/true);
prims.directRecvCopyDirectSend(offset, nelem, /*postOp*/true);
}
}
} else {
// Coverity reports that the callee treats &send as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
if (send == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecv(offset, nelem);
prims.directRecv(offset, offset, nelem);
}
} else {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecvCopySend(offset, nelem);
prims.directRecvCopyDirectSend(offset, nelem);
}
}
}
+1 -1
Melihat File
@@ -67,7 +67,7 @@ namespace {
}
// final step: recv
prims.directRecv(recv_offset + prims_offset, prims_nelem);
prims.directRecv(recv_offset + prims_offset, recv_offset + prims_offset, prims_nelem);
}
}
}
+8 -5
Melihat File
@@ -55,8 +55,11 @@ namespace {
T *inputBuf = (T*)work->sendbuff;
T *outputBuf = (T*)work->recvbuff;
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, work->connIndex, work->connIndex);
prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, work->connIndex, work->connIndex, work);
#if defined(ENABLE_NPKIT)
if (tid == 0) {
@@ -70,14 +73,14 @@ namespace {
if (rank == root) {
if (inputBuf == outputBuf) {
prims.send(offset, nelem);
prims.directSend(offset, offset, nelem);
} else {
prims.copySend(offset, offset, nelem);
prims.directCopySend(offset, offset, nelem);
}
} else if (nextRank == root) {
prims.recv(offset, nelem);
prims.directRecv(offset, offset, nelem);
} else {
prims.recvCopySend(offset, nelem);
prims.directRecvCopyDirectSend(offset, nelem);
}
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_BROADCAST_RING_EXIT)
+16 -10
Melihat File
@@ -214,7 +214,7 @@ __device__ inline void barrier_sync_aligned(int name, int nThreads) {
__device__ inline bool barrier_red_or(bool vote, int name) {
int ans;
asm("{ .reg .pred p;"
asm volatile("{ .reg .pred p;"
" setp.ne.s32 p, %1, 0;"
" barrier.red.or.pred p, %2, p; "
" selp.s32 %0, 1, 0, p; }"
@@ -223,7 +223,7 @@ __device__ inline bool barrier_red_or(bool vote, int name) {
}
__device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
int ans;
asm("{ .reg .pred p;"
asm volatile("{ .reg .pred p;"
" setp.ne.s32 p, %1, 0;"
" barrier.red.or.pred p, %2, %3, p; "
" selp.s32 %0, 1, 0, p; }"
@@ -232,7 +232,7 @@ __device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
}
__device__ inline bool barrier_red_or_aligned(bool vote, int name) {
int ans;
asm("{ .reg .pred p;"
asm volatile("{ .reg .pred p;"
" setp.ne.s32 p, %1, 0;"
" barrier.red.or.pred.aligned p, %2, p; "
" selp.s32 %0, 1, 0, p; }"
@@ -441,6 +441,9 @@ struct RunWorkBatch {
if (work->nWarps != workPrev->nWarps) __syncthreads();
}
int subtn = work->nWarps*WARP_SIZE;
// Coverity reports a possible thread divergence due to not all threads participating in the collective.
// However, the code ensures that the participation is on a per-warp basis.
// coverity[device_thread_diverged:FALSE]
if (tid < subtn) RunWorkColl<Fn, T, RedOp, Algo, Proto, COLL_UNROLL>().run(tid, subtn, work);
}
}
@@ -466,25 +469,25 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
// do better when we know all threads are querying the same bitmask.
switch (tid/WARP_SIZE) {
case 0:
//ncclShmem.channelId = blockIdx.x;
//ncclShmem.channelId = blockIdx.x;
for (int i = 0; i < num; i++) {
if (args->channelMask.masks[i] & (1ull<<x)) {
y = __popcll(args->channelMask.masks[i] & ((1ull<<x)-1));
y = total + y;
if (blockIdx.x == y) {
ncclShmem.channelId = x + total;
break;
break;
}
}
if (WARP_SIZE < 64) {
x = WARP_SIZE + tid;
if (args->channelMask.masks[i] & (1ull<<x)) {
y = __popcll(args->channelMask.masks[i] & ((1ull<<x)-1));
y = y + total;
y = __popcll(args->channelMask.masks[i] & ((1ull<<x)-1));
y = y + total;
if (blockIdx.x == y) {
ncclShmem.channelId = x + total;
break;
}
ncclShmem.channelId = x + total;
break;
}
}
}
total = total + __popcll(args->channelMask.masks[i]);
@@ -529,6 +532,9 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
default:
{ int subtid = tid - 2*WARP_SIZE;
int subtn = tn - 2*WARP_SIZE;
// Coverity reports a possible thread divergence due to not all threads participating in the collective.
// However, the code ensures that the participation is on a per-warp basis.
// coverity[device_thread_diverged:FALSE]
loadWorkBatchToShmem(subtid, subtn, args, /*batchIx=*/blockIdx.x);
} break;
}
+17 -1
Melihat File
@@ -71,6 +71,8 @@ __device__ __forceinline__ void reduceCopyPacks(
minSrcs[s] = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
#pragma unroll
for (int d=0; d < MinDsts; d++)
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
minDsts[d] = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;
// We dictate loop termination condition according to whether partial hunks
@@ -95,13 +97,17 @@ __device__ __forceinline__ void reduceCopyPacks(
#pragma unroll Unroll
for (int s=1; s < MinSrcs; s++) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_begin]
BytePack<BytePerPack> tmp[Unroll];
// coverity[dead_error_line]
RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
if (s < MultimemSrcs) {
// applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
acc[u] = applyLoadMultimem<RedFn, BytePerPack>(redFn, minSrcs[s]);
// coverity[dead_error_line]
tmp[u] = applyLoadMultimem<RedFn, BytePerPack>(redFn, minSrcs[s]);
} else {
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
tmp[u] = ld_volatile_global<BytePerPack>(minSrcs[s]);
@@ -110,6 +116,7 @@ __device__ __forceinline__ void reduceCopyPacks(
}
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
// coverity[dead_error_line]
if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]);
acc[u] = applyReduce(redFn, acc[u], tmp[u]);
}
@@ -118,6 +125,8 @@ __device__ __forceinline__ void reduceCopyPacks(
for (int s=MinSrcs; (MinSrcs < MaxSrcs) && (s < MaxSrcs) && (s < nSrcs); s++) {
uintptr_t src = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
BytePack<BytePerPack> tmp[Unroll];
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
@@ -127,6 +136,8 @@ __device__ __forceinline__ void reduceCopyPacks(
}
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]);
acc[u] = applyReduce(redFn, acc[u], tmp[u]);
}
@@ -141,7 +152,10 @@ __device__ __forceinline__ void reduceCopyPacks(
#pragma unroll Unroll
for (int d=0; d < MinDsts; d++) {
#pragma unroll Unroll
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_begin]
for (int u=0; u < Unroll; u++) {
// coverity[dead_error_condition]
if (d < MultimemDsts) {
multimem_st_global(minDsts[d], acc[u]);
} else {
@@ -163,6 +177,8 @@ __device__ __forceinline__ void reduceCopyPacks(
#pragma unroll
for (int s=0; s < MinSrcs; s++) minSrcs[s] += (nWarps-1)*BytePerHunk;
#pragma unroll
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
for (int d=0; d < MinDsts; d++) minDsts[d] += (nWarps-1)*BytePerHunk;
threadBytesBehind += nWarps*BytePerHunk;
threadBytesAhead -= nWarps*BytePerHunk;
+4 -2
Melihat File
@@ -19,10 +19,10 @@
inline __device__ void load64gpu(const uint64_t* ptr, uint64_t &v) {
#if __CUDA_ARCH__ >= 700
asm volatile("ld.relaxed.gpu.u64 {%0}, [%1];"
: "=l"(v) : "l"(ptr));
: "=l"(v) : "l"(ptr) : "memory");
#else
// asm volatile("ld.volatile.global.u64 {%0}, [%1];"
// : "=l"(v) : "l"(ptr));
// : "=l"(v) : "l"(ptr) : "memory");
#endif
}
@@ -226,6 +226,8 @@ inline __device__ void ncclNetDeviceUnpackInner(
int PPW = ppw(nbytes, nw);
// Coverity reports a potential overflow but in reality PPW is tiny so there's no need to store it in an uint64_t.
// coverity[overflow_before_widen]
for (uint64_t meta_s = w * PPW; meta_s < meta_cnt; meta_s += nw * PPW) {
uint64_t iter_meta_cnt = meta_cnt - meta_s;
+12 -9
Melihat File
@@ -147,6 +147,9 @@ struct BytePackOf<BytePack<0>> {
template<typename T>
__device__ __forceinline__ typename BytePackOf<T>::Pack toPack(T value) {
union { typename BytePackOf<T>::Pack p; T v; };
// Coverity recommends the use of std::move here but, given that T is a POD
// scalar, a plain copy will be just as efficient.
// coverity[copy_assignment_call]
v = value;
return p;
}
@@ -212,7 +215,7 @@ template<> __device__ __forceinline__ void st_global<0>(uintptr_t addr, BytePack
// template<> \
// __device__ __forceinline__ BytePack<bytes> ld_relaxed_gpu_global<bytes>(uintptr_t addr) { \
// data_cxx_ty tmp; \
// asm("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr)); \
// asm volatile("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr) : "memory"); \
// BytePack<bytes> ans; \
// ans.native = tmp; \
// return ans; \
@@ -266,14 +269,14 @@ DEFINE_ld_st_16__space(global, uintptr_t, l)
// template<>
// __device__ __forceinline__ BytePack<16> ld_relaxed_gpu_global<16>(uintptr_t addr) {
// BytePack<16> ans;
// asm("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr));
// asm volatile("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr) : "memory");
// return ans;
// }
// template<>
// __device__ __forceinline__ void st_relaxed_gpu_global<16>(uintptr_t addr, BytePack<16> value) {
// asm volatile("st." PTX_relaxed_gpu ".global.v2.b64 [%0], {%1,%2};" :: "l"(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory");
// }
//
// #undef PTX_relaxed_gpu
////////////////////////////////////////////////////////////////////////////////
@@ -291,12 +294,12 @@ __device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) {
}
// __device__ __forceinline__ uint64_t ld_relaxed_gpu_global(uint64_t *ptr) {
// uint64_t ans;
// #if __CUDA_ARCH__ >= 700
// asm("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
// #else
// asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
// #endif
// uint64_t ans;
// #if __CUDA_ARCH__ >= 700
// asm volatile("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
// #else
// asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
// #endif
// return ans;
// }
+9 -3
Melihat File
@@ -154,19 +154,25 @@ struct PrimitivesWithoutDirect {
__device__ void directSendFromOutput(intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->sendFromOutput(outIx, eltN);
}
__device__ void directRecv(intptr_t outIx, int eltN) {
__device__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->recv(outIx, eltN, /*postOp=*/false);
}
__device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
static_cast<RealPrimitives*>(this)->copySend(inpIx, outIx, eltN, postOp);
}
__device__ void directRecvCopySend(intptr_t outIx, int eltN) {
__device__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
static_cast<RealPrimitives*>(this)->recvCopySend(outIx, eltN, /*postOp=*/false);
}
__device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
__device__ void recvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
// Direct is only for the send part
static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
static_cast<RealPrimitives*>(this)->recvReduceSend(inpIx, eltN);
}
__device__ __forceinline__ void directRecvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
}
};
#include "prims_simple.h"
+34 -15
Melihat File
@@ -169,7 +169,7 @@ private:
uint64_t val64 = (uint64_t)(i4.data1) + (((uint64_t)i4.data2) << 32);
#else
do {
asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4) : "memory");
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
npkitWaitRecvSpins++;
#endif
@@ -192,6 +192,8 @@ private:
__device__ void readLLBeginAll(int offset, ncclLLFifoLine(&line)[MaxRecv]) {
#pragma unroll
for (int i=BeginIx; i < MaxRecv; i++) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
if (i < fan.nrecv()) {
union ncclLLFifoLine* src = recvPtr(i) + offset;
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
@@ -203,7 +205,7 @@ private:
line[i].v[1] = __builtin_nontemporal_load(src->v+1);
#endif
#else
asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4));
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4) : "memory");
#endif
}
}
@@ -230,7 +232,7 @@ private:
line[i].v[1] = __builtin_nontemporal_load(src->v+1);
#endif
#else
asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4));
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4) : "memory");
#endif
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
npkitWaitRecvSpins++;
@@ -259,7 +261,7 @@ private:
__builtin_nontemporal_store(i4.v[0], dst->v);
__builtin_nontemporal_store(i4.v[1], dst->v+1);
#else
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag) : "memory");
#endif
}
@@ -301,13 +303,13 @@ private:
#endif
#else
if(sizeof(U) == 1)
asm("ld.volatile.global.b8 %0,[%1];" : "=r"(u4) : "l"(src));
asm volatile("ld.volatile.global.b8 %0,[%1];" : "=r"(u4) : "l"(src) : "memory");
else if(sizeof(U) == 2)
asm("ld.volatile.global.b16 %0,[%1];" : "=h"(u2) : "l"(src));
asm volatile("ld.volatile.global.b16 %0,[%1];" : "=h"(u2) : "l"(src) : "memory");
else if(sizeof(U) == 4)
asm("ld.volatile.global.b32 %0,[%1];" : "=r"(u4) : "l"(src));
asm volatile("ld.volatile.global.b32 %0,[%1];" : "=r"(u4) : "l"(src) : "memory");
else
asm("ld.volatile.global.b64 %0,[%1];" : "=l"(u8) : "l"(src));
asm volatile("ld.volatile.global.b64 %0,[%1];" : "=l"(u8) : "l"(src) : "memory");
#endif
return elt;
}
@@ -333,13 +335,13 @@ private:
__builtin_nontemporal_store(u8, (uint64_t*)dst);
#else
if(sizeof(U) == 1)
asm("st.volatile.global.b8 [%0],%1;" :: "l"(dst), "r"(u4));
asm volatile("st.volatile.global.b8 [%0],%1;" :: "l"(dst), "r"(u4) : "memory");
else if(sizeof(U) == 2)
asm("st.volatile.global.b16 [%0],%1;" :: "l"(dst), "h"(u2));
asm volatile("st.volatile.global.b16 [%0],%1;" :: "l"(dst), "h"(u2) : "memory");
else if(sizeof(U) == 4)
asm("st.volatile.global.b32 [%0],%1;" :: "l"(dst), "r"(u4));
asm volatile("st.volatile.global.b32 [%0],%1;" :: "l"(dst), "r"(u4) : "memory");
else
asm("st.volatile.global.b64 [%0],%1;" :: "l"(dst), "l"(u8));
asm volatile("st.volatile.global.b64 [%0],%1;" :: "l"(dst), "l"(u8) : "memory");
#endif
}
@@ -363,6 +365,8 @@ private:
else {
#pragma unroll
for(int i=0; i < EltPerLine; i++) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
if(i==0 || i < eltN)
elt[i] = load(src + i);
}
@@ -387,6 +391,8 @@ private:
u8 = val;
#pragma unroll
for(int i=0; i < EltPerLine; i++) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
if (i==0 || i < eltN)
//store(dst+i, elt[i]);
dst[i] = elt[i];
@@ -460,6 +466,8 @@ private:
if (RECV) {
data = !SRC ? peerData : applyReduce(redOp, peerData, data);
#pragma unroll MaxRecv
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) {
peerData = readLLFinish(offset, line, i);
data = applyReduce(redOp, peerData, data);
@@ -470,6 +478,8 @@ private:
// Send : inter-node, then intra-node, then local
if (SEND) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
for (int i=1; i < MaxSend && i < fan.nsend(); i++)
storeLL(sendPtr(i)+offset, data, sendFlag(i));
storeLL(sendPtr(0)+offset, data, sendFlag(0));
@@ -502,6 +512,8 @@ private:
postRecv();
}
if (SEND) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
for (int i=1; i < MaxSend && i < fan.nsend(); i++)
incSend(i, offset);
incSend(0, offset);
@@ -610,12 +622,12 @@ private:
}
}
public:
public:
__device__ Primitives(
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr,
bool userBufReg=false, int stepSize_=0
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclDevWorkColl* e = nullptr,
bool ipcReg = false, bool netReg = false, int stepSize_ = 0
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
@@ -625,16 +637,23 @@ private:
// If we are going to support oneshot collNet + LL, then we would need to add connector index here
int nrecv=0, nsend=0;
// We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) {
loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv);
nrecv++;
}
// coverity[dead_error_line]
while (nsend < MaxSend && sendPeers[nsend] >= 0) {
loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend);
nsend++;
}
this->fan = Fan(nrecv, nsend);
// Coverity reports recvConn and sendConn being possibly NULL at this point but that won't actually
// happen given the two "while" loops just above.
// coverity[var_deref_model:FALSE]
loadRecvSync();
// coverity[var_deref_model:FALSE]
loadSendSync();
setDataPtrs(inputBuf, outputBuf);
}
+9 -1
Melihat File
@@ -264,6 +264,8 @@ private:
}
}
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
for (int i=1; i<MaxRecv && i<fan.nrecv(); i++) {
uint64_t flag = recvFlag(i);
uint64_t* ptr = recvPtr(i)+ll128Offset;
@@ -305,6 +307,8 @@ private:
#endif
/************************ Send **************************/
if (SEND) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
for (int i=1; i<MaxSend && i<fan.nsend(); i++) {
uint64_t flag = sendFlag(i);
uint64_t* ptr = sendPtr(i)+ll128Offset;
@@ -489,7 +493,7 @@ public:
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclDevWorkColl* e = nullptr,
bool userBufReg=false, int stepSize_=0
bool ipcReg = false, bool netReg = false, int stepSize_ = 0
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
@@ -508,7 +512,11 @@ public:
nsend++;
}
this->fan = Fan(nrecv, nsend);
// Coverity reports recvConn and sendConn being possibly NULL at this point but that won't actually
// happen given the two "while" loops just above.
// coverity[var_deref_model:FALSE]
loadRecvSync();
// coverity[var_deref_model:FALSE]
loadSendSync();
setDataPtrs(inputBuf, outputBuf);
}
+482 -252
Melihat File
@@ -14,28 +14,38 @@
#include "network/unpack/unpack.h"
#include <cassert>
enum primsMode {
primsModeDefault = 0,
primsModePatRs = 1,
primsModePatAg = 2
};
template<typename T, typename RedOp, typename Fan, int Direct,
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
class Primitives<
T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, MultimemSrcs, MultimemDsts>, P2p
> {
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
static constexpr int Input=0, Output=1;
static constexpr int RoleWaitRecv = 0x04, // 0x1 0x2 are free to use
RoleWaitSend = 0x08,
RolePostSend = 0x10,
RolePostRecv = 0x20,
Aborted = 0x40,
UserBufferMode = 0x80,
ConnFifoEnabled = 0x100,
DirectWrite = 0x200,
DirectRead = 0x400,
// 0x800 is free to use
NvlsMinPolling = 0x1000,
NetDeviceUnpack = 0x2000,
AnyNetDeviceUnpack = 0x4000,
NvlsDirectRead = 0x8000,
NvlsDirectWrite = 0x10000;
static constexpr int RoleInput = 0x01,
RoleOutput = 0x02,
RoleWaitRecv = 0x04,
RoleWaitSend = 0x08,
RolePostSend = 0x10,
RolePostRecv = 0x20,
Aborted = 0x40,
NetRegMode = 0x80,
ConnFifoEnabled = 0x100,
DirectWrite = 0x200,
DirectRead = 0x400,
PatMode = 0x800,
NvlsMinPolling = 0x1000,
NetDeviceUnpack = 0x2000,
AnyNetDeviceUnpack = 0x4000,
NvlsDirectRead = 0x8000,
NvlsDirectWrite = 0x10000,
IpcWrite = 0x20000,
IpcRead = 0x40000;
const int tid, tidInBlock;
const int nthreads;
int nworkers;
@@ -45,13 +55,15 @@ class Primitives<
int flags;
const int group;
uint64_t step;
struct ncclConnInfo* conn = NULL;
struct ncclConnFifo* connFifo = NULL;
T* connEltsFifo;
T* directBuff;
T* directBuff = NULL;
uint64_t *connStepPtr;
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
int connStepSize; // Connection step size
void* netDeviceHandle;
uint64_t accSize; // Accumulated size. Used by PAT operations
uint32_t* next_hdp_reg;
uint64_t* barriers;
uint64_t barrier_next = 0;
@@ -93,7 +105,7 @@ private:
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
if (flags & NvlsMinPolling) {
uint64_t ans;
asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
asm volatile("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
return ans;
}
#endif
@@ -109,8 +121,10 @@ private:
template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
__device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) {
const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
const bool noRecvWait = DirectRecv && Src && (flags & DirectRead); // no wait when directly reading from remote input
const bool noRecvWait = DirectRecv && Src && (flags & (DirectRead | IpcRead)); // no wait when directly reading from remote input
const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) ||
((flags & (Send*RoleWaitSend)) && !noSendWait)) {
int spins = 0;
@@ -134,28 +148,30 @@ private:
void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
: (ncclShmem.groups[group].srcs + Src);
if (flags & UserBufferMode) {
// Do nothing
if (flags & NetRegMode) {
// Do nothing
} else if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
ptrs[index] = connEltsFifo + loadInt(&connFifo[step%NCCL_STEPS].offset)/sizeof(T);
} else if (isSendNotRecv && DirectSend) {
if (flags & (DirectWrite | NvlsDirectWrite)) {
if (flags & (DirectWrite | NvlsDirectWrite | IpcWrite)) {
ptrs[index] = directBuff + dstIx + offset;
} else if (flags & DirectRead) { // empty send
} else if ((flags & DirectRead) || (flags & IpcRead)) { // empty send
ptrs[index] = nullptr;
} else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
}
} else if (!isSendNotRecv && DirectRecv) {
if (flags & (DirectRead | NvlsDirectRead)) {
if (flags & (DirectRead | NvlsDirectRead | IpcRead)) {
ptrs[index] = directBuff + srcIx + offset;
} else if (flags & DirectWrite) {
} else if ((flags & DirectWrite) || (flags & IpcWrite)) {
ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer
} else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
}
}
else {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
}
if (flags & NetDeviceUnpack) {
@@ -198,7 +214,7 @@ private:
int slice = 0;
int offset = 0;
if (tid < nworkers && offset < nelem && ((flags & UserBufferMode) == 0)) {
if (tid < nworkers && offset < nelem && ((flags & NetRegMode) == 0)) {
// Worker-only loop for non-empty slices. Non-workers and empty slices are
// processed in the loop following this if block. The benefit of splitting
// the loop like this is we pull two branches out of the critical path.
@@ -234,7 +250,7 @@ private:
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(srcIx, dstIx, offset, sliceSize);
subBarrier();
/* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
* to 0 to avoid unnecessary workload. */
* to 0 to avoid unnecessary workload. */
int workSize = ncclShmem.aborted ? 0 : sliceSize;
if (flags & AnyNetDeviceUnpack) {
ncclNetDeviceUnpack<Recv>(tid, tidInBlock, nworkers, group, ncclShmem.groups[group].devicePlugin.unpack.unpackNetDeviceIndexMask, Src, workSize);
@@ -244,8 +260,8 @@ private:
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]
/* NVLS can have srcs[0] == dsts[0], but we cannot enter this "if branch",
* so we need to check whether MultimemSrcs and MultimemDsts are 0. */
&& MultimemSrcs == 0 && MultimemDsts == 0) {
* so we need to check whether MultimemSrcs and MultimemDsts are 0. */
&& MultimemSrcs == 0 && MultimemDsts == 0 && !Src) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (Send) {
@@ -264,9 +280,9 @@ private:
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, MaxSend, /*PreOpSrcs*/0>
(tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false,
1, ncclShmem.groups[group].srcs,
fan.nsend(), ncclShmem.groups[group].dsts+1,
workSize);
1, ncclShmem.groups[group].srcs,
fan.nsend(), ncclShmem.groups[group].dsts+1,
workSize);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
if (tid == 0) {
@@ -300,9 +316,9 @@ private:
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, /*PreOpSrcs*/0>
(tid, nworkers, ncclShmem.redOpArgs[0], nullptr, postOp,
Recv, ncclShmem.groups[group].srcs,
Dst, ncclShmem.groups[group].dsts,
workSize);
Recv, ncclShmem.groups[group].srcs,
Dst, ncclShmem.groups[group].dsts,
workSize);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
if (tid == 0) {
@@ -318,7 +334,7 @@ private:
}
#endif
} else {
} else if (ncclShmem.groups[group].srcs[0] && ncclShmem.groups[group].dsts[0]) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
@@ -338,9 +354,9 @@ private:
MultimemSrcs, Recv+Src, Recv*MaxRecv+Src,
MultimemDsts, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
workSize);
Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
workSize);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
if (tid == 0) {
@@ -361,6 +377,8 @@ private:
postPeer<Recv, Send>(0 < sliceSize);
offset += sliceSize;
slice += 1;
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
} while (slice < SlicePerChunk && offset < nelem);
}
@@ -450,12 +468,13 @@ public:
}
template<int Recv, int Send, typename Fn>
__device__ __forceinline__ void process(Fn &&fn) {
__device__ __forceinline__ void process(Fn &&fn, uint32_t sendDirectFlag, uint32_t recvDirectFlag) {
#pragma unroll 1
for (int slice=0; slice < SlicePerChunk; slice++) {
if (tid < nworkers) {
int nsend, nrecv;
if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
int spins = 0;
while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
connStepCache = loadStepValue(connStepPtr);
@@ -466,19 +485,53 @@ public:
if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
int offset = loadInt(&connFifo[step%NCCL_STEPS].offset);
ptrs[index] = connEltsFifo + offset/sizeof(T);
} else if (Direct && fn.work->regUsed) {
if (isSendNotRecv) {
if (flags & (DirectWrite | IpcWrite)) {
ptrs[index] = directBuff;
} else if (flags & (DirectRead | IpcRead)) { // empty send
ptrs[index] = nullptr;
} else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
}
} else {
if (flags & (DirectRead | IpcRead)) {
ptrs[index] = directBuff;
} else if (flags & (DirectWrite | IpcWrite)) {
if (Send)
ptrs[index] = directBuff; // send to next from my output buffer
else
ptrs[index] = nullptr;
} else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
}
}
} else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
}
}
subBarrier();
fn.template operator()<SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend>
(tid, nworkers, slice, stepSize*StepPerSlice,
fan.nrecv(), ncclShmem.groups[group].srcs,
fan.nsend(), ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes);
if (Recv == 0 || ncclShmem.groups[group].srcs[0] == nullptr) {
nrecv = 0;
} else {
nrecv = fan.nrecv();
}
if (Send == 0 || ncclShmem.groups[group].dsts[0] == nullptr) {
nsend = 0;
} else {
nsend = fan.nsend();
}
fn.template operator() < SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend >
(tid, nworkers, slice, stepSize * StepPerSlice,
nrecv, ncclShmem.groups[group].srcs,
nsend, ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes, sendDirectFlag, recvDirectFlag);
}
barrier();
int32_t dstSize = 0;
if (flags & Send*RolePostSend) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_begin]
dstSize = ncclShmem.groups[group].dstSizes[index];
ncclShmem.groups[group].dstSizes[index] = 0;
if (flags & ConnFifoEnabled) connFifo[step%NCCL_STEPS].size = dstSize*sizeof(T);
@@ -561,110 +614,109 @@ private:
}
}
__device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
if (flags & (RoleWaitRecv|RolePostRecv)) {
auto *conn = &peer->recv[connIndex];
if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
// handle must be a device ptr
netDeviceHandle = conn->netDeviceHandle.handle;
// Cache the handle
ncclNetDeviceUnpackSetup(netDeviceHandle, group, index);
flags |= NetDeviceUnpack;
}
step = conn->step;
step = roundUp(step, SlicePerChunk*StepPerSlice);
if (flags & RolePostRecv) {
connStepPtr = conn->head;
STORE(connStepPtr, step); // Return credits in case we rounded up.
}
if (flags & RoleWaitRecv) {
ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->tail;
connStepCache = loadStepValue(connStepPtr);
connStepSize = conn->stepSize/sizeof(T);
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
if (conn->connFifo != nullptr) {
flags |= ConnFifoEnabled;
connFifo = conn->connFifo;
} else if (Direct) {
// User buffers have been registered
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
}
} else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
// direct read not allowed in non-register case
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
}
} else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) {
/* NVLS direct */
flags |= NvlsDirectRead;
__device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) {
conn = &peer->recv[connIndex];
if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
// handle must be a device ptr
netDeviceHandle = conn->netDeviceHandle.handle;
// Cache the handle
ncclNetDeviceUnpackSetup(netDeviceHandle, group, index);
flags |= NetDeviceUnpack;
}
step = conn->step;
step = roundUp(step, SlicePerChunk*StepPerSlice);
if (flags & RolePostRecv) {
connStepPtr = conn->head;
STORE(connStepPtr, step); // Return credits in case we rounded up.
}
if (flags & RoleWaitRecv) {
if ((flags & PatMode) == 0) ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->tail;
connStepCache = loadStepValue(connStepPtr);
connStepSize = conn->stepSize/sizeof(T);
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
if (conn->connFifo != nullptr) {
flags |= ConnFifoEnabled;
connFifo = conn->connFifo;
} else if (Direct && regFlag) {
// User buffers have been registered
if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) {
if (P2p) {
flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead;
} else if (connIndex == 1 && direct) {
flags |= IpcRead;
} else {
flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite;
}
} else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
if (P2p) {
flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead;
} else if (connIndex == 1 && direct) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite;
}
} else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
/* NVLS direct */
flags |= NvlsDirectRead;
}
}
}
}
__device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
if (flags & (RoleWaitSend|RolePostSend)) {
auto *conn = &peer->send[connIndex];
step = conn->step;
step = roundUp(step, SlicePerChunk*StepPerSlice);
__device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) {
conn = &peer->send[connIndex];
step = conn->step;
step = roundUp(step, SlicePerChunk*StepPerSlice);
connFifo = conn->connFifo;
if (connFifo != nullptr) flags |= ConnFifoEnabled;
connFifo = conn->connFifo;
if (connFifo != nullptr) flags |= ConnFifoEnabled;
if (flags & RolePostSend) {
connStepPtr = conn->tail;
next_hdp_reg = conn->next_hdp_reg;
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
}
if (flags & RoleWaitSend) {
ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->head;
connStepCache = loadStepValue(connStepPtr);
connStepSize = conn->stepSize/sizeof(T);
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
if (connFifo == nullptr && Direct) {
// User buffers have been registered
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
}
} else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
// direct read not allowed in non-register case
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
}
} else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) {
/* NVLS direct */
flags |= NvlsDirectWrite;
if (flags & RolePostSend) {
connStepPtr = conn->tail;
next_hdp_reg = conn->next_hdp_reg;
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
}
if (flags & RoleWaitSend) {
if ((flags & PatMode) == 0) ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->head;
connStepCache = loadStepValue(connStepPtr);
connStepSize = conn->stepSize/sizeof(T);
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
if (connFifo == nullptr && Direct && regFlag) {
// User buffers have been registered
if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) {
if (P2p) {
flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead;
} else if (connIndex == 1 && direct) {
flags |= IpcRead;
} else {
flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite;
}
} else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
if (P2p) {
flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead;
} else if (connIndex == 1 && direct) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite;
}
} else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
/* NVLS direct */
flags |= NvlsDirectWrite;
}
}
}
}
public:
public:
__forceinline__ __device__ Primitives(
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,bool userBufReg=false, int stepSize_=0
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,
bool ipcReg = false, bool netReg = false, int stepSize_ = 0, int mode = primsModeDefault
):
tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
@@ -673,33 +725,71 @@ private:
barriers = &ncclShmem.groups[group].barrier;
this->nworkers = nthreads;
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++;
this->fan = Fan(nrecv, nsend);
constexpr int ThreadPerSync =
MaxSend >= 16 || MaxRecv >= 16 ? 32 : // NVLS may have an arity > 8. In that case increase the size of the groups
MaxSend >= 8 || MaxRecv >= 8 ? 16 :
8; // Allows for all roles (WaitRecv/WaitSend/PostRecv/PostSend) within a single warp
static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
index = -1;
int peer = -1;
flags = 0;
assert(2*(nrecv+nsend) <= nthreads); // Ensure no thread is assigned more than one role.
if (tid < nrecv) { flags |= RoleWaitRecv; index = tid; }
else if (tid < nrecv+nsend) { flags |= RoleWaitSend; index = tid-nrecv; }
else if (nthreads-nsend <= tid) { flags |= RolePostSend; index = tid-(nthreads-nsend); }
else if (nthreads-nrecv-nsend <= tid) { flags |= RolePostRecv; index = tid-(nthreads-nrecv-nsend); }
index = -1;
if (mode == primsModeDefault) { // Connect to ranks in sendPeers/recvPeers
int nrecv=0, nsend=0;
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
// coverity[dead_error_line]
while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++;
this->fan = Fan(nrecv, nsend);
int peer = 0;
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
constexpr int ThreadPerSync =
MaxSend >= 16 || MaxRecv >= 16 ? 32 : // NVLS may have an arity > 8. In that case increase the size of the groups
MaxSend >= 8 || MaxRecv >= 8 ? 16 :
8; // Allows for all roles (WaitRecv/WaitSend/PostRecv/PostSend) within a single warp
static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
assert(2*(nrecv+nsend) <= nthreads); // Ensure no thread is assigned more than one role.
// Coverity assumes that index will equal tid based on the line below, but it doesn't consider the setting
// of flags. This results in multiple false positive overruns being reported here and in all_reduce.h.
// Unfortunately, we've been unsuccessful in trying to silence them with a single directive here so
// instead it's being done at the callers.
// coverity[assignment:FALSE]
if (tid < nrecv) { flags |= RoleWaitRecv; index = tid; }
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_begin]
else if (tid < nrecv+nsend) { flags |= RoleWaitSend; index = tid-nrecv; }
else if (nthreads-nsend <= tid) { flags |= RolePostSend; index = tid-(nthreads-nsend); }
else if (nthreads-nrecv-nsend <= tid) { flags |= RolePostRecv; index = tid-(nthreads-nrecv-nsend); }
if (userBufReg) flags |= UserBufferMode;
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
} else if (mode == primsModePatRs || mode == primsModePatAg) { // Connect to all ranks +/- 2^n
flags |= PatMode;
accSize = 0;
int nranks = ncclShmem.comm.nRanks;
int rank = ncclShmem.comm.rank;
// A thread is responsible for rank +/- 2 ^ (tid%32). That should be fine as long as rank is a 32-bits integer.
index = tid % 32;
uint32_t delta = 1 << index;
const int roles[4] = { RoleWaitRecv, RoleWaitSend, RolePostSend, RolePostRecv};
int block = tid / 32;
if (block < 4 && delta < nranks) {
int role = roles[block];
if (mode == primsModePatRs) {
if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank - delta + nranks) % nranks;
if (role & (RoleWaitSend|RolePostSend)) peer = (rank + delta) % nranks;
} else if (mode == primsModePatAg) {
if (role & (RoleWaitSend|RolePostSend)) peer = (rank - delta + nranks) % nranks;
if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank + delta) % nranks;
}
flags |= role;
} else if (tid == 128) {
flags |= RoleInput | RoleOutput; // Only one will be used depending on the operation
}
}
// Coverity thinks that index could be -1 here but that's not actually the case.
// coverity[negative_returns:FALSE]
if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e ? e->direct : 0, e ? e->regUsed : ipcReg);
// coverity[negative_returns:FALSE]
if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e ? e->direct : 0, e ? e->regUsed : ipcReg);
if (netReg) flags |= NetRegMode;
// if (barrierAny(flags & NetDeviceUnpack)) {
// flags |= AnyNetDeviceUnpack;
@@ -711,18 +801,14 @@ private:
// }
// }
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e);
// coverity[negative_returns:FALSE]
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e, (uint8_t)(e ? e->regUsed : ipcReg), peer);
}
__forceinline__ __device__ ~Primitives() {
// Ensure ncclShmem.groups[].send/recvConns are available
barrier();
// Save steps for the next operation
if (flags & (RolePostSend|RolePostRecv)) {
auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
conns[index]->step = step;
}
if ((flags & UserBufferMode) && (flags & RoleWaitSend)) {
if (flags & (RolePostSend|RolePostRecv)) conn->step = step;
if ((flags & NetRegMode) && (flags & RoleWaitSend)) {
// Make sure we wait until the proxy has sent data before we return.
// We don't want the next CUDA kernel to overwrite the send buffer which
// was accessed directly.
@@ -741,97 +827,111 @@ private:
barrier();
}
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* e) {
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* work, uint8_t ipcReg, int peer) {
if (tid==0) {
ncclShmem.groups[group].userInput = (void*)inputBuf;
ncclShmem.groups[group].userOutput = (void*)outputBuf;
ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input
}
bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite);
bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite));
bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead) || (flags == (flags|RoleWaitRecv|NvlsDirectRead)); // receiver accepts direct buffer
int regUsed = e != nullptr ? e->coll.regUsed : 0;
if (Direct && recvProvider) {
int spins = 0;
void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
// Wait for consumer to consume previous value before trampling it.
if (slot) {
while ((void *)atomicAdd((unsigned long long *) slot,0) != nullptr && !checkAbort(spins));
directBuff = (T*)outputBuf;
// Encode pointer by XOR'ing against some address they definitely wouldn't send
// since we want to allow them sending us nullptr while not colliding with
// the empty slot value.
*slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
}
}
if (Direct && sendAcceptor) {
int spins = 0;
void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
void *ptr;
while (slot) {
ptr = (void *)atomicAdd((unsigned long long *) slot,0);
if (ptr != nullptr || checkAbort(spins)) break;
}
if (slot) {
directBuff = regUsed ? (T*)(e->dnOutputs[index]) :
reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
*slot = nullptr;
} else {
/* slot is NULL, it must be regUsed == 1 */
directBuff = (T*)e->dnOutputs[index];
}
}
if (Direct && sendProvider) {
int spins = 0;
void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1;
// Wait for consumer to consume previous value before trampling it.
if (slot && argSlot0 && argSlot1) {
while (((void *)atomicAdd((unsigned long long *) slot,0) != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
// If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
// Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
// Exchange pre-scalers for use in direct pull
*argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg;
*argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32);
// Encode pointer by XOR'ing against some address they definitely wouldn't send
// since we want to allow them sending us nullptr while not colliding with
// the empty slot value.
*slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
}
}
if (Direct && recvAcceptor) {
int spins = 0;
void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1;
void *ptr;
while (slot) {
ptr = (void *)atomicAdd((unsigned long long *) slot,0);
if (ptr != nullptr || checkAbort(spins)) break;
}
if (slot && argSlot0 && argSlot1) {
directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) :
reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
if (MaxSend != 0) { // reduce group rather than gather group
// Store scalers for remote inputs
uint64_t arg0, arg1;
while (true) {
arg0 = *argSlot0;
arg1 = *argSlot1;
if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
if (Direct && ipcReg) {
bool recvProvider = (flags & RoleWaitRecv) && (flags & DirectWrite || flags & IpcWrite);
bool sendAcceptor = (flags & RoleWaitSend) && (flags & DirectWrite || flags & IpcWrite || flags & NvlsDirectWrite);
bool sendProvider = (flags & RoleWaitSend) && (flags & DirectRead || flags & IpcRead); // sender provides direct buffer (to be fetched)
bool recvAcceptor = (flags & RoleWaitRecv) && (flags & DirectRead || flags & IpcRead || flags & NvlsDirectRead); // receiver accepts direct buffer
if (recvProvider) {
int spins = 0;
void* volatile* slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
// Wait for consumer to consume previous value before trampling it.
if (slot) {
T* exchgPtr;
directBuff = (T*)outputBuf;
while ((void *)atomicAdd((unsigned long long *) slot,0) != nullptr && !checkAbort(spins));
if (P2p) {
exchgPtr = (T*)outputBuf;
} else {
int localPeer = ncclShmem.comm.rankToLocalRank[peer];
exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]);
}
ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
*slot = reinterpret_cast<void*>(exchgPtr);
}
}
if (sendAcceptor) {
int spins = 0;
void* volatile* slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
void* ptr;
while (slot) {
ptr = (void *)atomicAdd((unsigned long long *) slot,0);
if (ptr != nullptr || checkAbort(spins)) break;
}
if (slot) {
directBuff = reinterpret_cast<T*>(ptr);
*slot = nullptr;
} else {
directBuff = (T*)work->dnOutputs[index];
}
}
if (sendProvider) {
int spins = 0;
void* volatile* slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange + 1;
// Wait for consumer to consume previous value before trampling it.
if (slot && argSlot0 && argSlot1) {
T* exchgPtr;
while (((void *)atomicAdd((unsigned long long *) slot,0) != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(spins));
// If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
// Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
if (P2p) {
exchgPtr = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
} else {
int localPeer = ncclShmem.comm.rankToLocalRank[peer];
if (MaxRecv == 0)
exchgPtr = (T*)(work->coll.sendbuffOffset + work->coll.sendbuffRmtAddrs[localPeer]);
else
exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]);
}
// Exchange pre-scalers for use in direct pull
*argSlot0 = (uint64_t(1) << 32) | (uint32_t)redOpArg;
*argSlot1 = (uint64_t(1) << 32) | (uint32_t)(redOpArg >> 32);
*slot = reinterpret_cast<T*>(exchgPtr);
}
}
if (recvAcceptor) {
int spins = 0;
void* volatile* slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange + 1;
void* ptr;
while (slot) {
ptr = (void *)atomicAdd((unsigned long long *) slot,0);
if (ptr != nullptr || checkAbort(spins)) break;
}
if (slot && argSlot0 && argSlot1) {
directBuff = reinterpret_cast<T*>(ptr);
if (MaxSend != 0) { // reduce group rather than gather group
// Store scalers for remote inputs
uint64_t arg0, arg1;
while (true) {
arg0 = *argSlot0;
arg1 = *argSlot1;
if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
}
ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
}
*argSlot0 = 0; *argSlot1 = 0;
*slot = nullptr;
} else {
// Coverity complains about work being possibly NULL below. However, slot
// being NULL means that the NVLS buffer is registered (regUsed == 1)
// so work can't be NULL in this code path.
// coverity[var_deref_op]
directBuff = (T*)work->dnInputs[index];
}
*argSlot0 = 0; *argSlot1 = 0;
*slot = nullptr;
} else {
directBuff = (T*)e->dnInputs[index];
}
}
}
@@ -867,8 +967,8 @@ private:
__device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) {
genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false);
__device__ __forceinline__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) {
genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false);
@@ -887,8 +987,8 @@ private:
__device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) {
genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false);
__device__ __forceinline__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN) {
genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, false);
@@ -900,6 +1000,9 @@ private:
__device__ __forceinline__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<1, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void recvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
@@ -907,14 +1010,20 @@ private:
__device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
genericOp<1, 1, 1, 1, Input, -1>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
__device__ __forceinline__ void recvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
// Direct is only for the send part
genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
genericOp<1, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void
scatter(intptr_t inpIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
@@ -934,6 +1043,127 @@ private:
ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
}
__device__ __forceinline__ void patReduce(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int sendStepOffset, int nelem, int postRecv, int postSend) {
nelem = nelem < 0 ? 0 : nelem;
T* userInput = (T*)ncclShmem.groups[group].userInput;
T* userOutput = (T*)ncclShmem.groups[group].userOutput;
if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + recvOffset;
int spins = 0;
while (connStepCache < step + StepPerSlice) {
connStepCache = loadStepValue(connStepPtr);
if (checkAbort(spins)) break;
}
if (postRecv) step += StepPerSlice;
}
if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
int spins = 0;
while (connStepCache + NCCL_STEPS < step + sendStepOffset + StepPerSlice) {
connStepCache = loadStepValue(connStepPtr);
if (checkAbort(spins)) break;
}
ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + ((step+sendStepOffset)%NCCL_STEPS)*connStepSize) + sendOffset;
if (accSize < sendOffset + nelem + (step+sendStepOffset)*connStepSize) {
// New data, add our own data to it.
ncclShmem.groups[group].srcs[1] = userInput + inpIx;
accSize = sendOffset + nelem + (step+sendStepOffset)*connStepSize;
if (flags & ConnFifoEnabled)
connFifo[(step+sendStepOffset)%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
} else {
// There is already data in there, accumulate instead of writing to it.
ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
}
if (postSend) step += StepPerSlice;
}
if (sendPow2 < 0 && (flags & RoleOutput)) { // Destination is our own local buffer
ncclShmem.groups[group].dsts[0] = userOutput + outIx;
if (accSize < outIx + nelem) {
// New data, add our own data to it.
ncclShmem.groups[group].srcs[1] = userInput + inpIx;
accSize = outIx + nelem;
} else {
// There is already data in there, accumulate instead of writing to it.
ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
}
}
barrier();
int nSrcs = 2;
void** srcs = ncclShmem.groups[group].srcs;
if (recvPow2 < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source
int workSize = ncclShmem.aborted ? 0 : nelem;
reduceCopy<Unroll, RedOp, T, 0, 1, 2, 0, 1, 1, /*PreOpSrcs*/0>
(tid, nthreads, ncclShmem.redOpArgs[0], nullptr, /*postOp=*/false,
nSrcs, srcs, 1, ncclShmem.groups[group].dsts, workSize);
barrier();
if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
}
__device__ __forceinline__ void patCopy(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int recvStepOffset, int nelem, int postRecv, int postSend) {
nelem = nelem < 0 ? 0 : nelem;
T* userInput = (T*)ncclShmem.groups[group].userInput;
T* userOutput = (T*)ncclShmem.groups[group].userOutput;
if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + ((step+recvStepOffset)%NCCL_STEPS)*connStepSize) + recvOffset;
int spins = 0;
while (connStepCache < step + recvStepOffset + StepPerSlice) {
connStepCache = loadStepValue(connStepPtr);
if (checkAbort(spins)) break;
}
if (accSize < recvOffset + nelem + (step+recvStepOffset)*connStepSize) {
// New data, copy to our output buffer.
ncclShmem.groups[group].dsts[1] = userOutput + outIx;
accSize = recvOffset + nelem + (step+recvStepOffset)*connStepSize;
} else {
ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
}
if (postRecv) step += StepPerSlice;
}
if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
int spins = 0;
while (connStepCache + NCCL_STEPS < step + StepPerSlice) {
connStepCache = loadStepValue(connStepPtr);
if (checkAbort(spins)) break;
}
ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + sendOffset;
if (postSend) {
if (flags & ConnFifoEnabled)
connFifo[step%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
step += StepPerSlice;
}
}
if (recvPow2 < 0 && (flags & RoleInput)) { // Source is our own local buffer
ncclShmem.groups[group].srcs[0] = userInput + inpIx;
if (accSize < inpIx + nelem) {
// New data, copy to our output buffer.
ncclShmem.groups[group].dsts[1] = userOutput + outIx;
accSize = inpIx + nelem;
} else {
ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
}
}
barrier();
int nDsts = 2;
void** dsts = ncclShmem.groups[group].dsts;
if (sendPow2 < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest
if (ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[1]) nDsts--; // In-place or already done.
int workSize = ncclShmem.aborted ? 0 : nelem;
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 2, /*PreOpSrcs*/0>
(tid, nthreads, ncclShmem.redOpArgs[0], nullptr, /*postOp=*/false,
1, ncclShmem.groups[group].srcs, nDsts, dsts, workSize);
barrier();
if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
}
// MSCCL primitives
__device__ __forceinline__ void sendWithBarrier(intptr_t inpIx, int eltN) {
send(inpIx, eltN);
@@ -941,4 +1171,4 @@ private:
__device__ __forceinline__ void localCopy(T* srcs, T* dsts, int eltN) {
return mscclGenericOp<0,1,0,0>(&srcs, 1, &dsts, 1, eltN);
}
};
};
+3
Melihat File
@@ -28,6 +28,9 @@ namespace {
size_t offset;
int nelem;
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg, 0, work->connIndex, work->connIndex);
+30 -21
Melihat File
@@ -244,10 +244,10 @@ struct Apply_Reduce<FuncMinMax<uint8_t>, /*EltPerPack=*/4> {
// uint32_t a = apack.native;
// uint32_t b = bpack.native;
// uint32_t ab0 = (a*b) & 0xffu;
// asm("mad.lo.u32 %0, %1, %2, %0;" : "+r"(ab0) : "r"(a&0xff00u), "r"(b&0xff00u));
// asm volatile("mad.lo.u32 %0, %1, %2, %0;" : "+r"(ab0) : "r"(a&0xff00u), "r"(b&0xff00u));
// uint32_t ab1;
// asm("mul.hi.u32 %0, %1, %2;" : "=r"(ab1) : "r"(a&0xff0000), "r"(b&0xff0000));
// asm("mad.hi.u32 %0, %1, %2, %0;" : "+r"(ab1) : "r"(a&0xff000000u), "r"(b&0xff000000u));
// asm volatile("mul.hi.u32 %0, %1, %2;" : "=r"(ab1) : "r"(a&0xff0000), "r"(b&0xff0000));
// asm volatile("mad.hi.u32 %0, %1, %2, %0;" : "+r"(ab1) : "r"(a&0xff000000u), "r"(b&0xff000000u));
// apack.native = __byte_perm(ab0, ab1, 0x6420);
// return apack;
// }
@@ -273,10 +273,13 @@ SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, fn.isMinNotMax ? __hmin(x, y) : __h
#if defined(RCCL_BFLOAT16)
#if __CUDA_ARCH__ >= 800
SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __hadd(x, y))
// coverity[copy_constructor_call]
SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 2, __nv_bfloat162, __hadd2(x, y))
SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __hmul(x, y))
// coverity[copy_constructor_call]
SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 2, __nv_bfloat162, __hmul2(x, y))
SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
// coverity[copy_constructor_call]
SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 2, __nv_bfloat162, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
#else
SPECIALIZE_REDUCE(FuncSum, hip_bfloat16, 1, hip_bfloat16, (hip_bfloat16)((float)(x) + (float)(y)))
@@ -406,6 +409,9 @@ struct FuncPreMulSum {
};
template<>
// Coverity recommends the users of this type to use std::move in certain cases but,
// given that half is a scalar, a plain copy will be just as efficient.
// coverity[moveable_type]
struct FuncPreMulSum<half> {
using EltType = half;
half2 scalar;
@@ -419,6 +425,9 @@ struct FuncPreMulSum<half> {
#if defined(RCCL_BFLOAT16)
template<>
// Coverity recommends the users of this type to use std::move in certain cases but,
// given that __nv_bfloat16 is a scalar, a plain copy will be just as efficient.
// coverity[moveable_type]
struct FuncPreMulSum<hip_bfloat16> {
using EltType = hip_bfloat16;
#if __CUDA_ARCH__ >= 800
@@ -631,9 +640,9 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
__device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
: "l"(addr)); \
: "l"(addr) : "memory"); \
return ans; \
} \
};
@@ -644,13 +653,13 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
__device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
if (fn.isMinNotMax) { \
asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
: "l"(addr)); \
: "l"(addr) : "memory"); \
} else { \
asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
: "l"(addr)); \
: "l"(addr) : "memory"); \
} \
return ans; \
} \
@@ -662,12 +671,12 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
__device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
asm("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
: "l"(addr)); \
: "l"(addr) : "memory"); \
return ans; \
} \
};
@@ -678,19 +687,19 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
__device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
if (fn.isMinNotMax) { \
asm("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
: "l"(addr)); \
: "l"(addr) : "memory"); \
} else { \
asm("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
: "l"(addr)); \
: "l"(addr) : "memory"); \
} \
return ans; \
} \
@@ -702,9 +711,9 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
struct Apply_LoadMultimem<FuncSum<T>, sizeof(T)> { \
__device__ static BytePack<sizeof(T)> load(FuncSum<T> fn, uintptr_t addr) { \
BytePack<2*sizeof(T)> tmp; \
asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
: "l"(addr & -uintptr_t(2*sizeof(T)))); \
: "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
return tmp.half[(addr/sizeof(T))%2]; \
} \
};
@@ -715,13 +724,13 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
__device__ static BytePack<sizeof(T)> load(FuncMinMax<T> fn, uintptr_t addr) { \
BytePack<2*sizeof(T)> tmp; \
if (fn.isMinNotMax) { \
asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
: "l"(addr & -uintptr_t(2*sizeof(T)))); \
: "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
} else { \
asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
: "l"(addr & -uintptr_t(2*sizeof(T)))); \
: "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
} \
return tmp.half[(addr/sizeof(T))%2]; \
} \
+46 -11
Melihat File
@@ -53,7 +53,9 @@ namespace {
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg, 0, work->connIndex, work->connIndex);
@@ -151,6 +153,32 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_L
}
};
template<typename T, typename RedOp>
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
using Proto = ProtoSimple<1, 1>;
const int nranks = ncclShmem.comm.nRanks;
const int rank = ncclShmem.comm.rank;
size_t count, channelOffset, channelCount, chunkCount;
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
T *inputBuf = (T*)work->sendbuff;
T *outputBuf = (T*)work->recvbuff;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
(tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatRs);
PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
int last = 0;
while (!last) {
int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
size_t inpIx, outIx;
patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
prims.patReduce(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend);
}
}
};
template<typename T, typename RedOp>
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
@@ -220,6 +248,9 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_S
size_t outOffset = gridOffset + elemOffset;
size_t inpOffset = outOffset + rank * count;
nelem = min(chunkCount, channelCount - elemOffset);
// Coverity complains about a possible overrun inside the method invoked below, but that's actually
// a false positive.
// coverity[overrun-call:FALSE]
prims.directRecvCopy(inpOffset, outOffset, nelem);
}
@@ -241,7 +272,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
__device__ __forceinline__ void operator()(
int tid, int tn, int slice, int maxSliceSize,
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
) {
static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
@@ -276,19 +307,23 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
reduceCopy<ncclCollUnroll(), RedOp, T,
if (nDsts != 0) {
reduceCopy<ncclCollUnroll(), RedOp, T,
/*MultimemSrcs=*/0, 1+MinSrcs, 1+MaxSrcs,
/*MultimemDsts,MinDsts,MaxDsts=*/0,1,1,
/*PreOpSrcs=*/1>
(tid, tn, work->redOpArg, &work->redOpArg, false,
/*nSrcs=*/1+nSrcs, [=]__device__(int s) {
return s==0 ? (T*)inbuf + userOneBeg
: work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ)
? (T*)srcPtrs[s-1] + userOneBeg
: (T*)srcPtrs[s-1] + railAllOffset;
},
/*nDsts=*/1, [=]__device__(int d/*==0*/) {
return (T*)dstPtrs[dst] + railAllOffset;
},
delta);
}
railAllOffset += delta;
node += 1;
}
@@ -307,7 +342,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
ssize_t chunkSize = int(work->collnet.chunkCount);
ssize_t sizePerRank = work->collnet.count;
// if (direct->out == -1) __trap();
if (direct->out == -1) __builtin_trap();
bool isMultiRail = (direct->nHeads > 1);
int nWarps1 = (isMultiRail ? 2 : 0);
int nWarps2 = (isMultiRail ? 2 : 1);
@@ -322,15 +357,15 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
int tn = nWarps1*WARP_SIZE;
if (tid < tn) {
// Phase 1: Scatter inputs to peers
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, tn, nullptr, direct->heads+1, nullptr, nullptr,
work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
prims(tid, tn, nullptr, direct->heads+1, work->sendbuff, nullptr,
work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1, work);
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
Scatterer</*ReduceSendNotRecv=*/true> scat;
scat.work = work;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.template process</*Recv=*/0, /*Send=*/1>(scat);
prims.template process</*Recv=*/0, /*Send=*/1>(scat, NCCL_DIRECT_READ, 0);
}
return;
}
@@ -346,15 +381,15 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
__syncwarp();
} else {
// Phase 2: Reduce from peers + local input -> send to network
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr,
work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1, work);
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
Scatterer</*ReduceSendNotRecv=*/false> scat;
scat.work = work;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.template process</*Recv=*/1, /*Send=*/1>(scat);
prims.template process</*Recv=*/1, /*Send=*/1>(scat, 0, NCCL_DIRECT_READ);
}
}
return;
+9 -6
Melihat File
@@ -19,8 +19,8 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
template<typename Proto>
__device__ void runSend(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
size_t bytes = work->sendBytes;
int chunkSize = u32fp8Decode(work->sendChunkSize_u32fp8);
int chunkSize = work->sendIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->sendChunkSize_u32fp8);
#if defined(ENABLE_NPKIT)
bool isNpKitThread = (tid == 0);
int npKitCtxIdx = blockIdx.x + group;
@@ -43,7 +43,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
Primitives<T, RedOp, FanAsymmetric<0, 1>, 0, Proto, 1>
prims(tid, tn, nullptr, &work->sendRank, work->sendAddr, nullptr,
/*redOpArg(ignored)=*/0, group, work->sendConnIndex, work->sendConnIndex, nullptr,
/*userBufferMode=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize);
/*ipcReg=*/work->sendIpcReg, /*netReg=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize);
#if defined(ENABLE_NPKIT)
if (isNpKitThread) {
@@ -77,7 +77,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
template<typename Proto>
__device__ void runRecv(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
size_t bytes = work->recvBytes;
int chunkSize = u32fp8Decode(work->recvChunkSize_u32fp8);
int chunkSize = work->recvIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->recvChunkSize_u32fp8);
#if defined(ENABLE_NPKIT)
bool isNpKitThread = (tid == 0);
@@ -101,7 +101,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
Primitives<T, RedOp, FanAsymmetric<1, 0>, 0, Proto, 1>
prims(tid, tn, &work->recvRank, nullptr, nullptr, work->recvAddr,
/*redOpArg(ignored)=*/0, group, work->recvConnIndex, work->recvConnIndex, nullptr,
/*userBufferMode=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize);
/*ipcReg=*/work->recvIpcReg, /*netReg=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize);
#if defined(ENABLE_NPKIT)
if (isNpKitThread) {
@@ -120,7 +120,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
size_t cursor = 0;
do {
int n = min(size_t(chunkSize), bytes-cursor);
prims.directRecv(cursor, n);
prims.directRecv(cursor, cursor, n);
cursor += n;
} while (cursor < bytes && work->recvRegistered == 0);
@@ -172,6 +172,9 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
(isSend ? work->sendBytes : work->recvBytes) = partEnd - partBeg;
}
}
// Coverity reports a possible thread divergence due to not all threads participating in the collective.
// However, the code ensures that the participation is on a per-warp basis.
// coverity[device_thread_diverged:FALSE]
uint32_t mask = __ballot(hasWork);
if (lane == 0) {
shared->workSendMask = mask>>16;
+380 -121
Melihat File
@@ -18,6 +18,7 @@
#include "channel.h"
#include "rocmwrap.h"
#include "rccl_vars.h"
#include "profiler.h"
#include "transport.h"
#include "common.h"
#include "api_trace.h"
@@ -157,6 +158,10 @@ static void addWorkBatchToPlan(
if (newBatch || extendBatch) {
if (!newBatch) batch->nextExtends = extendBatch; // Extending the previous batch.
struct ncclWorkBatchList* batchNode = ncclMemoryStackAlloc<ncclWorkBatchList>(&comm->memScoped);
// Coverity thinks that ncclIntruQueueEnqueue will access chan->workBatchQueue->tail, which might
// be NULL. But that code is guarded by chan->workBatchQueue->head not being NULL, in which
// case tail won't be NULL either.
// coverity[var_deref_model:FALSE]
ncclIntruQueueEnqueue(&chan->workBatchQueue, batchNode);
batch = &batchNode->batch;
batch->nextExtends = 0;
@@ -277,7 +282,29 @@ static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* c
return ncclSuccess;
}
static ncclResult_t registerIntraNodeBuffers(
static ncclResult_t registerCheckP2PConnection(struct ncclComm* comm, struct ncclConnector* conn, struct ncclTopoGraph* graph, int peer, bool* needReg) {
if (conn->connected) {
if (conn->conn.flags & (NCCL_IPC_READ | NCCL_IPC_WRITE | NCCL_DIRECT_READ | NCCL_DIRECT_WRITE)) {
*needReg = true;
} else {
// network connection
*needReg = false;
}
} else {
struct ncclPeerInfo* peerInfo = &comm->peerInfo[peer];
struct ncclPeerInfo* myInfo = &comm->peerInfo[comm->rank];
int canConnect = 0;
NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, graph, myInfo, peerInfo));
if (canConnect) {
*needReg = true;
} else {
*needReg = false;
}
}
return ncclSuccess;
}
static ncclResult_t registerCollBuffers(
struct ncclComm* comm, struct ncclTaskColl* info,
void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
@@ -288,8 +315,10 @@ static ncclResult_t registerIntraNodeBuffers(
info->regBufType = NCCL_REGULAR_BUFFER;
*regNeedConnect = true;
if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit;
#if CUDART_VERSION >= 11030
if ((info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) && comm->nvlsRegSupport) {
if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit;
bool regBufUsed = false;
const void *sendbuff = info->sendbuff;
void *recvbuff = info->recvbuff;
@@ -322,60 +351,6 @@ static ncclResult_t registerIntraNodeBuffers(
}
info->regBufType = NCCL_NVLS_REG_BUFFER;
}
} else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now
comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
comm->intraRanks < comm->localRanks && // only with inter-process & intra-node peers
comm->planner.persistent && 0) {
/* Disable CollnetDirect registration since it does not support cuMem* allocated memory. */
int localRank = comm->localRank;
cudaPointerAttributes sattr, rattr;
CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff));
CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff));
if (sattr.type != cudaMemoryTypeDevice || rattr.type != cudaMemoryTypeDevice) return ncclSuccess;
if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess;
struct HandlePair {
cudaIpcMemHandle_t ipc[2]; // {send, recv}
size_t offset[2]; // {send, recv}
};
struct HandlePair handles[NCCL_MAX_LOCAL_RANKS];
CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback);
CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback);
void *baseSend, *baseRecv;
size_t size;
CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff));
handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend;
CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff));
handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv;
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair)));
// Open handles locally
for (int i=0; i < comm->localRanks; i++) {
if (i == localRank) { // Skip self
outRegBufSend[i] = nullptr;
outRegBufRecv[i] = nullptr;
} else {
for (int sr=0; sr < 2; sr++) {
// Get base address of mapping
void* base;
CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess));
// Get real buffer address by adding offset in the mapping
(sr == 0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr];
// Enqueue reminder to close memory handle
struct ncclIpcCleanupCallback* cb = (struct ncclIpcCleanupCallback*)malloc(sizeof(struct ncclIpcCleanupCallback));
cb->base.fn = cleanupIpc;
cb->ptr = base;
ncclIntruQueueEnqueue(cleanupQueue, &cb->base);
info->nCleanupQueueElts += 1;
}
}
}
info->regBufType = NCCL_IPC_REG_BUFFER;
} else if ((info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && comm->collNetRegSupport && info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv) {
size_t elementSize = ncclTypeSize(info->datatype);
size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
@@ -394,27 +369,200 @@ static ncclResult_t registerIntraNodeBuffers(
}
if ((sendRegBufFlag == 0 || recvRegBufFlag == 0) && comm->planner.persistent && ncclParamGraphRegister()) {
ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
info->sendMhandle = sendHandle;
if (sendRegBufFlag) {
if (!sendRegBufFlag) {
ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
info->sendMhandle = sendHandle;
}
if (sendRegBufFlag && !recvRegBufFlag) {
ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
info->recvMhandle = recvHandle;
}
}
if (sendRegBufFlag && recvRegBufFlag) {
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 1));
info->nMaxChannels = 1;
info->regBufType = NCCL_COLLNET_REG_BUFFER;
if (sendRegBufFlag == 1 && recvRegBufFlag == 1) {
INFO(NCCL_REG, "rank %d successfully registered collNet sendbuff %p (handle %p), sendbuff size %ld, recvbuff %p (handle %p), recvbuff size %ld", comm->rank, info->sendbuff, sendHandle, sendbuffSize, info->recvbuff, recvHandle, recvbuffSize);
}
}
} else if (comm->intraNodeP2pSupport && info->protocol == NCCL_PROTO_SIMPLE) {
// IPC buffer registration
if (info->func == ncclFuncReduceScatter) goto exit;
if (info->algorithm == NCCL_ALGO_RING && ((info->func == ncclFuncAllReduce && info->sendbuff == info->recvbuff) || info->func == ncclFuncReduce)) goto exit;
if ((info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && info->sendbuff == info->recvbuff) goto exit;
if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) goto exit;
int peerRanks[NCCL_MAX_LOCAL_RANKS];
int nPeers = 0;
size_t elementSize = ncclTypeSize(info->datatype);
size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
int regBufFlag = 0;
memset(peerRanks, 0xff, sizeof(int) * NCCL_MAX_LOCAL_RANKS);
if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
struct ncclChannel* channel = comm->channels;
for (int r = 0; r < NCCL_MAX_DIRECT_ARITY; ++r) {
for (int updown = 0; updown < 2; ++updown) {
int peer;
if (updown == 0)
peer = channel->collnetDirect.up[r];
else
peer = channel->collnetDirect.down[r];
if (peer != -1) {
struct ncclConnector* peerConn = &channel->peers[peer]->recv[0];
bool needReg = false;
NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], peer, &needReg));
if (needReg) {
bool found = false;
for (int p = 0; p < nPeers; ++p) {
if (peerRanks[p] == peer) {
found = true;
break;
}
}
if (!found) peerRanks[nPeers++] = peer;
}
}
}
}
if (nPeers > 0) {
if (ncclParamLocalRegister())
ncclIpcLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs);
if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
ncclIpcGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
}
if (regBufFlag) {
if (ncclParamLocalRegister())
ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
}
}
}
if (regBufFlag) {
info->regBufType = NCCL_IPC_REG_BUFFER;
}
} else if (info->algorithm == NCCL_ALGO_RING) {
struct ncclReg* recvRegRecord;
NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
if (recvRegRecord == NULL) goto exit;
for (int c = 0; c < comm->nChannels; ++c) {
struct ncclChannel* channel = comm->channels + c;
for (int r = 0; r < 2; ++r) {
bool needReg = false;
int peer;
struct ncclConnector* peerConn;
// P2P transport
if (r == 0)
peer = channel->ring.prev;
else
peer = channel->ring.next;
peerConn = &channel->peers[peer]->recv[0];
NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_RING], peer, &needReg));
if (needReg) {
bool found = false;
for (int p = 0; p < nPeers; ++p) {
if (peerRanks[p] == peer) {
found = true;
break;
}
}
if (!found) peerRanks[nPeers++] = peer;
}
}
}
if (nPeers > 0) {
if (ncclParamLocalRegister()) {
ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
}
if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
}
}
if (regBufFlag) {
info->regBufType = NCCL_IPC_REG_BUFFER;
}
} else if (info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
struct ncclReg* recvRegRecord;
NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
if (recvRegRecord == NULL) goto exit;
for (int c = 0; c < comm->nChannels; ++c) {
struct ncclChannel* channel = comm->channels + c;
struct ncclTree* tree = NULL;
int peers[NCCL_MAX_TREE_ARITY + 1];
if (info->algorithm == NCCL_ALGO_TREE)
tree = &channel->tree;
else
tree = &channel->collnetChain;
for (int p = 0; p < NCCL_MAX_TREE_ARITY; ++p) peers[p] = tree->down[p];
peers[NCCL_MAX_TREE_ARITY] = tree->up;
for (int p = 0; p < NCCL_MAX_TREE_ARITY + 1; ++p) {
int peer = peers[p];
bool peerNeedReg = false;
struct ncclConnector* recvConn = NULL;
// P2P transport
if (peer == -1 || peer == comm->nRanks) continue;
recvConn = &channel->peers[peer]->recv[0];
NCCLCHECK(registerCheckP2PConnection(comm, recvConn, &comm->graphs[info->algorithm], peer, &peerNeedReg));
if (peerNeedReg) {
bool found = false;
for (int pindex = 0; pindex < nPeers; ++pindex) {
if (peerRanks[pindex] == peer) {
found = true;
break;
}
}
if (!found) peerRanks[nPeers++] = peer;
}
}
}
if (nPeers > 0) {
if (ncclParamLocalRegister()) {
ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
}
if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
}
}
if (regBufFlag) {
info->regBufType = NCCL_IPC_REG_BUFFER;
}
}
if (info->regBufType == NCCL_IPC_REG_BUFFER && comm->nNodes == 1 && 16 < info->nMaxChannels && info->nMaxChannels <= 24) {
info->nMaxChannels = 16;
}
}
fallback:
#endif
exit:
return result;
}
static ncclResult_t registerP2pBuffer(struct ncclComm* comm, void* userbuff, int peerRank, size_t size, int* regFlag, void** regAddr, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
ncclResult_t ret = ncclSuccess;
uintptr_t offset = 0;
uintptr_t* peerRmtAddrs = NULL;
*regFlag = 0;
if (ncclParamLocalRegister()) {
ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs);
}
if (*regFlag == 0 && comm->planner.persistent && ncclParamGraphRegister()) {
ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast<void*>(cleanupQueue), NULL);
}
if (*regFlag)
*regAddr = (void*)((uintptr_t)peerRmtAddrs + offset);
return ret;
}
static ncclResult_t getCollNetSupport(struct ncclComm* comm, struct ncclTaskColl* task, int* collNetSupport);
static ncclResult_t getAlgoInfo(
struct ncclComm* comm, struct ncclTaskColl* task,
@@ -542,7 +690,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
void* regBufSend[NCCL_MAX_LOCAL_RANKS];
void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
bool regNeedConnect = true;
registerIntraNodeBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, &regNeedConnect);
registerCollBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, &regNeedConnect);
if (comm->runtimeConn && comm->initAlgoChannels[task->algorithm] == false) {
if (task->algorithm == NCCL_ALGO_NVLS_TREE && comm->initAlgoChannels[NCCL_ALGO_NVLS] == false && regNeedConnect == true) {
@@ -559,6 +707,10 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
struct ncclDevWorkColl devWork = {};
devWork.sendbuff = (void*)task->sendbuff;
devWork.recvbuff = (void*)task->recvbuff;
devWork.sendbuffOffset = task->sendbuffOffset;
devWork.recvbuffOffset = task->recvbuffOffset;
devWork.sendbuffRmtAddrs = task->sendbuffRmtAddrs;
devWork.recvbuffRmtAddrs = task->recvbuffRmtAddrs;
devWork.root = task->root;
devWork.nWarps = task->nWarps;
devWork.redOpArg = task->opDev.scalarArg;
@@ -571,35 +723,13 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
struct ncclWorkList* workNode;
switch (task->regBufType) {
case NCCL_REGULAR_BUFFER:
case NCCL_IPC_REG_BUFFER:
case NCCL_COLLNET_REG_BUFFER:
{ workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkColl>(&comm->memScoped, 1);
workNode->workType = ncclDevWorkTypeColl;
workNode->size = sizeof(struct ncclDevWorkColl);
memcpy((void*)(workNode+1), (void*)&devWork, workNode->size);
} break;
case NCCL_IPC_REG_BUFFER:
{ struct ncclDevWorkCollReg workReg = {};
workReg.coll = devWork;
struct ncclChannel *channel0 = &comm->channels[0];
for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
int peer = channel0->collnetDirect.down[i];
if (peer == -1) break;
int j = comm->rankToLocalRank[peer]; // Get intra-node slot
workReg.dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer
workReg.dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer
}
for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
int peer = channel0->collnetDirect.up[i];
if (peer == -1) break;
int j = comm->rankToLocalRank[peer];
// Output buffer of root peer
workReg.upOutputs[i] = regBufRecv[j];
}
workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkCollReg>(&comm->memScoped, 1);
workNode->workType = ncclDevWorkTypeCollReg;
workNode->size = sizeof(struct ncclDevWorkCollReg);
memcpy((void*)(workNode+1), (void*)&workReg, workNode->size);
} break;
case NCCL_NVLS_REG_BUFFER:
{ struct ncclDevWorkCollReg workReg = {};
workReg.coll = devWork; // C++ struct assignment
@@ -636,6 +766,7 @@ static ncclResult_t scheduleCollTasksToPlan(
int nChannels[2*2] = {0, 0, 0, 0}; // [collnet][nvls]
int const nMaxChannels[2*2] = {comm->nChannels, comm->nvlsChannels, // [collnet][nvls]
comm->nChannels, comm->nvlsChannels};
constexpr size_t MinTrafficPerChannel = 512; // Traffic as minimal
do {
size_t workBytes = 0;
struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue);
@@ -647,7 +778,7 @@ static ncclResult_t scheduleCollTasksToPlan(
nPlanColls += 1;
workBytes += workNode->size;
int kind = 2*task->isCollnet + task->isNvls;
trafficBytes[kind] += task->trafficBytes;
trafficBytes[kind] += std::max(MinTrafficPerChannel, task->trafficBytes);
nChannels[kind] += task->nMaxChannels;
nChannels[kind] = std::min(nChannels[kind], nMaxChannels[kind]);
task = task->next;
@@ -657,7 +788,6 @@ static ncclResult_t scheduleCollTasksToPlan(
} while (0);
int kindPrev = -1;
constexpr size_t MinTrafficPerChannel = 512;
size_t trafficPerChannel = 0;
int channelId = 0;
size_t currentTraffic = 0;
@@ -697,14 +827,16 @@ static ncclResult_t scheduleCollTasksToPlan(
for (int c=devWork->channelLo; c <= (int)devWork->channelHi; c++) {
proxyOp.channelId = c;
proxyOp.opCount = proxyOpId;
proxyOp.task.coll = task;
proxyOp.rank = comm->rank;
addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp));
}
} else { // not task->isCollnet
constexpr size_t cellSize = 16;
int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks);
size_t cellSize = divUp(divUp(MinTrafficPerChannel, (size_t)trafficPerByte), 16) * 16;
int elementsPerCell = cellSize/elementSize;
size_t cells = divUp(task->count*elementSize, cellSize);
int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks);
size_t trafficPerElement = elementSize*trafficPerByte;
size_t trafficPerCell = cellSize*trafficPerByte;
size_t cellsPerChannel = std::min(cells, divUp(trafficPerChannel, trafficPerCell));
@@ -712,7 +844,7 @@ static ncclResult_t scheduleCollTasksToPlan(
if (channelId+1 == nMaxChannels[kind]) { // On last channel everything goes to "lo"
cellsLo = cells;
} else {
cellsLo = std::min(cells, (trafficPerChannel-currentTraffic)/trafficPerCell);
cellsLo = std::min(cells, divUp((trafficPerChannel-currentTraffic),trafficPerCell));
}
int nMidChannels = (cells-cellsLo)/cellsPerChannel;
size_t cellsHi = (cells-cellsLo)%cellsPerChannel;
@@ -780,12 +912,12 @@ static ncclResult_t scheduleCollTasksToPlan(
// Update the current channel and vacant traffic budget.
if (countHi != 0) {
channelId += nChannels-1;
currentTraffic = countHi*trafficPerElement;
currentTraffic = cellsHi*elementsPerCell*trafficPerElement;
} else if (nMidChannels != 0) {
channelId += nChannels;
currentTraffic = 0;
} else {
currentTraffic += countLo*trafficPerElement;
currentTraffic += cellsLo*elementsPerCell*trafficPerElement;
}
if (currentTraffic >= trafficPerChannel && channelId+1 != nMaxChannels[kind]) {
@@ -805,6 +937,8 @@ static ncclResult_t scheduleCollTasksToPlan(
}
proxyOp->channelId = c;
proxyOp->opCount = proxyOpId;
proxyOp->task.coll = task;
proxyOp->rank = comm->rank;
proxyOp->connIndex = 0;
if (task->protocol == NCCL_PROTO_SIMPLE && task->algorithm == NCCL_ALGO_RING) {
if (comm->useIntraNet && nBytes > rcclParamIntraNetThreshold()) {
@@ -812,6 +946,9 @@ static ncclResult_t scheduleCollTasksToPlan(
}
}
addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
// Coverity reports "proxyOp->connection" as being possibly uninitialized. It's hard to
// determine if that's actually true but it's also not clear if that would be an issue.
// coverity[uninit_use_in_call:FALSE]
NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp));
}
}
@@ -856,6 +993,7 @@ static ncclResult_t scheduleCollTasksToPlan(
ncclIntruQueueDequeue(&planner->collWorkQueue);
nPlanColls -= 1;
planner->nTasksColl -= 1;
ncclIntruQueueEnqueue(&plan->collTaskQueue, task);
ncclIntruQueueEnqueue(&plan->workQueue, workNode);
plan->workBytes += workNode->size;
}
@@ -875,7 +1013,8 @@ static ncclResult_t addP2pToPlan(
int nChannelsMin, int nChannelsMax, int p2pRound,
int sendRank, void* sendAddr, ssize_t sendBytes,
int recvRank, void* recvAddr, ssize_t recvBytes,
uint64_t sendOpCount, uint64_t recvOpCount
uint64_t sendOpCount, uint64_t recvOpCount,
struct ncclTaskP2p** p2pTasks
) {
int connIndex[2] = {1, 1};
bool selfSend = (sendRank == comm->rank);
@@ -918,7 +1057,8 @@ static ncclResult_t addP2pToPlan(
int chunkSize[2];
int chunkDataSize[2];
int chunkDataSize_u32fp8[2];
bool registered[2];
bool registered[2] = {false, false};
bool ipcRegistered[2] = {false, false};
for (int dir=0; dir < 2; dir++) { // 0=recv, 1=send
if (bytes[dir] != -1) protoLL[dir] &= bytes[dir] <= thresholdLL;
@@ -942,11 +1082,29 @@ static ncclResult_t addP2pToPlan(
chunkSize[dir] = chunkDataSize[dir];
if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2;
registered[dir] = false;
if (bytes[dir] > 0 && network[dir] && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) {
struct ncclReg* regRecord;
NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], &regRecord));
registered[dir] = (regRecord && regRecord->nDevs);
if (network[dir]) {
if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) {
struct ncclReg* regRecord;
NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], &regRecord));
registered[dir] = regRecord && regRecord->nDevs;
}
} else if (bytes[dir] > 0 && addrs[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && !selfSend) {
int peerRank = dir ? sendRank : recvRank;
int regFlag = 0;
int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, 0, nChannelsMax, comm->nNodes);
struct ncclChannelPeer** channelPeers = comm->channels[channelId].peers;
struct ncclConnector* conn = dir ? &channelPeers[peerRank]->send[connIndex[dir]]
: &channelPeers[peerRank]->recv[connIndex[dir]];
void* regAddr = NULL;
if (conn->conn.flags & (NCCL_IPC_WRITE | NCCL_IPC_READ | NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
// We require users registering buffers on both sides
NCCLCHECK(registerP2pBuffer(comm, addrs[dir], peerRank, bytes[dir], &regFlag, &regAddr, &plan->cleanupQueue));
if (regFlag) {
if (dir == 0 && conn->conn.flags & (NCCL_IPC_WRITE | NCCL_DIRECT_WRITE)) recvAddr = regAddr;
else if (dir == 1 && conn->conn.flags & (NCCL_IPC_READ | NCCL_DIRECT_READ)) sendAddr = regAddr;
}
}
ipcRegistered[dir] = regFlag ? true : false;
}
if (bytes[dir] == -1) nChannels[dir] = 0;
@@ -976,6 +1134,7 @@ static ncclResult_t addP2pToPlan(
work->nSendChannels = nChannels[1];
work->sendProtoLL = protoLL[1];
work->sendRegistered = registered[1];
work->sendIpcReg = ipcRegistered[1];
work->sendChunkSize_u32fp8 = chunkDataSize_u32fp8[1];
work->sendRank = sendRank;
work->sendAddr = sendAddr;
@@ -985,6 +1144,7 @@ static ncclResult_t addP2pToPlan(
work->nRecvChannels = nChannels[0];
work->recvProtoLL = protoLL[0];
work->recvRegistered = registered[0];
work->recvIpcReg = ipcRegistered[0];
work->recvChunkSize_u32fp8 = chunkDataSize_u32fp8[0];
work->recvRank = recvRank;
work->recvAddr = recvAddr;
@@ -1005,6 +1165,9 @@ static ncclResult_t addP2pToPlan(
op->pattern = dir ? ncclPatternSend : ncclPatternRecv;
op->chunkSize = chunkSize[dir];
op->reg = registered[dir];
op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0;
op->task.p2p = p2pTasks[dir];
op->rank = comm->rank;
op->connIndex = connIndex[dir];
// The following are modified per channel part in addWorkToChannels():
// op->buffer, op->nbytes, op->nsteps = ...;
@@ -1127,14 +1290,16 @@ static ncclResult_t scheduleP2pTasksToPlan(
if (!testBudget(budget, plan->nWorkBatches+nChannelsMax, plan->workBytes + sizeof(struct ncclDevWorkP2p))) {
return ncclSuccess;
}
NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes,
send ? send->opCount : 0, recv ? recv->opCount : 0));
struct ncclTaskP2p* p2pTasks[2] = { recv, send };
NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes, send ? send->opCount : 0, recv ? recv->opCount : 0, p2pTasks));
if (send != nullptr) {
ncclIntruQueueDequeue(&peers[sendRank].sendQueue);
ncclIntruQueueEnqueue(&plan->p2pTaskQueue, send);
comm->planner.nTasksP2p -= 1;
}
if (recv != nullptr) {
ncclIntruQueueDequeue(&peers[recvRank].recvQueue);
ncclIntruQueueEnqueue(&plan->p2pTaskQueue, recv);
comm->planner.nTasksP2p -= 1;
}
}
@@ -1187,29 +1352,43 @@ static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduce
}
}
namespace {
struct uploadWork_cleanup_t {
struct ncclCommEventCallback base;
void *hostBuf;
};
ncclResult_t uploadWork_cleanup_fn(
struct ncclComm* comm, struct ncclCommEventCallback* cb
) {
struct uploadWork_cleanup_t* me = (struct uploadWork_cleanup_t*)cb;
free(me->hostBuf);
CUDACHECK(cudaEventDestroy(me->base.event));
return ncclSuccess;
}
}
static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) {
size_t workBytes = plan->workBytes;
size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
void* fifoBuf;
void* fifoBufHost;
uint32_t fifoCursor, fifoMask;
switch (plan->workStorageType) {
case ncclDevWorkStorageTypeArgs:
plan->kernelArgs->workBuf = nullptr;
fifoBuf = (void*)plan->kernelArgs;
fifoBufHost = (void*)plan->kernelArgs;
fifoCursor = sizeof(ncclDevKernelArgs) + batchBytes;
fifoMask = ~0u;
break;
case ncclDevWorkStorageTypeFifo:
fifoBuf = comm->workFifoBuf;
fifoBufHost = comm->workFifoBuf;
fifoCursor = comm->workFifoProduced;
fifoMask = comm->workFifoBytes-1;
waitWorkFifoAvailable(comm, fifoCursor + workBytes);
plan->kernelArgs->workBuf = comm->workFifoBufDev;
break;
case ncclDevWorkStorageTypePersistent:
ncclMemoryStackPush(&comm->memScoped);
fifoBuf = ncclMemoryStackAlloc(&comm->memScoped, workBytes, /*align=*/16);
fifoBufHost = aligned_alloc(16, workBytes); // We rely on 16-byte alignment
fifoCursor = 0;
fifoMask = ~0u;
break;
@@ -1231,7 +1410,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
// Write the channel-shared work structs.
struct ncclWorkList* workNode = ncclIntruQueueHead(&plan->workQueue);
while (workNode != nullptr) {
char* dst = (char*)fifoBuf;
char* dst = (char*)fifoBufHost;
char* src = (char*)(workNode+1);
for (int n = workNode->size; n != 0; n -= 16) {
memcpy(
@@ -1251,11 +1430,39 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
if (comm->workFifoBufGdrHandle != nullptr) wc_store_fence();
break;
case ncclDevWorkStorageTypePersistent:
NCCLCHECK(ncclCudaMalloc(&plan->workBufPersistent, workBytes));
plan->kernelArgs->workBuf = plan->workBufPersistent;
NCCLCHECK(ncclCudaMemcpy(plan->workBufPersistent, fifoBuf, workBytes));
ncclMemoryStackPop(&comm->memScoped);
break;
{ ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
void* fifoBufDev = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
// Acquire deviceStream to gain access to deviceStream.cudaStream. Since the
// user's graph will be launched later, and it also acquires the deviceStream,
// it will observe this upload.
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, finish_scope);
CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
plan->workBufPersistent = fifoBufDev;
plan->kernelArgs->workBuf = fifoBufDev;
CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
cudaEvent_t memcpyDone;
CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, finish_scope);
CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
struct uploadWork_cleanup_t* cleanup;
NCCLCHECK(ncclCalloc(&cleanup, 1));
cleanup->base.fn = uploadWork_cleanup_fn;
cleanup->base.event = memcpyDone;
cleanup->hostBuf = fifoBufHost;
ncclIntruQueueEnqueue(&comm->eventCallbackQueue, &cleanup->base);
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, finish_scope);
NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, finish_scope);
finish_scope:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (result != ncclSuccess) return result;
} break;
default: break;
}
return ncclSuccess;
@@ -1269,6 +1476,11 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
struct ncclProxyOp* op = ncclIntruQueueHead(&plan->proxyOpQueue);
while (op != nullptr) {
op->profilerContext = comm->profilerContext;
op->eActivationMask = op->coll <= ncclFuncAllReduce ? op->task.coll->eActivationMask : op->task.p2p->eActivationMask;
op->taskEventHandle = op->coll <= ncclFuncAllReduce ? op->task.coll->eventHandle : op->task.p2p->eventHandle;
ncclProfilerAddPidToProxyOp(op);
uint64_t oldId = op->opCount;
// Ignoring the bottom tag bit, opCount's are zero-based within plan so
// translate them to the tip of the comm's history.
@@ -1303,8 +1515,12 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
}
static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelPlan* plan) {
NCCLCHECK(ncclProfilerStartGroupEvent(plan));
NCCLCHECK(ncclProfilerStartTaskEvents(plan));
NCCLCHECK(uploadProxyOps(comm, plan));
NCCLCHECK(ncclProxyStart(comm));
NCCLCHECK(ncclProfilerStopTaskEvents(plan));
NCCLCHECK(ncclProfilerStopGroupEvent(plan));
if (!plan->persistent) {
// Notify main thread of our reclaiming. This will reclaim plan concurrently.
ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer);
@@ -1373,7 +1589,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
plan->comm = comm;
plan->reclaimer.fn = reclaimPlan;
plan->persistent = persistent;
// uploadWork() promotes ncclDevWorkStorageType[Fifo|Buf]->Args if the work can fit.
// finishPlan() promotes ncclDevWorkStorageType[Fifo|Persistent]->Args if the work can fit.
plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent
: ncclDevWorkStorageTypeFifo;
@@ -1652,10 +1868,15 @@ static ncclResult_t updateCollCostTable(
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
// CollNetDirect is only supported for up to 8 local GPUs
if (a == NCCL_ALGO_COLLNET_DIRECT && comm->maxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue;
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue;
if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
/* now we only support single-node NVLS allgather and reducescatter */
if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue;
/* Tree reduceScatter doesn't support scaling yet */
if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter
&& (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (p == NCCL_PROTO_LL128 && !(comm->topo->type & RCCL_TOPO_XGMI_ALL)) {
table[a][p] = NCCL_ALGO_PROTO_IGNORE;
@@ -1711,6 +1932,8 @@ static ncclResult_t topoGetAlgoInfo(
info->protocol = protocol;
float time = minTime;
// Yes, we are first assigning and then testing if protocol is sane, but that's OK in this case.
// coverity[check_after_sink]
if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) {
WARN("Error : no algorithm/protocol available");
@@ -1746,7 +1969,7 @@ static ncclResult_t topoGetAlgoInfo(
#endif
}
#endif
if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);
if (comm->rank == 0) INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %d proto %d time %f", ncclFuncToString(info->func), nBytes, info->algorithm, info->protocol, time);
if (simInfo) simInfo->estimatedTime = time;
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);
@@ -1819,6 +2042,7 @@ static ncclResult_t topoGetAlgoInfo(
info->nMaxChannels = nc;
}
if (info->algorithm == NCCL_ALGO_TREE) nt = NCCL_MAX_NTHREADS; // Tree now uses all threads always.
if (info->algorithm == NCCL_ALGO_PAT) nt = NCCL_MAX_NTHREADS;
info->nWarps = nt/WARP_SIZE;
return ncclSuccess;
}
@@ -1869,8 +2093,15 @@ static ncclResult_t calcCollChunking(
pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo;
break;
case ncclFuncReduceScatter:
pattern =
info->algorithm == NCCL_ALGO_PAT ? ncclPatternPatUp :
info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
ncclPatternRing;
break;
case ncclFuncAllGather:
pattern =
info->algorithm == NCCL_ALGO_PAT ? ncclPatternPatDown :
info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
ncclPatternRing;
@@ -1897,6 +2128,8 @@ static ncclResult_t calcCollChunking(
case ncclPatternTreeUp:
case ncclPatternTreeDown:
case ncclPatternTreeUpDown:
case ncclPatternPatUp:
case ncclPatternPatDown:
case ncclPatternPipelineFrom:
case ncclPatternPipelineTo:
case ncclPatternCollnetChain:
@@ -1959,13 +2192,17 @@ static ncclResult_t calcCollChunking(
int maxChunkSize = comm->nvlsChunkSize;
if (comm->nNodes > 1 && comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow.
// However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits.
// coverity[overflow_before_widen]
uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads;
if ((nBytes < (64 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
if ((nBytes < (8 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
if ((nBytes < (2 * (concurrentOps * chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
} else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow.
// However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits.
// coverity[overflow_before_widen]
uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads;
chunkSize = comm->nvlsChunkSize;
int maxChunkSize = (int)ncclParamNvlsTreeMaxChunkSize();
@@ -1979,14 +2216,21 @@ static ncclResult_t calcCollChunking(
int nNodes = comm->nNodes;
float ppn = comm->nRanks / (float)nNodes;
float nstepsLL128 = 1+log2i(nNodes) + 0.1*ppn;
// Yes, we are OK with the division on the left side of the < operand being integer.
// coverity[integer_division]
while (nBytes / (nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
// coverity[integer_division]
while (nBytes / (nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
} else if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) {
while (chunkSize*nChannels*32 > nBytes && chunkSize > 65536) chunkSize /= 2;
} else if (info->func == ncclFuncReduceScatter && info->algorithm == NCCL_ALGO_PAT) {
while (chunkSize*nChannels*16 > nBytes && chunkSize > 65536) chunkSize /= 2;
}
// Compute directFlags of work struct.
if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
// Set direct direction for broadcast-gather (read or write)
*outDirectFlags = (nBytes/nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ;
*outDirectFlags = (nBytes/nChannels <= 1024 * 4) ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
} else {
*outDirectFlags = 0;
}
@@ -2035,6 +2279,10 @@ static ncclResult_t calcCollChunking(
}
}
if (pattern == ncclPatternPatUp || pattern == ncclPatternPatDown) {
proxyOp->nbytes = DIVUP(nBytes, nChannels);
}
*outChunkSize = chunkSize;
return ncclSuccess;
}
@@ -2066,6 +2314,7 @@ static ncclResult_t hostToDevRedOp(
opFull->proxyOp = op;
int nbits = 8*ncclTypeSize(datatype);
if (nbits <= 0) return ncclInvalidArgument;
uint64_t allBits = uint64_t(-1)>>(64-nbits);
uint64_t signBit = allBits^(allBits>>1);
@@ -2151,6 +2400,9 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
ncclGroupCommJoin(info->comm);
struct ncclTaskP2p* p2p = ncclMemoryStackAlloc<struct ncclTaskP2p>(&comm->memScoped);
p2p->buff = (void*)info->recvbuff;
p2p->count = info->count;
p2p->datatype = info->datatype;
p2p->root = info->root;
p2p->bytes = nBytes;
p2p->opCount = comm->opCount;
ncclIntruQueueEnqueue(
@@ -2242,7 +2494,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
while (true) {
if (l == nullptr) { // Got to the end, this must be a new stream.
struct ncclCudaGraph graph;
NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream))
NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream));
if (planner->streams != nullptr && !ncclCudaGraphSame(planner->capturingGraph, graph)) {
WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph.");
return ncclInvalidUsage;
@@ -2294,7 +2546,7 @@ exit:
NCCLCHECK(ncclGroupEndInternal());
/* if depth is 1, ncclGroupEndInternal() will trigger group ops. The state can change
* so we have to check state here. */
if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) };
if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)); }
return ret;
fail:
if (info->comm && !info->comm->config.blocking) (void) ncclCommSetAsyncError(info->comm, ret);
@@ -2312,7 +2564,8 @@ ncclResult_t ncclRedOpCreatePreMulSum_impl(ncclRedOp_t *op, void *scalar, ncclDa
int cap = 2*comm->userRedOpCapacity;
if (cap < 4) cap = 4;
ncclUserRedOp *ops = new ncclUserRedOp[cap];
std::memcpy(ops, comm->userRedOps, comm->userRedOpCapacity*sizeof(ncclUserRedOp));
if (comm->userRedOpCapacity > 0)
std::memcpy(ops, comm->userRedOps, comm->userRedOpCapacity*sizeof(ncclUserRedOp));
for(int ix=comm->userRedOpCapacity; ix < cap; ix++)
ops[ix].freeNext = ix + 1;
delete[] comm->userRedOps;
@@ -2328,8 +2581,10 @@ ncclResult_t ncclRedOpCreatePreMulSum_impl(ncclRedOp_t *op, void *scalar, ncclDa
user->datatype = datatype;
user->opFull.op = ncclDevPreMulSum;
if (residence == ncclScalarHostImmediate) {
int size = ncclTypeSize(datatype);
if (size < 1) return ncclInternalError;
user->opFull.scalarArgIsPtr = false;
std::memcpy(&user->opFull.scalarArg, scalar, ncclTypeSize(datatype));
std::memcpy(&user->opFull.scalarArg, scalar, size);
} else {
user->opFull.scalarArgIsPtr = true;
user->opFull.scalarArg = reinterpret_cast<uint64_t>(scalar);
@@ -2346,6 +2601,10 @@ ncclResult_t ncclRedOpDestroy_impl(ncclRedOp_t op, ncclComm_t comm) {
WARN("ncclRedOpDestroy : operator is a NCCL builtin.");
return ncclInvalidArgument;
}
// int(ncclMaxRedOp) < int(op) will always be false due to the sizes of
// the datatypes involved, and that's by design. We keep the check though
// just as a reminder.
// coverity[result_independent_of_operands]
if (int(op) < 0 || int(ncclMaxRedOp) < int(op)) {
WARN("ncclRedOpDestroy : operator is garbage.");
return ncclInvalidArgument;
+31 -25
Melihat File
@@ -424,6 +424,8 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
}
}
channel->collnetDirect.nHeads = nHeads;
// nHeads should always be greater than 0.
// coverity[divide_by_zero]
channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
@@ -713,21 +715,23 @@ RCCL_PARAM(OutputTrees, "OUTPUT_TREES", 0);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent, int nc) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
ncclResult_t ret = ncclSuccess;
int *ringRecv = NULL, *ringSend = NULL, *ringPrev = NULL, *ringNext = NULL, *treeToParent = NULL, *treeToChild0 = NULL, *treeToChild1 = NULL, *nvlsHeads = NULL;
int nranks = comm->nRanks;
int nNodes = comm->nNodes;
int nChannels = comm->nChannels;
int minHeadNum = INT_MAX;
int shared = parent && parent->nvlsSupport && parent->config.splitShare;
int maxChannels;
int minNchannels, maxNchannels;
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));
NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&ringNext, nranks*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
// Alternate rings to avoid crossing rails
if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) {
@@ -804,7 +808,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
}
// Only use full MAXCHANNELS for gfx94x and gfx950
int maxChannels = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ?
maxChannels = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ?
((comm->topo->nodes[GPU].nodes[0].gpu.cu == 80 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 20 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 38)
? comm->topo->nodes[GPU].nodes[0].gpu.cu : MAXCHANNELS) : 2*CHANNEL_LIMIT;
@@ -817,7 +821,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
if (nChannels <= maxChannels/2) memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
// Get number of channels after duplication
int maxNchannels = std::min((int)ncclMaxNchannels(), maxChannels);
maxNchannels = std::min((int)ncclMaxNchannels(), maxChannels);
nc = std::min(maxNchannels/comm->nChannels, nc);
nc *= comm->nChannels;
@@ -845,7 +849,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
int collNetNchannels = std::min(maxChannels, nChannels+nChannels/2);
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
}
NCCLCHECK(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]));
NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
}
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
@@ -859,7 +863,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
}
int minNchannels = ncclMinNchannels();
minNchannels = ncclMinNchannels();
if (comm->nNodes > 1) {
minNchannels = std::min(64, minNchannels);
}
@@ -900,7 +904,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
if (comm->nChannels < comm->nvlsChannels) {
nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
}
NCCLCHECK(connectNvls(comm, nvlsHeads, minHeadNum));
NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail);
#endif
if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
@@ -908,16 +912,18 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
}
// Create rings array and check all is fine
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
NCCLCHECKGOTO(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext), ret, fail);
free(ringRecv);
free(ringSend);
free(ringPrev);
free(ringNext);
free(treeToParent);
free(treeToChild0);
free(treeToChild1);
free(nvlsHeads);
return ncclSuccess;
exit:
if (ringRecv) free(ringRecv);
if (ringSend) free(ringSend);
if (ringPrev) free(ringPrev);
if (ringNext) free(ringNext);
if (treeToParent) free(treeToParent);
if (treeToChild0) free(treeToChild0);
if (treeToChild1) free(treeToChild1);
if (nvlsHeads) free(nvlsHeads);
return ret;
fail:
goto exit;
}
+70 -77
Melihat File
@@ -38,13 +38,13 @@ NCCL_PARAM(NvbDisable, "NVB_DISABLE", 0);
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
if (baseNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
for (int i=0; i<system->nodes[baseNode->type].count; i++) baseNode->paths[baseNode->type][i].type = PATH_DIS;
}
// breadth-first search to set all paths to that node in the system
struct ncclTopoNodeList nodeList;
struct ncclTopoNodeList nextNodeList;
struct ncclTopoNodeList nextNodeList = { { 0 }, 0 };
nodeList.count = 1; nodeList.list[0] = baseNode;
nextNodeList.count = 0;
struct ncclTopoLinkList* basePath;
NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
basePath->count = 0;
@@ -82,7 +82,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
}
if (remPath->list[0] == NULL) {
WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
remNode->type, remNode->id, remNode->nlinks, node->type, node->id);
remNode->type, remNode->id, remNode->nlinks, node->type, node->id);
return ncclInternalError;
}
// Copy the rest of the path
@@ -121,9 +121,9 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n
const int linesize = 2048;
char line[linesize];
#ifdef ENABLE_TRACE
INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id);
INFO(NCCL_GRAPH, "Paths from %s/%lx-%lx :", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id));
#else
snprintf(line, linesize, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
snprintf(line, linesize, "%s/%lx-%lx :", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id));
int offset = strlen(line);
#endif
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
@@ -160,14 +160,14 @@ ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) {
return ncclSuccess;
}
static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
// Find the closest CPU to a GPU
int minHops = 0;
int localCpu = -1;
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
for (int c=0; c<system->nodes[CPU].count; c++) {
int hops = paths[c].count;
if (minHops == 0 || hops < minHops) {
if (hops > 0 && (minHops == 0 || hops < minHops)) {
localCpu = c;
minHops = hops;
}
@@ -198,20 +198,15 @@ static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix,
return ncclSuccess;
}
// Remove/free paths for a given type
static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) {
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
// Remove links _to_ the given type
for (int n=0; n<system->nodes[t].count; n++) {
struct ncclTopoNode* node = system->nodes[t].nodes+n;
free(node->paths[nodeType]);
node->paths[nodeType] = NULL;
}
// Remove links _from_ the given type
for (int n=0; n<system->nodes[nodeType].count; n++) {
struct ncclTopoNode* node = system->nodes[nodeType].nodes+n;
free(node->paths[t]);
node->paths[t] = NULL;
// Remove/free all paths
static void ncclTopoRemovePaths(struct ncclTopoSystem* system) {
for (int t1=0; t1<NCCL_TOPO_NODE_TYPES; t1++) {
for (int n=0; n<system->nodes[t1].count; n++) {
struct ncclTopoNode* node = system->nodes[t1].nodes+n;
for (int t2=0; t2<NCCL_TOPO_NODE_TYPES; t2++) {
if (node->paths[t2]) free(node->paths[t2]);
node->paths[t2] = NULL;
}
}
}
}
@@ -225,6 +220,7 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
if (str) {
int disable = strtol(str, NULL, 0);
if (disable == 1) l = 0;
if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %d", disableEnv, disable);
}
}
if (l == -1) {
@@ -246,9 +242,9 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
if (oldLevel > maxOldLevel) oldLevel = maxOldLevel;
l = levelsOldToNew[oldLevel];
}
if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
}
}
if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
*level = l >= 0 ? l : -2;
}
return ncclSuccess;
@@ -257,16 +253,16 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0);
int ncclTopoUserP2pLevel = -1;
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) {
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank) {
*p2p = 0;
if (read) *read = 0;
if (intermediateRank) *intermediateRank = -1;
// Get GPUs from topology
int g1, g2;
NCCLCHECK(ncclTopoIdToIndex(system, GPU, id1, &g1));
NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1));
struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
if (ncclTopoIdToIndex(system, GPU, id2, &g2) == ncclInternalError) {
if (ncclTopoRankToIndex(system, rank2, &g2) == ncclInternalError) {
// GPU not found, we can't use p2p.
return ncclSuccess;
}
@@ -282,8 +278,13 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
}
}
// In general, use P2P whenever we can.
int p2pLevel = PATH_SYS;
// By default don't use P2P across CPU Host Bridges and further apart
int p2pLevel = PATH_PXB;
int arch, vendor, model;
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
// Allow P2P between pairs of GPUs on AMD systems
if ((arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD) && system->nodes[GPU].count <= 2) p2pLevel = PATH_SYS;
// User override
if (ncclTopoUserP2pLevel == -1)
@@ -293,16 +294,6 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
goto compare;
}
// Don't use P2P through ARM CPUs
int arch, vendor, model;
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB;
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
p2pLevel = PATH_PXB;
}
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
p2pLevel = PATH_PXB;
}
compare:
// Compute the PCI distance and compare with the p2pLevel.
@@ -364,7 +355,7 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
INFO(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x",
info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId);
info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId);
*ret = 1;
}
return ncclSuccess;
@@ -471,7 +462,7 @@ ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int
NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 1);
// Check whether going through the network would be faster than going through P2P/SHM.
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net) {
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net) {
if (ncclParamNetDisableIntra() == 1) {
*net = 0;
return ncclSuccess;
@@ -479,8 +470,8 @@ ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_
*net = 1;
// First check the current GPU-to-GPU speed.
int g1, g2;
if (ncclTopoIdToIndex(system, GPU, id1, &g1) != ncclSuccess ||
ncclTopoIdToIndex(system, GPU, id2, &g2) != ncclSuccess) {
if (ncclTopoRankToIndex(system, rank1, &g1) != ncclSuccess ||
ncclTopoRankToIndex(system, rank2, &g2) != ncclSuccess) {
return ncclSuccess;
}
@@ -602,7 +593,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
// Precompute paths between GPUs/NICs.
// Remove everything in case we're re-computing
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
ncclTopoRemovePaths(system);
// Set direct paths to CPUs. We need them in many cases.
for (int c=0; c<system->nodes[CPU].count; c++) {
@@ -628,11 +619,11 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
for (int g=0; g<system->nodes[GPU].count; g++) {
for (int p=0; p<system->nodes[GPU].count; p++) {
int p2p;
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].gpu.rank, system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL));
if (p2p == 0) {
// Divert all traffic through the CPU
int cpu;
NCCLCHECK(getLocalCpu(system, g, &cpu));
NCCLCHECK(ncclGetLocalCpu(system, g, &cpu));
NCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g));
}
}
@@ -644,10 +635,10 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
if (p == g) continue;
struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank;
int p2p;
NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, comm, NULL, srcInfo, dstInfo));
if (p2p == 0) {
int shm;
NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, comm, NULL, srcInfo, dstInfo));
if (shm == 0) {
// Mark this peer as inaccessible. We'll trim it later.
system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET;
@@ -694,7 +685,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink
NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) && // Is on the same node as us
(peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC
gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU
gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU
// We can use that GPU as relay to communicate with that NIC.
// Only enabling it in the GPU->NIC direction for now to favor
// receiving locally and sending remotely (consistent with net.cc)
@@ -708,7 +699,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
if (gdr == 0) {
// We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
int localCpu;
NCCLCHECK(getLocalCpu(system, g, &localCpu));
NCCLCHECK(ncclGetLocalCpu(system, g, &localCpu));
NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
}
@@ -721,11 +712,16 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
RCCL_PARAM(EnableIntranet, "ENABLE_INTRANET", -2);
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) {
ncclResult_t ret = ncclSuccess;
int *domains;
int64_t *ids;
NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count));
int64_t *ids = NULL;
int myDomain = 0;
int ngpus = system->nodes[GPU].count;
int remove = 1;
int gdr = 1;
bool allXgmi = true;
NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
NCCLCHECKGOTO(ncclCalloc(&ids, system->nodes[GPU].count), ret, fail);
for (int g=0; g<system->nodes[GPU].count; g++) {
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
domains[g] = g;
@@ -738,7 +734,6 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
if (gpu->gpu.rank == comm->rank) myDomain = domains[g];
}
int ngpus = system->nodes[GPU].count;
for (int i=0; i<ngpus; i++) {
if (domains[i] == myDomain) continue;
struct ncclTopoNode* gpu = NULL;
@@ -749,11 +744,10 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
}
if (gpu == NULL) {
WARN("Could not find id %lx", ids[i]);
free(domains);
free(ids);
return ncclInternalError;
ret = ncclInternalError;
goto fail;
}
NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
NCCLCHECKGOTO(ncclTopoRemoveNode(system, GPU, g), ret, fail);
}
// trim low speed port on same NIC
@@ -772,15 +766,12 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
if (system->nodes[NET].nodes[n].net.bw == 0) break;
}
if (n<system->nodes[NET].count) {
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
NCCLCHECKGOTO(ncclTopoRemoveNode(system, NET, n), ret, fail);
}
else
break;
} while (system->nodes[NET].count);
int remove = 1;
int gdr = 1;
bool allXgmi = true;
// detect if all GPUs are connected by XGMI
for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
@@ -788,15 +779,15 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
if (i == j) continue;
int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
bool isXGMI;
NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
NCCLCHECKGOTO(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI), ret, fail);
allXgmi &= isXGMI;
}
}
if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL;
for (int g = 0; g < system->nodes[GPU].count; g++) {
int64_t netId;
NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &netId, nullptr));
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netId, 1, &gdr));
NCCLCHECKGOTO(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, 0, &netId, nullptr), ret, fail);
NCCLCHECKGOTO(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netId, 1, &gdr), ret, fail);
if (!gdr) break;
}
if (gdr && !allXgmi) {
@@ -813,16 +804,18 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
comm->localRanks = system->nodes[GPU].count;
if (system->nodes[GPU].count == comm->nRanks && remove) {
for (int n=system->nodes[NET].count-1; n>=0; n--)
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
NCCLCHECKGOTO(ncclTopoRemoveNode(system, NET, n), ret, fail);
}
exit:
free(domains);
free(ids);
return ncclSuccess;
if (ids) free(ids);
return ret;
fail:
goto exit;
}
void ncclTopoFree(struct ncclTopoSystem* system) {
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
ncclTopoRemovePaths(system);
free(system);
}
@@ -851,17 +844,17 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp
// Remote rank, use network
int nNetChannels = ncclParamNChannelsPerNetPeer();
if (nNetChannels == -1) {
//start from 2 channels per NIC and reduce with scale
nNetChannels = 2;
//start from 2 channels per NIC and reduce with scale
nNetChannels = 2;
// check if we need to use more than one NIC, hence more than one channel
int netCountByBw = 1, nChannelsMax = nNetChannels;
NCCLCHECK(getLocalNetCountByBw(system, g, &netCountByBw));
// Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
while (nChannelsMax*comm->nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
// check if we need to use more than one NIC, hence more than one channel
int netCountByBw = 1, nChannelsMax = nNetChannels;
NCCLCHECK(getLocalNetCountByBw(system, g, &netCountByBw));
// Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
while (nChannelsMax*comm->nRanks > comm->p2pnChannels*4 && nChannelsMax > 1) nChannelsMax /= 2;
//allow upto channels requires to drive the NICs
nNetChannels = std::max(netCountByBw, nChannelsMax);
//allow upto channels requires to drive the NICs
nNetChannels = std::max(netCountByBw, nChannelsMax);
}
*nChannels = nNetChannels;
}
+17 -11
Melihat File
@@ -6,17 +6,23 @@
#include "core.h"
#define MAXWIDTH 20
#define PREFIXLEN 15
#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
void dumpLine(int* values, int nranks, const char* prefix) {
int prefixlen = strlen(prefix);
char line[STRLENGTH+1];
line[STRLENGTH] = '\0';
memset(line, ' ', STRLENGTH);
strncpy(line, prefix, PREFIXLEN);
for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
INFO(NCCL_INIT,"%s", line);
constexpr int line_length = 128;
char line[line_length];
int num_width = snprintf(nullptr, 0, "%d", nranks-1); // safe as per "man snprintf"
int n = snprintf(line, line_length, "%s", prefix);
for (int i = 0; i < nranks && n < line_length-1; i++) {
n += snprintf(line + n, line_length - n, " %*d", num_width, values[i]);
// At this point n may be more than line_length-1, so don't use it
// for indexing into "line".
}
if (n >= line_length) {
// Sprintf wanted to write more than would fit in the buffer. Assume
// line_length is at least 4 and replace the end with "..." to
// indicate that it was truncated.
snprintf(line+line_length-4, 4, "...");
}
INFO(NCCL_INIT, "%s", line);
}
ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
@@ -32,7 +38,7 @@ ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* p
rings[r*nranks+i] = current;
current = next[r*nranks+current];
}
sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
snprintf(prefix, sizeof(prefix), "Channel %02d/%02d :", r, nrings);
if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
if (current != rank) {
WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
+94 -68
Melihat File
@@ -108,6 +108,9 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink));
revBw += fwBw;
}
// Coverity thinks that revLink could be NULL below. However, we access it only if revBw is non-0, and the
// logic of the code is that revBw can become non-0 only if revLink is non-NULL (see the "if" statement right above).
// coverity[var_deref_op]
if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; }
SUB_ROUND(link->bw, fwBw);
if (revBw) SUB_ROUND(revLink->bw, revBw);
@@ -177,7 +180,7 @@ static int gpuPciBw(struct ncclTopoNode* gpu) {
}
/* Choose the order in which we try next GPUs. This is critical for the search
to quickly converge to the best solution even if it eventually times out. */
to quickly converge to the best solution even if it eventually times out. */
struct ncclGpuScore {
int g; // Retain the index
int startIndex; // Least important
@@ -189,15 +192,15 @@ struct ncclGpuScore {
};
static int cmpScore(const void * g1, const void * g2) {
struct ncclGpuScore *s1 = (struct ncclGpuScore*)g1;
struct ncclGpuScore *s2 = (struct ncclGpuScore*)g2;
int d;
if ((d = (s2->interBw - s1->interBw))) return d;
if ((d = (s2->interPciBw - s1->interPciBw))) return d;
if ((d = (s1->interNhops - s2->interNhops))) return d;
if ((d = (s2->intraBw - s1->intraBw))) return d;
if ((d = (s1->intraNhops - s2->intraNhops))) return d;
return s1->startIndex - s2->startIndex;
struct ncclGpuScore *s1 = (struct ncclGpuScore*)g1;
struct ncclGpuScore *s2 = (struct ncclGpuScore*)g2;
int d;
if ((d = (s2->interBw - s1->interBw))) return d;
if ((d = (s2->interPciBw - s1->interPciBw))) return d;
if ((d = (s1->interNhops - s2->interNhops))) return d;
if ((d = (s2->intraBw - s1->intraBw))) return d;
if ((d = (s1->intraNhops - s2->intraNhops))) return d;
return s1->startIndex - s2->startIndex;
}
static int cmpIntraScores(struct ncclGpuScore* scores, int count) {
@@ -481,6 +484,7 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
// 2. add other NETs satisfying typeInter but not already in the list.
ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
ncclResult_t ret = ncclSuccess;
int netCount = 0;
int localNetCount;
int* localNets;
@@ -493,8 +497,8 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
for (int c = 0; c<MAXCHANNELS; c++) {
int64_t netId;
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL), ret, fail);
NCCLCHECKGOTO(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount), ret, fail);
if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
localNetCount++;
}
@@ -528,12 +532,15 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
}
*netCountRet = netCount;
exit:
free(localNets);
return ncclSuccess;
return ret;
fail:
goto exit;
}
ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
ncclResult_t ret = ncclSuccess;
if ((*time) <= 0) return ncclSuccess;
(*time)--;
@@ -555,6 +562,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
}
graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
int g = gpu - system->nodes[GPU].nodes;
int* nets = NULL;
if (step == backToNet) {
// first get back to NIC
if (system->nodes[NET].count) {
@@ -562,15 +570,17 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
int netCount;
int* nets;
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount));
NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount), ret, fail);
for (int i=0; i<netCount; i++) {
int n = nets[i];
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2]) continue;
if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2) {
if (graph->nChannels & 1 && net->id != graph->inter[(graph->nChannels-1)*2]) continue;
} else {
if (graph->crossNic == 0 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
}
// Balanced Tree : count half of the bandwidth on first two GPUs
int nextBackToNet = -1;
@@ -582,18 +592,17 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
graph->bwInter /= 2;
}
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net), ret, fail);
graph->bwInter = bwInterSave;
if (net) {
graph->inter[graph->nChannels*2+1] = net->id;
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
NCCLCHECKGOTO(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time), ret, fail);
if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2;
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net), ret, fail);
graph->bwInter = bwInterSave;
}
}
free(nets);
}
} else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time));
@@ -629,23 +638,29 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
// Next path
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
}
return ncclSuccess;
exit:
if (nets) free(nets);
return ret;
fail:
goto exit;
}
ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
ncclResult_t ret = ncclSuccess;
const int bw = graph->bwInter;
int* nets;
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
int netCount;
int graphFound = 0;
NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount), ret, fail);
for (int i=0; i<netCount; i++) {
if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) continue;
if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) break;
int n = nets[(graph->nChannels+i)%netCount];
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
if (graph->collNet && net->net.collSupport == 0) continue;
if (net->net.bw < bw) continue;
if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2+1]) continue;
if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2
&& (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2+1]) continue;
graph->inter[graph->nChannels*2] = net->id;
graph->latencyInter = net->net.latency;
@@ -661,25 +676,28 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
// NVLS search only tries to find NIC:GPU combinations to compute the heads.
if (graph->nChannels < netCount) {
int gpu;
int duplicate = 0;
NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
// check whether there is duplicate head when one GPU connects with multiple NICs
for (int gc = 0; gc < graph->nChannels; gc++) {
if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) {
duplicate = 1;
break;
NCCLCHECKGOTO(ncclTopoGetLocalGpu(system, net->id, &gpu), ret, fail);
if (gpu != -1) {
int duplicate = 0;
// check whether there is duplicate head when one GPU connects with multiple NICs
for (int gc = 0; gc < graph->nChannels; gc++) {
if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) {
duplicate = 1;
break;
}
}
if (!duplicate) {
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu), ret, fail);
graphFound = 1;
}
}
if (duplicate) continue;
if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
graphFound = 1;
}
} else {
if (graph->nChannels > 0) {
// Try to replay the last channel
int g;
NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
NCCLCHECKGOTO(ncclTopoReplayGetGpu(system, graph, -1, &g), ret, fail);
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g), ret, fail);
}
if (graph->nChannels == 0 || graph->sameChannels == 0) {
if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
@@ -699,7 +717,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
}
}
int t = 1 << 10;
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0), ret, fail);
if (t == -1) *time = -1;
}
@@ -711,7 +729,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
if (paths[g].bw > maxBw) {
maxBw = paths[g].bw;
minHops = paths[g].count;
} else if (paths[g].bw == maxBw && paths[g].count < minHops) {
} else if (paths[g].bw == maxBw && paths[g].count > 0 && paths[g].count < minHops) {
minHops = paths[g].count;
}
}
@@ -719,7 +737,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
for (int i=0; i<system->nodes[GPU].count; i++) {
int g = (graph->nChannels+i)%system->nodes[GPU].count;
if (paths[g].bw == maxBw && paths[g].count == minHops) {
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g), ret, fail);
}
}
}
@@ -733,27 +751,30 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
}
}
}
exit:
free(nets);
return ncclSuccess;
return ret;
fail:
goto exit;
}
/* Search Patterns
*
* Intra-node
* Ring : GPU a -> GPU b -> .. -> GPU x -> GPU a
* (=Split Tree Loop)
* Tree : GPU a -> GPU b -> .. -> GPU x
* (=Split Tree)
*
* Inter-node
* Ring : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic)
* Tree : NET n -> GPU a -> GPU b -> .. -> GPU x
* `--> NET n (or m if crossNic)
* Split Tree : NET n -> GPU a -> GPU b -> .. -> GPU x
* `--> NET n (or m if crossNic)
* Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a
* `--> NET n (or m if crossNic)
*/
*
* Intra-node
* Ring : GPU a -> GPU b -> .. -> GPU x -> GPU a
* (=Split Tree Loop)
* Tree : GPU a -> GPU b -> .. -> GPU x
* (=Split Tree)
*
* Inter-node
* Ring : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic)
* Tree : NET n -> GPU a -> GPU b -> .. -> GPU x
* `--> NET n (or m if crossNic)
* Split Tree : NET n -> GPU a -> GPU b -> .. -> GPU x
* `--> NET n (or m if crossNic)
* Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a
* `--> NET n (or m if crossNic)
*/
ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
if (system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1;
@@ -979,7 +1000,7 @@ RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0);
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
int crossNic = (system->nodes[NET].count > 1) &&
(graph->pattern == NCCL_TOPO_PATTERN_RING ||
(graph->pattern == NCCL_TOPO_PATTERN_RING ||
graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0;
graph->crossNic = crossNic == 1 ? 1 : 0;
@@ -1148,9 +1169,10 @@ search:
}
tmpGraph.typeInter = PATH_PIX;
if (crossNic == 2 && tmpGraph.crossNic == 0) {
if (crossNic == 2 && tmpGraph.crossNic == 0
&& (graph->pattern == NCCL_TOPO_PATTERN_RING || graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE)) {
// Try again with crossNic if permitted
tmpGraph.crossNic = 1;
tmpGraph.crossNic = 2;
goto search;
}
tmpGraph.crossNic = crossNic == 1 ? 1 : 0;
@@ -1220,7 +1242,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
sprintf(line, "%2d :", c);
int offset = strlen(line);
if (system->nodes[NET].count > 0 && system->nodes[GPU].count != system->nRanks && !graph->nIntraChannels) {
sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c]);
sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c]));
offset = strlen(line);
}
for (int i=0; i<ngpus; i++) {
@@ -1238,7 +1260,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
}
}
if (system->nodes[NET].count > 0 && system->nodes[GPU].count != system->nRanks && !graph->nIntraChannels) {
sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c+1]);
sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c]));
offset = strlen(line);
}
INFO(NCCL_GRAPH, "%s", line);
@@ -1247,16 +1269,20 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
}
ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) {
ncclResult_t ret = ncclSuccess;
const char* str = ncclGetEnv("NCCL_GRAPH_DUMP_FILE");
struct ncclXml* xml = NULL;
if (str) {
INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str);
struct ncclXml* xml;
NCCLCHECK(xmlAlloc(&xml, NCCL_GRAPH_XML_MAX_NODES));
NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml));
NCCLCHECK(ncclTopoDumpXmlToFile(str, xml));
free(xml);
NCCLCHECKGOTO(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml), ret, fail);
NCCLCHECKGOTO(ncclTopoDumpXmlToFile(str, xml), ret, fail);
}
return ncclSuccess;
exit:
if (xml) free(xml);
return ret;
fail:
goto exit;
}
#include "comm.h"
+98 -90
Melihat File
@@ -60,8 +60,8 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode*
for (int l=0; l<node->nlinks; l++) {
// Go up the PCI tree to find the CPU. Follow only PCI switches.
if (node->links[l].type == LINK_PCI
&& (node->links[l].remNode->type == PCI
|| node->links[l].remNode->type == CPU)) {
&& (node->links[l].remNode->type == PCI
|| node->links[l].remNode->type == CPU)) {
NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
}
if (*cpu != NULL) return ncclSuccess;
@@ -198,6 +198,7 @@ int getBcmGen(uint64_t id, int level) {
return 0;
}
ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
ncclResult_t ret = ncclSuccess;
for (int s=0; s<system->nodes[PCI].count; s++) {
struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s;
int gen = getBcmGen(pciSwitch->pci.device, 0);
@@ -223,7 +224,7 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
for (int s=0; s<subs; s++) {
// Find sub switch (system->nodes[PCI].nodes is changing every time we remove a node)
int index;
NCCLCHECK(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index));
NCCLCHECKGOTO(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index), ret, fail);
struct ncclTopoNode* sub = system->nodes[PCI].nodes+index;
// Connect all sub PCI devices to the parent switch
for (int l=0; l<sub->nlinks; l++) {
@@ -232,7 +233,8 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
// Add link from parent PCI switch -> PCI device
if (pciSwitch->nlinks == NCCL_TOPO_MAX_LINKS) {
WARN("Error : too many Topo links (max %d)", NCCL_TOPO_MAX_LINKS);
return ncclInternalError;
ret = ncclInternalError;
goto fail;
}
memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink));
pciSwitch->nlinks++;
@@ -244,16 +246,20 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
}
}
}
NCCLCHECK(ncclTopoRemoveNode(system, PCI, index));
NCCLCHECKGOTO(ncclTopoRemoveNode(system, PCI, index), ret, fail);
}
// Set subdevice to 0xffff to make sure we don't merge this switch again.
pciSwitch->pci.device |= 0xffff;
free(subSwIds);
// Restart, as system->nodes[PCI].nodes has changed.
s = 0;
continue;
fail:
free(subSwIds);
return ret;
}
}
return ncclSuccess;
return ret;
}
ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
@@ -287,7 +293,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
if (link->type == LINK_LOC) {
sprintf(line+offset, "+ %s[%2.1f] - %s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], link->remNode->id);
sprintf(line+offset, "+ %s[%2.1f] - %s/%lx-%lx", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id));
INFO(NCCL_GRAPH, "%s", line);
} else if (link->type != LINK_PCI || link->remNode != prevNode) {
sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw);
@@ -296,9 +302,9 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
} else {
if (link->remNode->type == NET) {
sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
sprintf(line+nextOffset, "%s/%lx-%lx (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id), link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
} else {
sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
sprintf(line+nextOffset, "%s/%lx-%lx", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id));
}
INFO(NCCL_GRAPH, "%s", line);
}
@@ -807,84 +813,87 @@ ncclResult_t ncclTopoRefreshBcmP2pLinks(void) {
}
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
ncclResult_t ret = ncclSuccess;
struct ncclXml* xml;
char* mem = NULL;
int* localRanks = NULL;
int netDevCount = 0;
struct ncclXml* rankXml;
int localRank = -1, nLocalRanks = 0;
NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES));
const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
if (xmlTopoFile) {
INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
NCCLCHECKGOTO(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1), ret, fail);
} else {
// Try default XML topology location
NCCLCHECK(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0));
NCCLCHECKGOTO(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0), ret, fail);
}
if (xml->maxIndex == 0) {
// Create top tag
struct ncclXmlNode* top;
NCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
NCCLCHECKGOTO(xmlAddNode(xml, NULL, "system", &top), ret, fail);
NCCLCHECKGOTO(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION), ret, fail);
}
NCCLCHECK(ncclTopoRefreshBcmP2pLinks());
NCCLCHECKGOTO(ncclTopoRefreshBcmP2pLinks(), ret, fail);
// Detect only the GPU managed by this process. We'll get any others through XML fusion.
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
NCCLCHECK(int64ToBusId(comm->peerInfo[comm->rank].busId, busId));
NCCLCHECKGOTO(int64ToBusId(comm->peerInfo[comm->rank].busId, busId), ret, fail);
struct ncclXmlNode* node;
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
NCCLCHECKGOTO(ncclTopoFillGpu(xml, busId, &node), ret, fail);
if (node) {
NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
NCCLCHECK(xmlSetAttrInt(node, "rank", comm->rank));
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport));
NCCLCHECKGOTO(xmlSetAttrInt(node, "keep", 1), ret, fail);
NCCLCHECKGOTO(xmlSetAttrInt(node, "rank", comm->rank), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport), ret, fail);
}
// Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
// so we start with collnet so that it has precedence.
int netDevCount = 0;
if (collNetSupport(comm)) {
NCCLCHECK(collNetDevices(comm, &netDevCount));
NCCLCHECKGOTO(collNetDevices(comm, &netDevCount), ret, fail);
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
NCCLCHECK(collNetGetProperties(comm, n, &props));
NCCLCHECKGOTO(collNetGetProperties(comm, n, &props), ret, fail);
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail);
NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail);
NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail);
NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail);
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "coll", 1), ret, fail);
}
}
if (netDevCount == 0) {
NCCLCHECK(comm->ncclNet->devices(&netDevCount));
NCCLCHECKGOTO(comm->ncclNet->devices(&netDevCount), ret, fail);
}
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
NCCLCHECK(comm->ncclNet->getProperties(n, &props));
NCCLCHECKGOTO(comm->ncclNet->getProperties(n, &props), ret, fail);
comm->netDeviceType = props.netDeviceType;
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail);
NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail);
NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail);
NCCLCHECKGOTO(xmlInitAttrFloat(netNode, "latency", props.latency), ret, fail);
NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail);
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail);
}
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
NCCLCHECK(ncclTopoTrimXml(xml));
NCCLCHECKGOTO(ncclTopoTrimXml(xml), ret, fail);
// XML topo fusion.
int* localRanks;
int localRank = -1, nLocalRanks = 0;
if (comm->MNNVL) {
// MNNVL clique support
nLocalRanks = comm->clique.size;
@@ -892,7 +901,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
localRanks = comm->clique.ranks;
} else {
// Intra-node fusion. Much of the comm is not initialized yet at this point so we need to do our own calculations.
NCCLCHECK(ncclCalloc(&localRanks, comm->nRanks));
NCCLCHECKGOTO(ncclCalloc(&localRanks, comm->nRanks), ret, fail);
for (int i = 0; i < comm->nRanks; i++) {
if (comm->peerInfo[i].hostHash == comm->peerInfo[comm->rank].hostHash) {
if (i == comm->rank)
@@ -901,37 +910,42 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
}
}
}
char* mem;
NCCLCHECK(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank);
NCCLCHECKGOTO(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)), ret, fail);
rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank);
memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
NCCLCHECKGOTO(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1), ret, fail);
// nLocalRanks can't actually be 0, or we wouldn't be running at all...
// coverity[divide_by_zero]
NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)), ret, fail);
if (comm->MNNVL) {
// Ensure that we have enough room when fusing topos from multiple nodes.
free(xml);
NCCLCHECK(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES));
xml = NULL;
NCCLCHECKGOTO(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES), ret, fail);
} else {
// In the intra-node case there's no need to enlarge the topo xml.
xml->maxIndex = 0;
free(localRanks);
}
for (int i = 0; i < nLocalRanks; i++) {
struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
NCCLCHECK(ncclTopoFuseXml(xml, peerXml));
NCCLCHECKGOTO(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0), ret, fail);
NCCLCHECKGOTO(ncclTopoFuseXml(xml, peerXml), ret, fail);
}
free(mem);
xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
NCCLCHECKGOTO(ncclTopoDumpXmlToFile(xmlTopoFile, xml), ret, fail);
}
NCCLCHECK(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash));
NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail);
exit:
if (!comm->MNNVL && localRanks) free(localRanks);
if (mem) free(mem);
free(xml);
return ncclSuccess;
return ret;
fail:
goto exit;
}
ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) {
@@ -940,6 +954,7 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index
int count = 0;
NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count));
struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType];
if (paths == NULL) { *localCount = 0; return ncclSuccess; }
for (int i=0; i<system->nodes[resultType].count; i++) {
if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) {
maxBw = paths[i].bw;
@@ -963,13 +978,13 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
//caveat, this could be wrong if there is a PCIe switch,
//and a narrower link to the CPU
if (system->nodes[GPU].nodes[gpu].links[l].remNode->type == CPU) {
gpuBw = system->nodes[GPU].nodes[gpu].links[l].bw;
gpuBw = system->nodes[GPU].nodes[gpu].links[l].bw;
}
}
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
for (int l=0; (l < localNetCount) && (totalNetBw < gpuBw); l++, netCountByBw++) {
totalNetBw += system->nodes[GPU].nodes[gpu].paths[NET][localNets[l]].bw;
totalNetBw += system->nodes[GPU].nodes[gpu].paths[NET][localNets[l]].bw;
}
*count = netCountByBw;
@@ -978,6 +993,7 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
}
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
ncclResult_t ret = ncclSuccess;
int gpu;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
int* localNets = NULL;
@@ -985,13 +1001,13 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
int* localGpus = NULL;
int localGpuCount;
int net = 0;
if (localNetCount == 0) {
*id = -1;
free(localNets);
return ncclSuccess;
}
NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL));
int net = 0;
NCCLCHECKGOTO(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL), ret, fail);
for (int i = 0; i < localGpuCount; i++) {
if (gpu == localGpus[i]) {
net = i;
@@ -1007,33 +1023,39 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
}
exit:
free(localNets);
free(localGpus);
return ncclSuccess;
if (localGpus) free(localGpus);
return ret;
fail:
goto exit;
}
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) {
ncclResult_t ret = ncclSuccess;
int netIndex;
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &netIndex));
int* localGpus = NULL;
int localGpuCount;
int foundGpu = -1;
NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
for (int c=0; c<MAXCHANNELS; c++) {
for (int lg=0; lg<localGpuCount; lg++) {
int g = localGpus[lg];
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
int64_t id;
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL));
NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL), ret, fail);
if (netId == id) {
*gpuIndex = g;
free(localGpus);
return ncclSuccess;
foundGpu = g;
goto exit;
}
}
}
exit:
*gpuIndex = foundGpu;
fail:
free(localGpus);
*gpuIndex = -1;
return ncclSuccess;
return ret;
}
/****************************/
@@ -1051,25 +1073,11 @@ NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) {
struct ncclTopoNode* cpu = NULL, *gpu = NULL;
for (int g=0; g<system->nodes[GPU].count; g++) {
if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
gpu = system->nodes[GPU].nodes+g;
// Find closer CPU
int cpuIndex = -1, minHops = 0;
for (int c=0; c<system->nodes[CPU].count; c++) {
int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
if (cpuIndex == -1 || nHops < minHops) {
cpuIndex = c;
minHops = nHops;
}
}
cpu = system->nodes[CPU].nodes+cpuIndex;
}
}
if (cpu == NULL) {
WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank);
return ncclInternalError;
}
int gpuIndex, cpuIndex;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpuIndex));
NCCLCHECK(ncclGetLocalCpu(system, gpuIndex, &cpuIndex));
gpu = system->nodes[GPU].nodes+gpuIndex;
cpu = system->nodes[CPU].nodes+cpuIndex;
// Query the CPU affinity set we were provided
cpu_set_t mask;
+4 -3
Melihat File
@@ -37,7 +37,7 @@
// to GPU traffic consumes more PCI bandwidth.
#define INTEL_P2P_OVERHEAD(bw) (bw*6/5)
#define NCCL_TOPO_NODE_TYPES 7
#define NCCL_TOPO_NODE_TYPES 6
#define GPU 0
#define PCI 1
#define NVS 2
@@ -111,9 +111,10 @@ struct ncclTopoLinkList {
#define NCCL_TOPO_UNDEF (-1)
#define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff
#define NCCL_TOPO_ID_SYSTEM_ID(id) (id >> 56)
#define NCCL_TOPO_ID_LOCAL_ID(id) (id & 0x00ffffffffffffff)
#define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + localid)
#define NCCL_TOPO_ID_LOCAL_ID(id) (id & NCCL_TOPO_ID_LOCAL_ID_MASK)
#define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + (localid & NCCL_TOPO_ID_LOCAL_ID_MASK))
#define RCCL_TOPO_CR8G 1
#define RCCL_TOPO_4P2H_ROME 2
+52 -29
Melihat File
@@ -25,7 +25,7 @@ static int getNthreads(const char* name, int env, int min, int max, int def, int
} else if (nt < min) {
WARN("Invalid %s %d (minimum %d).", name, nt, min);
nt = min;
}
}
} else {
nt = def;
}
@@ -55,9 +55,9 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
// Latencies in us, Bandwidths in GB/s
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
{ 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, // Tree, Ring
{ 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, // Collnet Direct, Chain
{ 0, 0, 0 }, { 0, 0, 0 }}; // NVLS, NVLS Tree
{ 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, // Tree, Ring
{ 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, // Collnet Direct, Chain
{ 0, 0, 0 }, { 0, 0, 0 }}; // NVLS, NVLS Tree
// NVLink, PCI, Network
#define NCCL_HW_NVLINK 0
@@ -288,6 +288,15 @@ static const double perChMaxTreeBws[3][3] = {
/* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
};
NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
static int ncclPatEnable(struct ncclComm* comm) {
int patEnable = ncclParamPatEnable();
if (patEnable != 2) return patEnable;
if (comm->nNodes != comm->nRanks) return 0; // PAT only supports 1 GPU per node
if (comm->netDeviceType != NCCL_NET_DEVICE_HOST) return 0; // PAT doesn't support net device offload
return 1;
}
// Network post overhead in ns (1000 = 1 us)
NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);
@@ -339,7 +348,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
// De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
//if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
float ppn = (float)nRanks / nNodes;
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
@@ -349,19 +358,19 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
nRanks;
int nInterSteps = coll == ncclFuncAllReduce ? (nNodes > 1 ? 2*nNodes :0) :
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
nNodes;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
if ((coll == ncclFuncBroadcast || coll == ncclFuncReduce) && a != NCCL_ALGO_RING) continue;
if ((coll == ncclFuncReduceScatter || coll == ncclFuncAllGather)
&& a != NCCL_ALGO_PAT && a != NCCL_ALGO_RING
&& a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
if (coll == ncclFuncAllReduce && a == NCCL_ALGO_PAT) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && comm->topo->nodes[GPU].count == comm->topo->nRanks) continue;
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
if ((coll == ncclFuncReduceScatter || coll == ncclFuncAllGather)
&& a == NCCL_ALGO_PAT && (p != NCCL_PROTO_SIMPLE || ncclPatEnable(comm) == 0)) continue;
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw;
@@ -379,11 +388,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && comm->topo->nodes[GPU].count == comm->topo->nRanks) { busBw = busBw * 1.65; }
#else
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * .5); }
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
if (a == NCCL_ALGO_TREE && coll == ncclFuncAllReduce) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
if (a == NCCL_ALGO_PAT) busBw *= .85;
if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
@@ -412,7 +422,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
#endif
// Convert bus BW to algorithm BW
if (!(a == NCCL_ALGO_COLLNET_DIRECT && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
if (!(a != NCCL_ALGO_RING && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
float ratio = 1.0f;
if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps;
else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0;
@@ -426,8 +436,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
comm->latencies[coll][a][p] = baseLat[a][p];
float intraLat = rcclTuningModel[comm->topo->tuning].hwLat[intraHw[a]][a][p];
float interLat = graphs[a]->latencyInter ? graphs[a]->latencyInter : rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p];
//if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
float interLat = ppn == 1 ? rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][NCCL_ALGO_TREE][p] : rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p];
interLat += graphs[a]->latencyInter;
// Also add the flush extra latency
if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter;
if (a == NCCL_ALGO_RING) {
@@ -447,11 +458,14 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3;
}
intraLat = std::max(intraLat, netOverhead);
int nInterSteps = nNodes == 1 ? 0 : coll == ncclFuncAllReduce ? 2*(nNodes-1) : nNodes-1;
comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat;
}
} else if (a == NCCL_ALGO_TREE) {
comm->latencies[coll][a][p] +=
2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
if (coll == ncclFuncAllReduce) {
comm->latencies[coll][a][p] +=
2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
}
} else if (a == NCCL_ALGO_COLLNET_DIRECT) {
comm->latencies[coll][a][p] +=
2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.4) + interLat; // Add 0.4 us arity serialization latency
@@ -461,6 +475,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (nNodes > 1) comm->latencies[coll][a][p] += rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p];
} else if (a == NCCL_ALGO_NVLS_TREE) {
comm->latencies[coll][a][p] += 2*(nNodes-1)*rcclTuningModel[comm->topo->tuning].hwLat[NCCL_HW_NET][a][p];
} else if (a == NCCL_ALGO_PAT) {
if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
comm->latencies[coll][a][p] = 8 // Base time
+ log2i(nNodes) * (interLat/3.5) // Log latency
+ nRanks * 2.8; // Still a linear part; hopefully we'll manage to remove it at some point.
}
}
}
}
@@ -469,7 +489,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
// Protocols/Algorithms enable/disable, and user overrides.
// All are enabled except ll128 which is enabled by default only in certain cases.
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 };
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1, 1 };
const char *protoStr = ncclGetEnv("NCCL_PROTO");
if (protoStr) {
@@ -551,23 +571,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (comm->rank == 0) {
char line[1024];
for (int block=0; block<2; block++) {
for (int block=0; block<DIVUP(NCCL_NUM_ALGORITHMS, 3); block++) {
sprintf(line, " Algorithm |");
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
int a = block*NCCL_NUM_ALGORITHMS/2+ba;
for (int ba=0; ba<3; ba++) {
int a = block*3+ba;
if (a >= NCCL_NUM_ALGORITHMS) continue;
sprintf(line+strlen(line), " %14s %14s %14s |", "", ncclAlgoStr[a], "");
}
INFO(NCCL_TUNING, "%s", line);
sprintf(line, " Protocol |");
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
for (int ba=0; ba<3; ba++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), " %14s |", ncclProtoStr[p]);
}
}
INFO(NCCL_TUNING, "%s", line);
sprintf(line, " Max NThreads |");
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
int a = block*NCCL_NUM_ALGORITHMS/2+ba;
for (int ba=0; ba<3; ba++) {
int a = block*3+ba;
if (a >= NCCL_NUM_ALGORITHMS) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
}
@@ -575,8 +597,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
INFO(NCCL_TUNING, "%s", line);
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
sprintf(line, "%13s |", ncclFuncStr[c]);
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
int a = block*NCCL_NUM_ALGORITHMS/2+ba;
for (int ba=0; ba<3; ba++) {
int a = block*3+ba;
if (a >= NCCL_NUM_ALGORITHMS) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
}
@@ -657,7 +680,7 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm,
else bw *= rcclTuningModel[comm->topo->tuning].ringCorrectionFactor[protocol][26];
}
#else
if (algorithm == NCCL_ALGO_TREE && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
if (algorithm == NCCL_ALGO_TREE && coll == ncclFuncAllReduce && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && comm->nNodes > 1
&& coll == ncclFuncAllReduce && nBytes/(comm->nChannels*comm->nRanks) >= 64) {
lat *= comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
+5 -2
Melihat File
@@ -480,8 +480,8 @@ ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml*
return ncclInternalError;
}
// Set affinity
char cpumaskPath[] = "/sys/devices/system/node/node0000";
sprintf(cpumaskPath, "/sys/devices/system/node/node%s", numaId);
char cpumaskPath[] = "/sys/devices/system/node/node000000";
snprintf(cpumaskPath, sizeof(cpumaskPath), "/sys/devices/system/node/node%s", numaId);
NCCLCHECK(ncclTopoSetAttrFromSys(cpuNode, cpumaskPath, "cpumap", "affinity"));
}
@@ -711,6 +711,9 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
}
pciNode->parent = parent;
// Keep PCI sub devices ordered by PCI Bus ID (Issue #820)
// Coverity complains about dereferenced parent being NULL
// but this can never happen.
// coverity[var_deref_op]
int subIndex = parent->nSubs;
const char* newBusId;
NCCLCHECK(xmlGetAttrStr(pciNode, "busid", &newBusId));
+38 -18
Melihat File
@@ -62,7 +62,12 @@ ncclResult_t ncclAsyncLaunch(
WARN("Blocking and nonblocking communicators are not allowed in the same group.");
ret = ncclInvalidArgument;
}
ncclIntruQueueEnqueue(&ncclAsyncJobs, job);
if (ret == ncclSuccess) {
ncclIntruQueueEnqueue(&ncclAsyncJobs, job);
} else {
// no need to undo, the job hasn't run
if (destructor) destructor(job);
}
}
return ret;
@@ -80,7 +85,7 @@ void* ncclAsyncJobMain(void* arg) {
ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) {
ncclResult_t ret;
SYSCHECK(pthread_join(job->thread, NULL), "pthread_join");
PTHREADCHECK(pthread_join(job->thread, NULL), "pthread_join");
if (job->result != ncclSuccess) {
WARN("ncclAsyncJobComplete: job %p failed, job error %d", job, job->result);
}
@@ -179,6 +184,12 @@ ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) {
NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
break;
}
case NCCL_ALGO_PAT: {
NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
break;
}
// Yes, it's a dead code. That's fine...
// coverity[dead_error_begin]
default: {
ret = ncclInternalError;
goto fail;
@@ -317,7 +328,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
memset(&comm->planner, 0, sizeof(comm->planner));
comm->planner.peers = tmp;
memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0]));
if (comm->planner.peers != NULL) memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0]));
}
if (!comm->config.blocking)
@@ -345,7 +356,7 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
if (!ncclIntruQueueEmpty(asyncJobsMain)) {
struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain);
do {
SYSCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), ret, fail);
PTHREADCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), "pthread_create", ret, fail);
job = job->next;
} while (job != nullptr);
@@ -357,8 +368,9 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
if (state == ncclGroupJobRunning) {
jobsDone = false;
} else if (state == ncclGroupJobDone) {
if (pthread_join(job->thread, nullptr) != 0) {
WARN("Error waiting for pthread_join : %s", strerror(errno));
int err;
if ((err = pthread_join(job->thread, nullptr)) != 0) {
WARN("Error waiting for pthread_join: %s", strerror(err));
ret = ncclSystemError;
}
job->state = ncclGroupJobJoined;
@@ -389,13 +401,6 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
if (ret != ncclSuccess) goto fail;
}
while (!ncclIntruQueueEmpty(asyncJobsMain)) {
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
(void) ncclCommSetAsyncError(job->comm, ret);
if (job->destructor) job->destructor((void*)job);
}
exit:
return ret;
fail:
@@ -409,6 +414,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
bool *groupAbortFlag = gjob->abortFlagPtr;
CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
@@ -425,7 +431,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
job->base.abortFlag = comm->abortFlag;
job->base.abortFlagDev = comm->abortFlagDev;
job->comm = comm;
ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
ncclIntruQueueEnqueue(asyncJobsMain, (struct ncclAsyncJob*)job);
struct ncclComm* next = comm->preconnectNext;
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
@@ -438,12 +444,14 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
/* Connect channels at runtime if cumem is supported */
if (groupCommHeadMain != nullptr) {
struct ncclComm* comm = groupCommHeadMain;
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncCollJobs;
ncclIntruQueueConstruct(&asyncCollJobs);
do {
bool needConnect = false;
bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
// CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
if (comm->cuMemSupport && needConnect) {
@@ -454,21 +462,33 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
job->base.destructor = free;
job->base.state = ncclGroupJobRunning;
job->base.abortFlag = comm->abortFlag;
job->base.abortFlagDev = comm->abortFlagDev;
job->comm = comm;
NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
}
comm = comm->groupNext;
} while (comm);
NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail);
NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
if (job->destructor) job->destructor((void*)job);
}
}
if ((!simInfo) && (groupCommHeadMain != nullptr)) {
NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
}
while (!ncclIntruQueueEmpty(asyncJobsMain)) {
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
(void) ncclCommSetAsyncError(job->comm, ret);
if (job->destructor) job->destructor((void*)job);
}
while (groupCommHeadMain != nullptr) {
struct ncclComm* comm = groupCommHeadMain;
struct ncclComm* next = comm->groupNext;
@@ -559,7 +579,7 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
}
ncclGroupJobMainPtr->base.func = groupLaunchNonBlocking;
SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail);
PTHREADCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), "pthread_create", ret, fail);
ret = ncclInProgress;
} else {
/* blocking group */
+135 -10
Melihat File
@@ -19,6 +19,11 @@
#include <string.h>
#include "rccl_vars.h"
#if CUDART_VERSION >= 11030
#include <cuda.h>
#include "cudawrap.h"
#endif
uint64_t clockNano(); // from utils.h with which we have a circular dependency
template<typename T>
@@ -26,6 +31,81 @@ constexpr size_t ncclSizeOfT() { return sizeof(T); }
template<>
constexpr size_t ncclSizeOfT<void>() { return 1; }
#if CUDART_VERSION >= 12020
static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
ncclResult_t result = ncclSuccess;
size_t granularity = 0;
CUdevice currentDev;
CUmemAllocationProp prop = {};
CUmemAccessDesc accessDesc = {};
CUmemGenericAllocationHandle handle;
int cudaDev;
int cpuNumaNodeId = -1;
CUmemAllocationHandleType type = ncclCuMemHandleType;
CUDACHECK(cudaGetDevice(&cudaDev));
CUCHECK(cuDeviceGet(&currentDev, cudaDev));
CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev));
if (cpuNumaNodeId < 0) cpuNumaNodeId = 0;
prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.requestedHandleTypes = type; // So it can be exported
prop.location.id = cpuNumaNodeId;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
ALIGN_SIZE(size, granularity);
/* Allocate the physical memory on the device */
CUCHECK(cuMemCreate(&handle, size, &prop, 0));
/* Reserve a virtual address range */
CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, granularity, 0, 0));
/* Map the virtual address range to the physical allocation */
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
/* Now allow RW access to the newly mapped memory for local GPU */
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = cudaDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
/* Now allow RW access to the newly mapped memory from the CPU */
accessDesc.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
accessDesc.location.id = cpuNumaNodeId;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
if (handlep) *handlep = handle;
INFO(NCCL_ALLOC, "CUMEM Host Alloc Size %zi pointer %p handle %llx numa %d dev %d granularity %ld", size, *ptr, handle, cpuNumaNodeId, cudaDev, granularity);
return result;
}
static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
if (ptr == NULL) return ncclSuccess;
ncclResult_t result = ncclSuccess;
CUmemGenericAllocationHandle handle;
size_t size = 0;
CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
CUCHECK(cuMemRelease(handle));
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
TRACE(NCCL_ALLOC, "CUMEM Host Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
CUCHECK(cuMemRelease(handle));
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
return result;
}
#else /* CUDART_VERSION >= 12020 */
static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, void* handlep, size_t size) {
WARN("CUMEM Host is not supported prior to CUDA 12.2");
return ncclInternalError;
}
static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
WARN("CUMEM Host is not supported prior to CUDA 12.2");
return ncclInternalError;
}
#endif /* CUDART_VERSION >= 12020 */
template <typename T>
ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
ncclResult_t result = ncclSuccess;
@@ -51,24 +131,25 @@ finish:
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
return result;
}
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
inline ncclResult_t ncclCudaHostFree(void* ptr) {
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
CUDACHECK(cudaFreeHost(ptr));
return ncclSuccess;
}
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
template <typename T>
ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
if (nelem > 0) {
void* p = malloc(nelem*ncclSizeOfT<T>());
T* p = (T*)malloc(nelem*ncclSizeOfT<T>());
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
return ncclSystemError;
}
//INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), p);
memset(p, 0, nelem*ncclSizeOfT<T>());
*ptr = (T*)p;
*ptr = p;
} else {
*ptr = NULL;
}
@@ -78,17 +159,17 @@ ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int li
template <typename T>
ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
if (nelem < oldNelem) return ncclInternalError;
T* oldp = *ptr;
if (nelem < oldNelem || (oldp == NULL && oldNelem > 0)) return ncclInternalError;
if (nelem == oldNelem) return ncclSuccess;
T* oldp = *ptr;
T* p = (T*)malloc(nelem*ncclSizeOfT<T>());
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
return ncclSystemError;
}
memcpy(p, oldp, oldNelem*ncclSizeOfT<T>());
free(oldp);
if (oldp && oldNelem) memcpy(p, oldp, oldNelem * ncclSizeOfT<T>());
if (oldp) free(oldp);
memset(p+oldNelem, 0, (nelem-oldNelem)*ncclSizeOfT<T>());
*ptr = (T*)p;
INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*ncclSizeOfT<T>(), nelem*ncclSizeOfT<T>(), *ptr);
@@ -113,6 +194,40 @@ extern struct allocationTracker allocTracker[];
#include <cuda.h>
#include "cudawrap.h"
// ncclCuMemAllocAddr takes memory handle and size and returns the mapped address pointer
static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
ncclResult_t result = ncclSuccess;
size_t granularity = 0;
CUmemAllocationProp prop = {};
CUmemAccessDesc accessDesc = {};
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUCHECK(cuMemGetAllocationPropertiesFromHandle(&prop, *handleIn));
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
ALIGN_SIZE(size, granularity);
/* Reserve a virtual address range */
CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
/* Map the virtual address range to the physical allocation */
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, *handleIn, 0));
/* Now allow RW access to the newly mapped memory */
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = cudaDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
TRACE(NCCL_ALLOC, "CuMem Map Size %zu pointer %p handle %llx", size, *ptr, *handleIn);
return result;
}
static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
if (ptr == NULL) return ncclSuccess;
ncclResult_t result = ncclSuccess;
size_t size = 0;
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
return result;
}
static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
ncclResult_t result = ncclSuccess;
size_t granularity = 0;
@@ -130,7 +245,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
prop.requestedHandleTypes = type;
prop.location.id = currentDev;
// Query device to see if RDMA support is available
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
ALIGN_SIZE(size, granularity);
@@ -178,6 +293,15 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
return ncclInternalError;
}
static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
WARN("CUMEM not supported prior to CUDA 11.3");
return ncclInternalError;
}
static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
WARN("CUMEM not supported prior to CUDA 11.3");
return ncclInternalError;
}
#endif
template <typename T>
@@ -297,7 +421,8 @@ finish:
// and if they are shared, that could cause a crash in a child process
inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
if (size > 0) {
size_t page_size = sysconf(_SC_PAGESIZE);
long page_size = sysconf(_SC_PAGESIZE);
if (page_size < 0) return ncclSystemError;
void* p;
int size_aligned = ROUNDUP(size, page_size);
int ret = posix_memalign(&p, page_size, size_aligned);
+11
Melihat File
@@ -185,6 +185,8 @@ inline __host__ __device__ Int pow2Up(Int x) {
template<typename Int>
inline __host__ __device__ Int pow2Down(Int x) {
// True, log2Down can return -1, but we don't normally pass 0 as an argument...
// coverity[negative_shift]
return Int(1)<<log2Down(x);
}
@@ -274,4 +276,13 @@ inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
return u32fpDecode(x, 3);
}
inline __host__ __device__ uint64_t getHash(const char* string, int n) {
// Based on DJB2a, result = result * 33 ^ char
uint64_t result = 5381;
for (int c = 0; c < n; c++) {
result = ((result << 5) + result) ^ string[c];
}
return result;
}
#endif
+2 -2
Melihat File
@@ -19,8 +19,8 @@ static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Boots
ncclResult_t bootstrapNetInit();
ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
ncclResult_t bootstrapInit(int nHandles, void* handle, struct ncclComm* comm);
ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
+38 -23
Melihat File
@@ -38,21 +38,17 @@
#include <errno.h>
// Check system calls
#define SYSCHECK(call, name) do { \
#define SYSCHECK(statement, name) do { \
int retval; \
SYSCHECKVAL(call, name, retval); \
} while (false)
#define SYSCHECKVAL(call, name, retval) do { \
SYSCHECKSYNC(call, name, retval); \
SYSCHECKSYNC((statement), name, retval); \
if (retval == -1) { \
WARN("Call to " name " failed : %s", strerror(errno)); \
WARN("Call to " name " failed: %s", strerror(errno)); \
return ncclSystemError; \
} \
} while (false)
#define SYSCHECKSYNC(call, name, retval) do { \
retval = call; \
#define SYSCHECKSYNC(statement, name, retval) do { \
retval = (statement); \
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
} else { \
@@ -60,14 +56,33 @@
} \
} while(true)
#define SYSCHECKGOTO(statement, RES, label) do { \
if ((statement) == -1) { \
/* Print the back trace*/ \
RES = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
#define SYSCHECKGOTO(statement, name, RES, label) do { \
int retval; \
SYSCHECKSYNC((statement), name, retval); \
if (retval == -1) { \
WARN("Call to " name " failed: %s", strerror(errno)); \
RES = ncclSystemError; \
goto label; \
} \
} while (0);
} while (0)
// Pthread calls don't set errno and never return EINTR.
#define PTHREADCHECK(statement, name) do { \
int retval = (statement); \
if (retval != 0) { \
WARN("Call to " name " failed: %s", strerror(retval)); \
return ncclSystemError; \
} \
} while (0)
#define PTHREADCHECKGOTO(statement, name, RES, label) do { \
int retval = (statement); \
if (retval != 0) { \
WARN("Call to " name " failed: %s", strerror(retval)); \
RES = ncclSystemError; \
goto label; \
} \
} while (0)
#define NEQCHECK(statement, value) do { \
if ((statement) != value) { \
@@ -75,7 +90,7 @@
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
} while (0)
#define NEQCHECKGOTO(statement, value, RES, label) do { \
if ((statement) != value) { \
@@ -84,7 +99,7 @@
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
} while (0)
#define EQCHECK(statement, value) do { \
if ((statement) == value) { \
@@ -92,7 +107,7 @@
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
} while (0)
#define EQCHECKGOTO(statement, value, RES, label) do { \
if ((statement) == value) { \
@@ -101,7 +116,7 @@
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
} while (0)
// Propagate errors up
#define NCCLCHECK(call) do { \
@@ -111,7 +126,7 @@
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
return RES; \
} \
} while (0);
} while (0)
#define NCCLCHECKGOTO(call, RES, label) do { \
RES = call; \
@@ -120,7 +135,7 @@
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
goto label; \
} \
} while (0);
} while (0)
#define NCCLWAIT(call, cond, abortFlagPtr) do { \
uint32_t* tmpAbortFlag = (abortFlagPtr); \
@@ -130,7 +145,7 @@
return ncclInternalError; \
} \
if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECK(*tmpAbortFlag, 0); \
} while (!(cond));
} while (!(cond))
#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
uint32_t* tmpAbortFlag = (abortFlagPtr); \
@@ -140,7 +155,7 @@
goto label; \
} \
if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
} while (!(cond));
} while (!(cond))
#define NCCLCHECKTHREAD(a, args) do { \
if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \
+486
Melihat File
@@ -72,4 +72,490 @@ struct ncclConnFifo {
ssize_t size;
void* ptr;
};
#include <stdio.h>
template<typename T>
class PatRSAlgorithm{
size_t offset;
size_t end;
size_t count;
int chunkCount;
int nelem;
int rank;
int nranks;
int nrPow2;
int postFreq;
int lastA;
int aggFactor;
int as; // aggregated steps
int a; // step inside aggregated step
int sendSkipped; // number of skipped steps during aggregation
int recvSkipped; // number of skipped steps during aggregation
int phase2recv; // receive offset for phase 2
int aggDelta;
int scale;
int phase;
__device__ __host__ int min(int a, int b) {
return (a<b)?a:b;
}
__device__ __host__ int getNelem() {
return min(chunkCount, end-offset);
}
__device__ __host__ int mirrorInvert(int i, int max) {
int ret = 0;
for (int mask=1, imask=max/2; mask<max; mask<<=1, imask>>=1) {
if ((i&mask) == 0) ret += imask;
}
return ret;
}
__device__ __host__ int firstBitSet(int i, int max) {
int ffs =
#ifdef __CUDA_ARCH__
__ffs(i);
#else
__builtin_ffs(i);
#endif
return ffs ? ffs-1 : max;
}
__device__ __host__ void resetA() {
a = 0;
sendSkipped = recvSkipped = 0;
lastA = aggFactor;
if (phase >= 2) lastA /= 2*scale;
}
__device__ __host__ void reset() {
nelem = getNelem();
phase = 0;
scale = 1;
phase2recv = 0;
as = aggDelta - 1;
resetA();
}
__device__ __host__ int nBitsSet(int i) {
int nbits =
#ifdef __CUDA_ARCH__
__popc(i);
#else
__builtin_popcount(i);
#endif
return nbits;
}
// Return 1 when only upper bits are set. For example, if nrpow2==16 we'll return 1 for 8, 12, 14, 15.
// A number being in the form of 1111000 implies that the complementary is 0000111 meaning it's a power of 2 minus 1.
__device__ __host__ int newPeer(int i, int pow2) {
//printf("New peer %d/%d -> %d\n", i, pow2, nBitsSet((i ^ (pow2-1)) + 1) == 1 ? 1 : 0);
return nBitsSet((i ^ (pow2-1)) + 1) == 1 ? 1 : 0;
}
public:
__device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
aggDelta = nrPow2 = (1<<log2Up(nranks));
aggFactor = 1;
size_t channelSize = end-offset;
while (stepSize / (channelSize*sizeof(T)*aggFactor) >= 2 && aggFactor < nranks/2) {
aggFactor *= 2;
aggDelta /= 2;
}
postFreq = aggFactor;
int d = stepDepth;
while (d > 1 && aggFactor < nranks/2) {
d /= 2;
aggFactor *= 2;
aggDelta /= 2;
}
reset();
}
__device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &sendStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
restart:
last = 0;
nelemOut = nelem;
outIx = offset;
int skip = 0;
//printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
if (phase == 0) {
int s = mirrorInvert(a, lastA)*aggDelta + as;
if (s >= nranks) skip = 1;
int sendDataRank = (rank + s) % nranks;
inpIx = sendDataRank * count + offset;
recvDim = -1;
sendDim = 0;
outIx = 0;
recvOffset = -1;
sendOffset = ((a - sendSkipped)%postFreq) * nelem;
sendStepOffset = 0;
if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
postSend = 1;
} else {
postSend = 0;
}
postRecv = 0;
if (skip) sendSkipped++;
if (++a == lastA) {
phase = as == 1 ? (aggFactor > 1 ? 2 : 4) : 1; // If as == 1, switch to phase 2
resetA();
}
if (skip == 0) return;
} else if (phase == 1) {
int s = mirrorInvert(a, lastA)*aggDelta + as;
if (s >= nranks) skip = 1;
recvDim = firstBitSet(s, nrPow2);
sendOffset = ((a - sendSkipped)%postFreq)*nelem;
recvOffset = ((a - recvSkipped)%postFreq)*nelem;
postSend = 0;
if (recvDim == 0) {
if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) postSend = 1;
sendStepOffset = 0;
} else {
sendStepOffset = (a - sendSkipped)/postFreq;
}
if ((((a - recvSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
postRecv = 1;
} else {
postRecv = 0;
}
s -= (1<<recvDim);
int recvDataRank = (rank + nranks + s) % nranks;
inpIx = recvDataRank * count + offset;
sendDim = s ? firstBitSet(s, nrPow2) : -1;
if (sendDim == -1) {
sendOffset = -1;
sendStepOffset = 0;
} else if (as - (1<<recvDim) == 0) {
if (newPeer(a, aggFactor)) sendSkipped = a;
int foffset = a - sendSkipped;
sendStepOffset = recvDim == 0 ? 0 : foffset/postFreq;
sendOffset = (foffset%postFreq)*nelem;
}
if (s < nranks && skip) {
recvDim = -1;
recvOffset = -1;
postRecv = 0;
skip = 0;
}
if (skip || recvDim == -1) recvSkipped++;
if (skip) sendSkipped++;
if (++a == lastA) {
as--;
phase = as % 2 == 1 ? 0 : 1;
resetA();
}
if (skip == 0) return;
} else if (phase == 2) {
int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta + 1;
postRecv = 0;
if (s >= nranks) skip = 1;
recvDim = 0;
postSend = a == lastA-1 ? 1 : 0;
s -= 1;
if (s < nranks && skip) {
recvDim = -1;
recvOffset = -1;
skip = 0;
} else if (!skip) {
int foffset = phase2recv;
phase2recv++;
postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
recvOffset = (foffset%postFreq) * nelem;
}
int recvDataRank = (rank + nranks + s) % nranks;
inpIx = recvDataRank * count + offset;
sendDim = s ? firstBitSet(s, nrPow2) : -1;
int foffset = a - sendSkipped;
postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
sendStepOffset = 0;
sendOffset = (foffset%postFreq) * nelem;
if (skip || sendDim == -1) sendSkipped++;
if (++a == lastA) {
phase = 3;
resetA();
}
if (skip == 0) return;
} else if (phase == 3) {
int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta;
postRecv = a == lastA-1 ? 1 : 0;
if (s >= nranks) skip = 1;
recvDim = firstBitSet(s, nrPow2);
postSend = 0;
s -= (1<<recvDim);
int foffset = a - recvSkipped;
postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0;
recvOffset = (foffset%postFreq) * nelem;
int recvDataRank = (rank + nranks + s) % nranks;
inpIx = recvDataRank * count + offset;
sendDim = s ? firstBitSet(s, nrPow2) : -1;
if (s < nranks && skip) {
recvDim = -1;
recvOffset = -1;
postRecv = 0;
skip = 0;
}
if (newPeer(a, aggFactor/(2*scale))) sendSkipped = a;
foffset = a - sendSkipped;
sendStepOffset = foffset / postFreq; // Accumulate on next steps
sendOffset = sendDim >= 0 ? (foffset%postFreq) * nelem : -1;
if (skip || recvDim == -1) recvSkipped++;
if (skip) sendSkipped++;
if (++a == lastA) {
scale *= 2;
phase = scale < aggFactor ? 2 : 4;
resetA();
}
if (skip == 0) return;
} else if (phase == 4) {
recvDim = 0;
sendDim = -1;
inpIx = rank * count + offset;
recvOffset = (phase2recv%postFreq) * nelem;
sendStepOffset = 0;
sendOffset = -1;
postRecv = 1;
postSend = 0;
offset += chunkCount;
if (offset >= end) {
last = 1;
} else {
reset();
}
return;
}
goto restart;
}
};
template<typename T>
class PatAGAlgorithm{
size_t offset;
size_t end;
size_t count;
int chunkCount;
int nelem;
int rank;
int nranks;
int nrPow2;
int postFreq;
int lastA;
int aggFactor;
int as; // aggregated steps
int a; // step inside aggregated step
int aggDelta;
int scale;
int phase;
// AS computation
int asDim;
int v;
int bitCount[32];
int bitZeroStep[32];
__device__ __host__ int min(int a, int b) {
return (a<b)?a:b;
}
__device__ __host__ int getNelem() {
return min(chunkCount, end-offset);
}
__device__ __host__ int mirror(int i, int max) {
int ret = 0;
for (int mask=1, imask=max/2; mask<max; mask<<=1, imask>>=1) {
if ((i&mask)) ret += imask;
}
return ret;
}
__device__ __host__ int firstBitSet(int i, int max) {
int ffs =
#ifdef __CUDA_ARCH__
__ffs(i);
#else
__builtin_ffs(i);
#endif
return ffs ? ffs-1 : max;
}
__device__ __host__ void resetA() {
a = 0;
lastA = aggFactor;
if (phase >= 2) lastA /= 2*scale;
}
__device__ __host__ void reset() {
nelem = getNelem();
scale = aggFactor/2;
phase = scale ? 2 : 1;
v = 0;
for (int i = 0; i<asDim; i++) {
bitCount[i] = asDim-i;
bitZeroStep[i] = 1;
}
as = nextAs();
resetA();
}
__device__ __host__ int nextAs() {
for (int d=0; d<asDim; d++) {
int p = 1<<d;
bitCount[d]--;
if (bitCount[d] == 0) {
v ^= p;
bitCount[d] = p;
if ((v&p) == 0) {
bitCount[d] += firstBitSet(bitZeroStep[d], asDim) - 1;
if (bitCount[d] == 0) {
v ^= p;
bitCount[d] = p;
}
bitZeroStep[d]++;
}
}
}
return v;
}
public:
__device__ __host__ PatAGAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
aggDelta = nrPow2 = (1<<log2Up(nranks));
aggFactor = 1;
size_t channelSize = end-offset;
while (stepSize / (channelSize*sizeof(T)*aggFactor) >= 2 && aggFactor < nranks/2) {
aggFactor *= 2;
aggDelta /= 2;
}
postFreq = aggFactor;
int d = stepDepth;
while (d > 1 && aggFactor < nranks/2) {
d /= 2;
aggFactor *= 2;
aggDelta /= 2;
}
//printf("AggFactor %d PostFreq %d AggDelta %d\n", aggFactor, postFreq, aggDelta);
asDim = log2Up(aggDelta);
reset();
}
__device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &recvStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
restart:
//printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
last = 0;
nelemOut = nelem;
inpIx = offset;
int skip = 0;
if (phase == 0) {
int s = a*aggDelta + as;
if (s >= nranks) skip = 1;
int nextSkip = (a+1)*aggDelta + as >= nranks ? 1 : 0;
int recvDataRank = (rank + s) % nranks;
outIx = recvDataRank * count + offset;
sendDim = -1;
recvDim = 0;
inpIx = 0;
sendOffset = -1;
recvOffset = (a % postFreq) * nelem;
recvStepOffset = 0;
postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
postSend = 0;
a++;
if (nextSkip) {
as = nextAs();
if (as == aggDelta/2) {
offset += chunkCount;
if (offset >= end) {
last = 1;
} else {
reset();
}
return;
}
phase = 1;
resetA();
}
if (skip == 0) return;
} else if (phase == 1) {
int s = a*aggDelta + as;
if (s >= nranks) skip = 1;
sendDim = firstBitSet(s, nrPow2);
s -= (1<<sendDim);
int sendDataRank = (rank + nranks + s) % nranks;
outIx = sendDataRank * count + offset;
recvDim = s ? firstBitSet(s, nrPow2) : -1;
sendOffset = recvOffset = (a % postFreq) * nelem;
postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
postRecv = (sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0;
recvStepOffset = (sendDim == 0) ? 0 : a/postFreq;
if (recvDim == -1) {
recvOffset = -1;
postRecv = 0;
} else if (as - (1<<sendDim) == 0) {
int foffset = (a*aggDelta) >> (recvDim+1);
recvOffset = (foffset%postFreq)*nelem;
postRecv = (sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<<recvDim) >= nranks) ? 1 : 0;
recvStepOffset = (sendDim == 0) ? 0 : foffset/postFreq;
}
if (s < nranks && sendDim == 0 && skip) {
// Don't forget to receive at least once even if we don't send afterwards
sendDim = -1;
sendOffset = -1;
postSend = 0;
skip = 0;
}
if (++a == lastA) {
if (as % 2 == 1) {
phase = 0;
} else {
as = nextAs();
}
resetA();
}
if (skip == 0) return;
} else if (phase == 2) {
int s = (2*a+1)*scale*aggDelta;
postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0;
postRecv = 0;
if (s >= nranks) skip = 1;
sendDim = firstBitSet(s, nrPow2);
s -= (1<<sendDim);
sendOffset = (a%postFreq) * nelem;
recvStepOffset = a / postFreq;
int sendDataRank = (rank + nranks + s) % nranks;
outIx = sendDataRank * count + offset;
recvDim = s ? firstBitSet(s, nrPow2) : -1;
s -= (1<<recvDim);
if (recvDim == -1) {
recvOffset = -1;
} else {
int foffset = (a*2*scale*aggDelta) >> (recvDim+1);
recvOffset = (foffset%postFreq)*nelem;
recvStepOffset = foffset / postFreq;
}
if (++a == lastA) {
scale /= 2;
phase = scale ? 2 : 1;
resetA();
}
if (skip == 0) return;
}
goto restart;
}
};
#endif
+102 -36
Melihat File
@@ -18,6 +18,7 @@
#include "register.h"
#include "graph.h"
#include "nvmlwrap.h"
#include "profiler.h"
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
#define HIPRT_CB
@@ -110,6 +111,11 @@ struct ncclCommCallback {
struct ncclCommCallback* next;
ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
};
struct ncclCommEventCallback {
struct ncclCommEventCallback* next;
cudaEvent_t event;
ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommEventCallback* cb);
};
struct ncclSharedResources {
int refCount;
@@ -179,6 +185,56 @@ struct ncclCollnetHandleList {
struct ncclProxyConnector* proxyconn;
};
struct ncclTaskColl {
struct ncclTaskColl* next;
ncclFunc_t func;
void const* sendbuff;
void* recvbuff;
size_t count;
int root;
ncclDataType_t datatype;
ncclRedOp_t opHost;
struct ncclDevRedOpFull opDev;
int chunkSteps, sliceSteps;
// Computed later:
size_t trafficBytes;
int32_t nMaxChannels:8;
int32_t nWarps:8;
int32_t algorithm:8, protocol:8;
uint32_t isCollnet:1, isNvls:1;
uint32_t devFuncId:30;
enum ncclRegBufferType regBufType;
uint64_t opCount;
// number of elements in planner->ipcMemQueue associated with this collective
int nCleanupQueueElts;
void* sendMhandle;
void* recvMhandle;
// index for IPC record lookup
uintptr_t sendbuffOffset;
uintptr_t recvbuffOffset;
uintptr_t* sendbuffRmtAddrs;
uintptr_t* recvbuffRmtAddrs;
// Profiler plugin
int eActivationMask;
void* eventHandle;
};
struct ncclTaskP2p {
struct ncclTaskP2p* next;
ncclFunc_t func;
void* buff;
size_t count;
ncclDataType_t datatype;
int root;
size_t bytes;
uint64_t opCount;
// Profiler plugin
int eActivationMask;
void* eventHandle;
};
struct ncclKernelPlan {
// A kernel plan is also a callback that reclaims itself. Hence this must
// be the first member.
@@ -204,42 +260,12 @@ struct ncclKernelPlan {
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> cleanupQueue;
void* workBufPersistent;
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> p2pTaskQueue;
struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collTaskQueue;
struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
};
////////////////////////////////////////////////////////////////////////////////
struct ncclTaskColl {
struct ncclTaskColl* next;
ncclFunc_t func;
void const* sendbuff;
void* recvbuff;
size_t count;
int root;
ncclDataType_t datatype;
ncclRedOp_t opHost;
struct ncclDevRedOpFull opDev;
int chunkSteps, sliceSteps;
// Computed later:
size_t trafficBytes;
int32_t nMaxChannels:8;
int32_t nWarps:8;
int32_t algorithm:8, protocol:8;
uint32_t isCollnet:1, isNvls:1;
uint32_t devFuncId:30;
enum ncclRegBufferType regBufType;
// number of elements in planner->ipcMemQueue associated with this collective
int nCleanupQueueElts;
void* sendMhandle;
void* recvMhandle;
uint64_t opCount;
};
struct ncclTaskP2p {
struct ncclTaskP2p* next;
void* buff;
size_t bytes;
uint64_t opCount;
// Profiler plugin
void* groupEventHandle;
};
////////////////////////////////////////////////////////////////////////////////
@@ -395,6 +421,7 @@ struct ncclPeerInfo {
// MNNVL support
nvmlGpuFabricInfoV_t fabricInfo;
int cuMemSupport;
int version;
};
struct ncclComm {
@@ -410,6 +437,8 @@ struct ncclComm {
struct ncclChannel channels[MAXCHANNELS];
struct ncclPeerInfo* peerInfo;
struct ncclTopoSystem* topo;
struct ncclProxyConnector* gproxyConn;
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> legacyRegCleanupQueue;
int netPluginLoaded;
ncclNet_t* ncclNet;
@@ -422,10 +451,12 @@ struct ncclComm {
struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
bool initAlgoChannels[NCCL_NUM_ALGORITHMS];
bool runtimeConn; // if dynamic connection is supported
bool directMode;
int cuMemSupport;
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
const char* commName;
uint64_t commHash;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
@@ -488,7 +519,7 @@ struct ncclComm {
int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
/* This attribute can indicate the states of communicators and return code of
* asynchronous NCCL operations. */
* asynchronous NCCL operations. */
ncclResult_t asyncResult;
// Flag to ask NCCL kernels to abort
@@ -537,7 +568,7 @@ struct ncclComm {
int collNetSupport;
bool collNetRegSupport;
uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
int intraHighestTransportType;
bool intraNodeP2pSupport;
int* collNetHeads;
int collNetHeadsNum;
int* collNetDenseToUserRank;
@@ -552,6 +583,8 @@ struct ncclComm {
struct ncclNvlsSharedRes* nvlsResources;
// pools backed by comm->memPermanent
struct ncclMemoryPool memPool_ncclTaskColl;
struct ncclMemoryPool memPool_ncclTaskP2p;
struct ncclMemoryPool memPool_ncclProxyOp;
struct ncclMemoryPool memPool_ncclKernelPlan;
@@ -566,6 +599,13 @@ struct ncclComm {
struct ncclKernelPlanner planner;
hipStream_t sideStream; // [RCCL] Cached non-captured stream
cudaMemPool_t memPool;
// Queue of events and associated callbacks for cleaning up asynchronous work.
// Using this is preferable to using CUDA host callbacks because host callbacks
// won't allow the work following the callback to run until the callback completes,
// which comes at expense to perf.
struct ncclIntruQueue<struct ncclCommEventCallback, &ncclCommEventCallback::next> eventCallbackQueue;
// user-created reduction ops
int userRedOpCapacity, userRedOpFreeHead;
@@ -614,6 +654,11 @@ struct ncclComm {
int tunerPluginLoaded;
ncclTuner_t* tuner;
void *tunerContext;
// Profiler plugin
void* profilerContext;
uint64_t seqNumber[NCCL_NUM_FUNCTIONS];
// buffer registration cache
struct ncclRegCache regCache;
uint64_t endMagic;
@@ -647,6 +692,27 @@ inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome)
return ncclSuccess;
}
inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
while (true) {
struct ncclCommEventCallback* cb = ncclIntruQueueHead(&comm->eventCallbackQueue);
if (cb == nullptr) break;
cudaError_t ok = cudaEventSynchronize(cb->event);
if (ok == cudaErrorNotReady) break;
ncclIntruQueueDequeue(&comm->eventCallbackQueue);
if (ok == cudaSuccess) {
NCCLCHECKGOTO(cb->fn(comm, cb), result, finish);
} else {
CUDACHECKGOTO(ok, result, finish);
}
}
finish:
cudaThreadExchangeStreamCaptureMode(&mode);
return ncclSuccess;
}
inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) {
int phase = comm->intraBarrierPhase;
if (comm->intraRanks == 1) {
+2
Melihat File
@@ -13,6 +13,7 @@
// Is cuMem API usage enabled
extern int ncclCuMemEnable();
extern int ncclCuMemHostEnable();
#if CUDART_VERSION >= 11030
#include <cudaTypedefs.h>
@@ -96,6 +97,7 @@ DECLARE_CUDA_PFN_EXTERN(cuMemRelease);
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle);
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess);
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle);
#if CUDA_VERSION >= 11070
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
#endif
+21 -8
Melihat File
@@ -54,9 +54,9 @@ struct ncclDevRedOpFull {
union ncclLLFifoLine {
/* Flags have to be *after* data, because otherwise, an incomplete receive
from the network may receive the flag but not the data.
Note this is assuming that either we receive contiguous chunks of data
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
from the network may receive the flag but not the data.
Note this is assuming that either we receive contiguous chunks of data
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
struct {
uint32_t data1;
uint32_t flag1;
@@ -144,6 +144,8 @@ struct ncclConnInfo {
};
struct ncclProxyConnector {
bool initialized;
int rank;
int tpRank;
int tpLocalRank;
int sameProcess;
@@ -157,6 +159,8 @@ struct ncclConnector {
struct ncclTransportComm* transportComm;
void* transportResources;
struct ncclConnInfo conn;
int sendMemSameProcess;
int recvMemSameProcess;
};
struct ncclRing {
@@ -247,6 +251,8 @@ struct alignas(16) ncclDevWorkP2p {
uint8_t sendProtoLL:1, recvProtoLL:1;
uint8_t sendRegistered:1, recvRegistered:1;
uint8_t sendIpcReg:1, recvIpcReg:1;
uint8_t sendConnIndex:2, recvConnIndex:2;
};
@@ -298,6 +304,10 @@ struct alignas(16) ncclDevWorkColl {
uint16_t pivotA2ANumBiRings;
void* recvbuff;
void* sendbuff;
uintptr_t sendbuffOffset;
uintptr_t recvbuffOffset;
uintptr_t* sendbuffRmtAddrs;
uintptr_t* recvbuffRmtAddrs;
union {
// Continuous-byte-distribution scheduling. The lo and hi channels are of
// different size than the channels in the middle.
@@ -319,9 +329,9 @@ struct alignas(16) ncclDevWorkColl {
__host__ __device__ constexpr int ncclProtoGrainSize(int proto) {
return proto == NCCL_PROTO_LL ? 16 :
proto == NCCL_PROTO_LL128 ? WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS*NCCL_LL128_DATAELEMS*sizeof(uint64_t) :
proto == NCCL_PROTO_SIMPLE ? 512 :
-1;
proto == NCCL_PROTO_LL128 ? WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS*NCCL_LL128_DATAELEMS*sizeof(uint64_t) :
proto == NCCL_PROTO_SIMPLE ? 512 :
-1;
}
template<typename Int>
@@ -367,7 +377,7 @@ enum ncclDevWorkType: uint8_t {
constexpr size_t ncclDevWorkSize(enum ncclDevWorkType type) {
return type == ncclDevWorkTypeP2p ? sizeof(ncclDevWorkP2p) :
type == ncclDevWorkTypeColl ? sizeof(ncclDevWorkColl) : sizeof(ncclDevWorkCollReg);
type == ncclDevWorkTypeColl ? sizeof(ncclDevWorkColl) : sizeof(ncclDevWorkCollReg);
}
#define NCCL_MAX_DEV_WORK_BATCH_BYTES 128
@@ -493,6 +503,7 @@ struct ncclDevComm {
int nNodes;
int buffSizes[NCCL_NUM_PROTOCOLS];
int p2pChunkSize;
int isNvlink;
int p2pnChannelsPerPeer;
// Work fifo return credits
@@ -506,6 +517,8 @@ struct ncclDevComm {
// Channels, device side
struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
int* rankToLocalRank;
#if defined(ENABLE_NPKIT)
NpKitEventCollectContext* npKitEventCollectContexts;
uint64_t* cpuTimestamp;
@@ -686,7 +699,7 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto)
row += (((algo * NCCL_NUM_PROTOCOLS + proto) * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - NCCL_NUM_FLOATS * (algo * NCCL_NUM_PROTOCOLS + proto);
break;
}
row += (NCCL_NUM_ALGORITHMS - 4) * NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - NCCL_NUM_FLOATS);
row += (NCCL_NUM_ALGORITHMS - 5) * NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - NCCL_NUM_FLOATS);
// RING / SIMPLE / Sum / int8_t
if (coll == ncclFuncAllToAllPivot) break;
+4 -3
Melihat File
@@ -34,16 +34,17 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
// Query topology
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int64_t netId, int read, int* useGdr);
#define MAX_XGMI_INTER_GPUS 4
ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int64_t* id, int* dev);
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush);
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net);
int ncclPxnDisable(struct ncclComm* comm);
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu);
// Find CPU affinity
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
@@ -82,7 +83,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
#define NCCL_TOPO_PATTERN_COLLNET_DIRECT 6 // Collnet Direct
struct ncclTopoGraph {
// Input / output
int id; // ring : 0, tree : 1, collnet : 2
int id; // ring : 0, tree : 1, collnet : 2, nvls : 3, collnetDirect : 4
int pattern;
int crossNic;
int collNet;
+2 -1
Melihat File
@@ -55,7 +55,7 @@ typedef enum {
ncclNumFuncs = 9
} ncclFunc_t;
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
#define NCCL_ALGO_UNDEF -1
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
@@ -63,6 +63,7 @@ typedef enum {
#define NCCL_ALGO_COLLNET_CHAIN 3
#define NCCL_ALGO_NVLS 4
#define NCCL_ALGO_NVLS_TREE 5
#define NCCL_ALGO_PAT 6
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define NCCL_PROTO_UNDEF -1
+150
Melihat File
@@ -0,0 +1,150 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_PROFILER_H_
#define NCCL_PROFILER_H_
#include <cstdint>
enum {
ncclProfileGroup = (1 << 0), // group event type
ncclProfileColl = (1 << 1), // host collective call event type
ncclProfileP2p = (1 << 2), // host point-to-point call event type
ncclProfileProxyOp = (1 << 3), // proxy operation event type
ncclProfileProxyStep = (1 << 4), // proxy step event type
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
ncclProfileNumEvents = ( 6),
};
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
const char* name;
uint64_t commHash;
uint64_t seqNumber;
uint8_t func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
uint8_t datatype;
uint32_t op;
size_t trafficBytes;
uint8_t nMaxChannels;
uint8_t nWarps;
uint8_t algo;
uint8_t proto;
int isCollnet;
int isNvls;
} coll;
struct {
const char* name;
uint64_t commHash;
uint8_t func;
void* buff;
uint8_t datatype;
size_t count;
int peer;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
};
} ncclProfilerEventDescr_v1_t;
typedef enum {
ncclProfilerProxyOpSendPosted,
ncclProfilerProxyOpSendRemFifoWait,
ncclProfilerProxyOpSendTransmitted,
ncclProfilerProxyOpSendDone,
ncclProfilerProxyOpRecvPosted,
ncclProfilerProxyOpRecvReceived,
ncclProfilerProxyOpRecvTransmitted,
ncclProfilerProxyOpRecvDone,
/* Legacy proxy profiler states */
ncclProfilerProxyStepSendGPUWait,
ncclProfilerProxyStepSendWait,
ncclProfilerProxyStepRecvWait,
ncclProfilerProxyStepRecvFlushWait,
ncclProfilerProxyStepRecvGPUWait,
/* Legacy proxy control states */
ncclProfilerProxyCtrlIdle,
ncclProfilerProxyCtrlActive,
ncclProfilerProxyCtrlSleep,
ncclProfilerProxyCtrlWakeup,
ncclProfilerProxyCtrlAppend,
ncclProfilerProxyCtrlAppendEnd,
} ncclProfilerEventState_v1_t;
typedef union {
struct {
size_t transSize;
int steps;
} proxyOp;
struct {
int appendedProxyOps;
} proxyCtrl;
} ncclProfilerEventStateArgs_v1_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v1_t;
typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t;
typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t;
typedef ncclProfiler_v1_t ncclProfiler_t;
#endif
+22 -19
Melihat File
@@ -17,32 +17,35 @@
#endif
// Define all NCCL-provided static schema IDs here (avoid duplicates).
#define NVTX_SID_CommInitRank 0
#define NVTX_SID_CommInitAll 1
#define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_AllGather 4
#define NVTX_SID_AllReduce 5
#define NVTX_SID_AllToAll 6
#define NVTX_SID_AllToAllv 7
#define NVTX_SID_Broadcast 8
#define NVTX_SID_Gather 9
#define NVTX_SID_MSCCL 10
#define NVTX_SID_ReduceScatter 11
#define NVTX_SID_Reduce 12
#define NVTX_SID_Scatter 13
#define NVTX_SID_Send 14
#define NVTX_SID_Recv 15
#define NVTX_SID_CommInitRank 0
#define NVTX_SID_CommInitAll 1
#define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_AllGather 4
#define NVTX_SID_AllReduce 5
#define NVTX_SID_AllToAll 6
#define NVTX_SID_AllToAllv 7
#define NVTX_SID_Broadcast 8
#define NVTX_SID_Gather 9
#define NVTX_SID_MSCCL 10
#define NVTX_SID_ReduceScatter 11
#define NVTX_SID_Reduce 12
#define NVTX_SID_Scatter 13
#define NVTX_SID_Send 14
#define NVTX_SID_Recv 15
#define NVTX_SID_CommInitRankConfig 16 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_CommInitRankScalable 17 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_CommSplit 18
// Define static schema ID for the reduction operation.
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
struct nccl_domain{static constexpr char const* name{"NCCL"};};
class payload_schema {
public:
public:
explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
{
schema_attr.name = schemaName;
@@ -59,7 +62,7 @@ class payload_schema {
payload_schema(payload_schema&&) = default;
payload_schema& operator=(payload_schema&&) = default;
private:
private:
nvtxPayloadSchemaAttr_t schema_attr{
NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES |
+28 -3
Melihat File
@@ -34,11 +34,36 @@ typedef union {
// Legacy CUDA IPC
cudaIpcMemHandle_t devIpc;
// cuMem API support
ncclCuDesc cuDesc;
struct {
ncclCuDesc cuDesc;
CUmemGenericAllocationHandle memHandle;
};
} ncclIpcDesc;
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr);
enum ncclIpcRegType {
NCCL_IPC_SENDRECV = 0,
NCCL_IPC_COLLECTIVE = 1
};
struct ncclIpcImpInfo {
void* rmtRegAddr;
bool legacyIpcCap;
uintptr_t offset;
};
struct ncclIpcRegInfo {
int peerRank;
void* baseAddr;
struct ncclProxyConnector* ipcProxyconn;
struct ncclIpcImpInfo impInfo;
};
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int directMap, ncclIpcDesc *ipcDesc, void **ptr);
ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut);
ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts);
ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo);
#endif
+38 -20
Melihat File
@@ -4,34 +4,52 @@
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_PROFILER_H_
#define NCCL_PROFILER_H_
#ifndef PROFILER_H_
#define PROFILER_H_
#include "proxy.h"
#include <cuda_runtime.h>
#include "nccl_profiler.h"
enum ncclProxyProfileState {
ncclProxyProfileBegin = 0,
struct ncclProxyArgs;
struct ncclKernelPlan;
struct ncclTaskColl;
struct ncclTaskP2p;
struct ncclInfo;
struct ncclComm;
struct ncclProxyOp;
ncclProxyProfileSendGPUWait = 1,
ncclProxyProfileSendWait = 2,
// Plugin Init/Finalize Wrappers
ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm);
ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm);
ncclProxyProfileRecvWait = 1,
ncclProxyProfileRecvFlushWait = 2,
ncclProxyProfileRecvGPUWait = 3,
// Profiler Start/Stop Group Wrappers
ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan);
ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan);
ncclProxyProfileEnd = 4,
// Profiler Start/Stop Task Events Wrappers
ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan);
ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan);
ncclProxyProfileSleep = 8,
ncclProxyProfileWakeup = 9,
// Proxy Op Start/Stop Event Wrappers
ncclResult_t ncclProfilerStartSendProxyOpEvent(int sub, struct ncclProxyArgs* args);
ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* args);
ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args);
ncclProxyProfileIdle = 16,
ncclProxyProfileActive = 17,
// Proxy Step Start/Stop Event Wrappers
ncclResult_t ncclProfilerStartSendProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
ncclResult_t ncclProfilerStartRecvProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
ncclResult_t ncclProfilerStopProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
ncclProxyProfileAppend = 24,
ncclProxyProfileAppendEnd = 25
};
// Proxy Control Start/Stop Events Wrappers
ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle);
ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state);
void ncclProfilingDump();
// Record Event Wrappers
ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
ncclResult_t ncclProfilerRecordProxyStepEventStates(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState);
ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState);
// Profiler utility functions
ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op);
#endif
+37 -4
Melihat File
@@ -15,7 +15,7 @@
#include "ipcsocket.h"
#include "nccl_net.h"
#include <pthread.h>
#include "shm.h"
#include "shmutils.h"
#include "p2p.h"
typedef enum : uint8_t {
@@ -30,6 +30,8 @@ typedef enum : uint8_t {
ncclPatternCollnetDirect,
ncclPatternNvls,
ncclPatternNvlsTree,
ncclPatternPatUp,
ncclPatternPatDown,
ncclPatternSend,
ncclPatternRecv
} ncclPattern_t;
@@ -79,6 +81,19 @@ struct ncclProxyOp {
union ncclProxyOpSpecifics specifics;
// Profiler plugin
union {
struct ncclTaskColl* coll;
struct ncclTaskP2p* p2p;
} task;
int eActivationMask;
void* taskEventHandle;
int rank;
int peer;
pid_t pid;
void* profilerContext;
struct ncclProxyOp *enqNext;
};
@@ -107,7 +122,15 @@ struct ncclProxySubArgs {
uint64_t done;
uint64_t end;
void* requests[NCCL_STEPS];
void* profilingEvents[NCCL_STEPS];
// Profiler plugin
int eActivationMask;
int rank;
void* taskEventHandle;
void* opEventHandle;
void* stepEventHandles[NCCL_STEPS];
size_t transSize;
void* recvRequestsCache[NCCL_STEPS];
int recvRequestsSubCount;
@@ -142,6 +165,10 @@ struct ncclProxyArgs {
int idle;
uint64_t hdp_flushed;
// Profiler plugin
pid_t pid;
void* profilerContext;
// Element linking
struct ncclProxyArgs* next;
struct ncclProxyArgs* nextPeer;
@@ -279,6 +306,7 @@ struct ncclProxyState {
ncclNet_t* ncclNet;
ncclCollNet_t* ncclCollNet;
uint32_t* abortFlag;
bool directMode;
// Service threads
pthread_t thread;
pthread_t threadUDS;
@@ -299,6 +327,9 @@ struct ncclProxyState {
// Progress thread
struct ncclProxyProgressState progressState;
// Profiler plugin
void* profilerContext;
// Queue of expected responses from the proxy
struct ncclExpectedProxyResponse* expectedResponses;
};
@@ -350,8 +381,9 @@ enum ncclProxyMsgType {
ncclProxyMsgAbort = 7,
ncclProxyMsgStop = 8,
ncclProxyMsgGetFd = 9, // cuMem API support (UDS)
ncclProxyMsgRegister = 10,
ncclProxyMsgDeregister = 11
ncclProxyMsgQueryFd = 10,
ncclProxyMsgRegister = 11,
ncclProxyMsgDeregister = 12
};
// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
@@ -365,6 +397,7 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec
// UDS support
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int rank, void *handle, int* convertedFd);
ncclResult_t ncclProxyClientQueryFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int localFd, int* rmtFd);
ncclResult_t ncclProxyStop(struct ncclComm* comm);
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
+11 -2
Melihat File
@@ -11,7 +11,13 @@ enum {
NVLS_REG_COMPLETE = 0x02,
NVLS_REG_POSSIBLE = 0x04,
NVLS_REG_NO_SUPPORT = 0x08,
COLLNET_REG_COMPLETE = 0x10
COLLNET_REG_COMPLETE = 0x10,
IPC_REG_COMPLETE = 0x20
};
struct ncclPeerRegIpcAddr {
uintptr_t* devPeerRmtAddrs;
uintptr_t* hostPeerRmtAddrs;
};
struct ncclReg {
@@ -34,7 +40,10 @@ struct ncclReg {
uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
// collnet reg
void* collnetHandle;
struct ncclProxyConnector* proxyconn;
struct ncclProxyConnector* collnetProxyconn;
// general ipc reg
struct ncclPeerRegIpcAddr regIpcAddrs;
struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS];
};
struct ncclRegCache {
+3 -1
Melihat File
@@ -15,7 +15,6 @@ typedef hsa_status_t (*PFN_hsa_system_get_info)(hsa_system_info_t attribute, voi
typedef hsa_status_t (*PFN_hsa_status_string)(hsa_status_t status, const char ** status_string);
typedef hsa_status_t (*PFN_hsa_amd_portable_export_dmabuf)(const void* ptr, size_t size, int* dmabuf, uint64_t* offset);
#define CUPFN(symbol) pfn_##symbol
// Check CUDA PFN driver calls
@@ -68,6 +67,9 @@ DECLARE_ROCM_PFN_EXTERN(hsa_init);
DECLARE_ROCM_PFN_EXTERN(hsa_system_get_info);
DECLARE_ROCM_PFN_EXTERN(hsa_status_string);
extern int ncclCuMemEnable();
extern int ncclCuMemHostEnable();
ncclResult_t rocmLibraryInit(void);
extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
+29 -18
Melihat File
@@ -1,26 +1,37 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_SHM_H_
#define NCCL_SHM_H_
#include "nccl.h"
#include "comm.h"
typedef void* ncclShmHandle_t;
ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
ncclResult_t ncclShmClose(ncclShmHandle_t handle);
ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
struct ncclShmemCollBuff {
volatile size_t *cnt[2];
volatile void *ptr[2];
int round;
size_t maxTypeSize;
struct shmLegacyIpc {
char shmSuffix[7];
ncclShmHandle_t handle;
size_t shmSize;
};
ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
struct shmCuIpc {
union {
CUmemFabricHandle handle;
CUmemGenericAllocationHandle data;
};
int tpProxyRank;
void *ptr;
size_t size;
};
struct shmIpcDesc {
union
{
struct shmLegacyIpc shmli;
struct shmCuIpc shmci;
};
bool legacy;
};
typedef struct shmIpcDesc ncclShmIpcDesc_t;
ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc);
#endif
+26
Melihat File
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_SHMUTILS_H_
#define NCCL_SHMUTILS_H_
#include "nccl.h"
typedef void* ncclShmHandle_t;
ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
ncclResult_t ncclShmClose(ncclShmHandle_t handle);
ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
struct ncclShmemCollBuff {
volatile size_t *cnt[2];
volatile void *ptr[2];
int round;
size_t maxTypeSize;
};
ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
#endif
+7 -7
Melihat File
@@ -33,15 +33,15 @@ static double startTimes[8];
#define TIME_START(index) do { \
counts[index]++; \
startTimes[index] = gettime(); \
} while (0);
} while (0)
#define TIME_STOP(index) do { \
times[index] += gettime() - startTimes[index]; \
} while (0);
} while (0)
#define TIME_CANCEL(index) do { \
counts[index]--; \
} while (0);
} while (0)
#define TIME_PRINT(name) do { \
printf("%s stats", name); \
@@ -50,11 +50,11 @@ static double startTimes[8];
counts[i] = 0; \
} \
printf("\n"); \
} while (0);
} while (0)
#else
#define TIME_START(index) while(0);
#define TIME_STOP(index) while(0);
#define TIME_CANCEL(index) while(0);
#define TIME_START(index) do {} while(0)
#define TIME_STOP(index) do {} while(0)
#define TIME_CANCEL(index) do {} while(0)
#define TIME_PRINT(name)
#endif
#endif
+5 -4
Melihat File
@@ -36,7 +36,7 @@ struct ncclConnector;
struct ncclComm;
#define CHANNEL_MASK_OFFSET(nranks, connIndex) (nranks * (connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0))
#define CONNECT_SIZE 128
#define CONNECT_SIZE 256
struct ncclConnect {
char data[CONNECT_SIZE];
};
@@ -77,7 +77,6 @@ struct ncclCollNetSharedRes {
void* resources;
int nChannels;
size_t buffSize;
int intraHighestTransportType;
};
struct ncclTransportComm {
@@ -95,13 +94,14 @@ struct ncclTransportComm {
struct ncclTransport {
const char name[8];
ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
ncclResult_t (*canConnect)(int*, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
struct ncclTransportComm send;
struct ncclTransportComm recv;
};
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL, bool* needsProxy=NULL);
ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode);
ncclResult_t ncclNvlsInit(struct ncclComm* comm);
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
@@ -113,7 +113,7 @@ ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdevi
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
enum { collNetRecv=0, collNetSend=1 };
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect);
bool ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect);
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufUsed, void** outHandle);
@@ -122,6 +122,7 @@ ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConne
ncclResult_t ncclTransportRingConnect(struct ncclComm* comm);
ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm);
ncclResult_t ncclTransportPatConnect(struct ncclComm* comm);
ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* graphs[]);
ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm);
-1
Melihat File
@@ -27,7 +27,6 @@ ncclResult_t busIdToInt64(const char* busId, int64_t* id);
ncclResult_t getBusId(int cudaDev, int64_t *busId);
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
uint64_t getHash(const char* string, int n);
uint64_t getHostHash();
uint64_t getPidHash();
ncclResult_t getRandomData(void* buffer, size_t bytes);
+312 -133
Melihat File
@@ -68,7 +68,7 @@
#endif
const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2] = { "AllGather", "AllReduce", "AllToAllPivot", "Broadcast", "Reduce", "ReduceScatter", "SendRecv"};
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree" };
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree", "PAT" };
const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
const char* ncclDevRedOpStr[ncclNumDevRedOps] = { "Sum", "Prod", "MinMax", "PreMulSum", "SumPostDiv" };
const char *ncclTypeStr[ncclNumTypes] = {"_i8", "_u8", "_i32", "_u32", "_i64", "_u64", "_f16", "_f32", "_f64", "_b16"};
@@ -196,9 +196,15 @@ NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
ncclResult_t ncclGetUniqueId_impl(ncclUniqueId* out) {
NCCLCHECK(ncclInit());
NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
ncclResult_t res = bootstrapGetUniqueId((struct ncclBootstrapHandle*)out);
struct ncclBootstrapHandle handle;
NCCLCHECK(bootstrapGetUniqueId(&handle));
// ncclUniqueId and bootstrapHandle don't have the same size and alignment
// reset to 0 to avoid undefined data
memset(out, 0, sizeof(*out));
// copy to avoid alignment mismatch
memcpy(out, &handle, sizeof(handle));
TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out));
return res;
return ncclSuccess;
}
// Prevent compiler from optimizing out these operations
@@ -337,7 +343,7 @@ void ncclCommPushCudaFree(struct ncclComm* comm, void* obj) {
}
static ncclResult_t ncclDestructorFnCudaHostFree(struct ncclDestructor* dtor) {
CUDACHECK(cudaFreeHost(dtor->obj));
NCCLCHECK(ncclCudaHostFree(dtor->obj));
return ncclSuccess;
}
void ncclCommPushCudaHostFree(struct ncclComm* comm, void* obj) {
@@ -370,13 +376,15 @@ static ncclResult_t commFree(ncclComm_t comm) {
* free all intra-process communicators; therefore, we only need to focus on local
* resource cleanup in commFree(). */
if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
pthread_join(comm->proxyState->thread, nullptr);
PTHREADCHECK(pthread_join(comm->proxyState->thread, nullptr), "pthread_join");
if (comm->proxyState->threadUDS) {
// UDS support
pthread_join(comm->proxyState->threadUDS, nullptr);;
PTHREADCHECK(pthread_join(comm->proxyState->threadUDS, nullptr), "pthread_join");
}
}
CUDACHECK(cudaMemPoolDestroy(comm->memPool));
delete[] comm->userRedOps;
free(comm->connectSend);
@@ -469,12 +477,14 @@ static ncclResult_t commFree(ncclComm_t comm) {
free(comm->topParentRanks);
free(comm->topParentLocalRanks);
free(comm->gproxyConn);
NCCLCHECK(ncclRegCleanup(comm));
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy");
commPoison(comm); // poison comm before free to avoid comm reuse.
NCCLCHECK(ncclProfilerPluginFinalize(comm));
NCCLCHECK(ncclNetFinalize(comm));
NCCLCHECK(ncclNetPluginUnload(comm));
free(comm);
@@ -560,6 +570,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
NCCLCHECK(ncclNetPluginLoad(comm));
NCCLCHECK(ncclNetInit(comm));
NCCLCHECK(ncclProfilerPluginInit(comm));
INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
if (parent && parent->config.splitShare) {
@@ -657,8 +668,28 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
}
ncclIntruQueueMpscConstruct(&comm->callbackQueue);
ncclIntruQueueConstruct(&comm->legacyRegCleanupQueue);
comm->regCache.pageSize = sysconf(_SC_PAGESIZE);
do {
cudaMemPoolProps props = {};
props.allocType = cudaMemAllocationTypePinned;
props.handleTypes = cudaMemHandleTypeNone;
props.location.type = cudaMemLocationTypeDevice;
props.location.id = comm->cudaDev;
CUDACHECK(cudaMemPoolCreate(&comm->memPool, &props));
uint64_t releaseThreshold = ~uint64_t(0);
CUDACHECK(cudaMemPoolSetAttribute(comm->memPool, cudaMemPoolAttrReleaseThreshold, &releaseThreshold));
} while (0);
ncclIntruQueueConstruct(&comm->eventCallbackQueue);
// setup intraComm0 and intraRanks 0 to default values to ensure proper cleanup of the communicator
comm->intraComm0 = comm;
comm->intraRank = 0;
comm->intraRanks = 1;
return ncclSuccess;
}
@@ -672,12 +703,16 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail);
NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
ncclCommPushCudaFree(comm, devCommAndChans);
NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
ncclCommPushCudaFree(comm, tmpCommAndChans.comm.rankToLocalRank);
NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
comm->devComm = &devCommAndChans->comm;
tmpCommAndChans.comm.rank = comm->rank;
tmpCommAndChans.comm.nRanks = nRanks;
tmpCommAndChans.comm.node = comm->node;
tmpCommAndChans.comm.nNodes = comm->nNodes;
tmpCommAndChans.comm.abortFlag = comm->abortFlagDev;
tmpCommAndChans.comm.isNvlink = ncclTopoPathAllNVLink(comm->topo);
tmpCommAndChans.comm.p2pnChannelsPerPeer = comm->p2pnChannelsPerPeer;
for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
@@ -824,10 +859,13 @@ static void showVersion() {
}
}
NCCL_PARAM(MNNVLCliqueId, "MNNVL_CLIQUE_ID", -1);
static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
info->rank = comm->rank;
info->cudaDev = comm->cudaDev;
info->nvmlDev = comm->nvmlDev;
NCCLCHECK(ncclGetVersion(&info->version));
info->hostHash=getHostHash()+commHash;
info->pidHash=getPidHash()+commHash;
info->cuMemSupport = ncclCuMemEnable();
@@ -881,6 +919,7 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
}
if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
}
#endif
@@ -1025,7 +1064,8 @@ static int checkMNNVL(struct ncclComm* comm) {
#define TIMER_INIT_TOPO 4
#define TIMER_INIT_GRAPHS 5
#define TIMER_INIT_CONNECT 6
#define TIMERS_INIT_COUNT 7
#define TIMER_INIT_ALLOC 7
#define TIMERS_INIT_COUNT 8
static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
// We use 2 AllGathers
@@ -1041,7 +1081,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
struct ncclTopoGraph* collNetChainGraph = &comm->graphs[NCCL_ALGO_COLLNET_CHAIN];
struct ncclTopoGraph* collNetDirectGraph = &comm->graphs[NCCL_ALGO_COLLNET_DIRECT];
struct ncclTopoGraph* nvlsGraph = &comm->graphs[NCCL_ALGO_NVLS];
struct ncclTopoGraph* graphs[] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph };
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph, treeGraph };
struct graphInfo {
int pattern;
@@ -1074,7 +1114,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
struct ncclProxyConnector proxyConn;
int* pxnPeers = NULL;
int *topParentLocalRanks = NULL;
int tpProxyRank;
bool needsProxy = false;
bool mscclNeedsProxy = needsProxy;
@@ -1087,6 +1126,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
comm->cuMemSupport = 1;
for (int i = 0; i < nranks; i++) {
if (comm->peerInfo[i].version != comm->peerInfo[rank].version) {
WARN("Mismatched NCCL version detected : rank %d version %d rank %d version %d",
i, comm->peerInfo[i].version, rank, comm->peerInfo[rank].version);
ret = ncclInvalidUsage;
goto fail;
}
if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++;
if (!comm->peerInfo[i].cuMemSupport) comm->cuMemSupport = 0;
if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
@@ -1266,7 +1311,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
collNetChainGraph->maxChannels = ringGraph->nChannels;
memset(collNetDirectGraph, 0, sizeof(struct ncclTopoGraph));
collNetDirectGraph->id = 2;
collNetDirectGraph->id = 4;
collNetDirectGraph->pattern = NCCL_TOPO_PATTERN_COLLNET_DIRECT;
collNetDirectGraph->collNet = 1;
collNetDirectGraph->minChannels = 1;
@@ -1509,18 +1554,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
comm->collNetSupport = 0;
}
comm->collNetRegSupport = true;
for (int n=0; n<comm->nNodes; n++) {
if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) {
WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1);
comm->collNetSupport = 0;
break;
}
if (comm->nodeRanks[n].localRanks > 1) {
// As long as there is more than 1 rank on any node, we need to disable collnet reg
comm->collNetRegSupport = false;
}
}
// As long as there is more than 1 rank on any node, we need to disable collnet reg
comm->collNetRegSupport = (comm->maxLocalRanks == 1);
}
NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
@@ -1567,6 +1602,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
}
comm->topParentLocalRanks = topParentLocalRanks;
NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->intraNodeP2pSupport, &comm->directMode), ret, fail);
// Launch proxy service thread, after this, the proxy calls can be used.
if (parent && parent->config.splitShare) {
comm->proxyState = parent->sharedRes->proxyState;
@@ -1574,6 +1610,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
} else {
NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
}
NCCLCHECKGOTO(ncclCalloc(&comm->gproxyConn, comm->nRanks), ret, fail);
timers[TIMER_INIT_CONNECT] = clockNano();
do { // Build p2p schedule
@@ -1661,6 +1698,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
// Connect Trees
NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
// Connect PAT only for communicators with 1 GPU per node
if (comm->maxLocalRanks == 1) NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
// Setup NVLS
NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
@@ -1672,12 +1712,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
if (comm->collNetSupport > 0) {
ncclCollNetSetup(comm, parent, graphs);
NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
}
}
// Connect to local net proxy
tpProxyRank = comm->topParentRanks[comm->rank];
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
// Then to remote ones when using PXN
@@ -1685,8 +1726,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
int nranks;
NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
for (int r=0; r<nranks; r++) {
tpProxyRank = comm->topParentRanks[pxnPeers[r]];
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
}
}
@@ -1791,17 +1831,20 @@ NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT);
NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT);
#define NCCL_MAX_CGA_CLUSTER_SIZE 8
#define NCCL_COMMINIT_FUNCNAME_LEN 128
struct ncclCommInitRankAsyncJob {
struct ncclAsyncJob base;
struct ncclComm* comm;
struct ncclComm** newcomm;
int cudaDev;
// For ncclCommInitRank
int nranks, myrank;
ncclUniqueId commId;
int nranks, myrank, nId;
ncclUniqueId* commId;
// for ncclCommSplit
struct ncclComm* parent;
int color, key;
// name of the function calling
char funcName[NCCL_COMMINIT_FUNCNAME_LEN];
};
struct ncclCommFinalizeAsyncJob {
@@ -1811,30 +1854,31 @@ struct ncclCommFinalizeAsyncJob {
NCCL_PARAM(CommSplitShareResources, "COMM_SPLIT_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT);
typedef struct{
int key;
int color;
} commSplitInfo;
static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* nRanksRet, int* myRankRet, int* parentRanksRet) {
int* colors = NULL;
int* keys = NULL;
int nRanks = 0, myRank = 0;
ncclResult_t ret = ncclSuccess;
NCCLCHECKGOTO(ncclCalloc(&colors, parent->nRanks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&keys, parent->nRanks), ret, fail);
commSplitInfo* info = NULL;
NCCLCHECKGOTO(ncclCalloc(&info, parent->nRanks), ret, fail);
// Compute nRanks, my rank and the ranks (of the original comm) before and after me
colors[parent->rank] = color;
keys[parent->rank] = key;
NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, colors, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, keys, sizeof(int)), ret, fail);
info[parent->rank].color = color;
info[parent->rank].key = key;
NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, info, sizeof(commSplitInfo)), ret, fail);
// Negative color does not create a new comm. Return now.
if (color == NCCL_SPLIT_NOCOLOR) goto exit;
memset(parentRanksRet, 0xff, sizeof(int) * parent->nRanks);
for (int i = 0; i < parent->nRanks; i++) {
if (colors[i] != color) continue;
if (info[i].color != color) continue;
// Find where to insert this rank
int insert = 0;
while (insert < nRanks && keys[parentRanksRet[insert]] <= keys[i]) insert++;
while (insert < nRanks && info[parentRanksRet[insert]].key <= info[i].key) insert++;
// Shift ranks by one after insert
for (int r = nRanks; r > insert; r--) parentRanksRet[r] = parentRanksRet[r - 1];
// Insert our rank
@@ -1850,8 +1894,7 @@ static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* par
*myRankRet = myRank;
exit:
free(colors);
free(keys);
free(info);
return ret;
fail:
goto exit;
@@ -1861,7 +1904,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
ncclComm_t comm = job->comm;
#ifdef ENABLE_MSCCLPP
ncclUniqueId origUniqueId = job->commId;
ncclUniqueId origUniqueId = *job->commId;
#endif
ncclResult_t res = ncclSuccess;
int archMajor, archMinor;
@@ -1869,7 +1912,9 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
int cudaDev = job->cudaDev;
int* parentRanks = NULL;
int cudaArch;
uint64_t timers[TIMERS_INIT_COUNT];
double sum_timers = 0;
uint64_t timers[TIMERS_INIT_COUNT] = {0};
unsigned long long commIdHash;
int64_t stackSize;
hipDeviceProp_t devProp;
@@ -1903,31 +1948,40 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
}
timers[TIMER_INIT_KERNELS] = clockNano() - timers[TIMER_INIT_KERNELS];
timers[TIMER_INIT_BOOTSTRAP] = clockNano();
if (job->parent) {
NCCLCHECKGOTO(ncclCalloc(&parentRanks, job->parent->nRanks), res, fail);
NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail);
// Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now.
if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
snprintf((char*)&job->commId, sizeof(job->commId), "%016lx-%d", job->parent->commHash, job->color);
timers[TIMER_INIT_ALLOC] = clockNano();
NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
NCCLCHECKGOTO(bootstrapSplit((struct ncclBootstrapHandle*)&job->commId, comm, job->parent, job->color, job->key, parentRanks), res, fail);
timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
// obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex), add the color
ncclUniqueId tmpId;
memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits
snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d", job->parent->commHash, job->color);
comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d- Init START", job->funcName,
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key);
timers[TIMER_INIT_BOOTSTRAP] = clockNano();
NCCLCHECKGOTO(bootstrapSplit(comm->commHash, comm, job->parent, job->color, job->key, parentRanks), res, fail);
timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
// debug info, no commId was used
commIdHash = 0;
} else {
timers[TIMER_INIT_ALLOC] = clockNano();
NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail);
timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
// obtain a unique hash using the first commId
comm->commHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
commIdHash = hashUniqueId(job->commId[0]);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName,
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
timers[TIMER_INIT_BOOTSTRAP] = clockNano();
NCCLCHECKGOTO(bootstrapInit(job->nId, (struct ncclBootstrapHandle*)job->commId, comm), res, fail);
timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
}
timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
comm->cudaArch = cudaArch;
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES);
if (job->parent) {
INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx - Init START",
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId));
} else {
INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START",
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
}
NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail);
@@ -1938,10 +1992,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
comm->mscclppCompatible = true;
comm->mscclpp_threshold = job->parent->mscclpp_threshold;
comm->mscclpp_comm = job->parent->mscclpp_comm;
auto& mscclppUniqueId = mscclpp_uniqueIdMap[origUniqueId];
mscclpp_uniqueIdMap[job->commId] = mscclppUniqueId;
mscclpp_uniqueIdReverseMap[mscclppUniqueId].insert(job->commId);
ncclCommToUniqueIdMap[comm] = job->commId;
auto& mscclppUniqueId = mscclpp_uniqueIdMap[*job->commId];
mscclpp_uniqueIdMap[*job->commId] = mscclppUniqueId;
mscclpp_uniqueIdReverseMap[mscclppUniqueId].insert(*job->commId);
ncclCommToUniqueIdMap[comm] = *job->commId;
}
}
else
@@ -1953,8 +2007,8 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
CUDACHECK(hipGetDeviceProperties(&devProp, cudaDev));
comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx942") || IsArchMatch(devProp.gcnArchName, "gfx950");
if (comm->mscclppCompatible) {
bool mapContainsId = (mscclpp_uniqueIdMap.count(job->commId) > 0);
auto& mscclppUniqueId = mscclpp_uniqueIdMap[job->commId];
bool mapContainsId = (mscclpp_uniqueIdMap.count(*job->commId) > 0);
auto& mscclppUniqueId = mscclpp_uniqueIdMap[*job->commId];
if (comm->localRank == 0 && !mapContainsId) {
NCCLCHECKGOTO(mscclpp_ncclGetUniqueId(&mscclppUniqueId), res, fail);
TRACE_CALL("mscclpp_ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(mscclppUniqueId));
@@ -1963,7 +2017,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, &mscclppUniqueId, sizeof(mscclppUniqueId)), res, fail);
unsigned long long mscclppUniqueIdHash; (void)mscclppUniqueIdHash;
TRACE_CALL("bootstrapIntraNodeBroadcast(rank=%d, nranks=%d, root=%d, bcastData=hash:0x%llx)", comm->localRank, comm->localRanks, 0, (mscclppUniqueIdHash = (unsigned long long)hashUniqueId(mscclppUniqueId)));
mscclpp_uniqueIdReverseMap[mscclppUniqueId].insert(job->commId);
mscclpp_uniqueIdReverseMap[mscclppUniqueId].insert(*job->commId);
comm->mscclpp_threshold = rcclParamMscclppThreshold();
INFO(NCCL_INIT, "MSCCL++: Enabled! Msg size threshold=%zu", comm->mscclpp_threshold);
@@ -1971,12 +2025,12 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
NCCLCHECKGOTO(mscclpp_ncclCommInitRank(&(comm->mscclpp_comm), job->nranks, mscclppUniqueId, job->myrank), res, fail);
TRACE_CALL("mscclpp_ncclCommInitRank (*comm=%p, nranks=%d, commId=hash:0x%llx, myrank=%d)", comm->mscclpp_comm, job->nranks, mscclppUniqueIdHash, job->myrank);
mscclpp_commToUniqueIdMap[comm->mscclpp_comm] = mscclppUniqueId;
ncclCommToUniqueIdMap[comm] = job->commId;
if (rcclParamMscclppForceEnabled()) {
comm->mscclppForceEnable = true;
} else {
comm->mscclppForceEnable = false;
}
ncclCommToUniqueIdMap[comm] = *job->commId;
if (rcclParamMscclppForceEnabled()) {
comm->mscclppForceEnable = true;
} else {
comm->mscclppForceEnable = false;
}
} else {
WARN("MSCCL++: Cannot enable MSCCL++ on %s architecture", devProp.gcnArchName);
}
@@ -2002,23 +2056,25 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
if (job->parent) {
/* unlink child abort flag. */
__atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE);
TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)",
job->parent, job->color, job->key, comm, comm->rank, comm->nRanks);
TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", job->parent, job->color, job->key, comm, comm->rank, comm->nRanks);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d - Init COMPLETE", job->funcName,
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key);
} else {
TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)",
comm, comm->nRanks, (unsigned long long)hashUniqueId(job->commId), comm->rank, comm->cudaDev);
// the name for the replay tool is ncclCommInitRank for all the variations
TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", comm, comm->nRanks, commIdHash, comm->rank, comm->cudaDev);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE", job->funcName,
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
}
if (job->parent) {
INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx localSize %zi used %ld bytes on core %d - Init COMPLETE",
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId), maxLocalSizeBytes, allocTracker[comm->cudaDev].totalAllocSize, sched_getcpu());
} else {
INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx localSize %zi used %ld bytes on core %d - Init COMPLETE",
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId), maxLocalSizeBytes, allocTracker[comm->cudaDev].totalAllocSize, sched_getcpu());
}
INFO(NCCL_INIT|NCCL_PROFILE,"Init timings: rank %d nranks %d total %.2f (kernels %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, connections %.2f, rest %.2f)", comm->rank, comm->nRanks, timers[TIMER_INIT_TOTAL]/1e9,
timers[TIMER_INIT_KERNELS]/1e9, timers[TIMER_INIT_BOOTSTRAP]/1e9, timers[TIMER_INIT_ALLGATHER]/1e9, timers[TIMER_INIT_TOPO]/1e9, timers[TIMER_INIT_GRAPHS]/1e9, timers[TIMER_INIT_CONNECT]/1e9,
(timers[TIMER_INIT_TOTAL]-timers[TIMER_INIT_KERNELS]-timers[TIMER_INIT_BOOTSTRAP]-timers[TIMER_INIT_ALLGATHER]-timers[TIMER_INIT_TOPO]-timers[TIMER_INIT_GRAPHS]-timers[TIMER_INIT_CONNECT])/1e9);
sum_timers = 0.0;
for (int it = 1; it < TIMERS_INIT_COUNT; ++it)
sum_timers += (timers[it] / 1e9);
INFO(NCCL_INIT | NCCL_PROFILE,
"Init timings - %s: rank %d nranks %d total %.2f (kernels %.2f, alloc %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, "
"connections %.2f, rest %.2f)",
job->funcName, comm->rank, comm->nRanks,
timers[TIMER_INIT_TOTAL] / 1e9, timers[TIMER_INIT_KERNELS] / 1e9, timers[TIMER_INIT_ALLOC] / 1e9,
timers[TIMER_INIT_BOOTSTRAP] / 1e9, timers[TIMER_INIT_ALLGATHER] / 1e9, timers[TIMER_INIT_TOPO] / 1e9,
timers[TIMER_INIT_GRAPHS] / 1e9, timers[TIMER_INIT_CONNECT] / 1e9, timers[TIMER_INIT_TOTAL] / 1e9 - sum_timers);
exit:
if (job->newcomm) {
/* assign it to user pointer. */
@@ -2203,17 +2259,24 @@ fail:
goto exit;
}
static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev, ncclConfig_t *config) {
ncclResult_t res = ncclSuccess;
ncclComm_t comm = NULL;
struct ncclCommInitRankAsyncJob *job = NULL;
const char* env = ncclGetEnv("NCCL_COMM_ID");
if (env && myrank == 0) {
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&commId, true), res, fail);
}
static void ncclCommInitJobFree(void* _job) {
struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)_job;
free(job->commId);
free(_job);
}
static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId, ncclUniqueId* commId, int myrank, int cudaDev, ncclConfig_t *config, const char funcName[]) {
if (nId <= 0 || nId > nranks) {
WARN("improper usage of ncclCommInitRank: nId = %d, nranks=%d", nId, nranks);
return ncclInvalidArgument;
}
ncclResult_t res = ncclSuccess;
const char* commIdEnv = NULL;
ncclComm_t comm = NULL;
struct ncclCommInitRankAsyncJob* job = NULL;
// first call ncclInit, this will setup the environment
NCCLCHECKGOTO(ncclInit(), res, fail);
if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel != NCCL_LOG_NONE && myrank == 0)) {
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, showVersion);
@@ -2241,19 +2304,37 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
*newcomm = comm;
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
job->nId = nId;
job->comm = comm;
job->nranks = nranks;
job->commId = commId; // C++ struct assignment
job->myrank = myrank;
job->cudaDev = cudaDev;
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail);
snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", funcName);
// need to copy the commIds to allow async commInit and to avoid alignement issues when casting from ncclUNiqueId and ncclBootstrapHandle
// ncclUniqueIds and ncclBootstrapHandle don't have the same alignment requirements.
// Therefore the array of Ids coming from the user might not be properly aligned to be cast into a ncclBootstrapHandle
// copying into allocated memory guarantees that the memory is properly aligned for any objects, removing that issue
NCCLCHECKGOTO(ncclCalloc(&job->commId, nId), res, fail);
memcpy(job->commId, commId, nId * NCCL_UNIQUE_ID_BYTES);
commIdEnv = ncclGetEnv("NCCL_COMM_ID");
if (commIdEnv && myrank == 0) {
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commIdEnv);
if (nId > 1) {
INFO(NCCL_INIT | NCCL_ENV, "NCCL_COMM_ID cannot be used with more than one ncclUniqueId");
job->nId = 1;
}
// start the bootstrap root before bootstrapping, use only the first handle
NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&job->commId[0], true), res, fail);
}
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, ncclCommInitJobFree, comm), res, fail);
exit:
return ncclGroupErrCheck(res);
fail:
if (comm) {
free(comm->abortFlag);
if (comm->abortFlagDev) ncclCudaHostFree((void*)comm->abortFlagDev);
if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev);
free(comm->abortFlagRefCount);
free(comm);
}
@@ -2285,7 +2366,7 @@ ncclResult_t ncclCommInitRank_impl(ncclComm_t* newcomm, int nranks, ncclUniqueId
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config));
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, &config, __func__));
return ncclSuccess;
}
@@ -2295,6 +2376,7 @@ ncclResult_t ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlis
int totalnDev;
int *gpuFlags = NULL;
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
int oldDev = 0;
constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"}
@@ -2304,6 +2386,7 @@ ncclResult_t ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlis
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
rocmLibraryInit();
CUDACHECK(cudaGetDevice(&oldDev));
NCCLCHECKGOTO(PtrCheck(comms, "CommInitAll", "comms"), ret, fail);
if (ndev < 0) {
WARN("Invalid device count requested : %d", ndev);
@@ -2317,7 +2400,8 @@ ncclResult_t ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlis
for (int i = 0; i < ndev; ++i) {
/* invalid device check. */
if (devlist[i] < 0 || devlist[i] >= totalnDev) {
ret = ncclUnhandledCudaError;
WARN("Invalid device %d (totalnDev=%d)", devlist[i], totalnDev);
ret = ncclInvalidArgument;
goto fail;
}
@@ -2338,13 +2422,18 @@ ncclResult_t ncclCommInitAll_impl(ncclComm_t* comms, int ndev, const int* devlis
NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
for (int i=0; i<ndev; i++) {
// Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, &config);
int dev = devlist ? devlist[i] : i;
CUDACHECKGOTO(cudaSetDevice(dev), ret, fail);
ncclCommInitRankDev(comms+i, ndev,1, &uniqueId, i, dev, &config, __func__);
}
NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
fail:
exit:
cudaSetDevice(oldDev);
free(gpuFlags);
return ret;
fail:
goto exit;
}
ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState) {
@@ -2359,7 +2448,6 @@ ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState) {
NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config);
ncclResult_t ncclCommInitRankConfig_impl(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
int cudaDev;
ncclResult_t ret = ncclSuccess;
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
@@ -2367,13 +2455,46 @@ ncclResult_t ncclCommInitRankConfig_impl(ncclComm_t *newcomm, int nranks, ncclUn
NCCLCHECK(ncclGroupStartInternal());
rocmLibraryInit();
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, fail);
CUDACHECK(cudaGetDevice(&cudaDev));
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommInitRankConfig, CommInitRankSchema, payload)
if (config == NULL)
internalConfigPtr = &internalConfig;
else
internalConfigPtr = config;
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail);
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, internalConfigPtr, __func__), ret, fail);
exit:
ncclGroupErrCheck(ret);
NCCLCHECK(ncclGroupEndInternal());
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
return ret;
fail:
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
goto exit;
}
NCCL_API(ncclResult_t, ncclCommInitRankScalable, ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config);
ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config) {
int cudaDev;
ncclResult_t ret = ncclSuccess;
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
ncclConfig_t *internalConfigPtr = NULL;
NCCLCHECK(ncclGroupStartInternal());
rocmLibraryInit();
CUDACHECK(cudaGetDevice(&cudaDev));
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommInitRankScalable, CommInitRankSchema, payload)
if (config == NULL)
internalConfigPtr = &internalConfig;
else
internalConfigPtr = config;
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, nId, commId, myrank, cudaDev, internalConfigPtr, __func__), ret, fail);
exit:
ncclGroupErrCheck(ret);
@@ -2400,13 +2521,25 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult);
if (comm->initState == ncclSuccess) {
NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->hostStream), ret, fail);
NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), ret, fail);
if ((ret = ncclStrongStreamSynchronize(&comm->sharedRes->hostStream)) != ncclSuccess) {
WARN("commDestroySync: comm %p rank %d sync hostStream error %d\n", comm, comm->rank, ret);
}
if ((ret = ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream)) != ncclSuccess) {
WARN("commDestroySync: comm %p rank %d sync deviceStream error %d\n", comm, comm->rank, ret);
}
NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), ret, fail);
NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
// And keep polling until all graphs referencing us die.
while (comm->persistentRefs != 0) {
NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail);
}
while (!ncclIntruQueueEmpty(&comm->legacyRegCleanupQueue)) {
struct ncclCommCallback* cb = ncclIntruQueueDequeue(&comm->legacyRegCleanupQueue);
if (cb->fn(comm, cb) != ncclSuccess) {
WARN("Legacy IPC cleanup callback failed comm %p (rank = %d) cb %p", comm, comm->rank, cb);
}
}
}
if ((ret = ncclProxyStop(comm)) != ncclSuccess) {
@@ -2484,14 +2617,15 @@ ncclResult_t ncclCommFinalize_impl(ncclComm_t comm) {
/* launch async thread to finalize comm. */
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
job->comm = comm;
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commDestroySync, NULL, free, comm), ret, fail);
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commDestroySync, NULL, free, comm), ret, fail);
exit:
ncclGroupErrCheck(ret);
NCCLCHECK(ncclGroupEndInternal());
if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)) };
if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); }
return ret;
fail:
free(job);
if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret);
goto exit;
}
@@ -2538,13 +2672,15 @@ static ncclResult_t commReclaim(struct ncclAsyncJob* job_) {
nextIntraComm = nextIntraComm->intraNext;
if ((ret = commCleanup(curIntraComm)) != ncclSuccess) {
// We pass a freed pointer, but we don't dereference; we merely print its value, so it's OK.
// coverity[pass_freed_arg]
WARN("commReclaim: cleanup comm %p rank %d failed in destroy/abort, error %d", curIntraComm, curRank, ret);
}
}
}
}
return ret;
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
@@ -2596,12 +2732,11 @@ ncclResult_t ncclCommDestroy_impl(ncclComm_t comm) {
NCCLCHECK(ncclCommEnsureReady(comm));
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
job->comm = comm;
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail);
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
exit:
return res;
fail:
free(job);
goto exit;
}
@@ -2612,15 +2747,6 @@ ncclResult_t ncclCommAbort_impl(ncclComm_t comm) {
return ncclSuccess;
}
int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
struct ncclCommFinalizeAsyncJob *job = NULL;
ncclResult_t res = ncclSuccess;
NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload)
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
// Ask anything that might still be running on the device to quit
if (comm->childAbortFlag != nullptr) {
__atomic_store_n(comm->childAbortFlag, 1, __ATOMIC_RELEASE);
@@ -2631,30 +2757,61 @@ ncclResult_t ncclCommAbort_impl(ncclComm_t comm) {
comm->destroyFlag = 1;
/* init thread must be joined before we destroy the comm,
* and we should ignore the init error here. */
ncclCommEnsureReady(comm);
(void)ncclCommEnsureReady(comm);
// once the comm is ready, we can access ranks etc
int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
struct ncclCommFinalizeAsyncJob *job = NULL;
ncclResult_t res = ncclSuccess;
NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload)
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
job->comm = comm;
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail);
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
exit:
return ncclSuccess;
fail:
free(job);
goto exit;
}
struct NvtxParamsCommSplit {
int rank;
int nranks;
int cudaDev;
int color;
int key;
};
constexpr nvtxPayloadSchemaEntry_t CommSplitSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommSplit, nranks)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommSplit, cudaDev)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "color", nullptr, 0, offsetof(NvtxParamsCommSplit, color)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "key", nullptr, 0, offsetof(NvtxParamsCommSplit, key)},
};
NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
ncclResult_t ncclCommSplit_impl(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
struct ncclCommInitRankAsyncJob *job = NULL;
struct ncclComm* childComm = NCCL_COMM_NULL;
ncclResult_t res = ncclSuccess;
NvtxParamsCommSplit payload{comm->rank, comm->nRanks, comm->cudaDev, color, key};
NVTX3_FUNC_WITH_PARAMS(CommSplit, CommSplitSchema, payload)
int oldDev;
CUDACHECK(cudaGetDevice(&oldDev));
NCCLCHECK(ncclGroupStartInternal());
NCCLCHECKGOTO(CommCheck(comm, "CommSplit", "comm"), res, fail);
NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail);
NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, fail);
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), res, fail);
/* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */
*newcomm = NCCL_COMM_NULL;
if (color == NCCL_SPLIT_NOCOLOR) {
@@ -2694,10 +2851,12 @@ ncclResult_t ncclCommSplit_impl(ncclComm_t comm, int color, int key, ncclComm_t
job->color = color;
job->key = key;
job->cudaDev = comm->cudaDev;
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail);
snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", __func__);
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, free, comm), res, fail);
exit:
ncclGroupErrCheck(res);
cudaSetDevice(oldDev);
(void)ncclGroupErrCheck(res);
NCCLCHECK(ncclGroupEndInternal());
return res;
fail:
@@ -2800,7 +2959,7 @@ ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
CUmemAccessDesc accessDesc = {};
CUmemGenericAllocationHandle handle;
int cudaDev;
int flag = 0;
int flag;
int dcnt;
int mcSupport = 0;
@@ -2814,12 +2973,18 @@ ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
if (mcSupport) {
int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
// Query device to see if FABRIC handle support is available
flag = 0;
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
memprop.requestedHandleTypes = ncclCuMemHandleType;
memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
memprop.location.id = currentDev;
// Query device to see if RDMA support is available
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
flag = 0;
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
@@ -2828,14 +2993,25 @@ ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
mcprop.size = size;
/* device cnt is a dummy value right now, it might affect mc granularity in the future. */
mcprop.numDevices = dcnt;
mcprop.handleTypes = ncclCuMemHandleType;
mcprop.handleTypes = requestedHandleTypes;
mcprop.flags = 0;
CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
/* only size needs to be aligned to mcGran */
ALIGN_SIZE(size, mcGran);
/* Allocate the physical memory on the device */
CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
/* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0));
if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) {
requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC;
memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
/* Allocate the physical memory on the device */
CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
}
} else {
/* Allocate the physical memory on the device */
CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
}
/* Reserve a virtual address range */
CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0));
/* Map the virtual address range to the physical allocation */
@@ -2855,6 +3031,9 @@ ncclResult_t ncclMemAlloc_impl(void **ptr, size_t size) {
fallback:
#endif
// Coverity is right to complain that we may pass a NULL ptr to cudaMalloc. That's deliberate though:
// we want CUDA to return an error to the caller.
// coverity[var_deref_model]
CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail);
exit:
@@ -2893,7 +3072,7 @@ fallback:
CUDACHECKGOTO(cudaFree(ptr), ret, fail);
exit:
cudaSetDevice(saveDevice);
CUDACHECK(cudaSetDevice(saveDevice));
return ret;
fail:
goto exit;
+4
Melihat File
@@ -53,6 +53,10 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
return ncclInvalidArgument;
}
// ncclMaxRedOp < info->op will always be false due to the sizes of
// the datatypes involved, and that's by design. We keep the check though
// just as a reminder.
// coverity[result_independent_of_operands]
if (info->op < 0 || ncclMaxRedOp < info->op) {
WARN("%s : invalid reduction operation %d", info->opName, info->op);
return ncclInvalidArgument;
+26 -2
Melihat File
@@ -11,7 +11,7 @@
// This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);
NCCL_PARAM(CuMemHostEnable, "CUMEM_HOST_ENABLE", 0);
// Handle type used for cuMemCreate()
CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
@@ -49,6 +49,14 @@ int ncclCuMemEnable() {
return param >= 0 ? param : (param == -2 && ncclCuMemSupported);
}
int ncclCuMemHostEnable() {
#if CUDART_VERSION < 12020
return 0;
#else
return ncclParamCuMemHostEnable();
#endif
}
#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
#if CUDART_VERSION >= 11030
@@ -81,6 +89,7 @@ DECLARE_CUDA_PFN(cuMemRelease);
DECLARE_CUDA_PFN(cuMemRetainAllocationHandle);
DECLARE_CUDA_PFN(cuMemSetAccess);
DECLARE_CUDA_PFN(cuMemUnmap);
DECLARE_CUDA_PFN(cuMemGetAllocationPropertiesFromHandle);
/* ncclMemAlloc/Free */
DECLARE_CUDA_PFN(cuPointerGetAttribute);
#if CUDA_VERSION >= 11070
@@ -107,7 +116,7 @@ bool ncclCudaLaunchBlocking = false;
#if CUDART_VERSION >= 12000
#define LOAD_SYM(symbol, ignore) do { \
cudaDriverEntryPointQueryResult driverStatus; \
cudaDriverEntryPointQueryResult driverStatus = cudaDriverEntryPointSymbolNotFound; \
res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault, &driverStatus); \
if (res != cudaSuccess || driverStatus != cudaDriverEntryPointSuccess) { \
if (!ignore) { \
@@ -157,6 +166,7 @@ static ncclResult_t cudaPfnFuncLoader(void) {
LOAD_SYM(cuMemRetainAllocationHandle, 1);
LOAD_SYM(cuMemSetAccess, 1);
LOAD_SYM(cuMemUnmap, 1);
LOAD_SYM(cuMemGetAllocationPropertiesFromHandle, 1);
/* ncclMemAlloc/Free */
LOAD_SYM(cuPointerGetAttribute, 1);
#if CUDA_VERSION >= 11070
@@ -208,6 +218,20 @@ static void initOnceFunc() {
// Determine whether we support the cuMem APIs or not
ncclCuMemSupported = ncclIsCuMemSupported();
#if 12020 <= CUDART_VERSION && CUDART_VERSION <= 12030
/* To use cuMem* for host memory allocation, we need to create context on each
* visible device. This is workaround needed in CUDA 12.3 which is fixed in 12.4. */
if (ncclCuMemSupported && ncclCuMemHostEnable()) {
int deviceCnt, saveDevice;
cudaGetDevice(&saveDevice);
cudaGetDeviceCount(&deviceCnt);
for (int i = 0; i < deviceCnt; ++i) {
cudaSetDevice(i);
cudaFree(NULL);
}
cudaSetDevice(saveDevice);
}
#endif
initResult = ret;
return;
error:
+10 -13
Melihat File
@@ -41,6 +41,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
if (len > (sizeof(cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot bind provided name to socket. Name too large");
close(fd);
return ncclInternalError;
}
#ifndef USE_ABSTRACT_SOCKET
@@ -66,7 +67,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
// Mark socket as non-blocking
if (handle->abortFlag) {
int flags;
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
SYSCHECK(flags = fcntl(fd, F_GETFL), "fcntl");
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
}
@@ -186,20 +187,16 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d to UDS socket %s", hdr, hdrLen, temp);
TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp);
if (sendFd != -1) {
TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
cmptr = CMSG_FIRSTHDR(&msg);
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
cmptr->cmsg_level = SOL_SOCKET;
cmptr->cmsg_type = SCM_RIGHTS;
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
}
cmptr = CMSG_FIRSTHDR(&msg);
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
cmptr->cmsg_level = SOL_SOCKET;
cmptr->cmsg_type = SCM_RIGHTS;
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
msg.msg_name = (void *)&cliaddr;
msg.msg_namelen = sizeof(struct sockaddr_un);
+4
Melihat File
@@ -102,6 +102,10 @@ ncclResult_t ncclNvmlEnsureInitialized() {
for(Symbol sym: symbols) {
*sym.ppfn = dlsym(libhandle, sym.name);
}
// Coverity complains that we never dlclose this object, but that's
// deliberate, since we want the loaded object to remain in memory until
// the process terminates, so that we can use its code.
// coverity[leaked_storage]
}
#endif
+19 -9
Melihat File
@@ -37,7 +37,7 @@ void setEnvFile(const char* fileName) {
while (line[s] != '\0' && line[s] != '=') s++;
if (line[s] == '\0') continue;
strncpy(envVar, line, std::min(1023,s));
envVar[s] = '\0';
envVar[std::min(1023,s)] = '\0';
s++;
strncpy(envValue, line+s, 1023);
envValue[1023]='\0';
@@ -48,17 +48,28 @@ void setEnvFile(const char* fileName) {
fclose(file);
}
void initEnv() {
static void initEnvFunc() {
char confFilePath[1024];
const char * userDir = userHomeDir();
if (userDir) {
sprintf(confFilePath, "%s/.rccl.conf", userDir);
const char* userFile = getenv("NCCL_CONF_FILE");
if (userFile && strlen(userFile) > 0) {
snprintf(confFilePath, sizeof(confFilePath), "%s", userFile);
setEnvFile(confFilePath);
} else {
const char* userDir = userHomeDir();
if (userDir) {
snprintf(confFilePath, sizeof(confFilePath), "%s/.rccl.conf", userDir);
setEnvFile(confFilePath);
}
}
sprintf(confFilePath, "/etc/rccl.conf");
snprintf(confFilePath, sizeof(confFilePath), "/etc/rccl.conf");
setEnvFile(confFilePath);
}
void initEnv() {
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, initEnvFunc);
}
void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_lock(&mutex);
@@ -80,8 +91,7 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
pthread_mutex_unlock(&mutex);
}
const char *ncclGetEnv(const char *name) {
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, initEnv);
const char* ncclGetEnv(const char* name) {
initEnv();
return getenv(name);
}
+502 -93
Melihat File
@@ -1,115 +1,524 @@
/*************************************************************************
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "param.h"
#include "checks.h"
#include "comm.h"
#include "enqueue.h"
#include "utils.h"
#include "proxy.h"
#include "profiler.h"
//#define PROFILE_PROXY 1
#ifdef PROFILE_PROXY
#include "timer.h"
#include "alloc.h"
static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER;
static int profilerPluginRefCount;
static void* profilerPluginLib;
static ncclProfiler_t* ncclProfiler;
static const char* profilingStateSendStr[] = { "BufferWait", "GPUWait", "SendWait", "", "End" };
static const char* profilingStateRecvStr[] = { "BufferWait", "RecvWait", "FlushWait", "GPUWait", "End" };
static const char* profilingEventStr[] = { "SendRecv", "Sleep", "Idle", "Append" };
struct ncclProxyProfileEvent {
double timestamp[6];
uint64_t opCount;
int peer;
int step;
uint16_t channel;
uint8_t type; // send / recv
uint8_t opIndex;
};
#define MAX_STR_LEN 256
#define NCCL_PROFILER_PLUGIN_SYMBOL "ncclProfiler_v1"
struct ncclProxyProfileEvent* profilingEvents = NULL;
int profilingIndex = 0;
double profilingStart = 0;
#define MAX_EVENTS 200000
ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) {
if (profilingEvents == NULL) {
NCCLCHECK(ncclCalloc(&profilingEvents, MAX_EVENTS));
profilingStart = gettime();
static void* tryOpenLib(char* name, int *err, char* errStr) {
if (nullptr == name || strlen(name) == 0) {
return nullptr;
}
struct ncclProxyProfileEvent* event = NULL;
if (state%8 == 0) {
if (profilingIndex == MAX_EVENTS) return ncclSuccess;
args->subs[sub].profilingEvents[step%NCCL_STEPS] = event = profilingEvents+profilingIndex++;
if (state == ncclProxyProfileBegin) {
// Proxy operation information
event->opCount = args->opCount;
event->channel = args->subs[sub].channelId;
event->peer = args->subs[sub].peer;
event->type = args->pattern;
event->step = step;
event->opIndex = (((uint64_t)args)/sizeof(struct ncclProxyArgs))%256;
} else event->peer = -state;
if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
name = nullptr;
}
void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
if (nullptr == handle) {
strncpy(errStr, dlerror(), MAX_STR_LEN);
errStr[MAX_STR_LEN] = 0;
if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
*err = ENOENT;
}
}
return handle;
}
static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
if (openErr == ENOENT) {
snprintf(nameList, *nameListLen, " %s", name);
nameList += strlen(name) + 1;
*nameListLen -= strlen(name) + 1;
return nameList;
}
INFO(NCCL_ENV, "PROFILER/Plugin: %s", openErrStr);
return nameList;
}
static void* openProfilerPluginLib(char* couldNotFindNames, int len) {
int openErr;
void *pluginLib;
char profilerPluginLibName[PATH_MAX];
char openErrStr[MAX_STR_LEN + 1] = { 0 };
const char *envProfilerPluginName = getenv("NCCL_PROFILER_PLUGIN");
if (envProfilerPluginName && strlen(envProfilerPluginName)) {
snprintf(profilerPluginLibName, PATH_MAX, "%s", envProfilerPluginName);
pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
if (pluginLib) {
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName);
return pluginLib;
}
couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
if (pluginLib) {
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName);
return pluginLib;
}
couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
} else {
event = (struct ncclProxyProfileEvent*)args->subs[sub].profilingEvents[step%NCCL_STEPS];
if (state == ncclProxyProfileEnd) args->subs[sub].profilingEvents[step%NCCL_STEPS] = NULL;
if (state == ncclProxyProfileAppendEnd) event->opCount = args->opCount;
snprintf(profilerPluginLibName, PATH_MAX, "libnccl-profiler.so");
pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
if (pluginLib) {
return pluginLib;
}
couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
}
// Timestamp
event->timestamp[state%8] = gettime()-profilingStart;
return nullptr;
}
enum {
profilerPluginLoadFailed = -1,
profilerPluginLoadReady = 0,
profilerPluginLoadSuccess = 1,
};
static int profilerPluginStatus = profilerPluginLoadReady;
static pid_t pid;
#define MAX_PLUGIN_LOAD 2
static ncclResult_t ncclProfilerPluginLoad(void) {
if (profilerPluginLoadFailed == profilerPluginStatus) {
return ncclSuccess;
}
char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
pthread_mutex_lock(&profilerLock);
if (profilerPluginLoadSuccess == profilerPluginStatus) {
++profilerPluginRefCount;
goto exit;
}
profilerPluginLib = openProfilerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
if (profilerPluginLib == nullptr) {
if (strlen(couldNotFindNames)) {
INFO(NCCL_ENV, "PROFILER/Plugin: Could not find:%s.", couldNotFindNames);
}
goto fail;
}
ncclProfiler = (ncclProfiler_t*)dlsym(profilerPluginLib, NCCL_PROFILER_PLUGIN_SYMBOL);
if (ncclProfiler == nullptr) {
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find " NCCL_PROFILER_PLUGIN_SYMBOL ".");
goto fail;
}
++profilerPluginRefCount;
profilerPluginStatus = profilerPluginLoadSuccess;
// Store the pid of the process loading the profiler.
// This is attached to the proxyOp event descriptor
// so the plugin can figure out if the parent event
// is in the same address space or not
pid = getpid();
exit:
pthread_mutex_unlock(&profilerLock);
return ncclSuccess;
fail:
if (profilerPluginLib) dlclose(profilerPluginLib);
profilerPluginStatus = profilerPluginLoadFailed;
goto exit;
}
static ncclResult_t ncclProfilerPluginUnload(void) {
pthread_mutex_lock(&profilerLock);
if (0 == (--profilerPluginRefCount)) {
INFO(NCCL_ENV, "PROFILER/Plugin: Closing profiler plugin %s", ncclProfiler->name);
dlclose(profilerPluginLib);
profilerPluginLib = nullptr;
ncclProfiler = nullptr;
profilerPluginStatus = profilerPluginLoadReady;
}
pthread_mutex_unlock(&profilerLock);
return ncclSuccess;
}
void ncclProfilingDump() {
static int dumpDone = 0;
if (dumpDone) return;
dumpDone = 1;
const char* str = ncclGetEnv("NCCL_PROXY_PROFILE");
if (!str) { free(profilingEvents); return; }
FILE* f = fopen(str, "w");
fprintf(f, "[\n");
#define ENABLE_TIMER 0
#include "timer.h"
for (int i=0; i<profilingIndex; i++) {
struct ncclProxyProfileEvent* e = profilingEvents+i;
const int sendrecv = e->peer >= 0;
const char* typeStr = sendrecv ? (e->type == ncclPatternSend ? "Send" : "Recv") :
profilingEventStr[-(e->peer/8)];
#if ENABLE_TIMER
static int64_t elapsedCount;
static int64_t initCount, finalizeCount;
static int64_t groupStartCount, groupStopCount;
static int64_t taskStartCount, taskStopCount;
static int64_t proxyOpStartCount, proxyOpStopCount;
static int64_t proxyStepStartCount, proxyStepStopCount;
static int64_t proxyCtrlStartCount, proxyCtrlStopCount;
static int64_t proxyOpRecordCount, proxyStepRecordCount, proxyCtrlRecordCount;
static double elapsedTs[2];
static double initTs[2], finalizeTs[2];
static double groupStartTs[2], groupStopTs[2];
static double taskStartTs[2], taskStopTs[2];
static double proxyOpStartTs[2], proxyOpStopTs[2];
static double proxyStepStartTs[2], proxyStepStopTs[2];
static double proxyCtrlStartTs[2], proxyCtrlStopTs[2];
static double proxyOpRecordTs[2], proxyStepRecordTs[2], proxyCtrlRecordTs[2];
#define TIME_START_EVENT(event) do { \
(event ## Count)++; \
(event ## Ts)[0] = gettime(); \
} while(0)
#define TIME_STOP_EVENT(event) do { \
double val = gettime() - (event ## Ts)[0]; \
(event ## Ts)[1] += val; \
} while(0)
#define TIME_PRINT_EVENTS(name) do { \
printf("%s ", name); \
if (elapsedCount) printf("[elapsed] %g/%ld = %g ", elapsedTs[1], elapsedCount, elapsedTs[1]/elapsedCount); \
if (initCount) printf("[init] %g/%ld = %g ", initTs[1], initCount, initTs[1]/initCount); \
if (finalizeCount) printf("[finalize] %g/%ld = %g ", finalizeTs[1], finalizeCount, finalizeTs[1]/finalizeCount); \
if (groupStartCount) printf("[groupStart] %g/%ld = %g ", groupStartTs[1], groupStartCount, groupStartTs[1]/groupStartCount); \
if (groupStopCount) printf("[groupStop] %g/%ld = %g ", groupStopTs[1], groupStopCount, groupStopTs[1]/groupStopCount); \
if (taskStartCount) printf("[taskStart] %g/%ld = %g ", taskStartTs[1], taskStartCount, taskStartTs[1]/taskStartCount); \
if (taskStopCount) printf("[taskStop] %g/%ld = %g ", taskStopTs[1], taskStopCount, taskStopTs[1]/taskStopCount); \
if (proxyOpStartCount) printf("[proxyOpStart] %g/%ld = %g ", proxyOpStartTs[1], proxyOpStartCount, proxyOpStartTs[1]/proxyOpStartCount); \
if (proxyOpStopCount) printf("[proxyOpStop] %g/%ld = %g ", proxyOpStopTs[1], proxyOpStopCount, proxyOpStopTs[1]/proxyOpStopCount); \
if (proxyStepStartCount) printf("[proxyStepStart] %g/%ld = %g ", proxyStepStartTs[1], proxyStepStartCount, proxyStepStartTs[1]/proxyStepStartCount); \
if (proxyStepStopCount) printf("[proxyStepStop] %g/%ld = %g ", proxyStepStopTs[1], proxyStepStopCount, proxyStepStopTs[1]/proxyStepStopCount); \
if (proxyCtrlStartCount) printf("[proxyCtrlStart] %g/%ld = %g ", proxyCtrlStartTs[1], proxyCtrlStartCount, proxyCtrlStartTs[1]/proxyCtrlStartCount); \
if (proxyCtrlStopCount) printf("[proxyCtrlStop] %g/%ld = %g ", proxyCtrlStopTs[1], proxyCtrlStopCount, proxyCtrlStopTs[1]/proxyCtrlStopCount); \
if (proxyOpRecordCount) printf("[proxyOpRecord] %g/%ld = %g ", proxyOpRecordTs[1], proxyOpRecordCount, proxyOpRecordTs[1]/proxyOpRecordCount); \
if (proxyStepRecordCount) printf("[proxyStepRecord] %g/%ld = %g ", proxyStepRecordTs[1], proxyStepRecordCount, proxyStepRecordTs[1]/proxyStepRecordCount); \
if (proxyCtrlRecordCount) printf("[proxyCtrlRecord] %g/%ld = %g", proxyCtrlRecordTs[1], proxyCtrlRecordCount, proxyCtrlRecordTs[1]/proxyCtrlRecordCount); \
printf("\n"); \
} while(0)
#else
#define TIME_START_EVENT(event) do {} while(0)
#define TIME_STOP_EVENT(event) do {} while(0)
#define TIME_PRINT_EVENTS(name) do {} while(0)
#endif
if (sendrecv) {
int state = ncclProxyProfileBegin;
const char** stateStr = e->type == ncclPatternSend ? profilingStateSendStr : profilingStateRecvStr;
fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f, \"args\": { \"opCount\": %ld, \"proxyOpIndex\":%d } },\n",
typeStr, e->peer, e->step, i, e->channel, e->timestamp[state], e->opCount, e->opIndex);
static int eActivationMask; // Set by profiler
static int eActivationMaskGroup; // Cached for current group
while (state<ncclProxyProfileEnd) {
if (e->timestamp[state]) {
const char* name = stateStr[state];
fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
name, i, e->channel, e->timestamp[state]);
state++;
while (e->timestamp[state] == 0) state++;
fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
name, i, e->channel, e->timestamp[state]);
}
}
fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
typeStr, e->peer, e->step, i, e->channel, e->timestamp[state]);
} else {
if (e->peer == -ncclProxyProfileAppend) {
fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f, \"args\": { \"added\": %ld } },\n",
typeStr, i, e->timestamp[0], e->opCount);
} else {
fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
typeStr, i, e->timestamp[0]);
}
fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
typeStr, i, e->timestamp[1]);
ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) {
TIME_START_EVENT(elapsed);
TIME_START_EVENT(init);
ncclProfilerPluginLoad();
if (__builtin_expect(ncclProfiler != NULL, 0)) {
int err = ncclProfiler->init(&comm->profilerContext, &eActivationMask);
if (err) {
WARN("Profiler init failed with error (%d). Continue without profiler.", err);
ncclProfiler = NULL;
}
}
fprintf(f, "{} ]\n");
fclose(f);
free(profilingEvents);
TIME_STOP_EVENT(init);
return ncclSuccess;
}
ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm) {
TIME_START_EVENT(finalize);
if (__builtin_expect(ncclProfiler != NULL, 0)) {
ncclProfiler->finalize(comm->profilerContext);
}
ncclProfilerPluginUnload();
TIME_STOP_EVENT(finalize);
TIME_STOP_EVENT(elapsed);
TIME_PRINT_EVENTS("Profiler");
return ncclSuccess;
}
ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) {
TIME_START_EVENT(groupStart);
eActivationMaskGroup = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
if (__builtin_expect(ncclProfiler != NULL, 0)) {
if (eActivationMaskGroup & (ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep)) {
ncclProfilerEventDescr_v1_t eDescr = { 0 };
eDescr.type = ncclProfileGroup;
ncclProfiler->startEvent(plan->comm->profilerContext, &plan->groupEventHandle, &eDescr);
}
}
TIME_STOP_EVENT(groupStart);
return ncclSuccess;
}
ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan) {
TIME_START_EVENT(groupStop);
if (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle) {
ncclProfiler->stopEvent(plan->groupEventHandle);
}
TIME_STOP_EVENT(groupStop);
return ncclSuccess;
}
ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
TIME_START_EVENT(taskStart);
if (__builtin_expect(ncclProfiler != NULL, 0)) {
int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl);
if (plan->groupEventHandle && enable) {
struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
while (ct) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileColl;
eDescr.parentObj = plan->groupEventHandle;
eDescr.rank = plan->comm->rank;
eDescr.coll.name = plan->comm->commName;
eDescr.coll.commHash = plan->comm->commHash;
eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]++;
eDescr.coll.func = ct->func;
eDescr.coll.sendBuff = ct->sendbuff;
eDescr.coll.recvBuff = ct->recvbuff;
eDescr.coll.count = ct->count;
eDescr.coll.root = ct->root;
eDescr.coll.datatype = ct->datatype;
eDescr.coll.op = ct->opHost;
eDescr.coll.trafficBytes = ct->trafficBytes;
eDescr.coll.nMaxChannels = ct->nMaxChannels;
eDescr.coll.nWarps = ct->nWarps;
eDescr.coll.algo = ct->algorithm;
eDescr.coll.proto = ct->protocol;
eDescr.coll.isCollnet = ct->isCollnet;
eDescr.coll.isNvls = ct->isNvls;
ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr);
// update collective task with group event activation mask
ct->eActivationMask = eActivationMaskGroup;
ct = ct->next;
}
struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
while (pt) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileP2p;
eDescr.parentObj = plan->groupEventHandle;
eDescr.rank = plan->comm->rank;
eDescr.p2p.name = plan->comm->commName;
eDescr.p2p.commHash = plan->comm->commHash;
eDescr.p2p.func = pt->func;
eDescr.p2p.buff = pt->buff;
eDescr.p2p.count = pt->count;
eDescr.p2p.datatype = pt->datatype;
eDescr.p2p.peer = pt->root;
ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
// update collective task with group event activation mask
pt->eActivationMask = eActivationMaskGroup;
pt = pt->next;
}
}
}
TIME_STOP_EVENT(taskStart);
return ncclSuccess;
}
ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) {
TIME_START_EVENT(taskStop);
if (__builtin_expect(ncclProfiler != NULL, 0)) {
int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl);
if (plan->groupEventHandle && enable) {
struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
while (ct) {
ncclProfiler->stopEvent(ct->eventHandle);
ct = ct->next;
}
struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
while (pt) {
ncclProfiler->stopEvent(pt->eventHandle);
pt = pt->next;
}
}
}
TIME_STOP_EVENT(taskStop);
return ncclSuccess;
}
ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args) {
TIME_START_EVENT(proxyOpStart);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0)) {
if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileProxyOp;
eDescr.parentObj = sub->taskEventHandle;
eDescr.rank = sub->rank;
eDescr.proxyOp.pid = args->pid;
eDescr.proxyOp.channelId = sub->channelId;
eDescr.proxyOp.peer = sub->peer;
eDescr.proxyOp.nSteps = sub->nsteps;
eDescr.proxyOp.chunkSize = args->chunkSize;
eDescr.proxyOp.isSend = 1;
ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr);
}
}
TIME_STOP_EVENT(proxyOpStart);
return ncclSuccess;
}
ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args) {
TIME_START_EVENT(proxyOpStart);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0)) {
if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileProxyOp;
eDescr.parentObj = sub->taskEventHandle;
eDescr.rank = sub->rank;
eDescr.proxyOp.pid = args->pid;
eDescr.proxyOp.channelId = sub->channelId;
eDescr.proxyOp.peer = sub->peer;
eDescr.proxyOp.nSteps = sub->nsteps;
eDescr.proxyOp.chunkSize = args->chunkSize;
eDescr.proxyOp.isSend = 0;
ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr);
}
}
TIME_STOP_EVENT(proxyOpStart);
return ncclSuccess;
}
ncclResult_t ncclProfilerStopProxyOpEvent(int s, struct ncclProxyArgs* args) {
TIME_START_EVENT(proxyOpStop);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
ncclProfiler->stopEvent(sub->opEventHandle);
sub->opEventHandle = NULL;
}
TIME_STOP_EVENT(proxyOpStop);
return ncclSuccess;
}
ncclResult_t ncclProfilerStartSendProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
TIME_START_EVENT(proxyStepStart);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0)) {
if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
for (uint64_t step = stepLo; step < stepHi; step++) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileProxyStep;
eDescr.parentObj = sub->opEventHandle;
eDescr.rank = sub->rank;
eDescr.proxyStep.step = step;
ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr);
}
}
}
TIME_STOP_EVENT(proxyStepStart);
return ncclSuccess;
}
ncclResult_t ncclProfilerStartRecvProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
TIME_START_EVENT(proxyStepStart);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0)) {
if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
for (uint64_t step = stepLo; step < stepHi; step++) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileProxyStep;
eDescr.parentObj = sub->opEventHandle;
eDescr.rank = sub->rank;
eDescr.proxyStep.step = step;
ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr);
}
}
}
TIME_STOP_EVENT(proxyStepStart);
return ncclSuccess;
}
ncclResult_t ncclProfilerStopProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
TIME_START_EVENT(proxyStepStop);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0)) {
for (uint64_t step = stepLo; step < stepHi; step++) {
if (sub->stepEventHandles[step%NCCL_STEPS]) {
ncclProfiler->stopEvent(sub->stepEventHandles[step%NCCL_STEPS]);
sub->stepEventHandles[step%NCCL_STEPS] = NULL;
}
}
}
TIME_STOP_EVENT(proxyStepStop);
return ncclSuccess;
}
ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle) {
TIME_START_EVENT(proxyCtrlStart);
if (__builtin_expect(ncclProfiler != NULL, 0)) {
// for proxy control events we allow profiling mode to change on a per event basis
int eActivationMaskProxy = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
if (eActivationMaskProxy & ncclProfileProxyCtrl) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileProxyCtrl;
ncclProfiler->startEvent(profilerContext, eHandle, &eDescr);
TIME_STOP_EVENT(proxyCtrlStart);
return ncclSuccess;
}
}
*eHandle = NULL;
TIME_STOP_EVENT(proxyCtrlStart);
return ncclSuccess;
}
ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle) {
TIME_START_EVENT(proxyCtrlStop);
if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle) {
ncclProfiler->stopEvent(eHandle);
}
TIME_STOP_EVENT(proxyCtrlStop);
return ncclSuccess;
}
ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState) {
TIME_START_EVENT(proxyOpRecord);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
ncclProfilerEventStateArgs_t a = { 0 };
a.proxyOp.steps = steps;
a.proxyOp.transSize = transSize;
ncclProfiler->recordEventState(sub->opEventHandle, eState, &a);
}
TIME_STOP_EVENT(proxyOpRecord);
return ncclSuccess;
}
ncclResult_t ncclProfilerRecordProxyStepEventStates(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState) {
TIME_START_EVENT(proxyStepRecord);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
for (uint64_t step = stepLo; step < stepHi; step++) {
if (sub->stepEventHandles[step%NCCL_STEPS]) {
ncclProfiler->recordEventState(sub->stepEventHandles[step%NCCL_STEPS], eState, 0);
}
}
}
TIME_STOP_EVENT(proxyStepRecord);
return ncclSuccess;
}
ncclResult_t ncclProfilerRecordProxyCtrlEventState(void* eHandle, int appended, ncclProfilerEventState_t eState) {
TIME_START_EVENT(proxyCtrlRecord);
if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) {
ncclProfilerEventStateArgs_t args = { 0 };
args.proxyCtrl.appendedProxyOps = appended;
ncclProfiler->recordEventState(eHandle, eState, &args);
}
TIME_STOP_EVENT(proxyCtrlRecord);
return ncclSuccess;
}
ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op) {
op->pid = pid;
return ncclSuccess;
}
#else
ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; }
void ncclProfilingDump() {}
#endif
+4
Melihat File
@@ -172,6 +172,10 @@ int ncclCuMemEnable() {
return 0;
}
int ncclCuMemHostEnable() {
return 0;
}
ncclResult_t rocmLibraryInit() {
pthread_once(&initOnceControl, initOnceFunc);
return initResult;
+7 -6
Melihat File
@@ -4,7 +4,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include "shm.h"
#include "shmutils.h"
#include "comm.h"
#include "checks.h"
#include <sys/types.h>
@@ -75,7 +75,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
goto fail;
}
} else {
SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "open", ret, fail);
}
retry_fallocate:
@@ -90,7 +90,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
}
INFO(NCCL_ALLOC, "Allocated %ld bytes of shared memory in %s", realShmSize, shmPath);
} else {
SYSCHECKGOTO(fd = open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
SYSCHECKGOTO(fd = open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", ret, fail);
}
hptr = (char*)mmap(NULL, realShmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
@@ -114,7 +114,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
}
if (devShmPtr) {
CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterMapped), ret, fail);
CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped), ret, fail);
CUDACHECKGOTO(cudaHostGetDevicePointer(&dptr, (void*)hptr, 0), ret, fail);
}
@@ -129,7 +129,7 @@ fail:
shmPath, shmSize, strerror(errno), errno);
if (tmphandle) {
shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle);
ncclShmClose((ncclShmHandle_t)tmphandle);
(void)ncclShmClose((ncclShmHandle_t)tmphandle);
tmphandle = NULL;
}
hptr = NULL;
@@ -182,7 +182,7 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) {
ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize) {
ncclResult_t ret = ncclSuccess;
int curRound = shmem->round;
int curRound;
size_t mycnt;
if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL || shmem->maxTypeSize < typeSize) {
@@ -190,6 +190,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff
goto exit;
}
curRound = shmem->round;
memcpy((char*)shmem->ptr[curRound] + comm->localRank * typeSize, sendbuff, typeSize);
/* sync among local ranks */
mycnt = __atomic_add_fetch(shmem->cnt[curRound], 1, __ATOMIC_ACQ_REL);
+13 -7
Melihat File
@@ -289,6 +289,7 @@ ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char
sin6.sin6_scope_id = 0; // should be global scope, set to 0
} else {
WARN("Net : unsupported IP family");
freeaddrinfo(p);
return ncclInvalidArgument;
}
@@ -413,7 +414,7 @@ ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress*
static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
socklen_t socklen = sizeof(union ncclSocketAddress);
sock->fd = accept(sock->acceptFd, &sock->addr.sa, &socklen);
sock->fd = accept(sock->acceptFd, (struct sockaddr*)&sock->addr, &socklen);
if (sock->fd != -1) {
sock->state = ncclSocketStateAccepted;
} else if (errno != EAGAIN && errno != EWOULDBLOCK) {
@@ -506,8 +507,9 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
} else if (ret < 0) {
WARN("socketPollConnect poll() failed with error %s", strerror(errno));
return ncclRemoteError;
} else {
EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0);
} else if (ret != 1 || (pfd.revents & POLLOUT) == 0) {
WARN("socketPollConnect poll() returned %d%s", ret, (pfd.revents & POLLOUT) ? "" : ", no POLLOUT events");
return ncclSystemError;
}
/* check socket status */
@@ -734,12 +736,12 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad
// [RCCL] Runtime socket options
if (rcclParamSocketReuseAddr()) {
int opt = 1;
SYSCHECKGOTO(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), ret, fail);
SYSCHECKGOTO(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt", ret, fail);
}
int lingerParam = (int)rcclParamSocketLinger();
if (lingerParam > -1) {
linger linger_opt = { 1, lingerParam };
SYSCHECKGOTO(setsockopt(sock->fd, SOL_SOCKET, SO_LINGER, &linger_opt, sizeof(linger_opt)), ret, fail);
SYSCHECKGOTO(setsockopt(sock->fd, SOL_SOCKET, SO_LINGER, &linger_opt, sizeof(linger_opt)), "setsockopt", ret, fail);
}
} else {
memset(&sock->addr, 0, sizeof(union ncclSocketAddress));
@@ -748,13 +750,17 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad
/* Set socket as non-blocking if async or if we need to be able to abort */
if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
int flags;
EQCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), -1, ret, fail);
SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), ret, fail);
SYSCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), "fcntl", ret, fail);
SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl", ret, fail);
}
exit:
return ret;
fail:
if (sock->fd != -1) {
close(sock->fd);
sock->fd = -1;
}
goto exit;
}
+2
Melihat File
@@ -77,6 +77,8 @@ static void* tryOpenLib(const char* name, int* err, char* errStr) {
if (nullptr == handle) {
strncpy(errStr, dlerror(), MAX_STR_LEN);
errStr[MAX_STR_LEN] = '\0';
// "handle" and "name" won't be NULL at the same time.
// coverity[var_deref_model]
if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
*err = ENOENT;
}
+9 -12
Melihat File
@@ -65,15 +65,7 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
return ncclSuccess;
}
uint64_t getHash(const char* string, int n) {
// Based on DJB2a, result = result * 33 ^ char
uint64_t result = 5381;
for (int c = 0; c < n; c++) {
result = ((result << 5) + result) ^ string[c];
}
return result;
}
static uint64_t hostHashValue = 0;
/* Generate a hash of the unique identifying string for this host
* that will be unique for both bare-metal and container instances
* Equivalent of a hash of;
@@ -83,7 +75,7 @@ uint64_t getHash(const char* string, int n) {
* This string can be overridden by using the NCCL_HOSTID env var.
*/
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
uint64_t getHostHash(void) {
static void getHostHashOnce() {
char hostHash[1024];
const char *hostId;
@@ -103,8 +95,8 @@ uint64_t getHostHash(void) {
strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
free(p);
}
fclose(file);
}
fclose(file);
}
// Make sure the string is terminated
@@ -112,7 +104,12 @@ uint64_t getHostHash(void) {
TRACE(NCCL_INIT,"unique hostname '%s'", hostHash);
return getHash(hostHash, strlen(hostHash));
hostHashValue = getHash(hostHash, strlen(hostHash));
}
uint64_t getHostHash(void) {
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, getHostHashOnce);
return hostHashValue;
}
/* Generate a hash of the unique identifying string for this process
+18
Melihat File
@@ -268,6 +268,23 @@ ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *new
/*! @endcond */
/*! @} */
/*! @brief Creates a new communicator (multi thread/process version), similar to ncclCommInitRankConfig.
@details Allows to use more than one ncclUniqueId (up to one per rank),
indicated by nId, to accelerate the init operation.
The number of ncclUniqueIds and their order must be the same for every rank.
@return Result code. See @ref rccl_result_code for more details.
@param[out] newcomm Pointer to new communicator
@param[in] nranks Total number of ranks participating in this communicator
@param[in] myrank Current rank
@param[in] nId Number of unique IDs
@param[in] commIds List of unique IDs
@param[in] config Config file for new communicator. May be NULL to inherit from comm */
ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
/*! @cond include_hidden */
ncclResult_t pncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
/*! @endcond */
/*! @defgroup rccl_api_errcheck Error Checking Calls
@details API calls that check for errors
@{ */
@@ -277,6 +294,7 @@ ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *new
@return String containing description of result code.
@param[in] result Result code to get description for */
/* Returns a string for each error code. */
const char* ncclGetErrorString(ncclResult_t result);
/*! @cond include_hidden */
const char* pncclGetErrorString(ncclResult_t result);
+5 -4
Melihat File
@@ -356,6 +356,8 @@ static void* tryOpenLib(char* name, int* err, char* errStr) {
if (nullptr == handle) {
strncpy(errStr, dlerror(), MAX_STR_LEN);
errStr[MAX_STR_LEN] = '\0';
// "handle" and "name" won't be NULL at the same time.
// coverity[var_deref_model]
if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
*err = ENOENT;
}
@@ -423,11 +425,10 @@ static int netPluginStatus = netPluginLoadReady;
ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
if (netPluginLoadFailed == netPluginStatus) {
return ncclSuccess;
}
pthread_mutex_lock(&netPluginLock);
if (netPluginLoadFailed == netPluginStatus) {
goto exit;
}
if (netPluginLoadSuccess == netPluginStatus) {
++netPluginRefCount;
goto exit;
+226 -79
Melihat File
@@ -10,16 +10,20 @@
#include "info.h"
#include "collectives.h"
#include "socket.h"
#include "shm.h"
#include "shmutils.h"
#include "profiler.h"
#define ENABLE_TIMER 0
#include "timer.h"
#include "profiler.h"
#include "transport.h"
#include <sys/syscall.h>
#include <assert.h>
#include <unistd.h>
#include <sys/time.h>
#include <sched.h>
void* ncclProxyServiceUDS(void* _args);
static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
@@ -67,8 +71,10 @@ static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, voi
return ncclInternalError;
}
memcpy(elem->respBuff, respBuff, respSize);
free(respBuff);
if (respSize > 0) {
memcpy(elem->respBuff, respBuff, respSize);
free(respBuff);
}
elem->done = true;
elem->res = res;
return ncclSuccess;
@@ -373,12 +379,17 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
sub->nsteps = op->nsteps;
sub->nbytes = op->nbytes;
sub->offset = 0;
sub->peer = op->root;
sub->peer = op->peer;
sub->reg = op->reg;
sub->sendMhandle = op->sendMhandle;
sub->recvMhandle = op->recvMhandle;
sub->sendbuff = op->sendbuff;
sub->recvbuff = op->recvbuff;
sub->eActivationMask = op->eActivationMask;
sub->taskEventHandle = op->taskEventHandle;
sub->rank = op->rank;
args->pid = op->pid;
args->profilerContext = op->profilerContext;
args->nsubs = subIndex+1;
if (subIndex) {
if ((args->sliceSteps != op->sliceSteps) ||
@@ -544,6 +555,7 @@ static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel
if (justInquire) *justInquire = true;
else {
op->peer = peer;
NCCLCHECK(ncclLocalOpAppend(comm, &connector->proxyConn, op));
}
return ncclSuccess;
@@ -614,6 +626,64 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[2], op, 0, justInquire));
NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeUp, op, 0, justInquire));
} break;
case ncclPatternPatUp: {
// Run full algorithm to count the number of steps for each peer.
int *nstepsSend, *nstepsRecv;
const int rank = comm->rank, nranks = comm->nRanks;
NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks)));
NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks)));
const ssize_t size = op->nbytes/comm->nRanks;
PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
int last = 0;
while (last == 0) {
int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
size_t inpIx, outIx;
algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++;
if (sendDim != -1 && postSend) nstepsSend[sendDim]++;
}
for (int i=0; i<log2Up(nranks); i++) {
if (nstepsSend[i]) {
int sendPeer = (rank + (1<<i)) % nranks;
op->nsteps = nstepsSend[i];
NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire));
}
if (nstepsRecv[i]) {
int recvPeer = (rank - (1<<i) + nranks) % nranks;
op->nsteps = nstepsRecv[i];
NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire));
}
}
} break;
case ncclPatternPatDown: {
// Run full algorithm to count the number of steps for each peer.
int *nstepsSend, *nstepsRecv;
const int rank = comm->rank, nranks = comm->nRanks;
NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks)));
NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks)));
const ssize_t size = op->nbytes/comm->nRanks;
PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
int last = 0;
while (last == 0) {
int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
size_t inpIx, outIx;
algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++;
if (sendDim != -1 && postSend) nstepsSend[sendDim]++;
}
for (int i=0; i<log2Up(nranks); i++) {
if (nstepsSend[i]) {
int sendPeer = (rank - (1<<i) + nranks) % nranks;
op->nsteps = nstepsSend[i];
NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire));
}
if (nstepsRecv[i]) {
int recvPeer = (rank + (1<<i)) % nranks;
op->nsteps = nstepsRecv[i];
NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire));
}
}
} break;
case ncclPatternSend:
case ncclPatternRecv: {
if (op->root == comm->rank) return ncclSuccess;
@@ -685,9 +755,9 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
if (state->opsPool == NULL) return ncclInternalError;
struct ncclProxyOpsPool* pool = state->opsPool;
struct ncclProxyArgs profArgs; // Only used for profiling purposes
if (state->nextOps != -1) goto process_nextops;
void* eHandle;
// If we have ops to progress, no need to block waiting for something to arrive or even wait for the lock
// to be available. Exit, continue progress, and come back later.
if (state->active != NULL && (pool->nextOps == -1 || pthread_mutex_trylock(&pool->mutex) != 0)) return ncclSuccess;
@@ -695,10 +765,11 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
if (state->active == NULL) {
pthread_mutex_lock(&pool->mutex);
while (pool->nextOps == -1 && !state->stop) {
struct ncclProxyArgs profArgs; // Only used for profiling purposes
ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep);
ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlSleep);
pthread_cond_wait(&pool->cond, &pool->mutex);
ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup);
ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlWakeup);
ncclProfilerStopProxyCtrlEvent(eHandle);
}
if (state->stop) { // We might have been woken up to stop.
pthread_mutex_unlock(&pool->mutex);
@@ -712,7 +783,8 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
if (state->nextOps == -1) return ncclInternalError;
process_nextops:
ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppend);
ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlAppend);
TIME_START(2);
int freeOp[NCCL_MAX_LOCAL_RANKS];
int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
@@ -748,6 +820,10 @@ process_nextops:
if (freeOp[i] == -1) continue;
int newFree = freeOp[i];
int oldFree = pool->freeOps[i];
// Coverity gets confused by the complex code structure here. The previous "for" loop ensures that freeOpEnd[i]
// is initialized so long as freeOp[i] is initialized (is not -1). In the current loop we filter out uninitialized
// freeOp[i], hence ensuring that freeOpEnd[i] is also initialized.
// coverity[uninit_use:FALSE]
pool->ops[freeOpEnd[i]].next = oldFree;
if (oldFree == -1) {
// Nothing for the main thread to consume, we can set it.
@@ -763,8 +839,8 @@ process_nextops:
}
}
}
profArgs.opCount = *added;
ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppendEnd);
ncclProfilerRecordProxyCtrlEventState(eHandle, *added, ncclProfilerProxyCtrlAppendEnd);
ncclProfilerStopProxyCtrlEvent(eHandle);
TIME_STOP(2);
return ncclSuccess;
}
@@ -787,6 +863,7 @@ static int setProxyThreadContext(struct ncclProxyState* proxyState) {
if (CUPFN(cuCtxCreate) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
WARN("Unable to create thread context due to old driver, disabling.");
createThreadContext = 0;
goto exit;
}
}
}
@@ -796,15 +873,17 @@ static int setProxyThreadContext(struct ncclProxyState* proxyState) {
NULL, 0, CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) {
WARN("Failed to create CUDA context on device %d", proxyState->cudaDev);
createThreadContext = 0;
goto exit;
}
} else {
if (CUPFN(cuCtxSetCurrent(proxyState->cudaCtx)) != CUDA_SUCCESS) {
WARN("Failed to set CUDA context on device %d", proxyState->cudaDev);
return 0;
goto exit;
}
return 1;
}
return 1;
}
exit:
#endif
return 0;
}
@@ -816,12 +895,14 @@ NCCL_PARAM(ProgressAppendOpFreq, "PROGRESS_APPENDOP_FREQ", 8);
void* ncclProxyProgress(void *proxyState_) {
struct ncclProxyState* proxyState = (struct ncclProxyState*)proxyState_;
if (setProxyThreadContext(proxyState)) {
INFO(NCCL_INIT, "[Proxy Progress] Created CUDA context on device %d", proxyState->cudaDev);
INFO(NCCL_INIT, "[Proxy Progress] Set CUDA context on device %d", proxyState->cudaDev);
} else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
WARN("[Proxy Progress] Failed to set CUDA device %d", proxyState->cudaDev);
}
// if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
INFO(NCCL_INIT, "[Proxy Progress] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
struct ncclProxyProgressState* state = &proxyState->progressState;
state->nextOps = -1;
const int sig = ncclParamProxyDumpSignal();
@@ -838,9 +919,7 @@ void* ncclProxyProgress(void *proxyState_) {
* ncclParamProgressAppendOpFreq(). If they are equal, we will append proxy ops. This will decrease the
* frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */
int proxyOpAppendCounter = 0;
struct ncclProxyArgs profArgs; // Only used for profiling purposes
while ((state->stop == 0 || (state->stop == 1 && state->active)) &&
__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) == 0) {
while (state->stop == 0 || (state->stop == 1 && state->active)) {
int idle = 1;
ncclResult_t ret = progressOps(proxyState, state, state->active, &idle);
if (ret != ncclSuccess) {
@@ -848,8 +927,11 @@ void* ncclProxyProgress(void *proxyState_) {
INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
continue;
}
if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle);
if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive);
void* eHandle;
ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
if (lastIdle == 0 && idle == 1) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlIdle);
if (lastIdle == 1 && idle == 0) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlActive);
ncclProfilerStopProxyCtrlEvent(eHandle);
if (idle || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) {
int added = 0;
proxyOpAppendCounter = 0;
@@ -889,7 +971,7 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm) {
static ncclResult_t ncclProxyProgressCreate(struct ncclProxyState* proxyState) {
struct ncclProxyProgressState* state = &proxyState->progressState;
if (!state->thread) {
pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState);
PTHREADCHECK(pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState), "pthread_create");
ncclSetThreadName(state->thread, "NCCL Progress%2d", proxyState->tpLocalnRanks);
}
return ncclSuccess;
@@ -904,7 +986,7 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) {
state->stop = 1;
pthread_cond_signal(&state->opsPool->cond);
pthread_mutex_unlock(&state->opsPool->mutex);
pthread_join(state->thread, NULL);
PTHREADCHECK(pthread_join(state->thread, NULL), "pthread_join");
}
// Free off any memory allocated for the proxy arg pools
@@ -914,7 +996,6 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) {
state->pools = next;
}
ncclProfilingDump();
TIME_PRINT("Proxy");
return ncclSuccess;
}
@@ -991,23 +1072,17 @@ struct ncclProxyInitResp {
char devShmPath[6]; // "XXXXXX" - May or may not be set
};
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int tpProxyRank, struct ncclProxyConnector* proxyConn) {
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn) {
struct ncclSocket* sock;
int ready, proxyRank = -1;
int ready;
struct ncclProxyState* sharedProxyState = comm->proxyState;
int tpProxyRank = comm->topParentRanks[proxyRank];
// Keep one connection per local rank
for (int i = 0; i < comm->localRanks; ++i) {
/* find the proxy rank in comm. */
if (comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) {
proxyRank = comm->localRankToRank[i];
break;
}
}
proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
// Keep one connection per local rank
proxyConn->connection = NULL;
proxyConn->tpRank = tpProxyRank;
proxyConn->rank = proxyRank;
if (sharedProxyState->peerSocks == NULL) {
NCCLCHECK(ncclCalloc(&sharedProxyState->peerSocks, comm->sharedRes->tpNLocalRanks));
NCCLCHECK(ncclCalloc(&sharedProxyState->proxyOps, comm->sharedRes->tpNLocalRanks));
@@ -1049,68 +1124,93 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
}
}
proxyConn->initialized = true;
INFO(NCCL_NET|NCCL_PROXY, "Connected to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection);
return ncclSuccess;
}
// UDS support
ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, int tpRank, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int *respFd) {
ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int* reqFd, int *respFd) {
ncclResult_t res = ncclSuccess;
struct ncclIpcSocket ipcSock = { 0 };
void *opId;
NCCLCHECK(getRandomData(&opId, sizeof(opId)));
int reqFdtmp = -1;
int rank = comm->topParentLocalRanks[comm->localRank];
struct ncclProxyState* sharedProxyState = comm->proxyState;
uint64_t pidHash = sharedProxyState->peerAddressesUDS[tpRank];
uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %p opId %p",
comm, rank, tpRank, pidHash, reqSize, respSize, respFd, opId);
comm, rank, proxyConn->tpRank, pidHash, reqSize, respSize, respFd, opId);
// cuMem: Create a UDS socket to receive the response
NCCLCHECK(ncclIpcSocketInit(&ipcSock, rank, (uint64_t)opId, comm->abortFlag));
if (reqFd) {
reqFdtmp = *reqFd;
} else {
// give a dummy fd for the other side of UDS socket
NCCLCHECK(ncclIpcSocketGetFd(&ipcSock, &reqFdtmp));
}
ncclIpcHdr hdr;
hdr.type = type;
hdr.rank = rank;
hdr.reqSize = reqSize;
hdr.respSize = respSize;
hdr.opId = opId;
assert(reqSize <= sizeof(hdr.data));
memcpy(&hdr.data, reqBuff, reqSize);
NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &hdr, sizeof(hdr), -1, tpRank, pidHash), res, error);
NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &hdr, sizeof(hdr), reqFdtmp, proxyConn->tpRank, pidHash), res, error);
NCCLCHECKGOTO(ncclIpcSocketRecvMsg(&ipcSock, respBuff, respSize, respFd), res, error);
NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), res, error);
INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %d opId %p - DONE",
comm, rank, tpRank, pidHash, reqSize, respSize, (respFd ? *respFd : -1), opId);
comm, rank, proxyConn->tpRank, pidHash, reqSize, respSize, (respFd ? *respFd : -1), opId);
return res;
error:
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
WARN("ncclProxyCallBlockingUDS call to tpRank %d(%lx) failed : %d", tpRank, pidHash, res);
WARN("ncclProxyCallBlockingUDS call to tpRank %d(%lx) failed : %d", proxyConn->tpRank, pidHash, res);
return res;
}
// cuMem API support
// The request/response is sent out-of-band using ncclIpcSocket for this specific command
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int tpRank, void *handle, int* convertedFd) {
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int proxyRank, void *handle, int* convertedFd) {
ncclResult_t ret = ncclSuccess;
// Request the allocation of a UDS fd for the handle
NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, tpRank, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), NULL, 0, convertedFd), ret, error);
if (comm->gproxyConn[proxyRank].initialized == false) {
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, proxyRank, &comm->gproxyConn[proxyRank]), ret, error);
}
NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, &comm->gproxyConn[proxyRank], ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), NULL, 0, NULL, convertedFd), ret, error);
// We have now received the converted fd over UDS
INFO(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx tpRank %d returned fd %d", *(uint64_t*)handle, tpRank, *convertedFd);
INFO(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx tpRank %d returned fd %d sameProcess %d", *(uint64_t*)handle, comm->topParentRanks[proxyRank], *convertedFd, comm->gproxyConn[proxyRank].sameProcess);
return ret;
error:
WARN("ncclProxyClientGetFd call to tpRank %d handle 0x%lx failed : %d", tpRank, *(uint64_t*)handle, ret);
WARN("ncclProxyClientGetFd call to tpRank %d handle 0x%lx failed : %d", comm->topParentRanks[proxyRank], *(uint64_t*)handle, ret);
return ret;
}
ncclResult_t ncclProxyClientQueryFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int localFd, int* rmtFd) {
ncclResult_t ret = ncclSuccess;
NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, proxyConn, ncclProxyMsgQueryFd, NULL, 0, (void*)rmtFd, sizeof(int), &localFd, NULL), ret, fail);
exit:
// We have now received the converted fd over UDS
INFO(NCCL_PROXY, "UDS: ClientQueryFd localFd %d tpRank %d remote fd %d sameProcess %d", localFd, proxyConn->tpRank, *rmtFd, proxyConn->sameProcess);
return ret;
fail:
WARN("ncclProxyClientQueryFdBlocking call to tpRank %d localFd %d failed : %d", proxyConn->tpRank, localFd, ret);
goto exit;
}
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" };
ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
struct ncclSocket* sock;
@@ -1120,7 +1220,6 @@ ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector
if (sharedProxyState->peerSocks == NULL) return ncclInternalError;
sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
if (sock == NULL) return ncclInternalError;
NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
@@ -1296,6 +1395,22 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
return ncclSuccess;
}
static ncclResult_t proxyQueryFd(struct ncclProxyState* proxyState, int rank, void *opId, int rmtFd) {
#if CUDART_VERSION >= 11030
struct ncclIpcSocket ipcSock = { 0 };
uint64_t hash = (uint64_t) opId;
ncclResult_t ret = ncclSuccess;
NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, exit);
NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &rmtFd, sizeof(int), rmtFd, rank, hash), ret, exit);
exit:
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
return ncclSuccess;
#else
return ncclInternalError;
#endif
}
// cuMem API support
static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void *opId, uint64_t handle) {
#if CUDART_VERSION >= 11030
@@ -1315,7 +1430,7 @@ static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void
error:
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
// We can now safely close the exported fd
(void) close(fd);
SYSCHECK(close(fd), "close");
return ret;
#else
return ncclInternalError;
@@ -1382,30 +1497,37 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
}
static ncclResult_t proxyServiceInitOp(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclProxyState* proxyState, int* asyncOpCount) {
ncclResult_t ret = ncclSuccess;
struct ncclSocket* sock = &peer->sock;
struct ncclProxyAsyncOp* asyncOp;
NCCLCHECK(ncclCalloc(&asyncOp, 1));
asyncOp->type = type;
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)));
NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)), ret, fail);
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int)));
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int)));
NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int)), ret, fail);
NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int)), ret, fail);
if (asyncOp->reqSize) {
NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize));
NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize));
NCCLCHECKGOTO(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize), ret, fail);
NCCLCHECKGOTO(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize), ret, fail);
}
// Store opId for completion response
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)));
NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)), ret, fail);
if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize));
if (asyncOp->respSize) NCCLCHECKGOTO(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize), ret, fail);
asyncProxyOpEnqueue(peer, asyncOp);
(*asyncOpCount)++;
NCCLCHECK(proxyProgressAsync(asyncOp, proxyState, asyncOpCount, peer, connectionPool));
return ncclSuccess;
exit:
return ret;
fail:
if (asyncOp->reqBuff) free(asyncOp->reqBuff);
if (asyncOp->respBuff) free(asyncOp->respBuff);
free(asyncOp);
goto exit;
}
#include <poll.h>
@@ -1425,6 +1547,12 @@ static bool proxyMatchOpType(int type) {
}
}
enum {
PROXY_RUNNING = 0,
PROXY_STOP = 1,
PROXY_ABORT = 2
};
void* ncclProxyService(void* _args) {
struct ncclProxyState* proxyState = (struct ncclProxyState*) _args;
// if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
@@ -1435,6 +1563,8 @@ void* ncclProxyService(void* _args) {
}
// if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
INFO(NCCL_INIT, "[Proxy Service] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
// Prepare poll descriptor
struct ncclProxyConnectionPool connectionPool;
connectionPool.pools = NULL;
@@ -1456,13 +1586,13 @@ void* ncclProxyService(void* _args) {
int maxnpeers = 0;
int npeers = 0;
int stop = 0;
int stop = PROXY_RUNNING;
int asyncOpCount = 0;
while (stop == 0 || (stop == 1 && npeers > 0)) {
while (stop == PROXY_RUNNING || npeers > 0) {
/* Even if local comm aborts, we cannot let proxy thread exit if we still have peer
* connections. Need to wait until all other related comms call abort and safely exit
* together, or we could face segmentation fault. */
if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) stop = 1;
if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) stop = PROXY_ABORT;
/* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
int ret;
do {
@@ -1504,10 +1634,14 @@ void* ncclProxyService(void* _args) {
if (pollfds[s].fd == -1) continue;
// Progress all ops for this ncclProxyLocalPeer
if (stop == PROXY_ABORT && ncclCuMemEnable() && ncclCuMemHostEnable() && !proxyState->directMode) closeConn = 1;
ncclProxyAsyncOp* op = peer->asyncOps;
while (op != nullptr) {
ncclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */
type = op->type;
// Coverity gets confused here by complex code structure. Yes, connectionPool.pools gets dereferenced, and
// while calling proxyProgressAsync() connectionPool.pools is NULL, but that changes before it's dereferenced.
// coverity[var_deref_model:FALSE]
res = proxyProgressAsync(op, proxyState, &asyncOpCount, peer, &connectionPool);
if (res == ncclSuccess || res == ncclInProgress) {
op = opnext;
@@ -1524,14 +1658,15 @@ void* ncclProxyService(void* _args) {
int closed;
res = ncclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/);
if (res != ncclSuccess && res != ncclInProgress) {
WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed);
if (!__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED))
WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed);
closeConn = 1;
} else if (closed) {
INFO(NCCL_INIT|NCCL_NET|NCCL_PROXY, "[Service thread] Connection closed by localRank %d", peer->tpLocalRank);
closeConn = 1;
} else if (res == ncclSuccess) { // We received something from the sock
if (type == ncclProxyMsgStop) {
stop = 1;
stop = PROXY_STOP;
closeConn = 1;
} else if (type == ncclProxyMsgClose) {
closeConn = 1;
@@ -1548,12 +1683,13 @@ void* ncclProxyService(void* _args) {
closeConn = 1;
}
if (res != ncclSuccess && res != ncclInProgress) {
WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", proxyState->tpRank, ncclProxyMsgTypeStr[type], peer->tpRank, res);
if (!__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED))
WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", proxyState->tpRank, ncclProxyMsgTypeStr[type], peer->tpRank, res);
closeConn = 1;
}
if (closeConn) {
ncclSocketClose(sock);
(void)ncclSocketClose(sock);
if (op != nullptr) {
asyncProxyOpDequeue(peer, op);
@@ -1570,10 +1706,10 @@ void* ncclProxyService(void* _args) {
WARN("[Proxy Service] proxyDestroy failed");
}
for (int s=0; s<maxnpeers; s++) {
ncclSocketClose(&peers[s].sock);
(void)ncclSocketClose(&peers[s].sock);
}
ncclProxyFreeConnections(&connectionPool, proxyState);
ncclSocketClose(proxyState->listenSock);
(void)ncclSocketClose(proxyState->listenSock);
free(proxyState->listenSock);
proxyOpsFree(proxyState);
return NULL;
@@ -1583,12 +1719,17 @@ void* ncclProxyService(void* _args) {
// Process a request on the UDS socket
static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd) {
ncclIpcHdr hdr;
NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), NULL));
int rmtFd = -1;
NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), &rmtFd));
if (hdr.type == ncclProxyMsgGetFd) {
// cuMem API support
uint64_t handle = *(uint64_t*)hdr.data;
INFO(NCCL_PROXY, "proxyUDSRecvReq::ncclProxyMsgGetFd rank %d opId %p handle=0x%lx", hdr.rank, hdr.opId, handle);
return proxyGetFd(proxyState, hdr.rank, hdr.opId, handle);
} else if (hdr.type == ncclProxyMsgQueryFd) {
INFO(NCCL_PROXY, "proxyUDSRecvReq::proxyQueryFd rank %d opId %p rmtFd %d", hdr.rank, hdr.opId, rmtFd);
return proxyQueryFd(proxyState, hdr.rank, hdr.opId, rmtFd);
}
return ncclInternalError;
@@ -1600,11 +1741,13 @@ void* ncclProxyServiceUDS(void* _args) {
struct pollfd pollfds[1];
if (setProxyThreadContext(proxyState)) {
INFO(NCCL_INIT, "[Proxy Service UDS] Created CUDA context on device %d", proxyState->cudaDev);
INFO(NCCL_INIT, "[Proxy Service UDS] Set CUDA context on device %d", proxyState->cudaDev);
} else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
WARN("[Proxy Service UDS] Failed to set CUDA device %d", proxyState->cudaDev);
}
INFO(NCCL_INIT, "[Proxy Service UDS] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
if (ncclIpcSocketGetFd(&proxyState->ipcSock, &pollfds[0].fd) != ncclSuccess) {
WARN("[Proxy Service UDS] Get listenSock fd fails");
return NULL;
@@ -1623,7 +1766,7 @@ void* ncclProxyServiceUDS(void* _args) {
}
// Check for stop/abort
if (proxyState->stop || *proxyState->abortFlag) break;
if (__atomic_load_n(&proxyState->stop, __ATOMIC_ACQUIRE) || __atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE)) break;
if (pollfds[0].revents) {
// A request was seen on the UDS fd
@@ -1668,14 +1811,16 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
proxyState->dmaBufSupport = comm->dmaBufSupport;
proxyState->ncclNet = comm->ncclNet;
proxyState->ncclCollNet = comm->ncclCollNet;
proxyState->profilerContext = comm->profilerContext;
proxyState->directMode = comm->directMode;
memcpy(proxyState->buffSizes, comm->buffSizes, sizeof(comm->buffSizes));
pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState);
PTHREADCHECK(pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState), "pthread_create");
ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev);
// UDS support
INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d", comm, comm->rank);
pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState);
PTHREADCHECK(pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState), "pthread_create");
ncclSetThreadName(comm->proxyState->threadUDS, "NCCL UDS Service %2d", comm->cudaDev);
}
return ncclSuccess;
@@ -1688,17 +1833,17 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
if (comm->proxyState->threadUDS) {
// UDS support
comm->proxyState->stop = 1;
__atomic_store_n(&comm->proxyState->stop, 1, __ATOMIC_RELEASE);
}
if (sharedProxyState->peerAddresses) {
if (*comm->abortFlag == 0 && sharedProxyState->peerAddresses) {
struct ncclSocket sock;
int type = ncclProxyMsgStop;
ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag);
if (ncclSocketConnect(&sock) == ncclSuccess) {
ncclSocketSend(&sock, &type, sizeof(int));
(void)ncclSocketSend(&sock, &type, sizeof(int));
}
ncclSocketClose(&sock);
(void)ncclSocketClose(&sock);
}
if (sharedProxyState->peerSocks) {
@@ -1716,7 +1861,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
}
}
int type = ncclProxyMsgClose;
ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int));
(void)ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int));
NCCLCHECK(ncclSocketClose(sharedProxyState->peerSocks + i));
}
}
@@ -1730,13 +1875,15 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState;
assert(sharedProxyState->refCount == 0);
free(sharedProxyState->peerAddresses);
free(sharedProxyState->peerAddressesUDS);
free(sharedProxyState->peerSocks);
free(sharedProxyState->proxyOps);
free(sharedProxyState->sharedDevMems);
expectedProxyResponseFree(sharedProxyState);
free(sharedProxyState);
if (sharedProxyState) {
assert(sharedProxyState->refCount == 0);
free(sharedProxyState->peerAddresses);
free(sharedProxyState->peerAddressesUDS);
free(sharedProxyState->peerSocks);
free(sharedProxyState->proxyOps);
free(sharedProxyState->sharedDevMems);
expectedProxyResponseFree(sharedProxyState);
free(sharedProxyState);
}
return ncclSuccess;
}
+21 -4
Melihat File
@@ -30,8 +30,8 @@ ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) {
ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, struct ncclReg* reg) {
struct ncclRegCache* cache = &comm->regCache;
int netCount;
NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount));
int netCount = 0;
if (comm->topo != NULL) NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount));
if (netCount == 0) return ncclSuccess;
ncclResult_t ret = ncclSuccess;
@@ -109,7 +109,11 @@ ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, s
NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, void** handle) {
if (!ncclParamLocalRegister()) return ncclSuccess;
if (!ncclParamLocalRegister()) {
*handle = NULL;
return ncclSuccess;
}
INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size);
struct ncclRegCache* cache = &comm->regCache;
uintptr_t pageSize = cache->pageSize;
uintptr_t addr = (uintptr_t)data & -pageSize;
@@ -198,6 +202,10 @@ ncclResult_t ncclCommDeregister_impl(const ncclComm_t comm, void* handle) {
struct ncclReg* reg = (struct ncclReg*)handle;
struct ncclRegCache* cache = &comm->regCache;
int slot;
int saveDev;
if (handle == NULL) goto exit;
CUDACHECK(cudaGetDevice(&saveDev));
CUDACHECK(cudaSetDevice(comm->cudaDev));
for (slot=0; slot<cache->population && cache->slots[slot] != reg; slot++);
if (slot == cache->population) {
WARN("Deregister: Could not find handle");
@@ -210,10 +218,19 @@ ncclResult_t ncclCommDeregister_impl(const ncclComm_t comm, void* handle) {
reg->regAddr = (CUdeviceptr)NULL;
}
if (reg->state & COLLNET_REG_COMPLETE) {
NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->proxyconn, reg->collnetHandle));
NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->collnetProxyconn, reg->collnetHandle));
}
if (reg->state & IPC_REG_COMPLETE) {
for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; ++i)
if (reg->ipcInfos[i])
NCCLCHECK(ncclIpcDeregBuffer(comm, reg->ipcInfos[i]));
if (reg->regIpcAddrs.hostPeerRmtAddrs) free(reg->regIpcAddrs.hostPeerRmtAddrs);
if (reg->regIpcAddrs.devPeerRmtAddrs) NCCLCHECK(ncclCudaFree(reg->regIpcAddrs.devPeerRmtAddrs));
}
free(reg);
memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*));
cache->population -= 1;
CUDACHECK(cudaSetDevice(saveDev));
exit:
return ncclSuccess;
}
+78 -54
Melihat File
@@ -40,7 +40,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclTransport *transport = ncclTransports[t];
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
int ret = 0;
NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
NCCLCHECK(transport->canConnect(&ret, comm, graph, myInfo, peerInfo));
if (ret) {
connector->transportComm = transportComm;
NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex));
@@ -87,20 +87,43 @@ NCCL_PARAM(ConnectRoundMaxPeers, "CONNECT_ROUND_MAX_PEERS", 128);
NCCL_PARAM(ReportConnectProgress, "REPORT_CONNECT_PROGRESS", 0);
#include <sys/time.h>
ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode) {
bool supportFlag = true;
bool directFlag = false;
if (comm->localRanks == 1) {
supportFlag = false;
} else {
for (int i = 0; i < comm->localRanks; ++i) {
for (int j = i + 1; j < comm->localRanks; ++j) {
int ipeer = comm->localRankToRank[i];
int jpeer = comm->localRankToRank[j];
struct ncclPeerInfo* ipeerInfo = &comm->peerInfo[ipeer];
struct ncclPeerInfo* jpeerInfo = &comm->peerInfo[jpeer];
int canConnect = 0;
NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, NULL, ipeerInfo, jpeerInfo));
if (!canConnect && supportFlag == true) {
supportFlag = false;
}
if (ipeerInfo->hostHash == jpeerInfo->hostHash && ipeerInfo->pidHash == jpeerInfo->pidHash) directFlag = true;
if (!supportFlag && directFlag) break;
}
}
}
*intraNodeP2pSupport = supportFlag;
*directMode = directFlag;
return ncclSuccess;
}
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/, bool* needsProxy/*=NULL*/) {
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
ncclResult_t ret = ncclSuccess;
int highestType = TRANSPORT_UNDEFINED; // track highest transport type
bool needsProxyResult = false;
struct ncclConnect** data; // Store intermediate send/recvData structs for connect
struct ncclConnect** recvData; // Points to entries inside data for given recv connection within a channel
struct ncclConnect** sendData; // Points to entries inside data for given send connection within a channel
struct ncclConnect** recvData = NULL; // Points to entries inside data for given recv connection within a channel
struct ncclConnect** sendData = NULL; // Points to entries inside data for given send connection within a channel
int done = 0;
int maxPeers = ncclParamConnectRoundMaxPeers();
NCCLCHECK(ncclCalloc(&data, maxPeers));
NCCLCHECK(ncclCalloc(&recvData, maxPeers));
NCCLCHECK(ncclCalloc(&sendData, maxPeers));
struct timeval timeStart, timeLast;
gettimeofday(&timeStart, NULL);
@@ -110,6 +133,10 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
int count = 0;
int num = MAXCHANNELS/64;
NCCLCHECK(ncclCalloc(&data, maxPeers));
NCCLCHECKGOTO(ncclCalloc(&recvData, maxPeers), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&sendData, maxPeers), ret, fail);
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
// First time initialization
for (int i=1; i<comm->nRanks; i++) {
@@ -135,7 +162,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
}
//if ((recvMask.masks[0]) || (sendMask.masks[0])) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS));
if (count) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS));
if (count) NCCLCHECKGOTO(ncclCalloc(data+p, 2*MAXCHANNELS), ret, fail);
recvData[p] = data[p];
int sendChannels = 0, recvChannels = 0;
@@ -189,8 +216,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
int sendPeer = (comm->rank + j) % comm->nRanks;
/*uint64_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
uint64_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];*/
struct channelMasks recvMask = comm->connectRecv[recvPeer+CHANNEL_MASK_OFFSET(comm->nRanks, connIndex)];
struct channelMasks sendMask = comm->connectSend[sendPeer+CHANNEL_MASK_OFFSET(comm->nRanks, connIndex)];
struct channelMasks recvMask = comm->connectRecv[recvPeer+CHANNEL_MASK_OFFSET(comm->nRanks, connIndex)];
struct channelMasks sendMask = comm->connectSend[sendPeer+CHANNEL_MASK_OFFSET(comm->nRanks, connIndex)];
int p = j-(done+1);
int sendDataOffset = 0;
@@ -198,11 +225,11 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
for (int c=0; c<MAXCHANNELS; c++) {
TIME_START(3);
//if (sendMask & (1UL<<c)) {
if (sendMask.masks[c/64] & (1UL<<(c%64))) {
if (sendMask.masks[c/64] & (1UL<<(c%64))) {
struct ncclConnector* conn = comm->channels[c].peers[sendPeer]->send + connIndex;
// This connector hasn't completed connection yet
if (conn->connected == 0) {
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[p] + sendDataOffset++, 1, comm->rank, conn), ret, fail);
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[p] + sendDataOffset, 1, comm->rank, conn), ret, fail);
if (ret == ncclSuccess) {
conn->connected = 1;
/* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */
@@ -211,17 +238,18 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
allChannelsConnected = false;
}
}
sendDataOffset++;
}
TIME_STOP(3);
// Start with recv channels
TIME_START(4);
//if (recvMask & (1UL<<c)) {
if (recvMask.masks[c/64] & (1UL<<(c%64))) {
if (recvMask.masks[c/64] & (1UL<<(c%64))) {
struct ncclConnector* conn = comm->channels[c].peers[recvPeer]->recv + connIndex;
// This connector hasn't completed connection yet
if (conn->connected == 0) {
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[p] + recvDataOffset++, 1, comm->rank, conn), ret, fail);
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[p] + recvDataOffset, 1, comm->rank, conn), ret, fail);
if (ret == ncclSuccess) {
conn->connected = 1;
/* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */
@@ -230,33 +258,34 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
allChannelsConnected = false;
}
}
recvDataOffset++;
}
TIME_STOP(4);
}
count = 0;
count = 0;
for (int j = 0; j < num; j++) {
if ((recvMask.masks[j]) || (sendMask.masks[j])) {
count++;
}
}
//if (sendMask.masks[0] || recvMask.masks[0]) {
if (count) {
if (count) {
free(data[p]);
data[p] = NULL;
}
}
if (ncclParamReportConnectProgress() && comm->rank == 0) {
if (ncclParamReportConnectProgress() && comm->rank == 0 && done > 0) {
struct timeval now;
gettimeofday(&now, NULL);
if (((now.tv_sec - timeLast.tv_sec)*1.0 + (now.tv_usec-timeLast.tv_usec)*1e-6) > 1) {
float elapsed = (now.tv_sec - timeStart.tv_sec)*1.0 + (now.tv_usec-timeStart.tv_usec)*1e-6;
float remaining = elapsed*(comm->nRanks-done)/done;
float remaining = elapsed*(comm->nRanks-done)/done;
printf("%sP2p connect: %g%% Elapsed %d:%02d Remaining %d:%02d ",
timeReported ? "\r" : "", done*100.0/comm->nRanks, ((int)elapsed)/60, ((int)elapsed)%60, ((int)remaining)/60, ((int)remaining)%60);
fflush(stdout);
timeReported = true;
timeLast = now; // struct copy;
timeLast = now; // struct copy;
}
}
}
@@ -273,50 +302,47 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
if (elapsed > 1.0) INFO(NCCL_PROFILE, "timings: rank %d nranks %d P2p connect done in %.2f", comm->rank, comm->nRanks, elapsed);
if (timeReported) {
printf("\rP2p connect done in %d:%02d \n",
((int)elapsed)/60, ((int)elapsed)%60);
((int)elapsed)/60, ((int)elapsed)%60);
fflush(stdout);
}
}
/* We need to sync ranks here since some ranks might run too fast after connection setup
* and start to destroy the connection after returning from this function; however, the
* others might still be trying to connect and import the buffer. No sync can lead to invalid
* shmem/cuda buffer. In addition, we also clear all connect masks and free each connectInfo array */
* and start to destroy the connection after returning from this function; however, the
* others might still be trying to connect and import the buffer. No sync can lead to invalid
* shmem/cuda buffer. In addition, we also clear all connect masks and free each connectInfo array */
for (int i = 1; i < comm->nRanks; i++) {
int bootstrapTag = (i << 8) + (1 << 7) + (graph ? graph->id + 1 : 0);
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
int sendPeer = (comm->rank + i) % comm->nRanks;
int flag = 0;
for (int j = 0; j < MAXCHANNELS/64; j++) {
if (recvPeer != sendPeer) {
if (comm->connectSend[sendPeer].masks[j] != 0UL)
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
if (comm->connectRecv[recvPeer].masks[j] != 0UL)
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
if (comm->connectSend[sendPeer].masks[j] != 0UL)
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
if (comm->connectRecv[recvPeer].masks[j] != 0UL)
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
if (comm->connectSend[sendPeer].masks[j] != 0UL) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
if (comm->connectRecv[recvPeer].masks[j] != 0UL) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, NULL, 0), ret, fail);
if (comm->connectSend[sendPeer].masks[j] != 0UL) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
if (comm->connectRecv[recvPeer].masks[j] != 0UL) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, NULL, 0), ret, fail);
} else {
if (comm->connectSend[sendPeer].masks[j] != 0UL || comm->connectRecv[recvPeer].masks[j] != 0UL) {
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
}
}
comm->connectRecv[recvPeer+CHANNEL_MASK_OFFSET(comm->nRanks, connIndex)].masks[j] = comm->connectSend[sendPeer+CHANNEL_MASK_OFFSET(comm->nRanks, connIndex)].masks[j] = 0UL;
}
}
free(data);
free(sendData);
free(recvData);
if (highestTransportType != NULL) *highestTransportType = highestType;
if (needsProxy != NULL) *needsProxy = needsProxyResult;
TIME_PRINT("P2P Setup/Connect");
exit:
for(int i=0; i<maxPeers; ++i){
if(data[i]) free(data[i]);
}
free(data);
if (sendData) free(sendData);
if (recvData) free(recvData);
NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream));
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream));
return ret;
@@ -328,8 +354,8 @@ extern struct ncclTransport collNetTransport;
// All ranks must participate in collNetSetup call
// We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect) {
int fail = 1;
bool ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect) {
ncclResult_t ret = ncclSuccess;
int rank = comm->rank;
int nranks = comm->nRanks;
int nMasters = comm->nNodes;
@@ -350,24 +376,23 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
conn->transportComm = transportComm;
// setup
struct ncclConnect myConnect;
if (isMaster) {
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
}
// prepare connect handles
ncclResult_t res;
struct ncclConnect myConnect = { 0 };
struct {
int isMaster;
ncclConnect connect;
} *allConnects = NULL;
ncclConnect *masterConnects = NULL;
if (isMaster) {
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
}
// prepare connect handles
NCCLCHECK(ncclCalloc(&masterConnects, nMasters));
if (type == collNetRecv) { // recv side: AllGather
// all ranks must participate
NCCLCHECK(ncclCalloc(&allConnects, nranks));
NCCLCHECKGOTO(ncclCalloc(&allConnects, nranks), ret, cleanup);
allConnects[rank].isMaster = isMaster;
memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect));
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup);
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), ret, cleanup);
// consolidate
int c = 0;
for (int r = 0; r < nranks; r++) {
@@ -381,21 +406,20 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
}
// connect
if (isMaster) {
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, comm->node, conn), res, cleanup);
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, comm->node, conn), ret, cleanup);
struct ncclDevChannelPeer* devRoot;
CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup);
CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), ret, cleanup);
struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv + type : devRoot->send + type;
CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup);
CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), ret, cleanup);
}
if (isMaster && type == collNetRecv) {
memcpy(connect, masterConnects+comm->node, sizeof(struct ncclConnect));
TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, comm->node, nMasters, masterPeer);
}
fail = 0;
cleanup:
if (allConnects != NULL) free(allConnects);
if (masterConnects != NULL) free(masterConnects);
return fail;
return ret != ncclSuccess;
}
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) {
+11 -28
Melihat File
@@ -19,15 +19,15 @@ int64_t ncclParamGdrCopySyncEnable();
int64_t ncclParamGdrCopyFlushEnable();
struct collNetRecvConnectInfo {
int rank;
int nranks;
collNetHandle_t collNetHandle;
};
static_assert(sizeof(collNetRecvConnectInfo) <= CONNECT_SIZE, "Collnet Recv Connect info is too large");
struct collNetSendConnectInfo {
void* mhandles[NCCL_NUM_PROTOCOLS];
void* reqFifo;
};
static_assert(sizeof(collNetSendConnectInfo) <= CONNECT_SIZE, "Collnet Send Connect info is too large");
#define COLLNET_GROUP_NSUBS 8
#define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS)
@@ -138,7 +138,7 @@ struct recvResources {
volatile uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
};
static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
// This transport cannot be used for p2p
*ret = 0;
return ncclSuccess;
@@ -157,15 +157,14 @@ struct setupReq {
static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct setupReq req = { 0 };
int proxyRank, tpProxyRank;
int proxyRank;
int64_t netId;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
tpProxyRank = comm->topParentRanks[myInfo->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
req.collNet = comm->collNetSharedRes;
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
@@ -178,7 +177,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
struct setupReq req = { 0 };
int proxyRank, tpProxyRank;
int proxyRank;
int64_t netId;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
@@ -187,8 +186,8 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
tpProxyRank = comm->topParentRanks[myInfo->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
static_assert(sizeof(collNetRecvConnectInfo) <= sizeof(struct ncclConnect), "Collnet Recv Connect info is too big");
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
req.collNet = comm->collNetSharedRes;
@@ -447,6 +446,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
static_assert(sizeof(collNetSendConnectInfo) <= sizeof(struct ncclConnect), "Collnet Send Connect info is too big");
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
@@ -1045,7 +1045,7 @@ ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* u
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
if (handle) {
regRecord->state |= COLLNET_REG_COMPLETE;
regRecord->proxyconn = proxyconn;
regRecord->collnetProxyconn = proxyconn;
*outHandle = regRecord->collnetHandle = handle;
*outRegBufFlag = 1;
}
@@ -1097,7 +1097,7 @@ ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* u
record->size = buffSize;
*outHandle = record->mhandle = handle;
*outRegBufFlag = 1;
ncclIntruQueueEnqueue(cleanupQueue, &record->base);
ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
*nCleanupQueueElts += 1;
exit:
@@ -1220,23 +1220,6 @@ ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) {
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1, &highestTransportType1), ret, fail);
// Exchange highest intra-node transport type among ranks
// because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
if (highestTransportType0 != TRANSPORT_UNDEFINED && highestTransportType1 != TRANSPORT_UNDEFINED) {
int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_UNDEFINED };
comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail);
for (int i = 0; i < comm->localRanks; i++) {
if (highestTypes[i] > comm->intraHighestTransportType)
comm->intraHighestTransportType = highestTypes[i];
}
if (comm->collNetSharedRes->intraHighestTransportType < comm->intraHighestTransportType)
comm->collNetSharedRes->intraHighestTransportType = comm->intraHighestTransportType;
} else if (comm->intraHighestTransportType == TRANSPORT_UNDEFINED) {
// reuse previous shared intraHighestTransportType
comm->intraHighestTransportType = comm->collNetSharedRes->intraHighestTransportType;
}
INFO(NCCL_INIT, "rank %d Connected CollNet", comm->rank);
exit:
+23
Melihat File
@@ -34,3 +34,26 @@ exit:
fail:
goto exit;
}
ncclResult_t ncclTransportPatConnect(struct ncclComm* comm) {
ncclResult_t ret = ncclSuccess;
if (comm && comm->nRanks > 1) {
for (int mask=1; mask<comm->nRanks; mask<<=1) {
int prevPeer = (comm->rank + mask) % comm->nRanks;
int nextPeer = (comm->rank + comm->nRanks - mask) % comm->nRanks;
for (int c = 0; c < comm->nChannels; c++) {
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &prevPeer, 1, &nextPeer, 0), ret, fail); // ReduceScatter
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail);
for (int c = 0; c < comm->nChannels; c++) {
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &nextPeer, 1, &prevPeer, 0), ret, fail); // AllGather
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail);
}
INFO(NCCL_INIT, "Connected binomial trees");
}
exit:
return ret;
fail:
goto exit;
}
+118 -105
Melihat File
@@ -12,10 +12,11 @@
#include "proxy.h"
#include "collectives.h"
#include "gdrwrap.h"
#include "shm.h"
#include "shmutils.h"
#include "p2p.h"
#include "profiler.h"
#include "transport.h"
#include "shm.h"
#include "graph.h"
#include "graph/topo.h"
#if defined(ENABLE_NPKIT)
@@ -47,7 +48,7 @@ static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too
#define NCCL_NET_MAP_GET_POINTER(mapStruct, cpuOrGpu, offsetName) \
(NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) ? NULL : \
(mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET))
(mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET))
#define NCCL_NET_MAP_DEV_MEM(mapStruct, offsetName) \
(((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_DEVMEM) != 0)
@@ -72,9 +73,8 @@ struct connectMapMem{
char* cpuPtr;
int size;
ncclIpcDesc ipcDesc;
char shmPath[PATH_MAX];
ncclShmHandle_t attachHandle;
ncclShmHandle_t createHandle;
ncclShmIpcDesc_t attachDesc;
ncclShmIpcDesc_t createDesc;
};
struct connectMap {
@@ -154,13 +154,13 @@ struct recvNetResources {
};
/* Determine if two peers can communicate with NET */
static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
*ret = 1;
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
#else
if (info1->hostHash == info2->hostHash) {
// If on the same host, check intra-node net is not disabled.
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, ret));
NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, ret));
}
#endif
return ncclSuccess;
@@ -190,12 +190,11 @@ struct setupReq {
static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args);
/* Determine if we will use this transport for this peer and return connect
* information for this peer */
* information for this peer */
static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct setupReq req = { 0 };
int tpProxyRank;
send->conn.shared = req.shared = (graph || mscclIsCaller()) ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
send->conn.shared = req.shared = (graph || connIndex == 0 || mscclIsCaller()) ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
req.channelId = channelId;
req.connIndex = connIndex;
req.curr_hdp_reg = 0;
@@ -212,8 +211,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
send->conn.curr_hdp_reg = req.curr_hdp_reg;
}
tpProxyRank = comm->topParentRanks[proxyRank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
req.tpRank = comm->topParentRanks[myInfo->rank];
req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
@@ -226,7 +224,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm->ncclNet->name, req.netDev,
proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
}
*((int*)connectInfo) = tpProxyRank;
*((int*)connectInfo) = comm->topParentRanks[proxyRank];
return ncclSuccess;
}
@@ -239,13 +237,13 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
struct setupReq req = { 0 };
recv->conn.shared = req.shared = (graph || mscclIsCaller()) ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
recv->conn.shared = req.shared = (graph || connIndex == 0 || mscclIsCaller()) ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
req.channelId = channelId;
req.connIndex = connIndex;
req.netDev = -1;
// Use myInfo->rank as the receiver uses its own NIC
int proxyRank = myInfo->rank, tpProxyRank;
int proxyRank = myInfo->rank;
int64_t netId;
if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &netId, &req.netDev));
if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &netId, &req.netDev, &proxyRank));
@@ -259,8 +257,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
}
// We don't support PXN on receive yet
tpProxyRank = comm->topParentRanks[myInfo->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
req.tpRank = comm->topParentRanks[myInfo->rank];
@@ -271,26 +268,24 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
return ncclSuccess;
}
static ncclResult_t netMapShm(struct connectMapMem* mem) {
mem->cpuPtr = NULL;
mem->gpuPtr = NULL;
NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, -1, &mem->attachHandle));
static ncclResult_t netMapShm(struct ncclComm *comm, struct connectMapMem* mem) {
NCCLCHECK(ncclShmImportShareableBuffer(comm, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, &mem->attachDesc));
return ncclSuccess;
}
static ncclResult_t netCreateShm(struct connectMapMem* mem) {
mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file
NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1, &mem->createHandle));
static ncclResult_t netCreateShm(struct ncclProxyState* proxyState, struct connectMapMem* mem) {
NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, mem->size, false, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr));
return ncclSuccess;
}
static ncclResult_t netDumpMap(struct connectMap* map) {
printf("Dump map same process %d shared %d\n", map->sameProcess, map->shared);
struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM;
printf("Mem 0: Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr);
printf("Mem 0: Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
mem = map->mems+NCCL_NET_MAP_DEVMEM;
printf("Mem 1: Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM;
printf("Mem 2: Shared Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr);
printf("Mem 2: Shared Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM;
printf("Mem 3: Shared Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
@@ -361,23 +356,23 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
}
}
} else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
if (!map->sameProcess) NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
if (!map->sameProcess) NCCLCHECK(netMapShm(comm, map->mems + NCCL_NET_MAP_HOSTMEM));
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr = NULL;
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
map->mems[NCCL_NET_MAP_DEVMEM].size,
&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.rank,
map->mems[NCCL_NET_MAP_DEVMEM].size,
&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = NULL;
}
if (map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size) {
void** sharedDevMemPtr = comm->proxyState->sharedDevMems + send->proxyConn.tpLocalRank;
if (*sharedDevMemPtr == NULL) {
map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = NULL;
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size,
&map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipcDesc,
sharedDevMemPtr));
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.rank,
map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size,
&map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipcDesc,
sharedDevMemPtr));
}
map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = (char*)(*sharedDevMemPtr);
map->mems[NCCL_NET_MAP_SHARED_DEVMEM].cpuPtr = NULL;
@@ -435,7 +430,7 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
// Use recv connector as unique identifier
opId = recv;
INFO(NCCL_PROXY, "recvConnect ncclProxyCallAsync opId=%p &recv->proxyConn=%p connectInfo=%p",
opId, &recv->proxyConn, connectInfo);
opId, &recv->proxyConn, connectInfo);
netRecvConnectArgs args = {0};
args.proxyRank = *((int*)connectInfo);
NCCLCHECK(ncclProxyCallAsync(comm, &recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(netRecvConnectArgs), sizeof(struct connectMap), opId));
@@ -496,24 +491,19 @@ static ncclResult_t sendFree(struct ncclConnector* send) {
if (map) {
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
if (map->sameProcess && map->cudaDev == cudaDev) {
// Our own GPU, so it wasn't mapped in
free(map);
return ncclSuccess;
}
if (!map->sameProcess || ncclCuMemEnable()) {
if (!map->sameProcess) NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].attachHandle));
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
if (ncclCuMemEnable()) {
// cuMem API support
NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
// Legacy CUDA IPC support
CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
}
if (map->cudaDev != cudaDev && map->mems[NCCL_NET_MAP_DEVMEM].size) {
if (ncclCuMemEnable()) {
// cuMem API support
NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
// Legacy CUDA IPC support
CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
}
}
if (!map->sameProcess) {
NCCLCHECK(ncclShmIpcClose(&map->mems[NCCL_NET_MAP_HOSTMEM].attachDesc));
}
free(map);
}
@@ -552,7 +542,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
if (cuda && state->cudaBuff == NULL) {
if (sameProcess == 0 || ncclCuMemEnable()) {
NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, &state->ipcDesc, (void**)&state->cudaBuff));
NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, 0, &state->ipcDesc, (void**)&state->cudaBuff));
} else {
#if defined(HIP_UNCACHED_MEMORY)
#if defined(HIP_CONTIGUOUS_MEMORY)
@@ -572,7 +562,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size));
}
if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
if (gpuPtr) *gpuPtr = sameProcess ? *cpuPtr : NULL;
if (gpuPtr) *gpuPtr = (cpuPtr && sameProcess) ? *cpuPtr : NULL;
if (ipcDesc) memcpy(ipcDesc, &state->ipcDesc, sizeof(state->ipcDesc));
return ncclSuccess;
}
@@ -588,7 +578,7 @@ static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int chan
static ncclResult_t sharedNetBuffersDestroy(struct ncclProxyState* proxyState, int tpLocalRank, int type, struct ncclProxyConnection* connection) {
if (proxyState->progressState.localPeers == NULL) NCCLCHECK(ncclInternalError);
struct ncclProxyPeer* peer = proxyState->progressState.localPeers[tpLocalRank];
if (peer == NULL) NCCLCHECK(ncclInternalError;)
if (peer == NULL) NCCLCHECK(ncclInternalError);
struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
if (state->size == 0) NCCLCHECK(ncclInternalError);
if (ncclAtomicRefCountDecrement(&state->refcount) == 0) {
@@ -811,8 +801,8 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
if (resources->shared == 0) {
if (!map->sameProcess || ncclCuMemEnable()) {
ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN);
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, 0, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
#if defined(HIP_UNCACHED_MEMORY)
#if defined(HIP_CONTIGUOUS_MEMORY)
@@ -834,7 +824,11 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
} else {
NCCLCHECK(netCreateShm(map->mems+NCCL_NET_MAP_HOSTMEM));
NCCLCHECK(netCreateShm(proxyState, map->mems+NCCL_NET_MAP_HOSTMEM));
void* sendMem = (void*)NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
void* recvMem = (void*)NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
memset(sendMem, 0, sizeof(struct ncclSendMem));
memset(recvMem, 0, sizeof(struct ncclRecvMem));
}
if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) {
uint64_t *cpuPtr, *gpuPtr;
@@ -999,8 +993,8 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
if (resources->shared == 0) {
if (ncclCuMemEnable()) {
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, 0, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
#if defined(HIP_UNCACHED_MEMORY)
#if defined(HIP_CONTIGUOUS_MEMORY)
@@ -1094,7 +1088,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
if (resources->map.sameProcess) {
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
} else {
NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].createHandle));
NCCLCHECK(ncclShmIpcClose(&mems[NCCL_NET_MAP_HOSTMEM].createDesc));
}
NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
if (!resources->map.sameProcess || ncclCuMemEnable()) {
@@ -1183,7 +1177,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
// Set step base for next op
resources->step = sub->base + sub->nsteps;
sub->posted = sub->transmitted = sub->done = 0;
for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
ncclProfilerStartSendProxyOpEvent(s, args);
if (sub->reg && sub->nbytes > 0) {
NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
} else {
@@ -1206,6 +1200,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
// Post buffers to the GPU
if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) {
ncclProfilerStartSendProxyStepEvents(s, args, sub->posted, sub->posted+args->sliceSteps);
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
if (resources->shared) {
if (!sub->reg) {
@@ -1221,9 +1216,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
if (sub->reg == 0 || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS;
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
} else sub->posted += args->sliceSteps;
for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) {
ncclProfilingRecord(args, s, step, ncclProxyProfileSendGPUWait);
}
ncclProfilerRecordProxyOpEventState(s, args, sub->posted, sub->transSize, ncclProfilerProxyOpSendPosted);
ncclProfilerRecordProxyStepEventStates(s, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepSendGPUWait);
args->idle = 0;
continue;
}
@@ -1272,7 +1266,11 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
args->hdp_flushed = *recvTail;
*resources->curr_hdp_reg = 1;
}
ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted + args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait);
// Data is ready, try to send.
// Coverity complains about the size here as pointing to an out-of-scope temporary. Which is nonsense,
// since size is a plain integer.
// coverity[use_invalid:FALSE]
NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->mhandle, sub->requests+buffSlot));
if (sub->requests[buffSlot] != NULL) {
@@ -1294,7 +1292,9 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p, size %d, proto %d, myRank %d, channelId %d", sub->transmitted, buffSlot, sub->requests[buffSlot], size, p, proxyState->tpRank, sub->channelId);
sub->transmitted += args->sliceSteps;
for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileSendWait);
ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted);
ncclProfilerRecordProxyStepEventStates(s, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepSendWait);
sub->transSize += size;
args->idle = 0;
continue;
}
@@ -1354,48 +1354,52 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
g_npkit_net_poll_cnt = 0;
#endif
#endif
if (sub->reg) {
if (size < sub->nbytes) {
sub->recvbuff += size;
sub->nbytes -= size;
// Do one more step (at least)
sub->nsteps++;
} else {
// Signal the GPU the send is complete and it can return.
connFifo[sub->base%NCCL_STEPS].size = -1;
}
if (sub->reg) {
if (size < sub->nbytes) {
sub->recvbuff += size;
sub->nbytes -= size;
// Do one more step (at least)
sub->nsteps++;
} else {
// Signal the GPU the send is complete and it can return.
connFifo[sub->base%NCCL_STEPS].size = -1;
}
// Make sure size is reset to -1 before we update the head.
if (sub->reg == 0) connFifo[buffSlot].size = -1;
__sync_synchronize();
TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
sub->done += args->sliceSteps;
for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
}
// Make sure size is reset to -1 before we update the head.
if (sub->reg == 0) connFifo[buffSlot].size = -1;
__sync_synchronize();
TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
sub->done += args->sliceSteps;
ncclProfilerStopProxyStepEvents(s, args, sub->done-args->sliceSteps, sub->done);
ncclProfilerRecordProxyOpEventState(s, args, sub->done, sub->transSize, ncclProfilerProxyOpSendDone);
if (resources->shared == 0) {
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
if (sub->reg) {
// We may have added more net steps, but reg operations only have a single step w.r.t. the GPU.
if (sub->done == sub->nsteps) *sendHead = sub->base + args->sliceSteps;
} else {
*sendHead = sub->base + sub->done;
}
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
if (resources->shared == 0) {
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
if (sub->reg) {
// We may have added more net steps, but reg operations only have a single step w.r.t. the GPU.
if (sub->done == sub->nsteps) *sendHead = sub->base + args->sliceSteps;
} else {
*sendHead = sub->base + sub->done;
}
args->idle = 0;
if (sub->done == sub->nsteps) {
if (sub->reg && sub->nbytes > 0) {
NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, sub->mhandle));
}
args->done++;
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
}
args->idle = 0;
if (sub->done == sub->nsteps) {
if (sub->reg && sub->nbytes > 0) {
NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, sub->mhandle));
}
args->done++;
}
}
}
if (args->done == args->nsubs) {
args->state = ncclProxyOpNone;
}
}
if (args->done == args->nsubs) {
for (int s=0; s<args->nsubs; s++) {
ncclProfilerStopProxyOpEvent(s, args);
}
args->state = ncclProxyOpNone;
}
}
return ncclSuccess;
}
@@ -1439,7 +1443,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
resources->step = sub->base + sub->nsteps;
sub->posted = sub->received = sub->transmitted = sub->done = 0;
for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize;
for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
ncclProfilerStartRecvProxyOpEvent(s, args);
if (sub->reg && sub->nbytes > 0) {
// Register buffer
NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
@@ -1464,6 +1468,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
struct ncclProxySubArgs* sub = subGroup + i;
if (sub->posted < sub->nsteps) {
if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; }
ncclProfilerStartRecvProxyStepEvents(s+i, args, sub->posted, sub->posted+args->sliceSteps);
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
if (sub->reg) maxDepth = 1;
int stepSize = resources->buffSizes[p] / NCCL_STEPS;
@@ -1520,7 +1525,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
#endif
sub->posted += args->sliceSteps;
for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait);
ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted);
ncclProfilerRecordProxyStepEventStates(s+i, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepRecvWait);
}
args->idle = 0;
}
@@ -1579,7 +1585,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
}
}
sub->received += args->sliceSteps;
for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
sub->transSize += sizes[i];
ncclProfilerRecordProxyOpEventState(s+i, args, sub->received, sub->transSize, ncclProfilerProxyOpRecvReceived);
ncclProfilerRecordProxyStepEventStates(s+i, args, sub->received-args->sliceSteps, sub->received, ncclProfilerProxyStepRecvFlushWait);
if (step < sub->nsteps) {
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
if (resources->useGdr) needFlush |= resources->needFlush;
@@ -1654,7 +1662,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
struct ncclProxySubArgs* sub = subGroup + i;
sub->transmitted += args->sliceSteps;
for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait);
ncclProfilerRecordProxyOpEventState(s+i, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpRecvTransmitted);
ncclProfilerRecordProxyStepEventStates(s+i, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepRecvGPUWait);
if (step < sub->nsteps) {
__sync_synchronize();
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
@@ -1692,7 +1701,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL;
}
sub->done += args->sliceSteps;
for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileEnd);
ncclProfilerStopProxyStepEvents(s+i, args, sub->done-args->sliceSteps, sub->done);
ncclProfilerRecordProxyOpEventState(s+i, args, sub->done, sub->transSize, ncclProfilerProxyOpRecvDone);
args->idle = 0;
if (sub->done == sub->nsteps) {
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
@@ -1708,6 +1718,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
}
if (args->done == args->nsubs) {
args->state = ncclProxyOpNone;
for (int s=0; s<args->nsubs; s++) {
ncclProfilerStopProxyOpEvent(s, args);
}
}
}
return ncclSuccess;
+273 -133
Melihat File
@@ -52,6 +52,11 @@ struct alignas(64) ncclIbMergedDev {
int devs[NCCL_IB_MAX_DEVS_PER_NIC]; // Points to an index in ncclIbDevs
int speed;
char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+'
int dmaBufSupported; // 0 = uninit, 1 = yes, -1 = no
};
struct ncclIbStats {
int fatalErrorCount;
};
static int ncclNIbDevs = -1;
@@ -72,6 +77,7 @@ struct alignas(64) ncclIbDev {
struct ncclIbMrCache mrCache;
int ar; // ADAPTIVE_ROUTING
struct ibv_port_attr portAttr;
struct ncclIbStats stats;
};
#define MAX_IB_DEVS 32
@@ -83,7 +89,7 @@ static int ncclIbRelaxedOrderingEnabled = 0;
NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", -1);
NCCL_PARAM(IbRoutableFlidIbGidIndex, "IB_ROUTABLE_FLID_GID_INDEX", 1);
NCCL_PARAM(IbRoceVersionNum, "IB_ROCE_VERSION_NUM", 2);
NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 18);
NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 20);
NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
NCCL_PARAM(IbPkey, "IB_PKEY", 0);
NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
@@ -93,6 +99,32 @@ NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", 0);
NCCL_PARAM(IbAsyncEvents,"IB_RETURN_ASYNC_EVENTS",1);
NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1);
static ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat) {
__atomic_store_n(&stat->fatalErrorCount, 0, __ATOMIC_RELAXED);
return ncclSuccess;
}
static void ncclIbStatsFatalError(struct ncclIbStats* stat){
__atomic_fetch_add(&stat->fatalErrorCount, 1, __ATOMIC_RELAXED);
}
static ncclResult_t ncclIbStatsCheckFatalCount(struct ncclIbStats* stat, const char* funcName) {
if (ncclParamIbAsyncEvents() && __atomic_load_n(&stat->fatalErrorCount, __ATOMIC_RELAXED)) {
WARN("communicator encountered a fatal error (detected in %s)\n", funcName);
return ncclSystemError;
}
return ncclSuccess;
}
static void ncclIbQpFatalError(struct ibv_qp* qp) {
ncclIbStatsFatalError((struct ncclIbStats*)qp->qp_context);
}
static void ncclIbCqFatalError(struct ibv_cq* cq) {
ncclIbStatsFatalError((struct ncclIbStats*)cq->cq_context);
}
static void ncclIbDevFatalError(struct ncclIbDev* dev) {
ncclIbStatsFatalError(&dev->stats);
}
pthread_t ncclIbAsyncThread;
static void* ncclIbAsyncThreadMain(void* args) {
@@ -101,9 +133,53 @@ static void* ncclIbAsyncThreadMain(void* args) {
struct ibv_async_event event;
if (ncclSuccess != wrap_ibv_get_async_event(dev->context, &event)) { break; }
char *str;
struct ibv_cq* cq = event.element.cq; // only valid if CQ error
struct ibv_qp* qp = event.element.qp; // only valid if QP error
struct ibv_srq* srq = event.element.srq; // only valid if SRQ error
if (ncclSuccess != wrap_ibv_event_type_str(&str, event.event_type)) { break; }
if (event.event_type != IBV_EVENT_COMM_EST)
WARN("NET/IB : %s:%d Got async event : %s", dev->devName, dev->portNum, str);
switch (event.event_type) {
case IBV_EVENT_DEVICE_FATAL:
// the above is device fatal error
WARN("NET/IB : %s:%d async fatal event: %s", dev->devName, dev->portNum, str);
ncclIbDevFatalError(dev);
break;
case IBV_EVENT_CQ_ERR:
// the above is a CQ fatal error
WARN("NET/IB : %s:%d async fatal event on CQ (%p): %s", dev->devName, dev->portNum, cq, str);
ncclIbCqFatalError(cq);
break;
case IBV_EVENT_QP_FATAL:
case IBV_EVENT_QP_REQ_ERR:
case IBV_EVENT_QP_ACCESS_ERR:
// the above are QP fatal errors
WARN("NET/IB : %s:%d async fatal event on QP (%p): %s", dev->devName, dev->portNum, qp, str);
ncclIbQpFatalError(qp);
break;
case IBV_EVENT_SRQ_ERR:
// SRQ are not used in NCCL
WARN("NET/IB : %s:%d async fatal event on SRQ, unused for now (%p): %s", dev->devName, dev->portNum, srq, str);
break;
case IBV_EVENT_PATH_MIG_ERR:
case IBV_EVENT_PORT_ERR:
case IBV_EVENT_PATH_MIG:
case IBV_EVENT_PORT_ACTIVE:
case IBV_EVENT_SQ_DRAINED:
case IBV_EVENT_LID_CHANGE:
case IBV_EVENT_PKEY_CHANGE:
case IBV_EVENT_SM_CHANGE:
case IBV_EVENT_QP_LAST_WQE_REACHED:
case IBV_EVENT_CLIENT_REREGISTER:
case IBV_EVENT_SRQ_LIMIT_REACHED:
// the above are non-fatal
WARN("NET/IB : %s:%d Got async error event: %s", dev->devName, dev->portNum, str);
break;
case IBV_EVENT_COMM_EST:
break;
default:
WARN("NET/IB : %s:%d unknown event type (%d)", dev->devName, dev->portNum, event.event_type);
break;
}
// acknowledgment needs to happen last to avoid user-after-free
if (ncclSuccess != wrap_ibv_ack_async_event(&event)) { break; }
}
return NULL;
@@ -143,11 +219,11 @@ static void* envIbAddrRange(sa_family_t af, int* mask) {
char addrString[128] = { 0 };
snprintf(addrString, 128, "%s", env);
char *addrStrPtr = addrString;
char *maskStrPtr = strstr(addrString, "/") + 1;
char *maskStrPtr = strstr(addrString, "/");
if (NULL == maskStrPtr) {
return NULL;
}
*(maskStrPtr - 1) = '\0';
*(maskStrPtr++) = '\0';
if (inet_pton(af, addrStrPtr, ret) == 0) {
WARN("NET/IB: Ip address '%s' is invalid for family %s, ignoring address", addrStrPtr, (af == AF_INET) ? "AF_INET" : "AF_INET6");
@@ -245,12 +321,14 @@ static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum,
int fd = open(roceTypePath, O_RDONLY);
if (fd == -1) {
WARN("NET/IB: open failed in ncclIbRoceGetVersionNum: %s", strerror(errno));
return ncclSystemError;
}
int ret = read(fd, gidRoceVerStr, 15);
close(fd);
if (ret == -1) {
WARN("NET/IB: read failed in ncclIbRoceGetVersionNum: %s", strerror(errno));
return ncclSystemError;
}
@@ -423,7 +501,7 @@ int ncclIbFindMatchingDev(int dev) {
}
ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
ncclResult_t ret;
ncclResult_t ret = ncclSuccess;
if (ncclParamIbDisable()) return ncclInternalError;
static int shownIbHcaEnv = 0;
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
@@ -503,11 +581,12 @@ build_ib_list:
ncclIbDevs[ncclNIbDevs].pdRefs = 0;
ncclIbDevs[ncclNIbDevs].pd = NULL;
strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
NCCLCHECK(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort));
NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail);
ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0;
ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
NCCLCHECK(ncclIbStatsInit(&ncclIbDevs[ncclNIbDevs].stats));
// Enable ADAPTIVE_ROUTING by default on IB networks
// But allow it to be overloaded by an env parameter
@@ -517,9 +596,9 @@ build_ib_list:
TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum,
portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs);
PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail);
ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d
int mergedDev = ncclNMergedIbDevs;
if (mergeNics) {
@@ -594,15 +673,16 @@ build_ib_list:
line[2047] = '\0';
char addrline[SOCKET_NAME_MAXLEN+1];
INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline));
ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline));
}
pthread_mutex_unlock(&ncclIbLock);
}
return ncclSuccess;
exit:
return ret;
fail:
if(ncclSuccess != wrap_ibv_free_device_list(devices)){WARN("NET/IB : Unable to free device list");}
pthread_mutex_unlock(&ncclIbLock);
return ret;
goto exit;
}
ncclResult_t ncclIbDevices(int* ndev) {
@@ -618,19 +698,19 @@ RCCL_PARAM(ForceEnableGdrdma, "FORCE_ENABLE_GDRDMA", -1);
// Returns :
// ncclSuccess : GDR works
// ncclSystemError : no module or module loaded but not supported by GPU
ncclResult_t ncclIbGdrSupport() {
static int moduleLoaded = -1;
#define KNL_MODULE_LOADED(a) ((access(a, F_OK) == -1) ? 0 : 1)
static int ncclIbGdrModuleLoaded = 0; // 1 = true, 0 = false
static void ibGdrSupportInitOnce() {
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
if (rcclParamForceEnableGdrdma() == 1) {
// RCCL_FORCE_ENABLE_GDRDMA=1 enables GPU-NIC RDMA only from RCCL-side
// Requires support from NIC driver modules
// Use ONLY for debugging!
moduleLoaded = 1;
INFO(NCCL_INIT, "RCCL_FORCE_ENABLE_GDRDMA = 1, so explicitly setting moduleLoaded = 1");
ncclIbGdrModuleLoaded = 1;
INFO(NCCL_INIT, "RCCL_FORCE_ENABLE_GDRDMA = 1, so explicitly setting ncclIbGdrModuleLoaded = 1");
}
if (moduleLoaded == -1) {
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
if (ncclIbGdrModuleLoaded == 0) {
// Check for `memory_peers` directory containing `amdkfd/version`
// This `memory_peers` directory is created by NIC-GPU driver interaction
// On Linux kernel 5.15.0 (e.g. Ubuntu 22.04), `memory_peers` is created under `/sys/kernel/mm/`
@@ -645,25 +725,25 @@ ncclResult_t ncclIbGdrSupport() {
while (memory_peers_paths[i]) {
if (access(memory_peers_paths[i], F_OK) == 0) {
moduleLoaded = 1;
ncclIbGdrModuleLoaded = 1;
INFO(NCCL_INIT,"Found %s", memory_peers_paths[i]);
break;
} else {
moduleLoaded = 0;
ncclIbGdrModuleLoaded = 0;
}
++i;
}
char strValue[MAX_STR_LEN];
NCCLCHECK(ncclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue));
ncclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue);
if (strncmp("Hyper-V UEFI Release", strValue, 20) == 0) {
int roMode = ncclParamIbPciRelaxedOrdering();
NCCLCHECK(ncclTopoGetStrFromSys("/proc/sys/kernel", "numa_balancing", strValue));
ncclTopoGetStrFromSys("/proc/sys/kernel", "numa_balancing", strValue);
if (strcmp(strValue, "1") == 0 && roMode == 0)
moduleLoaded = 0;
ncclIbGdrModuleLoaded = 0;
}
if (moduleLoaded == 0) {
if (ncclIbGdrModuleLoaded == 0) {
// Check for `ib_register_peer_memory_client` symbol in `/proc/kallsyms`
// if your system uses native OS ib_peer module
char buf[256];
@@ -676,56 +756,74 @@ ncclResult_t ncclIbGdrSupport() {
while (fgets(buf, sizeof(buf), fp) != NULL) {
if (strstr(buf, "t ib_register_peer_memory_client") != NULL ||
strstr(buf, "T ib_register_peer_memory_client") != NULL) {
moduleLoaded = 1;
ncclIbGdrModuleLoaded = 1;
INFO(NCCL_INIT,"Found ib_register_peer_memory_client in /proc/kallsyms");
break;
}
}
}
}
}
#else
// Check for the nv_peer_mem module being loaded
moduleLoaded = ((access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) &&
// Also support the new nv_mem_nc module
(access("/sys/kernel/mm/memory_peers/nv_mem_nc/version", F_OK) == -1)) ? 0 : 1;
// Check for the nv_peer_mem module being loaded
ncclIbGdrModuleLoaded = KNL_MODULE_LOADED("/sys/kernel/mm/memory_peers/nv_mem/version") ||
KNL_MODULE_LOADED("/sys/kernel/mm/memory_peers/nv_mem_nc/version") ||
KNL_MODULE_LOADED("/sys/module/nvidia_peermem/version");
#endif
}
if (moduleLoaded == 0) {
INFO(NCCL_INIT,"GDRDMA not enabled. Could not find memory_peers directory or peer_memory symbol");
}
ncclResult_t ncclIbGdrSupport() {
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, ibGdrSupportInitOnce);
if (!ncclIbGdrModuleLoaded)
return ncclSystemError;
}
return ncclSuccess;
}
static __thread int ibDmaSupportInitDev; // which device to init, must be thread local
static void ibDmaBufSupportInitOnce(){
ncclResult_t res;
// select the appropriate
struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev;
// Test each real devices
int dev_fail = 0;
NCCLCHECKGOTO(rocmLibraryInit(), res, failure);
for (int i = 0; i < mergedDev->ndevs; i++) {
int ibDev = mergedDev->devs[i];
struct ibv_pd* pd;
struct ibv_context* ctx = ncclIbDevs[ibDev].context;
NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
// Test kernel DMA-BUF support with a dummy call (fd=-1)
(void)wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL /*offset*/, 0ULL /*len*/, 0ULL /*iova*/, -1 /*fd*/, 0 /*flags*/);
// ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
dev_fail |= (errno == EOPNOTSUPP) || (errno == EPROTONOSUPPORT);
NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
// stop the search and goto failure
if (dev_fail) goto failure;
}
mergedDev->dmaBufSupported = 1;
return;
failure:
mergedDev->dmaBufSupported = -1;
return;
}
// Detect whether DMA-BUF support is present in the kernel
// Returns :
// ncclSuccess : DMA-BUF support is available
// ncclSystemError : DMA-BUF is not supported by the kernel
ncclResult_t ncclIbDmaBufSupport(int dev) {
static int dmaBufSupported = -1;
if (dmaBufSupported == -1) {
ncclResult_t res;
NCCLCHECKGOTO(rocmLibraryInit(), res, failure);
struct ibv_pd* pd;
struct ibv_context* ctx;
struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + dev;
struct oncewrap {
pthread_once_t once = PTHREAD_ONCE_INIT;
};
static oncewrap onces[MAX_IB_DEVS];
// init the device only once
ibDmaSupportInitDev = dev;
pthread_once(&onces[dev].once, ibDmaBufSupportInitOnce);
// Test each dev
for (int i = 0; i < mergedDev->ndevs; i++) {
int ibDev = mergedDev->devs[i];
ctx = ncclIbDevs[ibDev].context;
NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
// Test kernel DMA-BUF support with a dummy call (fd=-1)
(void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/);
// ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
dmaBufSupported = (errno != EOPNOTSUPP && errno != EPROTONOSUPPORT) ? 1 : 0;
NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
}
}
if (dmaBufSupported == 0) return ncclSystemError;
return ncclSuccess;
failure:
dmaBufSupported = 0;
int dmaBufSupported = ncclIbMergedDevs[dev].dmaBufSupported;
if (dmaBufSupported == 1) return ncclSuccess;
return ncclSystemError;
}
@@ -923,16 +1021,19 @@ struct alignas(32) ncclIbNetCommBase {
// Track necessary remDevInfo here
int nRemDevs;
struct ncclIbDevInfo remDevs[NCCL_IB_MAX_DEVS_PER_NIC];
// statistics about the comm
struct ncclIbStats stats;
};
struct ncclIbSendComm {
struct ncclIbNetCommBase base;
// Start with fifo and ibv structs as they have alignment restrictions
struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS + 1];
// Each dev correlates to a mergedIbDev
struct ncclIbSendCommDev devs[NCCL_IB_MAX_DEVS_PER_NIC];
struct ncclIbRequest* fifoReqs[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1];
struct ncclIbRemSizesFifo remSizesFifo;
uint64_t fifoHead;
int ar; // Use adaptive routing when all merged devices have it enabled
@@ -986,8 +1087,7 @@ static void ncclIbAddEvent(struct ncclIbRequest* req, int devIndex, struct ncclI
req->events[devIndex]++;
req->devBases[devIndex] = base;
}
ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base) {
ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base, void* cq_context) {
base->ibDevN = ibDevN;
ncclIbDev* ibDev = ncclIbDevs + ibDevN;
pthread_mutex_lock(&ibDev->lock);
@@ -1004,8 +1104,8 @@ ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base
pthread_mutex_unlock(&ibDev->lock);
// CQ is sized to accommodate the max SQ + RQ WQE completions. If each SQ WQE could be signaled, then,
// for each QP, there can be 2*MAX_REQUESTS completions for SQ and MAX_REQUESTS completions for RQ.
NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 3*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0));
// for each QP, there can be 2*MAX_REQUESTS completions for SQ and MAX_REQUESTS completions for RQ.
NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 3*MAX_REQUESTS*ncclParamIbQpsPerConn(), cq_context, NULL, 0));
return ncclSuccess;
}
@@ -1024,9 +1124,10 @@ returning:
return res;
}
ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, struct ncclIbQp* qp) {
ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, void* qp_context, struct ncclIbQp* qp) {
struct ibv_qp_init_attr qpInitAttr;
memset(&qpInitAttr, 0, sizeof(struct ibv_qp_init_attr));
qpInitAttr.qp_context = qp_context;
qpInitAttr.send_cq = base->cq;
qpInitAttr.recv_cq = base->cq;
qpInitAttr.qp_type = IBV_QPT_RC;
@@ -1071,22 +1172,22 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint
} else {
//pick lid if subnet prefixs are same, FLID if they are not
if (ncclIbExtractLocalSubnetPrefix(sGidInfo->localGid.global.subnet_prefix) ==
ncclIbExtractLocalSubnetPrefix(info->gid.global.subnet_prefix)) {
ncclIbExtractLocalSubnetPrefix(info->gid.global.subnet_prefix)) {
qpAttr.ah_attr.is_global = 0;
qpAttr.ah_attr.dlid = info->lid;
} else {
uint16_t flid = ncclIbExtractFlid(&info->gid);
uint16_t flid = ncclIbExtractFlid(&info->gid);
if (flid == 0) {
WARN("Warning: remote FLID configured as zero even when endpoints are on different subnets, using dlid as fallback");
qpAttr.ah_attr.dlid = info->lid;
} else {
} else {
qpAttr.ah_attr.dlid = ncclIbExtractFlid(&info->gid);
}
}
qpAttr.ah_attr.is_global = 1;
qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->gid.global.subnet_prefix;
qpAttr.ah_attr.grh.dgid.global.interface_id = info->gid.global.interface_id;
qpAttr.ah_attr.grh.sgid_index = sGidInfo->localGidIndex;
qpAttr.ah_attr.grh.hop_limit = 255;
qpAttr.ah_attr.grh.hop_limit = 255;
}
}
qpAttr.ah_attr.sl = ncclParamIbSl();
@@ -1110,6 +1211,7 @@ ncclResult_t ncclIbRtsQp(struct ibv_qp* qp) {
}
ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
ncclResult_t ret = ncclSuccess;
struct ncclIbListenComm* comm;
NCCLCHECK(ncclCalloc(&comm, 1));
struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
@@ -1117,14 +1219,20 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
memset(handle, 0, sizeof(struct ncclIbHandle));
comm->dev = dev;
handle->magic = NCCL_SOCKET_MAGIC;
NCCLCHECK(ncclSocketInit(&comm->sock, &ncclIbIfAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
NCCLCHECK(ncclSocketListen(&comm->sock));
NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr));
NCCLCHECKGOTO(ncclSocketInit(&comm->sock, &ncclIbIfAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1), ret, fail);
NCCLCHECKGOTO(ncclSocketListen(&comm->sock), ret, fail);
NCCLCHECKGOTO(ncclSocketGetAddr(&comm->sock, &handle->connectAddr), ret, fail);
*listenComm = comm;
return ncclSuccess;
exit:
return ret;
fail:
(void)ncclSocketClose(&comm->sock);
free(comm);
goto exit;
}
ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
ncclResult_t ret = ncclSuccess;
struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
struct ncclIbCommStage* stage = &handle->stage;
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
@@ -1139,16 +1247,18 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
WARN("Error: trying to connect already connected sendComm");
return ncclInternalError;
}
stage->buffer = NULL;
NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
NCCLCHECK(ncclSocketInit(&comm->base.sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
NCCLCHECKGOTO(ncclIbStatsInit(&comm->base.stats), ret, fail);
NCCLCHECKGOTO(ncclSocketInit(&comm->base.sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1), ret, fail);
stage->comm = comm;
stage->state = ncclIbCommStateConnect;
NCCLCHECK(ncclSocketConnect(&comm->base.sock));
NCCLCHECKGOTO(ncclSocketConnect(&comm->base.sock), ret, fail);
ib_connect_check:
/* since ncclSocketConnect is async, we must check if connection is complete */
NCCLCHECK(ncclSocketReady(&comm->base.sock, &ready));
NCCLCHECKGOTO(ncclSocketReady(&comm->base.sock, &ready), ret, fail);
if (!ready) return ncclSuccess;
// IB Setup
@@ -1162,7 +1272,7 @@ ib_connect_check:
comm->ar = 1; // Set to 1 for logic
for (int i = 0; i < mergedDev->ndevs; i++) {
int ibDevN = mergedDev->devs[i];
NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base));
NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base, &comm->base.stats), ret, fail);
comm->ar = comm->ar && ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING - if all merged devs have it enabled
}
@@ -1175,13 +1285,17 @@ ib_connect_check:
for (int q = 0; q < comm->base.nqps; q++) {
ncclIbSendCommDev* commDev = comm->devs + devIndex;
ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, comm->base.qps+q));
NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, &comm->base.stats, comm->base.qps + q), ret, fail);
comm->base.qps[q].devIndex = devIndex;
meta.qpInfo[q].qpn = comm->base.qps[q].qp->qp_num;
meta.qpInfo[q].devIndex = comm->base.qps[q].devIndex;
// Query ece capabilities (enhanced connection establishment)
NCCLCHECK(wrap_ibv_query_ece(comm->base.qps[q].qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
if (ncclParamIbEceEnable()) {
// Query ece capabilities (enhanced connection establishment)
NCCLCHECKGOTO(wrap_ibv_query_ece(comm->base.qps[q].qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
} else {
meta.qpInfo[q].ece_supported = 0;
}
devIndex = (devIndex + 1) % comm->base.ndevs;
}
@@ -1196,13 +1310,13 @@ ib_connect_check:
devInfo->lid = ibDev->portAttr.lid;
devInfo->ibv_dev_index = commDev->base.ibDevN;
// Prepare my fifo
NCCLCHECK(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
devInfo->fifoRkey = commDev->fifoMr->rkey;
// Pack local GID info
devInfo->link_layer = commDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer;
NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &commDev->base.gidInfo.localGidIndex));
NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid));
NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &commDev->base.gidInfo.localGidIndex), ret, fail);
NCCLCHECKGOTO(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid), ret, fail);
devInfo->gid.global.subnet_prefix = commDev->base.gidInfo.localGid.global.subnet_prefix;
devInfo->gid.global.interface_id = commDev->base.gidInfo.localGid.global.interface_id;
@@ -1214,7 +1328,7 @@ ib_connect_check:
INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d LID %d subnet-prefix %lu FLID %d fifoRkey=0x%x fifoLkey=0x%x",
comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev",
dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid,
devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey);
devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey);
}
} else { // RoCE
for (int q = 0; q < comm->base.nqps; q++) {
@@ -1232,12 +1346,12 @@ ib_connect_check:
stage->state = ncclIbCommStateSend;
stage->offset = 0;
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)));
NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)), ret, fail);
memcpy(stage->buffer, &meta, sizeof(meta));
ib_send:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(meta), &stage->offset));
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(meta), &stage->offset), ret, fail);
if (stage->offset != sizeof(meta)) return ncclSuccess;
stage->state = ncclIbCommStateConnecting;
@@ -1247,7 +1361,7 @@ ib_send:
ib_connect:
struct ncclIbConnectionMetadata remMeta;
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclIbConnectionMetadata), &stage->offset));
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclIbConnectionMetadata), &stage->offset), ret, fail);
if (stage->offset != sizeof(remMeta)) return ncclSuccess;
memcpy(&remMeta, stage->buffer, sizeof(ncclIbConnectionMetadata));
@@ -1280,7 +1394,7 @@ ib_connect:
}
for (int i=0; i < comm->base.ndevs; i++) {
NCCLCHECK(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
NCCLCHECKGOTO(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
}
comm->base.nRemDevs = remMeta.ndevs;
@@ -1295,10 +1409,10 @@ ib_connect:
struct ibv_qp* qp = comm->base.qps[q].qp;
if (remQpInfo->ece_supported)
NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported));
NCCLCHECKGOTO(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported), ret, fail);
NCCLCHECK(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false));
NCCLCHECK(ncclIbRtsQp(qp));
NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false), ret, fail);
NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail);
}
if (link_layer == IBV_LINK_LAYER_ETHERNET ) { // RoCE
@@ -1316,20 +1430,24 @@ ib_connect:
stage->offset = 0;
ib_send_ready:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, &comm->base.ready, sizeof(int), &stage->offset));
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, &comm->base.ready, sizeof(int), &stage->offset), ret, fail);
if (stage->offset != sizeof(int)) return ncclSuccess;
free(stage->buffer);
stage->state = ncclIbCommStateStart;
*sendComm = comm;
return ncclSuccess;
exit:
if (stage->buffer) free(stage->buffer);
stage->state = ncclIbCommStateStart;
return ret;
fail:
free(comm);
goto exit;
}
NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
RCCL_PARAM(IbGdrFlushGpuMemNoRelaxedOrdering, "GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING", 1);
ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
ncclResult_t ret = ncclSuccess;
struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
struct ncclIbCommStage* stage = &lComm->stage;
struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
@@ -1346,22 +1464,23 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
}
NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
NCCLCHECKGOTO(ncclIbStatsInit(&rComm->base.stats), ret, fail);
stage->comm = rComm;
stage->state = ncclIbCommStateAccept;
NCCLCHECK(ncclSocketInit(&rComm->base.sock));
NCCLCHECK(ncclSocketAccept(&rComm->base.sock, &lComm->sock));
NCCLCHECKGOTO(ncclSocketInit(&rComm->base.sock), ret, fail);
NCCLCHECKGOTO(ncclSocketAccept(&rComm->base.sock, &lComm->sock), ret, fail);
ib_accept_check:
NCCLCHECK(ncclSocketReady(&rComm->base.sock, &ready));
NCCLCHECKGOTO(ncclSocketReady(&rComm->base.sock, &ready), ret, fail);
if (!ready) return ncclSuccess;
struct ncclIbConnectionMetadata remMeta;
stage->state = ncclIbCommStateRecv;
stage->offset = 0;
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta)));
NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta)), ret, fail);
ib_recv:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset));
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset), ret, fail);
if (stage->offset != sizeof(remMeta)) return ncclSuccess;
/* copy back the received info */
@@ -1392,10 +1511,10 @@ ib_recv:
for (int i = 0; i < rComm->base.ndevs; i++) {
rCommDev = rComm->devs + i;
ibDevN = mergedDev->devs[i];
NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &rCommDev->base));
NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &rCommDev->base, &rComm->base.stats), ret, fail);
ibDev = ncclIbDevs + ibDevN;
NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex));
NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid));
NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex), ret, fail);
NCCLCHECKGOTO(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid), ret, fail);
}
// Copy remDevInfo for things like remGidInfo, remFifoAddr, etc.
@@ -1420,23 +1539,26 @@ ib_recv:
// Local ibDevN
ibDevN = rComm->devs[devIndex].base.ibDevN;
ibDev = ncclIbDevs + ibDevN;
NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, qp));
NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp), ret, fail);
qp->devIndex = devIndex;
devIndex = (devIndex + 1) % rComm->base.ndevs;
// Set the ece (enhanced connection establishment) on this QP before RTR
if (remMeta.qpInfo[q].ece_supported) {
NCCLCHECK(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
// Coverity suspects a copy-paste error below due to the use of remMeta in one argument and meta in another.
// However, this has been confirmed to be intentional.
// coverity[copy_paste_error]
NCCLCHECKGOTO(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
// Query the reduced ece for this QP (matching enhancements between the requestor and the responder)
// Store this in our own qpInfo for returning to the requestor
if (meta.qpInfo[q].ece_supported)
NCCLCHECK(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
NCCLCHECKGOTO(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
}
bool override_tc = (q == 0) ? true : false;
NCCLCHECK(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc));
NCCLCHECK(ncclIbRtsQp(qp->qp));
NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc), ret, fail);
NCCLCHECKGOTO(ncclIbRtsQp(qp->qp), ret, fail);
}
rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || ncclIbDmaBufSupport(lComm->dev) == ncclSuccess)
@@ -1450,7 +1572,7 @@ ib_recv:
// Retain remote fifo info and prepare my RDMA ops
rCommDev->fifoRkey = remMeta.devs[i].fifoRkey;
rComm->remFifo.addr = remMeta.fifoAddr;
NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
rCommDev->fifoSge.lkey = rCommDev->fifoMr->lkey;
if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
@@ -1458,20 +1580,20 @@ ib_recv:
if (rComm->flushEnabled) {
if (rcclParamIbGdrFlushGpuMemNoRelaxedOrdering()) {
#if defined(HIP_UNCACHED_MEMORY)
NCCLCHECK(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), nullptr, hipDeviceMallocUncached));
NCCLCHECKGOTO(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), nullptr, hipDeviceMallocUncached), ret, fail);
#else
NCCLCHECK(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), nullptr, hipDeviceMallocFinegrained));
NCCLCHECKGOTO(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), nullptr, hipDeviceMallocFinegrained), ret, fail);
#endif
NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->gpuFlush.gpuMr, rCommDev->base.pd, rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->gpuFlush.gpuMr, rCommDev->base.pd, rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
} else {
rCommDev->gpuFlush.gpuFlushGpuMem = nullptr;
rCommDev->gpuFlush.gpuMr = nullptr;
}
NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE));
NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE), ret, fail);
rCommDev->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlushHostMem;
rCommDev->gpuFlush.sge.length = 1;
rCommDev->gpuFlush.sge.lkey = rCommDev->gpuFlush.hostMr->lkey;
NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE, &rCommDev->gpuFlush.qp));
NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, &rCommDev->gpuFlush.qp), ret, fail);
struct ncclIbDevInfo devInfo;
devInfo.lid = ibDev->portAttr.lid;
devInfo.link_layer = ibDev->portAttr.link_layer;
@@ -1479,8 +1601,8 @@ ib_recv:
devInfo.gid.global.subnet_prefix = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
devInfo.gid.global.interface_id = rCommDev->base.gidInfo.localGid.global.interface_id;
devInfo.mtu = ibDev->portAttr.active_mtu;
NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false));
NCCLCHECK(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp));
NCCLCHECKGOTO(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false), ret, fail);
NCCLCHECKGOTO(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp), ret, fail);
}
// Fill Handle
@@ -1496,7 +1618,7 @@ ib_recv:
meta.devs[i].mtu = remMeta.devs[i].mtu;
// Prepare sizes fifo
NCCLCHECK(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
NCCLCHECKGOTO(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
meta.devs[i].fifoRkey = rComm->devs[i].sizesFifoMr->rkey;
}
meta.fifoAddr = (uint64_t)rComm->sizesFifo;
@@ -1511,30 +1633,36 @@ ib_recv:
stage->state = ncclIbCommStateSend;
stage->offset = 0;
if (stage->buffer) free(stage->buffer);
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbConnectionMetadata)));
if (stage->buffer) {
free(stage->buffer);
stage->buffer = NULL;
}
NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbConnectionMetadata)), ret, fail);
memcpy(stage->buffer, &meta, sizeof(struct ncclIbConnectionMetadata));
ib_send:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(struct ncclIbConnectionMetadata), &stage->offset));
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(struct ncclIbConnectionMetadata), &stage->offset), ret, fail);
if (stage->offset < sizeof(struct ncclIbConnectionMetadata)) return ncclSuccess;
stage->offset = 0;
stage->state = ncclIbCommStatePendingReady;
ib_recv_ready:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, &rComm->base.ready, sizeof(int), &stage->offset));
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, &rComm->base.ready, sizeof(int), &stage->offset), ret, fail);
if (stage->offset != sizeof(int)) return ncclSuccess;
free(stage->buffer);
*recvComm = rComm;
exit:
/* reset lComm stage */
if (stage->buffer) free(stage->buffer);
stage->state = ncclIbCommStateStart;
stage->offset = 0;
stage->comm = NULL;
stage->buffer = NULL;
return ncclSuccess;
return ret;
fail:
free(rComm);
goto exit;
}
ncclResult_t ncclIbGetRequest(struct ncclIbNetCommBase* base, struct ncclIbRequest** req) {
@@ -1627,16 +1755,21 @@ struct ncclIbNetCommDevBase* ncclIbGetNetCommDevBase(ncclIbNetCommBase* base, in
/* DMA-BUF support */
ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
ncclResult_t ret = ncclSuccess;
assert(size > 0);
struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle));
for (int i = 0; i < base->ndevs; i++) {
// Each ncclIbNetCommDevBase is at different offset in send and recv netComms
struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i);
NCCLCHECK(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i));
NCCLCHECKGOTO(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i), ret, fail);
}
*mhandle = (void*) mhandleWrapper;
return ncclSuccess;
exit:
return ret;
fail:
free(mhandleWrapper);
goto exit;
}
ncclResult_t ncclIbRegMr(void* comm, void* data, size_t size, int type, void** mhandle) {
@@ -1779,9 +1912,9 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
comm->wrs[r].wr.rdma.remote_addr += chunkSize;
TRACE(NCCL_VERBS, "Posted send wr_id=%lu, wr_indx=%d, qp_num=%d, src_nic=%d, dst_nic=%d, dlid=%d, opcode=%d, send_flags=%d, imm_data=%d, remote_addr=%lx, rkey=%x, length=%d, lkey=%x",
comm->wrs[r].wr_id, r, qp->qp->qp_num, comm->devs[qp->devIndex].base.ibDevN , comm->base.remDevs[qp->remDevIdx].ibv_dev_index, comm->base.remDevs[qp->remDevIdx].lid,
comm->wrs[r].opcode, comm->wrs[r].send_flags, comm->wrs[r].imm_data, comm->wrs[r].wr.rdma.remote_addr,
comm->wrs[r].wr.rdma.rkey,comm->wrs[r].sg_list ? comm->wrs[r].sg_list->length : 0, comm->wrs[r].sg_list ? comm->wrs[r].sg_list->lkey : 0);
comm->wrs[r].wr_id, r, qp->qp->qp_num, comm->devs[qp->devIndex].base.ibDevN , comm->base.remDevs[qp->remDevIdx].ibv_dev_index, comm->base.remDevs[qp->remDevIdx].lid,
comm->wrs[r].opcode, comm->wrs[r].send_flags, comm->wrs[r].imm_data, comm->wrs[r].wr.rdma.remote_addr,
comm->wrs[r].wr.rdma.rkey,comm->wrs[r].sg_list ? comm->wrs[r].sg_list->length : 0, comm->wrs[r].sg_list ? comm->wrs[r].sg_list->lkey : 0);
}
// Select the next qpIndex
@@ -1795,6 +1928,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; }
if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle;
@@ -1964,6 +2098,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; }
if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
@@ -2059,10 +2194,13 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
return ncclSuccess;
}
#define HCA_NAME(req, index) ((req)->devBases[(index)]->pd->context->device->name)
ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
struct ncclIbRequest *r = (struct ncclIbRequest*)request;
*done = 0;
while (1) {
NCCLCHECK(ncclIbStatsCheckFatalCount(&r->base->stats,__func__));
if (r->events[0] == 0 && r->events[1] == 0) {
TRACE(NCCL_NET, "r=%p done", r);
*done = 1;
@@ -2118,7 +2256,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%d wr_id=%ld r=%p type=%d events={%d,%d}, i=%d",
ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i);
#endif
if (req->type == NCCL_NET_IB_REQ_SEND) {
if (req && req->type == NCCL_NET_IB_REQ_SEND) {
for (int j = 0; j < req->nreqs; j++) {
struct ncclIbRequest* sendReq = r->base->reqs+((wc->wr_id >> (j*8)) & 0xff);
if ((sendReq->events[i] <= 0)) {
@@ -2140,6 +2278,9 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
req->events[i]--;
}
}
// Once the IB fatal event is reported in the async thread, we want to propagate this error
// to communicator and prevent further polling to reduce error pollution.
NCCLCHECK(ncclIbStatsCheckFatalCount(&ncclIbDevs[r->devBases[i]->ibDevN].stats,__func__));
}
}
@@ -2226,5 +2367,4 @@ ncclNet_t ncclNetIb = {
ncclIbCloseListen,
NULL /* getDeviceMr */,
NULL /* irecvConsumed */
};
};
+36 -18
Melihat File
@@ -73,22 +73,27 @@ ncclResult_t ncclNetSocketDevices(int* ndev) {
}
static ncclResult_t ncclNetSocketGetSpeed(char* devName, int* speed) {
ncclResult_t ret = ncclSuccess;
*speed = 0;
char speedPath[PATH_MAX];
sprintf(speedPath, "/sys/class/net/%s/speed", devName);
int fd = open(speedPath, O_RDONLY);
int fd = -1;
SYSCHECKSYNC(open(speedPath, O_RDONLY), "open", fd);
if (fd != -1) {
char speedStr[] = " ";
if (read(fd, speedStr, sizeof(speedStr)-1) > 0) {
int n;
// Allow this to silently fail
n = read(fd, speedStr, sizeof(speedStr)-1);
if (n > 0) {
*speed = strtol(speedStr, NULL, 0);
}
close(fd);
}
if (*speed <= 0) {
INFO(NCCL_NET, "Could not get speed from %s. Defaulting to 10 Gbps.", speedPath);
*speed = 10000;
}
return ncclSuccess;
if (fd != -1) SYSCHECK(close(fd), "close");
return ret;
}
ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
@@ -235,19 +240,24 @@ void* persistentSocketThread(void *args_) {
}
ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
ncclResult_t ret = ncclSuccess;
int nSocksPerThread = ncclParamSocketNsocksPerThread();
int nThreads = ncclParamSocketNthreads();
if (nThreads > MAX_THREADS) {
WARN("NET/Socket : NCCL_SOCKET_NTHREADS is greater than the maximum allowed, setting to %d", MAX_THREADS);
nThreads = MAX_THREADS;
}
int fd = -1;
int nSocks;
if (nThreads == -2 || nSocksPerThread == -2) {
// Auto-detection
int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
char vendorPath[PATH_MAX];
snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetSocketDevs[dev].devName);
// Coverity is wrong. NULL second argument to realpath() is OK by POSIX.1-2008.
// coverity[alias_transfer:FALSE]
char* rPath = realpath(vendorPath, NULL);
int fd = open(rPath, O_RDONLY);
fd = open(rPath, O_RDONLY);
free(rPath);
if (fd == -1) {
// Could not find device vendor. This is handled silently so
@@ -257,9 +267,7 @@ ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
}
char vendor[7];
strncpy(vendor, "0x0000", 7);
int len;
SYSCHECKVAL(read(fd, vendor, 6), "read", len);
SYSCHECK(close(fd), "close");
SYSCHECKGOTO(read(fd, vendor, 6), "read", ret, fail);
if (strcmp(vendor, "0x1d0f") == 0) { // AWS
autoNt = 2;
autoNs = 8;
@@ -271,7 +279,7 @@ end:
if (nThreads == -2) nThreads = autoNt;
if (nSocksPerThread == -2) nSocksPerThread = autoNs;
}
int nSocks = nSocksPerThread * nThreads;
nSocks = nSocksPerThread * nThreads;
if (nSocks > MAX_SOCKETS) {
nSocksPerThread = MAX_SOCKETS/nThreads;
WARN("NET/Socket : the total number of sockets is greater than the maximum allowed, setting NCCL_NSOCKS_PERTHREAD to %d", nSocksPerThread);
@@ -280,28 +288,38 @@ end:
*ns = nSocks;
*nt = nThreads;
if (nSocks > 0) INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread);
return ncclSuccess;
exit:
if (fd != -1) close(fd);
return ret;
fail:
goto exit;
}
ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) {
if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
return ncclInternalError;
}
ncclResult_t ret = ncclSuccess;
struct ncclNetSocketHandle* handle = (struct ncclNetSocketHandle*) opaqueHandle;
memset(handle, 0, sizeof(struct ncclNetSocketHandle));
static_assert(sizeof(struct ncclNetSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclNetSocketHandle size too large");
struct ncclNetSocketListenComm* comm;
NCCLCHECK(ncclCalloc(&comm, 1));
handle->magic = NCCL_SOCKET_MAGIC;
NCCLCHECK(ncclSocketInit(&comm->sock, &ncclNetSocketDevs[dev].addr, handle->magic, ncclSocketTypeNetSocket, NULL, 1));
NCCLCHECK(ncclSocketListen(&comm->sock));
NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr));
NCCLCHECK(ncclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
NCCLCHECKGOTO(ncclSocketInit(&comm->sock, &ncclNetSocketDevs[dev].addr, handle->magic, ncclSocketTypeNetSocket, NULL, 1), ret, fail);
NCCLCHECKGOTO(ncclSocketListen(&comm->sock), ret, fail);
NCCLCHECKGOTO(ncclSocketGetAddr(&comm->sock, &handle->connectAddr), ret, fail);
NCCLCHECKGOTO(ncclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads), ret, fail);
handle->nSocks = comm->nSocks;
handle->nThreads = comm->nThreads;
comm->dev = dev;
*listenComm = comm;
return ncclSuccess;
exit:
return ret;
fail:
(void)ncclSocketClose(&comm->sock);
free(comm);
goto exit;
}
ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
@@ -437,7 +455,7 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void*
res->comm = comm;
pthread_mutex_init(&res->threadLock, NULL);
pthread_cond_init(&res->threadCond, NULL);
pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res);
PTHREADCHECK(pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res), "pthread_create");
ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev);
}
struct ncclNetSocketTask* r = queue->tasks+queue->next;
@@ -482,7 +500,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
if (r->op == NCCL_SOCKET_RECV && data > r->size) {
char line[SOCKET_NAME_MAXLEN+1];
union ncclSocketAddress addr;
ncclSocketGetAddr(r->ctrlSock, &addr);
NCCLCHECK(ncclSocketGetAddr(r->ctrlSock, &addr));
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
ncclSocketToString(&addr, line), data, r->size);
@@ -579,7 +597,7 @@ ncclResult_t ncclNetSocketClose(void* opaqueComm) {
res->stop = 1;
pthread_cond_signal(&res->threadCond);
pthread_mutex_unlock(&res->threadLock);
pthread_join(comm->helperThread[i], NULL);
PTHREADCHECK(pthread_join(comm->helperThread[i], NULL), "pthread_join");
}
free(res->threadTaskQueue.tasks);
}
+46 -30
Melihat File
@@ -26,7 +26,7 @@ struct localRegData {
intptr_t offset;
};
ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
ncclResult_t nvlsCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
// This transport cannot be used for p2p
*ret = 0;
return ncclSuccess;
@@ -71,28 +71,31 @@ ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop,
ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle) {
CUmemAllocationHandleType type = ncclCuMemHandleType;
int fd = -1;
ncclResult_t ret = ncclSuccess;
INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank);
// Import and map the remote memory descriptor to the local GPU
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
// cuMem UDS support
int fd = -1;
TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle %p from rank %d", comm->localRank, shareableHandle, rank);
int tpProxyRank = comm->topParentRanks[rank];
TRACE(NCCL_NVLS, "NVLS rank %d request conversion of handle 0x%lx from rank %d", comm->localRank, *(uint64_t*)shareableHandle, rank);
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpProxyRank, shareableHandle, &fd));
NCCLCHECKGOTO(ncclProxyClientGetFdBlocking(comm, rank, shareableHandle, &fd), ret, fail);
TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank);
CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type));
(void) close(fd);
CUCHECKGOTO(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type), ret, fail);
SYSCHECK(close(fd), "close");
} else {
if (type == CU_MEM_HANDLE_TYPE_FABRIC) {
CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)shareableHandle, type));
CUCHECKGOTO(cuMemImportFromShareableHandle(mcHandle, (void *)shareableHandle, type), ret, fail);
} else {
memcpy(mcHandle, shareableHandle, sizeof(CUmemGenericAllocationHandle));
}
}
return ncclSuccess;
exit:
return ret;
fail:
if (fd != -1) close(fd);
goto exit;
}
ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAllocationHandle* mcHandle) {
@@ -100,7 +103,7 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll
INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zu dev %d", *mcHandle, size, dev);
// Unbind physical memory from group for the given device
CUCHECK(cuMulticastUnbind(*mcHandle, dev, 0/*mcOffset*/, size));
if (size) CUCHECK(cuMulticastUnbind(*mcHandle, dev, 0/*mcOffset*/, size));
return ncclSuccess;
}
@@ -117,14 +120,18 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr,
INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", *ucHandle, ucptr, *mcHandle, mcptr);
// Release the UC memory and mapping
CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size));
CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size));
CUCHECK(cuMemRelease(*ucHandle));
if (ucptr) {
CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size));
CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size));
CUCHECK(cuMemRelease(*ucHandle));
}
// Release the MC memory and mapping
CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size));
CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size));
CUCHECK(cuMemRelease(*mcHandle));
if (mcptr) {
CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size));
CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size));
CUCHECK(cuMemRelease(*mcHandle));
}
return ncclSuccess;
}
@@ -191,7 +198,9 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
size_t size = *sizePtr;
size_t originSize = size;
size_t ucgran, mcgran;
int allocMcHandle = 0;
*ucptr = *mcptr = NULL;
memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
mcprop.numDevices = comm->localRanks;
mcprop.handleTypes = ncclCuMemHandleType;
@@ -203,10 +212,12 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
if (comm->localRank == 0) {
NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail);
allocMcHandle = 1;
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
} else {
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], mcHandle), ret, fail);
allocMcHandle = 1;
}
CUCHECKGOTO(cuMulticastAddDevice(*mcHandle, comm->cudaDev), ret, fail);
@@ -226,6 +237,8 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, size, desc, 1), ret, fail);
CUDACHECKGOTO(cudaMemset(*ucptr, 0, size), ret, fail);
// intra-node barrier to mitigate the possible hang in cuMulticastBindMem during abort
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
// Bind physical memory to the Multicast group
// NB: It will block until all ranks have been added to the Group
CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, size, 0/*flags*/), ret, fail);
@@ -239,6 +252,7 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
exit:
return ret;
fail:
if (allocMcHandle && *mcptr == NULL && *ucptr == NULL) CUCHECK(cuMemRelease(*mcHandle));
goto exit;
}
@@ -350,10 +364,10 @@ setup:
struct ncclNvlsSharedRes* resources = NULL;
int nHeads = comm->channels[0].nvls.nHeads;
int nChannels = comm->nChannels;
size_t memSize = 16;
size_t memSize = 64;
size_t creditSize = nChannels * 2 * memSize * nHeads;
int nvlsStepSize = comm->nvlsChunkSize;
NCCLCHECKGOTO(ncclCalloc(&comm->nvlsResources, 1), res, fail);
comm->nvlsResources->inited = false;
comm->nvlsResources->refCount = 1;
@@ -466,7 +480,7 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
if (!comm->MNNVL && resources->nvlsShmemHandle)
NCCLCHECK(ncclShmClose(resources->nvlsShmemHandle));
if (resources->ucCredit && resources->mcCredit) {
if (resources->ucCredit || resources->mcCredit) {
NCCLCHECK(nvlsGroupUnbind(comm, resources->creditSize, &resources->mcCreditHandle));
NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditSize, resources->ucCredit, &resources->ucCreditHandle, resources->mcCredit, &resources->mcCreditHandle));
}
@@ -490,7 +504,6 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
char shareableHandle[NVLS_HANDLE_SIZE];
CUmemGenericAllocationHandle mcHandle;
size_t minSize = SIZE_MAX;
bool localRegBufUsed = false;
struct localRegData* regData = NULL;
cudaPointerAttributes attr;
size_t ucgran, mcgran;
@@ -500,7 +513,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
if (userBuff) {
NCCLCHECKGOTO(ncclRegFind(comm, (void*)userBuff, buffSize, &regRecord), ret, fail);
if (regRecord) {
CUDACHECK(cudaPointerGetAttributes(&attr, (void*)regRecord->addr));
CUDACHECKGOTO(cudaPointerGetAttributes(&attr, (void*)regRecord->addr), ret, fail);
if (attr.type == cudaMemoryTypeDevice) {
size_t regSize = regRecord->pages * comm->regCache.pageSize;
memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
@@ -508,7 +521,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
mcprop.handleTypes = ncclCuMemHandleType;
mcprop.flags = 0;
mcprop.size = regSize;
CUCHECK(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
memset(&ucprop, 0, sizeof(CUmemAllocationProp));
ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
@@ -517,7 +530,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
ucprop.requestedHandleTypes = ncclCuMemHandleType;
CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
CUCHECK(cuMemGetAddressRange((CUdeviceptr*)&regRecord->baseAddr, &regRecord->baseSize, (CUdeviceptr)regRecord->addr));
CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&regRecord->baseAddr, &regRecord->baseSize, (CUdeviceptr)regRecord->addr), ret, fail);
if (regSize % mcgran == 0) {
regRecord->regSize = regSize;
} else {
@@ -560,6 +573,9 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
}
CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->nvlsResources->dev), ret, fail);
// Coverity complains that regRecord could be NULL. That won't in practice be the case because we've already checked
// (regData[i].reg.state & NVLS_REG_POSSIBLE) of all local ranks, which would catch it and bail out.
// coverity[var_deref_op]
CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail);
// Create a VA for the NVLS
@@ -584,15 +600,13 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
}
}
localRegBufUsed = true;
*regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
*regUsed = true;
exit:
if (localRegBufUsed) *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
*regUsed = localRegBufUsed;
free(regData);
return ret;
fail:
localRegBufUsed = false;
*regUsed = false;
goto exit;
}
@@ -862,19 +876,21 @@ exit:
}
if (recvRecord) {
// Yes, it's a dead code. That's fine...
// coverity[dead_error_begin]
ncclNvlsDeregBuffer(&recvRecord->mcHandle, recvRecord->ptr, recvRecord->dev, recvRecord->size);
free(recvRecord);
}
} else {
if (sendRecord) {
*outRegBufSend = (void*)((uintptr_t)regSendPtr + (uintptr_t)sendbuff - (uintptr_t)baseSend);
ncclIntruQueueEnqueue(cleanupQueue, &sendRecord->base);
ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)sendRecord);
*nCleanupQueueEltsAdded += 1;
}
if (recvRecord) {
*outRegBufRecv = (void*)((uintptr_t)regRecvPtr + (uintptr_t)recvbuff - (uintptr_t)baseRecv);
ncclIntruQueueEnqueue(cleanupQueue, &recvRecord->base);
ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)recvRecord);
*nCleanupQueueEltsAdded += 1;
}
+509 -71
Melihat File
@@ -11,8 +11,10 @@
#include "shm.h"
#include "graph.h"
#include "graph/topo.h"
#include "shmutils.h"
#include "p2p.h"
#include "transport.h"
#include <assert.h>
enum p2pType { P2P_DIRECT, P2P_INTERMEDIATE, P2P_IPC, P2P_CUMEM };
@@ -22,16 +24,28 @@ struct ncclP2pBuff {
ncclIpcDesc ipcDesc;
};
struct ncclP2pRequest {
size_t size;
int refcount;
};
struct p2pConnectInfo {
int rank;
int read;
struct ncclP2pBuff p2pBuff;
// Used by CE memcpy
char shmName[7];
int shmSize;
ncclShmIpcDesc_t desc;
};
static_assert(sizeof(struct p2pConnectInfo) <= CONNECT_SIZE, "p2pConnectInfo is too large");
struct p2pIpcExpInfo {
ncclIpcDesc ipcDesc;
bool legacyIpcCap;
int impFd;
size_t size;
uintptr_t offset;
};
struct p2pShm {
struct ncclSendMem sendMem;
struct ncclRecvMem recvMem;
@@ -40,9 +54,7 @@ struct p2pShmProxyInfo {
// Shared memory between proxy and receiving GPU
struct p2pShm* shm;
struct p2pShm* devShm;
char shmName[7];
int shmSize;
ncclShmHandle_t handle;
ncclShmIpcDesc_t desc;
// Intermediate step for sender
struct ncclRecvMem* ceRecvMem;
@@ -65,13 +77,16 @@ struct p2pResources {
struct ncclRecvMem* recvDevMem;
};
void* sendMemIpc;
int sendMemSameProc;
void* recvMemIpc;
int recvMemSameProc;
// CE memcpy support
struct p2pShmProxyInfo proxyInfo;
struct p2pShm* shm;
struct p2pShm* devShm;
int shmSize;
ncclShmHandle_t handle;
ncclShmIpcDesc_t desc;
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
};
@@ -108,7 +123,7 @@ static void initCeOperation();
extern int64_t ncclParamMNNVLEnable();
/* Determine if two peers can communicate through p2p */
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
initCeOperation();
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
if (!info1->hasFineGrain || !info2->hasFineGrain) {
@@ -118,8 +133,8 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
#endif
// MNNVL support
if (ncclParamMNNVLEnable() != 0 && info1->hostHash != info2->hostHash) {
NCCLCHECK(ncclTopoCheckMNNVL(topo, info1, info2, ret));
if (comm->MNNVL && info1->hostHash != info2->hostHash) {
NCCLCHECK(ncclTopoCheckMNNVL(comm->topo, info1, info2, ret));
if (*ret) return ncclSuccess;
}
@@ -131,7 +146,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
// Check topology / p2p level.
int intermediateRank;
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank));
NCCLCHECK(ncclTopoCheckP2p(comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank));
if (*ret == 0) return ncclSuccess;
if (intermediateRank != -1) {
if (useMemcpy) *ret = 0;
@@ -140,7 +155,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
// Check if NET would work better
int useNet = 0;
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, &useNet));
if (useNet) {
*ret = 0;
return ncclSuccess;
@@ -164,7 +179,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
int p2p;
if (cudaDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != cudaSuccess) {
INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)",
cudaDev1, info1->busId, cudaDev2, info2->busId);
cudaDev1, info1->busId, cudaDev2, info2->busId);
*ret = 0;
return ncclSuccess;
}
@@ -195,7 +210,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
if (p2p == 0) {
INFO(NCCL_INIT|NCCL_P2P,"Could not enable P2P between dev %d(=%lx) and dev %d(=%lx)",
cudaDev1, info1->busId, cudaDev2, info2->busId);
cudaDev1, info1->busId, cudaDev2, info2->busId);
*ret = 0;
return ncclSuccess;
}
@@ -210,7 +225,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
} while (0)
// cuMem API support
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr) {
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int refcount, ncclIpcDesc *ipcDesc, void **ptr) {
if (ncclCuMemEnable()) {
#if CUDART_VERSION >= 11030
CUmemAllocationHandleType type = ncclCuMemHandleType;
@@ -224,6 +239,10 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, v
} else {
CUCHECK(cuMemExportToShareableHandle(&ipcDesc->cuDesc, handle, type, 0));
}
if (refcount) {
memcpy(&ipcDesc->memHandle, &handle, sizeof(handle));
for (int r = 0; r < refcount; ++r) CUCHECK(cuMemRetainAllocationHandle(&handle, *ptr));
}
#else
return ncclInternalError;
#endif
@@ -250,7 +269,7 @@ ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) {
return ncclSuccess;
}
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) {
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) {
if (ncclCuMemEnable()) {
#if CUDART_VERSION >= 11030
// cuMem API support
@@ -258,16 +277,25 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
CUmemAllocationHandleType type = ncclCuMemHandleType;
CUmemGenericAllocationHandle handle;
ncclCuDesc *cuDesc = &ipcDesc->cuDesc;
CUmemAllocationProp prop = {};
size_t granularity = 0;
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.requestedHandleTypes = type;
prop.location.id = comm->cudaDev;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
ALIGN_SIZE(size, granularity);
// Import and map the remote memory descriptor to the local GPU
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
// UDS fd support
int fd = -1;
// Send cuMem handle to remote for conversion to an fd
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpPeer, &cuDesc->data, &fd));
INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, tpPeer);
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, peer, &cuDesc->data, &fd));
INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, peer);
CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type));
(void) close(fd);
SYSCHECK(close(fd), "close");
} else {
CUCHECK(cuMemImportFromShareableHandle(&handle, cuDesc, type));
}
@@ -308,7 +336,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
int p2p;
// Queries the topology to see if the GPUs are Ampere and
// connected via NVLink, if so we enable P2P Read by default
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, read, intermediateRank));
NCCLCHECK(ncclTopoCheckP2p(topo, info1->rank, info2->rank, &p2p, read, intermediateRank));
int readEnable = ncclParamP2pReadEnable();
if (readEnable != -2) *read = readEnable;
@@ -328,24 +356,25 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
return ncclInternalError;
}
#if CUDART_VERSION >= 11030
// cuMem API support
if (ncclCuMemEnable()) {
// Allow direct access to the remote buffer from the local GPU
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = myInfo->cudaDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
INFO(NCCL_P2P, "Set Access for buffer %p size %zu on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev);
CUCHECK(cuMemSetAccess((CUdeviceptr) p2pBuff->directPtr, p2pBuff->size, &accessDesc, 1));
}
#if CUDART_VERSION >= 11030
// for intra-process ranks, we should map memHandle of the peers to increase refcount.
// Otherwise, if peers abort and free the buffer, the rank can suffer invalid access.
NCCLCHECK(ncclCuMemAllocAddr(devMem, &p2pBuff->ipcDesc.memHandle, p2pBuff->size));
CUCHECK(cuMemRelease(p2pBuff->ipcDesc.memHandle));
*ipcPtr = *devMem;
#endif
} else {
*devMem = p2pBuff->directPtr;
*ipcPtr = NULL;
}
} else {
*devMem = p2pBuff->directPtr;
*ipcPtr = NULL;
}
*devMem = p2pBuff->directPtr;
*ipcPtr = NULL;
} else {
// Different PID
NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem));
NCCLCHECK(ncclP2pImportShareableBuffer(comm, peerInfo->rank, p2pBuff->size, &p2pBuff->ipcDesc, devMem));
*ipcPtr = *devMem;
}
return ncclSuccess;
@@ -355,7 +384,7 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct p2pResources* resources;
int tpProxyRank;
struct ncclP2pRequest req;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
int useRead, intermediateRank;
@@ -398,12 +427,12 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
resources->type = P2P_CUMEM;
const char *MNNVL = comm->MNNVL ? "MNNVL" : "CUMEM";
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%x] -> %d[%x] via P2P/CUMEM%s%s%s comm %p nRanks %02d",
channelId, connIndex, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, MNNVL, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);;
channelId, connIndex, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, MNNVL, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);;
} else {
// Legacy CUDA IPC
resources->type = P2P_IPC;
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s comm %p nRanks %02d",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);
}
send->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
}
@@ -412,18 +441,21 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
info->rank = intermediateRank;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s comm %p nRanks %02d",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
comm->peerInfo[intermediateRank].busId, useReadStr, comm, comm->nRanks);
comm->peerInfo[intermediateRank].busId, useReadStr, comm, comm->nRanks);
}
tpProxyRank = comm->topParentRanks[info->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &send->proxyConn));
req.size = sendSize;
req.refcount = 0;
if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++;
if (P2P_SAME_PID((comm->peerInfo + info->rank), myInfo) && (comm->peerInfo[info->rank].cudaDev != myInfo->cudaDev)) req.refcount++;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
if (useMemcpy) {
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pShmProxyInfo)));
info->shmSize = resources->proxyInfo.shmSize;
memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
memcpy(&info->desc, &resources->proxyInfo.desc, sizeof(ncclShmIpcDesc_t));
} else {
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(struct ncclP2pRequest), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(p2pMap(comm, &send->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->sendDevMem, &resources->sendMemIpc));
resources->sendMemSameProc = P2P_SAME_PID(myInfo, (comm->peerInfo + info->rank));
}
return ncclSuccess;
@@ -433,7 +465,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) {
struct p2pResources* resources;
int tpProxyRank;
struct ncclP2pRequest req;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
int useRead, intermediateRank;
@@ -472,11 +504,15 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
info->rank = intermediateRank;
}
tpProxyRank = comm->topParentRanks[info->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, tpProxyRank, &recv->proxyConn));
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
req.size = recvSize;
req.refcount = 0;
if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++;
if (P2P_SAME_PID((comm->peerInfo + info->rank), myInfo) && (comm->peerInfo[info->rank].cudaDev != myInfo->cudaDev)) req.refcount++;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn));
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(struct ncclP2pRequest), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(p2pMap(comm, &recv->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->recvDevMem, &resources->recvMemIpc));
resources->recvMemSameProc = P2P_SAME_PID(myInfo, (comm->peerInfo + info->rank));
return ncclSuccess;
}
@@ -487,6 +523,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
NCCLCHECK(p2pMap(comm, &send->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc));
resources->recvMemSameProc = P2P_SAME_PID((comm->peerInfo + rank), (comm->peerInfo + info->rank));
char* buff = (char*)(remDevMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -527,17 +564,14 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
struct ncclSendMem* remDevMem = NULL;
if (useMemcpy) {
char shmPath[PATH_MAX];
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
resources->shmSize = info->shmSize;
// Attach to peer's SHM segment
NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, -1, &resources->handle));
NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->shm, (void**)&resources->devShm, &resources->desc));
recv->conn.tail = &resources->devShm->recvMem.tail;
recv->conn.head = &resources->devShm->sendMem.head;
} else {
NCCLCHECK(p2pMap(comm, &recv->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
resources->sendMemSameProc = P2P_SAME_PID((comm->peerInfo + rank), (comm->peerInfo + info->rank));
struct ncclRecvMem* devMem = resources->recvDevMem;
recv->conn.tail = &devMem->tail;
@@ -566,8 +600,21 @@ ncclResult_t p2pSendFree(struct ncclConnector* send) {
if (resources) {
if (ncclCuMemEnable()) {
// cuMem API support
if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
if (resources->sendMemIpc) {
if (resources->sendMemSameProc) {
NCCLCHECK(ncclCuMemFreeAddr(resources->sendMemIpc));
} else {
NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
}
}
if (resources->recvMemIpc) {
if (resources->recvMemSameProc) {
NCCLCHECK(ncclCuMemFreeAddr(resources->recvMemIpc));
} else {
NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
}
}
}
else {
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
@@ -583,14 +630,27 @@ ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
if (resources) {
if (ncclCuMemEnable()) {
// cuMem API support
if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
if (resources->sendMemIpc) {
if (resources->sendMemSameProc) {
NCCLCHECK(ncclCuMemFreeAddr(resources->sendMemIpc));
} else {
NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
}
}
if (resources->recvMemIpc) {
if (resources->recvMemSameProc) {
NCCLCHECK(ncclCuMemFreeAddr(resources->recvMemIpc));
} else {
NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
}
}
}
else {
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
if (useMemcpy) {
NCCLCHECK(ncclShmClose(resources->handle));
NCCLCHECK(ncclShmIpcClose(&resources->desc));
}
}
free(resources);
@@ -602,6 +662,9 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
if (useMemcpy) {
// CE memcpy support
struct p2pShmProxyInfo* proxyInfo;
size_t shmSize;
if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
connection->transportResources = proxyInfo;
@@ -611,24 +674,19 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, hipDeviceMallocFinegrained));
#endif
char shmPath[PATH_MAX];
shmPath[0] = '\0';
proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
// Create a SHM segment for the peer to attach to
NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1, &proxyInfo->handle));
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));
shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, shmSize, false, &proxyInfo->desc, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm));
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError;
memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo));
} else {
if (reqSize != sizeof(int)) return ncclInternalError;
int size = *((int*)reqBuff);
struct ncclP2pRequest* req = (struct ncclP2pRequest*)reqBuff;
if (reqSize != sizeof(struct ncclP2pRequest)) return ncclInternalError;
int size = req->size;
if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, req->refcount, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
p2pBuff->size = size;
if (ncclCuMemEnable()) {
// cuMem API support
@@ -645,11 +703,12 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
}
static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(int)) return ncclInternalError;
int size = *((int*)reqBuff);
struct ncclP2pRequest* req = (struct ncclP2pRequest*)reqBuff;
if (reqSize != sizeof(struct ncclP2pRequest)) return ncclInternalError;
int size = req->size;
if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, req->refcount, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
p2pBuff->size = size;
if (ncclCuMemEnable()) {
// cuMem API support
@@ -683,7 +742,7 @@ static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, str
if (useMemcpy) {
struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources;
if (proxyInfo) {
NCCLCHECK(ncclShmClose(proxyInfo->handle));
NCCLCHECK(ncclShmIpcClose(&proxyInfo->desc));
NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
NCCLCHECK(ncclCudaFree(proxyInfo->ceDevBuff));
CUDACHECK(cudaStreamDestroy(proxyInfo->stream));
@@ -784,11 +843,390 @@ static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, stru
return ncclSuccess;
}
ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut) {
ncclResult_t ret = ncclSuccess;
struct ncclReg *regRecord = NULL;
struct ncclIpcRegInfo* newInfo = NULL;
uintptr_t* peerRmtAddrs = NULL;
bool legacyIpcCap = false;
size_t baseSize = 0;
void* baseAddr = NULL;
bool needUpdate = false;
*regBufFlag = 0;
*offsetOut = 0;
*peerRmtAddrsOut = NULL;
if (comm && userbuff && buffSize > 0 && nPeers > 0) {
NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
if (regRecord) {
// buffer was registered by by users, we need to start to register or reuse it
int peerLocalRank;
for (int p = 0; p < nPeers; p++) {
int peerRank = peerRanks[p];
peerLocalRank = comm->rankToLocalRank[peerRank];
if (regRecord->ipcInfos[peerLocalRank]) {
// We already have IPC info for peerLocalRank, no need to register it, we can reuse it
*regBufFlag = 1;
INFO(NCCL_REG, "rank %d - IPC local reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr);
} else {
// Register buffer with peerLocalRank
struct ncclProxyConnector* proxyConn = NULL;
struct p2pIpcExpInfo ipcInfo;
if (baseAddr == NULL) {
CUDACHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
CUDACHECKGOTO(cuPointerGetAttribute((void*)&legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
}
if (comm->gproxyConn[peerRank].initialized == false)
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
proxyConn = &comm->gproxyConn[peerRank];
ipcInfo.legacyIpcCap = legacyIpcCap;
// Get the mem handle for that buffer. It may have been allocated through cudaMalloc in which case we'll
// get the CUDA legacy mem handle, or through cuMem*.
if (ipcInfo.legacyIpcCap) {
// legacy export
if (comm->directMode) goto fail;
CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
} else if (ncclCuMemEnable()) {
#if CUDART_VERSION >= 11030
CUmemGenericAllocationHandle handle;
if (CUPFN(cuMemRetainAllocationHandle(&handle, baseAddr)) != CUDA_SUCCESS) {
// if cuMem* export fails, retry legacy export
if (comm->directMode) goto fail;
CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
ipcInfo.legacyIpcCap = true;
} else {
// cuMem* export to file descriptor or fabric handle
if (proxyConn->sameProcess) {
memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
} else {
if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
int expFd = -1;
CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
SYSCHECKGOTO(close(expFd), "close", ret, fail);
} else {
// Allow this to silently fail for cases where the user buff cannot be registered
if (CUPFN(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0)) != CUDA_SUCCESS) {
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
goto fail;
}
}
}
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
}
#endif
} else {
// nothing works, just return
goto fail;
}
void* rmtRegAddr = NULL;
ipcInfo.size = baseSize;
ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr;
// Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side
// and get the remote register address back.
if (proxyConn)
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
if (rmtRegAddr) {
NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail);
assert(regRecord->ipcInfos[peerLocalRank] == NULL);
regRecord->state |= IPC_REG_COMPLETE;
newInfo->peerRank = peerRank;
newInfo->baseAddr = baseAddr;
newInfo->impInfo.rmtRegAddr = rmtRegAddr;
newInfo->impInfo.offset = ipcInfo.offset;
newInfo->impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
newInfo->ipcProxyconn = proxyConn;
regRecord->ipcInfos[peerLocalRank] = newInfo;
if (regRecord->regIpcAddrs.hostPeerRmtAddrs == NULL) {
NCCLCHECKGOTO(ncclCalloc(&regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
}
regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr;
needUpdate = true;
*regBufFlag = 1;
INFO(NCCL_REG, "rank %d - IPC local register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
}
}
}
if (*regBufFlag) {
if (type == NCCL_IPC_COLLECTIVE) {
// for collective, store registered remote buffers into dev memory for future reference
if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) {
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL)
NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
if (needUpdate)
NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
}
peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs;
} else {
assert(nPeers == 1);
// p2p always returns remote addr here since remote buffer addr is passed in ncclDevWorkP2p struct
peerRmtAddrs = (uintptr_t*)regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank];
}
*offsetOut = (uintptr_t)userbuff - regRecord->addr;
*peerRmtAddrsOut = peerRmtAddrs;
}
}
}
exit:
return ret;
fail:
*regBufFlag = 0;
*offsetOut = 0;
*peerRmtAddrsOut = NULL;
if (newInfo) free(newInfo);
goto exit;
}
struct ncclIpcCleanupCallback {
struct ncclCommCallback base;
bool isAddrs;
union {
struct ncclIpcRegInfo regInfo;
struct ncclPeerRegIpcAddr regIpcAddrs;
};
};
static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) {
struct ncclIpcCleanupCallback* obj = (struct ncclIpcCleanupCallback*)cb;
if (obj->isAddrs) {
if (obj->regIpcAddrs.hostPeerRmtAddrs)
free(obj->regIpcAddrs.hostPeerRmtAddrs);
if (obj->regIpcAddrs.devPeerRmtAddrs)
NCCLCHECK(ncclCudaFree(obj->regIpcAddrs.devPeerRmtAddrs));
} else {
NCCLCHECK(ncclIpcDeregBuffer(comm, &obj->regInfo));
}
free(obj);
return ncclSuccess;
}
ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts) {
ncclResult_t ret = ncclSuccess;
struct ncclProxyConnector* proxyConn = NULL;
struct p2pIpcExpInfo ipcInfo;
void* baseAddr = nullptr;
size_t baseSize = 0;
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue = reinterpret_cast<struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>*>(cleanupQueuePtr);
uintptr_t* peerRmtAddrs = NULL;
struct ncclIpcCleanupCallback* addrsRecord = NULL;
*regBufFlag = 0;
CUDACHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
CUDACHECKGOTO(cuPointerGetAttribute((void*)&ipcInfo.legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
if (type == NCCL_IPC_COLLECTIVE) {
// collective needs host memory array to hold all remote buffer addrs.
// We need to put this into graph release queue
NCCLCHECKGOTO(ncclCalloc(&addrsRecord, 1), ret, fail);
addrsRecord->base.fn = cleanupIpc;
addrsRecord->isAddrs = true;
NCCLCHECKGOTO(ncclCalloc(&addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
} else {
assert(nPeers == 1);
// p2p does not need anything, just returning the remote buffer is enough, but for now, we register
// peer one by one so nPeers must be 1
}
for (int p = 0; p < nPeers; ++p) {
int peerRank = peerRanks[p];
if (comm->gproxyConn[peerRank].initialized == false)
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
proxyConn = &comm->gproxyConn[peerRank];
// Same as local registration. Get the mem handle for that buffer. It may have been allocated through
// cudaMalloc in which case we'll get the CUDA legacy mem handle, or through cuMem*.
if (ipcInfo.legacyIpcCap) {
if (comm->directMode) goto fail;
CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
} else if (ncclCuMemEnable()) {
#if CUDART_VERSION >= 11030
// cuMem* export
CUmemGenericAllocationHandle handle;
if (pfn_cuMemRetainAllocationHandle(&handle, baseAddr) != CUDA_SUCCESS) {
if (comm->directMode) goto fail;
CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
ipcInfo.legacyIpcCap = true;
} else {
if (proxyConn->sameProcess) {
memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
} else {
if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
int expFd = -1;
CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
if (proxyConn->sameProcess) {
ipcInfo.impFd = expFd;
} else {
NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
SYSCHECKGOTO(close(expFd), "close", ret, fail);
}
} else {
CUCHECKGOTO(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0), ret, fail);
}
}
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
}
#endif
} else {
goto fail;
}
void* rmtRegAddr = NULL;
ipcInfo.size = baseSize;
ipcInfo.offset = 0;
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(struct p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
if (rmtRegAddr) {
struct ncclIpcCleanupCallback* record;
NCCLCHECKGOTO(ncclCalloc(&record, 1), ret, fail);
record->base.fn = cleanupIpc;
record->isAddrs = false;
record->regInfo.peerRank = peerRank;
record->regInfo.baseAddr = baseAddr;
record->regInfo.impInfo.rmtRegAddr = rmtRegAddr;
record->regInfo.impInfo.offset = 0;
record->regInfo.impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
record->regInfo.ipcProxyconn = proxyConn;
// store the remote address into host addr array
if (type == NCCL_IPC_COLLECTIVE)
addrsRecord->regIpcAddrs.hostPeerRmtAddrs[comm->rankToLocalRank[peerRank]] = (uintptr_t)rmtRegAddr;
else
peerRmtAddrs = (uintptr_t*)rmtRegAddr;
*regBufFlag = 1;
if (ipcInfo.legacyIpcCap)
ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &record->base);
else
ncclIntruQueueEnqueue(cleanupQueue, &record->base);
if (nCleanupQueueElts) *nCleanupQueueElts += 1;
INFO(NCCL_REG, "rank %d - IPC graph register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, baseAddr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - (uintptr_t)baseAddr);
}
}
if (type == NCCL_IPC_COLLECTIVE) {
// allocate the dev addr array and copy all previously stored addrs into it.
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
NCCLCHECKGOTO(ncclCudaCallocAsync(&addrsRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
NCCLCHECKGOTO(ncclCudaMemcpyAsync(addrsRecord->regIpcAddrs.devPeerRmtAddrs, addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->nRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
peerRmtAddrs = addrsRecord->regIpcAddrs.devPeerRmtAddrs;
if (ipcInfo.legacyIpcCap)
ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &addrsRecord->base);
else
ncclIntruQueueEnqueue(cleanupQueue, &addrsRecord->base);
}
*offsetOut = (uintptr_t)userbuff - (uintptr_t)baseAddr;
*peerRmtAddrsOut = peerRmtAddrs;
exit:
return ret;
fail:
*regBufFlag = 0;
*offsetOut = 0;
*peerRmtAddrsOut = NULL;
goto exit;
}
ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo) {
NCCLCHECK(ncclProxyCallBlocking(comm, regInfo->ipcProxyconn, ncclProxyMsgDeregister, &regInfo->impInfo, sizeof(struct ncclIpcImpInfo), NULL, 0));
INFO(NCCL_REG, "rank %d - IPC deregistered buffer %p peer %d ipc remote buffer %p", comm->rank, regInfo->baseAddr, regInfo->peerRank, regInfo->impInfo.rmtRegAddr);
return ncclSuccess;
}
static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct p2pIpcExpInfo* ipcExpInfo = (struct p2pIpcExpInfo*)reqBuff;
void* regAddr = NULL;
ncclResult_t ret = ncclSuccess;
bool mapped = false;
bool imported = false;
CUmemGenericAllocationHandle handle;
assert(sizeof(struct p2pIpcExpInfo) == reqSize);
assert(sizeof(void*) == respSize);
// request peer passes all necessary buffer info to import. The proxy thread would register
// the buffer locally and return register addr back
if (ipcExpInfo->legacyIpcCap) {
// legacy import
CUDACHECKGOTO(cudaIpcOpenMemHandle(&regAddr, ipcExpInfo->ipcDesc.devIpc, cudaIpcMemLazyEnablePeerAccess), ret, fail);
regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset);
} else {
#if CUDART_VERSION >= 11030
// cuMem import
if (connection->sameProcess) {
// if proxy is same process as request peer, we just need to map the handle.
memcpy(&handle, &ipcExpInfo->ipcDesc.memHandle, sizeof(CUmemGenericAllocationHandle));
} else {
if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
CUCHECKGOTO(cuMemImportFromShareableHandle(&handle, (void*)(uintptr_t)ipcExpInfo->impFd, ncclCuMemHandleType), ret, fail);
SYSCHECKGOTO(close(ipcExpInfo->impFd), "close", ret, fail);
} else {
CUCHECKGOTO(cuMemImportFromShareableHandle(&handle, (void*)&ipcExpInfo->ipcDesc.cuDesc, ncclCuMemHandleType), ret, fail);
}
}
imported = true;
CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)&regAddr, ipcExpInfo->size, /* alignment */ 0, /* addr */ 0, /* flags */ 0), ret, fail);
CUCHECKGOTO(cuMemMap((CUdeviceptr)regAddr, ipcExpInfo->size, /* offset */ 0, handle, /* flags */ 0), ret, fail);
mapped = true;
// Allow access by the local GPU
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = proxyState->cudaDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)regAddr, ipcExpInfo->size, &accessDesc, 1), ret, fail);
regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset);
#endif
}
INFO(NCCL_REG, "Proxy rank %d register succeeds, regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
exit:
memcpy(respBuff, (void*)&regAddr, sizeof(void*));
*done = 1;
return ret;
fail:
if (!ipcExpInfo->legacyIpcCap) {
#if CUDART_VERSION >= 11030
if (mapped) CUCHECK(cuMemUnmap((CUdeviceptr)regAddr, ipcExpInfo->size));
if (regAddr) CUCHECK(cuMemAddressFree((CUdeviceptr)regAddr, ipcExpInfo->size));
if (imported) CUCHECK(cuMemRelease(handle));
#endif
}
regAddr = NULL;
goto exit;
}
static ncclResult_t p2pProxyDeregister(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
ncclResult_t ret = ncclSuccess;
struct ncclIpcImpInfo* ipcInfo = (struct ncclIpcImpInfo*)reqBuff;
assert(sizeof(struct ncclIpcImpInfo) == reqSize);
if (ipcInfo->legacyIpcCap) {
CUDACHECKGOTO(cudaIpcCloseMemHandle((void*)((uintptr_t)ipcInfo->rmtRegAddr - ipcInfo->offset)), ret, fail);
} else {
if (connection->sameProcess) {
NCCLCHECKGOTO(ncclCuMemFreeAddr((void*)((uintptr_t)ipcInfo->rmtRegAddr - ipcInfo->offset)), ret, fail);
} else {
NCCLCHECKGOTO(ncclCudaFree((void*)((uintptr_t)ipcInfo->rmtRegAddr - ipcInfo->offset)), ret, fail);
}
}
exit:
*done = 1;
return ret;
fail:
goto exit;
}
struct ncclTransport p2pTransport = {
"P2P",
p2pCanConnect,
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL, NULL },
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL, NULL }
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL, p2pProxyRegister, p2pProxyDeregister },
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL, p2pProxyRegister, p2pProxyDeregister }
};
static void initCeOperation() {
+313 -101
Melihat File
@@ -5,35 +5,58 @@
************************************************************************/
#include "comm.h"
#include "shmutils.h"
#include "shm.h"
#include "transport.h"
struct shmConnectInfo {
char shmName[7];
int shmSize;
#define SHM_PATH_MAX 128
#define SHM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
struct shmBuffInfo {
void *hptr;
void *dptr;
};
struct shmConnectInfo {
ncclShmIpcDesc_t desc;
struct shmBuffInfo buf;
};
static_assert(sizeof(shmConnectInfo) <= CONNECT_SIZE, "SHM Connect info is too large");
struct shmSendResources {
int remShmSize;
struct ncclRecvMem* remHostMem;
struct ncclRecvMem* devRemHostMem;
ncclShmHandle_t remHandle;
int shmSize;
ncclShmIpcDesc_t remDesc;
struct ncclSendMem* hostMem;
struct ncclSendMem* devHostMem;
ncclShmHandle_t hostHandle;
};
struct shmRecvResources {
int remShmSize;
struct ncclSendMem* remHostMem;
struct ncclSendMem* devRemHostMem;
ncclShmHandle_t remHandle;
int shmSize;
ncclShmIpcDesc_t remDesc;
struct ncclRecvMem* hostMem;
struct ncclRecvMem* devHostMem;
ncclShmHandle_t hostHandle;
};
struct shmProxyInfo {
struct ncclRecvMem* ceRecvMem;
char* devFifo;
char* shmFifo;
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
// used by progress only
uint64_t step;
cudaStream_t stream;
cudaEvent_t events[NCCL_STEPS];
// ipc desc
ncclShmIpcDesc_t desc;
};
struct shmRequest {
size_t size;
bool legacy;
};
#define SHM_SEND_SIDE 1
@@ -48,14 +71,14 @@ static int shmLocality = 0;
static void initCeOperation();
/* Determine two peers can communicate with SHM */
static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
static ncclResult_t shmCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
*ret = 0;
initCeOperation();
if (ncclParamShmDisable() == 1) return ncclSuccess;
int useNet = 0;
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, &useNet));
if (useNet) return ncclSuccess;
// Same host?
@@ -76,22 +99,29 @@ static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct
/* Create and return connect structures for this peer to connect to me */
static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct shmSendResources* resources;
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
size_t shmSize = sizeof(struct ncclSendMem);
struct shmRequest req;
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
char shmPath[PATH_MAX];
shmPath[0] = '\0';
int shmSize = sizeof(struct ncclSendMem);
if (shmLocality == SHM_SEND_SIDE) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += comm->buffSizes[p];
}
info->shmSize = resources->shmSize = shmSize;
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
req.size = shmSize;
if (myInfo->hostHash == peerInfo->hostHash && myInfo->pidHash == peerInfo->pidHash)
req.legacy = true;
else
req.legacy = false;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, myInfo->rank, &send->proxyConn));
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo)));
resources->hostMem = (struct ncclSendMem*)info->buf.hptr;
resources->devHostMem = (struct ncclSendMem*)info->buf.dptr;
INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via SHM/%s/%s comm %p nRanks %02d", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useMemcpySend?"CE":"direct", useMemcpyRecv?"CE":"direct", comm, comm->nRanks);
return ncclSuccess;
@@ -99,52 +129,43 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
struct shmRecvResources* resources;
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
size_t shmSize = sizeof(struct ncclRecvMem);
struct shmRequest req;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
char shmPath[PATH_MAX];
shmPath[0] = '\0';
int shmSize = sizeof(struct ncclRecvMem);
if (shmLocality == SHM_RECV_SIDE) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += comm->buffSizes[p];
}
info->shmSize = resources->shmSize = shmSize;
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
req.size = shmSize;
if (myInfo->hostHash == peerInfo->hostHash && myInfo->pidHash == peerInfo->pidHash)
req.legacy = true;
else
req.legacy = false;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, myInfo->rank, &recv->proxyConn));
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo)));
resources->hostMem = (struct ncclRecvMem*)info->buf.hptr;
resources->devHostMem = (struct ncclRecvMem*)info->buf.dptr;
return ncclSuccess;
}
struct shmProxyInfo {
struct ncclRecvMem* ceRecvMem;
char* devFifo;
char* shmFifo;
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
// used by progress only
uint64_t step;
cudaStream_t stream;
cudaEvent_t events[NCCL_STEPS];
};
/* Connect to this peer */
static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
// Setup device pointers
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
char* buff;
char shmPath[PATH_MAX];
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
resources->remShmSize = info->shmSize;
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle));
NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
send->conn.buffs[p] = buff;
buff += comm->buffSizes[p];
@@ -157,9 +178,6 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
send->conn.connFifo = resources->devRemHostMem->connFifo;
}
if (useMemcpySend) {
int tpProxyRank;
tpProxyRank = comm->topParentRanks[comm->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, tpProxyRank, &send->proxyConn));
struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
@@ -177,14 +195,11 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
// Setup device pointers
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
char* buff;
char shmPath[PATH_MAX];
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
resources->remShmSize = info->shmSize;
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle));
NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
recv->conn.buffs[p] = buff;
buff += comm->buffSizes[p];
@@ -194,7 +209,6 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
recv->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
if (useMemcpyRecv) {
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
@@ -210,8 +224,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
static ncclResult_t shmSendFree(struct ncclConnector* send) {
struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
if (resources) {
NCCLCHECK(ncclShmClose(resources->hostHandle));
NCCLCHECK(ncclShmClose(resources->remHandle));
NCCLCHECK(ncclShmIpcClose(&resources->remDesc));
free(resources);
send->transportResources = NULL;
}
@@ -221,8 +234,7 @@ static ncclResult_t shmSendFree(struct ncclConnector* send) {
static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
if (resources) {
NCCLCHECK(ncclShmClose(resources->hostHandle));
NCCLCHECK(ncclShmClose(resources->remHandle));
NCCLCHECK(ncclShmIpcClose(&resources->remDesc));
free(resources);
recv->transportResources = NULL;
}
@@ -230,51 +242,76 @@ static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
}
static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
ncclResult_t ret = ncclSuccess;
if (reqSize != sizeof(struct shmProxyInfo) || respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
struct shmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(proxyInfo, reqBuff, reqSize);
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr));
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
struct shmProxyInfo* reqInfo = (struct shmProxyInfo*)reqBuff;
proxyInfo = (struct shmProxyInfo*)connection->transportResources;
proxyInfo->shmFifo = reqInfo->shmFifo;
proxyInfo->sendMem = reqInfo->sendMem;
proxyInfo->recvMem = reqInfo->recvMem;
NCCLCHECKGOTO(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]), ret, fail);
NCCLCHECKGOTO(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1), ret, fail);
CUDACHECKGOTO(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking), ret, fail);
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventCreate(proxyInfo->events+i));
CUDACHECKGOTO(cudaEventCreate(proxyInfo->events+i), ret, fail);
}
connection->proxyAppendPtr = &connection->proxyAppend;
connection->transportResources = proxyInfo;
if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(respBuff, proxyInfo, respSize);
return ncclSuccess;
*done = 1;
exit:
return ret;
fail:
if (proxyInfo->ceRecvMem) ncclCudaHostFree(proxyInfo->ceRecvMem);
if (proxyInfo->devFifo) (void)ncclCudaFree(proxyInfo->devFifo);
free(proxyInfo);
goto exit;
}
static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
ncclResult_t ret = ncclSuccess;
if (reqSize != sizeof(struct shmProxyInfo) || respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
struct shmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(proxyInfo, reqBuff, reqSize);
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr));
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
struct shmProxyInfo* reqInfo = (struct shmProxyInfo*)reqBuff;
proxyInfo = (struct shmProxyInfo*)connection->transportResources;
proxyInfo->shmFifo = reqInfo->shmFifo;
proxyInfo->sendMem = reqInfo->sendMem;
proxyInfo->recvMem = reqInfo->recvMem;
NCCLCHECKGOTO(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]), ret, fail);
NCCLCHECKGOTO(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1), ret, fail);
CUDACHECKGOTO(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking), ret, fail);
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventCreate(proxyInfo->events+i));
CUDACHECKGOTO(cudaEventCreate(proxyInfo->events+i), ret, fail);
}
connection->proxyAppendPtr = &connection->proxyAppend;
connection->transportResources = proxyInfo;
if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(respBuff, proxyInfo, respSize);
return ncclSuccess;
*done = 1;
exit:
return ret;
fail:
if (proxyInfo->ceRecvMem) ncclCudaHostFree(proxyInfo->ceRecvMem);
if (proxyInfo->devFifo) (void)ncclCudaFree(proxyInfo->devFifo);
free(proxyInfo);
goto exit;
}
static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
if (resources) {
CUDACHECK(cudaStreamDestroy(resources->stream));
NCCLCHECK(ncclCudaFree(resources->devFifo));
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventDestroy(resources->events[i]));
if (useMemcpySend) {
CUDACHECK(cudaStreamDestroy(resources->stream));
NCCLCHECK(ncclCudaFree(resources->devFifo));
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventDestroy(resources->events[i]));
}
}
NCCLCHECK(ncclShmIpcClose(&resources->desc));
free(connection->transportResources);
connection->transportResources = NULL;
}
@@ -285,12 +322,15 @@ static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, str
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
if (resources) {
CUDACHECK(cudaStreamDestroy(resources->stream));
NCCLCHECK(ncclCudaFree(resources->devFifo));
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventDestroy(resources->events[i]));
if (useMemcpyRecv) {
CUDACHECK(cudaStreamDestroy(resources->stream));
NCCLCHECK(ncclCudaFree(resources->devFifo));
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventDestroy(resources->events[i]));
}
}
NCCLCHECK(ncclShmIpcClose(&resources->desc));
free(connection->transportResources);
connection->transportResources = NULL;
}
@@ -413,12 +453,37 @@ static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, stru
return ncclSuccess;
}
struct ncclTransport shmTransport = {
"SHM",
shmCanConnect,
{ shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL, NULL },
{ shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL, NULL }
};
static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct shmRequest* req = (struct shmRequest*)reqBuff;
/* check message size */
if (reqSize != sizeof(struct shmRequest)) return ncclInternalError;
if (respSize != sizeof(struct shmConnectInfo)) return ncclInternalError;
struct shmConnectInfo* info = (struct shmConnectInfo*)respBuff;
struct shmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr));
memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
connection->transportResources = proxyInfo;
return ncclSuccess;
}
static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct shmRequest* req = (struct shmRequest*)reqBuff;
/* check message size */
if (reqSize != sizeof(struct shmRequest)) return ncclInternalError;
if (respSize != sizeof(struct shmConnectInfo)) return ncclInternalError;
struct shmConnectInfo* info = (struct shmConnectInfo*)respBuff;
struct shmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr));
memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
connection->transportResources = proxyInfo;
return ncclSuccess;
}
static void initCeOperation() {
static int init = 0;
@@ -427,12 +492,10 @@ static void initCeOperation() {
useMemcpyRecv = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 2);
if (useMemcpySend) {
shmTransport.send.proxyConnect = shmSendProxyConnect;
shmTransport.send.proxyFree = shmSendProxyFree;
shmTransport.send.proxyProgress = shmSendProxyProgress;
}
if (useMemcpyRecv) {
shmTransport.recv.proxyConnect = shmRecvProxyConnect;
shmTransport.recv.proxyFree = shmRecvProxyFree;
shmTransport.recv.proxyProgress = shmRecvProxyProgress;
}
shmLocality = ncclParamShmLocality();
@@ -443,3 +506,152 @@ static void initCeOperation() {
init = 1;
}
}
ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *desc, void **hptr, void **dptr) {
if (desc == NULL || hptr == NULL || tpProxyRank < -1) {
WARN("Invalid argument desc %p, hptr %p, tpProxyRank %d", desc, hptr, tpProxyRank);
return ncclInvalidArgument;
}
#if CUDART_VERSION >= 12020
if (ncclCuMemEnable() && ncclCuMemHostEnable() && !legacy) {
// cuMem API support
CUmemAllocationHandleType type = SHM_HANDLE_TYPE;
CUmemGenericAllocationHandle handle;
NCCLCHECK(ncclCuMemHostAlloc(hptr, &handle, size));
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
// Return the native cuMem handle for later Export/Import via UDS
memcpy(&desc->shmci.data, &handle, sizeof(handle));
desc->shmci.tpProxyRank = tpProxyRank;
} else {
CUCHECK(cuMemExportToShareableHandle(&desc->shmci.handle, handle, type, 0));
}
desc->shmci.size = size;
desc->shmci.ptr = *hptr;
if (dptr) *dptr = *hptr;
desc->legacy = false;
INFO(NCCL_SHM, "CUMEM allocated shareable buffer %p size %zi", desc->shmci.ptr, desc->shmci.size);
} else {
char shmPath[SHM_PATH_MAX] = { '\0' };
desc->shmli.shmSize = size;
NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle));
memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix));
desc->legacy = true;
INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
}
#else /* CUDART_VERSION >= 12020 */
char shmPath[SHM_PATH_MAX] = { '\0' };
desc->shmli.shmSize = size;
NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle));
memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix));
desc->legacy = true;
INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, size, *hptr);
#endif /* CUDART_VERSION >= 12020 */
return ncclSuccess;
}
ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut) {
if (comm == NULL || desc == NULL || hptr == NULL || descOut == NULL) {
WARN("Invalid argument comm %p, desc %p, hptr %p, descOut %p", comm, desc, hptr, descOut);
return ncclInvalidArgument;
}
#if CUDART_VERSION >= 12020
if (ncclCuMemEnable() && ncclCuMemHostEnable() && !desc->legacy) {
// cuMem API support
CUdeviceptr hostptr = 0;
CUmemAllocationHandleType type = SHM_HANDLE_TYPE;
CUmemGenericAllocationHandle handle;
int cudaDev;
CUdevice currentDev;
CUmemAccessDesc accessDesc = {};
int cpuNumaNodeId;
size_t granularity;
size_t size = desc->shmci.size;
CUmemAllocationProp prop = {};
// Import and map the remote memory descriptor to the local GPU
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
// UDS fd support
int fd = -1;
// Send cuMem handle to remote for conversion to an fd
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, desc->shmci.tpProxyRank, &desc->shmci.data, &fd));
CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type));
(void) close(fd);
} else {
CUCHECK(cuMemImportFromShareableHandle(&handle, &desc->shmci.handle, type));
}
// Get cpu numa id
CUDACHECK(cudaGetDevice(&cudaDev));
CUCHECK(cuDeviceGet(&currentDev, cudaDev));
CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev));
if (cpuNumaNodeId < 0) cpuNumaNodeId = 0;
// Get granularity
prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.requestedHandleTypes = type;
prop.location.id = cpuNumaNodeId;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
ALIGN_SIZE(size, granularity);
// Reserve and map address
CUCHECK(cuMemAddressReserve(&hostptr, size, /* alignment */ 0, /* addr */ 0, /* flags */ 0));
CUCHECK(cuMemMap(hostptr, size, /* offset */ 0, handle, /* flags */ 0));
// Allow access by the local GPU
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = cudaDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess(hostptr, size, &accessDesc, 1));
// Allow access by the local numa
accessDesc.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
accessDesc.location.id = cpuNumaNodeId;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess(hostptr, size, &accessDesc, 1));
descOut->shmci.ptr = *hptr = (void *)hostptr;
descOut->legacy = false;
if (dptr) *dptr = (void *)hostptr;
INFO(NCCL_SHM, "CUMEM imported shareable host buffer from tpProxyRank %d size %zi ptr %p, granularity %ld", desc->shmci.tpProxyRank, desc->shmci.size, descOut->shmci.ptr, granularity);
} else {
char shmPath[SHM_PATH_MAX];
sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
descOut->legacy = true;
INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
}
#else /* CUDART_VERSION >= 12020 */
char shmPath[SHM_PATH_MAX];
sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
descOut->legacy = true;
INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
#endif
return ncclSuccess;
}
ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc) {
if (desc) {
#if CUDART_VERSION >= 12020
if (ncclCuMemEnable() && ncclCuMemHostEnable() && !desc->legacy) {
NCCLCHECK(ncclCuMemHostFree(desc->shmci.ptr));
} else {
NCCLCHECK(ncclShmClose(desc->shmli.handle));
}
#else
NCCLCHECK(ncclShmClose(desc->shmli.handle));
#endif
}
return ncclSuccess;
}
struct ncclTransport shmTransport = {
"SHM",
shmCanConnect,
{ shmSendSetup, shmSendConnect, shmSendFree, NULL, shmSendProxySetup, NULL, shmSendProxyFree, NULL },
{ shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, shmRecvProxySetup, NULL, shmRecvProxyFree, NULL }
};
+1 -1
Melihat File
@@ -29,7 +29,7 @@
#define MAX_STACK_SIZE 480
#ifdef ENABLE_LL128
#define MAX_STACK_SIZE_gfx90a 320
#define MAX_STACK_SIZE_gfx90a 360
#else
#define MAX_STACK_SIZE_gfx90a MAX_STACK_SIZE
#endif