2
0
Add scalable init API
 * Add new ncclCommInitRankScalable to allow for passing multiple
   unique IDs to the init function.
 * Spreads the load onto multiple bootstrap roots, allowing for
   constant bootstrap time.
 * Requires multiple ranks to create a unique ID, and the CPU-side
   ID exchange code to call allgather[v] instead of broadcast.

Accelerate init bootstrap operations
 * Reduce the number of calls to allgather.
 * Allow roots to reply early to ranks when information is already
   available.
 * Add an option to use ncclNet instead of sockets to perform
   bootstrap allgather operations.

Add PAT algorithms for Allgather and ReduceScatter
 * Parallel Aggregated Trees, variation of Bruck algorithm.
 * Logarithmic number of network steps for small sizes at scale.
 * Only supports one rank per node at the moment.

Add support for registered buffers for intra-node communication.
 * Allow registered user buffers to be accessed directly intra-node
 * Avoids extra copies in algorithms which permit it, saving
   memory bandwidth and helping with compute overlap.

Add profiler plugin API
 * New plugin API for profiling
 * Supports various levels of profiling, with a hierarchy.

Asynchronous graph allocation
 * Make calls to cudaMalloc and cudaMemcpy during graph allocation
   asynchronous.
 * Significantly speeds up graph capture.

Use fatal IB asynchronous events to stop network operation
 * Avoids many other error messages
 * Only fatal errors are affected; potentially transient errors
   (e.g. port down) do not cause an immediate stop.

Set P2P level to PXB on AMD CPUs when using more than 2 GPUs per node
 * P2P would cause a significant performance degradation when using
   many GPUs, and therefore many interleaved data flows.
 * Disable P2P through the CPU when we have 3+ GPUs per node; keep it
   enabled when we only have 2 GPUs.

Improve the init logs to report the real NCCL function.
 * Make the log report ncclCommInitRank or ncclCommSplit, rather than
   the generic ncclCommInitRankFunc.

Add a parameter to set the location of the user configuration file.
 * Add NCCL_CONF_FILE environment variable to set where the user's
   configuration file resides.

Increase default IB timeout
 * Increase IB timeout value from 18 to 20.
 * Should help avoid fatal errors on large RoCE systems.

Add new check for nvidia peermem
 * On linux kernels 6.6+, /sys/kernel/mm/memory_peers is no longer
   present; check for /sys/module/nvidia_peermem/version instead.

Fix old performance regression when mixing small and large operations.
 * Improves distribution of work on channels.

Fix crash when NUMA IDs are equal to -1.
 * Can happen when a NIC is a virtual NIC, or when linux doesn't
   know which NUMA node a device is attached to
 * Issue NVIDIA/nccl-tests#233

Fix tree graph search when NCCL_CROSS_NIC is set to 1.
 * Would force NCCL to use the balanced_tree pattern, thereby
   disabling LL128 on platforms with 1 GPU+1 NIC per PCI switch.
 * Would also try to use alternate rings even though it was not
   needed.

Compiler tweaks and fixes
 * PR #1177
 * PR #1228

Fix stack smash
 * PR #1325

Fixes for multi-node NVLink + IB operation

Coverity fixes and comments.
Este cometimento está contido em:
Sylvain Jeaugey
2024-09-10 05:57:10 -07:00
ascendente 178b6b7590
cometimento 68b542363f
88 ficheiros modificados com 7119 adições e 1965 eliminações
+16
Ver ficheiro
@@ -0,0 +1,16 @@
#
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
NCCL_HOME := ../../build
INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
PLUGIN_SO := libnccl-profiler.so
default: $(PLUGIN_SO)
$(PLUGIN_SO): plugin.c event.c print_event.c
$(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
clean:
rm -f $(PLUGIN_SO)
+30
Ver ficheiro
@@ -0,0 +1,30 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <stdio.h>
#include "event.h"
int taskEventQueueEmpty(struct group* g) {
return g->eventHead == NULL;
}
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
event->next = NULL;
if (g->eventHead) g->eventTail->next = event;
else g->eventHead = event;
g->eventTail = event;
}
struct taskEventBase* taskEventQueueHead(struct group* g) {
return g->eventHead;
}
struct taskEventBase* taskEventQueueDequeue(struct group* g) {
struct taskEventBase* tmp = g->eventHead;
g->eventHead = g->eventHead->next;
if (g->eventHead == NULL) g->eventTail = NULL;
return tmp;
}
+167
Ver ficheiro
@@ -0,0 +1,167 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef EVENT_H_
#define EVENT_H_
#include <sys/types.h>
#include <stdint.h>
#include <unistd.h>
#include "profiler.h"
#define MAX_CHANNELS 32
#define MAX_STEPS 16
#define PROXY_OP_SEND_STATE_OFFSET (ncclProfilerProxyOpSendPosted)
#define PROXY_OP_RECV_STATE_OFFSET (ncclProfilerProxyOpRecvPosted)
#define PROXY_STEP_SEND_STATE_OFFSET (ncclProfilerProxyStepSendGPUWait)
#define PROXY_STEP_RECV_STATE_OFFSET (ncclProfilerProxyStepRecvWait)
#define NUM_PROXY_OP_SEND_STATES (ncclProfilerProxyOpSendDone - ncclProfilerProxyOpSendPosted + 1)
#define NUM_PROXY_OP_RECV_STATES (ncclProfilerProxyOpRecvDone - ncclProfilerProxyOpRecvPosted + 1)
#define NUM_PROXY_STEP_SEND_STATES (ncclProfilerProxyStepSendWait - ncclProfilerProxyStepSendGPUWait + 1)
#define NUM_PROXY_STEP_RECV_STATES (ncclProfilerProxyStepRecvGPUWait - ncclProfilerProxyStepRecvWait + 1)
#define PROXY_OP_SEND_STATE_IDX(state) (state - PROXY_OP_SEND_STATE_OFFSET)
#define PROXY_OP_RECV_STATE_IDX(state) (state - PROXY_OP_RECV_STATE_OFFSET)
#define PROXY_STEP_SEND_STATE_IDX(state) (state - PROXY_STEP_SEND_STATE_OFFSET)
#define PROXY_STEP_RECV_STATE_IDX(state) (state - PROXY_STEP_RECV_STATE_OFFSET)
#define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES)
#define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES)
#define MAX_COMM_CLIQUES (32 * 8)
struct proxyOp;
struct proxyStep {
uint8_t type; // type of event: network transfer
int step; // network transfer id in given channel
int isSend; // send/recv channel operation
double timestamp[MAX_PROXY_STEP_STATES];
double startTs;
double stopTs;
struct proxyOp* parent;
};
struct proxyOp {
uint8_t type; // type of event: proxy operation
uint8_t channelId; // channel id for this proxy operation
pid_t pid;
int rank;
int peer; // peer rank for this proxy operation
int nSteps; // total number of network transfers for this proxy operation
int chunkSize; // chunk size for this proxy operation
int isSend; // send/recv channel operation
size_t transSize; // transfer data size for this proxy operation
struct {
int steps; // completed steps for this proxy operation state
double timestamp;
} states[MAX_PROXY_OP_STATES];
double startTs;
double stopTs;
int stepCount; // last processed network operation for this proxy operation
struct proxyStep step[MAX_STEPS]; // array of network transfer events
struct taskEventBase* parent; // parent event p2p/collective
};
struct group;
struct context;
struct proxyCtrl {
uint8_t type;
struct context* ctx; // profiler context
double startTs;
double stopTs;
int state;
int appended; // appended proxy operations
};
// task level event base structure
struct taskEventBase {
uint8_t type; // event type: collective/p2p
int rank; // rank of the operation in NCCL communicator
const char* name; // FIXME: unused
uint64_t commHash; // communicator identifier
uint8_t func; // ncclFunc*
int refCount; // number of references for this operation
struct group* parent; // parent event group
struct taskEventBase* next; // next top level event in group
double startTs;
double stopTs;
};
struct collective {
struct taskEventBase base; // base structure for this event
uint64_t seqNumber; // sequence number for this collective in communicator
void const* sendBuff;
void* recvBuff;
size_t count;
size_t trafficBytes;
int root;
uint8_t datatype;
uint8_t nMaxChannels;
uint8_t algo;
uint8_t proto;
int op;
int nWarps;
int isCollnet;
int isNvls;
struct proxyOp send[MAX_CHANNELS];// array of send proxy operation events
struct proxyOp recv[MAX_CHANNELS];// array of recv proxy operation events
};
struct p2p {
struct taskEventBase base; // base structure for this event
uint8_t func;
void const* buff;
size_t count;
uint8_t datatype;
int peer;
struct proxyOp op;
};
struct group {
uint8_t type;
struct context* ctx; // profiler context
int groupId;
int refCount;
struct taskEventBase* eventHead; // queue head for task events
struct taskEventBase* eventTail; // queue tail for task events
double startTs;
double stopTs;
struct group* next; // next group event in queue
};
// arrays for different event objects
struct context {
int groupPoolSize;
int groupPoolBase;
int groupPoolIndex;
struct group* groupPool;
int collPoolSize;
int collPoolBase;
int collPoolIndex;
struct collective* collPool;
int p2pPoolSize;
int p2pPoolBase;
int p2pPoolIndex;
struct p2p* p2pPool;
int proxyCtrlPoolSize;
int proxyCtrlPoolBase;
int proxyCtrlPoolIndex;
struct proxyCtrl* proxyCtrlPool;
};
int taskEventQueueEmpty(struct group* g);
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event);
struct taskEventBase* taskEventQueueHead(struct group* g);
struct taskEventBase* taskEventQueueDequeue(struct group* g);
#endif
+15
Ver ficheiro
@@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COMMON_H_
#define COMMON_H_
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
#endif
+19
Ver ficheiro
@@ -0,0 +1,19 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_ERR_H_
#define NCCL_ERR_H_
/* Error type for plugins */
typedef enum { ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6 } ncclResult_t;
#endif
+18
Ver ficheiro
@@ -0,0 +1,18 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_PROFILER_H_
#define NCCL_PROFILER_H_
#include <stdint.h>
#include <stdlib.h>
#include "common.h"
#include "err.h"
#include "profiler_v1.h"
#endif // end include guard
+150
Ver ficheiro
@@ -0,0 +1,150 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_PROFILER_V1_H_
#define NCCL_PROFILER_V1_H_
#include <stdint.h>
enum {
ncclProfileGroup = (1 << 0), // group event type
ncclProfileColl = (1 << 1), // host collective call event type
ncclProfileP2p = (1 << 2), // host point-to-point call event type
ncclProfileProxyOp = (1 << 3), // proxy operation event type
ncclProfileProxyStep = (1 << 4), // proxy step event type
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
ncclProfileNumEvents = ( 6),
};
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
const char* name;
uint64_t commHash;
uint64_t seqNumber;
uint8_t func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
uint8_t datatype;
uint32_t op;
size_t trafficBytes;
uint8_t nMaxChannels;
uint8_t nWarps;
uint8_t algo;
uint8_t proto;
int isCollnet;
int isNvls;
} coll;
struct {
const char* name;
uint64_t commHash;
uint8_t func;
void* buff;
uint8_t datatype;
size_t count;
int peer;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
};
} ncclProfilerEventDescr_v1_t;
typedef enum {
ncclProfilerProxyOpSendPosted,
ncclProfilerProxyOpSendRemFifoWait,
ncclProfilerProxyOpSendTransmitted,
ncclProfilerProxyOpSendDone,
ncclProfilerProxyOpRecvPosted,
ncclProfilerProxyOpRecvReceived,
ncclProfilerProxyOpRecvTransmitted,
ncclProfilerProxyOpRecvDone,
/* Legacy proxy profiler states */
ncclProfilerProxyStepSendGPUWait,
ncclProfilerProxyStepSendWait,
ncclProfilerProxyStepRecvWait,
ncclProfilerProxyStepRecvFlushWait,
ncclProfilerProxyStepRecvGPUWait,
/* Legacy proxy control states */
ncclProfilerProxyCtrlIdle,
ncclProfilerProxyCtrlActive,
ncclProfilerProxyCtrlSleep,
ncclProfilerProxyCtrlWakeup,
ncclProfilerProxyCtrlAppend,
ncclProfilerProxyCtrlAppendEnd,
} ncclProfilerEventState_v1_t;
typedef union {
struct {
size_t transSize;
int steps;
} proxyOp;
struct {
int appendedProxyOps;
} proxyCtrl;
} ncclProfilerEventStateArgs_v1_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v1_t;
typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t;
typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t;
typedef ncclProfiler_v1_t ncclProfiler_t;
#endif
+21
Ver ficheiro
@@ -0,0 +1,21 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/
#ifndef NCCL_TYPES_H_
#define NCCL_TYPES_H_
/* Data types */
typedef enum { ncclInt8 = 0, ncclChar = 0,
ncclUint8 = 1,
ncclInt32 = 2, ncclInt = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6, ncclHalf = 6,
ncclFloat32 = 7, ncclFloat = 7,
ncclFloat64 = 8, ncclDouble = 8,
ncclBfloat16 = 9,
} ncclDataType_t;
#endif
+492
Ver ficheiro
@@ -0,0 +1,492 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <stdio.h>
#include <pthread.h>
#include <string.h>
#include <linux/limits.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <x86intrin.h>
#include "event.h"
#include "print_event.h"
#define __hidden __attribute__ ((visibility("hidden")))
static int initialized; // initialization counter for profiler
static double startTime; // profiler start time
static int groupPoolSize = 16;
static int collPoolSize = 16;
static int p2pPoolSize = 1024;
static int proxyCtrlPoolSize = 16;
static int detachPoolSize = 128;
static int detachPoolBase;
static int detachPoolIndex;
static int detachPoolDone;
static struct proxyOp* detachPool;
static double freq = -1;
__hidden void calibrate() {
struct timeval tv;
gettimeofday(&tv, NULL);
uint64_t timeCycles = __rdtsc();
double time = - tv.tv_sec*1e6 - tv.tv_usec;
uint64_t total = 0ULL;
for (int i = 0; i < 10000; i++) total += __rdtsc();
gettimeofday(&tv, NULL);
timeCycles = __rdtsc() - timeCycles;
time += tv.tv_sec*1e6 + tv.tv_usec;
freq = timeCycles / time;
}
__hidden double gettime(void) {
return __rdtsc() / freq;
}
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
static pid_t pid;
__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) {
pthread_mutex_lock(&lock);
if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
// first thread initializes event mask, environment and detach pool
__atomic_store_n(eActivationMask, ncclProfileColl | ncclProfileP2p, __ATOMIC_RELAXED);
if (getenv("NCCL_PROFILE_EVENT_MASK")) {
__atomic_store_n(eActivationMask, atoi(getenv("NCCL_PROFILE_EVENT_MASK")), __ATOMIC_RELAXED);
}
if (getenv("NCCL_PROFILE_GROUP_POOL_SIZE")) {
groupPoolSize = atoi(getenv("NCCL_PROFILE_GROUP_POOL_SIZE"));
}
if (getenv("NCCL_PROFILE_COLL_POOL_SIZE")) {
collPoolSize = atoi(getenv("NCCL_PROFILE_COLL_POOL_SIZE"));
}
if (getenv("NCCL_PROFILE_P2P_POOL_SIZE")) {
p2pPoolSize = atoi(getenv("NCCL_PROFILE_P2P_POOL_SIZE"));
}
if (getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE")) {
proxyCtrlPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE"));
}
if (getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE")) {
detachPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE"));
}
// detach pool is used to store PXN proxyOps and is shared among threads
detachPool = (struct proxyOp *)calloc(detachPoolSize, sizeof(*detachPool));
if (detachPool == NULL) {
pthread_mutex_unlock(&lock);
return ncclSystemError;
}
// Pid of the process initializing the profiler first.
// This is compared against the pid of proxyOp events
// to figure out if they have a parent event in this
// process address space.
pid = getpid();
// calibrate and start timer
calibrate();
startTime = gettime();
}
pthread_mutex_unlock(&lock);
// pre-allocate memory for event object pools in dedicated profiler context
struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
if (ctx->groupPool == NULL) goto fail;
ctx->collPool = (struct collective *)calloc(collPoolSize, sizeof(*ctx->collPool));
if (ctx->collPool == NULL) goto fail;
ctx->p2pPool = (struct p2p *)calloc(p2pPoolSize, sizeof(*ctx->p2pPool));
if (ctx->p2pPool == NULL) goto fail;
ctx->proxyCtrlPool = (struct proxyCtrl *)calloc(proxyCtrlPoolSize, sizeof(*ctx->proxyCtrlPool));
if (ctx->proxyCtrlPool == NULL) goto fail;
*context = ctx;
return ncclSuccess;
fail:
// cleanup resources
if (ctx->proxyCtrlPool) free(ctx->proxyCtrlPool);
if (ctx->p2pPool) free(ctx->p2pPool);
if (ctx->collPool) free(ctx->collPool);
if (ctx->groupPool) free(ctx->groupPool);
free(ctx);
if (detachPool) free(detachPool);
return ncclSystemError;
}
__hidden ncclResult_t exampleProfilerFinalize(void* context) {
FILE* fh = NULL;
char filename[PATH_MAX] = { 0 };
char hostname[64] = { 0 };
gethostname(hostname, 64);
const char* dump = getenv("NCCL_PROFILE_DUMP_FILE");
if (dump) {
sprintf(filename, "%s-%s-%ld.txt", dump, hostname, syscall(SYS_gettid));
fh = fopen(filename, "w");
fprintf(fh, "[\n");
}
// print last N groups/collectives/p2ps
struct context* ctx = (struct context *)context;
int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0;
int end = ctx->groupPoolIndex;
for (int i = start; i < end; i++) {
printEvent(fh, &ctx->groupPool[i%groupPoolSize]);
}
start = (ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize >= 0) ? ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize : 0;
end = ctx->proxyCtrlPoolIndex;
for (int i = start; i < end; i++) {
printEvent(fh, &ctx->proxyCtrlPool[i%proxyCtrlPoolSize]);
}
free(ctx->groupPool);
free(ctx->collPool);
free(ctx->p2pPool);
free(ctx->proxyCtrlPool);
free(ctx);
// last thread cleans up shared detach pool
if (__atomic_fetch_sub(&initialized, 1, __ATOMIC_RELAXED) - 1 == 0) {
start = (detachPoolIndex - detachPoolSize >= 0) ? detachPoolIndex - detachPoolSize : 0;
end = detachPoolIndex;
for (int i = start; i < end; i++) {
printEvent(fh, &detachPool[i%detachPoolSize]);
}
free(detachPool);
}
if (fh) fprintf(fh, "{}]\n");
if (fh) fclose(fh);
return ncclSuccess;
}
__hidden void updateEvent(void* handle);
__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr) {
*eHandle = NULL;
struct context* ctx = (struct context *)context;
if (eDescr->type == ncclProfileGroup) {
struct group* event;
int groupId = __atomic_fetch_add(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
if ((groupId - __atomic_load_n(&ctx->groupPoolBase, __ATOMIC_RELAXED)) < groupPoolSize) {
// if there are available group events grab one
event = &ctx->groupPool[groupId%groupPoolSize];
while (!taskEventQueueEmpty(event)) {
struct taskEventBase* base = taskEventQueueDequeue(event);
if (base->type == ncclProfileColl) {
struct collective* c = (struct collective *)base;
// reset event proxyOps & proxySteps
memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
// release collective events in the group and return them to the collective pool
__atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
} else if (base->type == ncclProfileP2p) {
struct p2p* p = (struct p2p *)base;
// reset event proxyOp and proxySteps
memset(&p->op, 0, sizeof(struct proxyOp));
// release p2p events in the group and return them to the p2p pool
__atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
}
}
} else {
// else drop this event
__atomic_fetch_sub(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->type = ncclProfileGroup;
__atomic_store_n(&event->refCount, 1, __ATOMIC_RELAXED);
event->ctx = ctx;
event->groupId = groupId;
event->startTs = gettime() - startTime;
*eHandle = event;
debugEvent(event, "GroupStart");
} else if (eDescr->type == ncclProfileColl) {
// the parent might be null if we run out of events
struct group* parent = (struct group *)eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
struct collective* event;
int collId = __atomic_fetch_add(&ctx->collPoolIndex, 1, __ATOMIC_RELAXED);
if ((collId - __atomic_load_n(&ctx->collPoolBase, __ATOMIC_RELAXED)) < collPoolSize) {
// if there are available collective events grab one
event = &ctx->collPool[collId%collPoolSize];
} else {
// else drop this event
__atomic_fetch_sub(&ctx->collPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->base.type = ncclProfileColl;
event->base.rank = eDescr->rank;
event->base.name = eDescr->coll.name;
event->base.commHash = eDescr->coll.commHash;
event->base.func = eDescr->coll.func;
event->base.startTs = gettime() - startTime;
event->base.parent = parent;
event->seqNumber = eDescr->coll.seqNumber;
event->sendBuff = eDescr->coll.sendBuff;
event->recvBuff = eDescr->coll.recvBuff;
event->count = eDescr->coll.count;
event->root = eDescr->coll.root;
event->datatype = eDescr->coll.datatype;
event->op = eDescr->coll.op;
event->trafficBytes = eDescr->coll.trafficBytes;
event->nMaxChannels = eDescr->coll.nMaxChannels;
event->nWarps = eDescr->coll.nWarps;
event->algo = eDescr->coll.algo;
event->proto = eDescr->coll.proto;
event->isCollnet = eDescr->coll.isCollnet;
event->isNvls = eDescr->coll.isNvls;
*eHandle = event;
taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
// increment the group ref counter so the event will staty open
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "CollStart");
} else if (eDescr->type == ncclProfileP2p) {
// the parent might be null if we run out of events
struct group* parent = (struct group *)eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
struct p2p* event;
int p2pId = __atomic_fetch_add(&ctx->p2pPoolIndex, 1, __ATOMIC_RELAXED);
if ((p2pId - __atomic_load_n(&ctx->p2pPoolBase, __ATOMIC_RELAXED)) < p2pPoolSize) {
// if there are available p2p events grab one
event = &ctx->p2pPool[p2pId%p2pPoolSize];
} else {
// else drop this event
__atomic_fetch_sub(&ctx->p2pPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->base.type = ncclProfileP2p;
event->base.rank = eDescr->rank;
event->base.name = eDescr->p2p.name;
event->base.commHash = eDescr->p2p.commHash;
event->base.func = eDescr->p2p.func;
event->base.next = parent->eventHead;
event->base.startTs = gettime() - startTime;
event->base.parent = parent;
event->buff = eDescr->p2p.buff;
event->count = eDescr->p2p.count;
event->datatype = eDescr->p2p.datatype;
event->peer = eDescr->p2p.peer;
*eHandle = event;
// increment the group ref counter so the event will staty open
taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
__atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "P2pStart");
} else if (eDescr->type == ncclProfileProxyCtrl) {
int proxyCtrlId = __atomic_fetch_add(&ctx->proxyCtrlPoolIndex, 1, __ATOMIC_RELAXED);
struct proxyCtrl* event = &ctx->proxyCtrlPool[proxyCtrlId%proxyCtrlPoolSize];
event->type = ncclProfileProxyCtrl;
event->ctx = ctx;
event->startTs = gettime() - startTime;
*eHandle = event;
} else if (eDescr->type == ncclProfileProxyOp) {
// the eventBase might be null if we run out of events
struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj;
if (eventBase == NULL) return ncclSuccess;
if (eDescr->proxyOp.pid != pid) {
// PXN captured proxyOp events
struct proxyOp* event;
int detachId = __atomic_fetch_add(&detachPoolIndex, 1, __ATOMIC_RELAXED);
if ((detachId - detachPoolBase) < detachPoolSize) {
// if there are available detached proxyOp events grab one
event = &detachPool[detachId%detachPoolSize];
} else {
// else drop this event
__atomic_fetch_sub(&detachPoolIndex, 1, __ATOMIC_RELAXED);
return ncclSuccess;
}
event->type = ncclProfileProxyOp;
event->channelId = eDescr->proxyOp.channelId;
event->pid = eDescr->proxyOp.pid;
event->rank = eDescr->rank;
event->peer = eDescr->proxyOp.peer;
event->nSteps = eDescr->proxyOp.nSteps;
event->chunkSize = eDescr->proxyOp.chunkSize;
event->isSend = eDescr->proxyOp.isSend;
event->startTs = gettime() - startTime;
event->parent = NULL;
*eHandle = event;
debugEvent(event, "PxnProxyOpStart");
return ncclSuccess;
}
if (eventBase->type == ncclProfileColl) {
struct collective* parent = (struct collective *)eDescr->parentObj;
struct proxyOp* event = (eDescr->proxyOp.isSend) ? &parent->send[eDescr->proxyOp.channelId] : &parent->recv[eDescr->proxyOp.channelId];
event->type = ncclProfileProxyOp;
event->channelId = eDescr->proxyOp.channelId;
event->pid = eDescr->proxyOp.pid;
event->rank = eDescr->rank;
event->peer = eDescr->proxyOp.peer;
event->nSteps = eDescr->proxyOp.nSteps;
event->chunkSize = eDescr->proxyOp.chunkSize;
event->isSend = eDescr->proxyOp.isSend;
event->parent = eventBase;
event->startTs = gettime() - startTime;
*eHandle = event;
__atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "ProxyOpStart");
} else { // ncclProfileP2p
struct p2p* parent = (struct p2p *)eDescr->parentObj;
struct proxyOp* event = &parent->op;
event->type = ncclProfileProxyOp;
event->channelId = eDescr->proxyOp.channelId;
event->pid = eDescr->proxyOp.pid;
event->rank = eDescr->rank;
event->peer = eDescr->proxyOp.peer;
event->nSteps = eDescr->proxyOp.nSteps;
event->chunkSize = eDescr->proxyOp.chunkSize;
event->isSend = eDescr->proxyOp.isSend;
event->parent = eventBase;
event->startTs = gettime() - startTime;
*eHandle = event;
__atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED);
debugEvent(event, "ProxyOpStart");
}
} else if (eDescr->type == ncclProfileProxyStep) {
// the parent might be null if we run out of events
struct proxyOp* parent = (struct proxyOp *)eDescr->parentObj;
if (parent == NULL) return ncclSuccess;
int s = parent->stepCount++ % MAX_STEPS;
struct proxyStep* event = &parent->step[s];
event->type = ncclProfileProxyStep;
event->step = eDescr->proxyStep.step;
event->isSend = parent->isSend;
event->parent = parent;
event->startTs = gettime() - startTime;
*eHandle = event;
debugEvent(event, "ProxyStepStart");
}
return ncclSuccess;
}
void updateEvent(void* handle) {
uint8_t type = *(uint8_t *)handle;
if (type == ncclProfileGroup) {
struct group* event = (struct group *)handle;
if (__atomic_fetch_sub(&event->refCount, 1, __ATOMIC_RELAXED) == 1) {
event->stopTs = gettime() - startTime;
// return group event to the pool
__atomic_fetch_add(&event->ctx->groupPoolBase, 1, __ATOMIC_RELAXED);
}
debugEvent(event, "GroupStop");
} else if (type == ncclProfileColl) {
struct collective* event = (struct collective *)handle;
if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) {
event->base.stopTs = gettime() - startTime;
debugEvent(event, "CollStop");
updateEvent(event->base.parent);
return;
}
debugEvent(event, "CollStop");
} else if (type == ncclProfileP2p) {
struct p2p* event = (struct p2p *)handle;
if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) {
event->base.stopTs = gettime() - startTime;
debugEvent(event, "P2pStop");
updateEvent(event->base.parent);
return;
}
debugEvent(event, "P2pStop");
} else if (type == ncclProfileProxyOp) {
struct proxyOp* event = (struct proxyOp *)handle;
event->stopTs = gettime() - startTime;
if (event->pid != pid) {
// only for proxyOps that don't have a parent collective/p2p (i.e., PXN)
int done = __atomic_fetch_add(&detachPoolDone, 1, __ATOMIC_RELAXED) + 1;
if (done == detachPoolSize) {
// reset the event completed (done) counter
__atomic_store_n(&detachPoolDone, 0, __ATOMIC_RELAXED);
// update the base pointer to the top of the pool
int index = __atomic_load_n(&detachPoolIndex, __ATOMIC_RELAXED);
__atomic_store_n(&detachPoolBase, index, __ATOMIC_RELAXED);
}
debugEvent(event, "ProxyOpStop");
return;
}
updateEvent(event->parent);
debugEvent(event, "ProxyOpStop");
} else if (type == ncclProfileProxyStep) {
struct proxyStep* event = (struct proxyStep *)handle;
event->stopTs = gettime() - startTime;
debugEvent(event, "ProxyStepStop");
} else if (type == ncclProfileProxyCtrl) {
struct proxyCtrl* event = (struct proxyCtrl *)handle;
event->stopTs = gettime() - startTime;
debugEvent(event, "ProxyCtrlStop");
}
}
__hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
// the event handle might be null if we run out of events
if (eHandle == NULL) return ncclSuccess;
uint8_t type = *(uint8_t *)eHandle;
if (type == ncclProfileGroup) {
// stopping the group event in NCCL core does not
// mean the group has completed. It means the group
// was submitted/enqueued so we need to keep the event open
struct group* event = (struct group *)eHandle;
event->stopTs = gettime() - startTime;
return ncclSuccess;
} else if (type == ncclProfileColl) {
// stopping the collective event in NCCL core does not
// mean the collective has completed. It means the collective
// was submitted/enqueued so we need to keep the event open
struct collective* event = (struct collective *)eHandle;
event->base.stopTs = gettime() - startTime;
return ncclSuccess;
}
updateEvent(eHandle);
return ncclSuccess;
}
__hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs) {
// the event handle might be null if we run out of events
if (eHandle == NULL) return ncclSuccess;
debugEvent(eHandle, "RecordEventState");
uint8_t type = *(uint8_t *)eHandle;
if (type == ncclProfileProxyOp) {
struct proxyOp* event = (struct proxyOp *)eHandle;
int steps = event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps;
if (eState == ncclProfilerProxyOpSendRemFifoWait && eStateArgs->proxyOp.steps == steps) return ncclSuccess;
event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps = eStateArgs->proxyOp.steps;
event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].timestamp = gettime() - startTime;
event->transSize = eStateArgs->proxyOp.transSize;
} else if (type == ncclProfileProxyStep) {
struct proxyStep* event = (struct proxyStep *)eHandle;
event->timestamp[event->isSend ? PROXY_STEP_SEND_STATE_IDX(eState) : PROXY_STEP_RECV_STATE_IDX(eState)] = gettime() - startTime;
} else if (type == ncclProfileProxyCtrl) {
struct proxyCtrl* event = (struct proxyCtrl *)eHandle;
if (eState == ncclProfilerProxyCtrlAppendEnd) {
event->appended = eStateArgs->proxyCtrl.appendedProxyOps;
}
event->state = eState;
}
return ncclSuccess;
}
ncclProfiler_v1_t ncclProfiler_v1 = {
"Example-profiler",
exampleProfilerInit,
exampleProfilerStartEvent,
exampleProfilerStopEvent,
exampleProfilerRecordEventState,
exampleProfilerFinalize,
};
+277
Ver ficheiro
@@ -0,0 +1,277 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <stdio.h>
#include "profiler.h"
#include "event.h"
#include "print_event.h"
#define __hidden __attribute__ ((visibility("hidden")))
__hidden const char* ncclFuncToString(int func) {
switch(func) {
case 0:
return "ncclBroadcast";
case 1:
return "ncclReduce";
case 2:
return "ncclAllGather";
case 3:
return "ncclReduceScatter";
case 4:
return "ncclAllReduce";
case 5:
return "ncclSendRecv";
case 6:
return "ncclSend";
case 7:
return "ncclRecv";
}
return NULL;
}
__hidden const char* ncclAlgoToString(int algo) {
switch(algo) {
case 0:
return "Tree";
case 1:
return "Ring";
case 2:
return "CollnetDirect";
case 3:
return "CollnetChain";
case 4:
return "Nvls";
case 5:
return "NvlsTree";
}
}
__hidden const char* ncclProtoToString(int proto) {
switch(proto) {
case 0:
return "LL";
case 1:
return "LL128";
case 2:
return "Simple";
}
}
// FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category
// It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a
// category that matches the type of event (GROUP, COLL, P2P, PROXY, NET)
static __thread int groupId;
__hidden void printGroupEventHeader(FILE* fh, struct group* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d}},\n",
"Group", groupId, getpid(), 1, event->startTs, event->groupId);
}
__hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"Group", groupId++, getpid(), 1, event->stopTs);
}
static __thread int collId;
__hidden void printCollEventHeader(FILE* fh, struct collective* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": %d, \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n",
ncclFuncToString(event->base.func), collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, ncclAlgoToString(event->algo), ncclProtoToString(event->proto), event->nMaxChannels);
}
__hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
ncclFuncToString(event->base.func), collId++, getpid(), 1, event->base.stopTs);
}
static __thread int p2pId;
__hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": %d}},\n",
ncclFuncToString(event->base.func), p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype);
}
__hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
ncclFuncToString(event->base.func), p2pId++, getpid(), 1, event->base.stopTs);
}
static __thread int proxyOpId;
__hidden void printProxyOpEventHeader(FILE* fh, struct proxyOp* event) {
if (event->isSend) {
int posted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendPosted);
int remFifoWait = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendRemFifoWait);
int transmitted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendTransmitted);
int done = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendDone);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"REM_FIFO_WAIT\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n",
"Send", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[remFifoWait].steps, event->states[remFifoWait].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp);
} else {
int posted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvPosted);
int received = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvReceived);
int transmitted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvTransmitted);
int done = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvDone);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"RECEIVED\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n",
"Recv", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[received].steps, event->states[received].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp);
}
}
__hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
event->isSend ? "Send" : "Recv", proxyOpId++, getpid(), 1, event->stopTs);
}
static __thread int proxyStepId;
__hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
if (event->isSend) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"SendBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"SendWait", proxyStepId++, getpid(), 1, event->stopTs);
} else {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)]);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
"RecvGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)], event->step);
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
"RecvGpuWait", proxyStepId++, getpid(), 1, event->stopTs);
}
}
static __thread int proxyCtrlId;
__hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
const char* str;
if (event->state == ncclProfilerProxyCtrlIdle || event->state == ncclProfilerProxyCtrlActive) {
str = "Idle";
} else if (event->state == ncclProfilerProxyCtrlSleep || event->state == ncclProfilerProxyCtrlWakeup) {
str = "Sleep";
} else if (event->state == ncclProfilerProxyCtrlAppend || event->state == ncclProfilerProxyCtrlAppendEnd) {
str = "Append";
}
if (event->state == ncclProfilerProxyCtrlAppendEnd) {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"appended\": %d}},\n",
str, proxyCtrlId, getpid(), 1, event->startTs, event->appended);
} else {
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
str, proxyCtrlId, getpid(), 1, event->startTs);
}
fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
str, proxyCtrlId++, getpid(), 1, event->stopTs);
}
//#define DEBUG_EVENTS
void debugEvent(void* eHandle, const char* tag) {
#ifdef DEBUG_EVENTS
char filename[64] = { 0 };
sprintf(filename, "EventDebug-%d", getpid());
FILE* fh = fopen(filename, "a+");
uint8_t type = *(uint8_t *)eHandle;
if (type == ncclProfileGroup) {
struct group* event = (struct group *)eHandle;
fprintf(fh, "Group event %p tag = %s {\n", event, tag);
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->refCount, __ATOMIC_RELAXED));
fprintf(fh, " startTs = %f\n", event->startTs);
fprintf(fh, " stopTs = %f\n", event->stopTs);
fprintf(fh, "}\n");
} else if (type == ncclProfileColl) {
struct collective* event = (struct collective *)eHandle;
fprintf(fh, "Collective event %p tag = %s {\n", event, tag);
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
fprintf(fh, " parent = %p\n", event->base.parent);
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->send[i].type == ncclProfileProxyOp) fprintf(fh, " send[%d] = %p\n", i, &event->send[i]);
for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->recv[i].type == ncclProfileProxyOp) fprintf(fh, " recv[%d] = %p\n", i, &event->recv[i]);
fprintf(fh, " startTs = %f\n", event->base.startTs);
fprintf(fh, " stopTs = %f\n", event->base.stopTs);
fprintf(fh, "}\n");
} else if (type == ncclProfileP2p) {
struct p2p* event = (struct p2p *)eHandle;
fprintf(fh, "P2p event %p tag = %s {\n", event, tag);
fprintf(fh, " refCount = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
fprintf(fh, " parent = %p\n", event->base.parent);
fprintf(fh, " op = %p\n", &event->op);
fprintf(fh, " startTs = %f\n", event->base.startTs);
fprintf(fh, " stopTs = %f\n", event->base.stopTs);
fprintf(fh, "}\n");
} else if (type == ncclProfileProxyOp) {
struct proxyOp* event = (struct proxyOp *)eHandle;
fprintf(fh, "ProxyOp event %p tag = %s {\n", event, tag);
fprintf(fh, " type = %s\n", event->isSend ? "Send" : "Recv");
fprintf(fh, " channel = %d\n", event->channelId);
fprintf(fh, " parent = %p\n", event->parent);
fprintf(fh, " rank = %d\n", event->rank);
fprintf(fh, " startTs = %f\n", event->startTs);
fprintf(fh, " stopTs = %f\n", event->stopTs);
fprintf(fh, "}\n");
} else if (type == ncclProfileProxyStep) {
struct proxyStep* event = (struct proxyStep *)eHandle;
fprintf(fh, "ProxyStep event %p tag = %s {\n", event, tag);
fprintf(fh, " type = %s\n", event->isSend ? "Send" : "Recv");
fprintf(fh, " parent = %p\n", event->parent);
fprintf(fh, " startTs = %f\n", event->startTs);
fprintf(fh, " stopTs = %f\n", event->stopTs);
fprintf(fh, "}\n");
}
fclose(fh);
#endif
}
void printEvent(FILE* fh, void* handle) {
if (handle == NULL || fh == NULL) return;
uint8_t type = *(uint8_t *)handle;
if (type == ncclProfileGroup) {
struct group* g = (struct group *)handle;
printGroupEventHeader(fh, g);
struct taskEventBase* base = taskEventQueueHead(g);
while (base) {
struct taskEventBase* next = base->next;
printEvent(fh, base);
base = next;
}
printGroupEventTrailer(fh, g);
} else if (type == ncclProfileColl) {
struct collective* c = (struct collective *)handle;
printCollEventHeader(fh, c);
for (int i = 0; i < MAX_CHANNELS; i++) {
printEvent(fh, &c->send[i]);
printEvent(fh, &c->recv[i]);
}
printCollEventTrailer(fh, c);
} else if (type == ncclProfileP2p) {
struct p2p* p = (struct p2p *)handle;
printP2pEventHeader(fh, p);
printEvent(fh, &p->op);
printP2pEventTrailer(fh, p);
} else if (type == ncclProfileProxyOp) {
struct proxyOp* p = (struct proxyOp *)handle;
printProxyOpEventHeader(fh, p);
for (int i = 0; i < MAX_STEPS; i++) {
printEvent(fh, &p->step[i]);
}
printProxyOpEventTrailer(fh, p);
} else if (type == ncclProfileProxyStep) {
struct proxyStep* p = (struct proxyStep *)handle;
printProxyStepEvent(fh, p);
} else if (type == ncclProfileProxyCtrl) {
struct proxyCtrl* p = (struct proxyCtrl *)handle;
printProxyCtrlEvent(fh, p);
}
return;
}
+13
Ver ficheiro
@@ -0,0 +1,13 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef PRINT_EVENT_H_
#define PRINT_EVENT_H_
void debugEvent(void* eHandle, const char* tag);
void printEvent(FILE* fh, void* handle);
#endif
+2 -1
Ver ficheiro
@@ -27,7 +27,7 @@ typedef enum {
ncclNumFuncs = 8
} ncclFunc_t;
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
#define NCCL_ALGO_UNDEF -1
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
@@ -35,6 +35,7 @@ typedef enum {
#define NCCL_ALGO_COLLNET_CHAIN 3
#define NCCL_ALGO_NVLS 4
#define NCCL_ALGO_NVLS_TREE 5
#define NCCL_ALGO_PAT 6
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define NCCL_PROTO_UNDEF -1
+7
Ver ficheiro
@@ -10,6 +10,7 @@ VERBOSE ?= 0
KEEP ?= 0
DEBUG ?= 0
ASAN ?= 0
UBSAN ?= 0
TRACE ?= 0
PROFAPI ?= 1
NVTX ?= 1
@@ -93,6 +94,12 @@ LDFLAGS += -fsanitize=address -static-libasan
NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan
endif
ifneq ($(UBSAN), 0)
CXXFLAGS += -fsanitize=undefined
LDFLAGS += -fsanitize=undefined -static-libubsan
NVLDFLAGS += -Xcompiler -fsanitize=undefined,-static-libubsan
endif
ifneq ($(VERBOSE), 0)
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
CXXFLAGS += -Wall -Wextra
+2 -2
Ver ficheiro
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 22
NCCL_PATCH := 3
NCCL_MINOR := 23
NCCL_PATCH := 4
NCCL_SUFFIX :=
PKG_REVISION := 1
+712 -238
Ver ficheiro
A apresentação das diferenças no ficheiro foi suprimida por ser demasiado grande Carregar diff
+1
Ver ficheiro
@@ -59,6 +59,7 @@ const char* ncclAlgoToString(int algo) {
case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN";
case NCCL_ALGO_NVLS: return "NVLS";
case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE";
case NCCL_ALGO_PAT: return "PAT";
default: return "Unknown";
}
}
+21 -11
Ver ficheiro
@@ -19,7 +19,7 @@ static int pid = -1;
static char hostname[1024];
thread_local int ncclDebugNoWarn = 0;
char ncclLastError[1024] = ""; // Global string for the last error in human readable form
static uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV
static uint64_t ncclDebugMask = NCCL_INIT | NCCL_BOOTSTRAP | NCCL_ENV; // Default debug sub-system mask is INIT and ENV
FILE *ncclDebugFile = stdout;
static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
static std::chrono::steady_clock::time_point ncclEpoch;
@@ -122,7 +122,7 @@ static void ncclDebugInit() {
int c = 0;
char debugFn[PATH_MAX+1] = "";
char *dfn = debugFn;
while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
while (ncclDebugFileEnv[c] != '\0' && (dfn - debugFn) < PATH_MAX) {
if (ncclDebugFileEnv[c++] != '%') {
*dfn++ = ncclDebugFileEnv[c-1];
continue;
@@ -132,16 +132,24 @@ static void ncclDebugInit() {
*dfn++ = '%';
break;
case 'h': // %h = hostname
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
dfn += snprintf(dfn, PATH_MAX + 1 - (dfn - debugFn), "%s", hostname);
break;
case 'p': // %p = pid
dfn += snprintf(dfn, PATH_MAX, "%d", pid);
dfn += snprintf(dfn, PATH_MAX + 1 - (dfn - debugFn), "%d", pid);
break;
default: // Echo everything we don't understand
*dfn++ = '%';
*dfn++ = ncclDebugFileEnv[c-1];
if ((dfn - debugFn) < PATH_MAX) {
*dfn++ = ncclDebugFileEnv[c-1];
}
break;
}
if ((dfn - debugFn) > PATH_MAX) {
// snprintf wanted to overfill the buffer: set dfn to the end
// of the buffer (for null char) and it will naturally exit
// the loop.
dfn = debugFn + PATH_MAX;
}
}
*dfn = '\0';
if (debugFn[0] != '\0') {
@@ -181,9 +189,9 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
tid = syscall(SYS_gettid);
}
int cudaDev;
int cudaDev = 0;
if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
cudaGetDevice(&cudaDev);
(void)cudaGetDevice(&cudaDev);
}
char buffer[1024];
@@ -207,11 +215,13 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
va_start(vargs, fmt);
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
va_end(vargs);
// vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
// vsnprintf may return len >= sizeof(buffer) in the case of a truncated output.
// Rewind len so that we can replace the final \0 by \n
if (len > sizeof(buffer)) len = sizeof(buffer)-1;
buffer[len++] = '\n';
if (len) fwrite(buffer, 1, len, ncclDebugFile);
if (len >= sizeof(buffer)) len = sizeof(buffer)-1;
if (len) {
buffer[len++] = '\n';
fwrite(buffer, 1, len, ncclDebugFile);
}
}
NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
+45 -14
Ver ficheiro
@@ -23,8 +23,11 @@ namespace {
T *inputBuf = (T*)work->sendbuff;
T *outputBuf = (T*)work->recvbuff;
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg);
(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);
for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) {
/////////////// begin AllGather steps ///////////////
@@ -46,7 +49,7 @@ namespace {
rankDest = ringRanks[nranks-j];
offset = dataOffset + rankDest * count;
prims.directRecvCopySend(offset, nelem);
prims.directRecvCopyDirectSend(offset, nelem);
}
// Make final copy from buffer to dest.
@@ -54,7 +57,7 @@ namespace {
offset = dataOffset + rankDest * count;
// Final wait/copy.
prims.directRecv(offset, nelem);
prims.directRecv(offset, offset, nelem);
}
}
}
@@ -81,6 +84,31 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128
}
};
template<typename T, typename RedOp>
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
using Proto = ProtoSimple<1, 1>;
const int nranks = ncclShmem.comm.nRanks;
const int rank = ncclShmem.comm.rank;
size_t count, channelOffset, channelCount, chunkCount;
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
T *inputBuf = (T*)work->sendbuff;
T *outputBuf = (T*)work->recvbuff;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
(tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatAg);
PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
int last = 0;
while (!last) {
int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
size_t inpIx, outIx;
patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
prims.patCopy(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend);
}
}
};
template<typename T, typename RedOp>
struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
@@ -165,7 +193,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
__device__ __forceinline__ void operator()(
int tid, int tn, int slice, int maxSliceSize,
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
) {
static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
@@ -203,19 +231,22 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
int outIsDst = (inPlace && rank == ncclShmem.comm.rank) ? 0 : 1;
reduceCopy<ncclCollUnroll(), RedOp, T,
if (nSrcs != 0 && outIsDst+nDsts != 0) {
reduceCopy<ncclCollUnroll(), RedOp, T,
/*MultimemSrcs,MinSrcs,MaxSrcs=*/0,1,1,
/*MultimemDsts=*/0, 0+MinDsts, 1+MaxDsts,
/*PreOpSrcs=*/0>
(tid, tn, 0, nullptr, false,
/*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* {
return (char*)srcPtrs[src] + railAllOffset;
return work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ) ? (char*)srcPtrs[src] + userOneBeg : (char*)srcPtrs[src] + railAllOffset;
},
/*nDsts=*/outIsDst+nDsts, [=]__device__(int d) -> void* {
return d < outIsDst ? outbuf + userOneBeg
: work->regUsed && (sendDirectFlag & NCCL_DIRECT_WRITE) ? (char*)dstPtrs[d-outIsDst] + userOneBeg
: (char*)dstPtrs[d-outIsDst] + railAllOffset;
},
delta);
}
railAllOffset += delta;
node += 1;
}
@@ -281,15 +312,15 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
__syncwarp();
} else {
// Phase 2: Recv network -> deposit output + send to bcast
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, tn, &direct->out, direct->heads + 1, nullptr, nullptr,
/*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0);
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
prims(tid, tn, &direct->out, direct->heads + 1, nullptr, work->recvbuff,
/*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0, work);
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
Scatterer</*BcastSendNotRecv=*/true> scat;
scat.work = work;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.template process</*Recv=*/1, /*Send=*/1>(scat);
prims.template process</*Recv=*/1, /*Send=*/1>(scat, work->direct, 0);
}
}
return;
@@ -299,15 +330,15 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
tn = nWarps3*WARP_SIZE;
if (tid < tn) {
// Phase 3: Recv bcast -> deposit output
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid, tn, direct->heads+1, nullptr, nullptr, nullptr,
/*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0);
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
prims(tid, tn, direct->heads+1, nullptr, nullptr, work->recvbuff,
/*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0, work);
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
Scatterer</*BcastSendNotRecv=*/false> scat;
scat.work = work;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.template process</*Recv=*/1, /*Send=*/0>(scat);
prims.template process</*Recv=*/1, /*Send=*/0>(scat, 0, work->direct);
}
return;
}
+77 -38
Ver ficheiro
@@ -23,8 +23,11 @@ namespace {
int nelem;
int chunk;
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work);
for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
ssize_t remCount = channelCount - elemOffset;
@@ -41,7 +44,7 @@ namespace {
chunkOffset = chunk * chunkCount;
offset = gridOffset + elemOffset + chunkOffset;
nelem = (int)min(chunkCount, remCount - chunkOffset);
prims.send(offset, nelem);
prims.directSend(offset, offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j = 2; j < nranks; ++j) {
@@ -49,7 +52,7 @@ namespace {
chunkOffset = chunk * chunkCount;
offset = gridOffset + elemOffset + chunkOffset;
nelem = (int)min(chunkCount, remCount - chunkOffset);
prims.recvReduceSend(offset, nelem);
prims.directRecvReduceDirectSend(offset, offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
@@ -58,7 +61,7 @@ namespace {
chunkOffset = chunk * chunkCount;
offset = gridOffset + elemOffset + chunkOffset;
nelem = (int)min(chunkCount, remCount - chunkOffset);
prims.directRecvReduceCopySend(offset, offset, nelem, /*postOp=*/true);
prims.directRecvReduceCopyDirectSend(offset, offset, nelem, /*postOp=*/true);
// k-2 steps: copy to next GPU
for (int j = 1; j < nranks - 1; ++j) {
@@ -66,7 +69,7 @@ namespace {
chunkOffset = chunk * chunkCount;
offset = gridOffset + elemOffset + chunkOffset;
nelem = (int)min(chunkCount, remCount - chunkOffset);
prims.directRecvCopySend(offset, nelem);
prims.directRecvCopyDirectSend(offset, nelem);
}
// Make final copy from buffer to dest.
@@ -75,7 +78,7 @@ namespace {
offset = gridOffset + elemOffset + chunkOffset;
nelem = (int)min(chunkCount, remCount - chunkOffset);
prims.directRecv(offset, nelem);
prims.directRecv(offset, offset, nelem);
}
}
@@ -90,34 +93,34 @@ namespace {
int nelem;
{ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
(tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg);
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/1, Proto, 0> prims
(tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work);
if (tree->up == -1) {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
prims.directRecvReduceCopy(offset, offset, nelem, /*postOp=*/true);
}
}
else if (tree->down[0] == -1) {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.send(offset, nelem);
prims.directSend(offset, nelem);
}
}
else {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.recvReduceSend(offset, nelem);
prims.directRecvReduceDirectSend(offset, offset, nelem);
}
}
}
{ // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_TREE_ARITY>, /*Direct=*/1, Proto, 0> prims
(tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
(tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work);
if (tree->up == -1) {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
@@ -129,14 +132,14 @@ namespace {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.directRecv(offset, nelem);
prims.directRecv(offset, offset, nelem);
}
}
else {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.directRecvCopySend(offset, nelem);
prims.directRecvCopyDirectSend(offset, nelem);
}
}
}
@@ -164,11 +167,11 @@ namespace {
if (tree->up == -1) {
// Reduce and broadcast. Max number of recv is 2, max number of send is 2
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_TREE_ARITY_TOP>, /*Direct=*/1, Proto, 0>
prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg, 0, 0, 0, work);
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.directRecvReduceCopySend(offset, offset, nelem, /*doPost=*/true);
prims.directRecvReduceCopyDirectSend(offset, offset, nelem, /*doPost=*/true);
}
}
else if (tid < nthreadsSplit) {
@@ -180,40 +183,46 @@ namespace {
* into DirectRecv and DirectSend capabilities, this ctor would have both=0,
* but the ctor above for tree roots would be DirectRecv=0 DirectSend=1.
*/
// Coverity reports that the callee treats &tree->up as an array. However, due to the use of
// FanAsymmetric<n, 1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/1, Proto, 0>
prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth);
prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work);
if (tree->down[0] == -1) {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.send(offset, nelem);
prims.directSend(offset, offset, nelem);
}
}
else {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.recvReduceSend(offset, nelem);
prims.directRecvReduceDirectSend(offset, offset, nelem);
}
}
}
else {
// Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
// Coverity reports that the callee treats &tree->up as an array. However, due to the use of
// FanAsymmetric<1, n>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_TREE_ARITY>, /*Direct=*/1, Proto, 0>
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, work->sendbuff, work->recvbuff,
work->redOpArg, 1*Proto::MaxGroupWidth);
work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
if (tree->down[0] == -1) {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.directRecv(offset, nelem);
prims.directRecv(offset, offset, nelem);
}
}
else {
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
nelem = min(chunkCount, channelCount - elemOffset);
prims.directRecvCopySend(offset, nelem);
prims.directRecvCopyDirectSend(offset, nelem);
}
}
}
@@ -264,9 +273,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
// Scatter
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, work->sendbuff, work->recvbuff,
work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, work);
work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
int nelem = min(direct->nHeads*chunkSize, size-offset);
@@ -276,12 +285,15 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
}
}
// Coverity complains about a possible overrun inside the destructor of "prims", but that's actually
// a false positive.
// coverity[overrun-call:FALSE]
} else if (tid >= tidStartReduce && direct->out != -1) {
if (hasDn) {
// Reduce, send to network
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, work->sendbuff, work->recvbuff,
work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, work);
work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
@@ -323,6 +335,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
} else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
if (hasDn) {
// Recv from network, broadcast
// Coverity complains about a possible overrun inside the class below, but that's actually
// a false positive.
// coverity[identity_transfer:FALSE]
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, work->sendbuff, work->recvbuff,
work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
@@ -382,7 +397,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
ssize_t offset;
int nelem;
int remCount = channelCount%(nvls->nHeads*chunkSize);
int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T));
int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16384/sizeof(T));
if (tid < tidEndScatter) {
// Scatter
@@ -456,6 +471,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
if (!hasOut) {
// Reduce, broadcast through NVLS
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
// Coverity complains about a possible overrun inside the class below, but that's actually
// a false positive.
// coverity[identity_transfer:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
@@ -467,6 +485,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
} else {
// Reduce, send to network
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
// Coverity complains about a possible overrun inside the class below, but that's actually
// a false positive.
// coverity[identity_transfer:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
@@ -479,6 +500,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
} else if (tid < tidEndBcast && nvls->headRank != -1) {
// Recv from network, broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
// Coverity complains about a possible overrun inside the class below, but that's actually
// a false positive.
// coverity[identity_transfer:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
@@ -564,6 +588,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_
} else {
// Reduce, send to network
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
// Coverity reports that the callee treats &treeUp as an array. However, due to the use of
// FanAsymmetric<3, 1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
@@ -579,6 +606,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_
} else if (tid < tidEndBcast && nvls->headRank != -1) {
// Recv from network, broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
// Coverity reports that the callee treats &treeUp as an array. However, due to the use of
// FanAsymmetric<1, 3>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/1, Proto, 0>
prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
@@ -639,21 +669,21 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
} else {
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
prims.send(offset, nelem);
prims.directSend(offset, offset, nelem);
}
}
} else {
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
prims.recvReduceSend(offset, nelem);
prims.directRecvReduceDirectSend(offset, offset, nelem);
}
}
}
@@ -668,40 +698,49 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
}
__syncwarp();
} else {
// Coverity reports that the callee treats &send as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
prims.recv(offset, nelem, /*postOp*/true);
prims.directRecv(offset, offset, nelem, /*postOp*/true);
}
}
} else {
// Coverity reports that the callee treats &send as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid * int(chunkSize);
int nelem = min(chunkSize, size - offset);
prims.recvCopyDirectSend(offset, nelem, /*postOp*/true);
prims.directRecvCopyDirectSend(offset, nelem, /*postOp*/true);
}
}
} else {
// Coverity reports that the callee treats &send as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex, work);
if (send == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecv(offset, nelem);
prims.directRecv(offset, offset, nelem);
}
} else {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecvCopySend(offset, nelem);
prims.directRecvCopyDirectSend(offset, nelem);
}
}
}
+9 -6
Ver ficheiro
@@ -24,8 +24,11 @@ namespace {
T *inputBuf = (T*)work->sendbuff;
T *outputBuf = (T*)work->recvbuff;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg);
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
offset = gridOffset + elemOffset;
@@ -33,14 +36,14 @@ namespace {
if (rank == root) {
if (inputBuf == outputBuf) {
prims.send(offset, nelem);
prims.directSend(offset, offset, nelem);
} else {
prims.copySend(offset, offset, nelem);
prims.directCopySend(offset, offset, nelem);
}
} else if (nextRank == root) {
prims.recv(offset, nelem);
prims.directRecv(offset, offset, nelem);
} else {
prims.recvCopySend(offset, nelem);
prims.directRecvCopyDirectSend(offset, nelem);
}
}
}
+11 -5
Ver ficheiro
@@ -97,7 +97,7 @@ __device__ inline void barrier_sync_aligned(int name, int nThreads) {
__device__ inline bool barrier_red_or(bool vote, int name) {
int ans;
asm("{ .reg .pred p;"
asm volatile("{ .reg .pred p;"
" setp.ne.s32 p, %1, 0;"
" barrier.red.or.pred p, %2, p; "
" selp.s32 %0, 1, 0, p; }"
@@ -106,7 +106,7 @@ __device__ inline bool barrier_red_or(bool vote, int name) {
}
__device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
int ans;
asm("{ .reg .pred p;"
asm volatile("{ .reg .pred p;"
" setp.ne.s32 p, %1, 0;"
" barrier.red.or.pred p, %2, %3, p; "
" selp.s32 %0, 1, 0, p; }"
@@ -115,7 +115,7 @@ __device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
}
__device__ inline bool barrier_red_or_aligned(bool vote, int name) {
int ans;
asm("{ .reg .pred p;"
asm volatile("{ .reg .pred p;"
" setp.ne.s32 p, %1, 0;"
" barrier.red.or.pred.aligned p, %2, p; "
" selp.s32 %0, 1, 0, p; }"
@@ -137,9 +137,9 @@ inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int by
int offset = 16*tid;
if (offset < bytes) {
uint64_t a=0, b=0;
asm("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset));
asm volatile("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset) : "memory");
uint32_t udst = (uint32_t)__cvta_generic_to_shared(dst);
asm volatile("st.shared.v2.u64 [%0],{%1,%2};" :: "r"(udst + offset), "l"(a), "l"(b));
asm volatile("st.shared.v2.u64 [%0],{%1,%2};" :: "r"(udst + offset), "l"(a), "l"(b) : "memory");
}
}
@@ -300,6 +300,9 @@ struct RunWorkBatch {
if (work->nWarps != workPrev->nWarps) __syncthreads();
}
int subtn = work->nWarps*WARP_SIZE;
// Coverity reports a possible thread divergence due to not all threads participating in the collective.
// However, the code ensures that the participation is on a per-warp basis.
// coverity[device_thread_diverged:FALSE]
if (tid < subtn) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid, subtn, work);
}
}
@@ -348,6 +351,9 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
default:
{ int subtid = tid - 2*WARP_SIZE;
int subtn = tn - 2*WARP_SIZE;
// Coverity reports a possible thread divergence due to not all threads participating in the collective.
// However, the code ensures that the participation is on a per-warp basis.
// coverity[device_thread_diverged:FALSE]
loadWorkBatchToShmem(subtid, subtn, args, /*batchIx=*/blockIdx.x);
} break;
}
+17 -1
Ver ficheiro
@@ -69,6 +69,8 @@ __device__ __forceinline__ void reduceCopyPacks(
minSrcs[s] = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
#pragma unroll
for (int d=0; d < MinDsts; d++)
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
minDsts[d] = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;
// We dictate loop termination condition according to whether partial hunks
@@ -93,13 +95,17 @@ __device__ __forceinline__ void reduceCopyPacks(
#pragma unroll (MinSrcs-1 + !(MinSrcs-1))
for (int s=1; s < MinSrcs; s++) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_begin]
BytePack<BytePerPack> tmp[Unroll];
// coverity[dead_error_line]
RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
if (s < MultimemSrcs) {
// applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
acc[u] = applyLoadMultimem<RedFn, BytePerPack>(redFn, minSrcs[s]);
// coverity[dead_error_line]
tmp[u] = applyLoadMultimem<RedFn, BytePerPack>(redFn, minSrcs[s]);
} else {
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
tmp[u] = ld_volatile_global<BytePerPack>(minSrcs[s]);
@@ -108,6 +114,7 @@ __device__ __forceinline__ void reduceCopyPacks(
}
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
// coverity[dead_error_line]
if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]);
acc[u] = applyReduce(redFn, acc[u], tmp[u]);
}
@@ -116,6 +123,8 @@ __device__ __forceinline__ void reduceCopyPacks(
for (int s=MinSrcs; (MinSrcs < MaxSrcs) && (s < MaxSrcs) && (s < nSrcs); s++) {
uintptr_t src = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
BytePack<BytePerPack> tmp[Unroll];
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
@@ -125,6 +134,8 @@ __device__ __forceinline__ void reduceCopyPacks(
}
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]);
acc[u] = applyReduce(redFn, acc[u], tmp[u]);
}
@@ -139,7 +150,10 @@ __device__ __forceinline__ void reduceCopyPacks(
#pragma unroll (MinDsts + !MinDsts)
for (int d=0; d < MinDsts; d++) {
#pragma unroll Unroll
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_begin]
for (int u=0; u < Unroll; u++) {
// coverity[dead_error_condition]
if (d < MultimemDsts) {
multimem_st_global(minDsts[d], acc[u]);
} else {
@@ -161,6 +175,8 @@ __device__ __forceinline__ void reduceCopyPacks(
#pragma unroll
for (int s=0; s < MinSrcs; s++) minSrcs[s] += (nWarps-1)*BytePerHunk;
#pragma unroll
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
for (int d=0; d < MinDsts; d++) minDsts[d] += (nWarps-1)*BytePerHunk;
threadBytesBehind += nWarps*BytePerHunk;
threadBytesAhead -= nWarps*BytePerHunk;
+7 -4
Ver ficheiro
@@ -7,7 +7,7 @@ all_colls = ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","Send
all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
all_tys = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16"]
all_protos = ["LL","LL128","SIMPLE"]
all_algos = ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"]
all_algos = ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE","PAT"]
################################################################################
# The first command line argument is the path to the directory to generate and
@@ -74,11 +74,11 @@ else:
################################################################################
algos_of_coll = {
"AllGather": ["RING","COLLNET_DIRECT","NVLS"],
"AllReduce": all_algos,
"AllGather": ["RING","COLLNET_DIRECT","NVLS","PAT"],
"AllReduce": ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"],
"Broadcast": ["RING"],
"Reduce": ["RING"],
"ReduceScatter": ["RING","COLLNET_DIRECT","NVLS"],
"ReduceScatter": ["RING","COLLNET_DIRECT","NVLS","PAT"],
"SendRecv": [None]
}
@@ -253,6 +253,9 @@ with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
cudart, _ = required_cuda(*kfn)
sym = paste("_", "ncclDevKernel", *kfn)
if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
# __global__ below gets removed by the host compiler, which results in
# Coverity diagnosing a specifiers inconsistency.
out("// coverity[declaration]\n")
out("__global__ void %s(ncclDevKernelArgs4K const);\n" % sym)
if cudart != 0: out("#endif\n")
out("\n")
+4 -2
Ver ficheiro
@@ -19,10 +19,10 @@
inline __device__ void load64gpu(const uint64_t* ptr, uint64_t &v) {
#if __CUDA_ARCH__ >= 700
asm volatile("ld.relaxed.gpu.u64 {%0}, [%1];"
: "=l"(v) : "l"(ptr));
: "=l"(v) : "l"(ptr) : "memory");
#else
asm volatile("ld.volatile.global.u64 {%0}, [%1];"
: "=l"(v) : "l"(ptr));
: "=l"(v) : "l"(ptr) : "memory");
#endif
}
@@ -226,6 +226,8 @@ inline __device__ void ncclNetDeviceUnpackInner(
int PPW = ppw(nbytes, nw);
// Coverity reports a potential overflow but in reality PPW is tiny so there's no need to store it in an uint64_t.
// coverity[overflow_before_widen]
for (uint64_t meta_s = w * PPW; meta_s < meta_cnt; meta_s += nw * PPW) {
uint64_t iter_meta_cnt = meta_cnt - meta_s;
+26 -23
Ver ficheiro
@@ -11,28 +11,28 @@
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
: "=l"(v0), "=l"(v1) : "l"(ptr));
: "=l"(v0), "=l"(v1) : "l"(ptr) : "memory");
}
inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};"
:: "l"(v0), "l"(v1), "l"(ptr));
:: "l"(v0), "l"(v1), "l"(ptr) : "memory");
}
inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
uint64_t* shmemAsmPtr;
asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr));
asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr) : "memory");
return shmemAsmPtr;
}
inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];"
: "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr));
: "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr) : "memory");
}
inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};"
:: "l"(v0), "l"(v1), "l"(shmemAsmPtr));
:: "l"(v0), "l"(v1), "l"(shmemAsmPtr) : "memory");
}
template<typename T>
@@ -48,20 +48,20 @@ inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1
// Produce 4 bytes of sub-register type by reading 2 4-byte
// aligned values and shifting.
uint32_t lo, hi;
asm("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0));
asm("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1));
asm volatile("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0) : "memory");
asm volatile("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1) : "memory");
tmp4[e] = __funnelshift_r(lo, hi, 8*(int(reinterpret_cast<uintptr_t>(ptr))%4));
}
}
else if(sizeof(T) == 4) {
#pragma unroll
for(int e=0; e < 4; e++)
asm("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e));
asm volatile("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e) : "memory");
}
else /*sizeof(T)==8*/ {
#pragma unroll
for(int e=0; e < 2; e++)
asm("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e));
asm volatile("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e) : "memory");
}
v0 = tmp8[0];
v1 = tmp8[1];
@@ -146,6 +146,9 @@ struct BytePackOf<BytePack<0>> {
template<typename T>
__device__ __forceinline__ typename BytePackOf<T>::Pack toPack(T value) {
union { typename BytePackOf<T>::Pack p; T v; };
// Coverity recommends the use of std::move here but, given that T is a POD
// scalar, a plain copy will be just as efficient.
// coverity[copy_assignment_call]
v = value;
return p;
}
@@ -183,7 +186,7 @@ template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t ad
template<> \
__device__ __forceinline__ BytePack<bytes> ld_##space<bytes>(addr_cxx_ty addr) { \
data_cxx_ty tmp; \
asm("ld." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \
asm volatile("ld." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr) : "memory"); \
BytePack<bytes> ans; \
ans.native = tmp; \
return ans; \
@@ -191,7 +194,7 @@ template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t ad
template<> \
__device__ __forceinline__ BytePack<bytes> ld_volatile_##space<bytes>(addr_cxx_ty addr) { \
data_cxx_ty tmp; \
asm("ld.volatile." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr)); \
asm volatile("ld.volatile." #space "." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : #addr_reg_ty(addr) : "memory"); \
BytePack<bytes> ans; \
ans.native = tmp; \
return ans; \
@@ -212,7 +215,7 @@ template<> __device__ __forceinline__ void st_relaxed_gpu_global<0>(uintptr_t ad
template<> \
__device__ __forceinline__ BytePack<bytes> ld_relaxed_gpu_global<bytes>(uintptr_t addr) { \
data_cxx_ty tmp; \
asm("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr)); \
asm volatile("ld." PTX_relaxed_gpu ".global." #data_ptx_ty " %0, [%1];" : "="#data_reg_ty(tmp) : "l"(addr) : "memory"); \
BytePack<bytes> ans; \
ans.native = tmp; \
return ans; \
@@ -242,18 +245,18 @@ DEFINE_ld_st__size(8, uint64_t, b64, l)
template<> \
__device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \
BytePack<16> ans; \
asm("ld." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \
asm volatile("ld." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr) : "memory"); \
return ans; \
} \
template<> \
__device__ __forceinline__ BytePack<16> ld_volatile_##space<16>(addr_cxx_ty addr) { \
BytePack<16> ans; \
asm("ld.volatile." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr)); \
asm volatile("ld.volatile." #space ".v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : #addr_reg_ty(addr) : "memory"); \
return ans; \
} \
template<> \
__device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \
asm("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \
asm volatile("st." #space ".v2.b64 [%0], {%1,%2};" :: #addr_reg_ty(addr), "l"(value.u64[0]), "l"(value.u64[1]) : "memory"); \
}
DEFINE_ld_st_16__space(global, uintptr_t, l)
DEFINE_ld_st_16__space(shared, uint32_t, r)
@@ -262,7 +265,7 @@ DEFINE_ld_st_16__space(shared, uint32_t, r)
template<>
__device__ __forceinline__ BytePack<16> ld_relaxed_gpu_global<16>(uintptr_t addr) {
BytePack<16> ans;
asm("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr));
asm volatile("ld." PTX_relaxed_gpu ".global.v2.b64 {%0,%1}, [%2];" : "=l"(ans.u64[0]), "=l"(ans.u64[1]) : "l"(addr) : "memory");
return ans;
}
template<>
@@ -277,33 +280,33 @@ __device__ __forceinline__ void st_relaxed_gpu_global<16>(uintptr_t addr, BytePa
__device__ __forceinline__ uint64_t ld_volatile_global(uint64_t *ptr) {
uint64_t ans;
asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
return ans;
}
__device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) {
uint64_t ans;
#if __CUDA_ARCH__ >= 700
asm("ld.relaxed.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
asm volatile("ld.relaxed.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
#else
asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
#endif
return ans;
}
__device__ __forceinline__ uint64_t ld_relaxed_gpu_global(uint64_t *ptr) {
uint64_t ans;
#if __CUDA_ARCH__ >= 700
asm("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
asm volatile("ld.relaxed.gpu.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
#else
asm("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
asm volatile("ld.volatile.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
#endif
return ans;
}
__device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) {
uint64_t ans;
#if __CUDA_ARCH__ >= 700
asm("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
asm volatile("ld.acquire.sys.global.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
#else
asm("ld.volatile.sys.global.u64 %0, [%1]; membar.gl;" : "=l"(ans) : "l"(cvta_to_global(ptr)));
asm volatile("ld.volatile.sys.global.u64 %0, [%1]; membar.gl;" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
#endif
return ans;
}
+9 -3
Ver ficheiro
@@ -115,19 +115,25 @@ struct PrimitivesWithoutDirect {
__device__ void directSendFromOutput(intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->sendFromOutput(outIx, eltN);
}
__device__ void directRecv(intptr_t outIx, int eltN) {
__device__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->recv(outIx, eltN, /*postOp=*/false);
}
__device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
static_cast<RealPrimitives*>(this)->copySend(inpIx, outIx, eltN, postOp);
}
__device__ void directRecvCopySend(intptr_t outIx, int eltN) {
__device__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
static_cast<RealPrimitives*>(this)->recvCopySend(outIx, eltN, /*postOp=*/false);
}
__device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
__device__ void recvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
// Direct is only for the send part
static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
static_cast<RealPrimitives*>(this)->recvReduceSend(inpIx, eltN);
}
__device__ __forceinline__ void directRecvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
}
};
#include "prims_simple.h"
+33 -14
Ver ficheiro
@@ -101,7 +101,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
uint32_t data1, flag1, data2, flag2;
int spins = 0;
do {
asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4) : "memory");
if (checkAbort(spins, 0)) break;
} while ((flag1 != flag) || (flag2 != flag));
uint64_t val64 = data1 + (((uint64_t)data2) << 32);
@@ -112,9 +112,11 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
__device__ void readLLBeginAll(int offset, ncclLLFifoLine(&line)[MaxRecv]) {
#pragma unroll
for (int i=BeginIx; i < MaxRecv; i++) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
if (i < fan.nrecv()) {
union ncclLLFifoLine* src = recvPtr(i) + offset;
asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4));
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4) : "memory");
}
}
}
@@ -123,7 +125,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
uint32_t flag = recvFlag(i);
int spins = 0;
while (line[i].flag1 != flag || line[i].flag2 != flag) {
asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4));
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4) : "memory");
if (checkAbort(spins, 0)) break;
}
uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32);
@@ -131,7 +133,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
}
__device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag) : "memory");
}
static constexpr int EltPerLine = sizeof(uint64_t)/sizeof(T);
@@ -145,13 +147,13 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
uint64_t u8;
};
if(sizeof(U) == 1)
asm("ld.volatile.global.b8 %0,[%1];" : "=r"(u4) : "l"(src));
asm volatile("ld.volatile.global.b8 %0,[%1];" : "=r"(u4) : "l"(src) : "memory");
else if(sizeof(U) == 2)
asm("ld.volatile.global.b16 %0,[%1];" : "=h"(u2) : "l"(src));
asm volatile("ld.volatile.global.b16 %0,[%1];" : "=h"(u2) : "l"(src) : "memory");
else if(sizeof(U) == 4)
asm("ld.volatile.global.b32 %0,[%1];" : "=r"(u4) : "l"(src));
asm volatile("ld.volatile.global.b32 %0,[%1];" : "=r"(u4) : "l"(src) : "memory");
else
asm("ld.volatile.global.b64 %0,[%1];" : "=l"(u8) : "l"(src));
asm volatile("ld.volatile.global.b64 %0,[%1];" : "=l"(u8) : "l"(src) : "memory");
return elt;
}
@@ -165,13 +167,13 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
};
elt = val;
if(sizeof(U) == 1)
asm("st.volatile.global.b8 [%0],%1;" :: "l"(dst), "r"(u4));
asm volatile("st.volatile.global.b8 [%0],%1;" :: "l"(dst), "r"(u4) : "memory");
else if(sizeof(U) == 2)
asm("st.volatile.global.b16 [%0],%1;" :: "l"(dst), "h"(u2));
asm volatile("st.volatile.global.b16 [%0],%1;" :: "l"(dst), "h"(u2) : "memory");
else if(sizeof(U) == 4)
asm("st.volatile.global.b32 [%0],%1;" :: "l"(dst), "r"(u4));
asm volatile("st.volatile.global.b32 [%0],%1;" :: "l"(dst), "r"(u4) : "memory");
else
asm("st.volatile.global.b64 [%0],%1;" :: "l"(dst), "l"(u8));
asm volatile("st.volatile.global.b64 [%0],%1;" :: "l"(dst), "l"(u8) : "memory");
}
struct DataLoader {
@@ -194,6 +196,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
else {
#pragma unroll
for(int i=0; i < EltPerLine; i++) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
if(i==0 || i < eltN)
elt[i] = load(src + i);
}
@@ -218,6 +222,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
u8 = val;
#pragma unroll
for(int i=0; i < EltPerLine; i++) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
if (i==0 || i < eltN)
//store(dst+i, elt[i]);
dst[i] = elt[i];
@@ -261,6 +267,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
if (RECV) {
data = !SRC ? peerData : applyReduce(redOp, peerData, data);
#pragma unroll MaxRecv
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) {
peerData = readLLFinish(offset, line, i);
data = applyReduce(redOp, peerData, data);
@@ -271,6 +279,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
// Send : inter-node, then intra-node, then local
if (SEND) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
for (int i=1; i < MaxSend && i < fan.nsend(); i++)
storeLL(sendPtr(i)+offset, data, sendFlag(i));
storeLL(sendPtr(0)+offset, data, sendFlag(0));
@@ -288,6 +298,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
postRecv();
}
if (SEND) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
for (int i=1; i < MaxSend && i < fan.nsend(); i++)
incSend(i, offset);
incSend(0, offset);
@@ -324,8 +336,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
__device__ Primitives(
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr,
bool userBufReg=false, int stepSize_=0
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclDevWorkColl* e = nullptr,
bool ipcReg = false, bool netReg = false, int stepSize_ = 0
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
@@ -334,16 +346,23 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
// If we are going to support oneshot collNet + LL, then we would need to add connector index here
int nrecv=0, nsend=0;
// We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) {
loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv);
nrecv++;
}
// coverity[dead_error_line]
while (nsend < MaxSend && sendPeers[nsend] >= 0) {
loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend);
nsend++;
}
this->fan = Fan(nrecv, nsend);
// Coverity reports recvConn and sendConn being possibly NULL at this point but that won't actually
// happen given the two "while" loops just above.
// coverity[var_deref_model:FALSE]
loadRecvSync();
// coverity[var_deref_model:FALSE]
loadSendSync();
setDataPtrs(inputBuf, outputBuf);
}
+9 -1
Ver ficheiro
@@ -234,6 +234,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
}
}
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
for (int i=1; i<MaxRecv && i<fan.nrecv(); i++) {
uint64_t flag = recvFlag(i);
uint64_t* ptr = recvPtr(i)+ll128Offset;
@@ -272,6 +274,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
/************************ Send **************************/
if (SEND) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
for (int i=1; i<MaxSend && i<fan.nsend(); i++) {
uint64_t flag = sendFlag(i);
uint64_t* ptr = sendPtr(i)+ll128Offset;
@@ -365,7 +369,7 @@ public:
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclDevWorkColl* e = nullptr,
bool userBufReg=false, int stepSize_=0
bool ipcReg = false, bool netReg = false, int stepSize_ = 0
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
@@ -383,7 +387,11 @@ public:
nsend++;
}
this->fan = Fan(nrecv, nsend);
// Coverity reports recvConn and sendConn being possibly NULL at this point but that won't actually
// happen given the two "while" loops just above.
// coverity[var_deref_model:FALSE]
loadRecvSync();
// coverity[var_deref_model:FALSE]
loadSendSync();
setDataPtrs(inputBuf, outputBuf);
}
+456 -225
Ver ficheiro
@@ -7,6 +7,12 @@
#include "network/unpack/unpack.h"
#include <cassert>
enum primsMode {
primsModeDefault = 0,
primsModePatRs = 1,
primsModePatAg = 2
};
template<typename T, typename RedOp, typename Fan, int Direct,
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
class Primitives<
@@ -14,21 +20,25 @@ class Primitives<
> {
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
static constexpr int Input=0, Output=1;
static constexpr int RoleWaitRecv = 0x04, // 0x1 0x2 are free to use
static constexpr int RoleInput = 0x01,
RoleOutput = 0x02,
RoleWaitRecv = 0x04,
RoleWaitSend = 0x08,
RolePostSend = 0x10,
RolePostRecv = 0x20,
Aborted = 0x40,
UserBufferMode = 0x80,
NetRegMode = 0x80,
ConnFifoEnabled = 0x100,
DirectWrite = 0x200,
DirectRead = 0x400,
// 0x800 is free to use
PatMode = 0x800,
NvlsMinPolling = 0x1000,
NetDeviceUnpack = 0x2000,
AnyNetDeviceUnpack = 0x4000,
NvlsDirectRead = 0x8000,
NvlsDirectWrite = 0x10000;
NvlsDirectWrite = 0x10000,
IpcWrite = 0x20000,
IpcRead = 0x40000;
const int tid, tidInBlock;
const int nthreads;
int nworkers;
@@ -38,13 +48,15 @@ class Primitives<
int flags;
int group;
uint64_t step;
struct ncclConnInfo* conn = NULL;
struct ncclConnFifo* connFifo = NULL;
T* connEltsFifo;
T* directBuff;
T* directBuff = NULL;
uint64_t *connStepPtr;
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
int connStepSize; // Connection step size
void* netDeviceHandle;
uint64_t accSize; // Accumulated size. Used by PAT operations
// Don't use barrier 0 as it's used by the final sync
__device__ void barrier() {
@@ -95,7 +107,7 @@ class Primitives<
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
if (flags & NvlsMinPolling) {
uint64_t ans;
asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
asm volatile("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)) : "memory");
return ans;
}
#endif
@@ -107,8 +119,10 @@ class Primitives<
template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
__device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) {
const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
const bool noRecvWait = DirectRecv && Src && (flags & DirectRead); // no wait when directly reading from remote input
const bool noRecvWait = DirectRecv && Src && (flags & (DirectRead | IpcRead)); // no wait when directly reading from remote input
const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) ||
((flags & (Send*RoleWaitSend)) && !noSendWait)) {
int spins = 0;
@@ -125,28 +139,30 @@ class Primitives<
void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
: (ncclShmem.groups[group].srcs + Src);
if (flags & UserBufferMode) {
if (flags & NetRegMode) {
// Do nothing
} else if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
ptrs[index] = connEltsFifo + loadInt(&connFifo[step%NCCL_STEPS].offset)/sizeof(T);
} else if (isSendNotRecv && DirectSend) {
if (flags & (DirectWrite | NvlsDirectWrite)) {
if (flags & (DirectWrite | NvlsDirectWrite | IpcWrite)) {
ptrs[index] = directBuff + dstIx + offset;
} else if (flags & DirectRead) { // empty send
} else if ((flags & DirectRead) || (flags & IpcRead)) { // empty send
ptrs[index] = nullptr;
} else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
}
} else if (!isSendNotRecv && DirectRecv) {
if (flags & (DirectRead | NvlsDirectRead)) {
if (flags & (DirectRead | NvlsDirectRead | IpcRead)) {
ptrs[index] = directBuff + srcIx + offset;
} else if (flags & DirectWrite) {
} else if ((flags & DirectWrite) || (flags & IpcWrite)) {
ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer
} else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
}
}
else {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
}
if (flags & NetDeviceUnpack) {
@@ -182,7 +198,7 @@ class Primitives<
int slice = 0;
int offset = 0;
if (tid < nworkers && offset < nelem && ((flags & UserBufferMode) == 0)) {
if (tid < nworkers && offset < nelem && ((flags & NetRegMode) == 0)) {
// Worker-only loop for non-empty slices. Non-workers and empty slices are
// processed in the loop following this if block. The benefit of splitting
// the loop like this is we pull two branches out of the critical path.
@@ -234,7 +250,7 @@ class Primitives<
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]
/* NVLS can have srcs[0] == dsts[0], but we cannot enter this "if branch",
* so we need to check whether MultimemSrcs and MultimemDsts are 0. */
&& MultimemSrcs == 0 && MultimemDsts == 0) {
&& MultimemSrcs == 0 && MultimemDsts == 0 && !Src) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (Send) {
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, MaxSend, /*PreOpSrcs*/0>
@@ -250,7 +266,7 @@ class Primitives<
Recv, ncclShmem.groups[group].srcs,
Dst, ncclShmem.groups[group].dsts,
workSize);
} else {
} else if (ncclShmem.groups[group].srcs[0] && ncclShmem.groups[group].dsts[0]) {
constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
reduceCopy<Unroll, RedOp, T,
@@ -265,6 +281,8 @@ class Primitives<
postPeer<Recv, Send>(0 < sliceSize);
offset += sliceSize;
slice += 1;
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
} while (slice < SlicePerChunk && offset < nelem);
}
@@ -310,12 +328,13 @@ public:
}
template<int Recv, int Send, typename Fn>
__device__ __forceinline__ void process(Fn &&fn) {
__device__ __forceinline__ void process(Fn &&fn, uint32_t sendDirectFlag, uint32_t recvDirectFlag) {
#pragma unroll 1
for (int slice=0; slice < SlicePerChunk; slice++) {
if (tid < nworkers) {
int nsend, nrecv;
if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
int spins = 0;
while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
connStepCache = loadStepValue(connStepPtr);
@@ -326,19 +345,53 @@ public:
if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
int offset = loadInt(&connFifo[step%NCCL_STEPS].offset);
ptrs[index] = connEltsFifo + offset/sizeof(T);
} else if (Direct && fn.work->regUsed) {
if (isSendNotRecv) {
if (flags & (DirectWrite | IpcWrite)) {
ptrs[index] = directBuff;
} else if (flags & (DirectRead | IpcRead)) { // empty send
ptrs[index] = nullptr;
} else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
}
} else {
if (flags & (DirectRead | IpcRead)) {
ptrs[index] = directBuff;
} else if (flags & (DirectWrite | IpcWrite)) {
if (Send)
ptrs[index] = directBuff; // send to next from my output buffer
else
ptrs[index] = nullptr;
} else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
}
}
} else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
}
}
subBarrier();
fn.template operator()<SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend>
(tid, nworkers, slice, stepSize*StepPerSlice,
fan.nrecv(), ncclShmem.groups[group].srcs,
fan.nsend(), ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes);
if (Recv == 0 || ncclShmem.groups[group].srcs[0] == nullptr) {
nrecv = 0;
} else {
nrecv = fan.nrecv();
}
if (Send == 0 || ncclShmem.groups[group].dsts[0] == nullptr) {
nsend = 0;
} else {
nsend = fan.nsend();
}
fn.template operator() < SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend >
(tid, nworkers, slice, stepSize * StepPerSlice,
nrecv, ncclShmem.groups[group].srcs,
nsend, ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes, sendDirectFlag, recvDirectFlag);
}
barrier();
int32_t dstSize = 0;
if (flags & Send*RolePostSend) {
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_begin]
dstSize = ncclShmem.groups[group].dstSizes[index];
ncclShmem.groups[group].dstSizes[index] = 0;
if (flags & ConnFifoEnabled) connFifo[step%NCCL_STEPS].size = dstSize*sizeof(T);
@@ -421,99 +474,97 @@ private:
}
}
__device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
if (flags & (RoleWaitRecv|RolePostRecv)) {
auto *conn = &peer->recv[connIndex];
if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
// handle must be a device ptr
netDeviceHandle = conn->netDeviceHandle.handle;
// Cache the handle
ncclNetDeviceUnpackSetup(netDeviceHandle, group, index);
flags |= NetDeviceUnpack;
}
step = conn->step;
step = roundUp(step, SlicePerChunk*StepPerSlice);
if (flags & RolePostRecv) {
connStepPtr = conn->head;
*connStepPtr = step; // Return credits in case we rounded up.
}
if (flags & RoleWaitRecv) {
ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->tail;
connStepCache = loadStepValue(connStepPtr);
connStepSize = conn->stepSize/sizeof(T);
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
if (conn->connFifo != nullptr) {
flags |= ConnFifoEnabled;
connFifo = conn->connFifo;
} else if (Direct) {
// User buffers have been registered
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
}
} else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
// direct read not allowed in non-register case
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
}
} else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) {
/* NVLS direct */
flags |= NvlsDirectRead;
__device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) {
conn = &peer->recv[connIndex];
if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
// handle must be a device ptr
netDeviceHandle = conn->netDeviceHandle.handle;
// Cache the handle
ncclNetDeviceUnpackSetup(netDeviceHandle, group, index);
flags |= NetDeviceUnpack;
}
step = conn->step;
step = roundUp(step, SlicePerChunk*StepPerSlice);
if (flags & RolePostRecv) {
connStepPtr = conn->head;
*connStepPtr = step; // Return credits in case we rounded up.
}
if (flags & RoleWaitRecv) {
if ((flags & PatMode) == 0) ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->tail;
connStepCache = loadStepValue(connStepPtr);
connStepSize = conn->stepSize/sizeof(T);
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
if (conn->connFifo != nullptr) {
flags |= ConnFifoEnabled;
connFifo = conn->connFifo;
} else if (Direct && regFlag) {
// User buffers have been registered
if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) {
if (P2p) {
flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead;
} else if (connIndex == 1 && direct) {
flags |= IpcRead;
} else {
flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite;
}
} else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
if (P2p) {
flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead;
} else if (connIndex == 1 && direct) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite;
}
} else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
/* NVLS direct */
flags |= NvlsDirectRead;
}
}
}
}
__device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
if (flags & (RoleWaitSend|RolePostSend)) {
auto *conn = &peer->send[connIndex];
step = conn->step;
step = roundUp(step, SlicePerChunk*StepPerSlice);
__device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) {
conn = &peer->send[connIndex];
step = conn->step;
step = roundUp(step, SlicePerChunk*StepPerSlice);
connFifo = conn->connFifo;
if (connFifo != nullptr) flags |= ConnFifoEnabled;
connFifo = conn->connFifo;
if (connFifo != nullptr) flags |= ConnFifoEnabled;
if (flags & RolePostSend) {
connStepPtr = conn->tail;
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
}
if (flags & RoleWaitSend) {
ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->head;
connStepCache = loadStepValue(connStepPtr);
connStepSize = conn->stepSize/sizeof(T);
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
if (connFifo == nullptr && Direct) {
// User buffers have been registered
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
}
} else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
// direct read not allowed in non-register case
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
}
} else if ((conn->flags & NCCL_NVLS_MIN_POLL) && e != nullptr && e->regUsed) {
/* NVLS direct */
flags |= NvlsDirectWrite;
if (flags & RolePostSend) {
connStepPtr = conn->tail;
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
}
if (flags & RoleWaitSend) {
if ((flags & PatMode) == 0) ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->head;
connStepCache = loadStepValue(connStepPtr);
connStepSize = conn->stepSize/sizeof(T);
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
if (connFifo == nullptr && Direct && regFlag) {
// User buffers have been registered
if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) {
if (P2p) {
flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead;
} else if (connIndex == 1 && direct) {
flags |= IpcRead;
} else {
flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite;
}
} else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
if (P2p) {
flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead;
} else if (connIndex == 1 && direct) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite;
}
} else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
/* NVLS direct */
flags |= NvlsDirectWrite;
}
}
}
@@ -523,7 +574,8 @@ private:
__device__ Primitives(
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,bool userBufReg=false, int stepSize_=0
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,
bool ipcReg = false, bool netReg = false, int stepSize_ = 0, int mode = primsModeDefault
):
tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
@@ -531,33 +583,71 @@ private:
// For send operations, we need an extra warp to overlap the threadfence and the copy
this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0);
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++;
this->fan = Fan(nrecv, nsend);
constexpr int ThreadPerSync =
MaxSend >= 16 || MaxRecv >= 16 ? 32 : // NVLS may have an arity > 8. In that case increase the size of the groups
MaxSend >= 8 || MaxRecv >= 8 ? 16 :
8; // Allows for all roles (WaitRecv/WaitSend/PostRecv/PostSend) within a single warp
static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
index = -1;
int peer = -1;
flags = 0;
assert(2*(nrecv+nsend) <= nthreads); // Ensure no thread is assigned more than one role.
if (tid < nrecv) { flags |= RoleWaitRecv; index = tid; }
else if (tid < nrecv+nsend) { flags |= RoleWaitSend; index = tid-nrecv; }
else if (nthreads-nsend <= tid) { flags |= RolePostSend; index = tid-(nthreads-nsend); }
else if (nthreads-nrecv-nsend <= tid) { flags |= RolePostRecv; index = tid-(nthreads-nrecv-nsend); }
index = -1;
if (mode == primsModeDefault) { // Connect to ranks in sendPeers/recvPeers
int nrecv=0, nsend=0;
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_line]
while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
// coverity[dead_error_line]
while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++;
this->fan = Fan(nrecv, nsend);
int peer = 0;
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
constexpr int ThreadPerSync =
MaxSend >= 16 || MaxRecv >= 16 ? 32 : // NVLS may have an arity > 8. In that case increase the size of the groups
MaxSend >= 8 || MaxRecv >= 8 ? 16 :
8; // Allows for all roles (WaitRecv/WaitSend/PostRecv/PostSend) within a single warp
static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
assert(2*(nrecv+nsend) <= nthreads); // Ensure no thread is assigned more than one role.
// Coverity assumes that index will equal tid based on the line below, but it doesn't consider the setting
// of flags. This results in multiple false positive overruns being reported here and in all_reduce.h.
// Unfortunately, we've been unsuccessful in trying to silence them with a single directive here so
// instead it's being done at the callers.
// coverity[assignment:FALSE]
if (tid < nrecv) { flags |= RoleWaitRecv; index = tid; }
// Yes, for some template arguments this code will be unreachable. That's fine.
// coverity[dead_error_begin]
else if (tid < nrecv+nsend) { flags |= RoleWaitSend; index = tid-nrecv; }
else if (nthreads-nsend <= tid) { flags |= RolePostSend; index = tid-(nthreads-nsend); }
else if (nthreads-nrecv-nsend <= tid) { flags |= RolePostRecv; index = tid-(nthreads-nrecv-nsend); }
if (userBufReg) flags |= UserBufferMode;
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
} else if (mode == primsModePatRs || mode == primsModePatAg) { // Connect to all ranks +/- 2^n
flags |= PatMode;
accSize = 0;
int nranks = ncclShmem.comm.nRanks;
int rank = ncclShmem.comm.rank;
// A thread is responsible for rank +/- 2 ^ (tid%32). That should be fine as long as rank is a 32-bits integer.
index = tid % 32;
uint32_t delta = 1 << index;
const int roles[4] = { RoleWaitRecv, RoleWaitSend, RolePostSend, RolePostRecv};
int block = tid / 32;
if (block < 4 && delta < nranks) {
int role = roles[block];
if (mode == primsModePatRs) {
if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank - delta + nranks) % nranks;
if (role & (RoleWaitSend|RolePostSend)) peer = (rank + delta) % nranks;
} else if (mode == primsModePatAg) {
if (role & (RoleWaitSend|RolePostSend)) peer = (rank - delta + nranks) % nranks;
if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank + delta) % nranks;
}
flags |= role;
} else if (tid == 128) {
flags |= RoleInput | RoleOutput; // Only one will be used depending on the operation
}
}
// Coverity thinks that index could be -1 here but that's not actually the case.
// coverity[negative_returns:FALSE]
if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e ? e->direct : 0, e ? e->regUsed : ipcReg);
// coverity[negative_returns:FALSE]
if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e ? e->direct : 0, e ? e->regUsed : ipcReg);
if (netReg) flags |= NetRegMode;
if (barrierAny(flags & NetDeviceUnpack)) {
flags |= AnyNetDeviceUnpack;
@@ -569,18 +659,14 @@ private:
}
}
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e);
// coverity[negative_returns:FALSE]
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e, (uint8_t)(e ? e->regUsed : ipcReg), peer);
}
__device__ ~Primitives() {
// Ensure ncclShmem.groups[].send/recvConns are available
barrier();
// Save steps for the next operation
if (flags & (RolePostSend|RolePostRecv)) {
auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
conns[index]->step = step;
}
if ((flags & UserBufferMode) && (flags & RoleWaitSend)) {
if (flags & (RolePostSend|RolePostRecv)) conn->step = step;
if ((flags & NetRegMode) && (flags & RoleWaitSend)) {
// Make sure we wait until the proxy has sent data before we return.
// We don't want the next CUDA kernel to overwrite the send buffer which
// was accessed directly.
@@ -599,97 +685,111 @@ private:
barrier();
}
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* e) {
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* work, uint8_t ipcReg, int peer) {
if (tid==0) {
ncclShmem.groups[group].userInput = (void*)inputBuf;
ncclShmem.groups[group].userOutput = (void*)outputBuf;
ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input
}
bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite);
bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite));
bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead) || (flags == (flags|RoleWaitRecv|NvlsDirectRead)); // receiver accepts direct buffer
int regUsed = e != nullptr ? e->coll.regUsed : 0;
if (Direct && recvProvider) {
int spins = 0;
void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
// Wait for consumer to consume previous value before trampling it.
if (slot) {
while (*slot != nullptr && !checkAbort(spins));
directBuff = (T*)outputBuf;
// Encode pointer by XOR'ing against some address they definitely wouldn't send
// since we want to allow them sending us nullptr while not colliding with
// the empty slot value.
*slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
}
}
if (Direct && sendAcceptor) {
int spins = 0;
void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
void *ptr;
while (slot) {
ptr = *slot;
if (ptr != nullptr || checkAbort(spins)) break;
}
if (slot) {
directBuff = regUsed ? (T*)(e->dnOutputs[index]) :
reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
*slot = nullptr;
} else {
/* slot is NULL, it must be regUsed == 1 */
directBuff = (T*)e->dnOutputs[index];
}
}
if (Direct && sendProvider) {
int spins = 0;
void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1;
// Wait for consumer to consume previous value before trampling it.
if (slot && argSlot0 && argSlot1) {
while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
// If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
// Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
// Exchange pre-scalers for use in direct pull
*argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg;
*argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32);
// Encode pointer by XOR'ing against some address they definitely wouldn't send
// since we want to allow them sending us nullptr while not colliding with
// the empty slot value.
*slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
}
}
if (Direct && recvAcceptor) {
int spins = 0;
void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1;
void *ptr;
while (slot) {
ptr = *slot;
if (ptr != nullptr || checkAbort(spins)) break;
}
if (slot && argSlot0 && argSlot1) {
directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) :
reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
if (MaxSend != 0) { // reduce group rather than gather group
// Store scalers for remote inputs
uint64_t arg0, arg1;
while (true) {
arg0 = *argSlot0;
arg1 = *argSlot1;
if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
if (Direct && ipcReg) {
bool recvProvider = (flags & RoleWaitRecv) && (flags & DirectWrite || flags & IpcWrite);
bool sendAcceptor = (flags & RoleWaitSend) && (flags & DirectWrite || flags & IpcWrite || flags & NvlsDirectWrite);
bool sendProvider = (flags & RoleWaitSend) && (flags & DirectRead || flags & IpcRead); // sender provides direct buffer (to be fetched)
bool recvAcceptor = (flags & RoleWaitRecv) && (flags & DirectRead || flags & IpcRead || flags & NvlsDirectRead); // receiver accepts direct buffer
if (recvProvider) {
int spins = 0;
void* volatile* slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
// Wait for consumer to consume previous value before trampling it.
if (slot) {
T* exchgPtr;
directBuff = (T*)outputBuf;
while (*slot != nullptr && !checkAbort(spins));
if (P2p) {
exchgPtr = (T*)outputBuf;
} else {
int localPeer = ncclShmem.comm.rankToLocalRank[peer];
exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]);
}
ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
*slot = reinterpret_cast<void*>(exchgPtr);
}
}
if (sendAcceptor) {
int spins = 0;
void* volatile* slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
void* ptr;
while (slot) {
ptr = *slot;
if (ptr != nullptr || checkAbort(spins)) break;
}
if (slot) {
directBuff = reinterpret_cast<T*>(ptr);
*slot = nullptr;
} else {
directBuff = (T*)work->dnOutputs[index];
}
}
if (sendProvider) {
int spins = 0;
void* volatile* slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange + 1;
// Wait for consumer to consume previous value before trampling it.
if (slot && argSlot0 && argSlot1) {
T* exchgPtr;
while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(spins));
// If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
// Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
if (P2p) {
exchgPtr = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
} else {
int localPeer = ncclShmem.comm.rankToLocalRank[peer];
if (MaxRecv == 0)
exchgPtr = (T*)(work->coll.sendbuffOffset + work->coll.sendbuffRmtAddrs[localPeer]);
else
exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]);
}
// Exchange pre-scalers for use in direct pull
*argSlot0 = (uint64_t(1) << 32) | (uint32_t)redOpArg;
*argSlot1 = (uint64_t(1) << 32) | (uint32_t)(redOpArg >> 32);
*slot = reinterpret_cast<T*>(exchgPtr);
}
}
if (recvAcceptor) {
int spins = 0;
void* volatile* slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange + 1;
void* ptr;
while (slot) {
ptr = *slot;
if (ptr != nullptr || checkAbort(spins)) break;
}
if (slot && argSlot0 && argSlot1) {
directBuff = reinterpret_cast<T*>(ptr);
if (MaxSend != 0) { // reduce group rather than gather group
// Store scalers for remote inputs
uint64_t arg0, arg1;
while (true) {
arg0 = *argSlot0;
arg1 = *argSlot1;
if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
}
ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
}
*argSlot0 = 0; *argSlot1 = 0;
*slot = nullptr;
} else {
// Coverity complains about work being possibly NULL below. However, slot
// being NULL means that the NVLS buffer is registered (regUsed == 1)
// so work can't be NULL in this code path.
// coverity[var_deref_op]
directBuff = (T*)work->dnInputs[index];
}
*argSlot0 = 0; *argSlot1 = 0;
*slot = nullptr;
} else {
directBuff = (T*)e->dnInputs[index];
}
}
}
@@ -717,8 +817,8 @@ private:
__device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) {
genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false);
__device__ __forceinline__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) {
genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false);
@@ -737,8 +837,8 @@ private:
__device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) {
genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false);
__device__ __forceinline__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN) {
genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, false);
@@ -750,6 +850,9 @@ private:
__device__ __forceinline__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<1, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void recvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
@@ -757,14 +860,20 @@ private:
__device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
genericOp<1, 1, 1, 1, Input, -1>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
__device__ __forceinline__ void recvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
// Direct is only for the send part
genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
genericOp<1, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void
scatter(intptr_t inpIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
@@ -783,4 +892,126 @@ private:
directGather(intptr_t outIx, ssize_t totalElem, int peerElem, ssize_t peerOffset, int skip, int shift) {
ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
}
__device__ __forceinline__ void patReduce(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int sendStepOffset, int nelem, int postRecv, int postSend) {
nelem = nelem < 0 ? 0 : nelem;
T* userInput = (T*)ncclShmem.groups[group].userInput;
T* userOutput = (T*)ncclShmem.groups[group].userOutput;
if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + recvOffset;
int spins = 0;
while (connStepCache < step + StepPerSlice) {
connStepCache = loadStepValue(connStepPtr);
if (checkAbort(spins)) break;
}
if (postRecv) step += StepPerSlice;
}
if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
int spins = 0;
while (connStepCache + NCCL_STEPS < step + sendStepOffset + StepPerSlice) {
connStepCache = loadStepValue(connStepPtr);
if (checkAbort(spins)) break;
}
ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + ((step+sendStepOffset)%NCCL_STEPS)*connStepSize) + sendOffset;
if (accSize < sendOffset + nelem + (step+sendStepOffset)*connStepSize) {
// New data, add our own data to it.
ncclShmem.groups[group].srcs[1] = userInput + inpIx;
accSize = sendOffset + nelem + (step+sendStepOffset)*connStepSize;
if (flags & ConnFifoEnabled)
connFifo[(step+sendStepOffset)%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
} else {
// There is already data in there, accumulate instead of writing to it.
ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
}
if (postSend) step += StepPerSlice;
}
if (sendPow2 < 0 && (flags & RoleOutput)) { // Destination is our own local buffer
ncclShmem.groups[group].dsts[0] = userOutput + outIx;
if (accSize < outIx + nelem) {
// New data, add our own data to it.
ncclShmem.groups[group].srcs[1] = userInput + inpIx;
accSize = outIx + nelem;
} else {
// There is already data in there, accumulate instead of writing to it.
ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
}
}
barrier();
int nSrcs = 2;
void** srcs = ncclShmem.groups[group].srcs;
if (recvPow2 < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source
int workSize = ncclShmem.aborted ? 0 : nelem;
reduceCopy<Unroll, RedOp, T, 0, 1, 2, 0, 1, 1, /*PreOpSrcs*/0>
(tid, nthreads, ncclShmem.redOpArgs[0], nullptr, /*postOp=*/false,
nSrcs, srcs, 1, ncclShmem.groups[group].dsts, workSize);
barrier();
if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
}
__device__ __forceinline__ void patCopy(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int recvStepOffset, int nelem, int postRecv, int postSend) {
nelem = nelem < 0 ? 0 : nelem;
T* userInput = (T*)ncclShmem.groups[group].userInput;
T* userOutput = (T*)ncclShmem.groups[group].userOutput;
if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + ((step+recvStepOffset)%NCCL_STEPS)*connStepSize) + recvOffset;
int spins = 0;
while (connStepCache < step + recvStepOffset + StepPerSlice) {
connStepCache = loadStepValue(connStepPtr);
if (checkAbort(spins)) break;
}
if (accSize < recvOffset + nelem + (step+recvStepOffset)*connStepSize) {
// New data, copy to our output buffer.
ncclShmem.groups[group].dsts[1] = userOutput + outIx;
accSize = recvOffset + nelem + (step+recvStepOffset)*connStepSize;
} else {
ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
}
if (postRecv) step += StepPerSlice;
}
if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
int spins = 0;
while (connStepCache + NCCL_STEPS < step + StepPerSlice) {
connStepCache = loadStepValue(connStepPtr);
if (checkAbort(spins)) break;
}
ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + sendOffset;
if (postSend) {
if (flags & ConnFifoEnabled)
connFifo[step%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
step += StepPerSlice;
}
}
if (recvPow2 < 0 && (flags & RoleInput)) { // Source is our own local buffer
ncclShmem.groups[group].srcs[0] = userInput + inpIx;
if (accSize < inpIx + nelem) {
// New data, copy to our output buffer.
ncclShmem.groups[group].dsts[1] = userOutput + outIx;
accSize = inpIx + nelem;
} else {
ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
}
}
barrier();
int nDsts = 2;
void** dsts = ncclShmem.groups[group].dsts;
if (sendPow2 < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest
if (ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[1]) nDsts--; // In-place or already done.
int workSize = ncclShmem.aborted ? 0 : nelem;
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 2, /*PreOpSrcs*/0>
(tid, nthreads, ncclShmem.redOpArgs[0], nullptr, /*postOp=*/false,
1, ncclShmem.groups[group].srcs, nDsts, dsts, workSize);
barrier();
if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
}
};
+3
Ver ficheiro
@@ -23,6 +23,9 @@ namespace {
size_t offset;
int nelem;
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
+35 -21
Ver ficheiro
@@ -234,10 +234,10 @@ struct Apply_Reduce<FuncProd<uint8_t>, /*EltPerPack=*/4> {
uint32_t a = apack.native;
uint32_t b = bpack.native;
uint32_t ab0 = (a*b) & 0xffu;
asm("mad.lo.u32 %0, %1, %2, %0;" : "+r"(ab0) : "r"(a&0xff00u), "r"(b&0xff00u));
asm volatile("mad.lo.u32 %0, %1, %2, %0;" : "+r"(ab0) : "r"(a&0xff00u), "r"(b&0xff00u));
uint32_t ab1;
asm("mul.hi.u32 %0, %1, %2;" : "=r"(ab1) : "r"(a&0xff0000), "r"(b&0xff0000));
asm("mad.hi.u32 %0, %1, %2, %0;" : "+r"(ab1) : "r"(a&0xff000000u), "r"(b&0xff000000u));
asm volatile("mul.hi.u32 %0, %1, %2;" : "=r"(ab1) : "r"(a&0xff0000), "r"(b&0xff0000));
asm volatile("mad.hi.u32 %0, %1, %2, %0;" : "+r"(ab1) : "r"(a&0xff000000u), "r"(b&0xff000000u));
apack.native = __byte_perm(ab0, ab1, 0x6420);
return apack;
}
@@ -260,8 +260,12 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f
#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
SPECIALIZE_REDUCE(FuncSum, half, 1, half, __hadd(x, y))
// Coverity recommends the use of std::move here but, given that half is a scalar,
// a plain copy will be just as efficient.
// coverity[copy_constructor_call]
SPECIALIZE_REDUCE(FuncSum, half, 2, half2, __hadd2(x, y))
SPECIALIZE_REDUCE(FuncProd, half, 1, half, __hmul(x, y))
// coverity[copy_constructor_call]
SPECIALIZE_REDUCE(FuncProd, half, 2, half2, __hmul2(x, y))
#else
SPECIALIZE_REDUCE(FuncSum, half, 1, half, __float2half(__half2float(x) + __half2float(y)))
@@ -270,6 +274,7 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f
#if __CUDA_ARCH__ >= 800
SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
// coverity[copy_constructor_call]
SPECIALIZE_REDUCE(FuncMinMax, half, 2, half2, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
#else
SPECIALIZE_REDUCE(FuncMinMax, half, 1, half, __float2half(fn.isMinNotMax ? fminf(__half2float(x), __half2float(y)) : fmaxf(__half2float(x), __half2float(y))))
@@ -278,10 +283,13 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f
#if defined(__CUDA_BF16_TYPES_EXIST__)
#if __CUDA_ARCH__ >= 800
SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __hadd(x, y))
// coverity[copy_constructor_call]
SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 2, __nv_bfloat162, __hadd2(x, y))
SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __hmul(x, y))
// coverity[copy_constructor_call]
SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 2, __nv_bfloat162, __hmul2(x, y))
SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 1, __nv_bfloat16, fn.isMinNotMax ? __hmin(x, y) : __hmax(x, y))
// coverity[copy_constructor_call]
SPECIALIZE_REDUCE(FuncMinMax, __nv_bfloat16, 2, __nv_bfloat162, fn.isMinNotMax ? __hmin2(x, y) : __hmax2(x, y))
#else
SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __float2bfloat16(__bfloat162float(x) + __bfloat162float(y)))
@@ -402,6 +410,9 @@ struct FuncPreMulSum {
};
template<>
// Coverity recommends the users of this type to use std::move in certain cases but,
// given that half is a scalar, a plain copy will be just as efficient.
// coverity[moveable_type]
struct FuncPreMulSum<half> {
using EltType = half;
#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
@@ -424,6 +435,9 @@ struct FuncPreMulSum<half> {
#if defined(__CUDA_BF16_TYPES_EXIST__)
template<>
// Coverity recommends the users of this type to use std::move in certain cases but,
// given that __nv_bfloat16 is a scalar, a plain copy will be just as efficient.
// coverity[moveable_type]
struct FuncPreMulSum<__nv_bfloat16> {
using EltType = __nv_bfloat16;
#if __CUDA_ARCH__ >= 800
@@ -584,9 +598,9 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
__device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
: "l"(addr)); \
: "l"(addr) : "memory"); \
return ans; \
} \
};
@@ -597,13 +611,13 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
__device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
if (fn.isMinNotMax) { \
asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
: "l"(addr)); \
: "l"(addr) : "memory"); \
} else { \
asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
: "l"(addr)); \
: "l"(addr) : "memory"); \
} \
return ans; \
} \
@@ -615,12 +629,12 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
__device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
asm("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
: "l"(addr)); \
: "l"(addr) : "memory"); \
return ans; \
} \
};
@@ -631,19 +645,19 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
__device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
if (fn.isMinNotMax) { \
asm("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
: "l"(addr)); \
: "l"(addr) : "memory"); \
} else { \
asm("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
: "l"(addr)); \
: "l"(addr) : "memory"); \
} \
return ans; \
} \
@@ -655,9 +669,9 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
struct Apply_LoadMultimem<FuncSum<T>, sizeof(T)> { \
__device__ static BytePack<sizeof(T)> load(FuncSum<T> fn, uintptr_t addr) { \
BytePack<2*sizeof(T)> tmp; \
asm("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
: "l"(addr & -uintptr_t(2*sizeof(T)))); \
: "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
return tmp.half[(addr/sizeof(T))%2]; \
} \
};
@@ -668,13 +682,13 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
__device__ static BytePack<sizeof(T)> load(FuncMinMax<T> fn, uintptr_t addr) { \
BytePack<2*sizeof(T)> tmp; \
if (fn.isMinNotMax) { \
asm("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
: "l"(addr & -uintptr_t(2*sizeof(T)))); \
: "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
} else { \
asm("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
: "l"(addr & -uintptr_t(2*sizeof(T)))); \
: "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
} \
return tmp.half[(addr/sizeof(T))%2]; \
} \
+46 -10
Ver ficheiro
@@ -24,6 +24,9 @@ namespace {
uint32_t nelem;
int rankDest;
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
// coverity[callee_ptr_arith:FALSE]
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);
@@ -74,6 +77,32 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_L
}
};
template<typename T, typename RedOp>
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
using Proto = ProtoSimple<1, 1>;
const int nranks = ncclShmem.comm.nRanks;
const int rank = ncclShmem.comm.rank;
size_t count, channelOffset, channelCount, chunkCount;
ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
T *inputBuf = (T*)work->sendbuff;
T *outputBuf = (T*)work->recvbuff;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
(tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatRs);
PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
int last = 0;
while (!last) {
int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
size_t inpIx, outIx;
patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
prims.patReduce(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend);
}
}
};
template<typename T, typename RedOp>
struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
@@ -88,7 +117,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_S
size_t offset;
int nelem;
/* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync;
/* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync;
* if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth
* and the rest are allocated to scatter. */
const int nThreadsReduce = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
@@ -143,6 +172,9 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_S
size_t outOffset = gridOffset + elemOffset;
size_t inpOffset = outOffset + rank * count;
nelem = min(chunkCount, channelCount - elemOffset);
// Coverity complains about a possible overrun inside the method invoked below, but that's actually
// a false positive.
// coverity[overrun-call:FALSE]
prims.directRecvCopy(inpOffset, outOffset, nelem);
}
@@ -164,7 +196,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
__device__ __forceinline__ void operator()(
int tid, int tn, int slice, int maxSliceSize,
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes
int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
) {
static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
@@ -199,19 +231,23 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
reduceCopy<ncclCollUnroll(), RedOp, T,
if (nDsts != 0) {
reduceCopy<ncclCollUnroll(), RedOp, T,
/*MultimemSrcs=*/0, 1+MinSrcs, 1+MaxSrcs,
/*MultimemDsts,MinDsts,MaxDsts=*/0,1,1,
/*PreOpSrcs=*/1>
(tid, tn, work->redOpArg, &work->redOpArg, false,
/*nSrcs=*/1+nSrcs, [=]__device__(int s) {
return s==0 ? (T*)inbuf + userOneBeg
: work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ)
? (T*)srcPtrs[s-1] + userOneBeg
: (T*)srcPtrs[s-1] + railAllOffset;
},
/*nDsts=*/1, [=]__device__(int d/*==0*/) {
return (T*)dstPtrs[dst] + railAllOffset;
},
delta);
}
railAllOffset += delta;
node += 1;
}
@@ -245,15 +281,15 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
int tn = nWarps1*WARP_SIZE;
if (tid < tn) {
// Phase 1: Scatter inputs to peers
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, tn, nullptr, direct->heads+1, nullptr, nullptr,
work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
prims(tid, tn, nullptr, direct->heads+1, work->sendbuff, nullptr,
work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1, work);
for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
Scatterer</*ReduceSendNotRecv=*/true> scat;
scat.work = work;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.template process</*Recv=*/0, /*Send=*/1>(scat);
prims.template process</*Recv=*/0, /*Send=*/1>(scat, NCCL_DIRECT_READ, 0);
}
return;
}
@@ -269,15 +305,15 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
__syncwarp();
} else {
// Phase 2: Reduce from peers + local input -> send to network
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr,
work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1, work);
for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
Scatterer</*ReduceSendNotRecv=*/false> scat;
scat.work = work;
scat.chunkSize = chunkSize;
scat.railGridOffset = railGridOffset;
prims.template process</*Recv=*/1, /*Send=*/1>(scat);
prims.template process</*Recv=*/1, /*Send=*/1>(scat, 0, NCCL_DIRECT_READ);
}
}
return;
+8 -5
Ver ficheiro
@@ -15,11 +15,11 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
template<typename Proto>
__device__ void runSend(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
size_t bytes = work->sendBytes;
int chunkSize = u32fp8Decode(work->sendChunkSize_u32fp8);
int chunkSize = work->sendIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->sendChunkSize_u32fp8);
Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1>
prims(tid, tn, nullptr, &work->sendRank, work->sendAddr, nullptr,
/*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
/*userBufferMode=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize);
/*ipcReg=*/work->sendIpcReg, /*netReg=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize);
size_t cursor = 0;
do {
int n = min(size_t(chunkSize), bytes-cursor);
@@ -31,15 +31,15 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
template<typename Proto>
__device__ void runRecv(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
size_t bytes = work->recvBytes;
int chunkSize = u32fp8Decode(work->recvChunkSize_u32fp8);
int chunkSize = work->recvIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->recvChunkSize_u32fp8);
Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1>
prims(tid, tn, &work->recvRank, nullptr, nullptr, work->recvAddr,
/*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
/*userBufferMode=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize);
/*ipcReg=*/work->recvIpcReg, /*netReg=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize);
size_t cursor = 0;
do {
int n = min(size_t(chunkSize), bytes-cursor);
prims.directRecv(cursor, n);
prims.directRecv(cursor, cursor, n);
cursor += n;
} while (cursor < bytes && work->recvRegistered == 0);
}
@@ -80,6 +80,9 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
(isSend ? work->sendBytes : work->recvBytes) = partEnd - partBeg;
}
}
// Coverity reports a possible thread divergence due to not all threads participating in the collective.
// However, the code ensures that the participation is on a per-warp basis.
// coverity[device_thread_diverged:FALSE]
uint32_t mask = __ballot_sync(~0u, hasWork);
if (lane == 0) {
shared->workSendMask = mask>>16;
+402 -123
Ver ficheiro
@@ -11,6 +11,7 @@
#include "bootstrap.h"
#include "channel.h"
#include "cudawrap.h"
#include "profiler.h"
#include "transport.h"
#include <cstring> // std::memcpy
@@ -121,6 +122,10 @@ static void addWorkBatchToPlan(
if (newBatch || extendBatch) {
if (!newBatch) batch->nextExtends = extendBatch; // Extending the previous batch.
struct ncclWorkBatchList* batchNode = ncclMemoryStackAlloc<ncclWorkBatchList>(&comm->memScoped);
// Coverity thinks that ncclIntruQueueEnqueue will access chan->workBatchQueue->tail, which might
// be NULL. But that code is guarded by chan->workBatchQueue->head not being NULL, in which
// case tail won't be NULL either.
// coverity[var_deref_model:FALSE]
ncclIntruQueueEnqueue(&chan->workBatchQueue, batchNode);
batch = &batchNode->batch;
batch->nextExtends = 0;
@@ -239,7 +244,29 @@ static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* c
return ncclSuccess;
}
static ncclResult_t registerIntraNodeBuffers(
static ncclResult_t registerCheckP2PConnection(struct ncclComm* comm, struct ncclConnector* conn, struct ncclTopoGraph* graph, int peer, bool* needReg) {
if (conn->connected) {
if (conn->conn.flags & (NCCL_IPC_READ | NCCL_IPC_WRITE | NCCL_DIRECT_READ | NCCL_DIRECT_WRITE)) {
*needReg = true;
} else {
// network connection
*needReg = false;
}
} else {
struct ncclPeerInfo* peerInfo = &comm->peerInfo[peer];
struct ncclPeerInfo* myInfo = &comm->peerInfo[comm->rank];
int canConnect = 0;
NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, graph, myInfo, peerInfo));
if (canConnect) {
*needReg = true;
} else {
*needReg = false;
}
}
return ncclSuccess;
}
static ncclResult_t registerCollBuffers(
struct ncclComm* comm, struct ncclTaskColl* info,
void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
@@ -250,8 +277,10 @@ static ncclResult_t registerIntraNodeBuffers(
info->regBufType = NCCL_REGULAR_BUFFER;
*regNeedConnect = true;
if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit;
#if CUDART_VERSION >= 11030
if ((info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) && comm->nvlsRegSupport) {
if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit;
bool regBufUsed = false;
const void *sendbuff = info->sendbuff;
void *recvbuff = info->recvbuff;
@@ -284,60 +313,6 @@ static ncclResult_t registerIntraNodeBuffers(
}
info->regBufType = NCCL_NVLS_REG_BUFFER;
}
} else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT && // limited to CollNetDirect for now
comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
comm->intraRanks < comm->localRanks && // only with inter-process & intra-node peers
comm->planner.persistent && 0) {
/* Disable CollnetDirect registration since it does not support cuMem* allocated memory. */
int localRank = comm->localRank;
cudaPointerAttributes sattr, rattr;
CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff));
CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff));
if (sattr.type != cudaMemoryTypeDevice || rattr.type != cudaMemoryTypeDevice) return ncclSuccess;
if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess;
struct HandlePair {
cudaIpcMemHandle_t ipc[2]; // {send, recv}
size_t offset[2]; // {send, recv}
};
struct HandlePair handles[NCCL_MAX_LOCAL_RANKS];
CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback);
CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback);
void *baseSend, *baseRecv;
size_t size;
CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff));
handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend;
CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff));
handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv;
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair)));
// Open handles locally
for (int i=0; i < comm->localRanks; i++) {
if (i == localRank) { // Skip self
outRegBufSend[i] = nullptr;
outRegBufRecv[i] = nullptr;
} else {
for (int sr=0; sr < 2; sr++) {
// Get base address of mapping
void* base;
CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess));
// Get real buffer address by adding offset in the mapping
(sr == 0 ? outRegBufSend : outRegBufRecv)[i] = (char*)base + handles[i].offset[sr];
// Enqueue reminder to close memory handle
struct ncclIpcCleanupCallback* cb = (struct ncclIpcCleanupCallback*)malloc(sizeof(struct ncclIpcCleanupCallback));
cb->base.fn = cleanupIpc;
cb->ptr = base;
ncclIntruQueueEnqueue(cleanupQueue, &cb->base);
info->nCleanupQueueElts += 1;
}
}
}
info->regBufType = NCCL_IPC_REG_BUFFER;
} else if ((info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && comm->collNetRegSupport && info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv) {
size_t elementSize = ncclTypeSize(info->datatype);
size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
@@ -356,27 +331,200 @@ static ncclResult_t registerIntraNodeBuffers(
}
if ((sendRegBufFlag == 0 || recvRegBufFlag == 0) && comm->planner.persistent && ncclParamGraphRegister()) {
ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
info->sendMhandle = sendHandle;
if (sendRegBufFlag) {
if (!sendRegBufFlag) {
ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
info->sendMhandle = sendHandle;
}
if (sendRegBufFlag && !recvRegBufFlag) {
ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
info->recvMhandle = recvHandle;
}
}
if (sendRegBufFlag && recvRegBufFlag) {
info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 1));
info->nMaxChannels = 1;
info->regBufType = NCCL_COLLNET_REG_BUFFER;
if (sendRegBufFlag == 1 && recvRegBufFlag == 1) {
INFO(NCCL_REG, "rank %d successfully registered collNet sendbuff %p (handle %p), sendbuff size %ld, recvbuff %p (handle %p), recvbuff size %ld", comm->rank, info->sendbuff, sendHandle, sendbuffSize, info->recvbuff, recvHandle, recvbuffSize);
}
}
} else if (comm->intraNodeP2pSupport && info->protocol == NCCL_PROTO_SIMPLE) {
// IPC buffer registration
if (info->func == ncclFuncReduceScatter) goto exit;
if (info->algorithm == NCCL_ALGO_RING && ((info->func == ncclFuncAllReduce && info->sendbuff == info->recvbuff) || info->func == ncclFuncReduce)) goto exit;
if ((info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && info->sendbuff == info->recvbuff) goto exit;
if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) goto exit;
int peerRanks[NCCL_MAX_LOCAL_RANKS];
int nPeers = 0;
size_t elementSize = ncclTypeSize(info->datatype);
size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
int regBufFlag = 0;
memset(peerRanks, 0xff, sizeof(int) * NCCL_MAX_LOCAL_RANKS);
if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
struct ncclChannel* channel = comm->channels;
for (int r = 0; r < NCCL_MAX_DIRECT_ARITY; ++r) {
for (int updown = 0; updown < 2; ++updown) {
int peer;
if (updown == 0)
peer = channel->collnetDirect.up[r];
else
peer = channel->collnetDirect.down[r];
if (peer != -1) {
struct ncclConnector* peerConn = &channel->peers[peer]->recv[0];
bool needReg = false;
NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], peer, &needReg));
if (needReg) {
bool found = false;
for (int p = 0; p < nPeers; ++p) {
if (peerRanks[p] == peer) {
found = true;
break;
}
}
if (!found) peerRanks[nPeers++] = peer;
}
}
}
}
if (nPeers > 0) {
if (ncclParamLocalRegister())
ncclIpcLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs);
if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
ncclIpcGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
}
if (regBufFlag) {
if (ncclParamLocalRegister())
ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
}
}
}
if (regBufFlag) {
info->regBufType = NCCL_IPC_REG_BUFFER;
}
} else if (info->algorithm == NCCL_ALGO_RING) {
struct ncclReg* recvRegRecord;
NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
if (recvRegRecord == NULL) goto exit;
for (int c = 0; c < comm->nChannels; ++c) {
struct ncclChannel* channel = comm->channels + c;
for (int r = 0; r < 2; ++r) {
bool needReg = false;
int peer;
struct ncclConnector* peerConn;
// P2P transport
if (r == 0)
peer = channel->ring.prev;
else
peer = channel->ring.next;
peerConn = &channel->peers[peer]->recv[0];
NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_RING], peer, &needReg));
if (needReg) {
bool found = false;
for (int p = 0; p < nPeers; ++p) {
if (peerRanks[p] == peer) {
found = true;
break;
}
}
if (!found) peerRanks[nPeers++] = peer;
}
}
}
if (nPeers > 0) {
if (ncclParamLocalRegister()) {
ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
}
if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
}
}
if (regBufFlag) {
info->regBufType = NCCL_IPC_REG_BUFFER;
}
} else if (info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
struct ncclReg* recvRegRecord;
NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
if (recvRegRecord == NULL) goto exit;
for (int c = 0; c < comm->nChannels; ++c) {
struct ncclChannel* channel = comm->channels + c;
struct ncclTree* tree = NULL;
int peers[NCCL_MAX_TREE_ARITY + 1];
if (info->algorithm == NCCL_ALGO_TREE)
tree = &channel->tree;
else
tree = &channel->collnetChain;
for (int p = 0; p < NCCL_MAX_TREE_ARITY; ++p) peers[p] = tree->down[p];
peers[NCCL_MAX_TREE_ARITY] = tree->up;
for (int p = 0; p < NCCL_MAX_TREE_ARITY + 1; ++p) {
int peer = peers[p];
bool peerNeedReg = false;
struct ncclConnector* recvConn = NULL;
// P2P transport
if (peer == -1 || peer == comm->nRanks) continue;
recvConn = &channel->peers[peer]->recv[0];
NCCLCHECK(registerCheckP2PConnection(comm, recvConn, &comm->graphs[info->algorithm], peer, &peerNeedReg));
if (peerNeedReg) {
bool found = false;
for (int pindex = 0; pindex < nPeers; ++pindex) {
if (peerRanks[pindex] == peer) {
found = true;
break;
}
}
if (!found) peerRanks[nPeers++] = peer;
}
}
}
if (nPeers > 0) {
if (ncclParamLocalRegister()) {
ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
}
if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
}
}
if (regBufFlag) {
info->regBufType = NCCL_IPC_REG_BUFFER;
}
}
if (info->regBufType == NCCL_IPC_REG_BUFFER && comm->nNodes == 1 && 16 < info->nMaxChannels && info->nMaxChannels <= 24) {
info->nMaxChannels = 16;
}
}
fallback:
exit:
#endif
return result;
}
static ncclResult_t registerP2pBuffer(struct ncclComm* comm, void* userbuff, int peerRank, size_t size, int* regFlag, void** regAddr, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
ncclResult_t ret = ncclSuccess;
uintptr_t offset = 0;
uintptr_t* peerRmtAddrs = NULL;
*regFlag = 0;
if (ncclParamLocalRegister()) {
ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs);
}
if (*regFlag == 0 && comm->planner.persistent && ncclParamGraphRegister()) {
ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast<void*>(cleanupQueue), NULL);
}
if (*regFlag)
*regAddr = (void*)((uintptr_t)peerRmtAddrs + offset);
return ret;
}
static ncclResult_t getCollNetSupport(struct ncclComm* comm, struct ncclTaskColl* task, int* collNetSupport);
static ncclResult_t getAlgoInfo(
struct ncclComm* comm, struct ncclTaskColl* task,
@@ -500,7 +648,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
void* regBufSend[NCCL_MAX_LOCAL_RANKS];
void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
bool regNeedConnect = true;
registerIntraNodeBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, &regNeedConnect);
registerCollBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, &regNeedConnect);
if (comm->runtimeConn && comm->initAlgoChannels[task->algorithm] == false) {
if (task->algorithm == NCCL_ALGO_NVLS_TREE && comm->initAlgoChannels[NCCL_ALGO_NVLS] == false && regNeedConnect == true) {
@@ -517,6 +665,10 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
struct ncclDevWorkColl devWork = {};
devWork.sendbuff = (void*)task->sendbuff;
devWork.recvbuff = (void*)task->recvbuff;
devWork.sendbuffOffset = task->sendbuffOffset;
devWork.recvbuffOffset = task->recvbuffOffset;
devWork.sendbuffRmtAddrs = task->sendbuffRmtAddrs;
devWork.recvbuffRmtAddrs = task->recvbuffRmtAddrs;
devWork.root = task->root;
devWork.nWarps = task->nWarps;
devWork.redOpArg = task->opDev.scalarArg;
@@ -527,35 +679,13 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
struct ncclWorkList* workNode;
switch (task->regBufType) {
case NCCL_REGULAR_BUFFER:
case NCCL_IPC_REG_BUFFER:
case NCCL_COLLNET_REG_BUFFER:
{ workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkColl>(&comm->memScoped, 1);
workNode->workType = ncclDevWorkTypeColl;
workNode->size = sizeof(struct ncclDevWorkColl);
memcpy((void*)(workNode+1), (void*)&devWork, workNode->size);
} break;
case NCCL_IPC_REG_BUFFER:
{ struct ncclDevWorkCollReg workReg = {};
workReg.coll = devWork;
struct ncclChannel *channel0 = &comm->channels[0];
for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
int peer = channel0->collnetDirect.down[i];
if (peer == -1) break;
int j = comm->rankToLocalRank[peer]; // Get intra-node slot
workReg.dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer
workReg.dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer
}
for (int i=0; i < NCCL_MAX_DIRECT_ARITY; i++) {
int peer = channel0->collnetDirect.up[i];
if (peer == -1) break;
int j = comm->rankToLocalRank[peer];
// Output buffer of root peer
workReg.upOutputs[i] = regBufRecv[j];
}
workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkCollReg>(&comm->memScoped, 1);
workNode->workType = ncclDevWorkTypeCollReg;
workNode->size = sizeof(struct ncclDevWorkCollReg);
memcpy((void*)(workNode+1), (void*)&workReg, workNode->size);
} break;
case NCCL_NVLS_REG_BUFFER:
{ struct ncclDevWorkCollReg workReg = {};
workReg.coll = devWork; // C++ struct assignment
@@ -590,6 +720,7 @@ static ncclResult_t scheduleCollTasksToPlan(
int nChannels[2*2] = {0, 0, 0, 0}; // [collnet][nvls]
int const nMaxChannels[2*2] = {comm->nChannels, comm->nvlsChannels, // [collnet][nvls]
comm->nChannels, comm->nvlsChannels};
constexpr size_t MinTrafficPerChannel = 16 << 10; // 16K traffic as minimal
do {
size_t workBytes = 0;
struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue);
@@ -601,7 +732,7 @@ static ncclResult_t scheduleCollTasksToPlan(
nPlanColls += 1;
workBytes += workNode->size;
int kind = 2*task->isCollnet + task->isNvls;
trafficBytes[kind] += task->trafficBytes;
trafficBytes[kind] += std::max(MinTrafficPerChannel, task->trafficBytes);
nChannels[kind] += task->nMaxChannels;
nChannels[kind] = std::min(nChannels[kind], nMaxChannels[kind]);
task = task->next;
@@ -611,7 +742,6 @@ static ncclResult_t scheduleCollTasksToPlan(
} while (0);
int kindPrev = -1;
constexpr size_t MinTrafficPerChannel = 512;
size_t trafficPerChannel = 0;
int channelId = 0;
size_t currentTraffic = 0;
@@ -650,14 +780,16 @@ static ncclResult_t scheduleCollTasksToPlan(
for (int c=devWork->channelLo; c <= (int)devWork->channelHi; c++) {
proxyOp.channelId = c;
proxyOp.opCount = proxyOpId;
proxyOp.task.coll = task;
proxyOp.rank = comm->rank;
addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp));
}
} else { // not task->isCollnet
constexpr size_t cellSize = 16;
int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks);
size_t cellSize = divUp(divUp(MinTrafficPerChannel, (size_t)trafficPerByte), 16) * 16;
int elementsPerCell = cellSize/elementSize;
size_t cells = divUp(task->count*elementSize, cellSize);
int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks);
size_t trafficPerElement = elementSize*trafficPerByte;
size_t trafficPerCell = cellSize*trafficPerByte;
size_t cellsPerChannel = std::min(cells, divUp(trafficPerChannel, trafficPerCell));
@@ -665,7 +797,7 @@ static ncclResult_t scheduleCollTasksToPlan(
if (channelId+1 == nMaxChannels[kind]) { // On last channel everything goes to "lo"
cellsLo = cells;
} else {
cellsLo = std::min(cells, (trafficPerChannel-currentTraffic)/trafficPerCell);
cellsLo = std::min(cells, divUp((trafficPerChannel-currentTraffic),trafficPerCell));
}
int nMidChannels = (cells-cellsLo)/cellsPerChannel;
size_t cellsHi = (cells-cellsLo)%cellsPerChannel;
@@ -725,12 +857,12 @@ static ncclResult_t scheduleCollTasksToPlan(
// Update the current channel and vacant traffic budget.
if (countHi != 0) {
channelId += nChannels-1;
currentTraffic = countHi*trafficPerElement;
currentTraffic = cellsHi*elementsPerCell*trafficPerElement;
} else if (nMidChannels != 0) {
channelId += nChannels;
currentTraffic = 0;
} else {
currentTraffic += countLo*trafficPerElement;
currentTraffic += cellsLo*elementsPerCell*trafficPerElement;
}
if (currentTraffic >= trafficPerChannel && channelId+1 != nMaxChannels[kind]) {
@@ -750,7 +882,12 @@ static ncclResult_t scheduleCollTasksToPlan(
}
proxyOp->channelId = c;
proxyOp->opCount = proxyOpId;
proxyOp->task.coll = task;
proxyOp->rank = comm->rank;
addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
// Coverity reports "proxyOp->connection" as being possibly uninitialized. It's hard to
// determine if that's actually true but it's also not clear if that would be an issue.
// coverity[uninit_use_in_call:FALSE]
NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp));
}
}
@@ -790,6 +927,7 @@ static ncclResult_t scheduleCollTasksToPlan(
ncclIntruQueueDequeue(&planner->collWorkQueue);
nPlanColls -= 1;
planner->nTasksColl -= 1;
ncclIntruQueueEnqueue(&plan->collTaskQueue, task);
ncclIntruQueueEnqueue(&plan->workQueue, workNode);
plan->workBytes += workNode->size;
}
@@ -807,7 +945,8 @@ static ncclResult_t addP2pToPlan(
struct ncclComm* comm, struct ncclKernelPlan* plan,
int nChannelsMin, int nChannelsMax, int p2pRound,
int sendRank, void* sendAddr, ssize_t sendBytes,
int recvRank, void* recvAddr, ssize_t recvBytes
int recvRank, void* recvAddr, ssize_t recvBytes,
struct ncclTaskP2p** p2pTasks
) {
constexpr int connIndex = 1;
bool selfSend = (sendRank == comm->rank);
@@ -842,7 +981,8 @@ static ncclResult_t addP2pToPlan(
int chunkSize[2];
int chunkDataSize[2];
int chunkDataSize_u32fp8[2];
bool registered[2];
bool registered[2] = {false, false};
bool ipcRegistered[2] = {false, false};
for (int dir=0; dir < 2; dir++) { // 0=recv, 1=send
if (bytes[dir] != -1) protoLL[dir] &= bytes[dir] <= thresholdLL;
@@ -866,11 +1006,29 @@ static ncclResult_t addP2pToPlan(
chunkSize[dir] = chunkDataSize[dir];
if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2;
registered[dir] = false;
if (bytes[dir] > 0 && network[dir] && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) {
struct ncclReg* regRecord;
NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], &regRecord));
registered[dir] = (regRecord && regRecord->nDevs);
if (network[dir]) {
if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) {
struct ncclReg* regRecord;
NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], &regRecord));
registered[dir] = regRecord && regRecord->nDevs;
}
} else if (bytes[dir] > 0 && addrs[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && !selfSend) {
int peerRank = dir ? sendRank : recvRank;
int regFlag = 0;
int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, 0);
struct ncclChannelPeer** channelPeers = comm->channels[channelId].peers;
struct ncclConnector* conn = dir ? &channelPeers[peerRank]->send[connIndex]
: &channelPeers[peerRank]->recv[connIndex];
void* regAddr = NULL;
if (conn->conn.flags & (NCCL_IPC_WRITE | NCCL_IPC_READ | NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
// We require users registering buffers on both sides
NCCLCHECK(registerP2pBuffer(comm, addrs[dir], peerRank, bytes[dir], &regFlag, &regAddr, &plan->cleanupQueue));
if (regFlag) {
if (dir == 0 && conn->conn.flags & (NCCL_IPC_WRITE | NCCL_DIRECT_WRITE)) recvAddr = regAddr;
else if (dir == 1 && conn->conn.flags & (NCCL_IPC_READ | NCCL_DIRECT_READ)) sendAddr = regAddr;
}
}
ipcRegistered[dir] = regFlag ? true : false;
}
if (bytes[dir] == -1) nChannels[dir] = 0;
@@ -900,6 +1058,7 @@ static ncclResult_t addP2pToPlan(
work->nSendChannels = nChannels[1];
work->sendProtoLL = protoLL[1];
work->sendRegistered = registered[1];
work->sendIpcReg = ipcRegistered[1];
work->sendChunkSize_u32fp8 = chunkDataSize_u32fp8[1];
work->sendRank = sendRank;
work->sendAddr = sendAddr;
@@ -907,6 +1066,7 @@ static ncclResult_t addP2pToPlan(
work->nRecvChannels = nChannels[0];
work->recvProtoLL = protoLL[0];
work->recvRegistered = registered[0];
work->recvIpcReg = ipcRegistered[0];
work->recvChunkSize_u32fp8 = chunkDataSize_u32fp8[0];
work->recvRank = recvRank;
work->recvAddr = recvAddr;
@@ -925,6 +1085,9 @@ static ncclResult_t addP2pToPlan(
op->pattern = dir ? ncclPatternSend : ncclPatternRecv;
op->chunkSize = chunkSize[dir];
op->reg = registered[dir];
op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0;
op->task.p2p = p2pTasks[dir];
op->rank = comm->rank;
// The following are modified per channel part in addWorkToChannels():
// op->buffer, op->nbytes, op->nsteps = ...;
}
@@ -1041,13 +1204,16 @@ static ncclResult_t scheduleP2pTasksToPlan(
if (!testBudget(budget, plan->nWorkBatches+nChannelsMax, plan->workBytes + sizeof(struct ncclDevWorkP2p))) {
return ncclSuccess;
}
NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes));
struct ncclTaskP2p* p2pTasks[2] = { recv, send };
NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes, p2pTasks));
if (send != nullptr) {
ncclIntruQueueDequeue(&peers[sendRank].sendQueue);
ncclIntruQueueEnqueue(&plan->p2pTaskQueue, send);
comm->planner.nTasksP2p -= 1;
}
if (recv != nullptr) {
ncclIntruQueueDequeue(&peers[recvRank].recvQueue);
ncclIntruQueueEnqueue(&plan->p2pTaskQueue, recv);
comm->planner.nTasksP2p -= 1;
}
}
@@ -1100,29 +1266,44 @@ static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduce
}
}
namespace {
struct uploadWork_cleanup_t {
struct ncclCommEventCallback base;
void *hostBuf;
};
ncclResult_t uploadWork_cleanup_fn(
struct ncclComm* comm, struct ncclCommEventCallback* cb
) {
struct uploadWork_cleanup_t* me = (struct uploadWork_cleanup_t*)cb;
free(me->hostBuf);
CUDACHECK(cudaEventDestroy(me->base.event));
return ncclSuccess;
}
}
static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) {
size_t workBytes = plan->workBytes;
size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
void* fifoBuf;
void* fifoBufHost;
uint32_t fifoCursor, fifoMask;
switch (plan->workStorageType) {
case ncclDevWorkStorageTypeArgs:
plan->kernelArgs->workBuf = nullptr;
fifoBuf = (void*)plan->kernelArgs;
fifoBufHost = (void*)plan->kernelArgs;
fifoCursor = sizeof(ncclDevKernelArgs) + batchBytes;
fifoMask = ~0u;
break;
case ncclDevWorkStorageTypeFifo:
fifoBuf = comm->workFifoBuf;
fifoBufHost = comm->workFifoBuf;
fifoCursor = comm->workFifoProduced;
fifoMask = comm->workFifoBytes-1;
waitWorkFifoAvailable(comm, fifoCursor + workBytes);
plan->kernelArgs->workBuf = comm->workFifoBufDev;
break;
case ncclDevWorkStorageTypePersistent:
ncclMemoryStackPush(&comm->memScoped);
fifoBuf = ncclMemoryStackAlloc(&comm->memScoped, workBytes, /*align=*/16);
static_assert(16 <= alignof(max_align_t), "We rely on 16-byte alignment.");
fifoBufHost = malloc(workBytes);
fifoCursor = 0;
fifoMask = ~0u;
break;
@@ -1144,7 +1325,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
// Write the channel-shared work structs.
struct ncclWorkList* workNode = ncclIntruQueueHead(&plan->workQueue);
while (workNode != nullptr) {
char* dst = (char*)fifoBuf;
char* dst = (char*)fifoBufHost;
char* src = (char*)(workNode+1);
for (int n = workNode->size; n != 0; n -= 16) {
memcpy(
@@ -1164,11 +1345,39 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
if (comm->workFifoBufGdrHandle != nullptr) wc_store_fence();
break;
case ncclDevWorkStorageTypePersistent:
NCCLCHECK(ncclCudaMalloc(&plan->workBufPersistent, workBytes));
plan->kernelArgs->workBuf = plan->workBufPersistent;
NCCLCHECK(ncclCudaMemcpy(plan->workBufPersistent, fifoBuf, workBytes));
ncclMemoryStackPop(&comm->memScoped);
break;
{ ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
void* fifoBufDev = nullptr;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
// Acquire deviceStream to gain access to deviceStream.cudaStream. Since the
// user's graph will be launched later, and it also acquires the deviceStream,
// it will observe this upload.
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, finish_scope);
CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
plan->workBufPersistent = fifoBufDev;
plan->kernelArgs->workBuf = fifoBufDev;
CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
cudaEvent_t memcpyDone;
CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, finish_scope);
CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
struct uploadWork_cleanup_t* cleanup;
NCCLCHECK(ncclCalloc(&cleanup, 1));
cleanup->base.fn = uploadWork_cleanup_fn;
cleanup->base.event = memcpyDone;
cleanup->hostBuf = fifoBufHost;
ncclIntruQueueEnqueue(&comm->eventCallbackQueue, &cleanup->base);
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, finish_scope);
NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, finish_scope);
finish_scope:
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
if (result != ncclSuccess) return result;
} break;
default: break;
}
return ncclSuccess;
@@ -1182,6 +1391,11 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
struct ncclProxyOp* op = ncclIntruQueueHead(&plan->proxyOpQueue);
while (op != nullptr) {
op->profilerContext = comm->profilerContext;
op->eActivationMask = op->coll <= ncclFuncAllReduce ? op->task.coll->eActivationMask : op->task.p2p->eActivationMask;
op->taskEventHandle = op->coll <= ncclFuncAllReduce ? op->task.coll->eventHandle : op->task.p2p->eventHandle;
ncclProfilerAddPidToProxyOp(op);
uint64_t oldId = op->opCount;
// Ignoring the bottom tag bit, opCount's are zero-based within plan so
// translate them to the tip of the comm's history.
@@ -1216,8 +1430,12 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
}
static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelPlan* plan) {
NCCLCHECK(ncclProfilerStartGroupEvent(plan));
NCCLCHECK(ncclProfilerStartTaskEvents(plan));
NCCLCHECK(uploadProxyOps(comm, plan));
NCCLCHECK(ncclProxyStart(comm));
NCCLCHECK(ncclProfilerStopTaskEvents(plan));
NCCLCHECK(ncclProfilerStopGroupEvent(plan));
if (!plan->persistent) {
// Notify main thread of our reclaiming. This will reclaim plan concurrently.
ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer);
@@ -1238,13 +1456,30 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim`
if (plan->persistent) {
comm->persistentRefs -= 1;
NCCLCHECK(ncclCudaFree(plan->workBufPersistent));
if (plan->workStorageType == ncclDevWorkStorageTypePersistent) {
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
CUDACHECK(cudaFree(plan->workBufPersistent));
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
}
struct ncclProxyOp* q = ncclIntruQueueHead(&plan->proxyOpQueue);
while (q != nullptr) {
struct ncclProxyOp* q1 = q->enqNext;
ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q);
q = q1;
}
struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
while (ct != nullptr) {
struct ncclTaskColl* ct1 = ct->next;
ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, ct);
ct = ct1;
}
struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
while (pt != nullptr) {
struct ncclTaskP2p* pt1 = pt->next;
ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, pt);
pt = pt1;
}
ncclResult_t result = ncclSuccess;
while (!ncclIntruQueueEmpty(&plan->cleanupQueue)) {
struct ncclCommCallback* cb = ncclIntruQueueDequeue(&plan->cleanupQueue);
@@ -1286,7 +1521,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
plan->comm = comm;
plan->reclaimer.fn = reclaimPlan;
plan->persistent = persistent;
// uploadWork() promotes ncclDevWorkStorageType[Fifo|Buf]->Args if the work can fit.
// finishPlan() promotes ncclDevWorkStorageType[Fifo|Persistent]->Args if the work can fit.
plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent
: ncclDevWorkStorageTypeFifo;
@@ -1554,10 +1789,15 @@ static ncclResult_t updateCollCostTable(
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
// CollNetDirect is only supported for up to 8 local GPUs
if (a == NCCL_ALGO_COLLNET_DIRECT && comm->maxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue;
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue;
if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
/* now we only support single-node NVLS allgather and reducescatter */
if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue;
/* Tree reduceScatter doesn't support scaling yet */
if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter
&& (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
bool backup;
float time;
@@ -1601,6 +1841,8 @@ static ncclResult_t topoGetAlgoInfo(
info->protocol = protocol;
float time = minTime;
// Yes, we are first assigning and then testing if protocol is sane, but that's OK in this case.
// coverity[check_after_sink]
if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) {
WARN("Error : no algorithm/protocol available");
@@ -1610,7 +1852,7 @@ static ncclResult_t topoGetAlgoInfo(
info->protocol = backupProto;
time = backupTime;
}
if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);
if (comm->rank == 0) INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %d proto %d time %f", ncclFuncToString(info->func), nBytes, info->algorithm, info->protocol, time);
if (simInfo) simInfo->estimatedTime = time;
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);
@@ -1653,6 +1895,7 @@ static ncclResult_t topoGetAlgoInfo(
}
nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
if (info->algorithm == NCCL_ALGO_TREE) nt = NCCL_MAX_NTHREADS; // Tree now uses all threads always.
if (info->algorithm == NCCL_ALGO_PAT) nt = NCCL_MAX_NTHREADS;
info->nMaxChannels = nc;
info->nWarps = nt/WARP_SIZE;
return ncclSuccess;
@@ -1704,8 +1947,15 @@ static ncclResult_t calcCollChunking(
pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo;
break;
case ncclFuncReduceScatter:
pattern =
info->algorithm == NCCL_ALGO_PAT ? ncclPatternPatUp :
info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
ncclPatternRing;
break;
case ncclFuncAllGather:
pattern =
info->algorithm == NCCL_ALGO_PAT ? ncclPatternPatDown :
info->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
info->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
ncclPatternRing;
@@ -1729,6 +1979,8 @@ static ncclResult_t calcCollChunking(
case ncclPatternTreeUp:
case ncclPatternTreeDown:
case ncclPatternTreeUpDown:
case ncclPatternPatUp:
case ncclPatternPatDown:
case ncclPatternPipelineFrom:
case ncclPatternPipelineTo:
case ncclPatternCollnetChain:
@@ -1776,13 +2028,17 @@ static ncclResult_t calcCollChunking(
int maxChunkSize = comm->nvlsChunkSize;
if (comm->nNodes > 1 && comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow.
// However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits.
// coverity[overflow_before_widen]
uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads;
if ((nBytes < (64 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
if ((nBytes < (8 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
if ((nBytes < (2 * (concurrentOps * chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
} else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow
// Use uint64_t so that concurrentOps*chunkSize*X does not overflow.
// However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits.
// coverity[overflow_before_widen]
uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads;
chunkSize = comm->nvlsChunkSize;
int maxChunkSize = (int)ncclParamNvlsTreeMaxChunkSize();
@@ -1796,14 +2052,21 @@ static ncclResult_t calcCollChunking(
int nNodes = comm->nNodes;
float ppn = comm->nRanks / (float)nNodes;
float nstepsLL128 = 1+log2i(nNodes) + 0.1*ppn;
// Yes, we are OK with the division on the left side of the < operand being integer.
// coverity[integer_division]
while (nBytes / (nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
// coverity[integer_division]
while (nBytes / (nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
} else if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) {
while (chunkSize*nChannels*32 > nBytes && chunkSize > 65536) chunkSize /= 2;
} else if (info->func == ncclFuncReduceScatter && info->algorithm == NCCL_ALGO_PAT) {
while (chunkSize*nChannels*16 > nBytes && chunkSize > 65536) chunkSize /= 2;
}
// Compute directFlags of work struct.
if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
// Set direct direction for broadcast-gather (read or write)
*outDirectFlags = (nBytes/nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ;
*outDirectFlags = (nBytes/nChannels <= 1024 * 4) ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
} else {
*outDirectFlags = 0;
}
@@ -1852,6 +2115,10 @@ static ncclResult_t calcCollChunking(
}
}
if (pattern == ncclPatternPatUp || pattern == ncclPatternPatDown) {
proxyOp->nbytes = DIVUP(nBytes, nChannels);
}
*outChunkSize = chunkSize;
return ncclSuccess;
}
@@ -1874,6 +2141,7 @@ static ncclResult_t hostToDevRedOp(
opFull->proxyOp = op;
int nbits = 8*ncclTypeSize(datatype);
if (nbits <= 0) return ncclInvalidArgument;
uint64_t allBits = uint64_t(-1)>>(64-nbits);
uint64_t signBit = allBits^(allBits>>1);
@@ -1947,8 +2215,12 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
// Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
ncclGroupCommJoin(info->comm);
struct ncclTaskP2p* p2p = ncclMemoryStackAlloc<struct ncclTaskP2p>(&comm->memScoped);
struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc<struct ncclTaskP2p>(&comm->memPool_ncclTaskP2p, &comm->memPermanent);
p2p->func = info->coll;
p2p->buff = (void*)info->recvbuff;
p2p->count = info->count;
p2p->datatype = info->datatype;
p2p->root = info->root;
p2p->bytes = nBytes;
ncclIntruQueueEnqueue(
isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
@@ -1996,7 +2268,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
} else {
// Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
ncclGroupCommJoin(info->comm);
struct ncclTaskColl* t = ncclMemoryStackAlloc<struct ncclTaskColl>(&comm->memScoped);
struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
t->func = info->coll;
t->sendbuff = info->sendbuff;
t->recvbuff = info->recvbuff;
@@ -2026,7 +2298,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
while (true) {
if (l == nullptr) { // Got to the end, this must be a new stream.
struct ncclCudaGraph graph;
NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream))
NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream));
if (planner->streams != nullptr && !ncclCudaGraphSame(planner->capturingGraph, graph)) {
WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph.");
return ncclInvalidUsage;
@@ -2075,7 +2347,7 @@ exit:
NCCLCHECK(ncclGroupEndInternal());
/* if depth is 1, ncclGroupEndInternal() will trigger group ops. The state can change
* so we have to check state here. */
if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) };
if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)); }
return ret;
fail:
if (info->comm && !info->comm->config.blocking) (void) ncclCommSetAsyncError(info->comm, ret);
@@ -2093,7 +2365,8 @@ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataTyp
int cap = 2*comm->userRedOpCapacity;
if (cap < 4) cap = 4;
ncclUserRedOp *ops = new ncclUserRedOp[cap];
std::memcpy(ops, comm->userRedOps, comm->userRedOpCapacity*sizeof(ncclUserRedOp));
if (comm->userRedOpCapacity > 0)
std::memcpy(ops, comm->userRedOps, comm->userRedOpCapacity*sizeof(ncclUserRedOp));
for(int ix=comm->userRedOpCapacity; ix < cap; ix++)
ops[ix].freeNext = ix + 1;
delete[] comm->userRedOps;
@@ -2109,8 +2382,10 @@ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataTyp
user->datatype = datatype;
user->opFull.op = ncclDevPreMulSum;
if (residence == ncclScalarHostImmediate) {
int size = ncclTypeSize(datatype);
if (size < 1) return ncclInternalError;
user->opFull.scalarArgIsPtr = false;
std::memcpy(&user->opFull.scalarArg, scalar, ncclTypeSize(datatype));
std::memcpy(&user->opFull.scalarArg, scalar, size);
} else {
user->opFull.scalarArgIsPtr = true;
user->opFull.scalarArg = reinterpret_cast<uint64_t>(scalar);
@@ -2127,6 +2402,10 @@ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) {
WARN("ncclRedOpDestroy : operator is a NCCL builtin.");
return ncclInvalidArgument;
}
// int(ncclMaxRedOp) < int(op) will always be false due to the sizes of
// the datatypes involved, and that's by design. We keep the check though
// just as a reminder.
// coverity[result_independent_of_operands]
if (int(op) < 0 || int(ncclMaxRedOp) < int(op)) {
WARN("ncclRedOpDestroy : operator is garbage.");
return ncclInvalidArgument;
+28 -23
Ver ficheiro
@@ -226,6 +226,8 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
}
}
channel->collnetDirect.nHeads = nHeads;
// nHeads should always be greater than 0.
// coverity[divide_by_zero]
channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
@@ -374,20 +376,21 @@ NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
ncclResult_t ret = ncclSuccess;
int *ringRecv = NULL, *ringSend = NULL, *ringPrev = NULL, *ringNext = NULL, *treeToParent = NULL, *treeToChild0 = NULL, *treeToChild1 = NULL, *nvlsHeads = NULL;
int nranks = comm->nRanks;
int nNodes = comm->nNodes;
int nChannels = comm->nChannels;
int minHeadNum = INT_MAX;
int shared = parent && parent->nvlsSupport && parent->config.splitShare;
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));
NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&ringNext, nranks*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
// Alternate rings to avoid crossing rails
if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) {
@@ -433,8 +436,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
}
// Connect rings and trees. This should also duplicate the channels.
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
NCCLCHECKGOTO(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext), ret, fail);
NCCLCHECKGOTO(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns), ret, fail);
// Duplicate ringPrev/ringNext for ncclBuildRing
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
@@ -459,7 +462,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
}
NCCLCHECK(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]));
NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
}
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
@@ -493,7 +496,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
if (comm->nChannels < comm->nvlsChannels) {
nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
}
NCCLCHECK(connectNvls(comm, nvlsHeads, minHeadNum));
NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail);
#endif
if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
@@ -501,16 +504,18 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
}
// Create rings array and check all is fine
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
NCCLCHECKGOTO(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext), ret, fail);
free(ringRecv);
free(ringSend);
free(ringPrev);
free(ringNext);
free(treeToParent);
free(treeToChild0);
free(treeToChild1);
free(nvlsHeads);
return ncclSuccess;
exit:
if (ringRecv) free(ringRecv);
if (ringSend) free(ringSend);
if (ringPrev) free(ringPrev);
if (ringNext) free(ringNext);
if (treeToParent) free(treeToParent);
if (treeToChild0) free(treeToChild0);
if (treeToChild1) free(treeToChild1);
if (nvlsHeads) free(nvlsHeads);
return ret;
fail:
goto exit;
}
+51 -57
Ver ficheiro
@@ -36,13 +36,13 @@ NCCL_PARAM(NvbDisable, "NVB_DISABLE", 0);
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
if (baseNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
for (int i=0; i<system->nodes[baseNode->type].count; i++) baseNode->paths[baseNode->type][i].type = PATH_DIS;
}
// breadth-first search to set all paths to that node in the system
struct ncclTopoNodeList nodeList;
struct ncclTopoNodeList nextNodeList;
struct ncclTopoNodeList nextNodeList = { { 0 }, 0 };
nodeList.count = 1; nodeList.list[0] = baseNode;
nextNodeList.count = 0;
struct ncclTopoLinkList* basePath;
NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
basePath->count = 0;
@@ -116,9 +116,9 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n
const int linesize = 1024;
char line[linesize];
#ifdef ENABLE_TRACE
INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id);
INFO(NCCL_GRAPH, "Paths from %s/%lx-%lx :", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id));
#else
snprintf(line, linesize, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
snprintf(line, linesize, "%s/%lx-%lx :", topoNodeTypeStr[node->type], NCCL_TOPO_ID_SYSTEM_ID(node->id), NCCL_TOPO_ID_LOCAL_ID(node->id));
int offset = strlen(line);
#endif
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
@@ -155,14 +155,14 @@ ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) {
return ncclSuccess;
}
static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
// Find the closest CPU to a GPU
int minHops = 0;
int localCpu = -1;
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
for (int c=0; c<system->nodes[CPU].count; c++) {
int hops = paths[c].count;
if (minHops == 0 || hops < minHops) {
if (hops > 0 && (minHops == 0 || hops < minHops)) {
localCpu = c;
minHops = hops;
}
@@ -193,20 +193,15 @@ static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix,
return ncclSuccess;
}
// Remove/free paths for a given type
static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) {
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
// Remove links _to_ the given type
for (int n=0; n<system->nodes[t].count; n++) {
struct ncclTopoNode* node = system->nodes[t].nodes+n;
free(node->paths[nodeType]);
node->paths[nodeType] = NULL;
}
// Remove links _from_ the given type
for (int n=0; n<system->nodes[nodeType].count; n++) {
struct ncclTopoNode* node = system->nodes[nodeType].nodes+n;
free(node->paths[t]);
node->paths[t] = NULL;
// Remove/free all paths
static void ncclTopoRemovePaths(struct ncclTopoSystem* system) {
for (int t1=0; t1<NCCL_TOPO_NODE_TYPES; t1++) {
for (int n=0; n<system->nodes[t1].count; n++) {
struct ncclTopoNode* node = system->nodes[t1].nodes+n;
for (int t2=0; t2<NCCL_TOPO_NODE_TYPES; t2++) {
if (node->paths[t2]) free(node->paths[t2]);
node->paths[t2] = NULL;
}
}
}
}
@@ -220,6 +215,7 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
if (str) {
int disable = strtol(str, NULL, 0);
if (disable == 1) l = 0;
if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %d", disableEnv, disable);
}
}
if (l == -1) {
@@ -241,9 +237,9 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
if (oldLevel > maxOldLevel) oldLevel = maxOldLevel;
l = levelsOldToNew[oldLevel];
}
if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
}
}
if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
*level = l >= 0 ? l : -2;
}
return ncclSuccess;
@@ -252,16 +248,16 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0);
int ncclTopoUserP2pLevel = -1;
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) {
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank) {
*p2p = 0;
if (read) *read = 0;
if (intermediateRank) *intermediateRank = -1;
// Get GPUs from topology
int g1, g2;
NCCLCHECK(ncclTopoIdToIndex(system, GPU, id1, &g1));
NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1));
struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
if (ncclTopoIdToIndex(system, GPU, id2, &g2) == ncclInternalError) {
if (ncclTopoRankToIndex(system, rank2, &g2) == ncclInternalError) {
// GPU not found, we can't use p2p.
return ncclSuccess;
}
@@ -277,8 +273,13 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
}
}
// In general, use P2P whenever we can.
int p2pLevel = PATH_SYS;
// By default don't use P2P across CPU Host Bridges and further apart
int p2pLevel = PATH_PXB;
int arch, vendor, model;
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
// Allow P2P between pairs of GPUs on AMD systems
if ((arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD) && system->nodes[GPU].count <= 2) p2pLevel = PATH_SYS;
// User override
if (ncclTopoUserP2pLevel == -1)
@@ -288,16 +289,6 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
goto compare;
}
// Don't use P2P through ARM CPUs
int arch, vendor, model;
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB;
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
p2pLevel = PATH_PXB;
}
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
p2pLevel = PATH_PXB;
}
compare:
// Compute the PCI distance and compare with the p2pLevel.
@@ -438,7 +429,7 @@ ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int
NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0);
// Check whether going through the network would be faster than going through P2P/SHM.
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net) {
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net) {
if (ncclParamNetDisableIntra() == 1) {
*net = 0;
return ncclSuccess;
@@ -446,8 +437,8 @@ ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_
*net = 1;
// First check the current GPU-to-GPU speed.
int g1, g2;
if (ncclTopoIdToIndex(system, GPU, id1, &g1) != ncclSuccess ||
ncclTopoIdToIndex(system, GPU, id2, &g2) != ncclSuccess) {
if (ncclTopoRankToIndex(system, rank1, &g1) != ncclSuccess ||
ncclTopoRankToIndex(system, rank2, &g2) != ncclSuccess) {
return ncclSuccess;
}
@@ -545,7 +536,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
// Precompute paths between GPUs/NICs.
// Remove everything in case we're re-computing
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
ncclTopoRemovePaths(system);
// Set direct paths to CPUs. We need them in many cases.
for (int c=0; c<system->nodes[CPU].count; c++) {
@@ -571,11 +562,11 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
for (int g=0; g<system->nodes[GPU].count; g++) {
for (int p=0; p<system->nodes[GPU].count; p++) {
int p2p;
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].gpu.rank, system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL));
if (p2p == 0) {
// Divert all traffic through the CPU
int cpu;
NCCLCHECK(getLocalCpu(system, g, &cpu));
NCCLCHECK(ncclGetLocalCpu(system, g, &cpu));
NCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g));
}
}
@@ -587,10 +578,10 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
if (p == g) continue;
struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank;
int p2p;
NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, comm, NULL, srcInfo, dstInfo));
if (p2p == 0) {
int shm;
NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, comm, NULL, srcInfo, dstInfo));
if (shm == 0) {
// Mark this peer as inaccessible. We'll trim it later.
system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET;
@@ -631,7 +622,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
if (gdr == 0) {
// We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
int localCpu;
NCCLCHECK(getLocalCpu(system, g, &localCpu));
NCCLCHECK(ncclGetLocalCpu(system, g, &localCpu));
NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
}
@@ -642,11 +633,13 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
}
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) {
ncclResult_t ret = ncclSuccess;
int *domains;
int64_t *ids;
NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count));
int64_t *ids = NULL;
int myDomain = 0;
int ngpus = system->nodes[GPU].count;
NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
NCCLCHECKGOTO(ncclCalloc(&ids, system->nodes[GPU].count), ret, fail);
for (int g=0; g<system->nodes[GPU].count; g++) {
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
domains[g] = g;
@@ -659,7 +652,6 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
if (gpu->gpu.rank == comm->rank) myDomain = domains[g];
}
int ngpus = system->nodes[GPU].count;
for (int i=0; i<ngpus; i++) {
if (domains[i] == myDomain) continue;
struct ncclTopoNode* gpu = NULL;
@@ -670,24 +662,26 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
}
if (gpu == NULL) {
WARN("Could not find id %lx", ids[i]);
free(domains);
free(ids);
return ncclInternalError;
ret = ncclInternalError;
goto fail;
}
NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
NCCLCHECKGOTO(ncclTopoRemoveNode(system, GPU, g), ret, fail);
}
if (system->nodes[GPU].count == comm->nRanks) {
for (int n=system->nodes[NET].count-1; n>=0; n--)
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
NCCLCHECKGOTO(ncclTopoRemoveNode(system, NET, n), ret, fail);
}
exit:
free(domains);
free(ids);
return ncclSuccess;
if (ids) free(ids);
return ret;
fail:
goto exit;
}
void ncclTopoFree(struct ncclTopoSystem* system) {
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
ncclTopoRemovePaths(system);
free(system);
}
+17 -11
Ver ficheiro
@@ -6,17 +6,23 @@
#include "core.h"
#define MAXWIDTH 20
#define PREFIXLEN 15
#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
void dumpLine(int* values, int nranks, const char* prefix) {
int prefixlen = strlen(prefix);
char line[STRLENGTH+1];
line[STRLENGTH] = '\0';
memset(line, ' ', STRLENGTH);
strncpy(line, prefix, PREFIXLEN);
for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
INFO(NCCL_INIT,"%s", line);
constexpr int line_length = 128;
char line[line_length];
int num_width = snprintf(nullptr, 0, "%d", nranks-1); // safe as per "man snprintf"
int n = snprintf(line, line_length, "%s", prefix);
for (int i = 0; i < nranks && n < line_length-1; i++) {
n += snprintf(line + n, line_length - n, " %*d", num_width, values[i]);
// At this point n may be more than line_length-1, so don't use it
// for indexing into "line".
}
if (n >= line_length) {
// Sprintf wanted to write more than would fit in the buffer. Assume
// line_length is at least 4 and replace the end with "..." to
// indicate that it was truncated.
snprintf(line+line_length-4, 4, "...");
}
INFO(NCCL_INIT, "%s", line);
}
ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
@@ -32,7 +38,7 @@ ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* p
rings[r*nranks+i] = current;
current = next[r*nranks+current];
}
sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
snprintf(prefix, sizeof(prefix), "Channel %02d/%02d :", r, nrings);
if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
if (current != rank) {
WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
+67 -41
Ver ficheiro
@@ -104,6 +104,9 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, link->type, &revLink));
revBw += fwBw;
}
// Coverity thinks that revLink could be NULL below. However, we access it only if revBw is non-0, and the
// logic of the code is that revBw can become non-0 only if revLink is non-NULL (see the "if" statement right above).
// coverity[var_deref_op]
if (link->bw < fwBw || (revBw && revLink->bw < revBw)) { *steps = step; return ncclSuccess; }
SUB_ROUND(link->bw, fwBw);
if (revBw) SUB_ROUND(revLink->bw, revBw);
@@ -444,6 +447,7 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
// 2. add other NETs satisfying typeInter but not already in the list.
ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
ncclResult_t ret = ncclSuccess;
int netCount = 0;
int localNetCount;
int* localNets;
@@ -456,8 +460,8 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
for (int c = 0; c<MAXCHANNELS; c++) {
int64_t netId;
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL), ret, fail);
NCCLCHECKGOTO(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount), ret, fail);
if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
localNetCount++;
}
@@ -491,12 +495,15 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
}
*netCountRet = netCount;
exit:
free(localNets);
return ncclSuccess;
return ret;
fail:
goto exit;
}
ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
ncclResult_t ret = ncclSuccess;
if ((*time) <= 0) return ncclSuccess;
(*time)--;
@@ -518,6 +525,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
}
graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
int g = gpu - system->nodes[GPU].nodes;
int* nets = NULL;
if (step == backToNet) {
// first get back to NIC
if (system->nodes[NET].count) {
@@ -525,15 +533,17 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
int netCount;
int* nets;
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount));
NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount), ret, fail);
for (int i=0; i<netCount; i++) {
int n = nets[i];
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2]) continue;
if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2) {
if (graph->nChannels & 1 && net->id != graph->inter[(graph->nChannels-1)*2]) continue;
} else {
if (graph->crossNic == 0 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
}
// Balanced Tree : count half of the bandwidth on first two GPUs
int nextBackToNet = -1;
@@ -545,18 +555,17 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
graph->bwInter /= 2;
}
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net), ret, fail);
graph->bwInter = bwInterSave;
if (net) {
graph->inter[graph->nChannels*2+1] = net->id;
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
NCCLCHECKGOTO(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time), ret, fail);
if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2;
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net), ret, fail);
graph->bwInter = bwInterSave;
}
}
free(nets);
}
} else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time));
@@ -592,23 +601,29 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
// Next path
NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
}
return ncclSuccess;
exit:
if (nets) free(nets);
return ret;
fail:
goto exit;
}
ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
ncclResult_t ret = ncclSuccess;
const int bw = graph->bwInter;
int* nets;
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
int netCount;
int graphFound = 0;
NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount), ret, fail);
for (int i=0; i<netCount; i++) {
if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) continue;
if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) break;
int n = nets[(graph->nChannels+i)%netCount];
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
if (graph->collNet && net->net.collSupport == 0) continue;
if (net->net.bw < bw) continue;
if (graph->crossNic && (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2+1]) continue;
if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2
&& (graph->nChannels & 1) && net->id != graph->inter[(graph->nChannels-1)*2+1]) continue;
graph->inter[graph->nChannels*2] = net->id;
graph->latencyInter = net->net.latency;
@@ -624,31 +639,34 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
// NVLS search only tries to find NIC:GPU combinations to compute the heads.
if (graph->nChannels < netCount) {
int gpu;
int duplicate = 0;
NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
// check whether there is duplicate head when one GPU connects with multiple NICs
for (int gc = 0; gc < graph->nChannels; gc++) {
if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) {
duplicate = 1;
break;
NCCLCHECKGOTO(ncclTopoGetLocalGpu(system, net->id, &gpu), ret, fail);
if (gpu != -1) {
int duplicate = 0;
// check whether there is duplicate head when one GPU connects with multiple NICs
for (int gc = 0; gc < graph->nChannels; gc++) {
if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) {
duplicate = 1;
break;
}
}
if (!duplicate) {
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu), ret, fail);
graphFound = 1;
}
}
if (duplicate) continue;
if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
graphFound = 1;
}
} else {
if (graph->nChannels > 0) {
// Try to replay the last channel
int g;
NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
NCCLCHECKGOTO(ncclTopoReplayGetGpu(system, graph, -1, &g), ret, fail);
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g), ret, fail);
}
if (graph->nChannels == 0 || graph->sameChannels == 0) {
if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
// Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
int t = 1 << 10;
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0), ret, fail);
if (t == -1) *time = -1;
}
@@ -660,7 +678,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
if (paths[g].bw > maxBw) {
maxBw = paths[g].bw;
minHops = paths[g].count;
} else if (paths[g].bw == maxBw && paths[g].count < minHops) {
} else if (paths[g].bw == maxBw && paths[g].count > 0 && paths[g].count < minHops) {
minHops = paths[g].count;
}
}
@@ -668,7 +686,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
for (int i=0; i<system->nodes[GPU].count; i++) {
int g = (graph->nChannels+i)%system->nodes[GPU].count;
if (paths[g].bw == maxBw && paths[g].count == minHops) {
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g), ret, fail);
}
}
}
@@ -682,8 +700,11 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
}
}
}
exit:
free(nets);
return ncclSuccess;
return ret;
fail:
goto exit;
}
/* Search Patterns
@@ -1040,9 +1061,10 @@ search:
}
tmpGraph.typeInter = PATH_PIX;
if (crossNic == 2 && tmpGraph.crossNic == 0) {
if (crossNic == 2 && tmpGraph.crossNic == 0
&& (graph->pattern == NCCL_TOPO_PATTERN_RING || graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE)) {
// Try again with crossNic if permitted
tmpGraph.crossNic = 1;
tmpGraph.crossNic = 2;
goto search;
}
tmpGraph.crossNic = crossNic == 1 ? 1 : 0;
@@ -1112,7 +1134,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
sprintf(line, "%2d :", c);
int offset = strlen(line);
if (system->nodes[NET].count > 0) {
sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c]);
sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c]));
offset = strlen(line);
}
for (int i=0; i<ngpus; i++) {
@@ -1120,7 +1142,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
offset = strlen(line);
}
if (system->nodes[NET].count > 0) {
sprintf(line+offset, " %s/%lx", topoNodeTypeStr[NET], graph->inter[2*c+1]);
sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c]));
offset = strlen(line);
}
INFO(NCCL_GRAPH, "%s", line);
@@ -1129,16 +1151,20 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
}
ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) {
ncclResult_t ret = ncclSuccess;
const char* str = ncclGetEnv("NCCL_GRAPH_DUMP_FILE");
struct ncclXml* xml = NULL;
if (str) {
INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str);
struct ncclXml* xml;
NCCLCHECK(xmlAlloc(&xml, NCCL_GRAPH_XML_MAX_NODES));
NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml));
NCCLCHECK(ncclTopoDumpXmlToFile(str, xml));
free(xml);
NCCLCHECKGOTO(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml), ret, fail);
NCCLCHECKGOTO(ncclTopoDumpXmlToFile(str, xml), ret, fail);
}
return ncclSuccess;
exit:
if (xml) free(xml);
return ret;
fail:
goto exit;
}
#include "comm.h"
+95 -86
Ver ficheiro
@@ -192,6 +192,7 @@ int getBcmGen(uint64_t id, int level) {
return 0;
}
ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
ncclResult_t ret = ncclSuccess;
for (int s=0; s<system->nodes[PCI].count; s++) {
struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s;
int gen = getBcmGen(pciSwitch->pci.device, 0);
@@ -217,7 +218,7 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
for (int s=0; s<subs; s++) {
// Find sub switch (system->nodes[PCI].nodes is changing every time we remove a node)
int index;
NCCLCHECK(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index));
NCCLCHECKGOTO(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index), ret, fail);
struct ncclTopoNode* sub = system->nodes[PCI].nodes+index;
// Connect all sub PCI devices to the parent switch
for (int l=0; l<sub->nlinks; l++) {
@@ -226,7 +227,8 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
// Add link from parent PCI switch -> PCI device
if (pciSwitch->nlinks == NCCL_TOPO_MAX_LINKS) {
WARN("Error : too many Topo links (max %d)", NCCL_TOPO_MAX_LINKS);
return ncclInternalError;
ret = ncclInternalError;
goto fail;
}
memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink));
pciSwitch->nlinks++;
@@ -238,16 +240,20 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
}
}
}
NCCLCHECK(ncclTopoRemoveNode(system, PCI, index));
NCCLCHECKGOTO(ncclTopoRemoveNode(system, PCI, index), ret, fail);
}
// Set subdevice to 0xffff to make sure we don't merge this switch again.
pciSwitch->pci.device |= 0xffff;
free(subSwIds);
// Restart, as system->nodes[PCI].nodes has changed.
s = 0;
continue;
fail:
free(subSwIds);
return ret;
}
}
return ncclSuccess;
return ret;
}
ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
@@ -281,7 +287,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
if (link->type == LINK_LOC) {
sprintf(line+offset, "+ %s[%2.1f] - %s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], link->remNode->id);
sprintf(line+offset, "+ %s[%2.1f] - %s/%lx-%lx", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id));
INFO(NCCL_GRAPH, "%s", line);
} else if (link->type != LINK_PCI || link->remNode != prevNode) {
sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw);
@@ -290,9 +296,9 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
} else {
if (link->remNode->type == NET) {
sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
sprintf(line+nextOffset, "%s/%lx-%lx (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id), link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
} else {
sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
sprintf(line+nextOffset, "%s/%lx-%lx", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id));
}
INFO(NCCL_GRAPH, "%s", line);
}
@@ -720,84 +726,87 @@ ncclResult_t ncclTopoRefreshBcmP2pLinks(void) {
}
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
ncclResult_t ret = ncclSuccess;
struct ncclXml* xml;
char* mem = NULL;
int* localRanks = NULL;
int netDevCount = 0;
struct ncclXml* rankXml;
int localRank = -1, nLocalRanks = 0;
NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES));
const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
if (xmlTopoFile) {
INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
NCCLCHECKGOTO(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1), ret, fail);
} else {
// Try default XML topology location
NCCLCHECK(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0));
NCCLCHECKGOTO(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0), ret, fail);
}
if (xml->maxIndex == 0) {
// Create top tag
struct ncclXmlNode* top;
NCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
NCCLCHECKGOTO(xmlAddNode(xml, NULL, "system", &top), ret, fail);
NCCLCHECKGOTO(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION), ret, fail);
}
NCCLCHECK(ncclTopoRefreshBcmP2pLinks());
NCCLCHECKGOTO(ncclTopoRefreshBcmP2pLinks(), ret, fail);
// Detect only the GPU managed by this process. We'll get any others through XML fusion.
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
NCCLCHECK(int64ToBusId(comm->peerInfo[comm->rank].busId, busId));
NCCLCHECKGOTO(int64ToBusId(comm->peerInfo[comm->rank].busId, busId), ret, fail);
struct ncclXmlNode* node;
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
NCCLCHECKGOTO(ncclTopoFillGpu(xml, busId, &node), ret, fail);
if (node) {
NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
NCCLCHECK(xmlSetAttrInt(node, "rank", comm->rank));
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport));
NCCLCHECKGOTO(xmlSetAttrInt(node, "keep", 1), ret, fail);
NCCLCHECKGOTO(xmlSetAttrInt(node, "rank", comm->rank), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport), ret, fail);
}
// Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
// so we start with collnet so that it has precedence.
int netDevCount = 0;
if (collNetSupport(comm)) {
NCCLCHECK(collNetDevices(comm, &netDevCount));
NCCLCHECKGOTO(collNetDevices(comm, &netDevCount), ret, fail);
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
NCCLCHECK(collNetGetProperties(comm, n, &props));
NCCLCHECKGOTO(collNetGetProperties(comm, n, &props), ret, fail);
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail);
NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail);
NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail);
NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail);
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "coll", 1), ret, fail);
}
}
if (netDevCount == 0) {
NCCLCHECK(comm->ncclNet->devices(&netDevCount));
NCCLCHECKGOTO(comm->ncclNet->devices(&netDevCount), ret, fail);
}
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
NCCLCHECK(comm->ncclNet->getProperties(n, &props));
NCCLCHECKGOTO(comm->ncclNet->getProperties(n, &props), ret, fail);
comm->netDeviceType = props.netDeviceType;
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail);
NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail);
NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail);
NCCLCHECKGOTO(xmlInitAttrFloat(netNode, "latency", props.latency), ret, fail);
NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail);
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail);
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail);
}
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
NCCLCHECK(ncclTopoTrimXml(xml));
NCCLCHECKGOTO(ncclTopoTrimXml(xml), ret, fail);
// XML topo fusion.
int* localRanks;
int localRank = -1, nLocalRanks = 0;
if (comm->MNNVL) {
// MNNVL clique support
nLocalRanks = comm->clique.size;
@@ -805,7 +814,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
localRanks = comm->clique.ranks;
} else {
// Intra-node fusion. Much of the comm is not initialized yet at this point so we need to do our own calculations.
NCCLCHECK(ncclCalloc(&localRanks, comm->nRanks));
NCCLCHECKGOTO(ncclCalloc(&localRanks, comm->nRanks), ret, fail);
for (int i = 0; i < comm->nRanks; i++) {
if (comm->peerInfo[i].hostHash == comm->peerInfo[comm->rank].hostHash) {
if (i == comm->rank)
@@ -814,37 +823,42 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
}
}
}
char* mem;
NCCLCHECK(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank);
NCCLCHECKGOTO(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)), ret, fail);
rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank);
memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
NCCLCHECKGOTO(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1), ret, fail);
// nLocalRanks can't actually be 0, or we wouldn't be running at all...
// coverity[divide_by_zero]
NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)), ret, fail);
if (comm->MNNVL) {
// Ensure that we have enough room when fusing topos from multiple nodes.
free(xml);
NCCLCHECK(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES));
xml = NULL;
NCCLCHECKGOTO(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES), ret, fail);
} else {
// In the intra-node case there's no need to enlarge the topo xml.
xml->maxIndex = 0;
free(localRanks);
}
for (int i = 0; i < nLocalRanks; i++) {
struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
NCCLCHECK(ncclTopoFuseXml(xml, peerXml));
NCCLCHECKGOTO(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0), ret, fail);
NCCLCHECKGOTO(ncclTopoFuseXml(xml, peerXml), ret, fail);
}
free(mem);
xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
NCCLCHECKGOTO(ncclTopoDumpXmlToFile(xmlTopoFile, xml), ret, fail);
}
NCCLCHECK(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash));
NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail);
exit:
if (!comm->MNNVL && localRanks) free(localRanks);
if (mem) free(mem);
free(xml);
return ncclSuccess;
return ret;
fail:
goto exit;
}
ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) {
@@ -853,6 +867,7 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index
int count = 0;
NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count));
struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType];
if (paths == NULL) { *localCount = 0; return ncclSuccess; }
for (int i=0; i<system->nodes[resultType].count; i++) {
if (paths[i].bw > maxBw || (paths[i].bw == maxBw && paths[i].type < minType)) {
maxBw = paths[i].bw;
@@ -891,6 +906,7 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
}
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
ncclResult_t ret = ncclSuccess;
int gpu;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
int* localNets;
@@ -898,39 +914,46 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
int* localGpus = NULL;
int localGpuCount;
NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL));
int net = system->nodes[GPU].nodes[gpu].gpu.dev;
int net;
NCCLCHECKGOTO(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL), ret, fail);
net = system->nodes[GPU].nodes[gpu].gpu.dev;
if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
net += channelId%(DIVUP(localNetCount,localGpuCount));
if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
exit:
free(localNets);
free(localGpus);
return ncclSuccess;
if (localGpus) free(localGpus);
return ret;
fail:
goto exit;
}
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) {
ncclResult_t ret = ncclSuccess;
int netIndex;
NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &netIndex));
int* localGpus = NULL;
int localGpuCount;
int foundGpu = -1;
NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
for (int c=0; c<MAXCHANNELS; c++) {
for (int lg=0; lg<localGpuCount; lg++) {
int g = localGpus[lg];
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
int64_t id;
NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL));
NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL), ret, fail);
if (netId == id) {
*gpuIndex = g;
free(localGpus);
return ncclSuccess;
foundGpu = g;
goto exit;
}
}
}
exit:
*gpuIndex = foundGpu;
fail:
free(localGpus);
*gpuIndex = -1;
return ncclSuccess;
return ret;
}
/****************************/
@@ -948,25 +971,11 @@ NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) {
struct ncclTopoNode* cpu = NULL, *gpu = NULL;
for (int g=0; g<system->nodes[GPU].count; g++) {
if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
gpu = system->nodes[GPU].nodes+g;
// Find closer CPU
int cpuIndex = -1, minHops = 0;
for (int c=0; c<system->nodes[CPU].count; c++) {
int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
if (cpuIndex == -1 || nHops < minHops) {
cpuIndex = c;
minHops = nHops;
}
}
cpu = system->nodes[CPU].nodes+cpuIndex;
}
}
if (cpu == NULL) {
WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank);
return ncclInternalError;
}
int gpuIndex, cpuIndex;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpuIndex));
NCCLCHECK(ncclGetLocalCpu(system, gpuIndex, &cpuIndex));
gpu = system->nodes[GPU].nodes+gpuIndex;
cpu = system->nodes[CPU].nodes+cpuIndex;
// Query the CPU affinity set we were provided
cpu_set_t mask;
+4 -3
Ver ficheiro
@@ -30,7 +30,7 @@
// to GPU traffic consumes more PCI bandwidth.
#define INTEL_P2P_OVERHEAD(bw) (bw*6/5)
#define NCCL_TOPO_NODE_TYPES 7
#define NCCL_TOPO_NODE_TYPES 6
#define GPU 0
#define PCI 1
#define NVS 2
@@ -103,9 +103,10 @@ struct ncclTopoLinkList {
#define NCCL_TOPO_UNDEF (-1)
#define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff
#define NCCL_TOPO_ID_SYSTEM_ID(id) (id >> 56)
#define NCCL_TOPO_ID_LOCAL_ID(id) (id & 0x00ffffffffffffff)
#define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + localid)
#define NCCL_TOPO_ID_LOCAL_ID(id) (id & NCCL_TOPO_ID_LOCAL_ID_MASK)
#define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + (localid & NCCL_TOPO_ID_LOCAL_ID_MASK))
struct ncclTopoNode {
int type;
+52 -28
Ver ficheiro
@@ -54,7 +54,7 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
// Latencies in us, Bandwidths in GB/s
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
{ 6.8, 14.0, 0 }, { 6.6, 14.0, 8.4 }, // Tree, Ring
{ 6.8, 14.0, 8.4 }, { 6.6, 14.0, 8.4 }, // Tree, Ring
{ 0, 0, 0 }, { 0, 0, 0 }, // Collnet Direct, Chain
{ 0, 0, 0 }, { 0, 0, 0 }}; // NVLS, NVLS Tree
@@ -64,15 +64,15 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
#define NCCL_HW_NET 2
static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
{ /* NVLINK */
{ /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 28 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
{ /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 4.0 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
/* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
/* NVLS */ { 0, 0, 25 }, /* NVLSTree */ { 0, 0, 25 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 4.0 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
/* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
/* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } },
/* NET */
{ /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 },
{ /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 14 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 },
/* CollNetDirect (Simple)*/ { 0, 0, 31 }, /* CollNetChain (Simple)*/ { 0, 0, 30 },
/* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 14 } }
};
@@ -105,6 +105,15 @@ static const double perChMaxTreeBws[3][3] = {
/* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
};
NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
static int ncclPatEnable(struct ncclComm* comm) {
int patEnable = ncclParamPatEnable();
if (patEnable != 2) return patEnable;
if (comm->nNodes != comm->nRanks) return 0; // PAT only supports 1 GPU per node
if (comm->netDeviceType != NCCL_NET_DEVICE_HOST) return 0; // PAT doesn't support net device offload
return 1;
}
// Network post overhead in ns (1000 = 1 us)
NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);
@@ -146,7 +155,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
// De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
float ppn = (float)nRanks / nNodes;
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
@@ -156,18 +165,18 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
nRanks;
int nInterSteps = coll == ncclFuncAllReduce ? (nNodes > 1 ? 2*nNodes :0) :
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
nNodes;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
if (coll == ncclFuncBroadcast && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncReduce && a != NCCL_ALGO_RING) continue;
if (coll == ncclFuncReduceScatter && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
if ((coll == ncclFuncBroadcast || coll == ncclFuncReduce) && a != NCCL_ALGO_RING) continue;
if ((coll == ncclFuncReduceScatter || coll == ncclFuncAllGather)
&& a != NCCL_ALGO_PAT && a != NCCL_ALGO_RING
&& a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
if (coll == ncclFuncAllReduce && a == NCCL_ALGO_PAT) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
if ((coll == ncclFuncReduceScatter || coll == ncclFuncAllGather)
&& a == NCCL_ALGO_PAT && (p != NCCL_PROTO_SIMPLE || ncclPatEnable(comm) == 0)) continue;
int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
if (a == NCCL_ALGO_NVLS) bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter);
@@ -176,11 +185,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
// Various model refinements
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * .5); }
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
if (a == NCCL_ALGO_TREE && coll == ncclFuncAllReduce) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
if (a == NCCL_ALGO_PAT) busBw *= .85;
if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0; // Not used
if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
@@ -208,7 +218,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
}
// Convert bus BW to algorithm BW
if (!(a == NCCL_ALGO_COLLNET_DIRECT && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
if (!(a != NCCL_ALGO_RING && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
float ratio = 1.0f;
if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps;
else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0;
@@ -222,7 +232,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
comm->latencies[coll][a][p] = baseLat[a][p];
float intraLat = hwLat[intraHw[a]][a][p];
float interLat = hwLat[NCCL_HW_NET][a][p] + graphs[a]->latencyInter;
// With ppn=1 latencies are fully exposed, use the Tree network latency
float interLat = ppn == 1 ? hwLat[NCCL_HW_NET][NCCL_ALGO_TREE][p] : hwLat[NCCL_HW_NET][a][p];
interLat += graphs[a]->latencyInter;
// Also add the flush extra latency
if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter;
@@ -243,11 +255,14 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3;
}
intraLat = std::max(intraLat, netOverhead);
int nInterSteps = nNodes == 1 ? 0 : coll == ncclFuncAllReduce ? 2*(nNodes-1) : nNodes-1;
comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat;
}
} else if (a == NCCL_ALGO_TREE) {
comm->latencies[coll][a][p] +=
2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
if (coll == ncclFuncAllReduce) {
comm->latencies[coll][a][p] +=
2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
}
} else if (a == NCCL_ALGO_COLLNET_DIRECT) {
comm->latencies[coll][a][p] +=
2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.4) + interLat; // Add 0.4 us arity serialization latency
@@ -258,6 +273,12 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (nNodes > 1) comm->latencies[coll][a][p] += interLat;
} else if (a == NCCL_ALGO_NVLS_TREE) {
comm->latencies[coll][a][p] += intraLat + 2 * log2i(nNodes) * interLat;
} else if (a == NCCL_ALGO_PAT) {
if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
comm->latencies[coll][a][p] = 8 // Base time
+ log2i(nNodes) * (interLat/3.5) // Log latency
+ nRanks * 2.8; // Still a linear part; hopefully we'll manage to remove it at some point.
}
}
}
}
@@ -266,7 +287,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
// Protocols/Algorithms enable/disable, and user overrides.
// All are enabled except ll128 which is enabled by default only in certain cases.
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 };
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1, 1 };
const char *protoStr = ncclGetEnv("NCCL_PROTO");
if (protoStr) {
@@ -336,23 +357,25 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (comm->rank == 0) {
char line[1024];
for (int block=0; block<2; block++) {
for (int block=0; block<DIVUP(NCCL_NUM_ALGORITHMS, 3); block++) {
sprintf(line, " Algorithm |");
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
int a = block*NCCL_NUM_ALGORITHMS/2+ba;
for (int ba=0; ba<3; ba++) {
int a = block*3+ba;
if (a >= NCCL_NUM_ALGORITHMS) continue;
sprintf(line+strlen(line), " %14s %14s %14s |", "", ncclAlgoStr[a], "");
}
INFO(NCCL_TUNING, "%s", line);
sprintf(line, " Protocol |");
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
for (int ba=0; ba<3; ba++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), " %14s |", ncclProtoStr[p]);
}
}
INFO(NCCL_TUNING, "%s", line);
sprintf(line, " Max NThreads |");
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
int a = block*NCCL_NUM_ALGORITHMS/2+ba;
for (int ba=0; ba<3; ba++) {
int a = block*3+ba;
if (a >= NCCL_NUM_ALGORITHMS) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
}
@@ -360,8 +383,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
INFO(NCCL_TUNING, "%s", line);
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
sprintf(line, "%13s |", ncclFuncStr[c]);
for (int ba=0; ba<NCCL_NUM_ALGORITHMS/2; ba++) {
int a = block*NCCL_NUM_ALGORITHMS/2+ba;
for (int ba=0; ba<3; ba++) {
int a = block*3+ba;
if (a >= NCCL_NUM_ALGORITHMS) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
}
@@ -431,7 +455,7 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm,
*time = -1.0; return ncclSuccess;
}
int logSize = log2i(nBytes>>6);
if (algorithm == NCCL_ALGO_TREE && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
if (algorithm == NCCL_ALGO_TREE && coll == ncclFuncAllReduce && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && comm->nNodes > 1
&& coll == ncclFuncAllReduce && nBytes/(comm->nChannels*comm->nRanks) >= 64) {
lat *= comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
+5 -2
Ver ficheiro
@@ -468,8 +468,8 @@ ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml*
return ncclInternalError;
}
// Set affinity
char cpumaskPath[] = "/sys/devices/system/node/node0000";
sprintf(cpumaskPath, "/sys/devices/system/node/node%s", numaId);
char cpumaskPath[] = "/sys/devices/system/node/node000000";
snprintf(cpumaskPath, sizeof(cpumaskPath), "/sys/devices/system/node/node%s", numaId);
NCCLCHECK(ncclTopoSetAttrFromSys(cpuNode, cpumaskPath, "cpumap", "affinity"));
}
@@ -690,6 +690,9 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
}
pciNode->parent = parent;
// Keep PCI sub devices ordered by PCI Bus ID (Issue #820)
// Coverity complains about dereferenced parent being NULL
// but this can never happen.
// coverity[var_deref_op]
int subIndex = parent->nSubs;
const char* newBusId;
NCCLCHECK(xmlGetAttrStr(pciNode, "busid", &newBusId));
+39 -19
Ver ficheiro
@@ -57,7 +57,12 @@ ncclResult_t ncclAsyncLaunch(
WARN("Blocking and nonblocking communicators are not allowed in the same group.");
ret = ncclInvalidArgument;
}
ncclIntruQueueEnqueue(&ncclAsyncJobs, job);
if (ret == ncclSuccess) {
ncclIntruQueueEnqueue(&ncclAsyncJobs, job);
} else {
// no need to undo, the job hasn't run
if (destructor) destructor(job);
}
}
return ret;
@@ -75,7 +80,7 @@ void* ncclAsyncJobMain(void* arg) {
ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) {
ncclResult_t ret;
SYSCHECK(pthread_join(job->thread, NULL), "pthread_join");
PTHREADCHECK(pthread_join(job->thread, NULL), "pthread_join");
if (job->result != ncclSuccess) {
WARN("ncclAsyncJobComplete: job %p failed, job error %d", job, job->result);
}
@@ -165,6 +170,12 @@ ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) {
NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
break;
}
case NCCL_ALGO_PAT: {
NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
break;
}
// Yes, it's a dead code. That's fine...
// coverity[dead_error_begin]
default: {
ret = ncclInternalError;
goto fail;
@@ -301,7 +312,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
ncclKernelPlanner::Peer* tmp = comm->planner.peers;
memset(&comm->planner, 0, sizeof(comm->planner));
comm->planner.peers = tmp;
memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0]));
if (comm->planner.peers != NULL) memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0]));
}
if (!comm->config.blocking)
@@ -329,7 +340,7 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
if (!ncclIntruQueueEmpty(asyncJobsMain)) {
struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain);
do {
SYSCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), ret, fail);
PTHREADCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), "pthread_create", ret, fail);
job = job->next;
} while (job != nullptr);
@@ -341,8 +352,9 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
if (state == ncclGroupJobRunning) {
jobsDone = false;
} else if (state == ncclGroupJobDone) {
if (pthread_join(job->thread, nullptr) != 0) {
WARN("Error waiting for pthread_join : %s", strerror(errno));
int err;
if ((err = pthread_join(job->thread, nullptr)) != 0) {
WARN("Error waiting for pthread_join: %s", strerror(err));
ret = ncclSystemError;
}
job->state = ncclGroupJobJoined;
@@ -373,13 +385,6 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
if (ret != ncclSuccess) goto fail;
}
while (!ncclIntruQueueEmpty(asyncJobsMain)) {
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
(void) ncclCommSetAsyncError(job->comm, ret);
if (job->destructor) job->destructor((void*)job);
}
exit:
return ret;
fail:
@@ -393,6 +398,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
bool *groupAbortFlag = gjob->abortFlagPtr;
CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
@@ -409,7 +415,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
job->base.abortFlag = comm->abortFlag;
job->base.abortFlagDev = comm->abortFlagDev;
job->comm = comm;
ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
ncclIntruQueueEnqueue(asyncJobsMain, (struct ncclAsyncJob*)job);
struct ncclComm* next = comm->preconnectNext;
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
@@ -422,12 +428,14 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
/* Connect channels at runtime if cumem is supported */
if (groupCommHeadMain != nullptr) {
struct ncclComm* comm = groupCommHeadMain;
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncCollJobs;
ncclIntruQueueConstruct(&asyncCollJobs);
do {
bool needConnect = false;
bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
if (comm->cuMemSupport && needConnect) {
@@ -438,21 +446,33 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
job->base.destructor = free;
job->base.state = ncclGroupJobRunning;
job->base.abortFlag = comm->abortFlag;
job->base.abortFlagDev = comm->abortFlagDev;
job->comm = comm;
NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
}
comm = comm->groupNext;
} while (comm);
NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail);
NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
if (job->destructor) job->destructor((void*)job);
}
}
if ((!simInfo) && (groupCommHeadMain != nullptr)) {
NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
}
while (!ncclIntruQueueEmpty(asyncJobsMain)) {
struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
(void) ncclCommSetAsyncError(job->comm, ret);
if (job->destructor) job->destructor((void*)job);
}
while (groupCommHeadMain != nullptr) {
struct ncclComm* comm = groupCommHeadMain;
struct ncclComm* next = comm->groupNext;
@@ -517,7 +537,7 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
ncclGroupJobMainPtr = &ncclGroupJobMain;
/* make sure ncclGroupBlocking has been set. */
assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1);
if (ncclGroupBlocking == 0 && (ncclGroupCommPreconnectHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs))) {
if (ncclGroupBlocking == 0) {
/* nonblocking group */
if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
@@ -539,7 +559,7 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
}
ncclGroupJobMainPtr->base.func = groupLaunchNonBlocking;
SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail);
PTHREADCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), "pthread_create", ret, fail);
ret = ncclInProgress;
} else {
/* blocking group */
+135 -10
Ver ficheiro
@@ -17,6 +17,11 @@
#include <stdlib.h>
#include <string.h>
#if CUDART_VERSION >= 11030
#include <cuda.h>
#include "cudawrap.h"
#endif
uint64_t clockNano(); // from utils.h with which we have a circular dependency
template<typename T>
@@ -24,6 +29,81 @@ constexpr size_t ncclSizeOfT() { return sizeof(T); }
template<>
constexpr size_t ncclSizeOfT<void>() { return 1; }
#if CUDART_VERSION >= 12020
static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
ncclResult_t result = ncclSuccess;
size_t granularity = 0;
CUdevice currentDev;
CUmemAllocationProp prop = {};
CUmemAccessDesc accessDesc = {};
CUmemGenericAllocationHandle handle;
int cudaDev;
int cpuNumaNodeId = -1;
CUmemAllocationHandleType type = ncclCuMemHandleType;
CUDACHECK(cudaGetDevice(&cudaDev));
CUCHECK(cuDeviceGet(&currentDev, cudaDev));
CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev));
if (cpuNumaNodeId < 0) cpuNumaNodeId = 0;
prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.requestedHandleTypes = type; // So it can be exported
prop.location.id = cpuNumaNodeId;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
ALIGN_SIZE(size, granularity);
/* Allocate the physical memory on the device */
CUCHECK(cuMemCreate(&handle, size, &prop, 0));
/* Reserve a virtual address range */
CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, granularity, 0, 0));
/* Map the virtual address range to the physical allocation */
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
/* Now allow RW access to the newly mapped memory for local GPU */
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = cudaDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
/* Now allow RW access to the newly mapped memory from the CPU */
accessDesc.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
accessDesc.location.id = cpuNumaNodeId;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
if (handlep) *handlep = handle;
INFO(NCCL_ALLOC, "CUMEM Host Alloc Size %zi pointer %p handle %llx numa %d dev %d granularity %ld", size, *ptr, handle, cpuNumaNodeId, cudaDev, granularity);
return result;
}
static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
if (ptr == NULL) return ncclSuccess;
ncclResult_t result = ncclSuccess;
CUmemGenericAllocationHandle handle;
size_t size = 0;
CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
CUCHECK(cuMemRelease(handle));
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
TRACE(NCCL_ALLOC, "CUMEM Host Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
CUCHECK(cuMemRelease(handle));
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
return result;
}
#else /* CUDART_VERSION >= 12020 */
static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, void* handlep, size_t size) {
WARN("CUMEM Host is not supported prior to CUDA 12.2");
return ncclInternalError;
}
static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
WARN("CUMEM Host is not supported prior to CUDA 12.2");
return ncclInternalError;
}
#endif /* CUDART_VERSION >= 12020 */
template <typename T>
ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
ncclResult_t result = ncclSuccess;
@@ -40,24 +120,25 @@ finish:
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
return result;
}
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
inline ncclResult_t ncclCudaHostFree(void* ptr) {
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
CUDACHECK(cudaFreeHost(ptr));
return ncclSuccess;
}
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
template <typename T>
ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
if (nelem > 0) {
void* p = malloc(nelem*ncclSizeOfT<T>());
T* p = (T*)malloc(nelem*ncclSizeOfT<T>());
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
return ncclSystemError;
}
//INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), p);
memset(p, 0, nelem*ncclSizeOfT<T>());
*ptr = (T*)p;
*ptr = p;
} else {
*ptr = NULL;
}
@@ -67,17 +148,17 @@ ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int li
template <typename T>
ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
if (nelem < oldNelem) return ncclInternalError;
T* oldp = *ptr;
if (nelem < oldNelem || (oldp == NULL && oldNelem > 0)) return ncclInternalError;
if (nelem == oldNelem) return ncclSuccess;
T* oldp = *ptr;
T* p = (T*)malloc(nelem*ncclSizeOfT<T>());
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
return ncclSystemError;
}
memcpy(p, oldp, oldNelem*ncclSizeOfT<T>());
free(oldp);
if (oldp && oldNelem) memcpy(p, oldp, oldNelem * ncclSizeOfT<T>());
if (oldp) free(oldp);
memset(p+oldNelem, 0, (nelem-oldNelem)*ncclSizeOfT<T>());
*ptr = (T*)p;
INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*ncclSizeOfT<T>(), nelem*ncclSizeOfT<T>(), *ptr);
@@ -89,6 +170,40 @@ ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
#include <cuda.h>
#include "cudawrap.h"
// ncclCuMemAllocAddr takes memory handle and size and returns the mapped address pointer
static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
ncclResult_t result = ncclSuccess;
size_t granularity = 0;
CUmemAllocationProp prop = {};
CUmemAccessDesc accessDesc = {};
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUCHECK(cuMemGetAllocationPropertiesFromHandle(&prop, *handleIn));
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
ALIGN_SIZE(size, granularity);
/* Reserve a virtual address range */
CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
/* Map the virtual address range to the physical allocation */
CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, *handleIn, 0));
/* Now allow RW access to the newly mapped memory */
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = cudaDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
TRACE(NCCL_ALLOC, "CuMem Map Size %zu pointer %p handle %llx", size, *ptr, *handleIn);
return result;
}
static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
if (ptr == NULL) return ncclSuccess;
ncclResult_t result = ncclSuccess;
size_t size = 0;
CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
return result;
}
static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
ncclResult_t result = ncclSuccess;
size_t granularity = 0;
@@ -106,7 +221,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
prop.requestedHandleTypes = type;
prop.location.id = currentDev;
// Query device to see if RDMA support is available
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
ALIGN_SIZE(size, granularity);
@@ -154,6 +269,15 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
return ncclInternalError;
}
static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
WARN("CUMEM not supported prior to CUDA 11.3");
return ncclInternalError;
}
static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
WARN("CUMEM not supported prior to CUDA 11.3");
return ncclInternalError;
}
#endif
template <typename T>
@@ -274,7 +398,8 @@ finish:
// and if they are shared, that could cause a crash in a child process
inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
if (size > 0) {
size_t page_size = sysconf(_SC_PAGESIZE);
long page_size = sysconf(_SC_PAGESIZE);
if (page_size < 0) return ncclSystemError;
void* p;
int size_aligned = ROUNDUP(size, page_size);
int ret = posix_memalign(&p, page_size, size_aligned);
+11
Ver ficheiro
@@ -185,6 +185,8 @@ inline __host__ __device__ Int pow2Up(Int x) {
template<typename Int>
inline __host__ __device__ Int pow2Down(Int x) {
// True, log2Down can return -1, but we don't normally pass 0 as an argument...
// coverity[negative_shift]
return Int(1)<<log2Down(x);
}
@@ -274,4 +276,13 @@ inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
return u32fpDecode(x, 3);
}
inline __host__ __device__ uint64_t getHash(const char* string, int n) {
// Based on DJB2a, result = result * 33 ^ char
uint64_t result = 5381;
for (int c = 0; c < n; c++) {
result = ((result << 5) + result) ^ string[c];
}
return result;
}
#endif
+2 -2
Ver ficheiro
@@ -19,8 +19,8 @@ static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Boots
ncclResult_t bootstrapNetInit();
ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
ncclResult_t bootstrapInit(int nHandles, void* handle, struct ncclComm* comm);
ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
+38 -23
Ver ficheiro
@@ -38,21 +38,17 @@
#include <errno.h>
// Check system calls
#define SYSCHECK(call, name) do { \
#define SYSCHECK(statement, name) do { \
int retval; \
SYSCHECKVAL(call, name, retval); \
} while (false)
#define SYSCHECKVAL(call, name, retval) do { \
SYSCHECKSYNC(call, name, retval); \
SYSCHECKSYNC((statement), name, retval); \
if (retval == -1) { \
WARN("Call to " name " failed : %s", strerror(errno)); \
WARN("Call to " name " failed: %s", strerror(errno)); \
return ncclSystemError; \
} \
} while (false)
#define SYSCHECKSYNC(call, name, retval) do { \
retval = call; \
#define SYSCHECKSYNC(statement, name, retval) do { \
retval = (statement); \
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
} else { \
@@ -60,14 +56,33 @@
} \
} while(true)
#define SYSCHECKGOTO(statement, RES, label) do { \
if ((statement) == -1) { \
/* Print the back trace*/ \
RES = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
#define SYSCHECKGOTO(statement, name, RES, label) do { \
int retval; \
SYSCHECKSYNC((statement), name, retval); \
if (retval == -1) { \
WARN("Call to " name " failed: %s", strerror(errno)); \
RES = ncclSystemError; \
goto label; \
} \
} while (0);
} while (0)
// Pthread calls don't set errno and never return EINTR.
#define PTHREADCHECK(statement, name) do { \
int retval = (statement); \
if (retval != 0) { \
WARN("Call to " name " failed: %s", strerror(retval)); \
return ncclSystemError; \
} \
} while (0)
#define PTHREADCHECKGOTO(statement, name, RES, label) do { \
int retval = (statement); \
if (retval != 0) { \
WARN("Call to " name " failed: %s", strerror(retval)); \
RES = ncclSystemError; \
goto label; \
} \
} while (0)
#define NEQCHECK(statement, value) do { \
if ((statement) != value) { \
@@ -75,7 +90,7 @@
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
} while (0)
#define NEQCHECKGOTO(statement, value, RES, label) do { \
if ((statement) != value) { \
@@ -84,7 +99,7 @@
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
} while (0)
#define EQCHECK(statement, value) do { \
if ((statement) == value) { \
@@ -92,7 +107,7 @@
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
} while (0)
#define EQCHECKGOTO(statement, value, RES, label) do { \
if ((statement) == value) { \
@@ -101,7 +116,7 @@
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
} while (0)
// Propagate errors up
#define NCCLCHECK(call) do { \
@@ -111,7 +126,7 @@
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
return RES; \
} \
} while (0);
} while (0)
#define NCCLCHECKGOTO(call, RES, label) do { \
RES = call; \
@@ -120,7 +135,7 @@
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
goto label; \
} \
} while (0);
} while (0)
#define NCCLWAIT(call, cond, abortFlagPtr) do { \
uint32_t* tmpAbortFlag = (abortFlagPtr); \
@@ -130,7 +145,7 @@
return ncclInternalError; \
} \
if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECK(*tmpAbortFlag, 0); \
} while (!(cond));
} while (!(cond))
#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
uint32_t* tmpAbortFlag = (abortFlagPtr); \
@@ -140,7 +155,7 @@
goto label; \
} \
if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
} while (!(cond));
} while (!(cond))
#define NCCLCHECKTHREAD(a, args) do { \
if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \
+486
Ver ficheiro
@@ -64,4 +64,490 @@ struct ncclConnFifo {
ssize_t size;
void* ptr;
};
#include <stdio.h>
template<typename T>
class PatRSAlgorithm{
size_t offset;
size_t end;
size_t count;
int chunkCount;
int nelem;
int rank;
int nranks;
int nrPow2;
int postFreq;
int lastA;
int aggFactor;
int as; // aggregated steps
int a; // step inside aggregated step
int sendSkipped; // number of skipped steps during aggregation
int recvSkipped; // number of skipped steps during aggregation
int phase2recv; // receive offset for phase 2
int aggDelta;
int scale;
int phase;
__device__ __host__ int min(int a, int b) {
return (a<b)?a:b;
}
__device__ __host__ int getNelem() {
return min(chunkCount, end-offset);
}
__device__ __host__ int mirrorInvert(int i, int max) {
int ret = 0;
for (int mask=1, imask=max/2; mask<max; mask<<=1, imask>>=1) {
if ((i&mask) == 0) ret += imask;
}
return ret;
}
__device__ __host__ int firstBitSet(int i, int max) {
int ffs =
#ifdef __CUDA_ARCH__
__ffs(i);
#else
__builtin_ffs(i);
#endif
return ffs ? ffs-1 : max;
}
__device__ __host__ void resetA() {
a = 0;
sendSkipped = recvSkipped = 0;
lastA = aggFactor;
if (phase >= 2) lastA /= 2*scale;
}
__device__ __host__ void reset() {
nelem = getNelem();
phase = 0;
scale = 1;
phase2recv = 0;
as = aggDelta - 1;
resetA();
}
__device__ __host__ int nBitsSet(int i) {
int nbits =
#ifdef __CUDA_ARCH__
__popc(i);
#else
__builtin_popcount(i);
#endif
return nbits;
}
// Return 1 when only upper bits are set. For example, if nrpow2==16 we'll return 1 for 8, 12, 14, 15.
// A number being in the form of 1111000 implies that the complementary is 0000111 meaning it's a power of 2 minus 1.
__device__ __host__ int newPeer(int i, int pow2) {
//printf("New peer %d/%d -> %d\n", i, pow2, nBitsSet((i ^ (pow2-1)) + 1) == 1 ? 1 : 0);
return nBitsSet((i ^ (pow2-1)) + 1) == 1 ? 1 : 0;
}
public:
__device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
aggDelta = nrPow2 = (1<<log2Up(nranks));
aggFactor = 1;
size_t channelSize = end-offset;
while (stepSize / (channelSize*sizeof(T)*aggFactor) >= 2 && aggFactor < nranks/2) {
aggFactor *= 2;
aggDelta /= 2;
}
postFreq = aggFactor;
int d = stepDepth;
while (d > 1 && aggFactor < nranks/2) {
d /= 2;
aggFactor *= 2;
aggDelta /= 2;
}
reset();
}
__device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &sendStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
restart:
last = 0;
nelemOut = nelem;
outIx = offset;
int skip = 0;
//printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
if (phase == 0) {
int s = mirrorInvert(a, lastA)*aggDelta + as;
if (s >= nranks) skip = 1;
int sendDataRank = (rank + s) % nranks;
inpIx = sendDataRank * count + offset;
recvDim = -1;
sendDim = 0;
outIx = 0;
recvOffset = -1;
sendOffset = ((a - sendSkipped)%postFreq) * nelem;
sendStepOffset = 0;
if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
postSend = 1;
} else {
postSend = 0;
}
postRecv = 0;
if (skip) sendSkipped++;
if (++a == lastA) {
phase = as == 1 ? (aggFactor > 1 ? 2 : 4) : 1; // If as == 1, switch to phase 2
resetA();
}
if (skip == 0) return;
} else if (phase == 1) {
int s = mirrorInvert(a, lastA)*aggDelta + as;
if (s >= nranks) skip = 1;
recvDim = firstBitSet(s, nrPow2);
sendOffset = ((a - sendSkipped)%postFreq)*nelem;
recvOffset = ((a - recvSkipped)%postFreq)*nelem;
postSend = 0;
if (recvDim == 0) {
if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) postSend = 1;
sendStepOffset = 0;
} else {
sendStepOffset = (a - sendSkipped)/postFreq;
}
if ((((a - recvSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
postRecv = 1;
} else {
postRecv = 0;
}
s -= (1<<recvDim);
int recvDataRank = (rank + nranks + s) % nranks;
inpIx = recvDataRank * count + offset;
sendDim = s ? firstBitSet(s, nrPow2) : -1;
if (sendDim == -1) {
sendOffset = -1;
sendStepOffset = 0;
} else if (as - (1<<recvDim) == 0) {
if (newPeer(a, aggFactor)) sendSkipped = a;
int foffset = a - sendSkipped;
sendStepOffset = recvDim == 0 ? 0 : foffset/postFreq;
sendOffset = (foffset%postFreq)*nelem;
}
if (s < nranks && skip) {
recvDim = -1;
recvOffset = -1;
postRecv = 0;
skip = 0;
}
if (skip || recvDim == -1) recvSkipped++;
if (skip) sendSkipped++;
if (++a == lastA) {
as--;
phase = as % 2 == 1 ? 0 : 1;
resetA();
}
if (skip == 0) return;
} else if (phase == 2) {
int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta + 1;
postRecv = 0;
if (s >= nranks) skip = 1;
recvDim = 0;
postSend = a == lastA-1 ? 1 : 0;
s -= 1;
if (s < nranks && skip) {
recvDim = -1;
recvOffset = -1;
skip = 0;
} else if (!skip) {
int foffset = phase2recv;
phase2recv++;
postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
recvOffset = (foffset%postFreq) * nelem;
}
int recvDataRank = (rank + nranks + s) % nranks;
inpIx = recvDataRank * count + offset;
sendDim = s ? firstBitSet(s, nrPow2) : -1;
int foffset = a - sendSkipped;
postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
sendStepOffset = 0;
sendOffset = (foffset%postFreq) * nelem;
if (skip || sendDim == -1) sendSkipped++;
if (++a == lastA) {
phase = 3;
resetA();
}
if (skip == 0) return;
} else if (phase == 3) {
int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta;
postRecv = a == lastA-1 ? 1 : 0;
if (s >= nranks) skip = 1;
recvDim = firstBitSet(s, nrPow2);
postSend = 0;
s -= (1<<recvDim);
int foffset = a - recvSkipped;
postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0;
recvOffset = (foffset%postFreq) * nelem;
int recvDataRank = (rank + nranks + s) % nranks;
inpIx = recvDataRank * count + offset;
sendDim = s ? firstBitSet(s, nrPow2) : -1;
if (s < nranks && skip) {
recvDim = -1;
recvOffset = -1;
postRecv = 0;
skip = 0;
}
if (newPeer(a, aggFactor/(2*scale))) sendSkipped = a;
foffset = a - sendSkipped;
sendStepOffset = foffset / postFreq; // Accumulate on next steps
sendOffset = sendDim >= 0 ? (foffset%postFreq) * nelem : -1;
if (skip || recvDim == -1) recvSkipped++;
if (skip) sendSkipped++;
if (++a == lastA) {
scale *= 2;
phase = scale < aggFactor ? 2 : 4;
resetA();
}
if (skip == 0) return;
} else if (phase == 4) {
recvDim = 0;
sendDim = -1;
inpIx = rank * count + offset;
recvOffset = (phase2recv%postFreq) * nelem;
sendStepOffset = 0;
sendOffset = -1;
postRecv = 1;
postSend = 0;
offset += chunkCount;
if (offset >= end) {
last = 1;
} else {
reset();
}
return;
}
goto restart;
}
};
template<typename T>
class PatAGAlgorithm{
size_t offset;
size_t end;
size_t count;
int chunkCount;
int nelem;
int rank;
int nranks;
int nrPow2;
int postFreq;
int lastA;
int aggFactor;
int as; // aggregated steps
int a; // step inside aggregated step
int aggDelta;
int scale;
int phase;
// AS computation
int asDim;
int v;
int bitCount[32];
int bitZeroStep[32];
__device__ __host__ int min(int a, int b) {
return (a<b)?a:b;
}
__device__ __host__ int getNelem() {
return min(chunkCount, end-offset);
}
__device__ __host__ int mirror(int i, int max) {
int ret = 0;
for (int mask=1, imask=max/2; mask<max; mask<<=1, imask>>=1) {
if ((i&mask)) ret += imask;
}
return ret;
}
__device__ __host__ int firstBitSet(int i, int max) {
int ffs =
#ifdef __CUDA_ARCH__
__ffs(i);
#else
__builtin_ffs(i);
#endif
return ffs ? ffs-1 : max;
}
__device__ __host__ void resetA() {
a = 0;
lastA = aggFactor;
if (phase >= 2) lastA /= 2*scale;
}
__device__ __host__ void reset() {
nelem = getNelem();
scale = aggFactor/2;
phase = scale ? 2 : 1;
v = 0;
for (int i = 0; i<asDim; i++) {
bitCount[i] = asDim-i;
bitZeroStep[i] = 1;
}
as = nextAs();
resetA();
}
__device__ __host__ int nextAs() {
for (int d=0; d<asDim; d++) {
int p = 1<<d;
bitCount[d]--;
if (bitCount[d] == 0) {
v ^= p;
bitCount[d] = p;
if ((v&p) == 0) {
bitCount[d] += firstBitSet(bitZeroStep[d], asDim) - 1;
if (bitCount[d] == 0) {
v ^= p;
bitCount[d] = p;
}
bitZeroStep[d]++;
}
}
}
return v;
}
public:
__device__ __host__ PatAGAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
aggDelta = nrPow2 = (1<<log2Up(nranks));
aggFactor = 1;
size_t channelSize = end-offset;
while (stepSize / (channelSize*sizeof(T)*aggFactor) >= 2 && aggFactor < nranks/2) {
aggFactor *= 2;
aggDelta /= 2;
}
postFreq = aggFactor;
int d = stepDepth;
while (d > 1 && aggFactor < nranks/2) {
d /= 2;
aggFactor *= 2;
aggDelta /= 2;
}
//printf("AggFactor %d PostFreq %d AggDelta %d\n", aggFactor, postFreq, aggDelta);
asDim = log2Up(aggDelta);
reset();
}
__device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &recvStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
restart:
//printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
last = 0;
nelemOut = nelem;
inpIx = offset;
int skip = 0;
if (phase == 0) {
int s = a*aggDelta + as;
if (s >= nranks) skip = 1;
int nextSkip = (a+1)*aggDelta + as >= nranks ? 1 : 0;
int recvDataRank = (rank + s) % nranks;
outIx = recvDataRank * count + offset;
sendDim = -1;
recvDim = 0;
inpIx = 0;
sendOffset = -1;
recvOffset = (a % postFreq) * nelem;
recvStepOffset = 0;
postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
postSend = 0;
a++;
if (nextSkip) {
as = nextAs();
if (as == aggDelta/2) {
offset += chunkCount;
if (offset >= end) {
last = 1;
} else {
reset();
}
return;
}
phase = 1;
resetA();
}
if (skip == 0) return;
} else if (phase == 1) {
int s = a*aggDelta + as;
if (s >= nranks) skip = 1;
sendDim = firstBitSet(s, nrPow2);
s -= (1<<sendDim);
int sendDataRank = (rank + nranks + s) % nranks;
outIx = sendDataRank * count + offset;
recvDim = s ? firstBitSet(s, nrPow2) : -1;
sendOffset = recvOffset = (a % postFreq) * nelem;
postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
postRecv = (sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0;
recvStepOffset = (sendDim == 0) ? 0 : a/postFreq;
if (recvDim == -1) {
recvOffset = -1;
postRecv = 0;
} else if (as - (1<<sendDim) == 0) {
int foffset = (a*aggDelta) >> (recvDim+1);
recvOffset = (foffset%postFreq)*nelem;
postRecv = (sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<<recvDim) >= nranks) ? 1 : 0;
recvStepOffset = (sendDim == 0) ? 0 : foffset/postFreq;
}
if (s < nranks && sendDim == 0 && skip) {
// Don't forget to receive at least once even if we don't send afterwards
sendDim = -1;
sendOffset = -1;
postSend = 0;
skip = 0;
}
if (++a == lastA) {
if (as % 2 == 1) {
phase = 0;
} else {
as = nextAs();
}
resetA();
}
if (skip == 0) return;
} else if (phase == 2) {
int s = (2*a+1)*scale*aggDelta;
postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0;
postRecv = 0;
if (s >= nranks) skip = 1;
sendDim = firstBitSet(s, nrPow2);
s -= (1<<sendDim);
sendOffset = (a%postFreq) * nelem;
recvStepOffset = a / postFreq;
int sendDataRank = (rank + nranks + s) % nranks;
outIx = sendDataRank * count + offset;
recvDim = s ? firstBitSet(s, nrPow2) : -1;
s -= (1<<recvDim);
if (recvDim == -1) {
recvOffset = -1;
} else {
int foffset = (a*2*scale*aggDelta) >> (recvDim+1);
recvOffset = (foffset%postFreq)*nelem;
recvStepOffset = foffset / postFreq;
}
if (++a == lastA) {
scale /= 2;
phase = scale ? 2 : 1;
resetA();
}
if (skip == 0) return;
}
goto restart;
}
};
#endif
+98 -33
Ver ficheiro
@@ -16,6 +16,7 @@
#include "nccl_net.h"
#include "register.h"
#include "graph.h"
#include "profiler.h"
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
@@ -104,6 +105,11 @@ struct ncclCommCallback {
struct ncclCommCallback* next;
ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
};
struct ncclCommEventCallback {
struct ncclCommEventCallback* next;
cudaEvent_t event;
ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommEventCallback* cb);
};
struct ncclSharedResources {
int refCount;
@@ -173,6 +179,54 @@ struct ncclCollnetHandleList {
struct ncclProxyConnector* proxyconn;
};
struct ncclTaskColl {
struct ncclTaskColl* next;
ncclFunc_t func;
void const* sendbuff;
void* recvbuff;
size_t count;
int root;
ncclDataType_t datatype;
ncclRedOp_t opHost;
struct ncclDevRedOpFull opDev;
int chunkSteps, sliceSteps;
// Computed later:
size_t trafficBytes;
int32_t nMaxChannels:8;
int32_t nWarps:8;
int32_t algorithm:8, protocol:8;
uint32_t isCollnet:1, isNvls:1;
uint32_t devFuncId:30;
enum ncclRegBufferType regBufType;
// number of elements in planner->ipcMemQueue associated with this collective
int nCleanupQueueElts;
void* sendMhandle;
void* recvMhandle;
// index for IPC record lookup
uintptr_t sendbuffOffset;
uintptr_t recvbuffOffset;
uintptr_t* sendbuffRmtAddrs;
uintptr_t* recvbuffRmtAddrs;
// Profiler plugin
int eActivationMask;
void* eventHandle;
};
struct ncclTaskP2p {
struct ncclTaskP2p* next;
ncclFunc_t func;
void* buff;
size_t count;
ncclDataType_t datatype;
int root;
size_t bytes;
// Profiler plugin
int eActivationMask;
void* eventHandle;
};
struct ncclKernelPlan {
// A kernel plan is also a callback that reclaims itself. Hence this must
// be the first member.
@@ -198,40 +252,12 @@ struct ncclKernelPlan {
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> cleanupQueue;
void* workBufPersistent;
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> p2pTaskQueue;
struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collTaskQueue;
struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
};
////////////////////////////////////////////////////////////////////////////////
struct ncclTaskColl {
struct ncclTaskColl* next;
ncclFunc_t func;
void const* sendbuff;
void* recvbuff;
size_t count;
int root;
ncclDataType_t datatype;
ncclRedOp_t opHost;
struct ncclDevRedOpFull opDev;
int chunkSteps, sliceSteps;
// Computed later:
size_t trafficBytes;
int32_t nMaxChannels:8;
int32_t nWarps:8;
int32_t algorithm:8, protocol:8;
uint32_t isCollnet:1, isNvls:1;
uint32_t devFuncId:30;
enum ncclRegBufferType regBufType;
// number of elements in planner->ipcMemQueue associated with this collective
int nCleanupQueueElts;
void* sendMhandle;
void* recvMhandle;
};
struct ncclTaskP2p {
struct ncclTaskP2p* next;
void* buff;
size_t bytes;
// Profiler plugin
void* groupEventHandle;
};
////////////////////////////////////////////////////////////////////////////////
@@ -383,6 +409,8 @@ struct ncclComm {
struct ncclChannel channels[MAXCHANNELS];
struct ncclPeerInfo* peerInfo;
struct ncclTopoSystem* topo;
struct ncclProxyConnector* gproxyConn;
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> legacyRegCleanupQueue;
int netPluginLoaded;
ncclNet_t* ncclNet;
@@ -395,10 +423,12 @@ struct ncclComm {
struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
bool initAlgoChannels[NCCL_NUM_ALGORITHMS];
bool runtimeConn; // if dynamic connection is supported
bool directMode;
int cuMemSupport;
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
const char* commName;
uint64_t commHash;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
@@ -504,7 +534,7 @@ struct ncclComm {
int collNetSupport;
bool collNetRegSupport;
uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
int intraHighestTransportType;
bool intraNodeP2pSupport;
int* collNetHeads;
int collNetHeadsNum;
int* collNetDenseToUserRank;
@@ -519,6 +549,8 @@ struct ncclComm {
struct ncclNvlsSharedRes* nvlsResources;
// pools backed by comm->memPermanent
struct ncclMemoryPool memPool_ncclTaskColl;
struct ncclMemoryPool memPool_ncclTaskP2p;
struct ncclMemoryPool memPool_ncclProxyOp;
struct ncclMemoryPool memPool_ncclKernelPlan;
@@ -532,6 +564,13 @@ struct ncclComm {
struct ncclKernelPlanner planner;
cudaMemPool_t memPool;
// Queue of events and associated callbacks for cleaning up asynchronous work.
// Using this is preferable to using CUDA host callbacks because host callbacks
// won't allow the work following the callback to run until the callback completes,
// which comes at expense to perf.
struct ncclIntruQueue<struct ncclCommEventCallback, &ncclCommEventCallback::next> eventCallbackQueue;
// user-created reduction ops
int userRedOpCapacity, userRedOpFreeHead;
ncclUserRedOp *userRedOps;
@@ -553,6 +592,11 @@ struct ncclComm {
int tunerPluginLoaded;
ncclTuner_t* tuner;
void *tunerContext;
// Profiler plugin
void* profilerContext;
uint64_t seqNumber[NCCL_NUM_FUNCTIONS];
// buffer registration cache
struct ncclRegCache regCache;
uint64_t endMagic;
@@ -583,6 +627,27 @@ inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome)
return ncclSuccess;
}
inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) {
ncclResult_t result = ncclSuccess;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
while (true) {
struct ncclCommEventCallback* cb = ncclIntruQueueHead(&comm->eventCallbackQueue);
if (cb == nullptr) break;
cudaError_t ok = cudaEventSynchronize(cb->event);
if (ok == cudaErrorNotReady) break;
ncclIntruQueueDequeue(&comm->eventCallbackQueue);
if (ok == cudaSuccess) {
NCCLCHECKGOTO(cb->fn(comm, cb), result, finish);
} else {
CUDACHECKGOTO(ok, result, finish);
}
}
finish:
cudaThreadExchangeStreamCaptureMode(&mode);
return ncclSuccess;
}
inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) {
int phase = comm->intraBarrierPhase;
if (comm->intraRanks == 1) {
+2
Ver ficheiro
@@ -13,6 +13,7 @@
// Is cuMem API usage enabled
extern int ncclCuMemEnable();
extern int ncclCuMemHostEnable();
#if CUDART_VERSION >= 11030
#include <cudaTypedefs.h>
@@ -96,6 +97,7 @@ DECLARE_CUDA_PFN_EXTERN(cuMemRelease);
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle);
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess);
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle);
#if CUDA_VERSION >= 11070
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
#endif
+18 -5
Ver ficheiro
@@ -128,6 +128,8 @@ struct ncclConnInfo {
};
struct ncclProxyConnector {
bool initialized;
int rank;
int tpRank;
int tpLocalRank;
int sameProcess;
@@ -141,6 +143,8 @@ struct ncclConnector {
struct ncclTransportComm* transportComm;
void* transportResources;
struct ncclConnInfo conn;
int sendMemSameProcess;
int recvMemSameProcess;
};
struct ncclRing {
@@ -225,6 +229,7 @@ struct alignas(16) ncclDevWorkP2p {
uint8_t sendProtoLL:1, recvProtoLL:1;
uint8_t sendRegistered:1, recvRegistered:1;
uint8_t sendIpcReg:1, recvIpcReg:1;
};
// Compute the subset of the data transfer corresponding to the given part index.
@@ -266,6 +271,10 @@ struct alignas(16) ncclDevWorkColl {
uint32_t root;
void* recvbuff;
void* sendbuff;
uintptr_t sendbuffOffset;
uintptr_t recvbuffOffset;
uintptr_t* sendbuffRmtAddrs;
uintptr_t* recvbuffRmtAddrs;
union {
// Continuous-byte-distribution scheduling. The lo and hi channels are of
// different size than the channels in the middle.
@@ -384,6 +393,7 @@ struct ncclDevComm {
int nNodes;
int buffSizes[NCCL_NUM_PROTOCOLS];
int p2pChunkSize;
int isNvlink;
// Work fifo return credits
uint32_t* workConsumed/*[MAXCHANNELS]*/;
@@ -395,6 +405,7 @@ struct ncclDevComm {
// Channels, device side
struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
int* rankToLocalRank;
};
struct alignas(16) ncclDevCommAndChannels {
@@ -539,11 +550,12 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto)
if (coll == ncclFuncSendRecv) break;
row += 1;
int nAlgos = 3;
int nAlgos = 4;
if (coll == ncclFuncAllGather) {
int algo1 = algo == NCCL_ALGO_RING ? 0 :
algo == NCCL_ALGO_COLLNET_DIRECT ? 1 :
/*algo == NCCL_ALGO_NVLS*/ 2;
algo == NCCL_ALGO_NVLS ? 2 :
/*algo == NCCL_ALGO_PAT*/ 3;
row += algo1*NCCL_NUM_PROTOCOLS + proto;
break;
}
@@ -556,7 +568,7 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto)
}
row += nAlgos*NCCL_NUM_PROTOCOLS;
nAlgos = NCCL_NUM_ALGORITHMS;
nAlgos = 6;
if (coll == ncclFuncAllReduce) {
row += ((devRedOp*NumTypes + type)*nAlgos + algo)*NCCL_NUM_PROTOCOLS + proto;
break;
@@ -570,11 +582,12 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto)
}
row += ncclNumDevRedOps*NumTypes*nAlgos*NCCL_NUM_PROTOCOLS;
nAlgos = 3;
nAlgos = 4;
if (coll == ncclFuncReduceScatter) {
int algo1 = algo == NCCL_ALGO_RING ? 0 :
algo == NCCL_ALGO_COLLNET_DIRECT ? 1 :
/*algo == NCCL_ALGO_NVLS*/ 2;
algo == NCCL_ALGO_NVLS ? 2 :
/*algo == NCCL_ALGO_PAT*/ 3;
row += ((devRedOp*NumTypes + type)*nAlgos + algo1)*NCCL_NUM_PROTOCOLS + proto;
break;
}
+4 -3
Ver ficheiro
@@ -33,13 +33,14 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
// Query topology
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int64_t netId, int read, int* useGdr);
ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush);
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net);
int ncclPxnDisable(struct ncclComm* comm);
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu);
// Find CPU affinity
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
@@ -76,7 +77,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
#define NCCL_TOPO_PATTERN_COLLNET_DIRECT 6 // Collnet Direct
struct ncclTopoGraph {
// Input / output
int id; // ring : 0, tree : 1, collnet : 2
int id; // ring : 0, tree : 1, collnet : 2, nvls : 3, collnetDirect : 4
int pattern;
int crossNic;
int collNet;
+2 -1
Ver ficheiro
@@ -50,7 +50,7 @@ typedef enum {
ncclNumFuncs = 8
} ncclFunc_t;
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
#define NCCL_ALGO_UNDEF -1
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
@@ -58,6 +58,7 @@ typedef enum {
#define NCCL_ALGO_COLLNET_CHAIN 3
#define NCCL_ALGO_NVLS 4
#define NCCL_ALGO_NVLS_TREE 5
#define NCCL_ALGO_PAT 6
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define NCCL_PROTO_UNDEF -1
+150
Ver ficheiro
@@ -0,0 +1,150 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_PROFILER_H_
#define NCCL_PROFILER_H_
#include <cstdint>
enum {
ncclProfileGroup = (1 << 0), // group event type
ncclProfileColl = (1 << 1), // host collective call event type
ncclProfileP2p = (1 << 2), // host point-to-point call event type
ncclProfileProxyOp = (1 << 3), // proxy operation event type
ncclProfileProxyStep = (1 << 4), // proxy step event type
ncclProfileProxyCtrl = (1 << 5), // proxy control event type
ncclProfileNumEvents = ( 6),
};
typedef struct {
uint8_t type; // event type descriptor: ncclProfileColl, ...
void* parentObj; // pointer to the profiler parent object (for coll is the group)
int rank; // originating rank
union {
struct {
const char* name;
uint64_t commHash;
uint64_t seqNumber;
uint8_t func;
void const* sendBuff;
void* recvBuff;
size_t count;
int root;
uint8_t datatype;
uint32_t op;
size_t trafficBytes;
uint8_t nMaxChannels;
uint8_t nWarps;
uint8_t algo;
uint8_t proto;
int isCollnet;
int isNvls;
} coll;
struct {
const char* name;
uint64_t commHash;
uint8_t func;
void* buff;
uint8_t datatype;
size_t count;
int peer;
} p2p;
struct {
pid_t pid; // pid of the originating process
uint8_t channelId; // channel id for this proxy operation
int peer; // remote rank for send/recv
int nSteps; // number of steps for this proxy operation
int chunkSize; // amount of data transferred by this proxy operation
int isSend;
} proxyOp;
struct {
int step;
} proxyStep;
};
} ncclProfilerEventDescr_v1_t;
typedef enum {
ncclProfilerProxyOpSendPosted,
ncclProfilerProxyOpSendRemFifoWait,
ncclProfilerProxyOpSendTransmitted,
ncclProfilerProxyOpSendDone,
ncclProfilerProxyOpRecvPosted,
ncclProfilerProxyOpRecvReceived,
ncclProfilerProxyOpRecvTransmitted,
ncclProfilerProxyOpRecvDone,
/* Legacy proxy profiler states */
ncclProfilerProxyStepSendGPUWait,
ncclProfilerProxyStepSendWait,
ncclProfilerProxyStepRecvWait,
ncclProfilerProxyStepRecvFlushWait,
ncclProfilerProxyStepRecvGPUWait,
/* Legacy proxy control states */
ncclProfilerProxyCtrlIdle,
ncclProfilerProxyCtrlActive,
ncclProfilerProxyCtrlSleep,
ncclProfilerProxyCtrlWakeup,
ncclProfilerProxyCtrlAppend,
ncclProfilerProxyCtrlAppendEnd,
} ncclProfilerEventState_v1_t;
typedef union {
struct {
size_t transSize;
int steps;
} proxyOp;
struct {
int appendedProxyOps;
} proxyCtrl;
} ncclProfilerEventStateArgs_v1_t;
typedef struct {
const char* name;
// init - initialize the profiler plugin
// Input
// - context : opaque profiler context object for separating profiler behavior across comms
// Output
// - eActivationMask: bitmask of active events set by the plugin
ncclResult_t (*init)(void** context, int* eActivationMask);
// startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
// Input
// - context: opaque profiler context object
// - eDescr : pointer to ncclProfilerEventDescr_t object
// Output
// - eHandle: return event handle for supplied event descriptor object
ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
// stopEvent - stop/finalize an event inside and event set
// Input
// - eHandle: handle to event object
ncclResult_t (*stopEvent)(void* eHandle);
// recordEventState - record event state transitions and event attribute updates
// Input
// - eHandle : handle to event object created through startEvent
// - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
// - eState : event state transition
ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
// finalize - finalize the profiler plugin
// Input
// - context: opaque profiler context object
ncclResult_t (*finalize)(void* context);
} ncclProfiler_v1_t;
typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t;
typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t;
typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t;
typedef ncclProfiler_v1_t ncclProfiler_t;
#endif
+15 -12
Ver ficheiro
@@ -16,20 +16,23 @@
#endif
// Define all NCCL-provided static schema IDs here (avoid duplicates).
#define NVTX_SID_CommInitRank 0
#define NVTX_SID_CommInitAll 1
#define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_AllGather 4
#define NVTX_SID_AllReduce 5
#define NVTX_SID_Broadcast 6
#define NVTX_SID_ReduceScatter 7
#define NVTX_SID_Reduce 8
#define NVTX_SID_Send 9
#define NVTX_SID_Recv 10
#define NVTX_SID_CommInitRank 0
#define NVTX_SID_CommInitAll 1
#define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_AllGather 4
#define NVTX_SID_AllReduce 5
#define NVTX_SID_Broadcast 6
#define NVTX_SID_ReduceScatter 7
#define NVTX_SID_Reduce 8
#define NVTX_SID_Send 9
#define NVTX_SID_Recv 10
#define NVTX_SID_CommInitRankConfig 11 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank
#define NVTX_SID_CommSplit 13
// Define static schema ID for the reduction operation.
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
+28 -3
Ver ficheiro
@@ -34,11 +34,36 @@ typedef union {
// Legacy CUDA IPC
cudaIpcMemHandle_t devIpc;
// cuMem API support
ncclCuDesc cuDesc;
struct {
ncclCuDesc cuDesc;
CUmemGenericAllocationHandle memHandle;
};
} ncclIpcDesc;
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr);
enum ncclIpcRegType {
NCCL_IPC_SENDRECV = 0,
NCCL_IPC_COLLECTIVE = 1
};
struct ncclIpcImpInfo {
void* rmtRegAddr;
bool legacyIpcCap;
uintptr_t offset;
};
struct ncclIpcRegInfo {
int peerRank;
void* baseAddr;
struct ncclProxyConnector* ipcProxyconn;
struct ncclIpcImpInfo impInfo;
};
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int directMap, ncclIpcDesc *ipcDesc, void **ptr);
ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut);
ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts);
ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo);
#endif
+38 -20
Ver ficheiro
@@ -4,34 +4,52 @@
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_PROFILER_H_
#define NCCL_PROFILER_H_
#ifndef PROFILER_H_
#define PROFILER_H_
#include "proxy.h"
#include <cuda_runtime.h>
#include "nccl_profiler.h"
enum ncclProxyProfileState {
ncclProxyProfileBegin = 0,
struct ncclProxyArgs;
struct ncclKernelPlan;
struct ncclTaskColl;
struct ncclTaskP2p;
struct ncclInfo;
struct ncclComm;
struct ncclProxyOp;
ncclProxyProfileSendGPUWait = 1,
ncclProxyProfileSendWait = 2,
// Plugin Init/Finalize Wrappers
ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm);
ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm);
ncclProxyProfileRecvWait = 1,
ncclProxyProfileRecvFlushWait = 2,
ncclProxyProfileRecvGPUWait = 3,
// Profiler Start/Stop Group Wrappers
ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan);
ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan);
ncclProxyProfileEnd = 4,
// Profiler Start/Stop Task Events Wrappers
ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan);
ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan);
ncclProxyProfileSleep = 8,
ncclProxyProfileWakeup = 9,
// Proxy Op Start/Stop Event Wrappers
ncclResult_t ncclProfilerStartSendProxyOpEvent(int sub, struct ncclProxyArgs* args);
ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* args);
ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args);
ncclProxyProfileIdle = 16,
ncclProxyProfileActive = 17,
// Proxy Step Start/Stop Event Wrappers
ncclResult_t ncclProfilerStartSendProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
ncclResult_t ncclProfilerStartRecvProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
ncclResult_t ncclProfilerStopProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
ncclProxyProfileAppend = 24,
ncclProxyProfileAppendEnd = 25
};
// Proxy Control Start/Stop Events Wrappers
ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle);
ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state);
void ncclProfilingDump();
// Record Event Wrappers
ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
ncclResult_t ncclProfilerRecordProxyStepEventStates(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState);
ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState);
// Profiler utility functions
ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op);
#endif
+37 -4
Ver ficheiro
@@ -13,7 +13,7 @@
#include "ipcsocket.h"
#include "nccl_net.h"
#include <pthread.h>
#include "shm.h"
#include "shmutils.h"
#include "p2p.h"
typedef enum : uint8_t {
@@ -28,6 +28,8 @@ typedef enum : uint8_t {
ncclPatternCollnetDirect,
ncclPatternNvls,
ncclPatternNvlsTree,
ncclPatternPatUp,
ncclPatternPatDown,
ncclPatternSend,
ncclPatternRecv
} ncclPattern_t;
@@ -72,6 +74,19 @@ struct ncclProxyOp {
union ncclProxyOpSpecifics specifics;
// Profiler plugin
union {
struct ncclTaskColl* coll;
struct ncclTaskP2p* p2p;
} task;
int eActivationMask;
void* taskEventHandle;
int rank;
int peer;
pid_t pid;
void* profilerContext;
struct ncclProxyOp *enqNext;
};
@@ -100,7 +115,15 @@ struct ncclProxySubArgs {
uint64_t done;
uint64_t end;
void* requests[NCCL_STEPS];
void* profilingEvents[NCCL_STEPS];
// Profiler plugin
int eActivationMask;
int rank;
void* taskEventHandle;
void* opEventHandle;
void* stepEventHandles[NCCL_STEPS];
size_t transSize;
void* recvRequestsCache[NCCL_STEPS];
int recvRequestsSubCount;
};
@@ -129,6 +152,10 @@ struct ncclProxyArgs {
int idle;
// Profiler plugin
pid_t pid;
void* profilerContext;
// Element linking
struct ncclProxyArgs* next;
struct ncclProxyArgs* nextPeer;
@@ -261,6 +288,7 @@ struct ncclProxyState {
ncclNet_t* ncclNet;
ncclCollNet_t* ncclCollNet;
uint32_t* abortFlag;
bool directMode;
// Service threads
pthread_t thread;
pthread_t threadUDS;
@@ -281,6 +309,9 @@ struct ncclProxyState {
// Progress thread
struct ncclProxyProgressState progressState;
// Profiler plugin
void* profilerContext;
// Queue of expected responses from the proxy
struct ncclExpectedProxyResponse* expectedResponses;
};
@@ -332,8 +363,9 @@ enum ncclProxyMsgType {
ncclProxyMsgAbort = 7,
ncclProxyMsgStop = 8,
ncclProxyMsgGetFd = 9, // cuMem API support (UDS)
ncclProxyMsgRegister = 10,
ncclProxyMsgDeregister = 11
ncclProxyMsgQueryFd = 10,
ncclProxyMsgRegister = 11,
ncclProxyMsgDeregister = 12
};
// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
@@ -347,6 +379,7 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec
// UDS support
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int rank, void *handle, int* convertedFd);
ncclResult_t ncclProxyClientQueryFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int localFd, int* rmtFd);
ncclResult_t ncclProxyStop(struct ncclComm* comm);
ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
+11 -2
Ver ficheiro
@@ -11,7 +11,13 @@ enum {
NVLS_REG_COMPLETE = 0x02,
NVLS_REG_POSSIBLE = 0x04,
NVLS_REG_NO_SUPPORT = 0x08,
COLLNET_REG_COMPLETE = 0x10
COLLNET_REG_COMPLETE = 0x10,
IPC_REG_COMPLETE = 0x20
};
struct ncclPeerRegIpcAddr {
uintptr_t* devPeerRmtAddrs;
uintptr_t* hostPeerRmtAddrs;
};
struct ncclReg {
@@ -34,7 +40,10 @@ struct ncclReg {
uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
// collnet reg
void* collnetHandle;
struct ncclProxyConnector* proxyconn;
struct ncclProxyConnector* collnetProxyconn;
// general ipc reg
struct ncclPeerRegIpcAddr regIpcAddrs;
struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS];
};
struct ncclRegCache {
+29 -18
Ver ficheiro
@@ -1,26 +1,37 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_SHM_H_
#define NCCL_SHM_H_
#include "nccl.h"
#include "comm.h"
typedef void* ncclShmHandle_t;
ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
ncclResult_t ncclShmClose(ncclShmHandle_t handle);
ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
struct ncclShmemCollBuff {
volatile size_t *cnt[2];
volatile void *ptr[2];
int round;
size_t maxTypeSize;
struct shmLegacyIpc {
char shmSuffix[7];
ncclShmHandle_t handle;
size_t shmSize;
};
ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
struct shmCuIpc {
union {
CUmemFabricHandle handle;
CUmemGenericAllocationHandle data;
};
int tpProxyRank;
void *ptr;
size_t size;
};
struct shmIpcDesc {
union
{
struct shmLegacyIpc shmli;
struct shmCuIpc shmci;
};
bool legacy;
};
typedef struct shmIpcDesc ncclShmIpcDesc_t;
ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc);
#endif
+26
Ver ficheiro
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_SHMUTILS_H_
#define NCCL_SHMUTILS_H_
#include "nccl.h"
typedef void* ncclShmHandle_t;
ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
ncclResult_t ncclShmClose(ncclShmHandle_t handle);
ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
struct ncclShmemCollBuff {
volatile size_t *cnt[2];
volatile void *ptr[2];
int round;
size_t maxTypeSize;
};
ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
#endif
+7 -7
Ver ficheiro
@@ -33,15 +33,15 @@ static double startTimes[8];
#define TIME_START(index) do { \
counts[index]++; \
startTimes[index] = gettime(); \
} while (0);
} while (0)
#define TIME_STOP(index) do { \
times[index] += gettime() - startTimes[index]; \
} while (0);
} while (0)
#define TIME_CANCEL(index) do { \
counts[index]--; \
} while (0);
} while (0)
#define TIME_PRINT(name) do { \
printf("%s stats", name); \
@@ -50,11 +50,11 @@ static double startTimes[8];
counts[i] = 0; \
} \
printf("\n"); \
} while (0);
} while (0)
#else
#define TIME_START(index) while(0);
#define TIME_STOP(index) while(0);
#define TIME_CANCEL(index) while(0);
#define TIME_START(index) do {} while(0)
#define TIME_STOP(index) do {} while(0)
#define TIME_CANCEL(index) do {} while(0)
#define TIME_PRINT(name)
#endif
#endif
+6 -4
Ver ficheiro
@@ -48,9 +48,10 @@ struct ncclPeerInfo {
// MNNVL support
nvmlGpuFabricInfoV_t fabricInfo;
int cuMemSupport;
int version;
};
#define CONNECT_SIZE 128
#define CONNECT_SIZE 256
struct ncclConnect {
char data[CONNECT_SIZE];
};
@@ -91,7 +92,6 @@ struct ncclCollNetSharedRes {
void* resources;
int nChannels;
size_t buffSize;
int intraHighestTransportType;
};
struct ncclTransportComm {
@@ -109,13 +109,14 @@ struct ncclTransportComm {
struct ncclTransport {
const char name[8];
ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
ncclResult_t (*canConnect)(int*, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
struct ncclTransportComm send;
struct ncclTransportComm recv;
};
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode);
ncclResult_t ncclNvlsInit(struct ncclComm* comm);
ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
@@ -127,7 +128,7 @@ ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdevi
ncclResult_t ncclNvlsFree(struct ncclComm* comm);
enum { collNetRecv=0, collNetSend=1 };
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect);
bool ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect);
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufUsed, void** outHandle);
@@ -136,6 +137,7 @@ ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConne
ncclResult_t ncclTransportRingConnect(struct ncclComm* comm);
ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm);
ncclResult_t ncclTransportPatConnect(struct ncclComm* comm);
ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* graphs[]);
ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm);
-1
Ver ficheiro
@@ -27,7 +27,6 @@ ncclResult_t busIdToInt64(const char* busId, int64_t* id);
ncclResult_t getBusId(int cudaDev, int64_t *busId);
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
uint64_t getHash(const char* string, int n);
uint64_t getHostHash();
uint64_t getPidHash();
ncclResult_t getRandomData(void* buffer, size_t bytes);
+300 -122
Ver ficheiro
@@ -37,7 +37,7 @@
#endif
const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree" };
const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain", "NVLS", "NVLSTree", "PAT" };
const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
@@ -101,9 +101,15 @@ NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
NCCLCHECK(ncclInit());
NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
ncclResult_t res = bootstrapGetUniqueId((struct ncclBootstrapHandle*)out);
struct ncclBootstrapHandle handle;
NCCLCHECK(bootstrapGetUniqueId(&handle));
// ncclUniqueId and bootstrapHandle don't have the same size and alignment
// reset to 0 to avoid undefined data
memset(out, 0, sizeof(*out));
// copy to avoid alignment mismatch
memcpy(out, &handle, sizeof(handle));
TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out));
return res;
return ncclSuccess;
}
// Prevent compiler from optimizing out these operations
@@ -147,7 +153,7 @@ void ncclCommPushCudaFree(struct ncclComm* comm, void* obj) {
}
static ncclResult_t ncclDestructorFnCudaHostFree(struct ncclDestructor* dtor) {
CUDACHECK(cudaFreeHost(dtor->obj));
NCCLCHECK(ncclCudaHostFree(dtor->obj));
return ncclSuccess;
}
void ncclCommPushCudaHostFree(struct ncclComm* comm, void* obj) {
@@ -180,13 +186,15 @@ static ncclResult_t commFree(ncclComm_t comm) {
* free all intra-process communicators; therefore, we only need to focus on local
* resource cleanup in commFree(). */
if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
pthread_join(comm->proxyState->thread, nullptr);
PTHREADCHECK(pthread_join(comm->proxyState->thread, nullptr), "pthread_join");
if (comm->proxyState->threadUDS) {
// UDS support
pthread_join(comm->proxyState->threadUDS, nullptr);;
PTHREADCHECK(pthread_join(comm->proxyState->threadUDS, nullptr), "pthread_join");
}
}
CUDACHECK(cudaMemPoolDestroy(comm->memPool));
delete[] comm->userRedOps;
free(comm->connectSend);
@@ -244,12 +252,14 @@ static ncclResult_t commFree(ncclComm_t comm) {
free(comm->topParentRanks);
free(comm->topParentLocalRanks);
free(comm->gproxyConn);
NCCLCHECK(ncclRegCleanup(comm));
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy");
commPoison(comm); // poison comm before free to avoid comm reuse.
NCCLCHECK(ncclProfilerPluginFinalize(comm));
NCCLCHECK(ncclNetFinalize(comm));
NCCLCHECK(ncclNetPluginUnload(comm));
free(comm);
@@ -328,6 +338,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
NCCLCHECK(ncclNetPluginLoad(comm));
NCCLCHECK(ncclNetInit(comm));
NCCLCHECK(ncclProfilerPluginInit(comm));
INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
if (parent && parent->config.splitShare) {
@@ -393,8 +404,28 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
}
ncclIntruQueueMpscConstruct(&comm->callbackQueue);
ncclIntruQueueConstruct(&comm->legacyRegCleanupQueue);
comm->regCache.pageSize = sysconf(_SC_PAGESIZE);
do {
cudaMemPoolProps props = {};
props.allocType = cudaMemAllocationTypePinned;
props.handleTypes = cudaMemHandleTypeNone;
props.location.type = cudaMemLocationTypeDevice;
props.location.id = comm->cudaDev;
CUDACHECK(cudaMemPoolCreate(&comm->memPool, &props));
uint64_t releaseThreshold = ~uint64_t(0);
CUDACHECK(cudaMemPoolSetAttribute(comm->memPool, cudaMemPoolAttrReleaseThreshold, &releaseThreshold));
} while (0);
ncclIntruQueueConstruct(&comm->eventCallbackQueue);
// setup intraComm0 and intraRanks 0 to default values to ensure proper cleanup of the communicator
comm->intraComm0 = comm;
comm->intraRank = 0;
comm->intraRanks = 1;
return ncclSuccess;
}
@@ -408,12 +439,16 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail);
NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
ncclCommPushCudaFree(comm, devCommAndChans);
NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
ncclCommPushCudaFree(comm, tmpCommAndChans.comm.rankToLocalRank);
NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
comm->devComm = &devCommAndChans->comm;
tmpCommAndChans.comm.rank = comm->rank;
tmpCommAndChans.comm.nRanks = nRanks;
tmpCommAndChans.comm.node = comm->node;
tmpCommAndChans.comm.nNodes = comm->nNodes;
tmpCommAndChans.comm.abortFlag = comm->abortFlagDev;
tmpCommAndChans.comm.isNvlink = ncclTopoPathAllNVLink(comm->topo);
for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
}
@@ -498,10 +533,13 @@ static void showVersion() {
}
}
NCCL_PARAM(MNNVLCliqueId, "MNNVL_CLIQUE_ID", -1);
static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
info->rank = comm->rank;
info->cudaDev = comm->cudaDev;
info->nvmlDev = comm->nvmlDev;
NCCLCHECK(ncclGetVersion(&info->version));
info->hostHash=getHostHash()+commHash;
info->pidHash=getPidHash()+commHash;
info->cuMemSupport = ncclCuMemEnable();
@@ -534,6 +572,7 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
}
if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
}
return ncclSuccess;
@@ -677,7 +716,8 @@ static int checkMNNVL(struct ncclComm* comm) {
#define TIMER_INIT_TOPO 4
#define TIMER_INIT_GRAPHS 5
#define TIMER_INIT_CONNECT 6
#define TIMERS_INIT_COUNT 7
#define TIMER_INIT_ALLOC 7
#define TIMERS_INIT_COUNT 8
static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
// We use 2 AllGathers
@@ -693,7 +733,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
struct ncclTopoGraph* collNetChainGraph = &comm->graphs[NCCL_ALGO_COLLNET_CHAIN];
struct ncclTopoGraph* collNetDirectGraph = &comm->graphs[NCCL_ALGO_COLLNET_DIRECT];
struct ncclTopoGraph* nvlsGraph = &comm->graphs[NCCL_ALGO_NVLS];
struct ncclTopoGraph* graphs[] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph };
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetDirectGraph, collNetChainGraph, nvlsGraph, nvlsGraph, treeGraph };
struct graphInfo {
int pattern;
@@ -722,7 +762,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
struct ncclProxyConnector proxyConn;
int* pxnPeers = NULL;
int *topParentLocalRanks = NULL;
int tpProxyRank;
timers[TIMER_INIT_ALLGATHER] = clockNano();
// AllGather1 - begin
@@ -732,6 +771,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
comm->cuMemSupport = 1;
for (int i = 0; i < nranks; i++) {
if (comm->peerInfo[i].version != comm->peerInfo[rank].version) {
WARN("Mismatched NCCL version detected : rank %d version %d rank %d version %d",
i, comm->peerInfo[i].version, rank, comm->peerInfo[rank].version);
ret = ncclInvalidUsage;
goto fail;
}
if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++;
if (!comm->peerInfo[i].cuMemSupport) comm->cuMemSupport = 0;
if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
@@ -869,7 +914,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
collNetChainGraph->maxChannels = ringGraph->nChannels;
memset(collNetDirectGraph, 0, sizeof(struct ncclTopoGraph));
collNetDirectGraph->id = 2;
collNetDirectGraph->id = 4;
collNetDirectGraph->pattern = NCCL_TOPO_PATTERN_COLLNET_DIRECT;
collNetDirectGraph->collNet = 1;
collNetDirectGraph->minChannels = 1;
@@ -1031,18 +1076,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
comm->collNetSupport = 0;
}
comm->collNetRegSupport = true;
for (int n=0; n<comm->nNodes; n++) {
if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) {
WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1);
comm->collNetSupport = 0;
break;
}
if (comm->nodeRanks[n].localRanks > 1) {
// As long as there is more than 1 rank on any node, we need to disable collnet reg
comm->collNetRegSupport = false;
}
}
// As long as there is more than 1 rank on any node, we need to disable collnet reg
comm->collNetRegSupport = (comm->maxLocalRanks == 1);
}
NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
@@ -1085,6 +1120,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
}
comm->topParentLocalRanks = topParentLocalRanks;
NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->intraNodeP2pSupport, &comm->directMode), ret, fail);
// Launch proxy service thread, after this, the proxy calls can be used.
if (parent && parent->config.splitShare) {
comm->proxyState = parent->sharedRes->proxyState;
@@ -1092,7 +1128,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
} else {
NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);
}
NCCLCHECKGOTO(ncclCalloc(&comm->gproxyConn, comm->nRanks), ret, fail);
timers[TIMER_INIT_CONNECT] = clockNano();
do { // Build p2p schedule
int node = comm->node;
@@ -1168,6 +1205,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
// Connect Trees
NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
// Connect PAT only for communicators with 1 GPU per node
if (comm->maxLocalRanks == 1) NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
// Setup NVLS
NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
@@ -1179,12 +1219,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
if (comm->collNetSupport > 0) {
ncclCollNetSetup(comm, parent, graphs);
NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
}
}
// Connect to local net proxy
tpProxyRank = comm->topParentRanks[comm->rank];
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
// Then to remote ones when using PXN
@@ -1192,8 +1233,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
int nranks;
NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
for (int r=0; r<nranks; r++) {
tpProxyRank = comm->topParentRanks[pxnPeers[r]];
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &proxyConn), ret, fail);
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
}
}
@@ -1286,17 +1326,20 @@ NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT);
NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT);
#define NCCL_MAX_CGA_CLUSTER_SIZE 8
#define NCCL_COMMINIT_FUNCNAME_LEN 128
struct ncclCommInitRankAsyncJob {
struct ncclAsyncJob base;
struct ncclComm* comm;
struct ncclComm** newcomm;
int cudaDev;
// For ncclCommInitRank
int nranks, myrank;
ncclUniqueId commId;
int nranks, myrank, nId;
ncclUniqueId* commId;
// for ncclCommSplit
struct ncclComm* parent;
int color, key;
// name of the function calling
char funcName[NCCL_COMMINIT_FUNCNAME_LEN];
};
struct ncclCommFinalizeAsyncJob {
@@ -1306,30 +1349,31 @@ struct ncclCommFinalizeAsyncJob {
NCCL_PARAM(CommSplitShareResources, "COMM_SPLIT_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT);
typedef struct{
int key;
int color;
} commSplitInfo;
static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* nRanksRet, int* myRankRet, int* parentRanksRet) {
int* colors = NULL;
int* keys = NULL;
int nRanks = 0, myRank = 0;
ncclResult_t ret = ncclSuccess;
NCCLCHECKGOTO(ncclCalloc(&colors, parent->nRanks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&keys, parent->nRanks), ret, fail);
commSplitInfo* info = NULL;
NCCLCHECKGOTO(ncclCalloc(&info, parent->nRanks), ret, fail);
// Compute nRanks, my rank and the ranks (of the original comm) before and after me
colors[parent->rank] = color;
keys[parent->rank] = key;
NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, colors, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, keys, sizeof(int)), ret, fail);
info[parent->rank].color = color;
info[parent->rank].key = key;
NCCLCHECKGOTO(bootstrapAllGather(parent->bootstrap, info, sizeof(commSplitInfo)), ret, fail);
// Negative color does not create a new comm. Return now.
if (color == NCCL_SPLIT_NOCOLOR) goto exit;
memset(parentRanksRet, 0xff, sizeof(int) * parent->nRanks);
for (int i = 0; i < parent->nRanks; i++) {
if (colors[i] != color) continue;
if (info[i].color != color) continue;
// Find where to insert this rank
int insert = 0;
while (insert < nRanks && keys[parentRanksRet[insert]] <= keys[i]) insert++;
while (insert < nRanks && info[parentRanksRet[insert]].key <= info[i].key) insert++;
// Shift ranks by one after insert
for (int r = nRanks; r > insert; r--) parentRanksRet[r] = parentRanksRet[r - 1];
// Insert our rank
@@ -1345,8 +1389,7 @@ static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* par
*myRankRet = myRank;
exit:
free(colors);
free(keys);
free(info);
return ret;
fail:
goto exit;
@@ -1361,7 +1404,9 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
int cudaDev = job->cudaDev;
int* parentRanks = NULL;
int cudaArch;
uint64_t timers[TIMERS_INIT_COUNT];
double sum_timers = 0;
uint64_t timers[TIMERS_INIT_COUNT] = {0};
unsigned long long commIdHash;
timers[TIMER_INIT_TOTAL] = clockNano();
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
@@ -1379,34 +1424,42 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
}
timers[TIMER_INIT_KERNELS] = clockNano() - timers[TIMER_INIT_KERNELS];
timers[TIMER_INIT_BOOTSTRAP] = clockNano();
if (job->parent) {
NCCLCHECKGOTO(ncclCalloc(&parentRanks, job->parent->nRanks), res, fail);
NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail);
// Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now.
if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
snprintf((char*)&job->commId, sizeof(job->commId), "%016lx-%d", job->parent->commHash, job->color);
timers[TIMER_INIT_ALLOC] = clockNano();
NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
NCCLCHECKGOTO(bootstrapSplit((struct ncclBootstrapHandle*)&job->commId, comm, job->parent, job->color, job->key, parentRanks), res, fail);
timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
// obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex), add the color
ncclUniqueId tmpId;
memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits
snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d", job->parent->commHash, job->color);
comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d- Init START", job->funcName,
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key);
timers[TIMER_INIT_BOOTSTRAP] = clockNano();
NCCLCHECKGOTO(bootstrapSplit(comm->commHash, comm, job->parent, job->color, job->key, parentRanks), res, fail);
timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
// debug info, no commId was used
commIdHash = 0;
} else {
timers[TIMER_INIT_ALLOC] = clockNano();
NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail);
timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
// obtain a unique hash using the first commId
comm->commHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
commIdHash = hashUniqueId(job->commId[0]);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName,
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
timers[TIMER_INIT_BOOTSTRAP] = clockNano();
NCCLCHECKGOTO(bootstrapInit(job->nId, (struct ncclBootstrapHandle*)job->commId, comm), res, fail);
timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
}
timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
comm->cudaArch = cudaArch;
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES);
if (job->parent) {
INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx - Init START",
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId));
} else {
INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START",
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
}
NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail);
NCCLCHECKGOTO(ncclTunerPluginLoad(comm), res, fail);
if (comm->tuner) {
NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog, &comm->tunerContext));
@@ -1420,23 +1473,25 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
if (job->parent) {
/* unlink child abort flag. */
__atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE);
TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)",
job->parent, job->color, job->key, comm, comm->rank, comm->nRanks);
TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", job->parent, job->color, job->key, comm, comm->rank, comm->nRanks);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d - Init COMPLETE", job->funcName,
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key);
} else {
TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)",
comm, comm->nRanks, (unsigned long long)hashUniqueId(job->commId), comm->rank, comm->cudaDev);
// the name for the replay tool is ncclCommInitRank for all the variations
TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", comm, comm->nRanks, commIdHash, comm->rank, comm->cudaDev);
INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE", job->funcName,
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
}
if (job->parent) {
INFO(NCCL_INIT,"ncclCommSplit comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d commId 0x%llx - Init COMPLETE",
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key, (unsigned long long)hashUniqueId(job->commId));
} else {
INFO(NCCL_INIT,"ncclCommInitRank comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init COMPLETE",
comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
}
INFO(NCCL_INIT|NCCL_PROFILE,"Init timings: rank %d nranks %d total %.2f (kernels %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, connections %.2f, rest %.2f)", comm->rank, comm->nRanks, timers[TIMER_INIT_TOTAL]/1e9,
timers[TIMER_INIT_KERNELS]/1e9, timers[TIMER_INIT_BOOTSTRAP]/1e9, timers[TIMER_INIT_ALLGATHER]/1e9, timers[TIMER_INIT_TOPO]/1e9, timers[TIMER_INIT_GRAPHS]/1e9, timers[TIMER_INIT_CONNECT]/1e9,
(timers[TIMER_INIT_TOTAL]-timers[TIMER_INIT_KERNELS]-timers[TIMER_INIT_BOOTSTRAP]-timers[TIMER_INIT_ALLGATHER]-timers[TIMER_INIT_TOPO]-timers[TIMER_INIT_GRAPHS]-timers[TIMER_INIT_CONNECT])/1e9);
sum_timers = 0.0;
for (int it = 1; it < TIMERS_INIT_COUNT; ++it)
sum_timers += (timers[it] / 1e9);
INFO(NCCL_INIT | NCCL_PROFILE,
"Init timings - %s: rank %d nranks %d total %.2f (kernels %.2f, alloc %.2f, bootstrap %.2f, allgathers %.2f, topo %.2f, graphs %.2f, "
"connections %.2f, rest %.2f)",
job->funcName, comm->rank, comm->nRanks,
timers[TIMER_INIT_TOTAL] / 1e9, timers[TIMER_INIT_KERNELS] / 1e9, timers[TIMER_INIT_ALLOC] / 1e9,
timers[TIMER_INIT_BOOTSTRAP] / 1e9, timers[TIMER_INIT_ALLGATHER] / 1e9, timers[TIMER_INIT_TOPO] / 1e9,
timers[TIMER_INIT_GRAPHS] / 1e9, timers[TIMER_INIT_CONNECT] / 1e9, timers[TIMER_INIT_TOTAL] / 1e9 - sum_timers);
exit:
if (job->newcomm) {
/* assign it to user pointer. */
@@ -1621,17 +1676,24 @@ fail:
goto exit;
}
static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev, ncclConfig_t *config) {
ncclResult_t res = ncclSuccess;
ncclComm_t comm = NULL;
struct ncclCommInitRankAsyncJob *job = NULL;
const char* env = ncclGetEnv("NCCL_COMM_ID");
if (env && myrank == 0) {
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&commId, true), res, fail);
}
static void ncclCommInitJobFree(void* _job) {
struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)_job;
free(job->commId);
free(_job);
}
static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId, ncclUniqueId* commId, int myrank, int cudaDev, ncclConfig_t *config, const char funcName[]) {
if (nId <= 0 || nId > nranks) {
WARN("improper usage of ncclCommInitRank: nId = %d, nranks=%d", nId, nranks);
return ncclInvalidArgument;
}
ncclResult_t res = ncclSuccess;
const char* commIdEnv = NULL;
ncclComm_t comm = NULL;
struct ncclCommInitRankAsyncJob* job = NULL;
// first call ncclInit, this will setup the environment
NCCLCHECKGOTO(ncclInit(), res, fail);
if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel != NCCL_LOG_NONE && myrank == 0)) {
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, showVersion);
@@ -1659,19 +1721,37 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
*newcomm = comm;
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
job->nId = nId;
job->comm = comm;
job->nranks = nranks;
job->commId = commId; // C++ struct assignment
job->myrank = myrank;
job->cudaDev = cudaDev;
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail);
snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", funcName);
// need to copy the commIds to allow async commInit and to avoid alignement issues when casting from ncclUNiqueId and ncclBootstrapHandle
// ncclUniqueIds and ncclBootstrapHandle don't have the same alignment requirements.
// Therefore the array of Ids coming from the user might not be properly aligned to be cast into a ncclBootstrapHandle
// copying into allocated memory guarantees that the memory is properly aligned for any objects, removing that issue
NCCLCHECKGOTO(ncclCalloc(&job->commId, nId), res, fail);
memcpy(job->commId, commId, nId * NCCL_UNIQUE_ID_BYTES);
commIdEnv = ncclGetEnv("NCCL_COMM_ID");
if (commIdEnv && myrank == 0) {
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commIdEnv);
if (nId > 1) {
INFO(NCCL_INIT | NCCL_ENV, "NCCL_COMM_ID cannot be used with more than one ncclUniqueId");
job->nId = 1;
}
// start the bootstrap root before bootstrapping, use only the first handle
NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&job->commId[0], true), res, fail);
}
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, ncclCommInitJobFree, comm), res, fail);
exit:
return ncclGroupErrCheck(res);
fail:
if (comm) {
free(comm->abortFlag);
if (comm->abortFlagDev) ncclCudaHostFree((void*)comm->abortFlagDev);
if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev);
free(comm->abortFlagRefCount);
free(comm);
}
@@ -1703,7 +1783,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, &config));
NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, &config, __func__));
return ncclSuccess;
}
@@ -1713,6 +1793,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
int totalnDev;
int *gpuFlags = NULL;
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
int oldDev = 0;
constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"}
@@ -1722,6 +1803,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
(void)ncclCudaLibraryInit();
CUDACHECK(cudaGetDevice(&oldDev));
NCCLCHECKGOTO(PtrCheck(comms, "CommInitAll", "comms"), ret, fail);
if (ndev < 0) {
WARN("Invalid device count requested : %d", ndev);
@@ -1735,7 +1817,8 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
for (int i = 0; i < ndev; ++i) {
/* invalid device check. */
if (devlist[i] < 0 || devlist[i] >= totalnDev) {
ret = ncclUnhandledCudaError;
WARN("Invalid device %d (totalnDev=%d)", devlist[i], totalnDev);
ret = ncclInvalidArgument;
goto fail;
}
@@ -1756,13 +1839,18 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
for (int i=0; i<ndev; i++) {
// Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i, &config);
int dev = devlist ? devlist[i] : i;
CUDACHECKGOTO(cudaSetDevice(dev), ret, fail);
ncclCommInitRankDev(comms+i, ndev,1, &uniqueId, i, dev, &config, __func__);
}
NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
fail:
exit:
cudaSetDevice(oldDev);
free(gpuFlags);
return ret;
fail:
goto exit;
}
ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState) {
@@ -1777,7 +1865,6 @@ ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState) {
NCCL_API(ncclResult_t, ncclCommInitRankConfig, ncclComm_t* comm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config);
ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueId commId, int myrank, ncclConfig_t *config) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
int cudaDev;
ncclResult_t ret = ncclSuccess;
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
@@ -1785,13 +1872,46 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
NCCLCHECK(ncclGroupStartInternal());
(void)ncclCudaLibraryInit();
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, fail);
CUDACHECK(cudaGetDevice(&cudaDev));
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommInitRankConfig, CommInitRankSchema, payload)
if (config == NULL)
internalConfigPtr = &internalConfig;
else
internalConfigPtr = config;
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev, internalConfigPtr), ret, fail);
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, internalConfigPtr, __func__), ret, fail);
exit:
ncclGroupErrCheck(ret);
NCCLCHECK(ncclGroupEndInternal());
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
return ret;
fail:
if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
goto exit;
}
NCCL_API(ncclResult_t, ncclCommInitRankScalable, ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config);
ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config) {
int cudaDev;
ncclResult_t ret = ncclSuccess;
ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
ncclConfig_t *internalConfigPtr = NULL;
NCCLCHECK(ncclGroupStartInternal());
(void)ncclCudaLibraryInit();
CUDACHECK(cudaGetDevice(&cudaDev));
NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommInitRankScalable, CommInitRankSchema, payload)
if (config == NULL)
internalConfigPtr = &internalConfig;
else
internalConfigPtr = config;
NCCLCHECKGOTO(ncclCommInitRankDev(newcomm, nranks, nId, commId, myrank, cudaDev, internalConfigPtr, __func__), ret, fail);
exit:
ncclGroupErrCheck(ret);
@@ -1818,13 +1938,25 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult);
if (comm->initState == ncclSuccess) {
NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->hostStream), ret, fail);
NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), ret, fail);
if ((ret = ncclStrongStreamSynchronize(&comm->sharedRes->hostStream)) != ncclSuccess) {
WARN("commDestroySync: comm %p rank %d sync hostStream error %d\n", comm, comm->rank, ret);
}
if ((ret = ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream)) != ncclSuccess) {
WARN("commDestroySync: comm %p rank %d sync deviceStream error %d\n", comm, comm->rank, ret);
}
NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), ret, fail);
NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
// And keep polling until all graphs referencing us die.
while (comm->persistentRefs != 0) {
NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail);
}
}
while (!ncclIntruQueueEmpty(&comm->legacyRegCleanupQueue)) {
struct ncclCommCallback* cb = ncclIntruQueueDequeue(&comm->legacyRegCleanupQueue);
if (cb->fn(comm, cb) != ncclSuccess) {
WARN("Legacy IPC cleanup callback failed comm %p (rank = %d) cb %p", comm, comm->rank, cb);
}
}
}
if ((ret = ncclProxyStop(comm)) != ncclSuccess) {
@@ -1886,14 +2018,15 @@ ncclResult_t ncclCommFinalize(ncclComm_t comm) {
/* launch async thread to finalize comm. */
NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
job->comm = comm;
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commDestroySync, NULL, free, comm), ret, fail);
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commDestroySync, NULL, free, comm), ret, fail);
exit:
ncclGroupErrCheck(ret);
NCCLCHECK(ncclGroupEndInternal());
if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)) };
if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); }
return ret;
fail:
free(job);
if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret);
goto exit;
}
@@ -1940,13 +2073,15 @@ static ncclResult_t commReclaim(struct ncclAsyncJob* job_) {
nextIntraComm = nextIntraComm->intraNext;
if ((ret = commCleanup(curIntraComm)) != ncclSuccess) {
// We pass a freed pointer, but we don't dereference; we merely print its value, so it's OK.
// coverity[pass_freed_arg]
WARN("commReclaim: cleanup comm %p rank %d failed in destroy/abort, error %d", curIntraComm, curRank, ret);
}
}
}
}
return ret;
return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
@@ -1975,12 +2110,11 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
NCCLCHECK(ncclCommEnsureReady(comm));
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
job->comm = comm;
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail);
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
exit:
return res;
fail:
free(job);
goto exit;
}
@@ -1991,15 +2125,6 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
return ncclSuccess;
}
int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
struct ncclCommFinalizeAsyncJob *job = NULL;
ncclResult_t res = ncclSuccess;
NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload)
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
// Ask anything that might still be running on the device to quit
if (comm->childAbortFlag != nullptr) {
__atomic_store_n(comm->childAbortFlag, 1, __ATOMIC_RELEASE);
@@ -2010,30 +2135,61 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
comm->destroyFlag = 1;
/* init thread must be joined before we destroy the comm,
* and we should ignore the init error here. */
ncclCommEnsureReady(comm);
(void)ncclCommEnsureReady(comm);
// once the comm is ready, we can access ranks etc
int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
struct ncclCommFinalizeAsyncJob *job = NULL;
ncclResult_t res = ncclSuccess;
NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload)
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
job->comm = comm;
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, commReclaim, NULL, free, comm), res, fail);
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
exit:
return ncclSuccess;
fail:
free(job);
goto exit;
}
struct NvtxParamsCommSplit {
int rank;
int nranks;
int cudaDev;
int color;
int key;
};
constexpr nvtxPayloadSchemaEntry_t CommSplitSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommSplit, nranks)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommSplit, cudaDev)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "color", nullptr, 0, offsetof(NvtxParamsCommSplit, color)},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "key", nullptr, 0, offsetof(NvtxParamsCommSplit, key)},
};
NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
struct ncclCommInitRankAsyncJob *job = NULL;
struct ncclComm* childComm = NCCL_COMM_NULL;
ncclResult_t res = ncclSuccess;
NvtxParamsCommSplit payload{comm->rank, comm->nRanks, comm->cudaDev, color, key};
NVTX3_FUNC_WITH_PARAMS(CommSplit, CommSplitSchema, payload)
int oldDev;
CUDACHECK(cudaGetDevice(&oldDev));
NCCLCHECK(ncclGroupStartInternal());
NCCLCHECKGOTO(CommCheck(comm, "CommSplit", "comm"), res, fail);
NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail);
NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, fail);
CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), res, fail);
/* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */
*newcomm = NCCL_COMM_NULL;
if (color == NCCL_SPLIT_NOCOLOR) {
@@ -2073,10 +2229,12 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
job->color = color;
job->key = key;
job->cudaDev = comm->cudaDev;
NCCLCHECKGOTO(ncclAsyncLaunch(&job->base, ncclCommInitRankFunc, NULL, free, comm), res, fail);
snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", __func__);
NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, free, comm), res, fail);
exit:
ncclGroupErrCheck(res);
cudaSetDevice(oldDev);
(void)ncclGroupErrCheck(res);
NCCLCHECK(ncclGroupEndInternal());
return res;
fail:
@@ -2179,7 +2337,7 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
CUmemAccessDesc accessDesc = {};
CUmemGenericAllocationHandle handle;
int cudaDev;
int flag = 0;
int flag;
int dcnt;
int mcSupport = 0;
@@ -2193,12 +2351,18 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
if (mcSupport) {
int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
// Query device to see if FABRIC handle support is available
flag = 0;
(void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
memprop.requestedHandleTypes = ncclCuMemHandleType;
memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
memprop.location.id = currentDev;
// Query device to see if RDMA support is available
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
flag = 0;
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
@@ -2207,14 +2371,25 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
mcprop.size = size;
/* device cnt is a dummy value right now, it might affect mc granularity in the future. */
mcprop.numDevices = dcnt;
mcprop.handleTypes = ncclCuMemHandleType;
mcprop.handleTypes = requestedHandleTypes;
mcprop.flags = 0;
CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
/* only size needs to be aligned to mcGran */
ALIGN_SIZE(size, mcGran);
/* Allocate the physical memory on the device */
CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
/* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0));
if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) {
requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC;
memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
/* Allocate the physical memory on the device */
CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
}
} else {
/* Allocate the physical memory on the device */
CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
}
/* Reserve a virtual address range */
CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0));
/* Map the virtual address range to the physical allocation */
@@ -2234,6 +2409,9 @@ ncclResult_t ncclMemAlloc(void **ptr, size_t size) {
fallback:
#endif
// Coverity is right to complain that we may pass a NULL ptr to cudaMalloc. That's deliberate though:
// we want CUDA to return an error to the caller.
// coverity[var_deref_model]
CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail);
exit:
@@ -2272,7 +2450,7 @@ fallback:
CUDACHECKGOTO(cudaFree(ptr), ret, fail);
exit:
cudaSetDevice(saveDevice);
CUDACHECK(cudaSetDevice(saveDevice));
return ret;
fail:
goto exit;
+4
Ver ficheiro
@@ -53,6 +53,10 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
return ncclInvalidArgument;
}
// ncclMaxRedOp < info->op will always be false due to the sizes of
// the datatypes involved, and that's by design. We keep the check though
// just as a reminder.
// coverity[result_independent_of_operands]
if (info->op < 0 || ncclMaxRedOp < info->op) {
WARN("%s : invalid reduction operation %d", info->opName, info->op);
return ncclInvalidArgument;
+26 -2
Ver ficheiro
@@ -11,7 +11,7 @@
// This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);
NCCL_PARAM(CuMemHostEnable, "CUMEM_HOST_ENABLE", 0);
// Handle type used for cuMemCreate()
CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
@@ -49,6 +49,14 @@ int ncclCuMemEnable() {
return param >= 0 ? param : (param == -2 && ncclCuMemSupported);
}
int ncclCuMemHostEnable() {
#if CUDART_VERSION < 12020
return 0;
#else
return ncclParamCuMemHostEnable();
#endif
}
#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
#if CUDART_VERSION >= 11030
@@ -81,6 +89,7 @@ DECLARE_CUDA_PFN(cuMemRelease);
DECLARE_CUDA_PFN(cuMemRetainAllocationHandle);
DECLARE_CUDA_PFN(cuMemSetAccess);
DECLARE_CUDA_PFN(cuMemUnmap);
DECLARE_CUDA_PFN(cuMemGetAllocationPropertiesFromHandle);
/* ncclMemAlloc/Free */
DECLARE_CUDA_PFN(cuPointerGetAttribute);
#if CUDA_VERSION >= 11070
@@ -107,7 +116,7 @@ bool ncclCudaLaunchBlocking = false;
#if CUDART_VERSION >= 12000
#define LOAD_SYM(symbol, ignore) do { \
cudaDriverEntryPointQueryResult driverStatus; \
cudaDriverEntryPointQueryResult driverStatus = cudaDriverEntryPointSymbolNotFound; \
res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault, &driverStatus); \
if (res != cudaSuccess || driverStatus != cudaDriverEntryPointSuccess) { \
if (!ignore) { \
@@ -157,6 +166,7 @@ static ncclResult_t cudaPfnFuncLoader(void) {
LOAD_SYM(cuMemRetainAllocationHandle, 1);
LOAD_SYM(cuMemSetAccess, 1);
LOAD_SYM(cuMemUnmap, 1);
LOAD_SYM(cuMemGetAllocationPropertiesFromHandle, 1);
/* ncclMemAlloc/Free */
LOAD_SYM(cuPointerGetAttribute, 1);
#if CUDA_VERSION >= 11070
@@ -208,6 +218,20 @@ static void initOnceFunc() {
// Determine whether we support the cuMem APIs or not
ncclCuMemSupported = ncclIsCuMemSupported();
#if 12020 <= CUDART_VERSION && CUDART_VERSION <= 12030
/* To use cuMem* for host memory allocation, we need to create context on each
* visible device. This is workaround needed in CUDA 12.3 which is fixed in 12.4. */
if (ncclCuMemSupported && ncclCuMemHostEnable()) {
int deviceCnt, saveDevice;
cudaGetDevice(&saveDevice);
cudaGetDeviceCount(&deviceCnt);
for (int i = 0; i < deviceCnt; ++i) {
cudaSetDevice(i);
cudaFree(NULL);
}
cudaSetDevice(saveDevice);
}
#endif
initResult = ret;
return;
error:
+10 -13
Ver ficheiro
@@ -41,6 +41,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
if (len > (sizeof(cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot bind provided name to socket. Name too large");
close(fd);
return ncclInternalError;
}
#ifndef USE_ABSTRACT_SOCKET
@@ -66,7 +67,7 @@ ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, v
// Mark socket as non-blocking
if (handle->abortFlag) {
int flags;
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
SYSCHECK(flags = fcntl(fd, F_GETFL), "fcntl");
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
}
@@ -186,20 +187,16 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d to UDS socket %s", hdr, hdrLen, temp);
TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp);
if (sendFd != -1) {
TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
cmptr = CMSG_FIRSTHDR(&msg);
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
cmptr->cmsg_level = SOL_SOCKET;
cmptr->cmsg_type = SCM_RIGHTS;
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
}
cmptr = CMSG_FIRSTHDR(&msg);
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
cmptr->cmsg_level = SOL_SOCKET;
cmptr->cmsg_type = SCM_RIGHTS;
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
msg.msg_name = (void *)&cliaddr;
msg.msg_namelen = sizeof(struct sockaddr_un);
+4
Ver ficheiro
@@ -102,6 +102,10 @@ ncclResult_t ncclNvmlEnsureInitialized() {
for(Symbol sym: symbols) {
*sym.ppfn = dlsym(libhandle, sym.name);
}
// Coverity complains that we never dlclose this object, but that's
// deliberate, since we want the loaded object to remain in memory until
// the process terminates, so that we can use its code.
// coverity[leaked_storage]
}
#endif
+19 -9
Ver ficheiro
@@ -37,7 +37,7 @@ void setEnvFile(const char* fileName) {
while (line[s] != '\0' && line[s] != '=') s++;
if (line[s] == '\0') continue;
strncpy(envVar, line, std::min(1023,s));
envVar[s] = '\0';
envVar[std::min(1023,s)] = '\0';
s++;
strncpy(envValue, line+s, 1023);
envValue[1023]='\0';
@@ -48,17 +48,28 @@ void setEnvFile(const char* fileName) {
fclose(file);
}
void initEnv() {
static void initEnvFunc() {
char confFilePath[1024];
const char * userDir = userHomeDir();
if (userDir) {
sprintf(confFilePath, "%s/.nccl.conf", userDir);
const char* userFile = getenv("NCCL_CONF_FILE");
if (userFile && strlen(userFile) > 0) {
snprintf(confFilePath, sizeof(confFilePath), "%s", userFile);
setEnvFile(confFilePath);
} else {
const char* userDir = userHomeDir();
if (userDir) {
snprintf(confFilePath, sizeof(confFilePath), "%s/.nccl.conf", userDir);
setEnvFile(confFilePath);
}
}
sprintf(confFilePath, "/etc/nccl.conf");
snprintf(confFilePath, sizeof(confFilePath), "/etc/nccl.conf");
setEnvFile(confFilePath);
}
void initEnv() {
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, initEnvFunc);
}
void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_lock(&mutex);
@@ -80,8 +91,7 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
pthread_mutex_unlock(&mutex);
}
const char *ncclGetEnv(const char *name) {
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, initEnv);
const char* ncclGetEnv(const char* name) {
initEnv();
return getenv(name);
}
+502 -93
Ver ficheiro
@@ -1,115 +1,524 @@
/*************************************************************************
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "param.h"
#include "checks.h"
#include "comm.h"
#include "enqueue.h"
#include "utils.h"
#include "proxy.h"
#include "profiler.h"
//#define PROFILE_PROXY 1
#ifdef PROFILE_PROXY
#include "timer.h"
#include "alloc.h"
static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER;
static int profilerPluginRefCount;
static void* profilerPluginLib;
static ncclProfiler_t* ncclProfiler;
static const char* profilingStateSendStr[] = { "BufferWait", "GPUWait", "SendWait", "", "End" };
static const char* profilingStateRecvStr[] = { "BufferWait", "RecvWait", "FlushWait", "GPUWait", "End" };
static const char* profilingEventStr[] = { "SendRecv", "Sleep", "Idle", "Append" };
struct ncclProxyProfileEvent {
double timestamp[6];
uint64_t opCount;
int peer;
int step;
uint16_t channel;
uint8_t type; // send / recv
uint8_t opIndex;
};
#define MAX_STR_LEN 256
#define NCCL_PROFILER_PLUGIN_SYMBOL "ncclProfiler_v1"
struct ncclProxyProfileEvent* profilingEvents = NULL;
int profilingIndex = 0;
double profilingStart = 0;
#define MAX_EVENTS 200000
ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) {
if (profilingEvents == NULL) {
NCCLCHECK(ncclCalloc(&profilingEvents, MAX_EVENTS));
profilingStart = gettime();
static void* tryOpenLib(char* name, int *err, char* errStr) {
if (nullptr == name || strlen(name) == 0) {
return nullptr;
}
struct ncclProxyProfileEvent* event = NULL;
if (state%8 == 0) {
if (profilingIndex == MAX_EVENTS) return ncclSuccess;
args->subs[sub].profilingEvents[step%NCCL_STEPS] = event = profilingEvents+profilingIndex++;
if (state == ncclProxyProfileBegin) {
// Proxy operation information
event->opCount = args->opCount;
event->channel = args->subs[sub].channelId;
event->peer = args->subs[sub].peer;
event->type = args->pattern;
event->step = step;
event->opIndex = (((uint64_t)args)/sizeof(struct ncclProxyArgs))%256;
} else event->peer = -state;
if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
name = nullptr;
}
void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
if (nullptr == handle) {
strncpy(errStr, dlerror(), MAX_STR_LEN);
errStr[MAX_STR_LEN] = 0;
if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
*err = ENOENT;
}
}
return handle;
}
static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
if (openErr == ENOENT) {
snprintf(nameList, *nameListLen, " %s", name);
nameList += strlen(name) + 1;
*nameListLen -= strlen(name) + 1;
return nameList;
}
INFO(NCCL_ENV, "PROFILER/Plugin: %s", openErrStr);
return nameList;
}
static void* openProfilerPluginLib(char* couldNotFindNames, int len) {
int openErr;
void *pluginLib;
char profilerPluginLibName[PATH_MAX];
char openErrStr[MAX_STR_LEN + 1] = { 0 };
const char *envProfilerPluginName = getenv("NCCL_PROFILER_PLUGIN");
if (envProfilerPluginName && strlen(envProfilerPluginName)) {
snprintf(profilerPluginLibName, PATH_MAX, "%s", envProfilerPluginName);
pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
if (pluginLib) {
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName);
return pluginLib;
}
couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
if (pluginLib) {
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName);
return pluginLib;
}
couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
} else {
event = (struct ncclProxyProfileEvent*)args->subs[sub].profilingEvents[step%NCCL_STEPS];
if (state == ncclProxyProfileEnd) args->subs[sub].profilingEvents[step%NCCL_STEPS] = NULL;
if (state == ncclProxyProfileAppendEnd) event->opCount = args->opCount;
snprintf(profilerPluginLibName, PATH_MAX, "libnccl-profiler.so");
pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
if (pluginLib) {
return pluginLib;
}
couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
}
// Timestamp
event->timestamp[state%8] = gettime()-profilingStart;
return nullptr;
}
enum {
profilerPluginLoadFailed = -1,
profilerPluginLoadReady = 0,
profilerPluginLoadSuccess = 1,
};
static int profilerPluginStatus = profilerPluginLoadReady;
static pid_t pid;
#define MAX_PLUGIN_LOAD 2
static ncclResult_t ncclProfilerPluginLoad(void) {
if (profilerPluginLoadFailed == profilerPluginStatus) {
return ncclSuccess;
}
char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
pthread_mutex_lock(&profilerLock);
if (profilerPluginLoadSuccess == profilerPluginStatus) {
++profilerPluginRefCount;
goto exit;
}
profilerPluginLib = openProfilerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
if (profilerPluginLib == nullptr) {
if (strlen(couldNotFindNames)) {
INFO(NCCL_ENV, "PROFILER/Plugin: Could not find:%s.", couldNotFindNames);
}
goto fail;
}
ncclProfiler = (ncclProfiler_t*)dlsym(profilerPluginLib, NCCL_PROFILER_PLUGIN_SYMBOL);
if (ncclProfiler == nullptr) {
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find " NCCL_PROFILER_PLUGIN_SYMBOL ".");
goto fail;
}
++profilerPluginRefCount;
profilerPluginStatus = profilerPluginLoadSuccess;
// Store the pid of the process loading the profiler.
// This is attached to the proxyOp event descriptor
// so the plugin can figure out if the parent event
// is in the same address space or not
pid = getpid();
exit:
pthread_mutex_unlock(&profilerLock);
return ncclSuccess;
fail:
if (profilerPluginLib) dlclose(profilerPluginLib);
profilerPluginStatus = profilerPluginLoadFailed;
goto exit;
}
static ncclResult_t ncclProfilerPluginUnload(void) {
pthread_mutex_lock(&profilerLock);
if (0 == (--profilerPluginRefCount)) {
INFO(NCCL_ENV, "PROFILER/Plugin: Closing profiler plugin %s", ncclProfiler->name);
dlclose(profilerPluginLib);
profilerPluginLib = nullptr;
ncclProfiler = nullptr;
profilerPluginStatus = profilerPluginLoadReady;
}
pthread_mutex_unlock(&profilerLock);
return ncclSuccess;
}
void ncclProfilingDump() {
static int dumpDone = 0;
if (dumpDone) return;
dumpDone = 1;
const char* str = ncclGetEnv("NCCL_PROXY_PROFILE");
if (!str) { free(profilingEvents); return; }
FILE* f = fopen(str, "w");
fprintf(f, "[\n");
#define ENABLE_TIMER 0
#include "timer.h"
for (int i=0; i<profilingIndex; i++) {
struct ncclProxyProfileEvent* e = profilingEvents+i;
const int sendrecv = e->peer >= 0;
const char* typeStr = sendrecv ? (e->type == ncclPatternSend ? "Send" : "Recv") :
profilingEventStr[-(e->peer/8)];
#if ENABLE_TIMER
static int64_t elapsedCount;
static int64_t initCount, finalizeCount;
static int64_t groupStartCount, groupStopCount;
static int64_t taskStartCount, taskStopCount;
static int64_t proxyOpStartCount, proxyOpStopCount;
static int64_t proxyStepStartCount, proxyStepStopCount;
static int64_t proxyCtrlStartCount, proxyCtrlStopCount;
static int64_t proxyOpRecordCount, proxyStepRecordCount, proxyCtrlRecordCount;
static double elapsedTs[2];
static double initTs[2], finalizeTs[2];
static double groupStartTs[2], groupStopTs[2];
static double taskStartTs[2], taskStopTs[2];
static double proxyOpStartTs[2], proxyOpStopTs[2];
static double proxyStepStartTs[2], proxyStepStopTs[2];
static double proxyCtrlStartTs[2], proxyCtrlStopTs[2];
static double proxyOpRecordTs[2], proxyStepRecordTs[2], proxyCtrlRecordTs[2];
#define TIME_START_EVENT(event) do { \
(event ## Count)++; \
(event ## Ts)[0] = gettime(); \
} while(0)
#define TIME_STOP_EVENT(event) do { \
double val = gettime() - (event ## Ts)[0]; \
(event ## Ts)[1] += val; \
} while(0)
#define TIME_PRINT_EVENTS(name) do { \
printf("%s ", name); \
if (elapsedCount) printf("[elapsed] %g/%ld = %g ", elapsedTs[1], elapsedCount, elapsedTs[1]/elapsedCount); \
if (initCount) printf("[init] %g/%ld = %g ", initTs[1], initCount, initTs[1]/initCount); \
if (finalizeCount) printf("[finalize] %g/%ld = %g ", finalizeTs[1], finalizeCount, finalizeTs[1]/finalizeCount); \
if (groupStartCount) printf("[groupStart] %g/%ld = %g ", groupStartTs[1], groupStartCount, groupStartTs[1]/groupStartCount); \
if (groupStopCount) printf("[groupStop] %g/%ld = %g ", groupStopTs[1], groupStopCount, groupStopTs[1]/groupStopCount); \
if (taskStartCount) printf("[taskStart] %g/%ld = %g ", taskStartTs[1], taskStartCount, taskStartTs[1]/taskStartCount); \
if (taskStopCount) printf("[taskStop] %g/%ld = %g ", taskStopTs[1], taskStopCount, taskStopTs[1]/taskStopCount); \
if (proxyOpStartCount) printf("[proxyOpStart] %g/%ld = %g ", proxyOpStartTs[1], proxyOpStartCount, proxyOpStartTs[1]/proxyOpStartCount); \
if (proxyOpStopCount) printf("[proxyOpStop] %g/%ld = %g ", proxyOpStopTs[1], proxyOpStopCount, proxyOpStopTs[1]/proxyOpStopCount); \
if (proxyStepStartCount) printf("[proxyStepStart] %g/%ld = %g ", proxyStepStartTs[1], proxyStepStartCount, proxyStepStartTs[1]/proxyStepStartCount); \
if (proxyStepStopCount) printf("[proxyStepStop] %g/%ld = %g ", proxyStepStopTs[1], proxyStepStopCount, proxyStepStopTs[1]/proxyStepStopCount); \
if (proxyCtrlStartCount) printf("[proxyCtrlStart] %g/%ld = %g ", proxyCtrlStartTs[1], proxyCtrlStartCount, proxyCtrlStartTs[1]/proxyCtrlStartCount); \
if (proxyCtrlStopCount) printf("[proxyCtrlStop] %g/%ld = %g ", proxyCtrlStopTs[1], proxyCtrlStopCount, proxyCtrlStopTs[1]/proxyCtrlStopCount); \
if (proxyOpRecordCount) printf("[proxyOpRecord] %g/%ld = %g ", proxyOpRecordTs[1], proxyOpRecordCount, proxyOpRecordTs[1]/proxyOpRecordCount); \
if (proxyStepRecordCount) printf("[proxyStepRecord] %g/%ld = %g ", proxyStepRecordTs[1], proxyStepRecordCount, proxyStepRecordTs[1]/proxyStepRecordCount); \
if (proxyCtrlRecordCount) printf("[proxyCtrlRecord] %g/%ld = %g", proxyCtrlRecordTs[1], proxyCtrlRecordCount, proxyCtrlRecordTs[1]/proxyCtrlRecordCount); \
printf("\n"); \
} while(0)
#else
#define TIME_START_EVENT(event) do {} while(0)
#define TIME_STOP_EVENT(event) do {} while(0)
#define TIME_PRINT_EVENTS(name) do {} while(0)
#endif
if (sendrecv) {
int state = ncclProxyProfileBegin;
const char** stateStr = e->type == ncclPatternSend ? profilingStateSendStr : profilingStateRecvStr;
fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f, \"args\": { \"opCount\": %ld, \"proxyOpIndex\":%d } },\n",
typeStr, e->peer, e->step, i, e->channel, e->timestamp[state], e->opCount, e->opIndex);
static int eActivationMask; // Set by profiler
static int eActivationMaskGroup; // Cached for current group
while (state<ncclProxyProfileEnd) {
if (e->timestamp[state]) {
const char* name = stateStr[state];
fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
name, i, e->channel, e->timestamp[state]);
state++;
while (e->timestamp[state] == 0) state++;
fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
name, i, e->channel, e->timestamp[state]);
}
}
fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
typeStr, e->peer, e->step, i, e->channel, e->timestamp[state]);
} else {
if (e->peer == -ncclProxyProfileAppend) {
fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f, \"args\": { \"added\": %ld } },\n",
typeStr, i, e->timestamp[0], e->opCount);
} else {
fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
typeStr, i, e->timestamp[0]);
}
fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
typeStr, i, e->timestamp[1]);
ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) {
TIME_START_EVENT(elapsed);
TIME_START_EVENT(init);
ncclProfilerPluginLoad();
if (__builtin_expect(ncclProfiler != NULL, 0)) {
int err = ncclProfiler->init(&comm->profilerContext, &eActivationMask);
if (err) {
WARN("Profiler init failed with error (%d). Continue without profiler.", err);
ncclProfiler = NULL;
}
}
fprintf(f, "{} ]\n");
fclose(f);
free(profilingEvents);
TIME_STOP_EVENT(init);
return ncclSuccess;
}
ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm) {
TIME_START_EVENT(finalize);
if (__builtin_expect(ncclProfiler != NULL, 0)) {
ncclProfiler->finalize(comm->profilerContext);
}
ncclProfilerPluginUnload();
TIME_STOP_EVENT(finalize);
TIME_STOP_EVENT(elapsed);
TIME_PRINT_EVENTS("Profiler");
return ncclSuccess;
}
ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) {
TIME_START_EVENT(groupStart);
eActivationMaskGroup = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
if (__builtin_expect(ncclProfiler != NULL, 0)) {
if (eActivationMaskGroup & (ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep)) {
ncclProfilerEventDescr_v1_t eDescr = { 0 };
eDescr.type = ncclProfileGroup;
ncclProfiler->startEvent(plan->comm->profilerContext, &plan->groupEventHandle, &eDescr);
}
}
TIME_STOP_EVENT(groupStart);
return ncclSuccess;
}
ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan) {
TIME_START_EVENT(groupStop);
if (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle) {
ncclProfiler->stopEvent(plan->groupEventHandle);
}
TIME_STOP_EVENT(groupStop);
return ncclSuccess;
}
ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
TIME_START_EVENT(taskStart);
if (__builtin_expect(ncclProfiler != NULL, 0)) {
int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl);
if (plan->groupEventHandle && enable) {
struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
while (ct) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileColl;
eDescr.parentObj = plan->groupEventHandle;
eDescr.rank = plan->comm->rank;
eDescr.coll.name = plan->comm->commName;
eDescr.coll.commHash = plan->comm->commHash;
eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]++;
eDescr.coll.func = ct->func;
eDescr.coll.sendBuff = ct->sendbuff;
eDescr.coll.recvBuff = ct->recvbuff;
eDescr.coll.count = ct->count;
eDescr.coll.root = ct->root;
eDescr.coll.datatype = ct->datatype;
eDescr.coll.op = ct->opHost;
eDescr.coll.trafficBytes = ct->trafficBytes;
eDescr.coll.nMaxChannels = ct->nMaxChannels;
eDescr.coll.nWarps = ct->nWarps;
eDescr.coll.algo = ct->algorithm;
eDescr.coll.proto = ct->protocol;
eDescr.coll.isCollnet = ct->isCollnet;
eDescr.coll.isNvls = ct->isNvls;
ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr);
// update collective task with group event activation mask
ct->eActivationMask = eActivationMaskGroup;
ct = ct->next;
}
struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
while (pt) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileP2p;
eDescr.parentObj = plan->groupEventHandle;
eDescr.rank = plan->comm->rank;
eDescr.p2p.name = plan->comm->commName;
eDescr.p2p.commHash = plan->comm->commHash;
eDescr.p2p.func = pt->func;
eDescr.p2p.buff = pt->buff;
eDescr.p2p.count = pt->count;
eDescr.p2p.datatype = pt->datatype;
eDescr.p2p.peer = pt->root;
ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
// update collective task with group event activation mask
pt->eActivationMask = eActivationMaskGroup;
pt = pt->next;
}
}
}
TIME_STOP_EVENT(taskStart);
return ncclSuccess;
}
ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) {
TIME_START_EVENT(taskStop);
if (__builtin_expect(ncclProfiler != NULL, 0)) {
int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl);
if (plan->groupEventHandle && enable) {
struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
while (ct) {
ncclProfiler->stopEvent(ct->eventHandle);
ct = ct->next;
}
struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
while (pt) {
ncclProfiler->stopEvent(pt->eventHandle);
pt = pt->next;
}
}
}
TIME_STOP_EVENT(taskStop);
return ncclSuccess;
}
ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args) {
TIME_START_EVENT(proxyOpStart);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0)) {
if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileProxyOp;
eDescr.parentObj = sub->taskEventHandle;
eDescr.rank = sub->rank;
eDescr.proxyOp.pid = args->pid;
eDescr.proxyOp.channelId = sub->channelId;
eDescr.proxyOp.peer = sub->peer;
eDescr.proxyOp.nSteps = sub->nsteps;
eDescr.proxyOp.chunkSize = args->chunkSize;
eDescr.proxyOp.isSend = 1;
ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr);
}
}
TIME_STOP_EVENT(proxyOpStart);
return ncclSuccess;
}
ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args) {
TIME_START_EVENT(proxyOpStart);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0)) {
if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileProxyOp;
eDescr.parentObj = sub->taskEventHandle;
eDescr.rank = sub->rank;
eDescr.proxyOp.pid = args->pid;
eDescr.proxyOp.channelId = sub->channelId;
eDescr.proxyOp.peer = sub->peer;
eDescr.proxyOp.nSteps = sub->nsteps;
eDescr.proxyOp.chunkSize = args->chunkSize;
eDescr.proxyOp.isSend = 0;
ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr);
}
}
TIME_STOP_EVENT(proxyOpStart);
return ncclSuccess;
}
ncclResult_t ncclProfilerStopProxyOpEvent(int s, struct ncclProxyArgs* args) {
TIME_START_EVENT(proxyOpStop);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
ncclProfiler->stopEvent(sub->opEventHandle);
sub->opEventHandle = NULL;
}
TIME_STOP_EVENT(proxyOpStop);
return ncclSuccess;
}
ncclResult_t ncclProfilerStartSendProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
TIME_START_EVENT(proxyStepStart);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0)) {
if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
for (uint64_t step = stepLo; step < stepHi; step++) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileProxyStep;
eDescr.parentObj = sub->opEventHandle;
eDescr.rank = sub->rank;
eDescr.proxyStep.step = step;
ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr);
}
}
}
TIME_STOP_EVENT(proxyStepStart);
return ncclSuccess;
}
ncclResult_t ncclProfilerStartRecvProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
TIME_START_EVENT(proxyStepStart);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0)) {
if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
for (uint64_t step = stepLo; step < stepHi; step++) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileProxyStep;
eDescr.parentObj = sub->opEventHandle;
eDescr.rank = sub->rank;
eDescr.proxyStep.step = step;
ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr);
}
}
}
TIME_STOP_EVENT(proxyStepStart);
return ncclSuccess;
}
ncclResult_t ncclProfilerStopProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
TIME_START_EVENT(proxyStepStop);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0)) {
for (uint64_t step = stepLo; step < stepHi; step++) {
if (sub->stepEventHandles[step%NCCL_STEPS]) {
ncclProfiler->stopEvent(sub->stepEventHandles[step%NCCL_STEPS]);
sub->stepEventHandles[step%NCCL_STEPS] = NULL;
}
}
}
TIME_STOP_EVENT(proxyStepStop);
return ncclSuccess;
}
ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle) {
TIME_START_EVENT(proxyCtrlStart);
if (__builtin_expect(ncclProfiler != NULL, 0)) {
// for proxy control events we allow profiling mode to change on a per event basis
int eActivationMaskProxy = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
if (eActivationMaskProxy & ncclProfileProxyCtrl) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileProxyCtrl;
ncclProfiler->startEvent(profilerContext, eHandle, &eDescr);
TIME_STOP_EVENT(proxyCtrlStart);
return ncclSuccess;
}
}
*eHandle = NULL;
TIME_STOP_EVENT(proxyCtrlStart);
return ncclSuccess;
}
ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle) {
TIME_START_EVENT(proxyCtrlStop);
if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle) {
ncclProfiler->stopEvent(eHandle);
}
TIME_STOP_EVENT(proxyCtrlStop);
return ncclSuccess;
}
ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState) {
TIME_START_EVENT(proxyOpRecord);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
ncclProfilerEventStateArgs_t a = { 0 };
a.proxyOp.steps = steps;
a.proxyOp.transSize = transSize;
ncclProfiler->recordEventState(sub->opEventHandle, eState, &a);
}
TIME_STOP_EVENT(proxyOpRecord);
return ncclSuccess;
}
ncclResult_t ncclProfilerRecordProxyStepEventStates(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState) {
TIME_START_EVENT(proxyStepRecord);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
for (uint64_t step = stepLo; step < stepHi; step++) {
if (sub->stepEventHandles[step%NCCL_STEPS]) {
ncclProfiler->recordEventState(sub->stepEventHandles[step%NCCL_STEPS], eState, 0);
}
}
}
TIME_STOP_EVENT(proxyStepRecord);
return ncclSuccess;
}
ncclResult_t ncclProfilerRecordProxyCtrlEventState(void* eHandle, int appended, ncclProfilerEventState_t eState) {
TIME_START_EVENT(proxyCtrlRecord);
if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) {
ncclProfilerEventStateArgs_t args = { 0 };
args.proxyCtrl.appendedProxyOps = appended;
ncclProfiler->recordEventState(eHandle, eState, &args);
}
TIME_STOP_EVENT(proxyCtrlRecord);
return ncclSuccess;
}
ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op) {
op->pid = pid;
return ncclSuccess;
}
#else
ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; }
void ncclProfilingDump() {}
#endif
+7 -6
Ver ficheiro
@@ -4,7 +4,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include "shm.h"
#include "shmutils.h"
#include "comm.h"
#include "checks.h"
#include <sys/types.h>
@@ -75,7 +75,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
goto fail;
}
} else {
SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "open", ret, fail);
}
retry_fallocate:
@@ -90,7 +90,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
}
INFO(NCCL_ALLOC, "Allocated %ld bytes of shared memory in %s", realShmSize, shmPath);
} else {
SYSCHECKGOTO(fd = open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
SYSCHECKGOTO(fd = open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", ret, fail);
}
hptr = (char*)mmap(NULL, realShmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
@@ -114,7 +114,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
}
if (devShmPtr) {
CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterMapped), ret, fail);
CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped), ret, fail);
CUDACHECKGOTO(cudaHostGetDevicePointer(&dptr, (void*)hptr, 0), ret, fail);
}
@@ -129,7 +129,7 @@ fail:
shmPath, shmSize, strerror(errno), errno);
if (tmphandle) {
shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle);
ncclShmClose((ncclShmHandle_t)tmphandle);
(void)ncclShmClose((ncclShmHandle_t)tmphandle);
tmphandle = NULL;
}
hptr = NULL;
@@ -182,7 +182,7 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) {
ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize) {
ncclResult_t ret = ncclSuccess;
int curRound = shmem->round;
int curRound;
size_t mycnt;
if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL || shmem->maxTypeSize < typeSize) {
@@ -190,6 +190,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff
goto exit;
}
curRound = shmem->round;
memcpy((char*)shmem->ptr[curRound] + comm->localRank * typeSize, sendbuff, typeSize);
/* sync among local ranks */
mycnt = __atomic_add_fetch(shmem->cnt[curRound], 1, __ATOMIC_ACQ_REL);
+11 -5
Ver ficheiro
@@ -284,6 +284,7 @@ ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char
sin6.sin6_scope_id = 0; // should be global scope, set to 0
} else {
WARN("Net : unsupported IP family");
freeaddrinfo(p);
return ncclInvalidArgument;
}
@@ -408,7 +409,7 @@ ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress*
static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
socklen_t socklen = sizeof(union ncclSocketAddress);
sock->fd = accept(sock->acceptFd, &sock->addr.sa, &socklen);
sock->fd = accept(sock->acceptFd, (struct sockaddr*)&sock->addr, &socklen);
if (sock->fd != -1) {
sock->state = ncclSocketStateAccepted;
} else if (errno != EAGAIN && errno != EWOULDBLOCK) {
@@ -501,8 +502,9 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
} else if (ret < 0) {
WARN("socketPollConnect poll() failed with error %s", strerror(errno));
return ncclRemoteError;
} else {
EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0);
} else if (ret != 1 || (pfd.revents & POLLOUT) == 0) {
WARN("socketPollConnect poll() returned %d%s", ret, (pfd.revents & POLLOUT) ? "" : ", no POLLOUT events");
return ncclSystemError;
}
/* check socket status */
@@ -734,13 +736,17 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad
/* Set socket as non-blocking if async or if we need to be able to abort */
if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
int flags;
EQCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), -1, ret, fail);
SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), ret, fail);
SYSCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), "fcntl", ret, fail);
SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl", ret, fail);
}
exit:
return ret;
fail:
if (sock->fd != -1) {
close(sock->fd);
sock->fd = -1;
}
goto exit;
}
+2
Ver ficheiro
@@ -77,6 +77,8 @@ static void* tryOpenLib(const char* name, int* err, char* errStr) {
if (nullptr == handle) {
strncpy(errStr, dlerror(), MAX_STR_LEN);
errStr[MAX_STR_LEN] = '\0';
// "handle" and "name" won't be NULL at the same time.
// coverity[var_deref_model]
if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
*err = ENOENT;
}
+9 -12
Ver ficheiro
@@ -65,15 +65,7 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
return ncclSuccess;
}
uint64_t getHash(const char* string, int n) {
// Based on DJB2a, result = result * 33 ^ char
uint64_t result = 5381;
for (int c = 0; c < n; c++) {
result = ((result << 5) + result) ^ string[c];
}
return result;
}
static uint64_t hostHashValue = 0;
/* Generate a hash of the unique identifying string for this host
* that will be unique for both bare-metal and container instances
* Equivalent of a hash of;
@@ -83,7 +75,7 @@ uint64_t getHash(const char* string, int n) {
* This string can be overridden by using the NCCL_HOSTID env var.
*/
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
uint64_t getHostHash(void) {
static void getHostHashOnce() {
char hostHash[1024];
const char *hostId;
@@ -103,8 +95,8 @@ uint64_t getHostHash(void) {
strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
free(p);
}
fclose(file);
}
fclose(file);
}
// Make sure the string is terminated
@@ -112,7 +104,12 @@ uint64_t getHostHash(void) {
TRACE(NCCL_INIT,"unique hostname '%s'", hostHash);
return getHash(hostHash, strlen(hostHash));
hostHashValue = getHash(hostHash, strlen(hostHash));
}
uint64_t getHostHash(void) {
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, getHostHashOnce);
return hostHashValue;
}
/* Generate a hash of the unique identifying string for this process
+7
Ver ficheiro
@@ -168,6 +168,13 @@ ncclResult_t pncclCommAbort(ncclComm_t comm);
ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
/* Creates a new communicator (multi thread/process version), similar to ncclCommInitRankConfig.
* Allows to use more than one ncclUniqueId (up to one per rank), indicated by nId, to accelerate the init operation.
* The number of ncclUniqueIds and their order must be the same for every rank.
*/
ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
ncclResult_t pncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
/* Returns a string for each error code. */
const char* ncclGetErrorString(ncclResult_t result);
const char* pncclGetErrorString(ncclResult_t result);
+5 -4
Ver ficheiro
@@ -355,6 +355,8 @@ static void* tryOpenLib(char* name, int* err, char* errStr) {
if (nullptr == handle) {
strncpy(errStr, dlerror(), MAX_STR_LEN);
errStr[MAX_STR_LEN] = '\0';
// "handle" and "name" won't be NULL at the same time.
// coverity[var_deref_model]
if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
*err = ENOENT;
}
@@ -422,11 +424,10 @@ static int netPluginStatus = netPluginLoadReady;
ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
if (netPluginLoadFailed == netPluginStatus) {
return ncclSuccess;
}
pthread_mutex_lock(&netPluginLock);
if (netPluginLoadFailed == netPluginStatus) {
goto exit;
}
if (netPluginLoadSuccess == netPluginStatus) {
++netPluginRefCount;
goto exit;
+225 -79
Ver ficheiro
@@ -8,18 +8,21 @@
#include "info.h"
#include "collectives.h"
#include "socket.h"
#include "shm.h"
#include "shmutils.h"
#include "profiler.h"
#define ENABLE_TIMER 0
#include "timer.h"
#include "profiler.h"
#include "transport.h"
#include <sys/syscall.h>
#include <assert.h>
#include <unistd.h>
#include <sys/time.h>
#include <sched.h>
enum { proxyRecv=0, proxySend=1 };
void* ncclProxyServiceUDS(void* _args);
static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
@@ -67,8 +70,10 @@ static ncclResult_t expectedProxyResponseStore(struct ncclProxyState* state, voi
return ncclInternalError;
}
memcpy(elem->respBuff, respBuff, respSize);
free(respBuff);
if (respSize > 0) {
memcpy(elem->respBuff, respBuff, respSize);
free(respBuff);
}
elem->done = true;
elem->res = res;
return ncclSuccess;
@@ -360,12 +365,17 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
sub->nsteps = op->nsteps;
sub->nbytes = op->nbytes;
sub->offset = 0;
sub->peer = op->root;
sub->peer = op->peer;
sub->reg = op->reg;
sub->sendMhandle = op->sendMhandle;
sub->recvMhandle = op->recvMhandle;
sub->sendbuff = op->sendbuff;
sub->recvbuff = op->recvbuff;
sub->eActivationMask = op->eActivationMask;
sub->taskEventHandle = op->taskEventHandle;
sub->rank = op->rank;
args->pid = op->pid;
args->profilerContext = op->profilerContext;
args->nsubs = subIndex+1;
if (subIndex) {
if ((args->sliceSteps != op->sliceSteps) ||
@@ -527,6 +537,7 @@ static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel
if (justInquire) *justInquire = true;
else {
op->peer = peer;
NCCLCHECK(ncclLocalOpAppend(comm, &connector->proxyConn, op));
}
return ncclSuccess;
@@ -588,6 +599,64 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
NCCLCHECK(SaveProxy(comm, channel, proxySend, channel->nvls.treeDown[2], op, 0, justInquire));
NCCLCHECK(SaveProxy(comm, channel, proxyRecv, channel->nvls.treeUp, op, 0, justInquire));
} break;
case ncclPatternPatUp: {
// Run full algorithm to count the number of steps for each peer.
int *nstepsSend, *nstepsRecv;
const int rank = comm->rank, nranks = comm->nRanks;
NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks)));
NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks)));
const ssize_t size = op->nbytes/comm->nRanks;
PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
int last = 0;
while (last == 0) {
int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
size_t inpIx, outIx;
algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++;
if (sendDim != -1 && postSend) nstepsSend[sendDim]++;
}
for (int i=0; i<log2Up(nranks); i++) {
if (nstepsSend[i]) {
int sendPeer = (rank + (1<<i)) % nranks;
op->nsteps = nstepsSend[i];
NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire));
}
if (nstepsRecv[i]) {
int recvPeer = (rank - (1<<i) + nranks) % nranks;
op->nsteps = nstepsRecv[i];
NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire));
}
}
} break;
case ncclPatternPatDown: {
// Run full algorithm to count the number of steps for each peer.
int *nstepsSend, *nstepsRecv;
const int rank = comm->rank, nranks = comm->nRanks;
NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks)));
NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks)));
const ssize_t size = op->nbytes/comm->nRanks;
PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
int last = 0;
while (last == 0) {
int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
size_t inpIx, outIx;
algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++;
if (sendDim != -1 && postSend) nstepsSend[sendDim]++;
}
for (int i=0; i<log2Up(nranks); i++) {
if (nstepsSend[i]) {
int sendPeer = (rank - (1<<i) + nranks) % nranks;
op->nsteps = nstepsSend[i];
NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire));
}
if (nstepsRecv[i]) {
int recvPeer = (rank + (1<<i)) % nranks;
op->nsteps = nstepsRecv[i];
NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire));
}
}
} break;
case ncclPatternSend:
case ncclPatternRecv: {
if (op->root == comm->rank) return ncclSuccess;
@@ -657,9 +726,9 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
if (state->opsPool == NULL) return ncclInternalError;
struct ncclProxyOpsPool* pool = state->opsPool;
struct ncclProxyArgs profArgs; // Only used for profiling purposes
if (state->nextOps != -1) goto process_nextops;
void* eHandle;
// If we have ops to progress, no need to block waiting for something to arrive or even wait for the lock
// to be available. Exit, continue progress, and come back later.
if (state->active != NULL && (pool->nextOps == -1 || pthread_mutex_trylock(&pool->mutex) != 0)) return ncclSuccess;
@@ -667,10 +736,11 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
if (state->active == NULL) {
pthread_mutex_lock(&pool->mutex);
while (pool->nextOps == -1 && !state->stop) {
struct ncclProxyArgs profArgs; // Only used for profiling purposes
ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep);
ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlSleep);
pthread_cond_wait(&pool->cond, &pool->mutex);
ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup);
ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlWakeup);
ncclProfilerStopProxyCtrlEvent(eHandle);
}
if (state->stop) { // We might have been woken up to stop.
pthread_mutex_unlock(&pool->mutex);
@@ -684,7 +754,8 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
if (state->nextOps == -1) return ncclInternalError;
process_nextops:
ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppend);
ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlAppend);
TIME_START(2);
int freeOp[NCCL_MAX_LOCAL_RANKS];
int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
@@ -720,6 +791,10 @@ process_nextops:
if (freeOp[i] == -1) continue;
int newFree = freeOp[i];
int oldFree = pool->freeOps[i];
// Coverity gets confused by the complex code structure here. The previous "for" loop ensures that freeOpEnd[i]
// is initialized so long as freeOp[i] is initialized (is not -1). In the current loop we filter out uninitialized
// freeOp[i], hence ensuring that freeOpEnd[i] is also initialized.
// coverity[uninit_use:FALSE]
pool->ops[freeOpEnd[i]].next = oldFree;
if (oldFree == -1) {
// Nothing for the main thread to consume, we can set it.
@@ -735,8 +810,8 @@ process_nextops:
}
}
}
profArgs.opCount = *added;
ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppendEnd);
ncclProfilerRecordProxyCtrlEventState(eHandle, *added, ncclProfilerProxyCtrlAppendEnd);
ncclProfilerStopProxyCtrlEvent(eHandle);
TIME_STOP(2);
return ncclSuccess;
}
@@ -758,6 +833,7 @@ static int setProxyThreadContext(struct ncclProxyState* proxyState) {
if (CUPFN(cuCtxCreate) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
WARN("Unable to create thread context due to old driver, disabling.");
createThreadContext = 0;
goto exit;
}
}
}
@@ -767,15 +843,17 @@ static int setProxyThreadContext(struct ncclProxyState* proxyState) {
NULL, 0, CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, proxyState->cudaDev)) != CUDA_SUCCESS) {
WARN("Failed to create CUDA context on device %d", proxyState->cudaDev);
createThreadContext = 0;
goto exit;
}
} else {
if (CUPFN(cuCtxSetCurrent(proxyState->cudaCtx)) != CUDA_SUCCESS) {
WARN("Failed to set CUDA context on device %d", proxyState->cudaDev);
return 0;
goto exit;
}
return 1;
}
return 1;
}
exit:
#endif
return 0;
}
@@ -787,12 +865,14 @@ NCCL_PARAM(ProgressAppendOpFreq, "PROGRESS_APPENDOP_FREQ", 8);
void* ncclProxyProgress(void *proxyState_) {
struct ncclProxyState* proxyState = (struct ncclProxyState*)proxyState_;
if (setProxyThreadContext(proxyState)) {
INFO(NCCL_INIT, "[Proxy Progress] Created CUDA context on device %d", proxyState->cudaDev);
INFO(NCCL_INIT, "[Proxy Progress] Set CUDA context on device %d", proxyState->cudaDev);
} else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
WARN("[Proxy Progress] Failed to set CUDA device %d", proxyState->cudaDev);
}
// if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
INFO(NCCL_INIT, "[Proxy Progress] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
struct ncclProxyProgressState* state = &proxyState->progressState;
state->nextOps = -1;
const int sig = ncclParamProxyDumpSignal();
@@ -809,9 +889,7 @@ void* ncclProxyProgress(void *proxyState_) {
* ncclParamProgressAppendOpFreq(). If they are equal, we will append proxy ops. This will decrease the
* frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */
int proxyOpAppendCounter = 0;
struct ncclProxyArgs profArgs; // Only used for profiling purposes
while ((state->stop == 0 || (state->stop == 1 && state->active)) &&
__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) == 0) {
while (state->stop == 0 || (state->stop == 1 && state->active)) {
int idle = 1;
ncclResult_t ret = progressOps(proxyState, state, state->active, &idle);
if (ret != ncclSuccess) {
@@ -819,8 +897,11 @@ void* ncclProxyProgress(void *proxyState_) {
INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
continue;
}
if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle);
if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive);
void* eHandle;
ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
if (lastIdle == 0 && idle == 1) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlIdle);
if (lastIdle == 1 && idle == 0) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlActive);
ncclProfilerStopProxyCtrlEvent(eHandle);
if (idle || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) {
int added = 0;
proxyOpAppendCounter = 0;
@@ -860,7 +941,7 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm) {
static ncclResult_t ncclProxyProgressCreate(struct ncclProxyState* proxyState) {
struct ncclProxyProgressState* state = &proxyState->progressState;
if (!state->thread) {
pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState);
PTHREADCHECK(pthread_create(&state->thread, NULL, ncclProxyProgress, proxyState), "pthread_create");
ncclSetThreadName(state->thread, "NCCL Progress%2d", proxyState->tpLocalnRanks);
}
return ncclSuccess;
@@ -875,7 +956,7 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) {
state->stop = 1;
pthread_cond_signal(&state->opsPool->cond);
pthread_mutex_unlock(&state->opsPool->mutex);
pthread_join(state->thread, NULL);
PTHREADCHECK(pthread_join(state->thread, NULL), "pthread_join");
}
// Free off any memory allocated for the proxy arg pools
@@ -885,7 +966,6 @@ ncclResult_t ncclProxyProgressDestroy(struct ncclProxyState* proxyState) {
state->pools = next;
}
ncclProfilingDump();
TIME_PRINT("Proxy");
return ncclSuccess;
}
@@ -962,23 +1042,17 @@ struct ncclProxyInitResp {
char devShmPath[6]; // "XXXXXX" - May or may not be set
};
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int tpProxyRank, struct ncclProxyConnector* proxyConn) {
ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn) {
struct ncclSocket* sock;
int ready, proxyRank = -1;
int ready;
struct ncclProxyState* sharedProxyState = comm->proxyState;
int tpProxyRank = comm->topParentRanks[proxyRank];
// Keep one connection per local rank
for (int i = 0; i < comm->localRanks; ++i) {
/* find the proxy rank in comm. */
if (comm->topParentRanks[comm->localRankToRank[i]] == tpProxyRank) {
proxyRank = comm->localRankToRank[i];
break;
}
}
proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
// Keep one connection per local rank
proxyConn->connection = NULL;
proxyConn->tpRank = tpProxyRank;
proxyConn->rank = proxyRank;
if (sharedProxyState->peerSocks == NULL) {
NCCLCHECK(ncclCalloc(&sharedProxyState->peerSocks, comm->sharedRes->tpNLocalRanks));
NCCLCHECK(ncclCalloc(&sharedProxyState->proxyOps, comm->sharedRes->tpNLocalRanks));
@@ -1020,68 +1094,93 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
}
}
proxyConn->initialized = true;
INFO(NCCL_NET|NCCL_PROXY, "Connected to proxy localRank %d -> connection %p", proxyConn->tpLocalRank, proxyConn->connection);
return ncclSuccess;
}
// UDS support
ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, int tpRank, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int *respFd) {
ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize, int* reqFd, int *respFd) {
ncclResult_t res = ncclSuccess;
struct ncclIpcSocket ipcSock = { 0 };
void *opId;
NCCLCHECK(getRandomData(&opId, sizeof(opId)));
int reqFdtmp = -1;
int rank = comm->topParentLocalRanks[comm->localRank];
struct ncclProxyState* sharedProxyState = comm->proxyState;
uint64_t pidHash = sharedProxyState->peerAddressesUDS[tpRank];
uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %p opId %p",
comm, rank, tpRank, pidHash, reqSize, respSize, respFd, opId);
comm, rank, proxyConn->tpRank, pidHash, reqSize, respSize, respFd, opId);
// cuMem: Create a UDS socket to receive the response
NCCLCHECK(ncclIpcSocketInit(&ipcSock, rank, (uint64_t)opId, comm->abortFlag));
if (reqFd) {
reqFdtmp = *reqFd;
} else {
// give a dummy fd for the other side of UDS socket
NCCLCHECK(ncclIpcSocketGetFd(&ipcSock, &reqFdtmp));
}
ncclIpcHdr hdr;
hdr.type = type;
hdr.rank = rank;
hdr.reqSize = reqSize;
hdr.respSize = respSize;
hdr.opId = opId;
assert(reqSize <= sizeof(hdr.data));
memcpy(&hdr.data, reqBuff, reqSize);
NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &hdr, sizeof(hdr), -1, tpRank, pidHash), res, error);
NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &hdr, sizeof(hdr), reqFdtmp, proxyConn->tpRank, pidHash), res, error);
NCCLCHECKGOTO(ncclIpcSocketRecvMsg(&ipcSock, respBuff, respSize, respFd), res, error);
NCCLCHECKGOTO(ncclIpcSocketClose(&ipcSock), res, error);
INFO(NCCL_PROXY, "ProxyCall UDS comm %p rank %d tpRank %d(%lx) reqSize %d respSize %d respFd %d opId %p - DONE",
comm, rank, tpRank, pidHash, reqSize, respSize, (respFd ? *respFd : -1), opId);
comm, rank, proxyConn->tpRank, pidHash, reqSize, respSize, (respFd ? *respFd : -1), opId);
return res;
error:
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
WARN("ncclProxyCallBlockingUDS call to tpRank %d(%lx) failed : %d", tpRank, pidHash, res);
WARN("ncclProxyCallBlockingUDS call to tpRank %d(%lx) failed : %d", proxyConn->tpRank, pidHash, res);
return res;
}
// cuMem API support
// The request/response is sent out-of-band using ncclIpcSocket for this specific command
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int tpRank, void *handle, int* convertedFd) {
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, int proxyRank, void *handle, int* convertedFd) {
ncclResult_t ret = ncclSuccess;
// Request the allocation of a UDS fd for the handle
NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, tpRank, ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), NULL, 0, convertedFd), ret, error);
if (comm->gproxyConn[proxyRank].initialized == false) {
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, proxyRank, &comm->gproxyConn[proxyRank]), ret, error);
}
NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, &comm->gproxyConn[proxyRank], ncclProxyMsgGetFd, handle, sizeof(CUmemGenericAllocationHandle), NULL, 0, NULL, convertedFd), ret, error);
// We have now received the converted fd over UDS
INFO(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx tpRank %d returned fd %d", *(uint64_t*)handle, tpRank, *convertedFd);
INFO(NCCL_PROXY, "UDS: ClientGetFd handle 0x%lx tpRank %d returned fd %d sameProcess %d", *(uint64_t*)handle, comm->topParentRanks[proxyRank], *convertedFd, comm->gproxyConn[proxyRank].sameProcess);
return ret;
error:
WARN("ncclProxyClientGetFd call to tpRank %d handle 0x%lx failed : %d", tpRank, *(uint64_t*)handle, ret);
WARN("ncclProxyClientGetFd call to tpRank %d handle 0x%lx failed : %d", comm->topParentRanks[proxyRank], *(uint64_t*)handle, ret);
return ret;
}
ncclResult_t ncclProxyClientQueryFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int localFd, int* rmtFd) {
ncclResult_t ret = ncclSuccess;
NCCLCHECKGOTO(ncclProxyCallBlockingUDS(comm, proxyConn, ncclProxyMsgQueryFd, NULL, 0, (void*)rmtFd, sizeof(int), &localFd, NULL), ret, fail);
exit:
// We have now received the converted fd over UDS
INFO(NCCL_PROXY, "UDS: ClientQueryFd localFd %d tpRank %d remote fd %d sameProcess %d", localFd, proxyConn->tpRank, *rmtFd, proxyConn->sameProcess);
return ret;
fail:
WARN("ncclProxyClientQueryFdBlocking call to tpRank %d localFd %d failed : %d", proxyConn->tpRank, localFd, ret);
goto exit;
}
const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" };
ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
struct ncclSocket* sock;
@@ -1091,7 +1190,6 @@ ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector
if (sharedProxyState->peerSocks == NULL) return ncclInternalError;
sock = sharedProxyState->peerSocks + proxyConn->tpLocalRank;
if (sock == NULL) return ncclInternalError;
NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
@@ -1267,6 +1365,22 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
return ncclSuccess;
}
static ncclResult_t proxyQueryFd(struct ncclProxyState* proxyState, int rank, void *opId, int rmtFd) {
#if CUDART_VERSION >= 11030
struct ncclIpcSocket ipcSock = { 0 };
uint64_t hash = (uint64_t) opId;
ncclResult_t ret = ncclSuccess;
NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, exit);
NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &rmtFd, sizeof(int), rmtFd, rank, hash), ret, exit);
exit:
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
return ncclSuccess;
#else
return ncclInternalError;
#endif
}
// cuMem API support
static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void *opId, uint64_t handle) {
#if CUDART_VERSION >= 11030
@@ -1286,7 +1400,7 @@ static ncclResult_t proxyGetFd(struct ncclProxyState* proxyState, int rank, void
error:
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
// We can now safely close the exported fd
(void) close(fd);
SYSCHECK(close(fd), "close");
return ret;
#else
return ncclInternalError;
@@ -1352,30 +1466,37 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclP
}
static ncclResult_t proxyServiceInitOp(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclProxyState* proxyState, int* asyncOpCount) {
ncclResult_t ret = ncclSuccess;
struct ncclSocket* sock = &peer->sock;
struct ncclProxyAsyncOp* asyncOp;
NCCLCHECK(ncclCalloc(&asyncOp, 1));
asyncOp->type = type;
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)));
NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)), ret, fail);
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int)));
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int)));
NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int)), ret, fail);
NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int)), ret, fail);
if (asyncOp->reqSize) {
NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize));
NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize));
NCCLCHECKGOTO(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize), ret, fail);
NCCLCHECKGOTO(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize), ret, fail);
}
// Store opId for completion response
NCCLCHECK(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)));
NCCLCHECKGOTO(ncclSocketRecv(sock, &asyncOp->opId, sizeof(asyncOp->opId)), ret, fail);
if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize));
if (asyncOp->respSize) NCCLCHECKGOTO(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize), ret, fail);
asyncProxyOpEnqueue(peer, asyncOp);
(*asyncOpCount)++;
NCCLCHECK(proxyProgressAsync(asyncOp, proxyState, asyncOpCount, peer, connectionPool));
return ncclSuccess;
exit:
return ret;
fail:
if (asyncOp->reqBuff) free(asyncOp->reqBuff);
if (asyncOp->respBuff) free(asyncOp->respBuff);
free(asyncOp);
goto exit;
}
#include <poll.h>
@@ -1395,6 +1516,12 @@ static bool proxyMatchOpType(int type) {
}
}
enum {
PROXY_RUNNING = 0,
PROXY_STOP = 1,
PROXY_ABORT = 2
};
void* ncclProxyService(void* _args) {
struct ncclProxyState* proxyState = (struct ncclProxyState*) _args;
// if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
@@ -1405,6 +1532,8 @@ void* ncclProxyService(void* _args) {
}
// if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
INFO(NCCL_INIT, "[Proxy Service] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
// Prepare poll descriptor
struct ncclProxyConnectionPool connectionPool;
connectionPool.pools = NULL;
@@ -1426,13 +1555,13 @@ void* ncclProxyService(void* _args) {
int maxnpeers = 0;
int npeers = 0;
int stop = 0;
int stop = PROXY_RUNNING;
int asyncOpCount = 0;
while (stop == 0 || (stop == 1 && npeers > 0)) {
while (stop == PROXY_RUNNING || npeers > 0) {
/* Even if local comm aborts, we cannot let proxy thread exit if we still have peer
* connections. Need to wait until all other related comms call abort and safely exit
* together, or we could face segmentation fault. */
if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) stop = 1;
if (__atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) != 0) stop = PROXY_ABORT;
/* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
int ret;
do {
@@ -1474,10 +1603,14 @@ void* ncclProxyService(void* _args) {
if (pollfds[s].fd == -1) continue;
// Progress all ops for this ncclProxyLocalPeer
if (stop == PROXY_ABORT && ncclCuMemEnable() && ncclCuMemHostEnable() && !proxyState->directMode) closeConn = 1;
ncclProxyAsyncOp* op = peer->asyncOps;
while (op != nullptr) {
ncclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */
type = op->type;
// Coverity gets confused here by complex code structure. Yes, connectionPool.pools gets dereferenced, and
// while calling proxyProgressAsync() connectionPool.pools is NULL, but that changes before it's dereferenced.
// coverity[var_deref_model:FALSE]
res = proxyProgressAsync(op, proxyState, &asyncOpCount, peer, &connectionPool);
if (res == ncclSuccess || res == ncclInProgress) {
op = opnext;
@@ -1494,14 +1627,15 @@ void* ncclProxyService(void* _args) {
int closed;
res = ncclSocketTryRecv(sock, &type, sizeof(int), &closed, false /*blocking*/);
if (res != ncclSuccess && res != ncclInProgress) {
WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed);
if (!__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED))
WARN("[Service thread] Could not receive type from localRank %d, res=%u, closed=%d", peer->tpLocalRank, res, closed);
closeConn = 1;
} else if (closed) {
INFO(NCCL_INIT|NCCL_NET|NCCL_PROXY, "[Service thread] Connection closed by localRank %d", peer->tpLocalRank);
closeConn = 1;
} else if (res == ncclSuccess) { // We received something from the sock
if (type == ncclProxyMsgStop) {
stop = 1;
stop = PROXY_STOP;
closeConn = 1;
} else if (type == ncclProxyMsgClose) {
closeConn = 1;
@@ -1518,12 +1652,13 @@ void* ncclProxyService(void* _args) {
closeConn = 1;
}
if (res != ncclSuccess && res != ncclInProgress) {
WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", proxyState->tpRank, ncclProxyMsgTypeStr[type], peer->tpRank, res);
if (!__atomic_load_n(proxyState->abortFlag, __ATOMIC_RELAXED))
WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", proxyState->tpRank, ncclProxyMsgTypeStr[type], peer->tpRank, res);
closeConn = 1;
}
if (closeConn) {
ncclSocketClose(sock);
(void)ncclSocketClose(sock);
if (op != nullptr) {
asyncProxyOpDequeue(peer, op);
@@ -1540,10 +1675,10 @@ void* ncclProxyService(void* _args) {
WARN("[Proxy Service] proxyDestroy failed");
}
for (int s=0; s<maxnpeers; s++) {
ncclSocketClose(&peers[s].sock);
(void)ncclSocketClose(&peers[s].sock);
}
ncclProxyFreeConnections(&connectionPool, proxyState);
ncclSocketClose(proxyState->listenSock);
(void)ncclSocketClose(proxyState->listenSock);
free(proxyState->listenSock);
proxyOpsFree(proxyState);
return NULL;
@@ -1553,12 +1688,17 @@ void* ncclProxyService(void* _args) {
// Process a request on the UDS socket
static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd) {
ncclIpcHdr hdr;
NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), NULL));
int rmtFd = -1;
NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), &rmtFd));
if (hdr.type == ncclProxyMsgGetFd) {
// cuMem API support
uint64_t handle = *(uint64_t*)hdr.data;
INFO(NCCL_PROXY, "proxyUDSRecvReq::ncclProxyMsgGetFd rank %d opId %p handle=0x%lx", hdr.rank, hdr.opId, handle);
return proxyGetFd(proxyState, hdr.rank, hdr.opId, handle);
} else if (hdr.type == ncclProxyMsgQueryFd) {
INFO(NCCL_PROXY, "proxyUDSRecvReq::proxyQueryFd rank %d opId %p rmtFd %d", hdr.rank, hdr.opId, rmtFd);
return proxyQueryFd(proxyState, hdr.rank, hdr.opId, rmtFd);
}
return ncclInternalError;
@@ -1570,11 +1710,13 @@ void* ncclProxyServiceUDS(void* _args) {
struct pollfd pollfds[1];
if (setProxyThreadContext(proxyState)) {
INFO(NCCL_INIT, "[Proxy Service UDS] Created CUDA context on device %d", proxyState->cudaDev);
INFO(NCCL_INIT, "[Proxy Service UDS] Set CUDA context on device %d", proxyState->cudaDev);
} else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
WARN("[Proxy Service UDS] Failed to set CUDA device %d", proxyState->cudaDev);
}
INFO(NCCL_INIT, "[Proxy Service UDS] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
if (ncclIpcSocketGetFd(&proxyState->ipcSock, &pollfds[0].fd) != ncclSuccess) {
WARN("[Proxy Service UDS] Get listenSock fd fails");
return NULL;
@@ -1593,7 +1735,7 @@ void* ncclProxyServiceUDS(void* _args) {
}
// Check for stop/abort
if (proxyState->stop || *proxyState->abortFlag) break;
if (__atomic_load_n(&proxyState->stop, __ATOMIC_ACQUIRE) || __atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE)) break;
if (pollfds[0].revents) {
// A request was seen on the UDS fd
@@ -1638,14 +1780,16 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
proxyState->dmaBufSupport = comm->dmaBufSupport;
proxyState->ncclNet = comm->ncclNet;
proxyState->ncclCollNet = comm->ncclCollNet;
proxyState->profilerContext = comm->profilerContext;
proxyState->directMode = comm->directMode;
memcpy(proxyState->buffSizes, comm->buffSizes, sizeof(comm->buffSizes));
pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState);
PTHREADCHECK(pthread_create(&comm->proxyState->thread, NULL, ncclProxyService, comm->proxyState), "pthread_create");
ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev);
// UDS support
INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d", comm, comm->rank);
pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState);
PTHREADCHECK(pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState), "pthread_create");
ncclSetThreadName(comm->proxyState->threadUDS, "NCCL UDS Service %2d", comm->cudaDev);
}
return ncclSuccess;
@@ -1658,17 +1802,17 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
if (comm->proxyState->threadUDS) {
// UDS support
comm->proxyState->stop = 1;
__atomic_store_n(&comm->proxyState->stop, 1, __ATOMIC_RELEASE);
}
if (sharedProxyState->peerAddresses) {
if (*comm->abortFlag == 0 && sharedProxyState->peerAddresses) {
struct ncclSocket sock;
int type = ncclProxyMsgStop;
ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag);
if (ncclSocketConnect(&sock) == ncclSuccess) {
ncclSocketSend(&sock, &type, sizeof(int));
(void)ncclSocketSend(&sock, &type, sizeof(int));
}
ncclSocketClose(&sock);
(void)ncclSocketClose(&sock);
}
if (sharedProxyState->peerSocks) {
@@ -1686,7 +1830,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
}
}
int type = ncclProxyMsgClose;
ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int));
(void)ncclSocketSend(sharedProxyState->peerSocks + i, &type, sizeof(int));
NCCLCHECK(ncclSocketClose(sharedProxyState->peerSocks + i));
}
}
@@ -1700,13 +1844,15 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
struct ncclProxyState* sharedProxyState = comm->sharedRes->proxyState;
assert(sharedProxyState->refCount == 0);
free(sharedProxyState->peerAddresses);
free(sharedProxyState->peerAddressesUDS);
free(sharedProxyState->peerSocks);
free(sharedProxyState->proxyOps);
free(sharedProxyState->sharedDevMems);
expectedProxyResponseFree(sharedProxyState);
free(sharedProxyState);
if (sharedProxyState) {
assert(sharedProxyState->refCount == 0);
free(sharedProxyState->peerAddresses);
free(sharedProxyState->peerAddressesUDS);
free(sharedProxyState->peerSocks);
free(sharedProxyState->proxyOps);
free(sharedProxyState->sharedDevMems);
expectedProxyResponseFree(sharedProxyState);
free(sharedProxyState);
}
return ncclSuccess;
}
+21 -4
Ver ficheiro
@@ -26,8 +26,8 @@ ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) {
ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, struct ncclReg* reg) {
struct ncclRegCache* cache = &comm->regCache;
int netCount;
NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount));
int netCount = 0;
if (comm->topo != NULL) NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount));
if (netCount == 0) return ncclSuccess;
ncclResult_t ret = ncclSuccess;
@@ -105,7 +105,11 @@ ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, s
NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, void** handle) {
if (!ncclParamLocalRegister()) return ncclSuccess;
if (!ncclParamLocalRegister()) {
*handle = NULL;
return ncclSuccess;
}
INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size);
struct ncclRegCache* cache = &comm->regCache;
uintptr_t pageSize = cache->pageSize;
uintptr_t addr = (uintptr_t)data & -pageSize;
@@ -166,6 +170,10 @@ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
struct ncclReg* reg = (struct ncclReg*)handle;
struct ncclRegCache* cache = &comm->regCache;
int slot;
int saveDev;
if (handle == NULL) goto exit;
CUDACHECK(cudaGetDevice(&saveDev));
CUDACHECK(cudaSetDevice(comm->cudaDev));
for (slot=0; slot<cache->population && cache->slots[slot] != reg; slot++);
if (slot == cache->population) {
WARN("Deregister: Could not find handle");
@@ -178,10 +186,19 @@ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
reg->regAddr = (CUdeviceptr)NULL;
}
if (reg->state & COLLNET_REG_COMPLETE) {
NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->proxyconn, reg->collnetHandle));
NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->collnetProxyconn, reg->collnetHandle));
}
if (reg->state & IPC_REG_COMPLETE) {
for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; ++i)
if (reg->ipcInfos[i])
NCCLCHECK(ncclIpcDeregBuffer(comm, reg->ipcInfos[i]));
if (reg->regIpcAddrs.hostPeerRmtAddrs) free(reg->regIpcAddrs.hostPeerRmtAddrs);
if (reg->regIpcAddrs.devPeerRmtAddrs) NCCLCHECK(ncclCudaFree(reg->regIpcAddrs.devPeerRmtAddrs));
}
free(reg);
memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*));
cache->population -= 1;
CUDACHECK(cudaSetDevice(saveDev));
exit:
return ncclSuccess;
}
+66 -42
Ver ficheiro
@@ -28,7 +28,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclTransport *transport = ncclTransports[t];
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
int ret = 0;
NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
NCCLCHECK(transport->canConnect(&ret, comm, graph, myInfo, peerInfo));
if (ret) {
connector->transportComm = transportComm;
NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex));
@@ -70,25 +70,52 @@ NCCL_PARAM(ConnectRoundMaxPeers, "CONNECT_ROUND_MAX_PEERS", 128);
NCCL_PARAM(ReportConnectProgress, "REPORT_CONNECT_PROGRESS", 0);
#include <sys/time.h>
ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode) {
bool supportFlag = true;
bool directFlag = false;
if (comm->localRanks == 1) {
supportFlag = false;
} else {
for (int i = 0; i < comm->localRanks; ++i) {
for (int j = i + 1; j < comm->localRanks; ++j) {
int ipeer = comm->localRankToRank[i];
int jpeer = comm->localRankToRank[j];
struct ncclPeerInfo* ipeerInfo = &comm->peerInfo[ipeer];
struct ncclPeerInfo* jpeerInfo = &comm->peerInfo[jpeer];
int canConnect = 0;
NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, NULL, ipeerInfo, jpeerInfo));
if (!canConnect && supportFlag == true) {
supportFlag = false;
}
if (ipeerInfo->hostHash == jpeerInfo->hostHash && ipeerInfo->pidHash == jpeerInfo->pidHash) directFlag = true;
if (!supportFlag && directFlag) break;
}
}
}
*intraNodeP2pSupport = supportFlag;
*directMode = directFlag;
return ncclSuccess;
}
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
ncclResult_t ret = ncclSuccess;
int highestType = TRANSPORT_UNDEFINED; // track highest transport type
struct ncclConnect** data; // Store intermediate send/recvData structs for connect
struct ncclConnect** recvData; // Points to entries inside data for given recv connection within a channel
struct ncclConnect** sendData; // Points to entries inside data for given send connection within a channel
struct ncclConnect** recvData = NULL; // Points to entries inside data for given recv connection within a channel
struct ncclConnect** sendData = NULL; // Points to entries inside data for given send connection within a channel
int done = 0;
int maxPeers = ncclParamConnectRoundMaxPeers();
NCCLCHECK(ncclCalloc(&data, maxPeers));
NCCLCHECK(ncclCalloc(&recvData, maxPeers));
NCCLCHECK(ncclCalloc(&sendData, maxPeers));
struct timeval timeStart, timeLast;
gettimeofday(&timeStart, NULL);
timeLast = timeStart; // struct copy
bool timeReported = false;
NCCLCHECK(ncclCalloc(&data, maxPeers));
NCCLCHECKGOTO(ncclCalloc(&recvData, maxPeers), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&sendData, maxPeers), ret, fail);
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
// First time initialization
for (int i=1; i<comm->nRanks; i++) {
@@ -104,7 +131,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
// The next M entries contain sendData, connection information for send connections
// It's not guaranteed that each entry of data has the same number of total or send/recv specific connections
int p = i-(done+1);
if (recvMask || sendMask) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS));
if (recvMask || sendMask) NCCLCHECKGOTO(ncclCalloc(data+p, 2*MAXCHANNELS), ret, fail);
recvData[p] = data[p];
int sendChannels = 0, recvChannels = 0;
int type;
@@ -163,7 +190,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclConnector* conn = comm->channels[c].peers[sendPeer]->send + connIndex;
// This connector hasn't completed connection yet
if (conn->connected == 0) {
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[p] + sendDataOffset++, 1, comm->rank, conn), ret, fail);
NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData[p] + sendDataOffset, 1, comm->rank, conn), ret, fail);
if (ret == ncclSuccess) {
conn->connected = 1;
/* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */
@@ -172,6 +199,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
allChannelsConnected = false;
}
}
sendDataOffset++;
}
TIME_STOP(3);
@@ -181,7 +209,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclConnector* conn = comm->channels[c].peers[recvPeer]->recv + connIndex;
// This connector hasn't completed connection yet
if (conn->connected == 0) {
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[p] + recvDataOffset++, 1, comm->rank, conn), ret, fail);
NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData[p] + recvDataOffset, 1, comm->rank, conn), ret, fail);
if (ret == ncclSuccess) {
conn->connected = 1;
/* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */
@@ -190,6 +218,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
allChannelsConnected = false;
}
}
recvDataOffset++;
}
TIME_STOP(4);
}
@@ -198,7 +227,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
data[p] = NULL;
}
}
if (ncclParamReportConnectProgress() && comm->rank == 0) {
if (ncclParamReportConnectProgress() && comm->rank == 0 && done > 0) {
struct timeval now;
gettimeofday(&now, NULL);
if (((now.tv_sec - timeLast.tv_sec)*1.0 + (now.tv_usec-timeLast.tv_usec)*1e-6) > 1) {
@@ -236,34 +265,31 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
int bootstrapTag = (i << 8) + (1 << 7) + (graph ? graph->id + 1 : 0);
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
int sendPeer = (comm->rank + i) % comm->nRanks;
int flag = 0;
if (recvPeer != sendPeer) {
if (comm->connectSend[sendPeer] != 0UL)
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
if (comm->connectRecv[recvPeer] != 0UL)
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
if (comm->connectSend[sendPeer] != 0UL)
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
if (comm->connectRecv[recvPeer] != 0UL)
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
if (comm->connectSend[sendPeer] != 0UL) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
if (comm->connectRecv[recvPeer] != 0UL) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, NULL, 0), ret, fail);
if (comm->connectSend[sendPeer] != 0UL) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
if (comm->connectRecv[recvPeer] != 0UL) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, NULL, 0), ret, fail);
} else {
if (comm->connectSend[sendPeer] != 0UL || comm->connectRecv[recvPeer] != 0UL) {
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, NULL, 0), ret, fail);
}
}
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
}
free(data);
free(sendData);
free(recvData);
if (highestTransportType != NULL) *highestTransportType = highestType;
TIME_PRINT("P2P Setup/Connect");
exit:
for(int i=0; i<maxPeers; ++i){
if(data[i]) free(data[i]);
}
free(data);
if (sendData) free(sendData);
if (recvData) free(recvData);
NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream));
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream));
return ret;
@@ -275,8 +301,8 @@ extern struct ncclTransport collNetTransport;
// All ranks must participate in collNetSetup call
// We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails
int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect) {
int fail = 1;
bool ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type, ncclConnect* connect) {
ncclResult_t ret = ncclSuccess;
int rank = comm->rank;
int nranks = comm->nRanks;
int nMasters = comm->nNodes;
@@ -297,24 +323,23 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
conn->transportComm = transportComm;
// setup
struct ncclConnect myConnect;
if (isMaster) {
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
}
// prepare connect handles
ncclResult_t res;
struct ncclConnect myConnect = { 0 };
struct {
int isMaster;
ncclConnect connect;
} *allConnects = NULL;
ncclConnect *masterConnects = NULL;
if (isMaster) {
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
}
// prepare connect handles
NCCLCHECK(ncclCalloc(&masterConnects, nMasters));
if (type == collNetRecv) { // recv side: AllGather
// all ranks must participate
NCCLCHECK(ncclCalloc(&allConnects, nranks));
NCCLCHECKGOTO(ncclCalloc(&allConnects, nranks), ret, cleanup);
allConnects[rank].isMaster = isMaster;
memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect));
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup);
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), ret, cleanup);
// consolidate
int c = 0;
for (int r = 0; r < nranks; r++) {
@@ -328,21 +353,20 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
}
// connect
if (isMaster) {
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, comm->node, conn), res, cleanup);
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, comm->node, conn), ret, cleanup);
struct ncclDevChannelPeer* devRoot;
CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), res, cleanup);
CUDACHECKGOTO(cudaMemcpy(&devRoot, channel->devPeers + nranks, sizeof(struct ncclDevChannelPeer*), cudaMemcpyDeviceToHost), ret, cleanup);
struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv + type : devRoot->send + type;
CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), res, cleanup);
CUDACHECKGOTO(cudaMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice), ret, cleanup);
}
if (isMaster && type == collNetRecv) {
memcpy(connect, masterConnects+comm->node, sizeof(struct ncclConnect));
TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, comm->node, nMasters, masterPeer);
}
fail = 0;
cleanup:
if (allConnects != NULL) free(allConnects);
if (masterConnects != NULL) free(masterConnects);
return fail;
return ret != ncclSuccess;
}
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) {
+11 -28
Ver ficheiro
@@ -18,15 +18,15 @@ int64_t ncclParamGdrCopySyncEnable();
int64_t ncclParamGdrCopyFlushEnable();
struct collNetRecvConnectInfo {
int rank;
int nranks;
collNetHandle_t collNetHandle;
};
static_assert(sizeof(collNetRecvConnectInfo) <= CONNECT_SIZE, "Collnet Recv Connect info is too large");
struct collNetSendConnectInfo {
void* mhandles[NCCL_NUM_PROTOCOLS];
void* reqFifo;
};
static_assert(sizeof(collNetSendConnectInfo) <= CONNECT_SIZE, "Collnet Send Connect info is too large");
#define COLLNET_GROUP_NSUBS 8
#define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS)
@@ -135,7 +135,7 @@ struct recvResources {
int collNetRank;
};
static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
// This transport cannot be used for p2p
*ret = 0;
return ncclSuccess;
@@ -154,15 +154,14 @@ struct setupReq {
static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct setupReq req = { 0 };
int proxyRank, tpProxyRank;
int proxyRank;
int64_t netId;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
tpProxyRank = comm->topParentRanks[myInfo->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, tpProxyRank, &send->proxyConn));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
req.collNet = comm->collNetSharedRes;
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
@@ -175,7 +174,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
struct setupReq req = { 0 };
int proxyRank, tpProxyRank;
int proxyRank;
int64_t netId;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
@@ -184,8 +183,8 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
tpProxyRank = comm->topParentRanks[myInfo->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, tpProxyRank, &recv->proxyConn));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
static_assert(sizeof(collNetRecvConnectInfo) <= sizeof(struct ncclConnect), "Collnet Recv Connect info is too big");
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
ncclAtomicRefCountIncrement(&comm->collNetSharedRes->refCount);
req.collNet = comm->collNetSharedRes;
@@ -442,6 +441,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
static_assert(sizeof(collNetSendConnectInfo) <= sizeof(struct ncclConnect), "Collnet Send Connect info is too big");
struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
@@ -1039,7 +1039,7 @@ ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* u
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
if (handle) {
regRecord->state |= COLLNET_REG_COMPLETE;
regRecord->proxyconn = proxyconn;
regRecord->collnetProxyconn = proxyconn;
*outHandle = regRecord->collnetHandle = handle;
*outRegBufFlag = 1;
}
@@ -1091,7 +1091,7 @@ ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* u
record->size = buffSize;
*outHandle = record->mhandle = handle;
*outRegBufFlag = 1;
ncclIntruQueueEnqueue(cleanupQueue, &record->base);
ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
*nCleanupQueueElts += 1;
exit:
@@ -1214,23 +1214,6 @@ ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) {
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1, &highestTransportType1), ret, fail);
// Exchange highest intra-node transport type among ranks
// because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
if (highestTransportType0 != TRANSPORT_UNDEFINED && highestTransportType1 != TRANSPORT_UNDEFINED) {
int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_UNDEFINED };
comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail);
for (int i = 0; i < comm->localRanks; i++) {
if (highestTypes[i] > comm->intraHighestTransportType)
comm->intraHighestTransportType = highestTypes[i];
}
if (comm->collNetSharedRes->intraHighestTransportType < comm->intraHighestTransportType)
comm->collNetSharedRes->intraHighestTransportType = comm->intraHighestTransportType;
} else if (comm->intraHighestTransportType == TRANSPORT_UNDEFINED) {
// reuse previous shared intraHighestTransportType
comm->intraHighestTransportType = comm->collNetSharedRes->intraHighestTransportType;
}
INFO(NCCL_INIT, "rank %d Connected CollNet", comm->rank);
exit:
+23
Ver ficheiro
@@ -34,3 +34,26 @@ exit:
fail:
goto exit;
}
ncclResult_t ncclTransportPatConnect(struct ncclComm* comm) {
ncclResult_t ret = ncclSuccess;
if (comm && comm->nRanks > 1) {
for (int mask=1; mask<comm->nRanks; mask<<=1) {
int prevPeer = (comm->rank + mask) % comm->nRanks;
int nextPeer = (comm->rank + comm->nRanks - mask) % comm->nRanks;
for (int c = 0; c < comm->nChannels; c++) {
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &prevPeer, 1, &nextPeer, 0), ret, fail); // ReduceScatter
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail);
for (int c = 0; c < comm->nChannels; c++) {
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &nextPeer, 1, &prevPeer, 0), ret, fail); // AllGather
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_TREE], 0), ret, fail);
}
INFO(NCCL_INIT, "Connected binomial trees");
}
exit:
return ret;
fail:
goto exit;
}
+74 -61
Ver ficheiro
@@ -10,10 +10,11 @@
#include "proxy.h"
#include "collectives.h"
#include "gdrwrap.h"
#include "shm.h"
#include "shmutils.h"
#include "p2p.h"
#include "profiler.h"
#include "transport.h"
#include "shm.h"
static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large");
@@ -62,9 +63,8 @@ struct connectMapMem{
char* cpuPtr;
int size;
ncclIpcDesc ipcDesc;
char shmPath[PATH_MAX];
ncclShmHandle_t attachHandle;
ncclShmHandle_t createHandle;
ncclShmIpcDesc_t attachDesc;
ncclShmIpcDesc_t createDesc;
};
struct connectMap {
@@ -142,11 +142,11 @@ struct recvNetResources {
};
/* Determine if two peers can communicate with NET */
static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
*ret = 1;
if (info1->hostHash == info2->hostHash) {
// If on the same host, check intra-node net is not disabled.
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, ret));
NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, ret));
}
return ncclSuccess;
}
@@ -173,9 +173,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
* information for this peer */
static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct setupReq req = { 0 };
int tpProxyRank;
send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
send->conn.shared = req.shared = graph || connIndex == 0 ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
req.channelId = channelId;
req.connIndex = connIndex;
@@ -185,8 +184,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
tpProxyRank = comm->topParentRanks[proxyRank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, tpProxyRank, &send->proxyConn));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
req.tpRank = comm->topParentRanks[myInfo->rank];
req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
@@ -199,7 +197,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
}
*((int*)connectInfo) = tpProxyRank;
*((int*)connectInfo) = comm->topParentRanks[proxyRank];
return ncclSuccess;
}
@@ -212,12 +210,12 @@ NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
struct setupReq req = { 0 };
recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
recv->conn.shared = req.shared = graph || connIndex == 0 ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
req.channelId = channelId;
req.connIndex = connIndex;
// Use myInfo->rank as the receiver uses its own NIC
int proxyRank, tpProxyRank;
int proxyRank;
int64_t netId;
NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &netId, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
@@ -226,8 +224,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
// We don't support PXN on receive yet
tpProxyRank = comm->topParentRanks[myInfo->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, tpProxyRank, &recv->proxyConn));
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
req.tpRank = comm->topParentRanks[myInfo->rank];
@@ -238,26 +235,24 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
return ncclSuccess;
}
static ncclResult_t netMapShm(struct connectMapMem* mem) {
mem->cpuPtr = NULL;
mem->gpuPtr = NULL;
NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, -1, &mem->attachHandle));
static ncclResult_t netMapShm(struct ncclComm *comm, struct connectMapMem* mem) {
NCCLCHECK(ncclShmImportShareableBuffer(comm, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, &mem->attachDesc));
return ncclSuccess;
}
static ncclResult_t netCreateShm(struct connectMapMem* mem) {
mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file
NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1, &mem->createHandle));
static ncclResult_t netCreateShm(struct ncclProxyState* proxyState, struct connectMapMem* mem) {
NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, mem->size, false, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr));
return ncclSuccess;
}
static ncclResult_t netDumpMap(struct connectMap* map) {
printf("Dump map same process %d shared %d\n", map->sameProcess, map->shared);
struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM;
printf("Mem 0: Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr);
printf("Mem 0: Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
mem = map->mems+NCCL_NET_MAP_DEVMEM;
printf("Mem 1: Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM;
printf("Mem 2: Shared Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr);
printf("Mem 2: Shared Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM;
printf("Mem 3: Shared Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
@@ -328,10 +323,10 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
}
}
} else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
if (!map->sameProcess) NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
if (!map->sameProcess) NCCLCHECK(netMapShm(comm, map->mems + NCCL_NET_MAP_HOSTMEM));
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr = NULL;
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.rank,
map->mems[NCCL_NET_MAP_DEVMEM].size,
&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
@@ -341,7 +336,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
void** sharedDevMemPtr = comm->proxyState->sharedDevMems + send->proxyConn.tpLocalRank;
if (*sharedDevMemPtr == NULL) {
map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = NULL;
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.tpRank,
NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.rank,
map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size,
&map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipcDesc,
sharedDevMemPtr));
@@ -463,24 +458,19 @@ static ncclResult_t sendFree(struct ncclConnector* send) {
if (map) {
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
if (map->sameProcess && map->cudaDev == cudaDev) {
// Our own GPU, so it wasn't mapped in
free(map);
return ncclSuccess;
}
if (!map->sameProcess || ncclCuMemEnable()) {
if (!map->sameProcess) NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].attachHandle));
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
if (ncclCuMemEnable()) {
// cuMem API support
NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
// Legacy CUDA IPC support
CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
}
if (map->cudaDev != cudaDev && map->mems[NCCL_NET_MAP_DEVMEM].size) {
if (ncclCuMemEnable()) {
// cuMem API support
NCCLCHECK(ncclP2pFreeShareableBuffer(&map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc));
NCCLCHECK(ncclCuMemFree(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
// Legacy CUDA IPC support
CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
}
}
if (!map->sameProcess) {
NCCLCHECK(ncclShmIpcClose(&map->mems[NCCL_NET_MAP_HOSTMEM].attachDesc));
}
free(map);
}
@@ -518,7 +508,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
if (cuda && state->cudaBuff == NULL) {
if (sameProcess == 0 || ncclCuMemEnable()) {
NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, &state->ipcDesc, (void**)&state->cudaBuff));
NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, 0, &state->ipcDesc, (void**)&state->cudaBuff));
} else {
NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size));
}
@@ -527,7 +517,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size));
}
if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
if (gpuPtr) *gpuPtr = sameProcess ? *cpuPtr : NULL;
if (gpuPtr) *gpuPtr = (cpuPtr && sameProcess) ? *cpuPtr : NULL;
if (ipcDesc) memcpy(ipcDesc, &state->ipcDesc, sizeof(state->ipcDesc));
return ncclSuccess;
}
@@ -543,7 +533,7 @@ static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int chan
static ncclResult_t sharedNetBuffersDestroy(struct ncclProxyState* proxyState, int tpLocalRank, int type, struct ncclProxyConnection* connection) {
if (proxyState->progressState.localPeers == NULL) NCCLCHECK(ncclInternalError);
struct ncclProxyPeer* peer = proxyState->progressState.localPeers[tpLocalRank];
if (peer == NULL) NCCLCHECK(ncclInternalError;)
if (peer == NULL) NCCLCHECK(ncclInternalError);
struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
if (state->size == 0) NCCLCHECK(ncclInternalError);
if (ncclAtomicRefCountDecrement(&state->refcount) == 0) {
@@ -746,7 +736,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
if (resources->shared == 0) {
if (!map->sameProcess || ncclCuMemEnable()) {
ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN);
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, 0, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size));
@@ -758,7 +748,11 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
} else {
NCCLCHECK(netCreateShm(map->mems+NCCL_NET_MAP_HOSTMEM));
NCCLCHECK(netCreateShm(proxyState, map->mems+NCCL_NET_MAP_HOSTMEM));
void* sendMem = (void*)NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
void* recvMem = (void*)NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
memset(sendMem, 0, sizeof(struct ncclSendMem));
memset(recvMem, 0, sizeof(struct ncclRecvMem));
}
if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) {
uint64_t *cpuPtr, *gpuPtr;
@@ -896,7 +890,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
if (resources->shared == 0) {
if (ncclCuMemEnable()) {
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, 0, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
(void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
} else {
NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size));
@@ -968,7 +962,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
if (resources->map.sameProcess) {
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
} else {
NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].createHandle));
NCCLCHECK(ncclShmIpcClose(&mems[NCCL_NET_MAP_HOSTMEM].createDesc));
}
NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
if (!resources->map.sameProcess || ncclCuMemEnable()) {
@@ -1050,7 +1044,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
// Set step base for next op
resources->step = sub->base + sub->nsteps;
sub->posted = sub->transmitted = sub->done = 0;
for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
ncclProfilerStartSendProxyOpEvent(s, args);
if (sub->reg && sub->nbytes > 0) {
NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
} else {
@@ -1072,6 +1066,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
// Post buffers to the GPU
if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) {
ncclProfilerStartSendProxyStepEvents(s, args, sub->posted, sub->posted+args->sliceSteps);
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
if (resources->shared) {
if (!sub->reg) {
@@ -1087,9 +1082,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
if (sub->reg == 0 || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS;
if (resources->gdcSync) wc_store_fence(); // Flush out WC write
} else sub->posted += args->sliceSteps;
for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) {
ncclProfilingRecord(args, s, step, ncclProxyProfileSendGPUWait);
}
ncclProfilerRecordProxyOpEventState(s, args, sub->posted, sub->transSize, ncclProfilerProxyOpSendPosted);
ncclProfilerRecordProxyStepEventStates(s, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepSendGPUWait);
args->idle = 0;
continue;
}
@@ -1130,12 +1124,18 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
buff = sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset;
}
if (ready) {
ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted + args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait);
// Data is ready, try to send.
// Coverity complains about the size here as pointing to an out-of-scope temporary. Which is nonsense,
// since size is a plain integer.
// coverity[use_invalid:FALSE]
NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->mhandle, sub->requests+buffSlot));
if (sub->requests[buffSlot] != NULL) {
TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p, size %d, proto %d, myRank %d, channelId %d", sub->transmitted, buffSlot, sub->requests[buffSlot], size, p, proxyState->tpRank, sub->channelId);
sub->transmitted += args->sliceSteps;
for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileSendWait);
ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted);
ncclProfilerRecordProxyStepEventStates(s, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepSendWait);
sub->transSize += size;
args->idle = 0;
continue;
}
@@ -1165,7 +1165,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
__sync_synchronize();
TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
sub->done += args->sliceSteps;
for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
ncclProfilerStopProxyStepEvents(s, args, sub->done-args->sliceSteps, sub->done);
ncclProfilerRecordProxyOpEventState(s, args, sub->done, sub->transSize, ncclProfilerProxyOpSendDone);
if (resources->shared == 0) {
volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
@@ -1188,6 +1189,9 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
}
}
if (args->done == args->nsubs) {
for (int s=0; s<args->nsubs; s++) {
ncclProfilerStopProxyOpEvent(s, args);
}
args->state = ncclProxyOpNone;
}
}
@@ -1229,7 +1233,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
resources->step = sub->base + sub->nsteps;
sub->posted = sub->received = sub->transmitted = sub->done = 0;
for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize;
for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
ncclProfilerStartRecvProxyOpEvent(s, args);
if (sub->reg && sub->nbytes > 0) {
// Register buffer
NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
@@ -1254,6 +1258,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
struct ncclProxySubArgs* sub = subGroup + i;
if (sub->posted < sub->nsteps) {
if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; }
ncclProfilerStartRecvProxyStepEvents(s+i, args, sub->posted, sub->posted+args->sliceSteps);
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
if (sub->reg) maxDepth = 1;
int stepSize = resources->buffSizes[p] / NCCL_STEPS;
@@ -1294,7 +1299,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
for (int i=0; i<subGroup->groupSize; i++) {
struct ncclProxySubArgs* sub = subGroup+i;
sub->posted += args->sliceSteps;
for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait);
ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted);
ncclProfilerRecordProxyStepEventStates(s+i, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepRecvWait);
}
args->idle = 0;
}
@@ -1337,7 +1343,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
}
}
sub->received += args->sliceSteps;
for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
sub->transSize += sizes[i];
ncclProfilerRecordProxyOpEventState(s+i, args, sub->received, sub->transSize, ncclProfilerProxyOpRecvReceived);
ncclProfilerRecordProxyStepEventStates(s+i, args, sub->received-args->sliceSteps, sub->received, ncclProfilerProxyStepRecvFlushWait);
if (step < sub->nsteps) {
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
if (resources->useGdr) needFlush |= resources->needFlush;
@@ -1393,7 +1401,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
struct ncclProxySubArgs* sub = subGroup + i;
sub->transmitted += args->sliceSteps;
for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait);
ncclProfilerRecordProxyOpEventState(s+i, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpRecvTransmitted);
ncclProfilerRecordProxyStepEventStates(s+i, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepRecvGPUWait);
if (step < sub->nsteps) {
__sync_synchronize();
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
@@ -1431,7 +1440,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL;
}
sub->done += args->sliceSteps;
for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileEnd);
ncclProfilerStopProxyStepEvents(s+i, args, sub->done-args->sliceSteps, sub->done);
ncclProfilerRecordProxyOpEventState(s+i, args, sub->done, sub->transSize, ncclProfilerProxyOpRecvDone);
args->idle = 0;
if (sub->done == sub->nsteps) {
struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
@@ -1447,6 +1457,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
}
if (args->done == args->nsubs) {
args->state = ncclProxyOpNone;
for (int s=0; s<args->nsubs; s++) {
ncclProfilerStopProxyOpEvent(s, args);
}
}
}
return ncclSuccess;
+242 -102
Ver ficheiro
@@ -49,6 +49,11 @@ struct alignas(64) ncclIbMergedDev {
int devs[NCCL_IB_MAX_DEVS_PER_NIC]; // Points to an index in ncclIbDevs
int speed;
char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+'
int dmaBufSupported; // 0 = uninit, 1 = yes, -1 = no
};
struct ncclIbStats {
int fatalErrorCount;
};
static int ncclNIbDevs = -1;
@@ -69,6 +74,7 @@ struct alignas(64) ncclIbDev {
struct ncclIbMrCache mrCache;
int ar; // ADAPTIVE_ROUTING
struct ibv_port_attr portAttr;
struct ncclIbStats stats;
};
#define MAX_IB_DEVS 32
@@ -80,7 +86,7 @@ static int ncclIbRelaxedOrderingEnabled = 0;
NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", -1);
NCCL_PARAM(IbRoutableFlidIbGidIndex, "IB_ROUTABLE_FLID_GID_INDEX", 1);
NCCL_PARAM(IbRoceVersionNum, "IB_ROCE_VERSION_NUM", 2);
NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 18);
NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 20);
NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
NCCL_PARAM(IbPkey, "IB_PKEY", 0);
NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
@@ -90,6 +96,32 @@ NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", 0);
NCCL_PARAM(IbAsyncEvents,"IB_RETURN_ASYNC_EVENTS",1);
NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1);
static ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat) {
__atomic_store_n(&stat->fatalErrorCount, 0, __ATOMIC_RELAXED);
return ncclSuccess;
}
static void ncclIbStatsFatalError(struct ncclIbStats* stat){
__atomic_fetch_add(&stat->fatalErrorCount, 1, __ATOMIC_RELAXED);
}
static ncclResult_t ncclIbStatsCheckFatalCount(struct ncclIbStats* stat, const char* funcName) {
if (ncclParamIbAsyncEvents() && __atomic_load_n(&stat->fatalErrorCount, __ATOMIC_RELAXED)) {
WARN("communicator encountered a fatal error (detected in %s)\n", funcName);
return ncclSystemError;
}
return ncclSuccess;
}
static void ncclIbQpFatalError(struct ibv_qp* qp) {
ncclIbStatsFatalError((struct ncclIbStats*)qp->qp_context);
}
static void ncclIbCqFatalError(struct ibv_cq* cq) {
ncclIbStatsFatalError((struct ncclIbStats*)cq->cq_context);
}
static void ncclIbDevFatalError(struct ncclIbDev* dev) {
ncclIbStatsFatalError(&dev->stats);
}
pthread_t ncclIbAsyncThread;
static void* ncclIbAsyncThreadMain(void* args) {
@@ -98,9 +130,53 @@ static void* ncclIbAsyncThreadMain(void* args) {
struct ibv_async_event event;
if (ncclSuccess != wrap_ibv_get_async_event(dev->context, &event)) { break; }
char *str;
struct ibv_cq* cq = event.element.cq; // only valid if CQ error
struct ibv_qp* qp = event.element.qp; // only valid if QP error
struct ibv_srq* srq = event.element.srq; // only valid if SRQ error
if (ncclSuccess != wrap_ibv_event_type_str(&str, event.event_type)) { break; }
if (event.event_type != IBV_EVENT_COMM_EST)
WARN("NET/IB : %s:%d Got async event : %s", dev->devName, dev->portNum, str);
switch (event.event_type) {
case IBV_EVENT_DEVICE_FATAL:
// the above is device fatal error
WARN("NET/IB : %s:%d async fatal event: %s", dev->devName, dev->portNum, str);
ncclIbDevFatalError(dev);
break;
case IBV_EVENT_CQ_ERR:
// the above is a CQ fatal error
WARN("NET/IB : %s:%d async fatal event on CQ (%p): %s", dev->devName, dev->portNum, cq, str);
ncclIbCqFatalError(cq);
break;
case IBV_EVENT_QP_FATAL:
case IBV_EVENT_QP_REQ_ERR:
case IBV_EVENT_QP_ACCESS_ERR:
// the above are QP fatal errors
WARN("NET/IB : %s:%d async fatal event on QP (%p): %s", dev->devName, dev->portNum, qp, str);
ncclIbQpFatalError(qp);
break;
case IBV_EVENT_SRQ_ERR:
// SRQ are not used in NCCL
WARN("NET/IB : %s:%d async fatal event on SRQ, unused for now (%p): %s", dev->devName, dev->portNum, srq, str);
break;
case IBV_EVENT_PATH_MIG_ERR:
case IBV_EVENT_PORT_ERR:
case IBV_EVENT_PATH_MIG:
case IBV_EVENT_PORT_ACTIVE:
case IBV_EVENT_SQ_DRAINED:
case IBV_EVENT_LID_CHANGE:
case IBV_EVENT_PKEY_CHANGE:
case IBV_EVENT_SM_CHANGE:
case IBV_EVENT_QP_LAST_WQE_REACHED:
case IBV_EVENT_CLIENT_REREGISTER:
case IBV_EVENT_SRQ_LIMIT_REACHED:
// the above are non-fatal
WARN("NET/IB : %s:%d Got async error event: %s", dev->devName, dev->portNum, str);
break;
case IBV_EVENT_COMM_EST:
break;
default:
WARN("NET/IB : %s:%d unknown event type (%d)", dev->devName, dev->portNum, event.event_type);
break;
}
// acknowledgment needs to happen last to avoid user-after-free
if (ncclSuccess != wrap_ibv_ack_async_event(&event)) { break; }
}
return NULL;
@@ -140,11 +216,11 @@ static void* envIbAddrRange(sa_family_t af, int* mask) {
char addrString[128] = { 0 };
snprintf(addrString, 128, "%s", env);
char *addrStrPtr = addrString;
char *maskStrPtr = strstr(addrString, "/") + 1;
char *maskStrPtr = strstr(addrString, "/");
if (NULL == maskStrPtr) {
return NULL;
}
*(maskStrPtr - 1) = '\0';
*(maskStrPtr++) = '\0';
if (inet_pton(af, addrStrPtr, ret) == 0) {
WARN("NET/IB: Ip address '%s' is invalid for family %s, ignoring address", addrStrPtr, (af == AF_INET) ? "AF_INET" : "AF_INET6");
@@ -242,12 +318,14 @@ static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum,
int fd = open(roceTypePath, O_RDONLY);
if (fd == -1) {
WARN("NET/IB: open failed in ncclIbRoceGetVersionNum: %s", strerror(errno));
return ncclSystemError;
}
int ret = read(fd, gidRoceVerStr, 15);
close(fd);
if (ret == -1) {
WARN("NET/IB: read failed in ncclIbRoceGetVersionNum: %s", strerror(errno));
return ncclSystemError;
}
@@ -420,7 +498,7 @@ int ncclIbFindMatchingDev(int dev) {
}
ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
ncclResult_t ret;
ncclResult_t ret = ncclSuccess;
if (ncclParamIbDisable()) return ncclInternalError;
static int shownIbHcaEnv = 0;
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
@@ -496,11 +574,12 @@ build_ib_list:
ncclIbDevs[ncclNIbDevs].pdRefs = 0;
ncclIbDevs[ncclNIbDevs].pd = NULL;
strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
NCCLCHECK(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort));
NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail);
ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0;
ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
NCCLCHECK(ncclIbStatsInit(&ncclIbDevs[ncclNIbDevs].stats));
// Enable ADAPTIVE_ROUTING by default on IB networks
// But allow it to be overloaded by an env parameter
@@ -510,9 +589,9 @@ build_ib_list:
TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum,
portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs);
PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail);
ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d
int mergedDev = ncclNMergedIbDevs;
if (mergeNics) {
@@ -592,10 +671,11 @@ build_ib_list:
}
pthread_mutex_unlock(&ncclIbLock);
}
return ncclSuccess;
exit:
return ret;
fail:
pthread_mutex_unlock(&ncclIbLock);
return ret;
goto exit;
}
ncclResult_t ncclIbDevices(int* ndev) {
@@ -607,46 +687,63 @@ ncclResult_t ncclIbDevices(int* ndev) {
// Returns :
// ncclSuccess : GDR works
// ncclSystemError : no module or module loaded but not supported by GPU
#define KNL_MODULE_LOADED(a) ((access(a, F_OK) == -1) ? 0 : 1)
static int ncclIbGdrModuleLoaded = 0; // 1 = true, 0 = false
static void ibGdrSupportInitOnce() {
// Check for the nv_peer_mem module being loaded
ncclIbGdrModuleLoaded = KNL_MODULE_LOADED("/sys/kernel/mm/memory_peers/nv_mem/version") ||
KNL_MODULE_LOADED("/sys/kernel/mm/memory_peers/nv_mem_nc/version") ||
KNL_MODULE_LOADED("/sys/module/nvidia_peermem/version");
}
ncclResult_t ncclIbGdrSupport() {
static int moduleLoaded = -1;
if (moduleLoaded == -1) {
// Check for the nv_peer_mem module being loaded
moduleLoaded = ((access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) &&
// Also support the new nv_mem_nc module
(access("/sys/kernel/mm/memory_peers/nv_mem_nc/version", F_OK) == -1)) ? 0 : 1;
}
if (moduleLoaded == 0) return ncclSystemError;
static pthread_once_t once = PTHREAD_ONCE_INIT;
pthread_once(&once, ibGdrSupportInitOnce);
if (!ncclIbGdrModuleLoaded)
return ncclSystemError;
return ncclSuccess;
}
static __thread int ibDmaSupportInitDev; // which device to init, must be thread local
static void ibDmaBufSupportInitOnce(){
ncclResult_t res;
// select the appropriate
struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev;
// Test each real devices
int dev_fail = 0;
for (int i = 0; i < mergedDev->ndevs; i++) {
int ibDev = mergedDev->devs[i];
struct ibv_pd* pd;
struct ibv_context* ctx = ncclIbDevs[ibDev].context;
NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
// Test kernel DMA-BUF support with a dummy call (fd=-1)
(void)wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL /*offset*/, 0ULL /*len*/, 0ULL /*iova*/, -1 /*fd*/, 0 /*flags*/);
// ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
dev_fail |= (errno == EOPNOTSUPP) || (errno == EPROTONOSUPPORT);
NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
// stop the search and goto failure
if (dev_fail) goto failure;
}
mergedDev->dmaBufSupported = 1;
return;
failure:
mergedDev->dmaBufSupported = -1;
return;
}
// Detect whether DMA-BUF support is present in the kernel
// Returns :
// ncclSuccess : DMA-BUF support is available
// ncclSystemError : DMA-BUF is not supported by the kernel
ncclResult_t ncclIbDmaBufSupport(int dev) {
static int dmaBufSupported = -1;
if (dmaBufSupported == -1) {
ncclResult_t res;
struct ibv_pd* pd;
struct ibv_context* ctx;
struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + dev;
struct oncewrap {
pthread_once_t once = PTHREAD_ONCE_INIT;
};
static oncewrap onces[MAX_IB_DEVS];
// init the device only once
ibDmaSupportInitDev = dev;
pthread_once(&onces[dev].once, ibDmaBufSupportInitOnce);
// Test each dev
for (int i = 0; i < mergedDev->ndevs; i++) {
int ibDev = mergedDev->devs[i];
ctx = ncclIbDevs[ibDev].context;
NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
// Test kernel DMA-BUF support with a dummy call (fd=-1)
(void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/);
// ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
dmaBufSupported = (errno != EOPNOTSUPP && errno != EPROTONOSUPPORT) ? 1 : 0;
NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
}
}
if (dmaBufSupported == 0) return ncclSystemError;
return ncclSuccess;
failure:
dmaBufSupported = 0;
int dmaBufSupported = ncclIbMergedDevs[dev].dmaBufSupported;
if (dmaBufSupported == 1) return ncclSuccess;
return ncclSystemError;
}
@@ -842,16 +939,19 @@ struct alignas(32) ncclIbNetCommBase {
// Track necessary remDevInfo here
int nRemDevs;
struct ncclIbDevInfo remDevs[NCCL_IB_MAX_DEVS_PER_NIC];
// statistics about the comm
struct ncclIbStats stats;
};
struct ncclIbSendComm {
struct ncclIbNetCommBase base;
// Start with fifo and ibv structs as they have alignment restrictions
struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS + 1];
// Each dev correlates to a mergedIbDev
struct ncclIbSendCommDev devs[NCCL_IB_MAX_DEVS_PER_NIC];
struct ncclIbRequest* fifoReqs[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1];
struct ncclIbRemSizesFifo remSizesFifo;
uint64_t fifoHead;
int ar; // Use adaptive routing when all merged devices have it enabled
@@ -903,8 +1003,7 @@ static void ncclIbAddEvent(struct ncclIbRequest* req, int devIndex, struct ncclI
req->events[devIndex]++;
req->devBases[devIndex] = base;
}
ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base) {
ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base, void* cq_context) {
base->ibDevN = ibDevN;
ncclIbDev* ibDev = ncclIbDevs + ibDevN;
pthread_mutex_lock(&ibDev->lock);
@@ -921,7 +1020,7 @@ ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base
pthread_mutex_unlock(&ibDev->lock);
// Recv requests can generate 2 completions (one for the post FIFO, one for the Recv).
NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0));
NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), cq_context, NULL, 0));
return ncclSuccess;
}
@@ -940,9 +1039,10 @@ returning:
return res;
}
ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, struct ncclIbQp* qp) {
ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, void* qp_context, struct ncclIbQp* qp) {
struct ibv_qp_init_attr qpInitAttr;
memset(&qpInitAttr, 0, sizeof(struct ibv_qp_init_attr));
qpInitAttr.qp_context = qp_context;
qpInitAttr.send_cq = base->cq;
qpInitAttr.recv_cq = base->cq;
qpInitAttr.qp_type = IBV_QPT_RC;
@@ -1026,6 +1126,7 @@ ncclResult_t ncclIbRtsQp(struct ibv_qp* qp) {
}
ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
ncclResult_t ret = ncclSuccess;
struct ncclIbListenComm* comm;
NCCLCHECK(ncclCalloc(&comm, 1));
struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
@@ -1033,14 +1134,20 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
memset(handle, 0, sizeof(struct ncclIbHandle));
comm->dev = dev;
handle->magic = NCCL_SOCKET_MAGIC;
NCCLCHECK(ncclSocketInit(&comm->sock, &ncclIbIfAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
NCCLCHECK(ncclSocketListen(&comm->sock));
NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr));
NCCLCHECKGOTO(ncclSocketInit(&comm->sock, &ncclIbIfAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1), ret, fail);
NCCLCHECKGOTO(ncclSocketListen(&comm->sock), ret, fail);
NCCLCHECKGOTO(ncclSocketGetAddr(&comm->sock, &handle->connectAddr), ret, fail);
*listenComm = comm;
return ncclSuccess;
exit:
return ret;
fail:
(void)ncclSocketClose(&comm->sock);
free(comm);
goto exit;
}
ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
ncclResult_t ret = ncclSuccess;
struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
struct ncclIbCommStage* stage = &handle->stage;
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
@@ -1055,16 +1162,18 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
WARN("Error: trying to connect already connected sendComm");
return ncclInternalError;
}
stage->buffer = NULL;
NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
NCCLCHECK(ncclSocketInit(&comm->base.sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
NCCLCHECKGOTO(ncclIbStatsInit(&comm->base.stats), ret, fail);
NCCLCHECKGOTO(ncclSocketInit(&comm->base.sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1), ret, fail);
stage->comm = comm;
stage->state = ncclIbCommStateConnect;
NCCLCHECK(ncclSocketConnect(&comm->base.sock));
NCCLCHECKGOTO(ncclSocketConnect(&comm->base.sock), ret, fail);
ib_connect_check:
/* since ncclSocketConnect is async, we must check if connection is complete */
NCCLCHECK(ncclSocketReady(&comm->base.sock, &ready));
NCCLCHECKGOTO(ncclSocketReady(&comm->base.sock, &ready), ret, fail);
if (!ready) return ncclSuccess;
// IB Setup
@@ -1078,7 +1187,7 @@ ib_connect_check:
comm->ar = 1; // Set to 1 for logic
for (int i = 0; i < mergedDev->ndevs; i++) {
int ibDevN = mergedDev->devs[i];
NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base));
NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base, &comm->base.stats), ret, fail);
comm->ar = comm->ar && ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING - if all merged devs have it enabled
}
@@ -1091,13 +1200,17 @@ ib_connect_check:
for (int q = 0; q < comm->base.nqps; q++) {
ncclIbSendCommDev* commDev = comm->devs + devIndex;
ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, comm->base.qps+q));
NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &commDev->base, IBV_ACCESS_REMOTE_WRITE, &comm->base.stats, comm->base.qps + q), ret, fail);
comm->base.qps[q].devIndex = devIndex;
meta.qpInfo[q].qpn = comm->base.qps[q].qp->qp_num;
meta.qpInfo[q].devIndex = comm->base.qps[q].devIndex;
// Query ece capabilities (enhanced connection establishment)
NCCLCHECK(wrap_ibv_query_ece(comm->base.qps[q].qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
if (ncclParamIbEceEnable()) {
// Query ece capabilities (enhanced connection establishment)
NCCLCHECKGOTO(wrap_ibv_query_ece(comm->base.qps[q].qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
} else {
meta.qpInfo[q].ece_supported = 0;
}
devIndex = (devIndex + 1) % comm->base.ndevs;
}
@@ -1112,13 +1225,13 @@ ib_connect_check:
devInfo->lid = ibDev->portAttr.lid;
// Prepare my fifo
NCCLCHECK(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
devInfo->fifoRkey = commDev->fifoMr->rkey;
// Pack local GID info
devInfo->link_layer = commDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer;
NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &commDev->base.gidInfo.localGidIndex));
NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid));
NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &commDev->base.gidInfo.localGidIndex), ret, fail);
NCCLCHECKGOTO(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, commDev->base.gidInfo.localGidIndex, &commDev->base.gidInfo.localGid), ret, fail);
devInfo->gid.global.subnet_prefix = commDev->base.gidInfo.localGid.global.subnet_prefix;
devInfo->gid.global.interface_id = commDev->base.gidInfo.localGid.global.interface_id;
@@ -1148,12 +1261,12 @@ ib_connect_check:
stage->state = ncclIbCommStateSend;
stage->offset = 0;
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)));
NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)), ret, fail);
memcpy(stage->buffer, &meta, sizeof(meta));
ib_send:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(meta), &stage->offset));
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(meta), &stage->offset), ret, fail);
if (stage->offset != sizeof(meta)) return ncclSuccess;
stage->state = ncclIbCommStateConnecting;
@@ -1163,7 +1276,7 @@ ib_send:
ib_connect:
struct ncclIbConnectionMetadata remMeta;
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclIbConnectionMetadata), &stage->offset));
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclIbConnectionMetadata), &stage->offset), ret, fail);
if (stage->offset != sizeof(remMeta)) return ncclSuccess;
memcpy(&remMeta, stage->buffer, sizeof(ncclIbConnectionMetadata));
@@ -1197,7 +1310,7 @@ ib_connect:
}
for (int i=0; i < comm->base.ndevs; i++) {
NCCLCHECK(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
NCCLCHECKGOTO(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
}
comm->base.nRemDevs = remMeta.ndevs;
@@ -1212,10 +1325,10 @@ ib_connect:
struct ibv_qp* qp = comm->base.qps[q].qp;
if (remQpInfo->ece_supported)
NCCLCHECK(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported));
NCCLCHECKGOTO(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported), ret, fail);
NCCLCHECK(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false));
NCCLCHECK(ncclIbRtsQp(qp));
NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false), ret, fail);
NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail);
}
if (link_layer == IBV_LINK_LAYER_ETHERNET ) { // RoCE
@@ -1233,19 +1346,23 @@ ib_connect:
stage->offset = 0;
ib_send_ready:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, &comm->base.ready, sizeof(int), &stage->offset));
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, &comm->base.ready, sizeof(int), &stage->offset), ret, fail);
if (stage->offset != sizeof(int)) return ncclSuccess;
free(stage->buffer);
stage->state = ncclIbCommStateStart;
*sendComm = comm;
return ncclSuccess;
exit:
if (stage->buffer) free(stage->buffer);
stage->state = ncclIbCommStateStart;
return ret;
fail:
free(comm);
goto exit;
}
NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
ncclResult_t ret = ncclSuccess;
struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
struct ncclIbCommStage* stage = &lComm->stage;
struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
@@ -1262,22 +1379,23 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
}
NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
NCCLCHECKGOTO(ncclIbStatsInit(&rComm->base.stats), ret, fail);
stage->comm = rComm;
stage->state = ncclIbCommStateAccept;
NCCLCHECK(ncclSocketInit(&rComm->base.sock));
NCCLCHECK(ncclSocketAccept(&rComm->base.sock, &lComm->sock));
NCCLCHECKGOTO(ncclSocketInit(&rComm->base.sock), ret, fail);
NCCLCHECKGOTO(ncclSocketAccept(&rComm->base.sock, &lComm->sock), ret, fail);
ib_accept_check:
NCCLCHECK(ncclSocketReady(&rComm->base.sock, &ready));
NCCLCHECKGOTO(ncclSocketReady(&rComm->base.sock, &ready), ret, fail);
if (!ready) return ncclSuccess;
struct ncclIbConnectionMetadata remMeta;
stage->state = ncclIbCommStateRecv;
stage->offset = 0;
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta)));
NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta)), ret, fail);
ib_recv:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset));
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset), ret, fail);
if (stage->offset != sizeof(remMeta)) return ncclSuccess;
/* copy back the received info */
@@ -1308,10 +1426,10 @@ ib_recv:
for (int i = 0; i < rComm->base.ndevs; i++) {
rCommDev = rComm->devs + i;
ibDevN = mergedDev->devs[i];
NCCLCHECK(ncclIbInitCommDevBase(ibDevN, &rCommDev->base));
NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &rCommDev->base, &rComm->base.stats), ret, fail);
ibDev = ncclIbDevs + ibDevN;
NCCLCHECK(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex));
NCCLCHECK(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid));
NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex), ret, fail);
NCCLCHECKGOTO(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid), ret, fail);
}
// Copy remDevInfo for things like remGidInfo, remFifoAddr, etc.
@@ -1336,23 +1454,26 @@ ib_recv:
// Local ibDevN
ibDevN = rComm->devs[devIndex].base.ibDevN;
ibDev = ncclIbDevs + ibDevN;
NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, qp));
NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp), ret, fail);
qp->devIndex = devIndex;
devIndex = (devIndex + 1) % rComm->base.ndevs;
// Set the ece (enhanced connection establishment) on this QP before RTR
if (remMeta.qpInfo[q].ece_supported) {
NCCLCHECK(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
// Coverity suspects a copy-paste error below due to the use of remMeta in one argument and meta in another.
// However, this has been confirmed to be intentional.
// coverity[copy_paste_error]
NCCLCHECKGOTO(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
// Query the reduced ece for this QP (matching enhancements between the requestor and the responder)
// Store this in our own qpInfo for returning to the requestor
if (meta.qpInfo[q].ece_supported)
NCCLCHECK(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
NCCLCHECKGOTO(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
}
bool override_tc = (q == 0) ? true : false;
NCCLCHECK(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc));
NCCLCHECK(ncclIbRtsQp(qp->qp));
NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc), ret, fail);
NCCLCHECKGOTO(ncclIbRtsQp(qp->qp), ret, fail);
}
rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || ncclIbDmaBufSupport(lComm->dev) == ncclSuccess)
@@ -1366,17 +1487,17 @@ ib_recv:
// Retain remote fifo info and prepare my RDMA ops
rCommDev->fifoRkey = remMeta.devs[i].fifoRkey;
rComm->remFifo.addr = remMeta.fifoAddr;
NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
rCommDev->fifoSge.lkey = rCommDev->fifoMr->lkey;
if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
// Allocate Flush dummy buffer for GPU Direct RDMA
if (rComm->flushEnabled) {
NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE));
NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE), ret, fail);
rCommDev->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlushHostMem;
rCommDev->gpuFlush.sge.length = 1;
rCommDev->gpuFlush.sge.lkey = rCommDev->gpuFlush.hostMr->lkey;
NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rCommDev->gpuFlush.qp));
NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rComm->base.stats, &rCommDev->gpuFlush.qp), ret, fail);
struct ncclIbDevInfo devInfo;
devInfo.lid = ibDev->portAttr.lid;
devInfo.link_layer = ibDev->portAttr.link_layer;
@@ -1384,8 +1505,8 @@ ib_recv:
devInfo.gid.global.subnet_prefix = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
devInfo.gid.global.interface_id = rCommDev->base.gidInfo.localGid.global.interface_id;
devInfo.mtu = ibDev->portAttr.active_mtu;
NCCLCHECK(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false));
NCCLCHECK(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp));
NCCLCHECKGOTO(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false), ret, fail);
NCCLCHECKGOTO(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp), ret, fail);
}
// Fill Handle
@@ -1400,7 +1521,7 @@ ib_recv:
meta.devs[i].mtu = remMeta.devs[i].mtu;
// Prepare sizes fifo
NCCLCHECK(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
NCCLCHECKGOTO(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
meta.devs[i].fifoRkey = rComm->devs[i].sizesFifoMr->rkey;
}
meta.fifoAddr = (uint64_t)rComm->sizesFifo;
@@ -1415,30 +1536,36 @@ ib_recv:
stage->state = ncclIbCommStateSend;
stage->offset = 0;
if (stage->buffer) free(stage->buffer);
NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbConnectionMetadata)));
if (stage->buffer) {
free(stage->buffer);
stage->buffer = NULL;
}
NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbConnectionMetadata)), ret, fail);
memcpy(stage->buffer, &meta, sizeof(struct ncclIbConnectionMetadata));
ib_send:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(struct ncclIbConnectionMetadata), &stage->offset));
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(struct ncclIbConnectionMetadata), &stage->offset), ret, fail);
if (stage->offset < sizeof(struct ncclIbConnectionMetadata)) return ncclSuccess;
stage->offset = 0;
stage->state = ncclIbCommStatePendingReady;
ib_recv_ready:
NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, &rComm->base.ready, sizeof(int), &stage->offset));
NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, &rComm->base.ready, sizeof(int), &stage->offset), ret, fail);
if (stage->offset != sizeof(int)) return ncclSuccess;
free(stage->buffer);
*recvComm = rComm;
exit:
/* reset lComm stage */
if (stage->buffer) free(stage->buffer);
stage->state = ncclIbCommStateStart;
stage->offset = 0;
stage->comm = NULL;
stage->buffer = NULL;
return ncclSuccess;
return ret;
fail:
free(rComm);
goto exit;
}
ncclResult_t ncclIbGetRequest(struct ncclIbNetCommBase* base, struct ncclIbRequest** req) {
@@ -1531,16 +1658,21 @@ struct ncclIbNetCommDevBase* ncclIbGetNetCommDevBase(ncclIbNetCommBase* base, in
/* DMA-BUF support */
ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
ncclResult_t ret = ncclSuccess;
assert(size > 0);
struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle));
for (int i = 0; i < base->ndevs; i++) {
// Each ncclIbNetCommDevBase is at different offset in send and recv netComms
struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i);
NCCLCHECK(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i));
NCCLCHECKGOTO(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i), ret, fail);
}
*mhandle = (void*) mhandleWrapper;
return ncclSuccess;
exit:
return ret;
fail:
free(mhandleWrapper);
goto exit;
}
ncclResult_t ncclIbRegMr(void* comm, void* data, size_t size, int type, void** mhandle) {
@@ -1694,6 +1826,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; }
if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle;
@@ -1858,6 +1991,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; }
if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
@@ -1937,10 +2071,13 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
return ncclSuccess;
}
#define HCA_NAME(req, index) ((req)->devBases[(index)]->pd->context->device->name)
ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
struct ncclIbRequest *r = (struct ncclIbRequest*)request;
*done = 0;
while (1) {
NCCLCHECK(ncclIbStatsCheckFatalCount(&r->base->stats,__func__));
if (r->events[0] == 0 && r->events[1] == 0) {
TRACE(NCCL_NET, "r=%p done", r);
*done = 1;
@@ -1996,7 +2133,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%d wr_id=%ld r=%p type=%d events={%d,%d}, i=%d",
ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i);
#endif
if (req->type == NCCL_NET_IB_REQ_SEND) {
if (req && req->type == NCCL_NET_IB_REQ_SEND) {
for (int j = 0; j < req->nreqs; j++) {
struct ncclIbRequest* sendReq = r->base->reqs+((wc->wr_id >> (j*8)) & 0xff);
if ((sendReq->events[i] <= 0)) {
@@ -2018,6 +2155,9 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
req->events[i]--;
}
}
// Once the IB fatal event is reported in the async thread, we want to propagate this error
// to communicator and prevent further polling to reduce error pollution.
NCCLCHECK(ncclIbStatsCheckFatalCount(&ncclIbDevs[r->devBases[i]->ibDevN].stats,__func__));
}
}
+36 -18
Ver ficheiro
@@ -73,22 +73,27 @@ ncclResult_t ncclNetSocketDevices(int* ndev) {
}
static ncclResult_t ncclNetSocketGetSpeed(char* devName, int* speed) {
ncclResult_t ret = ncclSuccess;
*speed = 0;
char speedPath[PATH_MAX];
sprintf(speedPath, "/sys/class/net/%s/speed", devName);
int fd = open(speedPath, O_RDONLY);
int fd = -1;
SYSCHECKSYNC(open(speedPath, O_RDONLY), "open", fd);
if (fd != -1) {
char speedStr[] = " ";
if (read(fd, speedStr, sizeof(speedStr)-1) > 0) {
int n;
// Allow this to silently fail
n = read(fd, speedStr, sizeof(speedStr)-1);
if (n > 0) {
*speed = strtol(speedStr, NULL, 0);
}
close(fd);
}
if (*speed <= 0) {
INFO(NCCL_NET, "Could not get speed from %s. Defaulting to 10 Gbps.", speedPath);
*speed = 10000;
}
return ncclSuccess;
if (fd != -1) SYSCHECK(close(fd), "close");
return ret;
}
ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
@@ -235,19 +240,24 @@ void* persistentSocketThread(void *args_) {
}
ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
ncclResult_t ret = ncclSuccess;
int nSocksPerThread = ncclParamSocketNsocksPerThread();
int nThreads = ncclParamSocketNthreads();
if (nThreads > MAX_THREADS) {
WARN("NET/Socket : NCCL_SOCKET_NTHREADS is greater than the maximum allowed, setting to %d", MAX_THREADS);
nThreads = MAX_THREADS;
}
int fd = -1;
int nSocks;
if (nThreads == -2 || nSocksPerThread == -2) {
// Auto-detection
int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
char vendorPath[PATH_MAX];
snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetSocketDevs[dev].devName);
// Coverity is wrong. NULL second argument to realpath() is OK by POSIX.1-2008.
// coverity[alias_transfer:FALSE]
char* rPath = realpath(vendorPath, NULL);
int fd = open(rPath, O_RDONLY);
fd = open(rPath, O_RDONLY);
free(rPath);
if (fd == -1) {
// Could not find device vendor. This is handled silently so
@@ -257,9 +267,7 @@ ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
}
char vendor[7];
strncpy(vendor, "0x0000", 7);
int len;
SYSCHECKVAL(read(fd, vendor, 6), "read", len);
SYSCHECK(close(fd), "close");
SYSCHECKGOTO(read(fd, vendor, 6), "read", ret, fail);
if (strcmp(vendor, "0x1d0f") == 0) { // AWS
autoNt = 2;
autoNs = 8;
@@ -271,7 +279,7 @@ end:
if (nThreads == -2) nThreads = autoNt;
if (nSocksPerThread == -2) nSocksPerThread = autoNs;
}
int nSocks = nSocksPerThread * nThreads;
nSocks = nSocksPerThread * nThreads;
if (nSocks > MAX_SOCKETS) {
nSocksPerThread = MAX_SOCKETS/nThreads;
WARN("NET/Socket : the total number of sockets is greater than the maximum allowed, setting NCCL_NSOCKS_PERTHREAD to %d", nSocksPerThread);
@@ -280,28 +288,38 @@ end:
*ns = nSocks;
*nt = nThreads;
if (nSocks > 0) INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread);
return ncclSuccess;
exit:
if (fd != -1) close(fd);
return ret;
fail:
goto exit;
}
ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) {
if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
return ncclInternalError;
}
ncclResult_t ret = ncclSuccess;
struct ncclNetSocketHandle* handle = (struct ncclNetSocketHandle*) opaqueHandle;
memset(handle, 0, sizeof(struct ncclNetSocketHandle));
static_assert(sizeof(struct ncclNetSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclNetSocketHandle size too large");
struct ncclNetSocketListenComm* comm;
NCCLCHECK(ncclCalloc(&comm, 1));
handle->magic = NCCL_SOCKET_MAGIC;
NCCLCHECK(ncclSocketInit(&comm->sock, &ncclNetSocketDevs[dev].addr, handle->magic, ncclSocketTypeNetSocket, NULL, 1));
NCCLCHECK(ncclSocketListen(&comm->sock));
NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr));
NCCLCHECK(ncclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
NCCLCHECKGOTO(ncclSocketInit(&comm->sock, &ncclNetSocketDevs[dev].addr, handle->magic, ncclSocketTypeNetSocket, NULL, 1), ret, fail);
NCCLCHECKGOTO(ncclSocketListen(&comm->sock), ret, fail);
NCCLCHECKGOTO(ncclSocketGetAddr(&comm->sock, &handle->connectAddr), ret, fail);
NCCLCHECKGOTO(ncclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads), ret, fail);
handle->nSocks = comm->nSocks;
handle->nThreads = comm->nThreads;
comm->dev = dev;
*listenComm = comm;
return ncclSuccess;
exit:
return ret;
fail:
(void)ncclSocketClose(&comm->sock);
free(comm);
goto exit;
}
ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
@@ -437,7 +455,7 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void*
res->comm = comm;
pthread_mutex_init(&res->threadLock, NULL);
pthread_cond_init(&res->threadCond, NULL);
pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res);
PTHREADCHECK(pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res), "pthread_create");
ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev);
}
struct ncclNetSocketTask* r = queue->tasks+queue->next;
@@ -482,7 +500,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
if (r->op == NCCL_SOCKET_RECV && data > r->size) {
char line[SOCKET_NAME_MAXLEN+1];
union ncclSocketAddress addr;
ncclSocketGetAddr(r->ctrlSock, &addr);
NCCLCHECK(ncclSocketGetAddr(r->ctrlSock, &addr));
WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
ncclSocketToString(&addr, line), data, r->size);
@@ -579,7 +597,7 @@ ncclResult_t ncclNetSocketClose(void* opaqueComm) {
res->stop = 1;
pthread_cond_signal(&res->threadCond);
pthread_mutex_unlock(&res->threadLock);
pthread_join(comm->helperThread[i], NULL);
PTHREADCHECK(pthread_join(comm->helperThread[i], NULL), "pthread_join");
}
free(res->threadTaskQueue.tasks);
}
+46 -30
Ver ficheiro
@@ -26,7 +26,7 @@ struct localRegData {
intptr_t offset;
};
ncclResult_t nvlsCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
ncclResult_t nvlsCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
// This transport cannot be used for p2p
*ret = 0;
return ncclSuccess;
@@ -71,28 +71,31 @@ ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop,
ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle) {
CUmemAllocationHandleType type = ncclCuMemHandleType;
int fd = -1;
ncclResult_t ret = ncclSuccess;
INFO(NCCL_NVLS, "NVLS importing shareableHandle %p from rank %d", shareableHandle, rank);
// Import and map the remote memory descriptor to the local GPU
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
// cuMem UDS support
int fd = -1;
TRACE(NCCL_NVLS, "NVLS rank %d Importing shareable handle %p from rank %d", comm->localRank, shareableHandle, rank);
int tpProxyRank = comm->topParentRanks[rank];
TRACE(NCCL_NVLS, "NVLS rank %d request conversion of handle 0x%lx from rank %d", comm->localRank, *(uint64_t*)shareableHandle, rank);
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpProxyRank, shareableHandle, &fd));
NCCLCHECKGOTO(ncclProxyClientGetFdBlocking(comm, rank, shareableHandle, &fd), ret, fail);
TRACE(NCCL_NVLS, "NVLS rank %d received converted fd %d from rank %d", comm->localRank, fd, rank);
CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type));
(void) close(fd);
CUCHECKGOTO(cuMemImportFromShareableHandle(mcHandle, (void *)(uintptr_t)fd, type), ret, fail);
SYSCHECK(close(fd), "close");
} else {
if (type == CU_MEM_HANDLE_TYPE_FABRIC) {
CUCHECK(cuMemImportFromShareableHandle(mcHandle, (void *)shareableHandle, type));
CUCHECKGOTO(cuMemImportFromShareableHandle(mcHandle, (void *)shareableHandle, type), ret, fail);
} else {
memcpy(mcHandle, shareableHandle, sizeof(CUmemGenericAllocationHandle));
}
}
return ncclSuccess;
exit:
return ret;
fail:
if (fd != -1) close(fd);
goto exit;
}
ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAllocationHandle* mcHandle) {
@@ -100,7 +103,7 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll
INFO(NCCL_NVLS, "NVLS Unbind MC handle %llx size %zu dev %d", *mcHandle, size, dev);
// Unbind physical memory from group for the given device
CUCHECK(cuMulticastUnbind(*mcHandle, dev, 0/*mcOffset*/, size));
if (size) CUCHECK(cuMulticastUnbind(*mcHandle, dev, 0/*mcOffset*/, size));
return ncclSuccess;
}
@@ -117,14 +120,18 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr,
INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", *ucHandle, ucptr, *mcHandle, mcptr);
// Release the UC memory and mapping
CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size));
CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size));
CUCHECK(cuMemRelease(*ucHandle));
if (ucptr) {
CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size));
CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size));
CUCHECK(cuMemRelease(*ucHandle));
}
// Release the MC memory and mapping
CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size));
CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size));
CUCHECK(cuMemRelease(*mcHandle));
if (mcptr) {
CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size));
CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size));
CUCHECK(cuMemRelease(*mcHandle));
}
return ncclSuccess;
}
@@ -191,7 +198,9 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
size_t size = *sizePtr;
size_t originSize = size;
size_t ucgran, mcgran;
int allocMcHandle = 0;
*ucptr = *mcptr = NULL;
memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
mcprop.numDevices = comm->localRanks;
mcprop.handleTypes = ncclCuMemHandleType;
@@ -203,10 +212,12 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
if (comm->localRank == 0) {
NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail);
allocMcHandle = 1;
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
} else {
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], mcHandle), ret, fail);
allocMcHandle = 1;
}
CUCHECKGOTO(cuMulticastAddDevice(*mcHandle, comm->cudaDev), ret, fail);
@@ -226,6 +237,8 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, size, desc, 1), ret, fail);
CUDACHECKGOTO(cudaMemset(*ucptr, 0, size), ret, fail);
// intra-node barrier to mitigate the possible hang in cuMulticastBindMem during abort
NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
// Bind physical memory to the Multicast group
// NB: It will block until all ranks have been added to the Group
CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, size, 0/*flags*/), ret, fail);
@@ -239,6 +252,7 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
exit:
return ret;
fail:
if (allocMcHandle && *mcptr == NULL && *ucptr == NULL) CUCHECK(cuMemRelease(*mcHandle));
goto exit;
}
@@ -350,10 +364,10 @@ setup:
struct ncclNvlsSharedRes* resources = NULL;
int nHeads = comm->channels[0].nvls.nHeads;
int nChannels = comm->nChannels;
size_t memSize = 16;
size_t memSize = 64;
size_t creditSize = nChannels * 2 * memSize * nHeads;
int nvlsStepSize = comm->nvlsChunkSize;
NCCLCHECKGOTO(ncclCalloc(&comm->nvlsResources, 1), res, fail);
comm->nvlsResources->inited = false;
comm->nvlsResources->refCount = 1;
@@ -466,7 +480,7 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
if (!comm->MNNVL && resources->nvlsShmemHandle)
NCCLCHECK(ncclShmClose(resources->nvlsShmemHandle));
if (resources->ucCredit && resources->mcCredit) {
if (resources->ucCredit || resources->mcCredit) {
NCCLCHECK(nvlsGroupUnbind(comm, resources->creditSize, &resources->mcCreditHandle));
NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditSize, resources->ucCredit, &resources->ucCreditHandle, resources->mcCredit, &resources->mcCreditHandle));
}
@@ -490,7 +504,6 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
char shareableHandle[NVLS_HANDLE_SIZE];
CUmemGenericAllocationHandle mcHandle;
size_t minSize = SIZE_MAX;
bool localRegBufUsed = false;
struct localRegData* regData = NULL;
cudaPointerAttributes attr;
size_t ucgran, mcgran;
@@ -500,7 +513,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
if (userBuff) {
NCCLCHECKGOTO(ncclRegFind(comm, (void*)userBuff, buffSize, &regRecord), ret, fail);
if (regRecord) {
CUDACHECK(cudaPointerGetAttributes(&attr, (void*)regRecord->addr));
CUDACHECKGOTO(cudaPointerGetAttributes(&attr, (void*)regRecord->addr), ret, fail);
if (attr.type == cudaMemoryTypeDevice) {
size_t regSize = regRecord->pages * comm->regCache.pageSize;
memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
@@ -508,7 +521,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
mcprop.handleTypes = ncclCuMemHandleType;
mcprop.flags = 0;
mcprop.size = regSize;
CUCHECK(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
memset(&ucprop, 0, sizeof(CUmemAllocationProp));
ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
@@ -517,7 +530,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
ucprop.requestedHandleTypes = ncclCuMemHandleType;
CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
CUCHECK(cuMemGetAddressRange((CUdeviceptr*)&regRecord->baseAddr, &regRecord->baseSize, (CUdeviceptr)regRecord->addr));
CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&regRecord->baseAddr, &regRecord->baseSize, (CUdeviceptr)regRecord->addr), ret, fail);
if (regSize % mcgran == 0) {
regRecord->regSize = regSize;
} else {
@@ -560,6 +573,9 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
}
CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->nvlsResources->dev), ret, fail);
// Coverity complains that regRecord could be NULL. That won't in practice be the case because we've already checked
// (regData[i].reg.state & NVLS_REG_POSSIBLE) of all local ranks, which would catch it and bail out.
// coverity[var_deref_op]
CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail);
// Create a VA for the NVLS
@@ -584,15 +600,13 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
}
}
localRegBufUsed = true;
*regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
*regUsed = true;
exit:
if (localRegBufUsed) *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
*regUsed = localRegBufUsed;
free(regData);
return ret;
fail:
localRegBufUsed = false;
*regUsed = false;
goto exit;
}
@@ -862,19 +876,21 @@ exit:
}
if (recvRecord) {
// Yes, it's a dead code. That's fine...
// coverity[dead_error_begin]
ncclNvlsDeregBuffer(&recvRecord->mcHandle, recvRecord->ptr, recvRecord->dev, recvRecord->size);
free(recvRecord);
}
} else {
if (sendRecord) {
*outRegBufSend = (void*)((uintptr_t)regSendPtr + (uintptr_t)sendbuff - (uintptr_t)baseSend);
ncclIntruQueueEnqueue(cleanupQueue, &sendRecord->base);
ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)sendRecord);
*nCleanupQueueEltsAdded += 1;
}
if (recvRecord) {
*outRegBufRecv = (void*)((uintptr_t)regRecvPtr + (uintptr_t)recvbuff - (uintptr_t)baseRecv);
ncclIntruQueueEnqueue(cleanupQueue, &recvRecord->base);
ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)recvRecord);
*nCleanupQueueEltsAdded += 1;
}
+495 -69
Ver ficheiro
@@ -7,9 +7,11 @@
#include "comm.h"
#include "graph.h"
#include "utils.h"
#include "shm.h"
#include "shmutils.h"
#include "p2p.h"
#include "transport.h"
#include <assert.h>
#include "shm.h"
enum p2pType { P2P_DIRECT, P2P_INTERMEDIATE, P2P_IPC, P2P_CUMEM };
@@ -19,16 +21,28 @@ struct ncclP2pBuff {
ncclIpcDesc ipcDesc;
};
struct ncclP2pRequest {
size_t size;
int refcount;
};
struct p2pConnectInfo {
int rank;
int read;
struct ncclP2pBuff p2pBuff;
// Used by CE memcpy
char shmName[7];
int shmSize;
ncclShmIpcDesc_t desc;
};
static_assert(sizeof(struct p2pConnectInfo) <= CONNECT_SIZE, "p2pConnectInfo is too large");
struct p2pIpcExpInfo {
ncclIpcDesc ipcDesc;
bool legacyIpcCap;
int impFd;
size_t size;
uintptr_t offset;
};
struct p2pShm {
struct ncclSendMem sendMem;
struct ncclRecvMem recvMem;
@@ -37,9 +51,7 @@ struct p2pShmProxyInfo {
// Shared memory between proxy and receiving GPU
struct p2pShm* shm;
struct p2pShm* devShm;
char shmName[7];
int shmSize;
ncclShmHandle_t handle;
ncclShmIpcDesc_t desc;
// Intermediate step for sender
struct ncclRecvMem* ceRecvMem;
@@ -62,13 +74,14 @@ struct p2pResources {
struct ncclRecvMem* recvDevMem;
};
void* sendMemIpc;
int sendMemSameProc;
void* recvMemIpc;
int recvMemSameProc;
// CE memcpy support
struct p2pShmProxyInfo proxyInfo;
struct p2pShm* shm;
struct p2pShm* devShm;
int shmSize;
ncclShmHandle_t handle;
ncclShmIpcDesc_t desc;
};
// cuMem API support
@@ -104,12 +117,12 @@ static void initCeOperation();
extern int64_t ncclParamMNNVLEnable();
/* Determine if two peers can communicate through p2p */
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
initCeOperation();
// MNNVL support
if (ncclParamMNNVLEnable() != 0 && info1->hostHash != info2->hostHash) {
NCCLCHECK(ncclTopoCheckMNNVL(topo, info1, info2, ret));
if (comm->MNNVL && info1->hostHash != info2->hostHash) {
NCCLCHECK(ncclTopoCheckMNNVL(comm->topo, info1, info2, ret));
if (*ret) return ncclSuccess;
}
@@ -121,7 +134,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
// Check topology / p2p level.
int intermediateRank;
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank));
NCCLCHECK(ncclTopoCheckP2p(comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank));
if (*ret == 0) return ncclSuccess;
if (intermediateRank != -1) {
if (useMemcpy) *ret = 0;
@@ -130,7 +143,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
// Check if NET would work better
int useNet = 0;
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, &useNet));
if (useNet) {
*ret = 0;
return ncclSuccess;
@@ -197,7 +210,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
} while (0)
// cuMem API support
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr) {
ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int refcount, ncclIpcDesc *ipcDesc, void **ptr) {
if (ncclCuMemEnable()) {
#if CUDART_VERSION >= 11030
CUmemAllocationHandleType type = ncclCuMemHandleType;
@@ -211,6 +224,10 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, v
} else {
CUCHECK(cuMemExportToShareableHandle(&ipcDesc->cuDesc, handle, type, 0));
}
if (refcount) {
memcpy(&ipcDesc->memHandle, &handle, sizeof(handle));
for (int r = 0; r < refcount; ++r) CUCHECK(cuMemRetainAllocationHandle(&handle, *ptr));
}
#else
return ncclInternalError;
#endif
@@ -233,7 +250,7 @@ ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc) {
return ncclSuccess;
}
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) {
ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int peer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr) {
if (ncclCuMemEnable()) {
#if CUDART_VERSION >= 11030
// cuMem API support
@@ -241,16 +258,25 @@ ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, siz
CUmemAllocationHandleType type = ncclCuMemHandleType;
CUmemGenericAllocationHandle handle;
ncclCuDesc *cuDesc = &ipcDesc->cuDesc;
CUmemAllocationProp prop = {};
size_t granularity = 0;
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.requestedHandleTypes = type;
prop.location.id = comm->cudaDev;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
ALIGN_SIZE(size, granularity);
// Import and map the remote memory descriptor to the local GPU
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
// UDS fd support
int fd = -1;
// Send cuMem handle to remote for conversion to an fd
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, tpPeer, &cuDesc->data, &fd));
INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, tpPeer);
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, peer, &cuDesc->data, &fd));
INFO(NCCL_P2P, "UDS converted handle 0x%lx to fd %d on remote peer %d", *(uint64_t*)&cuDesc->data, fd, peer);
CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type));
(void) close(fd);
SYSCHECK(close(fd), "close");
} else {
CUCHECK(cuMemImportFromShareableHandle(&handle, cuDesc, type));
}
@@ -291,7 +317,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
int p2p;
// Queries the topology to see if the GPUs are Ampere and
// connected via NVLink, if so we enable P2P Read by default
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, read, intermediateRank));
NCCLCHECK(ncclTopoCheckP2p(topo, info1->rank, info2->rank, &p2p, read, intermediateRank));
int readEnable = ncclParamP2pReadEnable();
if (readEnable != -2) *read = readEnable;
@@ -311,24 +337,23 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
return ncclInternalError;
}
#if CUDART_VERSION >= 11030
// cuMem API support
if (ncclCuMemEnable()) {
// Allow direct access to the remote buffer from the local GPU
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = myInfo->cudaDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
INFO(NCCL_P2P, "Set Access for buffer %p size %zu on dev %d", p2pBuff->directPtr, p2pBuff->size, peerInfo->cudaDev);
CUCHECK(cuMemSetAccess((CUdeviceptr) p2pBuff->directPtr, p2pBuff->size, &accessDesc, 1));
// for intra-process ranks, we should map memHandle of the peers to increase refcount.
// Otherwise, if peers abort and free the buffer, the rank can suffer invalid access.
NCCLCHECK(ncclCuMemAllocAddr(devMem, &p2pBuff->ipcDesc.memHandle, p2pBuff->size));
CUCHECK(cuMemRelease(p2pBuff->ipcDesc.memHandle));
*ipcPtr = *devMem;
} else {
*devMem = p2pBuff->directPtr;
*ipcPtr = NULL;
}
#endif
} else {
*devMem = p2pBuff->directPtr;
*ipcPtr = NULL;
}
*devMem = p2pBuff->directPtr;
*ipcPtr = NULL;
} else {
// Different PID
NCCLCHECK(ncclP2pImportShareableBuffer(comm, comm->topParentRanks[peerInfo->rank], p2pBuff->size, &p2pBuff->ipcDesc, devMem));
NCCLCHECK(ncclP2pImportShareableBuffer(comm, peerInfo->rank, p2pBuff->size, &p2pBuff->ipcDesc, devMem));
*ipcPtr = *devMem;
}
return ncclSuccess;
@@ -338,7 +363,7 @@ static ncclResult_t p2pMap(struct ncclComm *comm, struct ncclProxyConnector* pro
ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct p2pResources* resources;
int tpProxyRank;
struct ncclP2pRequest req;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
int useRead, intermediateRank;
@@ -387,15 +412,18 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
comm->peerInfo[intermediateRank].nvmlDev, useReadStr);
}
tpProxyRank = comm->topParentRanks[info->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, tpProxyRank, &send->proxyConn));
req.size = sendSize;
req.refcount = 0;
if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++;
if (P2P_SAME_PID((comm->peerInfo + info->rank), myInfo) && (comm->peerInfo[info->rank].cudaDev != myInfo->cudaDev)) req.refcount++;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
if (useMemcpy) {
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pShmProxyInfo)));
info->shmSize = resources->proxyInfo.shmSize;
memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
memcpy(&info->desc, &resources->proxyInfo.desc, sizeof(ncclShmIpcDesc_t));
} else {
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(struct ncclP2pRequest), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(p2pMap(comm, &send->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->sendDevMem, &resources->sendMemIpc));
resources->sendMemSameProc = P2P_SAME_PID(myInfo, (comm->peerInfo + info->rank));
}
return ncclSuccess;
@@ -405,7 +433,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) {
struct p2pResources* resources;
int tpProxyRank;
struct ncclP2pRequest req;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
int useRead, intermediateRank;
@@ -444,11 +472,15 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
info->rank = intermediateRank;
}
tpProxyRank = comm->topParentRanks[info->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, tpProxyRank, &recv->proxyConn));
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
req.size = recvSize;
req.refcount = 0;
if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++;
if (P2P_SAME_PID((comm->peerInfo + info->rank), myInfo) && (comm->peerInfo[info->rank].cudaDev != myInfo->cudaDev)) req.refcount++;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn));
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(struct ncclP2pRequest), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(p2pMap(comm, &recv->proxyConn, myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->recvDevMem, &resources->recvMemIpc));
resources->recvMemSameProc = P2P_SAME_PID(myInfo, (comm->peerInfo + info->rank));
return ncclSuccess;
}
@@ -459,6 +491,7 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
NCCLCHECK(p2pMap(comm, &send->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc));
resources->recvMemSameProc = P2P_SAME_PID((comm->peerInfo + rank), (comm->peerInfo + info->rank));
char* buff = (char*)(remDevMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -499,17 +532,14 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
struct ncclSendMem* remDevMem = NULL;
if (useMemcpy) {
char shmPath[PATH_MAX];
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
resources->shmSize = info->shmSize;
// Attach to peer's SHM segment
NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, -1, &resources->handle));
NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->shm, (void**)&resources->devShm, &resources->desc));
recv->conn.tail = &resources->devShm->recvMem.tail;
recv->conn.head = &resources->devShm->sendMem.head;
} else {
NCCLCHECK(p2pMap(comm, &recv->proxyConn, comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
resources->sendMemSameProc = P2P_SAME_PID((comm->peerInfo + rank), (comm->peerInfo + info->rank));
struct ncclRecvMem* devMem = resources->recvDevMem;
recv->conn.tail = &devMem->tail;
@@ -538,8 +568,21 @@ ncclResult_t p2pSendFree(struct ncclConnector* send) {
if (resources) {
if (ncclCuMemEnable()) {
// cuMem API support
if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
if (resources->sendMemIpc) {
if (resources->sendMemSameProc) {
NCCLCHECK(ncclCuMemFreeAddr(resources->sendMemIpc));
} else {
NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
}
}
if (resources->recvMemIpc) {
if (resources->recvMemSameProc) {
NCCLCHECK(ncclCuMemFreeAddr(resources->recvMemIpc));
} else {
NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
}
}
}
else {
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
@@ -555,14 +598,27 @@ ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
if (resources) {
if (ncclCuMemEnable()) {
// cuMem API support
if (resources->sendMemIpc) NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
if (resources->recvMemIpc) NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
if (resources->sendMemIpc) {
if (resources->sendMemSameProc) {
NCCLCHECK(ncclCuMemFreeAddr(resources->sendMemIpc));
} else {
NCCLCHECK(ncclCudaFree(resources->sendMemIpc));
}
}
if (resources->recvMemIpc) {
if (resources->recvMemSameProc) {
NCCLCHECK(ncclCuMemFreeAddr(resources->recvMemIpc));
} else {
NCCLCHECK(ncclCudaFree(resources->recvMemIpc));
}
}
}
else {
if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
if (useMemcpy) {
NCCLCHECK(ncclShmClose(resources->handle));
NCCLCHECK(ncclShmIpcClose(&resources->desc));
}
}
free(resources);
@@ -574,29 +630,27 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
if (useMemcpy) {
// CE memcpy support
struct p2pShmProxyInfo* proxyInfo;
size_t shmSize;
if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
connection->transportResources = proxyInfo;
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE]));
char shmPath[PATH_MAX];
shmPath[0] = '\0';
proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
// Create a SHM segment for the peer to attach to
NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1, &proxyInfo->handle));
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));
shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, shmSize, false, &proxyInfo->desc, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm));
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
if (respSize != sizeof(struct p2pShmProxyInfo)) return ncclInternalError;
memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo));
} else {
if (reqSize != sizeof(int)) return ncclInternalError;
int size = *((int*)reqBuff);
struct ncclP2pRequest* req = (struct ncclP2pRequest*)reqBuff;
if (reqSize != sizeof(struct ncclP2pRequest)) return ncclInternalError;
int size = req->size;
if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, req->refcount, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
p2pBuff->size = size;
if (ncclCuMemEnable()) {
// cuMem API support
@@ -613,11 +667,12 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
}
static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(int)) return ncclInternalError;
int size = *((int*)reqBuff);
struct ncclP2pRequest* req = (struct ncclP2pRequest*)reqBuff;
if (reqSize != sizeof(struct ncclP2pRequest)) return ncclInternalError;
int size = req->size;
if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
NCCLCHECK(ncclP2pAllocateShareableBuffer(size, req->refcount, &p2pBuff->ipcDesc, &p2pBuff->directPtr));
p2pBuff->size = size;
if (ncclCuMemEnable()) {
// cuMem API support
@@ -651,7 +706,7 @@ static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, str
if (useMemcpy) {
struct p2pShmProxyInfo* proxyInfo = (struct p2pShmProxyInfo*)connection->transportResources;
if (proxyInfo) {
NCCLCHECK(ncclShmClose(proxyInfo->handle));
NCCLCHECK(ncclShmIpcClose(&proxyInfo->desc));
NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
NCCLCHECK(ncclCudaFree(proxyInfo->ceDevBuff));
CUDACHECK(cudaStreamDestroy(proxyInfo->stream));
@@ -752,11 +807,382 @@ static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, stru
return ncclSuccess;
}
ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut) {
ncclResult_t ret = ncclSuccess;
struct ncclReg *regRecord = NULL;
struct ncclIpcRegInfo* newInfo = NULL;
uintptr_t* peerRmtAddrs = NULL;
bool legacyIpcCap = false;
size_t baseSize = 0;
void* baseAddr = NULL;
bool needUpdate = false;
*regBufFlag = 0;
*offsetOut = 0;
*peerRmtAddrsOut = NULL;
if (comm && userbuff && buffSize > 0 && nPeers > 0) {
NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
if (regRecord) {
// buffer was registered by by users, we need to start to register or reuse it
int peerLocalRank;
for (int p = 0; p < nPeers; p++) {
int peerRank = peerRanks[p];
peerLocalRank = comm->rankToLocalRank[peerRank];
if (regRecord->ipcInfos[peerLocalRank]) {
// We already have IPC info for peerLocalRank, no need to register it, we can reuse it
*regBufFlag = 1;
INFO(NCCL_REG, "rank %d - IPC local reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr);
} else {
// Register buffer with peerLocalRank
struct ncclProxyConnector* proxyConn = NULL;
struct p2pIpcExpInfo ipcInfo;
if (baseAddr == NULL) {
CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
CUCHECKGOTO(cuPointerGetAttribute((void*)&legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
}
if (comm->gproxyConn[peerRank].initialized == false)
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
proxyConn = &comm->gproxyConn[peerRank];
ipcInfo.legacyIpcCap = legacyIpcCap;
// Get the mem handle for that buffer. It may have been allocated through cudaMalloc in which case we'll
// get the CUDA legacy mem handle, or through cuMem*.
if (ipcInfo.legacyIpcCap) {
// legacy export
if (comm->directMode) goto fail;
CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
} else if (ncclCuMemEnable()) {
CUmemGenericAllocationHandle handle;
if (CUPFN(cuMemRetainAllocationHandle(&handle, baseAddr)) != CUDA_SUCCESS) {
// if cuMem* export fails, retry legacy export
if (comm->directMode) goto fail;
CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
ipcInfo.legacyIpcCap = true;
} else {
// cuMem* export to file descriptor or fabric handle
if (proxyConn->sameProcess) {
memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
} else {
if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
int expFd = -1;
CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
SYSCHECKGOTO(close(expFd), "close", ret, fail);
} else {
// Allow this to silently fail for cases where the user buff cannot be registered
if (CUPFN(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0)) != CUDA_SUCCESS) {
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
goto fail;
}
}
}
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
}
} else {
// nothing works, just return
goto fail;
}
void* rmtRegAddr = NULL;
ipcInfo.size = baseSize;
ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr;
// Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side
// and get the remote register address back.
if (proxyConn)
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
if (rmtRegAddr) {
NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail);
assert(regRecord->ipcInfos[peerLocalRank] == NULL);
regRecord->state |= IPC_REG_COMPLETE;
newInfo->peerRank = peerRank;
newInfo->baseAddr = baseAddr;
newInfo->impInfo.rmtRegAddr = rmtRegAddr;
newInfo->impInfo.offset = ipcInfo.offset;
newInfo->impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
newInfo->ipcProxyconn = proxyConn;
regRecord->ipcInfos[peerLocalRank] = newInfo;
if (regRecord->regIpcAddrs.hostPeerRmtAddrs == NULL) {
NCCLCHECKGOTO(ncclCalloc(&regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
}
regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr;
needUpdate = true;
*regBufFlag = 1;
INFO(NCCL_REG, "rank %d - IPC local register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
}
}
}
if (*regBufFlag) {
if (type == NCCL_IPC_COLLECTIVE) {
// for collective, store registered remote buffers into dev memory for future reference
if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) {
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL)
NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
if (needUpdate)
NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
}
peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs;
} else {
assert(nPeers == 1);
// p2p always returns remote addr here since remote buffer addr is passed in ncclDevWorkP2p struct
peerRmtAddrs = (uintptr_t*)regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank];
}
*offsetOut = (uintptr_t)userbuff - regRecord->addr;
*peerRmtAddrsOut = peerRmtAddrs;
}
}
}
exit:
return ret;
fail:
*regBufFlag = 0;
*offsetOut = 0;
*peerRmtAddrsOut = NULL;
if (newInfo) free(newInfo);
goto exit;
}
struct ncclIpcCleanupCallback {
struct ncclCommCallback base;
bool isAddrs;
union {
struct ncclIpcRegInfo regInfo;
struct ncclPeerRegIpcAddr regIpcAddrs;
};
};
static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) {
struct ncclIpcCleanupCallback* obj = (struct ncclIpcCleanupCallback*)cb;
if (obj->isAddrs) {
if (obj->regIpcAddrs.hostPeerRmtAddrs)
free(obj->regIpcAddrs.hostPeerRmtAddrs);
if (obj->regIpcAddrs.devPeerRmtAddrs)
NCCLCHECK(ncclCudaFree(obj->regIpcAddrs.devPeerRmtAddrs));
} else {
NCCLCHECK(ncclIpcDeregBuffer(comm, &obj->regInfo));
}
free(obj);
return ncclSuccess;
}
ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts) {
ncclResult_t ret = ncclSuccess;
struct ncclProxyConnector* proxyConn = NULL;
struct p2pIpcExpInfo ipcInfo;
void* baseAddr;
size_t baseSize;
struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue = reinterpret_cast<struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>*>(cleanupQueuePtr);
uintptr_t* peerRmtAddrs = NULL;
struct ncclIpcCleanupCallback* addrsRecord = NULL;
*regBufFlag = 0;
CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
CUCHECKGOTO(cuPointerGetAttribute((void*)&ipcInfo.legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
if (type == NCCL_IPC_COLLECTIVE) {
// collective needs host memory array to hold all remote buffer addrs.
// We need to put this into graph release queue
NCCLCHECKGOTO(ncclCalloc(&addrsRecord, 1), ret, fail);
addrsRecord->base.fn = cleanupIpc;
addrsRecord->isAddrs = true;
NCCLCHECKGOTO(ncclCalloc(&addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
} else {
assert(nPeers == 1);
// p2p does not need anything, just returning the remote buffer is enough, but for now, we register
// peer one by one so nPeers must be 1
}
for (int p = 0; p < nPeers; ++p) {
int peerRank = peerRanks[p];
if (comm->gproxyConn[peerRank].initialized == false)
NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
proxyConn = &comm->gproxyConn[peerRank];
// Same as local registration. Get the mem handle for that buffer. It may have been allocated through
// cudaMalloc in which case we'll get the CUDA legacy mem handle, or through cuMem*.
if (ipcInfo.legacyIpcCap) {
if (comm->directMode) goto fail;
CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
} else if (ncclCuMemEnable()) {
// cuMem* export
CUmemGenericAllocationHandle handle;
if (pfn_cuMemRetainAllocationHandle(&handle, baseAddr) != CUDA_SUCCESS) {
if (comm->directMode) goto fail;
CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
ipcInfo.legacyIpcCap = true;
} else {
if (proxyConn->sameProcess) {
memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
} else {
if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
int expFd = -1;
CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
if (proxyConn->sameProcess) {
ipcInfo.impFd = expFd;
} else {
NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
SYSCHECKGOTO(close(expFd), "close", ret, fail);
}
} else {
CUCHECKGOTO(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0), ret, fail);
}
}
CUCHECKGOTO(cuMemRelease(handle), ret, fail);
}
} else {
goto fail;
}
void* rmtRegAddr = NULL;
ipcInfo.size = baseSize;
ipcInfo.offset = 0;
NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(struct p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
if (rmtRegAddr) {
struct ncclIpcCleanupCallback* record;
NCCLCHECKGOTO(ncclCalloc(&record, 1), ret, fail);
record->base.fn = cleanupIpc;
record->isAddrs = false;
record->regInfo.peerRank = peerRank;
record->regInfo.baseAddr = baseAddr;
record->regInfo.impInfo.rmtRegAddr = rmtRegAddr;
record->regInfo.impInfo.offset = 0;
record->regInfo.impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
record->regInfo.ipcProxyconn = proxyConn;
// store the remote address into host addr array
if (type == NCCL_IPC_COLLECTIVE)
addrsRecord->regIpcAddrs.hostPeerRmtAddrs[comm->rankToLocalRank[peerRank]] = (uintptr_t)rmtRegAddr;
else
peerRmtAddrs = (uintptr_t*)rmtRegAddr;
*regBufFlag = 1;
if (ipcInfo.legacyIpcCap)
ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &record->base);
else
ncclIntruQueueEnqueue(cleanupQueue, &record->base);
if (nCleanupQueueElts) *nCleanupQueueElts += 1;
INFO(NCCL_REG, "rank %d - IPC graph register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, baseAddr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - (uintptr_t)baseAddr);
}
}
if (type == NCCL_IPC_COLLECTIVE) {
// allocate the dev addr array and copy all previously stored addrs into it.
NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
NCCLCHECKGOTO(ncclCudaCallocAsync(&addrsRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
NCCLCHECKGOTO(ncclCudaMemcpyAsync(addrsRecord->regIpcAddrs.devPeerRmtAddrs, addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->nRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
peerRmtAddrs = addrsRecord->regIpcAddrs.devPeerRmtAddrs;
if (ipcInfo.legacyIpcCap)
ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &addrsRecord->base);
else
ncclIntruQueueEnqueue(cleanupQueue, &addrsRecord->base);
}
*offsetOut = (uintptr_t)userbuff - (uintptr_t)baseAddr;
*peerRmtAddrsOut = peerRmtAddrs;
exit:
return ret;
fail:
*regBufFlag = 0;
*offsetOut = 0;
*peerRmtAddrsOut = NULL;
goto exit;
}
ncclResult_t ncclIpcDeregBuffer(struct ncclComm* comm, struct ncclIpcRegInfo* regInfo) {
NCCLCHECK(ncclProxyCallBlocking(comm, regInfo->ipcProxyconn, ncclProxyMsgDeregister, &regInfo->impInfo, sizeof(struct ncclIpcImpInfo), NULL, 0));
INFO(NCCL_REG, "rank %d - IPC deregistered buffer %p peer %d ipc remote buffer %p", comm->rank, regInfo->baseAddr, regInfo->peerRank, regInfo->impInfo.rmtRegAddr);
return ncclSuccess;
}
static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct p2pIpcExpInfo* ipcExpInfo = (struct p2pIpcExpInfo*)reqBuff;
void* regAddr = NULL;
ncclResult_t ret = ncclSuccess;
bool mapped = false;
bool imported = false;
CUmemGenericAllocationHandle handle;
assert(sizeof(struct p2pIpcExpInfo) == reqSize);
assert(sizeof(void*) == respSize);
// request peer passes all necessary buffer info to import. The proxy thread would register
// the buffer locally and return register addr back
if (ipcExpInfo->legacyIpcCap) {
// legacy import
CUDACHECKGOTO(cudaIpcOpenMemHandle(&regAddr, ipcExpInfo->ipcDesc.devIpc, cudaIpcMemLazyEnablePeerAccess), ret, fail);
regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset);
} else {
// cuMem import
if (connection->sameProcess) {
// if proxy is same process as request peer, we just need to map the handle.
memcpy(&handle, &ipcExpInfo->ipcDesc.memHandle, sizeof(CUmemGenericAllocationHandle));
} else {
if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
CUCHECKGOTO(cuMemImportFromShareableHandle(&handle, (void*)(uintptr_t)ipcExpInfo->impFd, ncclCuMemHandleType), ret, fail);
SYSCHECKGOTO(close(ipcExpInfo->impFd), "close", ret, fail);
} else {
CUCHECKGOTO(cuMemImportFromShareableHandle(&handle, (void*)&ipcExpInfo->ipcDesc.cuDesc, ncclCuMemHandleType), ret, fail);
}
}
imported = true;
CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)&regAddr, ipcExpInfo->size, /* alignment */ 0, /* addr */ 0, /* flags */ 0), ret, fail);
CUCHECKGOTO(cuMemMap((CUdeviceptr)regAddr, ipcExpInfo->size, /* offset */ 0, handle, /* flags */ 0), ret, fail);
mapped = true;
// Allow access by the local GPU
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = proxyState->cudaDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)regAddr, ipcExpInfo->size, &accessDesc, 1), ret, fail);
regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset);
}
INFO(NCCL_REG, "Proxy rank %d register succeeds, regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
exit:
memcpy(respBuff, (void*)&regAddr, sizeof(void*));
*done = 1;
return ret;
fail:
if (!ipcExpInfo->legacyIpcCap) {
if (mapped) CUCHECK(cuMemUnmap((CUdeviceptr)regAddr, ipcExpInfo->size));
if (regAddr) CUCHECK(cuMemAddressFree((CUdeviceptr)regAddr, ipcExpInfo->size));
if (imported) CUCHECK(cuMemRelease(handle));
}
regAddr = NULL;
goto exit;
}
static ncclResult_t p2pProxyDeregister(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
ncclResult_t ret = ncclSuccess;
struct ncclIpcImpInfo* ipcInfo = (struct ncclIpcImpInfo*)reqBuff;
assert(sizeof(struct ncclIpcImpInfo) == reqSize);
if (ipcInfo->legacyIpcCap) {
CUDACHECKGOTO(cudaIpcCloseMemHandle((void*)((uintptr_t)ipcInfo->rmtRegAddr - ipcInfo->offset)), ret, fail);
} else {
if (connection->sameProcess) {
NCCLCHECKGOTO(ncclCuMemFreeAddr((void*)((uintptr_t)ipcInfo->rmtRegAddr - ipcInfo->offset)), ret, fail);
} else {
NCCLCHECKGOTO(ncclCudaFree((void*)((uintptr_t)ipcInfo->rmtRegAddr - ipcInfo->offset)), ret, fail);
}
}
exit:
*done = 1;
return ret;
fail:
goto exit;
}
struct ncclTransport p2pTransport = {
"P2P",
p2pCanConnect,
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL, NULL },
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL, NULL }
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL, p2pProxyRegister, p2pProxyDeregister },
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL, p2pProxyRegister, p2pProxyDeregister }
};
static void initCeOperation() {
+313 -101
Ver ficheiro
@@ -5,35 +5,58 @@
************************************************************************/
#include "comm.h"
#include "shmutils.h"
#include "shm.h"
#include "transport.h"
struct shmConnectInfo {
char shmName[7];
int shmSize;
#define SHM_PATH_MAX 128
#define SHM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
struct shmBuffInfo {
void *hptr;
void *dptr;
};
struct shmConnectInfo {
ncclShmIpcDesc_t desc;
struct shmBuffInfo buf;
};
static_assert(sizeof(shmConnectInfo) <= CONNECT_SIZE, "SHM Connect info is too large");
struct shmSendResources {
int remShmSize;
struct ncclRecvMem* remHostMem;
struct ncclRecvMem* devRemHostMem;
ncclShmHandle_t remHandle;
int shmSize;
ncclShmIpcDesc_t remDesc;
struct ncclSendMem* hostMem;
struct ncclSendMem* devHostMem;
ncclShmHandle_t hostHandle;
};
struct shmRecvResources {
int remShmSize;
struct ncclSendMem* remHostMem;
struct ncclSendMem* devRemHostMem;
ncclShmHandle_t remHandle;
int shmSize;
ncclShmIpcDesc_t remDesc;
struct ncclRecvMem* hostMem;
struct ncclRecvMem* devHostMem;
ncclShmHandle_t hostHandle;
};
struct shmProxyInfo {
struct ncclRecvMem* ceRecvMem;
char* devFifo;
char* shmFifo;
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
// used by progress only
uint64_t step;
cudaStream_t stream;
cudaEvent_t events[NCCL_STEPS];
// ipc desc
ncclShmIpcDesc_t desc;
};
struct shmRequest {
size_t size;
bool legacy;
};
#define SHM_SEND_SIDE 1
@@ -48,14 +71,14 @@ static int shmLocality = 0;
static void initCeOperation();
/* Determine two peers can communicate with SHM */
static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
static ncclResult_t shmCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
*ret = 0;
initCeOperation();
if (ncclParamShmDisable() == 1) return ncclSuccess;
int useNet = 0;
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, &useNet));
if (useNet) return ncclSuccess;
// Same host?
@@ -76,22 +99,29 @@ static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct
/* Create and return connect structures for this peer to connect to me */
static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct shmSendResources* resources;
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
size_t shmSize = sizeof(struct ncclSendMem);
struct shmRequest req;
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
char shmPath[PATH_MAX];
shmPath[0] = '\0';
int shmSize = sizeof(struct ncclSendMem);
if (shmLocality == SHM_SEND_SIDE) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += comm->buffSizes[p];
}
info->shmSize = resources->shmSize = shmSize;
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
req.size = shmSize;
if (myInfo->hostHash == peerInfo->hostHash && myInfo->pidHash == peerInfo->pidHash)
req.legacy = true;
else
req.legacy = false;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, myInfo->rank, &send->proxyConn));
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo)));
resources->hostMem = (struct ncclSendMem*)info->buf.hptr;
resources->devHostMem = (struct ncclSendMem*)info->buf.dptr;
INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%d] -> %d[%d] via SHM/%s/%s", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useMemcpySend?"CE":"direct", useMemcpyRecv?"CE":"direct");
return ncclSuccess;
@@ -99,52 +129,43 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
struct shmRecvResources* resources;
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
size_t shmSize = sizeof(struct ncclRecvMem);
struct shmRequest req;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
char shmPath[PATH_MAX];
shmPath[0] = '\0';
int shmSize = sizeof(struct ncclRecvMem);
if (shmLocality == SHM_RECV_SIDE) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += comm->buffSizes[p];
}
info->shmSize = resources->shmSize = shmSize;
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
req.size = shmSize;
if (myInfo->hostHash == peerInfo->hostHash && myInfo->pidHash == peerInfo->pidHash)
req.legacy = true;
else
req.legacy = false;
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, myInfo->rank, &recv->proxyConn));
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo)));
resources->hostMem = (struct ncclRecvMem*)info->buf.hptr;
resources->devHostMem = (struct ncclRecvMem*)info->buf.dptr;
return ncclSuccess;
}
struct shmProxyInfo {
struct ncclRecvMem* ceRecvMem;
char* devFifo;
char* shmFifo;
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
// used by progress only
uint64_t step;
cudaStream_t stream;
cudaEvent_t events[NCCL_STEPS];
};
/* Connect to this peer */
static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
// Setup device pointers
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
char* buff;
char shmPath[PATH_MAX];
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
resources->remShmSize = info->shmSize;
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle));
NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
send->conn.buffs[p] = buff;
buff += comm->buffSizes[p];
@@ -157,9 +178,6 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
send->conn.connFifo = resources->devRemHostMem->connFifo;
}
if (useMemcpySend) {
int tpProxyRank;
tpProxyRank = comm->topParentRanks[comm->rank];
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, tpProxyRank, &send->proxyConn));
struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
@@ -177,14 +195,11 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
// Setup device pointers
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
char* buff;
char shmPath[PATH_MAX];
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
resources->remShmSize = info->shmSize;
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle));
NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
recv->conn.buffs[p] = buff;
buff += comm->buffSizes[p];
@@ -194,7 +209,6 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
recv->conn.stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
if (useMemcpyRecv) {
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
@@ -210,8 +224,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
static ncclResult_t shmSendFree(struct ncclConnector* send) {
struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
if (resources) {
NCCLCHECK(ncclShmClose(resources->hostHandle));
NCCLCHECK(ncclShmClose(resources->remHandle));
NCCLCHECK(ncclShmIpcClose(&resources->remDesc));
free(resources);
send->transportResources = NULL;
}
@@ -221,8 +234,7 @@ static ncclResult_t shmSendFree(struct ncclConnector* send) {
static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
if (resources) {
NCCLCHECK(ncclShmClose(resources->hostHandle));
NCCLCHECK(ncclShmClose(resources->remHandle));
NCCLCHECK(ncclShmIpcClose(&resources->remDesc));
free(resources);
recv->transportResources = NULL;
}
@@ -230,51 +242,76 @@ static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
}
static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
ncclResult_t ret = ncclSuccess;
if (reqSize != sizeof(struct shmProxyInfo) || respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
struct shmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(proxyInfo, reqBuff, reqSize);
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]));
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
struct shmProxyInfo* reqInfo = (struct shmProxyInfo*)reqBuff;
proxyInfo = (struct shmProxyInfo*)connection->transportResources;
proxyInfo->shmFifo = reqInfo->shmFifo;
proxyInfo->sendMem = reqInfo->sendMem;
proxyInfo->recvMem = reqInfo->recvMem;
NCCLCHECKGOTO(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]), ret, fail);
NCCLCHECKGOTO(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1), ret, fail);
CUDACHECKGOTO(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking), ret, fail);
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventCreate(proxyInfo->events+i));
CUDACHECKGOTO(cudaEventCreate(proxyInfo->events+i), ret, fail);
}
connection->proxyAppendPtr = &connection->proxyAppend;
connection->transportResources = proxyInfo;
if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(respBuff, proxyInfo, respSize);
return ncclSuccess;
*done = 1;
exit:
return ret;
fail:
if (proxyInfo->ceRecvMem) ncclCudaHostFree(proxyInfo->ceRecvMem);
if (proxyInfo->devFifo) (void)ncclCudaFree(proxyInfo->devFifo);
free(proxyInfo);
goto exit;
}
static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
ncclResult_t ret = ncclSuccess;
if (reqSize != sizeof(struct shmProxyInfo) || respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
struct shmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(proxyInfo, reqBuff, reqSize);
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]));
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
CUDACHECK(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking));
struct shmProxyInfo* reqInfo = (struct shmProxyInfo*)reqBuff;
proxyInfo = (struct shmProxyInfo*)connection->transportResources;
proxyInfo->shmFifo = reqInfo->shmFifo;
proxyInfo->sendMem = reqInfo->sendMem;
proxyInfo->recvMem = reqInfo->recvMem;
NCCLCHECKGOTO(ncclCudaCalloc(&proxyInfo->devFifo, proxyState->buffSizes[NCCL_PROTO_SIMPLE]), ret, fail);
NCCLCHECKGOTO(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1), ret, fail);
CUDACHECKGOTO(cudaStreamCreateWithFlags(&proxyInfo->stream, cudaStreamNonBlocking), ret, fail);
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventCreate(proxyInfo->events+i));
CUDACHECKGOTO(cudaEventCreate(proxyInfo->events+i), ret, fail);
}
connection->proxyAppendPtr = &connection->proxyAppend;
connection->transportResources = proxyInfo;
if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(respBuff, proxyInfo, respSize);
return ncclSuccess;
*done = 1;
exit:
return ret;
fail:
if (proxyInfo->ceRecvMem) ncclCudaHostFree(proxyInfo->ceRecvMem);
if (proxyInfo->devFifo) (void)ncclCudaFree(proxyInfo->devFifo);
free(proxyInfo);
goto exit;
}
static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
if (resources) {
CUDACHECK(cudaStreamDestroy(resources->stream));
NCCLCHECK(ncclCudaFree(resources->devFifo));
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventDestroy(resources->events[i]));
if (useMemcpySend) {
CUDACHECK(cudaStreamDestroy(resources->stream));
NCCLCHECK(ncclCudaFree(resources->devFifo));
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventDestroy(resources->events[i]));
}
}
NCCLCHECK(ncclShmIpcClose(&resources->desc));
free(connection->transportResources);
connection->transportResources = NULL;
}
@@ -285,12 +322,15 @@ static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, str
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
if (resources) {
CUDACHECK(cudaStreamDestroy(resources->stream));
NCCLCHECK(ncclCudaFree(resources->devFifo));
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventDestroy(resources->events[i]));
if (useMemcpyRecv) {
CUDACHECK(cudaStreamDestroy(resources->stream));
NCCLCHECK(ncclCudaFree(resources->devFifo));
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(cudaEventDestroy(resources->events[i]));
}
}
NCCLCHECK(ncclShmIpcClose(&resources->desc));
free(connection->transportResources);
connection->transportResources = NULL;
}
@@ -413,12 +453,37 @@ static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, stru
return ncclSuccess;
}
struct ncclTransport shmTransport = {
"SHM",
shmCanConnect,
{ shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL, NULL },
{ shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL, NULL }
};
static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct shmRequest* req = (struct shmRequest*)reqBuff;
/* check message size */
if (reqSize != sizeof(struct shmRequest)) return ncclInternalError;
if (respSize != sizeof(struct shmConnectInfo)) return ncclInternalError;
struct shmConnectInfo* info = (struct shmConnectInfo*)respBuff;
struct shmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr));
memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
connection->transportResources = proxyInfo;
return ncclSuccess;
}
static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct shmRequest* req = (struct shmRequest*)reqBuff;
/* check message size */
if (reqSize != sizeof(struct shmRequest)) return ncclInternalError;
if (respSize != sizeof(struct shmConnectInfo)) return ncclInternalError;
struct shmConnectInfo* info = (struct shmConnectInfo*)respBuff;
struct shmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr));
memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
connection->transportResources = proxyInfo;
return ncclSuccess;
}
static void initCeOperation() {
static int init = 0;
@@ -427,12 +492,10 @@ static void initCeOperation() {
useMemcpyRecv = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 2);
if (useMemcpySend) {
shmTransport.send.proxyConnect = shmSendProxyConnect;
shmTransport.send.proxyFree = shmSendProxyFree;
shmTransport.send.proxyProgress = shmSendProxyProgress;
}
if (useMemcpyRecv) {
shmTransport.recv.proxyConnect = shmRecvProxyConnect;
shmTransport.recv.proxyFree = shmRecvProxyFree;
shmTransport.recv.proxyProgress = shmRecvProxyProgress;
}
shmLocality = ncclParamShmLocality();
@@ -443,3 +506,152 @@ static void initCeOperation() {
init = 1;
}
}
ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *desc, void **hptr, void **dptr) {
if (desc == NULL || hptr == NULL || tpProxyRank < -1) {
WARN("Invalid argument desc %p, hptr %p, tpProxyRank %d", desc, hptr, tpProxyRank);
return ncclInvalidArgument;
}
#if CUDART_VERSION >= 12020
if (ncclCuMemEnable() && ncclCuMemHostEnable() && !legacy) {
// cuMem API support
CUmemAllocationHandleType type = SHM_HANDLE_TYPE;
CUmemGenericAllocationHandle handle;
NCCLCHECK(ncclCuMemHostAlloc(hptr, &handle, size));
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
// Return the native cuMem handle for later Export/Import via UDS
memcpy(&desc->shmci.data, &handle, sizeof(handle));
desc->shmci.tpProxyRank = tpProxyRank;
} else {
CUCHECK(cuMemExportToShareableHandle(&desc->shmci.handle, handle, type, 0));
}
desc->shmci.size = size;
desc->shmci.ptr = *hptr;
if (dptr) *dptr = *hptr;
desc->legacy = false;
INFO(NCCL_SHM, "CUMEM allocated shareable buffer %p size %zi", desc->shmci.ptr, desc->shmci.size);
} else {
char shmPath[SHM_PATH_MAX] = { '\0' };
desc->shmli.shmSize = size;
NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle));
memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix));
desc->legacy = true;
INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
}
#else /* CUDART_VERSION >= 12020 */
char shmPath[SHM_PATH_MAX] = { '\0' };
desc->shmli.shmSize = size;
NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle));
memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix));
desc->legacy = true;
INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, size, *hptr);
#endif /* CUDART_VERSION >= 12020 */
return ncclSuccess;
}
ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut) {
if (comm == NULL || desc == NULL || hptr == NULL || descOut == NULL) {
WARN("Invalid argument comm %p, desc %p, hptr %p, descOut %p", comm, desc, hptr, descOut);
return ncclInvalidArgument;
}
#if CUDART_VERSION >= 12020
if (ncclCuMemEnable() && ncclCuMemHostEnable() && !desc->legacy) {
// cuMem API support
CUdeviceptr hostptr = 0;
CUmemAllocationHandleType type = SHM_HANDLE_TYPE;
CUmemGenericAllocationHandle handle;
int cudaDev;
CUdevice currentDev;
CUmemAccessDesc accessDesc = {};
int cpuNumaNodeId;
size_t granularity;
size_t size = desc->shmci.size;
CUmemAllocationProp prop = {};
// Import and map the remote memory descriptor to the local GPU
if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
// UDS fd support
int fd = -1;
// Send cuMem handle to remote for conversion to an fd
NCCLCHECK(ncclProxyClientGetFdBlocking(comm, desc->shmci.tpProxyRank, &desc->shmci.data, &fd));
CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type));
(void) close(fd);
} else {
CUCHECK(cuMemImportFromShareableHandle(&handle, &desc->shmci.handle, type));
}
// Get cpu numa id
CUDACHECK(cudaGetDevice(&cudaDev));
CUCHECK(cuDeviceGet(&currentDev, cudaDev));
CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev));
if (cpuNumaNodeId < 0) cpuNumaNodeId = 0;
// Get granularity
prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.requestedHandleTypes = type;
prop.location.id = cpuNumaNodeId;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
ALIGN_SIZE(size, granularity);
// Reserve and map address
CUCHECK(cuMemAddressReserve(&hostptr, size, /* alignment */ 0, /* addr */ 0, /* flags */ 0));
CUCHECK(cuMemMap(hostptr, size, /* offset */ 0, handle, /* flags */ 0));
// Allow access by the local GPU
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = cudaDev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess(hostptr, size, &accessDesc, 1));
// Allow access by the local numa
accessDesc.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
accessDesc.location.id = cpuNumaNodeId;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess(hostptr, size, &accessDesc, 1));
descOut->shmci.ptr = *hptr = (void *)hostptr;
descOut->legacy = false;
if (dptr) *dptr = (void *)hostptr;
INFO(NCCL_SHM, "CUMEM imported shareable host buffer from tpProxyRank %d size %zi ptr %p, granularity %ld", desc->shmci.tpProxyRank, desc->shmci.size, descOut->shmci.ptr, granularity);
} else {
char shmPath[SHM_PATH_MAX];
sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
descOut->legacy = true;
INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
}
#else /* CUDART_VERSION >= 12020 */
char shmPath[SHM_PATH_MAX];
sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
descOut->legacy = true;
INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
#endif
return ncclSuccess;
}
ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc) {
if (desc) {
#if CUDART_VERSION >= 12020
if (ncclCuMemEnable() && ncclCuMemHostEnable() && !desc->legacy) {
NCCLCHECK(ncclCuMemHostFree(desc->shmci.ptr));
} else {
NCCLCHECK(ncclShmClose(desc->shmli.handle));
}
#else
NCCLCHECK(ncclShmClose(desc->shmli.handle));
#endif
}
return ncclSuccess;
}
struct ncclTransport shmTransport = {
"SHM",
shmCanConnect,
{ shmSendSetup, shmSendConnect, shmSendFree, NULL, shmSendProxySetup, NULL, shmSendProxyFree, NULL },
{ shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, shmRecvProxySetup, NULL, shmRecvProxyFree, NULL }
};