2.8.3-1
Optimization for Tree allreduce on A100.
Improve aggregation performance.
Use shared buffers for inter-node send/recv.
Add NVTX profiling hooks.
Accelerate alltoall connections by merging communication for all
channels.
Add support for one hop communication through NVLink, for faster
send/recv communication on cubemesh topologies like DGX-1.
Improve alltoall scheduling to better balance intra/inter node
communication.
Increase send/recv parallelism by 8x, each warp sending or
receiving to a different peer.
Net: move to v4.
Net: make flush operation asynchronous to accelerate alltoall.
Net: define maximum number of requests.
Fix hang when using LL128 protocol after 2^31 steps.
Fix #379 : topology injection failing when using less GPUs than
described in the XML.
Fix #394 : protocol mismatch causing hangs or crashes when using
one GPU per node.
[ROCm/rccl commit: 920dbe5b35]
This commit is contained in:
@@ -127,7 +127,7 @@ void ncclDebugInit() {
|
||||
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
|
||||
if (ncclDebugLevel == -1) ncclDebugInit();
|
||||
if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
|
||||
if (ncclDebugLevel < level) return;
|
||||
if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;
|
||||
|
||||
// Gather the rank information. This can take > 1us so we want to make sure
|
||||
// we only do it when needed.
|
||||
@@ -144,11 +144,11 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
if (level == NCCL_LOG_WARN)
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line);
|
||||
else if (level == NCCL_LOG_INFO && (flags & ncclDebugMask))
|
||||
else if (level == NCCL_LOG_INFO)
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
|
||||
#ifdef ENABLE_TRACE
|
||||
else if (level == NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
|
||||
else if (level == NCCL_LOG_TRACE) {
|
||||
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
|
||||
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
|
||||
Verwijs in nieuw issue
Block a user