920dbe5b35
Optimization for Tree allreduce on A100. Improve aggregation performance. Use shared buffers for inter-node send/recv. Add NVTX profiling hooks. Accelerate alltoall connections by merging communication for all channels. Add support for one hop communication through NVLink, for faster send/recv communication on cubemesh topologies like DGX-1. Improve alltoall scheduling to better balance intra/inter node communication. Increase send/recv parallelism by 8x, each warp sending or receiving to a different peer. Net: move to v4. Net: make flush operation asynchronous to accelerate alltoall. Net: define maximum number of requests. Fix hang when using LL128 protocol after 2^31 steps. Fix #379 : topology injection failing when using less GPUs than described in the XML. Fix #394 : protocol mismatch causing hangs or crashes when using one GPU per node.
110 行
3.8 KiB
C++
110 行
3.8 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include "nccl.h"
|
|
|
|
#define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank)
|
|
|
|
/* Btree which alternates leaves and nodes.
|
|
* Assumes root is 0, which conveniently builds a tree on powers of two,
|
|
* (because we have pow2-1 ranks) which lets us manipulate bits.
|
|
* Find first non-zero bit, then :
|
|
* Find the parent :
|
|
* xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below)
|
|
* xx11[0] -> xx10[0] (3,7,11 below)
|
|
* Find the children :
|
|
* xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13)
|
|
* xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13)
|
|
*
|
|
* Illustration :
|
|
* 0---------------8
|
|
* ______/ \______
|
|
* 4 12
|
|
* / \ / \
|
|
* 2 6 10 \
|
|
* / \ / \ / \ \
|
|
* 1 3 5 7 9 11 13
|
|
*/
|
|
ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) {
|
|
int up, down0, down1;
|
|
int bit;
|
|
for (bit=1; bit<nranks; bit<<=1) {
|
|
if (bit & rank) break;
|
|
}
|
|
|
|
if (rank == 0) {
|
|
*u = -1;
|
|
*d0 = -1;
|
|
// Child rank is > 0 so it has to be our child 1, not 0.
|
|
*d1 = nranks > 1 ? bit >> 1 : -1;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
up = (rank ^ bit) | (bit << 1);
|
|
// if smaller than the parent, we are his first child, otherwise we're his second
|
|
if (up >= nranks) up = (rank ^ bit);
|
|
*parentChildType = (rank < up) ? 0 : 1;
|
|
*u = up;
|
|
|
|
int lowbit = bit >> 1;
|
|
// down0 is always within bounds
|
|
down0 = lowbit == 0 ? -1 : rank-lowbit;
|
|
|
|
down1 = lowbit == 0 ? -1 : rank+lowbit;
|
|
// Make sure down1 is within bounds
|
|
while (down1 >= nranks) {
|
|
down1 = lowbit == 0 ? -1 : rank+lowbit;
|
|
lowbit >>= 1;
|
|
}
|
|
*d0 = down0; *d1 = down1;
|
|
|
|
return ncclSuccess;
|
|
}
|
|
|
|
/* Build a double binary tree. Take the previous tree for the first tree.
|
|
* For the second tree, we use a mirror tree (if nranks is even)
|
|
*
|
|
* 0---------------8 3----------------11
|
|
* ______/ \ / \______
|
|
* 4 \ / 7
|
|
* / \ \ / / \
|
|
* 2 6 10 1 5 9
|
|
* / \ / \ / \ / \ / \ / \
|
|
* 1 3 5 7 9 11 0 2 4 6 8 10
|
|
*
|
|
* or shift it by one rank (if nranks is odd).
|
|
*
|
|
* 0---------------8 1---------------9
|
|
* ______/ \______ ______/ \______
|
|
* 4 12 5 0
|
|
* / \ / / \ /
|
|
* 2 6 10 3 7 11
|
|
* / \ / \ / \ / \ / \ / \
|
|
* 1 3 5 7 9 11 2 4 6 8 10 12
|
|
*/
|
|
ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) {
|
|
// First tree ... use a btree
|
|
ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0);
|
|
// Second tree ... mirror or shift
|
|
if (nranks % 2 == 1) {
|
|
// shift
|
|
int shiftrank = (rank-1+nranks) % nranks;
|
|
int u, d0, d1;
|
|
ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1);
|
|
*s1 = u == -1 ? -1 : (u+1) % nranks;
|
|
*d1_0 = d0 == -1 ? -1 : (d0+1) % nranks;
|
|
*d1_1 = d1 == -1 ? -1 : (d1+1) % nranks;
|
|
} else {
|
|
// mirror
|
|
int u, d0, d1;
|
|
ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1);
|
|
*s1 = u == -1 ? -1 : nranks-1-u;
|
|
*d1_0 = d0 == -1 ? -1 : nranks-1-d0;
|
|
*d1_1 = d1 == -1 ? -1 : nranks-1-d1;
|
|
}
|
|
return ncclSuccess;
|
|
}
|