* Changes to device code

* Changes to src/misc

* Changes to graph

* src/include changes

* src/transport changes

* changes in init, enqueue, proxy

* Changes to CMakeLists.txt

* Additional changes to device code

* Additional changes to net.cc

* adding 'compiler warning' tag to ease upstream merge'

* typo correction

* Addessing comments

* Additional changes for new commits

[ROCm/rccl commit: 3f8cac388e]
Этот коммит содержится в:
Avinash
2025-08-05 17:36:23 -05:00
коммит произвёл GitHub
родитель df3b7e477f
Коммит f34d760613
28 изменённых файлов: 104 добавлений и 106 удалений
+2 -1
Просмотреть файл
@@ -1112,7 +1112,8 @@ target_compile_options(rccl PRIVATE -Werror=sometimes-uninitialized)
target_compile_options(rccl PRIVATE -Wall)
target_compile_options(rccl PRIVATE -Werror=deprecated-copy-with-user-provided-copy)
target_compile_options(rccl PRIVATE -Wno-format-nonliteral)
target_compile_options(rccl PRIVATE -fgpu-rdc) # Generate relocatable device code (required for extern __shared__)
target_compile_options(rccl PRIVATE -Wno-unused-function)
target_compile_options(rccl PRIVATE -fgpu-rdc)
## Set RCCL compile and linker options for unit tests and code coverage
if(ENABLE_CODE_COVERAGE)
+11 -5
Просмотреть файл
@@ -16,7 +16,10 @@ namespace {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
#if defined(ENABLE_NPKIT)
const int bid = ncclShmem.channelId - work->channelLo;
int npKitCtxIdx = bid; // unused variable - compiler warning
#endif
ncclRing *ring = &ncclShmem.channel.ring;
const int *ringRanks = ring->userRanks;
const int nranks = ncclShmem.comm.nRanks;
@@ -27,9 +30,6 @@ namespace {
int nelem;
int rankDest;
#if defined(ENABLE_NPKIT)
int npKitCtxIdx = bid;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
@@ -229,6 +229,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
static constexpr int nworkers = NCCL_PAT_NWORKERS;
struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
uint64_t pollCount = 0;
(void)pollCount; // unused variable - compiler warning
__syncthreads(); // Don't start using shared mem until everyone arrives
for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
if (tid == 0) shmem->localAccSize = 0;
@@ -238,11 +239,14 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
if (tid == nworkers) { // Algo computation thread
PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
(void)parallelFactor;// unused variable - compiler warning
int step = 0;
while (1) {
struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
int* poll = &ps->flags;
while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0) {
pollCount++ ;// Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
}
patAlgo.getNextOp(ps);
int last = ps->last;
step++;
@@ -267,7 +271,9 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
while(1) {
struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
int* poll = &ps->flags;
while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0) pollCount++; // Wait for compute thread
while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0){
pollCount++; // Wait for compute thread
}
int last = ps->last;
prims.patCopy(ps, shmem);
if (tidInGroup == 0) __hip_atomic_store(poll, 0, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); // Return element to compute thread
+9 -7
Просмотреть файл
@@ -23,7 +23,10 @@ namespace {
ncclRing *ring = &ncclShmem.channel.ring;
int ringIx = ring->index;
const int nranks = ncclShmem.comm.nRanks;
#if defined(ENABLE_NPKIT)
const int bid = ncclShmem.channelId - work->channelLo;
int npKitCtxIdx = bid; // unused variable - compiler warning
#endif
ssize_t size;
ssize_t gridOffset;
ssize_t channelCount;
@@ -34,9 +37,6 @@ namespace {
int nelem;
int chunk;
#if defined(ENABLE_NPKIT)
int npKitCtxIdx = bid;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
@@ -216,7 +216,10 @@ namespace {
#else
__device__ __attribute__((noinline)) void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
#if defined(ENABLE_NPKIT)
const int bid = ncclShmem.channelId - work->channelLo;
int npKitCtxIdx = bid; // unused variable - compiler warning
#endif
ncclTree *tree = &ncclShmem.channel.tree;
size_t size;
size_t gridOffset;
@@ -226,9 +229,6 @@ namespace {
size_t offset;
int nelem;
#if defined(ENABLE_NPKIT)
int npKitCtxIdx = bid;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
@@ -364,7 +364,9 @@ namespace {
#else
__device__ __attribute__((noinline)) void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
const int bid = ncclShmem.channelId - work->channelLo;
#if defined(ENABLE_NPKIT)
const int bid = ncclShmem.channelId - work->channelLo; // unused variable - compiler warning
#endif
ncclTree *tree = &ncclShmem.channel.tree;
size_t size;
size_t gridOffset;
+3 -3
Просмотреть файл
@@ -15,7 +15,10 @@ namespace {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
#if defined(ENABLE_NPKIT)
const int bid = ncclShmem.channelId - work->channelLo;
int npKitCtxIdx = bid; // unused variable - compiler warning
#endif
ncclRing *ring = &ncclShmem.channel.ring;
const int rank = ring->userRanks[0];
const int nextRank = ring->userRanks[1];
@@ -30,9 +33,6 @@ namespace {
int workNthreads;
bool isNetOffload = work->isOneRPN && work->netRegUsed;
#if defined(ENABLE_NPKIT)
int npKitCtxIdx = bid;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
+1 -1
Просмотреть файл
@@ -20,7 +20,7 @@
if (nthreads == NCCL_MAX_NTHREADS) { \
__THREAD_FENCE; __builtin_amdgcn_s_barrier(); \
} else { \
const int w = threadIdx.x/WARP_SIZE; \
/**const int w = threadIdx.x/WARP_SIZE //unused variable - compiler warning**/;\
const int wid = threadIdx.x%WARP_SIZE; \
if (wid == 0) { \
(BARRIER_NEXT) += (NWORKERS) / WARP_SIZE; \
+4 -3
Просмотреть файл
@@ -148,7 +148,8 @@ private:
__device__ uint64_t readLL(int offset, int i) {
union ncclLLFifoLine* src = recvPtr(i) + offset;
uint32_t flag = recvFlag(i);
uint32_t data1, flag1, data2, flag2;
uint32_t data1, flag1, data2, flag2;
(void)data1; (void)flag1; (void)data2; (void)flag2; // unused variable - compiler warning
int spins = 0;
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
@@ -865,8 +866,8 @@ public:
int eltInLine = EltPerLine < nelem ? EltPerLine : nelem;
DataLoader dl;
ncclLLFifoLine line[MaxRecv];
uint64_t data, peerData;
// ncclLLFifoLine line[MaxRecv];//unused variable - compiler warning
uint64_t data /*peerData*/; //unused variable - compiler warning
dl.loadBegin(srcElts, eltInLine);
srcElts += eltPerTrip;
data = dl.loadFinish();
+3 -4
Просмотреть файл
@@ -521,10 +521,9 @@ public:
bool ipcReg = false, bool netReg = false, int stepSize_ = 0
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
warpInBlock(threadIdx.x/WARP_SIZE),
flagThread((tid%4)==3), group(group),
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), /*compiler warnings*/
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)),
warp(tid/WARP_SIZE), warpInBlock(threadIdx.x/WARP_SIZE), flagThread((tid%4)==3), group(group){
auto *channel = &ncclShmem.channel;
barriers = &ncclShmem.groups[group].barrier;
int nrecv=0, nsend=0;
+2 -2
Просмотреть файл
@@ -751,8 +751,8 @@ public:
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* collWork = nullptr,
struct ncclDevWorkP2p* p2pWork = nullptr, int stepSize_ = 0, int mode = primsModeDefault
):
tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
tid(tid), tidInBlock(threadIdx.x), nthreads(nthreads), /*compiler warnings*/
stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_), group(group) {
barriers = &ncclShmem.groups[group].barrier;
// PAT uses the same barrier for each group
+8 -3
Просмотреть файл
@@ -178,7 +178,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SI
static constexpr int nworkers = NCCL_PAT_NWORKERS;
struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
uint64_t pollCount = 0;
//uint64_t pollCount = 0; unused variable - compiler warning
__syncthreads(); // Don't start using shared mem until everyone arrives
for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
if (tid == 0) shmem->localAccSize = 0;
@@ -188,11 +188,14 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SI
if (tid == nworkers) { // Algo computation thread
PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
(void)parallelFactor;// unused variable - compiler warning
int step = 0;
while (1) {
struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
int* poll = &ps->flags;
while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0){
//pollCount++;// unused variable - compiler warning // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
}
patAlgo.getNextOp(ps);
int last = ps->last;
step++;
@@ -217,7 +220,9 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SI
while(1) {
struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
int* poll = &ps->flags;
while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0) pollCount++; // Wait for compute thread
while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0) {
//pollCount++; // unused variable - compiler warning // Wait for compute thread
}
int last = ps->last;
prims.patReduce(ps, shmem);
if (tidInGroup == 0) __hip_atomic_store(poll, 0, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); // Return element to compute thread
+2 -2
Просмотреть файл
@@ -1238,7 +1238,7 @@ static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduce
warned = 1;
WARN("Waiting for work FIFO to become available. "
"Work fifo exhaustion can happen in large scale/high iteration count of alltoall. "
"In order to increase work FIFO size, set NCCL_WORK_FIFO_BYTES to higher number (current: %ld).\n\n"
"In order to increase work FIFO size, set NCCL_WORK_FIFO_BYTES to higher number (current: %d).\n\n"
"RCCL continues to retry...", comm->workFifoBytes);
}
@@ -1776,7 +1776,7 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
ncclIntruQueueConstruct(&planner->planQueue);
bool capturing = ncclCudaGraphValid(planner->capturingGraph);
cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
//cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch // unused variable - compiler warning
cudaStream_t deviceStream, launchOrder;
if (capturing || planner->numStreams != 1) {
+2 -2
Просмотреть файл
@@ -119,14 +119,14 @@ bool isRankHere(const char* s, int start, int end, int rank) {
ncclResult_t ncclTreeBasePostset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph) {
int x=0, y=0;
int x=0;
for (int i=0; treeGraph->treeBase[i][0]!=0; i++)
{
x=i+1;
}
if( treeGraph->treeBase[0][0] == 0) return ncclSuccess;
int nChannels = comm->nChannels;
int localRanks = comm->topo->nodes[GPU].count;
//int localRanks = comm->topo->nodes[GPU].count; // unused variable - compiler warning
//new tree
for (int c=0; c<nChannels; c++) { // in here
int buff = c%x;
+6 -4
Просмотреть файл
@@ -286,14 +286,17 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst
// GPU not found, we can't use p2p.
return ncclSuccess;
}
#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
int intermediateIndex = -1;
#endif
// Set intermediate GPU rank, if routing through an intermediate GPU.
struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
if (path->count == 2) {
struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
if (intermediateNode->type == GPU) {
#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
intermediateIndex = intermediateNode - system->nodes[GPU].nodes;
#endif
if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank;
}
}
@@ -324,8 +327,7 @@ compare:
// Compute the PCI distance and compare with the p2pLevel.
if (path->type <= p2pLevel) *p2p = 1;
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
#else
#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
if (*p2p == 1) {
// NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to
// validate against NVML at all since they are pretending to be on other hw.
@@ -515,10 +517,10 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev,
int g;
struct ncclTopoSystem* system = comm->topo;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
*flush = 1;
#else
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; // unused variable - compiler warning
// Flush is required on Ampere and earlier
if (gpu->gpu.cudaCompCap >= 90) *flush = 0;
// On C2C platforms, data could go through a PCI switch while completions and
+1 -10
Просмотреть файл
@@ -1338,7 +1338,6 @@ end:
*/
ncclResult_t parseGraphLight(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map) {
int gpus[NCCL_TOPO_MAX_NODES]; //transcribe/change according to gpu_map
int nChannels = 0;
int gpu = 0;
int offset = 0;
int start_offset = offset;
@@ -1348,7 +1347,7 @@ ncclResult_t parseGraphLight(const char* str, struct ncclTopoSystem* system, str
}
int status = 0; // 0 : between numbers, 1 : inside number
int ngpus = system->nodes[GPU].count;
int x=0, y=0;
int x=0;
do {
int digit = str[offset] - '0';
if (digit >= 0 && digit <= 9) {
@@ -1855,7 +1854,6 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
bool isAlltoall = checkAlltoallWidth(&romeTopo);
if (!isAlltoall) return ncclSuccess;
int gcnt = 0;
int *g8, n[NCCL_TOPO_MAX_NODES];
int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int));
struct timeval tvs, tve;
@@ -1878,7 +1876,6 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
}
if (ngpusPerNuma == 0) continue;
if (ngpusPerNuma != NUMA_GPUS) break;
gcnt++;
// init GPU mapping
for (int k = 0; k < ngpus; k++) {
if (romeTopo.gpuNuma[k] != j) continue;
@@ -1927,7 +1924,6 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
if (p < TOTAL_PERMUTE_COUNT) break;
}
gettimeofday(&tve, NULL);
float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3;
if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) {
//printf("No solution in %.2fms\n", t);
return ncclSuccess;
@@ -2035,7 +2031,6 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
int i;
int ngpus = system->nodes[GPU].count;
int ncpus = system->nodes[CPU].count;
int nnets = system->nodes[NET].count;
// Only support ring and tree graphs
@@ -2129,7 +2124,6 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
}
}
gettimeofday(&tve, NULL);
float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3;
if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) {
//printf("No solution in %.2fms (%d iter)\n", t, time);
return ncclSuccess;
@@ -2241,7 +2235,6 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
// only match for system with 16 GPUs
if (ngpus != 16 || ncpus != NUMA_CPUS) return ncclSuccess;
int gcnt = 0;
int *g16, n[NCCL_TOPO_MAX_NODES], rdm[NUMA_GPUS*NUMA_CPUS];
int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int));
struct timeval tvs, tve;
@@ -2262,7 +2255,6 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
}
if (ngpusPerNuma == 0) continue;
if (ngpusPerNuma != NUMA_GPUS) break;
gcnt++;
// init GPU mapping
for (int k = 0; k < ngpus; k++) {
if (romeTopo.gpuNuma[k] != j) continue;
@@ -2317,7 +2309,6 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
if (p < TOTAL_PERMUTE_COUNT) break;
}
gettimeofday(&tve, NULL);
float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3;
if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) {
//printf("No solution in %.2fms\n", t);
return ncclSuccess;
+11 -6
Просмотреть файл
@@ -168,7 +168,7 @@ static struct tuningModel tuning_model_0 {
{ 1.0, 0.8, 0.2, 1.0, 1.0, 0.3, 1.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.5, 1.0, 0.8, 0.8, 1.0, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
},
.llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED},
.llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
};
static struct tuningModel tuning_model_1 {
@@ -200,7 +200,7 @@ static struct tuningModel tuning_model_1 {
{ 0.3, 1.0, 0.3, 0.1, 0.1, 0.1, 0.3, 0.7, 1.0, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.9, 1.0, 1.0, 1.0, 1.0, },
},
.llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED},
.llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
};
static struct tuningModel tuning_model_2 {
@@ -232,7 +232,7 @@ static struct tuningModel tuning_model_2 {
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.4, 0.5, 0.6, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
},
.llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED},
.llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
};
static struct tuningModel tuning_model_3 {
@@ -264,7 +264,7 @@ static struct tuningModel tuning_model_3 {
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 1.0, 0.1, 0.3, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.4, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
},
.llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED},
.llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
};
static struct tuningModel tuning_model_4 {
@@ -296,7 +296,7 @@ static struct tuningModel tuning_model_4 {
{ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.8, 0.5, 0.1, 0.7, 0.2, 0.4, 0.4, 0.6, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
},
.llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED},
.llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
};
static struct tuningModel tuning_model_5 {
@@ -394,6 +394,7 @@ static struct tuningModel rcclTuningModel[] = {
#define HOPPER_COMPCAP_IDX 2
#define BLACKWELL_COMPCAP_IDX 3
#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
// LL128 max BW per channel
static const double llMaxBws[][3] = {
/* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
@@ -420,6 +421,7 @@ static const double perChMaxTreeBws[][3] = {
/* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
/* Blackwell (N1/N2/N4) */ {2*38.7, 2*41.4, 2*36.0},
};
#endif
NCCL_PARAM(PatEnable, "PAT_ENABLE", 0);
static int ncclPatEnable(struct ncclComm* comm) {
@@ -470,7 +472,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
int nNodes = comm->nNodes;
int nRanks = comm->nRanks;
if (nRanks <= 1) return ncclSuccess;
#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
int index2 = nNodes <= 2 ? nNodes-1 : 2;
// LL: for single node, we look at GPU type; for multi-node, we look at CPU type
@@ -480,6 +482,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2];
double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2];
double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
#endif
// De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
//if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
float ppn = (float)nRanks / nNodes;
@@ -805,11 +808,13 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
// factor is not ideal but works quite well. Powers of two, 64 B to 256MB.
#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
{ 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .4, .4, .5, .6, .7, .8, .9, 1.0, 1.0, 1.0, 1.0 },
{ 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .7, .6, .6, .6, .6, .6, .6, .8, .9, .9, .9, .9, 1.0, 1.0 },
{ .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .6, .7, .8, .7, .7, .8, .9, .9 }
};
#endif
ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time) {
float bw = comm->bandwidths[coll][algorithm][protocol];
+4 -4
Просмотреть файл
@@ -180,11 +180,11 @@ static gdr_t ncclGdrInit() {
template <typename T>
static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle, hipStream_t stream) {
gdr_info_t info;
// gdr_info_t info; // unused variable - compiler warning
size_t mapSize;
gdr_mh_t mh;
// gdr_mh_t mh; // unused variable - compiler warning
char *devMem;
void *gdrMap;
// void *gdrMap; // unused variable - compiler warning
mapSize = ncclSizeOfT<T>()*nelem;
@@ -216,7 +216,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
template <typename T>
static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
//gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; // unused variable - compiler warning
memcpy(dst, src, nelem*sizeof(T));
return ncclSuccess;
}
+1 -6
Просмотреть файл
@@ -73,11 +73,6 @@ inline __device__ rccl_float8 hadd(rccl_float8 x, rccl_float8 y)
u.i16_vec = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(v1, v1, /* scale */ 1.f, 0);
return u.fp8[0];
#elif __HIP_DEVICE_COMPILE__ && defined(__gfx942__)
union
{
uint32_t i32val;
rccl_float8 i8val[4];
} val;
float2_t v;
uint32_t ival = 0;
@@ -235,7 +230,7 @@ namespace rocblas_hip_f8_impl
else
x = reinterpret_cast<uint16_t&>(_x);
uint32_t y, head, mantissa;
uint32_t head, mantissa;
int exponent, bias;
uint32_t sign;
+3 -4
Просмотреть файл
@@ -709,7 +709,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
int nRanks = comm->nRanks;
struct ncclDevCommAndChannels tmpCommAndChans;
struct ncclDevCommAndChannels *devCommAndChans = NULL;
struct ncclNvmlCCStatus ccStatus;
//struct ncclNvmlCCStatus ccStatus; //unused variable - compiler warning
bool ccEnable = false;
cudaStream_t deviceStream;
@@ -1882,9 +1882,6 @@ fail:
static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
ncclComm_t comm = job->comm;
#ifdef ENABLE_MSCCLPP
ncclUniqueId origUniqueId = *job->commId;
#endif
ncclResult_t res = ncclSuccess;
int archMajor, archMinor;
size_t maxLocalSizeBytes = 0;
@@ -1895,8 +1892,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
double sum_timers = 0;
uint64_t timers[TIMERS_INIT_COUNT] = {0};
unsigned long long commIdHash;
#ifdef USE_INDIRECT_FUNCTION_CALL
int64_t stackSize;
hipDeviceProp_t devProp;
#endif
timers[TIMER_INIT_TOTAL] = clockNano();
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
+1 -5
Просмотреть файл
@@ -48,7 +48,6 @@ struct ARSMI_systemNode {
std::string s_card;
};
static const char *kPathDRMRoot = "/sys/class/drm";
static const char *kKFDNodesPathRoot = "/sys/class/kfd/kfd/topology/nodes";
static const uint32_t kAmdGpuId = 0x1002;
@@ -66,7 +65,6 @@ static thread_local int ARSMI_num_devices=-1;
int ARSMI_init(void)
{
std::string err_msg;
uint32_t count = 0;
std::multimap<uint64_t, ARSMI_systemNode> ARSMI_allSystemNodes;
if (ARSMI_num_devices > 0) {
@@ -101,7 +99,7 @@ int ARSMI_init(void)
int ret_loc_id = read_node_properties(node_id, "location_id", &location_id, properties);
int ret_domain = read_node_properties(node_id, "domain", &domain, properties);
int ret_vendor = read_node_properties(node_id, "vendor_id", &vendor_id, properties);
if (ret_gpu_id == 0 && ~(ret_unique_id != 0 || ret_loc_id != 0 || ret_unique_id != 0 || ret_vendor != 0) &&
if (ret_gpu_id == 0 && !(ret_unique_id != 0 || ret_loc_id != 0 || ret_domain != 0 || ret_vendor != 0) &&
(gpu_id != 0) && (vendor_id == kAmdGpuId)) {
// Do not try to build a node if one of these fields
// do not exist in KFD (0 as values okay)
@@ -194,7 +192,6 @@ int ARSMI_init(void)
// the order of each block.
for (auto i=0; i < first_elem.size(); i++) {
// Find the first_elem[i] in sort_vecs in
bool found = false;
for (auto j = 0; j < sort_vecs.size(); j++ ) {
if (first_elem[i] == sort_vecs[j][0].s_bdf) {
for (auto k=0; k<sort_vecs[j].size(); k++) {
@@ -226,7 +223,6 @@ int ARSMI_init(void)
continue;
}
uint64_t hops;
uint64_t type;
uint64_t weight;
uint64_t min_bandwidth;
-1
Просмотреть файл
@@ -30,7 +30,6 @@
RCCL_PARAM(MscclEnabled, "MSCCL_ENABLE", 1);
RCCL_PARAM(MscclForceEnabled, "MSCCL_FORCE_ENABLE", 0);
RCCL_PARAM(MscclEnableSingleProcess, "MSCCL_ENABLE_SINGLE_PROCESS", 1);
static const char* mscclAlgoFilePathEnv = "MSCCL_ALGO_FILE_PATH";
bool mscclEnabled() {
#ifdef COMPILE_MSCCL_KERNEL
-2
Просмотреть файл
@@ -708,7 +708,6 @@ ncclResult_t mscclXmlLoadSingleNode(FILE* file, struct mscclXmlNode* node) {
}
ncclResult_t mscclAlgoMetaXmlLoad(const char* xmlFilePath, struct mscclXmlNode* node) {
ncclResult_t ret = ncclSuccess;
FILE* file = fopen(xmlFilePath, "r");
if (file == NULL) {
fprintf(stderr, "Could not open MSCCL XML algorithm file %s : %s", xmlFilePath, strerror(errno));
@@ -720,7 +719,6 @@ ncclResult_t mscclAlgoMetaXmlLoad(const char* xmlFilePath, struct mscclXmlNode*
}
ncclResult_t mscclGetAlgoMetaFromXmlFile(const char* str, struct mscclAlgoMeta* algoMeta) {
ncclResult_t ret = ncclSuccess;
struct mscclXmlNode* node;
node = (struct mscclXmlNode *)malloc(sizeof(struct mscclXmlNode));
NCCLCHECK(mscclAlgoMetaXmlLoad(str, node));
-1
Просмотреть файл
@@ -123,7 +123,6 @@ ncclResult_t mscclSetupConnections(struct mscclAlgo* hostAlgo, ncclComm_t comm)
static ncclResult_t mscclSetupProxyImpl(struct mscclAlgo* hostAlgo, ncclComm_t comm) {
mscclStatus& status = mscclGetStatus(comm);
mscclThreadLocalStatus& threadLocalStatus = mscclGetThreadLocalStatus();
struct ncclProxyOp proxyOp = {};
proxyOp.connIndex = 0;
proxyOp.sliceSteps = status.sliceSteps;
+17 -17
Просмотреть файл
@@ -147,8 +147,8 @@ std::string facebook_rccl::ProxyTrace::dump(uint64_t commHash) {
sortedDumpStrMap[traceKey.str()] = proxyOpMap.second.str();
}
}
for (const auto &[keyStr, proxyOpStr] : sortedDumpStrMap) {
result += proxyOpStr;
for (const auto &pair : sortedDumpStrMap) {
result += pair.second; //proxyOpStr
}
return result;
}
@@ -159,22 +159,22 @@ std::string facebook_rccl::ProxyTrace::dump() {
// maps serialized key to serliazed proxyOp; sorted by key
std::map<std::string, std::string> sortedDumpStrMap;
for (auto &[commHash, opCountMap] : activeOps) {
for (auto &[opCount, proxyOpMap] : opCountMap) {
for (auto &[opId, opEntry] : proxyOpMap) {
ProxyTraceRecordKey traceKey = {commHash, opCount, opId};
opEntry.computeStatus();
sortedDumpStrMap[traceKey.str()] = opEntry.str();
for (auto &commHash_opCountMap : activeOps) {
for (auto &opCount_proxyOpMap : commHash_opCountMap.second /*opCountMap*/) {
for (auto &opId_opEntry : opCount_proxyOpMap.second/*proxyOpMap*/) {
ProxyTraceRecordKey traceKey = {commHash_opCountMap.first, opCount_proxyOpMap.first, opId_opEntry.first};
opId_opEntry.second.computeStatus();
sortedDumpStrMap[traceKey.str()] = opId_opEntry.second.str();
}
}
}
// add the recent finished ops as well
for (const auto &[keyStr, proxyOpStr] : finishedOps) {
sortedDumpStrMap[keyStr] = proxyOpStr;
for (const auto &keyStr_proxyOpStr : finishedOps) {
sortedDumpStrMap[keyStr_proxyOpStr.first] = keyStr_proxyOpStr.second;
}
for (const auto &[keyStr, proxyOpStr] : sortedDumpStrMap) {
result += proxyOpStr;
for (const auto &keyStr_proxyOpStr : sortedDumpStrMap) {
result += keyStr_proxyOpStr.second;
}
return result;
}
@@ -207,15 +207,15 @@ std::string facebook_rccl::ProxyTraceOp::str() {
float facebook_rccl::ProxyTrace::getMapSizeMB() const {
float size = 0;
for (const auto &[commHash, opCountMap] : activeOps) {
for (const auto &[opCount, proxyOpMap] : opCountMap) {
size += proxyOpMap.size() *
for (const auto &commHash_opCountMap : activeOps) {
for (const auto &opCount_proxyOpMap : commHash_opCountMap.second) {
size += opCount_proxyOpMap.second.size() *
(sizeof(ProxyTraceOp) +
sizeof(std::unique_ptr<facebook_rccl::ProxyTraceOp>));
}
}
for (const auto &[keyStr, proxyOpStr] : finishedOps) {
size += keyStr.size() + proxyOpStr.size();
for (const auto &keyStr_proxyOpStr : finishedOps) {
size += keyStr_proxyOpStr.first.size() + keyStr_proxyOpStr.second.size();
}
return size / 1024.0 / 1024.0;
}
+1 -1
Просмотреть файл
@@ -40,8 +40,8 @@ rcclApiCall::rcclApiCall(rcclCall_t type, const ncclInfo& info)://name(rcclCallS
datatype(info.datatype),
op(info.op),
root(info.root),
comm(info.comm),
nRanks(info.comm->nRanks),
comm(info.comm),
stream(info.stream),
nTasks(info.comm->planner.nTasksP2p + info.comm->planner.nTasksColl),
globalRank(info.comm->localRankToRank[info.comm->localRank])
+2 -1
Просмотреть файл
@@ -663,8 +663,9 @@ ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running) {
}
ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
#ifdef ENABLE_TRACE
char line[SOCKET_NAME_MAXLEN+1];
#endif
if (sock == NULL) {
WARN("ncclSocketConnect: pass NULL socket");
return ncclInvalidArgument;
-3
Просмотреть файл
@@ -292,10 +292,8 @@ ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) {
struct ncclProxyArgs* op = state->active;
int poolIndex, opIndex;
int list_len = 0;
int sublist_len = 0;
fprintf(stderr, "ACTIVE OPS\n");
while (op) {
sublist_len = 0;
NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
if (op->state & OP_SEEN) {
WARN("List loop at element %d-%d", poolIndex, opIndex);
@@ -304,7 +302,6 @@ ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) {
op->state |= OP_SEEN;
struct ncclProxyArgs* nextOp = op->nextPeer;
while (nextOp) {
sublist_len++;
NCCLCHECK(getOpIndex(nextOp, state, &poolIndex, &opIndex));
if (nextOp->state & OP_SEEN) {
WARN("List loop at element %d-%d", poolIndex, opIndex);
+6 -4
Просмотреть файл
@@ -524,6 +524,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
int dmabuf_fd = -1;
(void)dmabuf_fd; /*compiler warnings fix - unused variable*/
#if CUDA_VERSION >= 11070
/* DMA-BUF support */
if (resources->useGdr && resources->useDmaBuf) {
@@ -600,8 +601,9 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
struct connectMapMem* mapMem = map->mems+bank;
NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
int dmabuf_fd = -1;
(void)dmabuf_fd; /*compiler warnings fix - unused variable*/
#if CUDA_VERSION >= 11070
/* DMA-BUF support */
if (resources->useGdr && resources->useDmaBuf) {
@@ -1314,8 +1316,8 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s
NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
needReg = false;
}
#endif
peermem:
#endif
if (dmabuf_fd != -1) {
(void)close(dmabuf_fd);
dmabuf_fd = -1;
@@ -1342,20 +1344,20 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s
assert(reqSize == sizeof(struct collnetRegInfo));
assert(respSize == sizeof(void*));
int dmabuf_fd = -1;
#if CUDART_VERSION >= 11070
int dmabuf_fd = -1;
/* DMA-BUF support */
if (resources->useGdr && resources->useDmaBuf) {
CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
needReg = false;
}
#endif
peermem:
if (dmabuf_fd != -1) {
(void)close(dmabuf_fd);
dmabuf_fd = -1;
}
#endif
if (needReg) {
NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
}
+1 -1
Просмотреть файл
@@ -362,7 +362,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
send->transportResources = map;
opId = send;
INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId);
netSendConnectArgs args = {0};
netSendConnectArgs args = {{},0};
memcpy(&args.handle, connectInfo, sizeof(ncclNetHandle_t));
args.trafficClass = comm->config.trafficClass;
NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(netSendConnectArgs), sizeof(struct connectMap), opId));
+3 -3
Просмотреть файл
@@ -1077,9 +1077,6 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str
struct p2pIpcExpInfo* ipcExpInfo = (struct p2pIpcExpInfo*)reqBuff;
void* regAddr = NULL;
ncclResult_t ret = ncclSuccess;
bool mapped = false;
bool imported = false;
CUmemGenericAllocationHandle handle;
assert(sizeof(struct p2pIpcExpInfo) == reqSize);
assert(sizeof(void*) == respSize);
@@ -1094,6 +1091,9 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str
regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset);
} else {
#if CUDART_VERSION >= 11030
bool mapped = false; /*compiler warning, defining vars only if needed*/
bool imported = false;
CUmemGenericAllocationHandle handle;
// cuMem import
if (connection->sameProcess) {
// if proxy is same process as request peer, we just need to map the handle.