diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt index 6f22da7228..ddc3f26bfa 100644 --- a/projects/rccl/CMakeLists.txt +++ b/projects/rccl/CMakeLists.txt @@ -1112,7 +1112,8 @@ target_compile_options(rccl PRIVATE -Werror=sometimes-uninitialized) target_compile_options(rccl PRIVATE -Wall) target_compile_options(rccl PRIVATE -Werror=deprecated-copy-with-user-provided-copy) target_compile_options(rccl PRIVATE -Wno-format-nonliteral) -target_compile_options(rccl PRIVATE -fgpu-rdc) # Generate relocatable device code (required for extern __shared__) +target_compile_options(rccl PRIVATE -Wno-unused-function) +target_compile_options(rccl PRIVATE -fgpu-rdc) ## Set RCCL compile and linker options for unit tests and code coverage if(ENABLE_CODE_COVERAGE) diff --git a/projects/rccl/src/device/all_gather.h b/projects/rccl/src/device/all_gather.h index 20dd25a3ca..ddaf386b20 100644 --- a/projects/rccl/src/device/all_gather.h +++ b/projects/rccl/src/device/all_gather.h @@ -16,7 +16,10 @@ namespace { #else __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { #endif +#if defined(ENABLE_NPKIT) const int bid = ncclShmem.channelId - work->channelLo; + int npKitCtxIdx = bid; // unused variable - compiler warning +#endif ncclRing *ring = &ncclShmem.channel.ring; const int *ringRanks = ring->userRanks; const int nranks = ncclShmem.comm.nRanks; @@ -27,9 +30,6 @@ namespace { int nelem; int rankDest; -#if defined(ENABLE_NPKIT) - int npKitCtxIdx = bid; -#endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) if (tid == 0) { @@ -229,6 +229,7 @@ struct RunWorkCollpatSteps[i].flags = 0; if (tid == 0) shmem->localAccSize = 0; @@ -238,11 +239,14 @@ struct RunWorkColl patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks); int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor(); + (void)parallelFactor;// unused variable - compiler warning int step = 0; while (1) { struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS); int* poll = &ps->flags; - while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS' + while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0) { + pollCount++ ;// Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS' + } patAlgo.getNextOp(ps); int last = ps->last; step++; @@ -267,7 +271,9 @@ struct RunWorkCollpatSteps+(step%NCCL_SHMEM_PAT_STEPS); int* poll = &ps->flags; - while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0) pollCount++; // Wait for compute thread + while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0){ + pollCount++; // Wait for compute thread + } int last = ps->last; prims.patCopy(ps, shmem); if (tidInGroup == 0) __hip_atomic_store(poll, 0, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); // Return element to compute thread diff --git a/projects/rccl/src/device/all_reduce.h b/projects/rccl/src/device/all_reduce.h index ee4dc29185..901420352c 100644 --- a/projects/rccl/src/device/all_reduce.h +++ b/projects/rccl/src/device/all_reduce.h @@ -23,7 +23,10 @@ namespace { ncclRing *ring = &ncclShmem.channel.ring; int ringIx = ring->index; const int nranks = ncclShmem.comm.nRanks; +#if defined(ENABLE_NPKIT) const int bid = ncclShmem.channelId - work->channelLo; + int npKitCtxIdx = bid; // unused variable - compiler warning +#endif ssize_t size; ssize_t gridOffset; ssize_t channelCount; @@ -34,9 +37,6 @@ namespace { int nelem; int chunk; -#if defined(ENABLE_NPKIT) - int npKitCtxIdx = bid; -#endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) if (tid == 0) { @@ -216,7 +216,10 @@ namespace { #else __device__ __attribute__((noinline)) void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) { #endif +#if defined(ENABLE_NPKIT) const int bid = ncclShmem.channelId - work->channelLo; + int npKitCtxIdx = bid; // unused variable - compiler warning +#endif ncclTree *tree = &ncclShmem.channel.tree; size_t size; size_t gridOffset; @@ -226,9 +229,6 @@ namespace { size_t offset; int nelem; -#if defined(ENABLE_NPKIT) - int npKitCtxIdx = bid; -#endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) if (tid == 0) { @@ -364,7 +364,9 @@ namespace { #else __device__ __attribute__((noinline)) void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) { #endif - const int bid = ncclShmem.channelId - work->channelLo; +#if defined(ENABLE_NPKIT) + const int bid = ncclShmem.channelId - work->channelLo; // unused variable - compiler warning +#endif ncclTree *tree = &ncclShmem.channel.tree; size_t size; size_t gridOffset; diff --git a/projects/rccl/src/device/broadcast.h b/projects/rccl/src/device/broadcast.h index a100b4b893..e39da0bad4 100644 --- a/projects/rccl/src/device/broadcast.h +++ b/projects/rccl/src/device/broadcast.h @@ -15,7 +15,10 @@ namespace { #else __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { #endif +#if defined(ENABLE_NPKIT) const int bid = ncclShmem.channelId - work->channelLo; + int npKitCtxIdx = bid; // unused variable - compiler warning +#endif ncclRing *ring = &ncclShmem.channel.ring; const int rank = ring->userRanks[0]; const int nextRank = ring->userRanks[1]; @@ -30,9 +33,6 @@ namespace { int workNthreads; bool isNetOffload = work->isOneRPN && work->netRegUsed; -#if defined(ENABLE_NPKIT) - int npKitCtxIdx = bid; -#endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) if (tid == 0) { diff --git a/projects/rccl/src/device/primitives.h b/projects/rccl/src/device/primitives.h index 79a93239c0..1c56d8c24a 100644 --- a/projects/rccl/src/device/primitives.h +++ b/projects/rccl/src/device/primitives.h @@ -20,7 +20,7 @@ if (nthreads == NCCL_MAX_NTHREADS) { \ __THREAD_FENCE; __builtin_amdgcn_s_barrier(); \ } else { \ - const int w = threadIdx.x/WARP_SIZE; \ + /**const int w = threadIdx.x/WARP_SIZE //unused variable - compiler warning**/;\ const int wid = threadIdx.x%WARP_SIZE; \ if (wid == 0) { \ (BARRIER_NEXT) += (NWORKERS) / WARP_SIZE; \ diff --git a/projects/rccl/src/device/prims_ll.h b/projects/rccl/src/device/prims_ll.h index c81eba89bf..0e2dbf300d 100644 --- a/projects/rccl/src/device/prims_ll.h +++ b/projects/rccl/src/device/prims_ll.h @@ -148,7 +148,8 @@ private: __device__ uint64_t readLL(int offset, int i) { union ncclLLFifoLine* src = recvPtr(i) + offset; uint32_t flag = recvFlag(i); - uint32_t data1, flag1, data2, flag2; + uint32_t data1, flag1, data2, flag2; + (void)data1; (void)flag1; (void)data2; (void)flag2; // unused variable - compiler warning int spins = 0; #if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) @@ -865,8 +866,8 @@ public: int eltInLine = EltPerLine < nelem ? EltPerLine : nelem; DataLoader dl; - ncclLLFifoLine line[MaxRecv]; - uint64_t data, peerData; + // ncclLLFifoLine line[MaxRecv];//unused variable - compiler warning + uint64_t data /*peerData*/; //unused variable - compiler warning dl.loadBegin(srcElts, eltInLine); srcElts += eltPerTrip; data = dl.loadFinish(); diff --git a/projects/rccl/src/device/prims_ll128.h b/projects/rccl/src/device/prims_ll128.h index 42a66ab065..4b2a9f8fa5 100644 --- a/projects/rccl/src/device/prims_ll128.h +++ b/projects/rccl/src/device/prims_ll128.h @@ -521,10 +521,9 @@ public: bool ipcReg = false, bool netReg = false, int stepSize_ = 0 ): redOp(redOpArg), - tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), - warpInBlock(threadIdx.x/WARP_SIZE), - flagThread((tid%4)==3), group(group), - stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) { + tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), /*compiler warnings*/ + stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)), + warp(tid/WARP_SIZE), warpInBlock(threadIdx.x/WARP_SIZE), flagThread((tid%4)==3), group(group){ auto *channel = &ncclShmem.channel; barriers = &ncclShmem.groups[group].barrier; int nrecv=0, nsend=0; diff --git a/projects/rccl/src/device/prims_simple.h b/projects/rccl/src/device/prims_simple.h index fd8b75a147..86f7bef84f 100644 --- a/projects/rccl/src/device/prims_simple.h +++ b/projects/rccl/src/device/prims_simple.h @@ -751,8 +751,8 @@ public: uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* collWork = nullptr, struct ncclDevWorkP2p* p2pWork = nullptr, int stepSize_ = 0, int mode = primsModeDefault ): - tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group), - stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) { + tid(tid), tidInBlock(threadIdx.x), nthreads(nthreads), /*compiler warnings*/ + stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_), group(group) { barriers = &ncclShmem.groups[group].barrier; // PAT uses the same barrier for each group diff --git a/projects/rccl/src/device/reduce_scatter.h b/projects/rccl/src/device/reduce_scatter.h index 9f6bb7ad1b..ac4c33f079 100644 --- a/projects/rccl/src/device/reduce_scatter.h +++ b/projects/rccl/src/device/reduce_scatter.h @@ -178,7 +178,7 @@ struct RunWorkCollpatSteps[i].flags = 0; if (tid == 0) shmem->localAccSize = 0; @@ -188,11 +188,14 @@ struct RunWorkColl patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks); int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor(); + (void)parallelFactor;// unused variable - compiler warning int step = 0; while (1) { struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS); int* poll = &ps->flags; - while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS' + while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0){ + //pollCount++;// unused variable - compiler warning // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS' + } patAlgo.getNextOp(ps); int last = ps->last; step++; @@ -217,7 +220,9 @@ struct RunWorkCollpatSteps+(step%NCCL_SHMEM_PAT_STEPS); int* poll = &ps->flags; - while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0) pollCount++; // Wait for compute thread + while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0) { + //pollCount++; // unused variable - compiler warning // Wait for compute thread + } int last = ps->last; prims.patReduce(ps, shmem); if (tidInGroup == 0) __hip_atomic_store(poll, 0, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); // Return element to compute thread diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc index 6e9b47ed87..f352c3974e 100644 --- a/projects/rccl/src/enqueue.cc +++ b/projects/rccl/src/enqueue.cc @@ -1238,7 +1238,7 @@ static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduce warned = 1; WARN("Waiting for work FIFO to become available. " "Work fifo exhaustion can happen in large scale/high iteration count of alltoall. " - "In order to increase work FIFO size, set NCCL_WORK_FIFO_BYTES to higher number (current: %ld).\n\n" + "In order to increase work FIFO size, set NCCL_WORK_FIFO_BYTES to higher number (current: %d).\n\n" "RCCL continues to retry...", comm->workFifoBytes); } @@ -1776,7 +1776,7 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) { ncclIntruQueueConstruct(&planner->planQueue); bool capturing = ncclCudaGraphValid(planner->capturingGraph); - cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch + //cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch // unused variable - compiler warning cudaStream_t deviceStream, launchOrder; if (capturing || planner->numStreams != 1) { diff --git a/projects/rccl/src/graph/connect.cc b/projects/rccl/src/graph/connect.cc index c26a775555..ea53ff16a7 100644 --- a/projects/rccl/src/graph/connect.cc +++ b/projects/rccl/src/graph/connect.cc @@ -119,14 +119,14 @@ bool isRankHere(const char* s, int start, int end, int rank) { ncclResult_t ncclTreeBasePostset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph) { - int x=0, y=0; + int x=0; for (int i=0; treeGraph->treeBase[i][0]!=0; i++) { x=i+1; } if( treeGraph->treeBase[0][0] == 0) return ncclSuccess; int nChannels = comm->nChannels; - int localRanks = comm->topo->nodes[GPU].count; + //int localRanks = comm->topo->nodes[GPU].count; // unused variable - compiler warning //new tree for (int c=0; cpaths[GPU]+g2; if (path->count == 2) { struct ncclTopoNode* intermediateNode = path->list[0]->remNode; if (intermediateNode->type == GPU) { + #if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__) intermediateIndex = intermediateNode - system->nodes[GPU].nodes; + #endif if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank; } } @@ -324,8 +327,7 @@ compare: // Compute the PCI distance and compare with the p2pLevel. if (path->type <= p2pLevel) *p2p = 1; -#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) -#else +#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__) if (*p2p == 1) { // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to // validate against NVML at all since they are pretending to be on other hw. @@ -515,10 +517,10 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev, int g; struct ncclTopoSystem* system = comm->topo; NCCLCHECK(ncclTopoRankToIndex(system, rank, &g)); - struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) *flush = 1; #else + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; // unused variable - compiler warning // Flush is required on Ampere and earlier if (gpu->gpu.cudaCompCap >= 90) *flush = 0; // On C2C platforms, data could go through a PCI switch while completions and diff --git a/projects/rccl/src/graph/rome_models.cc b/projects/rccl/src/graph/rome_models.cc index a2fa9f9653..ba10262ba6 100644 --- a/projects/rccl/src/graph/rome_models.cc +++ b/projects/rccl/src/graph/rome_models.cc @@ -1338,7 +1338,6 @@ end: */ ncclResult_t parseGraphLight(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map) { int gpus[NCCL_TOPO_MAX_NODES]; //transcribe/change according to gpu_map - int nChannels = 0; int gpu = 0; int offset = 0; int start_offset = offset; @@ -1348,7 +1347,7 @@ ncclResult_t parseGraphLight(const char* str, struct ncclTopoSystem* system, str } int status = 0; // 0 : between numbers, 1 : inside number int ngpus = system->nodes[GPU].count; - int x=0, y=0; + int x=0; do { int digit = str[offset] - '0'; if (digit >= 0 && digit <= 9) { @@ -1855,7 +1854,6 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra bool isAlltoall = checkAlltoallWidth(&romeTopo); if (!isAlltoall) return ncclSuccess; - int gcnt = 0; int *g8, n[NCCL_TOPO_MAX_NODES]; int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int)); struct timeval tvs, tve; @@ -1878,7 +1876,6 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra } if (ngpusPerNuma == 0) continue; if (ngpusPerNuma != NUMA_GPUS) break; - gcnt++; // init GPU mapping for (int k = 0; k < ngpus; k++) { if (romeTopo.gpuNuma[k] != j) continue; @@ -1927,7 +1924,6 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra if (p < TOTAL_PERMUTE_COUNT) break; } gettimeofday(&tve, NULL); - float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3; if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) { //printf("No solution in %.2fms\n", t); return ncclSuccess; @@ -2035,7 +2031,6 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* int i; int ngpus = system->nodes[GPU].count; - int ncpus = system->nodes[CPU].count; int nnets = system->nodes[NET].count; // Only support ring and tree graphs @@ -2129,7 +2124,6 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* } } gettimeofday(&tve, NULL); - float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3; if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) { //printf("No solution in %.2fms (%d iter)\n", t, time); return ncclSuccess; @@ -2241,7 +2235,6 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra // only match for system with 16 GPUs if (ngpus != 16 || ncpus != NUMA_CPUS) return ncclSuccess; - int gcnt = 0; int *g16, n[NCCL_TOPO_MAX_NODES], rdm[NUMA_GPUS*NUMA_CPUS]; int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int)); struct timeval tvs, tve; @@ -2262,7 +2255,6 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra } if (ngpusPerNuma == 0) continue; if (ngpusPerNuma != NUMA_GPUS) break; - gcnt++; // init GPU mapping for (int k = 0; k < ngpus; k++) { if (romeTopo.gpuNuma[k] != j) continue; @@ -2317,7 +2309,6 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra if (p < TOTAL_PERMUTE_COUNT) break; } gettimeofday(&tve, NULL); - float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3; if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) { //printf("No solution in %.2fms\n", t); return ncclSuccess; diff --git a/projects/rccl/src/graph/tuning.cc b/projects/rccl/src/graph/tuning.cc index 7ccb776f45..682c220c79 100644 --- a/projects/rccl/src/graph/tuning.cc +++ b/projects/rccl/src/graph/tuning.cc @@ -168,7 +168,7 @@ static struct tuningModel tuning_model_0 { { 1.0, 0.8, 0.2, 1.0, 1.0, 0.3, 1.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.5, 1.0, 0.8, 0.8, 1.0, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }, }, - .llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED}, + .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}}, }; static struct tuningModel tuning_model_1 { @@ -200,7 +200,7 @@ static struct tuningModel tuning_model_1 { { 0.3, 1.0, 0.3, 0.1, 0.1, 0.1, 0.3, 0.7, 1.0, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.9, 1.0, 1.0, 1.0, 1.0, }, }, - .llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED}, + .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}}, }; static struct tuningModel tuning_model_2 { @@ -232,7 +232,7 @@ static struct tuningModel tuning_model_2 { { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.4, 0.5, 0.6, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }, }, - .llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED}, + .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}}, }; static struct tuningModel tuning_model_3 { @@ -264,7 +264,7 @@ static struct tuningModel tuning_model_3 { { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 1.0, 0.1, 0.3, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.4, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }, }, - .llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED}, + .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}}, }; static struct tuningModel tuning_model_4 { @@ -296,7 +296,7 @@ static struct tuningModel tuning_model_4 { { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.8, 0.5, 0.1, 0.7, 0.2, 0.4, 0.4, 0.6, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }, }, - .llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED}, + .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}}, }; static struct tuningModel tuning_model_5 { @@ -394,6 +394,7 @@ static struct tuningModel rcclTuningModel[] = { #define HOPPER_COMPCAP_IDX 2 #define BLACKWELL_COMPCAP_IDX 3 +#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__) // LL128 max BW per channel static const double llMaxBws[][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, @@ -420,6 +421,7 @@ static const double perChMaxTreeBws[][3] = { /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0}, /* Blackwell (N1/N2/N4) */ {2*38.7, 2*41.4, 2*36.0}, }; +#endif NCCL_PARAM(PatEnable, "PAT_ENABLE", 0); static int ncclPatEnable(struct ncclComm* comm) { @@ -470,7 +472,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom int nNodes = comm->nNodes; int nRanks = comm->nRanks; if (nRanks <= 1) return ncclSuccess; - +#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__) int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX); int index2 = nNodes <= 2 ? nNodes-1 : 2; // LL: for single node, we look at GPU type; for multi-node, we look at CPU type @@ -480,6 +482,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2]; double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2]; double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2]; +#endif // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring //if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]; float ppn = (float)nRanks / nNodes; @@ -805,11 +808,13 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom // Trees are not perfectly sticking to the model for medium sizes. Applying a static correction // factor is not ideal but works quite well. Powers of two, 64 B to 256MB. +#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__) static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = { { 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .4, .4, .5, .6, .7, .8, .9, 1.0, 1.0, 1.0, 1.0 }, { 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .7, .6, .6, .6, .6, .6, .6, .8, .9, .9, .9, .9, 1.0, 1.0 }, { .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .6, .7, .8, .7, .7, .8, .9, .9 } }; +#endif ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time) { float bw = comm->bandwidths[coll][algorithm][protocol]; diff --git a/projects/rccl/src/include/gdrwrap.h b/projects/rccl/src/include/gdrwrap.h index 281d30f9f9..a71dddb54a 100644 --- a/projects/rccl/src/include/gdrwrap.h +++ b/projects/rccl/src/include/gdrwrap.h @@ -180,11 +180,11 @@ static gdr_t ncclGdrInit() { template static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle, hipStream_t stream) { - gdr_info_t info; + // gdr_info_t info; // unused variable - compiler warning size_t mapSize; - gdr_mh_t mh; + // gdr_mh_t mh; // unused variable - compiler warning char *devMem; - void *gdrMap; + // void *gdrMap; // unused variable - compiler warning mapSize = ncclSizeOfT()*nelem; @@ -216,7 +216,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** template static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) { - gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; + //gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; // unused variable - compiler warning memcpy(dst, src, nelem*sizeof(T)); return ncclSuccess; } diff --git a/projects/rccl/src/include/rccl_float8.h b/projects/rccl/src/include/rccl_float8.h index 4dc975918c..8ccd4bd517 100755 --- a/projects/rccl/src/include/rccl_float8.h +++ b/projects/rccl/src/include/rccl_float8.h @@ -73,11 +73,6 @@ inline __device__ rccl_float8 hadd(rccl_float8 x, rccl_float8 y) u.i16_vec = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(v1, v1, /* scale */ 1.f, 0); return u.fp8[0]; #elif __HIP_DEVICE_COMPILE__ && defined(__gfx942__) - union - { - uint32_t i32val; - rccl_float8 i8val[4]; - } val; float2_t v; uint32_t ival = 0; @@ -235,7 +230,7 @@ namespace rocblas_hip_f8_impl else x = reinterpret_cast(_x); - uint32_t y, head, mantissa; + uint32_t head, mantissa; int exponent, bias; uint32_t sign; diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index ca99ef2382..aaedc5e06b 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -709,7 +709,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { int nRanks = comm->nRanks; struct ncclDevCommAndChannels tmpCommAndChans; struct ncclDevCommAndChannels *devCommAndChans = NULL; - struct ncclNvmlCCStatus ccStatus; + //struct ncclNvmlCCStatus ccStatus; //unused variable - compiler warning bool ccEnable = false; cudaStream_t deviceStream; @@ -1882,9 +1882,6 @@ fail: static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_; ncclComm_t comm = job->comm; -#ifdef ENABLE_MSCCLPP - ncclUniqueId origUniqueId = *job->commId; -#endif ncclResult_t res = ncclSuccess; int archMajor, archMinor; size_t maxLocalSizeBytes = 0; @@ -1895,8 +1892,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { double sum_timers = 0; uint64_t timers[TIMERS_INIT_COUNT] = {0}; unsigned long long commIdHash; + #ifdef USE_INDIRECT_FUNCTION_CALL int64_t stackSize; hipDeviceProp_t devProp; + #endif timers[TIMER_INIT_TOTAL] = clockNano(); CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail); diff --git a/projects/rccl/src/misc/alt_rsmi.cc b/projects/rccl/src/misc/alt_rsmi.cc index 0dffe07cd5..d748d2618e 100644 --- a/projects/rccl/src/misc/alt_rsmi.cc +++ b/projects/rccl/src/misc/alt_rsmi.cc @@ -48,7 +48,6 @@ struct ARSMI_systemNode { std::string s_card; }; -static const char *kPathDRMRoot = "/sys/class/drm"; static const char *kKFDNodesPathRoot = "/sys/class/kfd/kfd/topology/nodes"; static const uint32_t kAmdGpuId = 0x1002; @@ -66,7 +65,6 @@ static thread_local int ARSMI_num_devices=-1; int ARSMI_init(void) { std::string err_msg; - uint32_t count = 0; std::multimap ARSMI_allSystemNodes; if (ARSMI_num_devices > 0) { @@ -101,7 +99,7 @@ int ARSMI_init(void) int ret_loc_id = read_node_properties(node_id, "location_id", &location_id, properties); int ret_domain = read_node_properties(node_id, "domain", &domain, properties); int ret_vendor = read_node_properties(node_id, "vendor_id", &vendor_id, properties); - if (ret_gpu_id == 0 && ~(ret_unique_id != 0 || ret_loc_id != 0 || ret_unique_id != 0 || ret_vendor != 0) && + if (ret_gpu_id == 0 && !(ret_unique_id != 0 || ret_loc_id != 0 || ret_domain != 0 || ret_vendor != 0) && (gpu_id != 0) && (vendor_id == kAmdGpuId)) { // Do not try to build a node if one of these fields // do not exist in KFD (0 as values okay) @@ -194,7 +192,6 @@ int ARSMI_init(void) // the order of each block. for (auto i=0; i < first_elem.size(); i++) { // Find the first_elem[i] in sort_vecs in - bool found = false; for (auto j = 0; j < sort_vecs.size(); j++ ) { if (first_elem[i] == sort_vecs[j][0].s_bdf) { for (auto k=0; k sortedDumpStrMap; - for (auto &[commHash, opCountMap] : activeOps) { - for (auto &[opCount, proxyOpMap] : opCountMap) { - for (auto &[opId, opEntry] : proxyOpMap) { - ProxyTraceRecordKey traceKey = {commHash, opCount, opId}; - opEntry.computeStatus(); - sortedDumpStrMap[traceKey.str()] = opEntry.str(); + for (auto &commHash_opCountMap : activeOps) { + for (auto &opCount_proxyOpMap : commHash_opCountMap.second /*opCountMap*/) { + for (auto &opId_opEntry : opCount_proxyOpMap.second/*proxyOpMap*/) { + ProxyTraceRecordKey traceKey = {commHash_opCountMap.first, opCount_proxyOpMap.first, opId_opEntry.first}; + opId_opEntry.second.computeStatus(); + sortedDumpStrMap[traceKey.str()] = opId_opEntry.second.str(); } } } // add the recent finished ops as well - for (const auto &[keyStr, proxyOpStr] : finishedOps) { - sortedDumpStrMap[keyStr] = proxyOpStr; + for (const auto &keyStr_proxyOpStr : finishedOps) { + sortedDumpStrMap[keyStr_proxyOpStr.first] = keyStr_proxyOpStr.second; } - for (const auto &[keyStr, proxyOpStr] : sortedDumpStrMap) { - result += proxyOpStr; + for (const auto &keyStr_proxyOpStr : sortedDumpStrMap) { + result += keyStr_proxyOpStr.second; } return result; } @@ -207,15 +207,15 @@ std::string facebook_rccl::ProxyTraceOp::str() { float facebook_rccl::ProxyTrace::getMapSizeMB() const { float size = 0; - for (const auto &[commHash, opCountMap] : activeOps) { - for (const auto &[opCount, proxyOpMap] : opCountMap) { - size += proxyOpMap.size() * + for (const auto &commHash_opCountMap : activeOps) { + for (const auto &opCount_proxyOpMap : commHash_opCountMap.second) { + size += opCount_proxyOpMap.second.size() * (sizeof(ProxyTraceOp) + sizeof(std::unique_ptr)); } } - for (const auto &[keyStr, proxyOpStr] : finishedOps) { - size += keyStr.size() + proxyOpStr.size(); + for (const auto &keyStr_proxyOpStr : finishedOps) { + size += keyStr_proxyOpStr.first.size() + keyStr_proxyOpStr.second.size(); } return size / 1024.0 / 1024.0; } diff --git a/projects/rccl/src/misc/recorder.cc b/projects/rccl/src/misc/recorder.cc index 8268378ad0..3d390225c7 100644 --- a/projects/rccl/src/misc/recorder.cc +++ b/projects/rccl/src/misc/recorder.cc @@ -40,8 +40,8 @@ rcclApiCall::rcclApiCall(rcclCall_t type, const ncclInfo& info)://name(rcclCallS datatype(info.datatype), op(info.op), root(info.root), - comm(info.comm), nRanks(info.comm->nRanks), + comm(info.comm), stream(info.stream), nTasks(info.comm->planner.nTasksP2p + info.comm->planner.nTasksColl), globalRank(info.comm->localRankToRank[info.comm->localRank]) diff --git a/projects/rccl/src/misc/socket.cc b/projects/rccl/src/misc/socket.cc index 8d902d23d2..87f2e7eb10 100644 --- a/projects/rccl/src/misc/socket.cc +++ b/projects/rccl/src/misc/socket.cc @@ -663,8 +663,9 @@ ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running) { } ncclResult_t ncclSocketConnect(struct ncclSocket* sock) { +#ifdef ENABLE_TRACE char line[SOCKET_NAME_MAXLEN+1]; - +#endif if (sock == NULL) { WARN("ncclSocketConnect: pass NULL socket"); return ncclInvalidArgument; diff --git a/projects/rccl/src/proxy.cc b/projects/rccl/src/proxy.cc index dbd4a80ace..370b1ce471 100644 --- a/projects/rccl/src/proxy.cc +++ b/projects/rccl/src/proxy.cc @@ -292,10 +292,8 @@ ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) { struct ncclProxyArgs* op = state->active; int poolIndex, opIndex; int list_len = 0; - int sublist_len = 0; fprintf(stderr, "ACTIVE OPS\n"); while (op) { - sublist_len = 0; NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex)); if (op->state & OP_SEEN) { WARN("List loop at element %d-%d", poolIndex, opIndex); @@ -304,7 +302,6 @@ ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) { op->state |= OP_SEEN; struct ncclProxyArgs* nextOp = op->nextPeer; while (nextOp) { - sublist_len++; NCCLCHECK(getOpIndex(nextOp, state, &poolIndex, &opIndex)); if (nextOp->state & OP_SEEN) { WARN("List loop at element %d-%d", poolIndex, opIndex); diff --git a/projects/rccl/src/transport/coll_net.cc b/projects/rccl/src/transport/coll_net.cc index 7bcea409eb..483c18e24b 100644 --- a/projects/rccl/src/transport/coll_net.cc +++ b/projects/rccl/src/transport/coll_net.cc @@ -524,6 +524,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); int dmabuf_fd = -1; + (void)dmabuf_fd; /*compiler warnings fix - unused variable*/ #if CUDA_VERSION >= 11070 /* DMA-BUF support */ if (resources->useGdr && resources->useDmaBuf) { @@ -600,8 +601,9 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str struct connectMapMem* mapMem = map->mems+bank; NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); - + int dmabuf_fd = -1; + (void)dmabuf_fd; /*compiler warnings fix - unused variable*/ #if CUDA_VERSION >= 11070 /* DMA-BUF support */ if (resources->useGdr && resources->useDmaBuf) { @@ -1314,8 +1316,8 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); needReg = false; } -#endif peermem: +#endif if (dmabuf_fd != -1) { (void)close(dmabuf_fd); dmabuf_fd = -1; @@ -1342,20 +1344,20 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s assert(reqSize == sizeof(struct collnetRegInfo)); assert(respSize == sizeof(void*)); - int dmabuf_fd = -1; #if CUDART_VERSION >= 11070 + int dmabuf_fd = -1; /* DMA-BUF support */ if (resources->useGdr && resources->useDmaBuf) { CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem); NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem); needReg = false; } -#endif peermem: if (dmabuf_fd != -1) { (void)close(dmabuf_fd); dmabuf_fd = -1; } +#endif if (needReg) { NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail); } diff --git a/projects/rccl/src/transport/net.cc b/projects/rccl/src/transport/net.cc index 1669ac0679..ac3fd55e25 100644 --- a/projects/rccl/src/transport/net.cc +++ b/projects/rccl/src/transport/net.cc @@ -362,7 +362,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne send->transportResources = map; opId = send; INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId); - netSendConnectArgs args = {0}; + netSendConnectArgs args = {{},0}; memcpy(&args.handle, connectInfo, sizeof(ncclNetHandle_t)); args.trafficClass = comm->config.trafficClass; NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(netSendConnectArgs), sizeof(struct connectMap), opId)); diff --git a/projects/rccl/src/transport/p2p.cc b/projects/rccl/src/transport/p2p.cc index a5d89c4b43..7bdfbe07b3 100644 --- a/projects/rccl/src/transport/p2p.cc +++ b/projects/rccl/src/transport/p2p.cc @@ -1077,9 +1077,6 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str struct p2pIpcExpInfo* ipcExpInfo = (struct p2pIpcExpInfo*)reqBuff; void* regAddr = NULL; ncclResult_t ret = ncclSuccess; - bool mapped = false; - bool imported = false; - CUmemGenericAllocationHandle handle; assert(sizeof(struct p2pIpcExpInfo) == reqSize); assert(sizeof(void*) == respSize); @@ -1094,6 +1091,9 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset); } else { #if CUDART_VERSION >= 11030 + bool mapped = false; /*compiler warning, defining vars only if needed*/ + bool imported = false; + CUmemGenericAllocationHandle handle; // cuMem import if (connection->sameProcess) { // if proxy is same process as request peer, we just need to map the handle.