diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt
index 6f22da7228..ddc3f26bfa 100644
--- a/projects/rccl/CMakeLists.txt
+++ b/projects/rccl/CMakeLists.txt
@@ -1112,7 +1112,8 @@ target_compile_options(rccl PRIVATE -Werror=sometimes-uninitialized)
 target_compile_options(rccl PRIVATE -Wall)
 target_compile_options(rccl PRIVATE -Werror=deprecated-copy-with-user-provided-copy)
 target_compile_options(rccl PRIVATE -Wno-format-nonliteral)
-target_compile_options(rccl PRIVATE -fgpu-rdc)      # Generate relocatable device code (required for extern __shared__)
+target_compile_options(rccl PRIVATE -Wno-unused-function)
+target_compile_options(rccl PRIVATE -fgpu-rdc)
 
 ## Set RCCL compile and linker options for unit tests and code coverage
 if(ENABLE_CODE_COVERAGE)
diff --git a/projects/rccl/src/device/all_gather.h b/projects/rccl/src/device/all_gather.h
index 20dd25a3ca..ddaf386b20 100644
--- a/projects/rccl/src/device/all_gather.h
+++ b/projects/rccl/src/device/all_gather.h
@@ -16,7 +16,10 @@ namespace {
 #else
   __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #endif
+#if defined(ENABLE_NPKIT)
     const int bid = ncclShmem.channelId - work->channelLo;
+    int npKitCtxIdx = bid; // unused variable - compiler warning
+#endif
     ncclRing *ring = &ncclShmem.channel.ring;
     const int *ringRanks = ring->userRanks;
     const int nranks = ncclShmem.comm.nRanks;
@@ -27,9 +30,6 @@ namespace {
     int nelem;
     int rankDest;
 
-#if defined(ENABLE_NPKIT)
-    int npKitCtxIdx = bid;
-#endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
     if (tid == 0) {
@@ -229,6 +229,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
     static constexpr int nworkers = NCCL_PAT_NWORKERS;
     struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
     uint64_t pollCount = 0;
+    (void)pollCount; // unused variable - compiler warning
     __syncthreads(); // Don't start using shared mem until everyone arrives
     for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
     if (tid == 0) shmem->localAccSize = 0;
@@ -238,11 +239,14 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
     if (tid == nworkers) { // Algo computation thread
       PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
       int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
+      (void)parallelFactor;// unused variable - compiler warning
       int step = 0;
       while (1) {
         struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
         int* poll = &ps->flags;
-        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0) { 
+          pollCount++ ;// Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        }
         patAlgo.getNextOp(ps);
         int last = ps->last;
         step++;
@@ -267,7 +271,9 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
       while(1) {
         struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
         int* poll = &ps->flags;
-        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0) pollCount++; // Wait for compute thread
+        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0){
+          pollCount++; // Wait for compute thread 
+        }
         int last = ps->last;
         prims.patCopy(ps, shmem);
         if (tidInGroup == 0) __hip_atomic_store(poll, 0, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); // Return element to compute thread
diff --git a/projects/rccl/src/device/all_reduce.h b/projects/rccl/src/device/all_reduce.h
index ee4dc29185..901420352c 100644
--- a/projects/rccl/src/device/all_reduce.h
+++ b/projects/rccl/src/device/all_reduce.h
@@ -23,7 +23,10 @@ namespace {
     ncclRing *ring = &ncclShmem.channel.ring;
     int ringIx = ring->index;
     const int nranks = ncclShmem.comm.nRanks;
+#if defined(ENABLE_NPKIT)
     const int bid = ncclShmem.channelId - work->channelLo;
+    int npKitCtxIdx = bid; // unused variable - compiler warning
+#endif
     ssize_t size;
     ssize_t gridOffset;
     ssize_t channelCount;
@@ -34,9 +37,6 @@ namespace {
     int nelem;
     int chunk;
 
-#if defined(ENABLE_NPKIT)
-    int npKitCtxIdx = bid;
-#endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
     if (tid == 0) {
@@ -216,7 +216,10 @@ namespace {
 #else
   __device__ __attribute__((noinline)) void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #endif
+#if defined(ENABLE_NPKIT)
     const int bid = ncclShmem.channelId - work->channelLo;
+    int npKitCtxIdx = bid; // unused variable - compiler warning
+#endif
     ncclTree *tree = &ncclShmem.channel.tree;
     size_t size;
     size_t gridOffset;
@@ -226,9 +229,6 @@ namespace {
     size_t offset;
     int nelem;
 
-#if defined(ENABLE_NPKIT)
-    int npKitCtxIdx = bid;
-#endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
     if (tid == 0) {
@@ -364,7 +364,9 @@ namespace {
 #else
   __device__ __attribute__((noinline)) void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #endif
-    const int bid = ncclShmem.channelId - work->channelLo;
+#if defined(ENABLE_NPKIT)
+    const int bid = ncclShmem.channelId - work->channelLo; // unused variable - compiler warning
+#endif
     ncclTree *tree = &ncclShmem.channel.tree;
     size_t size;
     size_t gridOffset;
diff --git a/projects/rccl/src/device/broadcast.h b/projects/rccl/src/device/broadcast.h
index a100b4b893..e39da0bad4 100644
--- a/projects/rccl/src/device/broadcast.h
+++ b/projects/rccl/src/device/broadcast.h
@@ -15,7 +15,10 @@ namespace {
 #else
   __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #endif
+#if defined(ENABLE_NPKIT)
     const int bid = ncclShmem.channelId - work->channelLo;
+    int npKitCtxIdx = bid; // unused variable - compiler warning
+#endif
     ncclRing *ring = &ncclShmem.channel.ring;
     const int rank = ring->userRanks[0];
     const int nextRank = ring->userRanks[1];
@@ -30,9 +33,6 @@ namespace {
     int workNthreads;
     bool isNetOffload = work->isOneRPN && work->netRegUsed;
 
-#if defined(ENABLE_NPKIT)
-    int npKitCtxIdx = bid;
-#endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
     if (tid == 0) {
diff --git a/projects/rccl/src/device/primitives.h b/projects/rccl/src/device/primitives.h
index 79a93239c0..1c56d8c24a 100644
--- a/projects/rccl/src/device/primitives.h
+++ b/projects/rccl/src/device/primitives.h
@@ -20,7 +20,7 @@
   if (nthreads == NCCL_MAX_NTHREADS) { \
     __THREAD_FENCE; __builtin_amdgcn_s_barrier(); \
   } else { \
-    const int w = threadIdx.x/WARP_SIZE; \
+    /**const int w = threadIdx.x/WARP_SIZE //unused variable - compiler warning**/;\
     const int wid = threadIdx.x%WARP_SIZE; \
     if (wid == 0) { \
       (BARRIER_NEXT) += (NWORKERS) / WARP_SIZE; \
diff --git a/projects/rccl/src/device/prims_ll.h b/projects/rccl/src/device/prims_ll.h
index c81eba89bf..0e2dbf300d 100644
--- a/projects/rccl/src/device/prims_ll.h
+++ b/projects/rccl/src/device/prims_ll.h
@@ -148,7 +148,8 @@ private:
   __device__ uint64_t readLL(int offset, int i) {
     union ncclLLFifoLine* src = recvPtr(i) + offset;
     uint32_t flag = recvFlag(i);
-    uint32_t data1, flag1, data2, flag2;
+    uint32_t data1, flag1, data2, flag2; 
+    (void)data1; (void)flag1; (void)data2; (void)flag2; // unused variable - compiler warning
     int spins = 0;
 
 #if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
@@ -865,8 +866,8 @@ public:
       int eltInLine = EltPerLine < nelem ? EltPerLine : nelem;
 
       DataLoader dl;
-      ncclLLFifoLine line[MaxRecv];
-      uint64_t data, peerData;
+      // ncclLLFifoLine line[MaxRecv];//unused variable - compiler warning
+      uint64_t data /*peerData*/;     //unused variable - compiler warning
       dl.loadBegin(srcElts, eltInLine);
       srcElts += eltPerTrip;
       data = dl.loadFinish();
diff --git a/projects/rccl/src/device/prims_ll128.h b/projects/rccl/src/device/prims_ll128.h
index 42a66ab065..4b2a9f8fa5 100644
--- a/projects/rccl/src/device/prims_ll128.h
+++ b/projects/rccl/src/device/prims_ll128.h
@@ -521,10 +521,9 @@ public:
       bool ipcReg = false, bool netReg = false, int stepSize_ = 0
     ):
     redOp(redOpArg),
-    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
-    warpInBlock(threadIdx.x/WARP_SIZE),
-    flagThread((tid%4)==3), group(group),
-    stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
+    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE),                                /*compiler warnings*/
+    stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)), 
+    warp(tid/WARP_SIZE), warpInBlock(threadIdx.x/WARP_SIZE), flagThread((tid%4)==3), group(group){
     auto *channel = &ncclShmem.channel;
     barriers = &ncclShmem.groups[group].barrier;
     int nrecv=0, nsend=0;
diff --git a/projects/rccl/src/device/prims_simple.h b/projects/rccl/src/device/prims_simple.h
index fd8b75a147..86f7bef84f 100644
--- a/projects/rccl/src/device/prims_simple.h
+++ b/projects/rccl/src/device/prims_simple.h
@@ -751,8 +751,8 @@ public:
       uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* collWork = nullptr,
       struct ncclDevWorkP2p* p2pWork = nullptr, int stepSize_ = 0, int mode = primsModeDefault
     ):
-    tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
-    stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
+    tid(tid), tidInBlock(threadIdx.x), nthreads(nthreads), /*compiler warnings*/
+    stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_), group(group) {
 
     barriers = &ncclShmem.groups[group].barrier;
     // PAT uses the same barrier for each group
diff --git a/projects/rccl/src/device/reduce_scatter.h b/projects/rccl/src/device/reduce_scatter.h
index 9f6bb7ad1b..ac4c33f079 100644
--- a/projects/rccl/src/device/reduce_scatter.h
+++ b/projects/rccl/src/device/reduce_scatter.h
@@ -178,7 +178,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SI
 
     static constexpr int nworkers = NCCL_PAT_NWORKERS;
     struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
-    uint64_t pollCount = 0;
+    //uint64_t pollCount = 0; unused variable - compiler warning
     __syncthreads(); // Don't start using shared mem until everyone arrives
     for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
     if (tid == 0) shmem->localAccSize = 0;
@@ -188,11 +188,14 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SI
     if (tid == nworkers) { // Algo computation thread
       PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
       int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
+      (void)parallelFactor;// unused variable - compiler warning
       int step = 0;
       while (1) {
         struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
         int* poll = &ps->flags;
-        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0){
+          //pollCount++;// unused variable - compiler warning // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        }
         patAlgo.getNextOp(ps);
         int last = ps->last;
         step++;
@@ -217,7 +220,9 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SI
       while(1) {
         struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
         int* poll = &ps->flags;
-        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0) pollCount++; // Wait for compute thread
+        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0) {
+          //pollCount++; // unused variable - compiler warning // Wait for compute thread
+        }
         int last = ps->last;
         prims.patReduce(ps, shmem);
         if (tidInGroup == 0) __hip_atomic_store(poll, 0, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); // Return element to compute thread
diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc
index 6e9b47ed87..f352c3974e 100644
--- a/projects/rccl/src/enqueue.cc
+++ b/projects/rccl/src/enqueue.cc
@@ -1238,7 +1238,7 @@ static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduce
       warned = 1;
       WARN("Waiting for work FIFO to become available. "
            "Work fifo exhaustion can happen in large scale/high iteration count of alltoall. "
-           "In order to increase work FIFO size, set NCCL_WORK_FIFO_BYTES to higher number (current: %ld).\n\n"
+           "In order to increase work FIFO size, set NCCL_WORK_FIFO_BYTES to higher number (current: %d).\n\n"
            "RCCL continues to retry...", comm->workFifoBytes);
     }
 
@@ -1776,7 +1776,7 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
     ncclIntruQueueConstruct(&planner->planQueue);
 
     bool capturing = ncclCudaGraphValid(planner->capturingGraph);
-    cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
+    //cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch // unused variable - compiler warning
     cudaStream_t deviceStream, launchOrder;
 
     if (capturing || planner->numStreams != 1) {
diff --git a/projects/rccl/src/graph/connect.cc b/projects/rccl/src/graph/connect.cc
index c26a775555..ea53ff16a7 100644
--- a/projects/rccl/src/graph/connect.cc
+++ b/projects/rccl/src/graph/connect.cc
@@ -119,14 +119,14 @@ bool isRankHere(const char* s, int start, int end, int rank) {
 
 ncclResult_t ncclTreeBasePostset(struct ncclComm* comm,
     struct ncclTopoGraph* treeGraph) {
-  int x=0, y=0;
+  int x=0;
   for (int i=0;  treeGraph->treeBase[i][0]!=0; i++)
   {
     x=i+1;
   }
   if( treeGraph->treeBase[0][0] == 0) return ncclSuccess;
   int nChannels = comm->nChannels;
-  int localRanks = comm->topo->nodes[GPU].count;
+  //int localRanks = comm->topo->nodes[GPU].count; // unused variable - compiler warning
   //new tree
   for (int c=0; c<nChannels; c++) { // in here
     int buff = c%x;
diff --git a/projects/rccl/src/graph/paths.cc b/projects/rccl/src/graph/paths.cc
index 61055eeb38..9dfa1d5671 100644
--- a/projects/rccl/src/graph/paths.cc
+++ b/projects/rccl/src/graph/paths.cc
@@ -286,14 +286,17 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst
     // GPU not found, we can't use p2p.
     return ncclSuccess;
   }
-
+  #if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
   int intermediateIndex = -1;
+  #endif
   // Set intermediate GPU rank, if routing through an intermediate GPU.
   struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
   if (path->count == 2) {
     struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
     if (intermediateNode->type == GPU) {
+      #if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
       intermediateIndex = intermediateNode - system->nodes[GPU].nodes;
+      #endif
       if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank;
     }
   }
@@ -324,8 +327,7 @@ compare:
   // Compute the PCI distance and compare with the p2pLevel.
   if (path->type <= p2pLevel) *p2p = 1;
 
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
-#else
+#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
   if (*p2p == 1) {
     // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to
     // validate against NVML at all since they are pretending to be on other hw.
@@ -515,10 +517,10 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev,
   int g;
   struct ncclTopoSystem* system = comm->topo;
   NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
-  struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
   *flush = 1;
 #else
+  struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; // unused variable - compiler warning
   // Flush is required on Ampere and earlier
   if (gpu->gpu.cudaCompCap >= 90) *flush = 0;
   // On C2C platforms, data could go through a PCI switch while completions and
diff --git a/projects/rccl/src/graph/rome_models.cc b/projects/rccl/src/graph/rome_models.cc
index a2fa9f9653..ba10262ba6 100644
--- a/projects/rccl/src/graph/rome_models.cc
+++ b/projects/rccl/src/graph/rome_models.cc
@@ -1338,7 +1338,6 @@ end:
  */
 ncclResult_t parseGraphLight(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map) {
   int gpus[NCCL_TOPO_MAX_NODES]; //transcribe/change according to gpu_map
-  int nChannels = 0;
   int gpu = 0;
   int offset = 0;
   int start_offset = offset;
@@ -1348,7 +1347,7 @@ ncclResult_t parseGraphLight(const char* str, struct ncclTopoSystem* system, str
   }
   int status = 0; // 0 : between numbers, 1 : inside number
   int ngpus = system->nodes[GPU].count;
-  int x=0, y=0;
+  int x=0;
   do {
     int digit = str[offset] - '0';
     if (digit >= 0 && digit <= 9) {
@@ -1855,7 +1854,6 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
   bool isAlltoall = checkAlltoallWidth(&romeTopo);
   if (!isAlltoall) return ncclSuccess;
 
-  int gcnt = 0;
   int *g8, n[NCCL_TOPO_MAX_NODES];
   int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int));
   struct timeval tvs, tve;
@@ -1878,7 +1876,6 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
       }
       if (ngpusPerNuma == 0) continue;
       if (ngpusPerNuma != NUMA_GPUS) break;
-      gcnt++;
       // init GPU mapping
       for (int k = 0; k < ngpus; k++) {
         if (romeTopo.gpuNuma[k] != j) continue;
@@ -1927,7 +1924,6 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
     if (p < TOTAL_PERMUTE_COUNT) break;
   }
   gettimeofday(&tve, NULL);
-  float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3;
   if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) {
     //printf("No solution in %.2fms\n", t);
     return ncclSuccess;
@@ -2035,7 +2031,6 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
   int i;
 
   int ngpus = system->nodes[GPU].count;
-  int ncpus = system->nodes[CPU].count;
   int nnets = system->nodes[NET].count;
 
   // Only support ring and tree graphs
@@ -2129,7 +2124,6 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
     }
   }
   gettimeofday(&tve, NULL);
-  float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3;
   if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) {
     //printf("No solution in %.2fms (%d iter)\n", t, time);
     return ncclSuccess;
@@ -2241,7 +2235,6 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
   // only match for system with 16 GPUs
   if (ngpus != 16 || ncpus != NUMA_CPUS) return ncclSuccess;
 
-  int gcnt = 0;
   int *g16, n[NCCL_TOPO_MAX_NODES], rdm[NUMA_GPUS*NUMA_CPUS];
   int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int));
   struct timeval tvs, tve;
@@ -2262,7 +2255,6 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
       }
       if (ngpusPerNuma == 0) continue;
       if (ngpusPerNuma != NUMA_GPUS) break;
-      gcnt++;
       // init GPU mapping
       for (int k = 0; k < ngpus; k++) {
         if (romeTopo.gpuNuma[k] != j) continue;
@@ -2317,7 +2309,6 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
     if (p < TOTAL_PERMUTE_COUNT) break;
   }
   gettimeofday(&tve, NULL);
-  float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3;
   if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) {
     //printf("No solution in %.2fms\n", t);
     return ncclSuccess;
diff --git a/projects/rccl/src/graph/tuning.cc b/projects/rccl/src/graph/tuning.cc
index 7ccb776f45..682c220c79 100644
--- a/projects/rccl/src/graph/tuning.cc
+++ b/projects/rccl/src/graph/tuning.cc
@@ -168,7 +168,7 @@ static struct tuningModel tuning_model_0 {
     { 1.0, 0.8, 0.2, 1.0, 1.0, 0.3, 1.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.5, 1.0, 0.8, 0.8, 1.0, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
   },
 
-  .llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED},
+  .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
 };
 
 static struct tuningModel tuning_model_1 {
@@ -200,7 +200,7 @@ static struct tuningModel tuning_model_1 {
     { 0.3, 1.0, 0.3, 0.1, 0.1, 0.1, 0.3, 0.7, 1.0, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.9, 1.0, 1.0, 1.0, 1.0, },
   },
 
-  .llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED},
+  .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
 };
 
 static struct tuningModel tuning_model_2 {
@@ -232,7 +232,7 @@ static struct tuningModel tuning_model_2 {
     { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.4, 0.5, 0.6, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
   },
 
-  .llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED},
+  .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
 };
 
 static struct tuningModel tuning_model_3 {
@@ -264,7 +264,7 @@ static struct tuningModel tuning_model_3 {
     { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 1.0, 0.1, 0.3, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.4, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
   },
 
-  .llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED},
+  .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
 };
 
 static struct tuningModel tuning_model_4 {
@@ -296,7 +296,7 @@ static struct tuningModel tuning_model_4 {
     { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.8, 0.5, 0.1, 0.7, 0.2, 0.4, 0.4, 0.6, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
   },
 
-  .llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED},
+  .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
 };
 
 static struct tuningModel tuning_model_5 {
@@ -394,6 +394,7 @@ static struct tuningModel rcclTuningModel[] = {
 #define HOPPER_COMPCAP_IDX 2
 #define BLACKWELL_COMPCAP_IDX 3
 
+#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
 // LL128 max BW per channel
 static const double llMaxBws[][3] = {
   /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
@@ -420,6 +421,7 @@ static const double perChMaxTreeBws[][3] = {
   /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
   /* Blackwell (N1/N2/N4) */ {2*38.7, 2*41.4, 2*36.0},
 };
+#endif
 
 NCCL_PARAM(PatEnable, "PAT_ENABLE", 0);
 static int ncclPatEnable(struct ncclComm* comm) {
@@ -470,7 +472,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   int nNodes = comm->nNodes;
   int nRanks = comm->nRanks;
   if (nRanks <= 1) return ncclSuccess;
-
+#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
   int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
   int index2 = nNodes <= 2 ? nNodes-1 : 2;
   // LL: for single node, we look at GPU type; for multi-node, we look at CPU type
@@ -480,6 +482,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2];
   double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2];
   double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
+#endif
   // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
   //if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
   float ppn = (float)nRanks / nNodes;
@@ -805,11 +808,13 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
 
 // Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
 // factor is not ideal but works quite well. Powers of two, 64 B to 256MB.
+#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
 static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
   { 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .7,  .7,  .7,  .7,  .6,  .5,  .4,  .4,  .5,  .6,  .7,  .8,  .9, 1.0, 1.0, 1.0, 1.0 },
   { 1.0, 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .8,  .8,  .7,  .6,  .6,  .6,  .6,  .6,  .6,  .8,  .9,  .9,  .9,  .9, 1.0, 1.0 },
   {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .6,  .7,  .8,  .7,  .7,  .8,  .9,  .9 }
 };
+#endif
 
 ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time) {
   float bw = comm->bandwidths[coll][algorithm][protocol];
diff --git a/projects/rccl/src/include/gdrwrap.h b/projects/rccl/src/include/gdrwrap.h
index 281d30f9f9..a71dddb54a 100644
--- a/projects/rccl/src/include/gdrwrap.h
+++ b/projects/rccl/src/include/gdrwrap.h
@@ -180,11 +180,11 @@ static gdr_t ncclGdrInit() {
 
 template <typename T>
 static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle, hipStream_t stream) {
-  gdr_info_t info;
+  // gdr_info_t info; // unused variable - compiler warning
   size_t mapSize;
-  gdr_mh_t mh;
+  // gdr_mh_t mh;     // unused variable - compiler warning
   char *devMem;
-  void *gdrMap;
+  // void *gdrMap;    // unused variable - compiler warning
 
   mapSize = ncclSizeOfT<T>()*nelem;
 
@@ -216,7 +216,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
 
 template <typename T>
 static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
-  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
+  //gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; // unused variable - compiler warning
   memcpy(dst, src, nelem*sizeof(T));
   return ncclSuccess;
 }
diff --git a/projects/rccl/src/include/rccl_float8.h b/projects/rccl/src/include/rccl_float8.h
index 4dc975918c..8ccd4bd517 100755
--- a/projects/rccl/src/include/rccl_float8.h
+++ b/projects/rccl/src/include/rccl_float8.h
@@ -73,11 +73,6 @@ inline __device__  rccl_float8 hadd(rccl_float8 x, rccl_float8 y)
     u.i16_vec = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(v1, v1, /* scale */ 1.f, 0);
     return u.fp8[0];
 #elif __HIP_DEVICE_COMPILE__ && defined(__gfx942__)
-     union
-    {
-        uint32_t i32val;
-        rccl_float8 i8val[4];
-    } val;
 
     float2_t v;
     uint32_t ival = 0;
@@ -235,7 +230,7 @@ namespace rocblas_hip_f8_impl
         else
             x = reinterpret_cast<uint16_t&>(_x);
 
-        uint32_t y, head, mantissa;
+        uint32_t head, mantissa;
         int      exponent, bias;
         uint32_t sign;
 
diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc
index ca99ef2382..aaedc5e06b 100644
--- a/projects/rccl/src/init.cc
+++ b/projects/rccl/src/init.cc
@@ -709,7 +709,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   int nRanks = comm->nRanks;
   struct ncclDevCommAndChannels tmpCommAndChans;
   struct ncclDevCommAndChannels *devCommAndChans = NULL;
-  struct ncclNvmlCCStatus ccStatus;
+  //struct ncclNvmlCCStatus ccStatus; //unused variable - compiler warning
   bool ccEnable = false;
   cudaStream_t deviceStream;
 
@@ -1882,9 +1882,6 @@ fail:
 static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
   ncclComm_t comm = job->comm;
-#ifdef ENABLE_MSCCLPP
-  ncclUniqueId origUniqueId = *job->commId;
-#endif
   ncclResult_t res = ncclSuccess;
   int archMajor, archMinor;
   size_t maxLocalSizeBytes = 0;
@@ -1895,8 +1892,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   double sum_timers = 0;
   uint64_t timers[TIMERS_INIT_COUNT] = {0};
   unsigned long long commIdHash;
+  #ifdef USE_INDIRECT_FUNCTION_CALL
   int64_t stackSize;
   hipDeviceProp_t devProp;
+  #endif
 
   timers[TIMER_INIT_TOTAL] = clockNano();
   CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
diff --git a/projects/rccl/src/misc/alt_rsmi.cc b/projects/rccl/src/misc/alt_rsmi.cc
index 0dffe07cd5..d748d2618e 100644
--- a/projects/rccl/src/misc/alt_rsmi.cc
+++ b/projects/rccl/src/misc/alt_rsmi.cc
@@ -48,7 +48,6 @@ struct ARSMI_systemNode {
     std::string s_card;
 };
 
-static const char *kPathDRMRoot = "/sys/class/drm";
 static const char *kKFDNodesPathRoot = "/sys/class/kfd/kfd/topology/nodes";
 static const uint32_t kAmdGpuId = 0x1002;
 
@@ -66,7 +65,6 @@ static thread_local int ARSMI_num_devices=-1;
 int ARSMI_init(void)
 {
     std::string err_msg;
-    uint32_t count = 0;
     std::multimap<uint64_t, ARSMI_systemNode> ARSMI_allSystemNodes;
 
     if (ARSMI_num_devices > 0) {
@@ -101,7 +99,7 @@ int ARSMI_init(void)
         int ret_loc_id = read_node_properties(node_id, "location_id", &location_id, properties);
         int ret_domain = read_node_properties(node_id, "domain", &domain, properties);
         int ret_vendor = read_node_properties(node_id, "vendor_id", &vendor_id, properties);
-        if (ret_gpu_id == 0 &&  ~(ret_unique_id != 0 || ret_loc_id != 0 || ret_unique_id != 0 || ret_vendor != 0) &&
+        if (ret_gpu_id == 0 &&  !(ret_unique_id != 0 || ret_loc_id != 0 || ret_domain != 0 || ret_vendor != 0) &&
             (gpu_id != 0) && (vendor_id == kAmdGpuId)) {
             // Do not try to build a node if one of these fields
             // do not exist in KFD (0 as values okay)
@@ -194,7 +192,6 @@ int ARSMI_init(void)
     // the order of each block.
     for (auto i=0; i < first_elem.size(); i++) {
         // Find the first_elem[i] in sort_vecs in
-        bool found = false;
         for (auto j = 0; j < sort_vecs.size(); j++ ) {
             if (first_elem[i] == sort_vecs[j][0].s_bdf) {
                 for (auto k=0; k<sort_vecs[j].size(); k++) {
@@ -226,7 +223,6 @@ int ARSMI_init(void)
                 continue;
             }
 
-            uint64_t hops;
             uint64_t type;
             uint64_t weight;
             uint64_t min_bandwidth;
diff --git a/projects/rccl/src/misc/msccl/msccl_lifecycle.cc b/projects/rccl/src/misc/msccl/msccl_lifecycle.cc
index 1fa16c3849..ab8631950c 100644
--- a/projects/rccl/src/misc/msccl/msccl_lifecycle.cc
+++ b/projects/rccl/src/misc/msccl/msccl_lifecycle.cc
@@ -30,7 +30,6 @@
 RCCL_PARAM(MscclEnabled, "MSCCL_ENABLE", 1);
 RCCL_PARAM(MscclForceEnabled, "MSCCL_FORCE_ENABLE", 0);
 RCCL_PARAM(MscclEnableSingleProcess, "MSCCL_ENABLE_SINGLE_PROCESS", 1);
-static const char* mscclAlgoFilePathEnv = "MSCCL_ALGO_FILE_PATH";
 
 bool mscclEnabled() {
 #ifdef COMPILE_MSCCL_KERNEL
diff --git a/projects/rccl/src/misc/msccl/msccl_parser.cc b/projects/rccl/src/misc/msccl/msccl_parser.cc
index 2d532af6da..6cd9d0fadd 100644
--- a/projects/rccl/src/misc/msccl/msccl_parser.cc
+++ b/projects/rccl/src/misc/msccl/msccl_parser.cc
@@ -708,7 +708,6 @@ ncclResult_t mscclXmlLoadSingleNode(FILE* file, struct mscclXmlNode* node) {
 }
 
 ncclResult_t mscclAlgoMetaXmlLoad(const char* xmlFilePath, struct mscclXmlNode* node) {
-  ncclResult_t ret = ncclSuccess;
   FILE* file = fopen(xmlFilePath, "r");
   if (file == NULL) {
     fprintf(stderr, "Could not open MSCCL XML algorithm file %s : %s", xmlFilePath, strerror(errno));
@@ -720,7 +719,6 @@ ncclResult_t mscclAlgoMetaXmlLoad(const char* xmlFilePath, struct mscclXmlNode*
 }
 
 ncclResult_t mscclGetAlgoMetaFromXmlFile(const char* str, struct mscclAlgoMeta* algoMeta) {
-  ncclResult_t ret = ncclSuccess;
   struct mscclXmlNode* node;
   node = (struct mscclXmlNode *)malloc(sizeof(struct mscclXmlNode));
   NCCLCHECK(mscclAlgoMetaXmlLoad(str, node));
diff --git a/projects/rccl/src/misc/msccl/msccl_setup.cc b/projects/rccl/src/misc/msccl/msccl_setup.cc
index ed5075d049..1cae215c41 100644
--- a/projects/rccl/src/misc/msccl/msccl_setup.cc
+++ b/projects/rccl/src/misc/msccl/msccl_setup.cc
@@ -123,7 +123,6 @@ ncclResult_t mscclSetupConnections(struct mscclAlgo* hostAlgo, ncclComm_t comm)
 
 static ncclResult_t mscclSetupProxyImpl(struct mscclAlgo* hostAlgo, ncclComm_t comm) {
   mscclStatus& status = mscclGetStatus(comm);
-  mscclThreadLocalStatus& threadLocalStatus = mscclGetThreadLocalStatus();
   struct ncclProxyOp proxyOp = {};
   proxyOp.connIndex = 0;
   proxyOp.sliceSteps = status.sliceSteps;
diff --git a/projects/rccl/src/misc/proxy_trace/proxy_trace.cc b/projects/rccl/src/misc/proxy_trace/proxy_trace.cc
index 5d585389d4..c93b961853 100644
--- a/projects/rccl/src/misc/proxy_trace/proxy_trace.cc
+++ b/projects/rccl/src/misc/proxy_trace/proxy_trace.cc
@@ -147,8 +147,8 @@ std::string facebook_rccl::ProxyTrace::dump(uint64_t commHash) {
       sortedDumpStrMap[traceKey.str()] = proxyOpMap.second.str();
     }
   }
-  for (const auto &[keyStr, proxyOpStr] : sortedDumpStrMap) {
-    result += proxyOpStr;
+  for (const auto &pair : sortedDumpStrMap) {
+    result += pair.second; //proxyOpStr
   }
   return result;
 }
@@ -159,22 +159,22 @@ std::string facebook_rccl::ProxyTrace::dump() {
 
   // maps serialized key to serliazed proxyOp; sorted by key
   std::map<std::string, std::string> sortedDumpStrMap;
-  for (auto &[commHash, opCountMap] : activeOps) {
-    for (auto &[opCount, proxyOpMap] : opCountMap) {
-      for (auto &[opId, opEntry] : proxyOpMap) {
-        ProxyTraceRecordKey traceKey = {commHash, opCount, opId};
-        opEntry.computeStatus();
-        sortedDumpStrMap[traceKey.str()] = opEntry.str();
+  for (auto &commHash_opCountMap : activeOps) {
+    for (auto &opCount_proxyOpMap : commHash_opCountMap.second /*opCountMap*/) {
+      for (auto &opId_opEntry : opCount_proxyOpMap.second/*proxyOpMap*/) {
+        ProxyTraceRecordKey traceKey = {commHash_opCountMap.first, opCount_proxyOpMap.first, opId_opEntry.first};
+        opId_opEntry.second.computeStatus();
+        sortedDumpStrMap[traceKey.str()] = opId_opEntry.second.str();
       }
     }
   }
 
   // add the recent finished ops as well
-  for (const auto &[keyStr, proxyOpStr] : finishedOps) {
-    sortedDumpStrMap[keyStr] = proxyOpStr;
+  for (const auto &keyStr_proxyOpStr : finishedOps) {
+    sortedDumpStrMap[keyStr_proxyOpStr.first] = keyStr_proxyOpStr.second;
   }
-  for (const auto &[keyStr, proxyOpStr] : sortedDumpStrMap) {
-    result += proxyOpStr;
+  for (const auto &keyStr_proxyOpStr : sortedDumpStrMap) {
+    result += keyStr_proxyOpStr.second;
   }
   return result;
 }
@@ -207,15 +207,15 @@ std::string facebook_rccl::ProxyTraceOp::str() {
 
 float facebook_rccl::ProxyTrace::getMapSizeMB() const {
   float size = 0;
-  for (const auto &[commHash, opCountMap] : activeOps) {
-    for (const auto &[opCount, proxyOpMap] : opCountMap) {
-      size += proxyOpMap.size() *
+  for (const auto &commHash_opCountMap : activeOps) {
+    for (const auto &opCount_proxyOpMap : commHash_opCountMap.second) {
+      size += opCount_proxyOpMap.second.size() *
               (sizeof(ProxyTraceOp) +
                sizeof(std::unique_ptr<facebook_rccl::ProxyTraceOp>));
     }
   }
-  for (const auto &[keyStr, proxyOpStr] : finishedOps) {
-    size += keyStr.size() + proxyOpStr.size();
+  for (const auto &keyStr_proxyOpStr : finishedOps) {
+    size += keyStr_proxyOpStr.first.size() + keyStr_proxyOpStr.second.size();
   }
   return size / 1024.0 / 1024.0;
 }
diff --git a/projects/rccl/src/misc/recorder.cc b/projects/rccl/src/misc/recorder.cc
index 8268378ad0..3d390225c7 100644
--- a/projects/rccl/src/misc/recorder.cc
+++ b/projects/rccl/src/misc/recorder.cc
@@ -40,8 +40,8 @@ rcclApiCall::rcclApiCall(rcclCall_t type, const ncclInfo& info)://name(rcclCallS
                                                                 datatype(info.datatype),
                                                                 op(info.op),
                                                                 root(info.root),
-                                                                comm(info.comm),
                                                                 nRanks(info.comm->nRanks),
+                                                                comm(info.comm),
                                                                 stream(info.stream),
                                                                 nTasks(info.comm->planner.nTasksP2p + info.comm->planner.nTasksColl),
                                                                 globalRank(info.comm->localRankToRank[info.comm->localRank])
diff --git a/projects/rccl/src/misc/socket.cc b/projects/rccl/src/misc/socket.cc
index 8d902d23d2..87f2e7eb10 100644
--- a/projects/rccl/src/misc/socket.cc
+++ b/projects/rccl/src/misc/socket.cc
@@ -663,8 +663,9 @@ ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running) {
 }
 
 ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
+#ifdef ENABLE_TRACE
   char line[SOCKET_NAME_MAXLEN+1];
-
+#endif
   if (sock == NULL) {
     WARN("ncclSocketConnect: pass NULL socket");
     return ncclInvalidArgument;
diff --git a/projects/rccl/src/proxy.cc b/projects/rccl/src/proxy.cc
index dbd4a80ace..370b1ce471 100644
--- a/projects/rccl/src/proxy.cc
+++ b/projects/rccl/src/proxy.cc
@@ -292,10 +292,8 @@ ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) {
   struct ncclProxyArgs* op = state->active;
   int poolIndex, opIndex;
   int list_len = 0;
-  int sublist_len = 0;
   fprintf(stderr, "ACTIVE OPS\n");
   while (op) {
-    sublist_len = 0;
     NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
     if (op->state & OP_SEEN) {
       WARN("List loop at element %d-%d", poolIndex, opIndex);
@@ -304,7 +302,6 @@ ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) {
     op->state |= OP_SEEN;
     struct ncclProxyArgs* nextOp = op->nextPeer;
     while (nextOp) {
-      sublist_len++;
       NCCLCHECK(getOpIndex(nextOp, state, &poolIndex, &opIndex));
       if (nextOp->state & OP_SEEN) {
         WARN("List loop at element %d-%d", poolIndex, opIndex);
diff --git a/projects/rccl/src/transport/coll_net.cc b/projects/rccl/src/transport/coll_net.cc
index 7bcea409eb..483c18e24b 100644
--- a/projects/rccl/src/transport/coll_net.cc
+++ b/projects/rccl/src/transport/coll_net.cc
@@ -524,6 +524,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
 
   int dmabuf_fd = -1;
+  (void)dmabuf_fd; /*compiler warnings fix - unused variable*/
 #if CUDA_VERSION >= 11070
   /* DMA-BUF support */
   if (resources->useGdr && resources->useDmaBuf) {
@@ -600,8 +601,9 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   struct connectMapMem* mapMem = map->mems+bank;
   NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
   NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
-
+  
   int dmabuf_fd = -1;
+  (void)dmabuf_fd; /*compiler warnings fix - unused variable*/
 #if CUDA_VERSION >= 11070
   /* DMA-BUF support */
   if (resources->useGdr && resources->useDmaBuf) {
@@ -1314,8 +1316,8 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s
     NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
     needReg = false;
   }
-#endif
 peermem:
+#endif
   if (dmabuf_fd != -1) {
     (void)close(dmabuf_fd);
     dmabuf_fd = -1;
@@ -1342,20 +1344,20 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s
 
   assert(reqSize == sizeof(struct collnetRegInfo));
   assert(respSize == sizeof(void*));
-  int dmabuf_fd = -1;
   #if CUDART_VERSION >= 11070
+  int dmabuf_fd = -1;
   /* DMA-BUF support */
   if (resources->useGdr && resources->useDmaBuf) {
     CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
     NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
     needReg = false;
   }
-#endif
 peermem:
   if (dmabuf_fd != -1) {
     (void)close(dmabuf_fd);
     dmabuf_fd = -1;
   }
+#endif
   if (needReg) {
     NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
   }
diff --git a/projects/rccl/src/transport/net.cc b/projects/rccl/src/transport/net.cc
index 1669ac0679..ac3fd55e25 100644
--- a/projects/rccl/src/transport/net.cc
+++ b/projects/rccl/src/transport/net.cc
@@ -362,7 +362,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
     send->transportResources = map;
     opId = send;
     INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId);
-    netSendConnectArgs args = {0};
+    netSendConnectArgs args = {{},0};
     memcpy(&args.handle, connectInfo, sizeof(ncclNetHandle_t));
     args.trafficClass = comm->config.trafficClass;
     NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(netSendConnectArgs), sizeof(struct connectMap), opId));
diff --git a/projects/rccl/src/transport/p2p.cc b/projects/rccl/src/transport/p2p.cc
index a5d89c4b43..7bdfbe07b3 100644
--- a/projects/rccl/src/transport/p2p.cc
+++ b/projects/rccl/src/transport/p2p.cc
@@ -1077,9 +1077,6 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str
   struct p2pIpcExpInfo* ipcExpInfo = (struct p2pIpcExpInfo*)reqBuff;
   void* regAddr = NULL;
   ncclResult_t ret = ncclSuccess;
-  bool mapped = false;
-  bool imported = false;
-  CUmemGenericAllocationHandle handle;
 
   assert(sizeof(struct p2pIpcExpInfo) == reqSize);
   assert(sizeof(void*) == respSize);
@@ -1094,6 +1091,9 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str
     regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset);
   } else {
 #if CUDART_VERSION >= 11030
+    bool mapped = false; /*compiler warning, defining vars only if needed*/
+    bool imported = false;
+    CUmemGenericAllocationHandle handle;
     // cuMem import
     if (connection->sameProcess) {
       // if proxy is same process as request peer, we just need to map the handle.