Compiler warnings fix 2 (#1801)

* Changes to device code * Changes to src/misc * Changes to graph * src/include changes * src/transport changes * changes in init, enqueue, proxy * Changes to CMakeLists.txt * Additional changes to device code * Additional changes to net.cc * adding 'compiler warning' tag to ease upstream merge' * typo correction * Addessing comments * Additional changes for new commits [ROCm/rccl commit: 3f8cac388e]
2025-08-05 17:36:23 -05:00
@@ -1112,7 +1112,8 @@ target_compile_options(rccl PRIVATE -Werror=sometimes-uninitialized)
 target_compile_options(rccl PRIVATE -Wall)
 target_compile_options(rccl PRIVATE -Werror=deprecated-copy-with-user-provided-copy)
 target_compile_options(rccl PRIVATE -Wno-format-nonliteral)
-target_compile_options(rccl PRIVATE -fgpu-rdc)      # Generate relocatable device code (required for extern __shared__)
+target_compile_options(rccl PRIVATE -Wno-unused-function)
+target_compile_options(rccl PRIVATE -fgpu-rdc)

 ## Set RCCL compile and linker options for unit tests and code coverage
 if(ENABLE_CODE_COVERAGE)
@@ -16,7 +16,10 @@ namespace {
 #else
  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #endif
+#if defined(ENABLE_NPKIT)
    const int bid = ncclShmem.channelId - work->channelLo;
+    int npKitCtxIdx = bid; // unused variable - compiler warning
+#endif
    ncclRing *ring = &ncclShmem.channel.ring;
    const int *ringRanks = ring->userRanks;
    const int nranks = ncclShmem.comm.nRanks;
@@ -27,9 +30,6 @@ namespace {
    int nelem;
    int rankDest;

-#if defined(ENABLE_NPKIT)
-    int npKitCtxIdx = bid;
-#endif

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
    if (tid == 0) {
@@ -229,6 +229,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
    static constexpr int nworkers = NCCL_PAT_NWORKERS;
    struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
    uint64_t pollCount = 0;
+    (void)pollCount; // unused variable - compiler warning
    __syncthreads(); // Don't start using shared mem until everyone arrives
    for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
    if (tid == 0) shmem->localAccSize = 0;
@@ -238,11 +239,14 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
    if (tid == nworkers) { // Algo computation thread
      PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
      int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
+      (void)parallelFactor;// unused variable - compiler warning
      int step = 0;
      while (1) {
        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
        int* poll = &ps->flags;
-        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0) { 
+          pollCount++ ;// Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        }
        patAlgo.getNextOp(ps);
        int last = ps->last;
        step++;
@@ -267,7 +271,9 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
      while(1) {
        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
        int* poll = &ps->flags;
-        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0) pollCount++; // Wait for compute thread
+        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0){
+          pollCount++; // Wait for compute thread 
+        }
        int last = ps->last;
        prims.patCopy(ps, shmem);
        if (tidInGroup == 0) __hip_atomic_store(poll, 0, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); // Return element to compute thread
@@ -23,7 +23,10 @@ namespace {
    ncclRing *ring = &ncclShmem.channel.ring;
    int ringIx = ring->index;
    const int nranks = ncclShmem.comm.nRanks;
+#if defined(ENABLE_NPKIT)
    const int bid = ncclShmem.channelId - work->channelLo;
+    int npKitCtxIdx = bid; // unused variable - compiler warning
+#endif
    ssize_t size;
    ssize_t gridOffset;
    ssize_t channelCount;
@@ -34,9 +37,6 @@ namespace {
    int nelem;
    int chunk;

-#if defined(ENABLE_NPKIT)
-    int npKitCtxIdx = bid;
-#endif

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
    if (tid == 0) {
@@ -216,7 +216,10 @@ namespace {
 #else
  __device__ __attribute__((noinline)) void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #endif
+#if defined(ENABLE_NPKIT)
    const int bid = ncclShmem.channelId - work->channelLo;
+    int npKitCtxIdx = bid; // unused variable - compiler warning
+#endif
    ncclTree *tree = &ncclShmem.channel.tree;
    size_t size;
    size_t gridOffset;
@@ -226,9 +229,6 @@ namespace {
    size_t offset;
    int nelem;

-#if defined(ENABLE_NPKIT)
-    int npKitCtxIdx = bid;
-#endif

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
    if (tid == 0) {
@@ -364,7 +364,9 @@ namespace {
 #else
  __device__ __attribute__((noinline)) void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #endif
-    const int bid = ncclShmem.channelId - work->channelLo;
+#if defined(ENABLE_NPKIT)
+    const int bid = ncclShmem.channelId - work->channelLo; // unused variable - compiler warning
+#endif
    ncclTree *tree = &ncclShmem.channel.tree;
    size_t size;
    size_t gridOffset;
@@ -15,7 +15,10 @@ namespace {
 #else
  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #endif
+#if defined(ENABLE_NPKIT)
    const int bid = ncclShmem.channelId - work->channelLo;
+    int npKitCtxIdx = bid; // unused variable - compiler warning
+#endif
    ncclRing *ring = &ncclShmem.channel.ring;
    const int rank = ring->userRanks[0];
    const int nextRank = ring->userRanks[1];
@@ -30,9 +33,6 @@ namespace {
    int workNthreads;
    bool isNetOffload = work->isOneRPN && work->netRegUsed;

-#if defined(ENABLE_NPKIT)
-    int npKitCtxIdx = bid;
-#endif

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
    if (tid == 0) {
@@ -20,7 +20,7 @@
  if (nthreads == NCCL_MAX_NTHREADS) { \
    __THREAD_FENCE; __builtin_amdgcn_s_barrier(); \
  } else { \
-    const int w = threadIdx.x/WARP_SIZE; \
+    /**const int w = threadIdx.x/WARP_SIZE //unused variable - compiler warning**/;\
    const int wid = threadIdx.x%WARP_SIZE; \
    if (wid == 0) { \
      (BARRIER_NEXT) += (NWORKERS) / WARP_SIZE; \
@@ -148,7 +148,8 @@ private:
  __device__ uint64_t readLL(int offset, int i) {
    union ncclLLFifoLine* src = recvPtr(i) + offset;
    uint32_t flag = recvFlag(i);
-    uint32_t data1, flag1, data2, flag2;
+    uint32_t data1, flag1, data2, flag2; 
+    (void)data1; (void)flag1; (void)data2; (void)flag2; // unused variable - compiler warning
    int spins = 0;

 #if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
@@ -865,8 +866,8 @@ public:
      int eltInLine = EltPerLine < nelem ? EltPerLine : nelem;

      DataLoader dl;
-      ncclLLFifoLine line[MaxRecv];
-      uint64_t data, peerData;
+      // ncclLLFifoLine line[MaxRecv];//unused variable - compiler warning
+      uint64_t data /*peerData*/;     //unused variable - compiler warning
      dl.loadBegin(srcElts, eltInLine);
      srcElts += eltPerTrip;
      data = dl.loadFinish();
@@ -521,10 +521,9 @@ public:
      bool ipcReg = false, bool netReg = false, int stepSize_ = 0
    ):
    redOp(redOpArg),
-    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
-    warpInBlock(threadIdx.x/WARP_SIZE),
-    flagThread((tid%4)==3), group(group),
-    stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
+    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE),                                /*compiler warnings*/
+    stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)), 
+    warp(tid/WARP_SIZE), warpInBlock(threadIdx.x/WARP_SIZE), flagThread((tid%4)==3), group(group){
    auto *channel = &ncclShmem.channel;
    barriers = &ncclShmem.groups[group].barrier;
    int nrecv=0, nsend=0;
@@ -751,8 +751,8 @@ public:
      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* collWork = nullptr,
      struct ncclDevWorkP2p* p2pWork = nullptr, int stepSize_ = 0, int mode = primsModeDefault
    ):
-    tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
-    stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
+    tid(tid), tidInBlock(threadIdx.x), nthreads(nthreads), /*compiler warnings*/
+    stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_), group(group) {

    barriers = &ncclShmem.groups[group].barrier;
    // PAT uses the same barrier for each group
@@ -178,7 +178,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SI

    static constexpr int nworkers = NCCL_PAT_NWORKERS;
    struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
-    uint64_t pollCount = 0;
+    //uint64_t pollCount = 0; unused variable - compiler warning
    __syncthreads(); // Don't start using shared mem until everyone arrives
    for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
    if (tid == 0) shmem->localAccSize = 0;
@@ -188,11 +188,14 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SI
    if (tid == nworkers) { // Algo computation thread
      PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
      int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
+      (void)parallelFactor;// unused variable - compiler warning
      int step = 0;
      while (1) {
        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
        int* poll = &ps->flags;
-        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) != 0){
+          //pollCount++;// unused variable - compiler warning // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        }
        patAlgo.getNextOp(ps);
        int last = ps->last;
        step++;
@@ -217,7 +220,9 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SI
      while(1) {
        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
        int* poll = &ps->flags;
-        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0) pollCount++; // Wait for compute thread
+        while (__hip_atomic_load(poll, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) == 0) {
+          //pollCount++; // unused variable - compiler warning // Wait for compute thread
+        }
        int last = ps->last;
        prims.patReduce(ps, shmem);
        if (tidInGroup == 0) __hip_atomic_store(poll, 0, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); // Return element to compute thread
@@ -1238,7 +1238,7 @@ static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduce
      warned = 1;
      WARN("Waiting for work FIFO to become available. "
           "Work fifo exhaustion can happen in large scale/high iteration count of alltoall. "
-           "In order to increase work FIFO size, set NCCL_WORK_FIFO_BYTES to higher number (current: %ld).\n\n"
+           "In order to increase work FIFO size, set NCCL_WORK_FIFO_BYTES to higher number (current: %d).\n\n"
           "RCCL continues to retry...", comm->workFifoBytes);
    }

@@ -1776,7 +1776,7 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
    ncclIntruQueueConstruct(&planner->planQueue);

    bool capturing = ncclCudaGraphValid(planner->capturingGraph);
-    cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
+    //cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch // unused variable - compiler warning
    cudaStream_t deviceStream, launchOrder;

    if (capturing || planner->numStreams != 1) {
@@ -119,14 +119,14 @@ bool isRankHere(const char* s, int start, int end, int rank) {

 ncclResult_t ncclTreeBasePostset(struct ncclComm* comm,
    struct ncclTopoGraph* treeGraph) {
-  int x=0, y=0;
+  int x=0;
  for (int i=0;  treeGraph->treeBase[i][0]!=0; i++)
  {
    x=i+1;
  }
  if( treeGraph->treeBase[0][0] == 0) return ncclSuccess;
  int nChannels = comm->nChannels;
-  int localRanks = comm->topo->nodes[GPU].count;
+  //int localRanks = comm->topo->nodes[GPU].count; // unused variable - compiler warning
  //new tree
  for (int c=0; c<nChannels; c++) { // in here
    int buff = c%x;
@@ -286,14 +286,17 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst
    // GPU not found, we can't use p2p.
    return ncclSuccess;
  }
-
+  #if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
  int intermediateIndex = -1;
+  #endif
  // Set intermediate GPU rank, if routing through an intermediate GPU.
  struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
  if (path->count == 2) {
    struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
    if (intermediateNode->type == GPU) {
+      #if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
      intermediateIndex = intermediateNode - system->nodes[GPU].nodes;
+      #endif
      if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank;
    }
  }
@@ -324,8 +327,7 @@ compare:
  // Compute the PCI distance and compare with the p2pLevel.
  if (path->type <= p2pLevel) *p2p = 1;

-#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
-#else
+#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
  if (*p2p == 1) {
    // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to
    // validate against NVML at all since they are pretending to be on other hw.
@@ -515,10 +517,10 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev,
  int g;
  struct ncclTopoSystem* system = comm->topo;
  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
-  struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
  *flush = 1;
 #else
+  struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; // unused variable - compiler warning
  // Flush is required on Ampere and earlier
  if (gpu->gpu.cudaCompCap >= 90) *flush = 0;
  // On C2C platforms, data could go through a PCI switch while completions and
@@ -1338,7 +1338,6 @@ end:
 */
 ncclResult_t parseGraphLight(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map) {
  int gpus[NCCL_TOPO_MAX_NODES]; //transcribe/change according to gpu_map
-  int nChannels = 0;
  int gpu = 0;
  int offset = 0;
  int start_offset = offset;
@@ -1348,7 +1347,7 @@ ncclResult_t parseGraphLight(const char* str, struct ncclTopoSystem* system, str
  }
  int status = 0; // 0 : between numbers, 1 : inside number
  int ngpus = system->nodes[GPU].count;
-  int x=0, y=0;
+  int x=0;
  do {
    int digit = str[offset] - '0';
    if (digit >= 0 && digit <= 9) {
@@ -1855,7 +1854,6 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
  bool isAlltoall = checkAlltoallWidth(&romeTopo);
  if (!isAlltoall) return ncclSuccess;

-  int gcnt = 0;
  int *g8, n[NCCL_TOPO_MAX_NODES];
  int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int));
  struct timeval tvs, tve;
@@ -1878,7 +1876,6 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
      }
      if (ngpusPerNuma == 0) continue;
      if (ngpusPerNuma != NUMA_GPUS) break;
-      gcnt++;
      // init GPU mapping
      for (int k = 0; k < ngpus; k++) {
        if (romeTopo.gpuNuma[k] != j) continue;
@@ -1927,7 +1924,6 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
    if (p < TOTAL_PERMUTE_COUNT) break;
  }
  gettimeofday(&tve, NULL);
-  float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3;
  if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) {
    //printf("No solution in %.2fms\n", t);
    return ncclSuccess;
@@ -2035,7 +2031,6 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
  int i;

  int ngpus = system->nodes[GPU].count;
-  int ncpus = system->nodes[CPU].count;
  int nnets = system->nodes[NET].count;

  // Only support ring and tree graphs
@@ -2129,7 +2124,6 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
    }
  }
  gettimeofday(&tve, NULL);
-  float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3;
  if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) {
    //printf("No solution in %.2fms (%d iter)\n", t, time);
    return ncclSuccess;
@@ -2241,7 +2235,6 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
  // only match for system with 16 GPUs
  if (ngpus != 16 || ncpus != NUMA_CPUS) return ncclSuccess;

-  int gcnt = 0;
  int *g16, n[NCCL_TOPO_MAX_NODES], rdm[NUMA_GPUS*NUMA_CPUS];
  int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int));
  struct timeval tvs, tve;
@@ -2262,7 +2255,6 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
      }
      if (ngpusPerNuma == 0) continue;
      if (ngpusPerNuma != NUMA_GPUS) break;
-      gcnt++;
      // init GPU mapping
      for (int k = 0; k < ngpus; k++) {
        if (romeTopo.gpuNuma[k] != j) continue;
@@ -2317,7 +2309,6 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
    if (p < TOTAL_PERMUTE_COUNT) break;
  }
  gettimeofday(&tve, NULL);
-  float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3;
  if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) {
    //printf("No solution in %.2fms\n", t);
    return ncclSuccess;
@@ -168,7 +168,7 @@ static struct tuningModel tuning_model_0 {
    { 1.0, 0.8, 0.2, 1.0, 1.0, 0.3, 1.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.5, 1.0, 0.8, 0.8, 1.0, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
  },

-  .llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED},
+  .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
 };

 static struct tuningModel tuning_model_1 {
@@ -200,7 +200,7 @@ static struct tuningModel tuning_model_1 {
    { 0.3, 1.0, 0.3, 0.1, 0.1, 0.1, 0.3, 0.7, 1.0, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.9, 1.0, 1.0, 1.0, 1.0, },
  },

-  .llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED},
+  .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
 };

 static struct tuningModel tuning_model_2 {
@@ -232,7 +232,7 @@ static struct tuningModel tuning_model_2 {
    { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.4, 0.5, 0.6, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
  },

-  .llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED},
+  .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
 };

 static struct tuningModel tuning_model_3 {
@@ -264,7 +264,7 @@ static struct tuningModel tuning_model_3 {
    { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 1.0, 0.1, 0.3, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.4, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
  },

-  .llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED},
+  .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
 };

 static struct tuningModel tuning_model_4 {
@@ -296,7 +296,7 @@ static struct tuningModel tuning_model_4 {
    { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.8, 0.5, 0.1, 0.7, 0.2, 0.4, 0.4, 0.6, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, },
  },

-  .llProtoRanges = {RCCL_LL_LIMITS_UNDEFINED},
+  .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
 };

 static struct tuningModel tuning_model_5 {
@@ -394,6 +394,7 @@ static struct tuningModel rcclTuningModel[] = {
 #define HOPPER_COMPCAP_IDX 2
 #define BLACKWELL_COMPCAP_IDX 3

+#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
 // LL128 max BW per channel
 static const double llMaxBws[][3] = {
  /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
@@ -420,6 +421,7 @@ static const double perChMaxTreeBws[][3] = {
  /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
  /* Blackwell (N1/N2/N4) */ {2*38.7, 2*41.4, 2*36.0},
 };
+#endif

 NCCL_PARAM(PatEnable, "PAT_ENABLE", 0);
 static int ncclPatEnable(struct ncclComm* comm) {
@@ -470,7 +472,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
  int nNodes = comm->nNodes;
  int nRanks = comm->nRanks;
  if (nRanks <= 1) return ncclSuccess;
-
+#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
  int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
  int index2 = nNodes <= 2 ? nNodes-1 : 2;
  // LL: for single node, we look at GPU type; for multi-node, we look at CPU type
@@ -480,6 +482,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
  double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2];
  double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2];
  double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
+#endif
  // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
  //if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
  float ppn = (float)nRanks / nNodes;
@@ -805,11 +808,13 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom

 // Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
 // factor is not ideal but works quite well. Powers of two, 64 B to 256MB.
+#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
 static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
  { 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .7,  .7,  .7,  .7,  .6,  .5,  .4,  .4,  .5,  .6,  .7,  .8,  .9, 1.0, 1.0, 1.0, 1.0 },
  { 1.0, 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .8,  .8,  .7,  .6,  .6,  .6,  .6,  .6,  .6,  .8,  .9,  .9,  .9,  .9, 1.0, 1.0 },
  {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .6,  .7,  .8,  .7,  .7,  .8,  .9,  .9 }
 };
+#endif

 ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time) {
  float bw = comm->bandwidths[coll][algorithm][protocol];
@@ -180,11 +180,11 @@ static gdr_t ncclGdrInit() {

 template <typename T>
 static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle, hipStream_t stream) {
-  gdr_info_t info;
+  // gdr_info_t info; // unused variable - compiler warning
  size_t mapSize;
-  gdr_mh_t mh;
+  // gdr_mh_t mh;     // unused variable - compiler warning
  char *devMem;
-  void *gdrMap;
+  // void *gdrMap;    // unused variable - compiler warning

  mapSize = ncclSizeOfT<T>()*nelem;

@@ -216,7 +216,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**

 template <typename T>
 static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
-  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
+  //gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; // unused variable - compiler warning
  memcpy(dst, src, nelem*sizeof(T));
  return ncclSuccess;
 }
@@ -73,11 +73,6 @@ inline __device__  rccl_float8 hadd(rccl_float8 x, rccl_float8 y)
    u.i16_vec = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(v1, v1, /* scale */ 1.f, 0);
    return u.fp8[0];
 #elif __HIP_DEVICE_COMPILE__ && defined(__gfx942__)
-     union
-    {
-        uint32_t i32val;
-        rccl_float8 i8val[4];
-    } val;

    float2_t v;
    uint32_t ival = 0;
@@ -235,7 +230,7 @@ namespace rocblas_hip_f8_impl
        else
            x = reinterpret_cast<uint16_t&>(_x);

-        uint32_t y, head, mantissa;
+        uint32_t head, mantissa;
        int      exponent, bias;
        uint32_t sign;

@@ -709,7 +709,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
  int nRanks = comm->nRanks;
  struct ncclDevCommAndChannels tmpCommAndChans;
  struct ncclDevCommAndChannels *devCommAndChans = NULL;
-  struct ncclNvmlCCStatus ccStatus;
+  //struct ncclNvmlCCStatus ccStatus; //unused variable - compiler warning
  bool ccEnable = false;
  cudaStream_t deviceStream;

@@ -1882,9 +1882,6 @@ fail:
 static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
  struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
  ncclComm_t comm = job->comm;
-#ifdef ENABLE_MSCCLPP
-  ncclUniqueId origUniqueId = *job->commId;
-#endif
  ncclResult_t res = ncclSuccess;
  int archMajor, archMinor;
  size_t maxLocalSizeBytes = 0;
@@ -1895,8 +1892,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
  double sum_timers = 0;
  uint64_t timers[TIMERS_INIT_COUNT] = {0};
  unsigned long long commIdHash;
+  #ifdef USE_INDIRECT_FUNCTION_CALL
  int64_t stackSize;
  hipDeviceProp_t devProp;
+  #endif

  timers[TIMER_INIT_TOTAL] = clockNano();
  CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
@@ -48,7 +48,6 @@ struct ARSMI_systemNode {
    std::string s_card;
 };

-static const char *kPathDRMRoot = "/sys/class/drm";
 static const char *kKFDNodesPathRoot = "/sys/class/kfd/kfd/topology/nodes";
 static const uint32_t kAmdGpuId = 0x1002;

@@ -66,7 +65,6 @@ static thread_local int ARSMI_num_devices=-1;
 int ARSMI_init(void)
 {
    std::string err_msg;
-    uint32_t count = 0;
    std::multimap<uint64_t, ARSMI_systemNode> ARSMI_allSystemNodes;

    if (ARSMI_num_devices > 0) {
@@ -101,7 +99,7 @@ int ARSMI_init(void)
        int ret_loc_id = read_node_properties(node_id, "location_id", &location_id, properties);
        int ret_domain = read_node_properties(node_id, "domain", &domain, properties);
        int ret_vendor = read_node_properties(node_id, "vendor_id", &vendor_id, properties);
-        if (ret_gpu_id == 0 &&  ~(ret_unique_id != 0 || ret_loc_id != 0 || ret_unique_id != 0 || ret_vendor != 0) &&
+        if (ret_gpu_id == 0 &&  !(ret_unique_id != 0 || ret_loc_id != 0 || ret_domain != 0 || ret_vendor != 0) &&
            (gpu_id != 0) && (vendor_id == kAmdGpuId)) {
            // Do not try to build a node if one of these fields
            // do not exist in KFD (0 as values okay)
@@ -194,7 +192,6 @@ int ARSMI_init(void)
    // the order of each block.
    for (auto i=0; i < first_elem.size(); i++) {
        // Find the first_elem[i] in sort_vecs in
-        bool found = false;
        for (auto j = 0; j < sort_vecs.size(); j++ ) {
            if (first_elem[i] == sort_vecs[j][0].s_bdf) {
                for (auto k=0; k<sort_vecs[j].size(); k++) {
@@ -226,7 +223,6 @@ int ARSMI_init(void)
                continue;
            }

-            uint64_t hops;
            uint64_t type;
            uint64_t weight;
            uint64_t min_bandwidth;
@@ -30,7 +30,6 @@
 RCCL_PARAM(MscclEnabled, "MSCCL_ENABLE", 1);
 RCCL_PARAM(MscclForceEnabled, "MSCCL_FORCE_ENABLE", 0);
 RCCL_PARAM(MscclEnableSingleProcess, "MSCCL_ENABLE_SINGLE_PROCESS", 1);
-static const char* mscclAlgoFilePathEnv = "MSCCL_ALGO_FILE_PATH";

 bool mscclEnabled() {
 #ifdef COMPILE_MSCCL_KERNEL
@@ -708,7 +708,6 @@ ncclResult_t mscclXmlLoadSingleNode(FILE* file, struct mscclXmlNode* node) {
 }

 ncclResult_t mscclAlgoMetaXmlLoad(const char* xmlFilePath, struct mscclXmlNode* node) {
-  ncclResult_t ret = ncclSuccess;
  FILE* file = fopen(xmlFilePath, "r");
  if (file == NULL) {
    fprintf(stderr, "Could not open MSCCL XML algorithm file %s : %s", xmlFilePath, strerror(errno));
@@ -720,7 +719,6 @@ ncclResult_t mscclAlgoMetaXmlLoad(const char* xmlFilePath, struct mscclXmlNode*
 }

 ncclResult_t mscclGetAlgoMetaFromXmlFile(const char* str, struct mscclAlgoMeta* algoMeta) {
-  ncclResult_t ret = ncclSuccess;
  struct mscclXmlNode* node;
  node = (struct mscclXmlNode *)malloc(sizeof(struct mscclXmlNode));
  NCCLCHECK(mscclAlgoMetaXmlLoad(str, node));
@@ -123,7 +123,6 @@ ncclResult_t mscclSetupConnections(struct mscclAlgo* hostAlgo, ncclComm_t comm)

 static ncclResult_t mscclSetupProxyImpl(struct mscclAlgo* hostAlgo, ncclComm_t comm) {
  mscclStatus& status = mscclGetStatus(comm);
-  mscclThreadLocalStatus& threadLocalStatus = mscclGetThreadLocalStatus();
  struct ncclProxyOp proxyOp = {};
  proxyOp.connIndex = 0;
  proxyOp.sliceSteps = status.sliceSteps;
@@ -147,8 +147,8 @@ std::string facebook_rccl::ProxyTrace::dump(uint64_t commHash) {
      sortedDumpStrMap[traceKey.str()] = proxyOpMap.second.str();
    }
  }
-  for (const auto &[keyStr, proxyOpStr] : sortedDumpStrMap) {
-    result += proxyOpStr;
+  for (const auto &pair : sortedDumpStrMap) {
+    result += pair.second; //proxyOpStr
  }
  return result;
 }
@@ -159,22 +159,22 @@ std::string facebook_rccl::ProxyTrace::dump() {

  // maps serialized key to serliazed proxyOp; sorted by key
  std::map<std::string, std::string> sortedDumpStrMap;
-  for (auto &[commHash, opCountMap] : activeOps) {
-    for (auto &[opCount, proxyOpMap] : opCountMap) {
-      for (auto &[opId, opEntry] : proxyOpMap) {
-        ProxyTraceRecordKey traceKey = {commHash, opCount, opId};
-        opEntry.computeStatus();
-        sortedDumpStrMap[traceKey.str()] = opEntry.str();
+  for (auto &commHash_opCountMap : activeOps) {
+    for (auto &opCount_proxyOpMap : commHash_opCountMap.second /*opCountMap*/) {
+      for (auto &opId_opEntry : opCount_proxyOpMap.second/*proxyOpMap*/) {
+        ProxyTraceRecordKey traceKey = {commHash_opCountMap.first, opCount_proxyOpMap.first, opId_opEntry.first};
+        opId_opEntry.second.computeStatus();
+        sortedDumpStrMap[traceKey.str()] = opId_opEntry.second.str();
      }
    }
  }

  // add the recent finished ops as well
-  for (const auto &[keyStr, proxyOpStr] : finishedOps) {
-    sortedDumpStrMap[keyStr] = proxyOpStr;
+  for (const auto &keyStr_proxyOpStr : finishedOps) {
+    sortedDumpStrMap[keyStr_proxyOpStr.first] = keyStr_proxyOpStr.second;
  }
-  for (const auto &[keyStr, proxyOpStr] : sortedDumpStrMap) {
-    result += proxyOpStr;
+  for (const auto &keyStr_proxyOpStr : sortedDumpStrMap) {
+    result += keyStr_proxyOpStr.second;
  }
  return result;
 }
@@ -207,15 +207,15 @@ std::string facebook_rccl::ProxyTraceOp::str() {

 float facebook_rccl::ProxyTrace::getMapSizeMB() const {
  float size = 0;
-  for (const auto &[commHash, opCountMap] : activeOps) {
-    for (const auto &[opCount, proxyOpMap] : opCountMap) {
-      size += proxyOpMap.size() *
+  for (const auto &commHash_opCountMap : activeOps) {
+    for (const auto &opCount_proxyOpMap : commHash_opCountMap.second) {
+      size += opCount_proxyOpMap.second.size() *
              (sizeof(ProxyTraceOp) +
               sizeof(std::unique_ptr<facebook_rccl::ProxyTraceOp>));
    }
  }
-  for (const auto &[keyStr, proxyOpStr] : finishedOps) {
-    size += keyStr.size() + proxyOpStr.size();
+  for (const auto &keyStr_proxyOpStr : finishedOps) {
+    size += keyStr_proxyOpStr.first.size() + keyStr_proxyOpStr.second.size();
  }
  return size / 1024.0 / 1024.0;
 }
@@ -40,8 +40,8 @@ rcclApiCall::rcclApiCall(rcclCall_t type, const ncclInfo& info)://name(rcclCallS
                                                                datatype(info.datatype),
                                                                op(info.op),
                                                                root(info.root),
-                                                                comm(info.comm),
                                                                nRanks(info.comm->nRanks),
+                                                                comm(info.comm),
                                                                stream(info.stream),
                                                                nTasks(info.comm->planner.nTasksP2p + info.comm->planner.nTasksColl),
                                                                globalRank(info.comm->localRankToRank[info.comm->localRank])
@@ -663,8 +663,9 @@ ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running) {
 }

 ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
+#ifdef ENABLE_TRACE
  char line[SOCKET_NAME_MAXLEN+1];
-
+#endif
  if (sock == NULL) {
    WARN("ncclSocketConnect: pass NULL socket");
    return ncclInvalidArgument;
@@ -292,10 +292,8 @@ ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) {
  struct ncclProxyArgs* op = state->active;
  int poolIndex, opIndex;
  int list_len = 0;
-  int sublist_len = 0;
  fprintf(stderr, "ACTIVE OPS\n");
  while (op) {
-    sublist_len = 0;
    NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
    if (op->state & OP_SEEN) {
      WARN("List loop at element %d-%d", poolIndex, opIndex);
@@ -304,7 +302,6 @@ ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) {
    op->state |= OP_SEEN;
    struct ncclProxyArgs* nextOp = op->nextPeer;
    while (nextOp) {
-      sublist_len++;
      NCCLCHECK(getOpIndex(nextOp, state, &poolIndex, &opIndex));
      if (nextOp->state & OP_SEEN) {
        WARN("List loop at element %d-%d", poolIndex, opIndex);
@@ -524,6 +524,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);

  int dmabuf_fd = -1;
+  (void)dmabuf_fd; /*compiler warnings fix - unused variable*/
 #if CUDA_VERSION >= 11070
  /* DMA-BUF support */
  if (resources->useGdr && resources->useDmaBuf) {
@@ -600,8 +601,9 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
  struct connectMapMem* mapMem = map->mems+bank;
  NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
-
+  
  int dmabuf_fd = -1;
+  (void)dmabuf_fd; /*compiler warnings fix - unused variable*/
 #if CUDA_VERSION >= 11070
  /* DMA-BUF support */
  if (resources->useGdr && resources->useDmaBuf) {
@@ -1314,8 +1316,8 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s
    NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
    needReg = false;
  }
-#endif
 peermem:
+#endif
  if (dmabuf_fd != -1) {
    (void)close(dmabuf_fd);
    dmabuf_fd = -1;
@@ -1342,20 +1344,20 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s

  assert(reqSize == sizeof(struct collnetRegInfo));
  assert(respSize == sizeof(void*));
-  int dmabuf_fd = -1;
  #if CUDART_VERSION >= 11070
+  int dmabuf_fd = -1;
  /* DMA-BUF support */
  if (resources->useGdr && resources->useDmaBuf) {
    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
    NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
    needReg = false;
  }
-#endif
 peermem:
  if (dmabuf_fd != -1) {
    (void)close(dmabuf_fd);
    dmabuf_fd = -1;
  }
+#endif
  if (needReg) {
    NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
  }
@@ -362,7 +362,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
    send->transportResources = map;
    opId = send;
    INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId);
-    netSendConnectArgs args = {0};
+    netSendConnectArgs args = {{},0};
    memcpy(&args.handle, connectInfo, sizeof(ncclNetHandle_t));
    args.trafficClass = comm->config.trafficClass;
    NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(netSendConnectArgs), sizeof(struct connectMap), opId));
@@ -1077,9 +1077,6 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str
  struct p2pIpcExpInfo* ipcExpInfo = (struct p2pIpcExpInfo*)reqBuff;
  void* regAddr = NULL;
  ncclResult_t ret = ncclSuccess;
-  bool mapped = false;
-  bool imported = false;
-  CUmemGenericAllocationHandle handle;

  assert(sizeof(struct p2pIpcExpInfo) == reqSize);
  assert(sizeof(void*) == respSize);
@@ -1094,6 +1091,9 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str
    regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset);
  } else {
 #if CUDART_VERSION >= 11030
+    bool mapped = false; /*compiler warning, defining vars only if needed*/
+    bool imported = false;
+    CUmemGenericAllocationHandle handle;
    // cuMem import
    if (connection->sameProcess) {
      // if proxy is same process as request peer, we just need to map the handle.