diff --git a/CMakeLists.txt b/CMakeLists.txt
index bdd8ed5b92..f71929e5c6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -300,7 +300,7 @@ foreach(target ${AMDGPU_TARGETS})
 endforeach()
 
 if("${HIP_COMPILER}" MATCHES "clang")
-  target_compile_options(rccl PRIVATE -fvisibility=hidden)
+  target_compile_options(rccl PRIVATE -fvisibility=hidden --hipcc-func-supp)
   foreach(target ${AMDGPU_TARGETS})
     target_compile_options(rccl PRIVATE -fgpu-rdc)
   endforeach()
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index d83425895d..81a091cb23 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -16,12 +16,12 @@ namespace {
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
-    ncclRing *ring = &ncclShmem->channel.ring;
+    ncclRing *ring = &ncclShmem.channel.ring;
     const int *ringRanks = ring->userRanks;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
     // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
-    const int nranks = ncclShmem->comm.nRanks;
+    const int nranks = ncclShmem.comm.nRanks;
     const ssize_t loopSize = nChannels*int(chunkSize);
     const ssize_t size = args->count;
 
@@ -79,7 +79,7 @@ namespace {
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     using Proto = ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>;
     runRing<T, RedOp, Proto>(args);
   }
@@ -87,14 +87,14 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SI
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL>(args);
   }
 };
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL128>(args);
   }
 };
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index ea90cbd37f..f71953aa26 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -20,10 +20,10 @@ namespace {
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
-    ncclRing *ring = &ncclShmem->channel.ring;
+    ncclRing *ring = &ncclShmem.channel.ring;
     int ringIx = ring->index;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1));
-    const int nranks = ncclShmem->comm.nRanks;
+    const int nranks = ncclShmem.comm.nRanks;
     const ssize_t loopSize = nChannels*nranks*chunkSize;
     const ssize_t size = args->count;
 
@@ -33,23 +33,23 @@ namespace {
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
     if (tid == 0) {
-      uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
+      uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
       NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
@@ -102,7 +102,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY)
       if (tid == 0) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         prims.npKitDataProcessTotalTime = 0;
       }
 #endif
@@ -112,7 +112,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT)
       if (tid == 0) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
       }
 #endif
 
@@ -121,7 +121,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY)
       if (tid == 0 && nranks > 2) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         prims.npKitDataProcessTotalTime = 0;
       }
 #endif
@@ -136,7 +136,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT)
       if (tid == 0 && nranks > 2) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
       }
 #endif
 
@@ -149,7 +149,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY)
       if (tid == 0) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         prims.npKitDataProcessTotalTime = 0;
       }
 #endif
@@ -159,14 +159,14 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
       if (tid == 0) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
       }
 #endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY)
       if (tid == 0 && nranks > 2) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         prims.npKitDataProcessTotalTime = 0;
       }
 #endif
@@ -182,14 +182,14 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT)
       if (tid == 0 && nranks > 2) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
       }
 #endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY)
       if (tid == 0) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         prims.npKitDataProcessTotalTime = 0;
       }
 #endif
@@ -203,7 +203,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
       if (tid == 0) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
       }
 #endif
 
@@ -212,7 +212,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_EXIT, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
@@ -224,7 +224,7 @@ namespace {
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
-    ncclTree *tree = &ncclShmem->channel.tree;
+    ncclTree *tree = &ncclShmem.channel.tree;
     ssize_t chunkSize = int(
       Proto::Id == NCCL_PROTO_SIMPLE ? args->lastChunkSize
                    /* LL & LL128 */  : Proto::calcBytePerStep()/sizeof(T));
@@ -240,23 +240,23 @@ namespace {
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
     if (tid == 0) {
-      uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
+      uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
       NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
@@ -276,7 +276,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY)
       if (tid == 0) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         prims.npKitDataProcessTotalTime = 0;
       }
 #endif
@@ -306,7 +306,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT)
       if (tid == 0) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
       }
 #endif
 
@@ -325,7 +325,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY)
       if (tid == 0) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         prims.npKitDataProcessTotalTime = 0;
       }
 #endif
@@ -355,7 +355,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT)
       if (tid == 0) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
       }
 #endif
 
@@ -364,7 +364,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
@@ -376,7 +376,7 @@ namespace {
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
-    ncclTree *tree = (args->pad_0 == 2) ? &ncclShmem->channel.binTree : &ncclShmem->channel.tree;
+    ncclTree *tree = (args->pad_0 == 2) ? &ncclShmem.channel.binTree : &ncclShmem.channel.tree;
     ssize_t chunkSize = int(
       Proto::Id != NCCL_PROTO_LL ? args->lastChunkSize
                                  : Proto::calcBytePerStep()/sizeof(T));
@@ -410,23 +410,23 @@ namespace {
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
     if (isNpKitThread) {
-      uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
+      uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
       NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
     if (isNpKitThread) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY)
     if (isNpKitThread) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
@@ -447,7 +447,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY)
       if (isNpKitThread) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         prims.npKitDataProcessTotalTime = 0;
       }
 #endif
@@ -461,7 +461,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT)
       if (isNpKitThread) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
       }
 #endif
 
@@ -487,7 +487,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY)
       if (isNpKitThread) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         prims.npKitDataProcessTotalTime = 0;
       }
 #endif
@@ -510,7 +510,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT)
       if (isNpKitThread) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
       }
 #endif
 
@@ -529,7 +529,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY)
       if (isNpKitThread) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         prims.npKitDataProcessTotalTime = 0;
       }
 #endif
@@ -552,7 +552,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT)
       if (isNpKitThread) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
       }
 #endif
 
@@ -561,7 +561,7 @@ namespace {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT)
     if (isNpKitThread) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
@@ -570,7 +570,7 @@ namespace {
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>;
     runRing<T, RedOp, Proto>(args);
   }
@@ -578,7 +578,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SI
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(args);
   }
 };
@@ -590,7 +590,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
     const int tid = threadIdx.x;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
-    struct ncclDirect* tree = &ncclShmem->channel.collTree;
+    struct ncclDirect* tree = &ncclShmem.channel.collTree;
     const ssize_t chunkSize = int(args->lastChunkSize);
     const ssize_t size = args->count;
     const ssize_t loopSize = nChannels*tree->nHeads*chunkSize;
@@ -683,14 +683,14 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL>(args);
   }
 };
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     if (args->pad_0 == 0) runTreeUpDown<T, RedOp, ProtoLL>(args);
     else runTreeSplit<T, RedOp, ProtoLL>(args);
   }
@@ -698,7 +698,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL128>(args);
     //LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args);
   }
@@ -706,7 +706,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL128> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runTreeSplit<T, RedOp, ProtoLL128>(args);
     //LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args);
   }
diff --git a/src/collectives/device/alltoall_pivot.h b/src/collectives/device/alltoall_pivot.h
index 948bdc2827..0ffa7caee9 100644
--- a/src/collectives/device/alltoall_pivot.h
+++ b/src/collectives/device/alltoall_pivot.h
@@ -14,8 +14,8 @@ namespace {
     const int tid = threadIdx.x;
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
-    const int nranks = ncclShmem->comm.nRanks;
-    const ncclRing *ring = &ncclShmem->channel.ring;
+    const int nranks = ncclShmem.comm.nRanks;
+    const ncclRing *ring = &ncclShmem.channel.ring;
     const int num_bi_rings = args->pivotA2ANumBiRings;
     const int num_uni_rings = num_bi_rings * 2;
     const int num_chunks = args->nChannels / 2;
@@ -71,7 +71,7 @@ namespace {
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllToAllPivot, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     using Proto = ProtoSimple<ALLTOALL_PIVOT_CHUNKSTEPS/ALLTOALL_PIVOT_SLICESTEPS, ALLTOALL_PIVOT_SLICESTEPS>;
     runRing<T, RedOp, Proto>(args);
   }
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index c54a7d006c..85c1999ded 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -15,7 +15,7 @@ namespace {
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
-    ncclRing *ring = &ncclShmem->channel.ring;
+    ncclRing *ring = &ncclShmem.channel.ring;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1));
     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
     const ssize_t loopSize = nChannels*chunkSize;
@@ -61,7 +61,7 @@ namespace {
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     using Proto = ProtoSimple<BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS>;
     runRing<T, RedOp, Proto>(args);
   }
@@ -69,14 +69,14 @@ struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SI
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL>(args);
   }
 };
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL128>(args);
   }
 };
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index 569515c20a..448e2f7b45 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -188,7 +188,7 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs_ll128[]{
 
 template<unsigned short f, unsigned short l, bool u>
 struct Caller {
-  static __device__ __host__
+  static __forceinline__ __device__ __host__
   void call(unsigned short funcIndex) noexcept
   {
     constexpr unsigned short m = f + (l - f) / 2;
@@ -199,7 +199,7 @@ struct Caller {
 
 template<unsigned short f, bool u>
 struct Caller<f, f + 1, u>{
-  static __device__ __host__
+  static __forceinline__ __device__ __host__
   void call(unsigned short funcIndex) noexcept { if (u) ncclFuncs_ll128[f](); else ncclFuncs[f](); }
 };
 
@@ -207,7 +207,7 @@ static_assert(FUNC_INDEX_P2P == 2710, "Wrong P2P function index");
 static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 2711, "Wrong AllToAllPivot function index");
 
 template<bool USING_LL128>
-inline
+__forceinline__
 __device__
 void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {
 #if defined(BUILD_ALLREDUCE_ONLY)
@@ -320,14 +320,14 @@ class ncclFunction {
 
 #ifdef ENABLE_COLLTRACE
 #define traceColl(launch_type) { \
-    uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
-    struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
+    uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
+    struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
     collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
     collTrace->bid = blockIdx.x; \
-    collTrace->funcIndex = shmem.work.header.funcIndex; \
+    collTrace->funcIndex = ncclShmem.work.header.funcIndex; \
     asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0)); \
-    if (shmem.work.header.type == ncclWorkTypeP2p) { \
-      struct ncclWorkElemP2p *p2pElems = shmem.work.p2pElems; \
+    if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \
+      struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \
       collTrace->p2p[0].connIndex = 0; \
 	    collTrace->p2pOpCount[0] = p2pElems[0].opCount; \
       collTrace->p2p[0].ngroups = p2pElems[0].ngroups; \
@@ -341,8 +341,8 @@ class ncclFunction {
       collTrace->p2p[1].warpStart = p2pElems[1].warpStart; \
       collTrace->p2p[1].peer = p2pElems[1].p2pType == ncclWorkP2pTypeSend ? (uint16_t)(p2pElems[1].peer) : -1; \
       collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \
-    } else if (shmem.work.header.type == ncclWorkTypeColl) { \
-      struct ncclWorkElem *elems = shmem.work.elems; \
+    } else if (ncclShmem.work.header.type == ncclWorkTypeColl) { \
+      struct ncclWorkElem *elems = ncclShmem.work.elems; \
       collTrace->opCount = elems[0].opCount; \
       collTrace->coll.nWarps = elems[0].nWarps; \
       collTrace->coll.bid = elems[0].bid; \
@@ -355,23 +355,23 @@ class ncclFunction {
     traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \
   }
 #define traceKernelEnd()  { \
-    uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
-    struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
+    uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
+    struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
     collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
     collTrace->bid = blockIdx.x; \
     collTrace->type = ncclCollTraceKernelEndType; \
   }
 #define traceAbort()  { \
-    uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
-    struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
+    uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
+    struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
     collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
     collTrace->bid = blockIdx.x; \
     collTrace->type = ncclCollTraceAbortType; \
   }
 //  traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1)
 #define traceData(data2, data4, data8_0, data8_1) { \
-    uint32_t pos = __atomic_fetch_add(ncclShmem->comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
-    struct ncclCollTrace* collTrace = ncclShmem->comm.collTrace+pos; \
+    uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
+    struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
     collTrace->bid = blockIdx.x; \
     collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
     collTrace->funcIndex = data2; \
@@ -408,14 +408,14 @@ struct ncclShmemData {
   struct ncclProf prof;
 #endif
 };
-static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
+static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "ncclShmem.work needs to be 16B aligned");
 
 #ifdef ENABLE_PROFILING
 #define __insert_timestamp(line_num) do { \
-      if (shmem.prof.count < PROFILE_NUM_ITEMS) { \
-        shmem.prof.elem[shmem.prof.count].line = line_num; \
-        shmem.prof.elem[shmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
-        shmem.prof.count++; \
+      if (ncclShmem.prof.count < PROFILE_NUM_ITEMS) { \
+        ncclShmem.prof.elem[ncclShmem.prof.count].line = line_num; \
+        ncclShmem.prof.elem[ncclShmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
+        ncclShmem.prof.count++; \
       } \
     } while(0);
 #else
@@ -459,7 +459,7 @@ struct RunWork {
   }
 };
 
-static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
+static __forceinline__ __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
   if (we->isUsed && we->redOpArgIsPtr) {
     /* redOpArg is a pointer to the scalar value, so we'll dereference it
      * here so that redOpArg holds the bits of the scalar going forward.
@@ -480,19 +480,17 @@ static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
   }
 }
 
-extern __device__ struct ncclShmemData *ncclShmem;
+extern __shared__ ncclShmemData ncclShmem;
 
 template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex, bool COLLTRACE, bool USING_LL128>
-__device__ void ncclKernel(
+__forceinline__ __device__ void ncclKernel(
     struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead
   )  {
   int tid = threadIdx.x;
-  __shared__ struct ncclShmemData shmem;
-  ncclShmem = &shmem;
   if (tid == 0) {
     for (auto i = 0; i < NCCL_MAX_GROUPS; i++) {
-      shmem.groups[i].barrier = 0;
-      for (auto j = 0; j < NCCL_MAX_GROUPS; j++) shmem.groups[i].barrier_next[j] = 0;
+      ncclShmem.groups[i].barrier = 0;
+      for (auto j = 0; j < NCCL_MAX_GROUPS; j++) ncclShmem.groups[i].barrier_next[j] = 0;
     }
   }
   // To map blockId to channelId, we need the n'th set bit of channelMask which
@@ -501,18 +499,18 @@ __device__ void ncclKernel(
     int x = tid;
     if (channelMask & (1ull<<x)) {
       int y = __popcll(channelMask & ((1ull<<x)-1));
-      if (blockIdx.x == y) shmem.channelId = x;
+      if (blockIdx.x == y) ncclShmem.channelId = x;
     }
     if (32 < MAXCHANNELS) {
       x = 32 + tid;
       if (channelMask & (1ull<<x)) {
         int y = __popcll(channelMask & ((1ull<<x)-1));
-        if (blockIdx.x == y) shmem.channelId = x;
+        if (blockIdx.x == y) ncclShmem.channelId = x;
       }
     }
   }
-  __syncthreads(); // publish shmem.channelId
-  int channelId = shmem.channelId;
+  __syncthreads(); // publish ncclShmem.channelId
+  int channelId = ncclShmem.channelId;
 
   if (true) {
     void *dst, *src;
@@ -520,20 +518,20 @@ __device__ void ncclKernel(
     // Use first 3 warps to load comm, channel, and work into shmem
     switch (tid/WARP_SIZE) {
     case 0:
-      dst = &shmem.comm;
+      dst = &ncclShmem.comm;
       src = comm;
       bytes = sizeof(ncclDevComm);
       static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
       break;
     case 1:
       // Get address of channel without incurring indirect load from ncclDevComm::channels
-      dst = &shmem.channel;
+      dst = &ncclShmem.channel;
       src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
       bytes = sizeof(ncclDevChannel);
       static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
       break;
     case 2:
-      dst = &shmem.work;
+      dst = &ncclShmem.work;
       src = workHead + blockIdx.x;
       bytes = sizeof(ncclWork);
       static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn.");
@@ -547,8 +545,8 @@ __device__ void ncclKernel(
   __syncthreads(); // publish shmem
 #ifdef ENABLE_PROFILING
   if (tid == 0) {
-    shmem.prof.count = 0;
-    shmem.prof.seq = shmem.comm.devProf[blockIdx.x].seq;
+    ncclShmem.prof.count = 0;
+    ncclShmem.prof.seq = ncclShmem.comm.devProf[blockIdx.x].seq;
   }
 #endif
   if (tid == 0) __insert_timestamp(__LINE__);
@@ -557,34 +555,34 @@ __device__ void ncclKernel(
 
   while (true) {
     // Notify host that all fifo reads are complete.
-    if (tid == 0 && shmem.work.header.isLast && shmem.work.header.inFifo) {
-      *shmem.channel.workFifoDone = shmem.work.header.doneAcks;
+    if (tid == 0 && ncclShmem.work.header.isLast && ncclShmem.work.header.inFifo) {
+      *ncclShmem.channel.workFifoDone = ncclShmem.work.header.doneAcks;
     }
 
     __syncwarp();
-    if (shmem.work.header.type == ncclWorkTypeColl) {
-      if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&shmem.work.elems[tid]);
-    } else if (shmem.work.header.type == ncclWorkTypeRegColl) {
-      if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&shmem.work.regElems[tid].elem);
+    if (ncclShmem.work.header.type == ncclWorkTypeColl) {
+      if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]);
+    } else if (ncclShmem.work.header.type == ncclWorkTypeRegColl) {
+      if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem.work.regElems[tid].elem);
     }
     __syncthreads();
 
     if (tid == 0) __insert_timestamp(__LINE__);
-    if (shmem.work.header.funcIndex == FnIndex) {
-      RunWork<Fn, T, RedOp, Algo, Proto>().run(&shmem.work);
+    if (ncclShmem.work.header.funcIndex == FnIndex) {
+      RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
     } else {
-      NCCL_CALL_FUNCTIONS<USING_LL128>(shmem.work.header.funcIndex);
+      NCCL_CALL_FUNCTIONS<USING_LL128>(ncclShmem.work.header.funcIndex);
     }
 
-    int workIxNext = shmem.work.header.workNext;
+    int workIxNext = ncclShmem.work.header.workNext;
     __syncthreads();
-    if (shmem.work.header.isLast) break;
+    if (ncclShmem.work.header.isLast) break;
 
-    copyToShmem16(tid, &shmem.work, workHead + workIxNext, sizeof(ncclWork));
+    copyToShmem16(tid, &ncclShmem.work, workHead + workIxNext, sizeof(ncclWork));
 
     { // Check whether the last operation was aborted and make sure all threads exit
       int aborted = tid == 0 ? *comm->abortFlag : 0;
-      if (__any(aborted)) { // publish shmem.work
+      if (__any(aborted)) { // publish ncclShmem.work
         traceAbort();
         break;
       }
@@ -593,10 +591,10 @@ __device__ void ncclKernel(
   }
   if (COLLTRACE && tid == 0) traceKernelEnd();
 #ifdef ENABLE_PROFILING
-  if (shmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
+  if (ncclShmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
     __syncthreads();
-    copyToShmem16(tid, shmem.comm.devProf+MAXCHANNELS*shmem.prof.seq+blockIdx.x, &shmem.prof, sizeof(struct ncclProf));
-    if (tid == 0) shmem.comm.devProf[blockIdx.x].seq++;
+    copyToShmem16(tid, ncclShmem.comm.devProf+MAXCHANNELS*ncclShmem.prof.seq+blockIdx.x, &ncclShmem.prof, sizeof(struct ncclProf));
+    if (tid == 0) ncclShmem.comm.devProf[blockIdx.x].seq++;
   }
 #endif
 }
@@ -626,7 +624,7 @@ __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(st
 /* Functions for aggregation case */
 #define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \
 __device__  __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
-  RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem->work); \
+  RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \
 }
 
 // Only generate inline kernels for LL
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index e9958db464..c29744c975 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -9,7 +9,7 @@
 #include "collectives.h"
 #include "common.h"
 
-__device__ struct ncclShmemData* ncclShmem;
+__shared__ ncclShmemData ncclShmem;
 
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu
index d10345e7d2..256b236386 100644
--- a/src/collectives/device/onerank_reduce.cu
+++ b/src/collectives/device/onerank_reduce.cu
@@ -13,7 +13,7 @@
 namespace {
   template<typename T, typename RedOp>
   __device__ __attribute__((noinline)) void oneRankReduce() {
-    ncclWork *w = &ncclShmem->work;
+    ncclWork *w = &ncclShmem.work;
     int tid = threadIdx.x;
     int tn = blockDim.x;
     #pragma unroll 1
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index 3246988342..3f6897de81 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -45,7 +45,7 @@ struct ProtoSimple {
 
   // Data bytes (no flags etc) in one step of the fifo queue.
   __device__ static int calcBytePerStep() {
-    return ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
+    return ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
   }
   // Granularity of data bytes transferred per thread.
   __device__ static int calcBytePerGrain() {
@@ -63,7 +63,7 @@ struct ProtoLL {
 
   // Data bytes (no flags etc) in one step of the fifo queue.
   __device__ static int calcBytePerStep() {
-    return ncclShmem->comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/2; // Half is data
+    return ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/2; // Half is data
   }
   // Granularity of data bytes transferred per thread.
   __device__ static int calcBytePerGrain() {
@@ -81,7 +81,7 @@ struct ProtoLL128 {
 
   // Data bytes (no flags etc) in one step of the fifo queue.
   __device__ static int calcBytePerStep() {
-    return (ncclShmem->comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS)*NCCL_LL128_DATAELEMS/NCCL_LL128_LINEELEMS;
+    return (ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS)*NCCL_LL128_DATAELEMS/NCCL_LL128_LINEELEMS;
   }
   // Granularity of data bytes transferred per thread.
   __device__ static int calcBytePerGrain() {
diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h
index 4e91e188c5..0c96ca9022 100644
--- a/src/collectives/device/prims_ll.h
+++ b/src/collectives/device/prims_ll.h
@@ -83,7 +83,7 @@ private:
   inline __device__ int checkAbort(int &spins, int send) {
     spins++;
     if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      abort = __atomic_load_n((ncclShmem->comm.abortFlag), __ATOMIC_SEQ_CST);
+      abort = __atomic_load_n((ncclShmem.comm.abortFlag), __ATOMIC_SEQ_CST);
       spins = 0;
     }
     return abort;
@@ -93,7 +93,7 @@ private:
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY, nbytes, 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
     if (sendConnHeadPtr) {
@@ -114,7 +114,7 @@ private:
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT, nbytes, 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
   }
@@ -376,7 +376,7 @@ private:
       npKitWaitRecvTotalTime = 0;
       npKitWaitRecvDataProcessSize = nelem*sizeof(T);
       NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY,
-          npKitWaitRecvDataProcessSize, 0, __builtin_amdgcn_s_memrealtime(), ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          npKitWaitRecvDataProcessSize, 0, __builtin_amdgcn_s_memrealtime(), ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
@@ -446,7 +446,7 @@ private:
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT,
           npKitWaitRecvDataProcessSize, npKitWaitRecvTotalTime, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
@@ -494,11 +494,11 @@ private:
     ):
     redOp(redOpArg),
     tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group&(uint16_t)0xFFFF),
-    stepLines(ncclShmem->comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
-    barriers = &ncclShmem->groups[this->group].barrier;
-    barrier_next = ncclShmem->groups[this->group].barrier_next;
+    stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
+    barriers = &ncclShmem.groups[this->group].barrier;
+    barrier_next = ncclShmem.groups[this->group].barrier_next;
 
-    auto *channel = &ncclShmem->channel;
+    auto *channel = &ncclShmem.channel;
     // If we are going to support oneshot collNet + LL, then we would need to add connector index here
     int nrecv=0, nsend=0;
     while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
@@ -539,14 +539,14 @@ private:
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_ENTRY)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
     LLGenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false);
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_EXIT)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
   }
@@ -554,14 +554,14 @@ private:
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
     LLGenericOp<0, 1, Output, -1>(outIx, -1, eltN, false);
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
   }
@@ -569,14 +569,14 @@ private:
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_ENTRY)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
     LLGenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp);
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_EXIT)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
   }
@@ -584,14 +584,14 @@ private:
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
     LLGenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false);
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
   }
@@ -599,14 +599,14 @@ private:
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
     LLGenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
   }
@@ -614,14 +614,14 @@ private:
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_ENTRY)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
     LLGenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_EXIT)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
   }
@@ -629,14 +629,14 @@ private:
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
     LLGenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp);
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
   }
@@ -644,14 +644,14 @@ private:
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
     LLGenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT)
     if (tid == 0) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
   }
diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h
index 58653a7c9e..f5d7f31a66 100644
--- a/src/collectives/device/prims_ll128.h
+++ b/src/collectives/device/prims_ll128.h
@@ -78,7 +78,7 @@ private:
   inline __device__ int checkAbort(int &spins, int i, int send) {
     spins++;
     if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      abort = __atomic_load_n(ncclShmem->comm.abortFlag, __ATOMIC_SEQ_CST);
+      abort = __atomic_load_n(ncclShmem.comm.abortFlag, __ATOMIC_SEQ_CST);
       spins = 0;
     }
     return abort;
@@ -408,11 +408,11 @@ public:
     redOp(redOpArg),
     tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
     flagThread((tid%4)==3), group(group&(uint16_t)0xFFFF),
-    stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
-    barriers = &ncclShmem->groups[this->group].barrier;
-    barrier_next = ncclShmem->groups[this->group].barrier_next;
+    stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
+    barriers = &ncclShmem.groups[this->group].barrier;
+    barrier_next = ncclShmem.groups[this->group].barrier_next;
 
-    auto *channel = &ncclShmem->channel;
+    auto *channel = &ncclShmem.channel;
     int nrecv=0, nsend=0;
     while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
       loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h
index 24012f1dad..e268d4f306 100644
--- a/src/collectives/device/prims_simple.h
+++ b/src/collectives/device/prims_simple.h
@@ -90,7 +90,7 @@ private:
   inline __device__ bool checkAbort(int &spins) {
     spins++;
     if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      flags |= atomicAdd_system((unsigned int *)ncclShmem->comm.abortFlag, 0) ? Aborted : 0;
+      flags |= atomicAdd_system((unsigned int *)ncclShmem.comm.abortFlag, 0) ? Aborted : 0;
       spins = 0;
     }
     return flags & Aborted;
@@ -108,7 +108,7 @@ private:
         __builtin_amdgcn_s_sleep(8);
         connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0);
         if (checkAbort(spins)) break;
-        //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem->comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
+        //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
         if (spins == 0) traceData(__LINE__, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
       }
       __asm__ __volatile__("s_wakeup");
@@ -118,8 +118,8 @@ private:
       if (isSendNotRecv && (flags & SizesFifoEnabled))
         __atomic_store_n((connSizesFifoPtr+step%NCCL_STEPS), nelts*sizeof(T), __ATOMIC_SEQ_CST);
 
-      void **ptrs = isSendNotRecv ? (ncclShmem->groups[group].dsts + Dst)
-                                  : (ncclShmem->groups[group].srcs + Src);
+      void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
+                                  : (ncclShmem.groups[group].srcs + Src);
       if (flags & OffsFifoEnabled)
         ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
       else if (isSendNotRecv && DirectSend) {
@@ -200,19 +200,19 @@ private:
       do {
         sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset;
         if (Src && (flags & (SrcBuf==Input ? RoleInput : RoleOutput)))
-          ncclShmem->groups[group].srcs[0] = userBuff + srcIx + offset;
+          ncclShmem.groups[group].srcs[0] = userBuff + srcIx + offset;
         if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput)))
-          ncclShmem->groups[group].dsts[0] = userBuff + dstIx + offset;
+          ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset;
         waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteIx, offset, sliceSize);
         subBarrier();
-        if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) {
+        if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
           // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
           if (Send) {
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
             if (tid == 0) {
               NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-                  ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+                  ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
             }
 #endif
 
@@ -225,8 +225,8 @@ private:
             // (1-Send) is only there to avoid compilation errors in case MaxSend=0 (and Send=0).
             ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, (1-Send)+MaxSend, 0>
               (tid, nworkers, nullptr, false,
-               1, (T const**)ncclShmem->groups[group].srcs,
-               fan.nsend(), (T**)ncclShmem->groups[group].dsts+1,
+               1, (T const**)ncclShmem.groups[group].srcs,
+               fan.nsend(), (T**)ncclShmem.groups[group].dsts+1,
                sliceSize);
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
@@ -239,18 +239,18 @@ private:
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
             if (tid == 0) {
               NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-                  ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+                  ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
             }
 #endif
 
           }
-        } else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem->groups[group].dsts[Dst] == nullptr) {
+        } else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) {
           // For broadcast in CollNet to do empty send
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
           if (tid == 0) {
             NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-                ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+                ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
           }
 #endif
 
@@ -261,9 +261,9 @@ private:
 #endif
 
           ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>
-            (tid, nworkers, ncclShmem->redOpArgs, postOp,
-             Recv, (T const**)ncclShmem->groups[group].srcs,
-             Dst, (T**)ncclShmem->groups[group].dsts,
+            (tid, nworkers, ncclShmem.redOpArgs, postOp,
+             Recv, (T const**)ncclShmem.groups[group].srcs,
+             Dst, (T**)ncclShmem.groups[group].dsts,
              sliceSize);
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
@@ -276,7 +276,7 @@ private:
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
           if (tid == 0) {
             NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-                ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+                ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
           }
 #endif
 
@@ -285,7 +285,7 @@ private:
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
           if (tid == 0) {
             NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-                ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+                ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
           }
 #endif
 
@@ -298,9 +298,9 @@ private:
           constexpr int PreOpN = SrcBuf != Input ? 0 :
                                  DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
           ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpN>
-            (tid, nworkers, ncclShmem->redOpArgs, postOp,
-             Recv*fan.nrecv()+Src, (T const**)ncclShmem->groups[group].srcs,
-             Send*fan.nsend()+Dst, (T**)ncclShmem->groups[group].dsts,
+            (tid, nworkers, ncclShmem.redOpArgs, postOp,
+             Recv*fan.nrecv()+Src, (T const**)ncclShmem.groups[group].srcs,
+             Send*fan.nsend()+Dst, (T**)ncclShmem.groups[group].dsts,
              sliceSize);
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
@@ -313,15 +313,13 @@ private:
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
           if (tid == 0) {
             NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-                ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+                ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
           }
 #endif
 
         }
         barrier(); // This barrier has a counterpart in following loop
-#if defined(__gfx1030__)
-	if (Send && (flags & RolePostSend) && index == 0) __threadfence_system();
-#endif
+	if ((MaxSend == 0 || MaxRecv == 0) && Send && (flags & RolePostSend) && index == 0) __threadfence_system();
 	__syncwarp();
         postPeer<Recv, Send>();
         offset += sliceSize;
@@ -341,9 +339,7 @@ private:
         waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, 0);
       }
       barrier(); // Has couterpart in preceding worker-only loop.
-#if defined(__gfx1030__)
-      if (Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system();
-#endif
+      if ((MaxSend == 0 || MaxRecv == 0) && Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system();
       __syncwarp();
       postPeer<Recv, Send>();
       offset += sliceSize;
@@ -370,8 +366,8 @@ private:
         if (Send) {
           // Scatter pre-scales data of input buffer only in non-Direct case
           constexpr int PreOpN = DirectSend ? 0 : 1;
-          if (flags & RoleInput) ncclShmem->groups[group].srcs[0] = userBuff + inpIx + offset;
-          if (tid == 0) ncclShmem->groups[group].totalSendSize[slice] = 0; // Skip the threadfence
+          if (flags & RoleInput) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset;
+          if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] = 0; // Skip the threadfence
           // realSize is not accurate here; but intra-node does not rely on sizes FIFO
           waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
           subBarrier();
@@ -382,22 +378,22 @@ private:
             int peerOffset = i*peerElem;
             // Skip the data I am responsible of reducing myself
             if (skip >= 0 && i >= skip) peerOffset += peerElem;
-            const T* src0 = (T*)ncclShmem->groups[group].srcs[0] + peerOffset;
+            const T* src0 = (T*)ncclShmem.groups[group].srcs[0] + peerOffset;
             int realPeerSize = min(realSize, totalElem-peerOffset);
-            if (realPeerSize > 0 && ncclShmem->groups[group].dsts[i] != nullptr) {
-              ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpN>(tid, nworkers, ncclShmem->redOpArgs, false, 1, &src0, 1, (T**)ncclShmem->groups[group].dsts+i, realPeerSize);
+            if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
+              ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpN>(tid, nworkers, ncclShmem.redOpArgs, false, 1, &src0, 1, (T**)ncclShmem.groups[group].dsts+i, realPeerSize);
               // Mark for threadfence at the end
-              if (tid == 0) ncclShmem->groups[group].totalSendSize[slice] += realPeerSize;
+              if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] += realPeerSize;
             }
           }
         } else if (Recv) {
-          if (flags & RoleOutput) ncclShmem->groups[group].dsts[0] = userBuff + outIx + offset;
+          if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
           int peerOffset = index*peerElem;
           if (skip >= 0 && index >= skip) peerOffset += peerElem;
           // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
           waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+peerOffset, offset, realSize);
           subBarrier();
-          if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) {
+          if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
             // Since waitPeer sets srcs[0] to output buffer + offset, we are doing a direct-write based recv
             // Do nothing
           } else {
@@ -406,16 +402,16 @@ private:
               int i = (j+shift)%fan.nrecv();
               peerOffset = i*peerElem;
               if (skip >= 0 && i >= skip) peerOffset += peerElem;
-              T* dst0 = (T*)ncclShmem->groups[group].dsts[0] + peerOffset;
+              T* dst0 = (T*)ncclShmem.groups[group].dsts[0] + peerOffset;
               int realPeerSize = min(realSize, totalElem-peerOffset);
-              if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>(tid, nworkers, ncclShmem->redOpArgs, postOp, 1, (const T**)ncclShmem->groups[group].srcs+i, 1, &dst0, realPeerSize);
+              if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>(tid, nworkers, ncclShmem.redOpArgs, postOp, 1, (const T**)ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
             }
           }
         }
       }
       barrier();
       // If we indeed send something, threadfence
-      if (Send && (flags & RolePostSend) && ncclShmem->groups[group].totalSendSize[slice] > 0 && index == 0)
+      if (Send && (flags & RolePostSend) && ncclShmem.groups[group].totalSendSize[slice] > 0 && index == 0)
         __threadfence_system();
       __syncwarp();
       postPeer<Recv, Send>();
@@ -433,7 +429,7 @@ private:
         atomicExch_system((unsigned long long *)connStepPtr, step); // Return credits in case we rounded up.
       }
       if (flags & RoleWaitRecv) {
-        ncclShmem->groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
+        ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
         connStepPtr = conn->tail;
         connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0);
         flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
@@ -473,7 +469,7 @@ private:
 	    next_hdp_reg = conn->next_hdp_reg;
       }
       if (flags & RoleWaitSend) {
-        ncclShmem->groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
+        ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
         connStepPtr = conn->head;
         connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0);
         flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
@@ -508,20 +504,20 @@ private:
   }
 
  public:
-  __device__ Primitives(
+  __forceinline__ __device__ Primitives(
       int tid, int nthreads, int const *recvPeers, int const *sendPeers,
       void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr
     ):
     tid(tid),
-    stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
+    stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
 
     // For send operations, we need an extra warp to overlap the threadfence and the copy
     this->nthreads = nthreads;
     this->nworkers = nthreads;
     this->group = group & (uint16_t)0xFFFF;
     int connIndex = group >> 16;
-    barriers = &ncclShmem->groups[this->group].barrier;
-    barrier_next = ncclShmem->groups[this->group].barrier_next;
+    barriers = &ncclShmem.groups[this->group].barrier;
+    barrier_next = ncclShmem.groups[this->group].barrier_next;
 
     int nrecv=0, nsend=0;
     while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
@@ -551,30 +547,30 @@ private:
     if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
     if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
 
-    loadRecvConn(&ncclShmem->channel.peers[peer], connIndex, e);
-    loadSendConn(&ncclShmem->channel.peers[peer], connIndex, e);
+    loadRecvConn(&ncclShmem.channel.peers[peer], connIndex, e);
+    loadSendConn(&ncclShmem.channel.peers[peer], connIndex, e);
 
     setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
   }
 
-  __device__ ~Primitives() {
-    // Ensure ncclShmem->groups[].send/recvConns are available
+  __forceinline__ __device__ ~Primitives() {
+    // Ensure ncclShmem.groups[].send/recvConns are available
     if (!(flags & ThreadsSynced))
       barrier();
     // Save steps for the next operation
     if (flags & (RolePostSend|RolePostRecv)) {
-      auto *conns = (flags & RolePostSend) ? ncclShmem->groups[group].sendConns : ncclShmem->groups[group].recvConns;
+      auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
       conns[index]->step = step;
     }
     // Make sure all threads are done writing back conn->step and done using
-    // ncclShmem->groups[group]
+    // ncclShmem.groups[group]
     barrier();
   }
 
   __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) {
     if (flags & RoleInput) {
       userBuff = (T*)inputBuf;
-      ncclShmem->redOpArgs[0] = redOpArg;  // scaler for local input
+      ncclShmem.redOpArgs[0] = redOpArg;  // scaler for local input
     }
     if (flags & RoleOutput) userBuff = (T*)outputBuf;
     bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite);
@@ -585,7 +581,7 @@ private:
 
     if (Direct && recvProvider) {
       int spins = 0;
-      void *volatile *slot = ncclShmem->groups[group].recvConns[index]->ptrExchange;
+      void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
       // Wait for consumer to consume previous value before trampling it.
       while ((void *)atomicAdd_system((unsigned long long *) slot,0) != nullptr && !checkAbort(spins));
       directBuff = (T*)outputBuf;
@@ -596,7 +592,7 @@ private:
     }
     if (Direct && sendAcceptor) {
       int spins = 0;
-      void *volatile *slot = ncclShmem->groups[group].sendConns[index]->ptrExchange;
+      void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
       void *ptr;
       while (true) {
         ptr = (void *)atomicAdd_system((unsigned long long *) slot,0);
@@ -608,9 +604,9 @@ private:
     }
     if (Direct && sendProvider) {
       int spins = 0;
-      void *volatile *slot = ncclShmem->groups[group].sendConns[index]->ptrExchange;
-      volatile uint64_t* argSlot0 = ncclShmem->groups[group].sendConns[index]->redOpArgExchange;
-      volatile uint64_t* argSlot1 = ncclShmem->groups[group].sendConns[index]->redOpArgExchange+1;
+      void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
+      volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
+      volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1;
       // Wait for consumer to consume previous value before trampling it.
       while (((void *)atomicAdd_system((unsigned long long *) slot,0) != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
       // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
@@ -626,9 +622,9 @@ private:
     }
     if (Direct && recvAcceptor) {
       int spins = 0;
-      void *volatile *slot = ncclShmem->groups[group].recvConns[index]->ptrExchange;
-      volatile uint64_t* argSlot0 = ncclShmem->groups[group].recvConns[index]->redOpArgExchange;
-      volatile uint64_t* argSlot1 = ncclShmem->groups[group].recvConns[index]->redOpArgExchange+1;
+      void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
+      volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
+      volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1;
       void *ptr;
       while (true) {
         ptr = (void *)atomicAdd_system((unsigned long long *) slot,0);
@@ -644,7 +640,7 @@ private:
           arg1 = *argSlot1;
           if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
         }
-        ncclShmem->redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff);
+        ncclShmem.redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff);
       }
       *argSlot0 = 0; *argSlot1 = 0;
       *slot = nullptr;
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index 08bb7f9f1b..ac40072921 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -16,13 +16,13 @@ namespace {
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
-    ncclRing *ring = &ncclShmem->channel.ring;
+    ncclRing *ring = &ncclShmem.channel.ring;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1));
     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
-    const int nranks = ncclShmem->comm.nRanks;
+    const int nranks = ncclShmem.comm.nRanks;
     const ssize_t loopSize = nChannels*chunkSize;
     const ssize_t size = args->count;
-    const int rank = ncclShmem->comm.rank;
+    const int rank = ncclShmem.comm.rank;
     const int prevRank = ring->userRanks[nranks-1];
     const int root = args->root;
 
@@ -71,7 +71,7 @@ namespace {
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     using Proto = ProtoSimple<REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS>;
     runRing<T, RedOp, Proto>(args);
   }
@@ -79,14 +79,14 @@ struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL>(args);
   }
 };
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL128>(args);
   }
 };
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index 975c4533ba..9600a900dc 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -16,12 +16,12 @@ namespace {
     const int nthreads = args->nWarps*WARP_SIZE;
     const int bid = args->bid;
     const int nChannels = args->nChannels;
-    ncclRing *ring = &ncclShmem->channel.ring;
+    ncclRing *ring = &ncclShmem.channel.ring;
     int const *ringRanks = ring->userRanks;
     const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
     // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
     const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
-    const int nranks = ncclShmem->comm.nRanks;
+    const int nranks = ncclShmem.comm.nRanks;
     const ssize_t loopSize = nChannels*chunkSize;
     const ssize_t size = args->count;
 
@@ -69,7 +69,7 @@ namespace {
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     using Proto = ProtoSimple<REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS>;
     runRing<T, RedOp, Proto>(args);
   }
@@ -77,14 +77,14 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROT
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL>(args);
   }
 };
 
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
     runRing<T, RedOp, ProtoLL128>(args);
   }
 };
diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h
index c6f082f859..ef5b26aeae 100644
--- a/src/collectives/device/sendrecv.h
+++ b/src/collectives/device/sendrecv.h
@@ -25,20 +25,20 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
     if (isNpKitThread) {
-      uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
+      uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
       NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
     if (isNpKitThread) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
-    if (args->peer == ncclShmem->comm.rank) {
+    if (args->peer == ncclShmem.comm.rank) {
       struct ncclWorkElemP2p* recvArgs = args-1;
       void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
       if (buff != recvBuff) {
@@ -46,14 +46,14 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY)
         if (isNpKitThread) {
           NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-              ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+              ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         }
 #endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
         if (isNpKitThread) {
           NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-              ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+              ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         }
 #endif
 
@@ -62,14 +62,14 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
         if (isNpKitThread) {
           NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-              ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+              ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         }
 #endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT)
         if (isNpKitThread) {
           NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-              ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+              ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         }
 #endif
 
@@ -90,7 +90,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_ENTRY)
       if (isNpKitThread) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         prims.npKitDataProcessTotalTime = 0;
       }
 #endif
@@ -105,7 +105,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT)
       if (isNpKitThread) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
       }
 #endif
 
@@ -120,20 +120,20 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
     if (isNpKitThread) {
-      uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
+      uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
       NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
     if (isNpKitThread) {
       NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
-          ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
     }
 #endif
 
-    if (args->peer != ncclShmem->comm.rank) {
+    if (args->peer != ncclShmem.comm.rank) {
       using Proto = ProtoSimple<1, 1>;
       void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
       ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
@@ -151,7 +151,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY)
       if (isNpKitThread) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
         prims.npKitDataProcessTotalTime = 0;
       }
 #endif
@@ -166,14 +166,14 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT)
       if (isNpKitThread) {
         NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
-            ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
       }
 #endif
 
     }
   }
 
-  __device__ __forceinline__ void run(ncclWork *work) {
+  __device__  __attribute__((noinline)) void run(ncclWork *work) {
     struct ncclWorkElemP2p* args = work->p2pElems;
     int ngroups = args->ngroups;
     int tid = threadIdx.x;