diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f4c7b2758..d7c0e6e393 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,6 +103,10 @@ list(APPEND CMAKE_PREFIX_PATH
 find_package(hip REQUIRED)
 message(STATUS "HIP compiler: ${HIP_COMPILER}")
 message(STATUS "HIP runtime: ${HIP_RUNTIME}")
+check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVENT_DISABLE_FENCE)
+if(${HIP_EVENT_DISABLE_FENCE})
+  add_definitions(-DHIP_EVENT_DISABLE_FENCE)
+endif()
 
 find_package(hsa-runtime64 REQUIRED)
 get_target_property(HSA_INCLUDE_PATH hsa-runtime64::hsa-runtime64 INTERFACE_INCLUDE_DIRECTORIES)
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index 6717520183..9f53b305d4 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -19,9 +19,9 @@
 #define __synclds() \
   asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier");
 
-#if defined(__gfx90a__)
+#ifdef __GFX9__
 #define STORE(DST, SRC) \
-  { __threadfence_block(); atomicExch((unsigned long long *)(DST), (SRC)); }
+  { atomicExch((unsigned long long *)(DST), (SRC)); }
 #else
 #define STORE(DST, SRC) \
   { __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST); }
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index c6a4f7a088..b8b370a10c 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -20,7 +20,7 @@ static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
 inline __device__ int loadInt(int* ptr) {
   int v;
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-  v = atomicAdd_system((unsigned long long *)ptr, 0);
+  v = atomicAdd((unsigned long long *)ptr, 0);
 #else
   asm volatile("ld.volatile.global.u32 %0, [%1];"
       : "=r"(v) : "l"(ptr));
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index 6d769e23f3..ea91b61f6f 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -15,17 +15,14 @@
 #define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
 
 #define barrier_by_group() do { \
-  if (nthreads == NCCL_MAX_NTHREADS) \
-    __syncthreads(); \
-  else { \
-    const int w = threadIdx.x/WARP_SIZE; \
-    const int wid = threadIdx.x%WARP_SIZE; \
-    if (wid == 0) { \
-      barrier_next[w] += nthreads/WARP_SIZE; \
-      atomicAdd((unsigned long long *)barriers, 1); \
-      while (atomicAdd((unsigned long long *)barriers, 0) < barrier_next[w]) __builtin_amdgcn_s_sleep(1); \
-      __asm__ __volatile__("s_wakeup"); \
-    } \
+  const int w = threadIdx.x/WARP_SIZE; \
+  const int wid = threadIdx.x%WARP_SIZE; \
+  if (wid == 0) { \
+    __asm__ __volatile__("s_waitcnt vmcnt(0) lgkmcnt(0)"); \
+    barrier_next[w] += nthreads/WARP_SIZE; \
+    atomicAdd((unsigned long long *)barriers, 1); \
+    while (atomicAdd((unsigned long long *)barriers, 0) < barrier_next[w]) __builtin_amdgcn_s_sleep(1); \
+    __asm__ __volatile__("s_wakeup"); \
   } \
 } while (0)
 
diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h
index ab4d51d728..ce0dba04d3 100644
--- a/src/collectives/device/prims_ll.h
+++ b/src/collectives/device/prims_ll.h
@@ -104,7 +104,7 @@ private:
       int spins = 0;
       while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
         __builtin_amdgcn_s_sleep(1);
-        sendConnHeadCache = atomicAdd_system((unsigned long long *)sendConnHeadPtr, 0);
+        sendConnHeadCache = atomicAdd((unsigned long long *)sendConnHeadPtr, 0);
         if (checkAbort(spins, 1)) break;
       }
       __asm__ __volatile__("s_wakeup");
diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h
index f0beca4dce..f954ab9d34 100644
--- a/src/collectives/device/prims_simple.h
+++ b/src/collectives/device/prims_simple.h
@@ -112,7 +112,7 @@ private:
       int spins = 0;
       while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
         __builtin_amdgcn_s_sleep(1);
-        connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0);
+        connStepCache = atomicAdd((unsigned long long *)connStepPtr, 0);
         if (checkAbort(spins)) break;
         //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
         if (spins == 0) traceData(__LINE__, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
@@ -327,13 +327,8 @@ private:
 
         }
         barrier(); // This barrier has a counterpart in following loop
-#if defined(__gfx90a__)
-        if (Send && (flags & RolePostSend) && index == 0) {
-          if (MaxSend == 0 || MaxRecv == 0)
-            __threadfence_system();
-          else
-            __asm__ __volatile__("s_waitcnt vmcnt(0) lgkmcnt(0); buffer_wbinvl1_vol");
-        }
+#ifdef __GFX9__
+        if (Send && (flags & RolePostSend) && index == 0) __asm__ __volatile__("buffer_wbinvl1_vol");
 #else
         if (Send && (flags & RolePostSend) && index == 0) __threadfence_system();
 #endif
@@ -355,13 +350,8 @@ private:
         waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, 0);
       }
       barrier(); // Has couterpart in preceding worker-only loop.
-#if defined(__gfx90a__)
-      if (Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) {
-        if (MaxSend == 0 || MaxRecv == 0)
-          __threadfence_system();
-        else
-          __asm__ __volatile__("s_waitcnt vmcnt(0) lgkmcnt(0); buffer_wbinvl1_vol");
-      }
+#ifdef __GFX9__
+      if (Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __asm__ __volatile__("buffer_wbinvl1_vol");
 #else
       if (Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system();
 #endif
@@ -482,7 +472,7 @@ private:
       if (flags & RoleWaitRecv) {
         ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
         connStepPtr = conn->tail;
-        connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0);
+        connStepCache = atomicAdd((unsigned long long *)connStepPtr, 0);
         flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
         if (Direct) {
           // User buffers have been registered
@@ -522,7 +512,7 @@ private:
       if (flags & RoleWaitSend) {
         ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
         connStepPtr = conn->head;
-        connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0);
+        connStepCache = atomicAdd((unsigned long long *)connStepPtr, 0);
         flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
         if (flags & OffsFifoEnabled)
           connOffsFifoPtr = conn->offsFifo;
@@ -634,7 +624,7 @@ private:
       int spins = 0;
       void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
       // Wait for consumer to consume previous value before trampling it.
-      while ((void *)atomicAdd_system((unsigned long long *) slot,0) != nullptr && !checkAbort(spins));
+      while ((void *)atomicAdd((unsigned long long *) slot,0) != nullptr && !checkAbort(spins));
       directBuff = (T*)outputBuf;
       // Encode pointer by XOR'ing against some address they definitely wouldn't send
       // since we want to allow them sending us nullptr while not colliding with
@@ -646,7 +636,7 @@ private:
       void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
       void *ptr;
       while (true) {
-        ptr = (void *)atomicAdd_system((unsigned long long *) slot,0);
+        ptr = (void *)atomicAdd((unsigned long long *) slot,0);
         if (ptr != nullptr || checkAbort(spins)) break;
       }
       directBuff = regUsed ? (T*)(e->dnOutputs[index]) :
@@ -659,7 +649,7 @@ private:
       volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
       volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1;
       // Wait for consumer to consume previous value before trampling it.
-      while (((void *)atomicAdd_system((unsigned long long *) slot,0) != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
+      while (((void *)atomicAdd((unsigned long long *) slot,0) != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
       // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
       // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
       directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
@@ -678,7 +668,7 @@ private:
       volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1;
       void *ptr;
       while (true) {
-        ptr = (void *)atomicAdd_system((unsigned long long *) slot,0);
+        ptr = (void *)atomicAdd((unsigned long long *) slot,0);
         if (ptr != nullptr || checkAbort(spins)) break;
       }
       directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) :
diff --git a/src/init.cc b/src/init.cc
index 853b5cce85..68d3d16e94 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -472,7 +472,11 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua
   // Try to create a CUDA object right away. If there is something wrong with
   // the device we're on (failure cause #1) , better know it early.
   hipEvent_t doneEvent;
+#ifdef HIP_EVENT_DISABLE_FENCE
+  CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming|hipEventDisableSystemFence));
+#else
   CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming));
+#endif
 
   NCCLCHECK(ncclStrongStreamConstruct(&comm->deviceStream));
   NCCLCHECK(ncclStrongStreamConstruct(&comm->hostStream));