diff --git a/projects/rccl/src/device/primitives.h b/projects/rccl/src/device/primitives.h index f0c6986884..3ef9fd6126 100644 --- a/projects/rccl/src/device/primitives.h +++ b/projects/rccl/src/device/primitives.h @@ -29,10 +29,10 @@ const int wid = threadIdx.x%WARP_SIZE; \ if (wid == 0) { \ barrier_next += nthreads/WARP_SIZE; \ + __THREAD_FENCE; \ __hip_atomic_fetch_add(barriers, 1, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \ int spins = 0; \ int rate_limit = 50; \ - __THREAD_FENCE; \ while (__hip_atomic_load(barriers, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) < barrier_next) { \ spins++; \ if (spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { \