moving the thread_fence to apply before atomic fetch (#1672)

* applying thread_fence only on warp 0 before atomic fetch --------- Co-authored-by: Marzieh Berenjkoub <mberenjk@amd.com> [ROCm/rccl commit: 1cefcee51f]
2025-05-14 10:10:05 -05:00
@@ -29,10 +29,10 @@
    const int wid = threadIdx.x%WARP_SIZE; \
    if (wid == 0) { \
      barrier_next += nthreads/WARP_SIZE; \
+      __THREAD_FENCE; \
      __hip_atomic_fetch_add(barriers, 1, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
      int spins = 0; \
      int rate_limit = 50; \
-      __THREAD_FENCE; \
      while (__hip_atomic_load(barriers, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) < barrier_next) { \
        spins++; \
        if (spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { \