moving the thread_fence to apply before atomic fetch (#1672)

* applying thread_fence only on warp 0 before atomic fetch

---------

Co-authored-by: Marzieh Berenjkoub <mberenjk@amd.com>

[ROCm/rccl commit: 1cefcee51f]
Šī revīzija ir iekļauta:
mberenjk
2025-05-14 10:10:05 -05:00
revīziju iesūtīja GitHub
vecāks 128b0e7074
revīzija 08c0b8b0fc
+1 -1
Parādīt failu
@@ -29,10 +29,10 @@
const int wid = threadIdx.x%WARP_SIZE; \
if (wid == 0) { \
barrier_next += nthreads/WARP_SIZE; \
__THREAD_FENCE; \
__hip_atomic_fetch_add(barriers, 1, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
int spins = 0; \
int rate_limit = 50; \
__THREAD_FENCE; \
while (__hip_atomic_load(barriers, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) < barrier_next) { \
spins++; \
if (spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { \