From 08c0b8b0fca166e8e34edd7fd5378b01e9d9889c Mon Sep 17 00:00:00 2001 From: mberenjk <146776561+mberenjk@users.noreply.github.com> Date: Wed, 14 May 2025 10:10:05 -0500 Subject: [PATCH] moving the thread_fence to apply before atomic fetch (#1672) * applying thread_fence only on warp 0 before atomic fetch --------- Co-authored-by: Marzieh Berenjkoub [ROCm/rccl commit: 1cefcee51fe47c1dafdd82979a1437dc350ff289] --- projects/rccl/src/device/primitives.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/rccl/src/device/primitives.h b/projects/rccl/src/device/primitives.h index f0c6986884..3ef9fd6126 100644 --- a/projects/rccl/src/device/primitives.h +++ b/projects/rccl/src/device/primitives.h @@ -29,10 +29,10 @@ const int wid = threadIdx.x%WARP_SIZE; \ if (wid == 0) { \ barrier_next += nthreads/WARP_SIZE; \ + __THREAD_FENCE; \ __hip_atomic_fetch_add(barriers, 1, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \ int spins = 0; \ int rate_limit = 50; \ - __THREAD_FENCE; \ while (__hip_atomic_load(barriers, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) < barrier_next) { \ spins++; \ if (spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { \