moving the thread_fence to apply before atomic fetch (#1672)
* applying thread_fence only on warp 0 before atomic fetch
---------
Co-authored-by: Marzieh Berenjkoub <mberenjk@amd.com>
[ROCm/rccl commit: 1cefcee51f]
Šī revīzija ir iekļauta:
@@ -29,10 +29,10 @@
|
||||
const int wid = threadIdx.x%WARP_SIZE; \
|
||||
if (wid == 0) { \
|
||||
barrier_next += nthreads/WARP_SIZE; \
|
||||
__THREAD_FENCE; \
|
||||
__hip_atomic_fetch_add(barriers, 1, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP); \
|
||||
int spins = 0; \
|
||||
int rate_limit = 50; \
|
||||
__THREAD_FENCE; \
|
||||
while (__hip_atomic_load(barriers, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP) < barrier_next) { \
|
||||
spins++; \
|
||||
if (spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { \
|
||||
|
||||
Atsaukties uz šo jaunā problēmā
Block a user