From 393d0ba7f8d6fce2fe09043465c18030c1d3208f Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Sat, 18 Feb 2023 08:50:31 -0800 Subject: [PATCH] Add back __syncthreads() in barrier and adjust stack size (#688) [ROCm/rccl commit: 1c166046a2d9cef933d2db2597d1f663ea7f6787] --- projects/rccl/CHANGELOG.md | 1 + projects/rccl/src/collectives/device/common.h | 2 +- .../rccl/src/collectives/device/primitives.h | 16 ++++++++++------ projects/rccl/src/init.cc | 2 +- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/projects/rccl/CHANGELOG.md b/projects/rccl/CHANGELOG.md index 1c8416a84f..bff88ebf96 100644 --- a/projects/rccl/CHANGELOG.md +++ b/projects/rccl/CHANGELOG.md @@ -7,6 +7,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: - Compatibility with NCCL 2.16.2 ### Added ### Fixed +- Remove workaround and use indirect function call ### Removed ## Unreleased - RCCL 2.15.5 for ROCm 5.5.0 diff --git a/projects/rccl/src/collectives/device/common.h b/projects/rccl/src/collectives/device/common.h index 9065def46e..d82aead3bf 100644 --- a/projects/rccl/src/collectives/device/common.h +++ b/projects/rccl/src/collectives/device/common.h @@ -21,7 +21,7 @@ #ifdef __GFX9__ #define STORE(DST, SRC) \ - { atomicExch((unsigned long long *)(DST), (SRC)); } + { __threadfence(); atomicExch((unsigned long long *)(DST), (SRC)); } #else #define STORE(DST, SRC) \ { __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST); } diff --git a/projects/rccl/src/collectives/device/primitives.h b/projects/rccl/src/collectives/device/primitives.h index ea91b61f6f..50377e3323 100644 --- a/projects/rccl/src/collectives/device/primitives.h +++ b/projects/rccl/src/collectives/device/primitives.h @@ -17,12 +17,16 @@ #define barrier_by_group() do { \ const int w = threadIdx.x/WARP_SIZE; \ const int wid = threadIdx.x%WARP_SIZE; \ - if (wid == 0) { \ - __asm__ __volatile__("s_waitcnt vmcnt(0) lgkmcnt(0)"); \ - barrier_next[w] += nthreads/WARP_SIZE; \ - atomicAdd((unsigned long long *)barriers, 1); \ - while (atomicAdd((unsigned long long *)barriers, 0) < barrier_next[w]) __builtin_amdgcn_s_sleep(1); \ - __asm__ __volatile__("s_wakeup"); \ + __threadfence(); \ + if (nthreads == NCCL_MAX_NTHREADS) { \ + __syncthreads(); \ + } else { \ + if (wid == 0) { \ + barrier_next[w] += nthreads/WARP_SIZE; \ + atomicAdd((unsigned long long *)barriers, 1); \ + while (atomicAdd((unsigned long long *)barriers, 0) < barrier_next[w]) __builtin_amdgcn_s_sleep(1); \ + __asm__ __volatile__("s_wakeup"); \ + } \ } \ } while (0) diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 67430718e1..e58f291750 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -1415,7 +1415,7 @@ fail: #ifdef USE_INDIRECT_FUNCTION_CALL NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 1); -RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 8); +RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 256); #else NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0); RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 0);