Add back __syncthreads() in barrier and adjust stack size (#688)

[ROCm/rccl commit: 1c166046a2]
This commit is contained in:
Wenkai Du
2023-02-18 08:50:31 -08:00
zatwierdzone przez GitHub
rodzic 7c1290f995
commit 393d0ba7f8
4 zmienionych plików z 13 dodań i 8 usunięć
+1
Wyświetl plik
@@ -7,6 +7,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
- Compatibility with NCCL 2.16.2
### Added
### Fixed
- Remove workaround and use indirect function call
### Removed
## Unreleased - RCCL 2.15.5 for ROCm 5.5.0
@@ -21,7 +21,7 @@
#ifdef __GFX9__
#define STORE(DST, SRC) \
{ atomicExch((unsigned long long *)(DST), (SRC)); }
{ __threadfence(); atomicExch((unsigned long long *)(DST), (SRC)); }
#else
#define STORE(DST, SRC) \
{ __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST); }
@@ -17,12 +17,16 @@
#define barrier_by_group() do { \
const int w = threadIdx.x/WARP_SIZE; \
const int wid = threadIdx.x%WARP_SIZE; \
if (wid == 0) { \
__asm__ __volatile__("s_waitcnt vmcnt(0) lgkmcnt(0)"); \
barrier_next[w] += nthreads/WARP_SIZE; \
atomicAdd((unsigned long long *)barriers, 1); \
while (atomicAdd((unsigned long long *)barriers, 0) < barrier_next[w]) __builtin_amdgcn_s_sleep(1); \
__asm__ __volatile__("s_wakeup"); \
__threadfence(); \
if (nthreads == NCCL_MAX_NTHREADS) { \
__syncthreads(); \
} else { \
if (wid == 0) { \
barrier_next[w] += nthreads/WARP_SIZE; \
atomicAdd((unsigned long long *)barriers, 1); \
while (atomicAdd((unsigned long long *)barriers, 0) < barrier_next[w]) __builtin_amdgcn_s_sleep(1); \
__asm__ __volatile__("s_wakeup"); \
} \
} \
} while (0)
+1 -1
Wyświetl plik
@@ -1415,7 +1415,7 @@ fail:
#ifdef USE_INDIRECT_FUNCTION_CALL
NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 1);
RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 8);
RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 256);
#else
NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0);
RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 0);