[DEVICE] Use threadfence on gfx950 for LL protocol (#1686)
Signed-off-by: nileshnegi <Nilesh.Negi@amd.com>
[ROCm/rccl commit: b926203c05]
Bu işleme şunda yer alıyor:
işlemeyi yapan:
GitHub
ebeveyn
7abc3160e7
işleme
b797b62f6b
@@ -15,13 +15,7 @@
|
||||
|
||||
#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
|
||||
|
||||
#if defined(__gfx942__) || defined(__gfx950__)
|
||||
#define __THREAD_FENCE __threadfence_block()
|
||||
#else
|
||||
#define __THREAD_FENCE __threadfence()
|
||||
#endif
|
||||
|
||||
#define barrier_by_group() do { \
|
||||
#define barrier_by_group_common(__THREAD_FENCE) do { \
|
||||
if (nthreads == NCCL_MAX_NTHREADS) { \
|
||||
__THREAD_FENCE; __builtin_amdgcn_s_barrier(); \
|
||||
} else { \
|
||||
@@ -53,6 +47,9 @@
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define barrier_by_group() barrier_by_group_common(__threadfence())
|
||||
#define barrier_by_group_block() barrier_by_group_common(__threadfence_block())
|
||||
|
||||
/* Protocol classes: ProtoSimple, ProtoLL, ProtoLL128
|
||||
* We use these as template args to the Primtiives class instead of integral
|
||||
* enums (e.g. NCCL_PROTO_LL) because for SIMPLE we need to carry a few extra
|
||||
|
||||
@@ -71,7 +71,11 @@ private:
|
||||
inline __device__ void barrier() {
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
if (nthreads != WARP_SIZE)
|
||||
barrier_by_group();
|
||||
#if defined(__gfx942__)
|
||||
barrier_by_group_block();
|
||||
#else
|
||||
barrier_by_group();
|
||||
#endif
|
||||
#else
|
||||
if (nthreads == WARP_SIZE) {
|
||||
__syncwarp();
|
||||
|
||||
@@ -76,7 +76,11 @@ private:
|
||||
inline __device__ void barrier() {
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
if (nthreads != WARP_SIZE)
|
||||
barrier_by_group();
|
||||
#if defined(__gfx942__) || defined(__gfx950__)
|
||||
barrier_by_group_block();
|
||||
#else
|
||||
barrier_by_group();
|
||||
#endif
|
||||
#else
|
||||
barrier_sync(15-group, nthreads);
|
||||
#endif
|
||||
|
||||
@@ -79,7 +79,11 @@ private:
|
||||
if (nthreads == WARP_SIZE)
|
||||
__syncwarp();
|
||||
else
|
||||
barrier_by_group();
|
||||
#if defined(__gfx942__) || defined(__gfx950__)
|
||||
barrier_by_group_block();
|
||||
#else
|
||||
barrier_by_group();
|
||||
#endif
|
||||
}
|
||||
inline __device__ void subBarrier() {
|
||||
barrier();
|
||||
|
||||
Yeni konuda referans
Bir kullanıcı engelle