Cheaper threadfence for gfx942 in postPeer [1/N]: enable for single node allreduce (#1766)
Boosts single node bfloat16 allreduce performance by up to 20% for some data sizes and provides gating with the RCCL_GFX942_CHEAP_FENCE_OFF environment variable
This commit is contained in:
@@ -38,6 +38,7 @@ option(PROFILE "Enable profiling"
|
||||
option(TIMETRACE "Enable time-trace during compilation" OFF)
|
||||
option(TRACE "Enable additional tracing" OFF)
|
||||
option(FAULT_INJECTION "Enable fault injection" ON)
|
||||
option(DISABLE_CHEAP_THREADFENCE "Compile-time killswitch for simpler fence" OFF)
|
||||
|
||||
# Default GPU architectures to build
|
||||
#==================================================================================================
|
||||
@@ -437,6 +438,7 @@ set(SRC_FILES
|
||||
src/device/broadcast.h
|
||||
src/device/common.h
|
||||
src/device/common_kernel.h
|
||||
src/device/gfx9_threadfence.h
|
||||
src/device/op128.h
|
||||
src/device/primitives.h
|
||||
src/device/prims_ll128.h
|
||||
@@ -445,6 +447,7 @@ set(SRC_FILES
|
||||
src/device/reduce.h
|
||||
src/device/reduce_kernel.h
|
||||
src/device/reduce_scatter.h
|
||||
src/device/rccl_metadata.h
|
||||
src/device/sendrecv.h
|
||||
src/device/common.cu
|
||||
src/device/onerank.cu
|
||||
@@ -1112,6 +1115,12 @@ if (FAULT_INJECTION)
|
||||
target_compile_definitions(rccl PRIVATE ENABLE_FAULT_INJECTION)
|
||||
message(STATUS "Fault injection enabled")
|
||||
endif()
|
||||
if (DISABLE_CHEAP_THREADFENCE)
|
||||
target_compile_definitions(rccl PRIVATE DISABLE_CHEAP_THREADFENCE)
|
||||
message(STATUS "Cheap thread fence disabled")
|
||||
else()
|
||||
message(STATUS "Cheap thread fence enabled for some collectives/parameters")
|
||||
endif()
|
||||
|
||||
## Set RCCL linked library directories
|
||||
target_link_directories(rccl PRIVATE ${ROCM_SMI_LIB_DIR})
|
||||
|
||||
+14
-8
@@ -14,7 +14,7 @@
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
template<typename T, typename RedOp, typename Proto, int RCCLMetadata, int COLL_UNROLL>
|
||||
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
|
||||
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
#else
|
||||
@@ -61,7 +61,7 @@ namespace {
|
||||
// Coverity reports that the callee treats &ring->next as an array. However, due to the use of
|
||||
// FanSymmetric<1>, only the first element is ever accessed, so it's fine.
|
||||
// coverity[callee_ptr_arith:FALSE]
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0, false, RCCLMetadata> prims
|
||||
(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg, 0, work->connIndex, work->connIndex, work);
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
@@ -562,15 +562,21 @@ namespace {
|
||||
#define rcclAllReduceRunRingSimpleProtoImpl(tid, nthreads, work) \
|
||||
if(work->rcclUseOneSlice){ \
|
||||
using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS_SINGLE_NODE, ALLREDUCE_SLICESTEPS_SINGLE_NODE>; \
|
||||
runRing<T, RedOp, Proto>(tid, nthreads, work); \
|
||||
} else{ \
|
||||
if(work->regUsed || work->netRegUsed || work->gfx942CheapFenceOff){ \
|
||||
runRing<T, RedOp, Proto, RCCL_METADATA_EMPTY>(tid, nthreads, work); \
|
||||
} \
|
||||
else { \
|
||||
runRing<T, RedOp, Proto, RCCL_ONE_NODE_RING_SIMPLE>(tid, nthreads, work); \
|
||||
} \
|
||||
} \
|
||||
else{ \
|
||||
using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>; \
|
||||
runRing<T, RedOp, Proto>(tid, nthreads, work); \
|
||||
runRing<T, RedOp, Proto, RCCL_METADATA_EMPTY>(tid, nthreads, work); \
|
||||
}
|
||||
#else
|
||||
#define rcclAllReduceRunRingSimpleProtoImpl(tid, nthreads, work) \
|
||||
using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>; \
|
||||
runRing<T, RedOp, Proto>(tid, nthreads, work);
|
||||
runRing<T, RedOp, Proto, RCCL_METADATA_EMPTY>(tid, nthreads, work);
|
||||
#endif
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
@@ -1099,7 +1105,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
|
||||
runRing<T, RedOp, ProtoLL, RCCL_METADATA_EMPTY>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1113,7 +1119,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
|
||||
runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
|
||||
runRing<T, RedOp, ProtoLL128, RCCL_METADATA_EMPTY>(tid, nthreads, work);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
// This is only okay when the protocol buffer is allocated in uncached memory.
|
||||
#if defined(__gfx942__) && defined(HIP_UNCACHED_MEMORY) && !defined(DISABLE_CHEAP_THREADFENCE)
|
||||
#define RCCL_CHEAP_THREADFENCE_OK_SOMETIMES 1
|
||||
#else
|
||||
#define RCCL_CHEAP_THREADFENCE_OK_SOMETIMES 0
|
||||
#endif
|
||||
|
||||
template<bool UseCheaperThreadFence>
|
||||
inline __device__ void gfx9ThreadFence();
|
||||
|
||||
template<>
|
||||
inline __device__ void gfx9ThreadFence<true>() {
|
||||
asm volatile("s_waitcnt lgkmcnt(0) vmcnt(0)");
|
||||
asm volatile("buffer_inv sc0 sc1");
|
||||
}
|
||||
|
||||
template<>
|
||||
inline __device__ void gfx9ThreadFence<false>() {
|
||||
__threadfence();
|
||||
}
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
#include <type_traits>
|
||||
#include "reduce_kernel.h" // for reduction funcs
|
||||
#include "rccl_metadata.h"
|
||||
#include "common_kernel.h"
|
||||
#include "common.h"
|
||||
|
||||
@@ -136,7 +137,7 @@ struct FanSymmetric {
|
||||
};
|
||||
|
||||
// The primitives class. Specialized per protocol in the other headers.
|
||||
template<typename T, typename RedOp, typename Fan, int Direct, typename Proto, int P2p, bool isNetOffload = false>
|
||||
template<typename T, typename RedOp, typename Fan, int Direct, typename Proto, int P2p, bool isNetOffload = false, int Metadata = RCCL_METADATA_EMPTY>
|
||||
class Primitives;
|
||||
|
||||
// Used by LL & LL128 to implement direct members in the naive way.
|
||||
|
||||
@@ -10,6 +10,8 @@
|
||||
#include "npkit/npkit.h"
|
||||
#endif
|
||||
|
||||
#include "device/gfx9_threadfence.h"
|
||||
#include "device/rccl_metadata.h"
|
||||
#include "msccl/msccl_struct.h"
|
||||
#include "network/unpack/unpack.h"
|
||||
#include <cassert>
|
||||
@@ -21,9 +23,9 @@ enum primsMode {
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp, typename Fan, int Direct,
|
||||
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts, bool isNetOffload>
|
||||
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts, bool isNetOffload, int Metadata>
|
||||
class Primitives<
|
||||
T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, MultimemSrcs, MultimemDsts>, P2p, isNetOffload
|
||||
T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, MultimemSrcs, MultimemDsts>, P2p, isNetOffload, Metadata
|
||||
> {
|
||||
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
|
||||
static constexpr int Input=0, Output=1;
|
||||
@@ -199,12 +201,13 @@ private:
|
||||
|
||||
template<int Recv, int Send>
|
||||
inline __device__ void postPeer(bool dataStored) {
|
||||
if (Send && (flags & RolePostSend) && dataStored)
|
||||
if (Send && (flags & RolePostSend) && dataStored){
|
||||
#ifdef __GFX9__
|
||||
__threadfence();
|
||||
gfx9ThreadFence<isOneNodeRingSimple(Metadata) && RCCL_CHEAP_THREADFENCE_OK_SOMETIMES>();
|
||||
#else
|
||||
__threadfence_system();
|
||||
#endif
|
||||
}
|
||||
|
||||
if ((flags & Send*RolePostSend) && next_hdp_reg)
|
||||
STORE((unsigned int *)next_hdp_reg, 0x1);
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
#pragma once
|
||||
/*
|
||||
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* This file implements methods to extract metadata from an integer Metadata field passed in as a template parameter. Feel free to add additional fields below.*/
|
||||
|
||||
#define RCCL_METADATA_EMPTY 0
|
||||
#define RCCL_ONE_NODE_RING_SIMPLE (1 << 0)
|
||||
|
||||
constexpr bool isOneNodeRingSimple(int metadata) {
|
||||
return (metadata & RCCL_ONE_NODE_RING_SIMPLE) != 0;
|
||||
}
|
||||
|
||||
static_assert(isOneNodeRingSimple(RCCL_ONE_NODE_RING_SIMPLE), "RCCL_ONE_NODE_RING_SIMPLE should be set to (1 << 0)");
|
||||
static_assert(isOneNodeRingSimple(0) == 0, "RCCL_ONE_NODE_RING_SIMPLE should not be set when metadata is 0");
|
||||
@@ -366,6 +366,7 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
|
||||
devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
|
||||
devWork.oneNode = (comm->nNodes == 1);
|
||||
devWork.rcclUseOneSlice = comm->rcclUseOneSlice;
|
||||
devWork.gfx942CheapFenceOff = comm->gfx942CheapFenceOff;
|
||||
devWork.isOneRPN = comm->isOneRPN;
|
||||
devWork.netRegUsed = devWork.regUsed = 0;
|
||||
devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh);
|
||||
|
||||
@@ -488,6 +488,7 @@ struct ncclComm {
|
||||
int node;
|
||||
int nNodes;
|
||||
int rcclUseOneSlice; // RCCL: true if this comm is using one slice per primitive
|
||||
int gfx942CheapFenceOff; // RCCL: true if gfx942 cheap fence is disabled
|
||||
int localRank;
|
||||
int localRanks;
|
||||
int maxLocalRanks;
|
||||
|
||||
@@ -305,7 +305,7 @@ struct alignas(16) ncclDevWorkColl {
|
||||
// nChannels == (channelHi - channelLo) + 1
|
||||
uint32_t channelLo:8, channelHi:8;
|
||||
uint32_t nWarps:8;
|
||||
uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1, rcclUseOneSlice:1;
|
||||
uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1, rcclUseOneSlice:1, gfx942CheapFenceOff:1;
|
||||
uint32_t root:30, connIndex:2;
|
||||
uint16_t pivotA2ANumBiRings:15, profilerEnabled:1;
|
||||
void* recvbuff;
|
||||
|
||||
@@ -103,6 +103,8 @@ RCCL_PARAM(MscclppThreshold, "MSCCLPP_THRESHOLD", (size_t)(16*1024*1024));
|
||||
static constexpr int64_t defaultEnableMscclpp = 0;
|
||||
RCCL_PARAM(MscclppEnabled, "MSCCLPP_ENABLE", defaultEnableMscclpp);
|
||||
RCCL_PARAM(MscclppForceEnabled, "MSCCLPP_FORCE_ENABLE", 0);
|
||||
// Turn off cheap fence for gfx942
|
||||
RCCL_PARAM(Gfx942CheapFenceOff, "GFX942_CHEAP_FENCE_OFF", 0);
|
||||
|
||||
// GDRCOPY support: Off by default
|
||||
NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0);
|
||||
@@ -1365,6 +1367,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
|
||||
// RCCL: Only use one slice per primitive on some single node gfx9xx systems
|
||||
comm->rcclUseOneSlice = !managed && nNodes == 1;
|
||||
comm->gfx942CheapFenceOff = rcclParamGfx942CheapFenceOff();
|
||||
if (managed && nNodes > 1) {
|
||||
// This forces the minimum channels to 24
|
||||
allGather3Data[rank].nc = 6;
|
||||
|
||||
Reference in New Issue
Block a user