Disable the __threadfence on the sender side of the simple protocol when possible. (#1830)
Leverages the traits of extended-scope fine-grain memory to get rid of a device-scope acquire-release fence. This improves throughput for single node workloads on gfx942 and gfx950 for some input sizes (e.g., ~32 MiB to about 256 MiB) when using the simple protocol. Multinode workloads on MI300X see a smaller but statistically significant uplift for some message sizes. Runtime disablement is supported via setting the environment variable RCCL_GFX942_CHEAP_FENCE_ON to 0.
This commit is contained in:
committad av
GitHub
förälder
c61152baa4
incheckning
1aa2570b48
@@ -39,7 +39,6 @@ option(TIMETRACE "Enable time-trace during compila
|
||||
option(TRACE "Enable additional tracing" OFF)
|
||||
option(FAULT_INJECTION "Enable fault injection" ON)
|
||||
option(FORCE_REDUCE_PIPELINING "Force reduce pipelining" OFF)
|
||||
option(DISABLE_CHEAP_THREADFENCE "Compile-time killswitch for simpler fence" OFF)
|
||||
|
||||
# Default GPU architectures to build
|
||||
#==================================================================================================
|
||||
@@ -446,7 +445,6 @@ set(SRC_FILES
|
||||
src/device/broadcast.h
|
||||
src/device/common.h
|
||||
src/device/common_kernel.h
|
||||
src/device/gfx9_threadfence.h
|
||||
src/device/op128.h
|
||||
src/device/primitives.h
|
||||
src/device/prims_ll128.h
|
||||
@@ -1176,12 +1174,6 @@ if (FAULT_INJECTION)
|
||||
target_compile_definitions(rccl PRIVATE ENABLE_FAULT_INJECTION)
|
||||
message(STATUS "Fault injection enabled")
|
||||
endif()
|
||||
if (DISABLE_CHEAP_THREADFENCE)
|
||||
target_compile_definitions(rccl PRIVATE DISABLE_CHEAP_THREADFENCE)
|
||||
message(STATUS "Cheap thread fence disabled")
|
||||
else()
|
||||
message(STATUS "Cheap thread fence enabled for some collectives/parameters")
|
||||
endif()
|
||||
|
||||
## Set RCCL linked library directories
|
||||
target_link_directories(rccl PRIVATE ${ROCM_SMI_LIB_DIR})
|
||||
|
||||
@@ -560,16 +560,11 @@ namespace {
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__gfx942__) || defined(__gfx950__) // Use a single slice per simple primitive for a single node on some GFX9 devices.
|
||||
#if defined(__gfx942__) || defined(__gfx950__) // Use a single slice per simple primitive for a single node on some GFX9 devices.
|
||||
#define rcclAllReduceRunRingSimpleProtoImpl(tid, nthreads, work) \
|
||||
if(work->rcclUseOneSlice){ \
|
||||
using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS_SINGLE_NODE, ALLREDUCE_SLICESTEPS_SINGLE_NODE>; \
|
||||
if(work->regUsed || work->netRegUsed || work->gfx942CheapFenceOff){ \
|
||||
runRing<T, RedOp, Proto, RCCL_METADATA_EMPTY>(tid, nthreads, work); \
|
||||
} \
|
||||
else { \
|
||||
runRing<T, RedOp, Proto, RCCL_ONE_NODE_RING_SIMPLE>(tid, nthreads, work); \
|
||||
} \
|
||||
runRing<T, RedOp, Proto, RCCL_METADATA_EMPTY>(tid, nthreads, work); \
|
||||
} \
|
||||
else{ \
|
||||
using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>; \
|
||||
|
||||
@@ -1,44 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
// This is only okay when the protocol buffer is allocated in uncached memory.
|
||||
#if defined(__gfx942__) && defined(HIP_UNCACHED_MEMORY) && !defined(DISABLE_CHEAP_THREADFENCE)
|
||||
#define RCCL_CHEAP_THREADFENCE_OK_SOMETIMES 1
|
||||
#else
|
||||
#define RCCL_CHEAP_THREADFENCE_OK_SOMETIMES 0
|
||||
#endif
|
||||
|
||||
template<bool UseCheaperThreadFence>
|
||||
inline __device__ void gfx9ThreadFence();
|
||||
|
||||
template<>
|
||||
inline __device__ void gfx9ThreadFence<true>() {
|
||||
asm volatile("s_waitcnt lgkmcnt(0) vmcnt(0)");
|
||||
asm volatile("buffer_inv sc0 sc1");
|
||||
}
|
||||
|
||||
template<>
|
||||
inline __device__ void gfx9ThreadFence<false>() {
|
||||
__threadfence();
|
||||
}
|
||||
@@ -10,7 +10,6 @@
|
||||
#include "npkit/npkit.h"
|
||||
#endif
|
||||
|
||||
#include "device/gfx9_threadfence.h"
|
||||
#include "device/rccl_metadata.h"
|
||||
#include "msccl/msccl_struct.h"
|
||||
#include "network/unpack/unpack.h"
|
||||
@@ -68,6 +67,7 @@ class Primitives<
|
||||
uint64_t* barriers_pat;
|
||||
uint64_t barrier_next_pat = 0;
|
||||
int repeat;
|
||||
bool skip_fence = 0;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
public:
|
||||
@@ -201,9 +201,14 @@ private:
|
||||
|
||||
template<int Recv, int Send>
|
||||
inline __device__ void postPeer(bool dataStored) {
|
||||
if (Send && (flags & RolePostSend) && dataStored){
|
||||
if (skip_fence){
|
||||
__atomic_signal_fence(__ATOMIC_SEQ_CST);
|
||||
barrier_generic(asm volatile("s_waitcnt lgkmcnt(0) vmcnt(0)"), nworkers, barrier_next, barriers);
|
||||
__atomic_signal_fence(__ATOMIC_SEQ_CST);
|
||||
}
|
||||
else if((flags & RolePostSend) && dataStored){
|
||||
#ifdef __GFX9__
|
||||
gfx9ThreadFence<isOneNodeRingSimple(Metadata) && RCCL_CHEAP_THREADFENCE_OK_SOMETIMES>();
|
||||
__threadfence();
|
||||
#else
|
||||
__threadfence_system();
|
||||
#endif
|
||||
@@ -869,6 +874,9 @@ public:
|
||||
}
|
||||
patBarrier();
|
||||
}
|
||||
if(collWork){
|
||||
skip_fence = !collWork -> gfx942CheapFenceOff;
|
||||
}
|
||||
}
|
||||
|
||||
__forceinline__ __device__ ~Primitives() {
|
||||
|
||||
@@ -24,11 +24,3 @@ THE SOFTWARE.
|
||||
/* This file implements methods to extract metadata from an integer Metadata field passed in as a template parameter. Feel free to add additional fields below.*/
|
||||
|
||||
#define RCCL_METADATA_EMPTY 0
|
||||
#define RCCL_ONE_NODE_RING_SIMPLE (1 << 0)
|
||||
|
||||
constexpr bool isOneNodeRingSimple(int metadata) {
|
||||
return (metadata & RCCL_ONE_NODE_RING_SIMPLE) != 0;
|
||||
}
|
||||
|
||||
static_assert(isOneNodeRingSimple(RCCL_ONE_NODE_RING_SIMPLE), "RCCL_ONE_NODE_RING_SIMPLE should be set to (1 << 0)");
|
||||
static_assert(isOneNodeRingSimple(0) == 0, "RCCL_ONE_NODE_RING_SIMPLE should not be set when metadata is 0");
|
||||
+9
-1
@@ -335,6 +335,13 @@ static bool testBudget(
|
||||
return ok;
|
||||
}
|
||||
|
||||
// Returns whether this should be disabled at the device level. Should be called after devWork fields have been set for what
|
||||
// it depends on.
|
||||
bool gfx942CheapFenceOff(const ncclDevWorkColl& devWork, bool disabledByPrecheck){
|
||||
bool fenceOk = devWork.regUsed == 0 && devWork.netRegUsed == 0 && !disabledByPrecheck;
|
||||
return !fenceOk;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
|
||||
struct ncclKernelPlanner* planner = &comm->planner;
|
||||
struct ncclTaskColl *task;
|
||||
@@ -367,9 +374,10 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
|
||||
devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
|
||||
devWork.oneNode = (comm->nNodes == 1);
|
||||
devWork.rcclUseOneSlice = comm->rcclUseOneSlice;
|
||||
devWork.gfx942CheapFenceOff = comm->gfx942CheapFenceOff;
|
||||
|
||||
devWork.isOneRPN = comm->isOneRPN;
|
||||
devWork.netRegUsed = devWork.regUsed = 0;
|
||||
devWork.gfx942CheapFenceOff = gfx942CheapFenceOff(devWork, comm->gfx942CheapFenceOff);
|
||||
devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh);
|
||||
if (task->regBufType & NCCL_NET_REG_BUFFER)
|
||||
devWork.netRegUsed = 1;
|
||||
|
||||
+11
-1
@@ -1369,13 +1369,23 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph->nChannels);
|
||||
if (ringGraph->nChannels > MAXCHANNELS/2)
|
||||
allGather3Data[rank].nc = 1;
|
||||
comm -> gfx942CheapFenceOff = 1;
|
||||
#ifdef HIP_UNCACHED_MEMORY
|
||||
if(!rcclParamGfx942CheapFenceOff()){
|
||||
if(IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")){
|
||||
comm -> gfx942CheapFenceOff = 0;
|
||||
}
|
||||
else if(IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")){
|
||||
comm -> gfx942CheapFenceOff = nNodes > 1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")) {
|
||||
// Multi-node MI300A
|
||||
int managed = 0;
|
||||
CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
|
||||
// RCCL: Only use one slice per primitive on some single node gfx9xx systems
|
||||
comm->rcclUseOneSlice = !managed && nNodes == 1;
|
||||
comm->gfx942CheapFenceOff = rcclParamGfx942CheapFenceOff();
|
||||
if (managed && nNodes > 1) {
|
||||
// This forces the minimum channels to 24
|
||||
allGather3Data[rank].nc = 6;
|
||||
|
||||
Referens i nytt ärende
Block a user