Disable the __threadfence on the sender side of the simple protocol when possible. (#1830)

Leverages the traits of extended-scope fine-grain memory to get rid of a device-scope acquire-release fence.  This improves throughput for single node workloads on gfx942 and gfx950 for some input sizes (e.g., ~32 MiB to about 256 MiB) when using the simple protocol.  Multinode workloads on MI300X see a smaller but statistically significant uplift for some message sizes.  Runtime disablement is supported via setting the environment variable RCCL_GFX942_CHEAP_FENCE_ON to 0.
This commit is contained in:
alex-breslow-amd
2025-08-15 07:54:54 -07:00
committad av GitHub
förälder c61152baa4
incheckning 1aa2570b48
7 ändrade filer med 33 tillägg och 72 borttagningar
-8
Visa fil
@@ -39,7 +39,6 @@ option(TIMETRACE "Enable time-trace during compila
option(TRACE "Enable additional tracing" OFF)
option(FAULT_INJECTION "Enable fault injection" ON)
option(FORCE_REDUCE_PIPELINING "Force reduce pipelining" OFF)
option(DISABLE_CHEAP_THREADFENCE "Compile-time killswitch for simpler fence" OFF)
# Default GPU architectures to build
#==================================================================================================
@@ -446,7 +445,6 @@ set(SRC_FILES
src/device/broadcast.h
src/device/common.h
src/device/common_kernel.h
src/device/gfx9_threadfence.h
src/device/op128.h
src/device/primitives.h
src/device/prims_ll128.h
@@ -1176,12 +1174,6 @@ if (FAULT_INJECTION)
target_compile_definitions(rccl PRIVATE ENABLE_FAULT_INJECTION)
message(STATUS "Fault injection enabled")
endif()
if (DISABLE_CHEAP_THREADFENCE)
target_compile_definitions(rccl PRIVATE DISABLE_CHEAP_THREADFENCE)
message(STATUS "Cheap thread fence disabled")
else()
message(STATUS "Cheap thread fence enabled for some collectives/parameters")
endif()
## Set RCCL linked library directories
target_link_directories(rccl PRIVATE ${ROCM_SMI_LIB_DIR})
+2 -7
Visa fil
@@ -560,16 +560,11 @@ namespace {
}
}
#if defined(__gfx942__) || defined(__gfx950__) // Use a single slice per simple primitive for a single node on some GFX9 devices.
#if defined(__gfx942__) || defined(__gfx950__) // Use a single slice per simple primitive for a single node on some GFX9 devices.
#define rcclAllReduceRunRingSimpleProtoImpl(tid, nthreads, work) \
if(work->rcclUseOneSlice){ \
using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS_SINGLE_NODE, ALLREDUCE_SLICESTEPS_SINGLE_NODE>; \
if(work->regUsed || work->netRegUsed || work->gfx942CheapFenceOff){ \
runRing<T, RedOp, Proto, RCCL_METADATA_EMPTY>(tid, nthreads, work); \
} \
else { \
runRing<T, RedOp, Proto, RCCL_ONE_NODE_RING_SIMPLE>(tid, nthreads, work); \
} \
runRing<T, RedOp, Proto, RCCL_METADATA_EMPTY>(tid, nthreads, work); \
} \
else{ \
using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>; \
-44
Visa fil
@@ -1,44 +0,0 @@
/*
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
// This is only okay when the protocol buffer is allocated in uncached memory.
#if defined(__gfx942__) && defined(HIP_UNCACHED_MEMORY) && !defined(DISABLE_CHEAP_THREADFENCE)
#define RCCL_CHEAP_THREADFENCE_OK_SOMETIMES 1
#else
#define RCCL_CHEAP_THREADFENCE_OK_SOMETIMES 0
#endif
template<bool UseCheaperThreadFence>
inline __device__ void gfx9ThreadFence();
template<>
inline __device__ void gfx9ThreadFence<true>() {
asm volatile("s_waitcnt lgkmcnt(0) vmcnt(0)");
asm volatile("buffer_inv sc0 sc1");
}
template<>
inline __device__ void gfx9ThreadFence<false>() {
__threadfence();
}
+11 -3
Visa fil
@@ -10,7 +10,6 @@
#include "npkit/npkit.h"
#endif
#include "device/gfx9_threadfence.h"
#include "device/rccl_metadata.h"
#include "msccl/msccl_struct.h"
#include "network/unpack/unpack.h"
@@ -68,6 +67,7 @@ class Primitives<
uint64_t* barriers_pat;
uint64_t barrier_next_pat = 0;
int repeat;
bool skip_fence = 0;
#if defined(ENABLE_NPKIT)
public:
@@ -201,9 +201,14 @@ private:
template<int Recv, int Send>
inline __device__ void postPeer(bool dataStored) {
if (Send && (flags & RolePostSend) && dataStored){
if (skip_fence){
__atomic_signal_fence(__ATOMIC_SEQ_CST);
barrier_generic(asm volatile("s_waitcnt lgkmcnt(0) vmcnt(0)"), nworkers, barrier_next, barriers);
__atomic_signal_fence(__ATOMIC_SEQ_CST);
}
else if((flags & RolePostSend) && dataStored){
#ifdef __GFX9__
gfx9ThreadFence<isOneNodeRingSimple(Metadata) && RCCL_CHEAP_THREADFENCE_OK_SOMETIMES>();
__threadfence();
#else
__threadfence_system();
#endif
@@ -869,6 +874,9 @@ public:
}
patBarrier();
}
if(collWork){
skip_fence = !collWork -> gfx942CheapFenceOff;
}
}
__forceinline__ __device__ ~Primitives() {
-8
Visa fil
@@ -24,11 +24,3 @@ THE SOFTWARE.
/* This file implements methods to extract metadata from an integer Metadata field passed in as a template parameter. Feel free to add additional fields below.*/
#define RCCL_METADATA_EMPTY 0
#define RCCL_ONE_NODE_RING_SIMPLE (1 << 0)
constexpr bool isOneNodeRingSimple(int metadata) {
return (metadata & RCCL_ONE_NODE_RING_SIMPLE) != 0;
}
static_assert(isOneNodeRingSimple(RCCL_ONE_NODE_RING_SIMPLE), "RCCL_ONE_NODE_RING_SIMPLE should be set to (1 << 0)");
static_assert(isOneNodeRingSimple(0) == 0, "RCCL_ONE_NODE_RING_SIMPLE should not be set when metadata is 0");
+9 -1
Visa fil
@@ -335,6 +335,13 @@ static bool testBudget(
return ok;
}
// Returns whether this should be disabled at the device level. Should be called after devWork fields have been set for what
// it depends on.
bool gfx942CheapFenceOff(const ncclDevWorkColl& devWork, bool disabledByPrecheck){
bool fenceOk = devWork.regUsed == 0 && devWork.netRegUsed == 0 && !disabledByPrecheck;
return !fenceOk;
}
ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
struct ncclKernelPlanner* planner = &comm->planner;
struct ncclTaskColl *task;
@@ -367,9 +374,10 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
devWork.oneNode = (comm->nNodes == 1);
devWork.rcclUseOneSlice = comm->rcclUseOneSlice;
devWork.gfx942CheapFenceOff = comm->gfx942CheapFenceOff;
devWork.isOneRPN = comm->isOneRPN;
devWork.netRegUsed = devWork.regUsed = 0;
devWork.gfx942CheapFenceOff = gfx942CheapFenceOff(devWork, comm->gfx942CheapFenceOff);
devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh);
if (task->regBufType & NCCL_NET_REG_BUFFER)
devWork.netRegUsed = 1;
+11 -1
Visa fil
@@ -1369,13 +1369,23 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph->nChannels);
if (ringGraph->nChannels > MAXCHANNELS/2)
allGather3Data[rank].nc = 1;
comm -> gfx942CheapFenceOff = 1;
#ifdef HIP_UNCACHED_MEMORY
if(!rcclParamGfx942CheapFenceOff()){
if(IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")){
comm -> gfx942CheapFenceOff = 0;
}
else if(IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")){
comm -> gfx942CheapFenceOff = nNodes > 1;
}
}
#endif
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")) {
// Multi-node MI300A
int managed = 0;
CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
// RCCL: Only use one slice per primitive on some single node gfx9xx systems
comm->rcclUseOneSlice = !managed && nNodes == 1;
comm->gfx942CheapFenceOff = rcclParamGfx942CheapFenceOff();
if (managed && nNodes > 1) {
// This forces the minimum channels to 24
allGather3Data[rank].nc = 6;