diff --git a/CMakeLists.txt b/CMakeLists.txt index b209ebfcf1..7d97b713ff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,7 +39,6 @@ option(TIMETRACE "Enable time-trace during compila option(TRACE "Enable additional tracing" OFF) option(FAULT_INJECTION "Enable fault injection" ON) option(FORCE_REDUCE_PIPELINING "Force reduce pipelining" OFF) -option(DISABLE_CHEAP_THREADFENCE "Compile-time killswitch for simpler fence" OFF) # Default GPU architectures to build #================================================================================================== @@ -446,7 +445,6 @@ set(SRC_FILES src/device/broadcast.h src/device/common.h src/device/common_kernel.h - src/device/gfx9_threadfence.h src/device/op128.h src/device/primitives.h src/device/prims_ll128.h @@ -1176,12 +1174,6 @@ if (FAULT_INJECTION) target_compile_definitions(rccl PRIVATE ENABLE_FAULT_INJECTION) message(STATUS "Fault injection enabled") endif() -if (DISABLE_CHEAP_THREADFENCE) - target_compile_definitions(rccl PRIVATE DISABLE_CHEAP_THREADFENCE) - message(STATUS "Cheap thread fence disabled") -else() - message(STATUS "Cheap thread fence enabled for some collectives/parameters") -endif() ## Set RCCL linked library directories target_link_directories(rccl PRIVATE ${ROCM_SMI_LIB_DIR}) diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h index ac2347bcd5..b26c89df63 100644 --- a/src/device/all_reduce.h +++ b/src/device/all_reduce.h @@ -560,16 +560,11 @@ namespace { } } -#if defined(__gfx942__) || defined(__gfx950__) // Use a single slice per simple primitive for a single node on some GFX9 devices. +#if defined(__gfx942__) || defined(__gfx950__) // Use a single slice per simple primitive for a single node on some GFX9 devices. #define rcclAllReduceRunRingSimpleProtoImpl(tid, nthreads, work) \ if(work->rcclUseOneSlice){ \ using Proto = ProtoSimple; \ - if(work->regUsed || work->netRegUsed || work->gfx942CheapFenceOff){ \ - runRing(tid, nthreads, work); \ - } \ - else { \ - runRing(tid, nthreads, work); \ - } \ + runRing(tid, nthreads, work); \ } \ else{ \ using Proto = ProtoSimple; \ diff --git a/src/device/gfx9_threadfence.h b/src/device/gfx9_threadfence.h deleted file mode 100644 index 352cf6d291..0000000000 --- a/src/device/gfx9_threadfence.h +++ /dev/null @@ -1,44 +0,0 @@ -/* -Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#pragma once - -// This is only okay when the protocol buffer is allocated in uncached memory. -#if defined(__gfx942__) && defined(HIP_UNCACHED_MEMORY) && !defined(DISABLE_CHEAP_THREADFENCE) -#define RCCL_CHEAP_THREADFENCE_OK_SOMETIMES 1 -#else -#define RCCL_CHEAP_THREADFENCE_OK_SOMETIMES 0 -#endif - -template -inline __device__ void gfx9ThreadFence(); - -template<> -inline __device__ void gfx9ThreadFence() { - asm volatile("s_waitcnt lgkmcnt(0) vmcnt(0)"); - asm volatile("buffer_inv sc0 sc1"); -} - -template<> -inline __device__ void gfx9ThreadFence() { - __threadfence(); -} diff --git a/src/device/prims_simple.h b/src/device/prims_simple.h index 9a5f067e04..11d3adf927 100644 --- a/src/device/prims_simple.h +++ b/src/device/prims_simple.h @@ -10,7 +10,6 @@ #include "npkit/npkit.h" #endif -#include "device/gfx9_threadfence.h" #include "device/rccl_metadata.h" #include "msccl/msccl_struct.h" #include "network/unpack/unpack.h" @@ -68,6 +67,7 @@ class Primitives< uint64_t* barriers_pat; uint64_t barrier_next_pat = 0; int repeat; + bool skip_fence = 0; #if defined(ENABLE_NPKIT) public: @@ -201,9 +201,14 @@ private: template inline __device__ void postPeer(bool dataStored) { - if (Send && (flags & RolePostSend) && dataStored){ + if (skip_fence){ + __atomic_signal_fence(__ATOMIC_SEQ_CST); + barrier_generic(asm volatile("s_waitcnt lgkmcnt(0) vmcnt(0)"), nworkers, barrier_next, barriers); + __atomic_signal_fence(__ATOMIC_SEQ_CST); + } + else if((flags & RolePostSend) && dataStored){ #ifdef __GFX9__ - gfx9ThreadFence(); + __threadfence(); #else __threadfence_system(); #endif @@ -869,6 +874,9 @@ public: } patBarrier(); } + if(collWork){ + skip_fence = !collWork -> gfx942CheapFenceOff; + } } __forceinline__ __device__ ~Primitives() { diff --git a/src/device/rccl_metadata.h b/src/device/rccl_metadata.h index dbd7330091..4875137a61 100644 --- a/src/device/rccl_metadata.h +++ b/src/device/rccl_metadata.h @@ -24,11 +24,3 @@ THE SOFTWARE. /* This file implements methods to extract metadata from an integer Metadata field passed in as a template parameter. Feel free to add additional fields below.*/ #define RCCL_METADATA_EMPTY 0 -#define RCCL_ONE_NODE_RING_SIMPLE (1 << 0) - -constexpr bool isOneNodeRingSimple(int metadata) { - return (metadata & RCCL_ONE_NODE_RING_SIMPLE) != 0; -} - -static_assert(isOneNodeRingSimple(RCCL_ONE_NODE_RING_SIMPLE), "RCCL_ONE_NODE_RING_SIMPLE should be set to (1 << 0)"); -static_assert(isOneNodeRingSimple(0) == 0, "RCCL_ONE_NODE_RING_SIMPLE should not be set when metadata is 0"); \ No newline at end of file diff --git a/src/enqueue.cc b/src/enqueue.cc index f352c3974e..2e893c597d 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -335,6 +335,13 @@ static bool testBudget( return ok; } +// Returns whether this should be disabled at the device level. Should be called after devWork fields have been set for what +// it depends on. +bool gfx942CheapFenceOff(const ncclDevWorkColl& devWork, bool disabledByPrecheck){ + bool fenceOk = devWork.regUsed == 0 && devWork.netRegUsed == 0 && !disabledByPrecheck; + return !fenceOk; +} + ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) { struct ncclKernelPlanner* planner = &comm->planner; struct ncclTaskColl *task; @@ -367,9 +374,10 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) { devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr; devWork.oneNode = (comm->nNodes == 1); devWork.rcclUseOneSlice = comm->rcclUseOneSlice; - devWork.gfx942CheapFenceOff = comm->gfx942CheapFenceOff; + devWork.isOneRPN = comm->isOneRPN; devWork.netRegUsed = devWork.regUsed = 0; + devWork.gfx942CheapFenceOff = gfx942CheapFenceOff(devWork, comm->gfx942CheapFenceOff); devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh); if (task->regBufType & NCCL_NET_REG_BUFFER) devWork.netRegUsed = 1; diff --git a/src/init.cc b/src/init.cc index aaedc5e06b..9a97ecde5a 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1369,13 +1369,23 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph->nChannels); if (ringGraph->nChannels > MAXCHANNELS/2) allGather3Data[rank].nc = 1; + comm -> gfx942CheapFenceOff = 1; + #ifdef HIP_UNCACHED_MEMORY + if(!rcclParamGfx942CheapFenceOff()){ + if(IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")){ + comm -> gfx942CheapFenceOff = 0; + } + else if(IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")){ + comm -> gfx942CheapFenceOff = nNodes > 1; + } + } + #endif if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")) { // Multi-node MI300A int managed = 0; CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0)); // RCCL: Only use one slice per primitive on some single node gfx9xx systems comm->rcclUseOneSlice = !managed && nNodes == 1; - comm->gfx942CheapFenceOff = rcclParamGfx942CheapFenceOff(); if (managed && nNodes > 1) { // This forces the minimum channels to 24 allGather3Data[rank].nc = 6;