Disable the __threadfence on the sender side of the simple protocol when possible. (#1830)

Leverages the traits of extended-scope fine-grain memory to get rid of a device-scope acquire-release fence. This improves throughput for single node workloads on gfx942 and gfx950 for some input sizes (e.g., ~32 MiB to about 256 MiB) when using the simple protocol. Multinode workloads on MI300X see a smaller but statistically significant uplift for some message sizes. Runtime disablement is supported via setting the environment variable RCCL_GFX942_CHEAP_FENCE_ON to 0.
2025-08-15 07:54:54 -07:00
@@ -39,7 +39,6 @@ option(TIMETRACE                               "Enable time-trace during compila
 option(TRACE                                   "Enable additional tracing"                     OFF)
 option(FAULT_INJECTION                         "Enable fault injection"                        ON)
 option(FORCE_REDUCE_PIPELINING                 "Force reduce pipelining"                       OFF)
-option(DISABLE_CHEAP_THREADFENCE               "Compile-time killswitch for simpler fence"     OFF)

 # Default GPU architectures to build
 #==================================================================================================
@@ -446,7 +445,6 @@ set(SRC_FILES
  src/device/broadcast.h
  src/device/common.h
  src/device/common_kernel.h
-  src/device/gfx9_threadfence.h
  src/device/op128.h
  src/device/primitives.h
  src/device/prims_ll128.h
@@ -1176,12 +1174,6 @@ if (FAULT_INJECTION)
  target_compile_definitions(rccl PRIVATE ENABLE_FAULT_INJECTION)
  message(STATUS "Fault injection enabled")
 endif()
-if (DISABLE_CHEAP_THREADFENCE)
-  target_compile_definitions(rccl PRIVATE DISABLE_CHEAP_THREADFENCE)
-  message(STATUS "Cheap thread fence disabled")
-else()
-  message(STATUS "Cheap thread fence enabled for some collectives/parameters")
-endif()

 ## Set RCCL linked library directories
 target_link_directories(rccl PRIVATE ${ROCM_SMI_LIB_DIR})
@@ -560,16 +560,11 @@ namespace {
  }
 }

-#if defined(__gfx942__) || defined(__gfx950__) // Use a single slice per simple primitive for a single node on some GFX9 devices.
+#if defined(__gfx942__) || defined(__gfx950__) // Use a single slice per simple primitive for a single node on some GFX9 devices. 
 #define rcclAllReduceRunRingSimpleProtoImpl(tid, nthreads, work) \
  if(work->rcclUseOneSlice){ \
    using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS_SINGLE_NODE, ALLREDUCE_SLICESTEPS_SINGLE_NODE>; \
-    if(work->regUsed || work->netRegUsed || work->gfx942CheapFenceOff){ \
-      runRing<T, RedOp, Proto, RCCL_METADATA_EMPTY>(tid, nthreads, work); \
-    } \
-    else { \
-      runRing<T, RedOp, Proto, RCCL_ONE_NODE_RING_SIMPLE>(tid, nthreads, work); \
-    } \
+    runRing<T, RedOp, Proto, RCCL_METADATA_EMPTY>(tid, nthreads, work); \
  } \
  else{ \
    using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>; \
@@ -1,44 +0,0 @@
-/*
-Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#pragma once
-
-// This is only okay when the protocol buffer is allocated in uncached memory.
-#if defined(__gfx942__) && defined(HIP_UNCACHED_MEMORY) && !defined(DISABLE_CHEAP_THREADFENCE)
-#define RCCL_CHEAP_THREADFENCE_OK_SOMETIMES 1
-#else
-#define RCCL_CHEAP_THREADFENCE_OK_SOMETIMES 0
-#endif 
-
-template<bool UseCheaperThreadFence>
-inline __device__ void gfx9ThreadFence();
-
-template<>
-inline __device__ void gfx9ThreadFence<true>() {
-    asm volatile("s_waitcnt lgkmcnt(0) vmcnt(0)");
-    asm volatile("buffer_inv sc0 sc1");
-}
-
-template<>
-inline __device__ void gfx9ThreadFence<false>() {
-    __threadfence();
-}
@@ -10,7 +10,6 @@
 #include "npkit/npkit.h"
 #endif

-#include "device/gfx9_threadfence.h"
 #include "device/rccl_metadata.h"
 #include "msccl/msccl_struct.h"
 #include "network/unpack/unpack.h"
@@ -68,6 +67,7 @@ class Primitives<
  uint64_t* barriers_pat;
  uint64_t barrier_next_pat = 0;
  int repeat;
+  bool skip_fence = 0;

 #if defined(ENABLE_NPKIT)
 public:
@@ -201,9 +201,14 @@ private:

  template<int Recv, int Send>
  inline __device__ void postPeer(bool dataStored) {
-    if (Send && (flags & RolePostSend) && dataStored){
+    if (skip_fence){
+      __atomic_signal_fence(__ATOMIC_SEQ_CST);
+      barrier_generic(asm volatile("s_waitcnt lgkmcnt(0) vmcnt(0)"), nworkers, barrier_next, barriers);
+      __atomic_signal_fence(__ATOMIC_SEQ_CST);
+    }
+    else if((flags & RolePostSend) && dataStored){
 #ifdef __GFX9__
-    gfx9ThreadFence<isOneNodeRingSimple(Metadata) && RCCL_CHEAP_THREADFENCE_OK_SOMETIMES>();
+    __threadfence();
 #else
    __threadfence_system();
 #endif
@@ -869,6 +874,9 @@ public:
      }
      patBarrier();
    }
+    if(collWork){
+      skip_fence = !collWork -> gfx942CheapFenceOff;
+    }
  }

  __forceinline__ __device__ ~Primitives() {
@@ -24,11 +24,3 @@ THE SOFTWARE.
 /* This file implements methods to extract metadata from an integer Metadata field passed in as a template parameter. Feel free to add additional fields below.*/

 #define RCCL_METADATA_EMPTY 0
-#define RCCL_ONE_NODE_RING_SIMPLE (1 << 0)
-
-constexpr bool isOneNodeRingSimple(int metadata) {
-  return (metadata & RCCL_ONE_NODE_RING_SIMPLE) != 0;
-}
-
-static_assert(isOneNodeRingSimple(RCCL_ONE_NODE_RING_SIMPLE), "RCCL_ONE_NODE_RING_SIMPLE should be set to (1 << 0)");
-static_assert(isOneNodeRingSimple(0) == 0, "RCCL_ONE_NODE_RING_SIMPLE should not be set when metadata is 0");
@@ -335,6 +335,13 @@ static bool testBudget(
  return ok;
 }

+// Returns whether this should be disabled at the device level.  Should be called after devWork fields have been set for what
+// it depends on.
+bool gfx942CheapFenceOff(const ncclDevWorkColl& devWork, bool disabledByPrecheck){
+    bool fenceOk = devWork.regUsed == 0 && devWork.netRegUsed == 0 && !disabledByPrecheck;
+    return !fenceOk;
+}
+
 ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
  struct ncclKernelPlanner* planner = &comm->planner;
  struct ncclTaskColl *task;
@@ -367,9 +374,10 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
    devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
    devWork.oneNode = (comm->nNodes == 1);
    devWork.rcclUseOneSlice = comm->rcclUseOneSlice;
-    devWork.gfx942CheapFenceOff = comm->gfx942CheapFenceOff;
+    
    devWork.isOneRPN = comm->isOneRPN;
    devWork.netRegUsed = devWork.regUsed = 0;
+    devWork.gfx942CheapFenceOff = gfx942CheapFenceOff(devWork, comm->gfx942CheapFenceOff);
    devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh);
    if (task->regBufType & NCCL_NET_REG_BUFFER)
      devWork.netRegUsed = 1;
@@ -1369,13 +1369,23 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
    allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph->nChannels);
  if (ringGraph->nChannels > MAXCHANNELS/2)
    allGather3Data[rank].nc = 1;
+  comm -> gfx942CheapFenceOff = 1;
+  #ifdef HIP_UNCACHED_MEMORY
+  if(!rcclParamGfx942CheapFenceOff()){
+    if(IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")){
+      comm -> gfx942CheapFenceOff = 0;
+    }
+    else if(IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")){
+      comm -> gfx942CheapFenceOff = nNodes > 1;
+    }
+  }
+  #endif
  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")) {
    // Multi-node MI300A
    int managed = 0;
    CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
    // RCCL: Only use one slice per primitive on some single node gfx9xx systems
    comm->rcclUseOneSlice = !managed && nNodes == 1;
-    comm->gfx942CheapFenceOff = rcclParamGfx942CheapFenceOff();
    if (managed && nNodes > 1) {
      // This forces the minimum channels to 24
      allGather3Data[rank].nc = 6;