From 722b0cd579bf265f61cfa5cec29d4322682d40b3 Mon Sep 17 00:00:00 2001 From: Artem Kuzmitckii Date: Sat, 27 Sep 2025 21:19:50 +0200 Subject: [PATCH] Revert disabling of context tracking for Radeon (#1927) * Revert disabling of context tracking for Radeon Original commit df3b7e47 `Disable context tracking for the current version. (#1839)` * Add env variable for disabling of context tracking for Radeon `export NCCL_DISABLE_CONTEXT_TRACKING=1` to force disable of context tracking * Update docs/how-to/rccl-usage-tips.rst Fix grammar, thanks @amd-jnovotny Co-authored-by: Jeffrey Novotny * Rename NCCL_DISABLE_CONTEXT_TRACKING -> RCCL_DISABLE_CONTEXT_TRACKING * Revert changes in includes and rename util function --------- Co-authored-by: Jeffrey Novotny [ROCm/rccl commit: 07925ec027404198d9264cfefaf3a074595d376c] --- projects/rccl/docs/how-to/rccl-usage-tips.rst | 11 ++++++++++- projects/rccl/src/include/utils.h | 2 ++ projects/rccl/src/init.cc | 13 +++++++++++-- projects/rccl/src/misc/utils.cc | 14 ++++++++++++++ 4 files changed, 37 insertions(+), 3 deletions(-) diff --git a/projects/rccl/docs/how-to/rccl-usage-tips.rst b/projects/rccl/docs/how-to/rccl-usage-tips.rst index 852a1c33b0..2c08f63b38 100644 --- a/projects/rccl/docs/how-to/rccl-usage-tips.rst +++ b/projects/rccl/docs/how-to/rccl-usage-tips.rst @@ -253,4 +253,13 @@ set during the benchmark in the following manner: The default allreduce PyTorch benchmark peak bus bandwidth performance is ~170 GB/s on a single OAM with ROCm 6.2.4, while the optimized run for CPX on a -single OAM peaks at ~315 GB/s. \ No newline at end of file +single OAM peaks at ~315 GB/s. + +Context tracking on Radeon GPUs +---------------------------------------- +Context tracking is disabled on the AMD Instinctâ„¢ series of GPUs for better performance but is enabled for Radeon GPUs. +To disable context tracking for Radeon GPUs, set the following environment variable: + +.. code-block:: shell + + export RCCL_DISABLE_CONTEXT_TRACKING=1 \ No newline at end of file diff --git a/projects/rccl/src/include/utils.h b/projects/rccl/src/include/utils.h index 9bb642e1ab..12977aee6b 100644 --- a/projects/rccl/src/include/utils.h +++ b/projects/rccl/src/include/utils.h @@ -68,6 +68,8 @@ inline ncclResult_t getRandomData(void* buffer, size_t bytes) { return ret; } +bool rcclNeedEnableContextTrack(int cuDeviceId); + //////////////////////////////////////////////////////////////////////////////// template diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 727af9dbcd..59d619decc 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -531,7 +531,12 @@ static ncclResult_t commFree(ncclComm_t comm) { NCCLCHECK(ncclProfilerPluginFinalize(comm)); NCCLCHECK(ncclNetFinalize(comm)); // Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support. - //ncclCudaContextDrop(comm->context); + // but enable for Radeon due to big impact on performance + if (rcclNeedEnableContextTrack(comm->cudaDev)) { + ncclCudaContextDrop(comm->context); + INFO(NCCL_INIT, "cudaDev %d context tracking destroyed", comm->cudaDev); + } + free(comm); return ncclSuccess; @@ -627,7 +632,11 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in CUDACHECK(cudaGetDevice(&comm->cudaDev)); // Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support. - //NCCLCHECK(ncclCudaContextTrack(&comm->context)); + // but enable for Radeon due to big impact on performance + if (rcclNeedEnableContextTrack(comm->cudaDev)) { + NCCLCHECK(ncclCudaContextTrack(&comm->context)); + INFO(NCCL_INIT, "cudaDev %d context tracking created", comm->cudaDev); + } NCCLCHECK(getBusId(comm->cudaDev, &comm->busId)); char busId[]="0000:00:00.0"; diff --git a/projects/rccl/src/misc/utils.cc b/projects/rccl/src/misc/utils.cc index ee72bf60d3..26d1d4918f 100644 --- a/projects/rccl/src/misc/utils.cc +++ b/projects/rccl/src/misc/utils.cc @@ -8,6 +8,8 @@ #include "core.h" #include "nvmlwrap.h" +#include "archinfo.h" + #include #include @@ -191,6 +193,18 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz return false; } +RCCL_PARAM(DisableContextTracking, "DISABLE_CONTEXT_TRACKING", 0); +bool rcclNeedEnableContextTrack(int cuDeviceId) { + hipDeviceProp_t devProp; + if (rcclParamDisableContextTracking() == 1) + return false; + if (hipGetDeviceProperties(&devProp, cuDeviceId) != 0) + return false; + return IsArchMatch(devProp.gcnArchName,"gfx11") + || IsArchMatch(devProp.gcnArchName,"gfx12") + || IsArchMatch(devProp.gcnArchName,"gfx10"); +} + __thread struct ncclThreadSignal ncclThreadSignalLocalInstance = ncclThreadSignalStaticInitializer(); void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) {