From 00a42c80f3c6244417f2f4df35e3fd3c82782391 Mon Sep 17 00:00:00 2001 From: Artem Kuzmitckii Date: Thu, 9 Oct 2025 10:24:09 +0200 Subject: [PATCH] Reverse logic of context tracking enablement from #1927 (#1971) In this commit it disabled by default and can be enabled via `RCCL_ENABLE_CONTEXT_TRACKING=1` for both (CDNA, RDNA) Original PR https://github.com/ROCm/rccl/pull/1927 --- docs/how-to/rccl-usage-tips.rst | 10 ++++++---- src/include/utils.h | 2 -- src/init.cc | 9 +++++---- src/misc/utils.cc | 13 ------------- 4 files changed, 11 insertions(+), 23 deletions(-) diff --git a/docs/how-to/rccl-usage-tips.rst b/docs/how-to/rccl-usage-tips.rst index 352ccdef83..530b6b961c 100644 --- a/docs/how-to/rccl-usage-tips.rst +++ b/docs/how-to/rccl-usage-tips.rst @@ -269,11 +269,13 @@ The default allreduce PyTorch benchmark peak bus bandwidth performance is ~170 GB/s on a single OAM with ROCm 6.2.4, while the optimized run for CPX on a single OAM peaks at ~315 GB/s. -Context tracking on Radeon GPUs +Context tracking on GPUs ---------------------------------------- -Context tracking is disabled on the AMD Instinctâ„¢ series of GPUs for better performance but is enabled for Radeon GPUs. -To disable context tracking for Radeon GPUs, set the following environment variable: +Context tracking is disabled by default for optimal performance. However, enabling of context tracking can significantly improve performance +in certain scenarios. To enable context tracking, set the following environment variable: .. code-block:: shell - export RCCL_DISABLE_CONTEXT_TRACKING=1 + + export RCCL_ENABLE_CONTEXT_TRACKING=1 + diff --git a/src/include/utils.h b/src/include/utils.h index 12977aee6b..9bb642e1ab 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -68,8 +68,6 @@ inline ncclResult_t getRandomData(void* buffer, size_t bytes) { return ret; } -bool rcclNeedEnableContextTrack(int cuDeviceId); - //////////////////////////////////////////////////////////////////////////////// template diff --git a/src/init.cc b/src/init.cc index c869f79b0d..3828b9b5c1 100644 --- a/src/init.cc +++ b/src/init.cc @@ -246,6 +246,7 @@ RCCL_PARAM(EnableProxyTrace, "ENABLE_PROXY_TRACE", 0); RCCL_PARAM(KernelCollTraceEnable, "KERNEL_COLL_TRACE_ENABLE", 0); RCCL_PARAM(KernelCollTraceThreadEnable, "KERNEL_COLL_TRACE_THREAD_ENABLE", 0); +RCCL_PARAM(EnableContextTracking, "ENABLE_CONTEXT_TRACKING", 0); #ifdef ENABLE_COLLTRACE // Should be in sync with 'ALL_COLLS' in Generator.cmake @@ -532,8 +533,8 @@ static ncclResult_t commFree(ncclComm_t comm) { NCCLCHECK(ncclProfilerPluginFinalize(comm)); NCCLCHECK(ncclNetFinalize(comm)); // Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support. - // but enable for Radeon due to big impact on performance - if (rcclNeedEnableContextTrack(comm->cudaDev)) { + // but can be enabled via environment variable + if (rcclParamEnableContextTracking() == 1) { ncclCudaContextDrop(comm->context); INFO(NCCL_INIT, "cudaDev %d context tracking destroyed", comm->cudaDev); } @@ -633,8 +634,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in CUDACHECK(cudaGetDevice(&comm->cudaDev)); // Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support. - // but enable for Radeon due to big impact on performance - if (rcclNeedEnableContextTrack(comm->cudaDev)) { + // but can be enabled via environment variable + if (rcclParamEnableContextTracking() == 1) { NCCLCHECK(ncclCudaContextTrack(&comm->context)); INFO(NCCL_INIT, "cudaDev %d context tracking created", comm->cudaDev); } diff --git a/src/misc/utils.cc b/src/misc/utils.cc index 26d1d4918f..45a4b5b474 100644 --- a/src/misc/utils.cc +++ b/src/misc/utils.cc @@ -8,7 +8,6 @@ #include "core.h" #include "nvmlwrap.h" -#include "archinfo.h" #include #include @@ -193,18 +192,6 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz return false; } -RCCL_PARAM(DisableContextTracking, "DISABLE_CONTEXT_TRACKING", 0); -bool rcclNeedEnableContextTrack(int cuDeviceId) { - hipDeviceProp_t devProp; - if (rcclParamDisableContextTracking() == 1) - return false; - if (hipGetDeviceProperties(&devProp, cuDeviceId) != 0) - return false; - return IsArchMatch(devProp.gcnArchName,"gfx11") - || IsArchMatch(devProp.gcnArchName,"gfx12") - || IsArchMatch(devProp.gcnArchName,"gfx10"); -} - __thread struct ncclThreadSignal ncclThreadSignalLocalInstance = ncclThreadSignalStaticInitializer(); void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) {