Reverse logic of context tracking enablement from #1927 (#1971)

In this commit it disabled by default and can be enabled via
`RCCL_ENABLE_CONTEXT_TRACKING=1` for both (CDNA, RDNA)
Original PR https://github.com/ROCm/rccl/pull/1927
This commit is contained in:
Artem Kuzmitckii
2025-10-09 10:24:09 +02:00
committed by GitHub
parent cede6d0134
commit 00a42c80f3
4 changed files with 11 additions and 23 deletions
+6 -4
View File
@@ -269,11 +269,13 @@ The default allreduce PyTorch benchmark peak bus bandwidth performance is
~170 GB/s on a single OAM with ROCm 6.2.4, while the optimized run for CPX on a
single OAM peaks at ~315 GB/s.
Context tracking on Radeon GPUs
Context tracking on GPUs
----------------------------------------
Context tracking is disabled on the AMD Instinct™ series of GPUs for better performance but is enabled for Radeon GPUs.
To disable context tracking for Radeon GPUs, set the following environment variable:
Context tracking is disabled by default for optimal performance. However, enabling of context tracking can significantly improve performance
in certain scenarios. To enable context tracking, set the following environment variable:
.. code-block:: shell
export RCCL_DISABLE_CONTEXT_TRACKING=1
export RCCL_ENABLE_CONTEXT_TRACKING=1
-2
View File
@@ -68,8 +68,6 @@ inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
return ret;
}
bool rcclNeedEnableContextTrack(int cuDeviceId);
////////////////////////////////////////////////////////////////////////////////
template<typename Int>
+5 -4
View File
@@ -246,6 +246,7 @@ RCCL_PARAM(EnableProxyTrace, "ENABLE_PROXY_TRACE", 0);
RCCL_PARAM(KernelCollTraceEnable, "KERNEL_COLL_TRACE_ENABLE", 0);
RCCL_PARAM(KernelCollTraceThreadEnable, "KERNEL_COLL_TRACE_THREAD_ENABLE", 0);
RCCL_PARAM(EnableContextTracking, "ENABLE_CONTEXT_TRACKING", 0);
#ifdef ENABLE_COLLTRACE
// Should be in sync with 'ALL_COLLS' in Generator.cmake
@@ -532,8 +533,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
NCCLCHECK(ncclProfilerPluginFinalize(comm));
NCCLCHECK(ncclNetFinalize(comm));
// Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support.
// but enable for Radeon due to big impact on performance
if (rcclNeedEnableContextTrack(comm->cudaDev)) {
// but can be enabled via environment variable
if (rcclParamEnableContextTracking() == 1) {
ncclCudaContextDrop(comm->context);
INFO(NCCL_INIT, "cudaDev %d context tracking destroyed", comm->cudaDev);
}
@@ -633,8 +634,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
CUDACHECK(cudaGetDevice(&comm->cudaDev));
// Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support.
// but enable for Radeon due to big impact on performance
if (rcclNeedEnableContextTrack(comm->cudaDev)) {
// but can be enabled via environment variable
if (rcclParamEnableContextTracking() == 1) {
NCCLCHECK(ncclCudaContextTrack(&comm->context));
INFO(NCCL_INIT, "cudaDev %d context tracking created", comm->cudaDev);
}
-13
View File
@@ -8,7 +8,6 @@
#include "core.h"
#include "nvmlwrap.h"
#include "archinfo.h"
#include <unistd.h>
#include <stdlib.h>
@@ -193,18 +192,6 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz
return false;
}
RCCL_PARAM(DisableContextTracking, "DISABLE_CONTEXT_TRACKING", 0);
bool rcclNeedEnableContextTrack(int cuDeviceId) {
hipDeviceProp_t devProp;
if (rcclParamDisableContextTracking() == 1)
return false;
if (hipGetDeviceProperties(&devProp, cuDeviceId) != 0)
return false;
return IsArchMatch(devProp.gcnArchName,"gfx11")
|| IsArchMatch(devProp.gcnArchName,"gfx12")
|| IsArchMatch(devProp.gcnArchName,"gfx10");
}
__thread struct ncclThreadSignal ncclThreadSignalLocalInstance = ncclThreadSignalStaticInitializer();
void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) {