Revert disabling of context tracking for Radeon (#1927)

* Revert disabling of context tracking for Radeon

Original commit df3b7e47
 `Disable context tracking for the current version. (#1839)`

* Add env variable for disabling of context tracking for Radeon

`export NCCL_DISABLE_CONTEXT_TRACKING=1` to force disable of context tracking

* Update docs/how-to/rccl-usage-tips.rst

Fix grammar, thanks @amd-jnovotny

Co-authored-by: Jeffrey Novotny <jnovotny@amd.com>

* Rename NCCL_DISABLE_CONTEXT_TRACKING -> RCCL_DISABLE_CONTEXT_TRACKING

* Revert changes in includes and rename util function

---------

Co-authored-by: Jeffrey Novotny <jnovotny@amd.com>

[ROCm/rccl commit: 07925ec027]
Этот коммит содержится в:
Artem Kuzmitckii
2025-09-27 21:19:50 +02:00
коммит произвёл GitHub
родитель 6423f5b024
Коммит 722b0cd579
4 изменённых файлов: 37 добавлений и 3 удалений
+10 -1
Просмотреть файл
@@ -253,4 +253,13 @@ set during the benchmark in the following manner:
The default allreduce PyTorch benchmark peak bus bandwidth performance is
~170 GB/s on a single OAM with ROCm 6.2.4, while the optimized run for CPX on a
single OAM peaks at ~315 GB/s.
single OAM peaks at ~315 GB/s.
Context tracking on Radeon GPUs
----------------------------------------
Context tracking is disabled on the AMD Instinct™ series of GPUs for better performance but is enabled for Radeon GPUs.
To disable context tracking for Radeon GPUs, set the following environment variable:
.. code-block:: shell
export RCCL_DISABLE_CONTEXT_TRACKING=1
+2
Просмотреть файл
@@ -68,6 +68,8 @@ inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
return ret;
}
bool rcclNeedEnableContextTrack(int cuDeviceId);
////////////////////////////////////////////////////////////////////////////////
template<typename Int>
+11 -2
Просмотреть файл
@@ -531,7 +531,12 @@ static ncclResult_t commFree(ncclComm_t comm) {
NCCLCHECK(ncclProfilerPluginFinalize(comm));
NCCLCHECK(ncclNetFinalize(comm));
// Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support.
//ncclCudaContextDrop(comm->context);
// but enable for Radeon due to big impact on performance
if (rcclNeedEnableContextTrack(comm->cudaDev)) {
ncclCudaContextDrop(comm->context);
INFO(NCCL_INIT, "cudaDev %d context tracking destroyed", comm->cudaDev);
}
free(comm);
return ncclSuccess;
@@ -627,7 +632,11 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
CUDACHECK(cudaGetDevice(&comm->cudaDev));
// Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support.
//NCCLCHECK(ncclCudaContextTrack(&comm->context));
// but enable for Radeon due to big impact on performance
if (rcclNeedEnableContextTrack(comm->cudaDev)) {
NCCLCHECK(ncclCudaContextTrack(&comm->context));
INFO(NCCL_INIT, "cudaDev %d context tracking created", comm->cudaDev);
}
NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
char busId[]="0000:00:00.0";
+14
Просмотреть файл
@@ -8,6 +8,8 @@
#include "core.h"
#include "nvmlwrap.h"
#include "archinfo.h"
#include <unistd.h>
#include <stdlib.h>
@@ -191,6 +193,18 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz
return false;
}
RCCL_PARAM(DisableContextTracking, "DISABLE_CONTEXT_TRACKING", 0);
bool rcclNeedEnableContextTrack(int cuDeviceId) {
hipDeviceProp_t devProp;
if (rcclParamDisableContextTracking() == 1)
return false;
if (hipGetDeviceProperties(&devProp, cuDeviceId) != 0)
return false;
return IsArchMatch(devProp.gcnArchName,"gfx11")
|| IsArchMatch(devProp.gcnArchName,"gfx12")
|| IsArchMatch(devProp.gcnArchName,"gfx10");
}
__thread struct ncclThreadSignal ncclThreadSignalLocalInstance = ncclThreadSignalStaticInitializer();
void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) {