diff --git a/projects/rccl/docs/how-to/rccl-usage-tips.rst b/projects/rccl/docs/how-to/rccl-usage-tips.rst index 852a1c33b0..2c08f63b38 100644 --- a/projects/rccl/docs/how-to/rccl-usage-tips.rst +++ b/projects/rccl/docs/how-to/rccl-usage-tips.rst @@ -253,4 +253,13 @@ set during the benchmark in the following manner: The default allreduce PyTorch benchmark peak bus bandwidth performance is ~170 GB/s on a single OAM with ROCm 6.2.4, while the optimized run for CPX on a -single OAM peaks at ~315 GB/s. \ No newline at end of file +single OAM peaks at ~315 GB/s. + +Context tracking on Radeon GPUs +---------------------------------------- +Context tracking is disabled on the AMD Instinctâ„¢ series of GPUs for better performance but is enabled for Radeon GPUs. +To disable context tracking for Radeon GPUs, set the following environment variable: + +.. code-block:: shell + + export RCCL_DISABLE_CONTEXT_TRACKING=1 \ No newline at end of file diff --git a/projects/rccl/src/include/utils.h b/projects/rccl/src/include/utils.h index 9bb642e1ab..12977aee6b 100644 --- a/projects/rccl/src/include/utils.h +++ b/projects/rccl/src/include/utils.h @@ -68,6 +68,8 @@ inline ncclResult_t getRandomData(void* buffer, size_t bytes) { return ret; } +bool rcclNeedEnableContextTrack(int cuDeviceId); + //////////////////////////////////////////////////////////////////////////////// template diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 727af9dbcd..59d619decc 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -531,7 +531,12 @@ static ncclResult_t commFree(ncclComm_t comm) { NCCLCHECK(ncclProfilerPluginFinalize(comm)); NCCLCHECK(ncclNetFinalize(comm)); // Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support. - //ncclCudaContextDrop(comm->context); + // but enable for Radeon due to big impact on performance + if (rcclNeedEnableContextTrack(comm->cudaDev)) { + ncclCudaContextDrop(comm->context); + INFO(NCCL_INIT, "cudaDev %d context tracking destroyed", comm->cudaDev); + } + free(comm); return ncclSuccess; @@ -627,7 +632,11 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in CUDACHECK(cudaGetDevice(&comm->cudaDev)); // Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support. - //NCCLCHECK(ncclCudaContextTrack(&comm->context)); + // but enable for Radeon due to big impact on performance + if (rcclNeedEnableContextTrack(comm->cudaDev)) { + NCCLCHECK(ncclCudaContextTrack(&comm->context)); + INFO(NCCL_INIT, "cudaDev %d context tracking created", comm->cudaDev); + } NCCLCHECK(getBusId(comm->cudaDev, &comm->busId)); char busId[]="0000:00:00.0"; diff --git a/projects/rccl/src/misc/utils.cc b/projects/rccl/src/misc/utils.cc index ee72bf60d3..26d1d4918f 100644 --- a/projects/rccl/src/misc/utils.cc +++ b/projects/rccl/src/misc/utils.cc @@ -8,6 +8,8 @@ #include "core.h" #include "nvmlwrap.h" +#include "archinfo.h" + #include #include @@ -191,6 +193,18 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz return false; } +RCCL_PARAM(DisableContextTracking, "DISABLE_CONTEXT_TRACKING", 0); +bool rcclNeedEnableContextTrack(int cuDeviceId) { + hipDeviceProp_t devProp; + if (rcclParamDisableContextTracking() == 1) + return false; + if (hipGetDeviceProperties(&devProp, cuDeviceId) != 0) + return false; + return IsArchMatch(devProp.gcnArchName,"gfx11") + || IsArchMatch(devProp.gcnArchName,"gfx12") + || IsArchMatch(devProp.gcnArchName,"gfx10"); +} + __thread struct ncclThreadSignal ncclThreadSignalLocalInstance = ncclThreadSignalStaticInitializer(); void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) {