In this commit it disabled by default and can be enabled via `RCCL_ENABLE_CONTEXT_TRACKING=1` for both (CDNA, RDNA) Original PR https://github.com/ROCm/rccl/pull/1927
This commit is contained in:
@@ -269,11 +269,13 @@ The default allreduce PyTorch benchmark peak bus bandwidth performance is
|
||||
~170 GB/s on a single OAM with ROCm 6.2.4, while the optimized run for CPX on a
|
||||
single OAM peaks at ~315 GB/s.
|
||||
|
||||
Context tracking on Radeon GPUs
|
||||
Context tracking on GPUs
|
||||
----------------------------------------
|
||||
Context tracking is disabled on the AMD Instinct™ series of GPUs for better performance but is enabled for Radeon GPUs.
|
||||
To disable context tracking for Radeon GPUs, set the following environment variable:
|
||||
Context tracking is disabled by default for optimal performance. However, enabling of context tracking can significantly improve performance
|
||||
in certain scenarios. To enable context tracking, set the following environment variable:
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
export RCCL_DISABLE_CONTEXT_TRACKING=1
|
||||
|
||||
export RCCL_ENABLE_CONTEXT_TRACKING=1
|
||||
|
||||
|
||||
@@ -68,8 +68,6 @@ inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool rcclNeedEnableContextTrack(int cuDeviceId);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename Int>
|
||||
|
||||
+5
-4
@@ -246,6 +246,7 @@ RCCL_PARAM(EnableProxyTrace, "ENABLE_PROXY_TRACE", 0);
|
||||
|
||||
RCCL_PARAM(KernelCollTraceEnable, "KERNEL_COLL_TRACE_ENABLE", 0);
|
||||
RCCL_PARAM(KernelCollTraceThreadEnable, "KERNEL_COLL_TRACE_THREAD_ENABLE", 0);
|
||||
RCCL_PARAM(EnableContextTracking, "ENABLE_CONTEXT_TRACKING", 0);
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
// Should be in sync with 'ALL_COLLS' in Generator.cmake
|
||||
@@ -532,8 +533,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
NCCLCHECK(ncclProfilerPluginFinalize(comm));
|
||||
NCCLCHECK(ncclNetFinalize(comm));
|
||||
// Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support.
|
||||
// but enable for Radeon due to big impact on performance
|
||||
if (rcclNeedEnableContextTrack(comm->cudaDev)) {
|
||||
// but can be enabled via environment variable
|
||||
if (rcclParamEnableContextTracking() == 1) {
|
||||
ncclCudaContextDrop(comm->context);
|
||||
INFO(NCCL_INIT, "cudaDev %d context tracking destroyed", comm->cudaDev);
|
||||
}
|
||||
@@ -633,8 +634,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
CUDACHECK(cudaGetDevice(&comm->cudaDev));
|
||||
|
||||
// Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support.
|
||||
// but enable for Radeon due to big impact on performance
|
||||
if (rcclNeedEnableContextTrack(comm->cudaDev)) {
|
||||
// but can be enabled via environment variable
|
||||
if (rcclParamEnableContextTracking() == 1) {
|
||||
NCCLCHECK(ncclCudaContextTrack(&comm->context));
|
||||
INFO(NCCL_INIT, "cudaDev %d context tracking created", comm->cudaDev);
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
#include "core.h"
|
||||
|
||||
#include "nvmlwrap.h"
|
||||
#include "archinfo.h"
|
||||
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
@@ -193,18 +192,6 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz
|
||||
return false;
|
||||
}
|
||||
|
||||
RCCL_PARAM(DisableContextTracking, "DISABLE_CONTEXT_TRACKING", 0);
|
||||
bool rcclNeedEnableContextTrack(int cuDeviceId) {
|
||||
hipDeviceProp_t devProp;
|
||||
if (rcclParamDisableContextTracking() == 1)
|
||||
return false;
|
||||
if (hipGetDeviceProperties(&devProp, cuDeviceId) != 0)
|
||||
return false;
|
||||
return IsArchMatch(devProp.gcnArchName,"gfx11")
|
||||
|| IsArchMatch(devProp.gcnArchName,"gfx12")
|
||||
|| IsArchMatch(devProp.gcnArchName,"gfx10");
|
||||
}
|
||||
|
||||
__thread struct ncclThreadSignal ncclThreadSignalLocalInstance = ncclThreadSignalStaticInitializer();
|
||||
|
||||
void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) {
|
||||
|
||||
Reference in New Issue
Block a user