diff --git a/projects/rccl/docs/how-to/rccl-usage-tips.rst b/projects/rccl/docs/how-to/rccl-usage-tips.rst
index 852a1c33b0..2c08f63b38 100644
--- a/projects/rccl/docs/how-to/rccl-usage-tips.rst
+++ b/projects/rccl/docs/how-to/rccl-usage-tips.rst
@@ -253,4 +253,13 @@ set during the benchmark in the following manner:
 
 The default allreduce PyTorch benchmark peak bus bandwidth performance is
 ~170 GB/s on a single OAM with ROCm 6.2.4, while the optimized run for CPX on a
-single OAM peaks at ~315 GB/s.
\ No newline at end of file
+single OAM peaks at ~315 GB/s.
+
+Context tracking on Radeon GPUs
+----------------------------------------
+Context tracking is disabled on the AMD Instinct™ series of GPUs for better performance but is enabled for Radeon GPUs.
+To disable context tracking for Radeon GPUs, set the following environment variable:
+
+.. code-block:: shell
+
+   export RCCL_DISABLE_CONTEXT_TRACKING=1
\ No newline at end of file
diff --git a/projects/rccl/src/include/utils.h b/projects/rccl/src/include/utils.h
index 9bb642e1ab..12977aee6b 100644
--- a/projects/rccl/src/include/utils.h
+++ b/projects/rccl/src/include/utils.h
@@ -68,6 +68,8 @@ inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
   return ret;
 }
 
+bool rcclNeedEnableContextTrack(int cuDeviceId);
+
 ////////////////////////////////////////////////////////////////////////////////
 
 template<typename Int>
diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc
index 727af9dbcd..59d619decc 100644
--- a/projects/rccl/src/init.cc
+++ b/projects/rccl/src/init.cc
@@ -531,7 +531,12 @@ static ncclResult_t commFree(ncclComm_t comm) {
   NCCLCHECK(ncclProfilerPluginFinalize(comm));
   NCCLCHECK(ncclNetFinalize(comm));
   // Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support.
-  //ncclCudaContextDrop(comm->context);
+  // but enable for Radeon due to big impact on performance
+  if (rcclNeedEnableContextTrack(comm->cudaDev)) {
+    ncclCudaContextDrop(comm->context);
+    INFO(NCCL_INIT, "cudaDev %d context tracking destroyed", comm->cudaDev);
+  }
+
   free(comm);
 
   return ncclSuccess;
@@ -627,7 +632,11 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   CUDACHECK(cudaGetDevice(&comm->cudaDev));
 
   // Disable until we validate NCCL_LAUNCH_IMPLICIT_ORDER support.
-  //NCCLCHECK(ncclCudaContextTrack(&comm->context));
+  // but enable for Radeon due to big impact on performance
+  if (rcclNeedEnableContextTrack(comm->cudaDev)) {
+    NCCLCHECK(ncclCudaContextTrack(&comm->context));
+    INFO(NCCL_INIT, "cudaDev %d context tracking created", comm->cudaDev);
+  }
 
   NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
   char busId[]="0000:00:00.0";
diff --git a/projects/rccl/src/misc/utils.cc b/projects/rccl/src/misc/utils.cc
index ee72bf60d3..26d1d4918f 100644
--- a/projects/rccl/src/misc/utils.cc
+++ b/projects/rccl/src/misc/utils.cc
@@ -8,6 +8,8 @@
 #include "core.h"
 
 #include "nvmlwrap.h"
+#include "archinfo.h"
+
 #include <unistd.h>
 #include <stdlib.h>
 
@@ -191,6 +193,18 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz
   return false;
 }
 
+RCCL_PARAM(DisableContextTracking, "DISABLE_CONTEXT_TRACKING", 0);
+bool rcclNeedEnableContextTrack(int cuDeviceId) {
+  hipDeviceProp_t devProp;
+  if (rcclParamDisableContextTracking() == 1)
+    return false;
+  if (hipGetDeviceProperties(&devProp, cuDeviceId) != 0)
+    return false;
+  return IsArchMatch(devProp.gcnArchName,"gfx11")
+         || IsArchMatch(devProp.gcnArchName,"gfx12")
+         || IsArchMatch(devProp.gcnArchName,"gfx10");
+}
+
 __thread struct ncclThreadSignal ncclThreadSignalLocalInstance = ncclThreadSignalStaticInitializer();
 
 void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) {