diff --git a/projects/rccl/src/transport/net_ib.cc b/projects/rccl/src/transport/net_ib.cc index 45b2d23c9e..e75106cf41 100644 --- a/projects/rccl/src/transport/net_ib.cc +++ b/projects/rccl/src/transport/net_ib.cc @@ -557,6 +557,10 @@ ncclResult_t ncclIbDevices(int* ndev) { return ncclSuccess; } +// Introduce RCCL_FORCE_ENABLE_GDRDMA to force load GPU-NIC RDMA module +// Use ONLY for debugging! +RCCL_PARAM(ForceEnableGdrdma, "FORCE_ENABLE_GDRDMA", -1); + // Detect whether GDR can work on a given NIC with the current CUDA device // Returns : // ncclSuccess : GDR works @@ -564,6 +568,14 @@ ncclResult_t ncclIbDevices(int* ndev) { ncclResult_t ncclIbGdrSupport() { static int moduleLoaded = -1; + if (rcclParamForceEnableGdrdma() == 1) { + // RCCL_FORCE_ENABLE_GDRDMA=1 enables GPU-NIC RDMA only from RCCL-side + // Requires support from NIC driver modules + // Use ONLY for debugging! + moduleLoaded = 1; + INFO(NCCL_INIT, "RCCL_FORCE_ENABLE_GDRDMA = 1, so explicitly setting moduleLoaded = 1"); + } + if (moduleLoaded == -1) { #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) // Check for `memory_peers` directory containing `amdkfd/version`