[TRANSPORT] Add RCCL_FORCE_ENABLE_GDRDMA for debugging (#1356)

Signed-off-by: nileshnegi <Nilesh.Negi@amd.com>

[ROCm/rccl commit: 8ad76f8d10]
Этот коммит содержится в:
Nilesh M Negi
2024-10-06 18:43:49 -05:00
коммит произвёл GitHub
родитель 9c4ac4cae5
Коммит cd29f1e22f
+12
Просмотреть файл
@@ -557,6 +557,10 @@ ncclResult_t ncclIbDevices(int* ndev) {
return ncclSuccess;
}
// Introduce RCCL_FORCE_ENABLE_GDRDMA to force load GPU-NIC RDMA module
// Use ONLY for debugging!
RCCL_PARAM(ForceEnableGdrdma, "FORCE_ENABLE_GDRDMA", -1);
// Detect whether GDR can work on a given NIC with the current CUDA device
// Returns :
// ncclSuccess : GDR works
@@ -564,6 +568,14 @@ ncclResult_t ncclIbDevices(int* ndev) {
ncclResult_t ncclIbGdrSupport() {
static int moduleLoaded = -1;
if (rcclParamForceEnableGdrdma() == 1) {
// RCCL_FORCE_ENABLE_GDRDMA=1 enables GPU-NIC RDMA only from RCCL-side
// Requires support from NIC driver modules
// Use ONLY for debugging!
moduleLoaded = 1;
INFO(NCCL_INIT, "RCCL_FORCE_ENABLE_GDRDMA = 1, so explicitly setting moduleLoaded = 1");
}
if (moduleLoaded == -1) {
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
// Check for `memory_peers` directory containing `amdkfd/version`