[TRANSPORT] Add RCCL_FORCE_ENABLE_GDRDMA for debugging (#1356)
Signed-off-by: nileshnegi <Nilesh.Negi@amd.com>
[ROCm/rccl commit: 8ad76f8d10]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
9c4ac4cae5
Коммит
cd29f1e22f
@@ -557,6 +557,10 @@ ncclResult_t ncclIbDevices(int* ndev) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Introduce RCCL_FORCE_ENABLE_GDRDMA to force load GPU-NIC RDMA module
|
||||
// Use ONLY for debugging!
|
||||
RCCL_PARAM(ForceEnableGdrdma, "FORCE_ENABLE_GDRDMA", -1);
|
||||
|
||||
// Detect whether GDR can work on a given NIC with the current CUDA device
|
||||
// Returns :
|
||||
// ncclSuccess : GDR works
|
||||
@@ -564,6 +568,14 @@ ncclResult_t ncclIbDevices(int* ndev) {
|
||||
ncclResult_t ncclIbGdrSupport() {
|
||||
static int moduleLoaded = -1;
|
||||
|
||||
if (rcclParamForceEnableGdrdma() == 1) {
|
||||
// RCCL_FORCE_ENABLE_GDRDMA=1 enables GPU-NIC RDMA only from RCCL-side
|
||||
// Requires support from NIC driver modules
|
||||
// Use ONLY for debugging!
|
||||
moduleLoaded = 1;
|
||||
INFO(NCCL_INIT, "RCCL_FORCE_ENABLE_GDRDMA = 1, so explicitly setting moduleLoaded = 1");
|
||||
}
|
||||
|
||||
if (moduleLoaded == -1) {
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
// Check for `memory_peers` directory containing `amdkfd/version`
|
||||
|
||||
Ссылка в новой задаче
Block a user