From fb59328a7ba7aeee30ecaf8997be1ca409b9989a Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Mon, 2 Mar 2020 11:16:27 -0800 Subject: [PATCH] Check fine grained memory before enabling RDMA Adding back the check which was lost from 2.5 merge. --- src/transport/net.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/transport/net.cc b/src/transport/net.cc index 6b4aee176a..320a0e0bc0 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -61,6 +61,14 @@ NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB); static ncclResult_t netGetGdrSupport(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr) { *useGdr = 0; + int cudaDev; + CUDACHECK(hipGetDevice(&cudaDev)); + + if (!hasFineGrainVramPcie()) { + INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / Need Fine Grain VRAM over PCIe", ncclNetName(), cudaDev); + return ncclSuccess; + } + if (read) { // For reads (sends) only enable under certain conditions int gdrReadParam = ncclParamNetGdrRead(); if (gdrReadParam == 0) return ncclSuccess;