From e98891d039ba66dd71cad33599061498eca55538 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Tue, 2 Jun 2020 02:00:21 +0800 Subject: [PATCH] Log NUMA node of RDMA host buffer allocation --- CMakeLists.txt | 2 +- src/transport/net.cc | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6feb3e2b87..fa42e4c6cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -191,7 +191,7 @@ if("${HIP_COMPILER}" MATCHES "hcc") endif() endif() -target_link_libraries(rccl PRIVATE hip::device) +target_link_libraries(rccl PRIVATE hip::device numa) target_link_libraries(rccl INTERFACE hip::host) #Setup librccl.so version diff --git a/src/transport/net.cc b/src/transport/net.cc index 850912332c..8edee80421 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -9,6 +9,7 @@ #include "net.h" #include "graph.h" #include +#include struct netConnectInfo { ncclNetHandle_t netHandle; @@ -90,8 +91,13 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra if (resources->buffSizes[LOC_DEVMEM]) { NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr)); } + char line[16]; if (resources->buffSizes[LOC_HOSTMEM]) { NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM])); + int status[1] = {-1}; + line[0]= 0; + if (!move_pages(0, 1, (void **)resources->buffers+LOC_HOSTMEM, NULL, status, 0)) + sprintf(line, "/MEM%d", status[0]); } int offsets[LOC_COUNT]; @@ -103,7 +109,7 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra } INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev, - resources->useGdr ? "/GDRDMA" : ""); + resources->useGdr ? "/GDRDMA" : line); return ncclSuccess; } @@ -139,8 +145,13 @@ ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra if (resources->buffSizes[LOC_DEVMEM]) { NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM], resources->useGdr)); } + char line[16]; if (resources->buffSizes[LOC_HOSTMEM]) { NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM])); + int status[1] = {-1}; + line[0]= 0; + if (!move_pages(0, 1, (void **)resources->buffers+LOC_HOSTMEM, NULL, status, 0)) + sprintf(line, "/MEM%d", status[0]); } int offsets[LOC_COUNT]; @@ -152,7 +163,7 @@ ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra } INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev, - resources->useGdr ? "/GDRDMA" : ""); + resources->useGdr ? "/GDRDMA" : line); struct netConnectInfo* info = (struct netConnectInfo*) connectInfo; NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));