From 0eb19a563ad3154ac910e580423f4c168da6827c Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Mon, 22 Jun 2020 13:06:25 -0700 Subject: [PATCH] Use posix_memalign for network buffer allocation on host memory (#221) * Use posix_memalign for network buffer allocation on host memory * ib-test: add ability to specify run iterations * ib-test: define iterations as multiple of default cycles * Add checking to posix_memalign return value --- CMakeLists.txt | 2 +- src/graph/topo.cc | 6 ++++++ src/include/devcomm.h | 3 +++ src/transport/net.cc | 36 +++++++++++++++++++++++++++++++ src/transport/net_ib.cc | 2 +- tools/ib-test/ib_test.cpp | 45 ++++++++++++++++++++++++++++++--------- 6 files changed, 82 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fa42e4c6cd..ff79e764ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -169,7 +169,7 @@ endforeach() if("${HIP_COMPILER}" MATCHES "clang") foreach(target ${AMDGPU_TARGETS}) - target_compile_options(rccl PRIVATE --amdgpu-target=${target} PRIVATE -fgpu-rdc) + target_compile_options(rccl PRIVATE --amdgpu-target=${target} PRIVATE -fgpu-rdc -DUSE_MEMALIGN) endforeach() target_link_libraries(rccl PRIVATE -fgpu-rdc) target_include_directories(rccl PRIVATE /opt/rocm/hsa/include) diff --git a/src/graph/topo.cc b/src/graph/topo.cc index fd82fbfa37..ea53d90324 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -20,6 +20,7 @@ #endif #include "xml.h" #include "cpuset.h" +#include #define BUSID_SIZE (sizeof("0000:00:00.0")) #define BUSID_REDUCED_SIZE (sizeof("0000:00")) @@ -733,6 +734,11 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank) { NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr)); INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr); SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity"); + int ret = numa_run_on_node(cpu->id); + if (ret != 0) + INFO(NCCL_INIT, "Failed to run on numa node %ld", cpu->id); + else + INFO(NCCL_INIT, "Thread is set to run on numa node %ld", cpu->id); } return ncclSuccess; } diff --git a/src/include/devcomm.h b/src/include/devcomm.h index e5c5f43727..e136444f92 100644 --- a/src/include/devcomm.h +++ b/src/include/devcomm.h @@ -92,6 +92,9 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK struct ncclConnInfo { // Regular comm mechanism char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send +#ifdef USE_MEMALIGN + char *hostBuffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send +#endif uint64_t *tail; // Local for recv, remote for send uint64_t *head; // Local for send, remote for recv uint64_t *opCountLoc; // opCount of local rank diff --git a/src/transport/net.cc b/src/transport/net.cc index 7a9bf960a5..73e9bfffc1 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -26,6 +26,9 @@ struct netSendResources { int netDev; int useGdr; char* buffers[LOC_COUNT]; +#ifdef USE_MEMALIGN + char* hostBuffer; +#endif int buffSizes[LOC_COUNT]; void* mhandles[LOC_COUNT]; void** mhandlesProto[NCCL_NUM_PROTOCOLS]; @@ -93,10 +96,24 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra } char line[16]; if (resources->buffSizes[LOC_HOSTMEM]) { +#ifdef USE_MEMALIGN + int page_size = getpagesize(); + if (posix_memalign((void **)&resources->hostBuffer, page_size, resources->buffSizes[LOC_HOSTMEM])) { + WARN("Failed to posix_memalign %d bytes", resources->buffSizes[LOC_HOSTMEM]); + return ncclSystemError; + } + CUDACHECK(hipHostRegister(resources->hostBuffer, resources->buffSizes[LOC_HOSTMEM], hipHostRegisterMapped)); + CUDACHECK(hipHostGetDevicePointer((void **)resources->buffers+LOC_HOSTMEM, resources->hostBuffer, 0)); +#else NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM])); +#endif int status[1] = {-1}; line[0]= 0; +#ifdef USE_MEMALIGN + if (!move_pages(0, 1, (void **)&resources->hostBuffer, NULL, status, 0)) +#else if (!move_pages(0, 1, (void **)resources->buffers+LOC_HOSTMEM, NULL, status, 0)) +#endif sprintf(line, "/MEM%d", status[0]); } @@ -105,6 +122,12 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra for (int p=0; pmhandlesProto[p] = resources->mhandles+protoLoc[p]; send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]]; +#ifdef USE_MEMALIGN + if (protoLoc[p] == LOC_HOSTMEM) + send->conn.hostBuffs[p] = resources->hostBuffer + offsets[protoLoc[p]]; + else + send->conn.hostBuffs[p] = send->conn.buffs[p]; +#endif offsets[protoLoc[p]] += buffSizes[p]; } @@ -182,7 +205,11 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int ran NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM])); } if (resources->buffSizes[LOC_HOSTMEM]) { +#ifdef USE_MEMALIGN + NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->hostBuffer, resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM])); +#else NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM])); +#endif } return ncclSuccess; } @@ -213,7 +240,12 @@ ncclResult_t netSendFree(void* transportResources) { if (resources->buffers[l]) NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l])); } +#ifdef USE_MEMALIGN + CUDACHECK(hipHostUnregister(resources->hostBuffer)); + free(resources->hostBuffer); +#else NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM])); +#endif CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM])); NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); free(resources); @@ -251,7 +283,11 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { if (args->state == ncclProxyOpProgress) { int p = args->protocol; int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS; +#ifdef USE_MEMALIGN + char* localBuff = args->connector->conn.hostBuffs[p]; +#else char* localBuff = args->connector->conn.buffs[p]; +#endif void* mhandle = *(resources->mhandlesProto[p]); args->idle = 1; if (args->head < args->end) { diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 2bcfeab0c3..879fc821ee 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -291,7 +291,7 @@ struct ncclIbListenComm { int fd; }; -struct ncclIbSendFifo { +struct alignas(64) ncclIbSendFifo { uint64_t addr; int size; uint32_t seq; diff --git a/tools/ib-test/ib_test.cpp b/tools/ib-test/ib_test.cpp index 23f4c73f02..53556f9451 100755 --- a/tools/ib-test/ib_test.cpp +++ b/tools/ib-test/ib_test.cpp @@ -46,7 +46,7 @@ bool cmdOptionExists(char** begin, char** end, const std::string& option) { #define DEFAULT_BUFFSIZE (1LL << 22) /* 4MiB */ #define SLICE_STEPS 4 -#define ITERATIONS 2000 +#define DEFAULT_CYCLES 4000 #define VEGA_GPU_RTC_FREQUENCY 2.5E7 #define ENABLE_VALIDATION #define USE_MEMALIGN @@ -91,6 +91,8 @@ bool runSend = false, runRecv = false; uint64_t send_byte; uint64_t recv_byte; +uint64_t iterations = 1; + __device__ inline __attribute((always_inline)) long long int __rtc64() { @@ -316,6 +318,12 @@ int main(int argc,char* argv[]) CUDACHECK(hipSetDevice(atol(gpu))); } + char *iters = getCmdOption(argv, argv + argc, "-i"); + if (iters) { + iterations = atol(iters); + printf("Running %ld iterations\n", iterations); + } + char *gdr_read = getCmdOption(argv, argv + argc, "-r"); if (gdr_read) { use_gdr_read = atol(gdr_read); @@ -479,29 +487,30 @@ int main(int argc,char* argv[]) *sendHead = 0; *sendTail = 0; *sourceCycle = 0; *sourceBytes = 0; send_sizes = 0; send_bw_cumulative = 0; send_bw_count =0; send_byte = 0; hipLaunchKernelGGL(DataSourceKernel, dim3(1, 1, 1), dim3(256, 1, 1), 0, 0, - NCCL_STEPS*ITERATIONS, (Pack128 *)(use_gdr_read ? sendDevBuffer : d_sendHostBuffer), sendHead, sendTail, sourceCycle, sourceBytes); + NCCL_STEPS*iterations*DEFAULT_CYCLES, (Pack128 *)(use_gdr_read ? sendDevBuffer : d_sendHostBuffer), sendHead, sendTail, sourceCycle, sourceBytes); runSend = true; } else { *recvHead = 0; *recvTail = 0; *recvErrorCount = 0; *sinkCycle = 0, *sinkBytes = 0; recv_sizes = 0; recv_bw_cumulative = 0; recv_bw_count =0; recv_byte = 0; hipLaunchKernelGGL(DataSinkKernel, dim3(1, 1, 1), dim3(256, 1, 1), 0, 0, - NCCL_STEPS*ITERATIONS, (Pack128 *)(use_gdr_write ? recvDevBuffer : d_recvHostBuffer), recvHead, recvTail, recvErrorCount, sinkCycle, sinkBytes); + NCCL_STEPS*iterations*DEFAULT_CYCLES, (Pack128 *)(use_gdr_write ? recvDevBuffer : d_recvHostBuffer), recvHead, recvTail, recvErrorCount, sinkCycle, sinkBytes); runRecv = true; } - struct timeval tv_start, tv_end; + struct timeval tv_start, tv_end, tv_prev; gettimeofday(&tv_start, NULL); + gettimeofday(&tv_prev, NULL); memset(&sendArgs, 0, sizeof(struct ncclProxyArgs)); sendArgs.head = 0; sendArgs.tail = 0; - sendArgs.end = NCCL_STEPS*ITERATIONS; + sendArgs.end = NCCL_STEPS*iterations*DEFAULT_CYCLES; sendArgs.sliceSteps = sliceSteps; memset(&recvArgs, 0, sizeof(struct ncclProxyArgs)); recvArgs.head = 0; recvArgs.tail = 0; - recvArgs.end = NCCL_STEPS*ITERATIONS; + recvArgs.end = NCCL_STEPS*iterations*DEFAULT_CYCLES; recvArgs.sliceSteps = sliceSteps; do { @@ -509,6 +518,22 @@ int main(int argc,char* argv[]) NCCLCHECK(netRecvProxy(&recvArgs)); if (runSend) NCCLCHECK(netSendProxy(&sendArgs)); + + gettimeofday(&tv_end, NULL); + uint64_t timelap = ((uint64_t)(tv_end.tv_sec - tv_prev.tv_sec)*1000*1000 + tv_end.tv_usec - tv_prev.tv_usec); + if (timelap > 100000UL) { + uint64_t total_time = ((uint64_t)(tv_end.tv_sec - tv_start.tv_sec)*1000*1000 + tv_end.tv_usec - tv_start.tv_usec); + if (send_byte) printf("# Send %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes)\r", + sendArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)send_byte/total_time/1000.0 : 0, + send_byte, total_time, send_bw_count ? (float)send_bw_cumulative/send_bw_count : 0, send_bw_count, + *sourceCycle ? (double)(*sourceBytes)*sizeof(Pack128)/((double)(*sourceCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sourceBytes*sizeof(Pack128)); + if (recv_byte) printf("# Recv %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes) Errors %ld\r", + recvArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)recv_byte/total_time/1000.0 : 0, + recv_byte, total_time, recv_bw_count ? (float)recv_bw_cumulative/recv_bw_count : 0, recv_bw_count, + *sinkCycle ? (double)(*sinkBytes)*sizeof(Pack128)/((double)(*sinkCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sinkBytes*sizeof(Pack128), + *recvErrorCount); + gettimeofday(&tv_prev, NULL); + } } while (runSend || runRecv); CUDACHECK(hipDeviceSynchronize()); @@ -516,12 +541,12 @@ int main(int argc,char* argv[]) gettimeofday(&tv_end, NULL); uint64_t total_time = ((uint64_t)(tv_end.tv_sec - tv_start.tv_sec)*1000*1000 + tv_end.tv_usec - tv_start.tv_usec); - if (send_byte) printf("# Send %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes)\n", - (total_time) ? (double)send_byte/total_time/1000.0 : 0, + if (send_byte) printf("# Send %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes)\n", + sendArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)send_byte/total_time/1000.0 : 0, send_byte, total_time, send_bw_count ? (float)send_bw_cumulative/send_bw_count : 0, send_bw_count, *sourceCycle ? (double)(*sourceBytes)*sizeof(Pack128)/((double)(*sourceCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sourceBytes*sizeof(Pack128)); - if (recv_byte) printf("# Recv %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes) Data Error Counts %ld\n", - (total_time) ? (double)recv_byte/total_time/1000.0 : 0, + if (recv_byte) printf("# Recv %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes) Errors %ld\n", + recvArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)recv_byte/total_time/1000.0 : 0, recv_byte, total_time, recv_bw_count ? (float)recv_bw_cumulative/recv_bw_count : 0, recv_bw_count, *sinkCycle ? (double)(*sinkBytes)*sizeof(Pack128)/((double)(*sinkCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sinkBytes*sizeof(Pack128), *recvErrorCount);