Use posix_memalign for network buffer allocation on host memory (#221)

* Use posix_memalign for network buffer allocation on host memory * ib-test: add ability to specify run iterations * ib-test: define iterations as multiple of default cycles * Add checking to posix_memalign return value
2020-06-22 13:06:25 -07:00
@@ -169,7 +169,7 @@ endforeach()

 if("${HIP_COMPILER}" MATCHES "clang")
  foreach(target ${AMDGPU_TARGETS})
-    target_compile_options(rccl PRIVATE --amdgpu-target=${target} PRIVATE -fgpu-rdc)
+    target_compile_options(rccl PRIVATE --amdgpu-target=${target} PRIVATE -fgpu-rdc -DUSE_MEMALIGN)
  endforeach()
  target_link_libraries(rccl PRIVATE -fgpu-rdc)
  target_include_directories(rccl PRIVATE /opt/rocm/hsa/include)
@@ -20,6 +20,7 @@
 #endif
 #include "xml.h"
 #include "cpuset.h"
+#include <numa.h>

 #define BUSID_SIZE (sizeof("0000:00:00.0"))
 #define BUSID_REDUCED_SIZE (sizeof("0000:00"))
@@ -733,6 +734,11 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank) {
    NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
    INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr);
    SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
+    int ret = numa_run_on_node(cpu->id);
+    if (ret != 0)
+      INFO(NCCL_INIT, "Failed to run on numa node %ld", cpu->id);
+    else
+      INFO(NCCL_INIT, "Thread is set to run on numa node %ld", cpu->id);
  }
  return ncclSuccess;
 }
@@ -92,6 +92,9 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
 struct ncclConnInfo {
  // Regular comm mechanism
  char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
+#ifdef USE_MEMALIGN
+  char *hostBuffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
+#endif
  uint64_t *tail;     // Local for recv, remote for send
  uint64_t *head;     // Local for send, remote for recv
  uint64_t *opCountLoc; // opCount of local rank
@@ -26,6 +26,9 @@ struct netSendResources {
  int netDev;
  int useGdr;
  char* buffers[LOC_COUNT];
+#ifdef USE_MEMALIGN
+  char* hostBuffer;
+#endif
  int buffSizes[LOC_COUNT];
  void* mhandles[LOC_COUNT];
  void** mhandlesProto[NCCL_NUM_PROTOCOLS];
@@ -93,10 +96,24 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
  }
  char line[16];
  if (resources->buffSizes[LOC_HOSTMEM]) {
+#ifdef USE_MEMALIGN
+    int page_size = getpagesize();
+    if (posix_memalign((void **)&resources->hostBuffer, page_size, resources->buffSizes[LOC_HOSTMEM])) {
+      WARN("Failed to posix_memalign %d bytes", resources->buffSizes[LOC_HOSTMEM]);
+      return ncclSystemError;
+    }
+    CUDACHECK(hipHostRegister(resources->hostBuffer, resources->buffSizes[LOC_HOSTMEM], hipHostRegisterMapped));
+    CUDACHECK(hipHostGetDevicePointer((void **)resources->buffers+LOC_HOSTMEM, resources->hostBuffer, 0));
+#else
    NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
+#endif
    int status[1] = {-1};
    line[0]= 0;
+#ifdef USE_MEMALIGN
+    if (!move_pages(0, 1, (void **)&resources->hostBuffer, NULL, status, 0))
+#else
    if (!move_pages(0, 1, (void **)resources->buffers+LOC_HOSTMEM, NULL, status, 0))
+#endif
      sprintf(line, "/MEM%d", status[0]);
  }

@@ -105,6 +122,12 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
    resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
    send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
+#ifdef USE_MEMALIGN
+    if (protoLoc[p] == LOC_HOSTMEM)
+      send->conn.hostBuffs[p] = resources->hostBuffer + offsets[protoLoc[p]];
+    else
+      send->conn.hostBuffs[p] = send->conn.buffs[p];
+#endif
    offsets[protoLoc[p]] += buffSizes[p];
  }

@@ -182,7 +205,11 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
    NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
  }
  if (resources->buffSizes[LOC_HOSTMEM]) {
+#ifdef USE_MEMALIGN
+    NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->hostBuffer, resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
+#else
    NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
+#endif
  }
  return ncclSuccess;
 }
@@ -213,7 +240,12 @@ ncclResult_t netSendFree(void* transportResources) {
    if (resources->buffers[l])
      NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l]));
  }
+#ifdef USE_MEMALIGN
+  CUDACHECK(hipHostUnregister(resources->hostBuffer));
+  free(resources->hostBuffer);
+#else
  NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
+#endif
  CUDACHECK(hipFree(resources->buffers[LOC_DEVMEM]));
  NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
  free(resources);
@@ -251,7 +283,11 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
  if (args->state == ncclProxyOpProgress) {
    int p = args->protocol;
    int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
+#ifdef USE_MEMALIGN
+    char* localBuff = args->connector->conn.hostBuffs[p];
+#else
    char* localBuff = args->connector->conn.buffs[p];
+#endif
    void* mhandle = *(resources->mhandlesProto[p]);
    args->idle = 1;
    if (args->head < args->end) {
@@ -291,7 +291,7 @@ struct ncclIbListenComm {
  int fd;
 };

-struct ncclIbSendFifo {
+struct alignas(64) ncclIbSendFifo {
  uint64_t addr;
  int      size;
  uint32_t seq;
@@ -46,7 +46,7 @@ bool cmdOptionExists(char** begin, char** end, const std::string& option) {

 #define DEFAULT_BUFFSIZE (1LL << 22) /* 4MiB */
 #define SLICE_STEPS 4
-#define ITERATIONS 2000
+#define DEFAULT_CYCLES 4000
 #define VEGA_GPU_RTC_FREQUENCY 2.5E7
 #define ENABLE_VALIDATION
 #define USE_MEMALIGN
@@ -91,6 +91,8 @@ bool runSend = false, runRecv = false;
 uint64_t send_byte;
 uint64_t recv_byte;

+uint64_t iterations = 1;
+
 __device__
 inline  __attribute((always_inline))
 long long int __rtc64() {
@@ -316,6 +318,12 @@ int main(int argc,char* argv[])
    CUDACHECK(hipSetDevice(atol(gpu)));
  }

+  char *iters = getCmdOption(argv, argv + argc, "-i");
+  if (iters) {
+    iterations = atol(iters);
+    printf("Running %ld iterations\n", iterations);
+  }
+
  char *gdr_read = getCmdOption(argv, argv + argc, "-r");
  if (gdr_read) {
    use_gdr_read = atol(gdr_read);
@@ -479,29 +487,30 @@ int main(int argc,char* argv[])
    *sendHead = 0; *sendTail = 0; *sourceCycle = 0; *sourceBytes = 0;
    send_sizes = 0; send_bw_cumulative = 0; send_bw_count =0; send_byte = 0;
    hipLaunchKernelGGL(DataSourceKernel, dim3(1, 1, 1), dim3(256, 1, 1), 0, 0,
-      NCCL_STEPS*ITERATIONS, (Pack128 *)(use_gdr_read ? sendDevBuffer : d_sendHostBuffer), sendHead, sendTail, sourceCycle, sourceBytes);
+      NCCL_STEPS*iterations*DEFAULT_CYCLES, (Pack128 *)(use_gdr_read ? sendDevBuffer : d_sendHostBuffer), sendHead, sendTail, sourceCycle, sourceBytes);
    runSend = true;
  } else {
    *recvHead = 0; *recvTail = 0; *recvErrorCount = 0; *sinkCycle = 0, *sinkBytes = 0;
    recv_sizes = 0; recv_bw_cumulative = 0; recv_bw_count =0; recv_byte = 0;
    hipLaunchKernelGGL(DataSinkKernel, dim3(1, 1, 1), dim3(256, 1, 1), 0, 0,
-      NCCL_STEPS*ITERATIONS, (Pack128 *)(use_gdr_write ? recvDevBuffer : d_recvHostBuffer), recvHead, recvTail, recvErrorCount, sinkCycle, sinkBytes);
+      NCCL_STEPS*iterations*DEFAULT_CYCLES, (Pack128 *)(use_gdr_write ? recvDevBuffer : d_recvHostBuffer), recvHead, recvTail, recvErrorCount, sinkCycle, sinkBytes);
    runRecv = true;
  }

-  struct timeval tv_start, tv_end;
+  struct timeval tv_start, tv_end, tv_prev;
  gettimeofday(&tv_start, NULL);
+  gettimeofday(&tv_prev, NULL);

  memset(&sendArgs, 0, sizeof(struct ncclProxyArgs));
  sendArgs.head = 0;
  sendArgs.tail = 0;
-  sendArgs.end = NCCL_STEPS*ITERATIONS;
+  sendArgs.end = NCCL_STEPS*iterations*DEFAULT_CYCLES;
  sendArgs.sliceSteps = sliceSteps;

  memset(&recvArgs, 0, sizeof(struct ncclProxyArgs));
  recvArgs.head = 0;
  recvArgs.tail = 0;
-  recvArgs.end = NCCL_STEPS*ITERATIONS;
+  recvArgs.end = NCCL_STEPS*iterations*DEFAULT_CYCLES;
  recvArgs.sliceSteps = sliceSteps;

  do {
@@ -509,6 +518,22 @@ int main(int argc,char* argv[])
      NCCLCHECK(netRecvProxy(&recvArgs));
    if (runSend)
      NCCLCHECK(netSendProxy(&sendArgs));
+
+    gettimeofday(&tv_end, NULL);
+    uint64_t timelap = ((uint64_t)(tv_end.tv_sec - tv_prev.tv_sec)*1000*1000 + tv_end.tv_usec - tv_prev.tv_usec);
+    if (timelap > 100000UL) {
+      uint64_t total_time = ((uint64_t)(tv_end.tv_sec - tv_start.tv_sec)*1000*1000 + tv_end.tv_usec - tv_start.tv_usec);
+      if (send_byte) printf("# Send %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes)\r",
+        sendArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)send_byte/total_time/1000.0 : 0,
+        send_byte, total_time, send_bw_count ? (float)send_bw_cumulative/send_bw_count : 0, send_bw_count,
+        *sourceCycle ? (double)(*sourceBytes)*sizeof(Pack128)/((double)(*sourceCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sourceBytes*sizeof(Pack128));
+      if (recv_byte) printf("# Recv %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes) Errors %ld\r",
+        recvArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)recv_byte/total_time/1000.0 : 0,
+        recv_byte, total_time, recv_bw_count ? (float)recv_bw_cumulative/recv_bw_count : 0, recv_bw_count,
+        *sinkCycle ? (double)(*sinkBytes)*sizeof(Pack128)/((double)(*sinkCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sinkBytes*sizeof(Pack128),
+        *recvErrorCount);
+      gettimeofday(&tv_prev, NULL);
+    }
  } while (runSend || runRecv);

  CUDACHECK(hipDeviceSynchronize());
@@ -516,12 +541,12 @@ int main(int argc,char* argv[])
  gettimeofday(&tv_end, NULL);
  uint64_t total_time = ((uint64_t)(tv_end.tv_sec - tv_start.tv_sec)*1000*1000 + tv_end.tv_usec - tv_start.tv_usec);

-  if (send_byte) printf("# Send %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes)\n",
-    (total_time) ? (double)send_byte/total_time/1000.0 : 0,
+  if (send_byte) printf("# Send %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes)\n",
+    sendArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)send_byte/total_time/1000.0 : 0,
    send_byte, total_time, send_bw_count ? (float)send_bw_cumulative/send_bw_count : 0, send_bw_count,
    *sourceCycle ? (double)(*sourceBytes)*sizeof(Pack128)/((double)(*sourceCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sourceBytes*sizeof(Pack128));
-  if (recv_byte) printf("# Recv %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes) Data Error Counts %ld\n",
-    (total_time) ? (double)recv_byte/total_time/1000.0 : 0,
+  if (recv_byte) printf("# Recv %3ld%% %6.2f GB/s (%ld bytes %ld us) Proxy %6.2f GB/s (%d mmts) Kernel %6.2f GB/s (%ld bytes) Errors %ld\n",
+    recvArgs.head*100/(NCCL_STEPS*iterations*DEFAULT_CYCLES), (total_time) ? (double)recv_byte/total_time/1000.0 : 0,
    recv_byte, total_time, recv_bw_count ? (float)recv_bw_cumulative/recv_bw_count : 0, recv_bw_count,
    *sinkCycle ? (double)(*sinkBytes)*sizeof(Pack128)/((double)(*sinkCycle)/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, *sinkBytes*sizeof(Pack128),
    *recvErrorCount);