TransferBench p2p benchmark mode (#444)

* [TransferBench] Adding a p2p benchmark mode * [TransferBench] Switching to using single sync mode by default (USE_SINGLE_SYNC=1) [ROCm/rccl commit: 550d732d6c]
2021-10-21 15:28:16 -06:00
commit b795cc090b
@@ -40,7 +40,7 @@ public:

    useHipCall      = GetEnvVar("USE_HIP_CALL"     , 0);
    useMemset       = GetEnvVar("USE_MEMSET"       , 0);
-    useSingleSync   = GetEnvVar("USE_SINGLE_SYNC"  , 0);
+    useSingleSync   = GetEnvVar("USE_SINGLE_SYNC"  , 1);
    useInteractive  = GetEnvVar("USE_INTERACTIVE"  , 0);
    combineTiming   = GetEnvVar("COMBINE_TIMING"   , 0);
    showAddr        = GetEnvVar("SHOW_ADDR"        , 0);
@@ -50,24 +50,8 @@ int main(int argc, char **argv)
    exit(0);
  }

-  // Check that Link configuration file can be opened
-  FILE* fp = fopen(argv[1], "r");
-  if (!fp)
-  {
-    printf("[ERROR] Unable to open link configuration file: [%s]\n", argv[1]);
-    exit(1);
-  }
-
-  // Check for NUMA library support
-  if (numa_available() == -1)
-  {
-    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
-    exit(1);
-  }
-
  // Collect environment variables / display current run configuration
  EnvVars ev;
-  ev.DisplayEnvVars();

  // Determine number of bytes to run per Link
  // If a non-zero number of bytes is specified, use it
@@ -92,6 +76,30 @@ int main(int argc, char **argv)
  for (auto N : valuesOfN)
    maxN = std::max(maxN, N);

+  // Execute only peer to peer benchmark mode, similar to rocm-bandwidth-test
+  if (!strcmp(argv[1], "p2p"))
+  {
+    // Execute peer to peer benchmark mode
+    RunPeerToPeerBenchmarks(ev, numBytesPerLink / sizeof(float));
+    exit(0);
+  }
+
+  // Check that Link configuration file can be opened
+  FILE* fp = fopen(argv[1], "r");
+  if (!fp)
+  {
+    printf("[ERROR] Unable to open link configuration file: [%s]\n", argv[1]);
+    exit(1);
+  }
+
+  // Check for NUMA library support
+  if (numa_available() == -1)
+  {
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
+  }
+  ev.DisplayEnvVars();
+
  int const initOffset = ev.byteOffset / sizeof(float);
  std::stack<std::thread> threads;

@@ -381,6 +389,7 @@ void DisplayUsage(char const* cmdName)
  printf("Usage: %s configFile <N>\n", cmdName);

  printf("  configFile: File containing Links to execute (see below for format)\n");
+  printf("              Specifying \"p2p\" as the configFile will execute a peer to peer benchmark\n");
  printf("  N         : (Optional) Number of bytes to transfer per link.\n");
  printf("              If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", DEFAULT_BYTES_PER_LINK);
  printf("              If 0 is specified, a range of Ns will be benchmarked\n");
@@ -880,8 +889,8 @@ void CheckPages(char* array, size_t numBytes, int targetId)
  if (mistakeCount > 0)
  {
    printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId);
-    // NOTE: Some older versions of HIP do not properly respect NUMA policy so avoid failing for now
-    // exit(1);
+    printf("[ERROR] Ensure up-to-date ROCm is installed\n");
+    exit(1);
  }
 }

@@ -1086,3 +1095,173 @@ void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link)
      link.totalTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
  }
 }
+
+void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
+{
+  // Collect the number of available CPUs/GPUs on this machine
+  int numGpus;
+  HIP_CALL(hipGetDeviceCount(&numGpus));
+  int const numCpus = numa_num_configured_nodes();
+  int const numDevices = numCpus + numGpus;
+
+  // Enable peer to peer for each GPU
+  for (int i = 0; i < numGpus; i++)
+    for (int j = 0; j < numGpus; j++)
+      if (i != j) EnablePeerAccess(i, j);
+
+  printf("Performing copies in each direction of %lu bytes\n", N * sizeof(float));
+  printf("Using %d threads per NUMA node for CPU copies\n", ev.numCpuPerLink);
+
+  // Perform unidirectional / bidirectional
+  for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
+  {
+    // Print header
+    printf("%sdirectional copy peak bandwidth GB/s\n", isBidirectional ? "Bi" : "Uni");
+    printf("%10s", "D/D");
+    for (int i = 0; i < numCpus; i++)
+      printf("%7s %02d", "CPU", i);
+    for (int i = 0; i < numGpus; i++)
+      printf("%7s %02d", "GPU", i);
+    printf("\n");
+
+    // Loop over all possible src/dst pairs
+    for (int src = 0; src < numDevices; src++)
+    {
+      MemType const& srcMemType = (src < numCpus ? MEM_CPU : MEM_GPU);
+      int srcIndex = (srcMemType == MEM_CPU ? src : src - numCpus);
+      printf("%7s %02d", (srcMemType == MEM_CPU) ? "CPU" : "GPU", srcIndex);
+
+      for (int dst = 0; dst < numDevices; dst++)
+      {
+        MemType const& dstMemType = (dst < numCpus ? MEM_CPU : MEM_GPU);
+        int dstIndex = (dstMemType == MEM_CPU ? dst : dst - numCpus);
+
+        double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcMemType, srcIndex, dstMemType, dstIndex);
+        if (bandwidth == 0)
+          printf("%10s", "N/A");
+        else
+          printf("%10.2f", bandwidth);
+        fflush(stdout);
+      }
+      printf("\n");
+    }
+    printf("\n");
+  }
+}
+
+double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional,
+                        MemType srcMemType, int srcIndex,
+                        MemType dstMemType, int dstIndex)
+{
+  Link links[2];
+  int const initOffset = ev.byteOffset / sizeof(float);
+
+  // Skip bidirectional on same device
+  if (isBidirectional && srcMemType == dstMemType && srcIndex == dstIndex) return 0.0f;
+  
+  // Prepare Links
+  links[0].srcMemType = links[0].exeMemType = links[1].dstMemType = srcMemType;
+  links[0].srcIndex   = links[0].exeIndex   = links[1].dstIndex   = srcIndex;
+  links[0].dstMemType = links[1].exeMemType = links[1].srcMemType = dstMemType;
+  links[0].dstIndex   = links[1].exeIndex   = links[1].srcIndex   = dstIndex;
+  for (int i = 0; i <= isBidirectional; i++)
+  {
+    AllocateMemory(links[i].srcMemType, links[i].srcIndex, N * sizeof(float) + ev.byteOffset, &links[i].srcMem);
+    AllocateMemory(links[i].dstMemType, links[i].dstIndex, N * sizeof(float) + ev.byteOffset, &links[i].dstMem);
+    links[i].totalTime = 0.0;
+
+    CheckOrFill(MODE_FILL, N, ev.useMemset, ev.useHipCall, ev.fillPattern, links[i].srcMem + initOffset);
+    if (links[i].exeMemType == MEM_GPU)
+    {
+      HIP_CALL(hipDeviceGetAttribute(&links[i].numBlocksToUse, hipDeviceAttributeMultiprocessorCount, links[i].exeIndex));
+      HIP_CALL(hipSetDevice(links[i].exeIndex));
+      HIP_CALL(hipEventCreate(&links[i].startEvent));
+      HIP_CALL(hipEventCreate(&links[i].stopEvent));
+      HIP_CALL(hipMalloc((void**)&links[i].blockParam, sizeof(BlockParam) * links[i].numBlocksToUse));
+      HIP_CALL(hipStreamCreate(&links[i].stream));
+
+      size_t assigned = 0;
+      int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)links[i].numBlocksToUse);
+      for (int j = 0; j < links[i].numBlocksToUse; j++)
+      {
+        BlockParam param;
+        int blocksLeft = std::max(0, maxNumBlocksToUse - j);
+        size_t leftover = N - assigned;
+        size_t roundedN = (leftover + 31) / 32;
+        param.N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0;
+        param.src = links[i].srcMem + assigned + initOffset;
+        param.dst = links[i].dstMem + assigned + initOffset;
+        assigned += param.N;
+
+        HIP_CALL(hipMemcpy(&links[i].blockParam[j], &param, sizeof(BlockParam), hipMemcpyHostToDevice));
+      }
+    }
+    else
+    {
+      links[i].blockParam = (BlockParam*)malloc(ev.numCpuPerLink * sizeof(BlockParam));
+      // For CPU-based copy, divded based on the number of child threads
+      size_t assigned = 0;
+      int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)ev.numCpuPerLink);
+      for (int j = 0; j < ev.numCpuPerLink; j++)
+      {
+        int blocksLeft = std::max(0, maxNumBlocksToUse - j);
+        size_t leftover = N - assigned;
+        size_t roundedN = (leftover + 31) / 32;
+        links[i].blockParam[j].N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0;
+        links[i].blockParam[j].src = links[i].srcMem + assigned + initOffset;
+        links[i].blockParam[j].dst = links[i].dstMem + assigned + initOffset;
+        assigned += links[i].blockParam[j].N;
+      }
+    }
+  }
+
+  std::stack<std::thread> threads;
+
+  // Perform iteration
+  for (int iteration = -ev.numWarmups; iteration < ev.numIterations; iteration++)
+  {
+    // Perform timed iterations
+    for (int i = 0; i <= isBidirectional; i++)
+      threads.push(std::thread(RunLink, std::ref(ev), N, iteration, std::ref(links[i])));
+
+    // Wait for all threads to finish
+    for (int i = 0; i <= isBidirectional; i++)
+    {
+      threads.top().join();
+      threads.pop();
+    }
+  }
+
+  // Validate that each link has transferred correctly
+  for (int i = 0; i <= isBidirectional; i++)
+    CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, links[i].dstMem + initOffset);
+
+  // Collect aggregate bandwidth
+  double totalBandwidth = 0;
+  for (int i = 0; i <= isBidirectional; i++)
+  {
+    double linkDurationMsec = links[i].totalTime / (1.0 * ev.numIterations);
+    double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f;
+    totalBandwidth += linkBandwidthGbs;
+  }
+
+  // Release GPU memory
+  for (int i = 0; i <= isBidirectional; i++)
+  {
+    DeallocateMemory(links[i].srcMemType, links[i].srcIndex, links[i].srcMem);
+    DeallocateMemory(links[i].dstMemType, links[i].dstIndex, links[i].dstMem);
+
+    if (links[i].exeMemType == MEM_GPU)
+      {
+        HIP_CALL(hipEventDestroy(links[i].startEvent));
+        HIP_CALL(hipEventDestroy(links[i].stopEvent));
+        HIP_CALL(hipStreamDestroy(links[i].stream));
+        HIP_CALL(hipFree(links[i].blockParam));
+      }
+      else if (links[i].exeMemType == MEM_CPU)
+      {
+        free(links[i].blockParam);
+      }
+  }
+  return totalBandwidth;
+}
@@ -122,7 +122,10 @@ void DeallocateMemory(MemType memType, int devIndex, float* memPtr);
 void CheckPages(char* byteArray, size_t numBytes, int targetId);
 void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
 void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link);
-
+void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N);
+double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional,
+                        MemType srcMemType, int srcIndex,
+                        MemType dstMemType, int dstIndex);

 std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
 std::string GetDesc(MemType srcMemType, int srcIndex,