Upgrading various TransferBench features (#257)

2020-08-19 09:47:19 -06:00
parent a51e4071e3
commit ec9af40fcd
@@ -32,12 +32,11 @@ THE SOFTWARE.
 #include <iostream>
 #include <sstream>
 #include <hip/hip_runtime.h>
+#include <hip/hip_ext.h>
 #include "copy_kernel.h"
 #include "TransferBench.hpp"
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #include <hsa/hsa.h>
 #include <hsa/hsa_ext_amd.h>
-#endif

 int main(int argc, char **argv)
 {
@@ -47,12 +46,22 @@ int main(int argc, char **argv)
    printf("Usage: %s configFile <N>\n", argv[0]);
    printf("- configFile: file describing topologies to test\n");
    printf("  Each line should contain a single topology\n");
+    printf("  Either:\n");
+    printf("    Method 1: #Links followed by triplets:\n");
    printf("    L - number of links followed by L white-space separated triples (src, dst, # blocks)\n");
    printf("    For example:\n");
-    printf("      2 0 1 3  1 0 3\n");
+    printf("      2  0 1 3  1 0 3\n");
+    printf("      would define 2 links each using 3 threadblocks from GPU0 -> GPU1, and GPU1->GPU0\n");
+    printf("  Or:\n");
+    printf("    Method 2: -#Links #BlocksToUse, followed by (src,dst) pairs\n");
+    printf("    -#Links - (negative) number of links\n");
+    printf("    #BlocksToUse - # of threadblocks/CUs to use per link\n");
+    printf("    Example:\n");
+    printf("    -2 3   0 1    1 0\n");
    printf("      would define 2 links each using 3 threadblocks from GPU0 -> GPU1, and GPU1->GPU0\n");
    printf("- N: (Optional) Number of bytes to transfer per link.\n");
    printf("     If not specified, defaults to 2^28=256MB. Must be a multiple of 128 bytes\n");
+    printf("     If 0 is specified, a range of Ns will be benchmarked\n");
    printf("\n");
    printf("Environment variables:\n");
    printf("======================\n");
@@ -64,27 +73,83 @@ int main(int argc, char **argv)
    printf(" USE_ITERATIONS=N - Sets number of iterations to run (default is 10)\n");
    printf(" USE_SLEEP        - Adds a 100ms sleep after sync (for profiling)\n");
    printf(" REUSE_STREAMS    - Re-uses streams instead of creating/destroying per topology\n");
+
+    printf("\nDetected topology:\n");
+    int numDevices;
+    HIP_CALL(hipGetDeviceCount(&numDevices));
+
+    printf("        |");
+    for (int j = 0; j < numDevices; j++)
+      printf(" GPU %02d |", j);
+    printf("\n");
+    for (int j = 0; j <= numDevices; j++)
+      printf("--------+");
+    printf("\n");
+
+    for (int i = 0; i < numDevices; i++)
+    {
+      printf(" GPU %02d |", i);
+      for (int j = 0; j < numDevices; j++)
+      {
+        if (i == j)
+          printf("    -   |");
+        else
+        {
+          uint32_t linkType, hopCount;
+          HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+          printf(" %s-%d |",
+                 linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "  HT" :
+                 linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? " QPI" :
+                 linkType == HSA_AMD_LINK_INFO_TYPE_PCIE           ? "PCIE" :
+                 linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND      ? "INFB" :
+                 linkType == HSA_AMD_LINK_INFO_TYPE_XGMI           ? "XGMI" : "????",
+                 hopCount);
+        }
+      }
+      printf("\n");
+    }
    exit(0);
  }

  // Parse number of bytes to use (or use default if not specified)
+  std::vector<size_t> Nvector;
  size_t const numBytesPerLink = argc > 2 ? atoll(argv[2]) : (1<<28);
-  size_t N = numBytesPerLink / sizeof(float);
+
  if (numBytesPerLink % 128)
  {
    printf("[ERROR] numBytesPerLink (%lu) must be a multiple of 128\n", numBytesPerLink);
    exit(1);
  }
-  printf("Operating on %zu bytes per link (%zu floats)\n", numBytesPerLink, N);
+  if (numBytesPerLink == 0)
+  {
+    printf("Operating on range of sizes\n");
+    for (int N = 256; N <= (1<<27); N *= 2)
+    {
+      int decimationFactor = 1;
+      int delta = std::max(32, N / decimationFactor);
+      int curr = N;
+      while (curr < N * 2)
+      {
+        Nvector.push_back(curr);
+        curr += delta;
+      }
+    }
+  }
+  else
+  {
+    size_t N = numBytesPerLink / sizeof(float);
+    printf("Operating on %zu bytes per link (%zu floats)\n", numBytesPerLink, N);
+    Nvector.push_back(N);
+  }

  // Collect environment variables / display current run configuration
-  bool useHipCall = getenv("USE_HIP_CALL");
-  bool useMemset = getenv("USE_MEMSET");
-  bool useCoarseMem = getenv("USE_COARSE_MEM");
-  bool useSingleSync = getenv("USE_SINGLE_SYNC");
+  bool useHipCall     = getenv("USE_HIP_CALL");
+  bool useMemset      = getenv("USE_MEMSET");
+  bool useCoarseMem   = getenv("USE_COARSE_MEM");
+  bool useSingleSync  = getenv("USE_SINGLE_SYNC");
  bool useInteractive = getenv("USE_INTERACTIVE");
-  bool useSleep = getenv("USE_SLEEP");
-  bool reuseStreams = getenv("REUSE_STREAMS");
+  bool useSleep       = getenv("USE_SLEEP");
+  bool reuseStreams   = getenv("REUSE_STREAMS");

  int numWarmups = 3;
  int numIterations = getenv("USE_ITERATIONS") ? atoi(getenv("USE_ITERATIONS")) : 10;
@@ -102,7 +167,7 @@ int main(int argc, char **argv)
      printf("Using DMA copy engines (disable by setting HSA_ENABLE_SDMA=0)\n");
  }
  if (useSingleSync)
-    printf("Synchronizing only once, after all iterations (disables GPU timers)\n");
+    printf("Synchronizing only once, after all iterations\n");
  else
    printf("Synchronizing per iteration  (disable via USE_SINGLE_SYNC)\n");

@@ -133,23 +198,15 @@ int main(int argc, char **argv)

  // Print header
  printf("%*s", MAX_NAME_LEN, "");
-  printf("%*s | ", 8*(numDevices+1), "Bandwidth (GB/s)");
-  printf("%*s", 8*(numDevices+1), "Duration (msec)");
-  printf(" | Overhead\n");
+  printf("%*s | ", 8*(numDevices+1), "GPU-event measured Bandwidth (GB/s)");
+  printf("CPU BW  |    Duration (msec) | Launch\n");
  printf("%-*s", MAX_NAME_LEN, "Configuration");
  for (int i = 0; i < numDevices; i++)
    printf("  GPU %02d", i);
-  printf("   Total");
-  printf(" | ");
-  for (int i = 0; i < numDevices; i++)
-    printf("  GPU %02d", i);
-  printf(" CpuTime");
-  printf(" |   (msec)\n");
+  printf("   Total |  (GB/s) |  Max GPU  CPU-Time | Overhead\n");

  for (int i = 0; i < MAX_NAME_LEN + (8 * (numDevices + 1)); i++) printf("=");
-  printf("=|=");
-  for (int i = 0; i < (8 * (numDevices + 1)); i++) printf("=");
-  printf("=|=========\n");
+  printf("=|=========|====================|=========\n");

  // Read configuration file
  FILE* fp = fopen(argv[1], "r");
@@ -173,267 +230,269 @@ int main(int argc, char **argv)
    int const numLinks = links.size();
    if (numLinks == 0) continue;

-    // Clear counters
-    int linkCount[numDevices];
-    for (int i = 0; i < numDevices; i++)
-      linkCount[i] = 0;
-
-    float* linkSrcMem[numLinks];
-    float* linkDstMem[numLinks];
-    hipStream_t streams[numLinks];
-    hipEvent_t startEvents[numLinks];
-    hipEvent_t stopEvents[numLinks];
-    std::vector<BlockParam> cpuBlockParams[numLinks];
-    BlockParam* gpuBlockParams[numLinks];
-
-    char name[MAX_NAME_LEN+1] = {};
-
-    for (int i = 0; i < numLinks; i++)
+    for (auto N : Nvector)
    {
-      int const src = links[i].srcGpu;
-      int const dst = links[i].dstGpu;
-      if (src < 0 || src >= numDevices ||
-          dst < 0 || dst >= numDevices)
-      {
-        printf("[ERROR] Invalid link (%d to %d). Total devices: %d\n", src, dst, numDevices);
-        exit(1);
-      }
-      snprintf(name + strlen(name), MAX_NAME_LEN, "%d->%d:%d ", src, dst, links[i].numBlocksToUse);
+      // Clear counters
+      int linkCount[numDevices];
+      for (int i = 0; i < numDevices; i++)
+        linkCount[i] = 0;

-      // Enable peer-to-peer access if this is the first time seeing this pair
-      auto linkPair = std::make_pair(src, dst);
-      linkMap[linkPair]++;
-      if (linkMap[linkPair] == 1 && src != dst)
+      float* linkSrcMem[numLinks];
+      float* linkDstMem[numLinks];
+      hipStream_t streams[numLinks];
+      hipEvent_t startEvents[numLinks];
+      hipEvent_t stopEvents[numLinks];
+      std::vector<BlockParam> cpuBlockParams[numLinks];
+      BlockParam* gpuBlockParams[numLinks];
+
+      char name[MAX_NAME_LEN+1] = {};
+
+      for (int i = 0; i < numLinks; i++)
      {
-        int canAccess;
-        HIP_CALL(hipDeviceCanAccessPeer(&canAccess, src, dst));
-        if (!canAccess)
+        int const src = links[i].srcGpu;
+        int const dst = links[i].dstGpu;
+        if (src < 0 || src >= numDevices ||
+            dst < 0 || dst >= numDevices)
        {
-          printf("[ERROR] Unable to enable peer access between device %d and %d\n", src, dst);
+          printf("[ERROR] Invalid link (%d to %d). Total devices: %d\n", src, dst, numDevices);
          exit(1);
        }
-        HIP_CALL(hipSetDevice(src));
-        HIP_CALL(hipDeviceEnablePeerAccess(dst, 0));
-      }
+        snprintf(name + strlen(name), MAX_NAME_LEN, "%d->%d:%d ", src, dst, links[i].numBlocksToUse);

-      // Allocate GPU memory on source GPU / streams / events
-      HIP_CALL(hipSetDevice(src));
-      if (reuseStreams)
-      {
-        // Create new stream if necessary
-        if (streamCache[src].size() <= linkCount[src])
+        // Enable peer-to-peer access if this is the first time seeing this pair
+        auto linkPair = std::make_pair(src, dst);
+        linkMap[linkPair]++;
+        if (linkMap[linkPair] == 1 && src != dst)
        {
-          streamCache[src].resize(linkCount[src] + 1);
-          HIP_CALL(hipStreamCreate(&streamCache[src][linkCount[src]]));
+          int canAccess;
+          HIP_CALL(hipDeviceCanAccessPeer(&canAccess, src, dst));
+          if (!canAccess)
+          {
+            printf("[ERROR] Unable to enable peer access between device %d and %d\n", src, dst);
+            exit(1);
+          }
+          HIP_CALL(hipSetDevice(src));
+          HIP_CALL(hipDeviceEnablePeerAccess(dst, 0));
        }
-        streams[i] = streamCache[src][linkCount[src]];
-      }
-      else
-      {
-        HIP_CALL(hipStreamCreate(&streams[i]));
-      }
-      HIP_CALL(hipEventCreate(&startEvents[i]));
-      HIP_CALL(hipEventCreate(&stopEvents[i]));
-      HIP_CALL(hipMalloc((void **)&linkSrcMem[i], numBytesPerLink));
-      HIP_CALL(hipMalloc((void**)&gpuBlockParams[i], sizeof(BlockParam) * numLinks));
-      CheckOrFill(N, linkSrcMem[i], false, useMemset, useHipCall);

-      // Count # of links / total blocks each GPU will be working on
-      linkCount[src]++;
+        // Allocate GPU memory on source GPU / streams / events
+        HIP_CALL(hipSetDevice(src));
+        if (reuseStreams)
+        {
+          // Create new stream if necessary
+          if (streamCache[src].size() <= linkCount[src])
+          {
+            streamCache[src].resize(linkCount[src] + 1);
+            HIP_CALL(hipStreamCreate(&streamCache[src][linkCount[src]]));
+          }
+          streams[i] = streamCache[src][linkCount[src]];
+        }
+        else
+        {
+          HIP_CALL(hipStreamCreate(&streams[i]));
+        }
+        HIP_CALL(hipEventCreate(&startEvents[i]));
+        HIP_CALL(hipEventCreate(&stopEvents[i]));
+        HIP_CALL(hipMalloc((void **)&linkSrcMem[i], N * sizeof(float)));
+        HIP_CALL(hipMalloc((void**)&gpuBlockParams[i], sizeof(BlockParam) * numLinks));
+        CheckOrFill(N, linkSrcMem[i], false, useMemset, useHipCall);

-      // Allocate GPU memory on destination GPU
-      HIP_CALL(hipSetDevice(links[i].dstGpu));
-      if (useCoarseMem)
-        HIP_CALL(hipMalloc((void**)&linkDstMem[i], numBytesPerLink));
-      else
-        HIP_CALL(hipExtMallocWithFlags((void**)&linkDstMem[i], numBytesPerLink, hipDeviceMallocFinegrained));
+        // Count # of links / total blocks each GPU will be working on
+        linkCount[src]++;

-      // Each block needs to know src/dst pointers and how many elements to transfer
-      // Figure out the sub-array each block does for this link
-      // NOTE: Have each sub-array to work on multiple of 32-floats (128-bytes),
-      //       but divide as evenly as possible
-      // NOTE: N is always a multiple of 32
-      int blocksWithExtra = (N / 32) % links[i].numBlocksToUse;
-      int perBlockBaseN   = (N / 32) / links[i].numBlocksToUse * 32;
-      for (int j = 0; j < links[i].numBlocksToUse; j++)
-      {
-        BlockParam param;
-        param.N   = perBlockBaseN + ((j < blocksWithExtra) ? 32 : 0);
-        param.src = linkSrcMem[i] + ((j * perBlockBaseN) + ((j < blocksWithExtra) ?
-                                                            j : blocksWithExtra) * 32);
-        param.dst = linkDstMem[i] + ((j * perBlockBaseN) + ((j < blocksWithExtra) ?
-                                                            j : blocksWithExtra) * 32);
-        cpuBlockParams[i].push_back(param);
+        // Allocate GPU memory on destination GPU
+        HIP_CALL(hipSetDevice(links[i].dstGpu));
+        if (useCoarseMem)
+          HIP_CALL(hipMalloc((void**)&linkDstMem[i], N * sizeof(float)));
+        else
+          HIP_CALL(hipExtMallocWithFlags((void**)&linkDstMem[i], N * sizeof(float), hipDeviceMallocFinegrained));
+
+        // Each block needs to know src/dst pointers and how many elements to transfer
+        // Figure out the sub-array each block does for this link
+        // NOTE: Have each sub-array to work on multiple of 32-floats (128-bytes),
+        //       but divide as evenly as possible
+        // NOTE: N is always a multiple of 32
+        int blocksWithExtra = (N / 32) % links[i].numBlocksToUse;
+        int perBlockBaseN   = (N / 32) / links[i].numBlocksToUse * 32;
+        for (int j = 0; j < links[i].numBlocksToUse; j++)
+        {
+          BlockParam param;
+          param.N   = perBlockBaseN + ((j < blocksWithExtra) ? 32 : 0);
+          param.src = linkSrcMem[i] + ((j * perBlockBaseN) + ((j < blocksWithExtra) ?
+                                                              j : blocksWithExtra) * 32);
+          param.dst = linkDstMem[i] + ((j * perBlockBaseN) + ((j < blocksWithExtra) ?
+                                                              j : blocksWithExtra) * 32);
+          cpuBlockParams[i].push_back(param);
+        }
+
+        HIP_CALL(hipMemcpy(gpuBlockParams[i], cpuBlockParams[i].data(),
+                           sizeof(BlockParam) * links[i].numBlocksToUse, hipMemcpyHostToDevice));
      }

-      HIP_CALL(hipMemcpy(gpuBlockParams[i], cpuBlockParams[i].data(),
-                         sizeof(BlockParam) * links[i].numBlocksToUse, hipMemcpyHostToDevice));
-    }
+      // Launch kernels (warmup iterations are not counted)
+      double totalCpuTime = 0;
+      double totalGpuTime[numDevices];
+      for (int i = 0; i < numDevices; i++) totalGpuTime[i] = 0.0;

-    // Launch kernels (warmup iterations are not counted)
-    double totalCpuTime = 0;
-    double totalGpuTime[numDevices];
-    for (int i = 0; i < numDevices; i++) totalGpuTime[i] = 0.0;
-
-    for (int iteration = -numWarmups; iteration < numIterations; iteration++)
-    {
-      if (useInteractive && iteration == 0)
+      for (int iteration = -numWarmups; iteration < numIterations; iteration++)
      {
-        printf("Hit <Enter> to continue: ");
+        if (useInteractive && iteration == 0)
+        {
+          printf("Hit <Enter> to continue: ");
+          scanf("%*c");
+          printf("\n");
+        }
+
+        auto cpuStart = std::chrono::high_resolution_clock::now();
+
+#pragma omp parallel for num_threads(numLinks)
+        for (int i = 0; i < numLinks; i++)
+        {
+          HIP_CALL(hipSetDevice(links[i].srcGpu));
+
+          hipEvent_t startEvent = nullptr;
+          hipEvent_t stopEvent = nullptr;
+          if (!useSingleSync || iteration == 0)
+            startEvent = startEvents[i];
+          if (!useSingleSync || iteration == numIterations - 1)
+            stopEvent = stopEvents[i];
+
+          if (useHipCall)
+          {
+            if (startEvent != nullptr)
+              HIP_CALL(hipEventRecord(startEvent, streams[i]));
+            if (useMemset)
+            {
+              HIP_CALL(hipMemsetAsync(linkDstMem[i], 42, N * sizeof(float), streams[i]));
+            }
+            else
+            {
+              HIP_CALL(hipMemcpyAsync(linkDstMem[i], linkSrcMem[i],
+                                      N * sizeof(float), hipMemcpyDeviceToDevice,
+                                      streams[i]));
+            }
+            if (stopEvent != nullptr)
+              HIP_CALL(hipEventRecord(stopEvent, streams[i]));
+          }
+          else
+          {
+            hipExtLaunchKernelGGL(useMemset ? MemsetKernel : CopyKernel,
+                                  dim3(links[i].numBlocksToUse, 1, 1),
+                                  dim3(BLOCKSIZE, 1, 1),
+                                  0,
+                                  streams[i],
+                                  startEvent,
+                                  stopEvent,
+                                  0,
+                                  gpuBlockParams[i]);
+          }
+        }
+
+        // Synchronize per iteration, unless in single sync mode, in which case
+        // synchronize during last warmup / last actual iteration
+        if (!useSingleSync || iteration == -1  || iteration == numIterations - 1)
+        {
+          for (int i = 0; i < numLinks; i++)
+            hipStreamSynchronize(streams[i]);
+        }
+
+        auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+        double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+        if (useSleep) usleep(100000);
+
+        if (iteration >= 0)
+        {
+          totalCpuTime += deltaSec;
+
+          for (int i = 0; i < numDevices; i++)
+          {
+            // Collect GPU information only if this is the last iteration for single sync mode
+            if (useSingleSync && iteration != numIterations - 1)
+            {
+              totalGpuTime[i] = 0.00;
+            }
+            else
+            {
+              // Multiple links running on the same device may be running simultaneously
+              // so try to figure out the first/last event across all links
+              float maxTime = 0.0f;
+              for (int j = 0; j < numLinks; j++)
+              {
+                if (links[j].srcGpu != i) continue;
+                for (int k = 0; k < numLinks; k++)
+                {
+                  if (links[k].srcGpu != i) continue;
+
+                  float gpuDeltaMsec;
+                  HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvents[j], stopEvents[k]));
+                  maxTime = std::max(maxTime, gpuDeltaMsec);
+                }
+              }
+              totalGpuTime[i] += maxTime / 1000.0;
+            }
+          }
+        }
+      }
+
+      if (useInteractive)
+      {
+        printf("Transfers complete. Hit <Enter> to continue: ");
        scanf("%*c");
        printf("\n");
      }

-      auto cpuStart = std::chrono::high_resolution_clock::now();
-
-      #pragma omp parallel for num_threads(numLinks)
+      // Validate that each link has transferred correctly
      for (int i = 0; i < numLinks; i++)
+        CheckOrFill(N, linkDstMem[i], true, useMemset, useHipCall);
+
+      // Report timings
+      double totalGpuBandwidth = 0;
+      snprintf(name + strlen(name), MAX_NAME_LEN, "[%lu] ", N * sizeof(float));
+
+      printf("%-*s", MAX_NAME_LEN, name);
+      for (int i = 0; i < numDevices; i++)
      {
-        HIP_CALL(hipSetDevice(links[i].srcGpu));
-
-        if (!useSingleSync || iteration == 0)
-          HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
-
-        if (useHipCall)
+        if (linkCount[i] == 0)
        {
-          if (useMemset)
-          {
-            HIP_CALL(hipMemsetAsync(linkDstMem[i], 42, numBytesPerLink, streams[i]));
-          }
-          else
-          {
-            HIP_CALL(hipMemcpyAsync(linkDstMem[i], linkSrcMem[i],
-                                    numBytesPerLink, hipMemcpyDeviceToDevice,
-                                    streams[i]));
-          }
+          printf("%8.3f", 0.0f);
        }
        else
        {
-          if (useMemset)
-          {
-            hipLaunchKernelGGL(MemsetKernel,
-                               dim3(links[i].numBlocksToUse, 1, 1),
-                               dim3(BLOCKSIZE, 1, 1),
-                               0,
-                               streams[i],
-                               gpuBlockParams[i]);
-          }
-          else
-          {
-            hipLaunchKernelGGL(CopyKernel,
-                               dim3(links[i].numBlocksToUse, 1, 1),
-                               dim3(BLOCKSIZE, 1, 1),
-                               0,
-                               streams[i],
-                               gpuBlockParams[i]);
-          }
-        }
-        if (!useSingleSync || iteration == numIterations - 1)
-          HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
-      }
-
-      // Synchronize per iteration, unless in single sync mode, in which case
-      // synchronize during last warmup / last actual iteration
-      if (!useSingleSync || iteration == -1  || iteration == numIterations - 1)
-      {
-        for (int i = 0; i < numLinks; i++)
-          hipStreamSynchronize(streams[i]);
-      }
-
-      auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-      double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
-      if (useSleep) usleep(100000);
-
-      if (iteration >= 0)
-      {
-        totalCpuTime += deltaSec;
-
-        for (int i = 0; i < numDevices; i++)
-        {
-          // Collect GPU information only if this is the last iteration for single sync mode
-          if (useSingleSync && iteration != numIterations - 1)
-          {
-            totalGpuTime[i] = 0.00;
-          }
-          else
-          {
-            // Multiple links running on the same device may be running simultaneously
-            // so try to figure out the first/last event across all links
-            float maxTime = 0.0f;
-            for (int j = 0; j < numLinks; j++)
-            {
-              if (links[j].srcGpu != i) continue;
-              for (int k = 0; k < numLinks; k++)
-              {
-                if (links[k].srcGpu != i) continue;
-
-                float gpuDeltaMsec;
-                HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvents[j], stopEvents[k]));
-                maxTime = std::max(maxTime, gpuDeltaMsec);
-              }
-            }
-            totalGpuTime[i] += maxTime / 1000.0;
-          }
+          totalGpuTime[i] /= (1.0 * numIterations);
+          double linkBandwidth = (linkCount[i] * N * sizeof(float) / 1.0E9) / totalGpuTime[i];
+          printf("%8.3f", linkBandwidth);
+          totalGpuBandwidth += linkBandwidth;
        }
      }
-    }
+      // Print off total bandwidth
+      totalCpuTime /= numIterations;
+      printf("%8.3f", totalGpuBandwidth);
+      printf(" |");

-    if (useInteractive)
-    {
-      printf("Transfers complete. Hit <Enter> to continue: ");
-      scanf("%*c");
-      printf("\n");
-    }
-
-    // Validate that each link has transferred correctly
-    for (int i = 0; i < numLinks; i++)
-      CheckOrFill(N, linkDstMem[i], true, useMemset, useHipCall);
-
-    // Report timings
-    printf("%-*s", MAX_NAME_LEN, name);
-    for (int i = 0; i < numDevices; i++)
-    {
-      if (linkCount[i] == 0)
+      double maxGpuTime = 0.0;
+      for (int i = 0; i < numDevices; i++)
      {
-        printf("%8.3f", 0.0f);
+        if (linkCount[i] != 0)
+          maxGpuTime = std::max(maxGpuTime, totalGpuTime[i]);
      }
-      else
-      {
-        totalGpuTime[i] /= (1.0 * numIterations);
-        printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]);
-      }
-    }
-    // Print off bandwidth (based on CPU wall-time timer)
-    totalCpuTime /= numIterations;
-    printf("%8.3f", (numLinks * numBytesPerLink / 1.0E9) / totalCpuTime);
-    printf(" | ");
+      printf("%8.3f | %8.3f  %8.3f | %6.2f%%\n",
+             ((numLinks * N * sizeof(float) / 1.0E9) / totalCpuTime),
+             maxGpuTime * 1000.0f,
+             totalCpuTime * 1000.0f,
+             (totalCpuTime - maxGpuTime) / totalCpuTime * 100.0f);

-    double maxGpuTime = 0;
-    for (int i = 0; i < numDevices; i++)
-    {
-      if (linkCount[i] == 0)
+      // Release GPU memory
+      for (int i = 0; i < numLinks; i++)
      {
-        printf("%8.3f", 0.0f);
-      }
-      else
-      {
-        printf("%8.3f", totalGpuTime[i] * 1000.0f);
-        maxGpuTime = std::max(maxGpuTime, totalGpuTime[i]);
-      }
-    }
-    printf("%8.3f | %8.3f\n", totalCpuTime * 1000.0f, (totalCpuTime - maxGpuTime) * 1000.0f);
-
-    // Release GPU memory
-    for (int i = 0; i < numLinks; i++)
-    {
-      HIP_CALL(hipFree(linkSrcMem[i]));
-      HIP_CALL(hipFree(linkDstMem[i]));
-      HIP_CALL(hipFree(gpuBlockParams[i]));
-      if (!reuseStreams)
-        HIP_CALL(hipStreamDestroy(streams[i]));
-      HIP_CALL(hipEventDestroy(startEvents[i]));
-      HIP_CALL(hipEventDestroy(stopEvents[i]));
+        HIP_CALL(hipFree(linkSrcMem[i]));
+        HIP_CALL(hipFree(linkDstMem[i]));
+        HIP_CALL(hipFree(gpuBlockParams[i]));
+        if (!reuseStreams)
+          HIP_CALL(hipStreamDestroy(streams[i]));
+        HIP_CALL(hipEventDestroy(startEvents[i]));
+        HIP_CALL(hipEventDestroy(stopEvents[i]));

+      }
    }
  }
  fclose(fp);
@@ -447,9 +506,7 @@ int main(int argc, char **argv)

  // Print link information
  for (int i = 0; i < MAX_NAME_LEN + (8 * (numDevices + 1)); i++) printf("=");
-  printf("=|=");
-  for (int i = 0; i < (8 * (numDevices + 1)); i++) printf("=");
-  printf("=|=========\n");
+  printf("=|=========|====================|=========\n");
  printf("Link topology:\n");
  uint32_t linkType;
  uint32_t hopCount;
@@ -94,12 +94,31 @@ void ParseLinks(char const* line, std::vector<Link>& links)
    iss.clear();
    iss.str(line);
    iss >> numLinks;
-    links.resize(numLinks);
    if (iss.fail()) return;

-
-    for (int i = 0; i < numLinks; i++)
+    if (numLinks > 0)
+    {
+      // Method 1: Take in triples (src, dst, # blocks to use)
+      links.resize(numLinks);
+      for (int i = 0; i < numLinks; i++)
        iss >> links[i].srcGpu >> links[i].dstGpu >> links[i].numBlocksToUse;
+
+    }
+    else
+    {
+      // Method 2: Read common # blocks to use, then read (src, dst) doubles
+      int numBlocksToUse;
+      iss >> numBlocksToUse;
+      if (iss.fail()) return;
+
+      numLinks *= -1;
+      links.resize(numLinks);
+      for (int i = 0; i < numLinks; i++)
+      {
+        iss >> links[i].srcGpu >> links[i].dstGpu;
+        links[i].numBlocksToUse = numBlocksToUse;
+      }
+    }
 }

 // Helper function to either fill a device pointer with pseudo-random data, or to check to see if it matches