From ae0c4092c7c0149391dfc30e6dcbc3ea68726bfa Mon Sep 17 00:00:00 2001
From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com>
Date: Fri, 11 Dec 2020 10:21:14 -0700
Subject: [PATCH] [TransferBench] Adding ability to perform CPU-executed
 copies, various upgrades (#309)

* Adding CPU based execution, fixing typos, adding Fine-grained mem
* Exposing sampling factor when generating range of data sizes
* Refactoring how Links are launched, now once per thread
* Documentation updates
---
 tools/TransferBench/EnvVars.hpp       | 137 ++++
 tools/TransferBench/Makefile          |   2 +-
 tools/TransferBench/TransferBench.cpp | 988 +++++++++++++++-----------
 tools/TransferBench/TransferBench.hpp | 104 ++-
 tools/TransferBench/example.cfg       |  68 +-
 5 files changed, 802 insertions(+), 497 deletions(-)
 create mode 100644 tools/TransferBench/EnvVars.hpp

diff --git a/tools/TransferBench/EnvVars.hpp b/tools/TransferBench/EnvVars.hpp
new file mode 100644
index 0000000000..258f0ee33c
--- /dev/null
+++ b/tools/TransferBench/EnvVars.hpp
@@ -0,0 +1,137 @@
+#ifndef ENVVARS_HPP
+#define ENVVARS_HPP
+
+// This class manages environment variable that affect TransferBench
+class EnvVars
+{
+public:
+  // Default configuration values
+  int const DEFAULT_NUM_WARMUPS      =  3;
+  int const DEFAULT_NUM_ITERATIONS   = 10;
+  int const DEFAULT_SAMPLING_FACTOR  =  1;
+  int const DEFAULT_NUM_CPU_PER_LINK =  4;
+
+  // Environment variables
+  int useHipCall;      // Use hipMemcpy/hipMemset instead of custom shader kernels
+  int useMemset;       // Perform a memset instead of a copy (ignores source memory)
+  int useSingleSync;   // Perform synchronization only once after all iterations instead of per iteration
+  int useInteractive;  // Pause for user-input before starting transfer loop
+  int useSleep;        // Adds a 100ms sleep after each synchronization
+  int combineTiming;   // Combines the timing with kernel launch
+  int showAddr;        // Print out memory addresses for each Link
+  int outputToCsv;     // Output in CSV format
+  int byteOffset;      // Byte-offset for memory allocations
+  int numWarmups;      // Number of un-timed warmup iterations to perform
+  int numIterations;   // Number of timed iterations to perform
+  int samplingFactor;  // Affects how many different values of N are generated (when N set to 0)
+  int numCpuPerLink;   // Number of CPU child threads to use per CPU link
+
+  // Constructor that collects values
+  EnvVars()
+  {
+    useHipCall      = GetEnvVar("USE_HIP_CALL"     , 0);
+    useMemset       = GetEnvVar("USE_MEMSET"       , 0);
+    useSingleSync   = GetEnvVar("USE_SINGLE_SYNC"  , 0);
+    useInteractive  = GetEnvVar("USE_INTERACTIVE"  , 0);
+    combineTiming   = GetEnvVar("COMBINE_TIMING"   , 0);
+    showAddr        = GetEnvVar("SHOW_ADDR"        , 0);
+    outputToCsv     = GetEnvVar("OUTPUT_TO_CSV"    , 0);
+    byteOffset      = GetEnvVar("BYTE_OFFSET"      , 0);
+    numWarmups      = GetEnvVar("NUM_WARMUPS"      , DEFAULT_NUM_WARMUPS);
+    numIterations   = GetEnvVar("NUM_ITERATIONS"   , DEFAULT_NUM_ITERATIONS);
+    samplingFactor  = GetEnvVar("SAMPLING_FACTOR"  , DEFAULT_SAMPLING_FACTOR);
+    numCpuPerLink   = GetEnvVar("NUM_CPU_PER_LINK" , DEFAULT_NUM_CPU_PER_LINK);
+
+    // Perform some basic validation
+    if (byteOffset % sizeof(float))
+    {
+      printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
+      exit(1);
+    }
+    if (numWarmups < 0)
+    {
+      printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
+      exit(1);
+    }
+    if (numIterations <= 0)
+    {
+      printf("[ERROR] NUM_ITERATIONS must be set to a positive number\n");
+      exit(1);
+    }
+    if (samplingFactor < 1)
+    {
+      printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n");
+      exit(1);
+    }
+    if (numCpuPerLink < 1)
+    {
+      printf("[ERROR] NUM_CPU_PER_LINK must be greater or equal to 1\n");
+      exit(1);
+    }
+  }
+
+  // Display info on the env vars that can be used
+  static void DisplayUsage()
+  {
+    printf("Environment variables:\n");
+    printf("======================\n");
+    printf(" USE_HIP_CALL       - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n");
+    printf(" USE_MEMSET         - Perform a memset instead of a copy (ignores source memory)\n");
+    printf(" USE_SINGLE_SYNC    - Perform synchronization only once after all iterations instead of per iteration\n");
+    printf(" USE_INTERACTIVE    - Pause for user-input before starting transfer loop\n");
+    printf(" COMBINE_TIMING     - Combines timing with launch (potentially lower timing overhead)\n");
+    printf(" SHOW_ADDR          - Print out memory addresses for each Link\n");
+    printf(" OUTPUT_TO_CSV      - Outputs to CSV format if set\n");
+    printf(" BYTE_OFFSET        - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
+    printf(" NUM_WARMUPS=W      - Perform W untimed warmup iteration(s) per test\n");
+    printf(" NUM_ITERATIONS=I   - Perform I timed iteration(s) per test\n");
+    printf(" SAMPLING_FACTOR=F  - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
+    printf(" NUM_CPU_PER_LINK=C - Use C threads per Link for CPU-executed copies\n");
+  }
+
+  // Display env var settings
+  void DisplayEnvVars() const
+  {
+    if (!outputToCsv)
+    {
+      printf("Run configuration\n");
+      printf("=====================================================\n");
+      printf("%-20s = %12d : Using %s for GPU-executed copies\n", "USE_HIP_CALL", useHipCall,
+             useHipCall ? "HIP functions" : "custom kernels");
+      printf("%-20s = %12d : Performing %s\n", "USE_MEMSET", useMemset,
+             useMemset ? "memset" : "memcopy");
+      if (useHipCall && !useMemset)
+      {
+        char* env = getenv("HSA_ENABLE_SDMA");
+        printf("%-20s = %12s : %s\n", "HSA_ENABLE_SDMA", env,
+               (env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
+      }
+      printf("%-20s = %12d : %s\n", "USE_SINGLE_SYNC", useSingleSync,
+             useSingleSync ? "Synchronizing only once, after all iterations" : "Synchronizing per iteration");
+      printf("%-20s = %12d : Running in %s mode\n", "USE_INTERACTIVE", useInteractive,
+             useInteractive ? "interactive" : "non-interactive");
+      printf("%-20s = %12d : %s\n", "COMBINE_TIMING", combineTiming,
+             combineTiming ? "Using combined timing+launch" : "Using separate timing / launch");
+      printf("%-20s = %12d : %s\n", "SHOW_ADDR", showAddr,
+             showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses");
+      printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
+             outputToCsv ? "CSV" : "console");
+      printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
+      printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
+      printf("%-20s = %12d : Running %d timed iteration(s) per topology\n", "NUM_ITERATIONS", numIterations, numIterations);
+      printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Link\n", "NUM_CPU_PER_LINK", numCpuPerLink, numCpuPerLink);
+      printf("\n");
+    }
+  };
+
+private:
+  // Helper function that gets parses environment variable or sets to default value
+  int GetEnvVar(std::string const varname, int defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return atoi(getenv(varname.c_str()));
+    return defaultValue;
+  }
+};
+
+#endif
diff --git a/tools/TransferBench/Makefile b/tools/TransferBench/Makefile
index af0b4597b7..c51bd47b4c 100644
--- a/tools/TransferBench/Makefile
+++ b/tools/TransferBench/Makefile
@@ -6,7 +6,7 @@ endif
 HIPCC=$(HIP_PATH)/bin/hipcc
 
 EXE=TransferBench
-CXXFLAGS = -O3 -I../../src/include -I.
+CXXFLAGS = -O3 -I../../src/include -I. -lnuma
 
 all: $(EXE)
 
diff --git a/tools/TransferBench/TransferBench.cpp b/tools/TransferBench/TransferBench.cpp
index 531589695d..f6d90fdb29 100644
--- a/tools/TransferBench/TransferBench.cpp
+++ b/tools/TransferBench/TransferBench.cpp
@@ -24,11 +24,13 @@ THE SOFTWARE.
 // on the same node
 
 #include "TransferBench.hpp"
+#include <numa.h>
+#include <numaif.h>
+#include <stack>
+#include <thread>
 
 // Simple configuration parameters
-size_t const DEFAULT_BYTES_PER_LINK = (1<<26);
-int    const DEFAULT_NUM_WARMUPS    = 3;
-int    const DEFAULT_NUM_ITERATIONS = 10;
+size_t const DEFAULT_BYTES_PER_LINK = (1<<26);  // Amount of data transferred per Link
 
 int main(int argc, char **argv)
 {
@@ -41,130 +43,13 @@ int main(int argc, char **argv)
   }
 
   // If a negative value is listed for N, generate a comprehensive config file for this node
-  if (argc > 2 && atoi(argv[2]) < 0)
+  if (argc > 2 && atoll(argv[2]) < 0)
   {
     GenerateConfigFile(argv[1], -1*atoi(argv[2]));
     exit(0);
   }
 
-  // Collect environment variables / display current run configuration
-  bool useHipCall      = getenv("USE_HIP_CALL");       // Use hipMemcpy/hipMemset instead of custom shader kernels
-  bool useMemset       = getenv("USE_MEMSET");         // Perform a memset instead of a copy (ignores source memory)
-  bool useFineGrainMem = getenv("USE_FINEGRAIN_MEM");  // Allocate fine-grained GPU memory instead of coarse-grained GPU memory
-  bool useSingleSync   = getenv("USE_SINGLE_SYNC");    // Perform synchronization only once after all iterations instead of per iteration
-  bool useInteractive  = getenv("USE_INTERACTIVE");    // Pause for user-input before starting transfer loop
-  bool useSleep        = getenv("USE_SLEEP");          // Adds a 100ms sleep after each synchronization
-  bool combineTiming   = getenv("COMBINE_TIMING");     // Combines the timing with kernel launch
-  bool reuseStreams    = getenv("REUSE_STREAMS");      // Re-use streams instead of creating / destroying per test
-  bool showAddr        = getenv("SHOW_ADDR");          // Print out memory addresses for each Link
-  bool outputToCsv     = getenv("OUTPUT_TO_CSV");      // Output in CSV format
-  int  byteOffset      = getenv("BYTE_OFFSET") ? atoi(getenv("BYTE_OFFSET")) : 0; // Byte-offset for memory allocations
-  int  numWarmups      = getenv("NUM_WARMUPS") ? atoi(getenv("NUM_WARMUPS")) : DEFAULT_NUM_WARMUPS;
-  int  numIterations   = getenv("NUM_ITERATIONS") ? atoi(getenv("NUM_ITERATIONS")) : DEFAULT_NUM_ITERATIONS;
-
-  // Determine number of bytes to run per link
-  // If a non-zero number of bytes is specified, use it
-  // Otherwise generate array of bytes values to execute over
-  std::vector<size_t> valuesOfN;
-  size_t const numBytesPerLink = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_LINK;
-
-  if (numBytesPerLink % 4)
-  {
-    printf("[ERROR] numBytesPerLink (%lu) must be a multiple of 4\n", numBytesPerLink);
-    exit(1);
-  }
-
-  if (numBytesPerLink != 0)
-  {
-    size_t N = numBytesPerLink / sizeof(float);
-    valuesOfN.push_back(N);
-  }
-  else
-  {
-    for (int N = 256; N <= (1<<27); N *= 2)
-    {
-      int decimationFactor = 1;  // This can be modified to increase number of samples between powers of two
-      int delta = std::max(32, N / decimationFactor);
-      int curr = N;
-      while (curr < N * 2)
-      {
-        valuesOfN.push_back(curr);
-        curr += delta;
-      }
-    }
-  }
-
-  if (byteOffset % 4)
-  {
-    printf("[ERROR] byteOffset must be a multiple of 4\n");
-    exit(1);
-  }
-  int initOffset = byteOffset / sizeof(float);
-
-  char *env;
-  if (!outputToCsv)
-  {
-    printf("Run configuration\n");
-    printf("=====================================================\n");
-    printf("%-20s %8s: Using %s\n",
-           "USE_HIP_CALL", useHipCall ? "(set)" : "(unset)",
-           useHipCall ? "HIP functions" : "custom kernels");
-    printf("%-20s %8s: Performing %s\n",
-           "USE_MEMSET", useMemset ? "(set)" : "(unset)",
-           useMemset ? "memset" : "memcopy");
-    if (useHipCall && !useMemset)
-    {
-      env = getenv("HSA_ENABLE_SDMA");
-      printf("%-20s %8s: %s\n",
-             "HSA_ENABLE_SDMA", env ? env : "(unset)",
-             (env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
-    }
-    printf("%-20s %8s: GPU destination memory type: %s-grained\n",
-           "USE_FINEGRAIN_MEM", useFineGrainMem ? "(set)" : "(unset)",
-           useFineGrainMem ? "fine" : "coarse");
-    printf("%-20s %8s: %s\n",
-           "USE_SINGLE_SYNC", useSingleSync ? "(set)" : "(unset)",
-           useSingleSync ? "Synchronizing only once, after all iterations" : "Synchronizing per iteration");
-    printf("%-20s %8s: Running in %s mode\n",
-           "USE_INTERACTIVE", useInteractive ? "(set)" : "(unset)",
-           useInteractive ? "interactive" : "non-interactive");
-    printf("%-20s %8s: %s\n",
-           "USE_SLEEP", useSleep ? "(set)" : "(unset)",
-           useSleep ? "Add sleep after each sync" : "No sleep per sync");
-    printf("%-20s %8s: %s\n",
-           "COMBINE_TIMING", combineTiming ? "(set)" : "(unset)",
-           combineTiming ? "Using combined timing+launch" : "Using separate timing / launch");
-    printf("%-20s %8s: %s\n",
-           "REUSE_STREAMS", reuseStreams ? "(set)" : "(unset)",
-           reuseStreams ? "Re-using streams per topology" : "Creating/destroying streams per topology");
-    printf("%-20s %8s: %s\n",
-           "SHOW_ADDR", showAddr ? "(set)" : "(unset)",
-           showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses");
-    env = getenv("OUTPUT_TO_CSV");
-    printf("%-20s %8s: Output to csv\n",
-           "OUTPUT_TO_CSV", env ? env : "(unset)");
-    env = getenv("BYTE_OFFSET");
-    printf("%-20s %8s: Using byte offset of %d\n",
-           "BYTE_OFFSET", env ? env : "(unset)", byteOffset);
-    env = getenv("NUM_WARMUPS");
-    printf("%-20s %8s: Running %d warmup iteration(s) per topology\n",
-           "NUM_WARMUPS", env ? env : "(unset)", numWarmups);
-    env = getenv("NUM_ITERATIONS");
-    printf("%-20s %8s: Running %d timed iteration(s) per topology\n",
-           "NUM_ITERATIONS", env ? env : "(unset)", numIterations);
-    printf("\n");
-  }
-
-  // Collect the number of available CPUs/GPUs on this machine
-  int numGpuDevices;
-  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-  if (numGpuDevices < 1)
-  {
-    printf("[ERROR] No GPU devices found\n");
-    exit(1);
-  }
-
-  // Read configuration file
+  // Check that Link configuration file can be opened
   FILE* fp = fopen(argv[1], "r");
   if (!fp)
   {
@@ -172,148 +57,177 @@ int main(int argc, char **argv)
     exit(1);
   }
 
-  // Track links that get used
-  std::map<std::pair<int, int>, int> linkMap;
-  std::vector<std::vector<hipStream_t>> streamCache(numGpuDevices);
-
-  // Print CSV header
-  if (outputToCsv)
+  // Check for NUMA library support
+  if (numa_available() == -1)
   {
-    printf("Test,NumBytes,ExeGpu,SrcMem,DstMem,BW(GB/s),Time(ms),LinkDesc,SrcAddr,DstAddr,numWarmups,numIters,useHipCall,useMemSet,useFineGrain,useSingleSync,resuseStreams\n");
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
   }
 
-  // Loop over each line in the configuration file
+  // Collect environment variables / display current run configuration
+  EnvVars ev;
+  ev.DisplayEnvVars();
+
+  // Determine number of bytes to run per Link
+  // If a non-zero number of bytes is specified, use it
+  // Otherwise generate array of bytes values to execute over
+  std::vector<size_t> valuesOfN;
+  size_t const numBytesPerLink = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_LINK;
+  PopulateTestSizes(numBytesPerLink, ev.samplingFactor, valuesOfN);
+
+  // Find the largest N to be used - memory will only be allocated once per link config
+  size_t maxN = valuesOfN[0];
+  for (auto N : valuesOfN)
+    maxN = std::max(maxN, N);
+
+  int const initOffset = ev.byteOffset / sizeof(float);
+  std::stack<std::thread> threads;
+
+  // Collect the number of available CPUs/GPUs on this machine
+  int numGpuDevices;
+  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+  int const numCpuDevices = numa_num_configured_nodes();
+
+  // Track links that get used
+  std::set<std::pair<int, int>> peerAccessTracker;
+
+  // Print CSV header
+  if (ev.outputToCsv)
+  {
+    printf("Test,NumBytes,Executor,SrcMem,DstMem,CUs,BW(GB/s),Time(ms),LinkDesc,SrcAddr,DstAddr,ByteOffset,numWarmups,numIters,useHipCall,useMemSet,useSingleSync,combinedTiming\n");
+  }
+
+  // Loop over each line in the Link configuration file
   int testNum = 0;
   char line[2048];
   while(fgets(line, 2048, fp))
   {
     // Parse links from configuration file
     std::vector<Link> links;
-    ParseLinks(line, links);
+    ParseLinks(line, numCpuDevices, numGpuDevices, links);
 
     int const numLinks = links.size();
     if (numLinks == 0) continue;
     testNum++;
 
+    // Prepare link
+    for (int i = 0; i < numLinks; i++)
+    {
+      // Get some aliases to link variables
+      MemType const& exeMemType  = links[i].exeMemType;
+      int     const& exeIndex    = links[i].exeIndex;
+      MemType const& srcMemType  = links[i].srcMemType;
+      MemType const& dstMemType  = links[i].dstMemType;
+      int     const& srcIndex    = links[i].srcIndex;
+      int     const& dstIndex    = links[i].dstIndex;
+      int     const& blocksToUse = links[i].numBlocksToUse;
+
+      // Enable peer-to-peer access if necessary (can only be called once per unique pair)
+      if (exeMemType == MEM_GPU)
+      {
+        // Ensure executing GPU can access source memory
+        if ((srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) && srcIndex != exeIndex)
+        {
+          auto exeSrcPair = std::make_pair(exeIndex, srcIndex);
+           if (!peerAccessTracker.count(exeSrcPair))
+          {
+            EnablePeerAccess(exeIndex, srcIndex);
+            peerAccessTracker.insert(exeSrcPair);
+          }
+        }
+
+        // Ensure executing GPU can access destination memory
+        if ((dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) && dstIndex != exeIndex)
+        {
+          auto exeDstPair = std::make_pair(exeIndex, dstIndex);
+          if (!peerAccessTracker.count(exeDstPair))
+          {
+            EnablePeerAccess(exeIndex, dstIndex);
+            peerAccessTracker.insert(exeDstPair);
+          }
+        }
+      }
+
+      // Allocate (maximum) source / destination memory based on type / device index
+      AllocateMemory(srcMemType, srcIndex, maxN * sizeof(float) + ev.byteOffset, &links[i].srcMem);
+      AllocateMemory(dstMemType, dstIndex, maxN * sizeof(float) + ev.byteOffset, &links[i].dstMem);
+
+      // Prepare execution agent
+      if (exeMemType == MEM_GPU)
+      {
+        HIP_CALL(hipSetDevice(exeIndex));
+        HIP_CALL(hipEventCreate(&links[i].startEvent));
+        HIP_CALL(hipEventCreate(&links[i].stopEvent));
+        HIP_CALL(hipMalloc((void**)&links[i].blockParam, sizeof(BlockParam) * blocksToUse));
+        HIP_CALL(hipStreamCreate(&links[i].stream));
+      }
+      else if (exeMemType == MEM_CPU)
+      {
+        links[i].blockParam = (BlockParam*)malloc(ev.numCpuPerLink * sizeof(BlockParam));
+      }
+    }
+
     // Loop over all the different number of bytes to use per Link
     for (auto N : valuesOfN)
     {
-      if (!outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float));
-      float*                  linkSrcMem[numLinks];        // Source memory per Link
-      float*                  linkDstMem[numLinks];        // Destination memory per Link
-      hipStream_t             streams[numLinks];           // hipStream to use per Link
-      hipEvent_t              startEvents[numLinks];       // Start event per Link
-      hipEvent_t              stopEvents[numLinks];        // Stop event per Link
-      std::vector<BlockParam> cpuBlockParams[numLinks];    // CPU copy of block parameters
-      BlockParam*             gpuBlockParams[numLinks];    // GPU copy of block parameters
-
-      // Clear counters
-      int linkCount[numGpuDevices];
-      for (int i = 0; i < numGpuDevices; i++)
-        linkCount[i] = 0;
+      if (!ev.outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float));
 
+      // Prepare links based on current N
       for (int i = 0; i < numLinks; i++)
       {
-        MemType srcMemType  = links[i].srcMemType;
-        MemType dstMemType  = links[i].dstMemType;
-        int     exeIndex    = links[i].exeIndex;
-        int     srcIndex    = links[i].srcIndex;
-        int     dstIndex    = links[i].dstIndex;
-        int     blocksToUse = links[i].numBlocksToUse;
-
-        // Check for valid src/dst indices
-        if ((srcIndex < 0 || srcIndex >= numGpuDevices) ||
-            (dstIndex < 0 || dstIndex >= numGpuDevices) ||
-            (exeIndex < 0 || exeIndex >= numGpuDevices))
-        {
-          printf("[ERROR] Invalid link %d:(%c%d->%c%d) GPU index must be between 0 and %d inclusively\n",
-                 exeIndex, MemTypeStr[srcMemType], srcIndex, MemTypeStr[dstMemType], dstIndex, numGpuDevices-1);
-          exit(1);
-        }
-
-        // Enable peer-to-peer access if this is the first time seeing this pair
-        if (srcMemType == MEM_GPU && dstMemType == MEM_GPU)
-        {
-          auto linkPair = std::make_pair(srcIndex, dstIndex);
-          linkMap[linkPair]++;
-          if (linkMap[linkPair] == 1 && srcIndex != dstIndex)
-          {
-            int canAccess;
-            HIP_CALL(hipDeviceCanAccessPeer(&canAccess, srcIndex, dstIndex));
-            if (!canAccess)
-            {
-              printf("[ERROR] Unable to enable peer access between GPU devices %d and %d\n", srcIndex, dstIndex);
-              exit(1);
-            }
-            HIP_CALL(hipSetDevice(srcIndex));
-            HIP_CALL(hipDeviceEnablePeerAccess(dstIndex, 0));
-          }
-        }
-
-        // Allocate hipEvents / hipStreams on executing GPU
-        HIP_CALL(hipSetDevice(exeIndex));
-        HIP_CALL(hipEventCreate(&startEvents[i]));
-        HIP_CALL(hipEventCreate(&stopEvents[i]));
-        HIP_CALL(hipMalloc((void**)&gpuBlockParams[i], sizeof(BlockParam) * numLinks));
-        if (reuseStreams)
-        {
-          // If re-using streams, create new stream, otherwise point to existing stream
-          if (streamCache[exeIndex].size() <= linkCount[exeIndex])
-          {
-            streamCache[exeIndex].resize(linkCount[exeIndex] + 1);
-            HIP_CALL(hipStreamCreate(&streamCache[exeIndex][linkCount[exeIndex]]));
-          }
-          streams[i] = streamCache[exeIndex][linkCount[exeIndex]];
-        }
-        else
-        {
-          HIP_CALL(hipStreamCreate(&streams[i]));
-        }
-
-        // Allocate source / destination memory based on type / device index
-        AllocateMemory(srcMemType, srcIndex, N * sizeof(float) + byteOffset, useFineGrainMem, &linkSrcMem[i]);
-        AllocateMemory(dstMemType, dstIndex, N * sizeof(float) + byteOffset, useFineGrainMem, &linkDstMem[i]);
-
         // Initialize source memory with patterned data
-        CheckOrFill(MODE_FILL, N, useMemset, useHipCall, linkSrcMem[i] + initOffset);
-
-        // Count # of links / total blocks each GPU will be working on
-        linkCount[exeIndex]++;
-
+        CheckOrFill(MODE_FILL, N, ev.useMemset, ev.useHipCall, links[i].srcMem + initOffset);
 
         // Each block needs to know src/dst pointers and how many elements to transfer
         // Figure out the sub-array each block does for this Link
         // - Partition N as evenly as posible, but try to keep blocks as multiples of 32,
         //   except the very last one, for alignment reasons
-        size_t assigned = 0;
-        int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)links[i].numBlocksToUse);
-        for (int j = 0; j < links[i].numBlocksToUse; j++)
+        if (links[i].exeMemType == MEM_GPU)
         {
-          BlockParam param;
-          int blocksLeft = std::max(0, maxNumBlocksToUse - j);
-          size_t leftover = N - assigned;
-          size_t roundedN = (leftover + 31) / 32;
-          param.N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0;
-          param.src = linkSrcMem[i] + assigned + initOffset;
-          param.dst = linkDstMem[i] + assigned + initOffset;
-          assigned += param.N;
-          cpuBlockParams[i].push_back(param);
+          size_t assigned = 0;
+          int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)links[i].numBlocksToUse);
+          for (int j = 0; j < links[i].numBlocksToUse; j++)
+          {
+            BlockParam param;
+            int blocksLeft = std::max(0, maxNumBlocksToUse - j);
+            size_t leftover = N - assigned;
+            size_t roundedN = (leftover + 31) / 32;
+            param.N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0;
+            param.src = links[i].srcMem + assigned + initOffset;
+            param.dst = links[i].dstMem + assigned + initOffset;
+            assigned += param.N;
+
+            HIP_CALL(hipMemcpy(&links[i].blockParam[j], &param, sizeof(BlockParam), hipMemcpyHostToDevice));
+          }
+        }
+        else if (links[i].exeMemType == MEM_CPU)
+        {
+          // For CPU-based copy, divded based on the number of child threads
+          size_t assigned = 0;
+          int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)ev.numCpuPerLink);
+          for (int j = 0; j < ev.numCpuPerLink; j++)
+          {
+            int blocksLeft = std::max(0, maxNumBlocksToUse - j);
+            size_t leftover = N - assigned;
+            size_t roundedN = (leftover + 31) / 32;
+            links[i].blockParam[j].N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0;
+            links[i].blockParam[j].src = links[i].srcMem + assigned + initOffset;
+            links[i].blockParam[j].dst = links[i].dstMem + assigned + initOffset;
+            assigned += links[i].blockParam[j].N;
+          }
         }
 
-        HIP_CALL(hipMemcpy(gpuBlockParams[i], cpuBlockParams[i].data(),
-                           sizeof(BlockParam) * links[i].numBlocksToUse, hipMemcpyHostToDevice));
+        // Initialize timing
+        links[i].totalTime = 0.0;
       }
 
-      // Launch kernels (warmup iterations are not counted)
       double totalCpuTime = 0;
-      double totalGpuTime[numLinks];
 
-      for (int i = 0; i < numLinks; i++) totalGpuTime[i] = 0.0;
-
-      for (int iteration = -numWarmups; iteration < numIterations; iteration++)
+      // Launch kernels (warmup iterations are not counted)
+      for (int iteration = -ev.numWarmups; iteration < ev.numIterations; iteration++)
       {
         // Pause before starting first timed iteration in interactive mode
-        if (useInteractive && iteration == 0)
+        if (ev.useInteractive && iteration == 0)
         {
           printf("Hit <Enter> to continue: ");
           scanf("%*c");
@@ -323,80 +237,26 @@ int main(int argc, char **argv)
         // Start CPU timing for this iteration
         auto cpuStart = std::chrono::high_resolution_clock::now();
 
-        // Enqueue all links
+        // Execute all links in parallel
+        for (int i = 0; i < numLinks; i++)
+          threads.push(std::thread(RunLink, std::ref(ev), N, iteration, std::ref(links[i])));
+
+        // Wait for all threads to finish
         for (int i = 0; i < numLinks; i++)
         {
-          HIP_CALL(hipSetDevice(links[i].exeIndex));
-
-          bool recordStart = (!useSingleSync || iteration == 0);
-          bool recordStop  = (!useSingleSync || iteration == numIterations - 1);
-
-          if (useHipCall)
-          {
-            // Record start event
-            if (recordStart) HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
-
-            // Execute hipMemset / hipMemcpy
-            if (useMemset)
-              HIP_CALL(hipMemsetAsync(linkDstMem[i] + initOffset, 42, N * sizeof(float), streams[i]));
-            else
-              HIP_CALL(hipMemcpyAsync(linkDstMem[i] + initOffset,
-                                      linkSrcMem[i] + initOffset,
-                                      N * sizeof(float), hipMemcpyDeviceToDevice,
-                                      streams[i]));
-            // Record stop event
-            if (recordStop) HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
-          }
-          else
-          {
-            if (!combineTiming && recordStart) HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
-            hipExtLaunchKernelGGL(useMemset ? MemsetKernel : CopyKernel,
-                                  dim3(links[i].numBlocksToUse, 1, 1),
-                                  dim3(BLOCKSIZE, 1, 1),
-                                  0, streams[i],
-                                  (combineTiming && recordStart) ? startEvents[i] : NULL,
-                                  (combineTiming && recordStop)  ?  stopEvents[i] : NULL,
-                                  0, gpuBlockParams[i]);
-            if (!combineTiming & recordStop) HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
-          }
-        }
-
-        // Synchronize per iteration, unless in single sync mode, in which case
-        // synchronize during last warmup / last actual iteration
-        if (!useSingleSync || iteration == -1 || iteration == numIterations - 1)
-        {
-          for (int i = 0; i < numLinks; i++)
-          {
-            HIP_CALL(hipSetDevice(links[i].exeIndex));
-            hipStreamSynchronize(streams[i]);
-          }
+          threads.top().join();
+          threads.pop();
         }
 
         // Stop CPU timing for this iteration
         auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
         double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
-        if (useSleep) usleep(100000);
 
-        if (iteration >= 0)
-        {
-          totalCpuTime += deltaSec;
-
-          // Record GPU timing
-          if (!useSingleSync || iteration == numIterations - 1)
-          {
-            for (int i = 0; i < numLinks; i++)
-            {
-              HIP_CALL(hipSetDevice(links[i].exeIndex));
-              HIP_CALL(hipEventSynchronize(stopEvents[i]));
-              float gpuDeltaMsec;
-              HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvents[i], stopEvents[i]));
-              totalGpuTime[i] += gpuDeltaMsec;
-            }
-          }
-        }
+        if (iteration >= 0) totalCpuTime += deltaSec;
       }
 
-      if (useInteractive)
+      // Pause for interactive mode
+      if (ev.useInteractive)
       {
         printf("Transfers complete. Hit <Enter> to continue: ");
         scanf("%*c");
@@ -405,89 +265,100 @@ int main(int argc, char **argv)
 
       // Validate that each link has transferred correctly
       for (int i = 0; i < numLinks; i++)
-        CheckOrFill(MODE_CHECK, N, useMemset, useHipCall, linkDstMem[i] + initOffset);
+        CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, links[i].dstMem + initOffset);
 
       // Report timings
-      totalCpuTime = totalCpuTime / (1.0 * numIterations) * 1000;
+      totalCpuTime = totalCpuTime / (1.0 * ev.numIterations) * 1000;
       double totalBandwidthGbs = (numLinks * N * sizeof(float) / 1.0E6) / totalCpuTime;
       for (int i = 0; i < numLinks; i++)
       {
-        double linkDurationMsec = totalGpuTime[i] / (1.0 * numIterations);
+        double linkDurationMsec = links[i].totalTime / (1.0 * ev.numIterations);
         double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f;
-        if (!outputToCsv)
+        if (!ev.outputToCsv)
         {
-          printf(" Link %02d: %c%02d -> [GPU %02d:%02d] -> %c%02d | %9.3f GB/s | %8.3f ms | %9s |",
+          printf(" Link %02d: %c%02d -> [%cPU %02d:%02d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s",
                  i + 1,
                  MemTypeStr[links[i].srcMemType], links[i].srcIndex,
-                 links[i].exeIndex, links[i].numBlocksToUse,
+                 MemTypeStr[links[i].exeMemType], links[i].exeIndex,
+                 links[i].exeMemType == MEM_CPU ? ev.numCpuPerLink : links[i].numBlocksToUse,
                  MemTypeStr[links[i].dstMemType], links[i].dstIndex,
                  linkBandwidthGbs, linkDurationMsec,
                  GetLinkDesc(links[i]).c_str());
-          if (showAddr) printf(" %16p | %16p |", linkSrcMem[i] + initOffset, linkDstMem[i] + initOffset);
+          if (ev.showAddr) printf(" %16p | %16p |", links[i].srcMem + initOffset, links[i].dstMem + initOffset);
           printf("\n");
         }
         else
         {
-          printf("%d,%lu,%02d,%c%02d,%c%02d,%9.3f,%8.3f,%s,%p,%p,%d,%d,%s,%s,%s,%s,%s\n",
-                 testNum, N * sizeof(float), links[i].exeIndex,
+          printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%9.3f,%8.3f,%s,%p,%p,%d,%d,%d,%s,%s,%s,%s\n",
+                 testNum, N * sizeof(float),
                  MemTypeStr[links[i].srcMemType], links[i].srcIndex,
+                 MemTypeStr[links[i].exeMemType], links[i].exeIndex,
                  MemTypeStr[links[i].dstMemType], links[i].dstIndex,
+                 links[i].exeMemType == MEM_CPU ? ev.numCpuPerLink : links[i].numBlocksToUse,
                  linkBandwidthGbs, linkDurationMsec,
                  GetLinkDesc(links[i]).c_str(),
-                 linkSrcMem[i] + initOffset, linkDstMem[i] + initOffset,
-                 numWarmups, numIterations,
-                 useHipCall ? "true" : "false",
-                 useMemset ? "true" : "false",
-                 useFineGrainMem ? "true" : "false",
-                 useSingleSync ? "true" : "false",
-                 reuseStreams ? "true" : "false");
+                 links[i].srcMem + initOffset, links[i].dstMem + initOffset,
+                 ev.byteOffset,
+                 ev.numWarmups, ev.numIterations,
+                 ev.useHipCall ? "true" : "false",
+                 ev.useMemset ? "true" : "false",
+                 ev.useSingleSync ? "true" : "false",
+                 ev.combineTiming ? "true" : "false");
         }
       }
 
       // Display aggregate statistics
-      if (!outputToCsv)
+      if (!ev.outputToCsv)
       {
         printf(" Aggregate Bandwidth (CPU timed)    | %9.3f GB/s | %8.3f ms |\n", totalBandwidthGbs, totalCpuTime);
       }
       else
       {
-        printf("%d,%lu,ALL,ALL,ALL,%9.3f,%8.3f,ALL,ALL,ALL,%d,%d,%s,%s,%s,%s,%s\n",
-               testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, numWarmups, numIterations,
-               useHipCall ? "true" : "false",
-               useMemset ? "true" : "false",
-               useFineGrainMem ? "true" : "false",
-               useSingleSync ? "true" : "false",
-               reuseStreams ? "true" : "false");
+        printf("%d,%lu,ALL,ALL,ALL,ALL,%9.3f,%8.3f,ALL,ALL,ALL,%d,%d,%d,%s,%s,%s,%s\n",
+               testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, ev.byteOffset,
+               ev.numWarmups, ev.numIterations,
+               ev.useHipCall ? "true" : "false",
+               ev.useMemset ? "true" : "false",
+               ev.useSingleSync ? "true" : "false",
+               ev.combineTiming ? "true" : "false");
       }
+    }
 
-      // Release GPU memory
-      for (int i = 0; i < numLinks; i++)
+    // Release GPU memory
+    for (int i = 0; i < numLinks; i++)
+    {
+      DeallocateMemory(links[i].srcMemType, links[i].srcIndex, links[i].srcMem);
+      DeallocateMemory(links[i].dstMemType, links[i].dstIndex, links[i].dstMem);
+
+      if (links[i].exeMemType == MEM_GPU)
       {
-        DeallocateMemory(links[i].srcMemType, links[i].srcIndex, linkSrcMem[i]);
-        DeallocateMemory(links[i].dstMemType, links[i].dstIndex, linkDstMem[i]);
-        HIP_CALL(hipFree(gpuBlockParams[i]));
-        if (!reuseStreams)
-          HIP_CALL(hipStreamDestroy(streams[i]));
-        HIP_CALL(hipEventDestroy(startEvents[i]));
-        HIP_CALL(hipEventDestroy(stopEvents[i]));
+        HIP_CALL(hipEventDestroy(links[i].startEvent));
+        HIP_CALL(hipEventDestroy(links[i].stopEvent));
+        HIP_CALL(hipStreamDestroy(links[i].stream));
+        HIP_CALL(hipFree(links[i].blockParam));
+      }
+      else if (links[i].exeMemType == MEM_CPU)
+      {
+        free(links[i].blockParam);
       }
     }
   }
   fclose(fp);
 
-  // Clean up stream cache if re-using streams
-  if (reuseStreams)
-  {
-    for (auto streamVector : streamCache)
-      for (auto stream : streamVector)
-        HIP_CALL(hipStreamDestroy(stream));
-  }
-
   return 0;
 }
 
 void DisplayUsage(char const* cmdName)
 {
+  if (numa_available() == -1)
+  {
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
+  }
+  int numGpuDevices;
+  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+  int const numCpuDevices = numa_num_configured_nodes();
+
   printf("Usage: %s configFile <N>\n", cmdName);
 
   printf("  configFile: File containing Links to execute (see below for format)\n");
@@ -498,57 +369,48 @@ void DisplayUsage(char const* cmdName)
   printf("\n");
   printf("Configfile Format:\n");
   printf("==================\n");
-  printf("A Link is defined as a uni-directional transfer from src memory location to dst memory location\n");
+  printf("A Link is defined as a uni-directional transfer from src memory location to dst memory location executed by either CPU or GPU\n");
   printf("Each single line in the configuration file defines a set of Links to run in parallel\n");
   printf("\n");
   printf("There are two ways to specify the configuration file:\n");
   printf("\n");
   printf("1) Basic\n");
-  printf("   The basic specification assumes the same number of threadblocks/CUs used per link\n");
+  printf("   The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Link\n");
   printf("   A positive number of Links is specified followed by that number of triplets describing each Link\n");
   printf("\n");
-  printf("   #Links #CUs (GPUIndex1 srcMem1 dstMem1) ... (GPUIndexL srcMemL dstMemL)\n");
+  printf("   #Links #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)\n");
   printf("\n");
   printf("2) Advanced\n");
-  printf("   The advanced specification allows different number of threadblocks/CUs used per Link\n");
+  printf("   The advanced specification allows different number of threadblocks/CUs used per GPU-executed Link\n");
   printf("   A negative number of links is specified, followed by quadruples describing each Link\n");
-  printf("   -#Links (GPUIndex1 #CUs1 srcMem1 dstMem1) ... (GPUIndexL #CUsL srcMemL dstMemL)\n");
+  printf("   -#Links (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL)\n");
   printf("\n");
   printf("Argument Details:\n");
   printf("  #Links  :   Number of Links to be run in parallel\n");
-  printf("  #CUs    :   Number of threadblocks/CUs to use for a Link\n");
-  printf("  GpuIndex:   0-indexed GPU id executing the Link\n");
+  printf("  #CUs    :   Number of threadblocks/CUs to use for a GPU-executed Link\n");
   printf("  srcMemL :   Source memory location (Where the data is to be read from). Ignored in memset mode\n");
+  printf("  Executor:   Executor are specified by a character indicating executor type, followed by device index (0-indexed)\n");
+  printf("              - C: CPU-executed  (Indexed from 0 to %d)\n", numCpuDevices-1);
+  printf("              - G: GPU-executed  (Indexed from 0 to %d)\n", numGpuDevices-1);
   printf("  dstMemL :   Destination memory location (Where the data is to be written to)\n");
-  printf("              Memory locations are specified by a character indicating memory type, followed by GPU device index (0-indexed)\n");
+  printf("\n");
+  printf("              Memory locations are specified by a character indicating memory type, followed by device index (0-indexed)\n");
   printf("              Supported memory locations are:\n");
-  printf("              - P:    Pinned host memory   (on CPU, on NUMA node closest to provided GPU index)\n");
-  printf("              - G:    Global device memory (on GPU)\n");
-  printf("Round brackets may be included for human clarity, but will be ignored\n");
+  printf("              - C:    Pinned host memory       (on NUMA node, indexed from 0 to %d)\n", numCpuDevices-1);
+  printf("              - G:    Global device memory     (on GPU device indexed from 0 to %d)\n", numGpuDevices-1);
+  printf("              - F:    Fine-grain device memory (on GPU device indexed from 0 to %d)\n", numGpuDevices-1);
   printf("\n");
   printf("Examples:\n");
-  printf("1 4 (0 G0 G1)              Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1\n");
-  printf("1 4 (0 G1 G0)              Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 1 and copies it to memory on GPU 0\n");
-  printf("1 4 (2 P0 G2)              Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2\n");
-  printf("2 4 (0 G0 G1) (1 G1 G0)    Runs 2 Links in parallel.  GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs\n");
-  printf("-2 (0 G0 G1 4) (1 G1 G0 2) Runs 2 Links in parallel.  GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs\n");
+  printf("1 4 (G0->G0->G1)             Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1\n");
+  printf("1 4 (G1->C0->G0)             Single Link that uses 4 CUs on GPU 0 that reads memory from CPU 1 and copies it to memory on GPU 0\n");
+  printf("1 4 (C0->G2->G2)             Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2\n");
+  printf("2 4 G0->G0->G1 G1->G1->G0    Runs 2 Links in parallel.  GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs\n");
+  printf("-2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel.  GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs\n");
   printf("\n");
+  printf("Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary\n");
   printf("\n");
-  printf("Environment variables:\n");
-  printf("======================\n");
-  printf(" USE_HIP_CALL       - Use hipMemcpy/hipMemset instead of custom shader kernels\n");
-  printf(" USE_MEMSET         - Perform a memset instead of a copy (ignores source memory)\n");
-  printf(" USE_FINEGRAIN_MEM  - Allocate fine-grained GPU memory instead of coarse-grained GPU memory\n");
-  printf(" USE_SINGLE_SYNC    - Perform synchronization only once after all iterations instead of per iteration\n");
-  printf(" USE_INTERACTIVE    - Pause for user-input before starting transfer loop\n");
-  printf(" USE_SLEEP          - Adds a 100ms sleep after each synchronization\n");
-  printf(" COMBINE_TIMING     - Combines timing with launch (potentially lower timing overhead)\n");
-  printf(" REUSE_STREAMS      - Re-use streams instead of creating / destroying per test\n");
-  printf(" SHOW_ADDR          - Print out memory addresses for each Link\n");
-  printf(" OUTPUT_TO_CSV      - Outputs to CSV format if set\n");
-  printf(" BYTE_OFFSET        - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
-  printf(" NUM_WARMUPS=W      - Perform W untimed warmup iteration(s) per test\n");
-  printf(" NUM_ITERATIONS=I   - Perform I timed iteration(s) per test\n");
+
+  EnvVars::DisplayUsage();
 }
 
 void GenerateConfigFile(char const* cfgFile, int numBlocks)
@@ -556,13 +418,28 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
   // Detect number of available GPUs and skip if less than 2
   int numGpuDevices;
   HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-  printf("Generated configFile %s for %d device(s) / %d CUs per link\n", cfgFile, numGpuDevices, numBlocks);
+  printf("Generating configFile %s for %d device(s) / %d CUs per link\n", cfgFile, numGpuDevices, numBlocks);
   if (numGpuDevices < 2)
   {
     printf("Skipping. (Less than 2 GPUs detected)\n");
     exit(0);
   }
 
+  // Check first to see if file exists, and issue warning
+  FILE* exists = fopen(cfgFile, "r");
+  if (exists)
+  {
+    fclose(exists);
+    printf("[WARN] File %s alreadys exists.  Enter 'Y' to confirm overwrite\n", cfgFile);
+    char ch;
+    scanf(" %c", &ch);
+    if (ch != 'Y' && ch != 'y')
+    {
+      printf("Aborting\n");
+      exit(0);
+    }
+  }
+
   // Open config file for writing
   FILE* fp = fopen(cfgFile, "w");
   if (!fp)
@@ -574,18 +451,18 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
   // CU testing
   fprintf(fp, "# CU scaling tests\n");
   for (int i = 1; i < 16; i++)
-    fprintf(fp, "1 %d (0 G0 G1)\n", i);
+    fprintf(fp, "1 %d (G0->G0->G1)\n", i);
   fprintf(fp, "\n");
 
   // Pinned memory testing
   fprintf(fp, "# Pinned CPU memory read tests\n");
   for (int i = 0; i < numGpuDevices; i++)
-    fprintf(fp, "1 %d (%d C%d G%d)\n", numBlocks, i, i, i);
+    fprintf(fp, "1 %d (C0->G%d->G%d)\n", numBlocks, i, i);
   fprintf(fp, "\n");
 
   fprintf(fp, "# Pinned CPU memory write tests\n");
   for (int i = 0; i < numGpuDevices; i++)
-    fprintf(fp, "1 %d (%d G%d C%d)\n", numBlocks, i, i, i);
+    fprintf(fp, "1 %d (G%d->G%d->C0)\n", numBlocks, i, i);
   fprintf(fp, "\n");
 
   // Single link testing GPU testing
@@ -594,7 +471,7 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
     for (int j = 0; j < numGpuDevices; j++)
     {
       if (i == j) continue;
-      fprintf(fp, "1 %d (%d G%d G%d)\n", numBlocks, i, i, j);
+      fprintf(fp, "1 %d (G%d->G%d->G%d)\n", numBlocks, i, i, j);
     }
   fprintf(fp, "\n");
 
@@ -604,7 +481,7 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
     for (int j = 0; j < numGpuDevices; j++)
     {
       if (i == j) continue;
-      fprintf(fp, "2 %d (%d G%d G%d) (%d G%d G%d)\n", numBlocks, i, i, j, j, j, i);
+      fprintf(fp, "2 %d (G%d->G%d->G%d) (G%d->G%d->G%d)\n", numBlocks, i, i, j, j, j, i);
     }
   fprintf(fp, "\n");
 
@@ -613,7 +490,7 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
   fprintf(fp, "%d %d", numGpuDevices, numBlocks);
   for (int i = 0; i < numGpuDevices; i++)
   {
-    fprintf(fp, " (%d G%d G%d)", i, i, (i+1)%numGpuDevices);
+    fprintf(fp, " (G%d->G%d->G%d)", i, i, (i+1)%numGpuDevices);
   }
   fprintf(fp, "\n\n");
 
@@ -621,23 +498,23 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
   fprintf(fp, "# Simple bi-directional ring\n");
   fprintf(fp, "%d %d", numGpuDevices * 2, numBlocks);
   for (int i = 0; i < numGpuDevices; i++)
-    fprintf(fp, " (%d G%d G%d)", i, i, (i+1)%numGpuDevices);
+    fprintf(fp, " (G%d->G%d->G%d)", i, i, (i+1)%numGpuDevices);
   for (int i = 0; i < numGpuDevices; i++)
-    fprintf(fp, " (%d G%d G%d)", i, i, (i+numGpuDevices-1)%numGpuDevices);
+    fprintf(fp, " (G%d->G%d->G%d)", i, i, (i+numGpuDevices-1)%numGpuDevices);
   fprintf(fp, "\n\n");
 
   // Broadcast from GPU 0
   fprintf(fp, "# GPU 0 Broadcast\n");
   fprintf(fp, "%d %d", numGpuDevices-1, numBlocks);
   for (int i = 1; i < numGpuDevices; i++)
-    fprintf(fp, " (%d G%d G%d)", 0, 0, i);
+    fprintf(fp, " (G%d->G%d->G%d)", 0, 0, i);
   fprintf(fp, "\n\n");
 
   // Gather to GPU 0
   fprintf(fp, "# GPU 0 Gather\n");
   fprintf(fp, "%d %d", numGpuDevices-1, numBlocks);
   for (int i = 1; i < numGpuDevices; i++)
-    fprintf(fp, " (%d G%d G%d)", 0, i, 0);
+    fprintf(fp, " (G%d->G%d->G%d)", 0, i, 0);
   fprintf(fp, "\n\n");
 
   // Full stress test
@@ -647,7 +524,7 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
     for (int j = 0; j < numGpuDevices; j++)
     {
       if (i == j) continue;
-      fprintf(fp, " (%d G%d G%d)", i, i, j);
+      fprintf(fp, " (G%d->G%d->G%d)", i, i, j);
     }
   fprintf(fp, "\n\n");
 
@@ -692,29 +569,87 @@ void DisplayTopology()
   }
 }
 
-void ParseMemType(std::string const& token, MemType* memType, int* memIndex)
+void PopulateTestSizes(size_t const numBytesPerLink,
+                       int const samplingFactor,
+                       std::vector<size_t>& valuesOfN)
+{
+  valuesOfN.clear();
+
+  // If the number of bytes is specified, use it
+  if (numBytesPerLink != 0)
+  {
+    if (numBytesPerLink % 4)
+    {
+      printf("[ERROR] numBytesPerLink (%lu) must be a multiple of 4\n", numBytesPerLink);
+      exit(1);
+    }
+    size_t N = numBytesPerLink / sizeof(float);
+    valuesOfN.push_back(N);
+  }
+  else
+  {
+    // Otherwise generate a range of values
+    // (Powers of 2, with samplingFactor samples between successive powers of 2)
+    for (int N = 256; N <= (1<<27); N *= 2)
+    {
+      int delta = std::max(32, N / samplingFactor);
+      int curr = N;
+      while (curr < N * 2)
+      {
+        valuesOfN.push_back(curr);
+        curr += delta;
+      }
+    }
+  }
+}
+
+void ParseMemType(std::string const& token, int const numCpus, int const numGpus, MemType* memType, int* memIndex)
 {
   char typeChar;
   if (sscanf(token.c_str(), " %c %d", &typeChar, memIndex) != 2)
   {
-    printf("Error parsing memory type token %s\n", token.c_str());
+    printf("[ERROR] Unable to parse memory type token %s - expecting either 'C' or 'G' or 'F' followed by an index\n", token.c_str());
     exit(1);
   }
 
   switch (typeChar)
   {
-  case 'C': case 'c': *memType = MEM_CPU; break;
-  case 'G': case 'g': *memType = MEM_GPU; break;
-  default: printf("Unrecognized memory type %s\n", token.c_str()); exit(1);
+  case 'C': case 'c':
+    *memType = MEM_CPU;
+    if (*memIndex < 0 || *memIndex >= numCpus)
+    {
+      printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, *memIndex);
+      exit(1);
+    }
+    break;
+  case 'G': case 'g':
+    *memType = MEM_GPU;
+    if (*memIndex < 0 || *memIndex >= numGpus)
+    {
+      printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, *memIndex);
+      exit(1);
+    }
+    break;
+  case 'F': case 'f':
+    *memType = MEM_GPU_FINE;
+    if (*memIndex < 0 || *memIndex >= numGpus)
+    {
+      printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, *memIndex);
+      exit(1);
+    }
+    break;
+  default:
+    printf("[ERROR] Unrecognized memory type %s.  Expecting either 'C' or 'G' or 'F'\n", token.c_str());
+    exit(1);
   }
 }
 
-// Helper function to parse a link of link definitions
-void ParseLinks(char* line, std::vector<Link>& links)
+// Helper function to parse a list of link definitions
+void ParseLinks(char* line, int numCpus, int numGpus, std::vector<Link>& links)
 {
-  // Replace any round brackets with spaces
+  // Replace any round brackets or '->' with spaces,
   for (int i = 0; line[i]; i++)
-    if (line[i] == '(' || line[i] == ')') line[i] = ' ';
+    if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
 
   links.clear();
   int numLinks = 0;
@@ -725,11 +660,12 @@ void ParseLinks(char* line, std::vector<Link>& links)
   iss >> numLinks;
   if (iss.fail()) return;
 
+  std::string exeMem;
   std::string srcMem;
   std::string dstMem;
   if (numLinks > 0)
   {
-    // Method 1: Take in triples (exeGpu, srcMem, dstMem)
+    // Method 1: Take in triples (srcMem, exeMem, dstMem)
     int numBlocksToUse;
     iss >> numBlocksToUse;
     if (numBlocksToUse <= 0)
@@ -740,47 +676,99 @@ void ParseLinks(char* line, std::vector<Link>& links)
     links.resize(numLinks);
     for (int i = 0; i < numLinks; i++)
     {
-      iss >> links[i].exeIndex >> srcMem >> dstMem;
-      ParseMemType(srcMem, &links[i].srcMemType, &links[i].srcIndex);
-      ParseMemType(dstMem, &links[i].dstMemType, &links[i].dstIndex);
+      iss >> srcMem >> exeMem >> dstMem;
+      ParseMemType(srcMem, numCpus, numGpus, &links[i].srcMemType, &links[i].srcIndex);
+      ParseMemType(exeMem, numCpus, numGpus, &links[i].exeMemType, &links[i].exeIndex);
+      ParseMemType(dstMem, numCpus, numGpus, &links[i].dstMemType, &links[i].dstIndex);
       links[i].numBlocksToUse = numBlocksToUse;
+      if (links[i].exeMemType != MEM_CPU && links[i].exeMemType != MEM_GPU)
+      {
+        printf("[ERROR] Executor must either be CPU ('C') or GPU ('G')\n");
+        exit(1);
+      }
     }
   }
   else
   {
-    // Method 2: Read in quads (exeGpu, srcMem, dstMem,  Read common # blocks to use, then read (src, dst) doubles
+    // Method 2: Read in quads (srcMem, exeMem, dstMem,  Read common # blocks to use, then read (src, dst) doubles
     numLinks *= -1;
     links.resize(numLinks);
 
     for (int i = 0; i < numLinks; i++)
     {
-      iss >> links[i].exeIndex >> srcMem >> dstMem >> links[i].numBlocksToUse;
-      ParseMemType(srcMem, &links[i].srcMemType, &links[i].srcIndex);
-      ParseMemType(dstMem, &links[i].dstMemType, &links[i].dstIndex);
+      iss >> srcMem >> exeMem >> dstMem >> links[i].numBlocksToUse;
+      ParseMemType(srcMem, numCpus, numGpus, &links[i].srcMemType, &links[i].srcIndex);
+      ParseMemType(exeMem, numCpus, numGpus, &links[i].exeMemType, &links[i].exeIndex);
+      ParseMemType(dstMem, numCpus, numGpus, &links[i].dstMemType, &links[i].dstIndex);
+      if (links[i].exeMemType != MEM_CPU || links[i].exeMemType != MEM_GPU)
+      {
+        printf("[ERROR] Executor must either be CPU ('C') or GPU ('G')\n");
+        exit(1);
+      }
+
     }
   }
 }
 
-void AllocateMemory(MemType memType, int devIndex, size_t numBytes, bool useFineGrainMem, float** memPtr)
+void EnablePeerAccess(int const deviceId, int const peerDeviceId)
 {
-  HIP_CALL(hipSetDevice(devIndex));
+  int canAccess;
+  HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
+  if (!canAccess)
+  {
+    printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId);
+    exit(1);
+  }
+  HIP_CALL(hipSetDevice(deviceId));
+  HIP_CALL(hipDeviceEnablePeerAccess(peerDeviceId, 0));
+}
+
+void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr)
+{
+  if (numBytes == 0)
+  {
+    printf("[ERROR] Unable to allocate 0 bytes\n");
+    exit(1);
+  }
 
   if (memType == MEM_CPU)
   {
-    // // Allocate pinned-memory on NUMA node closest to the selected GPU
-    HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocPortable));
+    // Set numa policy prior to call to hipHostMalloc
+    unsigned long nodemask = (1ULL << devIndex);
+    long retCode = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8);
+    if (retCode)
+    {
+      printf("[ERROR] Unable to set NUMA memory policy to bind to NUMA node %d\n", devIndex);
+      exit(1);
+    }
+
+    // Allocate host-pinned memory (should respect NUMA mem policy)
+    HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
+
+    // Check that the allocated pages are actually on the correct NUMA node
+    CheckPages((char*)*memPtr, numBytes, devIndex);
+
+    // Reset to default numa mem policy
+    retCode = set_mempolicy(MPOL_DEFAULT, NULL, 8);
+    if (retCode)
+    {
+      printf("[ERROR] Unable reset to default NUMA memory policy\n");
+      exit(1);
+    }
   }
   else if (memType == MEM_GPU)
   {
-    // Allocate GPU memory
-    if (useFineGrainMem)
-      HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained));
-    else
-      HIP_CALL(hipMalloc((void**)memPtr, numBytes));
+    // Allocate GPU memory on appropriate device
+    HIP_CALL(hipSetDevice(devIndex));
+    HIP_CALL(hipMalloc((void**)memPtr, numBytes));
+  }
+  else if (memType == MEM_GPU_FINE)
+  {
+    HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained));
   }
   else
   {
-    printf("Error: Unsupported memory type %d\n", memType);
+    printf("[ERROR] Unsupported memory type %d\n", memType);
     exit(1);
   }
 }
@@ -791,12 +779,51 @@ void DeallocateMemory(MemType memType, int devIndex, float* memPtr)
   {
     HIP_CALL(hipHostFree(memPtr));
   }
-  else if (memType == MEM_GPU)
+  else if (memType == MEM_GPU || memType == MEM_GPU_FINE)
   {
     HIP_CALL(hipFree(memPtr));
   }
 }
 
+void CheckPages(char* array, size_t numBytes, int targetId)
+{
+  unsigned long const pageSize = getpagesize();
+  unsigned long const numPages = (numBytes + pageSize - 1) / pageSize;
+
+  std::vector<void *> pages(numPages);
+  std::vector<int> status(numPages);
+
+  pages[0] = array;
+  for (int i = 1; i < numPages; i++)
+  {
+    pages[i] = (char*)pages[i-1] + pageSize;
+  }
+
+  long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
+  if (retCode)
+  {
+    printf("[ERROR] Unable to collect page info\n");
+    exit(1);
+  }
+
+  size_t mistakeCount = 0;
+  for (int i = 0; i < numPages; i++)
+  {
+    if (status[i] < 0)
+    {
+      printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i);
+      exit(1);
+    }
+    if (status[i] != targetId) mistakeCount++;
+  }
+  if (mistakeCount > 0)
+  {
+    printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId);
+    // NOTE: Some older versions of HIP do not properly respect NUMA policy so avoid failing for now
+    // exit(1);
+  }
+}
+
 // Helper function to either fill a device pointer with pseudo-random data, or to check to see if it matches
 void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, float* ptr)
 {
@@ -858,31 +885,132 @@ std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount)
   return result;
 }
 
+std::string GetDesc(MemType srcMemType, int srcIndex,
+                    MemType dstMemType, int dstIndex)
+{
+  if (srcMemType == MEM_CPU)
+  {
+    if (dstMemType == MEM_CPU)
+      return (srcIndex == dstIndex) ? "LOCAL" : "NUMA";
+    else if (dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE)
+      return "PCIE";
+    else
+      goto error;
+  }
+  else if (srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE)
+  {
+    if (dstMemType == MEM_CPU)
+      return "PCIE";
+    else if (dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE)
+    {
+      if (srcIndex == dstIndex) return "LOCAL";
+      else
+      {
+        uint32_t linkType, hopCount;
+        HIP_CALL(hipExtGetLinkTypeAndHopCount(srcIndex, dstIndex, &linkType, &hopCount));
+        return GetLinkTypeDesc(linkType, hopCount);
+      }
+    }
+    else
+      goto error;
+  }
+error:
+  printf("[ERROR] Unrecognized memory type\n");
+  exit(1);
+}
+
 std::string GetLinkDesc(Link const& link)
 {
-  std::string result = "";
-
-  // Currently only describe links between src/dst on GPU
-  if (link.srcMemType == MEM_GPU && link.dstMemType == MEM_GPU)
-  {
-    if (link.exeIndex != link.srcIndex)
-    {
-      uint32_t linkType, hopCount;
-      HIP_CALL(hipExtGetLinkTypeAndHopCount(link.srcIndex, link.exeIndex, &linkType, &hopCount));
-      result += GetLinkTypeDesc(linkType, hopCount);
-    }
-
-    if (link.exeIndex != link.dstIndex)
-    {
-      uint32_t linkType, hopCount;
-      HIP_CALL(hipExtGetLinkTypeAndHopCount(link.exeIndex, link.dstIndex, &linkType, &hopCount));
-      if (result != "") result += "+";
-      result += GetLinkTypeDesc(linkType, hopCount);
-    }
-  }
-  else
-  {
-    result = "???";
-  }
-  return result;
+  return GetDesc(link.srcMemType, link.srcIndex, link.exeMemType, link.exeIndex) + "-"
+    + GetDesc(link.exeMemType, link.exeIndex, link.dstMemType, link.dstIndex);
+}
+
+void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link)
+{
+  // GPU execution agent
+  if (link.exeMemType == MEM_GPU)
+  {
+    // Switch to executing GPU
+    HIP_CALL(hipSetDevice(link.exeIndex));
+
+    bool recordStart = (!ev.useSingleSync || iteration == 0);
+    bool recordStop  = (!ev.useSingleSync || iteration == ev.numIterations - 1);
+
+    int const initOffset = ev.byteOffset / sizeof(float);
+
+    if (ev.useHipCall)
+    {
+      // Record start event
+      if (recordStart) HIP_CALL(hipEventRecord(link.startEvent, link.stream));
+
+      // Execute hipMemset / hipMemcpy
+      if (ev.useMemset)
+        HIP_CALL(hipMemsetAsync(link.dstMem + initOffset, 42, N * sizeof(float), link.stream));
+      else
+        HIP_CALL(hipMemcpyAsync(link.dstMem + initOffset,
+                                link.srcMem + initOffset,
+                                N * sizeof(float), hipMemcpyDefault,
+                                link.stream));
+      // Record stop event
+      if (recordStop) HIP_CALL(hipEventRecord(link.stopEvent, link.stream));
+    }
+    else
+    {
+      if (!ev.combineTiming && recordStart) HIP_CALL(hipEventRecord(link.startEvent, link.stream));
+      hipExtLaunchKernelGGL(ev.useMemset ? GpuMemsetKernel : GpuCopyKernel,
+                            dim3(link.numBlocksToUse, 1, 1),
+                            dim3(BLOCKSIZE, 1, 1),
+                            0, link.stream,
+                            (ev.combineTiming && recordStart) ? link.startEvent : NULL,
+                            (ev.combineTiming && recordStop)  ? link.stopEvent : NULL,
+                            0, link.blockParam);
+      if (!ev.combineTiming & recordStop) HIP_CALL(hipEventRecord(link.stopEvent, link.stream));
+    }
+
+    // Synchronize per iteration, unless in single sync mode, in which case
+    // synchronize during last warmup / last actual iteration
+    if (!ev.useSingleSync || iteration == -1 || iteration == ev.numIterations - 1)
+    {
+      HIP_CALL(hipStreamSynchronize(link.stream));
+    }
+
+    if (iteration >= 0)
+    {
+      // Record GPU timing
+      if (!ev.useSingleSync || iteration == ev.numIterations - 1)
+      {
+        HIP_CALL(hipEventSynchronize(link.stopEvent));
+        float gpuDeltaMsec;
+        HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, link.startEvent, link.stopEvent));
+        link.totalTime += gpuDeltaMsec;
+      }
+    }
+  }
+  else if (link.exeMemType == MEM_CPU) // CPU execution agent
+  {
+    // Force this thread and all child threads onto correct NUMA node
+    if (numa_run_on_node(link.exeIndex))
+    {
+      printf("[ERROR] Unable to set CPU to NUMA node %d\n", link.exeIndex);
+      exit(1);
+    }
+
+    std::vector<std::thread> childThreads;
+
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    // Launch child-threads to perform memcopies
+    for (int i = 0; i < ev.numCpuPerLink; i++)
+      childThreads.push_back(std::thread(ev.useMemset ? CpuMemsetKernel : CpuCopyKernel, std::ref(link.blockParam[i])));
+
+    // Wait for child-threads to finish
+    for (int i = 0; i < ev.numCpuPerLink; i++)
+      childThreads[i].join();
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+
+    // Record time if not a warmup iteration
+    if (iteration >= 0)
+      link.totalTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
+  }
 }
diff --git a/tools/TransferBench/TransferBench.hpp b/tools/TransferBench/TransferBench.hpp
index 1291bf68f7..6a08e80f4e 100644
--- a/tools/TransferBench/TransferBench.hpp
+++ b/tools/TransferBench/TransferBench.hpp
@@ -33,7 +33,9 @@ THE SOFTWARE.
 #include <hip/hip_runtime.h>
 #include <hip/hip_ext.h>
 #include <hsa/hsa_ext_amd.h>
+
 #include "copy_kernel.h"
+#include "EnvVars.hpp"
 
 // Helper macro for catching HIP errors
 #define HIP_CALL(cmd)                                                   \
@@ -50,11 +52,12 @@ THE SOFTWARE.
 // Different src/dst memory types supported
 typedef enum
 {
-  MEM_CPU = 0,             // Pinned CPU memory
-  MEM_GPU = 1              // Global GPU memory
+  MEM_CPU      = 0,    // Pinned CPU memory
+  MEM_GPU      = 1,    // Coarse-grained global GPU memory
+  MEM_GPU_FINE = 2     // Fine-grained global GPU memory
 } MemType;
 
-char const MemTypeStr[3] = "CG";
+char const MemTypeStr[4] = "CGF";
 
 typedef enum
 {
@@ -62,17 +65,6 @@ typedef enum
   MODE_CHECK = 1          // Check data against pattern
 } ModeType;
 
-// Each Link is a uni-direction operation from a src memory to dst memory executed by a specific GPU
-struct Link
-{
-  int     exeIndex;        // GPU to execute on
-  MemType srcMemType;      // Source memory type
-  int     srcIndex;        // Source device index
-  MemType dstMemType;      // Destination memory type
-  int     dstIndex;        // Destination device index
-  int     numBlocksToUse;  // Number of threadblocks to use for this Link
-};
-
 // Each threadblock copies N floats from src to dst
 struct BlockParam
 {
@@ -81,46 +73,90 @@ struct BlockParam
     float* dst;
 };
 
+// Each Link is a uni-direction operation from a src memory to dst memory executed by a specific GPU
+struct Link
+{
+  // Link config
+  MemType exeMemType;      // Link executor type (CPU or GPU)
+  int     exeIndex;        // Executor index (NUMA node for CPU / device ID for GPU)
+  MemType srcMemType;      // Source memory type
+  int     srcIndex;        // Source device index
+  MemType dstMemType;      // Destination memory type
+  int     dstIndex;        // Destination device index
+  int     numBlocksToUse;  // Number of threadblocks to use for this Link
+
+  // Link implementation
+  float*      srcMem;      // Source memory
+  float*      dstMem;      // Destination memory
+
+  hipEvent_t  startEvent;
+  hipEvent_t  stopEvent;
+  hipStream_t stream;
+  BlockParam* blockParam;
+
+  double totalTime;
+};
+
 void DisplayUsage(char const* cmdName);                      // Display usage instructions
 void GenerateConfigFile(char const* cfgFile, int numBlocks); // Generate a sample config file
 void DisplayTopology();                                      // Display GPU topology
-void ParseLinks(char* line, std::vector<Link>& links);       // Parse Link information
-void AllocateMemory(MemType memType, int devIndex, size_t numBytes, bool useFineGrainMem, float** memPtr);
+void PopulateTestSizes(size_t const numBytesPerLink, int const samplingFactor, std::vector<size_t>& valuesofN);
+void ParseMemType(std::string const& token, int const numCpus, int const numGpus, MemType* memType, int* memIndex);
+void ParseLinks(char* line, int numCpus, int numGpus, std::vector<Link>& links);       // Parse Link information
+void EnablePeerAccess(int const deviceId, int const peerDeviceId);
+void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr);
 void DeallocateMemory(MemType memType, int devIndex, float* memPtr);
+void CheckPages(char* byteArray, size_t numBytes, int targetId);
 void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, float* ptr);
+void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link);
+
+
 std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
+std::string GetDesc(MemType srcMemType, int srcIndex,
+                    MemType dstMemType, int dstIndex);
 std::string GetLinkDesc(Link const& link);
 
-
-#define MAX_NAME_LEN 64
 #define BLOCKSIZE 256
 #define COPY_UNROLL 4
 #define MEMSET_UNROLL 4
 
 // GPU copy kernel
 __global__ void __launch_bounds__(BLOCKSIZE)
-CopyKernel(BlockParam* blockParams)
+GpuCopyKernel(BlockParam* blockParams)
 {
-    // Collect the arguments for this block
-    int N = blockParams[blockIdx.x].N;
-    const float* __restrict__ src = (float* )blockParams[blockIdx.x].src;
-    float* __restrict__ dst = (float* )blockParams[blockIdx.x].dst;
+  // Collect the arguments for this block
+  int N = blockParams[blockIdx.x].N;
+  const float* __restrict__ src = (float* )blockParams[blockIdx.x].src;
+  float* __restrict__ dst = (float* )blockParams[blockIdx.x].dst;
 
-    Copy<COPY_UNROLL, BLOCKSIZE>(dst, src, N);
+  Copy<COPY_UNROLL, BLOCKSIZE>(dst, src, N);
 }
 
 // GPU set kernel
 __global__ void __launch_bounds__(BLOCKSIZE)
-MemsetKernel(BlockParam* blockParams)
+GpuMemsetKernel(BlockParam* blockParams)
 {
-    // Collect the arguments for this block
-    int N = blockParams[blockIdx.x].N;
-    float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst;
+  // Collect the arguments for this block
+  int N = blockParams[blockIdx.x].N;
+  float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst;
 
-    // Use non-zero value
-    #pragma unroll MEMSET_UNROLL
-    for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE)
-    {
-      dst[tid] = 1234.0;
-    }
+  // Use non-zero value
+  #pragma unroll MEMSET_UNROLL
+  for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE)
+  {
+    dst[tid] = 1234.0;
+  }
+}
+
+// CPU copy kernel
+void CpuCopyKernel(BlockParam const& blockParams)
+{
+  memcpy(blockParams.dst, blockParams.src, blockParams.N * sizeof(float));
+}
+
+// CPU memset kernel
+void CpuMemsetKernel(BlockParam const& blockParams)
+{
+  for (int i = 0; i < blockParams.N; i++)
+    blockParams.dst[i] = 1234.0;
 }
diff --git a/tools/TransferBench/example.cfg b/tools/TransferBench/example.cfg
index b14762f9e3..2d546f433b 100644
--- a/tools/TransferBench/example.cfg
+++ b/tools/TransferBench/example.cfg
@@ -1,39 +1,43 @@
-#Configfile Format:
-#==================
-#A Link is defined as a uni-directional transfer from src memory location to dst memory location
-#Each single line in the configuration file defines a set of Links to run in parallel
+# Configfile Format:
+# ==================
+# A Link is defined as a uni-directional transfer from src memory location to dst memory location executed by either CPU or GPU
+# Each single line in the configuration file defines a set of Links to run in parallel
 
-#There are two ways to specify the configuration file:
+# There are two ways to specify the configuration file:
 
-#1) Basic
-#   The basic specification assumes the same number of threadblocks/CUs used per link
-#   A positive number of Links is specified followed by that number of triplets describing each Link
+# 1) Basic
+#    The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Link
+#    A positive number of Links is specified followed by that number of triplets describing each Link
 
-   #Links #CUs (GPUIndex1 srcMem1 dstMem1) ... (GPUIndexL srcMemL dstMemL)
+#    #Links #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
 
-#2) Advanced
-#   The advanced specification allows different number of threadblocks/CUs used per Link
-#   A negative number of links is specified, followed by quadruples describing each Link
-#   -#Links (GPUIndex1 #CUs1 srcMem1 dstMem1) ... (GPUIndexL #CUsL srcMemL dstMemL)
+# 2) Advanced
+#    The advanced specification allows different number of threadblocks/CUs used per GPU-executed Link
+#    A negative number of links is specified, followed by quadruples describing each Link
+#    -#Links (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL)
 
-#Argument Details:
-#  #Links  :   Number of Links to be run in parallel
-#  #CUs    :   Number of threadblocks/CUs to use for a Link
-#  GpuIndex:   0-indexed GPU id executing the Link
-#  srcMemL :   Source memory location (Where the data is to be read from). Ignored in memset mode
-#  dstMemL :   Destination memory location (Where the data is to be written to)
-#              Memory locations are specified by a character indicating memory type, followed by GPU device index (0-indexed)
-#              Supported memory locations are:
-#              - P:    Pinned host memory   (on CPU, on NUMA node closest to provided GPU index)
-#              - G:    Global device memory (on GPU)
-#Round brackets may be included for human clarity, but will be ignored
+# Argument Details:
+#   #Links  :   Number of Links to be run in parallel
+#   #CUs    :   Number of threadblocks/CUs to use for a GPU-executed Link
+#   srcMemL :   Source memory location (Where the data is to be read from). Ignored in memset mode
+#   Executor:   Executor are specified by a character indicating executor type, followed by device index (0-indexed)
+#               - C: CPU-executed  (Indexed from 0 to 1)
+#               - G: GPU-executed  (Indexed from 0 to 3)
+#   dstMemL :   Destination memory location (Where the data is to be written to)
 
-#Examples:
-#1 4 (0 G0 G1)              Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1
-#1 4 (0 G1 G0)              Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 1 and copies it to memory on GPU 0
-#1 4 (2 P0 G2)              Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2
-#2 4 (0 G0 G1) (1 G1 G0)    Runs 2 Links in parallel.  GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs
-#-2 (0 G0 G1 4) (1 G1 G0 2) Runs 2 Links in parallel.  GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs
+#               Memory locations are specified by a character indicating memory type, followed by device index (0-indexed)
+#               Supported memory locations are:
+#               - C:    Pinned host memory       (on NUMA node, indexed from 0 to 1)
+#               - G:    Global device memory     (on GPU device indexed from 0 to 3)
+#               - F:    Fine-grain device memory (on GPU device indexed from 0 to 3)
 
-# Single link between GPUs 0 and 1
-1 1 (0 G0 G1)
+# Examples:
+# 1 4 (G0->G0->G1)             Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1
+# 1 4 (G1->C0->G0)             Single Link that uses 4 CUs on GPU 0 that reads memory from CPU 1 and copies it to memory on GPU 0
+# 1 4 (C0->G2->G2)             Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2
+# 2 4 G0->G0->G1 G1->G1->G0    Runs 2 Links in parallel.  GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs
+# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel.  GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs
+# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
+
+# Single GPU-executed link between GPUs 0 and 1 using 4 CUs
+1 4 (G0->G0->G1)