[TransferBench] Adding ability to perform CPU-executed copies, various upgrades (#309)

* Adding CPU based execution, fixing typos, adding Fine-grained mem * Exposing sampling factor when generating range of data sizes * Refactoring how Links are launched, now once per thread * Documentation updates
2020-12-11 10:21:14 -07:00
@@ -0,0 +1,137 @@
+#ifndef ENVVARS_HPP
+#define ENVVARS_HPP
+
+// This class manages environment variable that affect TransferBench
+class EnvVars
+{
+public:
+  // Default configuration values
+  int const DEFAULT_NUM_WARMUPS      =  3;
+  int const DEFAULT_NUM_ITERATIONS   = 10;
+  int const DEFAULT_SAMPLING_FACTOR  =  1;
+  int const DEFAULT_NUM_CPU_PER_LINK =  4;
+
+  // Environment variables
+  int useHipCall;      // Use hipMemcpy/hipMemset instead of custom shader kernels
+  int useMemset;       // Perform a memset instead of a copy (ignores source memory)
+  int useSingleSync;   // Perform synchronization only once after all iterations instead of per iteration
+  int useInteractive;  // Pause for user-input before starting transfer loop
+  int useSleep;        // Adds a 100ms sleep after each synchronization
+  int combineTiming;   // Combines the timing with kernel launch
+  int showAddr;        // Print out memory addresses for each Link
+  int outputToCsv;     // Output in CSV format
+  int byteOffset;      // Byte-offset for memory allocations
+  int numWarmups;      // Number of un-timed warmup iterations to perform
+  int numIterations;   // Number of timed iterations to perform
+  int samplingFactor;  // Affects how many different values of N are generated (when N set to 0)
+  int numCpuPerLink;   // Number of CPU child threads to use per CPU link
+
+  // Constructor that collects values
+  EnvVars()
+  {
+    useHipCall      = GetEnvVar("USE_HIP_CALL"     , 0);
+    useMemset       = GetEnvVar("USE_MEMSET"       , 0);
+    useSingleSync   = GetEnvVar("USE_SINGLE_SYNC"  , 0);
+    useInteractive  = GetEnvVar("USE_INTERACTIVE"  , 0);
+    combineTiming   = GetEnvVar("COMBINE_TIMING"   , 0);
+    showAddr        = GetEnvVar("SHOW_ADDR"        , 0);
+    outputToCsv     = GetEnvVar("OUTPUT_TO_CSV"    , 0);
+    byteOffset      = GetEnvVar("BYTE_OFFSET"      , 0);
+    numWarmups      = GetEnvVar("NUM_WARMUPS"      , DEFAULT_NUM_WARMUPS);
+    numIterations   = GetEnvVar("NUM_ITERATIONS"   , DEFAULT_NUM_ITERATIONS);
+    samplingFactor  = GetEnvVar("SAMPLING_FACTOR"  , DEFAULT_SAMPLING_FACTOR);
+    numCpuPerLink   = GetEnvVar("NUM_CPU_PER_LINK" , DEFAULT_NUM_CPU_PER_LINK);
+
+    // Perform some basic validation
+    if (byteOffset % sizeof(float))
+    {
+      printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
+      exit(1);
+    }
+    if (numWarmups < 0)
+    {
+      printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
+      exit(1);
+    }
+    if (numIterations <= 0)
+    {
+      printf("[ERROR] NUM_ITERATIONS must be set to a positive number\n");
+      exit(1);
+    }
+    if (samplingFactor < 1)
+    {
+      printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n");
+      exit(1);
+    }
+    if (numCpuPerLink < 1)
+    {
+      printf("[ERROR] NUM_CPU_PER_LINK must be greater or equal to 1\n");
+      exit(1);
+    }
+  }
+
+  // Display info on the env vars that can be used
+  static void DisplayUsage()
+  {
+    printf("Environment variables:\n");
+    printf("======================\n");
+    printf(" USE_HIP_CALL       - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n");
+    printf(" USE_MEMSET         - Perform a memset instead of a copy (ignores source memory)\n");
+    printf(" USE_SINGLE_SYNC    - Perform synchronization only once after all iterations instead of per iteration\n");
+    printf(" USE_INTERACTIVE    - Pause for user-input before starting transfer loop\n");
+    printf(" COMBINE_TIMING     - Combines timing with launch (potentially lower timing overhead)\n");
+    printf(" SHOW_ADDR          - Print out memory addresses for each Link\n");
+    printf(" OUTPUT_TO_CSV      - Outputs to CSV format if set\n");
+    printf(" BYTE_OFFSET        - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
+    printf(" NUM_WARMUPS=W      - Perform W untimed warmup iteration(s) per test\n");
+    printf(" NUM_ITERATIONS=I   - Perform I timed iteration(s) per test\n");
+    printf(" SAMPLING_FACTOR=F  - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
+    printf(" NUM_CPU_PER_LINK=C - Use C threads per Link for CPU-executed copies\n");
+  }
+
+  // Display env var settings
+  void DisplayEnvVars() const
+  {
+    if (!outputToCsv)
+    {
+      printf("Run configuration\n");
+      printf("=====================================================\n");
+      printf("%-20s = %12d : Using %s for GPU-executed copies\n", "USE_HIP_CALL", useHipCall,
+             useHipCall ? "HIP functions" : "custom kernels");
+      printf("%-20s = %12d : Performing %s\n", "USE_MEMSET", useMemset,
+             useMemset ? "memset" : "memcopy");
+      if (useHipCall && !useMemset)
+      {
+        char* env = getenv("HSA_ENABLE_SDMA");
+        printf("%-20s = %12s : %s\n", "HSA_ENABLE_SDMA", env,
+               (env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
+      }
+      printf("%-20s = %12d : %s\n", "USE_SINGLE_SYNC", useSingleSync,
+             useSingleSync ? "Synchronizing only once, after all iterations" : "Synchronizing per iteration");
+      printf("%-20s = %12d : Running in %s mode\n", "USE_INTERACTIVE", useInteractive,
+             useInteractive ? "interactive" : "non-interactive");
+      printf("%-20s = %12d : %s\n", "COMBINE_TIMING", combineTiming,
+             combineTiming ? "Using combined timing+launch" : "Using separate timing / launch");
+      printf("%-20s = %12d : %s\n", "SHOW_ADDR", showAddr,
+             showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses");
+      printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
+             outputToCsv ? "CSV" : "console");
+      printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
+      printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
+      printf("%-20s = %12d : Running %d timed iteration(s) per topology\n", "NUM_ITERATIONS", numIterations, numIterations);
+      printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Link\n", "NUM_CPU_PER_LINK", numCpuPerLink, numCpuPerLink);
+      printf("\n");
+    }
+  };
+
+private:
+  // Helper function that gets parses environment variable or sets to default value
+  int GetEnvVar(std::string const varname, int defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return atoi(getenv(varname.c_str()));
+    return defaultValue;
+  }
+};
+
+#endif
@@ -6,7 +6,7 @@ endif
 HIPCC=$(HIP_PATH)/bin/hipcc

 EXE=TransferBench
-CXXFLAGS = -O3 -I../../src/include -I.
+CXXFLAGS = -O3 -I../../src/include -I. -lnuma

 all: $(EXE)

@@ -33,7 +33,9 @@ THE SOFTWARE.
 #include <hip/hip_runtime.h>
 #include <hip/hip_ext.h>
 #include <hsa/hsa_ext_amd.h>
+
 #include "copy_kernel.h"
+#include "EnvVars.hpp"

 // Helper macro for catching HIP errors
 #define HIP_CALL(cmd)                                                   \
@@ -50,11 +52,12 @@ THE SOFTWARE.
 // Different src/dst memory types supported
 typedef enum
 {
-  MEM_CPU = 0,             // Pinned CPU memory
-  MEM_GPU = 1              // Global GPU memory
+  MEM_CPU      = 0,    // Pinned CPU memory
+  MEM_GPU      = 1,    // Coarse-grained global GPU memory
+  MEM_GPU_FINE = 2     // Fine-grained global GPU memory
 } MemType;

-char const MemTypeStr[3] = "CG";
+char const MemTypeStr[4] = "CGF";

 typedef enum
 {
@@ -62,17 +65,6 @@ typedef enum
  MODE_CHECK = 1          // Check data against pattern
 } ModeType;

-// Each Link is a uni-direction operation from a src memory to dst memory executed by a specific GPU
-struct Link
-{
-  int     exeIndex;        // GPU to execute on
-  MemType srcMemType;      // Source memory type
-  int     srcIndex;        // Source device index
-  MemType dstMemType;      // Destination memory type
-  int     dstIndex;        // Destination device index
-  int     numBlocksToUse;  // Number of threadblocks to use for this Link
-};
-
 // Each threadblock copies N floats from src to dst
 struct BlockParam
 {
@@ -81,46 +73,90 @@ struct BlockParam
    float* dst;
 };

+// Each Link is a uni-direction operation from a src memory to dst memory executed by a specific GPU
+struct Link
+{
+  // Link config
+  MemType exeMemType;      // Link executor type (CPU or GPU)
+  int     exeIndex;        // Executor index (NUMA node for CPU / device ID for GPU)
+  MemType srcMemType;      // Source memory type
+  int     srcIndex;        // Source device index
+  MemType dstMemType;      // Destination memory type
+  int     dstIndex;        // Destination device index
+  int     numBlocksToUse;  // Number of threadblocks to use for this Link
+
+  // Link implementation
+  float*      srcMem;      // Source memory
+  float*      dstMem;      // Destination memory
+
+  hipEvent_t  startEvent;
+  hipEvent_t  stopEvent;
+  hipStream_t stream;
+  BlockParam* blockParam;
+
+  double totalTime;
+};
+
 void DisplayUsage(char const* cmdName);                      // Display usage instructions
 void GenerateConfigFile(char const* cfgFile, int numBlocks); // Generate a sample config file
 void DisplayTopology();                                      // Display GPU topology
-void ParseLinks(char* line, std::vector<Link>& links);       // Parse Link information
-void AllocateMemory(MemType memType, int devIndex, size_t numBytes, bool useFineGrainMem, float** memPtr);
+void PopulateTestSizes(size_t const numBytesPerLink, int const samplingFactor, std::vector<size_t>& valuesofN);
+void ParseMemType(std::string const& token, int const numCpus, int const numGpus, MemType* memType, int* memIndex);
+void ParseLinks(char* line, int numCpus, int numGpus, std::vector<Link>& links);       // Parse Link information
+void EnablePeerAccess(int const deviceId, int const peerDeviceId);
+void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr);
 void DeallocateMemory(MemType memType, int devIndex, float* memPtr);
+void CheckPages(char* byteArray, size_t numBytes, int targetId);
 void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, float* ptr);
+void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link);
+
+
 std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
+std::string GetDesc(MemType srcMemType, int srcIndex,
+                    MemType dstMemType, int dstIndex);
 std::string GetLinkDesc(Link const& link);

-
-#define MAX_NAME_LEN 64
 #define BLOCKSIZE 256
 #define COPY_UNROLL 4
 #define MEMSET_UNROLL 4

 // GPU copy kernel
 __global__ void __launch_bounds__(BLOCKSIZE)
-CopyKernel(BlockParam* blockParams)
+GpuCopyKernel(BlockParam* blockParams)
 {
-    // Collect the arguments for this block
-    int N = blockParams[blockIdx.x].N;
-    const float* __restrict__ src = (float* )blockParams[blockIdx.x].src;
-    float* __restrict__ dst = (float* )blockParams[blockIdx.x].dst;
+  // Collect the arguments for this block
+  int N = blockParams[blockIdx.x].N;
+  const float* __restrict__ src = (float* )blockParams[blockIdx.x].src;
+  float* __restrict__ dst = (float* )blockParams[blockIdx.x].dst;

-    Copy<COPY_UNROLL, BLOCKSIZE>(dst, src, N);
+  Copy<COPY_UNROLL, BLOCKSIZE>(dst, src, N);
 }

 // GPU set kernel
 __global__ void __launch_bounds__(BLOCKSIZE)
-MemsetKernel(BlockParam* blockParams)
+GpuMemsetKernel(BlockParam* blockParams)
 {
-    // Collect the arguments for this block
-    int N = blockParams[blockIdx.x].N;
-    float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst;
+  // Collect the arguments for this block
+  int N = blockParams[blockIdx.x].N;
+  float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst;

-    // Use non-zero value
-    #pragma unroll MEMSET_UNROLL
-    for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE)
-    {
-      dst[tid] = 1234.0;
-    }
+  // Use non-zero value
+  #pragma unroll MEMSET_UNROLL
+  for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE)
+  {
+    dst[tid] = 1234.0;
+  }
+}
+
+// CPU copy kernel
+void CpuCopyKernel(BlockParam const& blockParams)
+{
+  memcpy(blockParams.dst, blockParams.src, blockParams.N * sizeof(float));
+}
+
+// CPU memset kernel
+void CpuMemsetKernel(BlockParam const& blockParams)
+{
+  for (int i = 0; i < blockParams.N; i++)
+    blockParams.dst[i] = 1234.0;
 }
@@ -1,39 +1,43 @@
-#Configfile Format:
-#==================
-#A Link is defined as a uni-directional transfer from src memory location to dst memory location
-#Each single line in the configuration file defines a set of Links to run in parallel
+# Configfile Format:
+# ==================
+# A Link is defined as a uni-directional transfer from src memory location to dst memory location executed by either CPU or GPU
+# Each single line in the configuration file defines a set of Links to run in parallel

-#There are two ways to specify the configuration file:
+# There are two ways to specify the configuration file:

-#1) Basic
-#   The basic specification assumes the same number of threadblocks/CUs used per link
-#   A positive number of Links is specified followed by that number of triplets describing each Link
+# 1) Basic
+#    The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Link
+#    A positive number of Links is specified followed by that number of triplets describing each Link

-   #Links #CUs (GPUIndex1 srcMem1 dstMem1) ... (GPUIndexL srcMemL dstMemL)
+#    #Links #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)

-#2) Advanced
-#   The advanced specification allows different number of threadblocks/CUs used per Link
-#   A negative number of links is specified, followed by quadruples describing each Link
-#   -#Links (GPUIndex1 #CUs1 srcMem1 dstMem1) ... (GPUIndexL #CUsL srcMemL dstMemL)
+# 2) Advanced
+#    The advanced specification allows different number of threadblocks/CUs used per GPU-executed Link
+#    A negative number of links is specified, followed by quadruples describing each Link
+#    -#Links (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL)

-#Argument Details:
-#  #Links  :   Number of Links to be run in parallel
-#  #CUs    :   Number of threadblocks/CUs to use for a Link
-#  GpuIndex:   0-indexed GPU id executing the Link
-#  srcMemL :   Source memory location (Where the data is to be read from). Ignored in memset mode
-#  dstMemL :   Destination memory location (Where the data is to be written to)
-#              Memory locations are specified by a character indicating memory type, followed by GPU device index (0-indexed)
-#              Supported memory locations are:
-#              - P:    Pinned host memory   (on CPU, on NUMA node closest to provided GPU index)
-#              - G:    Global device memory (on GPU)
-#Round brackets may be included for human clarity, but will be ignored
+# Argument Details:
+#   #Links  :   Number of Links to be run in parallel
+#   #CUs    :   Number of threadblocks/CUs to use for a GPU-executed Link
+#   srcMemL :   Source memory location (Where the data is to be read from). Ignored in memset mode
+#   Executor:   Executor are specified by a character indicating executor type, followed by device index (0-indexed)
+#               - C: CPU-executed  (Indexed from 0 to 1)
+#               - G: GPU-executed  (Indexed from 0 to 3)
+#   dstMemL :   Destination memory location (Where the data is to be written to)

-#Examples:
-#1 4 (0 G0 G1)              Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1
-#1 4 (0 G1 G0)              Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 1 and copies it to memory on GPU 0
-#1 4 (2 P0 G2)              Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2
-#2 4 (0 G0 G1) (1 G1 G0)    Runs 2 Links in parallel.  GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs
-#-2 (0 G0 G1 4) (1 G1 G0 2) Runs 2 Links in parallel.  GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs
+#               Memory locations are specified by a character indicating memory type, followed by device index (0-indexed)
+#               Supported memory locations are:
+#               - C:    Pinned host memory       (on NUMA node, indexed from 0 to 1)
+#               - G:    Global device memory     (on GPU device indexed from 0 to 3)
+#               - F:    Fine-grain device memory (on GPU device indexed from 0 to 3)

-# Single link between GPUs 0 and 1
-1 1 (0 G0 G1)
+# Examples:
+# 1 4 (G0->G0->G1)             Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1
+# 1 4 (G1->C0->G0)             Single Link that uses 4 CUs on GPU 0 that reads memory from CPU 1 and copies it to memory on GPU 0
+# 1 4 (C0->G2->G2)             Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2
+# 2 4 G0->G0->G1 G1->G1->G0    Runs 2 Links in parallel.  GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs
+# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel.  GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs
+# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
+
+# Single GPU-executed link between GPUs 0 and 1 using 4 CUs
+1 4 (G0->G0->G1)