Transfer bench single stream mode (#531)

- Adding single stream mode - Removing some unused env vars - Adding output to CSV mode for p2p benchmark, topology listing modes [ROCm/rccl commit: def6832287]
2022-04-08 15:20:55 -06:00
commit e61ff3ce37
@@ -0,0 +1,16 @@
+# Changelog for TransferBench
+
+## v1.01
+### Added
+- Adding USE_SINGLE_STREAM feature
+  - All Links that execute on the same GPU device are executed with a single kernel launch on a single stream
+  - Does not work with USE_HIP_CALL and forces USE_SINGLE_SYNC to collect timings
+  - Adding ability to request coherent / fine-grained host memory ('B')
+### Changed
+- Separating TransferBench from RCCL repo
+- Peer-to-peer benchmark mode now works OUTPUT_TO_CSV
+- Toplogy display now works with OUTPUT_TO_CSV
+- Moving documentation about config file into example.cfg
+### Removed
+- Removed config file generation
+- Removed show pointer address environment variable (SHOW_ADDR)
@@ -25,6 +25,8 @@ THE SOFTWARE.

 #include <algorithm>

+#define TB_VERSION "1.01"
+
 // This class manages environment variable that affect TransferBench
 class EnvVars
 {
@@ -40,9 +42,7 @@ public:
  int useMemset;       // Perform a memset instead of a copy (ignores source memory)
  int useSingleSync;   // Perform synchronization only once after all iterations instead of per iteration
  int useInteractive;  // Pause for user-input before starting transfer loop
-  int useSleep;        // Adds a 100ms sleep after each synchronization
  int combineTiming;   // Combines the timing with kernel launch
-  int showAddr;        // Print out memory addresses for each Link
  int outputToCsv;     // Output in CSV format
  int byteOffset;      // Byte-offset for memory allocations
  int numWarmups;      // Number of un-timed warmup iterations to perform
@@ -52,6 +52,7 @@ public:
  int sharedMemBytes;  // Amount of shared memory to use per threadblock
  int blockBytes;      // Each CU, except the last, gets a multiple of this many bytes to copy
  int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device
+  int useSingleStream; // Use a single stream per device instead of per Link. Can not be used with USE_HIP_CALL

  std::vector<float> fillPattern; // Pattern of floats used to fill source data

@@ -67,7 +68,6 @@ public:
    useSingleSync   = GetEnvVar("USE_SINGLE_SYNC"  , 1);
    useInteractive  = GetEnvVar("USE_INTERACTIVE"  , 0);
    combineTiming   = GetEnvVar("COMBINE_TIMING"   , 0);
-    showAddr        = GetEnvVar("SHOW_ADDR"        , 0);
    outputToCsv     = GetEnvVar("OUTPUT_TO_CSV"    , 0);
    byteOffset      = GetEnvVar("BYTE_OFFSET"      , 0);
    numWarmups      = GetEnvVar("NUM_WARMUPS"      , DEFAULT_NUM_WARMUPS);
@@ -77,6 +77,7 @@ public:
    sharedMemBytes  = GetEnvVar("SHARED_MEM_BYTES" , maxSharedMemBytes / 2 + 1);
    blockBytes      = GetEnvVar("BLOCK_BYTES"      , 256);
    usePcieIndexing = GetEnvVar("USE_PCIE_INDEX"   , 0);
+    useSingleStream = GetEnvVar("USE_SINGLE_STREAM", 0);

    // Check for fill pattern
    char* pattern = getenv("FILL_PATTERN");
@@ -172,6 +173,11 @@ public:
      printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n");
      exit(1);
    }
+    if (useSingleStream && useHipCall)
+    {
+      printf("[ERROR] Single stream mode cannot be used with HIP calls\n");
+      exit(1);
+    }
  }

  // Display info on the env vars that can be used
@@ -184,7 +190,6 @@ public:
    printf(" USE_SINGLE_SYNC    - Perform synchronization only once after all iterations instead of per iteration\n");
    printf(" USE_INTERACTIVE    - Pause for user-input before starting transfer loop\n");
    printf(" COMBINE_TIMING     - Combines timing with launch (potentially lower timing overhead)\n");
-    printf(" SHOW_ADDR          - Print out memory addresses for each Link\n");
    printf(" OUTPUT_TO_CSV      - Outputs to CSV format if set\n");
    printf(" BYTE_OFFSET        - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
    printf(" NUM_WARMUPS=W      - Perform W untimed warmup iteration(s) per test\n");
@@ -195,6 +200,7 @@ public:
    printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
    printf(" BLOCK_BYTES=B      - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
    printf(" USE_PCIE_INDEX     - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
+    printf(" USE_SINGLE_STREAM  - Use single stream per device instead of per link.  Cannot be used with USE_HIP_CALL\n");
  }

  // Display env var settings
@@ -202,7 +208,7 @@ public:
  {
    if (!outputToCsv)
    {
-      printf("Run configuration\n");
+      printf("Run configuration (TransferBench v%s)\n", TB_VERSION);
      printf("=====================================================\n");
      printf("%-20s = %12d : Using %s for GPU-executed copies\n", "USE_HIP_CALL", useHipCall,
             useHipCall ? "HIP functions" : "custom kernels");
@@ -220,8 +226,6 @@ public:
             useInteractive ? "interactive" : "non-interactive");
      printf("%-20s = %12d : %s\n", "COMBINE_TIMING", combineTiming,
             combineTiming ? "Using combined timing+launch" : "Using separate timing / launch");
-      printf("%-20s = %12d : %s\n", "SHOW_ADDR", showAddr,
-             showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses");
      printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
             outputToCsv ? "CSV" : "console");
      printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
@@ -242,13 +246,13 @@ public:
             getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
      printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes);
      printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
+      printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM", useSingleStream, (useSingleStream ? "device" : "Link"));
      printf("\n");
    }
  };

-private:
  // Helper function that gets parses environment variable or sets to default value
-  int GetEnvVar(std::string const varname, int defaultValue)
+  static int GetEnvVar(std::string const varname, int defaultValue)
  {
    if (getenv(varname.c_str()))
      return atoi(getenv(varname.c_str()));
@@ -36,6 +36,7 @@ GpuCopyKernel(BlockParam* blockParams)
  int          Nrem = blockParams[blockIdx.x].N;
  float const* src  = blockParams[blockIdx.x].src;
  float*       dst  = blockParams[blockIdx.x].dst;
+  if (threadIdx.x == 0) blockParams[blockIdx.x].startCycle = __builtin_amdgcn_s_memrealtime();

  // Operate on wavefront granularity
  int numWaves = BLOCKSIZE   / WARP_SIZE; // Number of wavefronts per threadblock
@@ -68,30 +69,34 @@ GpuCopyKernel(BlockParam* blockParams)
    loop1Offset += loop1Inc;
  }
  Nrem -= loop1Nelem;
-  if (Nrem == 0) return;
-
-  // 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration
-  int const loop2Npack  = Nrem / FLOATS_PER_PACK;
-  int const loop2Nelem  = loop2Npack * FLOATS_PER_PACK;
-  int const loop2Inc    = BLOCKSIZE;
-  int       loop2Offset = threadIdx.x;
-
-  packedSrc = (PackedFloat_t const*)(src + loop1Nelem);
-  packedDst = (PackedFloat_t      *)(dst + loop1Nelem);
-  while (loop2Offset < loop2Npack)
+  if (Nrem > 0)
  {
-    packedDst[loop2Offset] = packedSrc[loop2Offset];
-    loop2Offset += loop2Inc;
-  }
-  Nrem -= loop2Nelem;
-  if (Nrem == 0) return;
+    // 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration
+    int const loop2Npack  = Nrem / FLOATS_PER_PACK;
+    int const loop2Nelem  = loop2Npack * FLOATS_PER_PACK;
+    int const loop2Inc    = BLOCKSIZE;
+    int       loop2Offset = threadIdx.x;

-  // Deal with leftovers less than FLOATS_PER_PACK)
-  if (threadIdx.x < Nrem)
-  {
-    int offset = loop1Nelem + loop2Nelem + threadIdx.x;
-    dst[offset] = src[offset];
+    packedSrc = (PackedFloat_t const*)(src + loop1Nelem);
+    packedDst = (PackedFloat_t      *)(dst + loop1Nelem);
+    while (loop2Offset < loop2Npack)
+    {
+      packedDst[loop2Offset] = packedSrc[loop2Offset];
+      loop2Offset += loop2Inc;
+    }
+    Nrem -= loop2Nelem;
+
+    // Deal with leftovers less than FLOATS_PER_PACK)
+    if (threadIdx.x < Nrem)
+    {
+      int offset = loop1Nelem + loop2Nelem + threadIdx.x;
+      dst[offset] = src[offset];
+    }
  }
+
+  __threadfence_system();
+  if (threadIdx.x == 0)
+    blockParams[blockIdx.x].stopCycle = __builtin_amdgcn_s_memrealtime();
 }

 #define MEMSET_UNROLL 8
@@ -1,12 +1,9 @@
-# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
-HIP_PATH?= $(wildcard /opt/rocm/hip)
-ifeq (,$(HIP_PATH))
-HIP_PATH=../../..
-endif
-HIPCC=$(HIP_PATH)/bin/hipcc
+# Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ROCM_PATH ?= /opt/rocm
+HIPCC=$(ROCM_PATH)/bin/hipcc

 EXE=TransferBench
-CXXFLAGS = -O3 -I. -lnuma -L$(HIP_PATH)/../hsa/lib -lhsa-runtime64
+CXXFLAGS = -O3 -I. -lnuma -L$(ROCM_PATH)/hsa/lib -lhsa-runtime64

 all: $(EXE)

@@ -0,0 +1,14 @@
+# TransferBench
+
+TransferBench is a simple utility capable of benchmarking simultaneous copies between user-specified devices (CPUs/GPUs).
+
+## Requirements
+
+1. ROCm stack installed on the system (HIP runtime)
+2. libnuma installed on system
+
+## Building
+  To build TransferBench:
+* `make`
+
+  If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -49,15 +49,19 @@ THE SOFTWARE.
        }                                                               \
    } while (0)

+// Simple configuration parameters
+size_t const DEFAULT_BYTES_PER_LINK = (1<<26);  // Amount of data transferred per Link
+
 // Different src/dst memory types supported
 typedef enum
 {
-  MEM_CPU      = 0,    // Pinned CPU memory
+  MEM_CPU      = 0,    // Coarse-grained pinned CPU memory
  MEM_GPU      = 1,    // Coarse-grained global GPU memory
-  MEM_GPU_FINE = 2     // Fine-grained global GPU memory
+  MEM_CPU_FINE = 2,    // Fine-grained pinned CPU memory
+  MEM_GPU_FINE = 3     // Fine-grained global GPU memory
 } MemType;

-char const MemTypeStr[4] = "CGF";
+char const MemTypeStr[5] = "CGBF";

 typedef enum
 {
@@ -68,55 +72,99 @@ typedef enum
 // Each threadblock copies N floats from src to dst
 struct BlockParam
 {
-    int N;
-    float* src;
-    float* dst;
+  int       N;
+  float*    src;
+  float*    dst;
+  long long startCycle;
+  long long stopCycle;
 };

-// Each Link is a uni-direction operation from a src memory to dst memory executed by a specific GPU
+// Each Link is a uni-direction operation from a src memory to dst memory
 struct Link
 {
+  int     linkIndex;           // Link identifier
+
  // Link config
-  MemType exeMemType;      // Link executor type (CPU or GPU)
-  int     exeIndex;        // Executor index (NUMA node for CPU / device ID for GPU)
-  MemType srcMemType;      // Source memory type
-  int     srcIndex;        // Source device index
-  MemType dstMemType;      // Destination memory type
-  int     dstIndex;        // Destination device index
-  int     numBlocksToUse;  // Number of threadblocks to use for this Link
+  MemType exeMemType;          // Link executor type (CPU or GPU)
+  int     exeIndex;            // Executor index (NUMA node for CPU / device ID for GPU)
+  MemType srcMemType;          // Source memory type
+  int     srcIndex;            // Source device index
+  MemType dstMemType;          // Destination memory type
+  int     dstIndex;            // Destination device index
+  int     numBlocksToUse;      // Number of threadblocks to use for this Link

-  // Link implementation
-  float*      srcMem;      // Source memory
-  float*      dstMem;      // Destination memory
+  // Memory
+  float*  srcMem;              // Source memory
+  float*  dstMem;              // Destination memory

-  hipEvent_t  startEvent;
-  hipEvent_t  stopEvent;
-  hipStream_t stream;
-  BlockParam* blockParam;
+  // How memory is split across threadblocks / CPU cores
+  std::vector<BlockParam> blockParam;
+  BlockParam* blockParamGpuPtr;

+  // Results
+  double  linkTime;
+
+  // Prepares src memory and how to divide N elements across threadblocks/threads
+  void PrepareBlockParams(EnvVars const& ev, size_t const N);
+};
+
+typedef std::pair<MemType, int> Executor;
+
+struct ExecutorInfo
+{
+  std::vector<Link> links;       // Links to execute
+
+  // For GPU-Executors
+  int                      totalBlocks;   // Total number of CUs/CPU threads to use
+  BlockParam*              blockParamGpu; // Copy of block parameters in GPU device memory
+  std::vector<hipStream_t> streams;
+  std::vector<hipEvent_t>  startEvents;
+  std::vector<hipEvent_t>  stopEvents;
+
+  // Results
  double totalTime;
 };

-void DisplayUsage(char const* cmdName);                      // Display usage instructions
-void GenerateConfigFile(char const* cfgFile, int numBlocks); // Generate a sample config file
-void DisplayTopology();                                      // Display GPU topology
-void PopulateTestSizes(size_t const numBytesPerLink, int const samplingFactor, std::vector<size_t>& valuesofN);
-void ParseMemType(std::string const& token, int const numCpus, int const numGpus, MemType* memType, int* memIndex);
-void ParseLinks(char* line, int numCpus, int numGpus, std::vector<Link>& links);       // Parse Link information
+typedef std::map<Executor, ExecutorInfo> LinkMap;
+
+// Display usage instructions
+void DisplayUsage(char const* cmdName);
+
+// Display detected GPU topology / CPU numa nodes
+void DisplayTopology(bool const outputToCsv);
+
+// Build array of test sizes based on sampling factor
+void PopulateTestSizes(size_t const numBytesPerLink, int const samplingFactor,
+                       std::vector<size_t>& valuesofN);
+
+void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
+                  MemType* memType, int* memIndex);
+
+void ParseLinks(char* line, int numCpus, int numGpus,
+                LinkMap& linkMap);
+
 void EnablePeerAccess(int const deviceId, int const peerDeviceId);
-void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr);
-void DeallocateMemory(MemType memType, float* memPtr);
+void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
+void DeallocateMemory(MemType memType, void* memPtr);
 void CheckPages(char* byteArray, size_t numBytes, int targetId);
 void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
-void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link);
+void RunLink(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const linkIdx);
 void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu);
-double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional,
-                        MemType srcMemType, int srcIndex,
-                        MemType dstMemType, int dstIndex,
-                        int readMode);
+
+// Return the maximum bandwidth measured for given (src/dst) pair
+double GetPeakBandwidth(EnvVars const& ev,
+                        size_t  const  N,
+                        int     const  isBidirectional,
+                        int     const  readMode,
+                        int     const  numBlocksToUse,
+                        MemType const  srcMemType,
+                        int     const  srcIndex,
+                        MemType const  dstMemType,
+                        int     const  dstIndex);

 std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
 std::string GetDesc(MemType srcMemType, int srcIndex,
                    MemType dstMemType, int dstIndex);
 std::string GetLinkDesc(Link const& link);
 int RemappedIndex(int const origIdx, MemType const memType);
+int GetWallClockRate(int deviceId);
@@ -1,6 +1,7 @@
-# Configfile Format:
+# ConfigFile Format:
 # ==================
-# A Link is defined as a uni-directional transfer from src memory location to dst memory location executed by either CPU or GPU
+# A Link is defined as a uni-directional transfer from src memory location to dst memory location
+# executed by either CPU or GPU
 # Each single line in the configuration file defines a set of Links to run in parallel

 # There are two ways to specify the configuration file:
@@ -25,19 +26,22 @@
 #               - G: GPU-executed  (Indexed from 0 to 3)
 #   dstMemL :   Destination memory location (Where the data is to be written to)

-#               Memory locations are specified by a character indicating memory type, followed by device index (0-indexed)
+#               Memory locations are specified by a character indicating memory type,
+#               followed by device index (0-indexed)
 #               Supported memory locations are:
-#               - C:    Pinned host memory       (on NUMA node, indexed from 0 to 1)
-#               - G:    Global device memory     (on GPU device indexed from 0 to 3)
-#               - F:    Fine-grain device memory (on GPU device indexed from 0 to 3)
+#               - C:    Pinned host memory       (on NUMA node, indexed from 0 to [# NUMA nodes-1])
+#               - B:    Fine-grain host memory   (on NUMA node, indexed from 0 to [# NUMA nodes-1])
+#               - G:    Global device memory     (on GPU device indexed from 0 to [# GPUs - 1])
+#               - F:    Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])

 # Examples:
-# 1 4 (G0->G0->G1)             Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1
-# 1 4 (G1->C0->G0)             Single Link that uses 4 CUs on GPU 0 that reads memory from CPU 1 and copies it to memory on GPU 0
-# 1 4 (C0->G2->G2)             Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2
-# 2 4 G0->G0->G1 G1->G1->G0    Runs 2 Links in parallel.  GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs
-# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel.  GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs
+# 1 4 (G0->G0->G1)             Single link using 4 CUs on GPU0 to copy from GPU0 to GPU1
+# 1 4 (C1->G2->G0)             Single link using 4 CUs on GPU2 to copy from CPU1 to GPU0
+# 2 4 G0->G0->G1 G1->G1->G0    Runs 2 Links in parallel.  GPU0 to GPU1, and GPU1 to GPU0, each with 4 CUs
+# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel.  GPU0 to GPU1 with 4 CUs, and GPU1 to GPU0 with 2 CUs
+
 # Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
+# Lines starting with # will be ignored. Lines starting with ## will be echoed to output

 # Single GPU-executed link between GPUs 0 and 1 using 4 CUs
 1 4 (G0->G0->G1)