[TransferBench] Adding ability to perform CPU-executed copies, various upgrades (#309)

* Adding CPU based execution, fixing typos, adding Fine-grained mem
* Exposing sampling factor when generating range of data sizes
* Refactoring how Links are launched, now once per thread
* Documentation updates
这个提交包含在:
gilbertlee-amd
2020-12-11 10:21:14 -07:00
提交者 GitHub
父节点 b80ae551b1
当前提交 ae0c4092c7
修改 5 个文件,包含 802 行新增497 行删除
+137
查看文件
@@ -0,0 +1,137 @@
#ifndef ENVVARS_HPP
#define ENVVARS_HPP
// This class manages environment variable that affect TransferBench
class EnvVars
{
public:
// Default configuration values
int const DEFAULT_NUM_WARMUPS = 3;
int const DEFAULT_NUM_ITERATIONS = 10;
int const DEFAULT_SAMPLING_FACTOR = 1;
int const DEFAULT_NUM_CPU_PER_LINK = 4;
// Environment variables
int useHipCall; // Use hipMemcpy/hipMemset instead of custom shader kernels
int useMemset; // Perform a memset instead of a copy (ignores source memory)
int useSingleSync; // Perform synchronization only once after all iterations instead of per iteration
int useInteractive; // Pause for user-input before starting transfer loop
int useSleep; // Adds a 100ms sleep after each synchronization
int combineTiming; // Combines the timing with kernel launch
int showAddr; // Print out memory addresses for each Link
int outputToCsv; // Output in CSV format
int byteOffset; // Byte-offset for memory allocations
int numWarmups; // Number of un-timed warmup iterations to perform
int numIterations; // Number of timed iterations to perform
int samplingFactor; // Affects how many different values of N are generated (when N set to 0)
int numCpuPerLink; // Number of CPU child threads to use per CPU link
// Constructor that collects values
EnvVars()
{
useHipCall = GetEnvVar("USE_HIP_CALL" , 0);
useMemset = GetEnvVar("USE_MEMSET" , 0);
useSingleSync = GetEnvVar("USE_SINGLE_SYNC" , 0);
useInteractive = GetEnvVar("USE_INTERACTIVE" , 0);
combineTiming = GetEnvVar("COMBINE_TIMING" , 0);
showAddr = GetEnvVar("SHOW_ADDR" , 0);
outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
numWarmups = GetEnvVar("NUM_WARMUPS" , DEFAULT_NUM_WARMUPS);
numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS);
samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR);
numCpuPerLink = GetEnvVar("NUM_CPU_PER_LINK" , DEFAULT_NUM_CPU_PER_LINK);
// Perform some basic validation
if (byteOffset % sizeof(float))
{
printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
exit(1);
}
if (numWarmups < 0)
{
printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
exit(1);
}
if (numIterations <= 0)
{
printf("[ERROR] NUM_ITERATIONS must be set to a positive number\n");
exit(1);
}
if (samplingFactor < 1)
{
printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n");
exit(1);
}
if (numCpuPerLink < 1)
{
printf("[ERROR] NUM_CPU_PER_LINK must be greater or equal to 1\n");
exit(1);
}
}
// Display info on the env vars that can be used
static void DisplayUsage()
{
printf("Environment variables:\n");
printf("======================\n");
printf(" USE_HIP_CALL - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n");
printf(" USE_MEMSET - Perform a memset instead of a copy (ignores source memory)\n");
printf(" USE_SINGLE_SYNC - Perform synchronization only once after all iterations instead of per iteration\n");
printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n");
printf(" COMBINE_TIMING - Combines timing with launch (potentially lower timing overhead)\n");
printf(" SHOW_ADDR - Print out memory addresses for each Link\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n");
printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n");
printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n");
printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
printf(" NUM_CPU_PER_LINK=C - Use C threads per Link for CPU-executed copies\n");
}
// Display env var settings
void DisplayEnvVars() const
{
if (!outputToCsv)
{
printf("Run configuration\n");
printf("=====================================================\n");
printf("%-20s = %12d : Using %s for GPU-executed copies\n", "USE_HIP_CALL", useHipCall,
useHipCall ? "HIP functions" : "custom kernels");
printf("%-20s = %12d : Performing %s\n", "USE_MEMSET", useMemset,
useMemset ? "memset" : "memcopy");
if (useHipCall && !useMemset)
{
char* env = getenv("HSA_ENABLE_SDMA");
printf("%-20s = %12s : %s\n", "HSA_ENABLE_SDMA", env,
(env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
}
printf("%-20s = %12d : %s\n", "USE_SINGLE_SYNC", useSingleSync,
useSingleSync ? "Synchronizing only once, after all iterations" : "Synchronizing per iteration");
printf("%-20s = %12d : Running in %s mode\n", "USE_INTERACTIVE", useInteractive,
useInteractive ? "interactive" : "non-interactive");
printf("%-20s = %12d : %s\n", "COMBINE_TIMING", combineTiming,
combineTiming ? "Using combined timing+launch" : "Using separate timing / launch");
printf("%-20s = %12d : %s\n", "SHOW_ADDR", showAddr,
showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses");
printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
outputToCsv ? "CSV" : "console");
printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
printf("%-20s = %12d : Running %d timed iteration(s) per topology\n", "NUM_ITERATIONS", numIterations, numIterations);
printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Link\n", "NUM_CPU_PER_LINK", numCpuPerLink, numCpuPerLink);
printf("\n");
}
};
private:
// Helper function that gets parses environment variable or sets to default value
int GetEnvVar(std::string const varname, int defaultValue)
{
if (getenv(varname.c_str()))
return atoi(getenv(varname.c_str()));
return defaultValue;
}
};
#endif
+1 -1
查看文件
@@ -6,7 +6,7 @@ endif
HIPCC=$(HIP_PATH)/bin/hipcc
EXE=TransferBench
CXXFLAGS = -O3 -I../../src/include -I.
CXXFLAGS = -O3 -I../../src/include -I. -lnuma
all: $(EXE)
文件差异内容过多而无法显示 加载差异
+70 -34
查看文件
@@ -33,7 +33,9 @@ THE SOFTWARE.
#include <hip/hip_runtime.h>
#include <hip/hip_ext.h>
#include <hsa/hsa_ext_amd.h>
#include "copy_kernel.h"
#include "EnvVars.hpp"
// Helper macro for catching HIP errors
#define HIP_CALL(cmd) \
@@ -50,11 +52,12 @@ THE SOFTWARE.
// Different src/dst memory types supported
typedef enum
{
MEM_CPU = 0, // Pinned CPU memory
MEM_GPU = 1 // Global GPU memory
MEM_CPU = 0, // Pinned CPU memory
MEM_GPU = 1, // Coarse-grained global GPU memory
MEM_GPU_FINE = 2 // Fine-grained global GPU memory
} MemType;
char const MemTypeStr[3] = "CG";
char const MemTypeStr[4] = "CGF";
typedef enum
{
@@ -62,17 +65,6 @@ typedef enum
MODE_CHECK = 1 // Check data against pattern
} ModeType;
// Each Link is a uni-direction operation from a src memory to dst memory executed by a specific GPU
struct Link
{
int exeIndex; // GPU to execute on
MemType srcMemType; // Source memory type
int srcIndex; // Source device index
MemType dstMemType; // Destination memory type
int dstIndex; // Destination device index
int numBlocksToUse; // Number of threadblocks to use for this Link
};
// Each threadblock copies N floats from src to dst
struct BlockParam
{
@@ -81,46 +73,90 @@ struct BlockParam
float* dst;
};
// Each Link is a uni-direction operation from a src memory to dst memory executed by a specific GPU
struct Link
{
// Link config
MemType exeMemType; // Link executor type (CPU or GPU)
int exeIndex; // Executor index (NUMA node for CPU / device ID for GPU)
MemType srcMemType; // Source memory type
int srcIndex; // Source device index
MemType dstMemType; // Destination memory type
int dstIndex; // Destination device index
int numBlocksToUse; // Number of threadblocks to use for this Link
// Link implementation
float* srcMem; // Source memory
float* dstMem; // Destination memory
hipEvent_t startEvent;
hipEvent_t stopEvent;
hipStream_t stream;
BlockParam* blockParam;
double totalTime;
};
void DisplayUsage(char const* cmdName); // Display usage instructions
void GenerateConfigFile(char const* cfgFile, int numBlocks); // Generate a sample config file
void DisplayTopology(); // Display GPU topology
void ParseLinks(char* line, std::vector<Link>& links); // Parse Link information
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, bool useFineGrainMem, float** memPtr);
void PopulateTestSizes(size_t const numBytesPerLink, int const samplingFactor, std::vector<size_t>& valuesofN);
void ParseMemType(std::string const& token, int const numCpus, int const numGpus, MemType* memType, int* memIndex);
void ParseLinks(char* line, int numCpus, int numGpus, std::vector<Link>& links); // Parse Link information
void EnablePeerAccess(int const deviceId, int const peerDeviceId);
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr);
void DeallocateMemory(MemType memType, int devIndex, float* memPtr);
void CheckPages(char* byteArray, size_t numBytes, int targetId);
void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, float* ptr);
void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link);
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
std::string GetDesc(MemType srcMemType, int srcIndex,
MemType dstMemType, int dstIndex);
std::string GetLinkDesc(Link const& link);
#define MAX_NAME_LEN 64
#define BLOCKSIZE 256
#define COPY_UNROLL 4
#define MEMSET_UNROLL 4
// GPU copy kernel
__global__ void __launch_bounds__(BLOCKSIZE)
CopyKernel(BlockParam* blockParams)
GpuCopyKernel(BlockParam* blockParams)
{
// Collect the arguments for this block
int N = blockParams[blockIdx.x].N;
const float* __restrict__ src = (float* )blockParams[blockIdx.x].src;
float* __restrict__ dst = (float* )blockParams[blockIdx.x].dst;
// Collect the arguments for this block
int N = blockParams[blockIdx.x].N;
const float* __restrict__ src = (float* )blockParams[blockIdx.x].src;
float* __restrict__ dst = (float* )blockParams[blockIdx.x].dst;
Copy<COPY_UNROLL, BLOCKSIZE>(dst, src, N);
Copy<COPY_UNROLL, BLOCKSIZE>(dst, src, N);
}
// GPU set kernel
__global__ void __launch_bounds__(BLOCKSIZE)
MemsetKernel(BlockParam* blockParams)
GpuMemsetKernel(BlockParam* blockParams)
{
// Collect the arguments for this block
int N = blockParams[blockIdx.x].N;
float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst;
// Collect the arguments for this block
int N = blockParams[blockIdx.x].N;
float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst;
// Use non-zero value
#pragma unroll MEMSET_UNROLL
for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE)
{
dst[tid] = 1234.0;
}
// Use non-zero value
#pragma unroll MEMSET_UNROLL
for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE)
{
dst[tid] = 1234.0;
}
}
// CPU copy kernel
void CpuCopyKernel(BlockParam const& blockParams)
{
memcpy(blockParams.dst, blockParams.src, blockParams.N * sizeof(float));
}
// CPU memset kernel
void CpuMemsetKernel(BlockParam const& blockParams)
{
for (int i = 0; i < blockParams.N; i++)
blockParams.dst[i] = 1234.0;
}
+36 -32
查看文件
@@ -1,39 +1,43 @@
#Configfile Format:
#==================
#A Link is defined as a uni-directional transfer from src memory location to dst memory location
#Each single line in the configuration file defines a set of Links to run in parallel
# Configfile Format:
# ==================
# A Link is defined as a uni-directional transfer from src memory location to dst memory location executed by either CPU or GPU
# Each single line in the configuration file defines a set of Links to run in parallel
#There are two ways to specify the configuration file:
# There are two ways to specify the configuration file:
#1) Basic
# The basic specification assumes the same number of threadblocks/CUs used per link
# A positive number of Links is specified followed by that number of triplets describing each Link
# 1) Basic
# The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Link
# A positive number of Links is specified followed by that number of triplets describing each Link
#Links #CUs (GPUIndex1 srcMem1 dstMem1) ... (GPUIndexL srcMemL dstMemL)
# #Links #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
#2) Advanced
# The advanced specification allows different number of threadblocks/CUs used per Link
# A negative number of links is specified, followed by quadruples describing each Link
# -#Links (GPUIndex1 #CUs1 srcMem1 dstMem1) ... (GPUIndexL #CUsL srcMemL dstMemL)
# 2) Advanced
# The advanced specification allows different number of threadblocks/CUs used per GPU-executed Link
# A negative number of links is specified, followed by quadruples describing each Link
# -#Links (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL)
#Argument Details:
# #Links : Number of Links to be run in parallel
# #CUs : Number of threadblocks/CUs to use for a Link
# GpuIndex: 0-indexed GPU id executing the Link
# srcMemL : Source memory location (Where the data is to be read from). Ignored in memset mode
# dstMemL : Destination memory location (Where the data is to be written to)
# Memory locations are specified by a character indicating memory type, followed by GPU device index (0-indexed)
# Supported memory locations are:
# - P: Pinned host memory (on CPU, on NUMA node closest to provided GPU index)
# - G: Global device memory (on GPU)
#Round brackets may be included for human clarity, but will be ignored
# Argument Details:
# #Links : Number of Links to be run in parallel
# #CUs : Number of threadblocks/CUs to use for a GPU-executed Link
# srcMemL : Source memory location (Where the data is to be read from). Ignored in memset mode
# Executor: Executor are specified by a character indicating executor type, followed by device index (0-indexed)
# - C: CPU-executed (Indexed from 0 to 1)
# - G: GPU-executed (Indexed from 0 to 3)
# dstMemL : Destination memory location (Where the data is to be written to)
#Examples:
#1 4 (0 G0 G1) Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1
#1 4 (0 G1 G0) Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 1 and copies it to memory on GPU 0
#1 4 (2 P0 G2) Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2
#2 4 (0 G0 G1) (1 G1 G0) Runs 2 Links in parallel. GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs
#-2 (0 G0 G1 4) (1 G1 G0 2) Runs 2 Links in parallel. GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs
# Memory locations are specified by a character indicating memory type, followed by device index (0-indexed)
# Supported memory locations are:
# - C: Pinned host memory (on NUMA node, indexed from 0 to 1)
# - G: Global device memory (on GPU device indexed from 0 to 3)
# - F: Fine-grain device memory (on GPU device indexed from 0 to 3)
# Single link between GPUs 0 and 1
1 1 (0 G0 G1)
# Examples:
# 1 4 (G0->G0->G1) Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1
# 1 4 (G1->C0->G0) Single Link that uses 4 CUs on GPU 0 that reads memory from CPU 1 and copies it to memory on GPU 0
# 1 4 (C0->G2->G2) Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2
# 2 4 G0->G0->G1 G1->G1->G0 Runs 2 Links in parallel. GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs
# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel. GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs
# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
# Single GPU-executed link between GPUs 0 and 1 using 4 CUs
1 4 (G0->G0->G1)