[TransferBench] Adding ability to perform CPU-executed copies, various upgrades (#309)
* Adding CPU based execution, fixing typos, adding Fine-grained mem * Exposing sampling factor when generating range of data sizes * Refactoring how Links are launched, now once per thread * Documentation updates
这个提交包含在:
@@ -0,0 +1,137 @@
|
||||
#ifndef ENVVARS_HPP
|
||||
#define ENVVARS_HPP
|
||||
|
||||
// This class manages environment variable that affect TransferBench
|
||||
class EnvVars
|
||||
{
|
||||
public:
|
||||
// Default configuration values
|
||||
int const DEFAULT_NUM_WARMUPS = 3;
|
||||
int const DEFAULT_NUM_ITERATIONS = 10;
|
||||
int const DEFAULT_SAMPLING_FACTOR = 1;
|
||||
int const DEFAULT_NUM_CPU_PER_LINK = 4;
|
||||
|
||||
// Environment variables
|
||||
int useHipCall; // Use hipMemcpy/hipMemset instead of custom shader kernels
|
||||
int useMemset; // Perform a memset instead of a copy (ignores source memory)
|
||||
int useSingleSync; // Perform synchronization only once after all iterations instead of per iteration
|
||||
int useInteractive; // Pause for user-input before starting transfer loop
|
||||
int useSleep; // Adds a 100ms sleep after each synchronization
|
||||
int combineTiming; // Combines the timing with kernel launch
|
||||
int showAddr; // Print out memory addresses for each Link
|
||||
int outputToCsv; // Output in CSV format
|
||||
int byteOffset; // Byte-offset for memory allocations
|
||||
int numWarmups; // Number of un-timed warmup iterations to perform
|
||||
int numIterations; // Number of timed iterations to perform
|
||||
int samplingFactor; // Affects how many different values of N are generated (when N set to 0)
|
||||
int numCpuPerLink; // Number of CPU child threads to use per CPU link
|
||||
|
||||
// Constructor that collects values
|
||||
EnvVars()
|
||||
{
|
||||
useHipCall = GetEnvVar("USE_HIP_CALL" , 0);
|
||||
useMemset = GetEnvVar("USE_MEMSET" , 0);
|
||||
useSingleSync = GetEnvVar("USE_SINGLE_SYNC" , 0);
|
||||
useInteractive = GetEnvVar("USE_INTERACTIVE" , 0);
|
||||
combineTiming = GetEnvVar("COMBINE_TIMING" , 0);
|
||||
showAddr = GetEnvVar("SHOW_ADDR" , 0);
|
||||
outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0);
|
||||
byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
|
||||
numWarmups = GetEnvVar("NUM_WARMUPS" , DEFAULT_NUM_WARMUPS);
|
||||
numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS);
|
||||
samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR);
|
||||
numCpuPerLink = GetEnvVar("NUM_CPU_PER_LINK" , DEFAULT_NUM_CPU_PER_LINK);
|
||||
|
||||
// Perform some basic validation
|
||||
if (byteOffset % sizeof(float))
|
||||
{
|
||||
printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
|
||||
exit(1);
|
||||
}
|
||||
if (numWarmups < 0)
|
||||
{
|
||||
printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
|
||||
exit(1);
|
||||
}
|
||||
if (numIterations <= 0)
|
||||
{
|
||||
printf("[ERROR] NUM_ITERATIONS must be set to a positive number\n");
|
||||
exit(1);
|
||||
}
|
||||
if (samplingFactor < 1)
|
||||
{
|
||||
printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n");
|
||||
exit(1);
|
||||
}
|
||||
if (numCpuPerLink < 1)
|
||||
{
|
||||
printf("[ERROR] NUM_CPU_PER_LINK must be greater or equal to 1\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Display info on the env vars that can be used
|
||||
static void DisplayUsage()
|
||||
{
|
||||
printf("Environment variables:\n");
|
||||
printf("======================\n");
|
||||
printf(" USE_HIP_CALL - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n");
|
||||
printf(" USE_MEMSET - Perform a memset instead of a copy (ignores source memory)\n");
|
||||
printf(" USE_SINGLE_SYNC - Perform synchronization only once after all iterations instead of per iteration\n");
|
||||
printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n");
|
||||
printf(" COMBINE_TIMING - Combines timing with launch (potentially lower timing overhead)\n");
|
||||
printf(" SHOW_ADDR - Print out memory addresses for each Link\n");
|
||||
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
|
||||
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n");
|
||||
printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n");
|
||||
printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n");
|
||||
printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
|
||||
printf(" NUM_CPU_PER_LINK=C - Use C threads per Link for CPU-executed copies\n");
|
||||
}
|
||||
|
||||
// Display env var settings
|
||||
void DisplayEnvVars() const
|
||||
{
|
||||
if (!outputToCsv)
|
||||
{
|
||||
printf("Run configuration\n");
|
||||
printf("=====================================================\n");
|
||||
printf("%-20s = %12d : Using %s for GPU-executed copies\n", "USE_HIP_CALL", useHipCall,
|
||||
useHipCall ? "HIP functions" : "custom kernels");
|
||||
printf("%-20s = %12d : Performing %s\n", "USE_MEMSET", useMemset,
|
||||
useMemset ? "memset" : "memcopy");
|
||||
if (useHipCall && !useMemset)
|
||||
{
|
||||
char* env = getenv("HSA_ENABLE_SDMA");
|
||||
printf("%-20s = %12s : %s\n", "HSA_ENABLE_SDMA", env,
|
||||
(env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
|
||||
}
|
||||
printf("%-20s = %12d : %s\n", "USE_SINGLE_SYNC", useSingleSync,
|
||||
useSingleSync ? "Synchronizing only once, after all iterations" : "Synchronizing per iteration");
|
||||
printf("%-20s = %12d : Running in %s mode\n", "USE_INTERACTIVE", useInteractive,
|
||||
useInteractive ? "interactive" : "non-interactive");
|
||||
printf("%-20s = %12d : %s\n", "COMBINE_TIMING", combineTiming,
|
||||
combineTiming ? "Using combined timing+launch" : "Using separate timing / launch");
|
||||
printf("%-20s = %12d : %s\n", "SHOW_ADDR", showAddr,
|
||||
showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses");
|
||||
printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
|
||||
outputToCsv ? "CSV" : "console");
|
||||
printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
|
||||
printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
|
||||
printf("%-20s = %12d : Running %d timed iteration(s) per topology\n", "NUM_ITERATIONS", numIterations, numIterations);
|
||||
printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Link\n", "NUM_CPU_PER_LINK", numCpuPerLink, numCpuPerLink);
|
||||
printf("\n");
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
// Helper function that gets parses environment variable or sets to default value
|
||||
int GetEnvVar(std::string const varname, int defaultValue)
|
||||
{
|
||||
if (getenv(varname.c_str()))
|
||||
return atoi(getenv(varname.c_str()));
|
||||
return defaultValue;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -6,7 +6,7 @@ endif
|
||||
HIPCC=$(HIP_PATH)/bin/hipcc
|
||||
|
||||
EXE=TransferBench
|
||||
CXXFLAGS = -O3 -I../../src/include -I.
|
||||
CXXFLAGS = -O3 -I../../src/include -I. -lnuma
|
||||
|
||||
all: $(EXE)
|
||||
|
||||
|
||||
+558
-430
文件差异内容过多而无法显示
加载差异
@@ -33,7 +33,9 @@ THE SOFTWARE.
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_ext.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
|
||||
#include "copy_kernel.h"
|
||||
#include "EnvVars.hpp"
|
||||
|
||||
// Helper macro for catching HIP errors
|
||||
#define HIP_CALL(cmd) \
|
||||
@@ -50,11 +52,12 @@ THE SOFTWARE.
|
||||
// Different src/dst memory types supported
|
||||
typedef enum
|
||||
{
|
||||
MEM_CPU = 0, // Pinned CPU memory
|
||||
MEM_GPU = 1 // Global GPU memory
|
||||
MEM_CPU = 0, // Pinned CPU memory
|
||||
MEM_GPU = 1, // Coarse-grained global GPU memory
|
||||
MEM_GPU_FINE = 2 // Fine-grained global GPU memory
|
||||
} MemType;
|
||||
|
||||
char const MemTypeStr[3] = "CG";
|
||||
char const MemTypeStr[4] = "CGF";
|
||||
|
||||
typedef enum
|
||||
{
|
||||
@@ -62,17 +65,6 @@ typedef enum
|
||||
MODE_CHECK = 1 // Check data against pattern
|
||||
} ModeType;
|
||||
|
||||
// Each Link is a uni-direction operation from a src memory to dst memory executed by a specific GPU
|
||||
struct Link
|
||||
{
|
||||
int exeIndex; // GPU to execute on
|
||||
MemType srcMemType; // Source memory type
|
||||
int srcIndex; // Source device index
|
||||
MemType dstMemType; // Destination memory type
|
||||
int dstIndex; // Destination device index
|
||||
int numBlocksToUse; // Number of threadblocks to use for this Link
|
||||
};
|
||||
|
||||
// Each threadblock copies N floats from src to dst
|
||||
struct BlockParam
|
||||
{
|
||||
@@ -81,46 +73,90 @@ struct BlockParam
|
||||
float* dst;
|
||||
};
|
||||
|
||||
// Each Link is a uni-direction operation from a src memory to dst memory executed by a specific GPU
|
||||
struct Link
|
||||
{
|
||||
// Link config
|
||||
MemType exeMemType; // Link executor type (CPU or GPU)
|
||||
int exeIndex; // Executor index (NUMA node for CPU / device ID for GPU)
|
||||
MemType srcMemType; // Source memory type
|
||||
int srcIndex; // Source device index
|
||||
MemType dstMemType; // Destination memory type
|
||||
int dstIndex; // Destination device index
|
||||
int numBlocksToUse; // Number of threadblocks to use for this Link
|
||||
|
||||
// Link implementation
|
||||
float* srcMem; // Source memory
|
||||
float* dstMem; // Destination memory
|
||||
|
||||
hipEvent_t startEvent;
|
||||
hipEvent_t stopEvent;
|
||||
hipStream_t stream;
|
||||
BlockParam* blockParam;
|
||||
|
||||
double totalTime;
|
||||
};
|
||||
|
||||
void DisplayUsage(char const* cmdName); // Display usage instructions
|
||||
void GenerateConfigFile(char const* cfgFile, int numBlocks); // Generate a sample config file
|
||||
void DisplayTopology(); // Display GPU topology
|
||||
void ParseLinks(char* line, std::vector<Link>& links); // Parse Link information
|
||||
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, bool useFineGrainMem, float** memPtr);
|
||||
void PopulateTestSizes(size_t const numBytesPerLink, int const samplingFactor, std::vector<size_t>& valuesofN);
|
||||
void ParseMemType(std::string const& token, int const numCpus, int const numGpus, MemType* memType, int* memIndex);
|
||||
void ParseLinks(char* line, int numCpus, int numGpus, std::vector<Link>& links); // Parse Link information
|
||||
void EnablePeerAccess(int const deviceId, int const peerDeviceId);
|
||||
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr);
|
||||
void DeallocateMemory(MemType memType, int devIndex, float* memPtr);
|
||||
void CheckPages(char* byteArray, size_t numBytes, int targetId);
|
||||
void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, float* ptr);
|
||||
void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link);
|
||||
|
||||
|
||||
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
|
||||
std::string GetDesc(MemType srcMemType, int srcIndex,
|
||||
MemType dstMemType, int dstIndex);
|
||||
std::string GetLinkDesc(Link const& link);
|
||||
|
||||
|
||||
#define MAX_NAME_LEN 64
|
||||
#define BLOCKSIZE 256
|
||||
#define COPY_UNROLL 4
|
||||
#define MEMSET_UNROLL 4
|
||||
|
||||
// GPU copy kernel
|
||||
__global__ void __launch_bounds__(BLOCKSIZE)
|
||||
CopyKernel(BlockParam* blockParams)
|
||||
GpuCopyKernel(BlockParam* blockParams)
|
||||
{
|
||||
// Collect the arguments for this block
|
||||
int N = blockParams[blockIdx.x].N;
|
||||
const float* __restrict__ src = (float* )blockParams[blockIdx.x].src;
|
||||
float* __restrict__ dst = (float* )blockParams[blockIdx.x].dst;
|
||||
// Collect the arguments for this block
|
||||
int N = blockParams[blockIdx.x].N;
|
||||
const float* __restrict__ src = (float* )blockParams[blockIdx.x].src;
|
||||
float* __restrict__ dst = (float* )blockParams[blockIdx.x].dst;
|
||||
|
||||
Copy<COPY_UNROLL, BLOCKSIZE>(dst, src, N);
|
||||
Copy<COPY_UNROLL, BLOCKSIZE>(dst, src, N);
|
||||
}
|
||||
|
||||
// GPU set kernel
|
||||
__global__ void __launch_bounds__(BLOCKSIZE)
|
||||
MemsetKernel(BlockParam* blockParams)
|
||||
GpuMemsetKernel(BlockParam* blockParams)
|
||||
{
|
||||
// Collect the arguments for this block
|
||||
int N = blockParams[blockIdx.x].N;
|
||||
float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst;
|
||||
// Collect the arguments for this block
|
||||
int N = blockParams[blockIdx.x].N;
|
||||
float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst;
|
||||
|
||||
// Use non-zero value
|
||||
#pragma unroll MEMSET_UNROLL
|
||||
for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE)
|
||||
{
|
||||
dst[tid] = 1234.0;
|
||||
}
|
||||
// Use non-zero value
|
||||
#pragma unroll MEMSET_UNROLL
|
||||
for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE)
|
||||
{
|
||||
dst[tid] = 1234.0;
|
||||
}
|
||||
}
|
||||
|
||||
// CPU copy kernel
|
||||
void CpuCopyKernel(BlockParam const& blockParams)
|
||||
{
|
||||
memcpy(blockParams.dst, blockParams.src, blockParams.N * sizeof(float));
|
||||
}
|
||||
|
||||
// CPU memset kernel
|
||||
void CpuMemsetKernel(BlockParam const& blockParams)
|
||||
{
|
||||
for (int i = 0; i < blockParams.N; i++)
|
||||
blockParams.dst[i] = 1234.0;
|
||||
}
|
||||
|
||||
+36
-32
@@ -1,39 +1,43 @@
|
||||
#Configfile Format:
|
||||
#==================
|
||||
#A Link is defined as a uni-directional transfer from src memory location to dst memory location
|
||||
#Each single line in the configuration file defines a set of Links to run in parallel
|
||||
# Configfile Format:
|
||||
# ==================
|
||||
# A Link is defined as a uni-directional transfer from src memory location to dst memory location executed by either CPU or GPU
|
||||
# Each single line in the configuration file defines a set of Links to run in parallel
|
||||
|
||||
#There are two ways to specify the configuration file:
|
||||
# There are two ways to specify the configuration file:
|
||||
|
||||
#1) Basic
|
||||
# The basic specification assumes the same number of threadblocks/CUs used per link
|
||||
# A positive number of Links is specified followed by that number of triplets describing each Link
|
||||
# 1) Basic
|
||||
# The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Link
|
||||
# A positive number of Links is specified followed by that number of triplets describing each Link
|
||||
|
||||
#Links #CUs (GPUIndex1 srcMem1 dstMem1) ... (GPUIndexL srcMemL dstMemL)
|
||||
# #Links #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
|
||||
|
||||
#2) Advanced
|
||||
# The advanced specification allows different number of threadblocks/CUs used per Link
|
||||
# A negative number of links is specified, followed by quadruples describing each Link
|
||||
# -#Links (GPUIndex1 #CUs1 srcMem1 dstMem1) ... (GPUIndexL #CUsL srcMemL dstMemL)
|
||||
# 2) Advanced
|
||||
# The advanced specification allows different number of threadblocks/CUs used per GPU-executed Link
|
||||
# A negative number of links is specified, followed by quadruples describing each Link
|
||||
# -#Links (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL)
|
||||
|
||||
#Argument Details:
|
||||
# #Links : Number of Links to be run in parallel
|
||||
# #CUs : Number of threadblocks/CUs to use for a Link
|
||||
# GpuIndex: 0-indexed GPU id executing the Link
|
||||
# srcMemL : Source memory location (Where the data is to be read from). Ignored in memset mode
|
||||
# dstMemL : Destination memory location (Where the data is to be written to)
|
||||
# Memory locations are specified by a character indicating memory type, followed by GPU device index (0-indexed)
|
||||
# Supported memory locations are:
|
||||
# - P: Pinned host memory (on CPU, on NUMA node closest to provided GPU index)
|
||||
# - G: Global device memory (on GPU)
|
||||
#Round brackets may be included for human clarity, but will be ignored
|
||||
# Argument Details:
|
||||
# #Links : Number of Links to be run in parallel
|
||||
# #CUs : Number of threadblocks/CUs to use for a GPU-executed Link
|
||||
# srcMemL : Source memory location (Where the data is to be read from). Ignored in memset mode
|
||||
# Executor: Executor are specified by a character indicating executor type, followed by device index (0-indexed)
|
||||
# - C: CPU-executed (Indexed from 0 to 1)
|
||||
# - G: GPU-executed (Indexed from 0 to 3)
|
||||
# dstMemL : Destination memory location (Where the data is to be written to)
|
||||
|
||||
#Examples:
|
||||
#1 4 (0 G0 G1) Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1
|
||||
#1 4 (0 G1 G0) Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 1 and copies it to memory on GPU 0
|
||||
#1 4 (2 P0 G2) Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2
|
||||
#2 4 (0 G0 G1) (1 G1 G0) Runs 2 Links in parallel. GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs
|
||||
#-2 (0 G0 G1 4) (1 G1 G0 2) Runs 2 Links in parallel. GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs
|
||||
# Memory locations are specified by a character indicating memory type, followed by device index (0-indexed)
|
||||
# Supported memory locations are:
|
||||
# - C: Pinned host memory (on NUMA node, indexed from 0 to 1)
|
||||
# - G: Global device memory (on GPU device indexed from 0 to 3)
|
||||
# - F: Fine-grain device memory (on GPU device indexed from 0 to 3)
|
||||
|
||||
# Single link between GPUs 0 and 1
|
||||
1 1 (0 G0 G1)
|
||||
# Examples:
|
||||
# 1 4 (G0->G0->G1) Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1
|
||||
# 1 4 (G1->C0->G0) Single Link that uses 4 CUs on GPU 0 that reads memory from CPU 1 and copies it to memory on GPU 0
|
||||
# 1 4 (C0->G2->G2) Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2
|
||||
# 2 4 G0->G0->G1 G1->G1->G0 Runs 2 Links in parallel. GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs
|
||||
# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel. GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs
|
||||
# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
|
||||
|
||||
# Single GPU-executed link between GPUs 0 and 1 using 4 CUs
|
||||
1 4 (G0->G0->G1)
|
||||
|
||||
在新工单中引用
屏蔽一个用户