From ae0c4092c7c0149391dfc30e6dcbc3ea68726bfa Mon Sep 17 00:00:00 2001 From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com> Date: Fri, 11 Dec 2020 10:21:14 -0700 Subject: [PATCH] [TransferBench] Adding ability to perform CPU-executed copies, various upgrades (#309) * Adding CPU based execution, fixing typos, adding Fine-grained mem * Exposing sampling factor when generating range of data sizes * Refactoring how Links are launched, now once per thread * Documentation updates --- tools/TransferBench/EnvVars.hpp | 137 ++++ tools/TransferBench/Makefile | 2 +- tools/TransferBench/TransferBench.cpp | 988 +++++++++++++++----------- tools/TransferBench/TransferBench.hpp | 104 ++- tools/TransferBench/example.cfg | 68 +- 5 files changed, 802 insertions(+), 497 deletions(-) create mode 100644 tools/TransferBench/EnvVars.hpp diff --git a/tools/TransferBench/EnvVars.hpp b/tools/TransferBench/EnvVars.hpp new file mode 100644 index 0000000000..258f0ee33c --- /dev/null +++ b/tools/TransferBench/EnvVars.hpp @@ -0,0 +1,137 @@ +#ifndef ENVVARS_HPP +#define ENVVARS_HPP + +// This class manages environment variable that affect TransferBench +class EnvVars +{ +public: + // Default configuration values + int const DEFAULT_NUM_WARMUPS = 3; + int const DEFAULT_NUM_ITERATIONS = 10; + int const DEFAULT_SAMPLING_FACTOR = 1; + int const DEFAULT_NUM_CPU_PER_LINK = 4; + + // Environment variables + int useHipCall; // Use hipMemcpy/hipMemset instead of custom shader kernels + int useMemset; // Perform a memset instead of a copy (ignores source memory) + int useSingleSync; // Perform synchronization only once after all iterations instead of per iteration + int useInteractive; // Pause for user-input before starting transfer loop + int useSleep; // Adds a 100ms sleep after each synchronization + int combineTiming; // Combines the timing with kernel launch + int showAddr; // Print out memory addresses for each Link + int outputToCsv; // Output in CSV format + int byteOffset; // Byte-offset for memory allocations + int numWarmups; // Number of un-timed warmup iterations to perform + int numIterations; // Number of timed iterations to perform + int samplingFactor; // Affects how many different values of N are generated (when N set to 0) + int numCpuPerLink; // Number of CPU child threads to use per CPU link + + // Constructor that collects values + EnvVars() + { + useHipCall = GetEnvVar("USE_HIP_CALL" , 0); + useMemset = GetEnvVar("USE_MEMSET" , 0); + useSingleSync = GetEnvVar("USE_SINGLE_SYNC" , 0); + useInteractive = GetEnvVar("USE_INTERACTIVE" , 0); + combineTiming = GetEnvVar("COMBINE_TIMING" , 0); + showAddr = GetEnvVar("SHOW_ADDR" , 0); + outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0); + byteOffset = GetEnvVar("BYTE_OFFSET" , 0); + numWarmups = GetEnvVar("NUM_WARMUPS" , DEFAULT_NUM_WARMUPS); + numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS); + samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR); + numCpuPerLink = GetEnvVar("NUM_CPU_PER_LINK" , DEFAULT_NUM_CPU_PER_LINK); + + // Perform some basic validation + if (byteOffset % sizeof(float)) + { + printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float)); + exit(1); + } + if (numWarmups < 0) + { + printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n"); + exit(1); + } + if (numIterations <= 0) + { + printf("[ERROR] NUM_ITERATIONS must be set to a positive number\n"); + exit(1); + } + if (samplingFactor < 1) + { + printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n"); + exit(1); + } + if (numCpuPerLink < 1) + { + printf("[ERROR] NUM_CPU_PER_LINK must be greater or equal to 1\n"); + exit(1); + } + } + + // Display info on the env vars that can be used + static void DisplayUsage() + { + printf("Environment variables:\n"); + printf("======================\n"); + printf(" USE_HIP_CALL - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n"); + printf(" USE_MEMSET - Perform a memset instead of a copy (ignores source memory)\n"); + printf(" USE_SINGLE_SYNC - Perform synchronization only once after all iterations instead of per iteration\n"); + printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n"); + printf(" COMBINE_TIMING - Combines timing with launch (potentially lower timing overhead)\n"); + printf(" SHOW_ADDR - Print out memory addresses for each Link\n"); + printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n"); + printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n"); + printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n"); + printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n"); + printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n"); + printf(" NUM_CPU_PER_LINK=C - Use C threads per Link for CPU-executed copies\n"); + } + + // Display env var settings + void DisplayEnvVars() const + { + if (!outputToCsv) + { + printf("Run configuration\n"); + printf("=====================================================\n"); + printf("%-20s = %12d : Using %s for GPU-executed copies\n", "USE_HIP_CALL", useHipCall, + useHipCall ? "HIP functions" : "custom kernels"); + printf("%-20s = %12d : Performing %s\n", "USE_MEMSET", useMemset, + useMemset ? "memset" : "memcopy"); + if (useHipCall && !useMemset) + { + char* env = getenv("HSA_ENABLE_SDMA"); + printf("%-20s = %12s : %s\n", "HSA_ENABLE_SDMA", env, + (env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines"); + } + printf("%-20s = %12d : %s\n", "USE_SINGLE_SYNC", useSingleSync, + useSingleSync ? "Synchronizing only once, after all iterations" : "Synchronizing per iteration"); + printf("%-20s = %12d : Running in %s mode\n", "USE_INTERACTIVE", useInteractive, + useInteractive ? "interactive" : "non-interactive"); + printf("%-20s = %12d : %s\n", "COMBINE_TIMING", combineTiming, + combineTiming ? "Using combined timing+launch" : "Using separate timing / launch"); + printf("%-20s = %12d : %s\n", "SHOW_ADDR", showAddr, + showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses"); + printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv, + outputToCsv ? "CSV" : "console"); + printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset); + printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups); + printf("%-20s = %12d : Running %d timed iteration(s) per topology\n", "NUM_ITERATIONS", numIterations, numIterations); + printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Link\n", "NUM_CPU_PER_LINK", numCpuPerLink, numCpuPerLink); + printf("\n"); + } + }; + +private: + // Helper function that gets parses environment variable or sets to default value + int GetEnvVar(std::string const varname, int defaultValue) + { + if (getenv(varname.c_str())) + return atoi(getenv(varname.c_str())); + return defaultValue; + } +}; + +#endif diff --git a/tools/TransferBench/Makefile b/tools/TransferBench/Makefile index af0b4597b7..c51bd47b4c 100644 --- a/tools/TransferBench/Makefile +++ b/tools/TransferBench/Makefile @@ -6,7 +6,7 @@ endif HIPCC=$(HIP_PATH)/bin/hipcc EXE=TransferBench -CXXFLAGS = -O3 -I../../src/include -I. +CXXFLAGS = -O3 -I../../src/include -I. -lnuma all: $(EXE) diff --git a/tools/TransferBench/TransferBench.cpp b/tools/TransferBench/TransferBench.cpp index 531589695d..f6d90fdb29 100644 --- a/tools/TransferBench/TransferBench.cpp +++ b/tools/TransferBench/TransferBench.cpp @@ -24,11 +24,13 @@ THE SOFTWARE. // on the same node #include "TransferBench.hpp" +#include +#include +#include +#include // Simple configuration parameters -size_t const DEFAULT_BYTES_PER_LINK = (1<<26); -int const DEFAULT_NUM_WARMUPS = 3; -int const DEFAULT_NUM_ITERATIONS = 10; +size_t const DEFAULT_BYTES_PER_LINK = (1<<26); // Amount of data transferred per Link int main(int argc, char **argv) { @@ -41,130 +43,13 @@ int main(int argc, char **argv) } // If a negative value is listed for N, generate a comprehensive config file for this node - if (argc > 2 && atoi(argv[2]) < 0) + if (argc > 2 && atoll(argv[2]) < 0) { GenerateConfigFile(argv[1], -1*atoi(argv[2])); exit(0); } - // Collect environment variables / display current run configuration - bool useHipCall = getenv("USE_HIP_CALL"); // Use hipMemcpy/hipMemset instead of custom shader kernels - bool useMemset = getenv("USE_MEMSET"); // Perform a memset instead of a copy (ignores source memory) - bool useFineGrainMem = getenv("USE_FINEGRAIN_MEM"); // Allocate fine-grained GPU memory instead of coarse-grained GPU memory - bool useSingleSync = getenv("USE_SINGLE_SYNC"); // Perform synchronization only once after all iterations instead of per iteration - bool useInteractive = getenv("USE_INTERACTIVE"); // Pause for user-input before starting transfer loop - bool useSleep = getenv("USE_SLEEP"); // Adds a 100ms sleep after each synchronization - bool combineTiming = getenv("COMBINE_TIMING"); // Combines the timing with kernel launch - bool reuseStreams = getenv("REUSE_STREAMS"); // Re-use streams instead of creating / destroying per test - bool showAddr = getenv("SHOW_ADDR"); // Print out memory addresses for each Link - bool outputToCsv = getenv("OUTPUT_TO_CSV"); // Output in CSV format - int byteOffset = getenv("BYTE_OFFSET") ? atoi(getenv("BYTE_OFFSET")) : 0; // Byte-offset for memory allocations - int numWarmups = getenv("NUM_WARMUPS") ? atoi(getenv("NUM_WARMUPS")) : DEFAULT_NUM_WARMUPS; - int numIterations = getenv("NUM_ITERATIONS") ? atoi(getenv("NUM_ITERATIONS")) : DEFAULT_NUM_ITERATIONS; - - // Determine number of bytes to run per link - // If a non-zero number of bytes is specified, use it - // Otherwise generate array of bytes values to execute over - std::vector valuesOfN; - size_t const numBytesPerLink = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_LINK; - - if (numBytesPerLink % 4) - { - printf("[ERROR] numBytesPerLink (%lu) must be a multiple of 4\n", numBytesPerLink); - exit(1); - } - - if (numBytesPerLink != 0) - { - size_t N = numBytesPerLink / sizeof(float); - valuesOfN.push_back(N); - } - else - { - for (int N = 256; N <= (1<<27); N *= 2) - { - int decimationFactor = 1; // This can be modified to increase number of samples between powers of two - int delta = std::max(32, N / decimationFactor); - int curr = N; - while (curr < N * 2) - { - valuesOfN.push_back(curr); - curr += delta; - } - } - } - - if (byteOffset % 4) - { - printf("[ERROR] byteOffset must be a multiple of 4\n"); - exit(1); - } - int initOffset = byteOffset / sizeof(float); - - char *env; - if (!outputToCsv) - { - printf("Run configuration\n"); - printf("=====================================================\n"); - printf("%-20s %8s: Using %s\n", - "USE_HIP_CALL", useHipCall ? "(set)" : "(unset)", - useHipCall ? "HIP functions" : "custom kernels"); - printf("%-20s %8s: Performing %s\n", - "USE_MEMSET", useMemset ? "(set)" : "(unset)", - useMemset ? "memset" : "memcopy"); - if (useHipCall && !useMemset) - { - env = getenv("HSA_ENABLE_SDMA"); - printf("%-20s %8s: %s\n", - "HSA_ENABLE_SDMA", env ? env : "(unset)", - (env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines"); - } - printf("%-20s %8s: GPU destination memory type: %s-grained\n", - "USE_FINEGRAIN_MEM", useFineGrainMem ? "(set)" : "(unset)", - useFineGrainMem ? "fine" : "coarse"); - printf("%-20s %8s: %s\n", - "USE_SINGLE_SYNC", useSingleSync ? "(set)" : "(unset)", - useSingleSync ? "Synchronizing only once, after all iterations" : "Synchronizing per iteration"); - printf("%-20s %8s: Running in %s mode\n", - "USE_INTERACTIVE", useInteractive ? "(set)" : "(unset)", - useInteractive ? "interactive" : "non-interactive"); - printf("%-20s %8s: %s\n", - "USE_SLEEP", useSleep ? "(set)" : "(unset)", - useSleep ? "Add sleep after each sync" : "No sleep per sync"); - printf("%-20s %8s: %s\n", - "COMBINE_TIMING", combineTiming ? "(set)" : "(unset)", - combineTiming ? "Using combined timing+launch" : "Using separate timing / launch"); - printf("%-20s %8s: %s\n", - "REUSE_STREAMS", reuseStreams ? "(set)" : "(unset)", - reuseStreams ? "Re-using streams per topology" : "Creating/destroying streams per topology"); - printf("%-20s %8s: %s\n", - "SHOW_ADDR", showAddr ? "(set)" : "(unset)", - showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses"); - env = getenv("OUTPUT_TO_CSV"); - printf("%-20s %8s: Output to csv\n", - "OUTPUT_TO_CSV", env ? env : "(unset)"); - env = getenv("BYTE_OFFSET"); - printf("%-20s %8s: Using byte offset of %d\n", - "BYTE_OFFSET", env ? env : "(unset)", byteOffset); - env = getenv("NUM_WARMUPS"); - printf("%-20s %8s: Running %d warmup iteration(s) per topology\n", - "NUM_WARMUPS", env ? env : "(unset)", numWarmups); - env = getenv("NUM_ITERATIONS"); - printf("%-20s %8s: Running %d timed iteration(s) per topology\n", - "NUM_ITERATIONS", env ? env : "(unset)", numIterations); - printf("\n"); - } - - // Collect the number of available CPUs/GPUs on this machine - int numGpuDevices; - HIP_CALL(hipGetDeviceCount(&numGpuDevices)); - if (numGpuDevices < 1) - { - printf("[ERROR] No GPU devices found\n"); - exit(1); - } - - // Read configuration file + // Check that Link configuration file can be opened FILE* fp = fopen(argv[1], "r"); if (!fp) { @@ -172,148 +57,177 @@ int main(int argc, char **argv) exit(1); } - // Track links that get used - std::map, int> linkMap; - std::vector> streamCache(numGpuDevices); - - // Print CSV header - if (outputToCsv) + // Check for NUMA library support + if (numa_available() == -1) { - printf("Test,NumBytes,ExeGpu,SrcMem,DstMem,BW(GB/s),Time(ms),LinkDesc,SrcAddr,DstAddr,numWarmups,numIters,useHipCall,useMemSet,useFineGrain,useSingleSync,resuseStreams\n"); + printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n"); + exit(1); } - // Loop over each line in the configuration file + // Collect environment variables / display current run configuration + EnvVars ev; + ev.DisplayEnvVars(); + + // Determine number of bytes to run per Link + // If a non-zero number of bytes is specified, use it + // Otherwise generate array of bytes values to execute over + std::vector valuesOfN; + size_t const numBytesPerLink = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_LINK; + PopulateTestSizes(numBytesPerLink, ev.samplingFactor, valuesOfN); + + // Find the largest N to be used - memory will only be allocated once per link config + size_t maxN = valuesOfN[0]; + for (auto N : valuesOfN) + maxN = std::max(maxN, N); + + int const initOffset = ev.byteOffset / sizeof(float); + std::stack threads; + + // Collect the number of available CPUs/GPUs on this machine + int numGpuDevices; + HIP_CALL(hipGetDeviceCount(&numGpuDevices)); + int const numCpuDevices = numa_num_configured_nodes(); + + // Track links that get used + std::set> peerAccessTracker; + + // Print CSV header + if (ev.outputToCsv) + { + printf("Test,NumBytes,Executor,SrcMem,DstMem,CUs,BW(GB/s),Time(ms),LinkDesc,SrcAddr,DstAddr,ByteOffset,numWarmups,numIters,useHipCall,useMemSet,useSingleSync,combinedTiming\n"); + } + + // Loop over each line in the Link configuration file int testNum = 0; char line[2048]; while(fgets(line, 2048, fp)) { // Parse links from configuration file std::vector links; - ParseLinks(line, links); + ParseLinks(line, numCpuDevices, numGpuDevices, links); int const numLinks = links.size(); if (numLinks == 0) continue; testNum++; + // Prepare link + for (int i = 0; i < numLinks; i++) + { + // Get some aliases to link variables + MemType const& exeMemType = links[i].exeMemType; + int const& exeIndex = links[i].exeIndex; + MemType const& srcMemType = links[i].srcMemType; + MemType const& dstMemType = links[i].dstMemType; + int const& srcIndex = links[i].srcIndex; + int const& dstIndex = links[i].dstIndex; + int const& blocksToUse = links[i].numBlocksToUse; + + // Enable peer-to-peer access if necessary (can only be called once per unique pair) + if (exeMemType == MEM_GPU) + { + // Ensure executing GPU can access source memory + if ((srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) && srcIndex != exeIndex) + { + auto exeSrcPair = std::make_pair(exeIndex, srcIndex); + if (!peerAccessTracker.count(exeSrcPair)) + { + EnablePeerAccess(exeIndex, srcIndex); + peerAccessTracker.insert(exeSrcPair); + } + } + + // Ensure executing GPU can access destination memory + if ((dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) && dstIndex != exeIndex) + { + auto exeDstPair = std::make_pair(exeIndex, dstIndex); + if (!peerAccessTracker.count(exeDstPair)) + { + EnablePeerAccess(exeIndex, dstIndex); + peerAccessTracker.insert(exeDstPair); + } + } + } + + // Allocate (maximum) source / destination memory based on type / device index + AllocateMemory(srcMemType, srcIndex, maxN * sizeof(float) + ev.byteOffset, &links[i].srcMem); + AllocateMemory(dstMemType, dstIndex, maxN * sizeof(float) + ev.byteOffset, &links[i].dstMem); + + // Prepare execution agent + if (exeMemType == MEM_GPU) + { + HIP_CALL(hipSetDevice(exeIndex)); + HIP_CALL(hipEventCreate(&links[i].startEvent)); + HIP_CALL(hipEventCreate(&links[i].stopEvent)); + HIP_CALL(hipMalloc((void**)&links[i].blockParam, sizeof(BlockParam) * blocksToUse)); + HIP_CALL(hipStreamCreate(&links[i].stream)); + } + else if (exeMemType == MEM_CPU) + { + links[i].blockParam = (BlockParam*)malloc(ev.numCpuPerLink * sizeof(BlockParam)); + } + } + // Loop over all the different number of bytes to use per Link for (auto N : valuesOfN) { - if (!outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float)); - float* linkSrcMem[numLinks]; // Source memory per Link - float* linkDstMem[numLinks]; // Destination memory per Link - hipStream_t streams[numLinks]; // hipStream to use per Link - hipEvent_t startEvents[numLinks]; // Start event per Link - hipEvent_t stopEvents[numLinks]; // Stop event per Link - std::vector cpuBlockParams[numLinks]; // CPU copy of block parameters - BlockParam* gpuBlockParams[numLinks]; // GPU copy of block parameters - - // Clear counters - int linkCount[numGpuDevices]; - for (int i = 0; i < numGpuDevices; i++) - linkCount[i] = 0; + if (!ev.outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float)); + // Prepare links based on current N for (int i = 0; i < numLinks; i++) { - MemType srcMemType = links[i].srcMemType; - MemType dstMemType = links[i].dstMemType; - int exeIndex = links[i].exeIndex; - int srcIndex = links[i].srcIndex; - int dstIndex = links[i].dstIndex; - int blocksToUse = links[i].numBlocksToUse; - - // Check for valid src/dst indices - if ((srcIndex < 0 || srcIndex >= numGpuDevices) || - (dstIndex < 0 || dstIndex >= numGpuDevices) || - (exeIndex < 0 || exeIndex >= numGpuDevices)) - { - printf("[ERROR] Invalid link %d:(%c%d->%c%d) GPU index must be between 0 and %d inclusively\n", - exeIndex, MemTypeStr[srcMemType], srcIndex, MemTypeStr[dstMemType], dstIndex, numGpuDevices-1); - exit(1); - } - - // Enable peer-to-peer access if this is the first time seeing this pair - if (srcMemType == MEM_GPU && dstMemType == MEM_GPU) - { - auto linkPair = std::make_pair(srcIndex, dstIndex); - linkMap[linkPair]++; - if (linkMap[linkPair] == 1 && srcIndex != dstIndex) - { - int canAccess; - HIP_CALL(hipDeviceCanAccessPeer(&canAccess, srcIndex, dstIndex)); - if (!canAccess) - { - printf("[ERROR] Unable to enable peer access between GPU devices %d and %d\n", srcIndex, dstIndex); - exit(1); - } - HIP_CALL(hipSetDevice(srcIndex)); - HIP_CALL(hipDeviceEnablePeerAccess(dstIndex, 0)); - } - } - - // Allocate hipEvents / hipStreams on executing GPU - HIP_CALL(hipSetDevice(exeIndex)); - HIP_CALL(hipEventCreate(&startEvents[i])); - HIP_CALL(hipEventCreate(&stopEvents[i])); - HIP_CALL(hipMalloc((void**)&gpuBlockParams[i], sizeof(BlockParam) * numLinks)); - if (reuseStreams) - { - // If re-using streams, create new stream, otherwise point to existing stream - if (streamCache[exeIndex].size() <= linkCount[exeIndex]) - { - streamCache[exeIndex].resize(linkCount[exeIndex] + 1); - HIP_CALL(hipStreamCreate(&streamCache[exeIndex][linkCount[exeIndex]])); - } - streams[i] = streamCache[exeIndex][linkCount[exeIndex]]; - } - else - { - HIP_CALL(hipStreamCreate(&streams[i])); - } - - // Allocate source / destination memory based on type / device index - AllocateMemory(srcMemType, srcIndex, N * sizeof(float) + byteOffset, useFineGrainMem, &linkSrcMem[i]); - AllocateMemory(dstMemType, dstIndex, N * sizeof(float) + byteOffset, useFineGrainMem, &linkDstMem[i]); - // Initialize source memory with patterned data - CheckOrFill(MODE_FILL, N, useMemset, useHipCall, linkSrcMem[i] + initOffset); - - // Count # of links / total blocks each GPU will be working on - linkCount[exeIndex]++; - + CheckOrFill(MODE_FILL, N, ev.useMemset, ev.useHipCall, links[i].srcMem + initOffset); // Each block needs to know src/dst pointers and how many elements to transfer // Figure out the sub-array each block does for this Link // - Partition N as evenly as posible, but try to keep blocks as multiples of 32, // except the very last one, for alignment reasons - size_t assigned = 0; - int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)links[i].numBlocksToUse); - for (int j = 0; j < links[i].numBlocksToUse; j++) + if (links[i].exeMemType == MEM_GPU) { - BlockParam param; - int blocksLeft = std::max(0, maxNumBlocksToUse - j); - size_t leftover = N - assigned; - size_t roundedN = (leftover + 31) / 32; - param.N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0; - param.src = linkSrcMem[i] + assigned + initOffset; - param.dst = linkDstMem[i] + assigned + initOffset; - assigned += param.N; - cpuBlockParams[i].push_back(param); + size_t assigned = 0; + int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)links[i].numBlocksToUse); + for (int j = 0; j < links[i].numBlocksToUse; j++) + { + BlockParam param; + int blocksLeft = std::max(0, maxNumBlocksToUse - j); + size_t leftover = N - assigned; + size_t roundedN = (leftover + 31) / 32; + param.N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0; + param.src = links[i].srcMem + assigned + initOffset; + param.dst = links[i].dstMem + assigned + initOffset; + assigned += param.N; + + HIP_CALL(hipMemcpy(&links[i].blockParam[j], ¶m, sizeof(BlockParam), hipMemcpyHostToDevice)); + } + } + else if (links[i].exeMemType == MEM_CPU) + { + // For CPU-based copy, divded based on the number of child threads + size_t assigned = 0; + int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)ev.numCpuPerLink); + for (int j = 0; j < ev.numCpuPerLink; j++) + { + int blocksLeft = std::max(0, maxNumBlocksToUse - j); + size_t leftover = N - assigned; + size_t roundedN = (leftover + 31) / 32; + links[i].blockParam[j].N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0; + links[i].blockParam[j].src = links[i].srcMem + assigned + initOffset; + links[i].blockParam[j].dst = links[i].dstMem + assigned + initOffset; + assigned += links[i].blockParam[j].N; + } } - HIP_CALL(hipMemcpy(gpuBlockParams[i], cpuBlockParams[i].data(), - sizeof(BlockParam) * links[i].numBlocksToUse, hipMemcpyHostToDevice)); + // Initialize timing + links[i].totalTime = 0.0; } - // Launch kernels (warmup iterations are not counted) double totalCpuTime = 0; - double totalGpuTime[numLinks]; - for (int i = 0; i < numLinks; i++) totalGpuTime[i] = 0.0; - - for (int iteration = -numWarmups; iteration < numIterations; iteration++) + // Launch kernels (warmup iterations are not counted) + for (int iteration = -ev.numWarmups; iteration < ev.numIterations; iteration++) { // Pause before starting first timed iteration in interactive mode - if (useInteractive && iteration == 0) + if (ev.useInteractive && iteration == 0) { printf("Hit to continue: "); scanf("%*c"); @@ -323,80 +237,26 @@ int main(int argc, char **argv) // Start CPU timing for this iteration auto cpuStart = std::chrono::high_resolution_clock::now(); - // Enqueue all links + // Execute all links in parallel + for (int i = 0; i < numLinks; i++) + threads.push(std::thread(RunLink, std::ref(ev), N, iteration, std::ref(links[i]))); + + // Wait for all threads to finish for (int i = 0; i < numLinks; i++) { - HIP_CALL(hipSetDevice(links[i].exeIndex)); - - bool recordStart = (!useSingleSync || iteration == 0); - bool recordStop = (!useSingleSync || iteration == numIterations - 1); - - if (useHipCall) - { - // Record start event - if (recordStart) HIP_CALL(hipEventRecord(startEvents[i], streams[i])); - - // Execute hipMemset / hipMemcpy - if (useMemset) - HIP_CALL(hipMemsetAsync(linkDstMem[i] + initOffset, 42, N * sizeof(float), streams[i])); - else - HIP_CALL(hipMemcpyAsync(linkDstMem[i] + initOffset, - linkSrcMem[i] + initOffset, - N * sizeof(float), hipMemcpyDeviceToDevice, - streams[i])); - // Record stop event - if (recordStop) HIP_CALL(hipEventRecord(stopEvents[i], streams[i])); - } - else - { - if (!combineTiming && recordStart) HIP_CALL(hipEventRecord(startEvents[i], streams[i])); - hipExtLaunchKernelGGL(useMemset ? MemsetKernel : CopyKernel, - dim3(links[i].numBlocksToUse, 1, 1), - dim3(BLOCKSIZE, 1, 1), - 0, streams[i], - (combineTiming && recordStart) ? startEvents[i] : NULL, - (combineTiming && recordStop) ? stopEvents[i] : NULL, - 0, gpuBlockParams[i]); - if (!combineTiming & recordStop) HIP_CALL(hipEventRecord(stopEvents[i], streams[i])); - } - } - - // Synchronize per iteration, unless in single sync mode, in which case - // synchronize during last warmup / last actual iteration - if (!useSingleSync || iteration == -1 || iteration == numIterations - 1) - { - for (int i = 0; i < numLinks; i++) - { - HIP_CALL(hipSetDevice(links[i].exeIndex)); - hipStreamSynchronize(streams[i]); - } + threads.top().join(); + threads.pop(); } // Stop CPU timing for this iteration auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; double deltaSec = std::chrono::duration_cast>(cpuDelta).count(); - if (useSleep) usleep(100000); - if (iteration >= 0) - { - totalCpuTime += deltaSec; - - // Record GPU timing - if (!useSingleSync || iteration == numIterations - 1) - { - for (int i = 0; i < numLinks; i++) - { - HIP_CALL(hipSetDevice(links[i].exeIndex)); - HIP_CALL(hipEventSynchronize(stopEvents[i])); - float gpuDeltaMsec; - HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvents[i], stopEvents[i])); - totalGpuTime[i] += gpuDeltaMsec; - } - } - } + if (iteration >= 0) totalCpuTime += deltaSec; } - if (useInteractive) + // Pause for interactive mode + if (ev.useInteractive) { printf("Transfers complete. Hit to continue: "); scanf("%*c"); @@ -405,89 +265,100 @@ int main(int argc, char **argv) // Validate that each link has transferred correctly for (int i = 0; i < numLinks; i++) - CheckOrFill(MODE_CHECK, N, useMemset, useHipCall, linkDstMem[i] + initOffset); + CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, links[i].dstMem + initOffset); // Report timings - totalCpuTime = totalCpuTime / (1.0 * numIterations) * 1000; + totalCpuTime = totalCpuTime / (1.0 * ev.numIterations) * 1000; double totalBandwidthGbs = (numLinks * N * sizeof(float) / 1.0E6) / totalCpuTime; for (int i = 0; i < numLinks; i++) { - double linkDurationMsec = totalGpuTime[i] / (1.0 * numIterations); + double linkDurationMsec = links[i].totalTime / (1.0 * ev.numIterations); double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f; - if (!outputToCsv) + if (!ev.outputToCsv) { - printf(" Link %02d: %c%02d -> [GPU %02d:%02d] -> %c%02d | %9.3f GB/s | %8.3f ms | %9s |", + printf(" Link %02d: %c%02d -> [%cPU %02d:%02d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s", i + 1, MemTypeStr[links[i].srcMemType], links[i].srcIndex, - links[i].exeIndex, links[i].numBlocksToUse, + MemTypeStr[links[i].exeMemType], links[i].exeIndex, + links[i].exeMemType == MEM_CPU ? ev.numCpuPerLink : links[i].numBlocksToUse, MemTypeStr[links[i].dstMemType], links[i].dstIndex, linkBandwidthGbs, linkDurationMsec, GetLinkDesc(links[i]).c_str()); - if (showAddr) printf(" %16p | %16p |", linkSrcMem[i] + initOffset, linkDstMem[i] + initOffset); + if (ev.showAddr) printf(" %16p | %16p |", links[i].srcMem + initOffset, links[i].dstMem + initOffset); printf("\n"); } else { - printf("%d,%lu,%02d,%c%02d,%c%02d,%9.3f,%8.3f,%s,%p,%p,%d,%d,%s,%s,%s,%s,%s\n", - testNum, N * sizeof(float), links[i].exeIndex, + printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%9.3f,%8.3f,%s,%p,%p,%d,%d,%d,%s,%s,%s,%s\n", + testNum, N * sizeof(float), MemTypeStr[links[i].srcMemType], links[i].srcIndex, + MemTypeStr[links[i].exeMemType], links[i].exeIndex, MemTypeStr[links[i].dstMemType], links[i].dstIndex, + links[i].exeMemType == MEM_CPU ? ev.numCpuPerLink : links[i].numBlocksToUse, linkBandwidthGbs, linkDurationMsec, GetLinkDesc(links[i]).c_str(), - linkSrcMem[i] + initOffset, linkDstMem[i] + initOffset, - numWarmups, numIterations, - useHipCall ? "true" : "false", - useMemset ? "true" : "false", - useFineGrainMem ? "true" : "false", - useSingleSync ? "true" : "false", - reuseStreams ? "true" : "false"); + links[i].srcMem + initOffset, links[i].dstMem + initOffset, + ev.byteOffset, + ev.numWarmups, ev.numIterations, + ev.useHipCall ? "true" : "false", + ev.useMemset ? "true" : "false", + ev.useSingleSync ? "true" : "false", + ev.combineTiming ? "true" : "false"); } } // Display aggregate statistics - if (!outputToCsv) + if (!ev.outputToCsv) { printf(" Aggregate Bandwidth (CPU timed) | %9.3f GB/s | %8.3f ms |\n", totalBandwidthGbs, totalCpuTime); } else { - printf("%d,%lu,ALL,ALL,ALL,%9.3f,%8.3f,ALL,ALL,ALL,%d,%d,%s,%s,%s,%s,%s\n", - testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, numWarmups, numIterations, - useHipCall ? "true" : "false", - useMemset ? "true" : "false", - useFineGrainMem ? "true" : "false", - useSingleSync ? "true" : "false", - reuseStreams ? "true" : "false"); + printf("%d,%lu,ALL,ALL,ALL,ALL,%9.3f,%8.3f,ALL,ALL,ALL,%d,%d,%d,%s,%s,%s,%s\n", + testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, ev.byteOffset, + ev.numWarmups, ev.numIterations, + ev.useHipCall ? "true" : "false", + ev.useMemset ? "true" : "false", + ev.useSingleSync ? "true" : "false", + ev.combineTiming ? "true" : "false"); } + } - // Release GPU memory - for (int i = 0; i < numLinks; i++) + // Release GPU memory + for (int i = 0; i < numLinks; i++) + { + DeallocateMemory(links[i].srcMemType, links[i].srcIndex, links[i].srcMem); + DeallocateMemory(links[i].dstMemType, links[i].dstIndex, links[i].dstMem); + + if (links[i].exeMemType == MEM_GPU) { - DeallocateMemory(links[i].srcMemType, links[i].srcIndex, linkSrcMem[i]); - DeallocateMemory(links[i].dstMemType, links[i].dstIndex, linkDstMem[i]); - HIP_CALL(hipFree(gpuBlockParams[i])); - if (!reuseStreams) - HIP_CALL(hipStreamDestroy(streams[i])); - HIP_CALL(hipEventDestroy(startEvents[i])); - HIP_CALL(hipEventDestroy(stopEvents[i])); + HIP_CALL(hipEventDestroy(links[i].startEvent)); + HIP_CALL(hipEventDestroy(links[i].stopEvent)); + HIP_CALL(hipStreamDestroy(links[i].stream)); + HIP_CALL(hipFree(links[i].blockParam)); + } + else if (links[i].exeMemType == MEM_CPU) + { + free(links[i].blockParam); } } } fclose(fp); - // Clean up stream cache if re-using streams - if (reuseStreams) - { - for (auto streamVector : streamCache) - for (auto stream : streamVector) - HIP_CALL(hipStreamDestroy(stream)); - } - return 0; } void DisplayUsage(char const* cmdName) { + if (numa_available() == -1) + { + printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n"); + exit(1); + } + int numGpuDevices; + HIP_CALL(hipGetDeviceCount(&numGpuDevices)); + int const numCpuDevices = numa_num_configured_nodes(); + printf("Usage: %s configFile \n", cmdName); printf(" configFile: File containing Links to execute (see below for format)\n"); @@ -498,57 +369,48 @@ void DisplayUsage(char const* cmdName) printf("\n"); printf("Configfile Format:\n"); printf("==================\n"); - printf("A Link is defined as a uni-directional transfer from src memory location to dst memory location\n"); + printf("A Link is defined as a uni-directional transfer from src memory location to dst memory location executed by either CPU or GPU\n"); printf("Each single line in the configuration file defines a set of Links to run in parallel\n"); printf("\n"); printf("There are two ways to specify the configuration file:\n"); printf("\n"); printf("1) Basic\n"); - printf(" The basic specification assumes the same number of threadblocks/CUs used per link\n"); + printf(" The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Link\n"); printf(" A positive number of Links is specified followed by that number of triplets describing each Link\n"); printf("\n"); - printf(" #Links #CUs (GPUIndex1 srcMem1 dstMem1) ... (GPUIndexL srcMemL dstMemL)\n"); + printf(" #Links #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)\n"); printf("\n"); printf("2) Advanced\n"); - printf(" The advanced specification allows different number of threadblocks/CUs used per Link\n"); + printf(" The advanced specification allows different number of threadblocks/CUs used per GPU-executed Link\n"); printf(" A negative number of links is specified, followed by quadruples describing each Link\n"); - printf(" -#Links (GPUIndex1 #CUs1 srcMem1 dstMem1) ... (GPUIndexL #CUsL srcMemL dstMemL)\n"); + printf(" -#Links (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL)\n"); printf("\n"); printf("Argument Details:\n"); printf(" #Links : Number of Links to be run in parallel\n"); - printf(" #CUs : Number of threadblocks/CUs to use for a Link\n"); - printf(" GpuIndex: 0-indexed GPU id executing the Link\n"); + printf(" #CUs : Number of threadblocks/CUs to use for a GPU-executed Link\n"); printf(" srcMemL : Source memory location (Where the data is to be read from). Ignored in memset mode\n"); + printf(" Executor: Executor are specified by a character indicating executor type, followed by device index (0-indexed)\n"); + printf(" - C: CPU-executed (Indexed from 0 to %d)\n", numCpuDevices-1); + printf(" - G: GPU-executed (Indexed from 0 to %d)\n", numGpuDevices-1); printf(" dstMemL : Destination memory location (Where the data is to be written to)\n"); - printf(" Memory locations are specified by a character indicating memory type, followed by GPU device index (0-indexed)\n"); + printf("\n"); + printf(" Memory locations are specified by a character indicating memory type, followed by device index (0-indexed)\n"); printf(" Supported memory locations are:\n"); - printf(" - P: Pinned host memory (on CPU, on NUMA node closest to provided GPU index)\n"); - printf(" - G: Global device memory (on GPU)\n"); - printf("Round brackets may be included for human clarity, but will be ignored\n"); + printf(" - C: Pinned host memory (on NUMA node, indexed from 0 to %d)\n", numCpuDevices-1); + printf(" - G: Global device memory (on GPU device indexed from 0 to %d)\n", numGpuDevices-1); + printf(" - F: Fine-grain device memory (on GPU device indexed from 0 to %d)\n", numGpuDevices-1); printf("\n"); printf("Examples:\n"); - printf("1 4 (0 G0 G1) Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1\n"); - printf("1 4 (0 G1 G0) Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 1 and copies it to memory on GPU 0\n"); - printf("1 4 (2 P0 G2) Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2\n"); - printf("2 4 (0 G0 G1) (1 G1 G0) Runs 2 Links in parallel. GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs\n"); - printf("-2 (0 G0 G1 4) (1 G1 G0 2) Runs 2 Links in parallel. GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs\n"); + printf("1 4 (G0->G0->G1) Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1\n"); + printf("1 4 (G1->C0->G0) Single Link that uses 4 CUs on GPU 0 that reads memory from CPU 1 and copies it to memory on GPU 0\n"); + printf("1 4 (C0->G2->G2) Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2\n"); + printf("2 4 G0->G0->G1 G1->G1->G0 Runs 2 Links in parallel. GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs\n"); + printf("-2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel. GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs\n"); printf("\n"); + printf("Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary\n"); printf("\n"); - printf("Environment variables:\n"); - printf("======================\n"); - printf(" USE_HIP_CALL - Use hipMemcpy/hipMemset instead of custom shader kernels\n"); - printf(" USE_MEMSET - Perform a memset instead of a copy (ignores source memory)\n"); - printf(" USE_FINEGRAIN_MEM - Allocate fine-grained GPU memory instead of coarse-grained GPU memory\n"); - printf(" USE_SINGLE_SYNC - Perform synchronization only once after all iterations instead of per iteration\n"); - printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n"); - printf(" USE_SLEEP - Adds a 100ms sleep after each synchronization\n"); - printf(" COMBINE_TIMING - Combines timing with launch (potentially lower timing overhead)\n"); - printf(" REUSE_STREAMS - Re-use streams instead of creating / destroying per test\n"); - printf(" SHOW_ADDR - Print out memory addresses for each Link\n"); - printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n"); - printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n"); - printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n"); - printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n"); + + EnvVars::DisplayUsage(); } void GenerateConfigFile(char const* cfgFile, int numBlocks) @@ -556,13 +418,28 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks) // Detect number of available GPUs and skip if less than 2 int numGpuDevices; HIP_CALL(hipGetDeviceCount(&numGpuDevices)); - printf("Generated configFile %s for %d device(s) / %d CUs per link\n", cfgFile, numGpuDevices, numBlocks); + printf("Generating configFile %s for %d device(s) / %d CUs per link\n", cfgFile, numGpuDevices, numBlocks); if (numGpuDevices < 2) { printf("Skipping. (Less than 2 GPUs detected)\n"); exit(0); } + // Check first to see if file exists, and issue warning + FILE* exists = fopen(cfgFile, "r"); + if (exists) + { + fclose(exists); + printf("[WARN] File %s alreadys exists. Enter 'Y' to confirm overwrite\n", cfgFile); + char ch; + scanf(" %c", &ch); + if (ch != 'Y' && ch != 'y') + { + printf("Aborting\n"); + exit(0); + } + } + // Open config file for writing FILE* fp = fopen(cfgFile, "w"); if (!fp) @@ -574,18 +451,18 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks) // CU testing fprintf(fp, "# CU scaling tests\n"); for (int i = 1; i < 16; i++) - fprintf(fp, "1 %d (0 G0 G1)\n", i); + fprintf(fp, "1 %d (G0->G0->G1)\n", i); fprintf(fp, "\n"); // Pinned memory testing fprintf(fp, "# Pinned CPU memory read tests\n"); for (int i = 0; i < numGpuDevices; i++) - fprintf(fp, "1 %d (%d C%d G%d)\n", numBlocks, i, i, i); + fprintf(fp, "1 %d (C0->G%d->G%d)\n", numBlocks, i, i); fprintf(fp, "\n"); fprintf(fp, "# Pinned CPU memory write tests\n"); for (int i = 0; i < numGpuDevices; i++) - fprintf(fp, "1 %d (%d G%d C%d)\n", numBlocks, i, i, i); + fprintf(fp, "1 %d (G%d->G%d->C0)\n", numBlocks, i, i); fprintf(fp, "\n"); // Single link testing GPU testing @@ -594,7 +471,7 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks) for (int j = 0; j < numGpuDevices; j++) { if (i == j) continue; - fprintf(fp, "1 %d (%d G%d G%d)\n", numBlocks, i, i, j); + fprintf(fp, "1 %d (G%d->G%d->G%d)\n", numBlocks, i, i, j); } fprintf(fp, "\n"); @@ -604,7 +481,7 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks) for (int j = 0; j < numGpuDevices; j++) { if (i == j) continue; - fprintf(fp, "2 %d (%d G%d G%d) (%d G%d G%d)\n", numBlocks, i, i, j, j, j, i); + fprintf(fp, "2 %d (G%d->G%d->G%d) (G%d->G%d->G%d)\n", numBlocks, i, i, j, j, j, i); } fprintf(fp, "\n"); @@ -613,7 +490,7 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks) fprintf(fp, "%d %d", numGpuDevices, numBlocks); for (int i = 0; i < numGpuDevices; i++) { - fprintf(fp, " (%d G%d G%d)", i, i, (i+1)%numGpuDevices); + fprintf(fp, " (G%d->G%d->G%d)", i, i, (i+1)%numGpuDevices); } fprintf(fp, "\n\n"); @@ -621,23 +498,23 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks) fprintf(fp, "# Simple bi-directional ring\n"); fprintf(fp, "%d %d", numGpuDevices * 2, numBlocks); for (int i = 0; i < numGpuDevices; i++) - fprintf(fp, " (%d G%d G%d)", i, i, (i+1)%numGpuDevices); + fprintf(fp, " (G%d->G%d->G%d)", i, i, (i+1)%numGpuDevices); for (int i = 0; i < numGpuDevices; i++) - fprintf(fp, " (%d G%d G%d)", i, i, (i+numGpuDevices-1)%numGpuDevices); + fprintf(fp, " (G%d->G%d->G%d)", i, i, (i+numGpuDevices-1)%numGpuDevices); fprintf(fp, "\n\n"); // Broadcast from GPU 0 fprintf(fp, "# GPU 0 Broadcast\n"); fprintf(fp, "%d %d", numGpuDevices-1, numBlocks); for (int i = 1; i < numGpuDevices; i++) - fprintf(fp, " (%d G%d G%d)", 0, 0, i); + fprintf(fp, " (G%d->G%d->G%d)", 0, 0, i); fprintf(fp, "\n\n"); // Gather to GPU 0 fprintf(fp, "# GPU 0 Gather\n"); fprintf(fp, "%d %d", numGpuDevices-1, numBlocks); for (int i = 1; i < numGpuDevices; i++) - fprintf(fp, " (%d G%d G%d)", 0, i, 0); + fprintf(fp, " (G%d->G%d->G%d)", 0, i, 0); fprintf(fp, "\n\n"); // Full stress test @@ -647,7 +524,7 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks) for (int j = 0; j < numGpuDevices; j++) { if (i == j) continue; - fprintf(fp, " (%d G%d G%d)", i, i, j); + fprintf(fp, " (G%d->G%d->G%d)", i, i, j); } fprintf(fp, "\n\n"); @@ -692,29 +569,87 @@ void DisplayTopology() } } -void ParseMemType(std::string const& token, MemType* memType, int* memIndex) +void PopulateTestSizes(size_t const numBytesPerLink, + int const samplingFactor, + std::vector& valuesOfN) +{ + valuesOfN.clear(); + + // If the number of bytes is specified, use it + if (numBytesPerLink != 0) + { + if (numBytesPerLink % 4) + { + printf("[ERROR] numBytesPerLink (%lu) must be a multiple of 4\n", numBytesPerLink); + exit(1); + } + size_t N = numBytesPerLink / sizeof(float); + valuesOfN.push_back(N); + } + else + { + // Otherwise generate a range of values + // (Powers of 2, with samplingFactor samples between successive powers of 2) + for (int N = 256; N <= (1<<27); N *= 2) + { + int delta = std::max(32, N / samplingFactor); + int curr = N; + while (curr < N * 2) + { + valuesOfN.push_back(curr); + curr += delta; + } + } + } +} + +void ParseMemType(std::string const& token, int const numCpus, int const numGpus, MemType* memType, int* memIndex) { char typeChar; if (sscanf(token.c_str(), " %c %d", &typeChar, memIndex) != 2) { - printf("Error parsing memory type token %s\n", token.c_str()); + printf("[ERROR] Unable to parse memory type token %s - expecting either 'C' or 'G' or 'F' followed by an index\n", token.c_str()); exit(1); } switch (typeChar) { - case 'C': case 'c': *memType = MEM_CPU; break; - case 'G': case 'g': *memType = MEM_GPU; break; - default: printf("Unrecognized memory type %s\n", token.c_str()); exit(1); + case 'C': case 'c': + *memType = MEM_CPU; + if (*memIndex < 0 || *memIndex >= numCpus) + { + printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, *memIndex); + exit(1); + } + break; + case 'G': case 'g': + *memType = MEM_GPU; + if (*memIndex < 0 || *memIndex >= numGpus) + { + printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, *memIndex); + exit(1); + } + break; + case 'F': case 'f': + *memType = MEM_GPU_FINE; + if (*memIndex < 0 || *memIndex >= numGpus) + { + printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, *memIndex); + exit(1); + } + break; + default: + printf("[ERROR] Unrecognized memory type %s. Expecting either 'C' or 'G' or 'F'\n", token.c_str()); + exit(1); } } -// Helper function to parse a link of link definitions -void ParseLinks(char* line, std::vector& links) +// Helper function to parse a list of link definitions +void ParseLinks(char* line, int numCpus, int numGpus, std::vector& links) { - // Replace any round brackets with spaces + // Replace any round brackets or '->' with spaces, for (int i = 0; line[i]; i++) - if (line[i] == '(' || line[i] == ')') line[i] = ' '; + if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' '; links.clear(); int numLinks = 0; @@ -725,11 +660,12 @@ void ParseLinks(char* line, std::vector& links) iss >> numLinks; if (iss.fail()) return; + std::string exeMem; std::string srcMem; std::string dstMem; if (numLinks > 0) { - // Method 1: Take in triples (exeGpu, srcMem, dstMem) + // Method 1: Take in triples (srcMem, exeMem, dstMem) int numBlocksToUse; iss >> numBlocksToUse; if (numBlocksToUse <= 0) @@ -740,47 +676,99 @@ void ParseLinks(char* line, std::vector& links) links.resize(numLinks); for (int i = 0; i < numLinks; i++) { - iss >> links[i].exeIndex >> srcMem >> dstMem; - ParseMemType(srcMem, &links[i].srcMemType, &links[i].srcIndex); - ParseMemType(dstMem, &links[i].dstMemType, &links[i].dstIndex); + iss >> srcMem >> exeMem >> dstMem; + ParseMemType(srcMem, numCpus, numGpus, &links[i].srcMemType, &links[i].srcIndex); + ParseMemType(exeMem, numCpus, numGpus, &links[i].exeMemType, &links[i].exeIndex); + ParseMemType(dstMem, numCpus, numGpus, &links[i].dstMemType, &links[i].dstIndex); links[i].numBlocksToUse = numBlocksToUse; + if (links[i].exeMemType != MEM_CPU && links[i].exeMemType != MEM_GPU) + { + printf("[ERROR] Executor must either be CPU ('C') or GPU ('G')\n"); + exit(1); + } } } else { - // Method 2: Read in quads (exeGpu, srcMem, dstMem, Read common # blocks to use, then read (src, dst) doubles + // Method 2: Read in quads (srcMem, exeMem, dstMem, Read common # blocks to use, then read (src, dst) doubles numLinks *= -1; links.resize(numLinks); for (int i = 0; i < numLinks; i++) { - iss >> links[i].exeIndex >> srcMem >> dstMem >> links[i].numBlocksToUse; - ParseMemType(srcMem, &links[i].srcMemType, &links[i].srcIndex); - ParseMemType(dstMem, &links[i].dstMemType, &links[i].dstIndex); + iss >> srcMem >> exeMem >> dstMem >> links[i].numBlocksToUse; + ParseMemType(srcMem, numCpus, numGpus, &links[i].srcMemType, &links[i].srcIndex); + ParseMemType(exeMem, numCpus, numGpus, &links[i].exeMemType, &links[i].exeIndex); + ParseMemType(dstMem, numCpus, numGpus, &links[i].dstMemType, &links[i].dstIndex); + if (links[i].exeMemType != MEM_CPU || links[i].exeMemType != MEM_GPU) + { + printf("[ERROR] Executor must either be CPU ('C') or GPU ('G')\n"); + exit(1); + } + } } } -void AllocateMemory(MemType memType, int devIndex, size_t numBytes, bool useFineGrainMem, float** memPtr) +void EnablePeerAccess(int const deviceId, int const peerDeviceId) { - HIP_CALL(hipSetDevice(devIndex)); + int canAccess; + HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId)); + if (!canAccess) + { + printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId); + exit(1); + } + HIP_CALL(hipSetDevice(deviceId)); + HIP_CALL(hipDeviceEnablePeerAccess(peerDeviceId, 0)); +} + +void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr) +{ + if (numBytes == 0) + { + printf("[ERROR] Unable to allocate 0 bytes\n"); + exit(1); + } if (memType == MEM_CPU) { - // // Allocate pinned-memory on NUMA node closest to the selected GPU - HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocPortable)); + // Set numa policy prior to call to hipHostMalloc + unsigned long nodemask = (1ULL << devIndex); + long retCode = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8); + if (retCode) + { + printf("[ERROR] Unable to set NUMA memory policy to bind to NUMA node %d\n", devIndex); + exit(1); + } + + // Allocate host-pinned memory (should respect NUMA mem policy) + HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser)); + + // Check that the allocated pages are actually on the correct NUMA node + CheckPages((char*)*memPtr, numBytes, devIndex); + + // Reset to default numa mem policy + retCode = set_mempolicy(MPOL_DEFAULT, NULL, 8); + if (retCode) + { + printf("[ERROR] Unable reset to default NUMA memory policy\n"); + exit(1); + } } else if (memType == MEM_GPU) { - // Allocate GPU memory - if (useFineGrainMem) - HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained)); - else - HIP_CALL(hipMalloc((void**)memPtr, numBytes)); + // Allocate GPU memory on appropriate device + HIP_CALL(hipSetDevice(devIndex)); + HIP_CALL(hipMalloc((void**)memPtr, numBytes)); + } + else if (memType == MEM_GPU_FINE) + { + HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained)); } else { - printf("Error: Unsupported memory type %d\n", memType); + printf("[ERROR] Unsupported memory type %d\n", memType); exit(1); } } @@ -791,12 +779,51 @@ void DeallocateMemory(MemType memType, int devIndex, float* memPtr) { HIP_CALL(hipHostFree(memPtr)); } - else if (memType == MEM_GPU) + else if (memType == MEM_GPU || memType == MEM_GPU_FINE) { HIP_CALL(hipFree(memPtr)); } } +void CheckPages(char* array, size_t numBytes, int targetId) +{ + unsigned long const pageSize = getpagesize(); + unsigned long const numPages = (numBytes + pageSize - 1) / pageSize; + + std::vector pages(numPages); + std::vector status(numPages); + + pages[0] = array; + for (int i = 1; i < numPages; i++) + { + pages[i] = (char*)pages[i-1] + pageSize; + } + + long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0); + if (retCode) + { + printf("[ERROR] Unable to collect page info\n"); + exit(1); + } + + size_t mistakeCount = 0; + for (int i = 0; i < numPages; i++) + { + if (status[i] < 0) + { + printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i); + exit(1); + } + if (status[i] != targetId) mistakeCount++; + } + if (mistakeCount > 0) + { + printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId); + // NOTE: Some older versions of HIP do not properly respect NUMA policy so avoid failing for now + // exit(1); + } +} + // Helper function to either fill a device pointer with pseudo-random data, or to check to see if it matches void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, float* ptr) { @@ -858,31 +885,132 @@ std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount) return result; } +std::string GetDesc(MemType srcMemType, int srcIndex, + MemType dstMemType, int dstIndex) +{ + if (srcMemType == MEM_CPU) + { + if (dstMemType == MEM_CPU) + return (srcIndex == dstIndex) ? "LOCAL" : "NUMA"; + else if (dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) + return "PCIE"; + else + goto error; + } + else if (srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) + { + if (dstMemType == MEM_CPU) + return "PCIE"; + else if (dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) + { + if (srcIndex == dstIndex) return "LOCAL"; + else + { + uint32_t linkType, hopCount; + HIP_CALL(hipExtGetLinkTypeAndHopCount(srcIndex, dstIndex, &linkType, &hopCount)); + return GetLinkTypeDesc(linkType, hopCount); + } + } + else + goto error; + } +error: + printf("[ERROR] Unrecognized memory type\n"); + exit(1); +} + std::string GetLinkDesc(Link const& link) { - std::string result = ""; - - // Currently only describe links between src/dst on GPU - if (link.srcMemType == MEM_GPU && link.dstMemType == MEM_GPU) - { - if (link.exeIndex != link.srcIndex) - { - uint32_t linkType, hopCount; - HIP_CALL(hipExtGetLinkTypeAndHopCount(link.srcIndex, link.exeIndex, &linkType, &hopCount)); - result += GetLinkTypeDesc(linkType, hopCount); - } - - if (link.exeIndex != link.dstIndex) - { - uint32_t linkType, hopCount; - HIP_CALL(hipExtGetLinkTypeAndHopCount(link.exeIndex, link.dstIndex, &linkType, &hopCount)); - if (result != "") result += "+"; - result += GetLinkTypeDesc(linkType, hopCount); - } - } - else - { - result = "???"; - } - return result; + return GetDesc(link.srcMemType, link.srcIndex, link.exeMemType, link.exeIndex) + "-" + + GetDesc(link.exeMemType, link.exeIndex, link.dstMemType, link.dstIndex); +} + +void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link) +{ + // GPU execution agent + if (link.exeMemType == MEM_GPU) + { + // Switch to executing GPU + HIP_CALL(hipSetDevice(link.exeIndex)); + + bool recordStart = (!ev.useSingleSync || iteration == 0); + bool recordStop = (!ev.useSingleSync || iteration == ev.numIterations - 1); + + int const initOffset = ev.byteOffset / sizeof(float); + + if (ev.useHipCall) + { + // Record start event + if (recordStart) HIP_CALL(hipEventRecord(link.startEvent, link.stream)); + + // Execute hipMemset / hipMemcpy + if (ev.useMemset) + HIP_CALL(hipMemsetAsync(link.dstMem + initOffset, 42, N * sizeof(float), link.stream)); + else + HIP_CALL(hipMemcpyAsync(link.dstMem + initOffset, + link.srcMem + initOffset, + N * sizeof(float), hipMemcpyDefault, + link.stream)); + // Record stop event + if (recordStop) HIP_CALL(hipEventRecord(link.stopEvent, link.stream)); + } + else + { + if (!ev.combineTiming && recordStart) HIP_CALL(hipEventRecord(link.startEvent, link.stream)); + hipExtLaunchKernelGGL(ev.useMemset ? GpuMemsetKernel : GpuCopyKernel, + dim3(link.numBlocksToUse, 1, 1), + dim3(BLOCKSIZE, 1, 1), + 0, link.stream, + (ev.combineTiming && recordStart) ? link.startEvent : NULL, + (ev.combineTiming && recordStop) ? link.stopEvent : NULL, + 0, link.blockParam); + if (!ev.combineTiming & recordStop) HIP_CALL(hipEventRecord(link.stopEvent, link.stream)); + } + + // Synchronize per iteration, unless in single sync mode, in which case + // synchronize during last warmup / last actual iteration + if (!ev.useSingleSync || iteration == -1 || iteration == ev.numIterations - 1) + { + HIP_CALL(hipStreamSynchronize(link.stream)); + } + + if (iteration >= 0) + { + // Record GPU timing + if (!ev.useSingleSync || iteration == ev.numIterations - 1) + { + HIP_CALL(hipEventSynchronize(link.stopEvent)); + float gpuDeltaMsec; + HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, link.startEvent, link.stopEvent)); + link.totalTime += gpuDeltaMsec; + } + } + } + else if (link.exeMemType == MEM_CPU) // CPU execution agent + { + // Force this thread and all child threads onto correct NUMA node + if (numa_run_on_node(link.exeIndex)) + { + printf("[ERROR] Unable to set CPU to NUMA node %d\n", link.exeIndex); + exit(1); + } + + std::vector childThreads; + + auto cpuStart = std::chrono::high_resolution_clock::now(); + + // Launch child-threads to perform memcopies + for (int i = 0; i < ev.numCpuPerLink; i++) + childThreads.push_back(std::thread(ev.useMemset ? CpuMemsetKernel : CpuCopyKernel, std::ref(link.blockParam[i]))); + + // Wait for child-threads to finish + for (int i = 0; i < ev.numCpuPerLink; i++) + childThreads[i].join(); + + auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; + + // Record time if not a warmup iteration + if (iteration >= 0) + link.totalTime += (std::chrono::duration_cast>(cpuDelta).count() * 1000.0); + } } diff --git a/tools/TransferBench/TransferBench.hpp b/tools/TransferBench/TransferBench.hpp index 1291bf68f7..6a08e80f4e 100644 --- a/tools/TransferBench/TransferBench.hpp +++ b/tools/TransferBench/TransferBench.hpp @@ -33,7 +33,9 @@ THE SOFTWARE. #include #include #include + #include "copy_kernel.h" +#include "EnvVars.hpp" // Helper macro for catching HIP errors #define HIP_CALL(cmd) \ @@ -50,11 +52,12 @@ THE SOFTWARE. // Different src/dst memory types supported typedef enum { - MEM_CPU = 0, // Pinned CPU memory - MEM_GPU = 1 // Global GPU memory + MEM_CPU = 0, // Pinned CPU memory + MEM_GPU = 1, // Coarse-grained global GPU memory + MEM_GPU_FINE = 2 // Fine-grained global GPU memory } MemType; -char const MemTypeStr[3] = "CG"; +char const MemTypeStr[4] = "CGF"; typedef enum { @@ -62,17 +65,6 @@ typedef enum MODE_CHECK = 1 // Check data against pattern } ModeType; -// Each Link is a uni-direction operation from a src memory to dst memory executed by a specific GPU -struct Link -{ - int exeIndex; // GPU to execute on - MemType srcMemType; // Source memory type - int srcIndex; // Source device index - MemType dstMemType; // Destination memory type - int dstIndex; // Destination device index - int numBlocksToUse; // Number of threadblocks to use for this Link -}; - // Each threadblock copies N floats from src to dst struct BlockParam { @@ -81,46 +73,90 @@ struct BlockParam float* dst; }; +// Each Link is a uni-direction operation from a src memory to dst memory executed by a specific GPU +struct Link +{ + // Link config + MemType exeMemType; // Link executor type (CPU or GPU) + int exeIndex; // Executor index (NUMA node for CPU / device ID for GPU) + MemType srcMemType; // Source memory type + int srcIndex; // Source device index + MemType dstMemType; // Destination memory type + int dstIndex; // Destination device index + int numBlocksToUse; // Number of threadblocks to use for this Link + + // Link implementation + float* srcMem; // Source memory + float* dstMem; // Destination memory + + hipEvent_t startEvent; + hipEvent_t stopEvent; + hipStream_t stream; + BlockParam* blockParam; + + double totalTime; +}; + void DisplayUsage(char const* cmdName); // Display usage instructions void GenerateConfigFile(char const* cfgFile, int numBlocks); // Generate a sample config file void DisplayTopology(); // Display GPU topology -void ParseLinks(char* line, std::vector& links); // Parse Link information -void AllocateMemory(MemType memType, int devIndex, size_t numBytes, bool useFineGrainMem, float** memPtr); +void PopulateTestSizes(size_t const numBytesPerLink, int const samplingFactor, std::vector& valuesofN); +void ParseMemType(std::string const& token, int const numCpus, int const numGpus, MemType* memType, int* memIndex); +void ParseLinks(char* line, int numCpus, int numGpus, std::vector& links); // Parse Link information +void EnablePeerAccess(int const deviceId, int const peerDeviceId); +void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr); void DeallocateMemory(MemType memType, int devIndex, float* memPtr); +void CheckPages(char* byteArray, size_t numBytes, int targetId); void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, float* ptr); +void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link); + + std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount); +std::string GetDesc(MemType srcMemType, int srcIndex, + MemType dstMemType, int dstIndex); std::string GetLinkDesc(Link const& link); - -#define MAX_NAME_LEN 64 #define BLOCKSIZE 256 #define COPY_UNROLL 4 #define MEMSET_UNROLL 4 // GPU copy kernel __global__ void __launch_bounds__(BLOCKSIZE) -CopyKernel(BlockParam* blockParams) +GpuCopyKernel(BlockParam* blockParams) { - // Collect the arguments for this block - int N = blockParams[blockIdx.x].N; - const float* __restrict__ src = (float* )blockParams[blockIdx.x].src; - float* __restrict__ dst = (float* )blockParams[blockIdx.x].dst; + // Collect the arguments for this block + int N = blockParams[blockIdx.x].N; + const float* __restrict__ src = (float* )blockParams[blockIdx.x].src; + float* __restrict__ dst = (float* )blockParams[blockIdx.x].dst; - Copy(dst, src, N); + Copy(dst, src, N); } // GPU set kernel __global__ void __launch_bounds__(BLOCKSIZE) -MemsetKernel(BlockParam* blockParams) +GpuMemsetKernel(BlockParam* blockParams) { - // Collect the arguments for this block - int N = blockParams[blockIdx.x].N; - float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst; + // Collect the arguments for this block + int N = blockParams[blockIdx.x].N; + float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst; - // Use non-zero value - #pragma unroll MEMSET_UNROLL - for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE) - { - dst[tid] = 1234.0; - } + // Use non-zero value + #pragma unroll MEMSET_UNROLL + for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE) + { + dst[tid] = 1234.0; + } +} + +// CPU copy kernel +void CpuCopyKernel(BlockParam const& blockParams) +{ + memcpy(blockParams.dst, blockParams.src, blockParams.N * sizeof(float)); +} + +// CPU memset kernel +void CpuMemsetKernel(BlockParam const& blockParams) +{ + for (int i = 0; i < blockParams.N; i++) + blockParams.dst[i] = 1234.0; } diff --git a/tools/TransferBench/example.cfg b/tools/TransferBench/example.cfg index b14762f9e3..2d546f433b 100644 --- a/tools/TransferBench/example.cfg +++ b/tools/TransferBench/example.cfg @@ -1,39 +1,43 @@ -#Configfile Format: -#================== -#A Link is defined as a uni-directional transfer from src memory location to dst memory location -#Each single line in the configuration file defines a set of Links to run in parallel +# Configfile Format: +# ================== +# A Link is defined as a uni-directional transfer from src memory location to dst memory location executed by either CPU or GPU +# Each single line in the configuration file defines a set of Links to run in parallel -#There are two ways to specify the configuration file: +# There are two ways to specify the configuration file: -#1) Basic -# The basic specification assumes the same number of threadblocks/CUs used per link -# A positive number of Links is specified followed by that number of triplets describing each Link +# 1) Basic +# The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Link +# A positive number of Links is specified followed by that number of triplets describing each Link - #Links #CUs (GPUIndex1 srcMem1 dstMem1) ... (GPUIndexL srcMemL dstMemL) +# #Links #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL) -#2) Advanced -# The advanced specification allows different number of threadblocks/CUs used per Link -# A negative number of links is specified, followed by quadruples describing each Link -# -#Links (GPUIndex1 #CUs1 srcMem1 dstMem1) ... (GPUIndexL #CUsL srcMemL dstMemL) +# 2) Advanced +# The advanced specification allows different number of threadblocks/CUs used per GPU-executed Link +# A negative number of links is specified, followed by quadruples describing each Link +# -#Links (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL) -#Argument Details: -# #Links : Number of Links to be run in parallel -# #CUs : Number of threadblocks/CUs to use for a Link -# GpuIndex: 0-indexed GPU id executing the Link -# srcMemL : Source memory location (Where the data is to be read from). Ignored in memset mode -# dstMemL : Destination memory location (Where the data is to be written to) -# Memory locations are specified by a character indicating memory type, followed by GPU device index (0-indexed) -# Supported memory locations are: -# - P: Pinned host memory (on CPU, on NUMA node closest to provided GPU index) -# - G: Global device memory (on GPU) -#Round brackets may be included for human clarity, but will be ignored +# Argument Details: +# #Links : Number of Links to be run in parallel +# #CUs : Number of threadblocks/CUs to use for a GPU-executed Link +# srcMemL : Source memory location (Where the data is to be read from). Ignored in memset mode +# Executor: Executor are specified by a character indicating executor type, followed by device index (0-indexed) +# - C: CPU-executed (Indexed from 0 to 1) +# - G: GPU-executed (Indexed from 0 to 3) +# dstMemL : Destination memory location (Where the data is to be written to) -#Examples: -#1 4 (0 G0 G1) Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1 -#1 4 (0 G1 G0) Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 1 and copies it to memory on GPU 0 -#1 4 (2 P0 G2) Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2 -#2 4 (0 G0 G1) (1 G1 G0) Runs 2 Links in parallel. GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs -#-2 (0 G0 G1 4) (1 G1 G0 2) Runs 2 Links in parallel. GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs +# Memory locations are specified by a character indicating memory type, followed by device index (0-indexed) +# Supported memory locations are: +# - C: Pinned host memory (on NUMA node, indexed from 0 to 1) +# - G: Global device memory (on GPU device indexed from 0 to 3) +# - F: Fine-grain device memory (on GPU device indexed from 0 to 3) -# Single link between GPUs 0 and 1 -1 1 (0 G0 G1) +# Examples: +# 1 4 (G0->G0->G1) Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1 +# 1 4 (G1->C0->G0) Single Link that uses 4 CUs on GPU 0 that reads memory from CPU 1 and copies it to memory on GPU 0 +# 1 4 (C0->G2->G2) Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2 +# 2 4 G0->G0->G1 G1->G1->G0 Runs 2 Links in parallel. GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs +# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel. GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs +# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary + +# Single GPU-executed link between GPUs 0 and 1 using 4 CUs +1 4 (G0->G0->G1)