From e61ff3ce37e0bf81ca845da1e90c583eb3b2aa7c Mon Sep 17 00:00:00 2001 From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com> Date: Fri, 8 Apr 2022 15:20:55 -0600 Subject: [PATCH] Transfer bench single stream mode (#531) - Adding single stream mode - Removing some unused env vars - Adding output to CSV mode for p2p benchmark, topology listing modes [ROCm/rccl commit: def6832287b9f0b004c4e48313708ec5c96eac6e] --- .../rccl/tools/TransferBench/CHANGELOG.md | 16 + projects/rccl/tools/TransferBench/EnvVars.hpp | 22 +- projects/rccl/tools/TransferBench/Kernels.hpp | 47 +- projects/rccl/tools/TransferBench/Makefile | 11 +- projects/rccl/tools/TransferBench/README.md | 14 + .../tools/TransferBench/TransferBench.cpp | 1055 +++++++++-------- .../tools/TransferBench/TransferBench.hpp | 118 +- projects/rccl/tools/TransferBench/example.cfg | 26 +- 8 files changed, 700 insertions(+), 609 deletions(-) create mode 100644 projects/rccl/tools/TransferBench/CHANGELOG.md create mode 100644 projects/rccl/tools/TransferBench/README.md diff --git a/projects/rccl/tools/TransferBench/CHANGELOG.md b/projects/rccl/tools/TransferBench/CHANGELOG.md new file mode 100644 index 0000000000..8f13d5e2d9 --- /dev/null +++ b/projects/rccl/tools/TransferBench/CHANGELOG.md @@ -0,0 +1,16 @@ +# Changelog for TransferBench + +## v1.01 +### Added +- Adding USE_SINGLE_STREAM feature + - All Links that execute on the same GPU device are executed with a single kernel launch on a single stream + - Does not work with USE_HIP_CALL and forces USE_SINGLE_SYNC to collect timings + - Adding ability to request coherent / fine-grained host memory ('B') +### Changed +- Separating TransferBench from RCCL repo +- Peer-to-peer benchmark mode now works OUTPUT_TO_CSV +- Toplogy display now works with OUTPUT_TO_CSV +- Moving documentation about config file into example.cfg +### Removed +- Removed config file generation +- Removed show pointer address environment variable (SHOW_ADDR) diff --git a/projects/rccl/tools/TransferBench/EnvVars.hpp b/projects/rccl/tools/TransferBench/EnvVars.hpp index a84a3af185..95448739f8 100644 --- a/projects/rccl/tools/TransferBench/EnvVars.hpp +++ b/projects/rccl/tools/TransferBench/EnvVars.hpp @@ -25,6 +25,8 @@ THE SOFTWARE. #include +#define TB_VERSION "1.01" + // This class manages environment variable that affect TransferBench class EnvVars { @@ -40,9 +42,7 @@ public: int useMemset; // Perform a memset instead of a copy (ignores source memory) int useSingleSync; // Perform synchronization only once after all iterations instead of per iteration int useInteractive; // Pause for user-input before starting transfer loop - int useSleep; // Adds a 100ms sleep after each synchronization int combineTiming; // Combines the timing with kernel launch - int showAddr; // Print out memory addresses for each Link int outputToCsv; // Output in CSV format int byteOffset; // Byte-offset for memory allocations int numWarmups; // Number of un-timed warmup iterations to perform @@ -52,6 +52,7 @@ public: int sharedMemBytes; // Amount of shared memory to use per threadblock int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device + int useSingleStream; // Use a single stream per device instead of per Link. Can not be used with USE_HIP_CALL std::vector fillPattern; // Pattern of floats used to fill source data @@ -67,7 +68,6 @@ public: useSingleSync = GetEnvVar("USE_SINGLE_SYNC" , 1); useInteractive = GetEnvVar("USE_INTERACTIVE" , 0); combineTiming = GetEnvVar("COMBINE_TIMING" , 0); - showAddr = GetEnvVar("SHOW_ADDR" , 0); outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0); byteOffset = GetEnvVar("BYTE_OFFSET" , 0); numWarmups = GetEnvVar("NUM_WARMUPS" , DEFAULT_NUM_WARMUPS); @@ -77,6 +77,7 @@ public: sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , maxSharedMemBytes / 2 + 1); blockBytes = GetEnvVar("BLOCK_BYTES" , 256); usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0); + useSingleStream = GetEnvVar("USE_SINGLE_STREAM", 0); // Check for fill pattern char* pattern = getenv("FILL_PATTERN"); @@ -172,6 +173,11 @@ public: printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n"); exit(1); } + if (useSingleStream && useHipCall) + { + printf("[ERROR] Single stream mode cannot be used with HIP calls\n"); + exit(1); + } } // Display info on the env vars that can be used @@ -184,7 +190,6 @@ public: printf(" USE_SINGLE_SYNC - Perform synchronization only once after all iterations instead of per iteration\n"); printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n"); printf(" COMBINE_TIMING - Combines timing with launch (potentially lower timing overhead)\n"); - printf(" SHOW_ADDR - Print out memory addresses for each Link\n"); printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n"); printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n"); printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n"); @@ -195,6 +200,7 @@ public: printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n"); printf(" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n"); printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n"); + printf(" USE_SINGLE_STREAM - Use single stream per device instead of per link. Cannot be used with USE_HIP_CALL\n"); } // Display env var settings @@ -202,7 +208,7 @@ public: { if (!outputToCsv) { - printf("Run configuration\n"); + printf("Run configuration (TransferBench v%s)\n", TB_VERSION); printf("=====================================================\n"); printf("%-20s = %12d : Using %s for GPU-executed copies\n", "USE_HIP_CALL", useHipCall, useHipCall ? "HIP functions" : "custom kernels"); @@ -220,8 +226,6 @@ public: useInteractive ? "interactive" : "non-interactive"); printf("%-20s = %12d : %s\n", "COMBINE_TIMING", combineTiming, combineTiming ? "Using combined timing+launch" : "Using separate timing / launch"); - printf("%-20s = %12d : %s\n", "SHOW_ADDR", showAddr, - showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses"); printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv, outputToCsv ? "CSV" : "console"); printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset); @@ -242,13 +246,13 @@ public: getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes); printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes); printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP")); + printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM", useSingleStream, (useSingleStream ? "device" : "Link")); printf("\n"); } }; -private: // Helper function that gets parses environment variable or sets to default value - int GetEnvVar(std::string const varname, int defaultValue) + static int GetEnvVar(std::string const varname, int defaultValue) { if (getenv(varname.c_str())) return atoi(getenv(varname.c_str())); diff --git a/projects/rccl/tools/TransferBench/Kernels.hpp b/projects/rccl/tools/TransferBench/Kernels.hpp index 40f8a713bd..6c8110c148 100644 --- a/projects/rccl/tools/TransferBench/Kernels.hpp +++ b/projects/rccl/tools/TransferBench/Kernels.hpp @@ -36,6 +36,7 @@ GpuCopyKernel(BlockParam* blockParams) int Nrem = blockParams[blockIdx.x].N; float const* src = blockParams[blockIdx.x].src; float* dst = blockParams[blockIdx.x].dst; + if (threadIdx.x == 0) blockParams[blockIdx.x].startCycle = __builtin_amdgcn_s_memrealtime(); // Operate on wavefront granularity int numWaves = BLOCKSIZE / WARP_SIZE; // Number of wavefronts per threadblock @@ -68,30 +69,34 @@ GpuCopyKernel(BlockParam* blockParams) loop1Offset += loop1Inc; } Nrem -= loop1Nelem; - if (Nrem == 0) return; - - // 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration - int const loop2Npack = Nrem / FLOATS_PER_PACK; - int const loop2Nelem = loop2Npack * FLOATS_PER_PACK; - int const loop2Inc = BLOCKSIZE; - int loop2Offset = threadIdx.x; - - packedSrc = (PackedFloat_t const*)(src + loop1Nelem); - packedDst = (PackedFloat_t *)(dst + loop1Nelem); - while (loop2Offset < loop2Npack) + if (Nrem > 0) { - packedDst[loop2Offset] = packedSrc[loop2Offset]; - loop2Offset += loop2Inc; - } - Nrem -= loop2Nelem; - if (Nrem == 0) return; + // 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration + int const loop2Npack = Nrem / FLOATS_PER_PACK; + int const loop2Nelem = loop2Npack * FLOATS_PER_PACK; + int const loop2Inc = BLOCKSIZE; + int loop2Offset = threadIdx.x; - // Deal with leftovers less than FLOATS_PER_PACK) - if (threadIdx.x < Nrem) - { - int offset = loop1Nelem + loop2Nelem + threadIdx.x; - dst[offset] = src[offset]; + packedSrc = (PackedFloat_t const*)(src + loop1Nelem); + packedDst = (PackedFloat_t *)(dst + loop1Nelem); + while (loop2Offset < loop2Npack) + { + packedDst[loop2Offset] = packedSrc[loop2Offset]; + loop2Offset += loop2Inc; + } + Nrem -= loop2Nelem; + + // Deal with leftovers less than FLOATS_PER_PACK) + if (threadIdx.x < Nrem) + { + int offset = loop1Nelem + loop2Nelem + threadIdx.x; + dst[offset] = src[offset]; + } } + + __threadfence_system(); + if (threadIdx.x == 0) + blockParams[blockIdx.x].stopCycle = __builtin_amdgcn_s_memrealtime(); } #define MEMSET_UNROLL 8 diff --git a/projects/rccl/tools/TransferBench/Makefile b/projects/rccl/tools/TransferBench/Makefile index d12ca52887..21bfb42854 100644 --- a/projects/rccl/tools/TransferBench/Makefile +++ b/projects/rccl/tools/TransferBench/Makefile @@ -1,12 +1,9 @@ -# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. -HIP_PATH?= $(wildcard /opt/rocm/hip) -ifeq (,$(HIP_PATH)) -HIP_PATH=../../.. -endif -HIPCC=$(HIP_PATH)/bin/hipcc +# Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. +ROCM_PATH ?= /opt/rocm +HIPCC=$(ROCM_PATH)/bin/hipcc EXE=TransferBench -CXXFLAGS = -O3 -I. -lnuma -L$(HIP_PATH)/../hsa/lib -lhsa-runtime64 +CXXFLAGS = -O3 -I. -lnuma -L$(ROCM_PATH)/hsa/lib -lhsa-runtime64 all: $(EXE) diff --git a/projects/rccl/tools/TransferBench/README.md b/projects/rccl/tools/TransferBench/README.md new file mode 100644 index 0000000000..4733c4dfe6 --- /dev/null +++ b/projects/rccl/tools/TransferBench/README.md @@ -0,0 +1,14 @@ +# TransferBench + +TransferBench is a simple utility capable of benchmarking simultaneous copies between user-specified devices (CPUs/GPUs). + +## Requirements + +1. ROCm stack installed on the system (HIP runtime) +2. libnuma installed on system + +## Building + To build TransferBench: +* `make` + + If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately diff --git a/projects/rccl/tools/TransferBench/TransferBench.cpp b/projects/rccl/tools/TransferBench/TransferBench.cpp index db222cc504..4ec3df6e05 100644 --- a/projects/rccl/tools/TransferBench/TransferBench.cpp +++ b/projects/rccl/tools/TransferBench/TransferBench.cpp @@ -1,5 +1,5 @@ /* -Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -31,23 +31,14 @@ THE SOFTWARE. #include "GetClosestNumaNode.hpp" #include "Kernels.hpp" -// Simple configuration parameters -size_t const DEFAULT_BYTES_PER_LINK = (1<<26); // Amount of data transferred per Link - int main(int argc, char **argv) { - // Display usage + // Display usage instructions and detected topology if (argc <= 1) { - DisplayUsage(argv[0]); - DisplayTopology(); - exit(0); - } - - // If a negative value is listed for N, generate a comprehensive config file for this node - if (argc > 2 && atoll(argv[2]) < 0) - { - GenerateConfigFile(argv[1], -1*atoi(argv[2])); + int const outputToCsv = EnvVars::GetEnvVar("OUTPUT_TO_CSV", 0); + if (!outputToCsv) DisplayUsage(argv[0]); + DisplayTopology(outputToCsv); exit(0); } @@ -90,7 +81,7 @@ int main(int argc, char **argv) // Perform either local read (+remote write) [EXE = SRC] or // remote read (+local write) [EXE = DST] int readMode = (!strcmp(argv[1], "p2p_rr") || !strcmp(argv[1], "g2g_rr") ? 1 : 0); - int skipCpu = (!strcmp(argv[1], "g2g") || !strcmp(argv[1], "g2g_rr") ? 1 : 0); + int skipCpu = (!strcmp(argv[1], "g2g" ) || !strcmp(argv[1], "g2g_rr") ? 1 : 0); // Execute peer to peer benchmark mode RunPeerToPeerBenchmarks(ev, numBytesPerLink / sizeof(float), numBlocksToUse, readMode, skipCpu); @@ -121,13 +112,14 @@ int main(int argc, char **argv) HIP_CALL(hipGetDeviceCount(&numGpuDevices)); int const numCpuDevices = numa_num_configured_nodes(); - // Track links that get used + // Track unique pair of links that get used std::set> peerAccessTracker; // Print CSV header if (ev.outputToCsv) { - printf("Test,NumBytes,SrcMem,Executor,DstMem,CUs,BW(GB/s),Time(ms),LinkDesc,SrcAddr,DstAddr,ByteOffset,numWarmups,numIters,useHipCall,useMemSet,useSingleSync,combinedTiming\n"); + printf("Test,NumBytes,SrcMem,Executor,DstMem,CUs,BW(GB/s),Time(ms)," + "LinkDesc,SrcAddr,DstAddr,ByteOffset,numWarmups,numIters\n"); } // Loop over each line in the Link configuration file @@ -135,74 +127,97 @@ int main(int argc, char **argv) char line[2048]; while(fgets(line, 2048, fp)) { - // Check if line is a comment - if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') - printf("%s", line); + // Check if line is a comment to be echoed to output (starts with ##) + if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s", line); // Parse links from configuration file - std::vector links; - ParseLinks(line, numCpuDevices, numGpuDevices, links); + LinkMap linkMap; + ParseLinks(line, numCpuDevices, numGpuDevices, linkMap); + if (linkMap.size() == 0) continue; - int const numLinks = links.size(); - if (numLinks == 0) continue; testNum++; - // Prepare link - for (int i = 0; i < numLinks; i++) + // Prepare (maximum) memory for each link + std::vector linkList; + for (auto& exeInfoPair : linkMap) { - // Get some aliases to link variables - MemType const& exeMemType = links[i].exeMemType; - MemType const& srcMemType = links[i].srcMemType; - MemType const& dstMemType = links[i].dstMemType; - int const& blocksToUse = links[i].numBlocksToUse; + ExecutorInfo& exeInfo = exeInfoPair.second; + exeInfo.totalTime = 0.0; + exeInfo.totalBlocks = 0; - // Get potentially remapped device indices - int const srcIndex = RemappedIndex(links[i].srcIndex, srcMemType); - int const exeIndex = RemappedIndex(links[i].exeIndex, exeMemType); - int const dstIndex = RemappedIndex(links[i].dstIndex, dstMemType); - - // Enable peer-to-peer access if necessary (can only be called once per unique pair) - if (exeMemType == MEM_GPU) + for (Link& link : exeInfo.links) { - // Ensure executing GPU can access source memory - if ((srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) && srcIndex != exeIndex) + // Get some aliases to link variables + MemType const& exeMemType = link.exeMemType; + MemType const& srcMemType = link.srcMemType; + MemType const& dstMemType = link.dstMemType; + int const& blocksToUse = link.numBlocksToUse; + + // Get potentially remapped device indices + int const srcIndex = RemappedIndex(link.srcIndex, srcMemType); + int const exeIndex = RemappedIndex(link.exeIndex, exeMemType); + int const dstIndex = RemappedIndex(link.dstIndex, dstMemType); + + // Enable peer-to-peer access if necessary (can only be called once per unique pair) + if (exeMemType == MEM_GPU) { - auto exeSrcPair = std::make_pair(exeIndex, srcIndex); - if (!peerAccessTracker.count(exeSrcPair)) + // Ensure executing GPU can access source memory + if ((srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) && srcIndex != exeIndex) { - EnablePeerAccess(exeIndex, srcIndex); - peerAccessTracker.insert(exeSrcPair); + auto exeSrcPair = std::make_pair(exeIndex, srcIndex); + if (!peerAccessTracker.count(exeSrcPair)) + { + EnablePeerAccess(exeIndex, srcIndex); + peerAccessTracker.insert(exeSrcPair); + } + } + + // Ensure executing GPU can access destination memory + if ((dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) && dstIndex != exeIndex) + { + auto exeDstPair = std::make_pair(exeIndex, dstIndex); + if (!peerAccessTracker.count(exeDstPair)) + { + EnablePeerAccess(exeIndex, dstIndex); + peerAccessTracker.insert(exeDstPair); + } } } - // Ensure executing GPU can access destination memory - if ((dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) && dstIndex != exeIndex) - { - auto exeDstPair = std::make_pair(exeIndex, dstIndex); - if (!peerAccessTracker.count(exeDstPair)) - { - EnablePeerAccess(exeIndex, dstIndex); - peerAccessTracker.insert(exeDstPair); - } - } + // Allocate (maximum) source / destination memory based on type / device index + AllocateMemory(srcMemType, srcIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&link.srcMem); + AllocateMemory(dstMemType, dstIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&link.dstMem); + link.blockParam.resize(exeMemType == MEM_CPU ? ev.numCpuPerLink : blocksToUse); + exeInfo.totalBlocks += link.blockParam.size(); + linkList.push_back(&link); } - // Allocate (maximum) source / destination memory based on type / device index - AllocateMemory(srcMemType, srcIndex, maxN * sizeof(float) + ev.byteOffset, &links[i].srcMem); - AllocateMemory(dstMemType, dstIndex, maxN * sizeof(float) + ev.byteOffset, &links[i].dstMem); - - // Prepare execution agent + // Prepare GPU resources for GPU executors + MemType const exeMemType = exeInfoPair.first.first; + int const exeIndex = RemappedIndex(exeInfoPair.first.second, exeMemType); if (exeMemType == MEM_GPU) { - HIP_CALL(hipSetDevice(exeIndex)); - HIP_CALL(hipEventCreate(&links[i].startEvent)); - HIP_CALL(hipEventCreate(&links[i].stopEvent)); - HIP_CALL(hipMalloc((void**)&links[i].blockParam, sizeof(BlockParam) * blocksToUse)); - HIP_CALL(hipStreamCreate(&links[i].stream)); - } - else if (exeMemType == MEM_CPU) - { - links[i].blockParam = (BlockParam*)malloc(ev.numCpuPerLink * sizeof(BlockParam)); + AllocateMemory(exeMemType, exeIndex, exeInfo.totalBlocks * sizeof(BlockParam), + (void**)&exeInfo.blockParamGpu); + + int const numLinksToRun = ev.useSingleStream ? 1 : exeInfo.links.size(); + exeInfo.streams.resize(numLinksToRun); + exeInfo.startEvents.resize(numLinksToRun); + exeInfo.stopEvents.resize(numLinksToRun); + for (int i = 0; i < numLinksToRun; ++i) + { + HIP_CALL(hipSetDevice(exeIndex)); + HIP_CALL(hipStreamCreate(&exeInfo.streams[i])); + HIP_CALL(hipEventCreate(&exeInfo.startEvents[i])); + HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i])); + } + + int linkOffset = 0; + for (int i = 0; i < exeInfo.links.size(); i++) + { + exeInfo.links[i].blockParamGpuPtr = exeInfo.blockParamGpu + linkOffset; + linkOffset += exeInfo.links[i].blockParam.size(); + } } } @@ -211,60 +226,32 @@ int main(int argc, char **argv) { if (!ev.outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float)); - // Prepare links based on current N - for (int i = 0; i < numLinks; i++) + // Prepare input memory and block parameters for current N + for (auto& exeInfoPair : linkMap) { - // Initialize source memory with patterned data - CheckOrFill(MODE_FILL, N, ev.useMemset, ev.useHipCall, ev.fillPattern, links[i].srcMem + initOffset); + ExecutorInfo& exeInfo = exeInfoPair.second; + int linkOffset = 0; - // Each block needs to know src/dst pointers and how many elements to transfer - // Figure out the sub-array each block does for this Link - // - Partition N as evenly as posible, but try to keep blocks as multiples of BLOCK_BYTES bytes, - // except the very last one, for alignment reasons - int targetMultiple = ev.blockBytes / sizeof(float); - if (links[i].exeMemType == MEM_GPU) + for (int i = 0; i < exeInfo.links.size(); ++i) { - size_t assigned = 0; - int maxNumBlocksToUse = std::min((N + targetMultiple - 1) / targetMultiple, (size_t)links[i].numBlocksToUse); - for (int j = 0; j < links[i].numBlocksToUse; j++) - { - BlockParam param; - int blocksLeft = std::max(0, maxNumBlocksToUse - j); - size_t leftover = N - assigned; - size_t roundedN = (leftover + targetMultiple - 1) / targetMultiple; - param.N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * targetMultiple)) : 0; - param.src = links[i].srcMem + assigned + initOffset; - param.dst = links[i].dstMem + assigned + initOffset; - assigned += param.N; + Link& link = exeInfo.links[i]; + link.PrepareBlockParams(ev, N); - HIP_CALL(hipMemcpy(&links[i].blockParam[j], ¶m, sizeof(BlockParam), hipMemcpyHostToDevice)); + // Copy block parameters to GPU for GPU executors + if (link.exeMemType == MEM_GPU) + { + HIP_CALL(hipMemcpy(&exeInfo.blockParamGpu[linkOffset], + link.blockParam.data(), + link.blockParam.size() * sizeof(BlockParam), + hipMemcpyHostToDevice)); + linkOffset += link.blockParam.size(); } } - else if (links[i].exeMemType == MEM_CPU) - { - // For CPU-based copy, divded based on the number of child threads - size_t assigned = 0; - int maxNumBlocksToUse = std::min((N + targetMultiple - 1) / targetMultiple, (size_t)ev.numCpuPerLink); - for (int j = 0; j < ev.numCpuPerLink; j++) - { - int blocksLeft = std::max(0, maxNumBlocksToUse - j); - size_t leftover = N - assigned; - size_t roundedN = (leftover + targetMultiple - 1) / targetMultiple; - links[i].blockParam[j].N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * targetMultiple)) : 0; - links[i].blockParam[j].src = links[i].srcMem + assigned + initOffset; - links[i].blockParam[j].dst = links[i].dstMem + assigned + initOffset; - assigned += links[i].blockParam[j].N; - } - } - - // Initialize timing - links[i].totalTime = 0.0; } - double totalCpuTime = 0; - // Launch kernels (warmup iterations are not counted) + double totalCpuTime = 0; for (int iteration = -ev.numWarmups; iteration < ev.numIterations; iteration++) { // Pause before starting first timed iteration in interactive mode @@ -279,10 +266,16 @@ int main(int argc, char **argv) auto cpuStart = std::chrono::high_resolution_clock::now(); // Execute all links in parallel - for (int i = 0; i < numLinks; i++) - threads.push(std::thread(RunLink, std::ref(ev), N, iteration, std::ref(links[i]))); + for (auto& exeInfoPair : linkMap) + { + ExecutorInfo& exeInfo = exeInfoPair.second; + int const numLinksToRun = ev.useSingleStream ? 1 : exeInfo.links.size(); + for (int i = 0; i < numLinksToRun; ++i) + threads.push(std::thread(RunLink, std::ref(ev), N, iteration, std::ref(exeInfo), i)); + } // Wait for all threads to finish + int const numLinks = threads.size(); for (int i = 0; i < numLinks; i++) { threads.top().join(); @@ -293,6 +286,8 @@ int main(int argc, char **argv) auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; double deltaSec = std::chrono::duration_cast>(cpuDelta).count(); + + if (iteration >= 0) totalCpuTime += deltaSec; } @@ -305,48 +300,89 @@ int main(int argc, char **argv) } // Validate that each link has transferred correctly - for (int i = 0; i < numLinks; i++) - CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, links[i].dstMem + initOffset); + int const numLinks = linkList.size(); + for (auto link : linkList) + CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, link->dstMem + initOffset); // Report timings totalCpuTime = totalCpuTime / (1.0 * ev.numIterations) * 1000; double totalBandwidthGbs = (numLinks * N * sizeof(float) / 1.0E6) / totalCpuTime; double maxGpuTime = 0; - for (int i = 0; i < numLinks; i++) + + if (ev.useSingleStream) { - double linkDurationMsec = links[i].totalTime / (1.0 * ev.numIterations); - double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f; - maxGpuTime = std::max(maxGpuTime, linkDurationMsec); - if (!ev.outputToCsv) + for (auto& exeInfoPair : linkMap) { - printf(" Link %02d: %c%02d -> [%cPU %02d:%02d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s", - i + 1, - MemTypeStr[links[i].srcMemType], links[i].srcIndex, - MemTypeStr[links[i].exeMemType], links[i].exeIndex, - links[i].exeMemType == MEM_CPU ? ev.numCpuPerLink : links[i].numBlocksToUse, - MemTypeStr[links[i].dstMemType], links[i].dstIndex, - linkBandwidthGbs, linkDurationMsec, - GetLinkDesc(links[i]).c_str()); - if (ev.showAddr) printf(" %16p | %16p |", links[i].srcMem + initOffset, links[i].dstMem + initOffset); - printf("\n"); + ExecutorInfo const& exeInfo = exeInfoPair.second; + MemType const exeMemType = exeInfoPair.first.first; + int const exeIndex = exeInfoPair.first.second; + + double exeDurationMsec = exeInfo.totalTime / (1.0 * ev.numIterations); + double exeBandwidthGbs = (exeInfo.links.size() * N * sizeof(float) / 1.0E9) / exeDurationMsec * 1000.0f; + maxGpuTime = std::max(maxGpuTime, exeDurationMsec); + + if (!ev.outputToCsv) + { + printf(" Executor: %cPU %02d (# Links %02lu)| %9.3f GB/s | %8.3f ms |\n", + MemTypeStr[exeMemType], exeIndex, exeInfo.links.size(), exeBandwidthGbs, exeDurationMsec); + for (auto link : exeInfo.links) + { + double linkDurationMsec = link.linkTime / (1.0 * ev.numIterations); + double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f; + + printf(" Link %02d | %9.3f GB/s | %8.3f ms | %c%02d -> %c%02d:(%02d) -> %c%02d\n", + link.linkIndex, + linkBandwidthGbs, + linkDurationMsec, + MemTypeStr[link.srcMemType], link.srcIndex, + MemTypeStr[link.exeMemType], link.exeIndex, + link.exeMemType == MEM_CPU ? ev.numCpuPerLink : link.numBlocksToUse, + MemTypeStr[link.dstMemType], link.dstIndex); + } + } + else + { + printf("%d,%lu,ALL,%c%02d,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%d\n", + testNum, N * sizeof(float), + MemTypeStr[exeMemType], exeIndex, + exeBandwidthGbs, exeDurationMsec, + ev.byteOffset, + ev.numWarmups, ev.numIterations); + } } - else + } + else + { + for (auto link : linkList) { - printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%9.3f,%8.3f,%s,%p,%p,%d,%d,%d,%s,%s,%s,%s\n", - testNum, N * sizeof(float), - MemTypeStr[links[i].srcMemType], links[i].srcIndex, - MemTypeStr[links[i].exeMemType], links[i].exeIndex, - MemTypeStr[links[i].dstMemType], links[i].dstIndex, - links[i].exeMemType == MEM_CPU ? ev.numCpuPerLink : links[i].numBlocksToUse, - linkBandwidthGbs, linkDurationMsec, - GetLinkDesc(links[i]).c_str(), - links[i].srcMem + initOffset, links[i].dstMem + initOffset, - ev.byteOffset, - ev.numWarmups, ev.numIterations, - ev.useHipCall ? "true" : "false", - ev.useMemset ? "true" : "false", - ev.useSingleSync ? "true" : "false", - ev.combineTiming ? "true" : "false"); + double linkDurationMsec = link->linkTime / (1.0 * ev.numIterations); + double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f; + maxGpuTime = std::max(maxGpuTime, linkDurationMsec); + if (!ev.outputToCsv) + { + printf(" Link %02d: %c%02d -> [%cPU %02d:%02d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s\n", + link->linkIndex, + MemTypeStr[link->srcMemType], link->srcIndex, + MemTypeStr[link->exeMemType], link->exeIndex, + link->exeMemType == MEM_CPU ? ev.numCpuPerLink : link->numBlocksToUse, + MemTypeStr[link->dstMemType], link->dstIndex, + linkBandwidthGbs, linkDurationMsec, + GetLinkDesc(*link).c_str()); + } + else + { + printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%p,%p,%d,%d,%d\n", + testNum, N * sizeof(float), + MemTypeStr[link->srcMemType], link->srcIndex, + MemTypeStr[link->exeMemType], link->exeIndex, + MemTypeStr[link->dstMemType], link->dstIndex, + link->exeMemType == MEM_CPU ? ev.numCpuPerLink : link->numBlocksToUse, + linkBandwidthGbs, linkDurationMsec, + GetLinkDesc(*link).c_str(), + link->srcMem + initOffset, link->dstMem + initOffset, + ev.byteOffset, + ev.numWarmups, ev.numIterations); + } } } @@ -358,32 +394,41 @@ int main(int argc, char **argv) } else { - printf("%d,%lu,ALL,ALL,ALL,ALL,%9.3f,%8.3f,ALL,ALL,ALL,%d,%d,%d,%s,%s,%s,%s\n", + printf("%d,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%d\n", testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, ev.byteOffset, - ev.numWarmups, ev.numIterations, - ev.useHipCall ? "true" : "false", - ev.useMemset ? "true" : "false", - ev.useSingleSync ? "true" : "false", - ev.combineTiming ? "true" : "false"); + ev.numWarmups, ev.numIterations); } } // Release GPU memory - for (int i = 0; i < numLinks; i++) + for (auto exeInfoPair : linkMap) { - DeallocateMemory(links[i].srcMemType, links[i].srcMem); - DeallocateMemory(links[i].dstMemType, links[i].dstMem); + ExecutorInfo& exeInfo = exeInfoPair.second; + for (auto& link : exeInfo.links) + { + // Get some aliases to link variables + MemType const& exeMemType = link.exeMemType; + MemType const& srcMemType = link.srcMemType; + MemType const& dstMemType = link.dstMemType; - if (links[i].exeMemType == MEM_GPU) - { - HIP_CALL(hipEventDestroy(links[i].startEvent)); - HIP_CALL(hipEventDestroy(links[i].stopEvent)); - HIP_CALL(hipStreamDestroy(links[i].stream)); - HIP_CALL(hipFree(links[i].blockParam)); + // Allocate (maximum) source / destination memory based on type / device index + DeallocateMemory(srcMemType, link.srcMem); + DeallocateMemory(dstMemType, link.dstMem); + link.blockParam.clear(); } - else if (links[i].exeMemType == MEM_CPU) + + MemType const exeMemType = exeInfoPair.first.first; + int const exeIndex = RemappedIndex(exeInfoPair.first.second, exeMemType); + if (exeMemType == MEM_GPU) { - free(links[i].blockParam); + DeallocateMemory(exeMemType, exeInfo.blockParamGpu); + int const numLinksToRun = ev.useSingleStream ? 1 : exeInfo.links.size(); + for (int i = 0; i < numLinksToRun; ++i) + { + HIP_CALL(hipEventDestroy(exeInfo.startEvents[i])); + HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i])); + HIP_CALL(hipStreamDestroy(exeInfo.streams[i])); + } } } } @@ -394,6 +439,9 @@ int main(int argc, char **argv) void DisplayUsage(char const* cmdName) { + printf("TransferBench v%s\n", TB_VERSION); + printf("========================================\n"); + if (numa_available() == -1) { printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n"); @@ -405,7 +453,7 @@ void DisplayUsage(char const* cmdName) printf("Usage: %s config \n", cmdName); printf(" config: Either:\n"); - printf(" - Filename of configFile containing Links to execute (see below for format)\n"); + printf(" - Filename of configFile containing Links to execute (see example.cfg for format)\n"); printf(" - Name of preset benchmark:\n"); printf(" p2p - All CPU/GPU pairs benchmark\n"); printf(" p2p_rr - All CPU/GPU pairs benchmark with remote reads\n"); @@ -413,203 +461,15 @@ void DisplayUsage(char const* cmdName) printf(" g2g_rr - All GPU/GPU pairs benchmark with remote reads\n"); printf(" - 3rd optional argument will be used as # of CUs to use (uses all by default)\n"); printf(" N : (Optional) Number of bytes to transfer per link.\n"); - printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", DEFAULT_BYTES_PER_LINK); + printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", + DEFAULT_BYTES_PER_LINK); printf(" If 0 is specified, a range of Ns will be benchmarked\n"); - printf(" If a negative number is specified, a configFile gets generated with this number as default number of CUs per link\n"); printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n"); printf("\n"); - printf("Configfile Format:\n"); - printf("==================\n"); - printf("A Link is defined as a uni-directional transfer from src memory location to dst memory location executed by either CPU or GPU\n"); - printf("Each single line in the configuration file defines a set of Links to run in parallel\n"); - printf("\n"); - printf("There are two ways to specify the configuration file:\n"); - printf("\n"); - printf("1) Basic\n"); - printf(" The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Link\n"); - printf(" A positive number of Links is specified followed by that number of triplets describing each Link\n"); - printf("\n"); - printf(" #Links #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)\n"); - printf("\n"); - printf("2) Advanced\n"); - printf(" The advanced specification allows different number of threadblocks/CUs used per GPU-executed Link\n"); - printf(" A negative number of links is specified, followed by quadruples describing each Link\n"); - printf(" -#Links (srcMem1->Executor1->dstMem1 #CUs1) ... (srcMemL->ExecutorL->dstMemL #CUsL)\n"); - printf("\n"); - printf("Argument Details:\n"); - printf(" #Links : Number of Links to be run in parallel\n"); - printf(" #CUs : Number of threadblocks/CUs to use for a GPU-executed Link\n"); - printf(" srcMemL : Source memory location (Where the data is to be read from). Ignored in memset mode\n"); - printf(" Executor: Executor are specified by a character indicating executor type, followed by device index (0-indexed)\n"); - printf(" - C: CPU-executed (Indexed from 0 to %d)\n", numCpuDevices-1); - printf(" - G: GPU-executed (Indexed from 0 to %d)\n", numGpuDevices-1); - printf(" dstMemL : Destination memory location (Where the data is to be written to)\n"); - printf("\n"); - printf(" Memory locations are specified by a character indicating memory type, followed by device index (0-indexed)\n"); - printf(" Supported memory locations are:\n"); - printf(" - C: Pinned host memory (on NUMA node, indexed from 0 to %d)\n", numCpuDevices-1); - printf(" - G: Global device memory (on GPU device indexed from 0 to %d)\n", numGpuDevices-1); - printf(" - F: Fine-grain device memory (on GPU device indexed from 0 to %d)\n", numGpuDevices-1); - printf("\n"); - printf("Examples:\n"); - printf("1 4 (G0->G0->G1) Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1\n"); - printf("1 4 (G1->C0->G0) Single Link that uses CPU 0 to read memory from GPU 1 and then copies it to memory on GPU 0\n"); - printf("1 4 (C0->G2->G2) Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2\n"); - printf("2 4 G0->G0->G1 G1->G1->G0 Runs 2 Links in parallel. GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs\n"); - printf("-2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel. GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs\n"); - printf("\n"); - printf("Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary\n"); - printf("Lines starting with # will be ignored. Lines starting with ## will be echoed to output\n"); - printf("\n"); EnvVars::DisplayUsage(); } -void GenerateConfigFile(char const* cfgFile, int numBlocks) -{ - // Detect number of available GPUs and skip if less than 2 - int numGpuDevices; - HIP_CALL(hipGetDeviceCount(&numGpuDevices)); - printf("Generating configFile %s for %d device(s) / %d CUs per link\n", cfgFile, numGpuDevices, numBlocks); - if (numGpuDevices < 2) - { - printf("Skipping. (Less than 2 GPUs detected)\n"); - exit(0); - } - - // Check first to see if file exists, and issue warning - FILE* exists = fopen(cfgFile, "r"); - if (exists) - { - fclose(exists); - printf("[WARN] File %s alreadys exists. Enter 'Y' to confirm overwrite\n", cfgFile); - char ch; - scanf(" %c", &ch); - if (ch != 'Y' && ch != 'y') - { - printf("Aborting\n"); - exit(0); - } - } - - // Open config file for writing - FILE* fp = fopen(cfgFile, "w"); - if (!fp) - { - printf("Unable to open [%s] for writing\n", cfgFile); - exit(1); - } - - // CU testing - fprintf(fp, "# CU scaling tests\n"); - for (int i = 1; i < 16; i++) - fprintf(fp, "1 %d (G0->G0->G1)\n", i); - fprintf(fp, "\n"); - - // Pinned memory testing - fprintf(fp, "# Pinned CPU memory read tests\n"); - for (int i = 0; i < numGpuDevices; i++) - fprintf(fp, "1 %d (C0->G%d->G%d)\n", numBlocks, i, i); - fprintf(fp, "\n"); - - fprintf(fp, "# Pinned CPU memory write tests\n"); - for (int i = 0; i < numGpuDevices; i++) - fprintf(fp, "1 %d (G%d->G%d->C0)\n", numBlocks, i, i); - fprintf(fp, "\n"); - - // Single link testing GPU testing - fprintf(fp, "# Unidirectional link GPU tests\n"); - for (int i = 0; i < numGpuDevices; i++) - for (int j = 0; j < numGpuDevices; j++) - { - if (i == j) continue; - fprintf(fp, "1 %d (G%d->G%d->G%d)\n", numBlocks, i, i, j); - } - fprintf(fp, "\n"); - - // Bi-directional link testing - fprintf(fp, "# Bi-directional link tests\n"); - for (int i = 0; i < numGpuDevices; i++) - for (int j = 0; j < numGpuDevices; j++) - { - if (i == j) continue; - fprintf(fp, "2 %d (G%d->G%d->G%d) (G%d->G%d->G%d)\n", numBlocks, i, i, j, j, j, i); - } - fprintf(fp, "\n"); - - // Simple uni-directional ring - fprintf(fp, "# Simple unidirectional ring\n"); - fprintf(fp, "%d %d", numGpuDevices, numBlocks); - for (int i = 0; i < numGpuDevices; i++) - { - fprintf(fp, " (G%d->G%d->G%d)", i, i, (i+1)%numGpuDevices); - } - fprintf(fp, "\n\n"); - - // Simple bi-directional ring - fprintf(fp, "# Simple bi-directional ring\n"); - fprintf(fp, "%d %d", numGpuDevices * 2, numBlocks); - for (int i = 0; i < numGpuDevices; i++) - fprintf(fp, " (G%d->G%d->G%d)", i, i, (i+1)%numGpuDevices); - for (int i = 0; i < numGpuDevices; i++) - fprintf(fp, " (G%d->G%d->G%d)", i, i, (i+numGpuDevices-1)%numGpuDevices); - fprintf(fp, "\n\n"); - - // Broadcast from GPU 0 - fprintf(fp, "# GPU 0 Broadcast\n"); - fprintf(fp, "%d %d", numGpuDevices-1, numBlocks); - for (int i = 1; i < numGpuDevices; i++) - fprintf(fp, " (G%d->G%d->G%d)", 0, 0, i); - fprintf(fp, "\n\n"); - - // Gather to GPU 0 - fprintf(fp, "# GPU 0 Gather\n"); - fprintf(fp, "%d %d", numGpuDevices-1, numBlocks); - for (int i = 1; i < numGpuDevices; i++) - fprintf(fp, " (G%d->G%d->G%d)", i, 0, 0); - fprintf(fp, "\n\n"); - - // Full stress test - fprintf(fp, "# Full stress test\n"); - fprintf(fp, "%d %d", numGpuDevices * (numGpuDevices-1), numBlocks); - for (int i = 0; i < numGpuDevices; i++) - for (int j = 0; j < numGpuDevices; j++) - { - if (i == j) continue; - fprintf(fp, " (G%d->G%d->G%d)", i, i, j); - } - fprintf(fp, "\n\n"); - - // All single-hop XGMI links - int numSingleHopXgmiLinks = 0; - for (int i = 0; i < numGpuDevices; i++) - for (int j = 0; j < numGpuDevices; j++) - { - if (i == j) continue; - uint32_t linkType, hopCount; - HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount)); - if (linkType == HSA_AMD_LINK_INFO_TYPE_XGMI && hopCount == 1) numSingleHopXgmiLinks++; - } - if (numSingleHopXgmiLinks > 0) - { - fprintf(fp, "# All single-hop links\n"); - fprintf(fp, "%d %d", numSingleHopXgmiLinks, numBlocks); - for (int i = 0; i < numGpuDevices; i++) - for (int j = 0; j < numGpuDevices; j++) - { - if (i == j) continue; - uint32_t linkType, hopCount; - HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount)); - if (linkType == HSA_AMD_LINK_INFO_TYPE_XGMI && hopCount == 1) - { - fprintf(fp, " (G%d G%d F%d)", i, i, j); - } - } - fprintf(fp, "\n\n"); - } - fclose(fp); -} - int RemappedIndex(int const origIdx, MemType const memType) { static std::vector remapping; @@ -650,44 +510,67 @@ int RemappedIndex(int const origIdx, MemType const memType) return remapping[origIdx]; } -void DisplayTopology() +void DisplayTopology(bool const outputToCsv) { int numGpuDevices; HIP_CALL(hipGetDeviceCount(&numGpuDevices)); - printf("\nDetected topology: %d CPU NUMA node(s) %d GPU device(s)\n", numa_num_configured_nodes(), numGpuDevices); - printf(" |"); - for (int j = 0; j < numGpuDevices; j++) - printf(" GPU %02d |", j); - printf(" PCIe Bus ID | Closest NUMA\n"); - for (int j = 0; j <= numGpuDevices; j++) - printf("--------+"); - printf("--------------+-------------\n"); + + if (outputToCsv) + { + printf("NumCpus,%d\n", numa_num_configured_nodes()); + printf("NumGpus,%d\n", numGpuDevices); + printf("GPU"); + for (int j = 0; j < numGpuDevices; j++) + printf(",GPU %02d", j); + printf(",PCIe Bus ID,ClosestNUMA\n"); + } + else + { + printf("\nDetected topology: %d CPU NUMA node(s) %d GPU device(s)\n", numa_num_configured_nodes(), numGpuDevices); + printf(" |"); + for (int j = 0; j < numGpuDevices; j++) + printf(" GPU %02d |", j); + printf(" PCIe Bus ID | Closest NUMA\n"); + for (int j = 0; j <= numGpuDevices; j++) + printf("--------+"); + printf("--------------+-------------\n"); + } char pciBusId[20]; + for (int i = 0; i < numGpuDevices; i++) { - printf(" GPU %02d |", i); + printf("%sGPU %02d%s", outputToCsv ? "" : " ", i, outputToCsv ? "," : " |"); for (int j = 0; j < numGpuDevices; j++) { if (i == j) - printf(" - |"); + { + if (outputToCsv) + printf("-,"); + else + printf(" - |"); + } else { uint32_t linkType, hopCount; HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(i, MEM_GPU), RemappedIndex(j, MEM_GPU), &linkType, &hopCount)); - printf(" %s-%d |", + printf("%s%s-%d%s", + outputToCsv ? "" : " ", linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? " HT" : linkType == HSA_AMD_LINK_INFO_TYPE_QPI ? " QPI" : linkType == HSA_AMD_LINK_INFO_TYPE_PCIE ? "PCIE" : linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND ? "INFB" : linkType == HSA_AMD_LINK_INFO_TYPE_XGMI ? "XGMI" : "????", - hopCount); + hopCount, outputToCsv ? "," : " |"); } } HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, RemappedIndex(i, MEM_GPU))); - printf(" %11s | %d \n", pciBusId, GetClosestNumaNode(RemappedIndex(i, MEM_GPU))); + if (outputToCsv) + printf("%s,%d\n", pciBusId, GetClosestNumaNode(RemappedIndex(i, MEM_GPU))); + else + printf(" %11s | %d \n", pciBusId, GetClosestNumaNode(RemappedIndex(i, MEM_GPU))); } } @@ -730,30 +613,23 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus char typeChar; if (sscanf(token.c_str(), " %c %d", &typeChar, memIndex) != 2) { - printf("[ERROR] Unable to parse memory type token %s - expecting either 'C' or 'G' or 'F' followed by an index\n", token.c_str()); + printf("[ERROR] Unable to parse memory type token %s - expecting either 'B,C,G or F' followed by an index\n", + token.c_str()); exit(1); } switch (typeChar) { - case 'C': case 'c': - *memType = MEM_CPU; + case 'C': case 'c': case 'B': case 'b': + *memType = (typeChar == 'C' || typeChar == 'c') ? MEM_CPU : MEM_CPU_FINE; if (*memIndex < 0 || *memIndex >= numCpus) { printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", numCpus-1, *memIndex); exit(1); } break; - case 'G': case 'g': - *memType = MEM_GPU; - if (*memIndex < 0 || *memIndex >= numGpus) - { - printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, *memIndex); - exit(1); - } - break; - case 'F': case 'f': - *memType = MEM_GPU_FINE; + case 'G': case 'g': case 'F': case 'f': + *memType = (typeChar == 'G' || typeChar == 'g') ? MEM_GPU : MEM_GPU_FINE; if (*memIndex < 0 || *memIndex >= numGpus) { printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", numGpus-1, *memIndex); @@ -761,24 +637,22 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus } break; default: - printf("[ERROR] Unrecognized memory type %s. Expecting either 'C' or 'G' or 'F'\n", token.c_str()); + printf("[ERROR] Unrecognized memory type %s. Expecting either 'B', 'C' or 'G' or 'F'\n", token.c_str()); exit(1); } } // Helper function to parse a list of link definitions -void ParseLinks(char* line, int numCpus, int numGpus, std::vector& links) +void ParseLinks(char* line, int numCpus, int numGpus, LinkMap& linkMap) { // Replace any round brackets or '->' with spaces, for (int i = 1; line[i]; i++) if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' '; - links.clear(); + linkMap.clear(); int numLinks = 0; - std::istringstream iss; - iss.clear(); - iss.str(line); + std::istringstream iss(line); iss >> numLinks; if (iss.fail()) return; @@ -795,50 +669,64 @@ void ParseLinks(char* line, int numCpus, int numGpus, std::vector& links) printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numBlocksToUse); exit(1); } - links.resize(numLinks); for (int i = 0; i < numLinks; i++) { + Link link; + link.linkIndex = i; iss >> srcMem >> exeMem >> dstMem; if (iss.fail()) { printf("Parsing error: Unable to read valid Link triplet (possibly missing a SRC or EXE or DST)\n"); exit(1); } - ParseMemType(srcMem, numCpus, numGpus, &links[i].srcMemType, &links[i].srcIndex); - ParseMemType(exeMem, numCpus, numGpus, &links[i].exeMemType, &links[i].exeIndex); - ParseMemType(dstMem, numCpus, numGpus, &links[i].dstMemType, &links[i].dstIndex); - links[i].numBlocksToUse = numBlocksToUse; - if (links[i].exeMemType != MEM_CPU && links[i].exeMemType != MEM_GPU) + ParseMemType(srcMem, numCpus, numGpus, &link.srcMemType, &link.srcIndex); + ParseMemType(exeMem, numCpus, numGpus, &link.exeMemType, &link.exeIndex); + ParseMemType(dstMem, numCpus, numGpus, &link.dstMemType, &link.dstIndex); + link.numBlocksToUse = numBlocksToUse; + + // Ensure executor is either CPU or GPU + if (link.exeMemType != MEM_CPU && link.exeMemType != MEM_GPU) { printf("[ERROR] Executor must either be CPU ('C') or GPU ('G'), (from (%s->%s->%s %d))\n", - srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), links[i].numBlocksToUse); + srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), link.numBlocksToUse); exit(1); } + + Executor executor(link.exeMemType, link.exeIndex); + ExecutorInfo& executorInfo = linkMap[executor]; + executorInfo.totalBlocks += link.numBlocksToUse; + executorInfo.links.push_back(link); } } else { // Method 2: Read in quads (srcMem, exeMem, dstMem, Read common # blocks to use, then read (src, dst) doubles numLinks *= -1; - links.resize(numLinks); for (int i = 0; i < numLinks; i++) { - iss >> srcMem >> exeMem >> dstMem >> links[i].numBlocksToUse; + Link link; + link.linkIndex = i; + iss >> srcMem >> exeMem >> dstMem >> link.numBlocksToUse; if (iss.fail()) { printf("Parsing error: Unable to read valid Link quadruple (possibly missing a SRC or EXE or DST or #CU)\n"); exit(1); } - ParseMemType(srcMem, numCpus, numGpus, &links[i].srcMemType, &links[i].srcIndex); - ParseMemType(exeMem, numCpus, numGpus, &links[i].exeMemType, &links[i].exeIndex); - ParseMemType(dstMem, numCpus, numGpus, &links[i].dstMemType, &links[i].dstIndex); - if (links[i].exeMemType != MEM_CPU && links[i].exeMemType != MEM_GPU) + ParseMemType(srcMem, numCpus, numGpus, &link.srcMemType, &link.srcIndex); + ParseMemType(exeMem, numCpus, numGpus, &link.exeMemType, &link.exeIndex); + ParseMemType(dstMem, numCpus, numGpus, &link.dstMemType, &link.dstIndex); + if (link.exeMemType != MEM_CPU && link.exeMemType != MEM_GPU) { printf("[ERROR] Executor must either be CPU ('C') or GPU ('G'), (from (%s->%s->%s %d))\n" -, srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), links[i].numBlocksToUse); +, srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), link.numBlocksToUse); exit(1); } + + Executor executor(link.exeMemType, link.exeIndex); + ExecutorInfo& executorInfo = linkMap[executor]; + executorInfo.totalBlocks += link.numBlocksToUse; + executorInfo.links.push_back(link); } } } @@ -856,7 +744,7 @@ void EnablePeerAccess(int const deviceId, int const peerDeviceId) HIP_CALL(hipDeviceEnablePeerAccess(peerDeviceId, 0)); } -void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr) +void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr) { if (numBytes == 0) { @@ -864,7 +752,7 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memP exit(1); } - if (memType == MEM_CPU) + if (memType == MEM_CPU || memType == MEM_CPU_FINE) { // Set numa policy prior to call to hipHostMalloc // NOTE: It may be possible that the actual configured numa nodes do not start at 0 @@ -884,7 +772,15 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memP } // Allocate host-pinned memory (should respect NUMA mem policy) - HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent)); + + if (memType == MEM_CPU_FINE) + { + HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser)); + } + else + { + HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent)); + } // Check that the allocated pages are actually on the correct NUMA node CheckPages((char*)*memPtr, numBytes, numaIdx); @@ -915,9 +811,9 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memP } } -void DeallocateMemory(MemType memType, float* memPtr) +void DeallocateMemory(MemType memType, void* memPtr) { - if (memType == MEM_CPU) + if (memType == MEM_CPU || memType == MEM_CPU_FINE) { HIP_CALL(hipHostFree(memPtr)); } @@ -1041,9 +937,9 @@ std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount) std::string GetDesc(MemType srcMemType, int srcIndex, MemType dstMemType, int dstIndex) { - if (srcMemType == MEM_CPU) + if (srcMemType == MEM_CPU || srcMemType == MEM_CPU_FINE) { - if (dstMemType == MEM_CPU) + if (dstMemType == MEM_CPU || dstMemType == MEM_CPU_FINE) return (srcIndex == dstIndex) ? "LOCAL" : "NUMA"; else if (dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) return "PCIE"; @@ -1052,7 +948,7 @@ std::string GetDesc(MemType srcMemType, int srcIndex, } else if (srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) { - if (dstMemType == MEM_CPU) + if (dstMemType == MEM_CPU || dstMemType == MEM_CPU_FINE) return "PCIE"; else if (dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) { @@ -1080,64 +976,93 @@ std::string GetLinkDesc(Link const& link) + GetDesc(link.exeMemType, link.exeIndex, link.dstMemType, link.dstIndex); } -void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link) +void RunLink(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const linkIdx) { + Link& link = exeInfo.links[linkIdx]; + // GPU execution agent if (link.exeMemType == MEM_GPU) { // Switch to executing GPU - HIP_CALL(hipSetDevice(RemappedIndex(link.exeIndex, MEM_GPU))); + int const exeIndex = RemappedIndex(link.exeIndex, MEM_GPU); + HIP_CALL(hipSetDevice(exeIndex)); - bool recordStart = (!ev.useSingleSync || iteration == 0); - bool recordStop = (!ev.useSingleSync || iteration == ev.numIterations - 1); + hipStream_t& stream = exeInfo.streams[linkIdx]; + hipEvent_t& startEvent = exeInfo.startEvents[linkIdx]; + hipEvent_t& stopEvent = exeInfo.stopEvents[linkIdx]; + + bool recordStart = (!ev.useSingleSync || iteration == 0 || ev.useSingleStream); + bool recordStop = (!ev.useSingleSync || iteration == ev.numIterations - 1 || ev.useSingleStream); int const initOffset = ev.byteOffset / sizeof(float); if (ev.useHipCall) { // Record start event - if (recordStart) HIP_CALL(hipEventRecord(link.startEvent, link.stream)); + if (recordStart) HIP_CALL(hipEventRecord(startEvent, stream)); // Execute hipMemset / hipMemcpy if (ev.useMemset) - HIP_CALL(hipMemsetAsync(link.dstMem + initOffset, 42, N * sizeof(float), link.stream)); + HIP_CALL(hipMemsetAsync(link.dstMem + initOffset, 42, N * sizeof(float), stream)); else HIP_CALL(hipMemcpyAsync(link.dstMem + initOffset, link.srcMem + initOffset, N * sizeof(float), hipMemcpyDefault, - link.stream)); + stream)); // Record stop event - if (recordStop) HIP_CALL(hipEventRecord(link.stopEvent, link.stream)); + if (recordStop) HIP_CALL(hipEventRecord(stopEvent, stream)); } else { - if (!ev.combineTiming && recordStart) HIP_CALL(hipEventRecord(link.startEvent, link.stream)); + if (!ev.combineTiming && recordStart) HIP_CALL(hipEventRecord(startEvent, stream)); + int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalBlocks : link.numBlocksToUse; hipExtLaunchKernelGGL(ev.useMemset ? GpuMemsetKernel : GpuCopyKernel, - dim3(link.numBlocksToUse, 1, 1), + dim3(numBlocksToRun, 1, 1), dim3(BLOCKSIZE, 1, 1), - ev.sharedMemBytes, link.stream, - (ev.combineTiming && recordStart) ? link.startEvent : NULL, - (ev.combineTiming && recordStop) ? link.stopEvent : NULL, - 0, link.blockParam); - if (!ev.combineTiming & recordStop) HIP_CALL(hipEventRecord(link.stopEvent, link.stream)); + ev.sharedMemBytes, stream, + (ev.combineTiming && recordStart) ? startEvent : NULL, + (ev.combineTiming && recordStop) ? stopEvent : NULL, + 0, link.blockParamGpuPtr); + if (!ev.combineTiming & recordStop) HIP_CALL(hipEventRecord(stopEvent, stream)); } // Synchronize per iteration, unless in single sync mode, in which case // synchronize during last warmup / last actual iteration if (!ev.useSingleSync || iteration == -1 || iteration == ev.numIterations - 1) { - HIP_CALL(hipStreamSynchronize(link.stream)); + HIP_CALL(hipStreamSynchronize(stream)); } if (iteration >= 0) { // Record GPU timing - if (!ev.useSingleSync || iteration == ev.numIterations - 1) + if (!ev.useSingleSync || iteration == ev.numIterations - 1 || ev.useSingleStream) { - HIP_CALL(hipEventSynchronize(link.stopEvent)); + HIP_CALL(hipEventSynchronize(stopEvent)); float gpuDeltaMsec; - HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, link.startEvent, link.stopEvent)); - link.totalTime += gpuDeltaMsec; + HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); + + if (ev.useSingleStream) + { + for (Link& currLink : exeInfo.links) + { + long long minStartCycle = currLink.blockParamGpuPtr[0].startCycle; + long long maxStopCycle = currLink.blockParamGpuPtr[0].stopCycle; + for (int i = 1; i < currLink.numBlocksToUse; i++) + { + minStartCycle = std::min(minStartCycle, currLink.blockParamGpuPtr[i].startCycle); + maxStopCycle = std::max(maxStopCycle, currLink.blockParamGpuPtr[i].stopCycle); + } + int const wallClockRate = GetWallClockRate(exeIndex); + double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate); + currLink.linkTime += iterationTimeMs; + } + exeInfo.totalTime += gpuDeltaMsec; + } + else + { + link.linkTime += gpuDeltaMsec; + } } } } @@ -1166,7 +1091,7 @@ void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link) // Record time if not a warmup iteration if (iteration >= 0) - link.totalTime += (std::chrono::duration_cast>(cpuDelta).count() * 1000.0); + link.linkTime += (std::chrono::duration_cast>(cpuDelta).count() * 1000.0); } } @@ -1183,26 +1108,36 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, in for (int j = 0; j < numGpus; j++) if (i != j) EnablePeerAccess(i, j); - printf("Performing copies in each direction of %lu bytes\n", N * sizeof(float)); - printf("Using %d threads per NUMA node for CPU copies\n", ev.numCpuPerLink); - printf("Using %d CUs per transfer\n", numBlocksToUse); + if (!ev.outputToCsv) + { + printf("Performing copies in each direction of %lu bytes\n", N * sizeof(float)); + printf("Using %d threads per NUMA node for CPU copies\n", ev.numCpuPerLink); + printf("Using %d CUs per transfer\n", numBlocksToUse); + } + else + { + printf("SRC,DST,Direction,ReadMode,BW(GB/s),Bytes\n"); + } // Perform unidirectional / bidirectional for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) { // Print header - printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write]\n", isBidirectional ? "Bi" : "Uni", - readMode == 0 ? "Local" : "Remote", - readMode == 0 ? "Remote" : "Local"); - printf("%10s", "D/D"); - if (!skipCpu) + if (!ev.outputToCsv) { - for (int i = 0; i < numCpus; i++) - printf("%7s %02d", "CPU", i); + printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write]\n", isBidirectional ? "Bi" : "Uni", + readMode == 0 ? "Local" : "Remote", + readMode == 0 ? "Remote" : "Local"); + printf("%10s", "D/D"); + if (!skipCpu) + { + for (int i = 0; i < numCpus; i++) + printf("%7s %02d", "CPU", i); + } + for (int i = 0; i < numGpus; i++) + printf("%7s %02d", "GPU", i); + printf("\n"); } - for (int i = 0; i < numGpus; i++) - printf("%7s %02d", "GPU", i); - printf("\n"); // Loop over all possible src/dst pairs for (int src = 0; src < numDevices; src++) @@ -1210,95 +1145,107 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, in MemType const& srcMemType = (src < numCpus ? MEM_CPU : MEM_GPU); if (skipCpu && srcMemType == MEM_CPU) continue; int srcIndex = (srcMemType == MEM_CPU ? src : src - numCpus); - printf("%7s %02d", (srcMemType == MEM_CPU) ? "CPU" : "GPU", srcIndex); + if (!ev.outputToCsv) + printf("%7s %02d", (srcMemType == MEM_CPU) ? "CPU" : "GPU", srcIndex); for (int dst = 0; dst < numDevices; dst++) { MemType const& dstMemType = (dst < numCpus ? MEM_CPU : MEM_GPU); if (skipCpu && dstMemType == MEM_CPU) continue; int dstIndex = (dstMemType == MEM_CPU ? dst : dst - numCpus); - double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcMemType, srcIndex, dstMemType, dstIndex, readMode); - if (bandwidth == 0) - printf("%10s", "N/A"); + double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, readMode, numBlocksToUse, + srcMemType, srcIndex, dstMemType, dstIndex); + if (!ev.outputToCsv) + { + if (bandwidth == 0) + printf("%10s", "N/A"); + else + printf("%10.2f", bandwidth); + } else - printf("%10.2f", bandwidth); + { + printf("%s %02d,%s %02d,%s,%s,%.2f,%lu\n", + srcMemType == MEM_CPU ? "CPU" : "GPU", + srcIndex, + dstMemType == MEM_CPU ? "CPU" : "GPU", + dstIndex, + isBidirectional ? "bidirectional" : "unidirectional", + readMode == 0 ? "Local" : "Remote", + bandwidth, + N * sizeof(float)); + } fflush(stdout); } - printf("\n"); + if (!ev.outputToCsv) printf("\n"); } - printf("\n"); + if (!ev.outputToCsv) printf("\n"); } } -double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional, - MemType srcMemType, int srcIndex, - MemType dstMemType, int dstIndex, - int readMode) +double GetPeakBandwidth(EnvVars const& ev, + size_t const N, + int const isBidirectional, + int const readMode, + int const numBlocksToUse, + MemType const srcMemType, + int const srcIndex, + MemType const dstMemType, + int const dstIndex) { - Link links[2]; - int const initOffset = ev.byteOffset / sizeof(float); - // Skip bidirectional on same device if (isBidirectional && srcMemType == dstMemType && srcIndex == dstIndex) return 0.0f; + int const initOffset = ev.byteOffset / sizeof(float); + // Prepare Links - links[0].srcMemType = links[1].dstMemType = srcMemType; - links[0].srcIndex = links[1].dstIndex = RemappedIndex(srcIndex, srcMemType); - links[0].dstMemType = links[1].srcMemType = dstMemType; - links[0].dstIndex = links[1].srcIndex = RemappedIndex(dstIndex, dstMemType); - // Either perform local read / remote write, or remote read / local write - links[0].exeMemType = (readMode == 0 ? srcMemType : dstMemType); - links[0].exeIndex = RemappedIndex((readMode == 0 ? srcIndex : dstIndex), links[0].exeMemType); - links[1].exeMemType = (readMode == 0 ? dstMemType : srcMemType); - links[1].exeIndex = RemappedIndex((readMode == 0 ? dstIndex : srcIndex), links[1].exeMemType); + std::vector links; + ExecutorInfo exeInfo[2]; + for (int i = 0; i < 2; i++) + { + exeInfo[i].links.resize(1); + exeInfo[i].streams.resize(1); + exeInfo[i].startEvents.resize(1); + exeInfo[i].stopEvents.resize(1); + links.push_back(&exeInfo[i].links[0]); + } + + links[0]->srcMemType = links[1]->dstMemType = srcMemType; + links[0]->dstMemType = links[1]->srcMemType = dstMemType; + links[0]->srcIndex = links[1]->dstIndex = RemappedIndex(srcIndex, srcMemType); + links[0]->dstIndex = links[1]->srcIndex = RemappedIndex(dstIndex, dstMemType); + + // Either perform (local read + remote write), or (remote read + local write) + links[0]->exeMemType = (readMode == 0 ? srcMemType : dstMemType); + links[1]->exeMemType = (readMode == 0 ? dstMemType : srcMemType); + links[0]->exeIndex = RemappedIndex((readMode == 0 ? srcIndex : dstIndex), links[0]->exeMemType); + links[1]->exeIndex = RemappedIndex((readMode == 0 ? dstIndex : srcIndex), links[1]->exeMemType); for (int i = 0; i <= isBidirectional; i++) { - AllocateMemory(links[i].srcMemType, links[i].srcIndex, N * sizeof(float) + ev.byteOffset, &links[i].srcMem); - AllocateMemory(links[i].dstMemType, links[i].dstIndex, N * sizeof(float) + ev.byteOffset, &links[i].dstMem); - links[i].totalTime = 0.0; + AllocateMemory(links[i]->srcMemType, links[i]->srcIndex, + N * sizeof(float) + ev.byteOffset, (void**)&links[i]->srcMem); + AllocateMemory(links[i]->dstMemType, links[i]->dstIndex, + N * sizeof(float) + ev.byteOffset, (void**)&links[i]->dstMem); - CheckOrFill(MODE_FILL, N, ev.useMemset, ev.useHipCall, ev.fillPattern, links[i].srcMem + initOffset); - if (links[i].exeMemType == MEM_GPU) + // Prepare block parameters on CPU + links[i]->numBlocksToUse = (links[i]->exeMemType == MEM_GPU) ? numBlocksToUse : ev.numCpuPerLink; + links[i]->blockParam.resize(links[i]->numBlocksToUse); + links[i]->PrepareBlockParams(ev, N); + + if (links[i]->exeMemType == MEM_GPU) { - HIP_CALL(hipDeviceGetAttribute(&links[i].numBlocksToUse, hipDeviceAttributeMultiprocessorCount, links[i].exeIndex)); - HIP_CALL(hipSetDevice(links[i].exeIndex)); - HIP_CALL(hipEventCreate(&links[i].startEvent)); - HIP_CALL(hipEventCreate(&links[i].stopEvent)); - HIP_CALL(hipMalloc((void**)&links[i].blockParam, sizeof(BlockParam) * links[i].numBlocksToUse)); - HIP_CALL(hipStreamCreate(&links[i].stream)); + // Copy block parameters onto GPU + AllocateMemory(MEM_GPU, links[i]->exeIndex, numBlocksToUse * sizeof(BlockParam), + (void **)&links[i]->blockParamGpuPtr); + HIP_CALL(hipMemcpy(links[i]->blockParamGpuPtr, + links[i]->blockParam.data(), + numBlocksToUse * sizeof(BlockParam), + hipMemcpyHostToDevice)); - size_t assigned = 0; - int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)links[i].numBlocksToUse); - for (int j = 0; j < links[i].numBlocksToUse; j++) - { - BlockParam param; - int blocksLeft = std::max(0, maxNumBlocksToUse - j); - size_t leftover = N - assigned; - size_t roundedN = (leftover + 31) / 32; - param.N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0; - param.src = links[i].srcMem + assigned + initOffset; - param.dst = links[i].dstMem + assigned + initOffset; - assigned += param.N; - - HIP_CALL(hipMemcpy(&links[i].blockParam[j], ¶m, sizeof(BlockParam), hipMemcpyHostToDevice)); - } - } - else - { - links[i].blockParam = (BlockParam*)malloc(ev.numCpuPerLink * sizeof(BlockParam)); - // For CPU-based copy, divded based on the number of child threads - size_t assigned = 0; - int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)ev.numCpuPerLink); - for (int j = 0; j < ev.numCpuPerLink; j++) - { - int blocksLeft = std::max(0, maxNumBlocksToUse - j); - size_t leftover = N - assigned; - size_t roundedN = (leftover + 31) / 32; - links[i].blockParam[j].N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0; - links[i].blockParam[j].src = links[i].srcMem + assigned + initOffset; - links[i].blockParam[j].dst = links[i].dstMem + assigned + initOffset; - assigned += links[i].blockParam[j].N; - } + // Prepare GPU resources + HIP_CALL(hipSetDevice(links[i]->exeIndex)); + HIP_CALL(hipStreamCreate(&exeInfo[i].streams[0])); + HIP_CALL(hipEventCreate(&exeInfo[i].startEvents[0])); + HIP_CALL(hipEventCreate(&exeInfo[i].stopEvents[0])); } } @@ -1309,7 +1256,7 @@ double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional, { // Perform timed iterations for (int i = 0; i <= isBidirectional; i++) - threads.push(std::thread(RunLink, std::ref(ev), N, iteration, std::ref(links[i]))); + threads.push(std::thread(RunLink, std::ref(ev), N, iteration, std::ref(exeInfo[i]), 0)); // Wait for all threads to finish for (int i = 0; i <= isBidirectional; i++) @@ -1321,13 +1268,13 @@ double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional, // Validate that each link has transferred correctly for (int i = 0; i <= isBidirectional; i++) - CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, links[i].dstMem + initOffset); + CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, links[i]->dstMem + initOffset); // Collect aggregate bandwidth double totalBandwidth = 0; for (int i = 0; i <= isBidirectional; i++) { - double linkDurationMsec = links[i].totalTime / (1.0 * ev.numIterations); + double linkDurationMsec = links[i]->linkTime / (1.0 * ev.numIterations); double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f; totalBandwidth += linkBandwidthGbs; } @@ -1335,20 +1282,76 @@ double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional, // Release GPU memory for (int i = 0; i <= isBidirectional; i++) { - DeallocateMemory(links[i].srcMemType, links[i].srcMem); - DeallocateMemory(links[i].dstMemType, links[i].dstMem); + DeallocateMemory(links[i]->srcMemType, links[i]->srcMem); + DeallocateMemory(links[i]->dstMemType, links[i]->dstMem); - if (links[i].exeMemType == MEM_GPU) - { - HIP_CALL(hipEventDestroy(links[i].startEvent)); - HIP_CALL(hipEventDestroy(links[i].stopEvent)); - HIP_CALL(hipStreamDestroy(links[i].stream)); - HIP_CALL(hipFree(links[i].blockParam)); - } - else if (links[i].exeMemType == MEM_CPU) - { - free(links[i].blockParam); - } + if (links[i]->exeMemType == MEM_GPU) + { + DeallocateMemory(MEM_GPU, links[i]->blockParamGpuPtr); + HIP_CALL(hipStreamDestroy(exeInfo[i].streams[0])); + HIP_CALL(hipEventDestroy(exeInfo[i].startEvents[0])); + HIP_CALL(hipEventDestroy(exeInfo[i].stopEvents[0])); + } } return totalBandwidth; } + +void Link::PrepareBlockParams(EnvVars const& ev, size_t const N) +{ + int const initOffset = ev.byteOffset / sizeof(float); + + // Initialize source memory with patterned data + CheckOrFill(MODE_FILL, N, ev.useMemset, ev.useHipCall, ev.fillPattern, this->srcMem + initOffset); + + // Each block needs to know src/dst pointers and how many elements to transfer + // Figure out the sub-array each block does for this Link + // - Partition N as evenly as possible, but try to keep blocks as multiples of BLOCK_BYTES bytes, + // except the very last one, for alignment reasons + int const targetMultiple = ev.blockBytes / sizeof(float); + int const maxNumBlocksToUse = std::min((N + targetMultiple - 1) / targetMultiple, this->blockParam.size()); + size_t assigned = 0; + for (int j = 0; j < this->blockParam.size(); j++) + { + int const blocksLeft = std::max(0, maxNumBlocksToUse - j); + size_t const leftover = N - assigned; + size_t const roundedN = (leftover + targetMultiple - 1) / targetMultiple; + + BlockParam& param = this->blockParam[j]; + param.N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * targetMultiple)) : 0; + param.src = this->srcMem + assigned + initOffset; + param.dst = this->dstMem + assigned + initOffset; + param.startCycle = 0; + param.stopCycle = 0; + assigned += param.N; + } + + this->linkTime = 0.0; +} + +// NOTE: This is a stop-gap solution until HIP provides wallclock values +int GetWallClockRate(int deviceId) +{ + static std::vector wallClockPerDeviceMhz; + + if (wallClockPerDeviceMhz.size() == 0) + { + int numGpuDevices; + HIP_CALL(hipGetDeviceCount(&numGpuDevices)); + wallClockPerDeviceMhz.resize(numGpuDevices); + + hipDeviceProp_t prop; + for (int i = 0; i < numGpuDevices; i++) + { + HIP_CALL(hipGetDeviceProperties(&prop, i)); + int value = 25000; + switch (prop.gcnArch) + { + case 906: case 910: value = 25000; break; + default: + printf("Unrecognized GCN arch %d\n", prop.gcnArch); + } + wallClockPerDeviceMhz[i] = value; + } + } + return wallClockPerDeviceMhz[deviceId]; +} diff --git a/projects/rccl/tools/TransferBench/TransferBench.hpp b/projects/rccl/tools/TransferBench/TransferBench.hpp index bec6a50b52..6c801ebccc 100644 --- a/projects/rccl/tools/TransferBench/TransferBench.hpp +++ b/projects/rccl/tools/TransferBench/TransferBench.hpp @@ -1,5 +1,5 @@ /* -Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -49,15 +49,19 @@ THE SOFTWARE. } \ } while (0) +// Simple configuration parameters +size_t const DEFAULT_BYTES_PER_LINK = (1<<26); // Amount of data transferred per Link + // Different src/dst memory types supported typedef enum { - MEM_CPU = 0, // Pinned CPU memory + MEM_CPU = 0, // Coarse-grained pinned CPU memory MEM_GPU = 1, // Coarse-grained global GPU memory - MEM_GPU_FINE = 2 // Fine-grained global GPU memory + MEM_CPU_FINE = 2, // Fine-grained pinned CPU memory + MEM_GPU_FINE = 3 // Fine-grained global GPU memory } MemType; -char const MemTypeStr[4] = "CGF"; +char const MemTypeStr[5] = "CGBF"; typedef enum { @@ -68,55 +72,99 @@ typedef enum // Each threadblock copies N floats from src to dst struct BlockParam { - int N; - float* src; - float* dst; + int N; + float* src; + float* dst; + long long startCycle; + long long stopCycle; }; -// Each Link is a uni-direction operation from a src memory to dst memory executed by a specific GPU +// Each Link is a uni-direction operation from a src memory to dst memory struct Link { + int linkIndex; // Link identifier + // Link config - MemType exeMemType; // Link executor type (CPU or GPU) - int exeIndex; // Executor index (NUMA node for CPU / device ID for GPU) - MemType srcMemType; // Source memory type - int srcIndex; // Source device index - MemType dstMemType; // Destination memory type - int dstIndex; // Destination device index - int numBlocksToUse; // Number of threadblocks to use for this Link + MemType exeMemType; // Link executor type (CPU or GPU) + int exeIndex; // Executor index (NUMA node for CPU / device ID for GPU) + MemType srcMemType; // Source memory type + int srcIndex; // Source device index + MemType dstMemType; // Destination memory type + int dstIndex; // Destination device index + int numBlocksToUse; // Number of threadblocks to use for this Link - // Link implementation - float* srcMem; // Source memory - float* dstMem; // Destination memory + // Memory + float* srcMem; // Source memory + float* dstMem; // Destination memory - hipEvent_t startEvent; - hipEvent_t stopEvent; - hipStream_t stream; - BlockParam* blockParam; + // How memory is split across threadblocks / CPU cores + std::vector blockParam; + BlockParam* blockParamGpuPtr; + // Results + double linkTime; + + // Prepares src memory and how to divide N elements across threadblocks/threads + void PrepareBlockParams(EnvVars const& ev, size_t const N); +}; + +typedef std::pair Executor; + +struct ExecutorInfo +{ + std::vector links; // Links to execute + + // For GPU-Executors + int totalBlocks; // Total number of CUs/CPU threads to use + BlockParam* blockParamGpu; // Copy of block parameters in GPU device memory + std::vector streams; + std::vector startEvents; + std::vector stopEvents; + + // Results double totalTime; }; -void DisplayUsage(char const* cmdName); // Display usage instructions -void GenerateConfigFile(char const* cfgFile, int numBlocks); // Generate a sample config file -void DisplayTopology(); // Display GPU topology -void PopulateTestSizes(size_t const numBytesPerLink, int const samplingFactor, std::vector& valuesofN); -void ParseMemType(std::string const& token, int const numCpus, int const numGpus, MemType* memType, int* memIndex); -void ParseLinks(char* line, int numCpus, int numGpus, std::vector& links); // Parse Link information +typedef std::map LinkMap; + +// Display usage instructions +void DisplayUsage(char const* cmdName); + +// Display detected GPU topology / CPU numa nodes +void DisplayTopology(bool const outputToCsv); + +// Build array of test sizes based on sampling factor +void PopulateTestSizes(size_t const numBytesPerLink, int const samplingFactor, + std::vector& valuesofN); + +void ParseMemType(std::string const& token, int const numCpus, int const numGpus, + MemType* memType, int* memIndex); + +void ParseLinks(char* line, int numCpus, int numGpus, + LinkMap& linkMap); + void EnablePeerAccess(int const deviceId, int const peerDeviceId); -void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr); -void DeallocateMemory(MemType memType, float* memPtr); +void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr); +void DeallocateMemory(MemType memType, void* memPtr); void CheckPages(char* byteArray, size_t numBytes, int targetId); void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector const& fillPattern, float* ptr); -void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link); +void RunLink(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const linkIdx); void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu); -double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional, - MemType srcMemType, int srcIndex, - MemType dstMemType, int dstIndex, - int readMode); + +// Return the maximum bandwidth measured for given (src/dst) pair +double GetPeakBandwidth(EnvVars const& ev, + size_t const N, + int const isBidirectional, + int const readMode, + int const numBlocksToUse, + MemType const srcMemType, + int const srcIndex, + MemType const dstMemType, + int const dstIndex); std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount); std::string GetDesc(MemType srcMemType, int srcIndex, MemType dstMemType, int dstIndex); std::string GetLinkDesc(Link const& link); int RemappedIndex(int const origIdx, MemType const memType); +int GetWallClockRate(int deviceId); diff --git a/projects/rccl/tools/TransferBench/example.cfg b/projects/rccl/tools/TransferBench/example.cfg index 2d546f433b..ab0d1c2c08 100644 --- a/projects/rccl/tools/TransferBench/example.cfg +++ b/projects/rccl/tools/TransferBench/example.cfg @@ -1,6 +1,7 @@ -# Configfile Format: +# ConfigFile Format: # ================== -# A Link is defined as a uni-directional transfer from src memory location to dst memory location executed by either CPU or GPU +# A Link is defined as a uni-directional transfer from src memory location to dst memory location +# executed by either CPU or GPU # Each single line in the configuration file defines a set of Links to run in parallel # There are two ways to specify the configuration file: @@ -25,19 +26,22 @@ # - G: GPU-executed (Indexed from 0 to 3) # dstMemL : Destination memory location (Where the data is to be written to) -# Memory locations are specified by a character indicating memory type, followed by device index (0-indexed) +# Memory locations are specified by a character indicating memory type, +# followed by device index (0-indexed) # Supported memory locations are: -# - C: Pinned host memory (on NUMA node, indexed from 0 to 1) -# - G: Global device memory (on GPU device indexed from 0 to 3) -# - F: Fine-grain device memory (on GPU device indexed from 0 to 3) +# - C: Pinned host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1]) +# - B: Fine-grain host memory (on NUMA node, indexed from 0 to [# NUMA nodes-1]) +# - G: Global device memory (on GPU device indexed from 0 to [# GPUs - 1]) +# - F: Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1]) # Examples: -# 1 4 (G0->G0->G1) Single Link that uses 4 CUs on GPU 0 that reads memory from GPU 0 and copies it to memory on GPU 1 -# 1 4 (G1->C0->G0) Single Link that uses 4 CUs on GPU 0 that reads memory from CPU 1 and copies it to memory on GPU 0 -# 1 4 (C0->G2->G2) Single Link that uses 4 CUs on GPU 2 that reads memory from CPU 0 and copies it to memory on GPU 2 -# 2 4 G0->G0->G1 G1->G1->G0 Runs 2 Links in parallel. GPU 0 - > GPU1, and GP1 -> GPU 0, each with 4 CUs -# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel. GPU 0 - > GPU 1 using four CUs, and GPU1 -> GPU 0 using two CUs +# 1 4 (G0->G0->G1) Single link using 4 CUs on GPU0 to copy from GPU0 to GPU1 +# 1 4 (C1->G2->G0) Single link using 4 CUs on GPU2 to copy from CPU1 to GPU0 +# 2 4 G0->G0->G1 G1->G1->G0 Runs 2 Links in parallel. GPU0 to GPU1, and GPU1 to GPU0, each with 4 CUs +# -2 (G0 G0 G1 4) (G1 G1 G0 2) Runs 2 Links in parallel. GPU0 to GPU1 with 4 CUs, and GPU1 to GPU0 with 2 CUs + # Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary +# Lines starting with # will be ignored. Lines starting with ## will be echoed to output # Single GPU-executed link between GPUs 0 and 1 using 4 CUs 1 4 (G0->G0->G1)