From 269f07fbc3a36db2d6ce4fb4175d7bab77ea7951 Mon Sep 17 00:00:00 2001 From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com> Date: Tue, 12 Oct 2021 09:32:54 -0600 Subject: [PATCH] [TransferBench] Adding shared memory per threadblock env var. Defaulting to 1 threadblock per CU (#436) --- tools/TransferBench/EnvVars.hpp | 18 +++++++++++++++++- tools/TransferBench/TransferBench.cpp | 2 +- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/tools/TransferBench/EnvVars.hpp b/tools/TransferBench/EnvVars.hpp index 2f92185be3..2e430c9f65 100644 --- a/tools/TransferBench/EnvVars.hpp +++ b/tools/TransferBench/EnvVars.hpp @@ -27,11 +27,17 @@ public: int numIterations; // Number of timed iterations to perform int samplingFactor; // Affects how many different values of N are generated (when N set to 0) int numCpuPerLink; // Number of CPU child threads to use per CPU link + int sharedMemBytes; // Amount of shared memory to use per threadblock + std::vector fillPattern; // Pattern of floats used to fill source data // Constructor that collects values EnvVars() { + int maxSharedMemBytes = 0; + hipDeviceGetAttribute(&maxSharedMemBytes, + hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0); + useHipCall = GetEnvVar("USE_HIP_CALL" , 0); useMemset = GetEnvVar("USE_MEMSET" , 0); useSingleSync = GetEnvVar("USE_SINGLE_SYNC" , 0); @@ -44,6 +50,7 @@ public: numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS); samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR); numCpuPerLink = GetEnvVar("NUM_CPU_PER_LINK" , DEFAULT_NUM_CPU_PER_LINK); + sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , maxSharedMemBytes / 2 + 1); // Check for fill pattern char* pattern = getenv("FILL_PATTERN"); @@ -129,6 +136,11 @@ public: printf("[ERROR] NUM_CPU_PER_LINK must be greater or equal to 1\n"); exit(1); } + if (sharedMemBytes < 0 || sharedMemBytes > maxSharedMemBytes) + { + printf("[ERROR] SHARED_MEM_BYTES must be between 0 and %d\n", maxSharedMemBytes); + exit(1); + } } // Display info on the env vars that can be used @@ -149,6 +161,7 @@ public: printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n"); printf(" NUM_CPU_PER_LINK=C - Use C threads per Link for CPU-executed copies\n"); printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n"); + printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n"); } // Display env var settings @@ -182,7 +195,7 @@ public: printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups); printf("%-20s = %12d : Running %d timed iteration(s) per topology\n", "NUM_ITERATIONS", numIterations, numIterations); printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Link\n", "NUM_CPU_PER_LINK", numCpuPerLink, numCpuPerLink); - printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unspecified)"); + printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)"); if (fillPattern.size()) { printf("Pattern: %s", getenv("FILL_PATTERN")); @@ -192,6 +205,9 @@ public: printf("Pseudo-random: (Element i = i modulo 383 + 31)"); } printf("\n"); + printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES", + getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unspecified)", sharedMemBytes); + printf("\n"); } }; diff --git a/tools/TransferBench/TransferBench.cpp b/tools/TransferBench/TransferBench.cpp index d7b373ac8f..4ab2aad2dd 100644 --- a/tools/TransferBench/TransferBench.cpp +++ b/tools/TransferBench/TransferBench.cpp @@ -1027,7 +1027,7 @@ void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link) hipExtLaunchKernelGGL(ev.useMemset ? GpuMemsetKernel : GpuCopyKernel, dim3(link.numBlocksToUse, 1, 1), dim3(BLOCKSIZE, 1, 1), - 0, link.stream, + ev.sharedMemBytes, link.stream, (ev.combineTiming && recordStart) ? link.startEvent : NULL, (ev.combineTiming && recordStop) ? link.stopEvent : NULL, 0, link.blockParam);