[TransferBench] Adding shared memory per threadblock env var. Defaulting to 1 threadblock per CU (#436)

Αυτή η υποβολή περιλαμβάνεται σε:
gilbertlee-amd
2021-10-12 09:32:54 -06:00
υποβλήθηκε από GitHub
γονέας 2249a1d9d3
υποβολή 269f07fbc3
2 αρχεία άλλαξαν με 18 προσθήκες και 2 διαγραφές
@@ -27,11 +27,17 @@ public:
int numIterations; // Number of timed iterations to perform
int samplingFactor; // Affects how many different values of N are generated (when N set to 0)
int numCpuPerLink; // Number of CPU child threads to use per CPU link
int sharedMemBytes; // Amount of shared memory to use per threadblock
std::vector<float> fillPattern; // Pattern of floats used to fill source data
// Constructor that collects values
EnvVars()
{
int maxSharedMemBytes = 0;
hipDeviceGetAttribute(&maxSharedMemBytes,
hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0);
useHipCall = GetEnvVar("USE_HIP_CALL" , 0);
useMemset = GetEnvVar("USE_MEMSET" , 0);
useSingleSync = GetEnvVar("USE_SINGLE_SYNC" , 0);
@@ -44,6 +50,7 @@ public:
numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS);
samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR);
numCpuPerLink = GetEnvVar("NUM_CPU_PER_LINK" , DEFAULT_NUM_CPU_PER_LINK);
sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , maxSharedMemBytes / 2 + 1);
// Check for fill pattern
char* pattern = getenv("FILL_PATTERN");
@@ -129,6 +136,11 @@ public:
printf("[ERROR] NUM_CPU_PER_LINK must be greater or equal to 1\n");
exit(1);
}
if (sharedMemBytes < 0 || sharedMemBytes > maxSharedMemBytes)
{
printf("[ERROR] SHARED_MEM_BYTES must be between 0 and %d\n", maxSharedMemBytes);
exit(1);
}
}
// Display info on the env vars that can be used
@@ -149,6 +161,7 @@ public:
printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
printf(" NUM_CPU_PER_LINK=C - Use C threads per Link for CPU-executed copies\n");
printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n");
printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
}
// Display env var settings
@@ -182,7 +195,7 @@ public:
printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
printf("%-20s = %12d : Running %d timed iteration(s) per topology\n", "NUM_ITERATIONS", numIterations, numIterations);
printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Link\n", "NUM_CPU_PER_LINK", numCpuPerLink, numCpuPerLink);
printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unspecified)");
printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
if (fillPattern.size())
{
printf("Pattern: %s", getenv("FILL_PATTERN"));
@@ -192,6 +205,9 @@ public:
printf("Pseudo-random: (Element i = i modulo 383 + 31)");
}
printf("\n");
printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unspecified)", sharedMemBytes);
printf("\n");
}
};
@@ -1027,7 +1027,7 @@ void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link)
hipExtLaunchKernelGGL(ev.useMemset ? GpuMemsetKernel : GpuCopyKernel,
dim3(link.numBlocksToUse, 1, 1),
dim3(BLOCKSIZE, 1, 1),
0, link.stream,
ev.sharedMemBytes, link.stream,
(ev.combineTiming && recordStart) ? link.startEvent : NULL,
(ev.combineTiming && recordStop) ? link.stopEvent : NULL,
0, link.blockParam);