[TransferBench] Adding shared memory per threadblock env var. Defaulting to 1 threadblock per CU (#436)
Αυτή η υποβολή περιλαμβάνεται σε:
υποβλήθηκε από
GitHub
γονέας
2249a1d9d3
υποβολή
269f07fbc3
@@ -27,11 +27,17 @@ public:
|
||||
int numIterations; // Number of timed iterations to perform
|
||||
int samplingFactor; // Affects how many different values of N are generated (when N set to 0)
|
||||
int numCpuPerLink; // Number of CPU child threads to use per CPU link
|
||||
int sharedMemBytes; // Amount of shared memory to use per threadblock
|
||||
|
||||
std::vector<float> fillPattern; // Pattern of floats used to fill source data
|
||||
|
||||
// Constructor that collects values
|
||||
EnvVars()
|
||||
{
|
||||
int maxSharedMemBytes = 0;
|
||||
hipDeviceGetAttribute(&maxSharedMemBytes,
|
||||
hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0);
|
||||
|
||||
useHipCall = GetEnvVar("USE_HIP_CALL" , 0);
|
||||
useMemset = GetEnvVar("USE_MEMSET" , 0);
|
||||
useSingleSync = GetEnvVar("USE_SINGLE_SYNC" , 0);
|
||||
@@ -44,6 +50,7 @@ public:
|
||||
numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS);
|
||||
samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR);
|
||||
numCpuPerLink = GetEnvVar("NUM_CPU_PER_LINK" , DEFAULT_NUM_CPU_PER_LINK);
|
||||
sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , maxSharedMemBytes / 2 + 1);
|
||||
|
||||
// Check for fill pattern
|
||||
char* pattern = getenv("FILL_PATTERN");
|
||||
@@ -129,6 +136,11 @@ public:
|
||||
printf("[ERROR] NUM_CPU_PER_LINK must be greater or equal to 1\n");
|
||||
exit(1);
|
||||
}
|
||||
if (sharedMemBytes < 0 || sharedMemBytes > maxSharedMemBytes)
|
||||
{
|
||||
printf("[ERROR] SHARED_MEM_BYTES must be between 0 and %d\n", maxSharedMemBytes);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Display info on the env vars that can be used
|
||||
@@ -149,6 +161,7 @@ public:
|
||||
printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
|
||||
printf(" NUM_CPU_PER_LINK=C - Use C threads per Link for CPU-executed copies\n");
|
||||
printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n");
|
||||
printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
|
||||
}
|
||||
|
||||
// Display env var settings
|
||||
@@ -182,7 +195,7 @@ public:
|
||||
printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
|
||||
printf("%-20s = %12d : Running %d timed iteration(s) per topology\n", "NUM_ITERATIONS", numIterations, numIterations);
|
||||
printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Link\n", "NUM_CPU_PER_LINK", numCpuPerLink, numCpuPerLink);
|
||||
printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unspecified)");
|
||||
printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
|
||||
if (fillPattern.size())
|
||||
{
|
||||
printf("Pattern: %s", getenv("FILL_PATTERN"));
|
||||
@@ -192,6 +205,9 @@ public:
|
||||
printf("Pseudo-random: (Element i = i modulo 383 + 31)");
|
||||
}
|
||||
printf("\n");
|
||||
printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
|
||||
getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unspecified)", sharedMemBytes);
|
||||
printf("\n");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1027,7 +1027,7 @@ void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link)
|
||||
hipExtLaunchKernelGGL(ev.useMemset ? GpuMemsetKernel : GpuCopyKernel,
|
||||
dim3(link.numBlocksToUse, 1, 1),
|
||||
dim3(BLOCKSIZE, 1, 1),
|
||||
0, link.stream,
|
||||
ev.sharedMemBytes, link.stream,
|
||||
(ev.combineTiming && recordStart) ? link.startEvent : NULL,
|
||||
(ev.combineTiming && recordStop) ? link.stopEvent : NULL,
|
||||
0, link.blockParam);
|
||||
|
||||
Αναφορά σε νέο ζήτημα
Block a user