TransferBench p2p benchmark mode (#444)

* [TransferBench] Adding a p2p benchmark mode
* [TransferBench] Switching to using single sync mode by default (USE_SINGLE_SYNC=1)

[ROCm/rccl commit: 550d732d6c]
This commit is contained in:
gilbertlee-amd
2021-10-21 15:28:16 -06:00
committed by GitHub
szülő aac9d57633
commit b795cc090b
3 fájl változott, egészen pontosan 202 új sor hozzáadva és 20 régi sor törölve
@@ -40,7 +40,7 @@ public:
useHipCall = GetEnvVar("USE_HIP_CALL" , 0);
useMemset = GetEnvVar("USE_MEMSET" , 0);
useSingleSync = GetEnvVar("USE_SINGLE_SYNC" , 0);
useSingleSync = GetEnvVar("USE_SINGLE_SYNC" , 1);
useInteractive = GetEnvVar("USE_INTERACTIVE" , 0);
combineTiming = GetEnvVar("COMBINE_TIMING" , 0);
showAddr = GetEnvVar("SHOW_ADDR" , 0);
@@ -50,24 +50,8 @@ int main(int argc, char **argv)
exit(0);
}
// Check that Link configuration file can be opened
FILE* fp = fopen(argv[1], "r");
if (!fp)
{
printf("[ERROR] Unable to open link configuration file: [%s]\n", argv[1]);
exit(1);
}
// Check for NUMA library support
if (numa_available() == -1)
{
printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
exit(1);
}
// Collect environment variables / display current run configuration
EnvVars ev;
ev.DisplayEnvVars();
// Determine number of bytes to run per Link
// If a non-zero number of bytes is specified, use it
@@ -92,6 +76,30 @@ int main(int argc, char **argv)
for (auto N : valuesOfN)
maxN = std::max(maxN, N);
// Execute only peer to peer benchmark mode, similar to rocm-bandwidth-test
if (!strcmp(argv[1], "p2p"))
{
// Execute peer to peer benchmark mode
RunPeerToPeerBenchmarks(ev, numBytesPerLink / sizeof(float));
exit(0);
}
// Check that Link configuration file can be opened
FILE* fp = fopen(argv[1], "r");
if (!fp)
{
printf("[ERROR] Unable to open link configuration file: [%s]\n", argv[1]);
exit(1);
}
// Check for NUMA library support
if (numa_available() == -1)
{
printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
exit(1);
}
ev.DisplayEnvVars();
int const initOffset = ev.byteOffset / sizeof(float);
std::stack<std::thread> threads;
@@ -381,6 +389,7 @@ void DisplayUsage(char const* cmdName)
printf("Usage: %s configFile <N>\n", cmdName);
printf(" configFile: File containing Links to execute (see below for format)\n");
printf(" Specifying \"p2p\" as the configFile will execute a peer to peer benchmark\n");
printf(" N : (Optional) Number of bytes to transfer per link.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", DEFAULT_BYTES_PER_LINK);
printf(" If 0 is specified, a range of Ns will be benchmarked\n");
@@ -880,8 +889,8 @@ void CheckPages(char* array, size_t numBytes, int targetId)
if (mistakeCount > 0)
{
printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId);
// NOTE: Some older versions of HIP do not properly respect NUMA policy so avoid failing for now
// exit(1);
printf("[ERROR] Ensure up-to-date ROCm is installed\n");
exit(1);
}
}
@@ -1086,3 +1095,173 @@ void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link)
link.totalTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
}
}
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
{
// Collect the number of available CPUs/GPUs on this machine
int numGpus;
HIP_CALL(hipGetDeviceCount(&numGpus));
int const numCpus = numa_num_configured_nodes();
int const numDevices = numCpus + numGpus;
// Enable peer to peer for each GPU
for (int i = 0; i < numGpus; i++)
for (int j = 0; j < numGpus; j++)
if (i != j) EnablePeerAccess(i, j);
printf("Performing copies in each direction of %lu bytes\n", N * sizeof(float));
printf("Using %d threads per NUMA node for CPU copies\n", ev.numCpuPerLink);
// Perform unidirectional / bidirectional
for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
{
// Print header
printf("%sdirectional copy peak bandwidth GB/s\n", isBidirectional ? "Bi" : "Uni");
printf("%10s", "D/D");
for (int i = 0; i < numCpus; i++)
printf("%7s %02d", "CPU", i);
for (int i = 0; i < numGpus; i++)
printf("%7s %02d", "GPU", i);
printf("\n");
// Loop over all possible src/dst pairs
for (int src = 0; src < numDevices; src++)
{
MemType const& srcMemType = (src < numCpus ? MEM_CPU : MEM_GPU);
int srcIndex = (srcMemType == MEM_CPU ? src : src - numCpus);
printf("%7s %02d", (srcMemType == MEM_CPU) ? "CPU" : "GPU", srcIndex);
for (int dst = 0; dst < numDevices; dst++)
{
MemType const& dstMemType = (dst < numCpus ? MEM_CPU : MEM_GPU);
int dstIndex = (dstMemType == MEM_CPU ? dst : dst - numCpus);
double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcMemType, srcIndex, dstMemType, dstIndex);
if (bandwidth == 0)
printf("%10s", "N/A");
else
printf("%10.2f", bandwidth);
fflush(stdout);
}
printf("\n");
}
printf("\n");
}
}
double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional,
MemType srcMemType, int srcIndex,
MemType dstMemType, int dstIndex)
{
Link links[2];
int const initOffset = ev.byteOffset / sizeof(float);
// Skip bidirectional on same device
if (isBidirectional && srcMemType == dstMemType && srcIndex == dstIndex) return 0.0f;
// Prepare Links
links[0].srcMemType = links[0].exeMemType = links[1].dstMemType = srcMemType;
links[0].srcIndex = links[0].exeIndex = links[1].dstIndex = srcIndex;
links[0].dstMemType = links[1].exeMemType = links[1].srcMemType = dstMemType;
links[0].dstIndex = links[1].exeIndex = links[1].srcIndex = dstIndex;
for (int i = 0; i <= isBidirectional; i++)
{
AllocateMemory(links[i].srcMemType, links[i].srcIndex, N * sizeof(float) + ev.byteOffset, &links[i].srcMem);
AllocateMemory(links[i].dstMemType, links[i].dstIndex, N * sizeof(float) + ev.byteOffset, &links[i].dstMem);
links[i].totalTime = 0.0;
CheckOrFill(MODE_FILL, N, ev.useMemset, ev.useHipCall, ev.fillPattern, links[i].srcMem + initOffset);
if (links[i].exeMemType == MEM_GPU)
{
HIP_CALL(hipDeviceGetAttribute(&links[i].numBlocksToUse, hipDeviceAttributeMultiprocessorCount, links[i].exeIndex));
HIP_CALL(hipSetDevice(links[i].exeIndex));
HIP_CALL(hipEventCreate(&links[i].startEvent));
HIP_CALL(hipEventCreate(&links[i].stopEvent));
HIP_CALL(hipMalloc((void**)&links[i].blockParam, sizeof(BlockParam) * links[i].numBlocksToUse));
HIP_CALL(hipStreamCreate(&links[i].stream));
size_t assigned = 0;
int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)links[i].numBlocksToUse);
for (int j = 0; j < links[i].numBlocksToUse; j++)
{
BlockParam param;
int blocksLeft = std::max(0, maxNumBlocksToUse - j);
size_t leftover = N - assigned;
size_t roundedN = (leftover + 31) / 32;
param.N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0;
param.src = links[i].srcMem + assigned + initOffset;
param.dst = links[i].dstMem + assigned + initOffset;
assigned += param.N;
HIP_CALL(hipMemcpy(&links[i].blockParam[j], &param, sizeof(BlockParam), hipMemcpyHostToDevice));
}
}
else
{
links[i].blockParam = (BlockParam*)malloc(ev.numCpuPerLink * sizeof(BlockParam));
// For CPU-based copy, divded based on the number of child threads
size_t assigned = 0;
int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)ev.numCpuPerLink);
for (int j = 0; j < ev.numCpuPerLink; j++)
{
int blocksLeft = std::max(0, maxNumBlocksToUse - j);
size_t leftover = N - assigned;
size_t roundedN = (leftover + 31) / 32;
links[i].blockParam[j].N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0;
links[i].blockParam[j].src = links[i].srcMem + assigned + initOffset;
links[i].blockParam[j].dst = links[i].dstMem + assigned + initOffset;
assigned += links[i].blockParam[j].N;
}
}
}
std::stack<std::thread> threads;
// Perform iteration
for (int iteration = -ev.numWarmups; iteration < ev.numIterations; iteration++)
{
// Perform timed iterations
for (int i = 0; i <= isBidirectional; i++)
threads.push(std::thread(RunLink, std::ref(ev), N, iteration, std::ref(links[i])));
// Wait for all threads to finish
for (int i = 0; i <= isBidirectional; i++)
{
threads.top().join();
threads.pop();
}
}
// Validate that each link has transferred correctly
for (int i = 0; i <= isBidirectional; i++)
CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, links[i].dstMem + initOffset);
// Collect aggregate bandwidth
double totalBandwidth = 0;
for (int i = 0; i <= isBidirectional; i++)
{
double linkDurationMsec = links[i].totalTime / (1.0 * ev.numIterations);
double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f;
totalBandwidth += linkBandwidthGbs;
}
// Release GPU memory
for (int i = 0; i <= isBidirectional; i++)
{
DeallocateMemory(links[i].srcMemType, links[i].srcIndex, links[i].srcMem);
DeallocateMemory(links[i].dstMemType, links[i].dstIndex, links[i].dstMem);
if (links[i].exeMemType == MEM_GPU)
{
HIP_CALL(hipEventDestroy(links[i].startEvent));
HIP_CALL(hipEventDestroy(links[i].stopEvent));
HIP_CALL(hipStreamDestroy(links[i].stream));
HIP_CALL(hipFree(links[i].blockParam));
}
else if (links[i].exeMemType == MEM_CPU)
{
free(links[i].blockParam);
}
}
return totalBandwidth;
}
@@ -122,7 +122,10 @@ void DeallocateMemory(MemType memType, int devIndex, float* memPtr);
void CheckPages(char* byteArray, size_t numBytes, int targetId);
void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link);
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N);
double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional,
MemType srcMemType, int srcIndex,
MemType dstMemType, int dstIndex);
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
std::string GetDesc(MemType srcMemType, int srcIndex,