From 51d64894ff0733d2e83121bf88839c6079940f70 Mon Sep 17 00:00:00 2001 From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com> Date: Tue, 7 Sep 2021 15:28:16 -0600 Subject: [PATCH] [TransferBench] ConfigFile parsing fixes, adding additional info (#422) * [TransferBench] Adding GPU to NUMA distance detection, parsing fixes, config file generation fix * [TransferBench] Fixing up NUMA node detection by filtering pools --- tools/TransferBench/GetClosestNumaNode.hpp | 143 +++++++++++++++++++++ tools/TransferBench/Makefile | 2 +- tools/TransferBench/TransferBench.cpp | 29 +++-- 3 files changed, 165 insertions(+), 9 deletions(-) create mode 100644 tools/TransferBench/GetClosestNumaNode.hpp diff --git a/tools/TransferBench/GetClosestNumaNode.hpp b/tools/TransferBench/GetClosestNumaNode.hpp new file mode 100644 index 0000000000..40bfda07a5 --- /dev/null +++ b/tools/TransferBench/GetClosestNumaNode.hpp @@ -0,0 +1,143 @@ +/* +Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// Helper macro for checking HSA calls +#define HSA_CHECK(cmd) \ + do { \ + hsa_status_t error = (cmd); \ + if (error != HSA_STATUS_SUCCESS) { \ + const char* errString = NULL; \ + hsa_status_string(error, &errString); \ + std::cerr << "Encountered HSA error (" << errString << ") at line " \ + << __LINE__ << " in file " << __FILE__ << "\n"; \ + exit(-1); \ + } \ + } while (0) + +// Structure to hold HSA agent information +struct AgentData +{ + bool isInitialized; + std::vector cpuAgents; + std::vector gpuAgents; + std::vector closestNumaNode; +}; + +// Simple callback function to return any memory pool for an agent +hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data) +{ + hsa_amd_memory_pool_t* poolData = reinterpret_cast(data); + + // Check memory pool flags + uint32_t poolFlags; + HSA_CHECK(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &poolFlags)); + + // Only consider coarse-grained pools + if (!(poolFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) return HSA_STATUS_SUCCESS; + + *poolData = pool; + return HSA_STATUS_SUCCESS; +} + +// Callback function to gather HSA agent information +hsa_status_t AgentInfoCallback(hsa_agent_t agent, void* data) +{ + AgentData* agentData = reinterpret_cast(data); + + // Get the device type + hsa_device_type_t deviceType; + HSA_CHECK(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &deviceType)); + if (deviceType == HSA_DEVICE_TYPE_CPU) + agentData->cpuAgents.push_back(agent); + if (deviceType == HSA_DEVICE_TYPE_GPU) + { + agentData->gpuAgents.push_back(agent); + agentData->closestNumaNode.push_back(0); + } + + return HSA_STATUS_SUCCESS; +} + +AgentData& GetAgentData() +{ + static AgentData agentData = {}; + + if (!agentData.isInitialized) + { + agentData.isInitialized = true; + + // Add all detected agents to the list + HSA_CHECK(hsa_iterate_agents(AgentInfoCallback, &agentData)); + + // Loop over each GPU + for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++) + { + // Collect memory pool + hsa_amd_memory_pool_t pool; + HSA_CHECK(hsa_amd_agent_iterate_memory_pools(agentData.gpuAgents[i], MemPoolInfoCallback, &pool)); + + // Loop over each CPU agent and check distance + int bestDistance = -1; + for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++) + { + // Determine number of hops from GPU memory pool to CPU agent + uint32_t hops = 0; + HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j], + pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS, + &hops)); + // Gather link info + hsa_amd_memory_pool_link_info_t* link_info = + (hsa_amd_memory_pool_link_info_t *)malloc(hops * sizeof(hsa_amd_memory_pool_link_info_t)); + HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j], + pool, + HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, + link_info)); + int numaDist = 0; + for (int k = 0; k < hops; k++) + { + numaDist += link_info[k].numa_distance; + } + if (bestDistance == -1 || numaDist < bestDistance) + { + agentData.closestNumaNode[i] = j; + bestDistance = numaDist; + } + free(link_info); + } + } + } + return agentData; +} + +// Returns closest CPU NUMA node to provided GPU +// NOTE: This assumes HSA GPU indexing is similar to HIP GPU indexing +int GetClosestNumaNode(int gpuIdx) +{ + AgentData& agentData = GetAgentData(); + if (gpuIdx < 0 || gpuIdx >= agentData.closestNumaNode.size()) + { + printf("[ERROR] GPU index out is out of bounds\n"); + exit(1); + } + return agentData.closestNumaNode[gpuIdx]; +} diff --git a/tools/TransferBench/Makefile b/tools/TransferBench/Makefile index c51bd47b4c..62d1046fb7 100644 --- a/tools/TransferBench/Makefile +++ b/tools/TransferBench/Makefile @@ -6,7 +6,7 @@ endif HIPCC=$(HIP_PATH)/bin/hipcc EXE=TransferBench -CXXFLAGS = -O3 -I../../src/include -I. -lnuma +CXXFLAGS = -O3 -I../../src/include -I. -lnuma -L$(HIP_PATH)/../hsa/lib -lhsa-runtime64 all: $(EXE) diff --git a/tools/TransferBench/TransferBench.cpp b/tools/TransferBench/TransferBench.cpp index 899c71fc55..6fac7df90a 100644 --- a/tools/TransferBench/TransferBench.cpp +++ b/tools/TransferBench/TransferBench.cpp @@ -24,6 +24,7 @@ THE SOFTWARE. // on the same node #include "TransferBench.hpp" +#include "GetClosestNumaNode.hpp" #include #include #include @@ -270,10 +271,12 @@ int main(int argc, char **argv) // Report timings totalCpuTime = totalCpuTime / (1.0 * ev.numIterations) * 1000; double totalBandwidthGbs = (numLinks * N * sizeof(float) / 1.0E6) / totalCpuTime; + double maxGpuTime = 0; for (int i = 0; i < numLinks; i++) { double linkDurationMsec = links[i].totalTime / (1.0 * ev.numIterations); double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f; + maxGpuTime = std::max(maxGpuTime, linkDurationMsec); if (!ev.outputToCsv) { printf(" Link %02d: %c%02d -> [%cPU %02d:%02d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s", @@ -310,7 +313,8 @@ int main(int argc, char **argv) // Display aggregate statistics if (!ev.outputToCsv) { - printf(" Aggregate Bandwidth (CPU timed) | %9.3f GB/s | %8.3f ms |\n", totalBandwidthGbs, totalCpuTime); + printf(" Aggregate Bandwidth (CPU timed) | %9.3f GB/s | %8.3f ms | Overhead: %.3f ms\n", totalBandwidthGbs, totalCpuTime, + totalCpuTime - maxGpuTime); } else { @@ -514,7 +518,7 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks) fprintf(fp, "# GPU 0 Gather\n"); fprintf(fp, "%d %d", numGpuDevices-1, numBlocks); for (int i = 1; i < numGpuDevices; i++) - fprintf(fp, " (G%d->G%d->G%d)", 0, i, 0); + fprintf(fp, " (G%d->G%d->G%d)", i, 0, 0); fprintf(fp, "\n\n"); // Full stress test @@ -533,17 +537,16 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks) void DisplayTopology() { - printf("\nDetected topology:\n"); int numGpuDevices; HIP_CALL(hipGetDeviceCount(&numGpuDevices)); - + printf("\nDetected topology: %d CPU NUMA node(s) %d GPU device(s)\n", numa_num_configured_nodes(), numGpuDevices); printf(" |"); for (int j = 0; j < numGpuDevices; j++) printf(" GPU %02d |", j); - printf(" PCIe Bus ID\n"); + printf(" PCIe Bus ID | Closest NUMA\n"); for (int j = 0; j <= numGpuDevices; j++) printf("--------+"); - printf("-------------\n"); + printf("--------------+-------------\n"); char pciBusId[20]; for (int i = 0; i < numGpuDevices; i++) @@ -567,7 +570,7 @@ void DisplayTopology() } } HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i)); - printf(" %s\n", pciBusId); + printf(" %11s | %d \n", pciBusId, GetClosestNumaNode(i)); } } @@ -670,7 +673,7 @@ void ParseLinks(char* line, int numCpus, int numGpus, std::vector& links) // Method 1: Take in triples (srcMem, exeMem, dstMem) int numBlocksToUse; iss >> numBlocksToUse; - if (numBlocksToUse <= 0) + if (numBlocksToUse <= 0 || iss.fail()) { printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numBlocksToUse); exit(1); @@ -679,6 +682,11 @@ void ParseLinks(char* line, int numCpus, int numGpus, std::vector& links) for (int i = 0; i < numLinks; i++) { iss >> srcMem >> exeMem >> dstMem; + if (iss.fail()) + { + printf("Parsing error: Unable to read valid Link triplet (possibly missing a SRC or EXE or DST)\n"); + exit(1); + } ParseMemType(srcMem, numCpus, numGpus, &links[i].srcMemType, &links[i].srcIndex); ParseMemType(exeMem, numCpus, numGpus, &links[i].exeMemType, &links[i].exeIndex); ParseMemType(dstMem, numCpus, numGpus, &links[i].dstMemType, &links[i].dstIndex); @@ -699,6 +707,11 @@ void ParseLinks(char* line, int numCpus, int numGpus, std::vector& links) for (int i = 0; i < numLinks; i++) { iss >> srcMem >> exeMem >> dstMem >> links[i].numBlocksToUse; + if (iss.fail()) + { + printf("Parsing error: Unable to read valid Link quadruple (possibly missing a SRC or EXE or DST or #CU)\n"); + exit(1); + } ParseMemType(srcMem, numCpus, numGpus, &links[i].srcMemType, &links[i].srcIndex); ParseMemType(exeMem, numCpus, numGpus, &links[i].exeMemType, &links[i].exeIndex); ParseMemType(dstMem, numCpus, numGpus, &links[i].dstMemType, &links[i].dstIndex);