From 51d64894ff0733d2e83121bf88839c6079940f70 Mon Sep 17 00:00:00 2001
From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com>
Date: Tue, 7 Sep 2021 15:28:16 -0600
Subject: [PATCH] [TransferBench] ConfigFile parsing fixes, adding additional
 info (#422)

* [TransferBench] Adding GPU to NUMA distance detection, parsing fixes, config file generation fix

* [TransferBench] Fixing up NUMA node detection by filtering pools
---
 tools/TransferBench/GetClosestNumaNode.hpp | 143 +++++++++++++++++++++
 tools/TransferBench/Makefile               |   2 +-
 tools/TransferBench/TransferBench.cpp      |  29 +++--
 3 files changed, 165 insertions(+), 9 deletions(-)
 create mode 100644 tools/TransferBench/GetClosestNumaNode.hpp
diff --git a/tools/TransferBench/GetClosestNumaNode.hpp b/tools/TransferBench/GetClosestNumaNode.hpp
new file mode 100644
index 0000000000..40bfda07a5
--- /dev/null
+++ b/tools/TransferBench/GetClosestNumaNode.hpp
@@ -0,0 +1,143 @@
+/*
+Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// Helper macro for checking HSA calls
+#define HSA_CHECK(cmd)                                                  \
+  do {                                                                  \
+    hsa_status_t error = (cmd);                                         \
+    if (error != HSA_STATUS_SUCCESS) {                                  \
+      const char* errString = NULL;                                     \
+      hsa_status_string(error, &errString);                             \
+      std::cerr << "Encountered HSA error (" << errString << ") at line " \
+                << __LINE__ << " in file " << __FILE__ << "\n";         \
+      exit(-1);                                                         \
+    }                                                                   \
+  } while (0)
+
+// Structure to hold HSA agent information
+struct AgentData
+{
+  bool isInitialized;
+  std::vector<hsa_agent_t> cpuAgents;
+  std::vector<hsa_agent_t> gpuAgents;
+  std::vector<int> closestNumaNode;
+};
+
+// Simple callback function to return any memory pool for an agent
+hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data)
+{
+  hsa_amd_memory_pool_t* poolData = reinterpret_cast<hsa_amd_memory_pool_t*>(data);
+
+  // Check memory pool flags
+  uint32_t poolFlags;
+  HSA_CHECK(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &poolFlags));
+
+  // Only consider coarse-grained pools
+  if (!(poolFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) return HSA_STATUS_SUCCESS;
+
+  *poolData = pool;
+  return HSA_STATUS_SUCCESS;
+}
+
+// Callback function to gather HSA agent information
+hsa_status_t AgentInfoCallback(hsa_agent_t agent, void* data)
+{
+  AgentData* agentData = reinterpret_cast<AgentData*>(data);
+
+  // Get the device type
+  hsa_device_type_t deviceType;
+  HSA_CHECK(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &deviceType));
+  if (deviceType == HSA_DEVICE_TYPE_CPU)
+    agentData->cpuAgents.push_back(agent);
+  if (deviceType == HSA_DEVICE_TYPE_GPU)
+  {
+    agentData->gpuAgents.push_back(agent);
+    agentData->closestNumaNode.push_back(0);
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+AgentData& GetAgentData()
+{
+  static AgentData agentData = {};
+
+  if (!agentData.isInitialized)
+  {
+    agentData.isInitialized = true;
+
+    // Add all detected agents to the list
+    HSA_CHECK(hsa_iterate_agents(AgentInfoCallback, &agentData));
+
+    // Loop over each GPU
+    for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++)
+    {
+      // Collect memory pool
+      hsa_amd_memory_pool_t pool;
+      HSA_CHECK(hsa_amd_agent_iterate_memory_pools(agentData.gpuAgents[i], MemPoolInfoCallback, &pool));
+
+      // Loop over each CPU agent and check distance
+      int bestDistance = -1;
+      for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++)
+      {
+        // Determine number of hops from GPU memory pool to CPU agent
+        uint32_t hops = 0;
+        HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
+                                                     pool,
+                                                     HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS,
+                                                     &hops));
+        // Gather link info
+        hsa_amd_memory_pool_link_info_t* link_info =
+          (hsa_amd_memory_pool_link_info_t *)malloc(hops * sizeof(hsa_amd_memory_pool_link_info_t));
+        HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
+                                                     pool,
+                                                     HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO,
+                                                     link_info));
+        int numaDist = 0;
+        for (int k = 0; k < hops; k++)
+        {
+          numaDist += link_info[k].numa_distance;
+        }
+        if (bestDistance == -1 || numaDist < bestDistance)
+        {
+          agentData.closestNumaNode[i] = j;
+          bestDistance = numaDist;
+        }
+        free(link_info);
+      }
+    }
+  }
+  return agentData;
+}
+
+// Returns closest CPU NUMA node to provided GPU
+// NOTE: This assumes HSA GPU indexing is similar to HIP GPU indexing
+int GetClosestNumaNode(int gpuIdx)
+{
+  AgentData& agentData = GetAgentData();
+  if (gpuIdx < 0 || gpuIdx >= agentData.closestNumaNode.size())
+  {
+    printf("[ERROR] GPU index out is out of bounds\n");
+    exit(1);
+  }
+  return agentData.closestNumaNode[gpuIdx];
+}
diff --git a/tools/TransferBench/Makefile b/tools/TransferBench/Makefile
index c51bd47b4c..62d1046fb7 100644
--- a/tools/TransferBench/Makefile
+++ b/tools/TransferBench/Makefile
@@ -6,7 +6,7 @@ endif
 HIPCC=$(HIP_PATH)/bin/hipcc
 
 EXE=TransferBench
-CXXFLAGS = -O3 -I../../src/include -I. -lnuma
+CXXFLAGS = -O3 -I../../src/include -I. -lnuma -L$(HIP_PATH)/../hsa/lib -lhsa-runtime64
 
 all: $(EXE)
 
diff --git a/tools/TransferBench/TransferBench.cpp b/tools/TransferBench/TransferBench.cpp
index 899c71fc55..6fac7df90a 100644
--- a/tools/TransferBench/TransferBench.cpp
+++ b/tools/TransferBench/TransferBench.cpp
@@ -24,6 +24,7 @@ THE SOFTWARE.
 // on the same node
 
 #include "TransferBench.hpp"
+#include "GetClosestNumaNode.hpp"
 #include <numa.h>
 #include <numaif.h>
 #include <stack>
@@ -270,10 +271,12 @@ int main(int argc, char **argv)
       // Report timings
       totalCpuTime = totalCpuTime / (1.0 * ev.numIterations) * 1000;
       double totalBandwidthGbs = (numLinks * N * sizeof(float) / 1.0E6) / totalCpuTime;
+      double maxGpuTime = 0;
       for (int i = 0; i < numLinks; i++)
       {
         double linkDurationMsec = links[i].totalTime / (1.0 * ev.numIterations);
         double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f;
+        maxGpuTime = std::max(maxGpuTime, linkDurationMsec);
         if (!ev.outputToCsv)
         {
           printf(" Link %02d: %c%02d -> [%cPU %02d:%02d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s",
@@ -310,7 +313,8 @@ int main(int argc, char **argv)
       // Display aggregate statistics
       if (!ev.outputToCsv)
       {
-        printf(" Aggregate Bandwidth (CPU timed)    | %9.3f GB/s | %8.3f ms |\n", totalBandwidthGbs, totalCpuTime);
+        printf(" Aggregate Bandwidth (CPU timed)    | %9.3f GB/s | %8.3f ms | Overhead: %.3f ms\n", totalBandwidthGbs, totalCpuTime,
+               totalCpuTime - maxGpuTime);
       }
       else
       {
@@ -514,7 +518,7 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
   fprintf(fp, "# GPU 0 Gather\n");
   fprintf(fp, "%d %d", numGpuDevices-1, numBlocks);
   for (int i = 1; i < numGpuDevices; i++)
-    fprintf(fp, " (G%d->G%d->G%d)", 0, i, 0);
+    fprintf(fp, " (G%d->G%d->G%d)", i, 0, 0);
   fprintf(fp, "\n\n");
 
   // Full stress test
@@ -533,17 +537,16 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
 
 void DisplayTopology()
 {
-  printf("\nDetected topology:\n");
   int numGpuDevices;
   HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-
+  printf("\nDetected topology: %d CPU NUMA node(s)   %d GPU device(s)\n", numa_num_configured_nodes(), numGpuDevices);
   printf("        |");
   for (int j = 0; j < numGpuDevices; j++)
     printf(" GPU %02d |", j);
-  printf(" PCIe Bus ID\n");
+  printf(" PCIe Bus ID  | Closest NUMA\n");
   for (int j = 0; j <= numGpuDevices; j++)
     printf("--------+");
-  printf("-------------\n");
+  printf("--------------+-------------\n");
 
   char pciBusId[20];
   for (int i = 0; i < numGpuDevices; i++)
@@ -567,7 +570,7 @@ void DisplayTopology()
       }
     }
     HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
-    printf(" %s\n", pciBusId);
+    printf(" %11s |  %d  \n", pciBusId, GetClosestNumaNode(i));
   }
 }
 
@@ -670,7 +673,7 @@ void ParseLinks(char* line, int numCpus, int numGpus, std::vector<Link>& links)
     // Method 1: Take in triples (srcMem, exeMem, dstMem)
     int numBlocksToUse;
     iss >> numBlocksToUse;
-    if (numBlocksToUse <= 0)
+    if (numBlocksToUse <= 0 || iss.fail())
     {
       printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numBlocksToUse);
       exit(1);
@@ -679,6 +682,11 @@ void ParseLinks(char* line, int numCpus, int numGpus, std::vector<Link>& links)
     for (int i = 0; i < numLinks; i++)
     {
       iss >> srcMem >> exeMem >> dstMem;
+      if (iss.fail())
+      {
+        printf("Parsing error: Unable to read valid Link triplet (possibly missing a SRC or EXE or DST)\n");
+        exit(1);
+      }
       ParseMemType(srcMem, numCpus, numGpus, &links[i].srcMemType, &links[i].srcIndex);
       ParseMemType(exeMem, numCpus, numGpus, &links[i].exeMemType, &links[i].exeIndex);
       ParseMemType(dstMem, numCpus, numGpus, &links[i].dstMemType, &links[i].dstIndex);
@@ -699,6 +707,11 @@ void ParseLinks(char* line, int numCpus, int numGpus, std::vector<Link>& links)
     for (int i = 0; i < numLinks; i++)
     {
       iss >> srcMem >> exeMem >> dstMem >> links[i].numBlocksToUse;
+      if (iss.fail())
+      {
+        printf("Parsing error: Unable to read valid Link quadruple (possibly missing a SRC or EXE or DST or #CU)\n");
+        exit(1);
+      }
       ParseMemType(srcMem, numCpus, numGpus, &links[i].srcMemType, &links[i].srcIndex);
       ParseMemType(exeMem, numCpus, numGpus, &links[i].exeMemType, &links[i].exeIndex);
       ParseMemType(dstMem, numCpus, numGpus, &links[i].dstMemType, &links[i].dstIndex);