From b2deea27f5ce7faff346c15fb5676ef6736f55c3 Mon Sep 17 00:00:00 2001
From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com>
Date: Wed, 2 Feb 2022 08:51:41 -0700
Subject: [PATCH] TransferBench: Adding ability to reindex GPUs based on PCIe
 address (#494)

[ROCm/rccl commit: 84d5fce7ddb75448449ec116b56fc082e15dcd1f]
---
 projects/rccl/tools/TransferBench/EnvVars.hpp |  4 +
 .../tools/TransferBench/TransferBench.cpp     | 82 +++++++++++++++----
 .../tools/TransferBench/TransferBench.hpp     |  3 +-
 3 files changed, 70 insertions(+), 19 deletions(-)
diff --git a/projects/rccl/tools/TransferBench/EnvVars.hpp b/projects/rccl/tools/TransferBench/EnvVars.hpp
index ee6254bc3a..a84a3af185 100644
--- a/projects/rccl/tools/TransferBench/EnvVars.hpp
+++ b/projects/rccl/tools/TransferBench/EnvVars.hpp
@@ -51,6 +51,7 @@ public:
   int numCpuPerLink;   // Number of CPU child threads to use per CPU link
   int sharedMemBytes;  // Amount of shared memory to use per threadblock
   int blockBytes;      // Each CU, except the last, gets a multiple of this many bytes to copy
+  int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device
 
   std::vector<float> fillPattern; // Pattern of floats used to fill source data
 
@@ -75,6 +76,7 @@ public:
     numCpuPerLink   = GetEnvVar("NUM_CPU_PER_LINK" , DEFAULT_NUM_CPU_PER_LINK);
     sharedMemBytes  = GetEnvVar("SHARED_MEM_BYTES" , maxSharedMemBytes / 2 + 1);
     blockBytes      = GetEnvVar("BLOCK_BYTES"      , 256);
+    usePcieIndexing = GetEnvVar("USE_PCIE_INDEX"   , 0);
 
     // Check for fill pattern
     char* pattern = getenv("FILL_PATTERN");
@@ -192,6 +194,7 @@ public:
     printf(" FILL_PATTERN=STR   - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
     printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
     printf(" BLOCK_BYTES=B      - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
+    printf(" USE_PCIE_INDEX     - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
   }
 
   // Display env var settings
@@ -238,6 +241,7 @@ public:
       printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
              getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
       printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes);
+      printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
       printf("\n");
     }
   };
diff --git a/projects/rccl/tools/TransferBench/TransferBench.cpp b/projects/rccl/tools/TransferBench/TransferBench.cpp
index 187e6e4a21..0b9fc6e1e4 100644
--- a/projects/rccl/tools/TransferBench/TransferBench.cpp
+++ b/projects/rccl/tools/TransferBench/TransferBench.cpp
@@ -152,13 +152,15 @@ int main(int argc, char **argv)
     {
       // Get some aliases to link variables
       MemType const& exeMemType  = links[i].exeMemType;
-      int     const& exeIndex    = links[i].exeIndex;
       MemType const& srcMemType  = links[i].srcMemType;
       MemType const& dstMemType  = links[i].dstMemType;
-      int     const& srcIndex    = links[i].srcIndex;
-      int     const& dstIndex    = links[i].dstIndex;
       int     const& blocksToUse = links[i].numBlocksToUse;
 
+      // Get potentially remapped device indices
+      int const srcIndex = RemappedIndex(links[i].srcIndex, srcMemType);
+      int const exeIndex = RemappedIndex(links[i].exeIndex, exeMemType);
+      int const dstIndex = RemappedIndex(links[i].dstIndex, dstMemType);
+
       // Enable peer-to-peer access if necessary (can only be called once per unique pair)
       if (exeMemType == MEM_GPU)
       {
@@ -166,7 +168,7 @@ int main(int argc, char **argv)
         if ((srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) && srcIndex != exeIndex)
         {
           auto exeSrcPair = std::make_pair(exeIndex, srcIndex);
-           if (!peerAccessTracker.count(exeSrcPair))
+          if (!peerAccessTracker.count(exeSrcPair))
           {
             EnablePeerAccess(exeIndex, srcIndex);
             peerAccessTracker.insert(exeSrcPair);
@@ -369,8 +371,8 @@ int main(int argc, char **argv)
     // Release GPU memory
     for (int i = 0; i < numLinks; i++)
     {
-      DeallocateMemory(links[i].srcMemType, links[i].srcIndex, links[i].srcMem);
-      DeallocateMemory(links[i].dstMemType, links[i].dstIndex, links[i].dstMem);
+      DeallocateMemory(links[i].srcMemType, links[i].srcMem);
+      DeallocateMemory(links[i].dstMemType, links[i].dstMem);
 
       if (links[i].exeMemType == MEM_GPU)
       {
@@ -608,6 +610,46 @@ void GenerateConfigFile(char const* cfgFile, int numBlocks)
   fclose(fp);
 }
 
+int RemappedIndex(int const origIdx, MemType const memType)
+{
+  static std::vector<int> remapping;
+
+  // No need to re-map CPU devices
+  if (memType == MEM_CPU) return origIdx;
+
+  // Build remapping on first use
+  if (remapping.empty())
+  {
+    int numGpuDevices;
+    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+    remapping.resize(numGpuDevices);
+
+    int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0;
+    if (!usePcieIndexing)
+    {
+      // For HIP-based indexing no remapping is necessary
+      for (int i = 0; i < numGpuDevices; ++i)
+        remapping[i] = i;
+    }
+    else
+    {
+      // Collect PCIe address for each GPU
+      std::vector<std::pair<std::string, int>> mapping;
+      char pciBusId[20];
+      for (int i = 0; i < numGpuDevices; ++i)
+      {
+        HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
+        mapping.push_back(std::make_pair(pciBusId, i));
+      }
+      // Sort GPUs by PCIe address then use that as mapping
+      std::sort(mapping.begin(), mapping.end());
+      for (int i = 0; i < numGpuDevices; ++i)
+        remapping[i] = mapping[i].second;
+    }
+  }
+  return remapping[origIdx];
+}
+
 void DisplayTopology()
 {
   int numGpuDevices;
@@ -632,7 +674,9 @@ void DisplayTopology()
       else
       {
         uint32_t linkType, hopCount;
-        HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+        HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(i, MEM_GPU),
+                                              RemappedIndex(j, MEM_GPU),
+                                              &linkType, &hopCount));
         printf(" %s-%d |",
                linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "  HT" :
                linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? " QPI" :
@@ -642,8 +686,8 @@ void DisplayTopology()
                hopCount);
       }
     }
-    HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
-    printf(" %11s |  %d  \n", pciBusId, GetClosestNumaNode(i));
+    HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, RemappedIndex(i, MEM_GPU)));
+    printf(" %11s |  %d  \n", pciBusId, GetClosestNumaNode(RemappedIndex(i, MEM_GPU)));
   }
 }
 
@@ -863,7 +907,7 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memP
   }
 }
 
-void DeallocateMemory(MemType memType, int devIndex, float* memPtr)
+void DeallocateMemory(MemType memType, float* memPtr)
 {
   if (memType == MEM_CPU)
   {
@@ -1008,7 +1052,9 @@ std::string GetDesc(MemType srcMemType, int srcIndex,
       else
       {
         uint32_t linkType, hopCount;
-        HIP_CALL(hipExtGetLinkTypeAndHopCount(srcIndex, dstIndex, &linkType, &hopCount));
+        HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(srcIndex, MEM_GPU),
+                                              RemappedIndex(dstIndex, MEM_GPU),
+                                              &linkType, &hopCount));
         return GetLinkTypeDesc(linkType, hopCount);
       }
     }
@@ -1032,7 +1078,7 @@ void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link)
   if (link.exeMemType == MEM_GPU)
   {
     // Switch to executing GPU
-    HIP_CALL(hipSetDevice(link.exeIndex));
+    HIP_CALL(hipSetDevice(RemappedIndex(link.exeIndex, MEM_GPU)));
 
     bool recordStart = (!ev.useSingleSync || iteration == 0);
     bool recordStop  = (!ev.useSingleSync || iteration == ev.numIterations - 1);
@@ -1188,14 +1234,14 @@ double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional,
 
   // Prepare Links
   links[0].srcMemType = links[1].dstMemType = srcMemType;
-  links[0].srcIndex   = links[1].dstIndex   = srcIndex;
+  links[0].srcIndex   = links[1].dstIndex   = RemappedIndex(srcIndex, srcMemType);
   links[0].dstMemType = links[1].srcMemType = dstMemType;
-  links[0].dstIndex   = links[1].srcIndex   = dstIndex;
+  links[0].dstIndex   = links[1].srcIndex   = RemappedIndex(dstIndex, dstMemType);
   // Either perform local read / remote write, or remote read / local write
   links[0].exeMemType = (readMode == 0 ? srcMemType : dstMemType);
-  links[0].exeIndex   = (readMode == 0 ? srcIndex   : dstIndex);
+  links[0].exeIndex   = RemappedIndex((readMode == 0 ? srcIndex   : dstIndex), links[0].exeMemType);
   links[1].exeMemType = (readMode == 0 ? dstMemType : srcMemType);
-  links[1].exeIndex   = (readMode == 0 ? dstIndex   : srcIndex);
+  links[1].exeIndex   = RemappedIndex((readMode == 0 ? dstIndex   : srcIndex), links[1].exeMemType);
 
   for (int i = 0; i <= isBidirectional; i++)
   {
@@ -1281,8 +1327,8 @@ double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional,
   // Release GPU memory
   for (int i = 0; i <= isBidirectional; i++)
   {
-    DeallocateMemory(links[i].srcMemType, links[i].srcIndex, links[i].srcMem);
-    DeallocateMemory(links[i].dstMemType, links[i].dstIndex, links[i].dstMem);
+    DeallocateMemory(links[i].srcMemType, links[i].srcMem);
+    DeallocateMemory(links[i].dstMemType, links[i].dstMem);
 
     if (links[i].exeMemType == MEM_GPU)
       {
diff --git a/projects/rccl/tools/TransferBench/TransferBench.hpp b/projects/rccl/tools/TransferBench/TransferBench.hpp
index ee9fefb5b5..bec6a50b52 100644
--- a/projects/rccl/tools/TransferBench/TransferBench.hpp
+++ b/projects/rccl/tools/TransferBench/TransferBench.hpp
@@ -105,7 +105,7 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus
 void ParseLinks(char* line, int numCpus, int numGpus, std::vector<Link>& links);       // Parse Link information
 void EnablePeerAccess(int const deviceId, int const peerDeviceId);
 void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memPtr);
-void DeallocateMemory(MemType memType, int devIndex, float* memPtr);
+void DeallocateMemory(MemType memType, float* memPtr);
 void CheckPages(char* byteArray, size_t numBytes, int targetId);
 void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
 void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link);
@@ -119,3 +119,4 @@ std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
 std::string GetDesc(MemType srcMemType, int srcIndex,
                     MemType dstMemType, int dstIndex);
 std::string GetLinkDesc(Link const& link);
+int RemappedIndex(int const origIdx, MemType const memType);