From e58ec78d359860d1689a49f455df32c8be90a869 Mon Sep 17 00:00:00 2001
From: Audrey MP <aujadeva@gmail.com>
Date: Tue, 12 Sep 2023 15:34:40 -0400
Subject: [PATCH] Gcn arch name (#886)

We use CMake to determine if we're compiling against a version of ROCm that supports gcnArchName and handles architecture checking appropriately. It includes a few helper functions as drop ins for the functionality we used gcnArch for before; sometimes to enable flags, and sometimes to set frequencies.
---
 CMakeLists.txt              | 16 +++++++
 src/clique/CliqueManager.cc | 31 ++++++-------
 src/clique/CliqueManager.h  |  2 +-
 src/graph/topo.cc           |  9 +++-
 src/graph/topo.h            | 22 +++++-----
 src/graph/tuning.cc         |  2 +-
 src/graph/xml.cc            |  1 +
 src/graph/xml.h             |  1 +
 src/include/archinfo.h      | 39 +++++++++++++++++
 src/init.cc                 | 16 +++----
 src/misc/archinfo.cc        | 86 +++++++++++++++++++++++++++++++++++++
 src/misc/npkit.cc           | 10 ++---
 src/transport/net.cc        |  2 +-
 src/transport/p2p.cc        |  2 +-
 tools/topo_expl/Makefile    |  3 +-
 tools/topo_expl/utils.cpp   |  8 ++--
 16 files changed, 193 insertions(+), 57 deletions(-)
 create mode 100644 src/include/archinfo.h
 create mode 100644 src/misc/archinfo.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c8bb9ffa16..90ace2446a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -122,6 +122,17 @@ message(STATUS "hipcc version:    ${hipcc_version_string}")
 ### Check for hipEventDisableSystemFence support
 check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVENT_DISABLE_FENCE)
 
+### Check for hipDeviceMallocUncached support
+check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
+
+message(STATUS "HIP Library version: ${hip_VERSION_MINOR}")
+### Check the version of HIP to see if we can use gcnArchName instead of gcnArch (deprecated)
+if(${hipcc_version_string} VERSION_LESS "5.7.31921")
+    set(HIP_NO_GCNARCHNAME ON)
+  else()
+    set(HIP_NO_GCNARCHNAME OFF)
+endif()
+
 ### Check for indirect function call support
 if(ENABLE_IFC)
   if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201")
@@ -300,6 +311,7 @@ set(SRC_FILES
   src/graph/xml.cc
   src/graph/xml.h
   src/group.cc
+  src/include/archinfo.h
   src/include/align.h
   src/include/alloc.h
   src/include/argcheck.h
@@ -380,6 +392,7 @@ set(SRC_FILES
   src/include/utils.h
   src/init.cc
 #  src/init_nvtx.cc
+  src/misc/archinfo.cc
   src/misc/argcheck.cc
 # src/misc/cudawrap.cc
 # src/misc/gdrwrap.cc
@@ -547,6 +560,9 @@ else()
     target_compile_options(rccl PRIVATE --hipcc-func-supp)
   endif()
 endif()
+if(HIP_NO_GCNARCHNAME)
+  target_compile_definitions(rccl PRIVATE HIP_NO_GCNARCHNAME)
+endif()
 if (BUILD_BFD)
   if (HAVE_BFD)
     target_compile_definitions(rccl PRIVATE HAVE_BFD)
diff --git a/src/clique/CliqueManager.cc b/src/clique/CliqueManager.cc
index 722c7ac1e1..7abfc67fda 100644
--- a/src/clique/CliqueManager.cc
+++ b/src/clique/CliqueManager.cc
@@ -61,7 +61,7 @@ CliqueManager::CliqueManager(int          const  rank,
   m_opIndexHead(0),
   m_opIndexTail(0),
   m_init(false),
-  m_gcnArch(0),
+  m_gcnArchName(char[256]),
   m_allReduceByteLimit(0),
   m_pinnedCliquePtrs(NULL),
   m_gpuBarrierGlobalCount(NULL),
@@ -243,13 +243,13 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
   CUDACHECK(hipGetDevice(&deviceId));
   hipDeviceProp_t devProp;
   CUDACHECK(hipGetDeviceProperties(&devProp, deviceId));
-  m_gcnArch = devProp.gcnArch;
+  m_gcnArchName = devProp.gcnArchName;
 
   // Establish when to use clique-based kernels based on input size
   SetByteLimits();
 
   m_init = true;
-  INFO(NCCL_INIT, "Clique-based kernels enabled (mode %d) [GCN %d]", m_cliqueMode, m_gcnArch);
+  INFO(NCCL_INIT, "Clique-based kernels enabled (mode %d) [GCN %d]", m_cliqueMode, m_gcnArchName);
   return ncclSuccess;
 
 dropback:
@@ -266,12 +266,12 @@ void CliqueManager::SetByteLimits()
   m_allReduceByteLimit = rcclParamAllReduceCliqueByteLimit();
   if (m_allReduceByteLimit == 0)
   {
-    switch (m_gcnArch)
-    {
-    case 906: m_allReduceByteLimit =  16777216; break;
-    case 908: m_allReduceByteLimit =   8388608; break;
-    default:  m_allReduceByteLimit =  16777216; break;
-    }
+    if (IsArchMatch(m_gcnArchName, "gfx906"))
+      m_allReduceByteLimit = 16777216;
+    else if (IsArchMatch(m_gcnArchName, "gfx908"))
+      m_allReduceByteLimit = 8388608;
+    else
+      m_allReduceByteLimit = 16777216;
   }
 }
 
@@ -368,23 +368,18 @@ ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll,
     {
       // NOTE: These are currently based on collected data and not necessarily ideal for all hardware
       int numChannels;
-      switch (m_gcnArch)
-      {
-      case 906:
+      if (IsArchMatch(m_gcnArchName, "gfx906")) {
         if      (totalBytes <=   16384) numChannels =  1;
         else                            numChannels =  2;
-        break;
-      case 908:
+      } else if (IsArchMatch(m_gcnArchName, "gfx908")) {
         if      (totalBytes <=  131072) numChannels =  2;
         else if (totalBytes <=  524288) numChannels =  6;
         else if (totalBytes <= 1048576) numChannels = 13;
         else                            numChannels = 16;
-        break;
-      case 910:
+      } else if (IsArchMatch(m_gcnArchName, "gfx90a")) {
         if      (totalBytes <=  262144) numChannels =  4;
         else                            numChannels =  8;
-        break;
-      default:
+      } else {
         if      (totalBytes <=   65536) numChannels =  1;
         else if (totalBytes <=  262144) numChannels =  2;
         else if (totalBytes <=  524288) numChannels =  4;
diff --git a/src/clique/CliqueManager.h b/src/clique/CliqueManager.h
index caf2bdc8bd..33264759e7 100644
--- a/src/clique/CliqueManager.h
+++ b/src/clique/CliqueManager.h
@@ -95,7 +95,7 @@ protected:
   int32_t                      m_opIndexHead;                        // Track start of outstanding requests
   int32_t                      m_opIndexTail;                        // Track end of outstanding requests
   bool                         m_init;                               // Whether CliqueManager has been initialized
-  int                          m_gcnArch;                            // Device GCN arch value
+  char[256]                    m_gcnArchName;                        // Device GCN arch value
   size_t                       m_allReduceByteLimit;                 // Byte limit for AllReduce
   cliqueDevicePtrs_t*          m_pinnedCliquePtrs;                   // Pinned-host-memory (device accessible) containing device pointers
   int*                         m_gpuBarrierGlobalCount;              // Part of GPU barrier (count variable shared across ranks)
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index f246630fce..345b37b6ab 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -369,7 +369,14 @@ ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* s
 
 ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
   NCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap));
-  NCCLCHECK(xmlGetAttrInt(xmlGpu, "gcn", &gpu->gpu.gcn));
+  NCCLCHECK(xmlGetAttr(xmlGpu, "gcn", &gpu->gpu.gcn));
+  if (strcmp(gpu->gpu.gcn, "906") == 0) {
+    gpu->gpu.gcn = "gfx906";
+  } else if (strcmp(gpu->gpu.gcn, "908") == 0) {
+    gpu->gpu.gcn = "gfx908";
+  } else if (strcmp(gpu->gpu.gcn, "910") == 0) {
+    gpu->gpu.gcn = "gfx90a";
+  }
   rcclHipDeviceArch_t arch;
   NCCLCHECK(xmlGetAttrInt(xmlGpu, "arch", &arch.value));
   memcpy(&gpu->gpu.arch, &arch.arch, sizeof(hipDeviceArch_t));
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 1845b5f3e5..5cd973e9c8 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -10,6 +10,8 @@
 
 #include "graph.h"
 #include "core.h"
+#include "archinfo.h"
+#include <string.h>
 
 #define LOC_BW 5000.0
 #define SM60_NVLINK_BW 18.0
@@ -123,7 +125,7 @@ struct ncclTopoNode {
       int rank;
       int cudaCompCap;
       int gdrSupport;
-      int gcn;
+      const char* gcn;
       hipDeviceArch_t arch;
     }gpu;
     struct {
@@ -224,17 +226,13 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in
 }
 
 // Returns XGMI speed in GB/s
-static float ncclTopoXGMISpeed(int gcn) {
-  switch (gcn) {
-    case 910:
-      return MI200_XGMI_WIDTH;
-    case 940:
-    case 941:
-    case 942:
-      return GFX94X_XGMI_WIDTH;
-    default:
-      return VEGA_XGMI_WIDTH;
-  }
+static float ncclTopoXGMISpeed(const char* gcn) {
+  if (IsArchMatch(gcn, "gfx90a"))
+    return MI200_XGMI_WIDTH;
+  else if (IsArchMatch(gcn, "gfx94"))
+    return GFX94X_XGMI_WIDTH;
+  else
+    return VEGA_XGMI_WIDTH;
 }
 
 #if ENABLE_COLLTRACE
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index 8a9f964db1..47f57f7e65 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -455,7 +455,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
 #if defined(ENABLE_LL128)
       // Enable LL128 by default only on gfx90a with available tuning table
       pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL &&
-        (comm->topo->nodes[GPU].nodes[0].gpu.gcn == 910 && comm->topo->ll128Enabled) ? 1 : 0;
+        (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && comm->topo->ll128Enabled) ? 1 : 0;
 #else
       pEnable = 0;
 #endif
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index 138d48e401..d11f80864a 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -14,6 +14,7 @@
 #include "nvmlwrap.h"
 #include "xml.h"
 #include "rocm_smi_wrap.h"
+#include "archinfo.h"
 
 /*******************/
 /* XML File Parser */
diff --git a/src/graph/xml.h b/src/graph/xml.h
index 5ffa6c90c5..d56fb32768 100644
--- a/src/graph/xml.h
+++ b/src/graph/xml.h
@@ -12,6 +12,7 @@
 #include "debug.h"
 #include "checks.h"
 #include <stdlib.h>
+#include "archinfo.h"
 
 // A few constraints to make the implementation easy
 #define MAX_STR_LEN 255
diff --git a/src/include/archinfo.h b/src/include/archinfo.h
new file mode 100644
index 0000000000..a4a163bc3f
--- /dev/null
+++ b/src/include/archinfo.h
@@ -0,0 +1,39 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ARCHINFO_H_
+#define ARCHINFO_H_
+
+#include <string.h>
+
+/*
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
+*/
+
+void GcnArchNameFormat(char *gcnArchName, char* out);
+void GcnArchConvertToGcnArchName(int gcnArch, char* out);
+int GetGcnArchName(int deviceId, char* out);
+double GetDeviceWallClockRateInKhz(int deviceId);
+bool IsArchMatch(char const* arch, char const* target);
+
+#endif // ARCHINFO_H
diff --git a/src/init.cc b/src/init.cc
index e228120202..d19edd8b37 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -33,6 +33,7 @@
 #include <unistd.h>
 #include "graph/topo.h"
 #include "graph/xml.h"
+#include "archinfo.h"
 
 // [RCCL]
 #include "git_version.h"
@@ -174,15 +175,10 @@ RCCL_PARAM(KernelCollTraceEnable, "KERNEL_COLL_TRACE_ENABLE", 0);
 void *ncclCommThreadMain(void *arg) {
   ncclComm_t comm = (ncclComm_t)arg;
   int head[MAXCHANNELS];
-  hipDeviceProp_t devProp;
   double vega_gpu_rtc_freq;
 
   memset(head, 0, sizeof(int)*MAXCHANNELS);
-  hipError_t status = hipGetDeviceProperties(&devProp, comm->cudaDev);
-  if (devProp.gcnArch/10 == 94 && status == hipSuccess)
-    vega_gpu_rtc_freq = 1.0E8;
-  else
-    vega_gpu_rtc_freq = 2.5E7;
+  vega_gpu_rtc_freq = GetDeviceWallClockRateInKhz(comm->cudaDev) * 1.0E3;
   #define MAX_NAME_LENGTH 64
   char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+2));
   for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) {
@@ -1230,17 +1226,17 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx));
   allGather3Data[rank].nc = 2;
   if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
-       comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
+       IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx906") && allXgmi)
     allGather3Data[rank].nc = 4;
-  if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
+  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx908"))
     allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
   if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
        (comm->topo->type & RCCL_TOPO_CR8G))
     allGather3Data[rank].nc = 4;
   if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
-      comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
+      IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
     allGather3Data[rank].nc = 4;
-  if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
+  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
     allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
   if (ringGraph.nChannels > MAXCHANNELS/2)
     allGather3Data[rank].nc = 1;
diff --git a/src/misc/archinfo.cc b/src/misc/archinfo.cc
new file mode 100644
index 0000000000..565e0a7ddb
--- /dev/null
+++ b/src/misc/archinfo.cc
@@ -0,0 +1,86 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "archinfo.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+
+void GcnArchNameFormat(char* gcnArchName, char* out) {
+  // this function parses the char array from the device properties into something easier to handle.
+  // as the gcnArchName attribute looks something like: "gfx900:xnack+:blah-:etc-"
+  char *gcnArchNameToken = strtok(gcnArchName, ":");
+  strcpy(gcnArchNameToken, out);
+}
+
+void GcnArchConvertToGcnArchName(int gcnArch, char* gcnArchName) {
+  // gcnArch is deprecated and we should instead use gcnArchName; however, some data files still have
+  // the older gcnArch value.  There's only a handful of architectures that were coded prior to deprecation,
+  // so we handle those cases here.
+  //char gcnArchName[256] = {0}; // why 256?  Because that's what gcnArchName gives us, so we're matching it.
+  gcnArchName[6] = 0;
+  switch (gcnArch) {
+    case 906:
+      strncpy(gcnArchName, "gfx906", 6);
+      break;
+    case 908:
+      strncpy(gcnArchName, "gfx908", 6);
+      break;
+    case 910:
+      // this is actually 90a
+      strncpy(gcnArchName, "gfx90a", 6);
+      break;
+  }
+}
+
+int GetGcnArchName(int deviceId, char* out) {
+  // this is a generic call in to get a consistent gcnArchName regardless of which version of rocm we're using.
+  // or which version of rocm we're using.
+  hipDeviceProp_t devProp;
+  hipError_t status = hipGetDeviceProperties(&devProp, deviceId);
+  if (status != hipSuccess) {
+    //std::cerr << "Encountered HIP error getting device properties: "
+    //          << hipGetErrorString(status) << "\n";
+    exit(-1);
+  }
+#ifdef HIP_NO_GCNARCHNAME
+  // we're using a HIP version before 3.7.
+  GcnArchConvertToGcnArchName(devProp.gcnArch, out);
+  return 1;
+#else
+  GcnArchNameFormat(devProp.gcnArchName, out);
+  return 0;
+#endif
+}
+
+double GetDeviceWallClockRateInKhz(int deviceId) {
+  char* gcn;
+  GetGcnArchName(deviceId, gcn);
+  if (strncmp("gfx94", gcn, 5) == 0)
+    return 1.0E5;
+  else
+    return 2.5E4;
+}
+
+bool IsArchMatch(char const* arch, char const* target) {
+  // helper function to reduce clutter in code elsewhere.  Returns true on match.
+  return (strncmp(arch, target, strlen(target)) == 0);
+}
diff --git a/src/misc/npkit.cc b/src/misc/npkit.cc
index 0302fbd92b..0c72779645 100644
--- a/src/misc/npkit.cc
+++ b/src/misc/npkit.cc
@@ -9,6 +9,7 @@
 
 #include "alloc.h"
 #include "npkit/npkit.h"
+#include "archinfo.h"
 
 uint64_t NpKit::rank_ = 0;
 
@@ -120,13 +121,8 @@ ncclResult_t NpKit::Dump(const std::string& dump_dir) {
   dump_file_path = dump_dir;
   dump_file_path += "/gpu_clock_rate_rank_";
   dump_file_path += std::to_string(rank_);
-  hipDeviceProp_t devProp;
-  int vega_gpu_rtc_freq_in_khz;
-  CUDACHECK(hipGetDeviceProperties(&devProp, 0));
-  if (devProp.gcnArch/10 == 94)
-    vega_gpu_rtc_freq_in_khz = 100000;
-  else
-    vega_gpu_rtc_freq_in_khz = 25000;
+  // get the rtc frequency directly from HIP itself (via a wrapper)
+  double vega_gpu_rtc_freq_in_khz = GetDeviceWallClockRateInKhz(0);
   std::string clock_rate_str = std::to_string(vega_gpu_rtc_freq_in_khz);
   auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out);
   gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length());
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 274006d97f..7c454b87d8 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -190,7 +190,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
   send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
-  if (req.useGdr && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) {
+  if (req.useGdr && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
     CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
     send->conn.curr_hdp_reg = req.curr_hdp_reg;
   }
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index fd9e67c72d..67ad2c2c31 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -354,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
     return ncclInternalError;
   }
-  if (!isXGMI && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) {
+  if (!isXGMI && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
     CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
     TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
   }
diff --git a/tools/topo_expl/Makefile b/tools/topo_expl/Makefile
index 7446d7b52a..23d76e8a16 100644
--- a/tools/topo_expl/Makefile
+++ b/tools/topo_expl/Makefile
@@ -9,7 +9,7 @@ EXE = topo_expl
 CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL
 
 files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \
-	hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc
+	hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc
 
 all: hipify $(EXE)
 
@@ -21,6 +21,7 @@ hipify:
 	mkdir -p hipify_rccl
 	cp -a ../../src/include/ hipify_rccl/
 	cp -a ../../src/graph/ hipify_rccl/
+	cp -ar ../../src/misc/archinfo.cc hipify_rccl/graph/
 	hipify-perl -inplace -quiet-warnings hipify_rccl/include/*.h
 	hipify-perl -inplace -quiet-warnings hipify_rccl/graph/*
 
diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp
index 3ce4e7f5f5..8a3d04298f 100644
--- a/tools/topo_expl/utils.cpp
+++ b/tools/topo_expl/utils.cpp
@@ -862,17 +862,17 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a
   NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx));
   allGather3Data[rank].nc = 2;
   if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
-       comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
+      IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx906") && allXgmi)
     allGather3Data[rank].nc = 4;
-  if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
+  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx908"))
     allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
   if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
        (comm->topo->type & RCCL_TOPO_CR8G))
     allGather3Data[rank].nc = 4;
   if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
-      comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
+      IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
     allGather3Data[rank].nc = 4;
-  if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
+  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
     allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
   if (ringGraph.nChannels > MAXCHANNELS/2)
     allGather3Data[rank].nc = 1;