Gcn arch name (#886)

We use CMake to determine if we're compiling against a version of ROCm that supports gcnArchName and handles architecture checking appropriately. It includes a few helper functions as drop ins for the functionality we used gcnArch for before; sometimes to enable flags, and sometimes to set frequencies.
2023-09-12 15:34:40 -04:00
commit e58ec78d35
@@ -122,6 +122,17 @@ message(STATUS "hipcc version:    ${hipcc_version_string}")
 ### Check for hipEventDisableSystemFence support
 check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVENT_DISABLE_FENCE)

+### Check for hipDeviceMallocUncached support
+check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
+
+message(STATUS "HIP Library version: ${hip_VERSION_MINOR}")
+### Check the version of HIP to see if we can use gcnArchName instead of gcnArch (deprecated)
+if(${hipcc_version_string} VERSION_LESS "5.7.31921")
+    set(HIP_NO_GCNARCHNAME ON)
+  else()
+    set(HIP_NO_GCNARCHNAME OFF)
+endif()
+
 ### Check for indirect function call support
 if(ENABLE_IFC)
  if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201")
@@ -300,6 +311,7 @@ set(SRC_FILES
  src/graph/xml.cc
  src/graph/xml.h
  src/group.cc
+  src/include/archinfo.h
  src/include/align.h
  src/include/alloc.h
  src/include/argcheck.h
@@ -380,6 +392,7 @@ set(SRC_FILES
  src/include/utils.h
  src/init.cc
 #  src/init_nvtx.cc
+  src/misc/archinfo.cc
  src/misc/argcheck.cc
 # src/misc/cudawrap.cc
 # src/misc/gdrwrap.cc
@@ -547,6 +560,9 @@ else()
    target_compile_options(rccl PRIVATE --hipcc-func-supp)
  endif()
 endif()
+if(HIP_NO_GCNARCHNAME)
+  target_compile_definitions(rccl PRIVATE HIP_NO_GCNARCHNAME)
+endif()
 if (BUILD_BFD)
  if (HAVE_BFD)
    target_compile_definitions(rccl PRIVATE HAVE_BFD)
@@ -61,7 +61,7 @@ CliqueManager::CliqueManager(int          const  rank,
  m_opIndexHead(0),
  m_opIndexTail(0),
  m_init(false),
-  m_gcnArch(0),
+  m_gcnArchName(char[256]),
  m_allReduceByteLimit(0),
  m_pinnedCliquePtrs(NULL),
  m_gpuBarrierGlobalCount(NULL),
@@ -243,13 +243,13 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
  CUDACHECK(hipGetDevice(&deviceId));
  hipDeviceProp_t devProp;
  CUDACHECK(hipGetDeviceProperties(&devProp, deviceId));
-  m_gcnArch = devProp.gcnArch;
+  m_gcnArchName = devProp.gcnArchName;

  // Establish when to use clique-based kernels based on input size
  SetByteLimits();

  m_init = true;
-  INFO(NCCL_INIT, "Clique-based kernels enabled (mode %d) [GCN %d]", m_cliqueMode, m_gcnArch);
+  INFO(NCCL_INIT, "Clique-based kernels enabled (mode %d) [GCN %d]", m_cliqueMode, m_gcnArchName);
  return ncclSuccess;

 dropback:
@@ -266,12 +266,12 @@ void CliqueManager::SetByteLimits()
  m_allReduceByteLimit = rcclParamAllReduceCliqueByteLimit();
  if (m_allReduceByteLimit == 0)
  {
-    switch (m_gcnArch)
-    {
-    case 906: m_allReduceByteLimit =  16777216; break;
-    case 908: m_allReduceByteLimit =   8388608; break;
-    default:  m_allReduceByteLimit =  16777216; break;
-    }
+    if (IsArchMatch(m_gcnArchName, "gfx906"))
+      m_allReduceByteLimit = 16777216;
+    else if (IsArchMatch(m_gcnArchName, "gfx908"))
+      m_allReduceByteLimit = 8388608;
+    else
+      m_allReduceByteLimit = 16777216;
  }
 }

@@ -368,23 +368,18 @@ ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll,
    {
      // NOTE: These are currently based on collected data and not necessarily ideal for all hardware
      int numChannels;
-      switch (m_gcnArch)
-      {
-      case 906:
+      if (IsArchMatch(m_gcnArchName, "gfx906")) {
        if      (totalBytes <=   16384) numChannels =  1;
        else                            numChannels =  2;
-        break;
-      case 908:
+      } else if (IsArchMatch(m_gcnArchName, "gfx908")) {
        if      (totalBytes <=  131072) numChannels =  2;
        else if (totalBytes <=  524288) numChannels =  6;
        else if (totalBytes <= 1048576) numChannels = 13;
        else                            numChannels = 16;
-        break;
-      case 910:
+      } else if (IsArchMatch(m_gcnArchName, "gfx90a")) {
        if      (totalBytes <=  262144) numChannels =  4;
        else                            numChannels =  8;
-        break;
-      default:
+      } else {
        if      (totalBytes <=   65536) numChannels =  1;
        else if (totalBytes <=  262144) numChannels =  2;
        else if (totalBytes <=  524288) numChannels =  4;
@@ -95,7 +95,7 @@ protected:
  int32_t                      m_opIndexHead;                        // Track start of outstanding requests
  int32_t                      m_opIndexTail;                        // Track end of outstanding requests
  bool                         m_init;                               // Whether CliqueManager has been initialized
-  int                          m_gcnArch;                            // Device GCN arch value
+  char[256]                    m_gcnArchName;                        // Device GCN arch value
  size_t                       m_allReduceByteLimit;                 // Byte limit for AllReduce
  cliqueDevicePtrs_t*          m_pinnedCliquePtrs;                   // Pinned-host-memory (device accessible) containing device pointers
  int*                         m_gpuBarrierGlobalCount;              // Part of GPU barrier (count variable shared across ranks)
@@ -369,7 +369,14 @@ ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* s

 ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
  NCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap));
-  NCCLCHECK(xmlGetAttrInt(xmlGpu, "gcn", &gpu->gpu.gcn));
+  NCCLCHECK(xmlGetAttr(xmlGpu, "gcn", &gpu->gpu.gcn));
+  if (strcmp(gpu->gpu.gcn, "906") == 0) {
+    gpu->gpu.gcn = "gfx906";
+  } else if (strcmp(gpu->gpu.gcn, "908") == 0) {
+    gpu->gpu.gcn = "gfx908";
+  } else if (strcmp(gpu->gpu.gcn, "910") == 0) {
+    gpu->gpu.gcn = "gfx90a";
+  }
  rcclHipDeviceArch_t arch;
  NCCLCHECK(xmlGetAttrInt(xmlGpu, "arch", &arch.value));
  memcpy(&gpu->gpu.arch, &arch.arch, sizeof(hipDeviceArch_t));
@@ -10,6 +10,8 @@

 #include "graph.h"
 #include "core.h"
+#include "archinfo.h"
+#include <string.h>

 #define LOC_BW 5000.0
 #define SM60_NVLINK_BW 18.0
@@ -123,7 +125,7 @@ struct ncclTopoNode {
      int rank;
      int cudaCompCap;
      int gdrSupport;
-      int gcn;
+      const char* gcn;
      hipDeviceArch_t arch;
    }gpu;
    struct {
@@ -224,17 +226,13 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in
 }

 // Returns XGMI speed in GB/s
-static float ncclTopoXGMISpeed(int gcn) {
-  switch (gcn) {
-    case 910:
-      return MI200_XGMI_WIDTH;
-    case 940:
-    case 941:
-    case 942:
-      return GFX94X_XGMI_WIDTH;
-    default:
-      return VEGA_XGMI_WIDTH;
-  }
+static float ncclTopoXGMISpeed(const char* gcn) {
+  if (IsArchMatch(gcn, "gfx90a"))
+    return MI200_XGMI_WIDTH;
+  else if (IsArchMatch(gcn, "gfx94"))
+    return GFX94X_XGMI_WIDTH;
+  else
+    return VEGA_XGMI_WIDTH;
 }

 #if ENABLE_COLLTRACE
@@ -455,7 +455,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
 #if defined(ENABLE_LL128)
      // Enable LL128 by default only on gfx90a with available tuning table
      pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL &&
-        (comm->topo->nodes[GPU].nodes[0].gpu.gcn == 910 && comm->topo->ll128Enabled) ? 1 : 0;
+        (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && comm->topo->ll128Enabled) ? 1 : 0;
 #else
      pEnable = 0;
 #endif
@@ -14,6 +14,7 @@
 #include "nvmlwrap.h"
 #include "xml.h"
 #include "rocm_smi_wrap.h"
+#include "archinfo.h"

 /*******************/
 /* XML File Parser */
@@ -12,6 +12,7 @@
 #include "debug.h"
 #include "checks.h"
 #include <stdlib.h>
+#include "archinfo.h"

 // A few constraints to make the implementation easy
 #define MAX_STR_LEN 255
@@ -0,0 +1,39 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ARCHINFO_H_
+#define ARCHINFO_H_
+
+#include <string.h>
+
+/*
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
+*/
+
+void GcnArchNameFormat(char *gcnArchName, char* out);
+void GcnArchConvertToGcnArchName(int gcnArch, char* out);
+int GetGcnArchName(int deviceId, char* out);
+double GetDeviceWallClockRateInKhz(int deviceId);
+bool IsArchMatch(char const* arch, char const* target);
+
+#endif // ARCHINFO_H
@@ -33,6 +33,7 @@
 #include <unistd.h>
 #include "graph/topo.h"
 #include "graph/xml.h"
+#include "archinfo.h"

 // [RCCL]
 #include "git_version.h"
@@ -174,15 +175,10 @@ RCCL_PARAM(KernelCollTraceEnable, "KERNEL_COLL_TRACE_ENABLE", 0);
 void *ncclCommThreadMain(void *arg) {
  ncclComm_t comm = (ncclComm_t)arg;
  int head[MAXCHANNELS];
-  hipDeviceProp_t devProp;
  double vega_gpu_rtc_freq;

  memset(head, 0, sizeof(int)*MAXCHANNELS);
-  hipError_t status = hipGetDeviceProperties(&devProp, comm->cudaDev);
-  if (devProp.gcnArch/10 == 94 && status == hipSuccess)
-    vega_gpu_rtc_freq = 1.0E8;
-  else
-    vega_gpu_rtc_freq = 2.5E7;
+  vega_gpu_rtc_freq = GetDeviceWallClockRateInKhz(comm->cudaDev) * 1.0E3;
  #define MAX_NAME_LENGTH 64
  char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+2));
  for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) {
@@ -1230,17 +1226,17 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
  NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx));
  allGather3Data[rank].nc = 2;
  if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
-       comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
+       IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx906") && allXgmi)
    allGather3Data[rank].nc = 4;
-  if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
+  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx908"))
    allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
  if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
       (comm->topo->type & RCCL_TOPO_CR8G))
    allGather3Data[rank].nc = 4;
  if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
-      comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
+      IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
    allGather3Data[rank].nc = 4;
-  if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
+  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
    allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
  if (ringGraph.nChannels > MAXCHANNELS/2)
    allGather3Data[rank].nc = 1;
@@ -0,0 +1,86 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "archinfo.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+
+void GcnArchNameFormat(char* gcnArchName, char* out) {
+  // this function parses the char array from the device properties into something easier to handle.
+  // as the gcnArchName attribute looks something like: "gfx900:xnack+:blah-:etc-"
+  char *gcnArchNameToken = strtok(gcnArchName, ":");
+  strcpy(gcnArchNameToken, out);
+}
+
+void GcnArchConvertToGcnArchName(int gcnArch, char* gcnArchName) {
+  // gcnArch is deprecated and we should instead use gcnArchName; however, some data files still have
+  // the older gcnArch value.  There's only a handful of architectures that were coded prior to deprecation,
+  // so we handle those cases here.
+  //char gcnArchName[256] = {0}; // why 256?  Because that's what gcnArchName gives us, so we're matching it.
+  gcnArchName[6] = 0;
+  switch (gcnArch) {
+    case 906:
+      strncpy(gcnArchName, "gfx906", 6);
+      break;
+    case 908:
+      strncpy(gcnArchName, "gfx908", 6);
+      break;
+    case 910:
+      // this is actually 90a
+      strncpy(gcnArchName, "gfx90a", 6);
+      break;
+  }
+}
+
+int GetGcnArchName(int deviceId, char* out) {
+  // this is a generic call in to get a consistent gcnArchName regardless of which version of rocm we're using.
+  // or which version of rocm we're using.
+  hipDeviceProp_t devProp;
+  hipError_t status = hipGetDeviceProperties(&devProp, deviceId);
+  if (status != hipSuccess) {
+    //std::cerr << "Encountered HIP error getting device properties: "
+    //          << hipGetErrorString(status) << "\n";
+    exit(-1);
+  }
+#ifdef HIP_NO_GCNARCHNAME
+  // we're using a HIP version before 3.7.
+  GcnArchConvertToGcnArchName(devProp.gcnArch, out);
+  return 1;
+#else
+  GcnArchNameFormat(devProp.gcnArchName, out);
+  return 0;
+#endif
+}
+
+double GetDeviceWallClockRateInKhz(int deviceId) {
+  char* gcn;
+  GetGcnArchName(deviceId, gcn);
+  if (strncmp("gfx94", gcn, 5) == 0)
+    return 1.0E5;
+  else
+    return 2.5E4;
+}
+
+bool IsArchMatch(char const* arch, char const* target) {
+  // helper function to reduce clutter in code elsewhere.  Returns true on match.
+  return (strncmp(arch, target, strlen(target)) == 0);
+}
@@ -9,6 +9,7 @@

 #include "alloc.h"
 #include "npkit/npkit.h"
+#include "archinfo.h"

 uint64_t NpKit::rank_ = 0;

@@ -120,13 +121,8 @@ ncclResult_t NpKit::Dump(const std::string& dump_dir) {
  dump_file_path = dump_dir;
  dump_file_path += "/gpu_clock_rate_rank_";
  dump_file_path += std::to_string(rank_);
-  hipDeviceProp_t devProp;
-  int vega_gpu_rtc_freq_in_khz;
-  CUDACHECK(hipGetDeviceProperties(&devProp, 0));
-  if (devProp.gcnArch/10 == 94)
-    vega_gpu_rtc_freq_in_khz = 100000;
-  else
-    vega_gpu_rtc_freq_in_khz = 25000;
+  // get the rtc frequency directly from HIP itself (via a wrapper)
+  double vega_gpu_rtc_freq_in_khz = GetDeviceWallClockRateInKhz(0);
  std::string clock_rate_str = std::to_string(vega_gpu_rtc_freq_in_khz);
  auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out);
  gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length());
@@ -190,7 +190,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
  if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
  send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
-  if (req.useGdr && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) {
+  if (req.useGdr && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
    CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev));
    send->conn.curr_hdp_reg = req.curr_hdp_reg;
  }
@@ -354,7 +354,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
    return ncclInternalError;
  }
-  if (!isXGMI && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910 && comm->topo->nodes[GPU].nodes[0].gpu.gcn/10 != 94) {
+  if (!isXGMI && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94")) {
    CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
  }
@@ -9,7 +9,7 @@ EXE = topo_expl
 CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL

 files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \
-	hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc
+	hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc

 all: hipify $(EXE)

@@ -21,6 +21,7 @@ hipify:
 	mkdir -p hipify_rccl
 	cp -a ../../src/include/ hipify_rccl/
 	cp -a ../../src/graph/ hipify_rccl/
+	cp -ar ../../src/misc/archinfo.cc hipify_rccl/graph/
 	hipify-perl -inplace -quiet-warnings hipify_rccl/include/*.h
 	hipify-perl -inplace -quiet-warnings hipify_rccl/graph/*

@@ -862,17 +862,17 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a
  NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx));
  allGather3Data[rank].nc = 2;
  if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
-       comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
+      IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx906") && allXgmi)
    allGather3Data[rank].nc = 4;
-  if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
+  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx908"))
    allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
  if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
       (comm->topo->type & RCCL_TOPO_CR8G))
    allGather3Data[rank].nc = 4;
  if (comm->topo->nodes[GPU].count == comm->topo->nRanks &&
-      comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
+      IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
    allGather3Data[rank].nc = 4;
-  if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
+  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx90a"))
    allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
  if (ringGraph.nChannels > MAXCHANNELS/2)
    allGather3Data[rank].nc = 1;